diff --git a/app/graphql/types/detectors_type.rb b/app/graphql/types/detectors_type.rb index 02f2f6a..d312776 100644 --- a/app/graphql/types/detectors_type.rb +++ b/app/graphql/types/detectors_type.rb @@ -16,11 +16,11 @@ def journals end def lcsh - Detector::Lcsh.new(@object).identifiers.map(&:last) + Detector::Lcsh.new(@object).detections.map(&:last) end def standard_identifiers - Detector::StandardIdentifiers.new(@object).identifiers.map do |identifier| + Detector::StandardIdentifiers.new(@object).detections.map do |identifier| { kind: identifier.first, value: identifier.last } end end diff --git a/app/models/detector/lcsh.rb b/app/models/detector/lcsh.rb index 0d97e53..0cf5121 100644 --- a/app/models/detector/lcsh.rb +++ b/app/models/detector/lcsh.rb @@ -4,13 +4,16 @@ class Detector # Detector::LCSH is a very rudimentary detector for the separator between levels of a Library of Congress Subject # Heading (LCSH). These subject headings follow this pattern: "Social security beneficiaries -- United States" class Lcsh - attr_reader :identifiers + attr_reader :detections + + # shared instance methods + include Detector::PatternChecker # For now the initialize method just needs to run the pattern checker. A space for future development would be to # write additional methods to look up the detected LCSH for more information, and to confirm that the phrase is # actually an LCSH. def initialize(term) - @identifiers = {} + @detections = {} term_pattern_checker(term) end @@ -25,7 +28,7 @@ def initialize(term) def self.record(term) results = Detector::Lcsh.new(term.phrase) - results.identifiers.each_key do + results.detections.each_key do Detection.find_or_create_by( term:, detector: Detector.where(name: 'LCSH').first, @@ -38,23 +41,10 @@ def self.record(term) private - def term_pattern_checker(term) - subject_patterns.each_pair do |type, pattern| - @identifiers[type.to_sym] = match(pattern, term) if match(pattern, term).present? - end - end - - # This implementation will only detect the first match of a pattern in a long string. For the separator pattern this - # is fine, as we only need to find one (and finding multiples wouldn't change the outcome). If a pattern does come - # along where match counts matter, this should be reconsidered. - def match(pattern, term) - pattern.match(term).to_s.strip - end - - # subject_patterns are regex patterns that can be applied to indicate whether a search string is looking for an LCSH + # term_patterns are regex patterns that can be applied to indicate whether a search string is looking for an LCSH # string. At the moment there is only one - for the separator character " -- " - but others might be possible if # there are regex-able vocabulary quirks which might separate subject values from non-subject values. - def subject_patterns + def term_patterns { separator: /(.*)\s--\s(.*)/ } diff --git a/app/models/detector/pattern_checker.rb b/app/models/detector/pattern_checker.rb new file mode 100644 index 0000000..d1dbedd --- /dev/null +++ b/app/models/detector/pattern_checker.rb @@ -0,0 +1,22 @@ +# frozen_string_literal: true + +class Detector + # PatternChecker is intended to be added to Detectors via `include Detector::PatternChecker` to make + # these methods available to instances of the class + module PatternChecker + def term_pattern_checker(term) + term_patterns.each_pair do |type, pattern| + @detections[type.to_sym] = match(pattern, term) if match(pattern, term).present? + end + end + + # Note on the limitations of this implementation + # We only detect the first match of each pattern, so a search of "1234-5678 5678-1234" will not return two ISSNs as + # might be expected, but just "1234-5678". Using ruby's string.scan(pattern) may be worthwhile if we want to detect + # all possible matches instead of just the first. That may require a larger refactor though as initial tests of doing + # that change did result in unintended results so it was backed out for now. + def match(pattern, term) + pattern.match(term).to_s.strip + end + end +end diff --git a/app/models/detector/standard_identifiers.rb b/app/models/detector/standard_identifiers.rb index f200473..3455309 100644 --- a/app/models/detector/standard_identifiers.rb +++ b/app/models/detector/standard_identifiers.rb @@ -4,14 +4,17 @@ class Detector # Detector::StandardIdentifiers detects the identifiers DOI, ISBN, ISSN, PMID. # See /docs/reference/pattern_detection_and_enhancement.md for details. class StandardIdentifiers - attr_reader :identifiers + attr_reader :detections def self.table_name_prefix 'detector_' end + # shared instance methods + include Detector::PatternChecker + def initialize(term) - @identifiers = {} + @detections = {} term_pattern_checker(term) strip_invalid_issns end @@ -27,7 +30,7 @@ def initialize(term) def self.record(term) si = Detector::StandardIdentifiers.new(term.phrase) - si.identifiers.each_key do |k| + si.detections.each_key do |k| Detection.find_or_create_by( term:, detector: Detector.where(name: k.to_s.upcase).first, @@ -40,21 +43,6 @@ def self.record(term) private - def term_pattern_checker(term) - term_patterns.each_pair do |type, pattern| - @identifiers[type.to_sym] = match(pattern, term) if match(pattern, term).present? - end - end - - # Note on the limitations of this implementation - # We only detect the first match of each pattern, so a search of "1234-5678 5678-1234" will not return two ISSNs as - # might be expected, but just "1234-5678". Using ruby's string.scan(pattern) may be worthwhile if we want to detect - # all possible matches instead of just the first. That may require a larger refactor though as initial tests of doing - # that change did result in unintended results so it was backed out for now. - def match(pattern, term) - pattern.match(term).to_s.strip - end - # term_patterns are regex patterns to be applied to the basic search box input def term_patterns { @@ -66,9 +54,9 @@ def term_patterns end def strip_invalid_issns - return unless @identifiers[:issn] + return unless @detections[:issn] - @identifiers.delete(:issn) unless validate_issn(@identifiers[:issn]) + @detections.delete(:issn) unless validate_issn(@detections[:issn]) end # validate_issn is only called when the regex for an ISSN has indicated an ISSN diff --git a/app/models/metrics/algorithms.rb b/app/models/metrics/algorithms.rb index 0ccb421..387c95b 100644 --- a/app/models/metrics/algorithms.rb +++ b/app/models/metrics/algorithms.rb @@ -82,7 +82,7 @@ def event_matches(event, matches) suggested_resource_exact = process_suggested_resources(event, matches) lcshs = match_lcsh(event, matches) - matches[:unmatched] += 1 if ids.identifiers.blank? && lcshs.identifiers.blank? && journal_exact.count.zero? && suggested_resource_exact.count.zero? + matches[:unmatched] += 1 if ids.detections.blank? && lcshs.detections.blank? && journal_exact.count.zero? && suggested_resource_exact.count.zero? end # Checks for LCSH matches @@ -95,7 +95,7 @@ def match_lcsh(event, matches) ids = Detector::Lcsh.new(event.term.phrase) known_ids.each do |id| - matches[:lcsh] += 1 if ids.identifiers[id].present? + matches[:lcsh] += 1 if ids.detections[id].present? end ids end @@ -110,7 +110,7 @@ def match_standard_identifiers(event, matches) ids = Detector::StandardIdentifiers.new(event.term.phrase) known_ids.each do |id| - matches[id] += 1 if ids.identifiers[id].present? + matches[id] += 1 if ids.detections[id].present? end ids end diff --git a/test/models/detector/lcsh_test.rb b/test/models/detector/lcsh_test.rb index 3372601..6e6b1cd 100644 --- a/test/models/detector/lcsh_test.rb +++ b/test/models/detector/lcsh_test.rb @@ -11,7 +11,7 @@ class LcshTest < ActiveSupport::TestCase ] true_samples.each do |term| - actual = Detector::Lcsh.new(term).identifiers + actual = Detector::Lcsh.new(term).detections assert_includes(actual, :separator) end @@ -26,7 +26,7 @@ class LcshTest < ActiveSupport::TestCase ] false_samples.each do |term| - actual = Detector::Lcsh.new(term).identifiers + actual = Detector::Lcsh.new(term).detections assert_not_includes(actual, :separator) end diff --git a/test/models/detector/standard_identifiers_test.rb b/test/models/detector/standard_identifiers_test.rb index 11b2093..20e36bc 100644 --- a/test/models/detector/standard_identifiers_test.rb +++ b/test/models/detector/standard_identifiers_test.rb @@ -5,7 +5,7 @@ class Detector class StandardIdentifiersTest < ActiveSupport::TestCase test 'ISBN detected in a string' do - actual = Detector::StandardIdentifiers.new('test 978-3-16-148410-0 test').identifiers + actual = Detector::StandardIdentifiers.new('test 978-3-16-148410-0 test').detections assert_equal('978-3-16-148410-0', actual[:isbn]) end @@ -17,7 +17,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase '0-9752298-0-X'] samples.each do |isbn| - actual = Detector::StandardIdentifiers.new(isbn).identifiers + actual = Detector::StandardIdentifiers.new(isbn).detections assert_equal(isbn, actual[:isbn]) end @@ -29,7 +29,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase '978-0-943396-04-2', '979-0-9752298-0-X'] samples.each do |isbn| - actual = Detector::StandardIdentifiers.new(isbn).identifiers + actual = Detector::StandardIdentifiers.new(isbn).detections assert_equal(isbn, actual[:isbn]) end @@ -39,7 +39,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase samples = ['orange cats like popcorn', '1234-6798', 'another ISBN not found here'] samples.each do |notisbn| - actual = Detector::StandardIdentifiers.new(notisbn).identifiers + actual = Detector::StandardIdentifiers.new(notisbn).detections assert_nil(actual[:isbn]) end @@ -52,14 +52,14 @@ class StandardIdentifiersTest < ActiveSupport::TestCase # consider whether we care in the future as we look for incorrect real-world matches. samples.each do |notisbn| - actual = Detector::StandardIdentifiers.new(notisbn).identifiers + actual = Detector::StandardIdentifiers.new(notisbn).detections assert_nil(actual[:isbn]) end end test 'ISSNs detected in a string' do - actual = Detector::StandardIdentifiers.new('test 0250-6335 test').identifiers + actual = Detector::StandardIdentifiers.new('test 0250-6335 test').detections assert_equal('0250-6335', actual[:issn]) end @@ -68,7 +68,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase samples = %w[0250-6335 0000-0019 1864-0761 1877-959X 0973-7758 1877-5683 1440-172X 1040-5631] samples.each do |issn| - actual = Detector::StandardIdentifiers.new(issn).identifiers + actual = Detector::StandardIdentifiers.new(issn).detections assert_equal(issn, actual[:issn]) end @@ -78,14 +78,14 @@ class StandardIdentifiersTest < ActiveSupport::TestCase samples = ['orange cats like popcorn', '12346798', 'another ISSN not found here', '99921-58-10-7'] samples.each do |notissn| - actual = Detector::StandardIdentifiers.new(notissn).identifiers + actual = Detector::StandardIdentifiers.new(notissn).detections assert_nil(actual[:issn]) end end test 'ISSNs need boundaries' do - actual = Detector::StandardIdentifiers.new('12345-5678 1234-56789').identifiers + actual = Detector::StandardIdentifiers.new('12345-5678 1234-56789').detections assert_nil(actual[:issn]) end @@ -117,7 +117,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase 0250-633X ] samples.each do |notissn| - actual = Detector::StandardIdentifiers.new(notissn).identifiers + actual = Detector::StandardIdentifiers.new(notissn).detections assert_nil(actual[:issn]) end @@ -131,7 +131,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase 0973-7758 ] samples.each do |issn| - actual = Detector::StandardIdentifiers.new(issn).identifiers + actual = Detector::StandardIdentifiers.new(issn).detections assert_equal(issn, actual[:issn]) end @@ -139,7 +139,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase test 'doi detected in string' do actual = Detector::StandardIdentifiers.new('"Quantum tomography: Measured measurement", Markus Aspelmeyer, nature physics "\ - "January 2009, Volume 5, No 1, pp11-12; [ doi:10.1038/nphys1170 ]').identifiers + "January 2009, Volume 5, No 1, pp11-12; [ doi:10.1038/nphys1170 ]').detections assert_equal('10.1038/nphys1170', actual[:doi]) end @@ -149,7 +149,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase 10.1594/PANGAEA.667386 10.3207/2959859860 10.3866/PKU.WHXB201112303 10.1430/8105 10.1392/BC1.0] samples.each do |doi| - actual = Detector::StandardIdentifiers.new(doi).identifiers + actual = Detector::StandardIdentifiers.new(doi).detections assert_equal(doi, actual[:doi]) end @@ -159,14 +159,14 @@ class StandardIdentifiersTest < ActiveSupport::TestCase samples = ['orange cats like popcorn', '10.1234 almost doi', 'another doi not found here', '99921-58-10-7'] samples.each do |notdoi| - actual = Detector::StandardIdentifiers.new(notdoi).identifiers + actual = Detector::StandardIdentifiers.new(notdoi).detections assert_nil(actual[:notdoi]) end end test 'pmid detected in string' do - actual = Detector::StandardIdentifiers.new('Citation and stuff PMID: 35648703 more stuff.').identifiers + actual = Detector::StandardIdentifiers.new('Citation and stuff PMID: 35648703 more stuff.').detections assert_equal('PMID: 35648703', actual[:pmid]) end @@ -175,7 +175,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase samples = ['PMID: 35648703', 'pmid: 1234567', 'PMID:35648703'] samples.each do |pmid| - actual = Detector::StandardIdentifiers.new(pmid).identifiers + actual = Detector::StandardIdentifiers.new(pmid).detections assert_equal(pmid, actual[:pmid]) end @@ -185,7 +185,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase samples = ['orange cats like popcorn', 'pmid:almost', 'PMID: asdf', '99921-58-10-7'] samples.each do |notpmid| - actual = Detector::StandardIdentifiers.new(notpmid).identifiers + actual = Detector::StandardIdentifiers.new(notpmid).detections assert_nil(actual[:pmid]) end