Skip to content

Commit

Permalink
Merge pull request #120 from MITLibraries/common-detector-logic
Browse files Browse the repository at this point in the history
Common detector logic
  • Loading branch information
JPrevost authored Oct 15, 2024
2 parents 4225521 + 921e411 commit 93d3fbd
Show file tree
Hide file tree
Showing 13 changed files with 71 additions and 63 deletions.
4 changes: 2 additions & 2 deletions app/graphql/types/detectors_type.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ def journals
end

def lcsh
Detector::Lcsh.new(@object).identifiers.map(&:last)
Detector::Lcsh.new(@object).detections.map(&:last)
end

def standard_identifiers
Detector::StandardIdentifiers.new(@object).identifiers.map do |identifier|
Detector::StandardIdentifiers.new(@object).detections.map do |identifier|
{ kind: identifier.first, value: identifier.last }
end
end
Expand Down
26 changes: 8 additions & 18 deletions app/models/detector/lcsh.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@ class Detector
# Detector::LCSH is a very rudimentary detector for the separator between levels of a Library of Congress Subject
# Heading (LCSH). These subject headings follow this pattern: "Social security beneficiaries -- United States"
class Lcsh
attr_reader :identifiers
attr_reader :detections

# shared instance methods
include Detector::PatternChecker

# For now the initialize method just needs to run the pattern checker. A space for future development would be to
# write additional methods to look up the detected LCSH for more information, and to confirm that the phrase is
# actually an LCSH.
def initialize(term)
@identifiers = {}
@detections = {}
term_pattern_checker(term)
end

Expand All @@ -25,7 +28,7 @@ def initialize(term)
def self.record(term)
results = Detector::Lcsh.new(term.phrase)

results.identifiers.each_key do
results.detections.each_key do
Detection.find_or_create_by(
term:,
detector: Detector.where(name: 'LCSH').first,
Expand All @@ -38,23 +41,10 @@ def self.record(term)

private

def term_pattern_checker(term)
subject_patterns.each_pair do |type, pattern|
@identifiers[type.to_sym] = match(pattern, term) if match(pattern, term).present?
end
end

# This implementation will only detect the first match of a pattern in a long string. For the separator pattern this
# is fine, as we only need to find one (and finding multiples wouldn't change the outcome). If a pattern does come
# along where match counts matter, this should be reconsidered.
def match(pattern, term)
pattern.match(term).to_s.strip
end

# subject_patterns are regex patterns that can be applied to indicate whether a search string is looking for an LCSH
# term_patterns are regex patterns that can be applied to indicate whether a search string is looking for an LCSH
# string. At the moment there is only one - for the separator character " -- " - but others might be possible if
# there are regex-able vocabulary quirks which might separate subject values from non-subject values.
def subject_patterns
def term_patterns
{
separator: /(.*)\s--\s(.*)/
}
Expand Down
22 changes: 22 additions & 0 deletions app/models/detector/pattern_checker.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# frozen_string_literal: true

class Detector
# PatternChecker is intended to be added to Detectors via `include Detector::PatternChecker` to make
# these methods available to instances of the class
module PatternChecker
def term_pattern_checker(term)
term_patterns.each_pair do |type, pattern|
@detections[type.to_sym] = match(pattern, term) if match(pattern, term).present?
end
end

# Note on the limitations of this implementation
# We only detect the first match of each pattern, so a search of "1234-5678 5678-1234" will not return two ISSNs as
# might be expected, but just "1234-5678". Using ruby's string.scan(pattern) may be worthwhile if we want to detect
# all possible matches instead of just the first. That may require a larger refactor though as initial tests of doing
# that change did result in unintended results so it was backed out for now.
def match(pattern, term)
pattern.match(term).to_s.strip
end
end
end
28 changes: 8 additions & 20 deletions app/models/detector/standard_identifiers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@ class Detector
# Detector::StandardIdentifiers detects the identifiers DOI, ISBN, ISSN, PMID.
# See /docs/reference/pattern_detection_and_enhancement.md for details.
class StandardIdentifiers
attr_reader :identifiers
attr_reader :detections

def self.table_name_prefix
'detector_'
end

# shared instance methods
include Detector::PatternChecker

def initialize(term)
@identifiers = {}
@detections = {}
term_pattern_checker(term)
strip_invalid_issns
end
Expand All @@ -27,7 +30,7 @@ def initialize(term)
def self.record(term)
si = Detector::StandardIdentifiers.new(term.phrase)

si.identifiers.each_key do |k|
si.detections.each_key do |k|
Detection.find_or_create_by(
term:,
detector: Detector.where(name: k.to_s.upcase).first,
Expand All @@ -40,21 +43,6 @@ def self.record(term)

private

def term_pattern_checker(term)
term_patterns.each_pair do |type, pattern|
@identifiers[type.to_sym] = match(pattern, term) if match(pattern, term).present?
end
end

# Note on the limitations of this implementation
# We only detect the first match of each pattern, so a search of "1234-5678 5678-1234" will not return two ISSNs as
# might be expected, but just "1234-5678". Using ruby's string.scan(pattern) may be worthwhile if we want to detect
# all possible matches instead of just the first. That may require a larger refactor though as initial tests of doing
# that change did result in unintended results so it was backed out for now.
def match(pattern, term)
pattern.match(term).to_s.strip
end

# term_patterns are regex patterns to be applied to the basic search box input
def term_patterns
{
Expand All @@ -66,9 +54,9 @@ def term_patterns
end

def strip_invalid_issns
return unless @identifiers[:issn]
return unless @detections[:issn]

@identifiers.delete(:issn) unless validate_issn(@identifiers[:issn])
@detections.delete(:issn) unless validate_issn(@detections[:issn])
end

# validate_issn is only called when the regex for an ISSN has indicated an ISSN
Expand Down
6 changes: 3 additions & 3 deletions app/models/metrics/algorithms.rb
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def event_matches(event, matches)
suggested_resource_exact = process_suggested_resources(event, matches)
lcshs = match_lcsh(event, matches)

matches[:unmatched] += 1 if ids.identifiers.blank? && lcshs.identifiers.blank? && journal_exact.count.zero? && suggested_resource_exact.count.zero?
matches[:unmatched] += 1 if ids.detections.blank? && lcshs.detections.blank? && journal_exact.count.zero? && suggested_resource_exact.count.zero?
end

# Checks for LCSH matches
Expand All @@ -95,7 +95,7 @@ def match_lcsh(event, matches)
ids = Detector::Lcsh.new(event.term.phrase)

known_ids.each do |id|
matches[:lcsh] += 1 if ids.identifiers[id].present?
matches[:lcsh] += 1 if ids.detections[id].present?
end
ids
end
Expand All @@ -110,7 +110,7 @@ def match_standard_identifiers(event, matches)
ids = Detector::StandardIdentifiers.new(event.term.phrase)

known_ids.each do |id|
matches[id] += 1 if ids.identifiers[id].present?
matches[id] += 1 if ids.detections[id].present?
end
ids
end
Expand Down
2 changes: 1 addition & 1 deletion test/controllers/graphql_controller_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class GraphqlControllerTest < ActionDispatch::IntegrationTest
json = response.parsed_body

assert_equal('nature', json['data']['logSearchEvent']['detectors']['journals'].first['title'])
assert_equal({"issns"=>["0028-0836", "1476-4687"]},
assert_equal({ 'issns' => %w[0028-0836 1476-4687] },
json['data']['logSearchEvent']['detectors']['journals'].first['additionalInfo'])
end

Expand Down
2 changes: 2 additions & 0 deletions test/controllers/report_controller_test.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

require 'test_helper'

class ReportControllerTest < ActionDispatch::IntegrationTest
Expand Down
2 changes: 2 additions & 0 deletions test/controllers/static_controller_test.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

require 'test_helper'

class StaticControllerTest < ActionDispatch::IntegrationTest
Expand Down
2 changes: 2 additions & 0 deletions test/helpers/metrics_helper_test.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

require 'test_helper'

class MetricsHelperTest < ActionView::TestCase
Expand Down
1 change: 1 addition & 0 deletions test/models/category_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class CategoryTest < ActiveSupport::TestCase
test 'duplicate Categories are not allowed' do
initial_count = Category.count
Category.create!(name: 'Example')

assert_equal(initial_count + 1, Category.count)

assert_raises(ActiveRecord::RecordNotUnique) do
Expand Down
4 changes: 2 additions & 2 deletions test/models/detector/lcsh_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class LcshTest < ActiveSupport::TestCase
]

true_samples.each do |term|
actual = Detector::Lcsh.new(term).identifiers
actual = Detector::Lcsh.new(term).detections

assert_includes(actual, :separator)
end
Expand All @@ -26,7 +26,7 @@ class LcshTest < ActiveSupport::TestCase
]

false_samples.each do |term|
actual = Detector::Lcsh.new(term).identifiers
actual = Detector::Lcsh.new(term).detections

assert_not_includes(actual, :separator)
end
Expand Down
34 changes: 17 additions & 17 deletions test/models/detector/standard_identifiers_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
class Detector
class StandardIdentifiersTest < ActiveSupport::TestCase
test 'ISBN detected in a string' do
actual = Detector::StandardIdentifiers.new('test 978-3-16-148410-0 test').identifiers
actual = Detector::StandardIdentifiers.new('test 978-3-16-148410-0 test').detections

assert_equal('978-3-16-148410-0', actual[:isbn])
end
Expand All @@ -17,7 +17,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
'0-9752298-0-X']

samples.each do |isbn|
actual = Detector::StandardIdentifiers.new(isbn).identifiers
actual = Detector::StandardIdentifiers.new(isbn).detections

assert_equal(isbn, actual[:isbn])
end
Expand All @@ -29,7 +29,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
'978-0-943396-04-2', '979-0-9752298-0-X']

samples.each do |isbn|
actual = Detector::StandardIdentifiers.new(isbn).identifiers
actual = Detector::StandardIdentifiers.new(isbn).detections

assert_equal(isbn, actual[:isbn])
end
Expand All @@ -39,7 +39,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
samples = ['orange cats like popcorn', '1234-6798', 'another ISBN not found here']

samples.each do |notisbn|
actual = Detector::StandardIdentifiers.new(notisbn).identifiers
actual = Detector::StandardIdentifiers.new(notisbn).detections

assert_nil(actual[:isbn])
end
Expand All @@ -52,14 +52,14 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
# consider whether we care in the future as we look for incorrect real-world matches.

samples.each do |notisbn|
actual = Detector::StandardIdentifiers.new(notisbn).identifiers
actual = Detector::StandardIdentifiers.new(notisbn).detections

assert_nil(actual[:isbn])
end
end

test 'ISSNs detected in a string' do
actual = Detector::StandardIdentifiers.new('test 0250-6335 test').identifiers
actual = Detector::StandardIdentifiers.new('test 0250-6335 test').detections

assert_equal('0250-6335', actual[:issn])
end
Expand All @@ -68,7 +68,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
samples = %w[0250-6335 0000-0019 1864-0761 1877-959X 0973-7758 1877-5683 1440-172X 1040-5631]

samples.each do |issn|
actual = Detector::StandardIdentifiers.new(issn).identifiers
actual = Detector::StandardIdentifiers.new(issn).detections

assert_equal(issn, actual[:issn])
end
Expand All @@ -78,14 +78,14 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
samples = ['orange cats like popcorn', '12346798', 'another ISSN not found here', '99921-58-10-7']

samples.each do |notissn|
actual = Detector::StandardIdentifiers.new(notissn).identifiers
actual = Detector::StandardIdentifiers.new(notissn).detections

assert_nil(actual[:issn])
end
end

test 'ISSNs need boundaries' do
actual = Detector::StandardIdentifiers.new('12345-5678 1234-56789').identifiers
actual = Detector::StandardIdentifiers.new('12345-5678 1234-56789').detections

assert_nil(actual[:issn])
end
Expand Down Expand Up @@ -117,7 +117,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
0250-633X
]
samples.each do |notissn|
actual = Detector::StandardIdentifiers.new(notissn).identifiers
actual = Detector::StandardIdentifiers.new(notissn).detections

assert_nil(actual[:issn])
end
Expand All @@ -131,15 +131,15 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
0973-7758
]
samples.each do |issn|
actual = Detector::StandardIdentifiers.new(issn).identifiers
actual = Detector::StandardIdentifiers.new(issn).detections

assert_equal(issn, actual[:issn])
end
end

test 'doi detected in string' do
actual = Detector::StandardIdentifiers.new('"Quantum tomography: Measured measurement", Markus Aspelmeyer, nature physics "\
"January 2009, Volume 5, No 1, pp11-12; [ doi:10.1038/nphys1170 ]').identifiers
"January 2009, Volume 5, No 1, pp11-12; [ doi:10.1038/nphys1170 ]').detections

assert_equal('10.1038/nphys1170', actual[:doi])
end
Expand All @@ -149,7 +149,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
10.1594/PANGAEA.667386 10.3207/2959859860 10.3866/PKU.WHXB201112303 10.1430/8105 10.1392/BC1.0]

samples.each do |doi|
actual = Detector::StandardIdentifiers.new(doi).identifiers
actual = Detector::StandardIdentifiers.new(doi).detections

assert_equal(doi, actual[:doi])
end
Expand All @@ -159,14 +159,14 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
samples = ['orange cats like popcorn', '10.1234 almost doi', 'another doi not found here', '99921-58-10-7']

samples.each do |notdoi|
actual = Detector::StandardIdentifiers.new(notdoi).identifiers
actual = Detector::StandardIdentifiers.new(notdoi).detections

assert_nil(actual[:notdoi])
end
end

test 'pmid detected in string' do
actual = Detector::StandardIdentifiers.new('Citation and stuff PMID: 35648703 more stuff.').identifiers
actual = Detector::StandardIdentifiers.new('Citation and stuff PMID: 35648703 more stuff.').detections

assert_equal('PMID: 35648703', actual[:pmid])
end
Expand All @@ -175,7 +175,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
samples = ['PMID: 35648703', 'pmid: 1234567', 'PMID:35648703']

samples.each do |pmid|
actual = Detector::StandardIdentifiers.new(pmid).identifiers
actual = Detector::StandardIdentifiers.new(pmid).detections

assert_equal(pmid, actual[:pmid])
end
Expand All @@ -185,7 +185,7 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
samples = ['orange cats like popcorn', 'pmid:almost', 'PMID: asdf', '99921-58-10-7']

samples.each do |notpmid|
actual = Detector::StandardIdentifiers.new(notpmid).identifiers
actual = Detector::StandardIdentifiers.new(notpmid).detections

assert_nil(actual[:pmid])
end
Expand Down
1 change: 1 addition & 0 deletions test/models/detector_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class DetectorTest < ActiveSupport::TestCase
test 'duplicate Detectors are not allowed' do
detector_count = Detector.count
Detector.create!(name: 'Example')

assert_equal(detector_count + 1, Detector.count)

assert_raises(ActiveRecord::RecordNotUnique) do
Expand Down

0 comments on commit 93d3fbd

Please sign in to comment.