Skip to content

Commit

Permalink
Move StandardIdentifiers to Detectors module
Browse files Browse the repository at this point in the history
Why these changes are being introduced:

Since we created the `StandardIdentifiers` model, we started
namespacing our detection models.

Relevant ticket(s):

Tangentially related to [TCO-50](https://mitlibraries.atlassian.net/browse/TCO-50)

How this addresses that need:

This adds the `Detectors` namespace to `StandardIdentifiers` to
match our other detection models (`Journal` and `SuggestedResource`).

Side effects of this change:

Some cascading changes were made in Metrics::Algorithms and
the SearchEventType GraphQL.
  • Loading branch information
jazairi committed Aug 16, 2024
1 parent 806ecc3 commit 09b51c5
Show file tree
Hide file tree
Showing 6 changed files with 272 additions and 267 deletions.
2 changes: 1 addition & 1 deletion app/graphql/types/search_event_type.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def phrase
end

def standard_identifiers
StandardIdentifiers.new(@object.term.phrase).identifiers.map do |identifier|
Detector::StandardIdentifiers.new(@object.term.phrase).identifiers.map do |identifier|
{ kind: identifier.first, value: identifier.last }
end
end
Expand Down
76 changes: 76 additions & 0 deletions app/models/detector/standard_identifiers.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# frozen_string_literal: true

# StandardIdentifiers is a PatternDectector implementation that detects the identifiers DOI, ISBN, ISSN, PMID.
# See /docs/reference/pattern_detection_and_enhancement.md for details.

module Detector
class StandardIdentifiers
attr_reader :identifiers

def initialize(term)
@identifiers = {}
term_pattern_checker(term)
strip_invalid_issns
end

private

def term_pattern_checker(term)
term_patterns.each_pair do |type, pattern|
@identifiers[type.to_sym] = match(pattern, term) if match(pattern, term).present?
end
end

# Note on the limitations of this implementation
# We only detect the first match of each pattern, so a search of "1234-5678 5678-1234" will not return two ISSNs as
# might be expected, but just "1234-5678". Using ruby's string.scan(pattern) may be worthwhile if we want to detect
# all possible matches instead of just the first. That may require a larger refactor though as initial tests of doing
# that change did result in unintended results so it was backed out for now.
def match(pattern, term)
pattern.match(term).to_s.strip
end

# term_patterns are regex patterns to be applied to the basic search box input
def term_patterns
{
isbn: /\b(ISBN-*(1[03])* *(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/,
issn: /\b[0-9]{4}-[0-9]{3}[0-9xX]\b/,
pmid: /\b((pmid|PMID):\s?(\d{7,8}))\b/,
doi: %r{\b10\.(\d+\.*)+/(([^\s.])+\.*)+\b}
}
end

def strip_invalid_issns
return unless @identifiers[:issn]

@identifiers.delete(:issn) unless validate_issn(@identifiers[:issn])
end

# validate_issn is only called when the regex for an ISSN has indicated an ISSN
# of sufficient format is present - but the regex does not attempt to
# validate that the check digit in the ISSN spec is correct. This method
# does that calculation, so we do not returned falsely detected ISSNs,
# like "2015-2019".
#
# The algorithm is defined at
# https://datatracker.ietf.org/doc/html/rfc3044#section-2.2
# An example calculation is shared at
# https://en.wikipedia.org/wiki/International_Standard_Serial_Number#Code_format
def validate_issn(candidate)
digits = candidate.delete('-')[..6].chars
check_digit = candidate.last.downcase
sum = 0

digits.each_with_index do |digit, idx|
sum += digit.to_i * (8 - idx.to_i)
end

actual_digit = 11 - sum.modulo(11)
actual_digit = 'x' if actual_digit == 10

return true if actual_digit.to_s == check_digit.to_s

false
end
end
end
2 changes: 1 addition & 1 deletion app/models/metrics/algorithms.rb
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def event_matches(event, matches)
# @return [Array] an array of matched StandardIdentifiers
def match_standard_identifiers(event, matches)
known_ids = %i[unmatched pmid isbn issn doi]
ids = StandardIdentifiers.new(event.term.phrase)
ids = Detector::StandardIdentifiers.new(event.term.phrase)

known_ids.each do |id|
matches[id] += 1 if ids.identifiers[id].present?
Expand Down
73 changes: 0 additions & 73 deletions app/models/standard_identifiers.rb

This file was deleted.

194 changes: 194 additions & 0 deletions test/models/detector/standard_identifiers_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
# frozen_string_literal: true

require 'test_helper'

module Detector
class StandardIdentifiersTest < ActiveSupport::TestCase
test 'ISBN detected in a string' do
actual = Detector::StandardIdentifiers.new('test 978-3-16-148410-0 test').identifiers

assert_equal('978-3-16-148410-0', actual[:isbn])
end

test 'ISBN-10 examples' do
# from wikipedia
samples = ['99921-58-10-7', '9971-5-0210-0', '960-425-059-0', '80-902734-1-6', '85-359-0277-5',
'1-84356-028-3', '0-684-84328-5', '0-8044-2957-X', '0-85131-041-9', '93-86954-21-4', '0-943396-04-2',
'0-9752298-0-X']

samples.each do |isbn|
actual = Detector::StandardIdentifiers.new(isbn).identifiers

assert_equal(isbn, actual[:isbn])
end
end

test 'ISBN-13 examples' do
samples = ['978-99921-58-10-7', '979-9971-5-0210-0', '978-960-425-059-0', '979-80-902734-1-6', '978-85-359-0277-5',
'979-1-84356-028-3', '978-0-684-84328-5', '979-0-8044-2957-X', '978-0-85131-041-9', '979-93-86954-21-4',
'978-0-943396-04-2', '979-0-9752298-0-X']

samples.each do |isbn|
actual = Detector::StandardIdentifiers.new(isbn).identifiers

assert_equal(isbn, actual[:isbn])
end
end

test 'not ISBNs' do
samples = ['orange cats like popcorn', '1234-6798', 'another ISBN not found here']

samples.each do |notisbn|
actual = Detector::StandardIdentifiers.new(notisbn).identifiers

assert_nil(actual[:isbn])
end
end

test 'ISBNs need boundaries' do
samples = ['990026671500206761', '979-0-9752298-0-XYZ']
# note, there is a theoretical case of `asdf979-0-9752298-0-X` returning as an ISBN 10 even though it doesn't
# have a word boundary because the `-` is treated as a boundary so `0-9752298-0-X` would be an ISBN10. We can
# consider whether we care in the future as we look for incorrect real-world matches.

samples.each do |notisbn|
actual = Detector::StandardIdentifiers.new(notisbn).identifiers

assert_nil(actual[:isbn])
end
end

test 'ISSNs detected in a string' do
actual = Detector::StandardIdentifiers.new('test 0250-6335 test').identifiers

assert_equal('0250-6335', actual[:issn])
end

test 'ISSN examples' do
samples = %w[0250-6335 0000-0019 1864-0761 1877-959X 0973-7758 1877-5683 1440-172X 1040-5631]

samples.each do |issn|
actual = Detector::StandardIdentifiers.new(issn).identifiers

assert_equal(issn, actual[:issn])
end
end

test 'not ISSN examples' do
samples = ['orange cats like popcorn', '12346798', 'another ISSN not found here', '99921-58-10-7']

samples.each do |notissn|
actual = Detector::StandardIdentifiers.new(notissn).identifiers

assert_nil(actual[:issn])
end
end

test 'ISSNs need boundaries' do
actual = Detector::StandardIdentifiers.new('12345-5678 1234-56789').identifiers

assert_nil(actual[:issn])
end

test 'ISSN validate rejects ISSNs with wrong check digit' do
samples = %w[
1234-5678
2015-2016
1460-2441
1460-2442
1460-2443
1460-2444
1460-2445
1460-2446
1460-2447
1460-2448
1460-2449
1460-2440
0250-6331
0250-6332
0250-6333
0250-6334
0250-6336
0250-6337
0250-6338
0250-6339
0250-6330
0250-633x
0250-633X
]
samples.each do |notissn|
actual = Detector::StandardIdentifiers.new(notissn).identifiers

assert_nil(actual[:issn])
end
end

test 'ISSN validate method accepts ISSNs with correct check digit' do
samples = %w[
1460-244X
2015-223x
0250-6335
0973-7758
]
samples.each do |issn|
actual = Detector::StandardIdentifiers.new(issn).identifiers

assert_equal(issn, actual[:issn])
end
end

test 'doi detected in string' do
actual = Detector::StandardIdentifiers.new('"Quantum tomography: Measured measurement", Markus Aspelmeyer, nature physics "\
"January 2009, Volume 5, No 1, pp11-12; [ doi:10.1038/nphys1170 ]').identifiers

assert_equal('10.1038/nphys1170', actual[:doi])
end

test 'doi examples' do
samples = %w[10.1038/nphys1170 10.1002/0470841559.ch1 10.1594/PANGAEA.726855 10.1594/GFZ.GEOFON.gfz2009kciu
10.1594/PANGAEA.667386 10.3207/2959859860 10.3866/PKU.WHXB201112303 10.1430/8105 10.1392/BC1.0]

samples.each do |doi|
actual = Detector::StandardIdentifiers.new(doi).identifiers

assert_equal(doi, actual[:doi])
end
end

test 'not doi examples' do
samples = ['orange cats like popcorn', '10.1234 almost doi', 'another doi not found here', '99921-58-10-7']

samples.each do |notdoi|
actual = Detector::StandardIdentifiers.new(notdoi).identifiers

assert_nil(actual[:notdoi])
end
end

test 'pmid detected in string' do
actual = Detector::StandardIdentifiers.new('Citation and stuff PMID: 35648703 more stuff.').identifiers

assert_equal('PMID: 35648703', actual[:pmid])
end

test 'pmid examples' do
samples = ['PMID: 35648703', 'pmid: 1234567', 'PMID:35648703']

samples.each do |pmid|
actual = Detector::StandardIdentifiers.new(pmid).identifiers

assert_equal(pmid, actual[:pmid])
end
end

test 'not pmid examples' do
samples = ['orange cats like popcorn', 'pmid:almost', 'PMID: asdf', '99921-58-10-7']

samples.each do |notpmid|
actual = Detector::StandardIdentifiers.new(notpmid).identifiers

assert_nil(actual[:pmid])
end
end
end
end
Loading

0 comments on commit 09b51c5

Please sign in to comment.