diff --git a/app/models/detector/citation.rb b/app/models/detector/citation.rb new file mode 100644 index 0000000..d4ba7e2 --- /dev/null +++ b/app/models/detector/citation.rb @@ -0,0 +1,163 @@ +# frozen_string_literal: true + +class Detector + # Detector::Citation attempts to identify citations based on the prevalence of individual sub-patterns. It is not + # targeted at a particular citation format, but was designed based on characteristics of five formats: APA, MLA, + # Chicago, Terabian, and IEEE. + # + # It receives a Term object, which is parsed in various ways en route to calculating a final score. Terms with a + # higher score are more citation-like, while a score of 0 indicates a Term that has no hallmarks of being a citation. + # Terms whose score is higher than the REQUIRED_SCORE value can be registered as a Detection. + class Citation + attr_reader :score, :subpatterns, :summary + + # Citation patterns are regular expressions which attempt to identify structures that are part of many citations. + # This object is used as part of the pattern_checker method. Some of these patterns may get promoted to the Detector + # model if they prove useful beyond a Citation context. + CITATION_PATTERNS = { + apa_volume_issue: /\d+\(\d+\)/, + no: /no\.\s\d+/, + pages: /\d+-+\d+/, + pp: /pp\.\s\d+/, + vol: /vol\.\s\d+/, + year_parens: /\(\d{4}\)/, + brackets: /\[.*?\]/, + lastnames: /[A-Z][a-z]+[.,]/, + quotes: /".*?"/ + }.freeze + + # The required score value is the threshold needed for a Term to be officially recorded with a Detection. + REQUIRED_SCORE = 6 + + # Summary thresholds are used by the calculate_score method. This class counts the number of occurrences of specific + # characters in the @summary instance variable. The thresholds here determine whether any of those counts are high + # enough to contribute to the Term's citation score. + SUMMARY_THRESHOLDS = { + characters: 25, + colons: 2, + commas: 2, + periods: 2, + semicolons: 2, + words: 5 + }.freeze + + # Detection? is a convenience method to check whether the calculated @score is high enough to qualify as a citation. + # + # @return boolean + def detection? + @score >= REQUIRED_SCORE + end + + # The initializer handles the parsing of a Term object, and subsequent population of the @subpatterns, @summary, + # and @score instance variables. @subpatterns contains all the citation components which have been flagged by the + # CITATION_PATTERNS hash. @summary contains counts of how often certain characters or words appear in the Term. + # Finally, the @score value is a summary of how many elements in the subpatterns or summary report were detected. + # + # @note This method can be called directly via Detector::Citation.new(Term). It is also called indirectly via the + # Detector::Citation.record(Term) instance method. This method can be called directly when a Detection is not + # desired. + def initialize(term) + @subpatterns = {} + @summary = {} + pattern_checker(term.phrase) + summarize(term.phrase) + @score = calculate_score + end + + # The record method first runs all of the parsers by running the initialize method. If the resulting score is higher + # than the REQUIRED_SCORE value, then a Detection is registered. + # + # @return nil + def self.record(term) + cit = Detector::Citation.new(term) + return unless cit.detection? + + Detection.find_or_create_by( + term:, + detector: Detector.where(name: 'Citation').first, + detector_version: ENV.fetch('DETECTOR_VERSION', 'unset') + ) + + nil + end + + private + + # This combines the two reports generated by the Citation detector (subpatterns and summary), and calculates the + # final score value from their contents. + # + # Any detected subpattern is counted toward the score (multiple detections do not get counted twice). For example, + # if the brackets pattern finds two matches, it still only adds one to the final score. + # + # For the summary report, each value is compared with a threshold value in the SUMMARY_THRESHOLDS constant. The + # number of values which meet or exceed their threshold are added to the score. As an example, if a search term has + # five words, this value is compared to the word threshold (also five). Because the threshold is met, the score gets + # incremented by one. + # + # @return integer + def calculate_score + summary_score = @summary.count do |key, value| + SUMMARY_THRESHOLDS.key?(key) && value >= SUMMARY_THRESHOLDS[key] + end + + summary_score + @subpatterns.length + end + + # This calculates the number of characters in the search term. It is called by the summarize method. + def characters(term) + term.length + end + + # This counts the number of colons that appear in the search term, because they tend to appear more often in + # citations than in other searches. It is called by the summarize method. + def colons(term) + term.count(':') + end + + # This counts the number of commas in the search term. It is called by the summarize method. + def commas(term) + term.count(',') + end + + # This builds one of the two main components of the Citation detector - the subpattern report. It uses each of the + # regular expressions in the CITATION_PATTERNS constant, extracting all matches using the scan method. + # + # @return hash + def pattern_checker(term) + CITATION_PATTERNS.each_pair do |type, pattern| + @subpatterns[type.to_sym] = scan(pattern, term) if scan(pattern, term).present? + end + end + + # This counts the number of periods in the search term. It is called by the summarize method. + def periods(term) + term.count('.') + end + + # This is a convenience method for the scan method, which is used by pattern_checker. + def scan(pattern, term) + term.scan(pattern).map(&:strip) + end + + # This counts the semicolons in the search term. It is called by the summarize method. + def semicolons(term) + term.count(';') + end + + # This builds one of the two main components of the Citation detector - the summary report. It calls each of the + # methods in the first line - which all return integers - and puts the result as a key-value pair in the @summary + # instance variable. + # + # @return hash + def summarize(term) + %w[characters colons commas periods semicolons words].each do |check| + @summary[check.to_sym] = send(check, term) + end + end + + # This counts the number of words in the search term. It is called by the summarize method. + def words(term) + term.split.length + end + end +end diff --git a/app/models/term.rb b/app/models/term.rb index 3e2a220..04dc351 100644 --- a/app/models/term.rb +++ b/app/models/term.rb @@ -22,6 +22,7 @@ class Term < ApplicationRecord # # @return nil def record_detections + Detector::Citation.record(self) Detector::StandardIdentifiers.record(self) Detector::Journal.record(self) Detector::Lcsh.record(self) diff --git a/db/seeds.rb b/db/seeds.rb index acc5b8f..307c7ac 100644 --- a/db/seeds.rb +++ b/db/seeds.rb @@ -32,8 +32,14 @@ Detector.find_or_create_by(name: 'PMID') Detector.find_or_create_by(name: 'Journal') Detector.find_or_create_by(name: 'SuggestedResource') +Detector.find_or_create_by(name: 'Citation') # DetectorCategories +DetectorCategory.find_or_create_by( + detector: Detector.find_by(name: 'Citation'), + category: Category.find_by(name: 'Transactional'), + confidence: 0.3 +) DetectorCategory.find_or_create_by( detector: Detector.find_by(name: 'DOI'), category: Category.find_by(name: 'Transactional'), diff --git a/test/fixtures/detector_categories.yml b/test/fixtures/detector_categories.yml index 9ddee4c..9f52950 100644 --- a/test/fixtures/detector_categories.yml +++ b/test/fixtures/detector_categories.yml @@ -38,3 +38,8 @@ six: detector: lcsh category: informational confidence: 0.7 + +seven: + detector: citation + category: transactional + confidence: 0.3 diff --git a/test/fixtures/detectors.yml b/test/fixtures/detectors.yml index d9c837c..dc1560f 100644 --- a/test/fixtures/detectors.yml +++ b/test/fixtures/detectors.yml @@ -7,6 +7,9 @@ # created_at :datetime not null # updated_at :datetime not null # +citation: + name: 'Citation' + doi: name: 'DOI' diff --git a/test/fixtures/terms.yml b/test/fixtures/terms.yml index 20b81ee..ad38915 100644 --- a/test/fixtures/terms.yml +++ b/test/fixtures/terms.yml @@ -37,3 +37,6 @@ suggested_resource_jstor: multiple_detections: phrase: 'Environmental and Health Impacts of Air Pollution: A Review. Frontiers in Public Health. PMID: 32154200. DOI: 10.3389/fpubh.2020.00014' + +citation: + phrase: "A. Altun, "Understanding hypertext in the context of reading on the web: Language learners' experience," Current Issues in Education, vol. 6, no. 12, July, 2005. [Online serial]. Available: http://cie.ed.asu.edu/volume6/number12/. [Accessed Dec. 2, 2007]." diff --git a/test/models/detector/citation_test.rb b/test/models/detector/citation_test.rb new file mode 100644 index 0000000..d553a5c --- /dev/null +++ b/test/models/detector/citation_test.rb @@ -0,0 +1,249 @@ +# frozen_string_literal: true + +require 'test_helper' + +class Detector + class CitationTest < ActiveSupport::TestCase + test 'detector::citation exposes three instance variables' do + t = terms('citation') + result = Detector::Citation.new(t) + + assert_predicate result.score, :present? + + assert_predicate result.summary, :present? + + assert_predicate result.subpatterns, :present? + end + + test 'detector::citation generates certain summary counts always' do + result = Detector::Citation.new(terms('hi')) + expected = %i[characters colons commas periods semicolons words] + + assert_equal expected, result.summary.keys + end + + test 'summary includes a character count' do + result = Detector::Citation.new(Term.new(phrase: 'a')) + + assert_equal 1, result.summary[:characters] + + # Multibyte character + result = Detector::Citation.new(Term.new(phrase: 'あ')) + + assert_equal 1, result.summary[:characters] + + # Twelve thousand characters? No problem... + phrase = String.new('a' * 12_345) + result = Detector::Citation.new(Term.new(phrase:)) + + assert_equal 12_345, result.summary[:characters] + end + + test 'summary includes a count of colons in term' do + result = Detector::Citation.new(Term.new(phrase: 'No colons here')) + + assert_equal 0, result.summary[:colons] + + result = Detector::Citation.new(Term.new(phrase: 'Three: colons :: here')) + + assert_equal 3, result.summary[:colons] + end + + test 'summary includes a count of commas in term' do + result = Detector::Citation.new(Term.new(phrase: 'No commas here')) + + assert_equal 0, result.summary[:commas] + + result = Detector::Citation.new(Term.new(phrase: 'Please, buy, apples, mac, and, cheese, milk, and, bread,.')) + + assert_equal 9, result.summary[:commas] + end + + test 'summary includes a count of periods in term' do + result = Detector::Citation.new(Term.new(phrase: 'No periods here')) + + assert_equal 0, result.summary[:periods] + + result = Detector::Citation.new(Term.new(phrase: 'This has periods. There are two of them.')) + + assert_equal 2, result.summary[:periods] + + result = Detector::Citation.new(Term.new(phrase: 'This ends with an ellipses, which does not count, but no periods…')) + + assert_equal 0, result.summary[:periods] + end + + test 'summary includes a count of semicolons in term' do + result = Detector::Citation.new(Term.new(phrase: 'No semicolons here')) + + assert_equal 0, result.summary[:semicolons] + + result = Detector::Citation.new(Term.new(phrase: 'This has one semicolon;')) + + assert_equal 1, result.summary[:semicolons] + + result = Detector::Citation.new(Term.new(phrase: '"HTML entities are counted"')) + + assert_equal 2, result.summary[:semicolons] + end + + test 'summary includes a word count' do + result = Detector::Citation.new(Term.new(phrase: 'brief')) + + assert_equal 1, result.summary[:words] + + result = Detector::Citation.new(Term.new(phrase: ' extra ')) + + assert_equal 1, result.summary[:words] + + result = Detector::Citation.new(Term.new(phrase: 'less brief')) + + assert_equal 2, result.summary[:words] + + result = Detector::Citation.new(Term.new(phrase: 'hyphenated-word')) + + assert_equal 1, result.summary[:words] + end + + test 'summary word count handles non-space separators' do + result = Detector::Citation.new(Term.new(phrase: "tabs\tdo\tcount")) + + assert_equal 3, result.summary[:words] + + result = Detector::Citation.new(Term.new(phrase: "newlines\nalso\ncount")) + + assert_equal 3, result.summary[:words] + end + + test 'subpatterns are empty by default' do + result = Detector::Citation.new(Term.new(phrase: 'nothing here')) + + assert_empty(result.subpatterns) + end + + test 'subpatterns flag all APA-style "volume(issue)" sequences' do + result = Detector::Citation.new(Term.new(phrase: 'Weinstein, J. (2009). Classical Philology, 104(4), 439-458.')) + + assert_equal ['104(4)'], result.subpatterns[:apa_volume_issue] + end + + test 'subpatterns flag all "no." instances with a number' do + result = Detector::Citation.new(Term.new(phrase: 'Yes or no. vol. 6, no. 12, pp. 314')) + + assert_equal ['no. 12'], result.subpatterns[:no] + end + + test 'subpatterns flag page ranges without spaces' do + result = Detector::Citation.new(Term.new(phrase: 'Read from pages 1-100')) + + assert_equal ['1-100'], result.subpatterns[:pages] + + result = Detector::Citation.new(Term.new(phrase: '1 - 100')) + + assert_empty(result.subpatterns) + end + + test 'subpatterns flag all "pp." instances with a number' do + result = Detector::Citation.new(Term.new(phrase: 'I love this app. vol. 6, no. 12, pp. 314')) + + assert_equal ['pp. 314'], result.subpatterns[:pp] + end + + test 'subpatterns flag all "vol." instances with a number' do + result = Detector::Citation.new(Term.new(phrase: 'This is frivol. vol. 6, no. 12, pp. 314')) + + assert_equal ['vol. 6'], result.subpatterns[:vol] + end + + test 'subpatterns flag all years in parentheses' do + result = Detector::Citation.new(Term.new(phrase: 'Only two (2) four-digit years (1996) (1997) here since 2024.')) + + assert_equal ['(1996)', '(1997)'], result.subpatterns[:year_parens] + end + + test 'subpatterns flag phrases in square brackets' do + result = Detector::Citation.new(Term.new(phrase: 'Artificial intelligence. [Online serial].')) + + assert_equal ['[Online serial]'], result.subpatterns[:brackets] + end + + # This is pretty rough. + test 'subpatterns attempts to flag names as they appear in author lists' do + result = Detector::Citation.new(Term.new(phrase: 'Sadava, D. E., D. M. Hillis, et al. Life: The Science of Biology. 11th ed. W. H. Freeman, 2016. ISBN: 9781319145446')) + + # This is also catching the last word of the title. + assert_equal ['Sadava,', 'Hillis,', 'Biology.', 'Freeman,'], result.subpatterns[:lastnames] + end + + test 'subpatterns flag phrases in quotes' do + result = Detector::Citation.new(Term.new(phrase: '"Principles of Materials Science and Engineering" by William F. Smith and Javad Hashemi')) + + assert_equal ['"Principles of Materials Science and Engineering"'], result.subpatterns[:quotes] + + # Need two to catch anything + result = Detector::Citation.new(Term.new(phrase: 'Principles of Materials Science and Engineering" by William F. Smith and Javad Hashemi')) + + assert_empty(result.subpatterns) + end + + test 'citation score increases as phrase gets more citation-like' do + result = Detector::Citation.new(Term.new(phrase: 'simple search phrase')) + + assert_equal 0, result.score + + result = Detector::Citation.new(Term.new(phrase: 'Science Education and Cultural Diversity: Mapping the Field. Studies in Science Education, 24(1), 49–73.')) + + assert_operator 0, :<, result.score + end + + test 'detection? convenience method returns true for obvious citations' do + result = Detector::Citation.new(terms('citation')) + + assert_predicate result, :detection? + end + + test 'detection? convenience method returns false for obvious non-citations' do + result = Detector::Citation.new(terms('hi')) + + assert_not result.detection? + end + + test 'record method does relevant work' do + detection_count = Detection.count + t = terms('citation') + + Detector::Citation.record(t) + + assert_equal detection_count + 1, Detection.count + end + + test 'record method does nothing when not needed' do + detection_count = Detection.count + t = terms('hi') + + Detector::Citation.record(t) + + assert_equal detection_count, Detection.count + end + + test 'record method respects changes to the DETECTOR_VERSION value' do + # Create a relevant detection + t = terms('citation') + Detector::Citation.record(t) + + detection_count = Detection.count + + # Calling the record method again doesn't do anything, but does not error. + Detector::Citation.record(t) + + assert_equal detection_count, Detection.count + + # Calling the record method after DETECTOR_VERSION is incremented results in a new Detection. + ClimateControl.modify DETECTOR_VERSION: 'updated' do + Detector::Citation.record(t) + + assert_equal detection_count + 1, Detection.count + end + end + end +end