From f6ea1f57dd519fb9e9f80d04403ab21c59e60515 Mon Sep 17 00:00:00 2001 From: Matthew Bernhardt Date: Fri, 6 Sep 2024 16:49:30 -0400 Subject: [PATCH 1/2] Implement detector for LCSH values ** Why are these changes being introduced: We have noticed a significant volume of search traffic that looks like multi-level LCSH headings, like "Geology -- Massachusetts". These likely come from the Bento UI, which makes subject values like this clickable. It makes sense to try and write a detector for this pattern, especially as it would be the first detector which would resolve to an Informational (or subject-based) search. ** Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/tco-71 ** How does this address that need: This writes a new Detector::Lcsh class, which uses a regex to look for a ' -- ' separator. The class is patterned off of the StandardIdentifier class. I initially wrote this as part of that class, but this doesn't really belong there - the pattern isn't an identifier in that sense, and further work to identify subjects (particularly single-level subjects like "Geology" rather than just "Geology -- Massachusetts") will go beyond just using a regex for detections. Adding this class has follow-on changes to the Term, Metrics, and GraphQL areas of the application. Outside the app code, there are a variety of tests, changes to the db seeds, and a new migration to record the item counts that come from the metrics work. There will be a future ticket to look up the detected string in the set of subject headings, to return more than just a string in the GraphQL. Right now the GraphQL response is pretty useless, just sending the search string back. It would be good to include include something else. ** Document any side effects to this change: The Detectors Type file has its methods alphabetized. I'm not sure if Detectors::Lcsh should instead be Detectors::LCSH? --- app/graphql/types/detectors_type.rb | 17 +++-- app/models/detector/lcsh.rb | 62 +++++++++++++++++++ app/models/metrics/algorithms.rb | 21 ++++++- app/models/term.rb | 1 + ...001205152_add_lcsh_to_metrics_algorithm.rb | 5 ++ db/schema.rb | 3 +- db/seeds.rb | 6 ++ docs/reference/classes.md | 5 ++ test/controllers/graphql_controller_test.rb | 29 +++++++++ test/fixtures/detector_categories.yml | 5 ++ test/fixtures/detectors.yml | 3 + test/fixtures/search_events.yml | 7 +++ test/fixtures/terms.yml | 3 + test/models/detector/lcsh_test.rb | 53 ++++++++++++++++ test/models/metrics/algorithms_test.rb | 19 ++++++ 15 files changed, 230 insertions(+), 9 deletions(-) create mode 100644 app/models/detector/lcsh.rb create mode 100644 db/migrate/20241001205152_add_lcsh_to_metrics_algorithm.rb create mode 100644 test/models/detector/lcsh_test.rb diff --git a/app/graphql/types/detectors_type.rb b/app/graphql/types/detectors_type.rb index 0626fb8..02f2f6a 100644 --- a/app/graphql/types/detectors_type.rb +++ b/app/graphql/types/detectors_type.rb @@ -5,21 +5,26 @@ class DetectorsType < Types::BaseObject description 'Provides all available search term detectors' field :journals, [Types::JournalsType], description: 'Information about journals detected in the search term' + field :lcsh, [String], description: 'Library of Congress Subject Heading information' field :standard_identifiers, [Types::StandardIdentifiersType], description: 'Currently supported: ISBN, ISSN, PMID, DOI' field :suggested_resources, [Types::SuggestedResourcesType], description: 'Suggested resources detected in the search term' - def standard_identifiers - Detector::StandardIdentifiers.new(@object).identifiers.map do |identifier| - { kind: identifier.first, value: identifier.last } - end - end - def journals Detector::Journal.full_term_match(@object).map do |journal| { title: journal.name, additional_info: journal.additional_info } end end + def lcsh + Detector::Lcsh.new(@object).identifiers.map(&:last) + end + + def standard_identifiers + Detector::StandardIdentifiers.new(@object).identifiers.map do |identifier| + { kind: identifier.first, value: identifier.last } + end + end + def suggested_resources Detector::SuggestedResource.full_term_match(@object).map do |suggested_resource| { title: suggested_resource.title, url: suggested_resource.url } diff --git a/app/models/detector/lcsh.rb b/app/models/detector/lcsh.rb new file mode 100644 index 0000000..b2c5d05 --- /dev/null +++ b/app/models/detector/lcsh.rb @@ -0,0 +1,62 @@ +# frozen_string_literal: true + +class Detector + # Detector::LCSH is a very rudimentary detector for the separator between levels of a Library of Congress Subject + # Heading (LCSH). These subject headings follow this pattern: "Social security beneficiaries -- United States" + class Lcsh + attr_reader :identifiers + + # For now the initialize method just needs to run the pattern checker. A space for future development would be to + # write additional methods to look up the detected LCSH for more information, and to confirm that the phrase is + # actually an LCSH. + def initialize(term) + @identifiers = {} + term_pattern_checker(term) + end + + # The record method will consult the set of regex-based detectors that are defined in Detector::Lcsh. Any matches + # will be registered as Detection records. + # + # @note While there is currently only one check within the Detector::Lcsh class, the method is build to anticipate + # additional checks in the future. Every such check would be capable of generating a separate Detection record + # (although a single check finding multiple matches would still only result in one Detection). + # + # @return nil + def self.record(term) + results = Detector::Lcsh.new(term.phrase) + + results.identifiers.each_key do + Detection.find_or_create_by( + term:, + detector: Detector.where(name: 'LCSH').first + ) + end + + nil + end + + private + + def term_pattern_checker(term) + subject_patterns.each_pair do |type, pattern| + @identifiers[type.to_sym] = match(pattern, term) if match(pattern, term).present? + end + end + + # This implementation will only detect the first match of a pattern in a long string. For the separator pattern this + # is fine, as we only need to find one (and finding multiples wouldn't change the outcome). If a pattern does come + # along where match counts matter, this should be reconsidered. + def match(pattern, term) + pattern.match(term).to_s.strip + end + + # subject_patterns are regex patterns that can be applied to indicate whether a search string is looking for an LCSH + # string. At the moment there is only one - for the separator character " -- " - but others might be possible if + # there are regex-able vocabulary quirks which might separate subject values from non-subject values. + def subject_patterns + { + separator: /(.*)\s--\s(.*)/ + } + end + end +end diff --git a/app/models/metrics/algorithms.rb b/app/models/metrics/algorithms.rb index 3368641..0ccb421 100644 --- a/app/models/metrics/algorithms.rb +++ b/app/models/metrics/algorithms.rb @@ -9,6 +9,7 @@ # doi :integer # issn :integer # isbn :integer +# lcsh :integer # pmid :integer # unmatched :integer # created_at :datetime not null @@ -48,7 +49,7 @@ def generate(month = nil) count_matches(SearchEvent.includes(:term)) end Metrics::Algorithms.create(month:, doi: matches[:doi], issn: matches[:issn], isbn: matches[:isbn], - pmid: matches[:pmid], journal_exact: matches[:journal_exact], + lcsh: matches[:lcsh], pmid: matches[:pmid], journal_exact: matches[:journal_exact], suggested_resource_exact: matches[:suggested_resource_exact], unmatched: matches[:unmatched]) end @@ -79,8 +80,24 @@ def event_matches(event, matches) ids = match_standard_identifiers(event, matches) journal_exact = process_journals(event, matches) suggested_resource_exact = process_suggested_resources(event, matches) + lcshs = match_lcsh(event, matches) - matches[:unmatched] += 1 if ids.identifiers.blank? && journal_exact.count.zero? && suggested_resource_exact.count.zero? + matches[:unmatched] += 1 if ids.identifiers.blank? && lcshs.identifiers.blank? && journal_exact.count.zero? && suggested_resource_exact.count.zero? + end + + # Checks for LCSH matches + # + # @param event [SearchEvent] an individual search event to check for matches + # @param matches [Hash] a Hash that keeps track of how many of each algorithm we match + # @return [Array] an array of matched LCSH sub-patterns + def match_lcsh(event, matches) + known_ids = %i[separator] + ids = Detector::Lcsh.new(event.term.phrase) + + known_ids.each do |id| + matches[:lcsh] += 1 if ids.identifiers[id].present? + end + ids end # Checks for StandardIdentifer matches diff --git a/app/models/term.rb b/app/models/term.rb index 703869c..3e2a220 100644 --- a/app/models/term.rb +++ b/app/models/term.rb @@ -24,6 +24,7 @@ class Term < ApplicationRecord def record_detections Detector::StandardIdentifiers.record(self) Detector::Journal.record(self) + Detector::Lcsh.record(self) Detector::SuggestedResource.record(self) nil diff --git a/db/migrate/20241001205152_add_lcsh_to_metrics_algorithm.rb b/db/migrate/20241001205152_add_lcsh_to_metrics_algorithm.rb new file mode 100644 index 0000000..784f2a2 --- /dev/null +++ b/db/migrate/20241001205152_add_lcsh_to_metrics_algorithm.rb @@ -0,0 +1,5 @@ +class AddLcshToMetricsAlgorithm < ActiveRecord::Migration[7.1] + def change + add_column :metrics_algorithms, :lcsh, :integer + end +end diff --git a/db/schema.rb b/db/schema.rb index e99e358..c20d917 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.1].define(version: 2024_09_23_182249) do +ActiveRecord::Schema[7.1].define(version: 2024_10_01_205152) do create_table "categories", force: :cascade do |t| t.string "name" t.text "description" @@ -93,6 +93,7 @@ t.datetime "updated_at", null: false t.integer "journal_exact" t.integer "suggested_resource_exact" + t.integer "lcsh" end create_table "search_events", force: :cascade do |t| diff --git a/db/seeds.rb b/db/seeds.rb index b144734..acc5b8f 100644 --- a/db/seeds.rb +++ b/db/seeds.rb @@ -28,6 +28,7 @@ Detector.find_or_create_by(name: 'DOI') Detector.find_or_create_by(name: 'ISBN') Detector.find_or_create_by(name: 'ISSN') +Detector.find_or_create_by(name: 'LCSH') Detector.find_or_create_by(name: 'PMID') Detector.find_or_create_by(name: 'Journal') Detector.find_or_create_by(name: 'SuggestedResource') @@ -48,6 +49,11 @@ category: Category.find_by(name: 'Transactional'), confidence: 0.6 ) +DetectorCategory.find_or_create_by( + detector: Detector.find_by(name: 'LCSH'), + category: Category.find_by(name: 'Informational'), + confidence: 0.7 +) DetectorCategory.find_or_create_by( detector: Detector.find_by(name: 'PMID'), category: Category.find_by(name: 'Transactional'), diff --git a/docs/reference/classes.md b/docs/reference/classes.md index ee6c486..99071e7 100644 --- a/docs/reference/classes.md +++ b/docs/reference/classes.md @@ -80,6 +80,9 @@ classDiagram DetectorJournal: partial_term_match() DetectorJournal: record() + class DetectorLcsh + DetectorLcsh: record() + class DetectorStandardIdentifier DetectorStandardIdentifier: record() @@ -105,6 +108,7 @@ classDiagram namespace Detectors { class Detector class DetectorJournal["Detector::Journal"] + class DetectorLcsh["Detector::Lcsh"] class DetectorStandardIdentifier["Detector::StandardIdentifiers"] class DetectorSuggestedResource["Detector::SuggestedResource"] } @@ -116,6 +120,7 @@ classDiagram style DetectorCategory fill:#000,stroke:#fc8d62,color:#fc8d62 style Detector fill:#000,stroke:#fc8d62,color:#fc8d62 style DetectorJournal fill:#000,stroke:#fc8d62,color:#fc8d62 + style DetectorLcsh fill:#000,stroke:#fc8d62,color:#fc8d62 style DetectorStandardIdentifier fill:#000,stroke:#fc8d62,color:#fc8d62 style DetectorSuggestedResource fill:#000,stroke:#fc8d62,color:#fc8d62 diff --git a/test/controllers/graphql_controller_test.rb b/test/controllers/graphql_controller_test.rb index 34c92e5..418d503 100644 --- a/test/controllers/graphql_controller_test.rb +++ b/test/controllers/graphql_controller_test.rb @@ -111,6 +111,20 @@ class GraphqlControllerTest < ActionDispatch::IntegrationTest json['data']['logSearchEvent']['detectors']['suggestedResources'].first['url'] end + test 'search event query can return detected library of congress subject headings' do + post '/graphql', params: { query: '{ + logSearchEvent(sourceSystem: "timdex", searchTerm: "Maryland -- Geography") { + detectors { + lcsh + } + } + }' } + json = response.parsed_body + + assert_equal 'Maryland -- Geography', + json['data']['logSearchEvent']['detectors']['lcsh'].first + end + test 'search event query can return phrase from logged term' do post '/graphql', params: { query: '{ logSearchEvent(sourceSystem: "timdex", searchTerm: "10.1038/nphys1170") { @@ -170,6 +184,21 @@ class GraphqlControllerTest < ActionDispatch::IntegrationTest assert_in_delta 0.95, json['data']['logSearchEvent']['categories'].first['confidence'] end + test 'term lookup query can return detected library of congress subject headings' do + post '/graphql', params: { query: '{ + lookupTerm(searchTerm: "Geology -- Massachusetts") { + detectors { + lcsh + } + } + }' } + + json = response.parsed_body + + assert_equal('Geology -- Massachusetts', + json['data']['lookupTerm']['detectors']['lcsh'].first) + end + test 'term lookup query can return categorization details for searches that trip a detector' do post '/graphql', params: { query: '{ lookupTerm(searchTerm: "10.1016/j.physio.2010.12.004") { diff --git a/test/fixtures/detector_categories.yml b/test/fixtures/detector_categories.yml index 113fc20..9ddee4c 100644 --- a/test/fixtures/detector_categories.yml +++ b/test/fixtures/detector_categories.yml @@ -33,3 +33,8 @@ five: detector: journal category: transactional confidence: 0.5 + +six: + detector: lcsh + category: informational + confidence: 0.7 diff --git a/test/fixtures/detectors.yml b/test/fixtures/detectors.yml index 81271f2..d9c837c 100644 --- a/test/fixtures/detectors.yml +++ b/test/fixtures/detectors.yml @@ -16,6 +16,9 @@ isbn: issn: name: 'ISSN' +lcsh: + name: 'LCSH' + pmid: name: 'PMID' diff --git a/test/fixtures/search_events.yml b/test/fixtures/search_events.yml index 168250f..2060684 100644 --- a/test/fixtures/search_events.yml +++ b/test/fixtures/search_events.yml @@ -31,6 +31,13 @@ current_month_doi: current_month_isbn: term: isbn_9781319145446 source: test +current_month_lcsh: + term: lcsh + source: test +old_month_lcsh: + term: lcsh + source: test + created_at: <%= 1.year.ago %> current_month_nature_medicine: term: journal_nature_medicine source: test diff --git a/test/fixtures/terms.yml b/test/fixtures/terms.yml index cb88d9c..20b81ee 100644 --- a/test/fixtures/terms.yml +++ b/test/fixtures/terms.yml @@ -17,6 +17,9 @@ hi: pmid_38908367: phrase: 'TERT activation targets DNA methylation and multiple aging hallmarks. Shim HS, et al. Cell. 2024. PMID: 38908367' +lcsh: + phrase: 'Geology -- Massachusetts' + issn_1075_8623: phrase: 1075-8623 diff --git a/test/models/detector/lcsh_test.rb b/test/models/detector/lcsh_test.rb new file mode 100644 index 0000000..15d56e4 --- /dev/null +++ b/test/models/detector/lcsh_test.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +require 'test_helper' + +class Detector + class LcshTest < ActiveSupport::TestCase + test 'lcsh detector activates when a separator is found' do + true_samples = [ + 'Geology -- Massachusetts', + 'Space vehicles -- Materials -- Congresses' + ] + + true_samples.each do |term| + actual = Detector::Lcsh.new(term).identifiers + + assert_includes(actual, :separator) + end + end + + test 'lcsh detector does nothing in most cases' do + false_samples = [ + 'orange cats like popcorn', + 'hyphenated names like Lin-Manuel Miranda do nothing', + 'dashes used as an aside - like this one - do nothing', + 'This one should--also not work' + ] + + false_samples.each do |term| + actual = Detector::Lcsh.new(term).identifiers + + assert_not_includes(actual, :separator) + end + end + + test 'record method does relevant work' do + detection_count = Detection.count + t = terms('lcsh') + + Detector::Lcsh.record(t) + + assert_equal(detection_count + 1, Detection.count) + end + + test 'record does nothing when not needed' do + detection_count = Detection.count + t = terms('isbn_9781319145446') + + Detector::Lcsh.record(t) + + assert_equal(detection_count, Detection.count) + end + end +end diff --git a/test/models/metrics/algorithms_test.rb b/test/models/metrics/algorithms_test.rb index e01c367..7428096 100644 --- a/test/models/metrics/algorithms_test.rb +++ b/test/models/metrics/algorithms_test.rb @@ -9,6 +9,7 @@ # doi :integer # issn :integer # isbn :integer +# lcsh :integer # pmid :integer # unmatched :integer # created_at :datetime not null @@ -38,6 +39,12 @@ class Algorithms < ActiveSupport::TestCase assert_equal 1, aggregate.isbn end + test 'lcsh counts are included in monthly aggregation' do + aggregate = Metrics::Algorithms.new.generate(DateTime.now) + + assert_equal 1, aggregate.lcsh + end + test 'pmids counts are included in monthly aggregation' do aggregate = Metrics::Algorithms.new.generate(DateTime.now) @@ -93,6 +100,11 @@ class Algorithms < ActiveSupport::TestCase SearchEvent.create(term: terms(:isbn_9781319145446), source: 'test') end + lcsh_expected_count = rand(1...100) + lcsh_expected_count.times do + SearchEvent.create(term: terms(:lcsh), source: 'test') + end + pmid_expected_count = rand(1...100) pmid_expected_count.times do SearchEvent.create(term: terms(:pmid_38908367), source: 'test') @@ -108,6 +120,7 @@ class Algorithms < ActiveSupport::TestCase assert_equal doi_expected_count, aggregate.doi assert_equal issn_expected_count, aggregate.issn assert_equal isbn_expected_count, aggregate.isbn + assert_equal lcsh_expected_count, aggregate.lcsh assert_equal pmid_expected_count, aggregate.pmid assert_equal unmatched_expected_count, aggregate.unmatched end @@ -131,6 +144,12 @@ class Algorithms < ActiveSupport::TestCase assert_equal 1, aggregate.isbn end + test 'lcsh counts are included in total aggregation' do + aggregate = Metrics::Algorithms.new.generate + + assert_equal 2, aggregate.lcsh + end + test 'pmids counts are included in total aggregation' do aggregate = Metrics::Algorithms.new.generate From 77626cbb1fccac4d48a4d5e1d2ca2b901e8b3094 Mon Sep 17 00:00:00 2001 From: Matthew Bernhardt Date: Fri, 4 Oct 2024 15:30:09 -0400 Subject: [PATCH 2/2] Add bugfix and test for LCSH from TCO-90 --- app/models/detector/lcsh.rb | 3 ++- test/models/detector/lcsh_test.rb | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/app/models/detector/lcsh.rb b/app/models/detector/lcsh.rb index b2c5d05..0d97e53 100644 --- a/app/models/detector/lcsh.rb +++ b/app/models/detector/lcsh.rb @@ -28,7 +28,8 @@ def self.record(term) results.identifiers.each_key do Detection.find_or_create_by( term:, - detector: Detector.where(name: 'LCSH').first + detector: Detector.where(name: 'LCSH').first, + detector_version: ENV.fetch('DETECTOR_VERSION', 'unset') ) end diff --git a/test/models/detector/lcsh_test.rb b/test/models/detector/lcsh_test.rb index 15d56e4..3372601 100644 --- a/test/models/detector/lcsh_test.rb +++ b/test/models/detector/lcsh_test.rb @@ -49,5 +49,24 @@ class LcshTest < ActiveSupport::TestCase assert_equal(detection_count, Detection.count) end + + test 'record respects changes to the DETECTOR_VERSION value' do + # Create a relevant detection + Detector::Lcsh.record(terms('lcsh')) + + detection_count = Detection.count + + # Calling the record method again doesn't do anything, but does not error. + Detector::Lcsh.record(terms('lcsh')) + + assert_equal(detection_count, Detection.count) + + # Calling the record method after DETECTOR_VERSION is incremented results in a new Detection + ClimateControl.modify DETECTOR_VERSION: 'updated' do + Detector::Lcsh.record(terms('lcsh')) + + assert_equal detection_count + 1, Detection.count + end + end end end