Skip to content

Commit

Permalink
Merge pull request #84 from MITLibraries/tco-25-historical-hints
Browse files Browse the repository at this point in the history
Extend Metrics::Algorithms feature to include Detector::SuggestedResource matches
  • Loading branch information
matt-bernhardt authored Aug 14, 2024
2 parents 806ecc3 + d0dbd12 commit 7476eeb
Show file tree
Hide file tree
Showing 9 changed files with 108 additions and 4 deletions.
24 changes: 22 additions & 2 deletions app/models/detector/suggested_resource.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,20 @@ module Detector
class SuggestedResource < ApplicationRecord
before_save :update_fingerprint

# This exists for the before_save lifecycle hook to call the calculate_fingerprint method, to ensure that these
# records always have a correctly-calculated fingerprint. It has no arguments and returns nothing.
def update_fingerprint
self.fingerprint = calculate_fingerprint(phrase)
self.fingerprint = Detector::SuggestedResource.calculate_fingerprint(phrase)
end

# This implements the OpenRefine fingerprinting algorithm. See
# https://openrefine.org/docs/technical-reference/clustering-in-depth#fingerprint
def calculate_fingerprint(old_phrase)
#
# @param old_phrase [String] A text string which needs to have its fingerprint calculated. This could either be the
# "phrase" field on the SuggestedResource record, or an incoming search term received from a contributing system.
#
# @return [String] A string of all words in the input, downcased, normalized, and alphabetized.
def self.calculate_fingerprint(old_phrase)
modified_phrase = old_phrase
modified_phrase = modified_phrase.strip
modified_phrase = modified_phrase.downcase
Expand Down Expand Up @@ -76,5 +83,18 @@ def self.bulk_replace(input)
record.save
end
end

# Identify any SuggestedResource record whose pre-calculated fingerprint matches the fingerprint of the incoming
# phrase.
#
# @note There is a uniqueness constraint on the SuggestedResource fingerprint field, so there should only ever be
# one match (if any).
#
# @param phrase [String]. A string representation of a searchterm (not an actual Term object)
#
# @return [Detector::SuggestedResource] The record whose fingerprint matches that of the search term.
def self.full_term_match(phrase)
SuggestedResource.where(fingerprint: calculate_fingerprint(phrase))
end
end
end
20 changes: 19 additions & 1 deletion app/models/metrics/algorithms.rb
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def generate(month = nil)
end
Metrics::Algorithms.create(month:, doi: matches[:doi], issn: matches[:issn], isbn: matches[:isbn],
pmid: matches[:pmid], journal_exact: matches[:journal_exact],
suggested_resource_exact: matches[:suggested_resource_exact],
unmatched: matches[:unmatched])
end

Expand Down Expand Up @@ -73,8 +74,9 @@ def count_matches(events)
def event_matches(event, matches)
ids = match_standard_identifiers(event, matches)
journal_exact = process_journals(event, matches)
suggested_resource_exact = process_suggested_resources(event, matches)

matches[:unmatched] += 1 if ids.identifiers.blank? && journal_exact.count.zero?
matches[:unmatched] += 1 if ids.identifiers.blank? && journal_exact.count.zero? && suggested_resource_exact.count.zero?
end

# Checks for StandardIdentifer matches
Expand Down Expand Up @@ -107,5 +109,21 @@ def process_journals(event, matches)
matches[:journal_exact] += 1 if journal_exact.count.positive?
journal_exact
end

# Checks for SuggestedResource matches
#
# @note This only checks for exact matches of the search term, so any extra or missing words will result in no
# match.
#
# @param event [SearchEvent] an individual search event to check for matches
# @param matches [Hash] a Hash that keeps track of how many of each algorithm we match
# @return [Array] an array of the one Detector::SuggestedResource record whose fingerprint matches that of the
# search phrase (if one exists). The uniqueness constraint on the fingerprint should mean there is only ever one
# matched record.
def process_suggested_resources(event, matches)
suggested_resource_exact = Detector::SuggestedResource.full_term_match(event.term.phrase)
matches[:suggested_resource_exact] += 1 if suggested_resource_exact.count.positive?
suggested_resource_exact
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class AddSuggestedResourceExactToMetricsAlgorithm < ActiveRecord::Migration[7.1]
def change
add_column :metrics_algorithms, :suggested_resource_exact, :integer
end
end
3 changes: 2 additions & 1 deletion db/schema.rb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions test/fixtures/detector/suggested_resources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,9 @@ web_of_knowledge:
url: https://libguides.mit.edu/webofsci
phrase: web of knowledge
fingerprint: knowledge of web

nobel_laureate:
title: Professor Moungi Bawendi
url: https://news.mit.edu/2023/mit-chemist-moungi-bawendi-shares-nobel-prize-chemistry-1004
phrase: moungi bawendi
fingerprint: bawendi moungi
7 changes: 7 additions & 0 deletions test/fixtures/search_events.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,10 @@ old_month_nature_medicine:
term: journal_nature_medicine
source: test
created_at: <%= 1.year.ago %>
suggested_resource_jstor:
term: suggested_resource_jstor
source: test
old_suggested_resource_jstor:
term: suggested_resource_jstor
source: test
created_at: <%= 1.year.ago %>
3 changes: 3 additions & 0 deletions test/fixtures/terms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,6 @@ isbn_9781319145446:

journal_nature_medicine:
phrase: 'nature medicine'

suggested_resource_jstor:
phrase: 'jstor'
26 changes: 26 additions & 0 deletions test/models/detector/suggested_resource_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,31 @@ class SuggestedResourceTest < ActiveSupport::TestCase

assert_equal 'delta gamma', resource.fingerprint
end

test 'fingerprint matches on search term' do
expected = detector_suggested_resources('jstor')
actual = Detector::SuggestedResource.full_term_match('jstor')

assert_equal 1, actual.count
assert_equal expected, actual.first
end

test 'fingerprint matches on any word order or punctuation' do
expected = detector_suggested_resources('nobel_laureate')
actual_one = Detector::SuggestedResource.full_term_match('Moungi Bawendi')
actual_two = Detector::SuggestedResource.full_term_match('Bawendi, Moungi')

assert_equal 1, actual_one.count
assert_equal expected, actual_one.first
assert_equal actual_one.first, actual_two.first
end

test 'partial fingerprint matches do not count' do
actual_partial = Detector::SuggestedResource.full_term_match('science web')
actual_extra = Detector::SuggestedResource.full_term_match('the web of science')

assert_predicate actual_partial.count, :zero?
assert_predicate actual_extra.count, :zero?
end
end
end
18 changes: 18 additions & 0 deletions test/models/metrics/algorithms_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ class Algorithms < ActiveSupport::TestCase
assert_equal 1, aggregate.journal_exact
end

test 'suggested_resource exact counts are included in monthly aggregation' do
aggregate = Metrics::Algorithms.new.generate(DateTime.now)

assert_equal 1, aggregate.suggested_resource_exact
end

test 'unmatched counts are included are included in monthly aggregation' do
aggregate = Metrics::Algorithms.new.generate(DateTime.now)

Expand Down Expand Up @@ -124,6 +130,12 @@ class Algorithms < ActiveSupport::TestCase
assert_equal 2, aggregate.journal_exact
end

test 'suggested_resource exact counts are included in total aggregation' do
aggregate = Metrics::Algorithms.new.generate

assert_equal 2, aggregate.suggested_resource_exact
end

test 'unmatched counts are included are included in total aggregation' do
aggregate = Metrics::Algorithms.new.generate

Expand Down Expand Up @@ -159,6 +171,11 @@ class Algorithms < ActiveSupport::TestCase
SearchEvent.create(term: terms(:journal_nature_medicine), source: 'test')
end

suggested_resource_exact_count = rand(1...100)
suggested_resource_exact_count.times do
SearchEvent.create(term: terms(:suggested_resource_jstor), source: 'test')
end

unmatched_expected_count = rand(1...100)
unmatched_expected_count.times do
SearchEvent.create(term: terms(:hi), source: 'test')
Expand All @@ -171,6 +188,7 @@ class Algorithms < ActiveSupport::TestCase
assert_equal isbn_expected_count, aggregate.isbn
assert_equal pmid_expected_count, aggregate.pmid
assert_equal journal_exact_count, aggregate.journal_exact
assert_equal suggested_resource_exact_count, aggregate.suggested_resource_exact
assert_equal unmatched_expected_count, aggregate.unmatched
end
end

0 comments on commit 7476eeb

Please sign in to comment.