From 20e3bdd2ee477db368ecb41815d55ab24550e0bf Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Sun, 31 Dec 2023 23:08:25 +0100 Subject: [PATCH] only boost similarity in Jaro-Winkler once the Jaro similarity exceeds 0.7 --- CHANGELOG.md | 2 ++ src/lib.rs | 24 ++++++++++++++---------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e4b9b52..f38f562 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ This project attempts to adhere to [Semantic Versioning](http://semver.org). - reduce runtime in our own benchmark by more than `70%` - reduce binary size by more than `25%` +- only boost similarity in Jaro-Winkler once the Jaro similarity exceeds 0.7 + ### Fixed - Fix transposition counting in Jaro and Jaro-Winkler. diff --git a/src/lib.rs b/src/lib.rs index 9aff1e9..8118277 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -194,16 +194,20 @@ where &'b Iter2: IntoIterator, Elem1: PartialEq, { - let jaro_distance = generic_jaro(a, b); + let sim = generic_jaro(a, b); - let prefix_length = a - .into_iter() - .take(4) - .zip(b) - .take_while(|(a_elem, b_elem)| a_elem == b_elem) - .count(); + if sim > 0.7 { + let prefix_length = a + .into_iter() + .take(4) + .zip(b) + .take_while(|(a_elem, b_elem)| a_elem == b_elem) + .count(); - return jaro_distance + 0.1 * prefix_length as f64 * (1.0 - jaro_distance); + sim + 0.1 * prefix_length as f64 * (1.0 - sim) + } else { + sim + } } /// Like Jaro but gives a boost to strings that have a common prefix. @@ -953,7 +957,7 @@ mod tests { #[test] fn jaro_winkler_names() { assert_delta!( - 0.562, + 0.452, jaro_winkler("Friedrich Nietzsche", "Fran-Paul Sartre"), 0.001 ); @@ -977,7 +981,7 @@ mod tests { #[test] fn jaro_winkler_very_long_prefix() { assert_delta!( - 0.985, + 0.98519, jaro_winkler("thequickbrownfoxjumpedoverx", "thequickbrownfoxjumpedovery") ); }