Skip to content

Commit

Permalink
only boost similarity in Jaro-Winkler once the Jaro similarity exceed…
Browse files Browse the repository at this point in the history
…s 0.7
  • Loading branch information
maxbachmann committed Jan 5, 2024
1 parent f6a7593 commit 20e3bdd
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 10 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ This project attempts to adhere to [Semantic Versioning](http://semver.org).
- reduce runtime in our own benchmark by more than `70%`
- reduce binary size by more than `25%`

- only boost similarity in Jaro-Winkler once the Jaro similarity exceeds 0.7

### Fixed

- Fix transposition counting in Jaro and Jaro-Winkler.
Expand Down
24 changes: 14 additions & 10 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,16 +194,20 @@ where
&'b Iter2: IntoIterator<Item = Elem2>,
Elem1: PartialEq<Elem2>,
{
let jaro_distance = generic_jaro(a, b);
let sim = generic_jaro(a, b);

let prefix_length = a
.into_iter()
.take(4)
.zip(b)
.take_while(|(a_elem, b_elem)| a_elem == b_elem)
.count();
if sim > 0.7 {
let prefix_length = a
.into_iter()
.take(4)
.zip(b)
.take_while(|(a_elem, b_elem)| a_elem == b_elem)
.count();

return jaro_distance + 0.1 * prefix_length as f64 * (1.0 - jaro_distance);
sim + 0.1 * prefix_length as f64 * (1.0 - sim)
} else {
sim
}
}

/// Like Jaro but gives a boost to strings that have a common prefix.
Expand Down Expand Up @@ -953,7 +957,7 @@ mod tests {
#[test]
fn jaro_winkler_names() {
assert_delta!(
0.562,
0.452,
jaro_winkler("Friedrich Nietzsche", "Fran-Paul Sartre"),
0.001
);
Expand All @@ -977,7 +981,7 @@ mod tests {
#[test]
fn jaro_winkler_very_long_prefix() {
assert_delta!(
0.985,
0.98519,
jaro_winkler("thequickbrownfoxjumpedoverx", "thequickbrownfoxjumpedovery")
);
}
Expand Down

0 comments on commit 20e3bdd

Please sign in to comment.