From ec976640bd57e56f988d6d81ef088613bba99c49 Mon Sep 17 00:00:00 2001 From: Omar MHAIMDAT Date: Sat, 24 Feb 2024 17:29:10 +0100 Subject: [PATCH] Run aho corasick to also find overlapping substrings --- Cargo.toml | 2 +- quickner-core/Cargo.toml | 2 +- quickner-core/src/quickner.rs | 4 ++-- tests/performance.py | 3 ++- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b5a93a7..9188808 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "quickner" -version = "0.0.1-alpha.19" +version = "0.0.1-alpha.20" edition = "2021" authors = ["Omar MHAIMDAT"] license = "Mozilla Public License 2.0" diff --git a/quickner-core/Cargo.toml b/quickner-core/Cargo.toml index 94549a9..a0ab049 100644 --- a/quickner-core/Cargo.toml +++ b/quickner-core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "quickner-core" -version = "0.0.1-alpha.19" +version = "0.0.1-alpha.20" edition = "2021" authors = ["Omar MHAIMDAT"] license-file = "LICENSE" diff --git a/quickner-core/src/quickner.rs b/quickner-core/src/quickner.rs index e16782a..8fba067 100644 --- a/quickner-core/src/quickner.rs +++ b/quickner-core/src/quickner.rs @@ -125,7 +125,7 @@ impl Quickner { return None; } let mut annotations = Vec::new(); - for mat in aho_corasick.find_iter(&text) { + for mat in aho_corasick.find_overlapping_iter(&text) { let start = mat.start(); // convert byte index to char index (assuming utf8) let start = text[..start].chars().count(); @@ -141,7 +141,7 @@ impl Quickner { annotations.push((start, end, label)); continue; } - // if text == "python was created by guido van rossum" { + // if text == "monty python and the holy grail: the ultimate quiz http://bit.ly/pd3ms i got 42/50. can't believe i missed the name of lancelot's page " { // println!("Start: {}, End: {}, text_len: {}, End + 1: {}", start, end, text.len(), text.chars().nth(end + 1).unwrap_or('N')); // } // println!("Start: {}, End: {}, text_len: {}", start, end, char_len); diff --git a/tests/performance.py b/tests/performance.py index c3aaeff..ca3afad 100644 --- a/tests/performance.py +++ b/tests/performance.py @@ -26,7 +26,8 @@ def main(): quick.process() end = time.perf_counter() quick.to_jsonl("data/output.jsonl") - # docs = quick.find_documents_by_entity("twitter") + print(quick) + docs = quick.find_documents_by_entity("Twitter") print(f"Time elapsed: {end - start} seconds")