From ba243a8cb287cdfa38a19681d1d922a7420c9710 Mon Sep 17 00:00:00 2001 From: bvandercar-vt Date: Thu, 10 Oct 2024 11:04:11 -0600 Subject: [PATCH 1/5] reduce len() calls --- bench/benchmark_cdist.py | 10 ++++++---- bench/benchmark_cpdist.py | 11 +++++++---- bench/benchmark_scorer.py | 10 ++++++---- src/rapidfuzz/fuzz_py.py | 9 +++++---- src/rapidfuzz/process_cpp_impl.pyx | 10 ++++++---- src/rapidfuzz/process_py.py | 7 +++++-- 6 files changed, 35 insertions(+), 22 deletions(-) diff --git a/bench/benchmark_cdist.py b/bench/benchmark_cdist.py index 171e9b72..2264bc12 100644 --- a/bench/benchmark_cdist.py +++ b/bench/benchmark_cdist.py @@ -58,13 +58,15 @@ def get_platform(): def benchmark(): words = ["".join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) for _ in range(10000)] - sample_rate = len(words) // 100 + len_words = len(words) + sample_rate = len_words // 100 sample = words[::sample_rate] - total = len(words) * len(sample) + len_sample = len(sample) + total = len_words * len_sample print("System:", get_platform()) - print("Words :", len(words)) - print("Sample:", len(sample)) + print("Words :", len_words) + print("Sample:", len_sample) print("Total : %s calls\n" % total) def wrap_cdist(scorer, processor): diff --git a/bench/benchmark_cpdist.py b/bench/benchmark_cpdist.py index 8e67b8ae..68c243f9 100644 --- a/bench/benchmark_cpdist.py +++ b/bench/benchmark_cpdist.py @@ -47,14 +47,17 @@ def get_platform(): def benchmark(): words = ["".join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) for _ in range(1000000)] - sample_rate = len(words) // 2 + len_words = len(words) + sample_rate = len_words // 2 words1 = words[:sample_rate] words2 = words[sample_rate::] - total = len(words1) + len_words1 = len(words1) + len_words2 = len(words2) + total = len_words1 print("System:", get_platform()) - print("Words :", len(words1)) - print("Sample:", len(words2)) + print("Words :", len_words1) + print("Sample:", len_words2) print("Total : %s calls\n" % total) def wrap_cpdist(scorer): diff --git a/bench/benchmark_scorer.py b/bench/benchmark_scorer.py index ae3b98e0..55884291 100644 --- a/bench/benchmark_scorer.py +++ b/bench/benchmark_scorer.py @@ -47,14 +47,16 @@ def get_platform(): def benchmark(): words = ["".join(random.choice(string.ascii_letters + string.digits) for _ in range(10)) for _ in range(10000)] - sample_rate = len(words) // 100 + len_words = len(words) + sample_rate = len_words // 100 sample = words[::sample_rate] + len_sample = len(sample) - total = len(words) * len(sample) + total = len_words * len_sample print("System:", get_platform()) - print("Words :", len(words)) - print("Sample:", len(sample)) + print("Words :", len_words) + print("Sample:", len_sample) print("Total : %s calls\n" % total) def wrap(f): diff --git a/src/rapidfuzz/fuzz_py.py b/src/rapidfuzz/fuzz_py.py index 36eda853..3f8fab3d 100644 --- a/src/rapidfuzz/fuzz_py.py +++ b/src/rapidfuzz/fuzz_py.py @@ -316,9 +316,10 @@ def partial_ratio_alignment( if not s1 and not s2: return ScoreAlignment(100.0, 0, 0, 0, 0) - s1, s2 = conv_sequences(s1, s2) - if len(s1) <= len(s2): + len1 = len(s1) + len2 = len(s2) + if len1 <= len2: shorter = s1 longer = s2 else: @@ -326,7 +327,7 @@ def partial_ratio_alignment( longer = s1 res = _partial_ratio_impl(shorter, longer, score_cutoff / 100) - if res.score != 100 and len(s1) == len(s2): + if res.score != 100 and len1 == len2: score_cutoff = max(score_cutoff, res.score) res2 = _partial_ratio_impl(longer, shorter, score_cutoff / 100) if res2.score > res.score: @@ -335,7 +336,7 @@ def partial_ratio_alignment( if res.score < score_cutoff: return None - if len(s1) <= len(s2): + if len1 <= len2: return res return ScoreAlignment(res.score, res.dest_start, res.dest_end, res.src_start, res.src_end) diff --git a/src/rapidfuzz/process_cpp_impl.pyx b/src/rapidfuzz/process_cpp_impl.pyx index 90bae64d..9c4bf218 100644 --- a/src/rapidfuzz/process_cpp_impl.pyx +++ b/src/rapidfuzz/process_cpp_impl.pyx @@ -1208,6 +1208,7 @@ def extract(query, choices, *, scorer=WRatio, processor=None, limit=5, score_cut cdef RF_Scorer* scorer_context = NULL cdef RF_ScorerFlags scorer_flags cdef int64_t c_limit + cdef int64_t choices_len = len(choices) scorer_kwargs = scorer_kwargs.copy() if scorer_kwargs else {} setupPandas() @@ -1216,14 +1217,15 @@ def extract(query, choices, *, scorer=WRatio, processor=None, limit=5, score_cut return [] try: - if limit is None or limit > len(choices): - limit = len(choices) + if limit is None or limit > choices_len: + limit = choices_len except TypeError: # handle generators. In Theory we could retrieve the length later on while # preprocessing the choices, but this is good enough for now choices = list(choices) - if limit is None or limit > len(choices): - limit = len(choices) + choices_len = len(choices) + if limit is None or limit > choices_len: + limit = choices_len c_limit = limit if c_limit == 1: diff --git a/src/rapidfuzz/process_py.py b/src/rapidfuzz/process_py.py index 69e1f49f..4d482dda 100644 --- a/src/rapidfuzz/process_py.py +++ b/src/rapidfuzz/process_py.py @@ -643,14 +643,17 @@ def cpdist( """ import numpy as np - if len(queries) != len(choices): + len_queries = len(queries) + len_choices = len(choices) + + if len_queries != len_choices: error_message = "Length of queries and choices must be the same!" raise ValueError(error_message) _ = workers, score_hint scorer_kwargs = scorer_kwargs or {} dtype = _dtype_to_type_num(dtype, scorer, scorer_kwargs) - results = np.zeros((len(queries),), dtype=dtype) + results = np.zeros((len_queries,), dtype=dtype) setupPandas() From a011807fd3695eaf1f3c4f74a1cb7be73a021641 Mon Sep 17 00:00:00 2001 From: Blake Vandercar Date: Mon, 6 Jan 2025 11:34:53 -0700 Subject: [PATCH 2/5] style: revert bench folder changes --- bench/benchmark_cdist.py | 10 ++++------ bench/benchmark_cpdist.py | 11 ++++------- bench/benchmark_scorer.py | 10 ++++------ 3 files changed, 12 insertions(+), 19 deletions(-) diff --git a/bench/benchmark_cdist.py b/bench/benchmark_cdist.py index 2264bc12..171e9b72 100644 --- a/bench/benchmark_cdist.py +++ b/bench/benchmark_cdist.py @@ -58,15 +58,13 @@ def get_platform(): def benchmark(): words = ["".join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) for _ in range(10000)] - len_words = len(words) - sample_rate = len_words // 100 + sample_rate = len(words) // 100 sample = words[::sample_rate] - len_sample = len(sample) - total = len_words * len_sample + total = len(words) * len(sample) print("System:", get_platform()) - print("Words :", len_words) - print("Sample:", len_sample) + print("Words :", len(words)) + print("Sample:", len(sample)) print("Total : %s calls\n" % total) def wrap_cdist(scorer, processor): diff --git a/bench/benchmark_cpdist.py b/bench/benchmark_cpdist.py index 68c243f9..8e67b8ae 100644 --- a/bench/benchmark_cpdist.py +++ b/bench/benchmark_cpdist.py @@ -47,17 +47,14 @@ def get_platform(): def benchmark(): words = ["".join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) for _ in range(1000000)] - len_words = len(words) - sample_rate = len_words // 2 + sample_rate = len(words) // 2 words1 = words[:sample_rate] words2 = words[sample_rate::] - len_words1 = len(words1) - len_words2 = len(words2) - total = len_words1 + total = len(words1) print("System:", get_platform()) - print("Words :", len_words1) - print("Sample:", len_words2) + print("Words :", len(words1)) + print("Sample:", len(words2)) print("Total : %s calls\n" % total) def wrap_cpdist(scorer): diff --git a/bench/benchmark_scorer.py b/bench/benchmark_scorer.py index 55884291..ae3b98e0 100644 --- a/bench/benchmark_scorer.py +++ b/bench/benchmark_scorer.py @@ -47,16 +47,14 @@ def get_platform(): def benchmark(): words = ["".join(random.choice(string.ascii_letters + string.digits) for _ in range(10)) for _ in range(10000)] - len_words = len(words) - sample_rate = len_words // 100 + sample_rate = len(words) // 100 sample = words[::sample_rate] - len_sample = len(sample) - total = len_words * len_sample + total = len(words) * len(sample) print("System:", get_platform()) - print("Words :", len_words) - print("Sample:", len_sample) + print("Words :", len(words)) + print("Sample:", len(sample)) print("Total : %s calls\n" % total) def wrap(f): From 74b248d5fcbbaef6cce1bff7a95841cadd198d47 Mon Sep 17 00:00:00 2001 From: Blake Vandercar Date: Mon, 6 Jan 2025 11:38:01 -0700 Subject: [PATCH 3/5] move len to try block --- src/rapidfuzz/process_cpp_impl.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/rapidfuzz/process_cpp_impl.pyx b/src/rapidfuzz/process_cpp_impl.pyx index 9c4bf218..442c1f8e 100644 --- a/src/rapidfuzz/process_cpp_impl.pyx +++ b/src/rapidfuzz/process_cpp_impl.pyx @@ -1208,7 +1208,7 @@ def extract(query, choices, *, scorer=WRatio, processor=None, limit=5, score_cut cdef RF_Scorer* scorer_context = NULL cdef RF_ScorerFlags scorer_flags cdef int64_t c_limit - cdef int64_t choices_len = len(choices) + cdef int64_t choices_len scorer_kwargs = scorer_kwargs.copy() if scorer_kwargs else {} setupPandas() @@ -1217,6 +1217,7 @@ def extract(query, choices, *, scorer=WRatio, processor=None, limit=5, score_cut return [] try: + choices_len = len(choices) if limit is None or limit > choices_len: limit = choices_len except TypeError: From c8cfbca47fa81137d6f32fbeba184081c101ab10 Mon Sep 17 00:00:00 2001 From: Blake Vandercar Date: Mon, 6 Jan 2025 11:43:32 -0700 Subject: [PATCH 4/5] refactor: move common code --- src/rapidfuzz/process_cpp_impl.pyx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/rapidfuzz/process_cpp_impl.pyx b/src/rapidfuzz/process_cpp_impl.pyx index 442c1f8e..e365726d 100644 --- a/src/rapidfuzz/process_cpp_impl.pyx +++ b/src/rapidfuzz/process_cpp_impl.pyx @@ -1218,15 +1218,14 @@ def extract(query, choices, *, scorer=WRatio, processor=None, limit=5, score_cut try: choices_len = len(choices) - if limit is None or limit > choices_len: - limit = choices_len except TypeError: # handle generators. In Theory we could retrieve the length later on while # preprocessing the choices, but this is good enough for now choices = list(choices) choices_len = len(choices) - if limit is None or limit > choices_len: - limit = choices_len + + if limit is None or limit > choices_len: + limit = choices_len c_limit = limit if c_limit == 1: From 4f41086da3c3d1bf3823a73d02a9b60f99d2da0c Mon Sep 17 00:00:00 2001 From: Blake Vandercar Date: Wed, 8 Jan 2025 12:03:03 -0700 Subject: [PATCH 5/5] implement change --- src/rapidfuzz/process_cpp_impl.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rapidfuzz/process_cpp_impl.pyx b/src/rapidfuzz/process_cpp_impl.pyx index e365726d..96b4e776 100644 --- a/src/rapidfuzz/process_cpp_impl.pyx +++ b/src/rapidfuzz/process_cpp_impl.pyx @@ -1224,10 +1224,10 @@ def extract(query, choices, *, scorer=WRatio, processor=None, limit=5, score_cut choices = list(choices) choices_len = len(choices) - if limit is None or limit > choices_len: - limit = choices_len + c_limit = choices_len + if limit is not None: + c_limit = min(c_limit, limit) - c_limit = limit if c_limit == 1: res = extractOne( query,