diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9892567..5047865 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -11,8 +11,6 @@ jobs:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
test-cmd: [pytest]
include:
- #- python-version: pyp-y3.8
- # test-cmd: pytest test_thefuzz.py test_thefuzz_pytest.py
- python-version: "3.7"
test-cmd: python setup.py check --restructuredtext --strict --metadata
- python-version: "3.10"
@@ -26,7 +24,10 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip setuptools wheel
- pip install pytest pycodestyle docutils Pygments hypothesis python-Levenshtein
+ pip install pytest pycodestyle docutils Pygments hypothesis
+
+ - name: Install project
+ run: pip install .
+
- name: Test with pytest
- run: |
- ${{ matrix.test-cmd }}
+ run: ${{ matrix.test-cmd }}
diff --git a/README.rst b/README.rst
index 158c833..028244b 100644
--- a/README.rst
+++ b/README.rst
@@ -10,9 +10,7 @@ Requirements
============
- Python 3.7 or higher
-- difflib
-- `python-Levenshtein `_ (optional, provides a 4-10x speedup in String
- Matching, though may result in `differing results for certain cases `_)
+- `rapidfuzz `_
For testing
~~~~~~~~~~~
@@ -29,12 +27,6 @@ Using PIP via PyPI
pip install thefuzz
-or the following to install `python-Levenshtein` too
-
-.. code:: bash
-
- pip install thefuzz[speedup]
-
Using PIP via Github
@@ -110,7 +102,7 @@ Partial Token Sort Ratio
84
>>> fuzz.partial_token_sort_ratio("fuzzy was a bear", "wuzzy fuzzy was a bear")
100
-
+
Process
~~~~~~~
diff --git a/benchmarks.py b/benchmarks.py
index a8688a4..ce6d836 100644
--- a/benchmarks.py
+++ b/benchmarks.py
@@ -39,7 +39,6 @@
]
common_setup = "from thefuzz import fuzz, utils; "
-basic_setup = "from thefuzz.string_processing import StringProcessor;"
def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
@@ -55,16 +54,10 @@ def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
duration, avg_duration * (1000 ** -thousands), units[-thousands]))
-for s in choices:
- print('Test validate_string for: "%s"' % s)
- print_result_from_timeit('utils.validate_string(\'%s\')' % s, common_setup, number=iterations)
-
-print('')
-
for s in mixed_strings + cirque_strings + choices:
print('Test full_process for: "%s"' % s)
print_result_from_timeit('utils.full_process(u\'%s\')' % s,
- common_setup + basic_setup, number=iterations)
+ common_setup, number=iterations)
# benchmarking the core matching methods...
@@ -72,31 +65,31 @@ def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
print('Test fuzz.ratio for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('fuzz.ratio(u\'cirque du soleil\', u\'%s\')' % s,
- common_setup + basic_setup, number=iterations / 100)
+ common_setup, number=iterations / 100)
for s in cirque_strings:
print('Test fuzz.partial_ratio for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('fuzz.partial_ratio(u\'cirque du soleil\', u\'%s\')'
- % s, common_setup + basic_setup, number=iterations / 100)
+ % s, common_setup, number=iterations / 100)
for s in cirque_strings:
print('Test fuzz.WRatio for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('fuzz.WRatio(u\'cirque du soleil\', u\'%s\')' % s,
- common_setup + basic_setup, number=iterations / 100)
+ common_setup, number=iterations / 100)
print('Test process.extract(scorer = fuzz.QRatio) for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('process.extract(u\'cirque du soleil\', choices, scorer = fuzz.QRatio)',
- common_setup + basic_setup + " from thefuzz import process; import string,random; random.seed(18);"
+ common_setup + " from thefuzz import process; import string,random; random.seed(18);"
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
number=10)
print('Test process.extract(scorer = fuzz.WRatio) for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('process.extract(u\'cirque du soleil\', choices, scorer = fuzz.WRatio)',
- common_setup + basic_setup + " from thefuzz import process; import string,random; random.seed(18);"
+ common_setup + " from thefuzz import process; import string,random; random.seed(18);"
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
number=10)
@@ -114,6 +107,4 @@ def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
print('-------------------------------')
test += 'prepared_ratio = functools.partial(fuzz.ratio, "%s")\n' % s
test += 'titles.sort(key=prepared_ratio)\n'
-print_result_from_timeit(test,
- common_setup + basic_setup,
- number=100)
+print_result_from_timeit(test, common_setup, number=100)
diff --git a/setup.py b/setup.py
index a4e94bb..a3b0e1d 100644
--- a/setup.py
+++ b/setup.py
@@ -5,17 +5,10 @@
# This file is part of thefuzz.
from thefuzz import __version__
-import os
-
-try:
- from setuptools import setup
-except ImportError:
- from distutils.core import setup
-
-
-def open_file(fname):
- return open(os.path.join(os.path.dirname(__file__), fname))
+from setuptools import setup
+with open('README.rst') as f:
+ long_description = f.read()
setup(
name='thefuzz',
@@ -23,7 +16,9 @@ def open_file(fname):
author='Adam Cohen',
author_email='adam@seatgeek.com',
packages=['thefuzz'],
- extras_require={'speedup': ['python-levenshtein>=0.12']},
+ # keep for backwards compatibility of projects depending on `thefuzz[speedup]`
+ extras_require={'speedup': []},
+ install_requires= ['rapidfuzz>=3.0.0, < 4.0.0'],
url='https://github.com/seatgeek/thefuzz',
license="GPLv2",
classifiers=[
@@ -39,6 +34,7 @@ def open_file(fname):
'Programming Language :: Python :: 3 :: Only',
],
description='Fuzzy string matching in python',
- long_description=open_file('README.rst').read(),
+ long_description=long_description,
zip_safe=True,
+ python_requires='>=3.7'
)
diff --git a/test_thefuzz.py b/test_thefuzz.py
index f0e3bb3..40d3940 100644
--- a/test_thefuzz.py
+++ b/test_thefuzz.py
@@ -5,15 +5,26 @@
from thefuzz import fuzz
from thefuzz import process
from thefuzz import utils
-from thefuzz.string_processing import StringProcessor
+scorers = [
+ fuzz.ratio,
+ fuzz.partial_ratio,
+ fuzz.token_sort_ratio,
+ fuzz.token_set_ratio,
+ fuzz.partial_token_sort_ratio,
+ fuzz.partial_token_set_ratio,
+ fuzz.QRatio,
+ fuzz.UQRatio,
+ fuzz.WRatio,
+ fuzz.UWRatio,
+]
class StringProcessingTest(unittest.TestCase):
def test_replace_non_letters_non_numbers_with_whitespace(self):
strings = ["new york mets - atlanta braves", "Cães danados",
"New York //// Mets $$$", "Ça va?"]
for string in strings:
- proc_string = StringProcessor.replace_non_letters_non_numbers_with_whitespace(string)
+ proc_string = utils.full_process(string)
regex = re.compile(r"(?ui)[\W]")
for expr in regex.finditer(proc_string):
self.assertEqual(expr.group(), " ")
@@ -21,9 +32,13 @@ def test_replace_non_letters_non_numbers_with_whitespace(self):
def test_dont_condense_whitespace(self):
s1 = "new york mets - atlanta braves"
s2 = "new york mets atlanta braves"
- p1 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s1)
- p2 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s2)
- self.assertNotEqual(p1, p2)
+ s3 = "new york mets atlanta braves"
+ p1 = utils.full_process(s1)
+ p2 = utils.full_process(s2)
+ p3 = utils.full_process(s3)
+ self.assertEqual(p1, s3)
+ self.assertEqual(p2, s2)
+ self.assertEqual(p3, s3)
class UtilsTest(unittest.TestCase):
@@ -120,7 +135,8 @@ def testPartialTokenSortRatio(self):
self.assertEqual(fuzz.partial_token_sort_ratio(self.s8, self.s8a, full_process=False), 100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s9, self.s9a, full_process=True), 100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s9, self.s9a, full_process=False), 100)
- self.assertEqual(fuzz.partial_token_sort_ratio(self.s10, self.s10a, full_process=False), 50)
+ self.assertEqual(fuzz.partial_token_sort_ratio(self.s10, self.s10a, full_process=False), 67)
+ self.assertEqual(fuzz.partial_token_sort_ratio(self.s10a, self.s10, full_process=False), 67)
def testTokenSetRatio(self):
self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5), 100)
@@ -243,58 +259,44 @@ def testQRatioForceAscii(self):
score = fuzz.WRatio(s1, s2, force_ascii=False)
self.assertLess(score, 100)
- def testTokenSetForceAscii(self):
+ def testPartialTokenSetRatioForceAscii(self):
s1 = "ABCD\u00C1 HELP\u00C1"
s2 = "ABCD HELP"
- score = fuzz._token_set(s1, s2, force_ascii=True)
+ score = fuzz.partial_token_set_ratio(s1, s2, force_ascii=True)
self.assertEqual(score, 100)
- score = fuzz._token_set(s1, s2, force_ascii=False)
+ score = fuzz.partial_token_set_ratio(s1, s2, force_ascii=False)
self.assertLess(score, 100)
- def testTokenSortForceAscii(self):
+ def testPartialTokenSortRatioForceAscii(self):
s1 = "ABCD\u00C1 HELP\u00C1"
s2 = "ABCD HELP"
- score = fuzz._token_sort(s1, s2, force_ascii=True)
+ score = fuzz.partial_token_sort_ratio(s1, s2, force_ascii=True)
self.assertEqual(score, 100)
- score = fuzz._token_sort(s1, s2, force_ascii=False)
+ score = fuzz.partial_token_sort_ratio(s1, s2, force_ascii=False)
self.assertLess(score, 100)
-
-class ValidatorTest(unittest.TestCase):
- def setUp(self):
- self.testFunc = lambda *args, **kwargs: (args, kwargs)
-
def testCheckForNone(self):
- invalid_input = [
- (None, None),
- ('Some', None),
- (None, 'Some')
- ]
- decorated_func = utils.check_for_none(self.testFunc)
- for i in invalid_input:
- self.assertEqual(decorated_func(*i), 0)
+ for scorer in scorers:
+ self.assertEqual(scorer(None, None), 0)
+ self.assertEqual(scorer('Some', None), 0)
+ self.assertEqual(scorer(None, 'Some'), 0)
- valid_input = ('Some', 'Some')
- actual = decorated_func(*valid_input)
- self.assertNotEqual(actual, 0)
+ self.assertNotEqual(scorer('Some', 'Some'), 0)
def testCheckEmptyString(self):
- invalid_input = [
- ('', ''),
- ('Some', ''),
- ('', 'Some')
- ]
- decorated_func = utils.check_empty_string(self.testFunc)
- for i in invalid_input:
- self.assertEqual(decorated_func(*i), 0)
-
- valid_input = ('Some', 'Some')
- actual = decorated_func(*valid_input)
- self.assertNotEqual(actual, 0)
+ for scorer in scorers:
+ if scorer in {fuzz.token_set_ratio, fuzz.partial_token_set_ratio, fuzz.WRatio, fuzz.UWRatio, fuzz.QRatio, fuzz.UQRatio}:
+ self.assertEqual(scorer('', ''), 0)
+ else:
+ self.assertEqual(scorer('', ''), 100)
+
+ self.assertEqual(scorer('Some', ''), 0)
+ self.assertEqual(scorer('', 'Some'), 0)
+ self.assertNotEqual(scorer('Some', 'Some'), 0)
class ProcessTest(unittest.TestCase):
diff --git a/test_thefuzz_hypothesis.py b/test_thefuzz_hypothesis.py
index a99a061..b4b2630 100644
--- a/test_thefuzz_hypothesis.py
+++ b/test_thefuzz_hypothesis.py
@@ -2,7 +2,7 @@
from functools import partial
from string import ascii_letters, digits, punctuation
-from hypothesis import given, assume, settings
+from hypothesis import given, assume, settings, HealthCheck
import hypothesis.strategies as st
import pytest
@@ -62,7 +62,7 @@ def full_scorers_processors():
@pytest.mark.parametrize('scorer,processor',
scorers_processors())
@given(data=st.data())
-@settings(max_examples=20, deadline=5000)
+@settings(max_examples=20, deadline=5000, suppress_health_check=[HealthCheck.data_too_large])
def test_identical_strings_extracted(scorer, processor, data):
"""
Test that identical strings will always return a perfect match.
diff --git a/thefuzz/StringMatcher.py b/thefuzz/StringMatcher.py
deleted file mode 100644
index bcf88d9..0000000
--- a/thefuzz/StringMatcher.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env python
-"""
-StringMatcher.py
-
-ported from python-Levenshtein
-[https://github.com/miohtama/python-Levenshtein]
-License available here: https://github.com/miohtama/python-Levenshtein/blob/master/COPYING
-"""
-
-from Levenshtein import *
-from warnings import warn
-
-
-class StringMatcher:
- """A SequenceMatcher-like class built on the top of Levenshtein"""
-
- def _reset_cache(self):
- self._ratio = self._distance = None
- self._opcodes = self._editops = self._matching_blocks = None
-
- def __init__(self, isjunk=None, seq1='', seq2=''):
- if isjunk:
- warn("isjunk not NOT implemented, it will be ignored")
- self._str1, self._str2 = seq1, seq2
- self._reset_cache()
-
- def set_seqs(self, seq1, seq2):
- self._str1, self._str2 = seq1, seq2
- self._reset_cache()
-
- def set_seq1(self, seq1):
- self._str1 = seq1
- self._reset_cache()
-
- def set_seq2(self, seq2):
- self._str2 = seq2
- self._reset_cache()
-
- def get_opcodes(self):
- if not self._opcodes:
- if self._editops:
- self._opcodes = opcodes(self._editops, self._str1, self._str2)
- else:
- self._opcodes = opcodes(self._str1, self._str2)
- return self._opcodes
-
- def get_editops(self):
- if not self._editops:
- if self._opcodes:
- self._editops = editops(self._opcodes, self._str1, self._str2)
- else:
- self._editops = editops(self._str1, self._str2)
- return self._editops
-
- def get_matching_blocks(self):
- if not self._matching_blocks:
- self._matching_blocks = matching_blocks(self.get_opcodes(),
- self._str1, self._str2)
- return self._matching_blocks
-
- def ratio(self):
- if not self._ratio:
- self._ratio = ratio(self._str1, self._str2)
- return self._ratio
-
- def quick_ratio(self):
- # This is usually quick enough :o)
- if not self._ratio:
- self._ratio = ratio(self._str1, self._str2)
- return self._ratio
-
- def real_quick_ratio(self):
- len1, len2 = len(self._str1), len(self._str2)
- return 2.0 * min(len1, len2) / (len1 + len2)
-
- def distance(self):
- if not self._distance:
- self._distance = distance(self._str1, self._str2)
- return self._distance
diff --git a/thefuzz/StringMatcher.pyi b/thefuzz/StringMatcher.pyi
deleted file mode 100644
index b0b2286..0000000
--- a/thefuzz/StringMatcher.pyi
+++ /dev/null
@@ -1,26 +0,0 @@
-from typing import Optional, Tuple, List
-
-OpcodeT = Tuple[str, int, int, int, int]
-EditOpcodeT = Tuple[str, int, int]
-MatchingBlocksT = List[Tuple[int, int, int]]
-
-
-class StringMatcher:
- def _reset_cache(self) -> None:
- self._ratio: Optional[float] = None
- self._distance: Optional[int] = None
- self._opcodes: Optional[OpcodeT] = None
- self._editops: Optional[EditOpcodeT] = None
- self._matching_blocks: Optional[MatchingBlocksT] = None
-
- def __init__(self, isjunk: Optional[bool] = ..., seq1: str = ..., seq2: str = ...) -> None: ...
- def set_seqs(self, seq1: str, seq2: str) -> None: ...
- def set_seq1(self, seq1: str) -> None: ...
- def set_seq2(self, seq2: str) -> None: ...
- def get_opcodes(self) -> OpcodeT: ...
- def get_editops(self) -> EditOpcodeT: ...
- def get_matching_blocks(self) -> MatchingBlocksT: ...
- def ratio(self) -> float: ...
- def quick_ratio(self) -> float: ...
- def real_quick_ratio(self) -> float: ...
- def distance(self) -> int: ...
diff --git a/thefuzz/fuzz.py b/thefuzz/fuzz.py
index 36cbbc3..7d86e48 100644
--- a/thefuzz/fuzz.py
+++ b/thefuzz/fuzz.py
@@ -1,174 +1,83 @@
#!/usr/bin/env python
-import platform
-import warnings
-try:
- from .StringMatcher import StringMatcher as SequenceMatcher
-except ImportError:
- if platform.python_implementation() != "PyPy":
- warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
- from difflib import SequenceMatcher
+from rapidfuzz.fuzz import (
+ ratio as _ratio,
+ partial_ratio as _partial_ratio,
+ token_set_ratio as _token_set_ratio,
+ token_sort_ratio as _token_sort_ratio,
+ partial_token_set_ratio as _partial_token_set_ratio,
+ partial_token_sort_ratio as _partial_token_sort_ratio,
+ WRatio as _WRatio,
+ QRatio as _QRatio,
+)
from . import utils
-
###########################
# Basic Scoring Functions #
###########################
-@utils.check_for_none
-@utils.check_for_equivalence
-@utils.check_empty_string
-def ratio(s1, s2):
- s1, s2 = utils.make_type_consistent(s1, s2)
- m = SequenceMatcher(None, s1, s2)
- return utils.intr(100 * m.ratio())
+def _rapidfuzz_scorer(scorer, s1, s2, force_ascii, full_process):
+ """
+ wrapper around rapidfuzz function to be compatible with the API of thefuzz
+ """
+ if full_process:
+ if s1 is None or s2 is None:
+ return 0
+
+ s1 = utils.full_process(s1, force_ascii=force_ascii)
+ s2 = utils.full_process(s2, force_ascii=force_ascii)
+
+ return int(round(scorer(s1, s2)))
+
+
+def ratio(s1, s2):
+ return _rapidfuzz_scorer(_ratio, s1, s2, False, False)
-@utils.check_for_none
-@utils.check_for_equivalence
-@utils.check_empty_string
def partial_ratio(s1, s2):
- """"Return the ratio of the most similar substring
- as a number between 0 and 100."""
- s1, s2 = utils.make_type_consistent(s1, s2)
-
- if len(s1) <= len(s2):
- shorter = s1
- longer = s2
- else:
- shorter = s2
- longer = s1
-
- m = SequenceMatcher(None, shorter, longer)
- blocks = m.get_matching_blocks()
-
- # each block represents a sequence of matching characters in a string
- # of the form (idx_1, idx_2, len)
- # the best partial match will block align with at least one of those blocks
- # e.g. shorter = "abcd", longer = XXXbcdeEEE
- # block = (1,3,3)
- # best score === ratio("abcd", "Xbcd")
- scores = []
- for block in blocks:
- long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
- long_end = long_start + len(shorter)
- long_substr = longer[long_start:long_end]
-
- m2 = SequenceMatcher(None, shorter, long_substr)
- r = m2.ratio()
- if r > .995:
- return 100
- else:
- scores.append(r)
-
- return utils.intr(100 * max(scores))
+ """
+ Return the ratio of the most similar substring
+ as a number between 0 and 100.
+ """
+ return _rapidfuzz_scorer(_partial_ratio, s1, s2, False, False)
##############################
# Advanced Scoring Functions #
##############################
-def _process_and_sort(s, force_ascii, full_process=True):
- """Return a cleaned string with token sorted."""
- # pull tokens
- ts = utils.full_process(s, force_ascii=force_ascii) if full_process else s
- tokens = ts.split()
-
- # sort tokens and join
- sorted_string = " ".join(sorted(tokens))
- return sorted_string.strip()
-
-
# Sorted Token
# find all alphanumeric tokens in the string
# sort those tokens and take ratio of resulting joined strings
# controls for unordered string elements
-@utils.check_for_none
-def _token_sort(s1, s2, partial=True, force_ascii=True, full_process=True):
- sorted1 = _process_and_sort(s1, force_ascii, full_process=full_process)
- sorted2 = _process_and_sort(s2, force_ascii, full_process=full_process)
-
- if partial:
- return partial_ratio(sorted1, sorted2)
- else:
- return ratio(sorted1, sorted2)
-
-
def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
- """Return a measure of the sequences' similarity between 0 and 100
+ """
+ Return a measure of the sequences' similarity between 0 and 100
but sorting the token before comparing.
"""
- return _token_sort(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)
+ return _rapidfuzz_scorer(_token_sort_ratio, s1, s2, force_ascii, full_process)
def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
- """Return the ratio of the most similar substring as a number between
+ """
+ Return the ratio of the most similar substring as a number between
0 and 100 but sorting the token before comparing.
"""
- return _token_sort(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)
-
-
-@utils.check_for_none
-def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True):
- """Find all alphanumeric tokens in each string...
- - treat them as a set
- - construct two strings of the form:
-
- - take ratios of those two strings
- - controls for unordered partial matches"""
-
- if not full_process and s1 == s2:
- return 100
-
- p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1
- p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2
-
- if not utils.validate_string(p1):
- return 0
- if not utils.validate_string(p2):
- return 0
-
- # pull tokens
- tokens1 = set(p1.split())
- tokens2 = set(p2.split())
-
- intersection = tokens1.intersection(tokens2)
- diff1to2 = tokens1.difference(tokens2)
- diff2to1 = tokens2.difference(tokens1)
-
- sorted_sect = " ".join(sorted(intersection))
- sorted_1to2 = " ".join(sorted(diff1to2))
- sorted_2to1 = " ".join(sorted(diff2to1))
-
- combined_1to2 = sorted_sect + " " + sorted_1to2
- combined_2to1 = sorted_sect + " " + sorted_2to1
-
- # strip
- sorted_sect = sorted_sect.strip()
- combined_1to2 = combined_1to2.strip()
- combined_2to1 = combined_2to1.strip()
-
- if partial:
- ratio_func = partial_ratio
- else:
- ratio_func = ratio
-
- pairwise = [
- ratio_func(sorted_sect, combined_1to2),
- ratio_func(sorted_sect, combined_2to1),
- ratio_func(combined_1to2, combined_2to1)
- ]
- return max(pairwise)
+ return _rapidfuzz_scorer(
+ _partial_token_sort_ratio, s1, s2, force_ascii, full_process
+ )
def token_set_ratio(s1, s2, force_ascii=True, full_process=True):
- return _token_set(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)
+ return _rapidfuzz_scorer(_token_set_ratio, s1, s2, force_ascii, full_process)
def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):
- return _token_set(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)
+ return _rapidfuzz_scorer(
+ _partial_token_set_ratio, s1, s2, force_ascii, full_process
+ )
###################
@@ -189,20 +98,7 @@ def QRatio(s1, s2, force_ascii=True, full_process=True):
:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
:return: similarity ratio
"""
-
- if full_process:
- p1 = utils.full_process(s1, force_ascii=force_ascii)
- p2 = utils.full_process(s2, force_ascii=force_ascii)
- else:
- p1 = s1
- p2 = s2
-
- if not utils.validate_string(p1):
- return 0
- if not utils.validate_string(p2):
- return 0
-
- return ratio(p1, p2)
+ return _rapidfuzz_scorer(_QRatio, s1, s2, force_ascii, full_process)
def UQRatio(s1, s2, full_process=True):
@@ -253,52 +149,12 @@ def WRatio(s1, s2, force_ascii=True, full_process=True):
:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
:return:
"""
-
- if full_process:
- p1 = utils.full_process(s1, force_ascii=force_ascii)
- p2 = utils.full_process(s2, force_ascii=force_ascii)
- else:
- p1 = s1
- p2 = s2
-
- if not utils.validate_string(p1):
- return 0
- if not utils.validate_string(p2):
- return 0
-
- # should we look at partials?
- try_partial = True
- unbase_scale = .95
- partial_scale = .90
-
- base = ratio(p1, p2)
- len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))
-
- # if strings are similar length, don't use partials
- if len_ratio < 1.5:
- try_partial = False
-
- # if one string is much much shorter than the other
- if len_ratio > 8:
- partial_scale = .6
-
- if try_partial:
- partial = partial_ratio(p1, p2) * partial_scale
- ptsor = partial_token_sort_ratio(p1, p2, full_process=False) \
- * unbase_scale * partial_scale
- ptser = partial_token_set_ratio(p1, p2, full_process=False) \
- * unbase_scale * partial_scale
-
- return utils.intr(max(base, partial, ptsor, ptser))
- else:
- tsor = token_sort_ratio(p1, p2, full_process=False) * unbase_scale
- tser = token_set_ratio(p1, p2, full_process=False) * unbase_scale
-
- return utils.intr(max(base, tsor, tser))
+ return _rapidfuzz_scorer(_WRatio, s1, s2, force_ascii, full_process)
def UWRatio(s1, s2, full_process=True):
- """Return a measure of the sequences' similarity between 0 and 100,
+ """
+ Return a measure of the sequences' similarity between 0 and 100,
using different algorithms. Same as WRatio but preserving unicode.
"""
return WRatio(s1, s2, force_ascii=False, full_process=full_process)
diff --git a/thefuzz/fuzz.pyi b/thefuzz/fuzz.pyi
index 6dcc6d1..86916a2 100644
--- a/thefuzz/fuzz.pyi
+++ b/thefuzz/fuzz.pyi
@@ -1,10 +1,7 @@
def ratio(s1: str, s2: str) -> int: ...
def partial_ratio(s1: str, s2: str) -> int: ...
-def _process_and_sort(s: str, force_ascii: bool, full_process: bool = ...) -> str: ...
-def _token_sort(s1: str, s2: str, partial: bool = ..., force_ascii: bool = ..., full_process: bool = ...) -> int: ...
def token_sort_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
def partial_token_sort_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
-def _token_set(s1: str, s2: str, partial: bool = ..., force_ascii: bool = ..., full_process: bool = ...) -> int: ...
def token_set_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
def partial_token_set_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
def QRatio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
diff --git a/thefuzz/process.py b/thefuzz/process.py
index 89ffa34..f6b15ea 100644
--- a/thefuzz/process.py
+++ b/thefuzz/process.py
@@ -1,22 +1,81 @@
#!/usr/bin/env python
from . import fuzz
from . import utils
-import heapq
import logging
-from functools import partial
-
+from rapidfuzz import fuzz as rfuzz
+from rapidfuzz import process as rprocess
_logger = logging.getLogger(__name__)
-
default_scorer = fuzz.WRatio
+default_processor = utils.full_process
-default_processor = utils.full_process
+def _get_processor(processor, scorer):
+ """
+ thefuzz runs both the default preprocessing of the function and the preprocessing
+ function passed into process.* while rapidfuzz only runs the one passed into
+ process.*. This function wraps the processor to mimic this behavior
+ """
+ if scorer not in (fuzz.WRatio, fuzz.QRatio,
+ fuzz.token_set_ratio, fuzz.token_sort_ratio,
+ fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
+ fuzz.UWRatio, fuzz.UQRatio):
+ return processor
+
+ if not processor:
+ return utils.full_process
+
+ def wrapper(s):
+ return utils.full_process(processor(s))
+
+ return wrapper
+
+
+# this allows lowering the scorers back to the scorers used in rapidfuzz
+# this allows rapidfuzz to perform more optimizations behind the scenes.
+# These mapped scorers are the same with two expceptions
+# - default processor
+# - result is not rounded
+# these two exceptions need to be taken into account in the implementation
+_scorer_lowering = {
+ fuzz.ratio: rfuzz.ratio,
+ fuzz.partial_ratio: rfuzz.partial_ratio,
+ fuzz.token_set_ratio: rfuzz.token_set_ratio,
+ fuzz.token_sort_ratio: rfuzz.token_sort_ratio,
+ fuzz.partial_token_set_ratio: rfuzz.partial_token_set_ratio,
+ fuzz.partial_token_sort_ratio: rfuzz.partial_token_sort_ratio,
+ fuzz.WRatio: rfuzz.WRatio,
+ fuzz.QRatio: rfuzz.QRatio,
+ fuzz.UWRatio: rfuzz.WRatio,
+ fuzz.UQRatio: rfuzz.QRatio,
+}
+
+
+def _get_scorer(scorer):
+ """
+ rapidfuzz scorers require the score_cutoff argument to be available
+ This generates a compatible wrapper function
+ """
+ def wrapper(s1, s2, score_cutoff=0):
+ return scorer(s1, s2)
+
+ return _scorer_lowering.get(scorer, wrapper)
+
+
+def _preprocess_query(query, processor):
+ processed_query = processor(query) if processor else query
+ if len(processed_query) == 0:
+ _logger.warning("Applied processor reduces input query to empty string, "
+ "all comparisons will have score 0. "
+ f"[Query: \'{query}\']")
+
+ return processed_query
def extractWithoutOrder(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0):
- """Select the best match in a list or dictionary of choices.
+ """
+ Select the best match in a list or dictionary of choices.
Find best matches in a list or dictionary of choices, return a
generator of tuples containing the match and its score. If a dictionary
@@ -61,68 +120,27 @@ def extractWithoutOrder(query, choices, processor=default_processor, scorer=defa
('train', 22, 'bard'), ('man', 0, 'dog')
"""
- # Catch generators without lengths
- def no_process(x):
- return x
+ is_mapping = hasattr(choices, "items")
+ is_lowered = scorer in _scorer_lowering
- try:
- if choices is None or len(choices) == 0:
- return
- except TypeError:
- pass
+ query = _preprocess_query(query, processor)
+ it = rprocess.extract_iter(
+ query, choices,
+ processor=_get_processor(processor, scorer),
+ scorer=_get_scorer(scorer),
+ score_cutoff=score_cutoff
+ )
- # If the processor was removed by setting it to None
- # perform a noop as it still needs to be a function
- if processor is None:
- processor = no_process
+ for choice, score, key in it:
+ if is_lowered:
+ score = int(round(score))
- # Run the processor on the input query.
- processed_query = processor(query)
-
- if len(processed_query) == 0:
- _logger.warning("Applied processor reduces input query to empty string, "
- "all comparisons will have score 0. "
- f"[Query: \'{query}\']")
-
- # Don't run full_process twice
- if scorer in [fuzz.WRatio, fuzz.QRatio,
- fuzz.token_set_ratio, fuzz.token_sort_ratio,
- fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
- fuzz.UWRatio, fuzz.UQRatio] \
- and processor == utils.full_process:
- processor = no_process
-
- # Only process the query once instead of for every choice
- if scorer in [fuzz.UWRatio, fuzz.UQRatio]:
- pre_processor = partial(utils.full_process, force_ascii=False)
- scorer = partial(scorer, full_process=False)
- elif scorer in [fuzz.WRatio, fuzz.QRatio,
- fuzz.token_set_ratio, fuzz.token_sort_ratio,
- fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio]:
- pre_processor = partial(utils.full_process, force_ascii=True)
- scorer = partial(scorer, full_process=False)
- else:
- pre_processor = no_process
- processed_query = pre_processor(processed_query)
-
- try:
- # See if choices is a dictionary-like object.
- for key, choice in choices.items():
- processed = pre_processor(processor(choice))
- score = scorer(processed_query, processed)
- if score >= score_cutoff:
- yield (choice, score, key)
- except AttributeError:
- # It's a list; just iterate over it.
- for choice in choices:
- processed = pre_processor(processor(choice))
- score = scorer(processed_query, processed)
- if score >= score_cutoff:
- yield (choice, score)
+ yield (choice, score, key) if is_mapping else (choice, score)
def extract(query, choices, processor=default_processor, scorer=default_scorer, limit=5):
- """Select the best match in a list or dictionary of choices.
+ """
+ Select the best match in a list or dictionary of choices.
Find best matches in a list or dictionary of choices, return a
list of tuples containing the match and its score. If a dictionary
@@ -166,13 +184,12 @@ def extract(query, choices, processor=default_processor, scorer=default_scorer,
[('train', 22, 'bard'), ('man', 0, 'dog')]
"""
- sl = extractWithoutOrder(query, choices, processor, scorer)
- return heapq.nlargest(limit, sl, key=lambda i: i[1]) if limit is not None else \
- sorted(sl, key=lambda i: i[1], reverse=True)
+ return extractBests(query, choices, processor=processor, scorer=scorer, limit=limit)
def extractBests(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0, limit=5):
- """Get a list of the best matches to a collection of choices.
+ """
+ Get a list of the best matches to a collection of choices.
Convenience function for getting the choices with best scores.
@@ -190,14 +207,30 @@ def extractBests(query, choices, processor=default_processor, scorer=default_sco
Returns: A a list of (match, score) tuples.
"""
+ is_mapping = hasattr(choices, "items")
+ is_lowered = scorer in _scorer_lowering
- best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
- return heapq.nlargest(limit, best_list, key=lambda i: i[1]) if limit is not None else \
- sorted(best_list, key=lambda i: i[1], reverse=True)
+ query = _preprocess_query(query, processor)
+ results = rprocess.extract(
+ query, choices,
+ processor=_get_processor(processor, scorer),
+ scorer=_get_scorer(scorer),
+ score_cutoff=score_cutoff,
+ limit=limit
+ )
+
+ for i, (choice, score, key) in enumerate(results):
+ if is_lowered:
+ score = int(round(score))
+
+ results[i] = (choice, score, key) if is_mapping else (choice, score)
+
+ return results
def extractOne(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0):
- """Find the single best match above a score in a list of choices.
+ """
+ Find the single best match above a score in a list of choices.
This is a convenience method which returns the single best choice.
See extract() for the full arguments list.
@@ -217,22 +250,38 @@ def extractOne(query, choices, processor=default_processor, scorer=default_score
A tuple containing a single match and its score, if a match
was found that was above score_cutoff. Otherwise, returns None.
"""
- best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
- try:
- return max(best_list, key=lambda i: i[1])
- except ValueError:
- return None
+ is_mapping = hasattr(choices, "items")
+ is_lowered = scorer in _scorer_lowering
+
+ query = _preprocess_query(query, processor)
+ res = rprocess.extractOne(
+ query, choices,
+ processor=_get_processor(processor, scorer),
+ scorer=_get_scorer(scorer),
+ score_cutoff=score_cutoff
+ )
+
+ if res is None:
+ return res
+
+ choice, score, key = res
+
+ if is_lowered:
+ score = int(round(score))
+
+ return (choice, score, key) if is_mapping else (choice, score)
def dedupe(contains_dupes, threshold=70, scorer=fuzz.token_set_ratio):
- """This convenience function takes a list of strings containing duplicates and uses fuzzy matching to identify
- and remove duplicates. Specifically, it uses the process.extract to identify duplicates that
+ """
+ This convenience function takes a list of strings containing duplicates and uses fuzzy matching to identify
+ and remove duplicates. Specifically, it uses process.extract to identify duplicates that
score greater than a user defined threshold. Then, it looks for the longest item in the duplicate list
since we assume this item contains the most entity information and returns that. It breaks string
length ties on an alphabetical sort.
Note: as the threshold DECREASES the number of duplicates that are found INCREASES. This means that the
- returned deduplicated list will likely be shorter. Raise the threshold for fuzzy_dedupe to be less
+ returned deduplicated list will likely be shorter. Raise the threshold for dedupe to be less
sensitive.
Args:
@@ -249,39 +298,12 @@ def dedupe(contains_dupes, threshold=70, scorer=fuzz.token_set_ratio):
A deduplicated list. For example:
In: contains_dupes = ['Frodo Baggin', 'Frodo Baggins', 'F. Baggins', 'Samwise G.', 'Gandalf', 'Bilbo Baggins']
- In: fuzzy_dedupe(contains_dupes)
+ In: dedupe(contains_dupes)
Out: ['Frodo Baggins', 'Samwise G.', 'Bilbo Baggins', 'Gandalf']
- """
-
- extractor = []
-
- # iterate over items in *contains_dupes*
+ """
+ deduped = set()
for item in contains_dupes:
- # return all duplicate matches found
- matches = extract(item, contains_dupes, limit=None, scorer=scorer)
- # filter matches based on the threshold
- filtered = [x for x in matches if x[1] > threshold]
- # if there is only 1 item in *filtered*, no duplicates were found so append to *extracted*
- if len(filtered) == 1:
- extractor.append(filtered[0][0])
-
- else:
- # alpha sort
- filtered = sorted(filtered, key=lambda x: x[0])
- # length sort
- filter_sort = sorted(filtered, key=lambda x: len(x[0]), reverse=True)
- # take first item as our 'canonical example'
- extractor.append(filter_sort[0][0])
-
- # uniquify *extractor* list
- keys = {}
- for e in extractor:
- keys[e] = 1
- extractor = keys.keys()
-
- # check that extractor differs from contain_dupes (e.g. duplicates were found)
- # if not, then return the original list
- if len(extractor) == len(contains_dupes):
- return contains_dupes
- else:
- return extractor
+ matches = extractBests(item, contains_dupes, scorer=scorer, score_cutoff=threshold, limit=None)
+ deduped.add(max(matches, key=lambda x: (len(x[0]), x[0]))[0])
+
+ return list(deduped) if len(deduped) != len(contains_dupes) else contains_dupes
diff --git a/thefuzz/string_processing.py b/thefuzz/string_processing.py
deleted file mode 100644
index 0d8ac29..0000000
--- a/thefuzz/string_processing.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import re
-
-
-class StringProcessor:
- """
- This class defines method to process strings in the most
- efficient way. Ideally all the methods below use unicode strings
- for both input and output.
- """
-
- regex = re.compile(r"(?ui)\W")
-
- @classmethod
- def replace_non_letters_non_numbers_with_whitespace(cls, a_string):
- """
- This function replaces any sequence of non letters and non
- numbers with a single white space.
- """
- return cls.regex.sub(" ", a_string)
diff --git a/thefuzz/string_processing.pyi b/thefuzz/string_processing.pyi
deleted file mode 100644
index 6733043..0000000
--- a/thefuzz/string_processing.pyi
+++ /dev/null
@@ -1,3 +0,0 @@
-class StringProcessor(object):
- @classmethod
- def replace_non_letters_non_numbers_with_whitespace(cls, a_string: str) -> str: ...
diff --git a/thefuzz/utils.py b/thefuzz/utils.py
index 1caf781..bcda741 100644
--- a/thefuzz/utils.py
+++ b/thefuzz/utils.py
@@ -1,81 +1,22 @@
-import functools
+from rapidfuzz.utils import default_process as _default_process
-from thefuzz.string_processing import StringProcessor
-
-
-def validate_string(s):
- """
- Check input has length and that length > 0
-
- :param s:
- :return: True if len(s) > 0 else False
- """
- try:
- return len(s) > 0
- except TypeError:
- return False
-
-
-def check_for_equivalence(func):
- @functools.wraps(func)
- def decorator(*args, **kwargs):
- if args[0] == args[1]:
- return 100
- return func(*args, **kwargs)
- return decorator
-
-
-def check_for_none(func):
- @functools.wraps(func)
- def decorator(*args, **kwargs):
- if args[0] is None or args[1] is None:
- return 0
- return func(*args, **kwargs)
- return decorator
-
-
-def check_empty_string(func):
- @functools.wraps(func)
- def decorator(*args, **kwargs):
- if len(args[0]) == 0 or len(args[1]) == 0:
- return 0
- return func(*args, **kwargs)
- return decorator
-
-
-bad_chars = "".join([chr(i) for i in range(128, 256)]) # ascii dammit!
-translation_table = {ord(c): None for c in bad_chars}
+translation_table = {i: None for i in range(128, 256)} # ascii dammit!
def ascii_only(s):
return s.translate(translation_table)
-def make_type_consistent(s1, s2):
- """If objects aren't both string instances force them to strings"""
- if isinstance(s1, str) and isinstance(s2, str):
- return s1, s2
-
- else:
- return str(s1), str(s2)
-
-
def full_process(s, force_ascii=False):
- """Process string by
- -- removing all but letters and numbers
- -- trim whitespace
- -- force to lower case
- if force_ascii == True, force convert to ascii"""
+ """
+ Process string by
+ -- removing all but letters and numbers
+ -- trim whitespace
+ -- force to lower case
+ if force_ascii == True, force convert to ascii
+ """
if force_ascii:
s = ascii_only(str(s))
- # Keep only Letters and Numbers (see Unicode docs).
- string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s)
- # Remove leading and trailing whitespaces and force into lowercase.
- string_out = string_out.strip().lower()
- return string_out
-
-def intr(n):
- '''Returns a correctly rounded integer'''
- return int(round(n))
+ return _default_process(s)
diff --git a/thefuzz/utils.pyi b/thefuzz/utils.pyi
index 45dc6dc..2c15b14 100644
--- a/thefuzz/utils.pyi
+++ b/thefuzz/utils.pyi
@@ -1,14 +1,3 @@
-from typing import Any, Tuple, Union, Callable, TypeVar
-TCallable = TypeVar("TCallable", bound=Callable[..., Any])
-
-
-def validate_string(s: str) -> bool: ...
-def check_for_equivalence(func: TCallable) -> TCallable: ...
-def check_for_none(func: TCallable) -> TCallable: ...
-def check_empty_string(func: TCallable) -> TCallable: ...
-def asciionly(s: str) -> str: ...
-def asciidammit(s: Union[str, bytes]) -> str: ...
-def make_type_consistent(s1: str, s2: str) -> Tuple[str, str]: ...
+def ascii_only(s: str) -> str: ...
def full_process(s: str, force_ascii: bool = ...) -> str: ...
-def intr(n: float) -> int: ...