Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

replace python-Levenshtein with rapidfuzz #10

Merged
merged 10 commits into from
Aug 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ jobs:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
test-cmd: [pytest]
include:
#- python-version: pyp-y3.8
# test-cmd: pytest test_thefuzz.py test_thefuzz_pytest.py
- python-version: "3.7"
test-cmd: python setup.py check --restructuredtext --strict --metadata
- python-version: "3.10"
Expand All @@ -26,7 +24,10 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip setuptools wheel
pip install pytest pycodestyle docutils Pygments hypothesis python-Levenshtein
pip install pytest pycodestyle docutils Pygments hypothesis

- name: Install project
run: pip install .

- name: Test with pytest
run: |
${{ matrix.test-cmd }}
run: ${{ matrix.test-cmd }}
12 changes: 2 additions & 10 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ Requirements
============

- Python 3.7 or higher
- difflib
- `python-Levenshtein <https://github.com/ztane/python-Levenshtein/>`_ (optional, provides a 4-10x speedup in String
Matching, though may result in `differing results for certain cases <https://github.com/seatgeek/fuzzywuzzy/issues/128>`_)
- `rapidfuzz <https://github.com/maxbachmann/RapidFuzz/>`_

For testing
~~~~~~~~~~~
Expand All @@ -29,12 +27,6 @@ Using PIP via PyPI

pip install thefuzz

or the following to install `python-Levenshtein` too

.. code:: bash

pip install thefuzz[speedup]


Using PIP via Github

Expand Down Expand Up @@ -110,7 +102,7 @@ Partial Token Sort Ratio
84
>>> fuzz.partial_token_sort_ratio("fuzzy was a bear", "wuzzy fuzzy was a bear")
100

Process
~~~~~~~

Expand Down
23 changes: 7 additions & 16 deletions benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
]

common_setup = "from thefuzz import fuzz, utils; "
basic_setup = "from thefuzz.string_processing import StringProcessor;"


def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
Expand All @@ -55,48 +54,42 @@ def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
duration, avg_duration * (1000 ** -thousands), units[-thousands]))


for s in choices:
print('Test validate_string for: "%s"' % s)
print_result_from_timeit('utils.validate_string(\'%s\')' % s, common_setup, number=iterations)

print('')

for s in mixed_strings + cirque_strings + choices:
print('Test full_process for: "%s"' % s)
print_result_from_timeit('utils.full_process(u\'%s\')' % s,
common_setup + basic_setup, number=iterations)
common_setup, number=iterations)

# benchmarking the core matching methods...

for s in cirque_strings:
print('Test fuzz.ratio for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('fuzz.ratio(u\'cirque du soleil\', u\'%s\')' % s,
common_setup + basic_setup, number=iterations / 100)
common_setup, number=iterations / 100)

for s in cirque_strings:
print('Test fuzz.partial_ratio for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('fuzz.partial_ratio(u\'cirque du soleil\', u\'%s\')'
% s, common_setup + basic_setup, number=iterations / 100)
% s, common_setup, number=iterations / 100)

for s in cirque_strings:
print('Test fuzz.WRatio for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('fuzz.WRatio(u\'cirque du soleil\', u\'%s\')' % s,
common_setup + basic_setup, number=iterations / 100)
common_setup, number=iterations / 100)

print('Test process.extract(scorer = fuzz.QRatio) for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('process.extract(u\'cirque du soleil\', choices, scorer = fuzz.QRatio)',
common_setup + basic_setup + " from thefuzz import process; import string,random; random.seed(18);"
common_setup + " from thefuzz import process; import string,random; random.seed(18);"
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
number=10)

print('Test process.extract(scorer = fuzz.WRatio) for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('process.extract(u\'cirque du soleil\', choices, scorer = fuzz.WRatio)',
common_setup + basic_setup + " from thefuzz import process; import string,random; random.seed(18);"
common_setup + " from thefuzz import process; import string,random; random.seed(18);"
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
number=10)

Expand All @@ -114,6 +107,4 @@ def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
print('-------------------------------')
test += 'prepared_ratio = functools.partial(fuzz.ratio, "%s")\n' % s
test += 'titles.sort(key=prepared_ratio)\n'
print_result_from_timeit(test,
common_setup + basic_setup,
number=100)
print_result_from_timeit(test, common_setup, number=100)
20 changes: 8 additions & 12 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,20 @@
# This file is part of thefuzz.

from thefuzz import __version__
import os

try:
from setuptools import setup
except ImportError:
from distutils.core import setup


def open_file(fname):
return open(os.path.join(os.path.dirname(__file__), fname))
from setuptools import setup

with open('README.rst') as f:
long_description = f.read()

setup(
name='thefuzz',
version=__version__,
author='Adam Cohen',
author_email='[email protected]',
packages=['thefuzz'],
extras_require={'speedup': ['python-levenshtein>=0.12']},
# keep for backwards compatibility of projects depending on `thefuzz[speedup]`
extras_require={'speedup': []},
install_requires= ['rapidfuzz>=3.0.0, < 4.0.0'],
url='https://github.com/seatgeek/thefuzz',
license="GPLv2",
maxbachmann marked this conversation as resolved.
Show resolved Hide resolved
classifiers=[
Expand All @@ -39,6 +34,7 @@ def open_file(fname):
'Programming Language :: Python :: 3 :: Only',
],
description='Fuzzy string matching in python',
long_description=open_file('README.rst').read(),
long_description=long_description,
zip_safe=True,
python_requires='>=3.7'
)
82 changes: 42 additions & 40 deletions test_thefuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,40 @@
from thefuzz import fuzz
from thefuzz import process
from thefuzz import utils
from thefuzz.string_processing import StringProcessor

scorers = [
fuzz.ratio,
fuzz.partial_ratio,
fuzz.token_sort_ratio,
fuzz.token_set_ratio,
fuzz.partial_token_sort_ratio,
fuzz.partial_token_set_ratio,
fuzz.QRatio,
fuzz.UQRatio,
fuzz.WRatio,
fuzz.UWRatio,
]

class StringProcessingTest(unittest.TestCase):
def test_replace_non_letters_non_numbers_with_whitespace(self):
strings = ["new york mets - atlanta braves", "Cães danados",
"New York //// Mets $$$", "Ça va?"]
for string in strings:
proc_string = StringProcessor.replace_non_letters_non_numbers_with_whitespace(string)
proc_string = utils.full_process(string)
regex = re.compile(r"(?ui)[\W]")
for expr in regex.finditer(proc_string):
self.assertEqual(expr.group(), " ")

def test_dont_condense_whitespace(self):
s1 = "new york mets - atlanta braves"
s2 = "new york mets atlanta braves"
p1 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s1)
p2 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s2)
self.assertNotEqual(p1, p2)
s3 = "new york mets atlanta braves"
p1 = utils.full_process(s1)
p2 = utils.full_process(s2)
p3 = utils.full_process(s3)
self.assertEqual(p1, s3)
self.assertEqual(p2, s2)
self.assertEqual(p3, s3)


class UtilsTest(unittest.TestCase):
Expand Down Expand Up @@ -120,7 +135,8 @@ def testPartialTokenSortRatio(self):
self.assertEqual(fuzz.partial_token_sort_ratio(self.s8, self.s8a, full_process=False), 100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s9, self.s9a, full_process=True), 100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s9, self.s9a, full_process=False), 100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s10, self.s10a, full_process=False), 50)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s10, self.s10a, full_process=False), 67)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s10a, self.s10, full_process=False), 67)
maxbachmann marked this conversation as resolved.
Show resolved Hide resolved
Comment on lines +138 to +139
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original implementation did have different results for these two tests (50 / 67), since it only allowed alignments behind the string, but not in the front. This is fixed here.


def testTokenSetRatio(self):
self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5), 100)
Expand Down Expand Up @@ -243,58 +259,44 @@ def testQRatioForceAscii(self):
score = fuzz.WRatio(s1, s2, force_ascii=False)
self.assertLess(score, 100)

def testTokenSetForceAscii(self):
def testPartialTokenSetRatioForceAscii(self):
s1 = "ABCD\u00C1 HELP\u00C1"
s2 = "ABCD HELP"

score = fuzz._token_set(s1, s2, force_ascii=True)
score = fuzz.partial_token_set_ratio(s1, s2, force_ascii=True)
self.assertEqual(score, 100)

score = fuzz._token_set(s1, s2, force_ascii=False)
score = fuzz.partial_token_set_ratio(s1, s2, force_ascii=False)
self.assertLess(score, 100)

def testTokenSortForceAscii(self):
def testPartialTokenSortRatioForceAscii(self):
s1 = "ABCD\u00C1 HELP\u00C1"
s2 = "ABCD HELP"

score = fuzz._token_sort(s1, s2, force_ascii=True)
score = fuzz.partial_token_sort_ratio(s1, s2, force_ascii=True)
self.assertEqual(score, 100)

score = fuzz._token_sort(s1, s2, force_ascii=False)
score = fuzz.partial_token_sort_ratio(s1, s2, force_ascii=False)
self.assertLess(score, 100)


class ValidatorTest(unittest.TestCase):
maxbachmann marked this conversation as resolved.
Show resolved Hide resolved
def setUp(self):
self.testFunc = lambda *args, **kwargs: (args, kwargs)

def testCheckForNone(self):
invalid_input = [
(None, None),
('Some', None),
(None, 'Some')
]
decorated_func = utils.check_for_none(self.testFunc)
for i in invalid_input:
self.assertEqual(decorated_func(*i), 0)
for scorer in scorers:
self.assertEqual(scorer(None, None), 0)
self.assertEqual(scorer('Some', None), 0)
self.assertEqual(scorer(None, 'Some'), 0)

valid_input = ('Some', 'Some')
actual = decorated_func(*valid_input)
self.assertNotEqual(actual, 0)
self.assertNotEqual(scorer('Some', 'Some'), 0)

def testCheckEmptyString(self):
invalid_input = [
('', ''),
('Some', ''),
('', 'Some')
]
decorated_func = utils.check_empty_string(self.testFunc)
for i in invalid_input:
self.assertEqual(decorated_func(*i), 0)

valid_input = ('Some', 'Some')
actual = decorated_func(*valid_input)
self.assertNotEqual(actual, 0)
for scorer in scorers:
if scorer in {fuzz.token_set_ratio, fuzz.partial_token_set_ratio, fuzz.WRatio, fuzz.UWRatio, fuzz.QRatio, fuzz.UQRatio}:
self.assertEqual(scorer('', ''), 0)
else:
self.assertEqual(scorer('', ''), 100)

self.assertEqual(scorer('Some', ''), 0)
self.assertEqual(scorer('', 'Some'), 0)
self.assertNotEqual(scorer('Some', 'Some'), 0)


class ProcessTest(unittest.TestCase):
Expand Down
4 changes: 2 additions & 2 deletions test_thefuzz_hypothesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from functools import partial
from string import ascii_letters, digits, punctuation

from hypothesis import given, assume, settings
from hypothesis import given, assume, settings, HealthCheck
import hypothesis.strategies as st
import pytest

Expand Down Expand Up @@ -62,7 +62,7 @@ def full_scorers_processors():
@pytest.mark.parametrize('scorer,processor',
scorers_processors())
@given(data=st.data())
@settings(max_examples=20, deadline=5000)
@settings(max_examples=20, deadline=5000, suppress_health_check=[HealthCheck.data_too_large])
maxbachmann marked this conversation as resolved.
Show resolved Hide resolved
def test_identical_strings_extracted(scorer, processor, data):
"""
Test that identical strings will always return a perfect match.
Expand Down
Loading