diff --git a/README.md b/README.md index 2f07def..99eea3e 100644 --- a/README.md +++ b/README.md @@ -26,9 +26,9 @@ Just use 'pip install cleanco' if you have pip installed (as most systems do). O ## How does it work? Let's look at some sample code. To get the base name of a business without legal suffix: - >>> from cleanco import prepare_terms, basename + >>> from cleanco import prepare_default_terms, basename >>> business_name = "Some Big Pharma, LLC" - >>> terms = prepare_terms() + >>> terms = prepare_default_terms() >>> basename(business_name, terms, prefix=False, middle=False, suffix=True) >>> 'Some Big Pharma' diff --git a/cleanco/__init__.py b/cleanco/__init__.py index e761487..8b2de14 100644 --- a/cleanco/__init__.py +++ b/cleanco/__init__.py @@ -1,3 +1,3 @@ from .cleanco import cleanco -from .clean import prepare_terms, basename +from .clean import prepare_default_terms, basename from .classify import typesources, countrysources, matches diff --git a/cleanco/clean.py b/cleanco/clean.py index 78c510f..0454c1c 100644 --- a/cleanco/clean.py +++ b/cleanco/clean.py @@ -5,7 +5,7 @@ Basic usage: ->> terms = prepare_terms() +>> terms = prepare_default_terms() >> basename("Daddy & Sons, Ltd.", terms, prefix=True, middle=True, suffix=True) Daddy & Sons @@ -63,7 +63,7 @@ def normalized(text): return remove_accents(text) -def prepare_terms(): +def prepare_default_terms(): "construct an optimized term structure for basename extraction" terms = get_unique_terms() nterms = normalize_terms(terms) @@ -73,7 +73,7 @@ def prepare_terms(): return [(len(tp), tp) for tp in sntermparts] -def basename(name, terms, suffix=True, prefix=False, middle=False, **kwargs): +def custom_basename(name, terms, suffix=True, prefix=False, middle=False, **kwargs): "return cleaned base version of the business name" name = strip_tail(name) @@ -113,3 +113,5 @@ def basename(name, terms, suffix=True, prefix=False, middle=False, **kwargs): return strip_tail(" ".join(nparts)) +# convenience for most common use cases that don't parametrize base name extraction +basename = functools.partial(custom_basename, terms=prepare_default_terms()) diff --git a/cleanco/cleanco.py b/cleanco/cleanco.py index 0dcc733..1322124 100644 --- a/cleanco/cleanco.py +++ b/cleanco/cleanco.py @@ -1,4 +1,4 @@ -from .clean import prepare_terms, basename +from .clean import basename from .classify import typesources, countrysources, matches @@ -9,10 +9,9 @@ def __init__(self, name): self._name = name self._types = typesources() self._countries = countrysources() - self._terms = prepare_terms() def clean_name(self): - return basename(self._name, self._terms) + return basename(self._name) def country(self): return matches(self._name, self._countries) diff --git a/tests/test_cleanname.py b/tests/test_cleanname.py index 16f048c..1386973 100644 --- a/tests/test_cleanname.py +++ b/tests/test_cleanname.py @@ -1,22 +1,17 @@ # encoding: utf-8 import pytest -from cleanco import prepare_terms, basename - - -@pytest.fixture -def terms(): - return prepare_terms() +from cleanco import basename def test_deterministic_terms(monkeypatch): - """prepare_terms should always return the same list (even for different ordering in get_unique_terms)""" + """prepare_default_terms should always return the same list (even for different ordering in get_unique_terms)""" from cleanco import clean with monkeypatch.context() as m: mock_terms = ["aaa", "bbb", "ccc"] m.setattr(clean, "get_unique_terms", lambda: mock_terms) - res1 = clean.prepare_terms() + res1 = clean.prepare_default_terms() m.setattr(clean, "get_unique_terms", lambda: reversed(mock_terms)) - res2 = clean.prepare_terms() + res2 = clean.prepare_default_terms() assert res1 == res2 @@ -31,11 +26,11 @@ def test_deterministic_terms(monkeypatch): "name w/ ws suffix dot ws": " Hello World ltd. ", } -def test_basic_cleanups(terms): +def test_basic_cleanups(): expected = "Hello World" errmsg = "cleanup of %s failed" for testname, variation in basic_cleanup_tests.items(): - assert basename(variation, terms) == expected, errmsg % testname + assert basename(variation) == expected, errmsg % testname multi_cleanup_tests = { "name + suffix": "Hello World Oy", @@ -47,11 +42,11 @@ def test_basic_cleanups(terms): "name w/ mid + suffix": "Hello Oy World Ab" } -def test_multi_type_cleanups(terms): +def test_multi_type_cleanups(): expected = "Hello World" errmsg = "cleanup of %s failed" for testname, variation in multi_cleanup_tests.items(): - result = basename(variation, terms, prefix=True, suffix=True, middle=True) + result = basename(variation, prefix=True, suffix=True, middle=True) assert result == expected, errmsg % testname @@ -63,12 +58,12 @@ def test_multi_type_cleanups(terms): "name + two in middle": "Hello Ab Oy World" } -def test_double_cleanups(terms): +def test_double_cleanups(): expected = "Hello World" errmsg = "cleanup of %s failed" for testname, variation in multi_cleanup_tests.items(): - result = basename(variation, terms, prefix=True, suffix=True, middle=True) - final = basename(result, terms, prefix=True, suffix=True, middle=True) + result = basename(variation, prefix=True, suffix=True, middle=True) + final = basename(result, prefix=True, suffix=True, middle=True) assert final == expected, errmsg % testname @@ -79,10 +74,10 @@ def test_double_cleanups(terms): "name with dot": ("Hello. World, Oy", "Hello. World") } -def test_preserving_cleanups(terms): +def test_preserving_cleanups(): errmsg = "preserving cleanup of %s failed" for testname, (variation, expected) in preserving_cleanup_tests.items(): - assert basename(variation, terms) == expected, errmsg % testname + assert basename(variation) == expected, errmsg % testname # Test umlauts @@ -97,10 +92,10 @@ def test_preserving_cleanups(terms): } -def test_with_unicode_umlauted_name(terms): +def test_with_unicode_umlauted_name(): errmsg = "preserving cleanup of %s failed" for testname, (variation, expected) in unicode_umlaut_tests.items(): - assert basename(variation, terms, prefix=True) == expected, errmsg % testname + assert basename(variation, prefix=True) == expected, errmsg % testname terms_with_accents_tests = { @@ -108,7 +103,7 @@ def test_with_unicode_umlauted_name(terms): "term with ł incorrect spelling": ("Łoś spolka z o.o", "Łoś"), } -def test_terms_with_accents(terms): +def test_terms_with_accents(): errmsg = "preserving cleanup of %s failed" for testname, (variation, expected) in terms_with_accents_tests.items(): - assert basename(variation, terms, suffix=True) == expected, errmsg % testname + assert basename(variation, suffix=True) == expected, errmsg % testname