Skip to content

Commit

Permalink
Merge pull request #75 from psolin/use_default_terms
Browse files Browse the repository at this point in the history
Have`basename` use default terms by default. This simplifies usage and tests, while providing means for custom terms usage if someone so wishes.
  • Loading branch information
petri authored Nov 16, 2021
2 parents e19125a + 9fee065 commit 6391604
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 31 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ Just use 'pip install cleanco' if you have pip installed (as most systems do). O
## How does it work?
Let's look at some sample code. To get the base name of a business without legal suffix:

>>> from cleanco import prepare_terms, basename
>>> from cleanco import prepare_default_terms, basename
>>> business_name = "Some Big Pharma, LLC"
>>> terms = prepare_terms()
>>> terms = prepare_default_terms()
>>> basename(business_name, terms, prefix=False, middle=False, suffix=True)
>>> 'Some Big Pharma'

Expand Down
2 changes: 1 addition & 1 deletion cleanco/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .cleanco import cleanco
from .clean import prepare_terms, basename
from .clean import prepare_default_terms, basename
from .classify import typesources, countrysources, matches
8 changes: 5 additions & 3 deletions cleanco/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Basic usage:
>> terms = prepare_terms()
>> terms = prepare_default_terms()
>> basename("Daddy & Sons, Ltd.", terms, prefix=True, middle=True, suffix=True)
Daddy & Sons
Expand Down Expand Up @@ -63,7 +63,7 @@ def normalized(text):
return remove_accents(text)


def prepare_terms():
def prepare_default_terms():
"construct an optimized term structure for basename extraction"
terms = get_unique_terms()
nterms = normalize_terms(terms)
Expand All @@ -73,7 +73,7 @@ def prepare_terms():
return [(len(tp), tp) for tp in sntermparts]


def basename(name, terms, suffix=True, prefix=False, middle=False, **kwargs):
def custom_basename(name, terms, suffix=True, prefix=False, middle=False, **kwargs):
"return cleaned base version of the business name"

name = strip_tail(name)
Expand Down Expand Up @@ -113,3 +113,5 @@ def basename(name, terms, suffix=True, prefix=False, middle=False, **kwargs):
return strip_tail(" ".join(nparts))


# convenience for most common use cases that don't parametrize base name extraction
basename = functools.partial(custom_basename, terms=prepare_default_terms())
5 changes: 2 additions & 3 deletions cleanco/cleanco.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .clean import prepare_terms, basename
from .clean import basename
from .classify import typesources, countrysources, matches


Expand All @@ -9,10 +9,9 @@ def __init__(self, name):
self._name = name
self._types = typesources()
self._countries = countrysources()
self._terms = prepare_terms()

def clean_name(self):
return basename(self._name, self._terms)
return basename(self._name)

def country(self):
return matches(self._name, self._countries)
Expand Down
39 changes: 17 additions & 22 deletions tests/test_cleanname.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,17 @@
# encoding: utf-8
import pytest
from cleanco import prepare_terms, basename


@pytest.fixture
def terms():
return prepare_terms()
from cleanco import basename


def test_deterministic_terms(monkeypatch):
"""prepare_terms should always return the same list (even for different ordering in get_unique_terms)"""
"""prepare_default_terms should always return the same list (even for different ordering in get_unique_terms)"""
from cleanco import clean
with monkeypatch.context() as m:
mock_terms = ["aaa", "bbb", "ccc"]
m.setattr(clean, "get_unique_terms", lambda: mock_terms)
res1 = clean.prepare_terms()
res1 = clean.prepare_default_terms()
m.setattr(clean, "get_unique_terms", lambda: reversed(mock_terms))
res2 = clean.prepare_terms()
res2 = clean.prepare_default_terms()
assert res1 == res2


Expand All @@ -31,11 +26,11 @@ def test_deterministic_terms(monkeypatch):
"name w/ ws suffix dot ws": " Hello World ltd. ",
}

def test_basic_cleanups(terms):
def test_basic_cleanups():
expected = "Hello World"
errmsg = "cleanup of %s failed"
for testname, variation in basic_cleanup_tests.items():
assert basename(variation, terms) == expected, errmsg % testname
assert basename(variation) == expected, errmsg % testname

multi_cleanup_tests = {
"name + suffix": "Hello World Oy",
Expand All @@ -47,11 +42,11 @@ def test_basic_cleanups(terms):
"name w/ mid + suffix": "Hello Oy World Ab"
}

def test_multi_type_cleanups(terms):
def test_multi_type_cleanups():
expected = "Hello World"
errmsg = "cleanup of %s failed"
for testname, variation in multi_cleanup_tests.items():
result = basename(variation, terms, prefix=True, suffix=True, middle=True)
result = basename(variation, prefix=True, suffix=True, middle=True)
assert result == expected, errmsg % testname


Expand All @@ -63,12 +58,12 @@ def test_multi_type_cleanups(terms):
"name + two in middle": "Hello Ab Oy World"
}

def test_double_cleanups(terms):
def test_double_cleanups():
expected = "Hello World"
errmsg = "cleanup of %s failed"
for testname, variation in multi_cleanup_tests.items():
result = basename(variation, terms, prefix=True, suffix=True, middle=True)
final = basename(result, terms, prefix=True, suffix=True, middle=True)
result = basename(variation, prefix=True, suffix=True, middle=True)
final = basename(result, prefix=True, suffix=True, middle=True)

assert final == expected, errmsg % testname

Expand All @@ -79,10 +74,10 @@ def test_double_cleanups(terms):
"name with dot": ("Hello. World, Oy", "Hello. World")
}

def test_preserving_cleanups(terms):
def test_preserving_cleanups():
errmsg = "preserving cleanup of %s failed"
for testname, (variation, expected) in preserving_cleanup_tests.items():
assert basename(variation, terms) == expected, errmsg % testname
assert basename(variation) == expected, errmsg % testname

# Test umlauts

Expand All @@ -97,18 +92,18 @@ def test_preserving_cleanups(terms):

}

def test_with_unicode_umlauted_name(terms):
def test_with_unicode_umlauted_name():
errmsg = "preserving cleanup of %s failed"
for testname, (variation, expected) in unicode_umlaut_tests.items():
assert basename(variation, terms, prefix=True) == expected, errmsg % testname
assert basename(variation, prefix=True) == expected, errmsg % testname


terms_with_accents_tests = {
"term with ł correct spelling": ("Łoś spółka z o.o", "Łoś"),
"term with ł incorrect spelling": ("Łoś spolka z o.o", "Łoś"),
}

def test_terms_with_accents(terms):
def test_terms_with_accents():
errmsg = "preserving cleanup of %s failed"
for testname, (variation, expected) in terms_with_accents_tests.items():
assert basename(variation, terms, suffix=True) == expected, errmsg % testname
assert basename(variation, suffix=True) == expected, errmsg % testname

0 comments on commit 6391604

Please sign in to comment.