diff --git a/dtoolkit/accessor/series/textdistance.py b/dtoolkit/accessor/series/textdistance.py index 5fef142e3..d776ebf5c 100644 --- a/dtoolkit/accessor/series/textdistance.py +++ b/dtoolkit/accessor/series/textdistance.py @@ -1,7 +1,6 @@ from __future__ import annotations from functools import lru_cache -from functools import wraps from typing import Callable from warnings import warn @@ -21,6 +20,7 @@ def textdistance( other: None | str | pd.Series = None, method: Callable = None, align: bool = True, + **kwargs, ) -> pd.Series: """ Return a ``Series`` containing the text distance to aligned ``other``. @@ -60,10 +60,6 @@ def textdistance( -------- textdistance_matrix - Notes - ----- - The distance of any value compared to nan or None is 0. - Examples -------- >>> import dtoolkit @@ -87,11 +83,12 @@ def textdistance( raise TypeError(f"Expected string dtype, but got {s.dtype!r}.") if method is None: - method = __import__("rapidfuzz.fuzz").fuzz.ratio - method = lru_cache(check_none(check_nan(method))) + method = __import__("rapidfuzz").fuzz.ratio + method = lru_cache(method) if isinstance(other, str): - return s.apply(method, args=(other,)) + return s.apply(method, args=(other,), **kwargs) + elif isinstance(other, pd.Series): if not is_string_dtype(other): raise TypeError(f"Expected Series(string), but got {other.dtype!r}.") @@ -104,37 +101,9 @@ def textdistance( raise ValueError(f"{s.size=} != {other.size=}.") return pd.Series( - (method(*xy) for xy in zip(s, other)), + (method(*xy, **kwargs) for xy in zip(s, other)), name=s.name, index=s.index, ) - elif other is None or (not is_list_like(other) and pd.isna(other)): - # NOTE: - # - pd.na(Series) returns array-like of bool - # to make sure pd.isna(other) returns bool - # need to other is not array-like - # - compare to None or nan always returns 0 - # the behavior is following rapidfuzz.fuzz.ratio - return pd.Series(np.zeros(s.size), name=s.name, index=s.index) raise TypeError(f"Expected Series(string), but got {type(other).__name__!r}.") - - -def check_none(func): - @wraps(func) - def decorator(*args, **kwargs): - # NOTE: compare to None always returns 0 - # the behavior is following rapidfuzz.fuzz.ratio - return 0 if args[0] is None or args[1] is None else func(*args, **kwargs) - - return decorator - - -def check_nan(func): - @wraps(func) - def decorator(*args, **kwargs): - # NOTE: compare to nan always returns 0 - # the behavior is following rapidfuzz.fuzz.ratio - return 0 if pd.isna(args[0]) or pd.isna(args[1]) else func(*args, **kwargs) - - return decorator diff --git a/dtoolkit/accessor/series/textdistance_matrix.py b/dtoolkit/accessor/series/textdistance_matrix.py index 8458a7234..605f541cc 100644 --- a/dtoolkit/accessor/series/textdistance_matrix.py +++ b/dtoolkit/accessor/series/textdistance_matrix.py @@ -49,10 +49,6 @@ def textdistance_matrix( -------- textdistance - Notes - ----- - Can't handle nan or None type value. - Examples -------- >>> import dtoolkit diff --git a/test/accessor/series/test_textdistance_matrix.py b/test/accessor/series/test_textdistance_matrix.py index 37128dc71..e58f0eb56 100644 --- a/test/accessor/series/test_textdistance_matrix.py +++ b/test/accessor/series/test_textdistance_matrix.py @@ -36,13 +36,6 @@ rapidfuzz.string_metric.levenshtein, pd.DataFrame([[4, 9], [6, 9]]), ), - # other elements contain None or nan - ( - pd.Series(["hello", "world", "!"]), - pd.Series(["hi!", None, float("nan")]), - None, - pd.DataFrame([[25, 0, 0], [0, 0, 0], [50, 0, 0]]), - ), ], ) def test_work(s, other, method, expected):