diff --git a/maha/cleaners/functions/remove_fn.py b/maha/cleaners/functions/remove_fn.py index 76491dd2..2e642a05 100644 --- a/maha/cleaners/functions/remove_fn.py +++ b/maha/cleaners/functions/remove_fn.py @@ -260,11 +260,11 @@ def remove( def reduce_repeated_substring( - text: str, min_repeated: int = 3, reduce_to: int = 2 + text: str, min_repeated: int = 4, reduce_to: int = 3 ) -> str: """Reduces consecutive substrings that are repeated at least ``min_repeated`` times to ``reduce_to`` times. For example with the default arguments, 'hhhhhh' is - reduced to 'hh' + reduced to 'hhh' TODO: Maybe change the implemention for 50x speed https://stackoverflow.com/questions/29481088/how-can-i-tell-if-a-string-repeats-itself-in-python/29489919#29489919 @@ -274,9 +274,9 @@ def reduce_repeated_substring( text : str Text to process min_repeated : int, optional - Minimum number of consecutive repeated substring to consider, by default 3 + Minimum number of consecutive repeated substring to consider, by default 4 reduce_to : int, optional - Number of substring to keep, by default 2 + Number of substring to keep, by default 3 Returns ------- @@ -297,7 +297,7 @@ def reduce_repeated_substring( >>> from maha.cleaners.functions import reduce_repeated_substring >>> text = "ههههههههههههههه" >>> reduce_repeated_substring(text) - 'هه' + 'ههه' ..code:: pycon diff --git a/maha/processors/base_processor.py b/maha/processors/base_processor.py index 3df7773f..fc2b3cd1 100644 --- a/maha/processors/base_processor.py +++ b/maha/processors/base_processor.py @@ -224,7 +224,7 @@ def replace_pairs(self, keys: List[str], values: List[str]): self.apply(partial(replace_pairs, **self._arguments_except_self(locals()))) return self - def reduce_repeated_substring(self, min_repeated: int = 3, reduce_to: int = 2): + def reduce_repeated_substring(self, min_repeated: int = 4, reduce_to: int = 3): """Applies :func:`~.reduce_repeated_substring` to each line""" self.apply( partial(reduce_repeated_substring, **self._arguments_except_self(locals()))