Skip to content

Commit

Permalink
Closes Bears-R-Us#2714: Add regex argument to categorical substring s…
Browse files Browse the repository at this point in the history
…earch (Bears-R-Us#2717)

This PR (closes Bears-R-Us#2714) adds the regex argument categorical substring search methods

Co-authored-by: Pierce Hayes <[email protected]>
  • Loading branch information
stress-tess and Pierce Hayes authored Aug 28, 2023
1 parent 04386f0 commit e1052f6
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 46 deletions.
94 changes: 59 additions & 35 deletions arkouda/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,14 +492,18 @@ def reset_categories(self) -> Categorical:
)

@typechecked
def contains(self, substr: str) -> pdarray:
def contains(self, substr: Union[bytes, str_scalars], regex: bool = False) -> pdarray:
"""
Check whether each element contains the given substring.
Parameters
----------
substr : str
substr : Union[bytes, str_scalars]
The substring to search for
regex: bool
Indicates whether substr is a regular expression
Note: only handles regular expressions supported by re2
(does not support lookaheads/lookbehinds)
Returns
-------
Expand All @@ -509,86 +513,106 @@ def contains(self, substr: str) -> pdarray:
Raises
------
TypeError
Raised if substr is not a str
Raised if the substr parameter is not bytes or str_scalars
ValueError
Rasied if substr is not a valid regex
RuntimeError
Raised if there is a server-side error thrown
See Also
--------
Categorical.startswith, Categorical.endswith
Notes
-----
This method can be significantly faster than the corresponding method
on Strings objects, because it searches the unique category labels
instead of the full array.
See Also
--------
Categorical.startswith, Categorical.endswith
"""
categoriescontains = self.categories.contains(substr)
return categoriescontains[self.codes]
categories_contains = self.categories.contains(substr, regex)
return categories_contains[self.codes]

@typechecked
def startswith(self, substr: str) -> pdarray:
def startswith(self, substr: Union[bytes, str_scalars], regex: bool = False) -> pdarray:
"""
Check whether each element starts with the given substring.
Parameters
----------
substr : str
substr : Union[bytes, str_scalars]
The substring to search for
regex: bool
Indicates whether substr is a regular expression
Note: only handles regular expressions supported by re2
(does not support lookaheads/lookbehinds)
Returns
-------
pdarray, bool
True for elements that start with substr, False otherwise
Raises
------
TypeError
Raised if substr is not a str
Raised if the substr parameter is not bytes or str_scalars
ValueError
Rasied if substr is not a valid regex
RuntimeError
Raised if there is a server-side error thrown
Returns
-------
pdarray, bool
True for elements that contain substr, False otherwise
See Also
--------
Categorical.contains, Categorical.endswith
Notes
-----
This method can be significantly faster than the corresponding
method on Strings objects, because it searches the unique category
labels instead of the full array.
See Also
--------
Categorical.contains, Categorical.endswith
"""
categoriesstartswith = self.categories.startswith(substr)
return categoriesstartswith[self.codes]
categories_ends_with = self.categories.startswith(substr, regex)
return categories_ends_with[self.codes]

@typechecked
def endswith(self, substr: str) -> pdarray:
def endswith(self, substr: Union[bytes, str_scalars], regex: bool = False) -> pdarray:
"""
Check whether each element ends with the given substring.
Parameters
----------
substr : str
substr : Union[bytes, str_scalars]
The substring to search for
regex: bool
Indicates whether substr is a regular expression
Note: only handles regular expressions supported by re2
(does not support lookaheads/lookbehinds)
Returns
-------
pdarray, bool
True for elements that end with substr, False otherwise
Raises
------
TypeError
Raised if substr is not a str
Raised if the substr parameter is not bytes or str_scalars
ValueError
Rasied if substr is not a valid regex
RuntimeError
Raised if there is a server-side error thrown
Returns
-------
pdarray, bool
True for elements that contain substr, False otherwise
See Also
--------
Categorical.startswith, Categorical.contains
Notes
-----
This method can be significantly faster than the corresponding method
on Strings objects, because it searches the unique category labels
instead of the full array.
See Also
--------
Categorical.startswith, Categorical.contains
"""
categoriesendswith = self.categories.endswith(substr)
return categoriesendswith[self.codes]
categories_ends_with = self.categories.endswith(substr, regex)
return categories_ends_with[self.codes]

@typechecked
def in1d(self, test: Union[Strings, Categorical]) -> pdarray:
Expand Down
20 changes: 9 additions & 11 deletions tests/categorical_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,15 @@ def testCategoricalFromCodesAndCategories(self):
self.assertListEqual(codes.to_list(), cat.codes.to_list())
self.assertListEqual(categories.to_list(), cat.categories.to_list())

def testContains(self):
cat = self._getCategorical()
self.assertTrue(cat.contains("string").all())

def testEndsWith(self):
cat = self._getCategorical()
self.assertTrue(cat.endswith("1").any())

def testStartsWith(self):
cat = self._getCategorical()
self.assertTrue(cat.startswith("string").all())
def test_substring_search(self):
cat = ak.Categorical(ak.array([f"{i} string {i}" for i in range(10)]))
self.assertTrue(cat.contains("tri").all())
self.assertTrue(cat.endswith("ing 1").any())
self.assertTrue(cat.startswith("1 str").any())

self.assertTrue(cat.contains("\\w", regex=True).all())
self.assertTrue(cat.endswith("ing \\d", regex=True).all())
self.assertTrue(cat.startswith("\\d str", regex=True).all())

def testGroup(self):
group = self._getRandomizedCategorical().group()
Expand Down

0 comments on commit e1052f6

Please sign in to comment.