Skip to content

Commit

Permalink
Closes Bears-R-Us#3009: indexof1d to handle null values (Bears-R-Us…
Browse files Browse the repository at this point in the history
…#3169)

* Closes Bears-R-Us#3009: `indexof1d` to handle null values

This PR (closes Bears-R-Us#3009) refactors `indexof1d` to use `find` since they have similar functionality and `find` is fairly optimized and correctly handles null values (once we `dropna=False` to the `Groupby`). The two major difference is when there are how missing values are handled and how many indices get returned when there are duplicates in the search space. `find` would only return the index of the first occurrence and use `-1` to denote missing values, but `indexof1d` returns the indices of all occurrences and removes missing values. To enable this, I added the flags `all_occurrences` and `remove_missing` to `find`

The approach I took involved adding a segmented `mink/maxk`, which I went back and forth on whether it should be user facing. I implemented this by permuting the values and calling the existing `mink/maxk`. I'm not sure if this is the most efficient approach, but my goal was focus on correctness first and we can optimize later if needed.

Wrote tests for `indexof1d` both for the reproducer and in general.

* update and add examples in response to PR feedback

---------

Co-authored-by: Tess Hayes <[email protected]>
  • Loading branch information
stress-tess and stress-tess authored May 22, 2024
1 parent f0d559f commit c05c599
Show file tree
Hide file tree
Showing 8 changed files with 383 additions and 168 deletions.
53 changes: 49 additions & 4 deletions PROTO_tests/tests/setops_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pandas as pd
import pytest

import arkouda as ak
Expand Down Expand Up @@ -50,8 +51,8 @@ def make_np_arrays_cross_type(dtype1, dtype2):
a = np.array([-1, -3, 0, 1, 2, 3]).astype(dtype1)
c = np.array([-1, 0, 0, 7, 8, 3]).astype(dtype1)
elif dtype1 == ak.bigint:
a = np.array([-1, -3, 0, 1, 2, 3]).astype(ak.uint64) + 2 ** 200
c = np.array([-1, 0, 0, 7, 8, 3]).astype(ak.uint64) + 2 ** 200
a = np.array([-1, -3, 0, 1, 2, 3]).astype(ak.uint64) + 2**200
c = np.array([-1, 0, 0, 7, 8, 3]).astype(ak.uint64) + 2**200
elif dtype1 == ak.bool:
a = np.array([True, False, False, True, True])
c = np.array([True, True, False, False, True])
Expand All @@ -62,8 +63,8 @@ def make_np_arrays_cross_type(dtype1, dtype2):
b = np.array([-1, -11, 0, 4, 5, 3]).astype(dtype2)
d = np.array([-1, -4, 0, 7, 8, 3]).astype(dtype2)
elif dtype2 == ak.bigint:
b = np.array([-1, -11, 0, 4, 5, 3]).astype(ak.uint64) + 2 ** 200
d = np.array([-1, -4, 0, 7, 8, 3]).astype(ak.uint64) + 2 ** 200
b = np.array([-1, -11, 0, 4, 5, 3]).astype(ak.uint64) + 2**200
d = np.array([-1, -4, 0, 7, 8, 3]).astype(ak.uint64) + 2**200
elif dtype2 == ak.bool:
b = np.array([True, True, False, False, True])
d = np.array([True, True, False, False, True])
Expand Down Expand Up @@ -674,3 +675,47 @@ def test_multiarray_validation(self):
x = [ak.arange(3, dtype=ak.uint64), ak.arange(3)]
with pytest.raises(TypeError):
ak.pdarraysetops.multiarray_setop_validation(x, y)

def test_index_of(self):
# index of nan (reproducer from #3009)
s = ak.Series(ak.array([1, 2, 3]), index=ak.array([1, 2, np.nan]))
assert ak.indexof1d(ak.array([np.nan]), s.index.values).to_list() == [2]

select_from_list = [
ak.randint(-(2**32), 2**32, 10),
ak.linspace(-(2**32), 2**32, 10),
ak.random_strings_uniform(1, 16, 10),
]
for select_from in select_from_list:
arr1 = select_from[ak.randint(0, select_from.size, 20)]

# test unique search space, this should be identical to find
# be sure to test when all items are present and when there are items missing
for arr2 in select_from, select_from[:5], select_from[5:]:
found_in_second = ak.in1d(arr1, arr2)
idx_of_first_in_second = ak.indexof1d(arr1, arr2)

# ensure we match find
assert (idx_of_first_in_second == ak.find(arr1, arr2, remove_missing=True)).all()

# if an element of arr1 is found in arr2, return the index of that item in arr2
assert (arr2[idx_of_first_in_second] == arr1[found_in_second]).all()

# test duplicate items in search space, the easiest way I can think
# of to do this is to compare against pandas series getitem
arr2 = select_from[ak.randint(0, select_from.size, 20)]
pd_s = pd.Series(index=arr1.to_ndarray(), data=arr2.to_ndarray())
ak_s = ak.Series(index=arr1, data=arr2)

arr1_keys = ak.GroupBy(arr1).unique_keys
arr2_keys = ak.GroupBy(arr2).unique_keys
in_both = ak.intersect1d(arr1_keys, arr2_keys)

for i in in_both.to_list():
pd_i = pd_s[i]
ak_i = ak_s[i]
if isinstance(pd_i, pd.Series):
assert isinstance(ak_i, ak.Series)
assert pd_i.values.tolist() == ak_i.values.to_list()
else:
assert pd_i == ak_i
144 changes: 129 additions & 15 deletions arkouda/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
import numpy as np # type: ignore

from arkouda.categorical import Categorical
from arkouda.client import generic_msg
from arkouda.dtypes import bigint
from arkouda.dtypes import float64 as akfloat64
from arkouda.dtypes import int64 as akint64
from arkouda.dtypes import uint64 as akuint64
from arkouda.groupbyclass import GroupBy, broadcast, unique
from arkouda.numeric import where
from arkouda.pdarrayclass import pdarray
from arkouda.numeric import cumsum, where
from arkouda.pdarrayclass import create_pdarray, pdarray
from arkouda.pdarraycreation import arange, full, ones, zeros
from arkouda.pdarraysetops import concatenate, in1d
from arkouda.sorting import argsort, coargsort
Expand Down Expand Up @@ -109,23 +110,102 @@ class NonUniqueError(ValueError):
pass


def find(query, space):
def find(query, space, all_occurrences=False, remove_missing=False):
"""
Return indices of query items in a search list of items (-1 if not found).
Return indices of query items in a search list of items.
Parameters
----------
query : (sequence of) array-like
The items to search for. If multiple arrays, each "row" is an item.
space : (sequence of) array-like
The set of items in which to search. Must have same shape/dtype as query.
all_occurrences: bool
When duplicate terms are present in search space, if all_occurrences is True,
return all occurrences found as a SegArray, otherwise return only the first
occurrences as a pdarray. Defaults to only finding the first occurrence.
Finding all occurrences is not yet supported on sequences of arrays
remove_missing: bool
If False, return -1 for any items in query not found in space. If True,
remove these and only return indices of items that are found.
Returns
-------
indices : pdarray, int64
For each item in query, its index in space or -1 if not found.
"""
indices : pdarray or SegArray
For each item in query, its index in space. If remove_missing is True,
exclued missing values otherwise return -1. If all_occurrences is False,
the return will be a pdarray of the first index where each value in the
query appears in the space. if all_occurrences is True, the return will be
a SegArray containing every index where each value in the query appears in
the space.
Examples
--------
>>> select_from = ak.arange(10)
>>> arr1 = select_from[ak.randint(0, select_from.size, 20, seed=10)]
>>> arr2 = select_from[ak.randint(0, select_from.size, 20, seed=11)]
# remove some values to ensure we have some values
# which don't appear in the search space
>>> arr2 = arr2[arr2 != 9]
>>> arr2 = arr2[arr2 != 3]
# find with defaults (all_occurrences and remove_missing both False)
>>> ak.find(arr1, arr2)
array([-1 -1 -1 0 1 -1 -1 -1 2 -1 5 -1 8 -1 5 -1 -1 11 5 0])
# set remove_missing to True, only difference from default
# is missing values are excluded
>>> ak.find(arr1, arr2, remove_missing=True)
array([0 1 2 5 8 5 11 5 0])
# set all_occurrences to True, the first index of each list
# is the first occurence and should match the default
>>> ak.find(arr1, arr2, all_occurrences=True).to_list()
[[-1],
[-1],
[-1],
[0, 4],
[1, 3, 10],
[-1],
[-1],
[-1],
[2, 6, 12, 13],
[-1],
[5, 7],
[-1],
[8, 9, 14],
[-1],
[5, 7],
[-1],
[-1],
[11, 15],
[5, 7],
[0, 4]]
# set both remove_missing and all_occurrences to True, missing values
# will be empty segments
>>> ak.find(arr1, arr2, remove_missing=True, all_occurrences=True).to_list()
[[],
[],
[],
[0, 4],
[1, 3, 10],
[],
[],
[],
[2, 6, 12, 13],
[],
[5, 7],
[],
[8, 9, 14],
[],
[5, 7],
[],
[],
[11, 15],
[5, 7],
[0, 4]]
"""
# Concatenate the space and query in fast (block interleaved) mode
if isinstance(query, (pdarray, Strings, Categorical)):
if type(query) is not type(space):
Expand All @@ -151,15 +231,48 @@ def find(query, space):
# All space indices are less than all query indices
i = concatenate((arange(spacesize), arange(spacesize, spacesize + querysize)), ordered=False)
# Group on terms
g = GroupBy(c)
g = GroupBy(c, dropna=False)
# For each term, count how many times it appears in the search space
space_multiplicity = g.sum(i < spacesize)[1]
# Warn of any duplicate terms in space
if (space_multiplicity > 1).any():
warn(
"Duplicate terms present in search space. Only first instance of each query term\
will be reported."
)
has_duplicates = (space_multiplicity > 1).any()
# handle duplicate terms in space
if has_duplicates:
if all_occurrences:
if isinstance(query, Sequence):
raise TypeError("finding all_occurrences is not yet supported on sequences of arrays")

from arkouda.segarray import SegArray

# use segmented mink to select space_multiplicity number of elements
# and create a segarray which contains all the indices
# in our query space, instead of just the min for each segment

# only calculate where to place the negatives if remove_missing is false
negative_at = "" if remove_missing else space_multiplicity == 0
repMsg = generic_msg(
cmd="segmentedExtremaK",
args={
"vals": i[g.permutation],
"segs": g.segments,
"segLens": g.size()[1],
"kArray": space_multiplicity,
"isMin": True,
"removeMissing": remove_missing,
"negativeAt": negative_at,
},
)
min_k_vals = create_pdarray(repMsg)
seg_idx = g.broadcast(arange(g.segments.size))[i >= spacesize]
if not remove_missing:
space_multiplicity += negative_at
min_k_segs = cumsum(space_multiplicity) - space_multiplicity
sa = SegArray(min_k_segs, min_k_vals)
return sa[seg_idx]
else:
warn(
"Duplicate terms present in search space. Only first instance of each query term"
" will be reported. To return all occurrences, set all_occurrences=True."
)
# For query terms in the space, the min combined index will be the first index of that
# term in the space
uspaceidx = g.min(i)[1]
Expand All @@ -169,7 +282,8 @@ def find(query, space):
# Broadcast unique term indices to combined list of space and query terms
spaceidx = g.broadcast(uspaceidx)
# Return only the indices of the query terms (remove the search space)
return spaceidx[i >= spacesize]
pda = spaceidx[i >= spacesize]
return pda[pda != -1] if remove_missing else pda


def lookup(keys, values, arguments, fillvalue=-1):
Expand Down
Loading

0 comments on commit c05c599

Please sign in to comment.