Skip to content

Commit

Permalink
Closes Bears-R-Us#2853: df.merge on a mix of String and integer col…
Browse files Browse the repository at this point in the history
…umns (Bears-R-Us#2859)

This PR (closes Bears-R-Us#2853) adds `df.merge` on a mix of String and integer columns and updates the associated multi-column tests.

Co-authored-by: Pierce Hayes <[email protected]>
  • Loading branch information
stress-tess and Pierce Hayes authored Nov 21, 2023
1 parent 8215275 commit d5925c0
Show file tree
Hide file tree
Showing 6 changed files with 154 additions and 90 deletions.
58 changes: 35 additions & 23 deletions PROTO_tests/tests/dataframe_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import itertools

import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -674,26 +676,36 @@ def test_multi_col_merge(self):
b = ak.randint(-size // 10, size // 10, size, seed=seed + 1)
c = ak.randint(-size // 10, size // 10, size, seed=seed + 2)
d = ak.randint(-size // 10, size // 10, size, seed=seed + 3)
left_df = ak.DataFrame({"first": a, "second": b, "third": ak.ones(size, int)})
right_df = ak.DataFrame(
{"first": c, "second": d, "third": ak.cast(ak.arange(size) % 2 == 0, int)}
)
l_pd, r_pd = left_df.to_pandas(), right_df.to_pandas()

for how in "inner", "left", "right":
for on in "first", "second", "third", ["first", "third"], ["second", "third"], None:
ak_merge = ak.merge(left_df, right_df, on=on, how=how)
pd_merge = pd.merge(l_pd, r_pd, on=on, how=how)

sorted_columns = sorted(ak_merge.columns)
assert sorted_columns == sorted(pd_merge.columns.to_list())
sorted_ak = ak_merge.sort_values(sorted_columns).reset_index()
sorted_pd = pd_merge.sort_values(sorted_columns).reset_index(drop=True)
for col in sorted_columns:
assert np.allclose(
sorted_ak[col].to_ndarray(), sorted_pd[col].to_numpy(), equal_nan=True
)
# TODO arkouda seems to be sometimes convert columns to floats on a right merge
# when pandas doesnt. Eventually we want to test frame_equal, not just value equality
# from pandas.testing import assert_frame_equal
# assert_frame_equal(sorted_ak.to_pandas()[sorted_columns], sorted_pd[sorted_columns])
ones = ak.ones(size, int)
altr = ak.cast(ak.arange(size) % 2 == 0, int)
for truth in itertools.product([True, False], repeat=3):
left_arrs = [pda if t else pda_to_str_helper(pda) for pda, t in zip([a, b, ones], truth)]
right_arrs = [pda if t else pda_to_str_helper(pda) for pda, t in zip([c, d, altr], truth)]
left_df = ak.DataFrame({k: v for k, v in zip(["first", "second", "third"], left_arrs)})
right_df = ak.DataFrame({k: v for k, v in zip(["first", "second", "third"], right_arrs)})
l_pd, r_pd = left_df.to_pandas(), right_df.to_pandas()

for how in "inner", "left", "right":
for on in "first", "second", "third", ["first", "third"], ["second", "third"], None:
ak_merge = ak.merge(left_df, right_df, on=on, how=how)
pd_merge = pd.merge(l_pd, r_pd, on=on, how=how)

sorted_columns = sorted(ak_merge.columns)
assert sorted_columns == sorted(pd_merge.columns.to_list())
for col in sorted_columns:
from_ak = ak_merge[col].to_ndarray()
from_pd = pd_merge[col].to_numpy()
if isinstance(ak_merge[col], ak.pdarray):
assert np.allclose(np.sort(from_ak), np.sort(from_pd), equal_nan=True)
else:
# we have to cast to str because pandas arrays converted to numpy
# have dtype object and have float NANs in line with the str values
assert (np.sort(from_ak) == np.sort(from_pd.astype(str))).all()
# TODO arkouda seems to be sometimes convert columns to floats on a right merge
# when pandas doesnt. Eventually we want to test frame_equal, not just value equality
# from pandas.testing import assert_frame_equal
# assert_frame_equal(sorted_ak.to_pandas()[sorted_columns], sorted_pd[sorted_columns])


def pda_to_str_helper(pda):
return ak.array([f"str {i}" for i in pda.to_list()])
50 changes: 36 additions & 14 deletions PROTO_tests/tests/join_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,20 +87,38 @@ def test_multi_array_inner_join(self):
seed = 1
a = ak.randint(-size // 10, size // 10, size, seed=seed)
b = ak.randint(-size // 10, size // 10, size, seed=seed + 1)
left = [a, ak.ones(size, int)]
right = [b, ak.cast(ak.arange(size) % 2 == 0, int)]

# test with no where args
l_ind, r_ind = ak.join.inner_join(left, right)
for lf, rt in zip(left, right):
assert (lf[l_ind] == rt[r_ind]).all()

# test with where args
def where_func(x, y):
return (x[0] % 2 == 0) | (y[0] % 2 == 0)

l_ind, r_ind = ak.join.inner_join(left, right, where_func, (left, right))
assert where_func([lf[l_ind] for lf in left], [rt[r_ind] for rt in right]).all()
ones = ak.ones(size, int)
altr = ak.cast(ak.arange(size) % 2 == 0, int)
left_lists = [
[a, ones],
[pda_to_str_helper(a), ones],
[a, pda_to_str_helper(ones)],
[pda_to_str_helper(a), pda_to_str_helper(ones)],
]
right_list = [
[b, altr],
[pda_to_str_helper(b), altr],
[b, pda_to_str_helper(altr)],
[pda_to_str_helper(b), pda_to_str_helper(altr)],
]
for left, right in zip(left_lists, right_list):
# test with no where args
l_ind, r_ind = ak.join.inner_join(left, right)
for lf, rt in zip(left, right):
assert (lf[l_ind] == rt[r_ind]).all()

# test with where args
def where_func(x, y):
x_bool = (
(x[0] % 2 == 0) if isinstance(x[0], ak.pdarray) else (x[0].get_lengths() % 2 == 0)
)
y_bool = (
(x[0] % 2 == 0) if isinstance(y[0], ak.pdarray) else (y[0].get_lengths() % 2 == 0)
)
return x_bool | y_bool

l_ind, r_ind = ak.join.inner_join(left, right, where_func, (left, right))
assert where_func([lf[l_ind] for lf in left], [rt[r_ind] for rt in right]).all()

def test_str_inner_join(self):
int_left = ak.arange(50)
Expand Down Expand Up @@ -222,3 +240,7 @@ def test_error_handling(self):

def join_where(left, right):
return ak.arange(left.size) % 2 == 0


def pda_to_str_helper(pda):
return ak.array([f"str {i}" for i in pda.to_list()])
5 changes: 4 additions & 1 deletion arkouda/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2921,7 +2921,10 @@ def merge(
on = on if on is not None else col_intersect

if not isinstance(on, str):
if not all(isinstance(left[col], pdarray) and isinstance(right[col], pdarray) for col in on):
if not all(
isinstance(left[col], (pdarray, Strings)) and isinstance(right[col], (pdarray, Strings))
for col in on
):
raise ValueError("All columns of a multi-column merge must be pdarrays")

if how == "inner":
Expand Down
16 changes: 4 additions & 12 deletions arkouda/join.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,12 +180,12 @@ def compute_join_size(a: pdarray, b: pdarray) -> Tuple[int, int]:

@typechecked
def inner_join(
left: Union[pdarray, Strings, Categorical, Sequence[pdarray]],
right: Union[pdarray, Strings, Categorical, Sequence[pdarray]],
left: Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]],
right: Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]],
wherefunc: Callable = None,
whereargs: Tuple[
Union[pdarray, Strings, Categorical, Sequence[pdarray]],
Union[pdarray, Strings, Categorical, Sequence[pdarray]],
Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]],
Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]],
] = None,
) -> Tuple[pdarray, pdarray]:
"""Perform inner join on values in <left> and <right>,
Expand Down Expand Up @@ -230,10 +230,6 @@ def inner_join(
left, right = l.codes, r.codes

if is_sequence:
if any(not isinstance(lf, pdarray) for lf in left) or any(
not isinstance(rt, pdarray) for rt in right
):
raise TypeError("All elements of Multi-array arguments must be pdarrays")
if len(left) != len(right):
raise ValueError("Left must have same num arrays as right")
left_size, right_size = left[0].size, right[0].size
Expand All @@ -251,10 +247,6 @@ def inner_join(
if whereargs is None or len(whereargs) != 2:
raise ValueError("whereargs must be a 2-tuple with left and right arg arrays")
if is_sequence:
if any(not isinstance(wa, pdarray) for wa in whereargs[0]) or any(
not isinstance(wa, pdarray) for wa in whereargs[1]
):
raise TypeError("All elements of Multi-array arguments must be pdarrays")
if len(whereargs[0]) != len(whereargs[1]):
raise ValueError("Left must have same num arrays as right")
first_wa_size, second_wa_size = whereargs[0][0].size, whereargs[1][0].size
Expand Down
65 changes: 39 additions & 26 deletions tests/dataframe_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import glob
import itertools
import os
import random
import string
Expand Down Expand Up @@ -821,29 +822,41 @@ def test_multi_col_merge(self):
b = ak.randint(-size // 10, size // 10, size, seed=seed + 1)
c = ak.randint(-size // 10, size // 10, size, seed=seed + 2)
d = ak.randint(-size // 10, size // 10, size, seed=seed + 3)
left_df = ak.DataFrame({"first": a, "second": b, "third": ak.ones(size, int)})
right_df = ak.DataFrame(
{"first": c, "second": d, "third": ak.cast(ak.arange(size) % 2 == 0, int)}
)
l_pd, r_pd = left_df.to_pandas(), right_df.to_pandas()

for how in "inner", "left", "right":
for on in "first", "second", "third", ["first", "third"], ["second", "third"], None:
ak_merge = ak.merge(left_df, right_df, on=on, how=how)
pd_merge = pd.merge(l_pd, r_pd, on=on, how=how)

sorted_columns = sorted(ak_merge.columns)
self.assertListEqual(sorted_columns, sorted(pd_merge.columns.to_list()))
sorted_ak = ak_merge.sort_values(sorted_columns).reset_index()
sorted_pd = pd_merge.sort_values(sorted_columns).reset_index(drop=True)
for col in sorted_columns:
self.assertTrue(
np.allclose(
sorted_ak[col].to_ndarray(), sorted_pd[col].to_numpy(), equal_nan=True
)
)

# TODO arkouda seems to be sometimes convert columns to floats on a right merge
# when pandas doesnt. Eventually we want to test frame_equal, not just value equality
# from pandas.testing import assert_frame_equal
# assert_frame_equal(sorted_ak.to_pandas()[sorted_columns], sorted_pd[sorted_columns])
ones = ak.ones(size, int)
altr = ak.cast(ak.arange(size) % 2 == 0, int)
for truth in itertools.product([True, False], repeat=3):
left_arrs = [pda if t else pda_to_str_helper(pda) for pda, t in zip([a, b, ones], truth)]
right_arrs = [pda if t else pda_to_str_helper(pda) for pda, t in zip([c, d, altr], truth)]
left_df = ak.DataFrame({k: v for k, v in zip(["first", "second", "third"], left_arrs)})
right_df = ak.DataFrame({k: v for k, v in zip(["first", "second", "third"], right_arrs)})
l_pd, r_pd = left_df.to_pandas(), right_df.to_pandas()

for how in "inner", "left", "right":
for on in "first", "second", "third", ["first", "third"], ["second", "third"], None:
ak_merge = ak.merge(left_df, right_df, on=on, how=how)
pd_merge = pd.merge(l_pd, r_pd, on=on, how=how)

sorted_columns = sorted(ak_merge.columns)
self.assertListEqual(sorted_columns, sorted(pd_merge.columns.to_list()))
for col in sorted_columns:
from_ak = ak_merge[col].to_ndarray()
from_pd = pd_merge[col].to_numpy()
if isinstance(ak_merge[col], ak.pdarray):
self.assertTrue(
np.allclose(np.sort(from_ak), np.sort(from_pd), equal_nan=True)
)
else:
# we have to cast to str because pandas arrays converted to numpy
# have dtype object and have float NANs in line with the str values
self.assertTrue((np.sort(from_ak) == np.sort(from_pd.astype(str))).all())

# TODO arkouda seems to be sometimes convert columns to floats on a right merge
# when pandas doesnt. Eventually we want to test frame_equal, not just value equality
# from pandas.testing import assert_frame_equal
# sorted_ak = ak_merge.sort_values(sorted_columns).reset_index()
# sorted_pd = pd_merge.sort_values(sorted_columns).reset_index(drop=True)
# assert_frame_equal(sorted_ak.to_pandas()[sorted_columns], sorted_pd[sorted_columns])


def pda_to_str_helper(pda):
return ak.array([f"str {i}" for i in pda.to_list()])
50 changes: 36 additions & 14 deletions tests/join_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,20 +119,38 @@ def test_multi_array_inner_join(self):
seed = 1
a = ak.randint(-size // 10, size // 10, size, seed=seed)
b = ak.randint(-size // 10, size // 10, size, seed=seed + 1)
left = [a, ak.ones(size, int)]
right = [b, ak.cast(ak.arange(size) % 2 == 0, int)]

# test with no where args
l_ind, r_ind = ak.join.inner_join(left, right)
for lf, rt in zip(left, right):
self.assertTrue((lf[l_ind] == rt[r_ind]).all())

# test with where args
def where_func(x, y):
return (x[0] % 2 == 0) | (y[0] % 2 == 0)

l_ind, r_ind = ak.join.inner_join(left, right, where_func, (left, right))
self.assertTrue(where_func([lf[l_ind] for lf in left], [rt[r_ind] for rt in right]).all())
ones = ak.ones(size, int)
altr = ak.cast(ak.arange(size) % 2 == 0, int)
left_lists = [
[a, ones],
[pda_to_str_helper(a), ones],
[a, pda_to_str_helper(ones)],
[pda_to_str_helper(a), pda_to_str_helper(ones)],
]
right_list = [
[b, altr],
[pda_to_str_helper(b), altr],
[b, pda_to_str_helper(altr)],
[pda_to_str_helper(b), pda_to_str_helper(altr)],
]
for left, right in zip(left_lists, right_list):
# test with no where args
l_ind, r_ind = ak.join.inner_join(left, right)
for lf, rt in zip(left, right):
self.assertTrue((lf[l_ind] == rt[r_ind]).all())

# test with where args
def where_func(x, y):
x_bool = (
(x[0] % 2 == 0) if isinstance(x[0], ak.pdarray) else (x[0].get_lengths() % 2 == 0)
)
y_bool = (
(x[0] % 2 == 0) if isinstance(y[0], ak.pdarray) else (y[0].get_lengths() % 2 == 0)
)
return x_bool | y_bool

l_ind, r_ind = ak.join.inner_join(left, right, where_func, (left, right))
self.assertTrue(where_func([lf[l_ind] for lf in left], [rt[r_ind] for rt in right]).all())

def test_str_inner_join(self):
intLeft = ak.arange(50)
Expand Down Expand Up @@ -248,3 +266,7 @@ def test_error_handling(self):

def join_where(L, R):
return ak.arange(L.size) % 2 == 0


def pda_to_str_helper(pda):
return ak.array([f"str {i}" for i in pda.to_list()])

0 comments on commit d5925c0

Please sign in to comment.