From d5925c02d2041b2697fe19de36673ee300437b65 Mon Sep 17 00:00:00 2001 From: pierce <48131946+pierce314159@users.noreply.github.com> Date: Tue, 21 Nov 2023 14:14:00 -0500 Subject: [PATCH] Closes #2853: `df.merge` on a mix of String and integer columns (#2859) This PR (closes #2853) adds `df.merge` on a mix of String and integer columns and updates the associated multi-column tests. Co-authored-by: Pierce Hayes --- PROTO_tests/tests/dataframe_test.py | 58 +++++++++++++++---------- PROTO_tests/tests/join_test.py | 50 +++++++++++++++------- arkouda/dataframe.py | 5 ++- arkouda/join.py | 16 ++----- tests/dataframe_test.py | 65 +++++++++++++++++------------ tests/join_test.py | 50 +++++++++++++++------- 6 files changed, 154 insertions(+), 90 deletions(-) diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py index 67f307d8d4..9b16df460c 100644 --- a/PROTO_tests/tests/dataframe_test.py +++ b/PROTO_tests/tests/dataframe_test.py @@ -1,3 +1,5 @@ +import itertools + import numpy as np import pandas as pd import pytest @@ -674,26 +676,36 @@ def test_multi_col_merge(self): b = ak.randint(-size // 10, size // 10, size, seed=seed + 1) c = ak.randint(-size // 10, size // 10, size, seed=seed + 2) d = ak.randint(-size // 10, size // 10, size, seed=seed + 3) - left_df = ak.DataFrame({"first": a, "second": b, "third": ak.ones(size, int)}) - right_df = ak.DataFrame( - {"first": c, "second": d, "third": ak.cast(ak.arange(size) % 2 == 0, int)} - ) - l_pd, r_pd = left_df.to_pandas(), right_df.to_pandas() - - for how in "inner", "left", "right": - for on in "first", "second", "third", ["first", "third"], ["second", "third"], None: - ak_merge = ak.merge(left_df, right_df, on=on, how=how) - pd_merge = pd.merge(l_pd, r_pd, on=on, how=how) - - sorted_columns = sorted(ak_merge.columns) - assert sorted_columns == sorted(pd_merge.columns.to_list()) - sorted_ak = ak_merge.sort_values(sorted_columns).reset_index() - sorted_pd = pd_merge.sort_values(sorted_columns).reset_index(drop=True) - for col in sorted_columns: - assert np.allclose( - sorted_ak[col].to_ndarray(), sorted_pd[col].to_numpy(), equal_nan=True - ) - # TODO arkouda seems to be sometimes convert columns to floats on a right merge - # when pandas doesnt. Eventually we want to test frame_equal, not just value equality - # from pandas.testing import assert_frame_equal - # assert_frame_equal(sorted_ak.to_pandas()[sorted_columns], sorted_pd[sorted_columns]) + ones = ak.ones(size, int) + altr = ak.cast(ak.arange(size) % 2 == 0, int) + for truth in itertools.product([True, False], repeat=3): + left_arrs = [pda if t else pda_to_str_helper(pda) for pda, t in zip([a, b, ones], truth)] + right_arrs = [pda if t else pda_to_str_helper(pda) for pda, t in zip([c, d, altr], truth)] + left_df = ak.DataFrame({k: v for k, v in zip(["first", "second", "third"], left_arrs)}) + right_df = ak.DataFrame({k: v for k, v in zip(["first", "second", "third"], right_arrs)}) + l_pd, r_pd = left_df.to_pandas(), right_df.to_pandas() + + for how in "inner", "left", "right": + for on in "first", "second", "third", ["first", "third"], ["second", "third"], None: + ak_merge = ak.merge(left_df, right_df, on=on, how=how) + pd_merge = pd.merge(l_pd, r_pd, on=on, how=how) + + sorted_columns = sorted(ak_merge.columns) + assert sorted_columns == sorted(pd_merge.columns.to_list()) + for col in sorted_columns: + from_ak = ak_merge[col].to_ndarray() + from_pd = pd_merge[col].to_numpy() + if isinstance(ak_merge[col], ak.pdarray): + assert np.allclose(np.sort(from_ak), np.sort(from_pd), equal_nan=True) + else: + # we have to cast to str because pandas arrays converted to numpy + # have dtype object and have float NANs in line with the str values + assert (np.sort(from_ak) == np.sort(from_pd.astype(str))).all() + # TODO arkouda seems to be sometimes convert columns to floats on a right merge + # when pandas doesnt. Eventually we want to test frame_equal, not just value equality + # from pandas.testing import assert_frame_equal + # assert_frame_equal(sorted_ak.to_pandas()[sorted_columns], sorted_pd[sorted_columns]) + + +def pda_to_str_helper(pda): + return ak.array([f"str {i}" for i in pda.to_list()]) diff --git a/PROTO_tests/tests/join_test.py b/PROTO_tests/tests/join_test.py index d77fbc5112..72b89e3807 100755 --- a/PROTO_tests/tests/join_test.py +++ b/PROTO_tests/tests/join_test.py @@ -87,20 +87,38 @@ def test_multi_array_inner_join(self): seed = 1 a = ak.randint(-size // 10, size // 10, size, seed=seed) b = ak.randint(-size // 10, size // 10, size, seed=seed + 1) - left = [a, ak.ones(size, int)] - right = [b, ak.cast(ak.arange(size) % 2 == 0, int)] - - # test with no where args - l_ind, r_ind = ak.join.inner_join(left, right) - for lf, rt in zip(left, right): - assert (lf[l_ind] == rt[r_ind]).all() - - # test with where args - def where_func(x, y): - return (x[0] % 2 == 0) | (y[0] % 2 == 0) - - l_ind, r_ind = ak.join.inner_join(left, right, where_func, (left, right)) - assert where_func([lf[l_ind] for lf in left], [rt[r_ind] for rt in right]).all() + ones = ak.ones(size, int) + altr = ak.cast(ak.arange(size) % 2 == 0, int) + left_lists = [ + [a, ones], + [pda_to_str_helper(a), ones], + [a, pda_to_str_helper(ones)], + [pda_to_str_helper(a), pda_to_str_helper(ones)], + ] + right_list = [ + [b, altr], + [pda_to_str_helper(b), altr], + [b, pda_to_str_helper(altr)], + [pda_to_str_helper(b), pda_to_str_helper(altr)], + ] + for left, right in zip(left_lists, right_list): + # test with no where args + l_ind, r_ind = ak.join.inner_join(left, right) + for lf, rt in zip(left, right): + assert (lf[l_ind] == rt[r_ind]).all() + + # test with where args + def where_func(x, y): + x_bool = ( + (x[0] % 2 == 0) if isinstance(x[0], ak.pdarray) else (x[0].get_lengths() % 2 == 0) + ) + y_bool = ( + (x[0] % 2 == 0) if isinstance(y[0], ak.pdarray) else (y[0].get_lengths() % 2 == 0) + ) + return x_bool | y_bool + + l_ind, r_ind = ak.join.inner_join(left, right, where_func, (left, right)) + assert where_func([lf[l_ind] for lf in left], [rt[r_ind] for rt in right]).all() def test_str_inner_join(self): int_left = ak.arange(50) @@ -222,3 +240,7 @@ def test_error_handling(self): def join_where(left, right): return ak.arange(left.size) % 2 == 0 + + +def pda_to_str_helper(pda): + return ak.array([f"str {i}" for i in pda.to_list()]) diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py index 87b1c7c941..32e7aa7748 100644 --- a/arkouda/dataframe.py +++ b/arkouda/dataframe.py @@ -2921,7 +2921,10 @@ def merge( on = on if on is not None else col_intersect if not isinstance(on, str): - if not all(isinstance(left[col], pdarray) and isinstance(right[col], pdarray) for col in on): + if not all( + isinstance(left[col], (pdarray, Strings)) and isinstance(right[col], (pdarray, Strings)) + for col in on + ): raise ValueError("All columns of a multi-column merge must be pdarrays") if how == "inner": diff --git a/arkouda/join.py b/arkouda/join.py index 6b213c38fa..cf7b558264 100644 --- a/arkouda/join.py +++ b/arkouda/join.py @@ -180,12 +180,12 @@ def compute_join_size(a: pdarray, b: pdarray) -> Tuple[int, int]: @typechecked def inner_join( - left: Union[pdarray, Strings, Categorical, Sequence[pdarray]], - right: Union[pdarray, Strings, Categorical, Sequence[pdarray]], + left: Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]], + right: Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]], wherefunc: Callable = None, whereargs: Tuple[ - Union[pdarray, Strings, Categorical, Sequence[pdarray]], - Union[pdarray, Strings, Categorical, Sequence[pdarray]], + Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]], + Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]], ] = None, ) -> Tuple[pdarray, pdarray]: """Perform inner join on values in and , @@ -230,10 +230,6 @@ def inner_join( left, right = l.codes, r.codes if is_sequence: - if any(not isinstance(lf, pdarray) for lf in left) or any( - not isinstance(rt, pdarray) for rt in right - ): - raise TypeError("All elements of Multi-array arguments must be pdarrays") if len(left) != len(right): raise ValueError("Left must have same num arrays as right") left_size, right_size = left[0].size, right[0].size @@ -251,10 +247,6 @@ def inner_join( if whereargs is None or len(whereargs) != 2: raise ValueError("whereargs must be a 2-tuple with left and right arg arrays") if is_sequence: - if any(not isinstance(wa, pdarray) for wa in whereargs[0]) or any( - not isinstance(wa, pdarray) for wa in whereargs[1] - ): - raise TypeError("All elements of Multi-array arguments must be pdarrays") if len(whereargs[0]) != len(whereargs[1]): raise ValueError("Left must have same num arrays as right") first_wa_size, second_wa_size = whereargs[0][0].size, whereargs[1][0].size diff --git a/tests/dataframe_test.py b/tests/dataframe_test.py index 6d88bbb27e..c3888d6963 100644 --- a/tests/dataframe_test.py +++ b/tests/dataframe_test.py @@ -1,4 +1,5 @@ import glob +import itertools import os import random import string @@ -821,29 +822,41 @@ def test_multi_col_merge(self): b = ak.randint(-size // 10, size // 10, size, seed=seed + 1) c = ak.randint(-size // 10, size // 10, size, seed=seed + 2) d = ak.randint(-size // 10, size // 10, size, seed=seed + 3) - left_df = ak.DataFrame({"first": a, "second": b, "third": ak.ones(size, int)}) - right_df = ak.DataFrame( - {"first": c, "second": d, "third": ak.cast(ak.arange(size) % 2 == 0, int)} - ) - l_pd, r_pd = left_df.to_pandas(), right_df.to_pandas() - - for how in "inner", "left", "right": - for on in "first", "second", "third", ["first", "third"], ["second", "third"], None: - ak_merge = ak.merge(left_df, right_df, on=on, how=how) - pd_merge = pd.merge(l_pd, r_pd, on=on, how=how) - - sorted_columns = sorted(ak_merge.columns) - self.assertListEqual(sorted_columns, sorted(pd_merge.columns.to_list())) - sorted_ak = ak_merge.sort_values(sorted_columns).reset_index() - sorted_pd = pd_merge.sort_values(sorted_columns).reset_index(drop=True) - for col in sorted_columns: - self.assertTrue( - np.allclose( - sorted_ak[col].to_ndarray(), sorted_pd[col].to_numpy(), equal_nan=True - ) - ) - - # TODO arkouda seems to be sometimes convert columns to floats on a right merge - # when pandas doesnt. Eventually we want to test frame_equal, not just value equality - # from pandas.testing import assert_frame_equal - # assert_frame_equal(sorted_ak.to_pandas()[sorted_columns], sorted_pd[sorted_columns]) + ones = ak.ones(size, int) + altr = ak.cast(ak.arange(size) % 2 == 0, int) + for truth in itertools.product([True, False], repeat=3): + left_arrs = [pda if t else pda_to_str_helper(pda) for pda, t in zip([a, b, ones], truth)] + right_arrs = [pda if t else pda_to_str_helper(pda) for pda, t in zip([c, d, altr], truth)] + left_df = ak.DataFrame({k: v for k, v in zip(["first", "second", "third"], left_arrs)}) + right_df = ak.DataFrame({k: v for k, v in zip(["first", "second", "third"], right_arrs)}) + l_pd, r_pd = left_df.to_pandas(), right_df.to_pandas() + + for how in "inner", "left", "right": + for on in "first", "second", "third", ["first", "third"], ["second", "third"], None: + ak_merge = ak.merge(left_df, right_df, on=on, how=how) + pd_merge = pd.merge(l_pd, r_pd, on=on, how=how) + + sorted_columns = sorted(ak_merge.columns) + self.assertListEqual(sorted_columns, sorted(pd_merge.columns.to_list())) + for col in sorted_columns: + from_ak = ak_merge[col].to_ndarray() + from_pd = pd_merge[col].to_numpy() + if isinstance(ak_merge[col], ak.pdarray): + self.assertTrue( + np.allclose(np.sort(from_ak), np.sort(from_pd), equal_nan=True) + ) + else: + # we have to cast to str because pandas arrays converted to numpy + # have dtype object and have float NANs in line with the str values + self.assertTrue((np.sort(from_ak) == np.sort(from_pd.astype(str))).all()) + + # TODO arkouda seems to be sometimes convert columns to floats on a right merge + # when pandas doesnt. Eventually we want to test frame_equal, not just value equality + # from pandas.testing import assert_frame_equal + # sorted_ak = ak_merge.sort_values(sorted_columns).reset_index() + # sorted_pd = pd_merge.sort_values(sorted_columns).reset_index(drop=True) + # assert_frame_equal(sorted_ak.to_pandas()[sorted_columns], sorted_pd[sorted_columns]) + + +def pda_to_str_helper(pda): + return ak.array([f"str {i}" for i in pda.to_list()]) diff --git a/tests/join_test.py b/tests/join_test.py index b6d3acc2c8..4edcd9f156 100755 --- a/tests/join_test.py +++ b/tests/join_test.py @@ -119,20 +119,38 @@ def test_multi_array_inner_join(self): seed = 1 a = ak.randint(-size // 10, size // 10, size, seed=seed) b = ak.randint(-size // 10, size // 10, size, seed=seed + 1) - left = [a, ak.ones(size, int)] - right = [b, ak.cast(ak.arange(size) % 2 == 0, int)] - - # test with no where args - l_ind, r_ind = ak.join.inner_join(left, right) - for lf, rt in zip(left, right): - self.assertTrue((lf[l_ind] == rt[r_ind]).all()) - - # test with where args - def where_func(x, y): - return (x[0] % 2 == 0) | (y[0] % 2 == 0) - - l_ind, r_ind = ak.join.inner_join(left, right, where_func, (left, right)) - self.assertTrue(where_func([lf[l_ind] for lf in left], [rt[r_ind] for rt in right]).all()) + ones = ak.ones(size, int) + altr = ak.cast(ak.arange(size) % 2 == 0, int) + left_lists = [ + [a, ones], + [pda_to_str_helper(a), ones], + [a, pda_to_str_helper(ones)], + [pda_to_str_helper(a), pda_to_str_helper(ones)], + ] + right_list = [ + [b, altr], + [pda_to_str_helper(b), altr], + [b, pda_to_str_helper(altr)], + [pda_to_str_helper(b), pda_to_str_helper(altr)], + ] + for left, right in zip(left_lists, right_list): + # test with no where args + l_ind, r_ind = ak.join.inner_join(left, right) + for lf, rt in zip(left, right): + self.assertTrue((lf[l_ind] == rt[r_ind]).all()) + + # test with where args + def where_func(x, y): + x_bool = ( + (x[0] % 2 == 0) if isinstance(x[0], ak.pdarray) else (x[0].get_lengths() % 2 == 0) + ) + y_bool = ( + (x[0] % 2 == 0) if isinstance(y[0], ak.pdarray) else (y[0].get_lengths() % 2 == 0) + ) + return x_bool | y_bool + + l_ind, r_ind = ak.join.inner_join(left, right, where_func, (left, right)) + self.assertTrue(where_func([lf[l_ind] for lf in left], [rt[r_ind] for rt in right]).all()) def test_str_inner_join(self): intLeft = ak.arange(50) @@ -248,3 +266,7 @@ def test_error_handling(self): def join_where(L, R): return ak.arange(L.size) % 2 == 0 + + +def pda_to_str_helper(pda): + return ak.array([f"str {i}" for i in pda.to_list()])