Closes Bears-R-Us#2853: df.merge on a mix of String and integer col…

…umns (Bears-R-Us#2859) This PR (closes Bears-R-Us#2853) adds `df.merge` on a mix of String and integer columns and updates the associated multi-column tests. Co-authored-by: Pierce Hayes <[email protected]>
bmcdonald3 · Nov 21, 2023 · d5925c0 · d5925c0
1 parent 8215275
commit d5925c0
Show file tree

Hide file tree

Showing 6 changed files with 154 additions and 90 deletions.
diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py
@@ -1,3 +1,5 @@
+import itertools
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -674,26 +676,36 @@ def test_multi_col_merge(self):
         b = ak.randint(-size // 10, size // 10, size, seed=seed + 1)
         c = ak.randint(-size // 10, size // 10, size, seed=seed + 2)
         d = ak.randint(-size // 10, size // 10, size, seed=seed + 3)
-        left_df = ak.DataFrame({"first": a, "second": b, "third": ak.ones(size, int)})
-        right_df = ak.DataFrame(
-            {"first": c, "second": d, "third": ak.cast(ak.arange(size) % 2 == 0, int)}
-        )
-        l_pd, r_pd = left_df.to_pandas(), right_df.to_pandas()
-
-        for how in "inner", "left", "right":
-            for on in "first", "second", "third", ["first", "third"], ["second", "third"], None:
-                ak_merge = ak.merge(left_df, right_df, on=on, how=how)
-                pd_merge = pd.merge(l_pd, r_pd, on=on, how=how)
-
-                sorted_columns = sorted(ak_merge.columns)
-                assert sorted_columns == sorted(pd_merge.columns.to_list())
-                sorted_ak = ak_merge.sort_values(sorted_columns).reset_index()
-                sorted_pd = pd_merge.sort_values(sorted_columns).reset_index(drop=True)
-                for col in sorted_columns:
-                    assert np.allclose(
-                        sorted_ak[col].to_ndarray(), sorted_pd[col].to_numpy(), equal_nan=True
-                    )
-                # TODO arkouda seems to be sometimes convert columns to floats on a right merge
-                #  when pandas doesnt. Eventually we want to test frame_equal, not just value equality
-                # from pandas.testing import assert_frame_equal
-                # assert_frame_equal(sorted_ak.to_pandas()[sorted_columns], sorted_pd[sorted_columns])
+        ones = ak.ones(size, int)
+        altr = ak.cast(ak.arange(size) % 2 == 0, int)
+        for truth in itertools.product([True, False], repeat=3):
+            left_arrs = [pda if t else pda_to_str_helper(pda) for pda, t in zip([a, b, ones], truth)]
+            right_arrs = [pda if t else pda_to_str_helper(pda) for pda, t in zip([c, d, altr], truth)]
+            left_df = ak.DataFrame({k: v for k, v in zip(["first", "second", "third"], left_arrs)})
+            right_df = ak.DataFrame({k: v for k, v in zip(["first", "second", "third"], right_arrs)})
+            l_pd, r_pd = left_df.to_pandas(), right_df.to_pandas()
+
+            for how in "inner", "left", "right":
+                for on in "first", "second", "third", ["first", "third"], ["second", "third"], None:
+                    ak_merge = ak.merge(left_df, right_df, on=on, how=how)
+                    pd_merge = pd.merge(l_pd, r_pd, on=on, how=how)
+
+                    sorted_columns = sorted(ak_merge.columns)
+                    assert sorted_columns == sorted(pd_merge.columns.to_list())
+                    for col in sorted_columns:
+                        from_ak = ak_merge[col].to_ndarray()
+                        from_pd = pd_merge[col].to_numpy()
+                        if isinstance(ak_merge[col], ak.pdarray):
+                            assert np.allclose(np.sort(from_ak), np.sort(from_pd), equal_nan=True)
+                        else:
+                            # we have to cast to str because pandas arrays converted to numpy
+                            # have dtype object and have float NANs in line with the str values
+                            assert (np.sort(from_ak) == np.sort(from_pd.astype(str))).all()
+                    # TODO arkouda seems to be sometimes convert columns to floats on a right merge
+                    #  when pandas doesnt. Eventually we want to test frame_equal, not just value equality
+                    # from pandas.testing import assert_frame_equal
+                    # assert_frame_equal(sorted_ak.to_pandas()[sorted_columns], sorted_pd[sorted_columns])
+
+
+def pda_to_str_helper(pda):
+    return ak.array([f"str {i}" for i in pda.to_list()])
diff --git a/PROTO_tests/tests/join_test.py b/PROTO_tests/tests/join_test.py
@@ -87,20 +87,38 @@ def test_multi_array_inner_join(self):
         seed = 1
         a = ak.randint(-size // 10, size // 10, size, seed=seed)
         b = ak.randint(-size // 10, size // 10, size, seed=seed + 1)
-        left = [a, ak.ones(size, int)]
-        right = [b, ak.cast(ak.arange(size) % 2 == 0, int)]
-
-        # test with no where args
-        l_ind, r_ind = ak.join.inner_join(left, right)
-        for lf, rt in zip(left, right):
-            assert (lf[l_ind] == rt[r_ind]).all()
-
-        # test with where args
-        def where_func(x, y):
-            return (x[0] % 2 == 0) | (y[0] % 2 == 0)
-
-        l_ind, r_ind = ak.join.inner_join(left, right, where_func, (left, right))
-        assert where_func([lf[l_ind] for lf in left], [rt[r_ind] for rt in right]).all()
+        ones = ak.ones(size, int)
+        altr = ak.cast(ak.arange(size) % 2 == 0, int)
+        left_lists = [
+            [a, ones],
+            [pda_to_str_helper(a), ones],
+            [a, pda_to_str_helper(ones)],
+            [pda_to_str_helper(a), pda_to_str_helper(ones)],
+        ]
+        right_list = [
+            [b, altr],
+            [pda_to_str_helper(b), altr],
+            [b, pda_to_str_helper(altr)],
+            [pda_to_str_helper(b), pda_to_str_helper(altr)],
+        ]
+        for left, right in zip(left_lists, right_list):
+            # test with no where args
+            l_ind, r_ind = ak.join.inner_join(left, right)
+            for lf, rt in zip(left, right):
+                assert (lf[l_ind] == rt[r_ind]).all()
+
+            # test with where args
+            def where_func(x, y):
+                x_bool = (
+                    (x[0] % 2 == 0) if isinstance(x[0], ak.pdarray) else (x[0].get_lengths() % 2 == 0)
+                )
+                y_bool = (
+                    (x[0] % 2 == 0) if isinstance(y[0], ak.pdarray) else (y[0].get_lengths() % 2 == 0)
+                )
+                return x_bool | y_bool
+
+            l_ind, r_ind = ak.join.inner_join(left, right, where_func, (left, right))
+            assert where_func([lf[l_ind] for lf in left], [rt[r_ind] for rt in right]).all()
 
     def test_str_inner_join(self):
         int_left = ak.arange(50)
@@ -222,3 +240,7 @@ def test_error_handling(self):
 
 def join_where(left, right):
     return ak.arange(left.size) % 2 == 0
+
+
+def pda_to_str_helper(pda):
+    return ak.array([f"str {i}" for i in pda.to_list()])
diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py
@@ -2921,7 +2921,10 @@ def merge(
     on = on if on is not None else col_intersect
 
     if not isinstance(on, str):
-        if not all(isinstance(left[col], pdarray) and isinstance(right[col], pdarray) for col in on):
+        if not all(
+            isinstance(left[col], (pdarray, Strings)) and isinstance(right[col], (pdarray, Strings))
+            for col in on
+        ):
             raise ValueError("All columns of a multi-column merge must be pdarrays")
 
     if how == "inner":

diff --git a/arkouda/join.py b/arkouda/join.py
@@ -180,12 +180,12 @@ def compute_join_size(a: pdarray, b: pdarray) -> Tuple[int, int]:
 
 @typechecked
 def inner_join(
-    left: Union[pdarray, Strings, Categorical, Sequence[pdarray]],
-    right: Union[pdarray, Strings, Categorical, Sequence[pdarray]],
+    left: Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]],
+    right: Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]],
     wherefunc: Callable = None,
     whereargs: Tuple[
-        Union[pdarray, Strings, Categorical, Sequence[pdarray]],
-        Union[pdarray, Strings, Categorical, Sequence[pdarray]],
+        Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]],
+        Union[pdarray, Strings, Categorical, Sequence[Union[pdarray, Strings]]],
     ] = None,
 ) -> Tuple[pdarray, pdarray]:
     """Perform inner join on values in <left> and <right>,
@@ -230,10 +230,6 @@ def inner_join(
         left, right = l.codes, r.codes
 
     if is_sequence:
-        if any(not isinstance(lf, pdarray) for lf in left) or any(
-            not isinstance(rt, pdarray) for rt in right
-        ):
-            raise TypeError("All elements of Multi-array arguments must be pdarrays")
         if len(left) != len(right):
             raise ValueError("Left must have same num arrays as right")
         left_size, right_size = left[0].size, right[0].size
@@ -251,10 +247,6 @@ def inner_join(
         if whereargs is None or len(whereargs) != 2:
             raise ValueError("whereargs must be a 2-tuple with left and right arg arrays")
         if is_sequence:
-            if any(not isinstance(wa, pdarray) for wa in whereargs[0]) or any(
-                not isinstance(wa, pdarray) for wa in whereargs[1]
-            ):
-                raise TypeError("All elements of Multi-array arguments must be pdarrays")
             if len(whereargs[0]) != len(whereargs[1]):
                 raise ValueError("Left must have same num arrays as right")
             first_wa_size, second_wa_size = whereargs[0][0].size, whereargs[1][0].size

diff --git a/tests/dataframe_test.py b/tests/dataframe_test.py
@@ -1,4 +1,5 @@
 import glob
+import itertools
 import os
 import random
 import string
@@ -821,29 +822,41 @@ def test_multi_col_merge(self):
         b = ak.randint(-size // 10, size // 10, size, seed=seed + 1)
         c = ak.randint(-size // 10, size // 10, size, seed=seed + 2)
         d = ak.randint(-size // 10, size // 10, size, seed=seed + 3)
-        left_df = ak.DataFrame({"first": a, "second": b, "third": ak.ones(size, int)})
-        right_df = ak.DataFrame(
-            {"first": c, "second": d, "third": ak.cast(ak.arange(size) % 2 == 0, int)}
-        )
-        l_pd, r_pd = left_df.to_pandas(), right_df.to_pandas()
-
-        for how in "inner", "left", "right":
-            for on in "first", "second", "third", ["first", "third"], ["second", "third"], None:
-                ak_merge = ak.merge(left_df, right_df, on=on, how=how)
-                pd_merge = pd.merge(l_pd, r_pd, on=on, how=how)
-
-                sorted_columns = sorted(ak_merge.columns)
-                self.assertListEqual(sorted_columns, sorted(pd_merge.columns.to_list()))
-                sorted_ak = ak_merge.sort_values(sorted_columns).reset_index()
-                sorted_pd = pd_merge.sort_values(sorted_columns).reset_index(drop=True)
-                for col in sorted_columns:
-                    self.assertTrue(
-                        np.allclose(
-                            sorted_ak[col].to_ndarray(), sorted_pd[col].to_numpy(), equal_nan=True
-                        )
-                    )
-
-                # TODO arkouda seems to be sometimes convert columns to floats on a right merge
-                #  when pandas doesnt. Eventually we want to test frame_equal, not just value equality
-                # from pandas.testing import assert_frame_equal
-                # assert_frame_equal(sorted_ak.to_pandas()[sorted_columns], sorted_pd[sorted_columns])
+        ones = ak.ones(size, int)
+        altr = ak.cast(ak.arange(size) % 2 == 0, int)
+        for truth in itertools.product([True, False], repeat=3):
+            left_arrs = [pda if t else pda_to_str_helper(pda) for pda, t in zip([a, b, ones], truth)]
+            right_arrs = [pda if t else pda_to_str_helper(pda) for pda, t in zip([c, d, altr], truth)]
+            left_df = ak.DataFrame({k: v for k, v in zip(["first", "second", "third"], left_arrs)})
+            right_df = ak.DataFrame({k: v for k, v in zip(["first", "second", "third"], right_arrs)})
+            l_pd, r_pd = left_df.to_pandas(), right_df.to_pandas()
+
+            for how in "inner", "left", "right":
+                for on in "first", "second", "third", ["first", "third"], ["second", "third"], None:
+                    ak_merge = ak.merge(left_df, right_df, on=on, how=how)
+                    pd_merge = pd.merge(l_pd, r_pd, on=on, how=how)
+
+                    sorted_columns = sorted(ak_merge.columns)
+                    self.assertListEqual(sorted_columns, sorted(pd_merge.columns.to_list()))
+                    for col in sorted_columns:
+                        from_ak = ak_merge[col].to_ndarray()
+                        from_pd = pd_merge[col].to_numpy()
+                        if isinstance(ak_merge[col], ak.pdarray):
+                            self.assertTrue(
+                                np.allclose(np.sort(from_ak), np.sort(from_pd), equal_nan=True)
+                            )
+                        else:
+                            # we have to cast to str because pandas arrays converted to numpy
+                            # have dtype object and have float NANs in line with the str values
+                            self.assertTrue((np.sort(from_ak) == np.sort(from_pd.astype(str))).all())
+
+                    # TODO arkouda seems to be sometimes convert columns to floats on a right merge
+                    #  when pandas doesnt. Eventually we want to test frame_equal, not just value equality
+                    # from pandas.testing import assert_frame_equal
+                    # sorted_ak = ak_merge.sort_values(sorted_columns).reset_index()
+                    # sorted_pd = pd_merge.sort_values(sorted_columns).reset_index(drop=True)
+                    # assert_frame_equal(sorted_ak.to_pandas()[sorted_columns], sorted_pd[sorted_columns])
+
+
+def pda_to_str_helper(pda):
+    return ak.array([f"str {i}" for i in pda.to_list()])
diff --git a/tests/join_test.py b/tests/join_test.py
@@ -119,20 +119,38 @@ def test_multi_array_inner_join(self):
         seed = 1
         a = ak.randint(-size // 10, size // 10, size, seed=seed)
         b = ak.randint(-size // 10, size // 10, size, seed=seed + 1)
-        left = [a, ak.ones(size, int)]
-        right = [b, ak.cast(ak.arange(size) % 2 == 0, int)]
-
-        # test with no where args
-        l_ind, r_ind = ak.join.inner_join(left, right)
-        for lf, rt in zip(left, right):
-            self.assertTrue((lf[l_ind] == rt[r_ind]).all())
-
-        # test with where args
-        def where_func(x, y):
-            return (x[0] % 2 == 0) | (y[0] % 2 == 0)
-
-        l_ind, r_ind = ak.join.inner_join(left, right, where_func, (left, right))
-        self.assertTrue(where_func([lf[l_ind] for lf in left], [rt[r_ind] for rt in right]).all())
+        ones = ak.ones(size, int)
+        altr = ak.cast(ak.arange(size) % 2 == 0, int)
+        left_lists = [
+            [a, ones],
+            [pda_to_str_helper(a), ones],
+            [a, pda_to_str_helper(ones)],
+            [pda_to_str_helper(a), pda_to_str_helper(ones)],
+        ]
+        right_list = [
+            [b, altr],
+            [pda_to_str_helper(b), altr],
+            [b, pda_to_str_helper(altr)],
+            [pda_to_str_helper(b), pda_to_str_helper(altr)],
+        ]
+        for left, right in zip(left_lists, right_list):
+            # test with no where args
+            l_ind, r_ind = ak.join.inner_join(left, right)
+            for lf, rt in zip(left, right):
+                self.assertTrue((lf[l_ind] == rt[r_ind]).all())
+
+            # test with where args
+            def where_func(x, y):
+                x_bool = (
+                    (x[0] % 2 == 0) if isinstance(x[0], ak.pdarray) else (x[0].get_lengths() % 2 == 0)
+                )
+                y_bool = (
+                    (x[0] % 2 == 0) if isinstance(y[0], ak.pdarray) else (y[0].get_lengths() % 2 == 0)
+                )
+                return x_bool | y_bool
+
+            l_ind, r_ind = ak.join.inner_join(left, right, where_func, (left, right))
+            self.assertTrue(where_func([lf[l_ind] for lf in left], [rt[r_ind] for rt in right]).all())
 
     def test_str_inner_join(self):
         intLeft = ak.arange(50)
@@ -248,3 +266,7 @@ def test_error_handling(self):
 
 def join_where(L, R):
     return ak.arange(L.size) % 2 == 0
+
+
+def pda_to_str_helper(pda):
+    return ak.array([f"str {i}" for i in pda.to_list()])