Skip to content

Commit

Permalink
Reverts DataFrame Indexing Changes (Bears-R-Us#3323)
Browse files Browse the repository at this point in the history
* Revert "missed an iloc call that was affecting benchmark performance"

This reverts commit d664ac9.

* Revert "missing iloc usage"

This reverts commit 31d3185.

* stragglers for reverting dataframe indexing

* updates to remove .value calls

* flake changes
  • Loading branch information
brandon-neth authored Jun 13, 2024
1 parent 3043f38 commit 416cb90
Show file tree
Hide file tree
Showing 10 changed files with 242 additions and 2,464 deletions.
978 changes: 20 additions & 958 deletions PROTO_tests/tests/dataframe_test.py

Large diffs are not rendered by default.

22 changes: 11 additions & 11 deletions PROTO_tests/tests/io_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ def test_read_nested(self):
assert "idx" in data
assert "seg" in data
assert df["idx"].to_list() == data["idx"].to_list()
assert df["seg"].values.to_list() == data["seg"].to_list()
assert df["seg"].to_list() == data["seg"].to_list()

# test read with read_nested=false and no supplied datasets
data = ak.read_parquet(f"{file_name}*", read_nested=False)["idx"]
Expand All @@ -495,7 +495,7 @@ def test_read_nested(self):
assert "idx" in data
assert "seg" in data
assert df["idx"].to_list() == data["idx"].to_list()
assert df["seg"].values.to_list() == data["seg"].to_list()
assert df["seg"].to_list() == data["seg"].to_list()

@pytest.mark.parametrize("comp", COMPRESSIONS)
def test_ipv4_columns(self, comp):
Expand Down Expand Up @@ -524,7 +524,7 @@ def test_ipv4_columns(self, comp):

# test replacement of IPv4 with uint representation
df = ak.DataFrame({"a": ak.IPv4(ak.arange(10))})
df["a"] = df["a"].values.export_uint()
df["a"] = df["a"].export_uint()
assert ak.arange(10).to_list() == df["a"].to_list()

def test_empty_segs_segarray(self):
Expand Down Expand Up @@ -562,7 +562,7 @@ def test_empty_segs_segarray(self):
pddf.to_parquet(file_path)
akdf = ak.DataFrame(ak.read_parquet(file_path))

to_pd = pd.Series(akdf["rand"].values.to_list())
to_pd = pd.Series(akdf["rand"].to_list())
# raises an error if the two series aren't equal
# we can't use np.allclose(pddf['rand'].to_list, akdf['rand'].to_list) since these
# are lists of lists. assert_series_equal handles this and properly handles nans.
Expand Down Expand Up @@ -702,7 +702,7 @@ def test_read_and_write_with_dict(self):
for col_name in akdf.columns.values:
gen_arr = ak.read_hdf(f"{file_name}*", datasets=[col_name])[col_name]
if akdf[col_name].dtype != ak.float64:
assert akdf[col_name].values.to_list() == gen_arr.to_list()
assert akdf[col_name].to_list() == gen_arr.to_list()
else:
a = akdf[col_name].to_ndarray()
b = gen_arr.to_ndarray()
Expand Down Expand Up @@ -742,7 +742,7 @@ def test_read_and_write_with_dict(self):
# verify generic load works
gen_arr = ak.load(path_prefix=file_name, dataset=col_name)[col_name]
if akdf[col_name].dtype != ak.float64:
assert akdf[col_name].values.to_list() == gen_arr.to_list()
assert akdf[col_name].to_list() == gen_arr.to_list()
else:
a = akdf[col_name].to_ndarray()
b = gen_arr.to_ndarray()
Expand All @@ -754,7 +754,7 @@ def test_read_and_write_with_dict(self):
# verify generic load works with file_format parameter
gen_arr = ak.load(path_prefix=file_name, dataset=col_name, file_format="HDF5")[col_name]
if akdf[col_name].dtype != ak.float64:
assert akdf[col_name].values.to_list() == gen_arr.to_list()
assert akdf[col_name].to_list() == gen_arr.to_list()
else:
a = akdf[col_name].to_ndarray()
b = gen_arr.to_ndarray()
Expand Down Expand Up @@ -1229,7 +1229,7 @@ def test_hdf_overwrite_dataframe(self):
data = ak.read_hdf(f"{file_name}*")
odf_keys = list(odf.keys())
for key in df.keys():
assert (data[key] == (odf[key].values if key in odf_keys else df[key].values)).all()
assert (data[key] == (odf[key] if key in odf_keys else df[key])).all()

def test_overwrite_segarray(self):
sa1 = ak.SegArray(ak.arange(0, 1000, 5), ak.arange(1000))
Expand Down Expand Up @@ -1429,9 +1429,9 @@ def test_special_objtype(self):
assert isinstance(rd_df["ip"], ak.IPv4)
assert isinstance(rd_df["datetime"], ak.Datetime)
assert isinstance(rd_df["timedelta"], ak.Timedelta)
assert df["ip"].values.to_list() == rd_df["ip"].to_list()
assert df["datetime"].values.to_list() == rd_df["datetime"].to_list()
assert df["timedelta"].values.to_list() == rd_df["timedelta"].to_list()
assert df["ip"].to_list() == rd_df["ip"].to_list()
assert df["datetime"].to_list() == rd_df["datetime"].to_list()
assert df["timedelta"].to_list() == rd_df["timedelta"].to_list()


class TestCSV:
Expand Down
9 changes: 0 additions & 9 deletions PROTO_tests/tests/series_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,12 +341,3 @@ def test_fillna(self):

fill_values3 = 100.0
assert data.fillna(fill_values3).to_list() == [1.0, 100.0, 3.0, 100.0, 5.0]

def test_series_segarray_to_pandas(self):
# reproducer for issue #3222
sa = ak.SegArray(ak.arange(0, 30, 3), ak.arange(30))
akdf = ak.DataFrame({"test": sa})
pddf = pd.DataFrame({"test": sa.to_list()})

assert_frame_equal(akdf.to_pandas(), pddf)
assert_series_equal(akdf['test'].to_pandas(), pddf['test'], check_names=False)
Loading

0 comments on commit 416cb90

Please sign in to comment.