Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make random data in Python tests deterministic #14071

Open
wants to merge 11 commits into
base: branch-23.12
Choose a base branch
from
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,7 +950,7 @@ def sample(
frac: Optional[float] = None,
replace: bool = False,
weights: Union[abc.Sequence, "cudf.Series", None] = None,
random_state: Union[np.random.RandomState, int, None] = None,
random_state: Union[np.random.RandomState, int, None] = 1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: ‏I am not sure I like this change, it means that user code that previously worked to draw a sequence of independent samples from groupby objects now always returns the same result for each sample.

):
"""Return a random sample of items in each group.

Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3346,7 +3346,7 @@ def sample(
frac=None,
replace=False,
weights=None,
random_state=None,
random_state=1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: ‏Similarly here, I don't think we should set a specific seed as a default argument to sample. This is also creating a difference in the default API wrt pandas (which defaults to None https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html)

axis=None,
ignore_index=False,
):
Expand Down Expand Up @@ -3387,7 +3387,7 @@ def sample(
equal to the number of rows to sample from, and will be normalized
to have a sum of 1. Unlike pandas, index alignment is not currently
not performed.
random_state : int, numpy/cupy RandomState, or None, default None
random_state : int, numpy/cupy RandomState, or None, default 1
If None, default cupy random state is chosen.
If int, the seed for the default cupy random state.
If RandomState, rows-to-sample are generated from the RandomState.
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/datasets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
Expand All @@ -18,7 +18,7 @@ def timeseries(
freq="1s",
dtypes=None,
nulls_frequency=0,
seed=None,
seed=1,
):
"""Create timeseries dataframe with random data

Expand Down Expand Up @@ -80,7 +80,7 @@ def timeseries(
return gdf


def randomdata(nrows=10, dtypes=None, seed=None):
def randomdata(nrows=10, dtypes=None, seed=1):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note (non-blocking): ‏I am on the fence about these defaults. I suppose it is OK. Perhaps better would be to flip this to a no-default keyword only argument, forcing the caller to specify a seed:

Suggested change
def randomdata(nrows=10, dtypes=None, seed=1):
def randomdata(nrows=10, dtypes=None, *, seed):

"""Create a dataframe with random data

Parameters
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/testing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
ALL_TYPES = sorted(list(dtypeutils.ALL_TYPES))


def set_random_null_mask_inplace(series, null_probability=0.5, seed=None):
def set_random_null_mask_inplace(series, null_probability=0.5, seed=1):
"""Randomly nullify elements in series with the provided probability."""
probs = [null_probability, 1 - null_probability]
rng = np.random.default_rng(seed=seed)
Expand Down
8 changes: 3 additions & 5 deletions python/cudf/cudf/testing/dataset_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,15 @@ class Parameters:
Number of rows to generate
column_parameters : List[ColumnParams]
ColumnParams for each column
seed : int or None, default None
seed : int or None, default 1
Seed for random data generation
"""

def __init__(
self,
num_rows=2048,
column_parameters=None,
seed=None,
seed=1,
):
self.num_rows = num_rows
if column_parameters is None:
Expand Down Expand Up @@ -312,9 +312,7 @@ def get_dataframe(parameters, use_threads):
return tbl


def rand_dataframe(
dtypes_meta, rows, seed=random.randint(0, 2**32 - 1), use_threads=True
):
def rand_dataframe(dtypes_meta, rows, seed=1, use_threads=True):
"""
Generates a random table.

Expand Down
12 changes: 6 additions & 6 deletions python/cudf/cudf/tests/test_array_ufunc.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,8 @@ def test_ufunc_series(request, ufunc, has_nulls, indexed):
# Converting nullable integer cudf.Series to pandas will produce a
# float pd.Series, so instead we replace nulls with an arbitrary
# integer value, precompute the mask, and then reapply it afterwards.
for arg in args:
set_random_null_mask_inplace(arg)
for idx, arg in enumerate(args):
set_random_null_mask_inplace(arg, seed=idx)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seed=idx to ensure different null masks for different columns

pandas_args = [arg.fillna(0) for arg in args]

# Note: Different indexes must be aligned before the mask is computed.
Expand Down Expand Up @@ -261,8 +261,8 @@ def test_binary_ufunc_series_array(
# Converting nullable integer cudf.Series to pandas will produce a
# float pd.Series, so instead we replace nulls with an arbitrary
# integer value, precompute the mask, and then reapply it afterwards.
for arg in args:
set_random_null_mask_inplace(arg)
for idx, arg in enumerate(args):
set_random_null_mask_inplace(arg, seed=idx)

# Cupy doesn't support nulls, so we fill with nans before converting.
args[1] = args[1].fillna(cp.nan)
Expand Down Expand Up @@ -403,8 +403,8 @@ def test_ufunc_dataframe(request, ufunc, has_nulls, indexed):
# Converting nullable integer cudf.Series to pandas will produce a
# float pd.Series, so instead we replace nulls with an arbitrary
# integer value, precompute the mask, and then reapply it afterwards.
for arg in args:
set_random_null_mask_inplace(arg["foo"])
for idx, arg in enumerate(args):
set_random_null_mask_inplace(arg["foo"], seed=idx)
pandas_args = [arg.copy() for arg in args]
for arg in pandas_args:
arg["foo"] = arg["foo"].fillna(0)
Expand Down
7 changes: 3 additions & 4 deletions python/cudf/cudf/tests/test_datasets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.

import numpy as np

Expand All @@ -8,10 +8,10 @@

def test_dataset_timeseries():
gdf1 = gd.datasets.timeseries(
dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3, seed=1
dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3
)
gdf2 = gd.datasets.timeseries(
dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3, seed=1
dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3
)

assert_eq(gdf1, gdf2)
Expand All @@ -26,7 +26,6 @@ def test_dataset_timeseries():
freq="2H",
dtypes={"value": float, "name": "category", "id": int},
nulls_frequency=0.7,
seed=1,
)

assert gdf["value"].head().dtype == float
Expand Down
3 changes: 0 additions & 3 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2550,7 +2550,6 @@ def test_groupby_fillna_multi_value(nelem):
],
rows=nelem,
use_threads=False,
seed=0,
)
key_col = "0"
value_cols = ["1", "2", "3", "4", "5", "6"]
Expand Down Expand Up @@ -2595,7 +2594,6 @@ def test_groupby_fillna_multi_value_df(nelem):
],
rows=nelem,
use_threads=False,
seed=0,
)
key_col = "0"
value_cols = ["1", "2", "3", "4", "5"]
Expand Down Expand Up @@ -2671,7 +2669,6 @@ def test_groupby_fillna_method(nelem, method):
],
rows=nelem,
use_threads=False,
seed=0,
)
key_col = "0"
value_cols = ["1", "2", "3", "4", "5", "6", "7", "8"]
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ def test_chunked_orc_writer(
def test_orc_writer_strings(tmpdir, dtypes):
gdf_fname = tmpdir.join("gdf_strings.orc")

expect = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1)
expect = cudf.datasets.randomdata(nrows=10, dtypes=dtypes)
expect.to_orc(gdf_fname)
got = pd.read_orc(gdf_fname)

Expand All @@ -487,7 +487,7 @@ def test_orc_writer_strings(tmpdir, dtypes):
def test_chunked_orc_writer_strings(tmpdir, dtypes):
gdf_fname = tmpdir.join("chunked_gdf_strings.orc")

gdf = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1)
gdf = cudf.datasets.randomdata(nrows=10, dtypes=dtypes)
pdf = gdf.to_pandas()
expect = pd.concat([pdf, pdf]).reset_index(drop=True)
writer = ORCWriter(gdf_fname)
Expand Down Expand Up @@ -1661,7 +1661,7 @@ def test_writer_protobuf_large_rowindexentry():
@pytest.mark.parametrize("compression", ["ZLIB", "ZSTD"])
def test_orc_writer_nvcomp(compression):
expected = cudf.datasets.randomdata(
nrows=12345, dtypes={"a": int, "b": str, "c": float}, seed=1
nrows=12345, dtypes={"a": int, "b": str, "c": float}
)

buff = BytesIO()
Expand Down
2 changes: 0 additions & 2 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1316,7 +1316,6 @@ def test_delta_binary(nrows, add_nulls, tmpdir):
},
],
rows=nrows,
seed=0,
use_threads=False,
)
# Roundabout conversion to pandas to preserve nulls/data types
Expand Down Expand Up @@ -1469,7 +1468,6 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf):


def test_multifile_parquet_folder(tmpdir):

test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2)
test_pdf2 = make_pdf(nrows=20)
expect = pd.concat([test_pdf1, test_pdf2])
Expand Down
6 changes: 2 additions & 4 deletions python/dask_cudf/dask_cudf/io/tests/test_orc.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2022, NVIDIA CORPORATION.
# Copyright (c) 2018-2023, NVIDIA CORPORATION.

import glob
import os
Expand Down Expand Up @@ -85,7 +85,6 @@ def test_read_orc_filtered(tmpdir, engine, predicate, expected_len):


def test_read_orc_first_file_empty(tmpdir):

# Write a 3-file dataset where the first file is empty
# See: https://github.com/rapidsai/cudf/issues/8011
path = str(tmpdir)
Expand All @@ -112,9 +111,8 @@ def test_read_orc_first_file_empty(tmpdir):
],
)
def test_to_orc(tmpdir, dtypes, compression, compute):

# Create cudf and dask_cudf dataframes
df = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1)
df = cudf.datasets.randomdata(nrows=10, dtypes=dtypes)
df = df.set_index("index").sort_index()
ddf = dask_cudf.from_cudf(df, npartitions=3)

Expand Down