rapidsai · vuule · Sep 8, 2023 · Sep 8, 2023 · Sep 8, 2023 · Sep 8, 2023
@@ -950,7 +950,7 @@ def sample(
         frac: Optional[float] = None,
         replace: bool = False,
         weights: Union[abc.Sequence, "cudf.Series", None] = None,
-        random_state: Union[np.random.RandomState, int, None] = None,
+        random_state: Union[np.random.RandomState, int, None] = 1,
     ):
         """Return a random sample of items in each group.
 

@@ -3346,7 +3346,7 @@ def sample(
         frac=None,
         replace=False,
         weights=None,
-        random_state=None,
+        random_state=1,
         axis=None,
         ignore_index=False,
     ):
@@ -3387,7 +3387,7 @@ def sample(
             equal to the number of rows to sample from, and will be normalized
             to have a sum of 1. Unlike pandas, index alignment is not currently
             not performed.
-        random_state : int, numpy/cupy RandomState, or None, default None
+        random_state : int, numpy/cupy RandomState, or None, default 1
             If None, default cupy random state is chosen.
             If int, the seed for the default cupy random state.
             If RandomState, rows-to-sample are generated from the RandomState.

@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -18,7 +18,7 @@ def timeseries(
     freq="1s",
     dtypes=None,
     nulls_frequency=0,
-    seed=None,
+    seed=1,
 ):
     """Create timeseries dataframe with random data
 
@@ -80,7 +80,7 @@ def timeseries(
     return gdf
 
 
-def randomdata(nrows=10, dtypes=None, seed=None):
+def randomdata(nrows=10, dtypes=None, seed=1):
-def randomdata(nrows=10, dtypes=None, seed=1):
+def randomdata(nrows=10, dtypes=None, *, seed):
-def randomdata(nrows=10, dtypes=None, seed=1):
+def randomdata(nrows=10, dtypes=None, *, seed):
     """Create a dataframe with random data
 
     Parameters

@@ -49,7 +49,7 @@
 ALL_TYPES = sorted(list(dtypeutils.ALL_TYPES))
 
 
-def set_random_null_mask_inplace(series, null_probability=0.5, seed=None):
+def set_random_null_mask_inplace(series, null_probability=0.5, seed=1):
     """Randomly nullify elements in series with the provided probability."""
     probs = [null_probability, 1 - null_probability]
     rng = np.random.default_rng(seed=seed)

@@ -71,15 +71,15 @@ class Parameters:
         Number of rows to generate
     column_parameters : List[ColumnParams]
         ColumnParams for each column
-    seed : int or None, default None
+    seed : int or None, default 1
         Seed for random data generation
     """
 
     def __init__(
         self,
         num_rows=2048,
         column_parameters=None,
-        seed=None,
+        seed=1,
     ):
         self.num_rows = num_rows
         if column_parameters is None:
@@ -312,9 +312,7 @@ def get_dataframe(parameters, use_threads):
     return tbl
 
 
-def rand_dataframe(
-    dtypes_meta, rows, seed=random.randint(0, 2**32 - 1), use_threads=True
-):
+def rand_dataframe(dtypes_meta, rows, seed=1, use_threads=True):
     """
     Generates a random table.
 

@@ -181,8 +181,8 @@ def test_ufunc_series(request, ufunc, has_nulls, indexed):
         # Converting nullable integer cudf.Series to pandas will produce a
         # float pd.Series, so instead we replace nulls with an arbitrary
         # integer value, precompute the mask, and then reapply it afterwards.
-        for arg in args:
-            set_random_null_mask_inplace(arg)
+        for idx, arg in enumerate(args):
+            set_random_null_mask_inplace(arg, seed=idx)
         pandas_args = [arg.fillna(0) for arg in args]
 
         # Note: Different indexes must be aligned before the mask is computed.
@@ -261,8 +261,8 @@ def test_binary_ufunc_series_array(
         # Converting nullable integer cudf.Series to pandas will produce a
         # float pd.Series, so instead we replace nulls with an arbitrary
         # integer value, precompute the mask, and then reapply it afterwards.
-        for arg in args:
-            set_random_null_mask_inplace(arg)
+        for idx, arg in enumerate(args):
+            set_random_null_mask_inplace(arg, seed=idx)
 
         # Cupy doesn't support nulls, so we fill with nans before converting.
         args[1] = args[1].fillna(cp.nan)
@@ -403,8 +403,8 @@ def test_ufunc_dataframe(request, ufunc, has_nulls, indexed):
         # Converting nullable integer cudf.Series to pandas will produce a
         # float pd.Series, so instead we replace nulls with an arbitrary
         # integer value, precompute the mask, and then reapply it afterwards.
-        for arg in args:
-            set_random_null_mask_inplace(arg["foo"])
+        for idx, arg in enumerate(args):
+            set_random_null_mask_inplace(arg["foo"], seed=idx)
         pandas_args = [arg.copy() for arg in args]
         for arg in pandas_args:
             arg["foo"] = arg["foo"].fillna(0)

@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import numpy as np
 
@@ -8,10 +8,10 @@
 
 def test_dataset_timeseries():
     gdf1 = gd.datasets.timeseries(
-        dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3, seed=1
+        dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3
     )
     gdf2 = gd.datasets.timeseries(
-        dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3, seed=1
+        dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3
     )
 
     assert_eq(gdf1, gdf2)
@@ -26,7 +26,6 @@ def test_dataset_timeseries():
         freq="2H",
         dtypes={"value": float, "name": "category", "id": int},
         nulls_frequency=0.7,
-        seed=1,
     )
 
     assert gdf["value"].head().dtype == float

@@ -2550,7 +2550,6 @@ def test_groupby_fillna_multi_value(nelem):
         ],
         rows=nelem,
         use_threads=False,
-        seed=0,
     )
     key_col = "0"
     value_cols = ["1", "2", "3", "4", "5", "6"]
@@ -2595,7 +2594,6 @@ def test_groupby_fillna_multi_value_df(nelem):
         ],
         rows=nelem,
         use_threads=False,
-        seed=0,
     )
     key_col = "0"
     value_cols = ["1", "2", "3", "4", "5"]
@@ -2671,7 +2669,6 @@ def test_groupby_fillna_method(nelem, method):
         ],
         rows=nelem,
         use_threads=False,
-        seed=0,
     )
     key_col = "0"
     value_cols = ["1", "2", "3", "4", "5", "6", "7", "8"]

@@ -468,7 +468,7 @@ def test_chunked_orc_writer(
 def test_orc_writer_strings(tmpdir, dtypes):
     gdf_fname = tmpdir.join("gdf_strings.orc")
 
-    expect = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1)
+    expect = cudf.datasets.randomdata(nrows=10, dtypes=dtypes)
     expect.to_orc(gdf_fname)
     got = pd.read_orc(gdf_fname)
 
@@ -487,7 +487,7 @@ def test_orc_writer_strings(tmpdir, dtypes):
 def test_chunked_orc_writer_strings(tmpdir, dtypes):
     gdf_fname = tmpdir.join("chunked_gdf_strings.orc")
 
-    gdf = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1)
+    gdf = cudf.datasets.randomdata(nrows=10, dtypes=dtypes)
     pdf = gdf.to_pandas()
     expect = pd.concat([pdf, pdf]).reset_index(drop=True)
     writer = ORCWriter(gdf_fname)
@@ -1661,7 +1661,7 @@ def test_writer_protobuf_large_rowindexentry():
 @pytest.mark.parametrize("compression", ["ZLIB", "ZSTD"])
 def test_orc_writer_nvcomp(compression):
     expected = cudf.datasets.randomdata(
-        nrows=12345, dtypes={"a": int, "b": str, "c": float}, seed=1
+        nrows=12345, dtypes={"a": int, "b": str, "c": float}
     )
 
     buff = BytesIO()

@@ -1316,7 +1316,6 @@ def test_delta_binary(nrows, add_nulls, tmpdir):
             },
         ],
         rows=nrows,
-        seed=0,
         use_threads=False,
     )
     # Roundabout conversion to pandas to preserve nulls/data types
@@ -1469,7 +1468,6 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf):
 
 
 def test_multifile_parquet_folder(tmpdir):
-
     test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2)
     test_pdf2 = make_pdf(nrows=20)
     expect = pd.concat([test_pdf1, test_pdf2])

@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 import glob
 import os
@@ -85,7 +85,6 @@ def test_read_orc_filtered(tmpdir, engine, predicate, expected_len):
 
 
 def test_read_orc_first_file_empty(tmpdir):
-
     # Write a 3-file dataset where the first file is empty
     # See: https://github.com/rapidsai/cudf/issues/8011
     path = str(tmpdir)
@@ -112,9 +111,8 @@ def test_read_orc_first_file_empty(tmpdir):
     ],
 )
 def test_to_orc(tmpdir, dtypes, compression, compute):
-
     # Create cudf and dask_cudf dataframes
-    df = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1)
+    df = cudf.datasets.randomdata(nrows=10, dtypes=dtypes)
     df = df.set_index("index").sort_index()
     ddf = dask_cudf.from_cudf(df, npartitions=3)