diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx index e8ab51e4dd..3d6be3abf2 100644 --- a/python/cuml/cuml/cluster/kmeans.pyx +++ b/python/cuml/cuml/cluster/kmeans.pyx @@ -20,6 +20,7 @@ from cuml.internals.safe_imports import cpu_only_import np = cpu_only_import('numpy') from cuml.internals.safe_imports import gpu_only_import rmm = gpu_only_import('rmm') +from cuml.internals.safe_imports import safe_import_from, return_false import typing IF GPUBUILD == 1: @@ -46,7 +47,10 @@ from cuml.common import input_to_cuml_array from cuml.internals.api_decorators import device_interop_preparation from cuml.internals.api_decorators import enable_device_interop -from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +# from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +_openmp_effective_n_threads = safe_import_from( + "sklearn.utils._openmp_helpers", "_openmp_effective_n_threads", alt=return_false +) class KMeans(UniversalBase, @@ -235,7 +239,10 @@ class KMeans(UniversalBase, self.cluster_centers_ = None # For sklearn interoperability - self._n_threads = _openmp_effective_n_threads() + if _openmp_effective_n_threads(): + self._n_threads = _openmp_effective_n_threads() + else: + self._n_threads = 1 # cuPy does not allow comparing with string. See issue #2372 init_str = init if isinstance(init, str) else None diff --git a/python/cuml/cuml/internals/array.py b/python/cuml/cuml/internals/array.py index e61d84ab83..c30d609563 100644 --- a/python/cuml/cuml/internals/array.py +++ b/python/cuml/cuml/internals/array.py @@ -1251,13 +1251,14 @@ def array_to_memory_order(arr, default="C"): return arr.order except AttributeError: pass - try: - array_interface = arr.__cuda_array_interface__ - except AttributeError: - try: - array_interface = arr.__array_interface__ - except AttributeError: - return array_to_memory_order(CumlArray.from_input(arr, order="K")) + array_interface = getattr( + arr, + "__cuda_array_interface__", + getattr(arr, "__array_interface__", False), + ) + if not array_interface: + return array_to_memory_order(CumlArray.from_input(arr, order="K")) + strides = array_interface.get("strides", None) if strides is None: try: diff --git a/python/cuml/cuml/model_selection/_split.py b/python/cuml/cuml/model_selection/_split.py index 0727f82c82..227f0eb297 100644 --- a/python/cuml/cuml/model_selection/_split.py +++ b/python/cuml/cuml/model_selection/_split.py @@ -265,8 +265,18 @@ def train_test_split( string" ) - x_order = array_to_memory_order(X) - X_arr, X_row, *_ = input_to_cuml_array(X, order=x_order) + all_numeric = True + if isinstance(X, cudf.DataFrame): + all_numeric = all( + cudf.api.types.is_numeric_dtype(X[col]) for col in X.columns + ) + + if all_numeric: + x_order = array_to_memory_order(X) + X_arr, X_row, *_ = input_to_cuml_array(X, order=x_order) + else: + x_order = "F" + X_arr, X_row = X, X.shape[0] if y is not None: y_order = array_to_memory_order(y) y_arr, y_row, *_ = input_to_cuml_array(y, order=y_order) @@ -363,55 +373,53 @@ def train_test_split( train_indices = range(0, train_size) test_indices = range(-1 * test_size, 0) - # Gather from indices - X_train = X_arr[train_indices] - X_test = X_arr[test_indices] - if y is not None: - y_train = y_arr[train_indices] - y_test = y_arr[test_indices] - - # Coerce output to original input type - if ty := determine_df_obj_type(X): - x_type = ty - else: - x_type = determine_array_type(X) - - if ty := determine_df_obj_type(y): - y_type = ty - else: - y_type = determine_array_type(y) - - if x_type in ("series", "dataframe"): - X_train = output_to_df_obj_like(X_train, X, x_type) - X_test = output_to_df_obj_like(X_test, X, x_type) - - if determine_array_type(X.index) == "pandas": - if isinstance(train_indices, cp.ndarray): - train_indices = train_indices.get() - if isinstance(test_indices, cp.ndarray): - test_indices = test_indices.get() + if all_numeric: + # Gather from indices + X_train = X_arr[train_indices] + X_test = X_arr[test_indices] + if y is not None: + y_train = y_arr[train_indices] + y_test = y_arr[test_indices] + + # Coerce output to original input type + x_type = determine_df_obj_type(X) or determine_array_type(X) + if y is not None: + y_type = determine_df_obj_type(y) or determine_array_type(y) + + def _process_df_objs( + df, df_type, df_train, df_test, train_indices, test_indices + ): + if df_type in {"series", "dataframe"}: + df_train = output_to_df_obj_like(df_train, df, df_type) + df_test = output_to_df_obj_like(df_test, df, df_type) + + if determine_array_type(df.index) == "pandas": + if isinstance(train_indices, cp.ndarray): + train_indices = train_indices.get() + if isinstance(test_indices, cp.ndarray): + test_indices = test_indices.get() + + df_train.index = df.index[train_indices] + df_test.index = df.index[test_indices] + else: + df_train = df_train.to_output(df_type) + df_test = df_test.to_output(df_type) + return df_train, df_test + + X_train, X_test = _process_df_objs( + X, x_type, X_train, X_test, train_indices, test_indices + ) + if y is not None: + y_train, y_test = _process_df_objs( + y, y_type, y_train, y_test, train_indices, test_indices + ) - X_train.index = X.index[train_indices] - X_test.index = X.index[test_indices] else: - X_train = X_train.to_output(x_type) - X_test = X_test.to_output(x_type) - - if y_type in ("series", "dataframe"): - y_train = output_to_df_obj_like(y_train, y, y_type) - y_test = output_to_df_obj_like(y_test, y, y_type) - - if determine_array_type(y.index) == "pandas": - if isinstance(train_indices, cp.ndarray): - train_indices = train_indices.get() - if isinstance(test_indices, cp.ndarray): - test_indices = test_indices.get() - - y_train.index = y.index[train_indices] - y_test.index = y.index[test_indices] - elif y_type is not None: - y_train = y_train.to_output(y_type) - y_test = y_test.to_output(y_type) + X_train = X_arr.iloc[train_indices] + X_test = X_arr.iloc[test_indices] + if y is not None: + y_train = y_arr[train_indices] + y_test = y_arr[test_indices] if y is not None: return X_train, X_test, y_train, y_test