From 21f9882762807dbdd73f8fa5c379dceeaaa7ced0 Mon Sep 17 00:00:00 2001 From: Ray Douglass <3107146+raydouglass@users.noreply.github.com> Date: Thu, 26 Sep 2024 14:42:42 -0400 Subject: [PATCH 1/6] Add license file to conda packages (#6061) Just adds the license file itself to the conda packages. Authors: - Ray Douglass (https://github.com/raydouglass) - Dante Gama Dessavre (https://github.com/dantegd) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cuml/pull/6061 --- conda/recipes/cuml-cpu/meta.yaml | 2 +- conda/recipes/cuml/meta.yaml | 2 +- conda/recipes/libcuml/meta.yaml | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/conda/recipes/cuml-cpu/meta.yaml b/conda/recipes/cuml-cpu/meta.yaml index 97e5cdd813..a4c6950171 100644 --- a/conda/recipes/cuml-cpu/meta.yaml +++ b/conda/recipes/cuml-cpu/meta.yaml @@ -45,5 +45,5 @@ tests: # [linux64] about: home: https://rapids.ai/ license: Apache-2.0 - # license_file: LICENSE + license_file: LICENSE summary: cuML-CPU library diff --git a/conda/recipes/cuml/meta.yaml b/conda/recipes/cuml/meta.yaml index 74ba26ea14..94e26003bc 100644 --- a/conda/recipes/cuml/meta.yaml +++ b/conda/recipes/cuml/meta.yaml @@ -101,5 +101,5 @@ tests: about: home: https://rapids.ai/ license: Apache-2.0 - # license_file: LICENSE + license_file: LICENSE summary: cuML library diff --git a/conda/recipes/libcuml/meta.yaml b/conda/recipes/libcuml/meta.yaml index 0737da6969..ea1b935f01 100644 --- a/conda/recipes/libcuml/meta.yaml +++ b/conda/recipes/libcuml/meta.yaml @@ -122,6 +122,7 @@ outputs: about: home: https://rapids.ai/ license: Apache-2.0 + license_file: LICENSE summary: libcuml library - name: libcuml-tests version: {{ version }} @@ -155,4 +156,5 @@ outputs: about: home: https://rapids.ai/ license: Apache-2.0 + license_file: LICENSE summary: libcuml test & benchmark executables From 9284ca34ec5b8192fa41f57ac98ce318f5fe51d6 Mon Sep 17 00:00:00 2001 From: Jinfeng Li Date: Tue, 1 Oct 2024 07:02:26 -0700 Subject: [PATCH 2/6] Support all-zeroes feature vectors for MG sparse logistic regression (#6082) Authors: - Jinfeng Li (https://github.com/lijinf2) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/cuml/pull/6082 --- cpp/src/glm/qn/mg/standardization.cuh | 10 +++++ .../dask/test_dask_logistic_regression.py | 41 +++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/cpp/src/glm/qn/mg/standardization.cuh b/cpp/src/glm/qn/mg/standardization.cuh index 1b623a188f..a64f477848 100644 --- a/cpp/src/glm/qn/mg/standardization.cuh +++ b/cpp/src/glm/qn/mg/standardization.cuh @@ -162,6 +162,16 @@ void mean_stddev(const raft::handle_t& handle, { auto stream = handle.get_stream(); int D = X.n; + + if (X.nnz == 0) { + SimpleVec meanVec(mean_vector, D); + meanVec.fill(0., stream); + + SimpleVec stddevVec(stddev_vector, D); + stddevVec.fill(0., stream); + return; + } + mean(handle, X, n_samples, mean_vector); // calculate stdev.S diff --git a/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py b/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py index dc6b63428d..2d07329b4f 100644 --- a/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py +++ b/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py @@ -1119,3 +1119,44 @@ def make_classification_with_nnz( ) assert lr_on.dtype == datatype + + +@pytest.mark.parametrize("standardization", [False, True]) +@pytest.mark.parametrize("fit_intercept", [False, True]) +def test_sparse_all_zeroes(standardization, fit_intercept, client): + n_parts = 2 + datatype = "float32" + + X = np.array([(0, 0), (0, 0), (0, 0), (0, 0)], datatype) + y = np.array([1.0, 1.0, 0.0, 0.0], datatype) + X = csr_matrix(X) + X_da_csr, y_da = _prep_training_data_sparse(client, X, y, n_parts) + + from cuml.dask.linear_model import LogisticRegression as cumlLBFGS_dask + + mg = cumlLBFGS_dask( + fit_intercept=fit_intercept, + verbose=True, + standardization=standardization, + ) + mg.fit(X_da_csr, y_da) + mg_preds = mg.predict(X_da_csr).compute() + + from sklearn.linear_model import LogisticRegression + + cpu_lr = LogisticRegression(fit_intercept=fit_intercept) + cpu_lr.fit(X, y) + cpu_preds = cpu_lr.predict(X) + + assert array_equal(mg_preds, cpu_preds) + + assert array_equal( + mg.coef_, + cpu_lr.coef_, + with_sign=True, + ) + assert array_equal( + mg.intercept_, + cpu_lr.intercept_, + with_sign=True, + ) From 8ed7bda0b21aeb0b8b20d0b785911c3fa8e74cb8 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 3 Oct 2024 09:48:21 -0500 Subject: [PATCH 3/6] Remove old dask-glm based logistic regression (#6028) closes #6025 Authors: - Dante Gama Dessavre (https://github.com/dantegd) - Bradley Dice (https://github.com/bdice) - Divye Gala (https://github.com/divyegala) Approvers: - Bradley Dice (https://github.com/bdice) - Divye Gala (https://github.com/divyegala) URL: https://github.com/rapidsai/cuml/pull/6028 --- .../all_cuda-118_arch-x86_64.yaml | 3 - .../all_cuda-125_arch-x86_64.yaml | 3 - dependencies.yaml | 8 - python/cuml/cuml/dask/extended/__init__.py | 0 .../dask/extended/linear_model/__init__.py | 27 --- .../linear_model/logistic_regression.py | 219 ------------------ .../dask/test_dask_logistic_regression.py | 86 ------- python/cuml/pyproject.toml | 1 - 8 files changed, 347 deletions(-) delete mode 100644 python/cuml/cuml/dask/extended/__init__.py delete mode 100644 python/cuml/cuml/dask/extended/linear_model/__init__.py delete mode 100644 python/cuml/cuml/dask/extended/linear_model/logistic_regression.py diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 0540f469d8..406789fffc 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -50,7 +50,6 @@ dependencies: - numpydoc - nvcc_linux-64=11.8 - packaging -- pip - pydata-sphinx-theme!=0.14.2 - pylibraft==24.10.*,>=0.0.0a0 - pynndescent @@ -78,6 +77,4 @@ dependencies: - sysroot_linux-64==2.17 - treelite==4.3.0 - umap-learn==0.5.6 -- pip: - - dask-glm==0.3.0 name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index ad8d12f1a3..28c9197192 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -46,7 +46,6 @@ dependencies: - numpy>=1.23,<3.0a0 - numpydoc - packaging -- pip - pydata-sphinx-theme!=0.14.2 - pylibraft==24.10.*,>=0.0.0a0 - pynndescent @@ -74,6 +73,4 @@ dependencies: - sysroot_linux-64==2.17 - treelite==4.3.0 - umap-learn==0.5.6 -- pip: - - dask-glm==0.3.0 name: all_cuda-125_arch-x86_64 diff --git a/dependencies.yaml b/dependencies.yaml index d176e382ad..687c0bd9aa 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -518,14 +518,6 @@ dependencies: - umap-learn==0.5.6 - pynndescent - setuptools # Needed on Python 3.12 for dask-glm, which requires pkg_resources but Python 3.12 doesn't have setuptools by default - - output_types: conda - packages: - - pip - - pip: - - dask-glm==0.3.0 - - output_types: pyproject - packages: - - dask-glm==0.3.0 test_notebooks: common: - output_types: [conda, requirements] diff --git a/python/cuml/cuml/dask/extended/__init__.py b/python/cuml/cuml/dask/extended/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/python/cuml/cuml/dask/extended/linear_model/__init__.py b/python/cuml/cuml/dask/extended/linear_model/__init__.py deleted file mode 100644 index 8f8cba28a1..0000000000 --- a/python/cuml/cuml/dask/extended/linear_model/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -# -# Copyright (c) 2021-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from cuml.internals.import_utils import has_daskglm -import warnings - -if has_daskglm(): - from cuml.dask.extended.linear_model.logistic_regression import ( - LogisticRegression, - ) -else: - warnings.warn( - "Dask-glm not found. Multi-GPU logistic regression is disabled." - ) diff --git a/python/cuml/cuml/dask/extended/linear_model/logistic_regression.py b/python/cuml/cuml/dask/extended/linear_model/logistic_regression.py deleted file mode 100644 index cbaccdc193..0000000000 --- a/python/cuml/cuml/dask/extended/linear_model/logistic_regression.py +++ /dev/null @@ -1,219 +0,0 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from dask.utils import is_dataframe_like, is_series_like, is_arraylike -from cuml.internals.safe_imports import cpu_only_import -from cuml.dask.common.base import BaseEstimator -from cuml.common import with_cupy_rmm -from cuml.internals.import_utils import has_daskglm - -from cuml.internals.safe_imports import gpu_only_import - -cp = gpu_only_import("cupy") -np = cpu_only_import("numpy") -cudf = gpu_only_import("cudf") - - -class LogisticRegression(BaseEstimator): - """ - Distributed Logistic Regression for Binary classification. - - - Parameters - ---------- - fit_intercept: boolean (default = True) - If True, the model tries to correct for the global mean of y. - If False, the model expects that you have centered the data. - solver : 'admm' - Solver to use. Only admm is supported currently. - penalty : {'l1', 'l2', 'elastic_net'} (default = 'l2') - Regularization technique for the solver. - C: float (default = 1.0) - Inverse of regularization strength; must be a positive float. - max_iter: int (default = 100) - Maximum number of iterations taken for the solvers to converge. - verbose : int or boolean (default=False) - Sets logging level. It must be one of `cuml.common.logger.level_*`. - See :ref:`verbosity-levels` for more info. - - Attributes - ---------- - coef_: device array (n_features, 1) - The estimated coefficients for the logistic regression model. - intercept_: device array (1,) - The independent term. If `fit_intercept` is False, will be 0. - solver: string - Algorithm to use in the optimization process. Currently only `admm` is - supported. - - Notes - ------ - - This estimator is a wrapper class around Dask-GLM's - Logistic Regression estimator. Several methods in this wrapper class - duplicate code from Dask-GLM to create a user-friendly namespace. - """ - - def __init__( - self, - *, - client=None, - fit_intercept=True, - solver="admm", - penalty="l2", - C=1.0, - max_iter=100, - verbose=False, - **kwargs, - ): - super(LogisticRegression, self).__init__( - client=client, verbose=verbose, **kwargs - ) - - if not has_daskglm("0.2.1.dev"): - raise ImportError( - "dask-glm >= 0.2.1.dev was not found, please install it" - " to use multi-GPU logistic regression." - ) - - from dask_glm.estimators import ( - LogisticRegression as LogisticRegressionGLM, - ) - - self.fit_intercept = fit_intercept - self.solver = solver - self.penalty = penalty - self.C = C - self.max_iter = max_iter - - if self.penalty not in ("l2", "l1", "elastic_net"): - raise TypeError( - "Only l2, l1, and elastic_net penalties are" - " currently supported." - ) - - self.solver_model = LogisticRegressionGLM( - solver=self.solver, - fit_intercept=self.fit_intercept, - regularizer=self.penalty, - max_iter=self.max_iter, - lamduh=1 / self.C, - ) - - @with_cupy_rmm - def fit(self, X, y): - """ - Fit the model with X and y. - - Parameters - ---------- - X : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_features) - Features for regression - y : Dask cuDF Series or CuPy backed Dask Array (n_rows,) - Label (outcome values) - """ - - X = self._input_to_dask_cupy_array(X) - y = self._input_to_dask_cupy_array(y) - self.solver_model.fit(X, y) - self._finalize_coefs() - return self - - @with_cupy_rmm - def predict(self, X): - """ - Predicts the ŷ for X. - - Parameters - ---------- - X : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_features) - Distributed dense matrix (floats or doubles) of shape - (n_samples, n_features). - - Returns - ------- - y : Dask cuDF Series or CuPy backed Dask Array (n_rows,) - """ - return self.predict_proba(X) > 0.5 - - @with_cupy_rmm - def predict_proba(self, X): - from dask_glm.utils import sigmoid - - X = self._input_to_dask_cupy_array(X) - return sigmoid(self.decision_function(X)) - - @with_cupy_rmm - def decision_function(self, X): - X = self._input_to_dask_cupy_array(X) - X_ = self._maybe_add_intercept(X) - return np.dot(X_, self._coef) - - @with_cupy_rmm - def score(self, X, y): - from dask_glm.utils import accuracy_score - - X = self._input_to_dask_cupy_array(X) - y = self._input_to_dask_cupy_array(y) - return accuracy_score(y, self.predict(X)) - - @with_cupy_rmm - def _finalize_coefs(self): - # _coef contains coefficients and (potentially) intercept - self._coef = cp.asarray(self.solver_model._coef) - if self.fit_intercept: - self.coef_ = self._coef[:-1] - self.intercept_ = self.solver_model._coef[-1] - else: - self.coef_ = self._coef - - @with_cupy_rmm - def _maybe_add_intercept(self, X): - from dask_glm.utils import add_intercept - - if self.fit_intercept: - return add_intercept(X) - else: - return X - - @with_cupy_rmm - def _input_to_dask_cupy_array(self, X): - if (is_dataframe_like(X) or is_series_like(X)) and hasattr(X, "dask"): - - if not isinstance(X._meta, (cudf.Series, cudf.DataFrame)): - raise TypeError( - "Please convert your Dask DataFrame" - " to a Dask-cuDF DataFrame using dask_cudf." - ) - X = X.values - X._meta = cp.asarray(X._meta) - - elif is_arraylike(X) and hasattr(X, "dask"): - if not isinstance(X._meta, cp.ndarray): - raise TypeError( - "Please convert your CPU Dask Array" - " to a GPU Dask Array using" - " arr.map_blocks(cp.asarray)." - ) - else: - raise TypeError( - "Please pass a GPU backed Dask DataFrame" " or Dask Array." - ) - - X.compute_chunk_sizes() - return X - - def get_param_names(self): - return list(self.kwargs.keys()) diff --git a/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py b/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py index 2d07329b4f..f208d5a330 100644 --- a/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py +++ b/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py @@ -103,92 +103,6 @@ def make_classification_dataset( return X, y -def select_sk_solver(cuml_solver): - if cuml_solver == "newton": - return "newton-cg" - elif cuml_solver in ["admm", "lbfgs"]: - return "lbfgs" - else: - pytest.xfail("No matched sklearn solver") - - -@pytest.mark.mg -@pytest.mark.parametrize("nrows", [1e5]) -@pytest.mark.parametrize("ncols", [20]) -@pytest.mark.parametrize("n_parts", [2, 6]) -@pytest.mark.parametrize("fit_intercept", [False, True]) -@pytest.mark.parametrize("datatype", [np.float32, np.float64]) -@pytest.mark.parametrize("gpu_array_input", [False, True]) -@pytest.mark.parametrize( - "solver", ["admm", "gradient_descent", "newton", "lbfgs", "proximal_grad"] -) -def test_lr_fit_predict_score( - nrows, - ncols, - n_parts, - fit_intercept, - datatype, - gpu_array_input, - solver, - client, -): - sk_solver = select_sk_solver(cuml_solver=solver) - - def imp(): - import cuml.comm.serialize # NOQA - - client.run(imp) - - from cuml.dask.extended.linear_model import ( - LogisticRegression as cumlLR_dask, - ) - - n_info = 5 - nrows = int(nrows) - ncols = int(ncols) - X, y = make_classification_dataset(datatype, nrows, ncols, n_info) - - gX, gy = _prep_training_data(client, X, y, n_parts) - - if gpu_array_input: - gX = gX.values - gX._meta = cp.asarray(gX._meta) - gy = gy.values - gy._meta = cp.asarray(gy._meta) - - cuml_model = cumlLR_dask( - fit_intercept=fit_intercept, solver=solver, max_iter=10 - ) - - # test fit and predict - cuml_model.fit(gX, gy) - cu_preds = cuml_model.predict(gX) - accuracy_cuml = accuracy_score(y, cu_preds.compute().get()) - - sk_model = skLR(fit_intercept=fit_intercept, solver=sk_solver, max_iter=10) - sk_model.fit(X, y) - sk_preds = sk_model.predict(X) - accuracy_sk = accuracy_score(y, sk_preds) - - assert (accuracy_cuml >= accuracy_sk) | ( - np.abs(accuracy_cuml - accuracy_sk) < 1e-3 - ) - - # score - accuracy_cuml = cuml_model.score(gX, gy).compute().item() - accuracy_sk = sk_model.score(X, y) - - assert (accuracy_cuml >= accuracy_sk) | ( - np.abs(accuracy_cuml - accuracy_sk) < 1e-3 - ) - - # predicted probabilities should differ by <= 5% - # even with different solvers (arbitrary) - probs_cuml = cuml_model.predict_proba(gX).compute() - probs_sk = sk_model.predict_proba(X)[:, 1] - assert np.abs(probs_sk - probs_cuml.get()).max() <= 0.05 - - @pytest.mark.mg @pytest.mark.parametrize("n_parts", [2]) @pytest.mark.parametrize("datatype", [np.float32, np.float64]) diff --git a/python/cuml/pyproject.toml b/python/cuml/pyproject.toml index 8934a0f226..228cb92b5c 100644 --- a/python/cuml/pyproject.toml +++ b/python/cuml/pyproject.toml @@ -111,7 +111,6 @@ classifiers = [ [project.optional-dependencies] test = [ "cython>=3.0.0", - "dask-glm==0.3.0", "dask-ml", "hdbscan>=0.8.38,<0.8.39", "hypothesis>=6.0,<7", From 4df6a48a753bc91744265d8cb689b3ea8a042083 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 3 Oct 2024 12:59:16 -0400 Subject: [PATCH 4/6] Prune workflows based on changed files (#6094) Contributes to https://github.com/rapidsai/build-planning/issues/94 Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cuml/pull/6094 --- .github/workflows/pr.yaml | 61 +++++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 84aecb0df0..844d611804 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -12,6 +12,7 @@ concurrency: jobs: pr-builder: needs: + - changed-files - checks - clang-tidy - conda-cpp-build @@ -27,6 +28,48 @@ jobs: - devcontainer secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12 + if: always() + with: + needs: ${{ toJSON(needs) }} + changed-files: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12 + with: + files_yaml: | + test_cpp: + - '**' + - '!.devcontainer/**' + - '!.pre-commit-config.yaml' + - '!CONTRIBUTING.md' + - '!README.md' + - '!docs/**' + - '!img/**' + - '!notebooks/**' + - '!python/**' + - '!thirdparty/LICENSES/**' + # TODO: Remove before merging + - '!.github/workflows/**' + test_notebooks: + - '**' + - '!.devcontainer/**' + - '!.pre-commit-config.yaml' + - '!CONTRIBUTING.md' + - '!README.md' + - '!thirdparty/LICENSES/**' + # TODO: Remove before merging + - '!.github/workflows/**' + test_python: + - '**' + - '!.devcontainer/**' + - '!.pre-commit-config.yaml' + - '!CONTRIBUTING.md' + - '!README.md' + - '!docs/**' + - '!img/**' + - '!notebooks/**' + - '!thirdparty/LICENSES/**' + # TODO: Remove before merging + - '!.github/workflows/**' checks: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12 @@ -51,9 +94,10 @@ jobs: with: build_type: pull-request conda-cpp-tests: - needs: conda-cpp-build + needs: [conda-cpp-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp with: build_type: pull-request conda-cpp-checks: @@ -71,31 +115,35 @@ jobs: with: build_type: pull-request conda-python-tests-singlegpu: - needs: conda-python-build + needs: [conda-python-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: "ci/test_python_singlegpu.sh" optional-job-conda-python-tests-cudf-pandas-integration: - needs: conda-python-build + needs: [conda-python-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: matrix_filter: map(select(.ARCH == "amd64")) build_type: pull-request script: "ci/test_python_integration.sh" conda-python-tests-dask: - needs: conda-python-build + needs: [conda-python-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: "ci/test_python_dask.sh" conda-notebook-tests: - needs: conda-python-build + needs: [conda-python-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -123,9 +171,10 @@ jobs: extra-repo-sha: branch-24.12 extra-repo-deploy-key: CUMLPRIMS_SSH_PRIVATE_DEPLOY_KEY wheel-tests-cuml: - needs: wheel-build-cuml + needs: [wheel-build-cuml, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: ci/test_wheel.sh From c78a74874ff8636180ad1576b876bb60d0888436 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 3 Oct 2024 13:01:32 -0400 Subject: [PATCH 5/6] Remove temporary exclusion of .github/workflows/ (#6097) Follow-up to https://github.com/rapidsai/cuml/pull/6094 --- .github/workflows/pr.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 844d611804..1ca4589500 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -47,8 +47,6 @@ jobs: - '!notebooks/**' - '!python/**' - '!thirdparty/LICENSES/**' - # TODO: Remove before merging - - '!.github/workflows/**' test_notebooks: - '**' - '!.devcontainer/**' @@ -56,8 +54,6 @@ jobs: - '!CONTRIBUTING.md' - '!README.md' - '!thirdparty/LICENSES/**' - # TODO: Remove before merging - - '!.github/workflows/**' test_python: - '**' - '!.devcontainer/**' @@ -68,8 +64,6 @@ jobs: - '!img/**' - '!notebooks/**' - '!thirdparty/LICENSES/**' - # TODO: Remove before merging - - '!.github/workflows/**' checks: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12 From 65a02f6ef223390f4424767376d902f18617c1be Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 3 Oct 2024 15:47:24 -0500 Subject: [PATCH 6/6] Fix train_test_split for string columns (#6088) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #5834 Before the fix, this was an issue: ```python import cudf from cuml.model_selection import train_test_split SEED = 1 df_a = cudf.DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9], 'c': ['High', 'Low', 'High', 'High', 'Low'] }) target = cudf.Series([1, 1, 1, 0, 0]) # breakpoint() all_numeric = all(cudf.api.types.is_numeric_dtype(df_a[col]) for col in df_a.columns) print(all_numeric) tr, te, ytr, yte = train_test_split(X=df_a, y=target, test_size=0.3, random_state=SEED, stratify=target) print(tr) `` would result in multiple errors of the type ```python File "/home/coder/.conda/envs/rapids/lib/python3.12/site-packages/cudf/utils/performance_tracking.py", line 51, in wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/coder/.conda/envs/rapids/lib/python3.12/site-packages/cudf/core/frame.py", line 358, in _get_columns_by_label return self._from_data_like_self(self._data.select_by_label(labels)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/coder/.conda/envs/rapids/lib/python3.12/site-packages/cudf/core/column_accessor.py", line 401, in select_by_label return self._select_by_label_grouped(key) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/coder/.conda/envs/rapids/lib/python3.12/site-packages/cudf/core/column_accessor.py", line 563, in _select_by_label_grouped result = self._grouped_data[key] ~~~~~~~~~~~~~~~~~~^^^^^ KeyError: '__cuda_array_interface__' ``` After the fix, train_test_split works for cuDF string columns: ```python (rapids) coder ➜ ~ $ python cudfstr.py a b c 3 3 8 High 4 4 9 Low 2 2 7 High 1 1 6 Low ``` Need to add a test and probably do a small fix for cudf.pandas. There is some redundancy in the code, which can be cleaned as a follow up for a later release to get this is for 24.10. --- python/cuml/cuml/cluster/kmeans.pyx | 11 ++- python/cuml/cuml/internals/array.py | 15 +-- python/cuml/cuml/model_selection/_split.py | 106 +++++++++++---------- 3 files changed, 74 insertions(+), 58 deletions(-) diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx index e8ab51e4dd..3d6be3abf2 100644 --- a/python/cuml/cuml/cluster/kmeans.pyx +++ b/python/cuml/cuml/cluster/kmeans.pyx @@ -20,6 +20,7 @@ from cuml.internals.safe_imports import cpu_only_import np = cpu_only_import('numpy') from cuml.internals.safe_imports import gpu_only_import rmm = gpu_only_import('rmm') +from cuml.internals.safe_imports import safe_import_from, return_false import typing IF GPUBUILD == 1: @@ -46,7 +47,10 @@ from cuml.common import input_to_cuml_array from cuml.internals.api_decorators import device_interop_preparation from cuml.internals.api_decorators import enable_device_interop -from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +# from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +_openmp_effective_n_threads = safe_import_from( + "sklearn.utils._openmp_helpers", "_openmp_effective_n_threads", alt=return_false +) class KMeans(UniversalBase, @@ -235,7 +239,10 @@ class KMeans(UniversalBase, self.cluster_centers_ = None # For sklearn interoperability - self._n_threads = _openmp_effective_n_threads() + if _openmp_effective_n_threads(): + self._n_threads = _openmp_effective_n_threads() + else: + self._n_threads = 1 # cuPy does not allow comparing with string. See issue #2372 init_str = init if isinstance(init, str) else None diff --git a/python/cuml/cuml/internals/array.py b/python/cuml/cuml/internals/array.py index e61d84ab83..c30d609563 100644 --- a/python/cuml/cuml/internals/array.py +++ b/python/cuml/cuml/internals/array.py @@ -1251,13 +1251,14 @@ def array_to_memory_order(arr, default="C"): return arr.order except AttributeError: pass - try: - array_interface = arr.__cuda_array_interface__ - except AttributeError: - try: - array_interface = arr.__array_interface__ - except AttributeError: - return array_to_memory_order(CumlArray.from_input(arr, order="K")) + array_interface = getattr( + arr, + "__cuda_array_interface__", + getattr(arr, "__array_interface__", False), + ) + if not array_interface: + return array_to_memory_order(CumlArray.from_input(arr, order="K")) + strides = array_interface.get("strides", None) if strides is None: try: diff --git a/python/cuml/cuml/model_selection/_split.py b/python/cuml/cuml/model_selection/_split.py index 0727f82c82..227f0eb297 100644 --- a/python/cuml/cuml/model_selection/_split.py +++ b/python/cuml/cuml/model_selection/_split.py @@ -265,8 +265,18 @@ def train_test_split( string" ) - x_order = array_to_memory_order(X) - X_arr, X_row, *_ = input_to_cuml_array(X, order=x_order) + all_numeric = True + if isinstance(X, cudf.DataFrame): + all_numeric = all( + cudf.api.types.is_numeric_dtype(X[col]) for col in X.columns + ) + + if all_numeric: + x_order = array_to_memory_order(X) + X_arr, X_row, *_ = input_to_cuml_array(X, order=x_order) + else: + x_order = "F" + X_arr, X_row = X, X.shape[0] if y is not None: y_order = array_to_memory_order(y) y_arr, y_row, *_ = input_to_cuml_array(y, order=y_order) @@ -363,55 +373,53 @@ def train_test_split( train_indices = range(0, train_size) test_indices = range(-1 * test_size, 0) - # Gather from indices - X_train = X_arr[train_indices] - X_test = X_arr[test_indices] - if y is not None: - y_train = y_arr[train_indices] - y_test = y_arr[test_indices] - - # Coerce output to original input type - if ty := determine_df_obj_type(X): - x_type = ty - else: - x_type = determine_array_type(X) - - if ty := determine_df_obj_type(y): - y_type = ty - else: - y_type = determine_array_type(y) - - if x_type in ("series", "dataframe"): - X_train = output_to_df_obj_like(X_train, X, x_type) - X_test = output_to_df_obj_like(X_test, X, x_type) - - if determine_array_type(X.index) == "pandas": - if isinstance(train_indices, cp.ndarray): - train_indices = train_indices.get() - if isinstance(test_indices, cp.ndarray): - test_indices = test_indices.get() + if all_numeric: + # Gather from indices + X_train = X_arr[train_indices] + X_test = X_arr[test_indices] + if y is not None: + y_train = y_arr[train_indices] + y_test = y_arr[test_indices] + + # Coerce output to original input type + x_type = determine_df_obj_type(X) or determine_array_type(X) + if y is not None: + y_type = determine_df_obj_type(y) or determine_array_type(y) + + def _process_df_objs( + df, df_type, df_train, df_test, train_indices, test_indices + ): + if df_type in {"series", "dataframe"}: + df_train = output_to_df_obj_like(df_train, df, df_type) + df_test = output_to_df_obj_like(df_test, df, df_type) + + if determine_array_type(df.index) == "pandas": + if isinstance(train_indices, cp.ndarray): + train_indices = train_indices.get() + if isinstance(test_indices, cp.ndarray): + test_indices = test_indices.get() + + df_train.index = df.index[train_indices] + df_test.index = df.index[test_indices] + else: + df_train = df_train.to_output(df_type) + df_test = df_test.to_output(df_type) + return df_train, df_test + + X_train, X_test = _process_df_objs( + X, x_type, X_train, X_test, train_indices, test_indices + ) + if y is not None: + y_train, y_test = _process_df_objs( + y, y_type, y_train, y_test, train_indices, test_indices + ) - X_train.index = X.index[train_indices] - X_test.index = X.index[test_indices] else: - X_train = X_train.to_output(x_type) - X_test = X_test.to_output(x_type) - - if y_type in ("series", "dataframe"): - y_train = output_to_df_obj_like(y_train, y, y_type) - y_test = output_to_df_obj_like(y_test, y, y_type) - - if determine_array_type(y.index) == "pandas": - if isinstance(train_indices, cp.ndarray): - train_indices = train_indices.get() - if isinstance(test_indices, cp.ndarray): - test_indices = test_indices.get() - - y_train.index = y.index[train_indices] - y_test.index = y.index[test_indices] - elif y_type is not None: - y_train = y_train.to_output(y_type) - y_test = y_test.to_output(y_type) + X_train = X_arr.iloc[train_indices] + X_test = X_arr.iloc[test_indices] + if y is not None: + y_train = y_arr[train_indices] + y_test = y_arr[test_indices] if y is not None: return X_train, X_test, y_train, y_test