From 2299144e854170c1ac6fb02a82f2a208e6563046 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Wed, 4 Oct 2023 15:58:44 -0500
Subject: [PATCH] Fixes for timeouts in tests (#5598)

Timeouts in CI were happening due to CUDA context creation happening in some dask estimator docstrings, this PR fixes that by avoiding that context creation.

Also fixes a typo in the coordinate descent solver that caused the intercept to be calculated wrong in some cases

Authors:
  - Dante Gama Dessavre (https://github.com/dantegd)

Approvers:
  - Simon Adorf (https://github.com/csadorf)

URL: https://github.com/rapidsai/cuml/pull/5598
---
 python/cuml/internals/available_devices.py  |  14 ++-
 python/cuml/solvers/cd.pyx                  |   2 +-
 python/cuml/tests/test_nearest_neighbors.py |   2 +
 python/cuml/tests/test_no_cuinit.py         | 109 ++++++++++++++++++++
 4 files changed, 124 insertions(+), 3 deletions(-)
 create mode 100644 python/cuml/tests/test_no_cuinit.py
diff --git a/python/cuml/internals/available_devices.py b/python/cuml/internals/available_devices.py
index ec58fac0df..8110f1b5d1 100644
--- a/python/cuml/internals/available_devices.py
+++ b/python/cuml/internals/available_devices.py
@@ -24,12 +24,22 @@
     cache = lru_cache(maxsize=None)
 
 
-get_cuda_count = gpu_only_import_from("rmm._cuda.gpu", "getDeviceCount")
+def gpu_available_no_context_creation():
+    """
+    Function tries to check if GPUs are available in the system without
+    creating a CUDA context. We check for CuPy presence as a proxy of that.
+    """
+    try:
+        import cupy
+
+        return True
+    except ImportError:
+        return False
 
 
 @cache
 def is_cuda_available():
     try:
-        return GPU_ENABLED and get_cuda_count() >= 1
+        return GPU_ENABLED and gpu_available_no_context_creation()
     except UnavailableError:
         return False
diff --git a/python/cuml/solvers/cd.pyx b/python/cuml/solvers/cd.pyx
index 816d5f1955..c9c22fd0f6 100644
--- a/python/cuml/solvers/cd.pyx
+++ b/python/cuml/solvers/cd.pyx
@@ -296,7 +296,7 @@ class CD(Base,
                       <double>self.tol,
                       <double*>sample_weight_ptr)
 
-            self.intercept_ = _c_intercept2_f64
+                self.intercept_ = _c_intercept2_f64
 
         self.handle.sync()
         del X_m
diff --git a/python/cuml/tests/test_nearest_neighbors.py b/python/cuml/tests/test_nearest_neighbors.py
index 85548fc9f5..b4bed52d27 100644
--- a/python/cuml/tests/test_nearest_neighbors.py
+++ b/python/cuml/tests/test_nearest_neighbors.py
@@ -256,6 +256,8 @@ def test_ivfflat_pred(nrows, ncols, n_neighbors, nlist):
 def test_ivfpq_pred(
     nrows, ncols, n_neighbors, nlist, M, n_bits, usePrecomputedTables
 ):
+    if ncols == 512 and usePrecomputedTables is True:
+        pytest.skip("https://github.com/rapidsai/cuml/issues/5603")
     algo_params = {
         "nlist": nlist,
         "nprobe": int(nlist * 0.2),
diff --git a/python/cuml/tests/test_no_cuinit.py b/python/cuml/tests/test_no_cuinit.py
new file mode 100644
index 0000000000..661e496dfc
--- /dev/null
+++ b/python/cuml/tests/test_no_cuinit.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+import os
+import subprocess
+import sys
+from shutil import which
+
+import pytest
+
+GDB_COMMANDS = """
+set confirm off
+set breakpoint pending on
+break cuInit
+run
+exit
+"""
+
+
+@pytest.fixture(scope="module")
+def cuda_gdb(request):
+    gdb = which("cuda-gdb")
+    if gdb is None:
+        request.applymarker(
+            pytest.mark.xfail(reason="No cuda-gdb found, can't detect cuInit"),
+        )
+        return gdb
+    else:
+        output = subprocess.run(
+            [gdb, "--version"], capture_output=True, text=True
+        )
+        if output.returncode != 0:
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason=(
+                        "cuda-gdb not working on this platform, "
+                        f"can't detect cuInit: {output.stderr}"
+                    )
+                ),
+            )
+        return gdb
+
+
+def test_cuml_import_no_cuinit(cuda_gdb):
+    # When RAPIDS_NO_INITIALIZE is set, importing cuml should _not_
+    # create a CUDA context (i.e. cuInit should not be called).
+    # Intercepting the call to cuInit programmatically is tricky since
+    # the way it is resolved from dynamic libraries by
+    # cuda-python/numba/cupy is multitudinous (see discussion at
+    # https://github.com/rapidsai/cuml/pull/12361 which does this, but
+    # needs provide hooks that override dlsym, cuGetProcAddress, and
+    # cuInit.
+    # Instead, we just run under GDB and see if we hit a breakpoint
+    env = os.environ.copy()
+    env["RAPIDS_NO_INITIALIZE"] = "1"
+    output = subprocess.run(
+        [
+            cuda_gdb,
+            "-x",
+            "-",
+            "--args",
+            sys.executable,
+            "-c",
+            "import cuml",
+        ],
+        input=GDB_COMMANDS,
+        env=env,
+        capture_output=True,
+        text=True,
+    )
+
+    cuInit_called = output.stdout.find("in cuInit ()")
+    print("Command output:\n")
+    print("*** STDOUT ***")
+    print(output.stdout)
+    print("*** STDERR ***")
+    print(output.stderr)
+    assert output.returncode == 0
+    assert cuInit_called < 0
+
+
+def test_cuml_create_estimator_cuinit(cuda_gdb):
+    # This tests that our gdb scripting correctly identifies cuInit
+    # when it definitely should have been called.
+    env = os.environ.copy()
+    env["RAPIDS_NO_INITIALIZE"] = "1"
+    output = subprocess.run(
+        [
+            cuda_gdb,
+            "-x",
+            "-",
+            "--args",
+            sys.executable,
+            "-c",
+            "import cupy as cp; a = cp.ones(10)",
+        ],
+        input=GDB_COMMANDS,
+        env=env,
+        capture_output=True,
+        text=True,
+    )
+
+    cuInit_called = output.stdout.find("in cuInit ()")
+    print("Command output:\n")
+    print("*** STDOUT ***")
+    print(output.stdout)
+    print("*** STDERR ***")
+    print(output.stderr)
+    assert output.returncode == 0
+    assert cuInit_called >= 0