NVIDIA · NaderAlAwar · Jan 10, 2025 · Jan 10, 2025 · shwina · Jan 11, 2025
@@ -6,7 +6,7 @@
 from __future__ import annotations  # TODO: required for Python 3.7 docs env
 
 import ctypes
-from typing import Callable
+from typing import Callable, Optional
 
 import numba
 import numpy as np
@@ -46,6 +46,30 @@ def _dtype_validation(dt1, dt2):
         raise TypeError(f"dtype mismatch: __init__={dt1}, __call__={dt2}")
 
 
+def _validate_and_get_stream(stream) -> Optional[int]:
+    # null stream is allowed
+    if stream is None:
+        return None
+
+    if not hasattr(stream, "__cuda_stream__"):
+        raise TypeError(
+            f"stream argument {stream} does not implement the '__cuda_stream__' protocol"
+        )
+
+    stream_property = stream.__cuda_stream__
+    if (
+        isinstance(stream_property, tuple)
+        and len(stream_property) == 2
+        and all(isinstance(i, int) for i in stream_property)
+    ):
+        version, handle = stream_property
+        return handle
+
+    raise TypeError(
+        f"__cuda_stream__ property of '{stream}' must return a 'Tuple[int, int]'; got {stream_property} instead"
+    )
-    if (
-        isinstance(stream_property, tuple)
-        and len(stream_property) == 2
-        and all(isinstance(i, int) for i in stream_property)
-    ):
-        version, handle = stream_property
-        return handle
-
-    raise TypeError(
-        f"__cuda_stream__ property of '{stream}' must return a 'Tuple[int, int]'; got {stream_property} instead"
-    )
+    _, handle = stream_property  # (version, handle)
+    if not isinstance(handle, int):
+        raise TypeError(...)
+    return handle
-    if (
-        isinstance(stream_property, tuple)
-        and len(stream_property) == 2
-        and all(isinstance(i, int) for i in stream_property)
-    ):
-        version, handle = stream_property
-        return handle
-
-    raise TypeError(
-        f"__cuda_stream__ property of '{stream}' must return a 'Tuple[int, int]'; got {stream_property} instead"
-    )
+    _, handle = stream_property  # (version, handle)
+    if not isinstance(handle, int):
+        raise TypeError(...)
+    return handle
+
+
 class _Reduce:
     # TODO: constructor shouldn't require concrete `d_in`, `d_out`:
     def __init__(
@@ -85,7 +109,9 @@ def __init__(
         if error != enums.CUDA_SUCCESS:
             raise ValueError("Error building reduce")
 
-    def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray):
+    def __call__(
+        self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray, stream=None
+    ):
         d_in_cccl = cccl.to_cccl_iter(d_in)
         if d_in_cccl.type.value == cccl.IteratorKind.ITERATOR:
             assert num_items is not None
@@ -101,6 +127,7 @@ def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray
         )
         _dtype_validation(self._ctor_d_out_dtype, d_out.dtype)
         _dtype_validation(self._ctor_init_dtype, h_init.dtype)
+        stream_handle = _validate_and_get_stream(stream)
         bindings = get_bindings()
         if temp_storage is None:
             temp_storage_bytes = ctypes.c_size_t()
@@ -120,7 +147,7 @@ def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray
             ctypes.c_ulonglong(num_items),
             self.op_wrapper.handle(),
             cccl.host_array_to_value(h_init),
-            None,
+            stream_handle,
         )
         if error != enums.CUDA_SUCCESS:
             raise ValueError("Error reducing")

@@ -550,3 +550,88 @@ def binary_op(x, y):
     d_in = cp.zeros(size)[::2]
     with pytest.raises(ValueError, match="Non-contiguous arrays are not supported."):
         _ = algorithms.reduce_into(d_in, d_out, binary_op, h_init)
+
+
+def test_reduce_with_stream():
+    # Simple cupy stream wrapper that implements the __cuda_stream__ protocol for the purposes of this test
+    class Stream:
+        def __init__(self, cp_stream):
+            self.cp_stream = cp_stream
+
+        @property
+        def __cuda_stream__(self):
+            return (0, self.cp_stream.ptr)
+
+    def add_op(x, y):
+        return x + y
+
+    h_init = np.asarray([0], dtype=np.int32)
+    h_in = random_int(5, np.int32)
+
+    stream = cp.cuda.Stream()
+    with stream:
+        d_in = cp.asarray(h_in)
+        d_out = cp.empty(1, dtype=np.int32)
+
+    stream_wrapper = Stream(stream)
+    reduce_into = algorithms.reduce_into(
+        d_in=d_in, d_out=d_out, op=add_op, h_init=h_init
+    )
+    temp_storage_size = reduce_into(
+        None,
+        d_in=d_in,
+        d_out=d_out,
+        num_items=d_in.size,
+        h_init=h_init,
+        stream=stream_wrapper,
+    )
+    d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
+
+    reduce_into(d_temp_storage, d_in, d_out, d_in.size, h_init, stream=stream_wrapper)
+    np.testing.assert_allclose(d_in.sum().get(), d_out.get())
+
+
+def test_reduce_invalid_stream():
+    # Invalid stream that doesn't implement __cuda_stream__
+    class Stream1:
+        def __init__(self):
+            pass
+
+    # Invalid stream that implements __cuda_stream__ but returns the wrong type
+    class Stream2:
+        def __init__(self):
+            pass
+
+        @property
+        def __cuda_stream__(self):
+            return None
+
+    def add_op(x, y):
+        return x + y
+
+    d_out = cp.empty(1)
+    h_init = np.empty(1)
+    d_in = cp.empty(1)
+    reduce_into = algorithms.reduce_into(d_in, d_out, add_op, h_init)
+
+    with pytest.raises(
+        TypeError, match="does not implement the '__cuda_stream__' protocol"
+    ):
+        _ = reduce_into(
+            None,
+            d_in=d_in,
+            d_out=d_out,
+            num_items=d_in.size,
+            h_init=h_init,
+            stream=Stream1(),
+        )
+
+    with pytest.raises(TypeError, match="must return a 'Tuple\\[int, int\\]';"):
+        _ = reduce_into(
+            None,
+            d_in=d_in,
+            d_out=d_out,
+            num_items=d_in.size,
+            h_init=h_init,
+            stream=Stream2(),
+        )