From 2ff1bd0aca76afda5daaa2f01faa76fa4cb2612d Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Wed, 13 Sep 2023 10:54:08 +1000
Subject: [PATCH] Fix stream_ordered_memory_resource attempt to record event in
 stream from another device (#1333)

As described in #1306 , stream_ordered_memory_resource maintained thread-local storage of an event to synchronize in the per-thread default stream. However, this caused the event to be recorded on the wrong stream if allocations were made on memory resources associated with different devices, because allocation on the first device on the PTDS would initialize the TLS for that stream, and subsequent device pools would try to use the already initialized TLS.

This PR adds a new test that only runs on multidevice systems (more correctly, does nothing on single device systems). The test creates two per-device pools, and creates and destroys a device buffer on each.

It also fixes `stream_ordered_memory_resource` to store the ID of the device that is current when it is constructed, and then to store a vector of one event per device in TLS rather than a single event. When the PTDS is passed to `get_event`, the event for the MR's stored device ID is used. This should correctly support PTDS with multiple threads and multiple devices (still one MR per device).

The PR also includes some changes to the device ID utilities in `cuda_device.hpp`. There is a new RAII device helper class, and a `get_num_cuda_devices()` function.

Finally, there is a small addition to the .clang-tidy to disable warnings about `do...while` loops inside of RMM error checking macros.

- Fixes #1306

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/rmm/pull/1333
---
 .clang-tidy                                   |  2 +
 .gitignore                                    |  1 +
 README.md                                     | 31 +++++++++--
 include/rmm/cuda_device.hpp                   | 54 +++++++++++++++++--
 include/rmm/detail/dynamic_load_runtime.hpp   |  4 +-
 .../mr/device/cuda_async_memory_resource.hpp  |  2 +-
 .../cuda_async_view_memory_resource.hpp       |  2 +-
 .../detail/stream_ordered_memory_resource.hpp | 35 +++++++-----
 include/rmm/mr/device/per_device_resource.hpp |  4 +-
 tests/mr/device/cuda_async_view_mr_tests.cpp  |  4 +-
 tests/mr/device/pool_mr_tests.cpp             | 39 ++++++++++++++
 11 files changed, 148 insertions(+), 30 deletions(-)

diff --git a/.clang-tidy b/.clang-tidy
index 04689c330..9b3f844c9 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -64,4 +64,6 @@ CheckOptions:
     value:           '1'
   - key:             readability-magic-numbers.IgnorePowersOf2IntegerValues
     value:           '1'
+  - key:             cppcoreguidelines-avoid-do-while.IgnoreMacros
+    value:           'true'
 ...
diff --git a/.gitignore b/.gitignore
index 1ab57e4d4..ad6c8ebf7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ DartConfiguration.tcl
 .DS_Store
 *.manifest
 *.spec
+compile_commands.json
 
 ## Python build directories & artifacts
 dist/
diff --git a/README.md b/README.md
index 8d65b7d33..2fa4761df 100644
--- a/README.md
+++ b/README.md
@@ -354,11 +354,32 @@ objects for each device and sets them as the per-device resource for that device
 ```c++
 std::vector<unique_ptr<pool_memory_resource>> per_device_pools;
 for(int i = 0; i < N; ++i) {
-    cudaSetDevice(i); // set device i before creating MR
-    // Use a vector of unique_ptr to maintain the lifetime of the MRs
-    per_device_pools.push_back(std::make_unique<pool_memory_resource>());
-    // Set the per-device resource for device i
-    set_per_device_resource(cuda_device_id{i}, &per_device_pools.back());
+  cudaSetDevice(i); // set device i before creating MR
+  // Use a vector of unique_ptr to maintain the lifetime of the MRs
+  per_device_pools.push_back(std::make_unique<pool_memory_resource>());
+  // Set the per-device resource for device i
+  set_per_device_resource(cuda_device_id{i}, &per_device_pools.back());
+}
+```
+
+Note that the CUDA device that is current when creating a `device_memory_resource` must also be
+current any time that `device_memory_resource` is used to deallocate memory, including in a
+destructor. This affects RAII classes like `rmm::device_buffer` and `rmm::device_uvector`. Here's an
+(incorrect) example that assumes the above example loop has been run to create a
+`pool_memory_resource` for each device. A correct example adds a call to `cudaSetDevice(0)` on the
+line of the error comment.
+
+```c++
+{
+  RMM_CUDA_TRY(cudaSetDevice(0));
+  rmm::device_buffer buf_a(16);
+
+  {
+    RMM_CUDA_TRY(cudaSetDevice(1));
+    rmm::device_buffer buf_b(16);
+  }
+
+  // Error: when buf_a is destroyed, the current device must be 0, but it is 1
 }
 ```
 
diff --git a/include/rmm/cuda_device.hpp b/include/rmm/cuda_device.hpp
index 81d35dc3c..8d355ee23 100644
--- a/include/rmm/cuda_device.hpp
+++ b/include/rmm/cuda_device.hpp
@@ -42,7 +42,6 @@ struct cuda_device_id {
   value_type id_;
 };
 
-namespace detail {
 /**
  * @brief Returns a `cuda_device_id` for the current device
  *
@@ -50,11 +49,56 @@ namespace detail {
  *
  * @return `cuda_device_id` for the current device
  */
-inline cuda_device_id current_device()
+inline cuda_device_id get_current_cuda_device()
 {
-  int dev_id{};
-  RMM_CUDA_TRY(cudaGetDevice(&dev_id));
+  cuda_device_id::value_type dev_id{-1};
+  RMM_ASSERT_CUDA_SUCCESS(cudaGetDevice(&dev_id));
   return cuda_device_id{dev_id};
 }
-}  // namespace detail
+
+/**
+ * @brief Returns the number of CUDA devices in the system
+ *
+ * @return Number of CUDA devices in the system
+ */
+inline int get_num_cuda_devices()
+{
+  cuda_device_id::value_type num_dev{-1};
+  RMM_ASSERT_CUDA_SUCCESS(cudaGetDeviceCount(&num_dev));
+  return num_dev;
+}
+
+/**
+ * @brief RAII class that sets the current CUDA device to the specified device on construction
+ * and restores the previous device on destruction.
+ */
+struct cuda_set_device_raii {
+  /**
+   * @brief Construct a new cuda_set_device_raii object and sets the current CUDA device to `dev_id`
+   *
+   * @param dev_id The device to set as the current CUDA device
+   */
+  explicit cuda_set_device_raii(cuda_device_id dev_id)
+    : old_device_{get_current_cuda_device()}, needs_reset_{old_device_.value() != dev_id.value()}
+  {
+    if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(dev_id.value()));
+  }
+  /**
+   * @brief Reactivates the previous CUDA device
+   */
+  ~cuda_set_device_raii() noexcept
+  {
+    if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(old_device_.value()));
+  }
+
+  cuda_set_device_raii(cuda_set_device_raii const&)            = delete;
+  cuda_set_device_raii& operator=(cuda_set_device_raii const&) = delete;
+  cuda_set_device_raii(cuda_set_device_raii&&)                 = delete;
+  cuda_set_device_raii& operator=(cuda_set_device_raii&&)      = delete;
+
+ private:
+  cuda_device_id old_device_;
+  bool needs_reset_;
+};
+
 }  // namespace rmm
diff --git a/include/rmm/detail/dynamic_load_runtime.hpp b/include/rmm/detail/dynamic_load_runtime.hpp
index 28121e6a8..b45dbae25 100644
--- a/include/rmm/detail/dynamic_load_runtime.hpp
+++ b/include/rmm/detail/dynamic_load_runtime.hpp
@@ -115,7 +115,7 @@ struct async_alloc {
       int cuda_pool_supported{};
       auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
                                            cudaDevAttrMemoryPoolsSupported,
-                                           rmm::detail::current_device().value());
+                                           rmm::get_current_cuda_device().value());
       return result == cudaSuccess and cuda_pool_supported == 1;
     }()};
     return runtime_supports_pool and driver_supports_pool;
@@ -139,7 +139,7 @@ struct async_alloc {
     if (cudaMemHandleTypeNone != handle_type) {
       auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask,
                                                  cudaDevAttrMemoryPoolSupportedHandleTypes,
-                                                 rmm::detail::current_device().value());
+                                                 rmm::get_current_cuda_device().value());
 
       // Don't throw on cudaErrorInvalidValue
       auto const unsupported_runtime = (result == cudaErrorInvalidValue);
diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp
index d41eae63e..329d8f29a 100644
--- a/include/rmm/mr/device/cuda_async_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_memory_resource.hpp
@@ -98,7 +98,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
     RMM_EXPECTS(rmm::detail::async_alloc::is_export_handle_type_supported(pool_props.handleTypes),
                 "Requested IPC memory handle type not supported");
     pool_props.location.type = cudaMemLocationTypeDevice;
-    pool_props.location.id   = rmm::detail::current_device().value();
+    pool_props.location.id   = rmm::get_current_cuda_device().value();
     cudaMemPool_t cuda_pool_handle{};
     RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle, &pool_props));
     pool_ = cuda_async_view_memory_resource{cuda_pool_handle};
diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
index 806ace807..c685cd75f 100644
--- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
@@ -60,7 +60,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
       }()}
   {
     // Check if cudaMallocAsync Memory pool supported
-    auto const device = rmm::detail::current_device();
+    auto const device = rmm::get_current_cuda_device();
     int cuda_pool_supported{};
     auto result =
       cudaDeviceGetAttribute(&cuda_pool_supported, cudaDevAttrMemoryPoolsSupported, device.value());
diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
index 53575e5ce..f071717c0 100644
--- a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
+++ b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
@@ -15,15 +15,16 @@
  */
 #pragma once
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/logger.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-#include <fmt/core.h>
-
 #include <cuda_runtime_api.h>
 
+#include <fmt/core.h>
+
 #include <cstddef>
 #include <functional>
 #include <limits>
@@ -288,17 +289,25 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
   stream_event_pair get_event(cuda_stream_view stream)
   {
     if (stream.is_per_thread_default()) {
-      // Create a thread-local shared event wrapper. Shared pointers in the thread and in each MR
-      // instance ensures it is destroyed cleaned up only after all are finished with it.
-      thread_local auto event_tls = std::make_shared<event_wrapper>();
-      default_stream_events.insert(event_tls);
-      return stream_event_pair{stream.value(), event_tls->event};
+      // Create a thread-local shared event wrapper for each device. Shared pointers in the thread
+      // and in each MR instance ensure the wrappers are destroyed only after all are finished
+      // with them.
+      thread_local std::vector<std::shared_ptr<event_wrapper>> events_tls(
+        rmm::get_num_cuda_devices());
+      auto event = [&, device_id = this->device_id_]() {
+        if (events_tls[device_id.value()]) { return events_tls[device_id.value()]->event; }
+
+        auto event = std::make_shared<event_wrapper>();
+        this->default_stream_events.insert(event);
+        return (events_tls[device_id.value()] = std::move(event))->event;
+      }();
+      return stream_event_pair{stream.value(), event};
     }
     // We use cudaStreamLegacy as the event map key for the default stream for consistency between
     // PTDS and non-PTDS mode. In PTDS mode, the cudaStreamLegacy map key will only exist if the
     // user explicitly passes it, so it is used as the default location for the free list
-    // at construction. For consistency, the same key is used for null stream free lists in non-PTDS
-    // mode.
+    // at construction. For consistency, the same key is used for null stream free lists in
+    // non-PTDS mode.
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-cstyle-cast)
     auto* const stream_to_store = stream.is_default() ? cudaStreamLegacy : stream.value();
 
@@ -496,11 +505,13 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
   // bidirectional mapping between non-default streams and events
   std::unordered_map<cudaStream_t, stream_event_pair> stream_events_;
 
-  // shared pointers to events keeps the events alive as long as either the thread that created them
-  // or the MR that is using them exists.
+  // shared pointers to events keeps the events alive as long as either the thread that created
+  // them or the MR that is using them exists.
   std::set<std::shared_ptr<event_wrapper>> default_stream_events;
 
   std::mutex mtx_;  // mutex for thread-safe access
-};                  // namespace detail
+
+  rmm::cuda_device_id device_id_{rmm::get_current_cuda_device()};
+};  // namespace detail
 
 }  // namespace rmm::mr::detail
diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp
index 371c97fdf..aa7217758 100644
--- a/include/rmm/mr/device/per_device_resource.hpp
+++ b/include/rmm/mr/device/per_device_resource.hpp
@@ -202,7 +202,7 @@ inline device_memory_resource* set_per_device_resource(cuda_device_id device_id,
  */
 inline device_memory_resource* get_current_device_resource()
 {
-  return get_per_device_resource(rmm::detail::current_device());
+  return get_per_device_resource(rmm::get_current_cuda_device());
 }
 
 /**
@@ -231,6 +231,6 @@ inline device_memory_resource* get_current_device_resource()
  */
 inline device_memory_resource* set_current_device_resource(device_memory_resource* new_mr)
 {
-  return set_per_device_resource(rmm::detail::current_device(), new_mr);
+  return set_per_device_resource(rmm::get_current_cuda_device(), new_mr);
 }
 }  // namespace rmm::mr
diff --git a/tests/mr/device/cuda_async_view_mr_tests.cpp b/tests/mr/device/cuda_async_view_mr_tests.cpp
index 86cb6f106..209429b4b 100644
--- a/tests/mr/device/cuda_async_view_mr_tests.cpp
+++ b/tests/mr/device/cuda_async_view_mr_tests.cpp
@@ -31,7 +31,7 @@ TEST(PoolTest, UsePool)
 {
   cudaMemPool_t memPool{};
   RMM_CUDA_TRY(rmm::detail::async_alloc::cudaDeviceGetDefaultMemPool(
-    &memPool, rmm::detail::current_device().value()));
+    &memPool, rmm::get_current_cuda_device().value()));
 
   const auto pool_init_size{100};
   cuda_async_view_mr mr{memPool};
@@ -44,7 +44,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool)
 {
   cudaMemPoolProps poolProps = {};
   poolProps.allocType        = cudaMemAllocationTypePinned;
-  poolProps.location.id      = rmm::detail::current_device().value();
+  poolProps.location.id      = rmm::get_current_cuda_device().value();
   poolProps.location.type    = cudaMemLocationTypeDevice;
 
   cudaMemPool_t memPool{};
diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp
index c5df1951c..6bcd90527 100644
--- a/tests/mr/device/pool_mr_tests.cpp
+++ b/tests/mr/device/pool_mr_tests.cpp
@@ -14,10 +14,12 @@
  * limitations under the License.
  */
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/cuda_util.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/limiting_resource_adaptor.hpp>
@@ -150,5 +152,42 @@ TEST(PoolTest, UpstreamDoesntSupportMemInfo)
   mr2.deallocate(ptr, 1024);
 }
 
+TEST(PoolTest, MultidevicePool)
+{
+  using MemoryResource = rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>;
+
+  // Get the number of cuda devices
+  int num_devices = rmm::get_num_cuda_devices();
+
+  // only run on multidevice systems
+  if (num_devices >= 2) {
+    rmm::mr::cuda_memory_resource general_mr;
+
+    // initializing pool_memory_resource of multiple devices
+    int devices      = 2;
+    size_t pool_size = 1024;
+    std::vector<std::shared_ptr<MemoryResource>> mrs;
+
+    for (int i = 0; i < devices; ++i) {
+      RMM_CUDA_TRY(cudaSetDevice(i));
+      auto mr = std::make_shared<MemoryResource>(&general_mr, pool_size, pool_size);
+      rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, mr.get());
+      mrs.emplace_back(mr);
+    }
+
+    {
+      RMM_CUDA_TRY(cudaSetDevice(0));
+      rmm::device_buffer buf_a(16, rmm::cuda_stream_per_thread, mrs[0].get());
+
+      {
+        RMM_CUDA_TRY(cudaSetDevice(1));
+        rmm::device_buffer buf_b(16, rmm::cuda_stream_per_thread, mrs[1].get());
+      }
+
+      RMM_CUDA_TRY(cudaSetDevice(0));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace rmm::test