From b8dcd9b8876c3ef6fede6d045151b2557f8439fa Mon Sep 17 00:00:00 2001
From: Eyal Rozenberg <eyalroz1@gmx.com>
Date: Sun, 20 Oct 2024 19:33:16 +0300
Subject: [PATCH] Regards #690: We've extracted the `free()` calls out of the
 async sub-namespace + some comment tweaks and redundancy removals

---
 src/cuda/api/memory.hpp                     | 59 +++++++++++++--------
 src/cuda/api/multi_wrapper_impls/memory.hpp | 19 +++++--
 src/cuda/api/stream.hpp                     |  6 +--
 3 files changed, 56 insertions(+), 28 deletions(-)
diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
index 3e09d490..ec906af9 100644
--- a/src/cuda/api/memory.hpp
+++ b/src/cuda/api/memory.hpp
@@ -229,41 +229,58 @@ inline region_t allocate(
 	return allocate_in_current_context(size_in_bytes, stream_handle);
 }
 
-} // namespace detail_
-
-/// Free a region of device-side memory (regardless of how it was allocated)
-inline void free(void* ptr)
+#if CUDA_VERSION >= 11020
+inline void free(
+	context::handle_t          context_handle,
+	void*                      allocated_region_start,
+	optional<stream::handle_t> stream_handle = {})
+#else
+inline void free(
+	context::handle_t          context_handle,
+	void*                      allocated_region_start)
+#endif
 {
-	auto result = cuMemFree(address(ptr));
+#if CUDA_VERSION >= 11020
+	if (stream_handle) {
+		auto status = cuMemFreeAsync(device::address(allocated_region_start), *stream_handle);
+		throw_if_error_lazy(status,
+			"Failed scheduling an asynchronous freeing of the global memory region starting at "
+			+ cuda::detail_::ptr_as_hex(allocated_region_start) + " on "
+			+ stream::detail_::identify(*stream_handle, context_handle));
+		return;
+	}
+#endif
+	auto result = cuMemFree(address(allocated_region_start));
 #ifdef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
 	if (result == status::success) { return; }
 #else
 	if (result == status::success or result == status::context_is_destroyed) { return; }
 #endif
-	throw runtime_error(result, "Freeing device memory at " + cuda::detail_::ptr_as_hex(ptr));
+	throw runtime_error(result, "Freeing device memory at " + cuda::detail_::ptr_as_hex(allocated_region_start));
 }
 
-/// @copydoc free(void*)
-inline void free(region_t region) { free(region.start()); }
+} // namespace detail_
 
+/// Free a region of device-side memory (regardless of how it was allocated)
 #if CUDA_VERSION >= 11020
-namespace async {
-
-namespace detail_ {
+inline void free(void* region_start, optional_ref<const stream_t> stream = {});
+#else
+inline void free(void* ptr);
+#endif
 
-inline void free(
-	context::handle_t  context_handle,
-	stream::handle_t   stream_handle,
-	void*              allocated_region_start)
+/// @copydoc free(void*, optional_ref<const stream_t>)
+#if CUDA_VERSION >= 11020
+inline void free(region_t region, optional_ref<const stream_t> stream = {})
+#else
+inline void free(region_t region)
+#endif
 {
-	auto status = cuMemFreeAsync(device::address(allocated_region_start), stream_handle);
-	throw_if_error_lazy(status,
-		"Failed scheduling an asynchronous freeing of the global memory region starting at "
-		+ cuda::detail_::ptr_as_hex(allocated_region_start) + " on "
-		+ stream::detail_::identify(stream_handle, context_handle) );
+	free(region.start(), stream);
 }
 
-} // namespace detail_
+#if CUDA_VERSION >= 11020
+
+namespace async {
 
 /**
  * Schedule a de-allocation of device-side memory on a CUDA stream.
diff --git a/src/cuda/api/multi_wrapper_impls/memory.hpp b/src/cuda/api/multi_wrapper_impls/memory.hpp
index b6635340..57d0926b 100644
--- a/src/cuda/api/multi_wrapper_impls/memory.hpp
+++ b/src/cuda/api/multi_wrapper_impls/memory.hpp
@@ -121,13 +121,24 @@ inline region_t allocate(size_t size_in_bytes, optional_ref<const stream_t> stre
 		detail_::allocate_in_current_context(size_in_bytes);
 }
 
-namespace async {
+#endif // CUDA_VERSION >= 11020
 
-inline void free(const stream_t& stream, void* region_start)
+#if CUDA_VERSION >= 11020
+inline void free(void* region_start, optional_ref<const stream_t> stream)
+#else
+inline void free(void* ptr)
+#endif // CUDA_VERSION >= 11020
 {
-	return detail_::free(stream.context().handle(), stream.handle(), region_start);
+	auto cch = context::current::detail_::get_handle();
+#if CUDA_VERSION >= 11020
+	if (stream) {
+		detail_::free(cch, region_start, stream->handle());
+	}
+#endif
+	detail_::free(cch,region_start);
 }
-#endif // CUDA_VERSION >= 11020
+
+namespace async {
 
 template <typename T>
 inline void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream)
diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp
index 953c034d..a7e3f0cb 100644
--- a/src/cuda/api/stream.hpp
+++ b/src/cuda/api/stream.hpp
@@ -601,14 +601,14 @@ class stream_t {
 		///@{
 		void free(void* region_start) const
 		{
-			memory::device::async::free(associated_stream, region_start);
+			memory::device::free(region_start, associated_stream);
 		}
 
 		void free(memory::region_t region) const
 		{
-			memory::device::async::free(associated_stream, region);
+			memory::device::free(region, associated_stream);
 		}
-#endif
+#endif // CUDA_VERSION >= 11020
 
 		/**
 		 * Sets the attachment of a region of managed memory (i.e. in the address space visible