From b8dcd9b8876c3ef6fede6d045151b2557f8439fa Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Sun, 20 Oct 2024 19:33:16 +0300 Subject: [PATCH] Regards #690: We've extracted the `free()` calls out of the async sub-namespace + some comment tweaks and redundancy removals --- src/cuda/api/memory.hpp | 59 +++++++++++++-------- src/cuda/api/multi_wrapper_impls/memory.hpp | 19 +++++-- src/cuda/api/stream.hpp | 6 +-- 3 files changed, 56 insertions(+), 28 deletions(-) diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp index 3e09d490..ec906af9 100644 --- a/src/cuda/api/memory.hpp +++ b/src/cuda/api/memory.hpp @@ -229,41 +229,58 @@ inline region_t allocate( return allocate_in_current_context(size_in_bytes, stream_handle); } -} // namespace detail_ - -/// Free a region of device-side memory (regardless of how it was allocated) -inline void free(void* ptr) +#if CUDA_VERSION >= 11020 +inline void free( + context::handle_t context_handle, + void* allocated_region_start, + optional stream_handle = {}) +#else +inline void free( + context::handle_t context_handle, + void* allocated_region_start) +#endif { - auto result = cuMemFree(address(ptr)); +#if CUDA_VERSION >= 11020 + if (stream_handle) { + auto status = cuMemFreeAsync(device::address(allocated_region_start), *stream_handle); + throw_if_error_lazy(status, + "Failed scheduling an asynchronous freeing of the global memory region starting at " + + cuda::detail_::ptr_as_hex(allocated_region_start) + " on " + + stream::detail_::identify(*stream_handle, context_handle)); + return; + } +#endif + auto result = cuMemFree(address(allocated_region_start)); #ifdef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT if (result == status::success) { return; } #else if (result == status::success or result == status::context_is_destroyed) { return; } #endif - throw runtime_error(result, "Freeing device memory at " + cuda::detail_::ptr_as_hex(ptr)); + throw runtime_error(result, "Freeing device memory at " + cuda::detail_::ptr_as_hex(allocated_region_start)); } -/// @copydoc free(void*) -inline void free(region_t region) { free(region.start()); } +} // namespace detail_ +/// Free a region of device-side memory (regardless of how it was allocated) #if CUDA_VERSION >= 11020 -namespace async { - -namespace detail_ { +inline void free(void* region_start, optional_ref stream = {}); +#else +inline void free(void* ptr); +#endif -inline void free( - context::handle_t context_handle, - stream::handle_t stream_handle, - void* allocated_region_start) +/// @copydoc free(void*, optional_ref) +#if CUDA_VERSION >= 11020 +inline void free(region_t region, optional_ref stream = {}) +#else +inline void free(region_t region) +#endif { - auto status = cuMemFreeAsync(device::address(allocated_region_start), stream_handle); - throw_if_error_lazy(status, - "Failed scheduling an asynchronous freeing of the global memory region starting at " - + cuda::detail_::ptr_as_hex(allocated_region_start) + " on " - + stream::detail_::identify(stream_handle, context_handle) ); + free(region.start(), stream); } -} // namespace detail_ +#if CUDA_VERSION >= 11020 + +namespace async { /** * Schedule a de-allocation of device-side memory on a CUDA stream. diff --git a/src/cuda/api/multi_wrapper_impls/memory.hpp b/src/cuda/api/multi_wrapper_impls/memory.hpp index b6635340..57d0926b 100644 --- a/src/cuda/api/multi_wrapper_impls/memory.hpp +++ b/src/cuda/api/multi_wrapper_impls/memory.hpp @@ -121,13 +121,24 @@ inline region_t allocate(size_t size_in_bytes, optional_ref stre detail_::allocate_in_current_context(size_in_bytes); } -namespace async { +#endif // CUDA_VERSION >= 11020 -inline void free(const stream_t& stream, void* region_start) +#if CUDA_VERSION >= 11020 +inline void free(void* region_start, optional_ref stream) +#else +inline void free(void* ptr) +#endif // CUDA_VERSION >= 11020 { - return detail_::free(stream.context().handle(), stream.handle(), region_start); + auto cch = context::current::detail_::get_handle(); +#if CUDA_VERSION >= 11020 + if (stream) { + detail_::free(cch, region_start, stream->handle()); + } +#endif + detail_::free(cch,region_start); } -#endif // CUDA_VERSION >= 11020 + +namespace async { template inline void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream) diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp index 953c034d..a7e3f0cb 100644 --- a/src/cuda/api/stream.hpp +++ b/src/cuda/api/stream.hpp @@ -601,14 +601,14 @@ class stream_t { ///@{ void free(void* region_start) const { - memory::device::async::free(associated_stream, region_start); + memory::device::free(region_start, associated_stream); } void free(memory::region_t region) const { - memory::device::async::free(associated_stream, region); + memory::device::free(region, associated_stream); } -#endif +#endif // CUDA_VERSION >= 11020 /** * Sets the attachment of a region of managed memory (i.e. in the address space visible