Regards #564: CUDA 12 launch configuration structure and attributes s…

…upport * Now supporting CUDA-12-introduced launch attributes, including remote memory space, programmatic completion, launch completion events and clusters * Avoiding a bit of code duplication in kernel launching (but an increase in duplication due to the unavailability of `cuLaunchKernelEx()` with attribute support before CUDA 12 * New multi-wrapper-impl file for launch configurations - for mashalling launch attributes
eyalroz · Feb 3, 2024 · 88a96b8 · 88a96b8
1 parent bba75d4
commit 88a96b8
Show file tree

Hide file tree

Showing 6 changed files with 240 additions and 52 deletions.
diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp
@@ -73,6 +73,7 @@
 #include "api/multi_wrapper_impls/apriori_compiled_kernel.hpp"
 #include "api/multi_wrapper_impls/module.hpp"
 #include "api/multi_wrapper_impls/ipc.hpp"
+#include "api/multi_wrapper_impls/launch_configuration.hpp"
 
 #include "api/launch_config_builder.hpp"
 

diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp
@@ -153,9 +153,19 @@ void enqueue_launch(
 	launch_configuration_t  launch_configuration,
 	KernelParameters&&...   parameters);
 
+inline void enqueue_kernel_launch_by_handle_in_current_context(
+	kernel::handle_t        kernel_function_handle,
+	device::id_t            device_id,
+	context::handle_t       context_handle,
+	stream::handle_t        stream_handle,
+	launch_configuration_t  launch_config,
+	const void**            marshalled_arguments);
+
 template<typename KernelFunction, typename... KernelParameters>
 void enqueue_raw_kernel_launch_in_current_context(
 	KernelFunction&&        kernel_function,
+	device::id_t            device_id,
+	context::handle_t       context_handle,
 	stream::handle_t        stream_handle,
 	launch_configuration_t  launch_configuration,
 	KernelParameters&&...   parameters)
@@ -171,7 +181,7 @@ void enqueue_raw_kernel_launch_in_current_context(
 #ifndef NDEBUG
 	validate(launch_configuration);
 #endif
-	if (launch_configuration.block_cooperation == thread_blocks_may_not_cooperate) {
+	if (not launch_configuration.has_nondefault_attributes()) {
 		// regular plain vanilla launch
 		kernel_function <<<
 			launch_configuration.dimensions.grid,
@@ -184,13 +194,9 @@ void enqueue_raw_kernel_launch_in_current_context(
 	else {
 #if CUDA_VERSION < 9000
 		throw cuda::runtime_error(status::not_supported,
-			"Only CUDA versions 9.0 and later support launching kernels \"cooperatively\"");
+			"Only CUDA versions 9.0 and later support launching kernels with additional"
+			"arguments, e.g block cooperation");
 #else
-		// Cooperative launches cannot be made using the triple-chevron syntax,
-		// nor is there a variadic-template of the launch API call, so we need to
-		// a bit of useless work here. We could have done exactly the same thing
-		// for the non-cooperative case, mind you.
-
 		// The following hack is due to C++ not supporting arrays of length 0 -
 		// but such an array being necessary for collect_argument_addresses with
 		// multiple parameters. Other workarounds are possible, but would be
@@ -204,27 +210,24 @@ void enqueue_raw_kernel_launch_in_current_context(
 		detail_::collect_argument_addresses(argument_ptrs, ::std::forward<KernelParameters>(parameters)...);
 #if CUDA_VERSION >= 11000
 		kernel::handle_t kernel_function_handle = kernel::apriori_compiled::detail_::get_handle( (const void*) kernel_function);
-		auto status = cuLaunchCooperativeKernel(
+		enqueue_kernel_launch_by_handle_in_current_context(
 			kernel_function_handle,
-			launch_configuration.dimensions.grid.x,
-			launch_configuration.dimensions.grid.y,
-			launch_configuration.dimensions.grid.z,
-			launch_configuration.dimensions.block.x,
-			launch_configuration.dimensions.block.y,
-			launch_configuration.dimensions.block.z,
-			launch_configuration.dynamic_shared_memory_size,
+			device_id,
+			context_handle,
 			stream_handle,
-			argument_ptrs);
-#else
+			launch_configuration,
+			const_cast<const void**>(argument_ptrs));
+
+#else // CUDA_VERSION is at least 9000 but under 11000
 		auto status = cudaLaunchCooperativeKernel(
 			(const void *) kernel_function,
 			(dim3)(uint3)launch_configuration.dimensions.grid,
 			(dim3)(uint3)launch_configuration.dimensions.block,
 			&argument_ptrs[0],
 			(size_t)launch_configuration.dynamic_shared_memory_size,
 			cudaStream_t(stream_handle));
+		throw_if_error_lazy(status, "Kernel launch failed");
 #endif // CUDA_VERSION >= 11000
-		throw_if_error_lazy(status, "Cooperative kernel launch failed");
 #endif // CUDA_VERSION >= 9000
 	}
 }

diff --git a/src/cuda/api/launch_config_builder.hpp b/src/cuda/api/launch_config_builder.hpp
@@ -167,10 +167,11 @@ class launch_config_builder_t {
 public:
 	launch_configuration_t build() const
 	{
-		auto composite_dims = get_composite_dimensions();
-		auto dynamic_shmem_size = get_dynamic_shared_memory_size(composite_dims.block);
-
-		return launch_configuration_t{composite_dims, dynamic_shmem_size, thread_block_cooperation};
+		auto result = launch_configuration_t{ get_composite_dimensions() };
+		result.dynamic_shared_memory_size = get_dynamic_shared_memory_size(result.dimensions.block);
+		result.block_cooperation = thread_block_cooperation;
+		// TODO: More fields!
+		return result;
 	}
 
 protected:

diff --git a/src/cuda/api/launch_configuration.hpp b/src/cuda/api/launch_configuration.hpp
@@ -1,8 +1,9 @@
 /**
  * @file
  *
- * @brief Contains the @ref launch_configuration_t class and some auxiliary
- * functions for it.
+ * @brief Contains the class @ref `launch_configuration_t`, an enhanced child class of the
+ * CUlaunchConfig struct of CUDA 12.x and later, with an emulation of it for earlier CUDA
+ * versions.
  *
  * @note Launch configurations are  used mostly in @ref kernel_launch.hpp . 
  */
@@ -46,14 +47,22 @@ void validate_compatibility(const device_t &device, memory::shared::size_t share
 
 } // namespace detail_
 
+#if CUDA_VERSION >= 12000
+enum class cluster_scheduling_policy_t {
+	default_ = CU_CLUSTER_SCHEDULING_POLICY_DEFAULT,
+	spread = CU_CLUSTER_SCHEDULING_POLICY_SPREAD,
+	load_balance = CU_CLUSTER_SCHEDULING_POLICY_LOAD_BALANCING
+};
+#endif
+
 struct launch_configuration_t {
-	grid::composite_dimensions_t dimensions {0 , 0 };
+	grid::composite_dimensions_t dimensions { grid::dimensions_t{ 0u, 0u, 0u }, grid::block_dimensions_t{ 0u, 0u, 0u } };
 
 	/**
 	 * The number of bytes each grid block may use, in addition to the statically-allocated
 	 * shared memory data inherent in the compiled kernel.
 	 */
-	memory::shared::size_t      dynamic_shared_memory_size { 0u };
+	memory::shared::size_t dynamic_shared_memory_size { 0u };
 
 	/**
 	 * When true, CUDA's "cooperative launch" mechanism will be used, enabling
@@ -62,7 +71,77 @@ struct launch_configuration_t {
 	 * groups", but you should ignore those, as they are simply C++ library constructs and do not
 	 * in the compiled code).
 	 */
-	bool                        block_cooperation { false };
+	bool block_cooperation { false };
+
+#if CUDA_VERSION >= 12000
+	/**
+	 * A kernel thus launched, will not await the completion of any previous launched kernel
+	 * before its blocks begin to be scheduled. Rather, its threads will be able to use the
+	 * `griddepcontrol.wait` PTX instruction (a.k.a. `cudaGridDependencySynchronize()`) -
+	 * at some point during their execution - to actually wait for all previous in-flight
+	 * kernels (= kernel grids) to conclude. This allows such a subsequent kernel to
+	 * perform independent "preamble" tasks concurrently with the execution of its
+	 * antecedents on the stream.
+	 */
+	bool programmatically_dependent_launch { true };
+
+	/**
+	 * If this is specified, the pointed-to event will trigger once all kernel threads
+	 * have issued the `griddepcontrol.launch_dependents` instruction (a.k.a. the
+	 * `cudaTriggerProgrammaticLaunchCompletion()` function).
+	 *
+	 * @note This is a non-owning pointer; no @ref `event_t` is allocated or released
+	 * while using this class. Also, the actual CUDA event must be valid and not be reused
+	 * or destroyed until the kernel concludes and the event fires.
+	 *
+	 * @note this field is independent of @ref programmatically_dependent_launch , as it
+	 * regards the _conclusion_ of the launched kernel, and future kernels which may depend
+	 * on it, rather than the beginning of scheduling of the launched kernel and its
+	 * dependence on antecedents.
+	 */
+	struct {
+		event_t* event { nullptr };
+		// unsigned flags; WHAT ABOUT THE FLAGS?
+		bool trigger_event_at_block_start { true };
+	} programmatic_completion;
+
+	/**
+	 * When set to true, a GPU-scope memory synchronization will not be sufficient
+	 * to establish memory activity order between this kernel and kernels in the default,
+	 * or any other, memory synchronization domain - even if those kernels are launched
+	 * on the same GPU.
+	 */
+	bool in_remote_memory_synchronization_domain { false };
+
+	/**
+	 * Dimensions of each part in the partition of the grid blocks into clusters, which
+	 * can pool their shared memory together.
+	 */
+	struct {
+		grid::dimensions_t cluster_dimensions { 1, 1, 1 };
+		cluster_scheduling_policy_t scheduling_policy { cluster_scheduling_policy_t::default_ };
+	} clustering;
+#endif // CUDA_VERSION >= 12000
+
+public: // non-mutators
+
+	/**
+	 * Determine whether the configuration includes launch attributes different than the default
+	 * values.
+	 *
+	 * @note The grid dimensions, block dimensions, and dynamic shared memory size are not
+	 * considered launch attributes, and their settings does not affect the result of this method.
+	 */
+	bool has_nondefault_attributes() const
+	{
+		if (block_cooperation) { return true; }
+#if CUDA_VERSION >= 12000
+		return  programmatically_dependent_launch or programmatic_completion.event
+			or in_remote_memory_synchronization_domain or clustering.cluster_dimensions == grid::dimensions_t::point();
+#else
+		return false;
+#endif
+	}
 
 	// In C++11, an inline initializer for a struct's field costs us a lot
 	// of its defaulted constructors; but - we must initialize the shared
@@ -77,36 +156,27 @@ struct launch_configuration_t {
 
 	constexpr launch_configuration_t(
 		grid::composite_dimensions_t grid_and_block_dimensions,
-		memory::shared::size_t dynamic_shared_mem = 0u,
-		bool thread_block_cooperation = false
+		memory::shared::size_t dynamic_shared_mem = 0u
 	) :
 		dimensions{grid_and_block_dimensions},
-		dynamic_shared_memory_size{dynamic_shared_mem},
-		block_cooperation{thread_block_cooperation}
+		dynamic_shared_memory_size{dynamic_shared_mem}
 	{ }
 
 	constexpr launch_configuration_t(
 		grid::dimensions_t grid_dims,
 		grid::dimensions_t block_dims,
-		memory::shared::size_t dynamic_shared_mem = 0u,
-		bool thread_block_cooperation = false
-	) : launch_configuration_t(
-		{grid_dims, block_dims},
-		dynamic_shared_mem,
-		thread_block_cooperation)
-	{ }
+		memory::shared::size_t dynamic_shared_mem = 0u
+	) : launch_configuration_t( {grid_dims, block_dims}, dynamic_shared_mem) { }
 
 	// A "convenience" delegating ctor to avoid narrowing-conversion warnings
 	constexpr launch_configuration_t(
 		int grid_dims,
 		int block_dims,
-		memory::shared::size_t dynamic_shared_mem = 0u,
-		bool thread_block_cooperation = false
+		memory::shared::size_t dynamic_shared_mem = 0u
 	) : launch_configuration_t(
 		grid::dimensions_t(grid_dims),
 		grid::block_dimensions_t(block_dims),
-		dynamic_shared_mem,
-		thread_block_cooperation)
+		dynamic_shared_mem)
 	{ }
 
    // These can be made constexpr in C++14
@@ -117,9 +187,15 @@ struct launch_configuration_t {
 constexpr bool operator==(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept
 {
 	return
-		lhs.dimensions == rhs.dimensions and
-			lhs.dynamic_shared_memory_size == rhs.dynamic_shared_memory_size and
-			lhs.block_cooperation == rhs.block_cooperation;
+		lhs.dimensions == rhs.dimensions
+	    and lhs.dynamic_shared_memory_size == rhs.dynamic_shared_memory_size
+		and lhs.block_cooperation == rhs.block_cooperation
+#if CUDA_VERSION >= 12000
+		and lhs.programmatically_dependent_launch == rhs.programmatically_dependent_launch
+		and lhs.programmatic_completion.event == rhs.programmatic_completion.event
+		and lhs.in_remote_memory_synchronization_domain == rhs.in_remote_memory_synchronization_domain
+#endif // CUDA_VERSION >= 12000
+		;
 }
 
 constexpr bool operator!=(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept
@@ -157,8 +233,18 @@ inline void validate_compatibility(
 	//	validate_grid_dimension_compatibility(kernel, launch_config.dimensions.grid);
 }
 
-} // namespace detail_
+using launch_attribute_index_t = unsigned int;
 
+// ensure we have the same number here as the number of attribute insertions in marsha()
+constexpr launch_attribute_index_t maximum_possible_kernel_launch_attributes = 7;
+
+// Note: The atttribute_storage must have a capacity of maximum_possible_kernel_launch_attributes+1 at least
+CUlaunchConfig marshal(
+	const launch_configuration_t& config,
+	const stream::handle_t stream_handle,
+	span<CUlaunchAttribute> attribute_storage) noexcept(true);
+
+} // namespace detail_
 
 } // namespace cuda
 

diff --git a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp
@@ -126,6 +126,8 @@ void enqueue_launch_helper<kernel::apriori_compiled_t, KernelParameters...>::ope
 
 	detail_::enqueue_raw_kernel_launch_in_current_context(
 		unwrapped_kernel_function,
+		stream.device_id(),
+		stream.context_handle(),
 		stream.handle(),
 		launch_configuration,
 		::std::forward<KernelParameters>(parameters)...);
@@ -140,7 +142,7 @@ marshal_dynamic_kernel_arguments(KernelParameters&&... parameters)
 
 
 // Note: The last (valid) element of marshalled_arguments must be null
-inline void launch_type_erased_in_current_context(
+inline void enqueue_kernel_launch_by_handle_in_current_context(
 	kernel::handle_t        kernel_function_handle,
 	device::id_t            device_id,
 	context::handle_t       context_handle,
@@ -153,7 +155,19 @@ inline void launch_type_erased_in_current_context(
 #endif
 	status_t status;
 	const auto&lc = launch_config; // alias for brevity
-	if (launch_config.block_cooperation)
+#if CUDA_VERSION >= 12000
+	CUlaunchAttribute launch_attributes[detail_::maximum_possible_kernel_launch_attributes+1];
+	auto launch_attributes_span = span<CUlaunchAttribute>{
+		launch_attributes, sizeof(launch_attributes)/sizeof(launch_attributes[0])
+	};
+	CUlaunchConfig full_launch_config = detail_::marshal(lc, stream_handle, launch_attributes_span);
+	status = cuLaunchKernelEx(
+		&full_launch_config,
+		kernel_function_handle,
+		const_cast<void**>(marshalled_arguments),
+		nullptr);
+#else
+	if (launch_config.has_nondefault_attributes())
 		status = cuLaunchCooperativeKernel(
 			kernel_function_handle,
 			lc.dimensions.grid.x,  lc.dimensions.grid.y,  lc.dimensions.grid.z,
@@ -175,8 +189,8 @@ inline void launch_type_erased_in_current_context(
 			no_arguments_in_alternative_format
 		);
 	}
+#endif // CUDA_VERSION >= 12000
 	throw_if_error_lazy(status,
-		(lc.block_cooperation ? "Cooperative " : "") +
 		::std::string(" kernel launch failed for ") + kernel::detail_::identify(kernel_function_handle)
 		+ " on " + stream::detail_::identify(stream_handle, context_handle, device_id));
 }
@@ -198,7 +212,7 @@ struct enqueue_launch_helper<kernel_t, KernelParameters...> {
 		validate_compatibility(wrapped_kernel, launch_config);
 #endif
 
-		launch_type_erased_in_current_context(
+		enqueue_kernel_launch_by_handle_in_current_context(
 			function_handle, stream.device_id(), stream.context_handle(),
 			stream.handle(), launch_config, marshalled_arguments.data());
 	}
@@ -223,7 +237,7 @@ void enqueue_launch(
 	// validating the configuration without the device should happen within the next function...
 #endif
 	detail_::enqueue_raw_kernel_launch_in_current_context<RawKernelFunction, KernelParameters...>(
-		kernel_function, stream.handle(), launch_configuration,
+		kernel_function, stream.device_id(), stream.context_handle(), stream.handle(), launch_configuration,
 		::std::forward<KernelParameters>(parameters)...);
 }
 
@@ -292,7 +306,7 @@ inline void launch_type_erased(
 	}
 #endif
 	CAW_SET_SCOPE_CONTEXT(stream.context_handle());
-	return detail_::launch_type_erased_in_current_context(
+	return detail_::enqueue_kernel_launch_by_handle_in_current_context(
 		kernel.handle(),
 		stream.device_id(),
 		stream.context_handle(),