Skip to content

Commit

Permalink
Regards #564: CUDA 12 launch configuration structure and attributes s…
Browse files Browse the repository at this point in the history
…upport

* Now supporting CUDA-12-introduced launch attributes, including remote memory space, programmatic completion, launch completion events and clusters
* Avoiding a bit of code duplication in kernel launching (but an increase in duplication due to the unavailability of `cuLaunchKernelEx()` with attribute support before CUDA 12
* New multi-wrapper-impl file for launch configurations - for mashalling launch attributes
  • Loading branch information
eyalroz committed Feb 3, 2024
1 parent bba75d4 commit 88a96b8
Show file tree
Hide file tree
Showing 6 changed files with 240 additions and 52 deletions.
1 change: 1 addition & 0 deletions src/cuda/api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
#include "api/multi_wrapper_impls/apriori_compiled_kernel.hpp"
#include "api/multi_wrapper_impls/module.hpp"
#include "api/multi_wrapper_impls/ipc.hpp"
#include "api/multi_wrapper_impls/launch_configuration.hpp"

#include "api/launch_config_builder.hpp"

Expand Down
39 changes: 21 additions & 18 deletions src/cuda/api/kernel_launch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,19 @@ void enqueue_launch(
launch_configuration_t launch_configuration,
KernelParameters&&... parameters);

inline void enqueue_kernel_launch_by_handle_in_current_context(
kernel::handle_t kernel_function_handle,
device::id_t device_id,
context::handle_t context_handle,
stream::handle_t stream_handle,
launch_configuration_t launch_config,
const void** marshalled_arguments);

template<typename KernelFunction, typename... KernelParameters>
void enqueue_raw_kernel_launch_in_current_context(
KernelFunction&& kernel_function,
device::id_t device_id,
context::handle_t context_handle,
stream::handle_t stream_handle,
launch_configuration_t launch_configuration,
KernelParameters&&... parameters)
Expand All @@ -171,7 +181,7 @@ void enqueue_raw_kernel_launch_in_current_context(
#ifndef NDEBUG
validate(launch_configuration);
#endif
if (launch_configuration.block_cooperation == thread_blocks_may_not_cooperate) {
if (not launch_configuration.has_nondefault_attributes()) {
// regular plain vanilla launch
kernel_function <<<
launch_configuration.dimensions.grid,
Expand All @@ -184,13 +194,9 @@ void enqueue_raw_kernel_launch_in_current_context(
else {
#if CUDA_VERSION < 9000
throw cuda::runtime_error(status::not_supported,
"Only CUDA versions 9.0 and later support launching kernels \"cooperatively\"");
"Only CUDA versions 9.0 and later support launching kernels with additional"
"arguments, e.g block cooperation");
#else
// Cooperative launches cannot be made using the triple-chevron syntax,
// nor is there a variadic-template of the launch API call, so we need to
// a bit of useless work here. We could have done exactly the same thing
// for the non-cooperative case, mind you.

// The following hack is due to C++ not supporting arrays of length 0 -
// but such an array being necessary for collect_argument_addresses with
// multiple parameters. Other workarounds are possible, but would be
Expand All @@ -204,27 +210,24 @@ void enqueue_raw_kernel_launch_in_current_context(
detail_::collect_argument_addresses(argument_ptrs, ::std::forward<KernelParameters>(parameters)...);
#if CUDA_VERSION >= 11000
kernel::handle_t kernel_function_handle = kernel::apriori_compiled::detail_::get_handle( (const void*) kernel_function);
auto status = cuLaunchCooperativeKernel(
enqueue_kernel_launch_by_handle_in_current_context(
kernel_function_handle,
launch_configuration.dimensions.grid.x,
launch_configuration.dimensions.grid.y,
launch_configuration.dimensions.grid.z,
launch_configuration.dimensions.block.x,
launch_configuration.dimensions.block.y,
launch_configuration.dimensions.block.z,
launch_configuration.dynamic_shared_memory_size,
device_id,
context_handle,
stream_handle,
argument_ptrs);
#else
launch_configuration,
const_cast<const void**>(argument_ptrs));

#else // CUDA_VERSION is at least 9000 but under 11000
auto status = cudaLaunchCooperativeKernel(
(const void *) kernel_function,
(dim3)(uint3)launch_configuration.dimensions.grid,
(dim3)(uint3)launch_configuration.dimensions.block,
&argument_ptrs[0],
(size_t)launch_configuration.dynamic_shared_memory_size,
cudaStream_t(stream_handle));
throw_if_error_lazy(status, "Kernel launch failed");
#endif // CUDA_VERSION >= 11000
throw_if_error_lazy(status, "Cooperative kernel launch failed");
#endif // CUDA_VERSION >= 9000
}
}
Expand Down
9 changes: 5 additions & 4 deletions src/cuda/api/launch_config_builder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,10 +167,11 @@ class launch_config_builder_t {
public:
launch_configuration_t build() const
{
auto composite_dims = get_composite_dimensions();
auto dynamic_shmem_size = get_dynamic_shared_memory_size(composite_dims.block);

return launch_configuration_t{composite_dims, dynamic_shmem_size, thread_block_cooperation};
auto result = launch_configuration_t{ get_composite_dimensions() };
result.dynamic_shared_memory_size = get_dynamic_shared_memory_size(result.dimensions.block);
result.block_cooperation = thread_block_cooperation;
// TODO: More fields!
return result;
}

protected:
Expand Down
134 changes: 110 additions & 24 deletions src/cuda/api/launch_configuration.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
/**
* @file
*
* @brief Contains the @ref launch_configuration_t class and some auxiliary
* functions for it.
* @brief Contains the class @ref `launch_configuration_t`, an enhanced child class of the
* CUlaunchConfig struct of CUDA 12.x and later, with an emulation of it for earlier CUDA
* versions.
*
* @note Launch configurations are used mostly in @ref kernel_launch.hpp .
*/
Expand Down Expand Up @@ -46,14 +47,22 @@ void validate_compatibility(const device_t &device, memory::shared::size_t share

} // namespace detail_

#if CUDA_VERSION >= 12000
enum class cluster_scheduling_policy_t {
default_ = CU_CLUSTER_SCHEDULING_POLICY_DEFAULT,
spread = CU_CLUSTER_SCHEDULING_POLICY_SPREAD,
load_balance = CU_CLUSTER_SCHEDULING_POLICY_LOAD_BALANCING
};
#endif

struct launch_configuration_t {
grid::composite_dimensions_t dimensions {0 , 0 };
grid::composite_dimensions_t dimensions { grid::dimensions_t{ 0u, 0u, 0u }, grid::block_dimensions_t{ 0u, 0u, 0u } };

/**
* The number of bytes each grid block may use, in addition to the statically-allocated
* shared memory data inherent in the compiled kernel.
*/
memory::shared::size_t dynamic_shared_memory_size { 0u };
memory::shared::size_t dynamic_shared_memory_size { 0u };

/**
* When true, CUDA's "cooperative launch" mechanism will be used, enabling
Expand All @@ -62,7 +71,77 @@ struct launch_configuration_t {
* groups", but you should ignore those, as they are simply C++ library constructs and do not
* in the compiled code).
*/
bool block_cooperation { false };
bool block_cooperation { false };

#if CUDA_VERSION >= 12000
/**
* A kernel thus launched, will not await the completion of any previous launched kernel
* before its blocks begin to be scheduled. Rather, its threads will be able to use the
* `griddepcontrol.wait` PTX instruction (a.k.a. `cudaGridDependencySynchronize()`) -
* at some point during their execution - to actually wait for all previous in-flight
* kernels (= kernel grids) to conclude. This allows such a subsequent kernel to
* perform independent "preamble" tasks concurrently with the execution of its
* antecedents on the stream.
*/
bool programmatically_dependent_launch { true };

/**
* If this is specified, the pointed-to event will trigger once all kernel threads
* have issued the `griddepcontrol.launch_dependents` instruction (a.k.a. the
* `cudaTriggerProgrammaticLaunchCompletion()` function).
*
* @note This is a non-owning pointer; no @ref `event_t` is allocated or released
* while using this class. Also, the actual CUDA event must be valid and not be reused
* or destroyed until the kernel concludes and the event fires.
*
* @note this field is independent of @ref programmatically_dependent_launch , as it
* regards the _conclusion_ of the launched kernel, and future kernels which may depend
* on it, rather than the beginning of scheduling of the launched kernel and its
* dependence on antecedents.
*/
struct {
event_t* event { nullptr };
// unsigned flags; WHAT ABOUT THE FLAGS?
bool trigger_event_at_block_start { true };
} programmatic_completion;

/**
* When set to true, a GPU-scope memory synchronization will not be sufficient
* to establish memory activity order between this kernel and kernels in the default,
* or any other, memory synchronization domain - even if those kernels are launched
* on the same GPU.
*/
bool in_remote_memory_synchronization_domain { false };

/**
* Dimensions of each part in the partition of the grid blocks into clusters, which
* can pool their shared memory together.
*/
struct {
grid::dimensions_t cluster_dimensions { 1, 1, 1 };
cluster_scheduling_policy_t scheduling_policy { cluster_scheduling_policy_t::default_ };
} clustering;
#endif // CUDA_VERSION >= 12000

public: // non-mutators

/**
* Determine whether the configuration includes launch attributes different than the default
* values.
*
* @note The grid dimensions, block dimensions, and dynamic shared memory size are not
* considered launch attributes, and their settings does not affect the result of this method.
*/
bool has_nondefault_attributes() const
{
if (block_cooperation) { return true; }
#if CUDA_VERSION >= 12000
return programmatically_dependent_launch or programmatic_completion.event
or in_remote_memory_synchronization_domain or clustering.cluster_dimensions == grid::dimensions_t::point();
#else
return false;
#endif
}

// In C++11, an inline initializer for a struct's field costs us a lot
// of its defaulted constructors; but - we must initialize the shared
Expand All @@ -77,36 +156,27 @@ struct launch_configuration_t {

constexpr launch_configuration_t(
grid::composite_dimensions_t grid_and_block_dimensions,
memory::shared::size_t dynamic_shared_mem = 0u,
bool thread_block_cooperation = false
memory::shared::size_t dynamic_shared_mem = 0u
) :
dimensions{grid_and_block_dimensions},
dynamic_shared_memory_size{dynamic_shared_mem},
block_cooperation{thread_block_cooperation}
dynamic_shared_memory_size{dynamic_shared_mem}
{ }

constexpr launch_configuration_t(
grid::dimensions_t grid_dims,
grid::dimensions_t block_dims,
memory::shared::size_t dynamic_shared_mem = 0u,
bool thread_block_cooperation = false
) : launch_configuration_t(
{grid_dims, block_dims},
dynamic_shared_mem,
thread_block_cooperation)
{ }
memory::shared::size_t dynamic_shared_mem = 0u
) : launch_configuration_t( {grid_dims, block_dims}, dynamic_shared_mem) { }

// A "convenience" delegating ctor to avoid narrowing-conversion warnings
constexpr launch_configuration_t(
int grid_dims,
int block_dims,
memory::shared::size_t dynamic_shared_mem = 0u,
bool thread_block_cooperation = false
memory::shared::size_t dynamic_shared_mem = 0u
) : launch_configuration_t(
grid::dimensions_t(grid_dims),
grid::block_dimensions_t(block_dims),
dynamic_shared_mem,
thread_block_cooperation)
dynamic_shared_mem)
{ }

// These can be made constexpr in C++14
Expand All @@ -117,9 +187,15 @@ struct launch_configuration_t {
constexpr bool operator==(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept
{
return
lhs.dimensions == rhs.dimensions and
lhs.dynamic_shared_memory_size == rhs.dynamic_shared_memory_size and
lhs.block_cooperation == rhs.block_cooperation;
lhs.dimensions == rhs.dimensions
and lhs.dynamic_shared_memory_size == rhs.dynamic_shared_memory_size
and lhs.block_cooperation == rhs.block_cooperation
#if CUDA_VERSION >= 12000
and lhs.programmatically_dependent_launch == rhs.programmatically_dependent_launch
and lhs.programmatic_completion.event == rhs.programmatic_completion.event
and lhs.in_remote_memory_synchronization_domain == rhs.in_remote_memory_synchronization_domain
#endif // CUDA_VERSION >= 12000
;
}

constexpr bool operator!=(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept
Expand Down Expand Up @@ -157,8 +233,18 @@ inline void validate_compatibility(
// validate_grid_dimension_compatibility(kernel, launch_config.dimensions.grid);
}

} // namespace detail_
using launch_attribute_index_t = unsigned int;

// ensure we have the same number here as the number of attribute insertions in marsha()
constexpr launch_attribute_index_t maximum_possible_kernel_launch_attributes = 7;

// Note: The atttribute_storage must have a capacity of maximum_possible_kernel_launch_attributes+1 at least
CUlaunchConfig marshal(
const launch_configuration_t& config,
const stream::handle_t stream_handle,
span<CUlaunchAttribute> attribute_storage) noexcept(true);

} // namespace detail_

} // namespace cuda

Expand Down
26 changes: 20 additions & 6 deletions src/cuda/api/multi_wrapper_impls/kernel_launch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ void enqueue_launch_helper<kernel::apriori_compiled_t, KernelParameters...>::ope

detail_::enqueue_raw_kernel_launch_in_current_context(
unwrapped_kernel_function,
stream.device_id(),
stream.context_handle(),
stream.handle(),
launch_configuration,
::std::forward<KernelParameters>(parameters)...);
Expand All @@ -140,7 +142,7 @@ marshal_dynamic_kernel_arguments(KernelParameters&&... parameters)


// Note: The last (valid) element of marshalled_arguments must be null
inline void launch_type_erased_in_current_context(
inline void enqueue_kernel_launch_by_handle_in_current_context(
kernel::handle_t kernel_function_handle,
device::id_t device_id,
context::handle_t context_handle,
Expand All @@ -153,7 +155,19 @@ inline void launch_type_erased_in_current_context(
#endif
status_t status;
const auto&lc = launch_config; // alias for brevity
if (launch_config.block_cooperation)
#if CUDA_VERSION >= 12000
CUlaunchAttribute launch_attributes[detail_::maximum_possible_kernel_launch_attributes+1];
auto launch_attributes_span = span<CUlaunchAttribute>{
launch_attributes, sizeof(launch_attributes)/sizeof(launch_attributes[0])
};
CUlaunchConfig full_launch_config = detail_::marshal(lc, stream_handle, launch_attributes_span);
status = cuLaunchKernelEx(
&full_launch_config,
kernel_function_handle,
const_cast<void**>(marshalled_arguments),
nullptr);
#else
if (launch_config.has_nondefault_attributes())
status = cuLaunchCooperativeKernel(
kernel_function_handle,
lc.dimensions.grid.x, lc.dimensions.grid.y, lc.dimensions.grid.z,
Expand All @@ -175,8 +189,8 @@ inline void launch_type_erased_in_current_context(
no_arguments_in_alternative_format
);
}
#endif // CUDA_VERSION >= 12000
throw_if_error_lazy(status,
(lc.block_cooperation ? "Cooperative " : "") +
::std::string(" kernel launch failed for ") + kernel::detail_::identify(kernel_function_handle)
+ " on " + stream::detail_::identify(stream_handle, context_handle, device_id));
}
Expand All @@ -198,7 +212,7 @@ struct enqueue_launch_helper<kernel_t, KernelParameters...> {
validate_compatibility(wrapped_kernel, launch_config);
#endif

launch_type_erased_in_current_context(
enqueue_kernel_launch_by_handle_in_current_context(
function_handle, stream.device_id(), stream.context_handle(),
stream.handle(), launch_config, marshalled_arguments.data());
}
Expand All @@ -223,7 +237,7 @@ void enqueue_launch(
// validating the configuration without the device should happen within the next function...
#endif
detail_::enqueue_raw_kernel_launch_in_current_context<RawKernelFunction, KernelParameters...>(
kernel_function, stream.handle(), launch_configuration,
kernel_function, stream.device_id(), stream.context_handle(), stream.handle(), launch_configuration,
::std::forward<KernelParameters>(parameters)...);
}

Expand Down Expand Up @@ -292,7 +306,7 @@ inline void launch_type_erased(
}
#endif
CAW_SET_SCOPE_CONTEXT(stream.context_handle());
return detail_::launch_type_erased_in_current_context(
return detail_::enqueue_kernel_launch_by_handle_in_current_context(
kernel.handle(),
stream.device_id(),
stream.context_handle(),
Expand Down
Loading

0 comments on commit 88a96b8

Please sign in to comment.