From 31e49d476c02fb33652e6214ca883e5472e278eb Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Wed, 8 Jan 2025 22:49:07 -0800 Subject: [PATCH 01/31] Make CUB NVRTC commandline arguments come from a cmake template (#3292) --- cub/test/CMakeLists.txt | 6 ++---- cub/test/catch2_test_nvrtc.cu | 1 + cub/test/cmake/nvrtc_args.h.in | 6 ++++++ 3 files changed, 9 insertions(+), 4 deletions(-) create mode 100644 cub/test/cmake/nvrtc_args.h.in diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index 17201c4704f..c86d24754de 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -227,10 +227,8 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id) endif() # CUB_SEPARATE_CATCH2 if ("${test_target}" MATCHES "nvrtc") - target_compile_definitions(${test_target} PRIVATE NVRTC_CUB_PATH="-I${CMAKE_SOURCE_DIR}/cub") - target_compile_definitions(${test_target} PRIVATE NVRTC_THRUST_PATH="-I${CMAKE_SOURCE_DIR}/thrust") - target_compile_definitions(${test_target} PRIVATE NVRTC_LIBCUDACXX_PATH="-I${CMAKE_SOURCE_DIR}/libcudacxx/include") - target_compile_definitions(${test_target} PRIVATE NVRTC_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}") + configure_file("cmake/nvrtc_args.h.in" ${CMAKE_CURRENT_BINARY_DIR}/nvrtc_args.h) + target_include_directories(${test_target} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) endif() if ("${test_target}" MATCHES "test.iterator") diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu index 01f39027ce0..71187ecc83a 100644 --- a/cub/test/catch2_test_nvrtc.cu +++ b/cub/test/catch2_test_nvrtc.cu @@ -31,6 +31,7 @@ #include #include +#include TEST_CASE("Test nvrtc", "[test][nvrtc]") { diff --git a/cub/test/cmake/nvrtc_args.h.in b/cub/test/cmake/nvrtc_args.h.in new file mode 100644 index 00000000000..215804ad0f0 --- /dev/null +++ b/cub/test/cmake/nvrtc_args.h.in @@ -0,0 +1,6 @@ +#pragma once + +const char* NVRTC_CUB_PATH = "-I@CMAKE_SOURCE_DIR@/cub"; +const char* NVRTC_THRUST_PATH = "-I@CMAKE_SOURCE_DIR@/thrust"; +const char* NVRTC_LIBCUDACXX_PATH = "-I@CMAKE_SOURCE_DIR@/libcudacxx/include"; +const char* NVRTC_CTK_PATH = "-I@CUDAToolkit_INCLUDE_DIRS@"; From 58d8893dee7e53fc034589b7b99ab67814a618da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <158148890+caugonnet@users.noreply.github.com> Date: Thu, 9 Jan 2025 09:42:37 +0100 Subject: [PATCH 02/31] Propose the same components (thrust, cub, libc++, cudax, cuda.parallel,...) in the bug report template than in the feature request template (#3295) --- .github/ISSUE_TEMPLATE/bug_report.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 725009e6256..74de63e7a94 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -37,6 +37,11 @@ body: - Thrust - CUB - libcu++ + - CUDA Experimental (cudax) + - cuda.cooperative (Python) + - cuda.parallel (Python) + - General CCCL + - Infrastructure - Not sure validations: required: true From 466c0d3cefe554d884c53ac242d95b4b598da5e6 Mon Sep 17 00:00:00 2001 From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Date: Thu, 9 Jan 2025 01:30:16 -0800 Subject: [PATCH 03/31] Use process isolation instead of default hyper-v for Windows. (#3294) Try improving build times by using process isolation instead of hyper-v Co-authored-by: Michael Schellenberger Costa --- .github/actions/workflow-run-job-windows/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/workflow-run-job-windows/action.yml b/.github/actions/workflow-run-job-windows/action.yml index 805beff3446..1b5289a5a7d 100644 --- a/.github/actions/workflow-run-job-windows/action.yml +++ b/.github/actions/workflow-run-job-windows/action.yml @@ -50,6 +50,7 @@ runs: docker run \ --mount type=bind,source="${{steps.paths.outputs.HOST_REPO}}",target="${{steps.paths.outputs.MOUNT_REPO}}" \ --workdir "${{steps.paths.outputs.MOUNT_REPO}}" \ + --isolation=process \ ${{ inputs.image }} \ powershell -c " [System.Environment]::SetEnvironmentVariable('AWS_ACCESS_KEY_ID','${{env.AWS_ACCESS_KEY_ID}}'); From f43dc54f8d54999d540f380b627314379ba7316d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 11:21:10 +0100 Subject: [PATCH 04/31] [pre-commit.ci] pre-commit autoupdate (#3248) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/pre-commit/mirrors-clang-format: v18.1.8 → v19.1.6](https://github.com/pre-commit/mirrors-clang-format/compare/v18.1.8...v19.1.6) - [github.com/astral-sh/ruff-pre-commit: v0.8.3 → v0.8.6](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.3...v0.8.6) - [github.com/pre-commit/mirrors-mypy: v1.13.0 → v1.14.1](https://github.com/pre-commit/mirrors-mypy/compare/v1.13.0...v1.14.1) Co-authored-by: Michael Schellenberger Costa --- .pre-commit-config.yaml | 6 +- cub/cub/agent/agent_histogram.cuh | 2 +- cub/cub/agent/agent_reduce.cuh | 8 +- cub/cub/block/block_radix_rank.cuh | 3 +- cub/cub/detail/strong_load.cuh | 153 +++------- cub/cub/detail/strong_store.cuh | 215 ++++--------- .../device/dispatch/dispatch_transform.cuh | 21 +- .../tuning/tuning_three_way_partition.cuh | 5 +- cub/cub/thread/thread_operators.cuh | 4 +- cub/cub/thread/thread_reduce.cuh | 8 +- .../catch2_test_device_for_each_in_extents.cu | 4 +- cub/test/catch2_test_device_transform.cu | 4 +- cub/test/test_block_radix_rank.cu | 2 +- .../__async/sender/basic_sender.cuh | 12 +- .../__async/sender/completion_signatures.cuh | 81 ++--- .../__async/sender/continue_on.cuh | 4 +- .../cuda/experimental/__async/sender/cpos.cuh | 8 +- .../experimental/__async/sender/let_value.cuh | 5 +- .../__async/sender/stop_token.cuh | 4 +- .../experimental/__async/sender/tuple.cuh | 4 +- .../__memory_resource/any_resource.cuh | 8 +- .../__utility/basic_any/basic_any_from.cuh | 8 +- .../__utility/basic_any/basic_any_ptr.cuh | 11 +- .../__utility/basic_any/interfaces.cuh | 12 +- .../experimental/__utility/basic_any/iset.cuh | 4 +- .../experimental/__utility/basic_any/rtti.cuh | 8 +- .../__utility/basic_any/virtual_ptrs.cuh | 6 +- .../__utility/basic_any/virtual_tables.cuh | 4 +- cudax/test/stf/error_checks/ctx_mismatch.cu | 3 +- .../error_checks/data_interface_mismatch.cu | 3 +- .../test/stf/error_checks/double_finalize.cu | 3 +- cudax/test/stf/error_checks/erase_frozen.cu | 3 +- .../error_checks/misformed_tasks_dbl_end.cu | 3 +- .../error_checks/misformed_tasks_dbl_start.cu | 3 +- .../test/stf/error_checks/non_managed_data.cu | 3 +- .../stf/error_checks/slice_check_bounds.cu | 3 +- .../stf/error_checks/uninitialized_data.cu | 3 +- .../stf/error_checks/unsatisfiable_spec.cu | 3 +- cudax/test/stf/error_checks/write_frozen.cu | 3 +- .../cuda/__barrier/barrier_block_scope.h | 57 ++-- .../cuda/__barrier/barrier_expect_tx.h | 7 +- .../cuda/__functional/address_stability.h | 4 +- .../__memcpy_async/cp_async_shared_global.h | 20 +- .../cuda/__memcpy_async/memcpy_completion.h | 8 +- .../instructions/generated/barrier_cluster.h | 25 +- .../instructions/generated/cp_async_bulk.h | 28 +- .../generated/cp_async_bulk_commit_group.h | 5 +- .../generated/cp_async_bulk_multicast.h | 13 +- .../generated/cp_async_bulk_tensor.h | 135 ++++----- .../cp_async_bulk_tensor_multicast.h | 85 +++--- .../generated/cp_async_bulk_wait_group.h | 10 +- .../generated/cp_reduce_async_bulk.h | 271 +++++++---------- .../generated/cp_reduce_async_bulk_bf16.h | 24 +- .../generated/cp_reduce_async_bulk_f16.h | 24 +- .../generated/fence_mbarrier_init.h | 5 +- .../generated/fence_proxy_alias.h | 5 +- .../generated/fence_proxy_async.h | 5 +- .../__ptx/instructions/generated/get_sreg.h | 185 ++---------- .../__ptx/instructions/generated/getctarank.h | 5 +- .../instructions/generated/mbarrier_arrive.h | 27 +- .../generated/mbarrier_arrive_expect_tx.h | 7 +- .../generated/mbarrier_arrive_no_complete.h | 7 +- .../instructions/generated/mbarrier_init.h | 5 +- .../generated/mbarrier_test_wait.h | 6 +- .../generated/mbarrier_test_wait_parity.h | 6 +- .../generated/mbarrier_try_wait.h | 13 +- .../generated/mbarrier_try_wait_parity.h | 13 +- .../__ptx/instructions/generated/red_async.h | 103 ++++--- .../__ptx/instructions/generated/st_async.h | 16 +- .../generated/tensormap_replace.h | 132 ++++---- libcudacxx/include/cuda/pipeline | 13 +- .../include/cuda/std/__atomic/types/base.h | 36 +-- .../include/cuda/std/__atomic/types/common.h | 4 +- .../include/cuda/std/__atomic/types/locked.h | 28 +- .../include/cuda/std/__atomic/types/small.h | 36 +-- .../cuda/std/__concepts/concept_macros.h | 5 +- libcudacxx/include/cuda/std/__cstddef/types.h | 2 +- libcudacxx/include/cuda/std/__cuda/chrono.h | 3 +- .../include/cuda/std/__functional/function.h | 2 +- .../include/cuda/std/__functional/mem_fn.h | 2 +- .../include/cuda/std/__iterator/access.h | 16 +- libcudacxx/include/cuda/std/__iterator/data.h | 8 +- .../include/cuda/std/__iterator/empty.h | 4 +- .../include/cuda/std/__iterator/iter_move.h | 5 +- .../cuda/std/__iterator/iterator_traits.h | 11 +- .../cuda/std/__iterator/reverse_access.h | 16 +- .../include/cuda/std/__mdspan/extents.h | 3 +- libcudacxx/include/cuda/std/__mdspan/macros.h | 10 +- libcudacxx/include/cuda/std/__mdspan/mdspan.h | 15 +- .../cuda/std/__memory/allocator_traits.h | 4 +- .../cuda/std/__memory/pointer_traits.h | 4 +- libcudacxx/include/cuda/std/__ranges/access.h | 5 +- libcudacxx/include/cuda/std/__ranges/data.h | 5 +- libcudacxx/include/cuda/std/__ranges/rend.h | 5 +- .../include/cuda/std/__ranges/subrange.h | 4 +- .../cuda/std/__thread/threading_support.h | 4 +- .../std/__thread/threading_support_cuda.h | 3 +- .../cuda/std/__type_traits/type_list.h | 4 +- .../cuda/std/detail/libcxx/include/span | 4 +- .../cuda/std/detail/libcxx/include/variant | 20 +- .../atomic.ext/atomic_fetch_max.pass.cpp | 3 +- .../atomic.ext/atomic_fetch_min.pass.cpp | 3 +- .../cuda/atomics/atomic.ext/atomic_helpers.h | 9 +- .../barrier/cp_async_bulk_tensor_1d.pass.cpp | 5 +- .../barrier/cp_async_bulk_tensor_2d.pass.cpp | 5 +- .../barrier/cp_async_bulk_tensor_3d.pass.cpp | 5 +- .../barrier/cp_async_bulk_tensor_4d.pass.cpp | 5 +- .../barrier/cp_async_bulk_tensor_5d.pass.cpp | 5 +- .../test/libcudacxx/cuda/memcpy_async.h | 18 +- .../cuda/memcpy_async/group_memcpy_async.h | 18 +- ...ne_memcpy_async_producer_consumer.pass.cpp | 3 +- ...peline_memcpy_async_thread_scope_generic.h | 15 +- .../atomics.types.generic/bool.pass.cpp | 3 +- .../floating_point.pass.cpp | 3 +- .../floating_point_ref.pass.cpp | 3 +- .../floating_point_ref_constness.pass.cpp | 3 +- .../integral/1b_integral_cuda.pass.cpp | 3 +- .../integral/1b_integral_std.pass.cpp | 3 +- .../integral/2b_integral_cuda.pass.cpp | 3 +- .../integral/2b_integral_std.pass.cpp | 3 +- .../integral/4b_integral_cuda.pass.cpp | 3 +- .../integral/4b_integral_std.pass.cpp | 3 +- .../integral/8b_integral_cuda.pass.cpp | 3 +- .../integral/8b_integral_std.pass.cpp | 3 +- .../integral/integral_ref.pass.cpp | 3 +- .../integral/integral_ref_constness.pass.cpp | 3 +- .../atomic_helpers.h | 15 +- .../equality_comparable.compile.pass.cpp | 8 +- .../equality_comparable_with.compile.pass.cpp | 284 ++++++++---------- .../totally_ordered.pass.cpp | 8 +- .../totally_ordered_with.pass.cpp | 258 ++++++++-------- .../concepts.object/copyable.compile.pass.cpp | 8 +- .../concepts.object/movable.compile.pass.cpp | 8 +- .../concepts.object/regular.compile.pass.cpp | 16 +- .../semiregular.compile.pass.cpp | 8 +- .../array/array.creation/to_array.pass.cpp | 2 +- .../incrementable_traits.compile.pass.cpp | 12 +- .../indirectly_readable.compile.pass.cpp | 6 +- .../weakly_incrementable.compile.pass.cpp | 12 +- .../thread/thread.barrier/completion.pass.cpp | 3 +- .../func.bind_front/bind_front.pass.cpp | 4 +- .../func.invoke/invoke.pass.cpp | 4 +- .../refwrap.invoke/invoke.compile.fail.cpp | 2 +- .../meta.trans.other/common_type.pass.cpp | 3 +- .../meta.trans.other/result_of.pass.cpp | 2 +- .../meta.trans.other/result_of11.pass.cpp | 2 +- .../bitset.members/to_ullong.pass.cpp | 2 +- .../bitset.members/to_ulong.pass.cpp | 2 +- .../tuple.apply/apply_extended_types.pass.cpp | 2 +- .../utility/utility.swap/swap.pass.cpp | 4 +- .../utility/utility.swap/swap_array.pass.cpp | 4 +- .../variant.swap/swap.pass.cpp | 4 +- libcudacxx/test/support/archetypes.h | 3 +- .../test/support/charconv_test_helpers.h | 4 +- libcudacxx/test/support/concurrent_agents.h | 3 +- libcudacxx/test/support/counting_predicates.h | 8 +- libcudacxx/test/support/cuda_space_selector.h | 3 +- libcudacxx/test/support/is_transparent.h | 20 +- libcudacxx/test/support/rapid-cxx-test.h | 144 ++++----- libcudacxx/test/support/test_convertible.h | 4 +- thrust/testing/async_transform.cu | 18 +- thrust/testing/cuda/transform.cu | 8 +- thrust/testing/unittest/testframework.h | 6 +- thrust/thrust/detail/functional/actor.h | 4 +- thrust/thrust/detail/functional/operators.h | 24 +- thrust/thrust/detail/tuple_transform.h | 3 +- .../detail/type_traits/pointer_traits.h | 12 +- thrust/thrust/functional.h | 16 +- thrust/thrust/optional.h | 10 +- thrust/thrust/system/cuda/detail/transform.h | 4 +- .../type_traits/is_contiguous_iterator.h | 4 +- 171 files changed, 1361 insertions(+), 1940 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7dd411ba39b..d317e931e78 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: - id: mixed-line-ending - id: trailing-whitespace - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.8 + rev: v19.1.6 hooks: - id: clang-format types_or: [file] @@ -39,7 +39,7 @@ repos: # TODO/REMINDER: add the Ruff vscode extension to the devcontainers # Ruff, the Python auto-correcting linter/formatter written in Rust - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.3 + rev: v0.8.6 hooks: - id: ruff # linter - id: ruff-format # formatter @@ -57,7 +57,7 @@ repos: - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.13.0' + rev: 'v1.14.1' hooks: - id: mypy additional_dependencies: [types-cachetools, numpy] diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index 21a487828ca..e454dc837b1 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -629,7 +629,7 @@ struct AgentHistogram // Set valid flags MarkValid( - is_valid, valid_samples, Int2Type{}); + is_valid, valid_samples, Int2Type < AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED > {}); // Accumulate samples if (prefer_smem) diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh index 2e0d94b219c..d5e3514f369 100644 --- a/cub/cub/agent/agent_reduce.cuh +++ b/cub/cub/agent/agent_reduce.cuh @@ -382,8 +382,8 @@ struct AgentReduce even_share.template BlockInit(block_offset, block_end); return (IsAligned(d_in + block_offset, Int2Type())) - ? ConsumeRange(even_share, Int2Type < true && ATTEMPT_VECTORIZATION > ()) - : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ()); + ? ConsumeRange(even_share, Int2Type()) + : ConsumeRange(even_share, Int2Type()); } /** @@ -396,8 +396,8 @@ struct AgentReduce even_share.template BlockInit(); return (IsAligned(d_in, Int2Type())) - ? ConsumeRange(even_share, Int2Type < true && ATTEMPT_VECTORIZATION > ()) - : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ()); + ? ConsumeRange(even_share, Int2Type()) + : ConsumeRange(even_share, Int2Type()); } private: diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index 490abb86bda..92605b5168d 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -606,8 +606,7 @@ private: { volatile DigitCounterT warp_digit_counters[RADIX_DIGITS][PADDED_WARPS]; DigitCounterT raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT]; - } - aliasable; + } aliasable; }; #endif // !_CCCL_DOXYGEN_INVOKED diff --git a/cub/cub/detail/strong_load.cuh b/cub/cub/detail/strong_load.cuh index 61693d808e2..b6ba4bb5fc8 100644 --- a/cub/cub/detail/strong_load.cuh +++ b/cub/cub/detail/strong_load.cuh @@ -59,14 +59,14 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint4 load_relaxed(uint4 const* ptr) uint4 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) - : "l"(ptr) - : "memory");)); + (asm volatile("ld.relaxed.gpu.v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x), + "=r"(retval.y), + "=r"(retval.z), + "=r"(retval.w) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x), + "=r"(retval.y), + "=r"(retval.z), + "=r"(retval.w) : "l"(ptr) : "memory");)); return retval; } @@ -75,14 +75,8 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 load_relaxed(ulonglong2 const* ulonglong2 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.v2.u64 {%0, %1}, [%2];" - : "=l"(retval.x), "=l"(retval.y) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" - : "=l"(retval.x), "=l"(retval.y) - : "l"(ptr) - : "memory");)); + (asm volatile("ld.relaxed.gpu.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory");)); return retval; } @@ -91,14 +85,14 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE ushort4 load_relaxed(ushort4 const* ptr) ushort4 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.v4.u16 {%0, %1, %2, %3}, [%4];" - : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v4.u16 {%0, %1, %2, %3}, [%4];" - : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) - : "l"(ptr) - : "memory");)); + (asm volatile("ld.relaxed.gpu.v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x), + "=h"(retval.y), + "=h"(retval.z), + "=h"(retval.w) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x), + "=h"(retval.y), + "=h"(retval.z), + "=h"(retval.w) : "l"(ptr) : "memory");)); return retval; } @@ -107,46 +101,26 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_relaxed(uint2 const* ptr) uint2 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.v2.u32 {%0, %1}, [%2];" - : "=r"(retval.x), "=r"(retval.y) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" - : "=r"(retval.x), "=r"(retval.y) - : "l"(ptr) - : "memory");)); + (asm volatile("ld.relaxed.gpu.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory");)); return retval; } static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned long long load_relaxed(unsigned long long const* ptr) { unsigned long long retval; - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.u64 %0, [%1];" - : "=l"(retval) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.u64 %0, [%1];" - : "=l"(retval) - : "l"(ptr) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.u64 %0, [%1];" : "=l"(retval) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.u64 %0, [%1];" : "=l"(retval) : "l"(ptr) : "memory");)); return retval; } static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_relaxed(unsigned int const* ptr) { unsigned int retval; - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.u32 %0, [%1];" - : "=r"(retval) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.u32 %0, [%1];" - : "=r"(retval) - : "l"(ptr) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory");)); return retval; } @@ -154,16 +128,9 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_relaxed(unsigned int con static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned short load_relaxed(unsigned short const* ptr) { unsigned short retval; - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.u16 %0, [%1];" - : "=h"(retval) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.u16 %0, [%1];" - : "=h"(retval) - : "l"(ptr) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.u16 %0, [%1];" : "=h"(retval) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.u16 %0, [%1];" : "=h"(retval) : "l"(ptr) : "memory");)); return retval; } @@ -172,24 +139,16 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned char load_relaxed(unsigned char c unsigned short retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile( - "{" - " .reg .u8 datum;" - " ld.relaxed.gpu.u8 datum, [%1];" - " cvt.u16.u8 %0, datum;" - "}" - : "=h"(retval) - : "l"(ptr) - : "memory");), - (asm volatile( - "{" - " .reg .u8 datum;" - " ld.cg.u8 datum, [%1];" - " cvt.u16.u8 %0, datum;" - "}" - : "=h"(retval) - : "l"(ptr) - : "memory");)); + (asm volatile("{" + " .reg .u8 datum;" + " ld.relaxed.gpu.u8 datum, [%1];" + " cvt.u16.u8 %0, datum;" + "}" : "=h"(retval) : "l"(ptr) : "memory");), + (asm volatile("{" + " .reg .u8 datum;" + " ld.cg.u8 datum, [%1];" + " cvt.u16.u8 %0, datum;" + "}" : "=h"(retval) : "l"(ptr) : "memory");)); return (unsigned char) retval; } @@ -198,14 +157,8 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 load_acquire(ulonglong2 const* ulonglong2 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.acquire.gpu.v2.u64 {%0, %1}, [%2];" - : "=l"(retval.x), "=l"(retval.y) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" - : "=l"(retval.x), "=l"(retval.y) - : "l"(ptr) - : "memory"); + (asm volatile("ld.acquire.gpu.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory"); __threadfence();)); return retval; } @@ -215,14 +168,8 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_acquire(uint2 const* ptr) uint2 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.acquire.gpu.v2.u32 {%0, %1}, [%2];" - : "=r"(retval.x), "=r"(retval.y) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" - : "=r"(retval.x), "=r"(retval.y) - : "l"(ptr) - : "memory"); + (asm volatile("ld.acquire.gpu.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory"); __threadfence();)); return retval; } @@ -230,17 +177,9 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_acquire(uint2 const* ptr) static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_acquire(unsigned int const* ptr) { unsigned int retval; - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("ld.acquire.gpu.u32 %0, [%1];" - : "=r"(retval) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.u32 %0, [%1];" - : "=r"(retval) - : "l"(ptr) - : "memory"); - __threadfence();)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.acquire.gpu.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory"); __threadfence();)); return retval; } diff --git a/cub/cub/detail/strong_store.cuh b/cub/cub/detail/strong_store.cuh index 9b8091738db..cc0e8f60e71 100644 --- a/cub/cub/detail/strong_store.cuh +++ b/cub/cub/detail/strong_store.cuh @@ -56,98 +56,61 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(uint4* ptr, uint4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.v4.u32 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) - : "memory");), - (asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) - : "memory");)); + (asm volatile("st.relaxed.gpu.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr), + "r"(val.x), + "r"(val.y), + "r"(val.z), + "r"(val.w) : "memory");), + (asm volatile( + "st.cg.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(ulonglong2* ptr, ulonglong2 val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.v2.u64 [%0], {%1, %2};" - : - : "l"(ptr), "l"(val.x), "l"(val.y) - : "memory");), - (asm volatile("st.cg.v2.u64 [%0], {%1, %2};" - : - : "l"(ptr), "l"(val.x), "l"(val.y) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");), + (asm volatile("st.cg.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(ushort4* ptr, ushort4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.v4.u16 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) - : "memory");), - (asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) - : "memory");)); + (asm volatile("st.relaxed.gpu.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr), + "h"(val.x), + "h"(val.y), + "h"(val.z), + "h"(val.w) : "memory");), + (asm volatile( + "st.cg.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(uint2* ptr, uint2 val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.v2.u32 [%0], {%1, %2};" - : - : "l"(ptr), "r"(val.x), "r"(val.y) - : "memory");), - (asm volatile("st.cg.v2.u32 [%0], {%1, %2};" - : - : "l"(ptr), "r"(val.x), "r"(val.y) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");), + (asm volatile("st.cg.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned long long* ptr, unsigned long long val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.u64 [%0], %1;" - : - : "l"(ptr), "l"(val) - : "memory");), - (asm volatile("st.cg.u64 [%0], %1;" - : - : "l"(ptr), "l"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");), + (asm volatile("st.cg.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned int* ptr, unsigned int val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.u32 [%0], %1;" - : - : "l"(ptr), "r"(val) - : "memory");), - (asm volatile("st.cg.u32 [%0], %1;" - : - : "l"(ptr), "r"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");), + (asm volatile("st.cg.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned short* ptr, unsigned short val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.u16 [%0], %1;" - : - : "l"(ptr), "h"(val) - : "memory");), - (asm volatile("st.cg.u16 [%0], %1;" - : - : "l"(ptr), "h"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");), + (asm volatile("st.cg.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned char* ptr, unsigned char val) @@ -158,123 +121,77 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned char* ptr, uns " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.relaxed.gpu.u8 [%0], datum;" - "}" - : - : "l"(ptr), "h"((unsigned short) val) - : "memory");), + "}" : : "l"(ptr), + "h"((unsigned short) val) : "memory");), (asm volatile("{" " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.cg.u8 [%0], datum;" - "}" - : - : "l"(ptr), "h"((unsigned short) val) - : "memory");)); + "}" : : "l"(ptr), + "h"((unsigned short) val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(uint4* ptr, uint4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.v4.u32 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) - : "memory");), - (__threadfence(); - asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) - : "memory");)); + (asm volatile("st.release.gpu.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr), + "r"(val.x), + "r"(val.y), + "r"(val.z), + "r"(val.w) : "memory");), + (__threadfence(); asm volatile( + "st.cg.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(ulonglong2* ptr, ulonglong2 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.v2.u64 [%0], {%1, %2};" - : - : "l"(ptr), "l"(val.x), "l"(val.y) - : "memory");), - (__threadfence(); - asm volatile("st.cg.v2.u64 [%0], {%1, %2};" - : - : "l"(ptr), "l"(val.x), "l"(val.y) - : "memory");)); + (asm volatile("st.release.gpu.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");), + (__threadfence(); asm volatile("st.cg.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(ushort4* ptr, ushort4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.v4.u16 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) - : "memory");), - (__threadfence(); - asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) - : "memory");)); + (asm volatile("st.release.gpu.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr), + "h"(val.x), + "h"(val.y), + "h"(val.z), + "h"(val.w) : "memory");), + (__threadfence(); asm volatile( + "st.cg.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(uint2* ptr, uint2 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.v2.u32 [%0], {%1, %2};" - : - : "l"(ptr), "r"(val.x), "r"(val.y) - : "memory");), - (__threadfence(); - asm volatile("st.cg.v2.u32 [%0], {%1, %2};" - : - : "l"(ptr), "r"(val.x), "r"(val.y) - : "memory");)); + (asm volatile("st.release.gpu.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");), + (__threadfence(); asm volatile("st.cg.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned long long* ptr, unsigned long long val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.u64 [%0], %1;" - : - : "l"(ptr), "l"(val) - : "memory");), - (__threadfence(); - asm volatile("st.cg.u64 [%0], %1;" - : - : "l"(ptr), "l"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");), + (__threadfence(); asm volatile("st.cg.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned int* ptr, unsigned int val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.u32 [%0], %1;" - : - : "l"(ptr), "r"(val) - : "memory");), - (__threadfence(); - asm volatile("st.cg.u32 [%0], %1;" - : - : "l"(ptr), "r"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");), + (__threadfence(); asm volatile("st.cg.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned short* ptr, unsigned short val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.u16 [%0], %1;" - : - : "l"(ptr), "h"(val) - : "memory");), - (__threadfence(); - asm volatile("st.cg.u16 [%0], %1;" - : - : "l"(ptr), "h"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");), + (__threadfence(); asm volatile("st.cg.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned char* ptr, unsigned char val) @@ -285,19 +202,15 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned char* ptr, unsigned c " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.release.gpu.u8 [%0], datum;" - "}" - : - : "l"(ptr), "h"((unsigned short) val) - : "memory");), + "}" : : "l"(ptr), + "h"((unsigned short) val) : "memory");), (__threadfence(); asm volatile( "{" " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.cg.u8 [%0], datum;" - "}" - : - : "l"(ptr), "h"((unsigned short) val) - : "memory");)); + "}" : : "l"(ptr), + "h"((unsigned short) val) : "memory");)); } } // namespace detail diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index 386a6276dfa..fa4fa80d0ef 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -169,11 +169,10 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply_impl(F&& f, Tuple&& t, ::cuda::st } template -_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t) - -> decltype(poor_apply_impl( - ::cuda::std::forward(f), - ::cuda::std::forward(t), - ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::remove_reference_t>::value>{})) +_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t) -> decltype(poor_apply_impl( + ::cuda::std::forward(f), + ::cuda::std::forward(t), + ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::remove_reference_t>::value>{})) { return poor_apply_impl( ::cuda::std::forward(f), @@ -473,8 +472,9 @@ using needs_aligned_ptr_t = #ifdef _CUB_HAS_TRANSFORM_UBLKCP template ::value, int> = 0> -_CCCL_DEVICE _CCCL_FORCEINLINE auto select_kernel_arg( - ::cuda::std::integral_constant, kernel_arg&& arg) -> aligned_base_ptr>&& +_CCCL_DEVICE _CCCL_FORCEINLINE auto +select_kernel_arg(::cuda::std::integral_constant, kernel_arg&& arg) + -> aligned_base_ptr>&& { return ::cuda::std::move(arg.aligned_ptr); } @@ -660,10 +660,9 @@ struct dispatch_t - CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel() - -> PoorExpected< - ::cuda::std:: - tuple> + CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel() -> PoorExpected< + ::cuda::std:: + tuple> { using policy_t = typename ActivePolicy::algo_policy; constexpr int block_dim = policy_t::block_threads; diff --git a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh index c6894ccbc86..3645e4b9ed7 100644 --- a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh @@ -255,9 +255,8 @@ struct policy_hub typename Tuning::delay_constructor>; template - static auto select_agent_policy(long) -> - typename DefaultPolicy< - default_delay_constructor_t::pack_t>>::ThreeWayPartitionPolicy; + static auto select_agent_policy(long) -> typename DefaultPolicy< + default_delay_constructor_t::pack_t>>::ThreeWayPartitionPolicy; struct Policy800 : ChainedPolicy<800, Policy800, Policy350> { diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index 7af32df392c..feef89776a9 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -391,8 +391,8 @@ struct CCCL_DEPRECATED BinaryFlip {} template - _CCCL_DEVICE auto - operator()(T&& t, U&& u) -> decltype(binary_op(::cuda::std::forward(u), ::cuda::std::forward(t))) + _CCCL_DEVICE auto operator()(T&& t, U&& u) + -> decltype(binary_op(::cuda::std::forward(u), ::cuda::std::forward(t))) { return binary_op(::cuda::std::forward(u), ::cuda::std::forward(t)); } diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh index 294bc449e31..d3850051ca7 100644 --- a/cub/cub/thread/thread_reduce.cuh +++ b/cub/cub/thread/thread_reduce.cuh @@ -543,8 +543,8 @@ ThreadReduceTernaryTree(const Input& input, ReductionOp reduction_op) // never reached. Protect instantion of ThreadReduceSimd with arbitrary types and operators _CCCL_TEMPLATE(typename Input, typename ReductionOp) _CCCL_REQUIRES((!cub::internal::enable_generic_simd_reduction())) -_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto -ThreadReduceSimd(const Input& input, ReductionOp) -> ::cuda::std::remove_cvref_t +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, ReductionOp) + -> ::cuda::std::remove_cvref_t { assert(false); return input[0]; @@ -552,8 +552,8 @@ ThreadReduceSimd(const Input& input, ReductionOp) -> ::cuda::std::remove_cvref_t _CCCL_TEMPLATE(typename Input, typename ReductionOp) _CCCL_REQUIRES((cub::internal::enable_generic_simd_reduction())) -_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto -ThreadReduceSimd(const Input& input, ReductionOp reduction_op) -> ::cuda::std::remove_cvref_t +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, ReductionOp reduction_op) + -> ::cuda::std::remove_cvref_t { using cub::detail::unsafe_bitcast; using T = ::cuda::std::remove_cvref_t; diff --git a/cub/test/catch2_test_device_for_each_in_extents.cu b/cub/test/catch2_test_device_for_each_in_extents.cu index 8ad75a1d0cb..3e5a6c6689a 100644 --- a/cub/test/catch2_test_device_for_each_in_extents.cu +++ b/cub/test/catch2_test_device_for_each_in_extents.cu @@ -135,8 +135,8 @@ using dimensions = cuda::std::index_sequence<3, 2, 5, 4>>; template -auto build_static_extents(IndexType, - cuda::std::index_sequence) -> cuda::std::extents +auto build_static_extents(IndexType, cuda::std::index_sequence) + -> cuda::std::extents { return {}; } diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu index 06f2b7c31a7..95c4794b8cf 100644 --- a/cub/test/catch2_test_device_transform.cu +++ b/cub/test/catch2_test_device_transform.cu @@ -166,8 +166,8 @@ struct alignas(Alignment) overaligned_addable_t return a.value == b.value; } - _CCCL_HOST_DEVICE friend auto - operator+(const overaligned_addable_t& a, const overaligned_addable_t& b) -> overaligned_addable_t + _CCCL_HOST_DEVICE friend auto operator+(const overaligned_addable_t& a, const overaligned_addable_t& b) + -> overaligned_addable_t { check(a); check(b); diff --git a/cub/test/test_block_radix_rank.cu b/cub/test/test_block_radix_rank.cu index 8c1df1a80c7..c53c6b179e3 100644 --- a/cub/test/test_block_radix_rank.cu +++ b/cub/test/test_block_radix_rank.cu @@ -310,7 +310,7 @@ void Test() Test(); Test(); - Test(cub::Int2Type<(BlockThreads % 32) == 0>{}); + Test(cub::Int2Type < (BlockThreads % 32) == 0 > {}); } int main(int argc, char** argv) diff --git a/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh b/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh index 459beddee22..ae8ad239d46 100644 --- a/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh +++ b/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh @@ -60,8 +60,8 @@ struct receiver_defaults } template - _CUDAX_TRIVIAL_API static auto - set_stopped(__ignore, _Rcvr& __rcvr) noexcept -> __async::completion_signatures<__async::set_stopped_t()> + _CUDAX_TRIVIAL_API static auto set_stopped(__ignore, _Rcvr& __rcvr) noexcept + -> __async::completion_signatures<__async::set_stopped_t()> { __async::set_stopped(static_cast<_Rcvr&&>(__rcvr)); return {}; @@ -198,15 +198,15 @@ _CUDAX_TRIVIAL_API auto __make_opstate(_Sndr __sndr, _Rcvr __rcvr) } template -_CUDAX_TRIVIAL_API auto -__get_attrs(int, const _Data& __data, const _Sndrs&... __sndrs) noexcept -> decltype(__data.get_attrs(__sndrs...)) +_CUDAX_TRIVIAL_API auto __get_attrs(int, const _Data& __data, const _Sndrs&... __sndrs) noexcept + -> decltype(__data.get_attrs(__sndrs...)) { return __data.get_attrs(__sndrs...); } template -_CUDAX_TRIVIAL_API auto -__get_attrs(long, const _Data&, const _Sndrs&... __sndrs) noexcept -> decltype(__async::get_env(__sndrs...)) +_CUDAX_TRIVIAL_API auto __get_attrs(long, const _Data&, const _Sndrs&... __sndrs) noexcept + -> decltype(__async::get_env(__sndrs...)) { return __async::get_env(__sndrs...); } diff --git a/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh b/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh index 25d5ef04d76..868c911b1da 100644 --- a/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh +++ b/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh @@ -76,48 +76,36 @@ template class _Vy, template class _ using __transform_sig_t = decltype(__transform_sig<_Sig, _Vy, _Ey, _Sy>()); template - class _Vy, - template - class _Ey, + template class _Vy, + template class _Ey, class _Sy, - template - class _Variant, + template class _Variant, class... _More> extern _DIAGNOSTIC<_Sigs> __transform_completion_signatures_v; template - class _Vy, - template - class _Ey, + template class _Vy, + template class _Ey, class _Sy, - template - class _Variant, + template class _Variant, class... _More> extern __fn_t<_ERROR<_What...>>* __transform_completion_signatures_v<_ERROR<_What...>, _Vy, _Ey, _Sy, _Variant, _More...>; template - class _Vy, - template - class _Ey, + template class _Vy, + template class _Ey, class _Sy, - template - class _Variant, + template class _Variant, class... _More> extern __fn_t<_Variant<__transform_sig_t<_Sigs, _Vy, _Ey, _Sy>..., _More...>>* __transform_completion_signatures_v, _Vy, _Ey, _Sy, _Variant, _More...>; template - class _Vy, - template - class _Ey, + template class _Vy, + template class _Ey, class _Sy, - template - class _Variant, + template class _Variant, class... _More> using __transform_completion_signatures = decltype(__transform_completion_signatures_v<_Sigs, _Vy, _Ey, _Sy, _Variant, _More...>()); @@ -129,12 +117,9 @@ template <> struct __gather_sigs_fn { template - class _Then, - template - class _Else, - template - class _Variant, + template class _Then, + template class _Else, + template class _Variant, class... _More> using __call = __transform_completion_signatures< _Sigs, @@ -149,12 +134,9 @@ template <> struct __gather_sigs_fn { template - class _Then, - template - class _Else, - template - class _Variant, + template class _Then, + template class _Else, + template class _Variant, class... _More> using __call = __transform_completion_signatures< _Sigs, @@ -169,12 +151,9 @@ template <> struct __gather_sigs_fn { template - class _Then, - template - class _Else, - template - class _Variant, + template class _Then, + template class _Else, + template class _Variant, class... _More> using __call = __transform_completion_signatures< _Sigs, @@ -187,12 +166,9 @@ struct __gather_sigs_fn template - class _Then, - template - class _Else, - template - class _Variant, + template class _Then, + template class _Else, + template class _Variant, class... _More> using __gather_completion_signatures = typename __gather_sigs_fn<_WantedTag>::template __call<_Sigs, _Then, _Else, _Variant, _More...>; @@ -404,13 +380,12 @@ template auto completion(_Tag, _Args&&...) -> __csig::__sigs<_Tag(_Args...)>&; template -auto completions_of(_Sndr&&, - _Rcvr = {}) -> decltype(__csig::__to_sigs(__declval&>())); +auto completions_of(_Sndr&&, _Rcvr = {}) + -> decltype(__csig::__to_sigs(__declval&>())); template -auto eptr_completion_if() - -> _CUDA_VSTD:: - conditional_t<_PotentiallyThrowing, __csig::__sigs, __csig::__sigs<>>&; +auto eptr_completion_if() -> _CUDA_VSTD:: + conditional_t<_PotentiallyThrowing, __csig::__sigs, __csig::__sigs<>>&; } // namespace meta } // namespace cuda::experimental::__async diff --git a/cudax/include/cuda/experimental/__async/sender/continue_on.cuh b/cudax/include/cuda/experimental/__async/sender/continue_on.cuh index 9a0c142e21c..8da87a443a3 100644 --- a/cudax/include/cuda/experimental/__async/sender/continue_on.cuh +++ b/cudax/include/cuda/experimental/__async/sender/continue_on.cuh @@ -267,8 +267,8 @@ struct continue_on_t::__sndr_t }; template -_CUDAX_API auto -continue_on_t::operator()(_Sndr __sndr, _Sch __sch) const noexcept -> continue_on_t::__sndr_t<_Sndr, _Sch> +_CUDAX_API auto continue_on_t::operator()(_Sndr __sndr, _Sch __sch) const noexcept + -> continue_on_t::__sndr_t<_Sndr, _Sch> { return __sndr_t<_Sndr, _Sch>{{}, __sch, static_cast<_Sndr&&>(__sndr)}; } diff --git a/cudax/include/cuda/experimental/__async/sender/cpos.cuh b/cudax/include/cuda/experimental/__async/sender/cpos.cuh index 7f1fb383a71..dab62e7ac10 100644 --- a/cudax/include/cuda/experimental/__async/sender/cpos.cuh +++ b/cudax/include/cuda/experimental/__async/sender/cpos.cuh @@ -110,8 +110,8 @@ _CCCL_GLOBAL_CONSTANT struct set_error_t _CCCL_GLOBAL_CONSTANT struct set_stopped_t { template - _CUDAX_TRIVIAL_API auto - operator()(_Rcvr&& __rcvr) const noexcept -> decltype(static_cast<_Rcvr&&>(__rcvr).set_stopped()) + _CUDAX_TRIVIAL_API auto operator()(_Rcvr&& __rcvr) const noexcept + -> decltype(static_cast<_Rcvr&&>(__rcvr).set_stopped()) { static_assert(_CUDA_VSTD::is_same_v(__rcvr).set_stopped()), void>); static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_stopped())); @@ -119,8 +119,8 @@ _CCCL_GLOBAL_CONSTANT struct set_stopped_t } template - _CUDAX_TRIVIAL_API auto - operator()(_Rcvr* __rcvr) const noexcept -> decltype(static_cast<_Rcvr&&>(*__rcvr).set_stopped()) + _CUDAX_TRIVIAL_API auto operator()(_Rcvr* __rcvr) const noexcept + -> decltype(static_cast<_Rcvr&&>(*__rcvr).set_stopped()) { static_assert(_CUDA_VSTD::is_same_v(*__rcvr).set_stopped()), void>); static_assert(noexcept(static_cast<_Rcvr&&>(*__rcvr).set_stopped())); diff --git a/cudax/include/cuda/experimental/__async/sender/let_value.cuh b/cudax/include/cuda/experimental/__async/sender/let_value.cuh index 7d06e071fe0..6742a1c1d6c 100644 --- a/cudax/include/cuda/experimental/__async/sender/let_value.cuh +++ b/cudax/include/cuda/experimental/__async/sender/let_value.cuh @@ -243,8 +243,9 @@ private: _Sndr __sndr_; template - _CUDAX_API auto connect(_Rcvr __rcvr) && noexcept( - __nothrow_constructible<__opstate_t<_Rcvr, _Sndr, _Fn>, _Sndr, _Fn, _Rcvr>) -> __opstate_t<_Rcvr, _Sndr, _Fn> + _CUDAX_API auto + connect(_Rcvr __rcvr) && noexcept(__nothrow_constructible<__opstate_t<_Rcvr, _Sndr, _Fn>, _Sndr, _Fn, _Rcvr>) + -> __opstate_t<_Rcvr, _Sndr, _Fn> { return __opstate_t<_Rcvr, _Sndr, _Fn>( static_cast<_Sndr&&>(__sndr_), static_cast<_Fn&&>(__fn_), static_cast<_Rcvr&&>(__rcvr)); diff --git a/cudax/include/cuda/experimental/__async/sender/stop_token.cuh b/cudax/include/cuda/experimental/__async/sender/stop_token.cuh index 35e6d4d164a..693816dbb45 100644 --- a/cudax/include/cuda/experimental/__async/sender/stop_token.cuh +++ b/cudax/include/cuda/experimental/__async/sender/stop_token.cuh @@ -369,8 +369,8 @@ _CUDAX_API inline void inplace_stop_source::__unlock(uint8_t __old_state) const (void) __state_.store(__old_state, _CUDA_VSTD::memory_order_release); } -_CUDAX_API inline auto -inplace_stop_source::__try_lock_unless_stop_requested(bool __set_stop_requested) const noexcept -> bool +_CUDAX_API inline auto inplace_stop_source::__try_lock_unless_stop_requested(bool __set_stop_requested) const noexcept + -> bool { __stok::__spin_wait __spin; auto __old_state = __state_.load(_CUDA_VSTD::memory_order_relaxed); diff --git a/cudax/include/cuda/experimental/__async/sender/tuple.cuh b/cudax/include/cuda/experimental/__async/sender/tuple.cuh index 98a1d0997f1..0229ed8b9c7 100644 --- a/cudax/include/cuda/experimental/__async/sender/tuple.cuh +++ b/cudax/include/cuda/experimental/__async/sender/tuple.cuh @@ -65,8 +65,8 @@ struct __tupl<_CUDA_VSTD::index_sequence<_Idx...>, _Ts...> : __box<_Idx, _Ts>... template _CUDAX_TRIVIAL_API static auto __for_each(_Fn&& __fn, _Self&& __self, _Us&&... __us) // - noexcept((__nothrow_callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> - && ...)) -> _CUDA_VSTD::enable_if_t<(__callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> && ...)> + noexcept((__nothrow_callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> && ...)) + -> _CUDA_VSTD::enable_if_t<(__callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> && ...)> { return ( static_cast<_Fn&&>(__fn)(static_cast<_Us&&>(__us)..., static_cast<_Self&&>(__self).__box<_Idx, _Ts>::__value_), diff --git a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh index 8a42bab40ca..0e1dceff19b 100644 --- a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh @@ -80,8 +80,8 @@ struct __with_property template struct __iproperty : interface<__iproperty> { - _CUDAX_HOST_API friend auto - get_property([[maybe_unused]] const __iproperty& __obj, _Property) -> __property_result_t<_Property> + _CUDAX_HOST_API friend auto get_property([[maybe_unused]] const __iproperty& __obj, _Property) + -> __property_result_t<_Property> { if constexpr (!_CUDA_VSTD::is_same_v<__property_result_t<_Property>, void>) { @@ -268,8 +268,8 @@ template struct __with_try_get_property { template - _CUDAX_HOST_API _CCCL_NODISCARD_FRIEND auto - try_get_property(const _Derived& __self, _Property) noexcept -> __try_property_result_t<_Property> + _CUDAX_HOST_API _CCCL_NODISCARD_FRIEND auto try_get_property(const _Derived& __self, _Property) noexcept + -> __try_property_result_t<_Property> { auto __prop = __cudax::dynamic_any_cast*>(&__self); if constexpr (_CUDA_VSTD::is_same_v<__property_result_t<_Property>, void>) diff --git a/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh b/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh index 5b64dbc531d..bd481b3dea2 100644 --- a/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh +++ b/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh @@ -50,8 +50,8 @@ _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto basic_any_from(_Interface<_Super>& } template