From 31e49d476c02fb33652e6214ca883e5472e278eb Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Wed, 8 Jan 2025 22:49:07 -0800
Subject: [PATCH 01/31] Make CUB NVRTC commandline arguments come from a cmake
 template (#3292)

---
 cub/test/CMakeLists.txt        | 6 ++----
 cub/test/catch2_test_nvrtc.cu  | 1 +
 cub/test/cmake/nvrtc_args.h.in | 6 ++++++
 3 files changed, 9 insertions(+), 4 deletions(-)
 create mode 100644 cub/test/cmake/nvrtc_args.h.in

diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt
index 17201c4704f..c86d24754de 100644
--- a/cub/test/CMakeLists.txt
+++ b/cub/test/CMakeLists.txt
@@ -227,10 +227,8 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id)
     endif() # CUB_SEPARATE_CATCH2
 
     if ("${test_target}" MATCHES "nvrtc")
-      target_compile_definitions(${test_target} PRIVATE NVRTC_CUB_PATH="-I${CMAKE_SOURCE_DIR}/cub")
-      target_compile_definitions(${test_target} PRIVATE NVRTC_THRUST_PATH="-I${CMAKE_SOURCE_DIR}/thrust")
-      target_compile_definitions(${test_target} PRIVATE NVRTC_LIBCUDACXX_PATH="-I${CMAKE_SOURCE_DIR}/libcudacxx/include")
-      target_compile_definitions(${test_target} PRIVATE NVRTC_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}")
+      configure_file("cmake/nvrtc_args.h.in" ${CMAKE_CURRENT_BINARY_DIR}/nvrtc_args.h)
+      target_include_directories(${test_target} PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
     endif()
 
     if ("${test_target}" MATCHES "test.iterator")
diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu
index 01f39027ce0..71187ecc83a 100644
--- a/cub/test/catch2_test_nvrtc.cu
+++ b/cub/test/catch2_test_nvrtc.cu
@@ -31,6 +31,7 @@
 
 #include <c2h/catch2_test_helper.h>
 #include <nvrtc.h>
+#include <nvrtc_args.h>
 
 TEST_CASE("Test nvrtc", "[test][nvrtc]")
 {
diff --git a/cub/test/cmake/nvrtc_args.h.in b/cub/test/cmake/nvrtc_args.h.in
new file mode 100644
index 00000000000..215804ad0f0
--- /dev/null
+++ b/cub/test/cmake/nvrtc_args.h.in
@@ -0,0 +1,6 @@
+#pragma once
+
+const char* NVRTC_CUB_PATH        = "-I@CMAKE_SOURCE_DIR@/cub";
+const char* NVRTC_THRUST_PATH     = "-I@CMAKE_SOURCE_DIR@/thrust";
+const char* NVRTC_LIBCUDACXX_PATH = "-I@CMAKE_SOURCE_DIR@/libcudacxx/include";
+const char* NVRTC_CTK_PATH        = "-I@CUDAToolkit_INCLUDE_DIRS@";

From 58d8893dee7e53fc034589b7b99ab67814a618da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?=
 <158148890+caugonnet@users.noreply.github.com>
Date: Thu, 9 Jan 2025 09:42:37 +0100
Subject: [PATCH 02/31] Propose the same components (thrust, cub, libc++,
 cudax, cuda.parallel,...) in the bug report template than in the feature
 request template (#3295)

---
 .github/ISSUE_TEMPLATE/bug_report.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index 725009e6256..74de63e7a94 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -37,6 +37,11 @@ body:
         - Thrust
         - CUB
         - libcu++
+        - CUDA Experimental (cudax)
+        - cuda.cooperative (Python)
+        - cuda.parallel (Python)
+        - General CCCL
+        - Infrastructure
         - Not sure
     validations:
       required: true

From 466c0d3cefe554d884c53ac242d95b4b598da5e6 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Thu, 9 Jan 2025 01:30:16 -0800
Subject: [PATCH 03/31] Use process isolation instead of default hyper-v for
 Windows. (#3294)

Try improving build times by using process isolation instead of hyper-v

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 .github/actions/workflow-run-job-windows/action.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/actions/workflow-run-job-windows/action.yml b/.github/actions/workflow-run-job-windows/action.yml
index 805beff3446..1b5289a5a7d 100644
--- a/.github/actions/workflow-run-job-windows/action.yml
+++ b/.github/actions/workflow-run-job-windows/action.yml
@@ -50,6 +50,7 @@ runs:
         docker run \
           --mount type=bind,source="${{steps.paths.outputs.HOST_REPO}}",target="${{steps.paths.outputs.MOUNT_REPO}}" \
           --workdir "${{steps.paths.outputs.MOUNT_REPO}}" \
+          --isolation=process \
           ${{ inputs.image }} \
           powershell -c "
             [System.Environment]::SetEnvironmentVariable('AWS_ACCESS_KEY_ID','${{env.AWS_ACCESS_KEY_ID}}');

From f43dc54f8d54999d540f380b627314379ba7316d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 9 Jan 2025 11:21:10 +0100
Subject: [PATCH 04/31] [pre-commit.ci] pre-commit autoupdate (#3248)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [pre-commit.ci] pre-commit autoupdate

updates:
- [github.com/pre-commit/mirrors-clang-format: v18.1.8 → v19.1.6](https://github.com/pre-commit/mirrors-clang-format/compare/v18.1.8...v19.1.6)
- [github.com/astral-sh/ruff-pre-commit: v0.8.3 → v0.8.6](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.3...v0.8.6)
- [github.com/pre-commit/mirrors-mypy: v1.13.0 → v1.14.1](https://github.com/pre-commit/mirrors-mypy/compare/v1.13.0...v1.14.1)

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 .pre-commit-config.yaml                       |   6 +-
 cub/cub/agent/agent_histogram.cuh             |   2 +-
 cub/cub/agent/agent_reduce.cuh                |   8 +-
 cub/cub/block/block_radix_rank.cuh            |   3 +-
 cub/cub/detail/strong_load.cuh                | 153 +++-------
 cub/cub/detail/strong_store.cuh               | 215 ++++---------
 .../device/dispatch/dispatch_transform.cuh    |  21 +-
 .../tuning/tuning_three_way_partition.cuh     |   5 +-
 cub/cub/thread/thread_operators.cuh           |   4 +-
 cub/cub/thread/thread_reduce.cuh              |   8 +-
 .../catch2_test_device_for_each_in_extents.cu |   4 +-
 cub/test/catch2_test_device_transform.cu      |   4 +-
 cub/test/test_block_radix_rank.cu             |   2 +-
 .../__async/sender/basic_sender.cuh           |  12 +-
 .../__async/sender/completion_signatures.cuh  |  81 ++---
 .../__async/sender/continue_on.cuh            |   4 +-
 .../cuda/experimental/__async/sender/cpos.cuh |   8 +-
 .../experimental/__async/sender/let_value.cuh |   5 +-
 .../__async/sender/stop_token.cuh             |   4 +-
 .../experimental/__async/sender/tuple.cuh     |   4 +-
 .../__memory_resource/any_resource.cuh        |   8 +-
 .../__utility/basic_any/basic_any_from.cuh    |   8 +-
 .../__utility/basic_any/basic_any_ptr.cuh     |  11 +-
 .../__utility/basic_any/interfaces.cuh        |  12 +-
 .../experimental/__utility/basic_any/iset.cuh |   4 +-
 .../experimental/__utility/basic_any/rtti.cuh |   8 +-
 .../__utility/basic_any/virtual_ptrs.cuh      |   6 +-
 .../__utility/basic_any/virtual_tables.cuh    |   4 +-
 cudax/test/stf/error_checks/ctx_mismatch.cu   |   3 +-
 .../error_checks/data_interface_mismatch.cu   |   3 +-
 .../test/stf/error_checks/double_finalize.cu  |   3 +-
 cudax/test/stf/error_checks/erase_frozen.cu   |   3 +-
 .../error_checks/misformed_tasks_dbl_end.cu   |   3 +-
 .../error_checks/misformed_tasks_dbl_start.cu |   3 +-
 .../test/stf/error_checks/non_managed_data.cu |   3 +-
 .../stf/error_checks/slice_check_bounds.cu    |   3 +-
 .../stf/error_checks/uninitialized_data.cu    |   3 +-
 .../stf/error_checks/unsatisfiable_spec.cu    |   3 +-
 cudax/test/stf/error_checks/write_frozen.cu   |   3 +-
 .../cuda/__barrier/barrier_block_scope.h      |  57 ++--
 .../cuda/__barrier/barrier_expect_tx.h        |   7 +-
 .../cuda/__functional/address_stability.h     |   4 +-
 .../__memcpy_async/cp_async_shared_global.h   |  20 +-
 .../cuda/__memcpy_async/memcpy_completion.h   |   8 +-
 .../instructions/generated/barrier_cluster.h  |  25 +-
 .../instructions/generated/cp_async_bulk.h    |  28 +-
 .../generated/cp_async_bulk_commit_group.h    |   5 +-
 .../generated/cp_async_bulk_multicast.h       |  13 +-
 .../generated/cp_async_bulk_tensor.h          | 135 ++++-----
 .../cp_async_bulk_tensor_multicast.h          |  85 +++---
 .../generated/cp_async_bulk_wait_group.h      |  10 +-
 .../generated/cp_reduce_async_bulk.h          | 271 +++++++----------
 .../generated/cp_reduce_async_bulk_bf16.h     |  24 +-
 .../generated/cp_reduce_async_bulk_f16.h      |  24 +-
 .../generated/fence_mbarrier_init.h           |   5 +-
 .../generated/fence_proxy_alias.h             |   5 +-
 .../generated/fence_proxy_async.h             |   5 +-
 .../__ptx/instructions/generated/get_sreg.h   | 185 ++----------
 .../__ptx/instructions/generated/getctarank.h |   5 +-
 .../instructions/generated/mbarrier_arrive.h  |  27 +-
 .../generated/mbarrier_arrive_expect_tx.h     |   7 +-
 .../generated/mbarrier_arrive_no_complete.h   |   7 +-
 .../instructions/generated/mbarrier_init.h    |   5 +-
 .../generated/mbarrier_test_wait.h            |   6 +-
 .../generated/mbarrier_test_wait_parity.h     |   6 +-
 .../generated/mbarrier_try_wait.h             |  13 +-
 .../generated/mbarrier_try_wait_parity.h      |  13 +-
 .../__ptx/instructions/generated/red_async.h  | 103 ++++---
 .../__ptx/instructions/generated/st_async.h   |  16 +-
 .../generated/tensormap_replace.h             | 132 ++++----
 libcudacxx/include/cuda/pipeline              |  13 +-
 .../include/cuda/std/__atomic/types/base.h    |  36 +--
 .../include/cuda/std/__atomic/types/common.h  |   4 +-
 .../include/cuda/std/__atomic/types/locked.h  |  28 +-
 .../include/cuda/std/__atomic/types/small.h   |  36 +--
 .../cuda/std/__concepts/concept_macros.h      |   5 +-
 libcudacxx/include/cuda/std/__cstddef/types.h |   2 +-
 libcudacxx/include/cuda/std/__cuda/chrono.h   |   3 +-
 .../include/cuda/std/__functional/function.h  |   2 +-
 .../include/cuda/std/__functional/mem_fn.h    |   2 +-
 .../include/cuda/std/__iterator/access.h      |  16 +-
 libcudacxx/include/cuda/std/__iterator/data.h |   8 +-
 .../include/cuda/std/__iterator/empty.h       |   4 +-
 .../include/cuda/std/__iterator/iter_move.h   |   5 +-
 .../cuda/std/__iterator/iterator_traits.h     |  11 +-
 .../cuda/std/__iterator/reverse_access.h      |  16 +-
 .../include/cuda/std/__mdspan/extents.h       |   3 +-
 libcudacxx/include/cuda/std/__mdspan/macros.h |  10 +-
 libcudacxx/include/cuda/std/__mdspan/mdspan.h |  15 +-
 .../cuda/std/__memory/allocator_traits.h      |   4 +-
 .../cuda/std/__memory/pointer_traits.h        |   4 +-
 libcudacxx/include/cuda/std/__ranges/access.h |   5 +-
 libcudacxx/include/cuda/std/__ranges/data.h   |   5 +-
 libcudacxx/include/cuda/std/__ranges/rend.h   |   5 +-
 .../include/cuda/std/__ranges/subrange.h      |   4 +-
 .../cuda/std/__thread/threading_support.h     |   4 +-
 .../std/__thread/threading_support_cuda.h     |   3 +-
 .../cuda/std/__type_traits/type_list.h        |   4 +-
 .../cuda/std/detail/libcxx/include/span       |   4 +-
 .../cuda/std/detail/libcxx/include/variant    |  20 +-
 .../atomic.ext/atomic_fetch_max.pass.cpp      |   3 +-
 .../atomic.ext/atomic_fetch_min.pass.cpp      |   3 +-
 .../cuda/atomics/atomic.ext/atomic_helpers.h  |   9 +-
 .../barrier/cp_async_bulk_tensor_1d.pass.cpp  |   5 +-
 .../barrier/cp_async_bulk_tensor_2d.pass.cpp  |   5 +-
 .../barrier/cp_async_bulk_tensor_3d.pass.cpp  |   5 +-
 .../barrier/cp_async_bulk_tensor_4d.pass.cpp  |   5 +-
 .../barrier/cp_async_bulk_tensor_5d.pass.cpp  |   5 +-
 .../test/libcudacxx/cuda/memcpy_async.h       |  18 +-
 .../cuda/memcpy_async/group_memcpy_async.h    |  18 +-
 ...ne_memcpy_async_producer_consumer.pass.cpp |   3 +-
 ...peline_memcpy_async_thread_scope_generic.h |  15 +-
 .../atomics.types.generic/bool.pass.cpp       |   3 +-
 .../floating_point.pass.cpp                   |   3 +-
 .../floating_point_ref.pass.cpp               |   3 +-
 .../floating_point_ref_constness.pass.cpp     |   3 +-
 .../integral/1b_integral_cuda.pass.cpp        |   3 +-
 .../integral/1b_integral_std.pass.cpp         |   3 +-
 .../integral/2b_integral_cuda.pass.cpp        |   3 +-
 .../integral/2b_integral_std.pass.cpp         |   3 +-
 .../integral/4b_integral_cuda.pass.cpp        |   3 +-
 .../integral/4b_integral_std.pass.cpp         |   3 +-
 .../integral/8b_integral_cuda.pass.cpp        |   3 +-
 .../integral/8b_integral_std.pass.cpp         |   3 +-
 .../integral/integral_ref.pass.cpp            |   3 +-
 .../integral/integral_ref_constness.pass.cpp  |   3 +-
 .../atomic_helpers.h                          |  15 +-
 .../equality_comparable.compile.pass.cpp      |   8 +-
 .../equality_comparable_with.compile.pass.cpp | 284 ++++++++----------
 .../totally_ordered.pass.cpp                  |   8 +-
 .../totally_ordered_with.pass.cpp             | 258 ++++++++--------
 .../concepts.object/copyable.compile.pass.cpp |   8 +-
 .../concepts.object/movable.compile.pass.cpp  |   8 +-
 .../concepts.object/regular.compile.pass.cpp  |  16 +-
 .../semiregular.compile.pass.cpp              |   8 +-
 .../array/array.creation/to_array.pass.cpp    |   2 +-
 .../incrementable_traits.compile.pass.cpp     |  12 +-
 .../indirectly_readable.compile.pass.cpp      |   6 +-
 .../weakly_incrementable.compile.pass.cpp     |  12 +-
 .../thread/thread.barrier/completion.pass.cpp |   3 +-
 .../func.bind_front/bind_front.pass.cpp       |   4 +-
 .../func.invoke/invoke.pass.cpp               |   4 +-
 .../refwrap.invoke/invoke.compile.fail.cpp    |   2 +-
 .../meta.trans.other/common_type.pass.cpp     |   3 +-
 .../meta.trans.other/result_of.pass.cpp       |   2 +-
 .../meta.trans.other/result_of11.pass.cpp     |   2 +-
 .../bitset.members/to_ullong.pass.cpp         |   2 +-
 .../bitset.members/to_ulong.pass.cpp          |   2 +-
 .../tuple.apply/apply_extended_types.pass.cpp |   2 +-
 .../utility/utility.swap/swap.pass.cpp        |   4 +-
 .../utility/utility.swap/swap_array.pass.cpp  |   4 +-
 .../variant.swap/swap.pass.cpp                |   4 +-
 libcudacxx/test/support/archetypes.h          |   3 +-
 .../test/support/charconv_test_helpers.h      |   4 +-
 libcudacxx/test/support/concurrent_agents.h   |   3 +-
 libcudacxx/test/support/counting_predicates.h |   8 +-
 libcudacxx/test/support/cuda_space_selector.h |   3 +-
 libcudacxx/test/support/is_transparent.h      |  20 +-
 libcudacxx/test/support/rapid-cxx-test.h      | 144 ++++-----
 libcudacxx/test/support/test_convertible.h    |   4 +-
 thrust/testing/async_transform.cu             |  18 +-
 thrust/testing/cuda/transform.cu              |   8 +-
 thrust/testing/unittest/testframework.h       |   6 +-
 thrust/thrust/detail/functional/actor.h       |   4 +-
 thrust/thrust/detail/functional/operators.h   |  24 +-
 thrust/thrust/detail/tuple_transform.h        |   3 +-
 .../detail/type_traits/pointer_traits.h       |  12 +-
 thrust/thrust/functional.h                    |  16 +-
 thrust/thrust/optional.h                      |  10 +-
 thrust/thrust/system/cuda/detail/transform.h  |   4 +-
 .../type_traits/is_contiguous_iterator.h      |   4 +-
 171 files changed, 1361 insertions(+), 1940 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7dd411ba39b..d317e931e78 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,7 +17,7 @@ repos:
       - id: mixed-line-ending
       - id: trailing-whitespace
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v18.1.8
+    rev: v19.1.6
     hooks:
       - id: clang-format
         types_or: [file]
@@ -39,7 +39,7 @@ repos:
   # TODO/REMINDER: add the Ruff vscode extension to the devcontainers
   # Ruff, the Python auto-correcting linter/formatter written in Rust
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.8.3
+    rev: v0.8.6
     hooks:
     - id: ruff  # linter
     - id: ruff-format  # formatter
@@ -57,7 +57,7 @@ repos:
 
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v1.13.0'
+    rev: 'v1.14.1'
     hooks:
       - id: mypy
         additional_dependencies: [types-cachetools, numpy]
diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
index 21a487828ca..e454dc837b1 100644
--- a/cub/cub/agent/agent_histogram.cuh
+++ b/cub/cub/agent/agent_histogram.cuh
@@ -629,7 +629,7 @@ struct AgentHistogram
 
     // Set valid flags
     MarkValid<IS_FULL_TILE>(
-      is_valid, valid_samples, Int2Type<AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED>{});
+      is_valid, valid_samples, Int2Type < AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED > {});
 
     // Accumulate samples
     if (prefer_smem)
diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh
index 2e0d94b219c..d5e3514f369 100644
--- a/cub/cub/agent/agent_reduce.cuh
+++ b/cub/cub/agent/agent_reduce.cuh
@@ -382,8 +382,8 @@ struct AgentReduce
     even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
 
     return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>()))
-           ? ConsumeRange(even_share, Int2Type < true && ATTEMPT_VECTORIZATION > ())
-           : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ());
+           ? ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>())
+           : ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
   }
 
   /**
@@ -396,8 +396,8 @@ struct AgentReduce
     even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
 
     return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>()))
-           ? ConsumeRange(even_share, Int2Type < true && ATTEMPT_VECTORIZATION > ())
-           : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ());
+           ? ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>())
+           : ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
   }
 
 private:
diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index 490abb86bda..92605b5168d 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -606,8 +606,7 @@ private:
     {
       volatile DigitCounterT warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
       DigitCounterT raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
-    }
-    aliasable;
+    } aliasable;
   };
 #endif // !_CCCL_DOXYGEN_INVOKED
 
diff --git a/cub/cub/detail/strong_load.cuh b/cub/cub/detail/strong_load.cuh
index 61693d808e2..b6ba4bb5fc8 100644
--- a/cub/cub/detail/strong_load.cuh
+++ b/cub/cub/detail/strong_load.cuh
@@ -59,14 +59,14 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint4 load_relaxed(uint4 const* ptr)
   uint4 retval;
   NV_IF_TARGET(
     NV_PROVIDES_SM_70,
-    (asm volatile("ld.relaxed.gpu.v4.u32 {%0, %1, %2, %3}, [%4];"
-                  : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w)
-                  : "l"(ptr)
-                  : "memory");),
-    (asm volatile("ld.cg.v4.u32 {%0, %1, %2, %3}, [%4];"
-                  : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w)
-                  : "l"(ptr)
-                  : "memory");));
+    (asm volatile("ld.relaxed.gpu.v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x),
+                  "=r"(retval.y),
+                  "=r"(retval.z),
+                  "=r"(retval.w) : "l"(ptr) : "memory");),
+    (asm volatile("ld.cg.v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x),
+                  "=r"(retval.y),
+                  "=r"(retval.z),
+                  "=r"(retval.w) : "l"(ptr) : "memory");));
   return retval;
 }
 
@@ -75,14 +75,8 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 load_relaxed(ulonglong2 const*
   ulonglong2 retval;
   NV_IF_TARGET(
     NV_PROVIDES_SM_70,
-    (asm volatile("ld.relaxed.gpu.v2.u64 {%0, %1}, [%2];"
-                  : "=l"(retval.x), "=l"(retval.y)
-                  : "l"(ptr)
-                  : "memory");),
-    (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];"
-                  : "=l"(retval.x), "=l"(retval.y)
-                  : "l"(ptr)
-                  : "memory");));
+    (asm volatile("ld.relaxed.gpu.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory");),
+    (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory");));
   return retval;
 }
 
@@ -91,14 +85,14 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE ushort4 load_relaxed(ushort4 const* ptr)
   ushort4 retval;
   NV_IF_TARGET(
     NV_PROVIDES_SM_70,
-    (asm volatile("ld.relaxed.gpu.v4.u16 {%0, %1, %2, %3}, [%4];"
-                  : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w)
-                  : "l"(ptr)
-                  : "memory");),
-    (asm volatile("ld.cg.v4.u16 {%0, %1, %2, %3}, [%4];"
-                  : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w)
-                  : "l"(ptr)
-                  : "memory");));
+    (asm volatile("ld.relaxed.gpu.v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x),
+                  "=h"(retval.y),
+                  "=h"(retval.z),
+                  "=h"(retval.w) : "l"(ptr) : "memory");),
+    (asm volatile("ld.cg.v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x),
+                  "=h"(retval.y),
+                  "=h"(retval.z),
+                  "=h"(retval.w) : "l"(ptr) : "memory");));
   return retval;
 }
 
@@ -107,46 +101,26 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_relaxed(uint2 const* ptr)
   uint2 retval;
   NV_IF_TARGET(
     NV_PROVIDES_SM_70,
-    (asm volatile("ld.relaxed.gpu.v2.u32 {%0, %1}, [%2];"
-                  : "=r"(retval.x), "=r"(retval.y)
-                  : "l"(ptr)
-                  : "memory");),
-    (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];"
-                  : "=r"(retval.x), "=r"(retval.y)
-                  : "l"(ptr)
-                  : "memory");));
+    (asm volatile("ld.relaxed.gpu.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory");),
+    (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory");));
   return retval;
 }
 
 static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned long long load_relaxed(unsigned long long const* ptr)
 {
   unsigned long long retval;
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_70,
-    (asm volatile("ld.relaxed.gpu.u64 %0, [%1];"
-                  : "=l"(retval)
-                  : "l"(ptr)
-                  : "memory");),
-    (asm volatile("ld.cg.u64 %0, [%1];"
-                  : "=l"(retval)
-                  : "l"(ptr)
-                  : "memory");));
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("ld.relaxed.gpu.u64 %0, [%1];" : "=l"(retval) : "l"(ptr) : "memory");),
+               (asm volatile("ld.cg.u64 %0, [%1];" : "=l"(retval) : "l"(ptr) : "memory");));
   return retval;
 }
 
 static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_relaxed(unsigned int const* ptr)
 {
   unsigned int retval;
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_70,
-    (asm volatile("ld.relaxed.gpu.u32 %0, [%1];"
-                  : "=r"(retval)
-                  : "l"(ptr)
-                  : "memory");),
-    (asm volatile("ld.cg.u32 %0, [%1];"
-                  : "=r"(retval)
-                  : "l"(ptr)
-                  : "memory");));
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("ld.relaxed.gpu.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory");),
+               (asm volatile("ld.cg.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory");));
 
   return retval;
 }
@@ -154,16 +128,9 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_relaxed(unsigned int con
 static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned short load_relaxed(unsigned short const* ptr)
 {
   unsigned short retval;
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_70,
-    (asm volatile("ld.relaxed.gpu.u16 %0, [%1];"
-                  : "=h"(retval)
-                  : "l"(ptr)
-                  : "memory");),
-    (asm volatile("ld.cg.u16 %0, [%1];"
-                  : "=h"(retval)
-                  : "l"(ptr)
-                  : "memory");));
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("ld.relaxed.gpu.u16 %0, [%1];" : "=h"(retval) : "l"(ptr) : "memory");),
+               (asm volatile("ld.cg.u16 %0, [%1];" : "=h"(retval) : "l"(ptr) : "memory");));
   return retval;
 }
 
@@ -172,24 +139,16 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned char load_relaxed(unsigned char c
   unsigned short retval;
   NV_IF_TARGET(
     NV_PROVIDES_SM_70,
-    (asm volatile(
-       "{"
-       "  .reg .u8 datum;"
-       "  ld.relaxed.gpu.u8 datum, [%1];"
-       "  cvt.u16.u8 %0, datum;"
-       "}"
-       : "=h"(retval)
-       : "l"(ptr)
-       : "memory");),
-    (asm volatile(
-       "{"
-       "  .reg .u8 datum;"
-       "  ld.cg.u8 datum, [%1];"
-       "  cvt.u16.u8 %0, datum;"
-       "}"
-       : "=h"(retval)
-       : "l"(ptr)
-       : "memory");));
+    (asm volatile("{"
+                  "  .reg .u8 datum;"
+                  "  ld.relaxed.gpu.u8 datum, [%1];"
+                  "  cvt.u16.u8 %0, datum;"
+                  "}" : "=h"(retval) : "l"(ptr) : "memory");),
+    (asm volatile("{"
+                  "  .reg .u8 datum;"
+                  "  ld.cg.u8 datum, [%1];"
+                  "  cvt.u16.u8 %0, datum;"
+                  "}" : "=h"(retval) : "l"(ptr) : "memory");));
   return (unsigned char) retval;
 }
 
@@ -198,14 +157,8 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 load_acquire(ulonglong2 const*
   ulonglong2 retval;
   NV_IF_TARGET(
     NV_PROVIDES_SM_70,
-    (asm volatile("ld.acquire.gpu.v2.u64 {%0, %1}, [%2];"
-                  : "=l"(retval.x), "=l"(retval.y)
-                  : "l"(ptr)
-                  : "memory");),
-    (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];"
-                  : "=l"(retval.x), "=l"(retval.y)
-                  : "l"(ptr)
-                  : "memory");
+    (asm volatile("ld.acquire.gpu.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory");),
+    (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory");
      __threadfence();));
   return retval;
 }
@@ -215,14 +168,8 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_acquire(uint2 const* ptr)
   uint2 retval;
   NV_IF_TARGET(
     NV_PROVIDES_SM_70,
-    (asm volatile("ld.acquire.gpu.v2.u32 {%0, %1}, [%2];"
-                  : "=r"(retval.x), "=r"(retval.y)
-                  : "l"(ptr)
-                  : "memory");),
-    (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];"
-                  : "=r"(retval.x), "=r"(retval.y)
-                  : "l"(ptr)
-                  : "memory");
+    (asm volatile("ld.acquire.gpu.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory");),
+    (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory");
      __threadfence();));
   return retval;
 }
@@ -230,17 +177,9 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_acquire(uint2 const* ptr)
 static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_acquire(unsigned int const* ptr)
 {
   unsigned int retval;
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_70,
-    (asm volatile("ld.acquire.gpu.u32 %0, [%1];"
-                  : "=r"(retval)
-                  : "l"(ptr)
-                  : "memory");),
-    (asm volatile("ld.cg.u32 %0, [%1];"
-                  : "=r"(retval)
-                  : "l"(ptr)
-                  : "memory");
-     __threadfence();));
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("ld.acquire.gpu.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory");),
+               (asm volatile("ld.cg.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory"); __threadfence();));
 
   return retval;
 }
diff --git a/cub/cub/detail/strong_store.cuh b/cub/cub/detail/strong_store.cuh
index 9b8091738db..cc0e8f60e71 100644
--- a/cub/cub/detail/strong_store.cuh
+++ b/cub/cub/detail/strong_store.cuh
@@ -56,98 +56,61 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(uint4* ptr, uint4 val)
 {
   NV_IF_TARGET(
     NV_PROVIDES_SM_70,
-    (asm volatile("st.relaxed.gpu.v4.u32 [%0], {%1, %2, %3, %4};"
-                  :
-                  : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)
-                  : "memory");),
-    (asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};"
-                  :
-                  : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)
-                  : "memory");));
+    (asm volatile("st.relaxed.gpu.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr),
+                  "r"(val.x),
+                  "r"(val.y),
+                  "r"(val.z),
+                  "r"(val.w) : "memory");),
+    (asm volatile(
+       "st.cg.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) : "memory");));
 }
 
 static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(ulonglong2* ptr, ulonglong2 val)
 {
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_70,
-    (asm volatile("st.relaxed.gpu.v2.u64 [%0], {%1, %2};"
-                  :
-                  : "l"(ptr), "l"(val.x), "l"(val.y)
-                  : "memory");),
-    (asm volatile("st.cg.v2.u64 [%0], {%1, %2};"
-                  :
-                  : "l"(ptr), "l"(val.x), "l"(val.y)
-                  : "memory");));
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.relaxed.gpu.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");),
+               (asm volatile("st.cg.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");));
 }
 
 static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(ushort4* ptr, ushort4 val)
 {
   NV_IF_TARGET(
     NV_PROVIDES_SM_70,
-    (asm volatile("st.relaxed.gpu.v4.u16 [%0], {%1, %2, %3, %4};"
-                  :
-                  : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)
-                  : "memory");),
-    (asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};"
-                  :
-                  : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)
-                  : "memory");));
+    (asm volatile("st.relaxed.gpu.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr),
+                  "h"(val.x),
+                  "h"(val.y),
+                  "h"(val.z),
+                  "h"(val.w) : "memory");),
+    (asm volatile(
+       "st.cg.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) : "memory");));
 }
 
 static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(uint2* ptr, uint2 val)
 {
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_70,
-    (asm volatile("st.relaxed.gpu.v2.u32 [%0], {%1, %2};"
-                  :
-                  : "l"(ptr), "r"(val.x), "r"(val.y)
-                  : "memory");),
-    (asm volatile("st.cg.v2.u32 [%0], {%1, %2};"
-                  :
-                  : "l"(ptr), "r"(val.x), "r"(val.y)
-                  : "memory");));
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.relaxed.gpu.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");),
+               (asm volatile("st.cg.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");));
 }
 
 static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned long long* ptr, unsigned long long val)
 {
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_70,
-    (asm volatile("st.relaxed.gpu.u64 [%0], %1;"
-                  :
-                  : "l"(ptr), "l"(val)
-                  : "memory");),
-    (asm volatile("st.cg.u64 [%0], %1;"
-                  :
-                  : "l"(ptr), "l"(val)
-                  : "memory");));
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.relaxed.gpu.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");),
+               (asm volatile("st.cg.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");));
 }
 
 static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned int* ptr, unsigned int val)
 {
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_70,
-    (asm volatile("st.relaxed.gpu.u32 [%0], %1;"
-                  :
-                  : "l"(ptr), "r"(val)
-                  : "memory");),
-    (asm volatile("st.cg.u32 [%0], %1;"
-                  :
-                  : "l"(ptr), "r"(val)
-                  : "memory");));
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.relaxed.gpu.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");),
+               (asm volatile("st.cg.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");));
 }
 
 static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned short* ptr, unsigned short val)
 {
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_70,
-    (asm volatile("st.relaxed.gpu.u16 [%0], %1;"
-                  :
-                  : "l"(ptr), "h"(val)
-                  : "memory");),
-    (asm volatile("st.cg.u16 [%0], %1;"
-                  :
-                  : "l"(ptr), "h"(val)
-                  : "memory");));
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.relaxed.gpu.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");),
+               (asm volatile("st.cg.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");));
 }
 
 static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned char* ptr, unsigned char val)
@@ -158,123 +121,77 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned char* ptr, uns
                   "  .reg .u8 datum;"
                   "  cvt.u8.u16 datum, %1;"
                   "  st.relaxed.gpu.u8 [%0], datum;"
-                  "}"
-                  :
-                  : "l"(ptr), "h"((unsigned short) val)
-                  : "memory");),
+                  "}" : : "l"(ptr),
+                  "h"((unsigned short) val) : "memory");),
     (asm volatile("{"
                   "  .reg .u8 datum;"
                   "  cvt.u8.u16 datum, %1;"
                   "  st.cg.u8 [%0], datum;"
-                  "}"
-                  :
-                  : "l"(ptr), "h"((unsigned short) val)
-                  : "memory");));
+                  "}" : : "l"(ptr),
+                  "h"((unsigned short) val) : "memory");));
 }
 
 _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(uint4* ptr, uint4 val)
 {
   NV_IF_TARGET(
     NV_PROVIDES_SM_70,
-    (asm volatile("st.release.gpu.v4.u32 [%0], {%1, %2, %3, %4};"
-                  :
-                  : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)
-                  : "memory");),
-    (__threadfence();
-     asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};"
-                  :
-                  : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)
-                  : "memory");));
+    (asm volatile("st.release.gpu.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr),
+                  "r"(val.x),
+                  "r"(val.y),
+                  "r"(val.z),
+                  "r"(val.w) : "memory");),
+    (__threadfence(); asm volatile(
+       "st.cg.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) : "memory");));
 }
 
 _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(ulonglong2* ptr, ulonglong2 val)
 {
   NV_IF_TARGET(
     NV_PROVIDES_SM_70,
-    (asm volatile("st.release.gpu.v2.u64 [%0], {%1, %2};"
-                  :
-                  : "l"(ptr), "l"(val.x), "l"(val.y)
-                  : "memory");),
-    (__threadfence();
-     asm volatile("st.cg.v2.u64 [%0], {%1, %2};"
-                  :
-                  : "l"(ptr), "l"(val.x), "l"(val.y)
-                  : "memory");));
+    (asm volatile("st.release.gpu.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");),
+    (__threadfence(); asm volatile("st.cg.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");));
 }
 
 _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(ushort4* ptr, ushort4 val)
 {
   NV_IF_TARGET(
     NV_PROVIDES_SM_70,
-    (asm volatile("st.release.gpu.v4.u16 [%0], {%1, %2, %3, %4};"
-                  :
-                  : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)
-                  : "memory");),
-    (__threadfence();
-     asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};"
-                  :
-                  : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)
-                  : "memory");));
+    (asm volatile("st.release.gpu.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr),
+                  "h"(val.x),
+                  "h"(val.y),
+                  "h"(val.z),
+                  "h"(val.w) : "memory");),
+    (__threadfence(); asm volatile(
+       "st.cg.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) : "memory");));
 }
 
 _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(uint2* ptr, uint2 val)
 {
   NV_IF_TARGET(
     NV_PROVIDES_SM_70,
-    (asm volatile("st.release.gpu.v2.u32 [%0], {%1, %2};"
-                  :
-                  : "l"(ptr), "r"(val.x), "r"(val.y)
-                  : "memory");),
-    (__threadfence();
-     asm volatile("st.cg.v2.u32 [%0], {%1, %2};"
-                  :
-                  : "l"(ptr), "r"(val.x), "r"(val.y)
-                  : "memory");));
+    (asm volatile("st.release.gpu.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");),
+    (__threadfence(); asm volatile("st.cg.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");));
 }
 
 _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned long long* ptr, unsigned long long val)
 {
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_70,
-    (asm volatile("st.release.gpu.u64 [%0], %1;"
-                  :
-                  : "l"(ptr), "l"(val)
-                  : "memory");),
-    (__threadfence();
-     asm volatile("st.cg.u64 [%0], %1;"
-                  :
-                  : "l"(ptr), "l"(val)
-                  : "memory");));
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.release.gpu.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");),
+               (__threadfence(); asm volatile("st.cg.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");));
 }
 
 _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned int* ptr, unsigned int val)
 {
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_70,
-    (asm volatile("st.release.gpu.u32 [%0], %1;"
-                  :
-                  : "l"(ptr), "r"(val)
-                  : "memory");),
-    (__threadfence();
-     asm volatile("st.cg.u32 [%0], %1;"
-                  :
-                  : "l"(ptr), "r"(val)
-                  : "memory");));
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.release.gpu.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");),
+               (__threadfence(); asm volatile("st.cg.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");));
 }
 
 _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned short* ptr, unsigned short val)
 {
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_70,
-    (asm volatile("st.release.gpu.u16 [%0], %1;"
-                  :
-                  : "l"(ptr), "h"(val)
-                  : "memory");),
-    (__threadfence();
-     asm volatile("st.cg.u16 [%0], %1;"
-                  :
-                  : "l"(ptr), "h"(val)
-                  : "memory");));
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.release.gpu.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");),
+               (__threadfence(); asm volatile("st.cg.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");));
 }
 
 _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned char* ptr, unsigned char val)
@@ -285,19 +202,15 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned char* ptr, unsigned c
                   "  .reg .u8 datum;"
                   "  cvt.u8.u16 datum, %1;"
                   "  st.release.gpu.u8 [%0], datum;"
-                  "}"
-                  :
-                  : "l"(ptr), "h"((unsigned short) val)
-                  : "memory");),
+                  "}" : : "l"(ptr),
+                  "h"((unsigned short) val) : "memory");),
     (__threadfence(); asm volatile(
        "{"
        "  .reg .u8 datum;"
        "  cvt.u8.u16 datum, %1;"
        "  st.cg.u8 [%0], datum;"
-       "}"
-       :
-       : "l"(ptr), "h"((unsigned short) val)
-       : "memory");));
+       "}" : : "l"(ptr),
+       "h"((unsigned short) val) : "memory");));
 }
 
 } // namespace detail
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
index 386a6276dfa..fa4fa80d0ef 100644
--- a/cub/cub/device/dispatch/dispatch_transform.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -169,11 +169,10 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply_impl(F&& f, Tuple&& t, ::cuda::st
 }
 
 template <class F, class Tuple>
-_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t)
-  -> decltype(poor_apply_impl(
-    ::cuda::std::forward<F>(f),
-    ::cuda::std::forward<Tuple>(t),
-    ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::remove_reference_t<Tuple>>::value>{}))
+_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t) -> decltype(poor_apply_impl(
+  ::cuda::std::forward<F>(f),
+  ::cuda::std::forward<Tuple>(t),
+  ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::remove_reference_t<Tuple>>::value>{}))
 {
   return poor_apply_impl(
     ::cuda::std::forward<F>(f),
@@ -473,8 +472,9 @@ using needs_aligned_ptr_t =
 
 #ifdef _CUB_HAS_TRANSFORM_UBLKCP
 template <Algorithm Alg, typename It, ::cuda::std::enable_if_t<needs_aligned_ptr_t<Alg>::value, int> = 0>
-_CCCL_DEVICE _CCCL_FORCEINLINE auto select_kernel_arg(
-  ::cuda::std::integral_constant<Algorithm, Alg>, kernel_arg<It>&& arg) -> aligned_base_ptr<value_t<It>>&&
+_CCCL_DEVICE _CCCL_FORCEINLINE auto
+select_kernel_arg(::cuda::std::integral_constant<Algorithm, Alg>, kernel_arg<It>&& arg)
+  -> aligned_base_ptr<value_t<It>>&&
 {
   return ::cuda::std::move(arg.aligned_ptr);
 }
@@ -660,10 +660,9 @@ struct dispatch_t<RequiresStableAddress,
   // TODO(bgruber): I want to write tests for this but those are highly depending on the architecture we are running
   // on?
   template <typename ActivePolicy>
-  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel()
-    -> PoorExpected<
-      ::cuda::std::
-        tuple<THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron, decltype(CUB_DETAIL_TRANSFORM_KERNEL_PTR), int>>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel() -> PoorExpected<
+    ::cuda::std::
+      tuple<THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron, decltype(CUB_DETAIL_TRANSFORM_KERNEL_PTR), int>>
   {
     using policy_t          = typename ActivePolicy::algo_policy;
     constexpr int block_dim = policy_t::block_threads;
diff --git a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh
index c6894ccbc86..3645e4b9ed7 100644
--- a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh
@@ -255,9 +255,8 @@ struct policy_hub
                                     typename Tuning::delay_constructor>;
 
   template <typename Tuning>
-  static auto select_agent_policy(long) ->
-    typename DefaultPolicy<
-      default_delay_constructor_t<typename accumulator_pack_t<OffsetT>::pack_t>>::ThreeWayPartitionPolicy;
+  static auto select_agent_policy(long) -> typename DefaultPolicy<
+    default_delay_constructor_t<typename accumulator_pack_t<OffsetT>::pack_t>>::ThreeWayPartitionPolicy;
 
   struct Policy800 : ChainedPolicy<800, Policy800, Policy350>
   {
diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index 7af32df392c..feef89776a9 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -391,8 +391,8 @@ struct CCCL_DEPRECATED BinaryFlip
   {}
 
   template <typename T, typename U>
-  _CCCL_DEVICE auto
-  operator()(T&& t, U&& u) -> decltype(binary_op(::cuda::std::forward<U>(u), ::cuda::std::forward<T>(t)))
+  _CCCL_DEVICE auto operator()(T&& t, U&& u)
+    -> decltype(binary_op(::cuda::std::forward<U>(u), ::cuda::std::forward<T>(t)))
   {
     return binary_op(::cuda::std::forward<U>(u), ::cuda::std::forward<T>(t));
   }
diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh
index 294bc449e31..d3850051ca7 100644
--- a/cub/cub/thread/thread_reduce.cuh
+++ b/cub/cub/thread/thread_reduce.cuh
@@ -543,8 +543,8 @@ ThreadReduceTernaryTree(const Input& input, ReductionOp reduction_op)
 // never reached. Protect instantion of ThreadReduceSimd with arbitrary types and operators
 _CCCL_TEMPLATE(typename Input, typename ReductionOp)
 _CCCL_REQUIRES((!cub::internal::enable_generic_simd_reduction<Input, ReductionOp>()))
-_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto
-ThreadReduceSimd(const Input& input, ReductionOp) -> ::cuda::std::remove_cvref_t<decltype(input[0])>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, ReductionOp)
+  -> ::cuda::std::remove_cvref_t<decltype(input[0])>
 {
   assert(false);
   return input[0];
@@ -552,8 +552,8 @@ ThreadReduceSimd(const Input& input, ReductionOp) -> ::cuda::std::remove_cvref_t
 
 _CCCL_TEMPLATE(typename Input, typename ReductionOp)
 _CCCL_REQUIRES((cub::internal::enable_generic_simd_reduction<Input, ReductionOp>()))
-_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto
-ThreadReduceSimd(const Input& input, ReductionOp reduction_op) -> ::cuda::std::remove_cvref_t<decltype(input[0])>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, ReductionOp reduction_op)
+  -> ::cuda::std::remove_cvref_t<decltype(input[0])>
 {
   using cub::detail::unsafe_bitcast;
   using T                       = ::cuda::std::remove_cvref_t<decltype(input[0])>;
diff --git a/cub/test/catch2_test_device_for_each_in_extents.cu b/cub/test/catch2_test_device_for_each_in_extents.cu
index 8ad75a1d0cb..3e5a6c6689a 100644
--- a/cub/test/catch2_test_device_for_each_in_extents.cu
+++ b/cub/test/catch2_test_device_for_each_in_extents.cu
@@ -135,8 +135,8 @@ using dimensions =
                  cuda::std::index_sequence<3, 2, 5, 4>>;
 
 template <typename IndexType, size_t... Dimensions>
-auto build_static_extents(IndexType,
-                          cuda::std::index_sequence<Dimensions...>) -> cuda::std::extents<IndexType, Dimensions...>
+auto build_static_extents(IndexType, cuda::std::index_sequence<Dimensions...>)
+  -> cuda::std::extents<IndexType, Dimensions...>
 {
   return {};
 }
diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu
index 06f2b7c31a7..95c4794b8cf 100644
--- a/cub/test/catch2_test_device_transform.cu
+++ b/cub/test/catch2_test_device_transform.cu
@@ -166,8 +166,8 @@ struct alignas(Alignment) overaligned_addable_t
     return a.value == b.value;
   }
 
-  _CCCL_HOST_DEVICE friend auto
-  operator+(const overaligned_addable_t& a, const overaligned_addable_t& b) -> overaligned_addable_t
+  _CCCL_HOST_DEVICE friend auto operator+(const overaligned_addable_t& a, const overaligned_addable_t& b)
+    -> overaligned_addable_t
   {
     check(a);
     check(b);
diff --git a/cub/test/test_block_radix_rank.cu b/cub/test/test_block_radix_rank.cu
index 8c1df1a80c7..c53c6b179e3 100644
--- a/cub/test/test_block_radix_rank.cu
+++ b/cub/test/test_block_radix_rank.cu
@@ -310,7 +310,7 @@ void Test()
   Test<cub::RadixRankAlgorithm::RADIX_RANK_BASIC, BlockThreads>();
   Test<cub::RadixRankAlgorithm::RADIX_RANK_MEMOIZE, BlockThreads>();
 
-  Test<BlockThreads>(cub::Int2Type<(BlockThreads % 32) == 0>{});
+  Test<BlockThreads>(cub::Int2Type < (BlockThreads % 32) == 0 > {});
 }
 
 int main(int argc, char** argv)
diff --git a/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh b/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh
index 459beddee22..ae8ad239d46 100644
--- a/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh
+++ b/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh
@@ -60,8 +60,8 @@ struct receiver_defaults
   }
 
   template <class _Rcvr>
-  _CUDAX_TRIVIAL_API static auto
-  set_stopped(__ignore, _Rcvr& __rcvr) noexcept -> __async::completion_signatures<__async::set_stopped_t()>
+  _CUDAX_TRIVIAL_API static auto set_stopped(__ignore, _Rcvr& __rcvr) noexcept
+    -> __async::completion_signatures<__async::set_stopped_t()>
   {
     __async::set_stopped(static_cast<_Rcvr&&>(__rcvr));
     return {};
@@ -198,15 +198,15 @@ _CUDAX_TRIVIAL_API auto __make_opstate(_Sndr __sndr, _Rcvr __rcvr)
 }
 
 template <class _Data, class... _Sndrs>
-_CUDAX_TRIVIAL_API auto
-__get_attrs(int, const _Data& __data, const _Sndrs&... __sndrs) noexcept -> decltype(__data.get_attrs(__sndrs...))
+_CUDAX_TRIVIAL_API auto __get_attrs(int, const _Data& __data, const _Sndrs&... __sndrs) noexcept
+  -> decltype(__data.get_attrs(__sndrs...))
 {
   return __data.get_attrs(__sndrs...);
 }
 
 template <class _Data, class... _Sndrs>
-_CUDAX_TRIVIAL_API auto
-__get_attrs(long, const _Data&, const _Sndrs&... __sndrs) noexcept -> decltype(__async::get_env(__sndrs...))
+_CUDAX_TRIVIAL_API auto __get_attrs(long, const _Data&, const _Sndrs&... __sndrs) noexcept
+  -> decltype(__async::get_env(__sndrs...))
 {
   return __async::get_env(__sndrs...);
 }
diff --git a/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh b/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh
index 25d5ef04d76..868c911b1da 100644
--- a/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh
+++ b/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh
@@ -76,48 +76,36 @@ template <class _Sig, template <class...> class _Vy, template <class...> class _
 using __transform_sig_t = decltype(__transform_sig<_Sig, _Vy, _Ey, _Sy>());
 
 template <class _Sigs,
-          template <class...>
-          class _Vy,
-          template <class...>
-          class _Ey,
+          template <class...> class _Vy,
+          template <class...> class _Ey,
           class _Sy,
-          template <class...>
-          class _Variant,
+          template <class...> class _Variant,
           class... _More>
 extern _DIAGNOSTIC<_Sigs> __transform_completion_signatures_v;
 
 template <class... _What,
-          template <class...>
-          class _Vy,
-          template <class...>
-          class _Ey,
+          template <class...> class _Vy,
+          template <class...> class _Ey,
           class _Sy,
-          template <class...>
-          class _Variant,
+          template <class...> class _Variant,
           class... _More>
 extern __fn_t<_ERROR<_What...>>*
   __transform_completion_signatures_v<_ERROR<_What...>, _Vy, _Ey, _Sy, _Variant, _More...>;
 
 template <class... _Sigs,
-          template <class...>
-          class _Vy,
-          template <class...>
-          class _Ey,
+          template <class...> class _Vy,
+          template <class...> class _Ey,
           class _Sy,
-          template <class...>
-          class _Variant,
+          template <class...> class _Variant,
           class... _More>
 extern __fn_t<_Variant<__transform_sig_t<_Sigs, _Vy, _Ey, _Sy>..., _More...>>*
   __transform_completion_signatures_v<completion_signatures<_Sigs...>, _Vy, _Ey, _Sy, _Variant, _More...>;
 
 template <class _Sigs,
-          template <class...>
-          class _Vy,
-          template <class...>
-          class _Ey,
+          template <class...> class _Vy,
+          template <class...> class _Ey,
           class _Sy,
-          template <class...>
-          class _Variant,
+          template <class...> class _Variant,
           class... _More>
 using __transform_completion_signatures =
   decltype(__transform_completion_signatures_v<_Sigs, _Vy, _Ey, _Sy, _Variant, _More...>());
@@ -129,12 +117,9 @@ template <>
 struct __gather_sigs_fn<set_value_t>
 {
   template <class _Sigs,
-            template <class...>
-            class _Then,
-            template <class...>
-            class _Else,
-            template <class...>
-            class _Variant,
+            template <class...> class _Then,
+            template <class...> class _Else,
+            template <class...> class _Variant,
             class... _More>
   using __call = __transform_completion_signatures<
     _Sigs,
@@ -149,12 +134,9 @@ template <>
 struct __gather_sigs_fn<set_error_t>
 {
   template <class _Sigs,
-            template <class...>
-            class _Then,
-            template <class...>
-            class _Else,
-            template <class...>
-            class _Variant,
+            template <class...> class _Then,
+            template <class...> class _Else,
+            template <class...> class _Variant,
             class... _More>
   using __call = __transform_completion_signatures<
     _Sigs,
@@ -169,12 +151,9 @@ template <>
 struct __gather_sigs_fn<set_stopped_t>
 {
   template <class _Sigs,
-            template <class...>
-            class _Then,
-            template <class...>
-            class _Else,
-            template <class...>
-            class _Variant,
+            template <class...> class _Then,
+            template <class...> class _Else,
+            template <class...> class _Variant,
             class... _More>
   using __call = __transform_completion_signatures<
     _Sigs,
@@ -187,12 +166,9 @@ struct __gather_sigs_fn<set_stopped_t>
 
 template <class _Sigs,
           class _WantedTag,
-          template <class...>
-          class _Then,
-          template <class...>
-          class _Else,
-          template <class...>
-          class _Variant,
+          template <class...> class _Then,
+          template <class...> class _Else,
+          template <class...> class _Variant,
           class... _More>
 using __gather_completion_signatures =
   typename __gather_sigs_fn<_WantedTag>::template __call<_Sigs, _Then, _Else, _Variant, _More...>;
@@ -404,13 +380,12 @@ template <class _Tag, class... _Args>
 auto completion(_Tag, _Args&&...) -> __csig::__sigs<_Tag(_Args...)>&;
 
 template <class _Sndr, class _Rcvr = receiver_archetype>
-auto completions_of(_Sndr&&,
-                    _Rcvr = {}) -> decltype(__csig::__to_sigs(__declval<completion_signatures_of_t<_Sndr, _Rcvr>&>()));
+auto completions_of(_Sndr&&, _Rcvr = {})
+  -> decltype(__csig::__to_sigs(__declval<completion_signatures_of_t<_Sndr, _Rcvr>&>()));
 
 template <bool _PotentiallyThrowing>
-auto eptr_completion_if()
-  -> _CUDA_VSTD::
-    conditional_t<_PotentiallyThrowing, __csig::__sigs<set_error_t(::std::exception_ptr)>, __csig::__sigs<>>&;
+auto eptr_completion_if() -> _CUDA_VSTD::
+  conditional_t<_PotentiallyThrowing, __csig::__sigs<set_error_t(::std::exception_ptr)>, __csig::__sigs<>>&;
 } // namespace meta
 } // namespace cuda::experimental::__async
 
diff --git a/cudax/include/cuda/experimental/__async/sender/continue_on.cuh b/cudax/include/cuda/experimental/__async/sender/continue_on.cuh
index 9a0c142e21c..8da87a443a3 100644
--- a/cudax/include/cuda/experimental/__async/sender/continue_on.cuh
+++ b/cudax/include/cuda/experimental/__async/sender/continue_on.cuh
@@ -267,8 +267,8 @@ struct continue_on_t::__sndr_t
 };
 
 template <class _Sndr, class _Sch>
-_CUDAX_API auto
-continue_on_t::operator()(_Sndr __sndr, _Sch __sch) const noexcept -> continue_on_t::__sndr_t<_Sndr, _Sch>
+_CUDAX_API auto continue_on_t::operator()(_Sndr __sndr, _Sch __sch) const noexcept
+  -> continue_on_t::__sndr_t<_Sndr, _Sch>
 {
   return __sndr_t<_Sndr, _Sch>{{}, __sch, static_cast<_Sndr&&>(__sndr)};
 }
diff --git a/cudax/include/cuda/experimental/__async/sender/cpos.cuh b/cudax/include/cuda/experimental/__async/sender/cpos.cuh
index 7f1fb383a71..dab62e7ac10 100644
--- a/cudax/include/cuda/experimental/__async/sender/cpos.cuh
+++ b/cudax/include/cuda/experimental/__async/sender/cpos.cuh
@@ -110,8 +110,8 @@ _CCCL_GLOBAL_CONSTANT struct set_error_t
 _CCCL_GLOBAL_CONSTANT struct set_stopped_t
 {
   template <class _Rcvr>
-  _CUDAX_TRIVIAL_API auto
-  operator()(_Rcvr&& __rcvr) const noexcept -> decltype(static_cast<_Rcvr&&>(__rcvr).set_stopped())
+  _CUDAX_TRIVIAL_API auto operator()(_Rcvr&& __rcvr) const noexcept
+    -> decltype(static_cast<_Rcvr&&>(__rcvr).set_stopped())
   {
     static_assert(_CUDA_VSTD::is_same_v<decltype(static_cast<_Rcvr&&>(__rcvr).set_stopped()), void>);
     static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_stopped()));
@@ -119,8 +119,8 @@ _CCCL_GLOBAL_CONSTANT struct set_stopped_t
   }
 
   template <class _Rcvr>
-  _CUDAX_TRIVIAL_API auto
-  operator()(_Rcvr* __rcvr) const noexcept -> decltype(static_cast<_Rcvr&&>(*__rcvr).set_stopped())
+  _CUDAX_TRIVIAL_API auto operator()(_Rcvr* __rcvr) const noexcept
+    -> decltype(static_cast<_Rcvr&&>(*__rcvr).set_stopped())
   {
     static_assert(_CUDA_VSTD::is_same_v<decltype(static_cast<_Rcvr&&>(*__rcvr).set_stopped()), void>);
     static_assert(noexcept(static_cast<_Rcvr&&>(*__rcvr).set_stopped()));
diff --git a/cudax/include/cuda/experimental/__async/sender/let_value.cuh b/cudax/include/cuda/experimental/__async/sender/let_value.cuh
index 7d06e071fe0..6742a1c1d6c 100644
--- a/cudax/include/cuda/experimental/__async/sender/let_value.cuh
+++ b/cudax/include/cuda/experimental/__async/sender/let_value.cuh
@@ -243,8 +243,9 @@ private:
     _Sndr __sndr_;
 
     template <class _Rcvr>
-    _CUDAX_API auto connect(_Rcvr __rcvr) && noexcept(
-      __nothrow_constructible<__opstate_t<_Rcvr, _Sndr, _Fn>, _Sndr, _Fn, _Rcvr>) -> __opstate_t<_Rcvr, _Sndr, _Fn>
+    _CUDAX_API auto
+    connect(_Rcvr __rcvr) && noexcept(__nothrow_constructible<__opstate_t<_Rcvr, _Sndr, _Fn>, _Sndr, _Fn, _Rcvr>)
+      -> __opstate_t<_Rcvr, _Sndr, _Fn>
     {
       return __opstate_t<_Rcvr, _Sndr, _Fn>(
         static_cast<_Sndr&&>(__sndr_), static_cast<_Fn&&>(__fn_), static_cast<_Rcvr&&>(__rcvr));
diff --git a/cudax/include/cuda/experimental/__async/sender/stop_token.cuh b/cudax/include/cuda/experimental/__async/sender/stop_token.cuh
index 35e6d4d164a..693816dbb45 100644
--- a/cudax/include/cuda/experimental/__async/sender/stop_token.cuh
+++ b/cudax/include/cuda/experimental/__async/sender/stop_token.cuh
@@ -369,8 +369,8 @@ _CUDAX_API inline void inplace_stop_source::__unlock(uint8_t __old_state) const
   (void) __state_.store(__old_state, _CUDA_VSTD::memory_order_release);
 }
 
-_CUDAX_API inline auto
-inplace_stop_source::__try_lock_unless_stop_requested(bool __set_stop_requested) const noexcept -> bool
+_CUDAX_API inline auto inplace_stop_source::__try_lock_unless_stop_requested(bool __set_stop_requested) const noexcept
+  -> bool
 {
   __stok::__spin_wait __spin;
   auto __old_state = __state_.load(_CUDA_VSTD::memory_order_relaxed);
diff --git a/cudax/include/cuda/experimental/__async/sender/tuple.cuh b/cudax/include/cuda/experimental/__async/sender/tuple.cuh
index 98a1d0997f1..0229ed8b9c7 100644
--- a/cudax/include/cuda/experimental/__async/sender/tuple.cuh
+++ b/cudax/include/cuda/experimental/__async/sender/tuple.cuh
@@ -65,8 +65,8 @@ struct __tupl<_CUDA_VSTD::index_sequence<_Idx...>, _Ts...> : __box<_Idx, _Ts>...
 
   template <class _Fn, class _Self, class... _Us>
   _CUDAX_TRIVIAL_API static auto __for_each(_Fn&& __fn, _Self&& __self, _Us&&... __us) //
-    noexcept((__nothrow_callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>>
-              && ...)) -> _CUDA_VSTD::enable_if_t<(__callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> && ...)>
+    noexcept((__nothrow_callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> && ...))
+      -> _CUDA_VSTD::enable_if_t<(__callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> && ...)>
   {
     return (
       static_cast<_Fn&&>(__fn)(static_cast<_Us&&>(__us)..., static_cast<_Self&&>(__self).__box<_Idx, _Ts>::__value_),
diff --git a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh
index 8a42bab40ca..0e1dceff19b 100644
--- a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh
+++ b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh
@@ -80,8 +80,8 @@ struct __with_property
   template <class...>
   struct __iproperty : interface<__iproperty>
   {
-    _CUDAX_HOST_API friend auto
-    get_property([[maybe_unused]] const __iproperty& __obj, _Property) -> __property_result_t<_Property>
+    _CUDAX_HOST_API friend auto get_property([[maybe_unused]] const __iproperty& __obj, _Property)
+      -> __property_result_t<_Property>
     {
       if constexpr (!_CUDA_VSTD::is_same_v<__property_result_t<_Property>, void>)
       {
@@ -268,8 +268,8 @@ template <class _Derived>
 struct __with_try_get_property
 {
   template <class _Property>
-  _CUDAX_HOST_API _CCCL_NODISCARD_FRIEND auto
-  try_get_property(const _Derived& __self, _Property) noexcept -> __try_property_result_t<_Property>
+  _CUDAX_HOST_API _CCCL_NODISCARD_FRIEND auto try_get_property(const _Derived& __self, _Property) noexcept
+    -> __try_property_result_t<_Property>
   {
     auto __prop = __cudax::dynamic_any_cast<const __iproperty<_Property>*>(&__self);
     if constexpr (_CUDA_VSTD::is_same_v<__property_result_t<_Property>, void>)
diff --git a/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh b/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh
index 5b64dbc531d..bd481b3dea2 100644
--- a/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh
+++ b/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh
@@ -50,8 +50,8 @@ _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto basic_any_from(_Interface<_Super>&
 }
 
 template <template <class...> class _Interface, class _Super>
-_CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto
-basic_any_from(_Interface<_Super> const& __self) noexcept -> basic_any<_Super> const&
+_CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto basic_any_from(_Interface<_Super> const& __self) noexcept
+  -> basic_any<_Super> const&
 {
   return static_cast<basic_any<_Super> const&>(__self);
 }
@@ -72,8 +72,8 @@ _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto basic_any_from(_Interface<_Super>*
 }
 
 template <template <class...> class _Interface, class _Super>
-_CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto
-basic_any_from(_Interface<_Super> const* __self) noexcept -> basic_any<_Super> const*
+_CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto basic_any_from(_Interface<_Super> const* __self) noexcept
+  -> basic_any<_Super> const*
 {
   return static_cast<basic_any<_Super> const*>(__self);
 }
diff --git a/cudax/include/cuda/experimental/__utility/basic_any/basic_any_ptr.cuh b/cudax/include/cuda/experimental/__utility/basic_any/basic_any_ptr.cuh
index 8c9e67e757d..03e05648dae 100644
--- a/cudax/include/cuda/experimental/__utility/basic_any/basic_any_ptr.cuh
+++ b/cudax/include/cuda/experimental/__utility/basic_any/basic_any_ptr.cuh
@@ -169,8 +169,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT basic_any<_Interface*>
   //!
   _CCCL_TEMPLATE(class _Tp, class _Up = _CUDA_VSTD::remove_pointer_t<_Tp>, class _Vp = _CUDA_VSTD::remove_const_t<_Up>)
   _CCCL_REQUIRES(__satisfies<_Vp, _Interface> _CCCL_AND(__is_const_ptr || !_CUDA_VSTD::is_const_v<_Up>))
-  _CUDAX_HOST_API auto
-  emplace(_CUDA_VSTD::type_identity_t<_Up>* __obj) noexcept -> _CUDA_VSTD::__maybe_const<__is_const_ptr, _Vp>*&
+  _CUDAX_HOST_API auto emplace(_CUDA_VSTD::type_identity_t<_Up>* __obj) noexcept
+    -> _CUDA_VSTD::__maybe_const<__is_const_ptr, _Vp>*&
   {
     __vptr_for<interface_type> __vptr = __cudax::__get_vtable_ptr_for<interface_type, _Vp>();
     __ref_.__set_ref(__obj ? __vptr : nullptr, __obj);
@@ -184,7 +184,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT basic_any<_Interface*>
     return *static_cast<__void_ptr_t>(__get_optr()) == *static_cast<__void_ptr_t>(__other.__get_optr());
   }
 #else // ^^^ !_CCCL_NO_THREE_WAY_COMPARISON ^^^ / vvv _CCCL_NO_THREE_WAY_COMPARISON vvv
-  _CCCL_NODISCARD_FRIEND _CUDAX_HOST_API auto operator==(basic_any const& __lhs, basic_any const& __rhs) noexcept -> bool
+  _CCCL_NODISCARD_FRIEND _CUDAX_HOST_API auto operator==(basic_any const& __lhs, basic_any const& __rhs) noexcept
+    -> bool
   {
     using __void_ptr_t _CCCL_NODEBUG_ALIAS = _CUDA_VSTD::__maybe_const<__is_const_ptr, void>* const*;
     return *static_cast<__void_ptr_t>(__lhs.__get_optr()) == *static_cast<__void_ptr_t>(__rhs.__get_optr());
@@ -271,8 +272,8 @@ private:
     return &__ref_.__optr_;
   }
 
-  _CCCL_NODISCARD _CUDAX_HOST_API auto
-  __get_optr() const noexcept -> _CUDA_VSTD::__maybe_const<__is_const_ptr, void>* const*
+  _CCCL_NODISCARD _CUDAX_HOST_API auto __get_optr() const noexcept
+    -> _CUDA_VSTD::__maybe_const<__is_const_ptr, void>* const*
   {
     return &__ref_.__optr_;
   }
diff --git a/cudax/include/cuda/experimental/__utility/basic_any/interfaces.cuh b/cudax/include/cuda/experimental/__utility/basic_any/interfaces.cuh
index 435e43ee699..fcd05a6600b 100644
--- a/cudax/include/cuda/experimental/__utility/basic_any/interfaces.cuh
+++ b/cudax/include/cuda/experimental/__utility/basic_any/interfaces.cuh
@@ -318,22 +318,22 @@ template <template <class...> class _Interface>
 struct __interface_cast_fn<_Interface<>>
 {
   template <class _Super>
-  _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto
-  operator()(_Interface<_Super>&& __self) const noexcept -> _Interface<_Super>&&
+  _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto operator()(_Interface<_Super>&& __self) const noexcept
+    -> _Interface<_Super>&&
   {
     return _CUDA_VSTD::move(__self);
   }
 
   template <class _Super>
-  _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto
-  operator()(_Interface<_Super>& __self) const noexcept -> _Interface<_Super>&
+  _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto operator()(_Interface<_Super>& __self) const noexcept
+    -> _Interface<_Super>&
   {
     return __self;
   }
 
   template <class _Super>
-  _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto
-  operator()(_Interface<_Super> const& __self) noexcept -> _Interface<_Super> const&
+  _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto operator()(_Interface<_Super> const& __self) noexcept
+    -> _Interface<_Super> const&
   {
     return __self;
   }
diff --git a/cudax/include/cuda/experimental/__utility/basic_any/iset.cuh b/cudax/include/cuda/experimental/__utility/basic_any/iset.cuh
index d16fdc43fd7..419be0e7660 100644
--- a/cudax/include/cuda/experimental/__utility/basic_any/iset.cuh
+++ b/cudax/include/cuda/experimental/__utility/basic_any/iset.cuh
@@ -102,8 +102,8 @@ struct __iset_vptr : __base_vptr
   }
 
   template <class _Interface>
-  _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API constexpr auto
-  __query_interface(_Interface) const noexcept -> __vptr_for<_Interface>
+  _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API constexpr auto __query_interface(_Interface) const noexcept
+    -> __vptr_for<_Interface>
   {
     if (__vptr_->__kind_ == __vtable_kind::__normal)
     {
diff --git a/cudax/include/cuda/experimental/__utility/basic_any/rtti.cuh b/cudax/include/cuda/experimental/__utility/basic_any/rtti.cuh
index cda9a72e789..470c7c84228 100644
--- a/cudax/include/cuda/experimental/__utility/basic_any/rtti.cuh
+++ b/cudax/include/cuda/experimental/__utility/basic_any/rtti.cuh
@@ -137,8 +137,8 @@ struct __rtti : __rtti_base
   {}
 
   template <class... _Interfaces>
-  _CCCL_NODISCARD _CUDAX_HOST_API auto
-  __query_interface(__iset<_Interfaces...>) const noexcept -> __vptr_for<__iset<_Interfaces...>>
+  _CCCL_NODISCARD _CUDAX_HOST_API auto __query_interface(__iset<_Interfaces...>) const noexcept
+    -> __vptr_for<__iset<_Interfaces...>>
   {
     // TODO: find a way to check at runtime that the requested __iset is a subset
     // of the interfaces in the vtable.
@@ -201,8 +201,8 @@ struct __rtti_ex : __rtti
 //! interfaces.
 //!
 template <class _SrcInterface, class _DstInterface>
-_CCCL_NODISCARD _CUDAX_HOST_API auto
-__try_vptr_cast(__vptr_for<_SrcInterface> __src_vptr) noexcept -> __vptr_for<_DstInterface>
+_CCCL_NODISCARD _CUDAX_HOST_API auto __try_vptr_cast(__vptr_for<_SrcInterface> __src_vptr) noexcept
+  -> __vptr_for<_DstInterface>
 {
   static_assert(_CUDA_VSTD::is_class_v<_SrcInterface> && _CUDA_VSTD::is_class_v<_DstInterface>, "expected class types");
   if (__src_vptr == nullptr)
diff --git a/cudax/include/cuda/experimental/__utility/basic_any/virtual_ptrs.cuh b/cudax/include/cuda/experimental/__utility/basic_any/virtual_ptrs.cuh
index 1d41a05d05f..902477040ab 100644
--- a/cudax/include/cuda/experimental/__utility/basic_any/virtual_ptrs.cuh
+++ b/cudax/include/cuda/experimental/__utility/basic_any/virtual_ptrs.cuh
@@ -61,12 +61,14 @@ struct __base_vptr
 #if !defined(_CCCL_NO_THREE_WAY_COMPARISON)
   bool operator==(__base_vptr const& __other) const noexcept = default;
 #else // ^^^ !_CCCL_NO_THREE_WAY_COMPARISON ^^^ / vvv _CCCL_NO_THREE_WAY_COMPARISON vvv
-  _CCCL_NODISCARD_FRIEND _CUDAX_HOST_API constexpr auto operator==(__base_vptr __lhs, __base_vptr __rhs) noexcept -> bool
+  _CCCL_NODISCARD_FRIEND _CUDAX_HOST_API constexpr auto operator==(__base_vptr __lhs, __base_vptr __rhs) noexcept
+    -> bool
   {
     return __lhs.__vptr_ == __rhs.__vptr_;
   }
 
-  _CCCL_NODISCARD_FRIEND _CUDAX_HOST_API constexpr auto operator!=(__base_vptr __lhs, __base_vptr __rhs) noexcept -> bool
+  _CCCL_NODISCARD_FRIEND _CUDAX_HOST_API constexpr auto operator!=(__base_vptr __lhs, __base_vptr __rhs) noexcept
+    -> bool
   {
     return !(__lhs == __rhs);
   }
diff --git a/cudax/include/cuda/experimental/__utility/basic_any/virtual_tables.cuh b/cudax/include/cuda/experimental/__utility/basic_any/virtual_tables.cuh
index a673ecd7746..289be9d8112 100644
--- a/cudax/include/cuda/experimental/__utility/basic_any/virtual_tables.cuh
+++ b/cudax/include/cuda/experimental/__utility/basic_any/virtual_tables.cuh
@@ -71,8 +71,8 @@ struct _CCCL_DECLSPEC_EMPTY_BASES __basic_vtable
   }
 
   template <class... _Others>
-  _CCCL_NODISCARD _CUDAX_HOST_API auto
-  __query_interface(__iset<_Others...>) const noexcept -> __vptr_for<__iset<_Others...>>
+  _CCCL_NODISCARD _CUDAX_HOST_API auto __query_interface(__iset<_Others...>) const noexcept
+    -> __vptr_for<__iset<_Others...>>
   {
     using __remainder _CCCL_NODEBUG_ALIAS =
       _CUDA_VSTD::__type_list_size<_CUDA_VSTD::__type_find<__unique_interfaces<interface>, __iset<_Others...>>>;
diff --git a/cudax/test/stf/error_checks/ctx_mismatch.cu b/cudax/test/stf/error_checks/ctx_mismatch.cu
index c04d589c367..6e44900393c 100644
--- a/cudax/test/stf/error_checks/ctx_mismatch.cu
+++ b/cudax/test/stf/error_checks/ctx_mismatch.cu
@@ -56,8 +56,7 @@ int main()
 #if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
 #else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
-  struct sigaction sigabrt_action
-  {};
+  struct sigaction sigabrt_action{};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
   sigabrt_action.sa_handler = &cleanupRoutine;
 
diff --git a/cudax/test/stf/error_checks/data_interface_mismatch.cu b/cudax/test/stf/error_checks/data_interface_mismatch.cu
index ea2ada7e633..400b913fa10 100644
--- a/cudax/test/stf/error_checks/data_interface_mismatch.cu
+++ b/cudax/test/stf/error_checks/data_interface_mismatch.cu
@@ -66,8 +66,7 @@ int main()
 #if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
 #else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
-  struct sigaction sigabrt_action
-  {};
+  struct sigaction sigabrt_action{};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
   sigabrt_action.sa_handler = &cleanupRoutine;
 
diff --git a/cudax/test/stf/error_checks/double_finalize.cu b/cudax/test/stf/error_checks/double_finalize.cu
index 37913ca6e36..cae7cecfc50 100644
--- a/cudax/test/stf/error_checks/double_finalize.cu
+++ b/cudax/test/stf/error_checks/double_finalize.cu
@@ -42,8 +42,7 @@ int main()
 #  if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
 #  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
-  struct sigaction sigabrt_action
-  {};
+  struct sigaction sigabrt_action{};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
   sigabrt_action.sa_handler = &cleanupRoutine;
 
diff --git a/cudax/test/stf/error_checks/erase_frozen.cu b/cudax/test/stf/error_checks/erase_frozen.cu
index 624dfb062f8..eaec786bf96 100644
--- a/cudax/test/stf/error_checks/erase_frozen.cu
+++ b/cudax/test/stf/error_checks/erase_frozen.cu
@@ -43,8 +43,7 @@ int main()
 #if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
 #else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
-  struct sigaction sigabrt_action
-  {};
+  struct sigaction sigabrt_action{};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
   sigabrt_action.sa_handler = &cleanupRoutine;
 
diff --git a/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu b/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu
index fa28e5467e0..4b04ae3a182 100644
--- a/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu
+++ b/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu
@@ -42,8 +42,7 @@ int main()
 #  if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
 #  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
-  struct sigaction sigabrt_action
-  {};
+  struct sigaction sigabrt_action{};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
   sigabrt_action.sa_handler = &cleanupRoutine;
 
diff --git a/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu b/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu
index b35cb99457f..84d3e33518f 100644
--- a/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu
+++ b/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu
@@ -40,8 +40,7 @@ int main()
 #if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
 #else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
-  struct sigaction sigabrt_action
-  {};
+  struct sigaction sigabrt_action{};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
   sigabrt_action.sa_handler = &cleanupRoutine;
 
diff --git a/cudax/test/stf/error_checks/non_managed_data.cu b/cudax/test/stf/error_checks/non_managed_data.cu
index a1188c7750f..6ac0121470d 100644
--- a/cudax/test/stf/error_checks/non_managed_data.cu
+++ b/cudax/test/stf/error_checks/non_managed_data.cu
@@ -44,8 +44,7 @@ int main()
 #  if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
 #  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
-  struct sigaction sigabrt_action
-  {};
+  struct sigaction sigabrt_action{};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
   sigabrt_action.sa_handler = &cleanupRoutine;
 
diff --git a/cudax/test/stf/error_checks/slice_check_bounds.cu b/cudax/test/stf/error_checks/slice_check_bounds.cu
index fecea9e7a55..a6b77c6bcc5 100644
--- a/cudax/test/stf/error_checks/slice_check_bounds.cu
+++ b/cudax/test/stf/error_checks/slice_check_bounds.cu
@@ -51,8 +51,7 @@ int main()
 #  if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
 #  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
-  struct sigaction sigabrt_action
-  {};
+  struct sigaction sigabrt_action{};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
   sigabrt_action.sa_handler = &cleanupRoutine;
 
diff --git a/cudax/test/stf/error_checks/uninitialized_data.cu b/cudax/test/stf/error_checks/uninitialized_data.cu
index 6af57556ad5..cf30b023bfb 100644
--- a/cudax/test/stf/error_checks/uninitialized_data.cu
+++ b/cudax/test/stf/error_checks/uninitialized_data.cu
@@ -42,8 +42,7 @@ int main()
 #  if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
 #  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
-  struct sigaction sigabrt_action
-  {};
+  struct sigaction sigabrt_action{};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
   sigabrt_action.sa_handler = &cleanupRoutine;
 
diff --git a/cudax/test/stf/error_checks/unsatisfiable_spec.cu b/cudax/test/stf/error_checks/unsatisfiable_spec.cu
index a0e4277979c..041b535fe61 100644
--- a/cudax/test/stf/error_checks/unsatisfiable_spec.cu
+++ b/cudax/test/stf/error_checks/unsatisfiable_spec.cu
@@ -42,8 +42,7 @@ int main()
 #  if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
 #  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
-  struct sigaction sigabrt_action
-  {};
+  struct sigaction sigabrt_action{};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
   sigabrt_action.sa_handler = &cleanupRoutine;
 
diff --git a/cudax/test/stf/error_checks/write_frozen.cu b/cudax/test/stf/error_checks/write_frozen.cu
index b4e08642a5e..011f4afd88e 100644
--- a/cudax/test/stf/error_checks/write_frozen.cu
+++ b/cudax/test/stf/error_checks/write_frozen.cu
@@ -43,8 +43,7 @@ int main()
 #if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
 #else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
-  struct sigaction sigabrt_action
-  {};
+  struct sigaction sigabrt_action{};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
   sigabrt_action.sa_handler = &cleanupRoutine;
 
diff --git a/libcudacxx/include/cuda/__barrier/barrier_block_scope.h b/libcudacxx/include/cuda/__barrier/barrier_block_scope.h
index 13027dfc581..af38ab97c8b 100644
--- a/libcudacxx/include/cuda/__barrier/barrier_block_scope.h
+++ b/libcudacxx/include/cuda/__barrier/barrier_block_scope.h
@@ -157,8 +157,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
         int __inc              = __popc(__active) * __update;
 
         unsigned __laneid;
-        asm("mov.u32 %0, %%laneid;"
-            : "=r"(__laneid));
+        asm("mov.u32 %0, %%laneid;" : "=r"(__laneid));
         int __leader = __ffs(__active) - 1;
         // All threads in mask synchronize here, establishing cummulativity to the __leader:
         __syncwarp(__mask);
@@ -181,10 +180,8 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
                     ".reg .pred p;\n\t"
                     "mbarrier.test_wait.shared.b64 p, [%1], %2;\n\t"
                     "selp.b32 %0, 1, 0, p;\n\t"
-                    "}"
-                    : "=r"(__ready)
-                    : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__token)
-                    : "memory");))
+                    "}" : "=r"(__ready) : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
+                    "l"(__token) : "memory");))
     return __ready;
   }
 
@@ -203,10 +200,9 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
                        ".reg .pred p;\n\t"
                        "mbarrier.try_wait.shared.b64 p, [%1], %2;\n\t"
                        "selp.b32 %0, 1, 0, p;\n\t"
-                       "}"
-                       : "=r"(__ready)
-                       : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__token)
-                       : "memory");
+                       "}" : "=r"(__ready) : "r"(
+                         static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
+                       "l"(__token) : "memory");
         return __ready;),
       NV_PROVIDES_SM_80,
       (if (!__isShared(&__barrier)) {
@@ -278,15 +274,12 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
     uint16_t __ready = 0;
     NV_DISPATCH_TARGET(
       NV_PROVIDES_SM_80,
-      (asm volatile(
-         "{"
-         ".reg .pred %%p;"
-         "mbarrier.test_wait.parity.shared.b64 %%p, [%1], %2;"
-         "selp.u16 %0, 1, 0, %%p;"
-         "}"
-         : "=h"(__ready)
-         : "r"(static_cast<uint32_t>(__cvta_generic_to_shared(&__barrier))), "r"(static_cast<uint32_t>(__phase_parity))
-         : "memory");))
+      (asm volatile("{"
+                    ".reg .pred %%p;"
+                    "mbarrier.test_wait.parity.shared.b64 %%p, [%1], %2;"
+                    "selp.u16 %0, 1, 0, %%p;"
+                    "}" : "=h"(__ready) : "r"(static_cast<uint32_t>(__cvta_generic_to_shared(&__barrier))),
+                    "r"(static_cast<uint32_t>(__phase_parity)) : "memory");))
     return __ready;
   }
 
@@ -299,16 +292,12 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
           return _CUDA_VSTD::__call_try_wait_parity(__barrier, __phase_parity);
         } else if (!__isShared(&__barrier)) { __trap(); } int32_t __ready = 0;
 
-        asm volatile(
-          "{\n\t"
-          ".reg .pred p;\n\t"
-          "mbarrier.try_wait.parity.shared.b64 p, [%1], %2;\n\t"
-          "selp.b32 %0, 1, 0, p;\n\t"
-          "}"
-          : "=r"(__ready)
-          : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
-            "r"(static_cast<_CUDA_VSTD::uint32_t>(__phase_parity))
-          :);
+        asm volatile("{\n\t"
+                     ".reg .pred p;\n\t"
+                     "mbarrier.try_wait.parity.shared.b64 p, [%1], %2;\n\t"
+                     "selp.b32 %0, 1, 0, p;\n\t"
+                     "}" : "=r"(__ready) : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
+                     "r"(static_cast<_CUDA_VSTD::uint32_t>(__phase_parity)) :);
 
         return __ready;),
       NV_PROVIDES_SM_80,
@@ -402,9 +391,8 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
           __trap();
         }
 
-        asm volatile("mbarrier.arrive_drop.shared.b64 _, [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>(
-          __cvta_generic_to_shared(&__barrier)))
-                     : "memory");),
+        asm volatile("mbarrier.arrive_drop.shared.b64 _, [%0];" ::"r"(
+          static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))) : "memory");),
       NV_PROVIDES_SM_80,
       (
         // Fallback to slowpath on device
@@ -413,9 +401,8 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
           return;
         }
 
-        asm volatile("mbarrier.arrive_drop.shared.b64 _, [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>(
-          __cvta_generic_to_shared(&__barrier)))
-                     : "memory");),
+        asm volatile("mbarrier.arrive_drop.shared.b64 _, [%0];" ::"r"(
+          static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))) : "memory");),
       NV_ANY_TARGET,
       (
         // Fallback to slowpath on device
diff --git a/libcudacxx/include/cuda/__barrier/barrier_expect_tx.h b/libcudacxx/include/cuda/__barrier/barrier_expect_tx.h
index 4d9f063512f..1b8fc49d400 100644
--- a/libcudacxx/include/cuda/__barrier/barrier_expect_tx.h
+++ b/libcudacxx/include/cuda/__barrier/barrier_expect_tx.h
@@ -54,11 +54,8 @@ barrier_expect_tx(barrier<thread_scope_block>& __b, _CUDA_VSTD::ptrdiff_t __tran
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
-     asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
-         :
-         : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
-           "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
-         : "memory");),
+     asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" : : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
+         "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) : "memory");),
     (__cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();));
 }
 
diff --git a/libcudacxx/include/cuda/__functional/address_stability.h b/libcudacxx/include/cuda/__functional/address_stability.h
index d3c88b8bfa4..3402c3cea0e 100644
--- a/libcudacxx/include/cuda/__functional/address_stability.h
+++ b/libcudacxx/include/cuda/__functional/address_stability.h
@@ -68,8 +68,8 @@ struct proclaims_copyable_arguments<__callable_permitting_copied_arguments<F>> :
 //! implementation.
 //! @see proclaims_copyable_arguments
 template <typename F>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
-proclaim_copyable_arguments(F&& f) -> __callable_permitting_copied_arguments<::cuda::std::decay_t<F>>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto proclaim_copyable_arguments(F&& f)
+  -> __callable_permitting_copied_arguments<::cuda::std::decay_t<F>>
 {
   return {::cuda::std::forward<F>(f)};
 }
diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h
index 867dad16111..07eb5c84c93 100644
--- a/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h
+++ b/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h
@@ -47,12 +47,10 @@ inline __device__ void __cp_async_shared_global(char* __dest, const char* __src)
 
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_80,
-    (asm volatile("cp.async.ca.shared.global [%0], [%1], %2, %2;"
-                  :
-                  : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
-                    "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
-                    "n"(_Copy_size)
-                  : "memory");),
+    (asm volatile("cp.async.ca.shared.global [%0], [%1], %2, %2;" : : "r"(
+                    static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
+                  "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
+                  "n"(_Copy_size) : "memory");),
     (__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();));
 }
 
@@ -63,12 +61,10 @@ inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __
   // When copying 16 bytes, it is possible to skip L1 cache (.cg).
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_80,
-    (asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %2;"
-                  :
-                  : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
-                    "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
-                    "n"(16)
-                  : "memory");),
+    (asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %2;" : : "r"(
+                    static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
+                  "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
+                  "n"(16) : "memory");),
     (__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();));
 }
 
diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h
index 182357d591a..3dd3e91d125 100644
--- a/libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_completion.h
@@ -76,9 +76,8 @@ struct __memcpy_completion_impl
             // have completed writing to shared memory.
             _CUDA_VSTD::uint64_t* __bh = __try_get_barrier_handle(__barrier);
 
-            asm volatile("cp.async.mbarrier.arrive.shared.b64 [%0];" ::"r"(static_cast<_CUDA_VSTD::uint32_t>(
-              __cvta_generic_to_shared(__bh)))
-                         : "memory");));
+            asm volatile("cp.async.mbarrier.arrive.shared.b64 [%0];" ::"r"(
+              static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__bh))) : "memory");));
         return async_contract_fulfillment::async;
       case __completion_mechanism::__async_bulk_group:
         // This completion mechanism should not be used with a shared
@@ -124,8 +123,7 @@ struct __memcpy_completion_impl
                      (
                        // Blocking: wait for all thread-local cp.async instructions to have
                        // completed writing to shared memory.
-                       asm volatile("cp.async.wait_all;" ::
-                                      : "memory");));
+                       asm volatile("cp.async.wait_all;" :: : "memory");));
         return async_contract_fulfillment::async;
       case __completion_mechanism::__mbarrier_complete_tx:
         // Non-smem barriers do not have an mbarrier_complete_tx mechanism..
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h
index 10d55714c5b..c8ce41c0a20 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h
@@ -16,10 +16,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.arrive;"
-                  :
-                  :
-                  : "memory");),
+    (asm volatile("barrier.cluster.arrive;" : : : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
@@ -39,10 +36,7 @@ _CCCL_DEVICE static inline void barrier_cluster_wait()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.wait;"
-                  :
-                  :
-                  : "memory");),
+    (asm volatile("barrier.cluster.wait;" : : : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
@@ -65,10 +59,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t)
   // __sem == sem_release (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.arrive.release;"
-                  :
-                  :
-                  : "memory");),
+    (asm volatile("barrier.cluster.arrive.release;" : : : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
@@ -91,10 +82,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t)
   // __sem == sem_relaxed (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.arrive.relaxed;"
-                  :
-                  :
-                  :);),
+    (asm volatile("barrier.cluster.arrive.relaxed;" : : :);),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
@@ -117,10 +105,7 @@ _CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t)
   // __sem == sem_acquire (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.wait.acquire;"
-                  :
-                  :
-                  : "memory");),
+    (asm volatile("barrier.cluster.wait.acquire;" : : : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h
index 8ba40d45f64..d2196402e7a 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h
@@ -32,10 +32,11 @@ _CCCL_DEVICE static inline void cp_async_bulk(
   // __space == space_global (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast"
-         :
-         : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
+    (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // "
+         "1a. unicast" : : "r"(__as_ptr_smem(__dstMem)),
+         "l"(__as_ptr_gmem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_smem(__smem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
@@ -70,13 +71,11 @@ _CCCL_DEVICE static inline void cp_async_bulk(
   // __space == space_shared (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
+    (asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. " : : "r"(
+           __as_ptr_remote_dsmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
@@ -105,10 +104,9 @@ cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcM
   // __space == space_shared (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. "
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. " : : "l"(__as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h
index 7bb58675ddb..3c32743e977 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h
@@ -15,10 +15,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_commit_group()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm volatile("cp.async.bulk.commit_group;"
-                  :
-                  :
-                  :);),
+    (asm volatile("cp.async.bulk.commit_group;" : : :);),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h
index a5534ef0b48..f54bf8bbdeb 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h
@@ -35,14 +35,11 @@ _CCCL_DEVICE static inline void cp_async_bulk(
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
     (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], "
-         "%4; // 1. "
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__as_ptr_gmem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
+         "%4; // 1. " : : "r"(__as_ptr_smem(__dstMem)),
+         "l"(__as_ptr_gmem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_smem(__smem_bar)),
+         "h"(__ctaMask) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h
index 3cbd26fda04..f7c60bb72f6 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h
@@ -33,10 +33,10 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];// "
-         "1a."
-         :
-         : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
+         "1a." : : "r"(__as_ptr_smem(__dstMem)),
+         "l"(__tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__as_ptr_smem(__smem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
@@ -69,10 +69,9 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_shared (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2]; // 3a."
-         :
-         : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
+    (asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2]; // 3a." : : "l"(__tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__as_ptr_smem(__srcMem)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
@@ -109,14 +108,11 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], "
-         "[%4];// 1b."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
+         "[%4];// 1b." : : "r"(__as_ptr_smem(__dstMem)),
+         "l"(__tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__tensorCoords[1]),
+         "r"(__as_ptr_smem(__smem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
@@ -149,10 +145,10 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_shared (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3]; // 3b."
-         :
-         : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
+    (asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3]; // 3b." : : "l"(__tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__tensorCoords[1]),
+         "r"(__as_ptr_smem(__srcMem)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
@@ -189,15 +185,12 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], "
-         "[%5];// 1c."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
+         "[%5];// 1c." : : "r"(__as_ptr_smem(__dstMem)),
+         "l"(__tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__tensorCoords[1]),
+         "r"(__tensorCoords[2]),
+         "r"(__as_ptr_smem(__smem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
@@ -230,14 +223,12 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_shared (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 3c."
-         :
-         : "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
+    (asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 3c." : : "l"(
+           __tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__tensorCoords[1]),
+         "r"(__tensorCoords[2]),
+         "r"(__as_ptr_smem(__srcMem)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
@@ -274,16 +265,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, "
-         "%5}], [%6];// 1d."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
+         "%5}], [%6];// 1d." : : "r"(__as_ptr_smem(__dstMem)),
+         "l"(__tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__tensorCoords[1]),
+         "r"(__tensorCoords[2]),
+         "r"(__tensorCoords[3]),
+         "r"(__as_ptr_smem(__smem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
@@ -316,15 +304,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_shared (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 3d."
-         :
-         : "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
+    (asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 3d." : : "l"(
+           __tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__tensorCoords[1]),
+         "r"(__tensorCoords[2]),
+         "r"(__tensorCoords[3]),
+         "r"(__as_ptr_smem(__srcMem)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
@@ -361,17 +347,14 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, "
-         "%6}], [%7];// 1e."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__tensorCoords[4]),
-           "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
+         "%6}], [%7];// 1e." : : "r"(__as_ptr_smem(__dstMem)),
+         "l"(__tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__tensorCoords[1]),
+         "r"(__tensorCoords[2]),
+         "r"(__tensorCoords[3]),
+         "r"(__tensorCoords[4]),
+         "r"(__as_ptr_smem(__smem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
@@ -404,16 +387,14 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_shared (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 3e."
-         :
-         : "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__tensorCoords[4]),
-           "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
+    (asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 3e." : : "l"(
+           __tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__tensorCoords[1]),
+         "r"(__tensorCoords[2]),
+         "r"(__tensorCoords[3]),
+         "r"(__tensorCoords[4]),
+         "r"(__as_ptr_smem(__srcMem)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h
index 915979d18f3..56c199d39ff 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h
@@ -35,14 +35,11 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
     (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2}], [%3], %4; // 2a."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
+         "[%1, {%2}], [%3], %4; // 2a." : : "r"(__as_ptr_smem(__dstMem)),
+         "l"(__tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__as_ptr_smem(__smem_bar)),
+         "h"(__ctaMask) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
@@ -81,15 +78,12 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
     (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2, %3}], [%4], %5; // 2b."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
+         "[%1, {%2, %3}], [%4], %5; // 2b." : : "r"(__as_ptr_smem(__dstMem)),
+         "l"(__tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__tensorCoords[1]),
+         "r"(__as_ptr_smem(__smem_bar)),
+         "h"(__ctaMask) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
@@ -128,16 +122,13 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
     (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2, %3, %4}], [%5], %6; // 2c."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
+         "[%1, {%2, %3, %4}], [%5], %6; // 2c." : : "r"(__as_ptr_smem(__dstMem)),
+         "l"(__tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__tensorCoords[1]),
+         "r"(__tensorCoords[2]),
+         "r"(__as_ptr_smem(__smem_bar)),
+         "h"(__ctaMask) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
@@ -176,17 +167,14 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
     (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
+         "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d." : : "r"(__as_ptr_smem(__dstMem)),
+         "l"(__tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__tensorCoords[1]),
+         "r"(__tensorCoords[2]),
+         "r"(__tensorCoords[3]),
+         "r"(__as_ptr_smem(__smem_bar)),
+         "h"(__ctaMask) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
@@ -225,18 +213,15 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
     (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__tensorCoords[4]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
+         "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e." : : "r"(__as_ptr_smem(__dstMem)),
+         "l"(__tensorMap),
+         "r"(__tensorCoords[0]),
+         "r"(__tensorCoords[1]),
+         "r"(__tensorCoords[2]),
+         "r"(__tensorCoords[3]),
+         "r"(__tensorCoords[4]),
+         "r"(__as_ptr_smem(__smem_bar)),
+         "h"(__ctaMask) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h
index 2057323665a..85b1507f721 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h
@@ -16,10 +16,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __N)
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm volatile("cp.async.bulk.wait_group %0;"
-                  :
-                  : "n"(__N.value)
-                  : "memory");),
+    (asm volatile("cp.async.bulk.wait_group %0;" : : "n"(__N.value) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();));
@@ -39,10 +36,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __N)
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm volatile("cp.async.bulk.wait_group.read %0;"
-                  :
-                  : "n"(__N.value)
-                  : "memory");),
+    (asm volatile("cp.async.bulk.wait_group.read %0;" : : "n"(__N.value) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h
index a35684c85e1..9b1bf35b290 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h
@@ -39,13 +39,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
+         "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -88,13 +85,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
+         "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -137,13 +131,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
+         "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -186,13 +177,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
+         "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -235,13 +223,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
+         "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -284,13 +269,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
+         "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -333,13 +315,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
+         "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -382,13 +361,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
+         "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -431,13 +407,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
+         "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -480,13 +453,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
+         "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -529,13 +499,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
+         "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -578,13 +545,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
+         "// 1." : : "r"(__as_ptr_remote_dsmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -627,13 +591,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; "
-         "// 2."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
+         "// 2." : : "r"(__as_ptr_remote_dsmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size),
+         "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -807,10 +768,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_min (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -849,10 +810,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_max (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -891,10 +852,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_add (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -933,10 +894,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_inc (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -975,10 +936,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_dec (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -1017,10 +978,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_min (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -1059,10 +1020,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_max (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -1101,10 +1062,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_add (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -1143,10 +1104,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_min (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -1185,10 +1146,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_max (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -1227,10 +1188,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_add (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -1269,10 +1230,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_min (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -1311,10 +1272,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_max (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -1348,10 +1309,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_add (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -1385,10 +1346,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_add (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -1427,10 +1388,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_add (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [%0], [%1], %2; // 6."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [%0], [%1], %2; // 6." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h
index 1e13bb5f4f2..da5cdb6bc9b 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h
@@ -35,10 +35,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_min (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -77,10 +77,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_max (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -119,10 +119,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_add (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16  [%0], [%1], %2; // 5."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16  [%0], [%1], %2; // 5." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h
index 0c4678c95bb..3d9d4520dcb 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h
@@ -30,10 +30,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_min (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -67,10 +67,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_max (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16  [%0], [%1], %2; // 4." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
@@ -104,10 +104,10 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   // __op == op_add (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16  [%0], [%1], %2; // 5."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16  [%0], [%1], %2; // 5." : : "l"(
+           __as_ptr_gmem(__dstMem)),
+         "r"(__as_ptr_smem(__srcMem)),
+         "r"(__size) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h
index e185913b3cd..f8c4e6cf476 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h
@@ -21,10 +21,7 @@ _CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster
   // __scope == scope_cluster (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm volatile("fence.mbarrier_init.release.cluster; // 3."
-                  :
-                  :
-                  : "memory");),
+    (asm volatile("fence.mbarrier_init.release.cluster; // 3." : : : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h
index 40229b84a96..cc413a0f511 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h
@@ -15,10 +15,7 @@ _CCCL_DEVICE static inline void fence_proxy_alias()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_70,
-    (asm volatile("fence.proxy.alias; // 4."
-                  :
-                  :
-                  : "memory");),
+    (asm volatile("fence.proxy.alias; // 4." : : : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h
index f64b5faee5e..176d24ff73f 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h
@@ -15,10 +15,7 @@ _CCCL_DEVICE static inline void fence_proxy_async()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm volatile("fence.proxy.async; // 5."
-                  :
-                  :
-                  : "memory");),
+    (asm volatile("fence.proxy.async; // 5." : : : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h
index 08128cc00a1..da802adb9db 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h
@@ -135,11 +135,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%nwarpid;"
-       : "=r"(__sreg_value)
-       :
-       :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%nwarpid;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -264,11 +260,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%nsmid;"
-       : "=r"(__sreg_value)
-       :
-       :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%nsmid;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -307,10 +299,7 @@ _CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster()
      asm("{\n\t .reg .pred P_OUT; \n\t"
          "mov.pred P_OUT, %%is_explicit_cluster;\n\t"
          "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__sreg_value)
-         :
-         :);
+         "}" : "=r"(__sreg_value) : :);
      return static_cast<bool>(__sreg_value);),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -330,11 +319,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%clusterid.x;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%clusterid.x;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -354,11 +339,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%clusterid.y;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%clusterid.y;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -378,11 +359,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%clusterid.z;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%clusterid.z;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -402,11 +379,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%nclusterid.x;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%nclusterid.x;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -426,11 +399,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%nclusterid.y;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%nclusterid.y;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -450,11 +419,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%nclusterid.z;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%nclusterid.z;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -474,11 +439,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_ctaid.x;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_ctaid.x;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -498,11 +459,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_ctaid.y;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_ctaid.y;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -522,11 +479,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_ctaid.z;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_ctaid.z;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -546,11 +499,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_nctaid.x;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_nctaid.x;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -570,11 +519,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_nctaid.y;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_nctaid.y;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -594,11 +539,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_nctaid.z;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_nctaid.z;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -618,11 +559,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_ctarank;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_ctarank;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -642,11 +579,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_nctarank;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%cluster_nctarank;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -666,11 +599,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_eq;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_eq;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -690,11 +619,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_le;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_le;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -714,11 +639,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_lt;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -738,11 +659,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_ge;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_ge;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -762,11 +679,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_gt;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%lanemask_gt;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -801,11 +714,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%clock_hi;"
-       : "=r"(__sreg_value)
-       :
-       :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%clock_hi;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -825,11 +734,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint64_t __sreg_value; asm volatile(
-       "mov.u64 %0, %%clock64;"
-       : "=l"(__sreg_value)
-       :
-       :);
+    (_CUDA_VSTD::uint64_t __sreg_value; asm volatile("mov.u64 %0, %%clock64;" : "=l"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -849,11 +754,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint64_t __sreg_value; asm volatile(
-       "mov.u64 %0, %%globaltimer;"
-       : "=l"(__sreg_value)
-       :
-       :);
+    (_CUDA_VSTD::uint64_t __sreg_value; asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -873,11 +774,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%globaltimer_lo;"
-       : "=r"(__sreg_value)
-       :
-       :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -897,11 +794,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%globaltimer_hi;"
-       : "=r"(__sreg_value)
-       :
-       :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -921,11 +814,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%total_smem_size;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%total_smem_size;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -945,11 +834,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%aggr_smem_size;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%aggr_smem_size;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -969,11 +854,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%dynamic_smem_size;"
-         : "=r"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint32_t __sreg_value; asm("mov.u32 %0, %%dynamic_smem_size;" : "=r"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -993,11 +874,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec()
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_50,
-    (_CUDA_VSTD::uint64_t __sreg_value;
-     asm("mov.u64 %0, %%current_graph_exec;"
-         : "=l"(__sreg_value)
-         :
-         :);
+    (_CUDA_VSTD::uint64_t __sreg_value; asm("mov.u64 %0, %%current_graph_exec;" : "=l"(__sreg_value) : :);
      return __sreg_value;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h
index a769868f45c..22bb73180dc 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h
@@ -20,10 +20,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, cons
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (_CUDA_VSTD::uint32_t __dest;
-     asm("getctarank.shared::cluster.u32 %0, %1;"
-         : "=r"(__dest)
-         : "r"(__as_ptr_smem(__addr))
-         :);
+     asm("getctarank.shared::cluster.u32 %0, %1;" : "=r"(__dest) : "r"(__as_ptr_smem(__addr)) :);
      return __dest;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
index e1afe25d8c2..c7102ebfdb5 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
@@ -16,11 +16,8 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_80,
-    (_CUDA_VSTD::uint64_t __state;
-     asm("mbarrier.arrive.shared.b64                                  %0,  [%1];           // 1. "
-         : "=l"(__state)
-         : "r"(__as_ptr_smem(__addr))
-         : "memory");
+    (_CUDA_VSTD::uint64_t __state; asm("mbarrier.arrive.shared.b64                                  %0,  [%1];         "
+                                       "  // 1. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)) : "memory");
      return __state;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -44,10 +41,9 @@ mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __coun
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (_CUDA_VSTD::uint64_t __state;
-     asm("mbarrier.arrive.shared::cta.b64                             %0,  [%1], %2;    // 2. "
-         : "=l"(__state)
-         : "r"(__as_ptr_smem(__addr)), "r"(__count)
-         : "memory");
+     asm("mbarrier.arrive.shared::cta.b64                             %0,  [%1], %2;    "
+         "// 2. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)),
+         "r"(__count) : "memory");
      return __state;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -165,10 +161,8 @@ mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uin
   // __space == space_cluster (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0];                // 4a. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__addr))
-         : "memory");),
+    (asm("mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0];               "
+         " // 4a. " : : "r"(__as_ptr_remote_dsmem(__addr)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();));
@@ -199,10 +193,9 @@ _CCCL_DEVICE static inline void mbarrier_arrive(
   // __space == space_cluster (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0], %1;         // 4b. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__count)
-         : "memory");),
+    (asm("mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0], %1;         "
+         "// 4b. " : : "r"(__as_ptr_remote_dsmem(__addr)),
+         "r"(__count) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
index 79301a57851..dc33b212e21 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
@@ -73,10 +73,9 @@ _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx(
   // __space == space_cluster (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64   _, [%0], %1; // 9. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count)
-         : "memory");),
+    (asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64   _, [%0], %1; // 9. " : : "r"(
+           __as_ptr_remote_dsmem(__addr)),
+         "r"(__tx_count) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h
index cbfb275baa4..45c444c5364 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h
@@ -19,10 +19,9 @@ mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_80,
     (_CUDA_VSTD::uint64_t __state;
-     asm("mbarrier.arrive.noComplete.shared.b64                       %0,  [%1], %2;    // 5. "
-         : "=l"(__state)
-         : "r"(__as_ptr_smem(__addr)), "r"(__count)
-         : "memory");
+     asm("mbarrier.arrive.noComplete.shared.b64                       %0,  [%1], %2;    "
+         "// 5. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)),
+         "r"(__count) : "memory");
      return __state;),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h
index d1e5c57c97e..6b3041de0d2 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h
@@ -17,10 +17,7 @@ _CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, cons
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_80,
-    (asm("mbarrier.init.shared.b64 [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__addr)), "r"(__count)
-         : "memory");),
+    (asm("mbarrier.init.shared.b64 [%0], %1;" : : "r"(__as_ptr_smem(__addr)), "r"(__count) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h
index f3dbb6ed1c3..9adc677c76d 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h
@@ -21,10 +21,8 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait(_CUDA_VSTD::uint64_t* __addr,
      asm("{\n\t .reg .pred P_OUT; \n\t"
          "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2;                                                  // 1. \n\t"
          "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "l"(__state)
-         : "memory");
+         "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)),
+         "l"(__state) : "memory");
      return static_cast<bool>(__waitComplete);),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h
index b975434b2de..1166b336d2d 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h
@@ -22,10 +22,8 @@ mbarrier_test_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32
      asm("{\n\t .reg .pred P_OUT; \n\t"
          "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2;                                     // 3. \n\t"
          "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
-         : "memory");
+         "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)),
+         "r"(__phaseParity) : "memory");
      return static_cast<bool>(__waitComplete);),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h
index dd50a2c9f41..52fa5a4928a 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h
@@ -21,10 +21,8 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr,
      asm("{\n\t .reg .pred P_OUT; \n\t"
          "mbarrier.try_wait.shared::cta.b64         P_OUT, [%1], %2;                                      // 5a. \n\t"
          "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "l"(__state)
-         : "memory");
+         "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)),
+         "l"(__state) : "memory");
      return static_cast<bool>(__waitComplete);),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -52,10 +50,9 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(
      asm("{\n\t .reg .pred P_OUT; \n\t"
          "mbarrier.try_wait.shared::cta.b64         P_OUT, [%1], %2, %3;                    // 5b. \n\t"
          "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
-         : "memory");
+         "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)),
+         "l"(__state),
+         "r"(__suspendTimeHint) : "memory");
      return static_cast<bool>(__waitComplete);),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h
index d3deb3ca1d5..aa15e255352 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h
@@ -22,10 +22,8 @@ mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_
      asm("{\n\t .reg .pred P_OUT; \n\t"
          "mbarrier.try_wait.parity.shared::cta.b64  P_OUT, [%1], %2;                                // 7a. \n\t"
          "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
-         : "memory");
+         "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)),
+         "r"(__phaseParity) : "memory");
      return static_cast<bool>(__waitComplete);),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -53,10 +51,9 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
      asm("{\n\t .reg .pred P_OUT; \n\t"
          "mbarrier.try_wait.parity.shared::cta.b64  P_OUT, [%1], %2, %3;               // 7b. \n\t"
          "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
-         : "memory");
+         "}" : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)),
+         "r"(__phaseParity),
+         "r"(__suspendTimeHint) : "memory");
      return static_cast<bool>(__waitComplete);),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h
index d88392f3635..74110933270 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h
@@ -25,10 +25,10 @@ _CCCL_DEVICE static inline void red_async(
   // __op == op_inc (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32  [%0], %1, [%2]; " : : "r"(
+           __as_ptr_remote_dsmem(__dest)),
+         "r"(__value),
+         "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
@@ -57,10 +57,10 @@ _CCCL_DEVICE static inline void red_async(
   // __op == op_dec (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32  [%0], %1, [%2]; " : : "r"(
+           __as_ptr_remote_dsmem(__dest)),
+         "r"(__value),
+         "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
@@ -89,10 +89,10 @@ _CCCL_DEVICE static inline void red_async(
   // __op == op_min (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32  [%0], %1, [%2]; " : : "r"(
+           __as_ptr_remote_dsmem(__dest)),
+         "r"(__value),
+         "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
@@ -121,10 +121,10 @@ _CCCL_DEVICE static inline void red_async(
   // __op == op_max (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32  [%0], %1, [%2]; " : : "r"(
+           __as_ptr_remote_dsmem(__dest)),
+         "r"(__value),
+         "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
@@ -153,10 +153,10 @@ _CCCL_DEVICE static inline void red_async(
   // __op == op_add (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32  [%0], %1, [%2]; " : : "r"(
+           __as_ptr_remote_dsmem(__dest)),
+         "r"(__value),
+         "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
@@ -185,10 +185,10 @@ red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va
   // __op == op_min (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32  [%0], %1, [%2]; " : : "r"(
+           __as_ptr_remote_dsmem(__dest)),
+         "r"(__value),
+         "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
@@ -217,10 +217,10 @@ red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va
   // __op == op_max (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32  [%0], %1, [%2]; " : : "r"(
+           __as_ptr_remote_dsmem(__dest)),
+         "r"(__value),
+         "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
@@ -249,10 +249,10 @@ red_async(op_add_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va
   // __op == op_add (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32  [%0], %1, [%2]; " : : "r"(
+           __as_ptr_remote_dsmem(__dest)),
+         "r"(__value),
+         "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
@@ -282,10 +282,10 @@ red_async(op_and_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t*
   static_assert(sizeof(_B32) == 4, "");
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32  [%0], %1, [%2]; " : : "r"(
+           __as_ptr_remote_dsmem(__dest)),
+         "r"(__as_b32(__value)),
+         "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
@@ -315,10 +315,10 @@ red_async(op_or_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* _
   static_assert(sizeof(_B32) == 4, "");
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32  [%0], %1, [%2]; " : : "r"(
+           __as_ptr_remote_dsmem(__dest)),
+         "r"(__as_b32(__value)),
+         "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
@@ -348,10 +348,10 @@ red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t*
   static_assert(sizeof(_B32) == 4, "");
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32  [%0], %1, [%2]; " : : "r"(
+           __as_ptr_remote_dsmem(__dest)),
+         "r"(__as_b32(__value)),
+         "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
@@ -380,10 +380,10 @@ _CCCL_DEVICE static inline void red_async(
   // __op == op_add (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [%0], %1, [%2]; " : : "r"(
+           __as_ptr_remote_dsmem(__dest)),
+         "l"(__value),
+         "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
@@ -411,10 +411,9 @@ red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __va
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [%0], %1, [%2]; // .u64 "
-         "intentional"
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
+         "intentional" : : "r"(__as_ptr_remote_dsmem(__dest)),
+         "l"(__value),
+         "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h
index 18fd2c03a41..e6c3fcf1737 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h
@@ -97,15 +97,13 @@ _CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4],
   static_assert(sizeof(_B32) == 4, "");
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
-    (asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5];    // 3. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__addr)),
-           "r"(__as_b32(__value[0])),
-           "r"(__as_b32(__value[1])),
-           "r"(__as_b32(__value[2])),
-           "r"(__as_b32(__value[3])),
-           "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
+    (asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5];    // "
+         "3. " : : "r"(__as_ptr_remote_dsmem(__addr)),
+         "r"(__as_b32(__value[0])),
+         "r"(__as_b32(__value[1])),
+         "r"(__as_b32(__value[2])),
+         "r"(__as_b32(__value[3])),
+         "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h
index 3889026750d..598b56f90b0 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h
@@ -21,10 +21,8 @@ _CCCL_DEVICE static inline void tensormap_replace_global_address(space_global_t,
   static_assert(sizeof(_B64) == 8, "");
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_address.global.b1024.b64    [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "l"(__as_b64(__new_val))
-         : "memory");),
+    (asm("tensormap.replace.tile.global_address.global.b1024.b64    [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)),
+         "l"(__as_b64(__new_val)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();));
@@ -49,10 +47,8 @@ _CCCL_DEVICE static inline void tensormap_replace_global_address(space_shared_t,
   static_assert(sizeof(_B64) == 8, "");
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64    [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "l"(__as_b64(__new_val))
-         : "memory");),
+    (asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64    [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)),
+         "l"(__as_b64(__new_val)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();));
@@ -77,10 +73,8 @@ _CCCL_DEVICE static inline void tensormap_replace_rank(space_global_t, void* __t
   static_assert(sizeof(_B32) == 4, "");
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.rank.global.b1024.b32              [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "r"(__as_b32(__new_val))
-         : "memory");),
+    (asm("tensormap.replace.tile.rank.global.b1024.b32              [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)),
+         "r"(__as_b32(__new_val)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();));
@@ -105,10 +99,8 @@ _CCCL_DEVICE static inline void tensormap_replace_rank(space_shared_t, void* __t
   static_assert(sizeof(_B32) == 4, "");
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.rank.shared::cta.b1024.b32              [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "r"(__as_b32(__new_val))
-         : "memory");),
+    (asm("tensormap.replace.tile.rank.shared::cta.b1024.b32              [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)),
+         "r"(__as_b32(__new_val)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();));
@@ -135,10 +127,9 @@ tensormap_replace_box_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B
   static_assert(sizeof(_B32) == 4, "");
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.box_dim.global.b1024.b32           [%0], %1, %2;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
+    (asm("tensormap.replace.tile.box_dim.global.b1024.b32           [%0], %1, %2;" : : "l"(__as_ptr_gmem(__tm_addr)),
+         "n"(__ord.value),
+         "r"(__as_b32(__new_val)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();));
@@ -165,10 +156,10 @@ tensormap_replace_box_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B
   static_assert(sizeof(_B32) == 4, "");
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.box_dim.shared::cta.b1024.b32           [%0], %1, %2;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
+    (asm(
+       "tensormap.replace.tile.box_dim.shared::cta.b1024.b32           [%0], %1, %2;" : : "r"(__as_ptr_smem(__tm_addr)),
+       "n"(__ord.value),
+       "r"(__as_b32(__new_val)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();));
@@ -195,10 +186,9 @@ tensormap_replace_global_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord,
   static_assert(sizeof(_B32) == 4, "");
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_dim.global.b1024.b32        [%0], %1, %2;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
+    (asm("tensormap.replace.tile.global_dim.global.b1024.b32        [%0], %1, %2;" : : "l"(__as_ptr_gmem(__tm_addr)),
+         "n"(__ord.value),
+         "r"(__as_b32(__new_val)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();));
@@ -225,10 +215,10 @@ tensormap_replace_global_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord,
   static_assert(sizeof(_B32) == 4, "");
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_dim.shared::cta.b1024.b32        [%0], %1, %2;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
+    (asm(
+       "tensormap.replace.tile.global_dim.shared::cta.b1024.b32        [%0], %1, %2;" : : "r"(__as_ptr_smem(__tm_addr)),
+       "n"(__ord.value),
+       "r"(__as_b32(__new_val)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();));
@@ -255,10 +245,9 @@ tensormap_replace_global_stride(space_global_t, void* __tm_addr, n32_t<_N32> __o
   static_assert(sizeof(_B64) == 8, "");
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_stride.global.b1024.b64     [%0], %1, %2;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val))
-         : "memory");),
+    (asm("tensormap.replace.tile.global_stride.global.b1024.b64     [%0], %1, %2;" : : "l"(__as_ptr_gmem(__tm_addr)),
+         "n"(__ord.value),
+         "l"(__as_b64(__new_val)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();));
@@ -285,10 +274,10 @@ tensormap_replace_global_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __o
   static_assert(sizeof(_B64) == 8, "");
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_stride.shared::cta.b1024.b64     [%0], %1, %2;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val))
-         : "memory");),
+    (asm(
+       "tensormap.replace.tile.global_stride.shared::cta.b1024.b64     [%0], %1, %2;" : : "r"(__as_ptr_smem(__tm_addr)),
+       "n"(__ord.value),
+       "l"(__as_b64(__new_val)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();));
@@ -315,10 +304,9 @@ tensormap_replace_element_size(space_global_t, void* __tm_addr, n32_t<_N32> __or
   static_assert(sizeof(_B32) == 4, "");
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.element_stride.global.b1024.b32    [%0], %1, %2;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
+    (asm("tensormap.replace.tile.element_stride.global.b1024.b32    [%0], %1, %2;" : : "l"(__as_ptr_gmem(__tm_addr)),
+         "n"(__ord.value),
+         "r"(__as_b32(__new_val)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();));
@@ -345,10 +333,10 @@ tensormap_replace_element_size(space_shared_t, void* __tm_addr, n32_t<_N32> __or
   static_assert(sizeof(_B32) == 4, "");
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32    [%0], %1, %2;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
+    (asm(
+       "tensormap.replace.tile.element_stride.shared::cta.b1024.b32    [%0], %1, %2;" : : "r"(__as_ptr_smem(__tm_addr)),
+       "n"(__ord.value),
+       "r"(__as_b32(__new_val)) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();));
@@ -372,10 +360,8 @@ _CCCL_DEVICE static inline void tensormap_replace_elemtype(space_global_t, void*
   // __space == space_global (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.elemtype.global.b1024.b32          [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
+    (asm("tensormap.replace.tile.elemtype.global.b1024.b32          [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)),
+         "n"(__new_val.value) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();));
@@ -399,10 +385,8 @@ _CCCL_DEVICE static inline void tensormap_replace_elemtype(space_shared_t, void*
   // __space == space_shared (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32          [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
+    (asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32          [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)),
+         "n"(__new_val.value) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();));
@@ -427,10 +411,8 @@ tensormap_replace_interleave_layout(space_global_t, void* __tm_addr, n32_t<_N32>
   // __space == space_global (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
+    (asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)),
+         "n"(__new_val.value) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();));
@@ -455,10 +437,8 @@ tensormap_replace_interleave_layout(space_shared_t, void* __tm_addr, n32_t<_N32>
   // __space == space_shared (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
+    (asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)),
+         "n"(__new_val.value) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();));
@@ -482,10 +462,8 @@ _CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_global_t, v
   // __space == space_global (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32      [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
+    (asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32      [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)),
+         "n"(__new_val.value) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();));
@@ -509,10 +487,8 @@ _CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_shared_t, v
   // __space == space_shared (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32      [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
+    (asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32      [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)),
+         "n"(__new_val.value) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();));
@@ -536,10 +512,8 @@ _CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_global_t, void
   // __space == space_global (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.fill_mode.global.b1024.b32         [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
+    (asm("tensormap.replace.tile.fill_mode.global.b1024.b32         [%0], %1;" : : "l"(__as_ptr_gmem(__tm_addr)),
+         "n"(__new_val.value) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();));
@@ -563,10 +537,8 @@ _CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void
   // __space == space_shared (due to parameter type constraint)
   NV_IF_ELSE_TARGET(
     NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32         [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
+    (asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32         [%0], %1;" : : "r"(__as_ptr_smem(__tm_addr)),
+         "n"(__new_val.value) : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();));
diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline
index d034c931644..7946e8bdc91 100644
--- a/libcudacxx/include/cuda/pipeline
+++ b/libcudacxx/include/cuda/pipeline
@@ -199,12 +199,9 @@ struct __pipeline_asm_helper
 {
   _CCCL_DEVICE static inline uint32_t __lane_id()
   {
-    NV_IF_ELSE_TARGET(
-      NV_IS_DEVICE,
-      (uint32_t __lane_id; asm volatile("mov.u32 %0, %%laneid;"
-                                        : "=r"(__lane_id));
-       return __lane_id;),
-      (return 0;))
+    NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                      (uint32_t __lane_id; asm volatile("mov.u32 %0, %%laneid;" : "=r"(__lane_id)); return __lane_id;),
+                      (return 0;))
   }
 };
 
@@ -546,9 +543,7 @@ _CCCL_DEVICE void __pipeline_consumer_wait(pipeline<thread_scope_thread>& __pipe
   (void) __pipeline;
   NV_IF_TARGET(NV_PROVIDES_SM_80, constexpr uint8_t __max_prior = 8;
 
-               asm volatile("cp.async.wait_group %0;"
-                            :
-                            : "n"(_Prior < __max_prior ? _Prior : __max_prior));)
+               asm volatile("cp.async.wait_group %0;" : : "n"(_Prior < __max_prior ? _Prior : __max_prior));)
 }
 
 _CCCL_DEVICE inline void __pipeline_consumer_wait(pipeline<thread_scope_thread>& __pipeline, uint8_t __prior)
diff --git a/libcudacxx/include/cuda/std/__atomic/types/base.h b/libcudacxx/include/cuda/std/__atomic/types/base.h
index bbd9086dce5..18eb3713bfe 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/base.h
@@ -98,8 +98,8 @@ _CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memo
 }
 
 template <typename _Sto, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
@@ -109,8 +109,8 @@ __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) -> __at
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
@@ -160,8 +160,8 @@ _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
@@ -171,8 +171,8 @@ __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco =
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
@@ -182,8 +182,8 @@ __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco =
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
@@ -193,8 +193,8 @@ __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
@@ -204,8 +204,8 @@ __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
@@ -215,8 +215,8 @@ __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   NV_IF_TARGET(
     NV_IS_DEVICE,
@@ -225,8 +225,8 @@ __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   NV_IF_TARGET(
     NV_IS_DEVICE,
diff --git a/libcudacxx/include/cuda/std/__atomic/types/common.h b/libcudacxx/include/cuda/std/__atomic/types/common.h
index 13d326bfe79..6706ad5181b 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/common.h
@@ -78,9 +78,7 @@ _CCCL_HOST_DEVICE inline int __atomic_memcmp(void const* __lhs, void const* __rh
     NV_IS_DEVICE,
     (unsigned char const* __lhs_c; unsigned char const* __rhs_c;
      // NVCC recommended laundering through inline asm to compare padding bytes.
-     asm("mov.b64 %0, %2;\n mov.b64 %1, %3;"
-         : "=l"(__lhs_c), "=l"(__rhs_c)
-         : "l"(__lhs), "l"(__rhs));
+     asm("mov.b64 %0, %2;\n mov.b64 %1, %3;" : "=l"(__lhs_c), "=l"(__rhs_c) : "l"(__lhs), "l"(__rhs));
      while (__count--) {
        auto const __lhs_v = *__lhs_c++;
        auto const __rhs_v = *__rhs_c++;
diff --git a/libcudacxx/include/cuda/std/__atomic/types/locked.h b/libcudacxx/include/cuda/std/__atomic/types/locked.h
index 5538abcce68..c462c5d16a6 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/locked.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/locked.h
@@ -85,8 +85,8 @@ _CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memo
 }
 
 template <typename _Sto, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
@@ -97,8 +97,8 @@ __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) -> __at
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
@@ -152,8 +152,8 @@ __atomic_compare_exchange_weak_dispatch(_Sto* __a, _Up* __expected, _Up __value,
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
@@ -165,8 +165,8 @@ __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {}) ->
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
@@ -178,8 +178,8 @@ __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {}) ->
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
@@ -191,8 +191,8 @@ __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {}) -
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
@@ -204,8 +204,8 @@ __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {}) ->
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
diff --git a/libcudacxx/include/cuda/std/__atomic/types/small.h b/libcudacxx/include/cuda/std/__atomic/types/small.h
index 4f24753ca60..a4e969f0936 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/small.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/small.h
@@ -95,16 +95,16 @@ _CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memo
 }
 
 template <typename _Sto, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   return __atomic_small_from_32<_Tp>(__atomic_load_dispatch(&__a->__a_value, __order, _Sco{}));
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   return __atomic_small_from_32<_Tp>(
@@ -155,8 +155,8 @@ _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   return __atomic_small_from_32<_Tp>(
@@ -164,8 +164,8 @@ __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco =
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   return __atomic_small_from_32<_Tp>(
@@ -173,8 +173,8 @@ __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco =
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   return __atomic_small_from_32<_Tp>(
@@ -182,8 +182,8 @@ __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   return __atomic_small_from_32<_Tp>(
@@ -191,8 +191,8 @@ __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   return __atomic_small_from_32<_Tp>(
@@ -200,8 +200,8 @@ __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   return __atomic_small_from_32<_Tp>(
@@ -209,8 +209,8 @@ __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline auto
-__atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
 {
   using _Tp = __atomic_underlying_t<_Sto>;
   return __atomic_small_from_32<_Tp>(
diff --git a/libcudacxx/include/cuda/std/__concepts/concept_macros.h b/libcudacxx/include/cuda/std/__concepts/concept_macros.h
index 4cbf3e5e9b3..3acd8ae6841 100644
--- a/libcudacxx/include/cuda/std/__concepts/concept_macros.h
+++ b/libcudacxx/include/cuda/std/__concepts/concept_macros.h
@@ -158,7 +158,7 @@ namespace __cccl_unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-
       {                                               \
         _CCCL_PP_CAT4(_CCCL_PP_EAT_SAME_AS_, _REQ)    \
       } -> _CCCL_CONCEPT_VSTD::same_as<_CCCL_PP_EVAL( \
-          _CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS_AUX, _CCCL_PP_CAT4(_CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS_, _REQ))>
+        _CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS_AUX, _CCCL_PP_CAT4(_CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS_, _REQ))>
 #    define _CCCL_PP_EAT_SAME_AS__Same_as(...)
 #    define _CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS_AUX(_TYPE, ...) _CCCL_PP_EXPAND _TYPE
 #    define _CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS__Same_as(...)   (__VA_ARGS__),
@@ -170,8 +170,7 @@ namespace __cccl_unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-
 #    define _CCCL_CONCEPT _CCCL_INLINE_VAR constexpr bool
 
 #    define _CCCL_CONCEPT_FRAGMENT(_NAME, ...)                                                                         \
-      _LIBCUDACXX_HIDE_FROM_ABI auto _NAME##_CCCL_CONCEPT_FRAGMENT_impl_ _CCCL_CONCEPT_FRAGMENT_REQS_##__VA_ARGS__ > { \
-      }                                                                                                                \
+      _LIBCUDACXX_HIDE_FROM_ABI auto _NAME##_CCCL_CONCEPT_FRAGMENT_impl_ _CCCL_CONCEPT_FRAGMENT_REQS_##__VA_ARGS__> {} \
       template <class... _As>                                                                                          \
       _LIBCUDACXX_HIDE_FROM_ABI char _NAME##_CCCL_CONCEPT_FRAGMENT_(                                                   \
         ::__cccl_tag<_As...>*, decltype(&_NAME##_CCCL_CONCEPT_FRAGMENT_impl_<_As...>));                                \
diff --git a/libcudacxx/include/cuda/std/__cstddef/types.h b/libcudacxx/include/cuda/std/__cstddef/types.h
index 215b60fbc55..730243c32ee 100644
--- a/libcudacxx/include/cuda/std/__cstddef/types.h
+++ b/libcudacxx/include/cuda/std/__cstddef/types.h
@@ -26,7 +26,7 @@
 #  include <cstddef>
 #else
 #  if !defined(offsetof)
-#    define offsetof(type, member) (::size_t)((char*) &(((type*) 0)->member) - (char*) 0)
+#    define offsetof(type, member) (::size_t) ((char*) &(((type*) 0)->member) - (char*) 0)
 #  endif // !offsetof
 #endif // !_CCCL_COMPILER(NVRTC)
 
diff --git a/libcudacxx/include/cuda/std/__cuda/chrono.h b/libcudacxx/include/cuda/std/__cuda/chrono.h
index 1b3110a556c..c40284a18e8 100644
--- a/libcudacxx/include/cuda/std/__cuda/chrono.h
+++ b/libcudacxx/include/cuda/std/__cuda/chrono.h
@@ -40,8 +40,7 @@ _LIBCUDACXX_HIDE_FROM_ABI system_clock::time_point system_clock::now() noexcept
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
-    (uint64_t __time; asm volatile("mov.u64 %0, %%globaltimer;"
-                                   : "=l"(__time)::);
+    (uint64_t __time; asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(__time)::);
      return time_point(duration_cast<duration>(nanoseconds(__time)));),
     NV_IS_HOST,
     (return time_point(duration_cast<duration>(nanoseconds(
diff --git a/libcudacxx/include/cuda/std/__functional/function.h b/libcudacxx/include/cuda/std/__functional/function.h
index 6870b89a88f..e2ec912e6fb 100644
--- a/libcudacxx/include/cuda/std/__functional/function.h
+++ b/libcudacxx/include/cuda/std/__functional/function.h
@@ -99,7 +99,7 @@ _LIBCUDACXX_HIDE_FROM_ABI bool __not_null(_Fp* __ptr)
 }
 
 template <class _Ret, class _Class>
-_LIBCUDACXX_HIDE_FROM_ABI bool __not_null(_Ret _Class::*__ptr)
+_LIBCUDACXX_HIDE_FROM_ABI bool __not_null(_Ret _Class::* __ptr)
 {
   return __ptr;
 }
diff --git a/libcudacxx/include/cuda/std/__functional/mem_fn.h b/libcudacxx/include/cuda/std/__functional/mem_fn.h
index 20a55850ea5..8327b4edfef 100644
--- a/libcudacxx/include/cuda/std/__functional/mem_fn.h
+++ b/libcudacxx/include/cuda/std/__functional/mem_fn.h
@@ -53,7 +53,7 @@ class __mem_fn : public __weak_result_type<_Tp>
 };
 
 template <class _Rp, class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __mem_fn<_Rp _Tp::*> mem_fn(_Rp _Tp::*__pm) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 __mem_fn<_Rp _Tp::*> mem_fn(_Rp _Tp::* __pm) noexcept
 {
   return __mem_fn<_Rp _Tp::*>(__pm);
 }
diff --git a/libcudacxx/include/cuda/std/__iterator/access.h b/libcudacxx/include/cuda/std/__iterator/access.h
index 1cb9eb1382b..1fba194a63c 100644
--- a/libcudacxx/include/cuda/std/__iterator/access.h
+++ b/libcudacxx/include/cuda/std/__iterator/access.h
@@ -36,15 +36,15 @@ struct __fn
   }
 
   template <class _Cp>
-  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const
-    noexcept(noexcept(__c.begin())) -> decltype(__c.begin())
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const noexcept(noexcept(__c.begin()))
+    -> decltype(__c.begin())
   {
     return __c.begin();
   }
 
   template <class _Cp>
-  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
-    noexcept(noexcept(__c.begin())) -> decltype(__c.begin())
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const noexcept(noexcept(__c.begin()))
+    -> decltype(__c.begin())
   {
     return __c.begin();
   }
@@ -67,15 +67,15 @@ struct __fn
   }
 
   template <class _Cp>
-  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const
-    noexcept(noexcept(__c.end())) -> decltype(__c.end())
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const noexcept(noexcept(__c.end()))
+    -> decltype(__c.end())
   {
     return __c.end();
   }
 
   template <class _Cp>
-  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
-    noexcept(noexcept(__c.end())) -> decltype(__c.end())
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const noexcept(noexcept(__c.end()))
+    -> decltype(__c.end())
   {
     return __c.end();
   }
diff --git a/libcudacxx/include/cuda/std/__iterator/data.h b/libcudacxx/include/cuda/std/__iterator/data.h
index f51d84888df..2177c136d61 100644
--- a/libcudacxx/include/cuda/std/__iterator/data.h
+++ b/libcudacxx/include/cuda/std/__iterator/data.h
@@ -30,16 +30,16 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Cont>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
-data(_Cont& __c) noexcept(noexcept(__c.data())) -> decltype(__c.data())
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto data(_Cont& __c) noexcept(noexcept(__c.data()))
+  -> decltype(__c.data())
 {
   return __c.data();
 }
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Cont>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
-data(const _Cont& __c) noexcept(noexcept(__c.data())) -> decltype(__c.data())
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto data(const _Cont& __c) noexcept(noexcept(__c.data()))
+  -> decltype(__c.data())
 {
   return __c.data();
 }
diff --git a/libcudacxx/include/cuda/std/__iterator/empty.h b/libcudacxx/include/cuda/std/__iterator/empty.h
index 4dea0eb53e0..e9775db3616 100644
--- a/libcudacxx/include/cuda/std/__iterator/empty.h
+++ b/libcudacxx/include/cuda/std/__iterator/empty.h
@@ -29,8 +29,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if _CCCL_STD_VER > 2011
 
 template <class _Cont>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
-empty(const _Cont& __c) noexcept(noexcept(__c.empty())) -> decltype(__c.empty())
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto empty(const _Cont& __c) noexcept(noexcept(__c.empty()))
+  -> decltype(__c.empty())
 {
   return __c.empty();
 }
diff --git a/libcudacxx/include/cuda/std/__iterator/iter_move.h b/libcudacxx/include/cuda/std/__iterator/iter_move.h
index 22a13ef33ae..b8556ed1108 100644
--- a/libcudacxx/include/cuda/std/__iterator/iter_move.h
+++ b/libcudacxx/include/cuda/std/__iterator/iter_move.h
@@ -101,8 +101,9 @@ struct __fn
 
   _CCCL_TEMPLATE(class _Ip)
   _CCCL_REQUIRES(__move_deref<_Ip>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Ip&& __i) const noexcept(noexcept(
-    _CUDA_VSTD::move(*_CUDA_VSTD::forward<_Ip>(__i)))) -> decltype(_CUDA_VSTD::move(*_CUDA_VSTD::forward<_Ip>(__i)))
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Ip&& __i) const
+    noexcept(noexcept(_CUDA_VSTD::move(*_CUDA_VSTD::forward<_Ip>(__i))))
+      -> decltype(_CUDA_VSTD::move(*_CUDA_VSTD::forward<_Ip>(__i)))
   {
     return _CUDA_VSTD::move(*_CUDA_VSTD::forward<_Ip>(__i));
   }
diff --git a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
index 095880f7cce..27f9262e070 100644
--- a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
+++ b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
@@ -174,19 +174,20 @@ using _ITER_TRAITS = typename __iter_traits_cache<_Iter>::type;
 #  if defined(_GLIBCXX_DEBUG)
 _CCCL_TEMPLATE(class _Iter, class _Ty, class _Range)
 _CCCL_REQUIRES(_IsSame<_Iter, ::__gnu_debug::_Safe_iterator<_Ty*, _Range>>::value)
-_LIBCUDACXX_HIDE_FROM_ABI auto
-  __iter_concept_fn(::__gnu_debug::_Safe_iterator<_Ty*, _Range>, __priority_tag<3>) -> contiguous_iterator_tag;
+_LIBCUDACXX_HIDE_FROM_ABI auto __iter_concept_fn(::__gnu_debug::_Safe_iterator<_Ty*, _Range>, __priority_tag<3>)
+  -> contiguous_iterator_tag;
 #  endif
 #  if defined(__GLIBCXX__)
 _CCCL_TEMPLATE(class _Iter, class _Ty, class _Range)
 _CCCL_REQUIRES(_IsSame<_Iter, ::__gnu_cxx::__normal_iterator<_Ty*, _Range>>::value)
-_LIBCUDACXX_HIDE_FROM_ABI auto
-  __iter_concept_fn(::__gnu_cxx::__normal_iterator<_Ty*, _Range>, __priority_tag<3>) -> contiguous_iterator_tag;
+_LIBCUDACXX_HIDE_FROM_ABI auto __iter_concept_fn(::__gnu_cxx::__normal_iterator<_Ty*, _Range>, __priority_tag<3>)
+  -> contiguous_iterator_tag;
 #  endif // __GLIBCXX__
 #  if defined(_LIBCPP_VERSION)
 _CCCL_TEMPLATE(class _Iter, class _Ty)
 _CCCL_REQUIRES(_IsSame<_Iter, ::std::__wrap_iter<_Ty*>>::value)
-_LIBCUDACXX_HIDE_FROM_ABI auto __iter_concept_fn(::std::__wrap_iter<_Ty*>, __priority_tag<3>) -> contiguous_iterator_tag;
+_LIBCUDACXX_HIDE_FROM_ABI auto __iter_concept_fn(::std::__wrap_iter<_Ty*>, __priority_tag<3>)
+  -> contiguous_iterator_tag;
 #  elif defined(_MSVC_STL_VERSION) || defined(_IS_WRS)
 _CCCL_TEMPLATE(class _Iter)
 _CCCL_REQUIRES(_IsSame<_Iter, class _Iter::_Array_iterator>::value)
diff --git a/libcudacxx/include/cuda/std/__iterator/reverse_access.h b/libcudacxx/include/cuda/std/__iterator/reverse_access.h
index ffeed85b900..9c66c6a645d 100644
--- a/libcudacxx/include/cuda/std/__iterator/reverse_access.h
+++ b/libcudacxx/include/cuda/std/__iterator/reverse_access.h
@@ -47,15 +47,15 @@ struct __fn
   }
 
   template <class _Cp>
-  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const
-    noexcept(noexcept(__c.rbegin())) -> decltype(__c.rbegin())
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const noexcept(noexcept(__c.rbegin()))
+    -> decltype(__c.rbegin())
   {
     return __c.rbegin();
   }
 
   template <class _Cp>
-  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
-    noexcept(noexcept(__c.rbegin())) -> decltype(__c.rbegin())
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const noexcept(noexcept(__c.rbegin()))
+    -> decltype(__c.rbegin())
   {
     return __c.rbegin();
   }
@@ -85,15 +85,15 @@ struct __fn
   }
 
   template <class _Cp>
-  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const
-    noexcept(noexcept(__c.rend())) -> decltype(__c.rend())
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(_Cp& __c) const noexcept(noexcept(__c.rend()))
+    -> decltype(__c.rend())
   {
     return __c.rend();
   }
 
   template <class _Cp>
-  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const
-    noexcept(noexcept(__c.rend())) -> decltype(__c.rend())
+  _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 auto operator()(const _Cp& __c) const noexcept(noexcept(__c.rend()))
+    -> decltype(__c.rend())
   {
     return __c.rend();
   }
diff --git a/libcudacxx/include/cuda/std/__mdspan/extents.h b/libcudacxx/include/cuda/std/__mdspan/extents.h
index 981e51d35fc..b6ce539a901 100644
--- a/libcudacxx/include/cuda/std/__mdspan/extents.h
+++ b/libcudacxx/include/cuda/std/__mdspan/extents.h
@@ -251,7 +251,8 @@ class extents
   _CCCL_REQUIRES(
     /* multi-stage check to protect from invalid pack expansion when sizes don't match? */
     (decltype(__detail::__check_compatible_extents(
-      integral_constant<bool, sizeof...(_Extents) == sizeof...(_OtherExtents)>{},
+      integral_constant < bool,
+      sizeof...(_Extents) == sizeof...(_OtherExtents) > {},
       __indices_t{}, // _CUDA_VSTD::integer_sequence<size_t, _Extents...>{}
       _CUDA_VSTD::integer_sequence<size_t, _OtherExtents...>{}))::value))
   _LIBCUDACXX_HIDE_FROM_ABI __MDSPAN_CONDITIONAL_EXPLICIT(
diff --git a/libcudacxx/include/cuda/std/__mdspan/macros.h b/libcudacxx/include/cuda/std/__mdspan/macros.h
index b9b56adae37..36895751bb1 100644
--- a/libcudacxx/include/cuda/std/__mdspan/macros.h
+++ b/libcudacxx/include/cuda/std/__mdspan/macros.h
@@ -256,11 +256,11 @@
         return __MDSPAN_PP_REMOVE_PARENS(BODY);                                    \
       }
 #  else
-#    define __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(SIGNATURE, BODY)                          \
-      auto __MDSPAN_PP_REMOVE_PARENS(                                                         \
-        SIGNATURE) -> _CUDA_VSTD::remove_cv_t<_CUDA_VSTD::remove_reference_t<decltype(BODY)>> \
-      {                                                                                       \
-        return __MDSPAN_PP_REMOVE_PARENS(BODY);                                               \
+#    define __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(SIGNATURE, BODY)               \
+      auto __MDSPAN_PP_REMOVE_PARENS(SIGNATURE)                                    \
+        -> _CUDA_VSTD::remove_cv_t<_CUDA_VSTD::remove_reference_t<decltype(BODY)>> \
+      {                                                                            \
+        return __MDSPAN_PP_REMOVE_PARENS(BODY);                                    \
       }
 #    define __MDSPAN_DEDUCE_DECLTYPE_AUTO_RETURN_TYPE_SINGLE_LINE(SIGNATURE, BODY) \
       auto __MDSPAN_PP_REMOVE_PARENS(SIGNATURE) -> decltype(BODY)                  \
diff --git a/libcudacxx/include/cuda/std/__mdspan/mdspan.h b/libcudacxx/include/cuda/std/__mdspan/mdspan.h
index bf7625a8093..07918917125 100644
--- a/libcudacxx/include/cuda/std/__mdspan/mdspan.h
+++ b/libcudacxx/include/cuda/std/__mdspan/mdspan.h
@@ -472,24 +472,25 @@ class mdspan
 #  if defined(__MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION)
 _CCCL_TEMPLATE(class _ElementType, class... _SizeTypes)
 _CCCL_REQUIRES(__fold_and_v<_CCCL_TRAIT(is_integral, _SizeTypes)...> _CCCL_AND(sizeof...(_SizeTypes) > 0))
-_CCCL_HOST_DEVICE explicit mdspan(_ElementType*,
-                                  _SizeTypes...) -> mdspan<_ElementType, dextents<size_t, sizeof...(_SizeTypes)>>;
+_CCCL_HOST_DEVICE explicit mdspan(_ElementType*, _SizeTypes...)
+  -> mdspan<_ElementType, dextents<size_t, sizeof...(_SizeTypes)>>;
 
 _CCCL_TEMPLATE(class _Pointer)
 _CCCL_REQUIRES(_CCCL_TRAIT(is_pointer, _CUDA_VSTD::remove_reference_t<_Pointer>))
-_CCCL_HOST_DEVICE
-mdspan(_Pointer&&) -> mdspan<_CUDA_VSTD::remove_pointer_t<_CUDA_VSTD::remove_reference_t<_Pointer>>, extents<size_t>>;
+_CCCL_HOST_DEVICE mdspan(_Pointer&&)
+  -> mdspan<_CUDA_VSTD::remove_pointer_t<_CUDA_VSTD::remove_reference_t<_Pointer>>, extents<size_t>>;
 _CCCL_TEMPLATE(class _CArray)
 _CCCL_REQUIRES(_CCCL_TRAIT(is_array, _CArray) _CCCL_AND(rank_v<_CArray> == 1))
 _CCCL_HOST_DEVICE mdspan(_CArray&)
   -> mdspan<_CUDA_VSTD::remove_all_extents_t<_CArray>, extents<size_t, _CUDA_VSTD::extent_v<_CArray, 0>>>;
 
 template <class _ElementType, class _SizeType, size_t _Np>
-_CCCL_HOST_DEVICE mdspan(_ElementType*,
-                         const _CUDA_VSTD::array<_SizeType, _Np>&) -> mdspan<_ElementType, dextents<size_t, _Np>>;
+_CCCL_HOST_DEVICE mdspan(_ElementType*, const _CUDA_VSTD::array<_SizeType, _Np>&)
+  -> mdspan<_ElementType, dextents<size_t, _Np>>;
 
 template <class _ElementType, class _SizeType, size_t _Np>
-_CCCL_HOST_DEVICE mdspan(_ElementType*, _CUDA_VSTD::span<_SizeType, _Np>) -> mdspan<_ElementType, dextents<size_t, _Np>>;
+_CCCL_HOST_DEVICE mdspan(_ElementType*, _CUDA_VSTD::span<_SizeType, _Np>)
+  -> mdspan<_ElementType, dextents<size_t, _Np>>;
 
 // This one is necessary because all the constructors take `data_handle_type`s, not
 // `_ElementType*`s, and `data_handle_type` is taken from `accessor_type::data_handle_type`, which
diff --git a/libcudacxx/include/cuda/std/__memory/allocator_traits.h b/libcudacxx/include/cuda/std/__memory/allocator_traits.h
index 726b857be48..b553056ad99 100644
--- a/libcudacxx/include/cuda/std/__memory/allocator_traits.h
+++ b/libcudacxx/include/cuda/std/__memory/allocator_traits.h
@@ -289,8 +289,8 @@ _LIBCUDACXX_HIDE_FROM_ABI typename pointer_traits<_Pointer>::element_type* __to_
 }
 #else // ^^^ C++17 ^^^ / vvv C++20 vvv
 template <class _Pointer>
-_LIBCUDACXX_HIDE_FROM_ABI auto
-__to_raw_pointer(const _Pointer& __p) noexcept -> decltype(pointer_traits<_Pointer>::to_address(__p))
+_LIBCUDACXX_HIDE_FROM_ABI auto __to_raw_pointer(const _Pointer& __p) noexcept
+  -> decltype(pointer_traits<_Pointer>::to_address(__p))
 {
   return pointer_traits<_Pointer>::to_address(__p);
 }
diff --git a/libcudacxx/include/cuda/std/__memory/pointer_traits.h b/libcudacxx/include/cuda/std/__memory/pointer_traits.h
index 66e738b46df..d102dde7a74 100644
--- a/libcudacxx/include/cuda/std/__memory/pointer_traits.h
+++ b/libcudacxx/include/cuda/std/__memory/pointer_traits.h
@@ -238,8 +238,8 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr auto to_address(_Tp* __p) noexcept
 }
 
 template <class _Pointer>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr auto
-to_address(const _Pointer& __p) noexcept -> decltype(_CUDA_VSTD::__to_address(__p))
+_LIBCUDACXX_HIDE_FROM_ABI constexpr auto to_address(const _Pointer& __p) noexcept
+  -> decltype(_CUDA_VSTD::__to_address(__p))
 {
   return _CUDA_VSTD::__to_address(__p);
 }
diff --git a/libcudacxx/include/cuda/std/__ranges/access.h b/libcudacxx/include/cuda/std/__ranges/access.h
index c6ba238ea41..7a5d2aade97 100644
--- a/libcudacxx/include/cuda/std/__ranges/access.h
+++ b/libcudacxx/include/cuda/std/__ranges/access.h
@@ -283,8 +283,9 @@ struct __fn
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Tp)
   _CCCL_REQUIRES(is_rvalue_reference_v<_Tp&&>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const noexcept(noexcept(
-    _CUDA_VRANGES::end(static_cast<const _Tp&&>(__t)))) -> decltype(_CUDA_VRANGES::end(static_cast<const _Tp&&>(__t)))
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
+    noexcept(noexcept(_CUDA_VRANGES::end(static_cast<const _Tp&&>(__t))))
+      -> decltype(_CUDA_VRANGES::end(static_cast<const _Tp&&>(__t)))
   {
     return _CUDA_VRANGES::end(static_cast<const _Tp&&>(__t));
   }
diff --git a/libcudacxx/include/cuda/std/__ranges/data.h b/libcudacxx/include/cuda/std/__ranges/data.h
index a9c5db6f085..7f05385b120 100644
--- a/libcudacxx/include/cuda/std/__ranges/data.h
+++ b/libcudacxx/include/cuda/std/__ranges/data.h
@@ -115,8 +115,9 @@ struct __fn
 
   _CCCL_TEMPLATE(class _Tp)
   _CCCL_REQUIRES(is_rvalue_reference_v<_Tp&&>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const noexcept(noexcept(
-    _CUDA_VRANGES::data(static_cast<const _Tp&&>(__t)))) -> decltype(_CUDA_VRANGES::data(static_cast<const _Tp&&>(__t)))
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
+    noexcept(noexcept(_CUDA_VRANGES::data(static_cast<const _Tp&&>(__t))))
+      -> decltype(_CUDA_VRANGES::data(static_cast<const _Tp&&>(__t)))
   {
     return _CUDA_VRANGES::data(static_cast<const _Tp&&>(__t));
   }
diff --git a/libcudacxx/include/cuda/std/__ranges/rend.h b/libcudacxx/include/cuda/std/__ranges/rend.h
index 3f21c323eba..42a3c37054f 100644
--- a/libcudacxx/include/cuda/std/__ranges/rend.h
+++ b/libcudacxx/include/cuda/std/__ranges/rend.h
@@ -161,8 +161,9 @@ struct __fn
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Tp)
   _CCCL_REQUIRES(is_rvalue_reference_v<_Tp&&>)
-  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const noexcept(noexcept(
-    _CUDA_VRANGES::rend(static_cast<const _Tp&&>(__t)))) -> decltype(_CUDA_VRANGES::rend(static_cast<const _Tp&&>(__t)))
+  _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto operator()(_Tp&& __t) const
+    noexcept(noexcept(_CUDA_VRANGES::rend(static_cast<const _Tp&&>(__t))))
+      -> decltype(_CUDA_VRANGES::rend(static_cast<const _Tp&&>(__t)))
   {
     return _CUDA_VRANGES::rend(static_cast<const _Tp&&>(__t));
   }
diff --git a/libcudacxx/include/cuda/std/__ranges/subrange.h b/libcudacxx/include/cuda/std/__ranges/subrange.h
index 484ce8c1f46..b1b2d9c81d0 100644
--- a/libcudacxx/include/cuda/std/__ranges/subrange.h
+++ b/libcudacxx/include/cuda/std/__ranges/subrange.h
@@ -399,8 +399,8 @@ _CCCL_HOST_DEVICE subrange(_Iter, _Sent) -> subrange<_Iter, _Sent>;
 
 _CCCL_TEMPLATE(class _Iter, class _Sent)
 _CCCL_REQUIRES(input_or_output_iterator<_Iter> _CCCL_AND sentinel_for<_Sent, _Iter>)
-_CCCL_HOST_DEVICE
-subrange(_Iter, _Sent, make_unsigned_t<iter_difference_t<_Iter>>) -> subrange<_Iter, _Sent, subrange_kind::sized>;
+_CCCL_HOST_DEVICE subrange(_Iter, _Sent, make_unsigned_t<iter_difference_t<_Iter>>)
+  -> subrange<_Iter, _Sent, subrange_kind::sized>;
 
 _CCCL_TEMPLATE(class _Range)
 _CCCL_REQUIRES(borrowed_range<_Range>)
diff --git a/libcudacxx/include/cuda/std/__thread/threading_support.h b/libcudacxx/include/cuda/std/__thread/threading_support.h
index d2ebacf576f..31968a27365 100644
--- a/libcudacxx/include/cuda/std/__thread/threading_support.h
+++ b/libcudacxx/include/cuda/std/__thread/threading_support.h
@@ -45,9 +45,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #  define _LIBCUDACXX_POLLING_COUNT 16
 
 #  if defined(__aarch64__)
-#    define __LIBCUDACXX_ASM_THREAD_YIELD (asm volatile("yield" :::);)
+#    define __LIBCUDACXX_ASM_THREAD_YIELD (asm volatile("yield" :: :);)
 #  elif defined(__x86_64__)
-#    define __LIBCUDACXX_ASM_THREAD_YIELD (asm volatile("pause" :::);)
+#    define __LIBCUDACXX_ASM_THREAD_YIELD (asm volatile("pause" :: :);)
 #  else // ^^^ __x86_64__ ^^^ / vvv !__x86_64__ vvv
 #    define __LIBCUDACXX_ASM_THREAD_YIELD (;)
 #  endif // !__x86_64__
diff --git a/libcudacxx/include/cuda/std/__thread/threading_support_cuda.h b/libcudacxx/include/cuda/std/__thread/threading_support_cuda.h
index c46cf508dca..cd3c5f12e07 100644
--- a/libcudacxx/include/cuda/std/__thread/threading_support_cuda.h
+++ b/libcudacxx/include/cuda/std/__thread/threading_support_cuda.h
@@ -35,8 +35,7 @@ _LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_sleep_for(_CUDA_VSTD::chrono::nanos
 {
   NV_IF_TARGET(NV_IS_DEVICE,
                (auto const __step = __ns.count(); assert(__step < numeric_limits<unsigned>::max());
-                asm volatile("nanosleep.u32 %0;" ::"r"((unsigned) __step)
-                             :);))
+                asm volatile("nanosleep.u32 %0;" ::"r"((unsigned) __step) :);))
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__type_traits/type_list.h b/libcudacxx/include/cuda/std/__type_traits/type_list.h
index 66652922ceb..f2dc0fffe43 100644
--- a/libcudacxx/include/cuda/std/__type_traits/type_list.h
+++ b/libcudacxx/include/cuda/std/__type_traits/type_list.h
@@ -1020,8 +1020,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_value_list : __type_list<integral_co
 namespace __detail
 {
 template <class _Ty, _Ty _Start, _Ty _Stride, _Ty... _Is>
-_LIBCUDACXX_HIDE_FROM_ABI auto
-__type_iota_fn(integer_sequence<_Ty, _Is...>*) -> __type_value_list<_Ty, _Ty(_Start + (_Is * _Stride))...>;
+_LIBCUDACXX_HIDE_FROM_ABI auto __type_iota_fn(integer_sequence<_Ty, _Is...>*)
+  -> __type_value_list<_Ty, _Ty(_Start + (_Is * _Stride))...>;
 } // namespace __detail
 
 //! \brief Return an \c __type_value_list of size \c _Size starting at \c _Start
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/span b/libcudacxx/include/cuda/std/detail/libcxx/include/span
index 19fdea2f4ce..b4e8fb04d95 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/span
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/span
@@ -785,8 +785,8 @@ _CCCL_HOST_DEVICE span(const array<_Tp, _Sz>&) -> span<const _Tp, _Sz>;
 
 _CCCL_TEMPLATE(class _It, class _EndOrSize)
 _CCCL_REQUIRES(contiguous_iterator<_It>)
-_CCCL_HOST_DEVICE span(_It,
-                       _EndOrSize) -> span<remove_reference_t<iter_reference_t<_It>>, __maybe_static_ext<_EndOrSize>>;
+_CCCL_HOST_DEVICE span(_It, _EndOrSize)
+  -> span<remove_reference_t<iter_reference_t<_It>>, __maybe_static_ext<_EndOrSize>>;
 
 _CCCL_TEMPLATE(class _Range)
 _CCCL_REQUIRES(_CUDA_VRANGES::contiguous_range<_Range>)
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/variant b/libcudacxx/include/cuda/std/detail/libcxx/include/variant
index 908c76ae1ed..0f6ec9d29fc 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/variant
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/variant
@@ -1059,8 +1059,9 @@ _LIBCUDACXX_VARIANT_MOVE_CONSTRUCTOR(_Trait::_TriviallyAvailable,
 _LIBCUDACXX_VARIANT_MOVE_CONSTRUCTOR(
   _Trait::_Available,
   _LIBCUDACXX_HIDE_FROM_ABI __move_constructor(__move_constructor&& __that) noexcept(
-    __all<_CCCL_TRAIT(is_nothrow_move_constructible, _Types)...>::value)
-  : __move_constructor(__valueless_t{}) { this->__generic_construct(*this, _CUDA_VSTD::move(__that)); });
+    __all<_CCCL_TRAIT(is_nothrow_move_constructible, _Types)...>::value) : __move_constructor(__valueless_t{}) {
+    this->__generic_construct(*this, _CUDA_VSTD::move(__that));
+  });
 
 _LIBCUDACXX_VARIANT_MOVE_CONSTRUCTOR(_Trait::_Unavailable, __move_constructor(__move_constructor&&) = delete;);
 
@@ -1091,8 +1092,10 @@ _LIBCUDACXX_VARIANT_COPY_CONSTRUCTOR(
   _Trait::_TriviallyAvailable, _CCCL_HIDE_FROM_ABI __copy_constructor(const __copy_constructor& __that) = default;);
 
 _LIBCUDACXX_VARIANT_COPY_CONSTRUCTOR(
-  _Trait::_Available, _LIBCUDACXX_HIDE_FROM_ABI __copy_constructor(const __copy_constructor& __that)
-  : __copy_constructor(__valueless_t{}) { this->__generic_construct(*this, __that); });
+  _Trait::_Available,
+  _LIBCUDACXX_HIDE_FROM_ABI __copy_constructor(const __copy_constructor& __that) : __copy_constructor(__valueless_t{}) {
+    this->__generic_construct(*this, __that);
+  });
 
 _LIBCUDACXX_VARIANT_COPY_CONSTRUCTOR(_Trait::_Unavailable, __copy_constructor(const __copy_constructor&) = delete;);
 
@@ -1395,8 +1398,8 @@ template <class _Tp, size_t>
 struct __overload_bool
 {
   template <class _Up, class _Ap = remove_cvref_t<_Up>>
-  _LIBCUDACXX_HIDE_FROM_ABI auto
-  operator()(bool, _Up&&) const -> enable_if_t<_CCCL_TRAIT(is_same, _Ap, bool), type_identity<_Tp>>;
+  _LIBCUDACXX_HIDE_FROM_ABI auto operator()(bool, _Up&&) const
+    -> enable_if_t<_CCCL_TRAIT(is_same, _Ap, bool), type_identity<_Tp>>;
 };
 
 template <size_t _Idx>
@@ -2078,8 +2081,9 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr _Rp visit(_Visitor&& __visitor, _Vs&&... __v
 }
 
 template <class... _Types>
-_LIBCUDACXX_HIDE_FROM_ABI auto swap(variant<_Types...>& __lhs, variant<_Types...>& __rhs) noexcept(
-  noexcept(__lhs.swap(__rhs))) -> decltype(__lhs.swap(__rhs))
+_LIBCUDACXX_HIDE_FROM_ABI auto
+swap(variant<_Types...>& __lhs, variant<_Types...>& __rhs) noexcept(noexcept(__lhs.swap(__rhs)))
+  -> decltype(__lhs.swap(__rhs))
 {
   return __lhs.swap(__rhs);
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_max.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_max.pass.cpp
index 2c6c155d784..cc4d90341d4 100644
--- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_max.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_max.pass.cpp
@@ -20,8 +20,7 @@
 #include "test_macros.h"
 
 template <class T,
-          template <typename, typename>
-          class Selector,
+          template <typename, typename> class Selector,
           cuda::thread_scope ThreadScope,
           bool Signed = cuda::std::is_signed<T>::value>
 struct TestFn
diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_min.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_min.pass.cpp
index fae515acc94..8bd6a5cd685 100644
--- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_min.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_min.pass.cpp
@@ -20,8 +20,7 @@
 #include "test_macros.h"
 
 template <class T,
-          template <typename, typename>
-          class Selector,
+          template <typename, typename> class Selector,
           cuda::thread_scope ThreadScope,
           bool Signed = cuda::std::is_signed<T>::value>
 struct TestFn
diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_helpers.h b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_helpers.h
index cc54eda725e..c2be3033275 100644
--- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_helpers.h
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_helpers.h
@@ -27,8 +27,7 @@ struct UserAtomicType
 };
 
 template <template <class, template <typename, typename> class, cuda::thread_scope> class TestFunctor,
-          template <typename, typename>
-          class Selector,
+          template <typename, typename> class Selector,
           cuda::thread_scope Scope
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
           = cuda::thread_scope_system
@@ -66,8 +65,7 @@ struct TestEachIntegralType
 };
 
 template <template <class, template <typename, typename> class, cuda::thread_scope> class TestFunctor,
-          template <typename, typename>
-          class Selector,
+          template <typename, typename> class Selector,
           cuda::thread_scope Scope
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
           = cuda::thread_scope_system
@@ -83,8 +81,7 @@ struct TestEachFloatingPointType
 };
 
 template <template <class, template <typename, typename> class, cuda::thread_scope> class TestFunctor,
-          template <typename, typename>
-          class Selector,
+          template <typename, typename> class Selector,
           cuda::thread_scope Scope
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
           = cuda::thread_scope_system
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
index 5527b2359af..334053be29c 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
@@ -52,7 +52,8 @@ int main(int, char**)
       // Required by concurrent_agents_launch to know how many we're launching
       cuda_thread_count = 512; init_tensor_map(gmem_tensor, GMEM_DIMS, SMEM_DIMS);),
     NV_IS_DEVICE,
-    (for (auto smem_coord
-          : TEST_SMEM_COORDS) { test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); }));
+    (for (auto smem_coord : TEST_SMEM_COORDS) {
+      test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+    }));
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
index 6df27820c79..21250677f10 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
@@ -57,7 +57,8 @@ int main(int, char**)
       // Required by concurrent_agents_launch to know how many we're launching
       cuda_thread_count = 512; init_tensor_map(gmem_tensor, GMEM_DIMS, SMEM_DIMS);),
     NV_IS_DEVICE,
-    (for (auto smem_coord
-          : TEST_SMEM_COORDS) { test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); }));
+    (for (auto smem_coord : TEST_SMEM_COORDS) {
+      test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+    }));
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
index f765b02d540..88973305198 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_3d.pass.cpp
@@ -52,7 +52,8 @@ int main(int, char**)
       // Required by concurrent_agents_launch to know how many we're launching
       cuda_thread_count = 512; init_tensor_map(gmem_tensor, GMEM_DIMS, SMEM_DIMS);),
     NV_IS_DEVICE,
-    (for (auto smem_coord
-          : TEST_SMEM_COORDS) { test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); }));
+    (for (auto smem_coord : TEST_SMEM_COORDS) {
+      test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+    }));
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
index fa46fde4d43..c40f5784da2 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_4d.pass.cpp
@@ -53,7 +53,8 @@ int main(int, char**)
       // Required by concurrent_agents_launch to know how many we're launching
       cuda_thread_count = 512; init_tensor_map(gmem_tensor, GMEM_DIMS, SMEM_DIMS);),
     NV_IS_DEVICE,
-    (for (auto smem_coord
-          : TEST_SMEM_COORDS) { test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); }));
+    (for (auto smem_coord : TEST_SMEM_COORDS) {
+      test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+    }));
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
index 557a1277250..71089666b9c 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_5d.pass.cpp
@@ -53,7 +53,8 @@ int main(int, char**)
       // Required by concurrent_agents_launch to know how many we're launching
       cuda_thread_count = 512; init_tensor_map(gmem_tensor, GMEM_DIMS, SMEM_DIMS);),
     NV_IS_DEVICE,
-    (for (auto smem_coord
-          : TEST_SMEM_COORDS) { test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); }));
+    (for (auto smem_coord : TEST_SMEM_COORDS) {
+      test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+    }));
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async.h b/libcudacxx/test/libcudacxx/cuda/memcpy_async.h
index 4d0504ab66d..7d75bf4b842 100644
--- a/libcudacxx/test/libcudacxx/cuda/memcpy_async.h
+++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async.h
@@ -16,12 +16,9 @@
 #include "large_type.h"
 
 template <class T,
-          template <typename, typename>
-          class SourceSelector,
-          template <typename, typename>
-          class DestSelector,
-          template <typename, typename>
-          class BarrierSelector,
+          template <typename, typename> class SourceSelector,
+          template <typename, typename> class DestSelector,
+          template <typename, typename> class BarrierSelector,
           cuda::thread_scope BarrierScope,
           typename... CompletionF>
 __host__ __device__ __noinline__ void test_fully_specialized()
@@ -60,12 +57,9 @@ struct completion
 };
 
 template <class T,
-          template <typename, typename>
-          class SourceSelector,
-          template <typename, typename>
-          class DestSelector,
-          template <typename, typename>
-          class BarrierSelector>
+          template <typename, typename> class SourceSelector,
+          template <typename, typename> class DestSelector,
+          template <typename, typename> class BarrierSelector>
 __host__ __device__ __noinline__ void test_select_scope()
 {
   test_fully_specialized<T, SourceSelector, DestSelector, BarrierSelector, cuda::thread_scope_system>();
diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async.h b/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async.h
index 574ed5ceb80..8b7fba78404 100644
--- a/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async.h
+++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/group_memcpy_async.h
@@ -71,12 +71,9 @@ static_assert(std::is_trivially_copy_constructible<storage<uint64_t>>::value, ""
 #endif
 
 template <class T,
-          template <typename, typename>
-          class SourceSelector,
-          template <typename, typename>
-          class DestSelector,
-          template <typename, typename>
-          class BarrierSelector,
+          template <typename, typename> class SourceSelector,
+          template <typename, typename> class DestSelector,
+          template <typename, typename> class BarrierSelector,
           cuda::thread_scope BarrierScope,
           typename... CompletionF>
 __device__ __noinline__ void test_fully_specialized()
@@ -123,12 +120,9 @@ struct completion
 };
 
 template <class T,
-          template <typename, typename>
-          class SourceSelector,
-          template <typename, typename>
-          class DestSelector,
-          template <typename, typename>
-          class BarrierSelector>
+          template <typename, typename> class SourceSelector,
+          template <typename, typename> class DestSelector,
+          template <typename, typename> class BarrierSelector>
 __host__ __device__ __noinline__ void test_select_scope()
 {
   test_fully_specialized<T, SourceSelector, DestSelector, BarrierSelector, cuda::thread_scope_system>();
diff --git a/libcudacxx/test/libcudacxx/cuda/pipeline_memcpy_async_producer_consumer.pass.cpp b/libcudacxx/test/libcudacxx/cuda/pipeline_memcpy_async_producer_consumer.pass.cpp
index 14ae81da2da..6bedfb5bb8f 100644
--- a/libcudacxx/test/libcudacxx/cuda/pipeline_memcpy_async_producer_consumer.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/pipeline_memcpy_async_producer_consumer.pass.cpp
@@ -51,8 +51,7 @@ __device__ __noinline__ void test_consumer(T* dest, T* source, cuda::pipeline<Pi
 }
 
 template <class T,
-          template <typename, typename>
-          class PipelineSelector,
+          template <typename, typename> class PipelineSelector,
           cuda::thread_scope PipelineScope,
           uint8_t PipelineStages>
 __device__ __noinline__ void test_fully_specialized()
diff --git a/libcudacxx/test/libcudacxx/cuda/pipeline_memcpy_async_thread_scope_generic.h b/libcudacxx/test/libcudacxx/cuda/pipeline_memcpy_async_thread_scope_generic.h
index 787c6bd050e..86d9fa0f180 100644
--- a/libcudacxx/test/libcudacxx/cuda/pipeline_memcpy_async_thread_scope_generic.h
+++ b/libcudacxx/test/libcudacxx/cuda/pipeline_memcpy_async_thread_scope_generic.h
@@ -27,12 +27,9 @@ __host__ __device__ cuda::pipeline<scope> get_pipeline(cuda::pipeline_shared_sta
 
 template <cuda::thread_scope Scope,
           class T,
-          template <typename, typename>
-          class SourceSelector,
-          template <typename, typename>
-          class DestSelector,
-          template <typename, typename>
-          class PipelineSelector,
+          template <typename, typename> class SourceSelector,
+          template <typename, typename> class DestSelector,
+          template <typename, typename> class PipelineSelector,
           uint8_t PipelineStages>
 __host__ __device__ __noinline__ void test_fully_specialized()
 {
@@ -86,10 +83,8 @@ __host__ __device__ __noinline__ void test_fully_specialized()
 
 template <cuda::thread_scope Scope,
           class T,
-          template <typename, typename>
-          class SourceSelector,
-          template <typename, typename>
-          class DestSelector>
+          template <typename, typename> class SourceSelector,
+          template <typename, typename> class DestSelector>
 __host__ __device__ __noinline__ void test_select_pipeline()
 {
   constexpr uint8_t stages_count = 2;
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp
index 888c473feac..a36fd0ada46 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp
@@ -64,8 +64,7 @@
 
 template <template <cuda::thread_scope> class Atomic,
           cuda::thread_scope Scope,
-          template <typename, typename>
-          class Selector>
+          template <typename, typename> class Selector>
 __host__ __device__ __noinline__ void do_test()
 {
   {
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp
index 4f457985ed5..4af01d2a60b 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp
@@ -148,8 +148,7 @@ __host__ __device__ __noinline__ void test()
 
 template <template <typename, cuda::thread_scope> class Atomic,
           cuda::thread_scope Scope,
-          template <typename, typename>
-          class Selector>
+          template <typename, typename> class Selector>
 __host__ __device__ void test_for_all_types()
 {
   test<Atomic<float, Scope>, float, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp
index 82dba2c3302..e74a5b8bcb9 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp
@@ -144,8 +144,7 @@ __host__ __device__ __noinline__ void test()
 
 template <template <typename, cuda::thread_scope> class Atomic,
           cuda::thread_scope Scope,
-          template <typename, typename>
-          class Selector>
+          template <typename, typename> class Selector>
 __host__ __device__ void test_for_all_types()
 {
   test<Atomic<float, Scope>, float, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp
index 87953269665..e9959955657 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp
@@ -84,8 +84,7 @@ __host__ __device__ __noinline__ void test()
 
 template <template <typename, cuda::thread_scope> class Atomic,
           cuda::thread_scope Scope,
-          template <typename, typename>
-          class Selector>
+          template <typename, typename> class Selector>
 __host__ __device__ void test_for_all_types()
 {
   test<Atomic<float, Scope>, float, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/1b_integral_cuda.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/1b_integral_cuda.pass.cpp
index 5aa401c3f75..267afbd1034 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/1b_integral_cuda.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/1b_integral_cuda.pass.cpp
@@ -98,8 +98,7 @@ __host__ __device__ __noinline__ void test()
 
 template <template <typename, cuda::thread_scope> class Atomic,
           cuda::thread_scope Scope,
-          template <typename, typename>
-          class Selector>
+          template <typename, typename> class Selector>
 __host__ __device__ void test_for_all_types()
 {
   test<Atomic<char, Scope>, char, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/1b_integral_std.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/1b_integral_std.pass.cpp
index c3e809be2c0..617fb4481b9 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/1b_integral_std.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/1b_integral_std.pass.cpp
@@ -98,8 +98,7 @@ __host__ __device__ __noinline__ void test()
 
 template <template <typename, cuda::thread_scope> class Atomic,
           cuda::thread_scope Scope,
-          template <typename, typename>
-          class Selector>
+          template <typename, typename> class Selector>
 __host__ __device__ void test_for_all_types()
 {
   test<Atomic<char, Scope>, char, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/2b_integral_cuda.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/2b_integral_cuda.pass.cpp
index faf48d1d970..9efb262fd46 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/2b_integral_cuda.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/2b_integral_cuda.pass.cpp
@@ -98,8 +98,7 @@ __host__ __device__ __noinline__ void test()
 
 template <template <typename, cuda::thread_scope> class Atomic,
           cuda::thread_scope Scope,
-          template <typename, typename>
-          class Selector>
+          template <typename, typename> class Selector>
 __host__ __device__ void test_for_all_types()
 {
   test<Atomic<short, Scope>, short, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/2b_integral_std.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/2b_integral_std.pass.cpp
index 7fa15876db7..f9697062d5e 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/2b_integral_std.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/2b_integral_std.pass.cpp
@@ -98,8 +98,7 @@ __host__ __device__ __noinline__ void test()
 
 template <template <typename, cuda::thread_scope> class Atomic,
           cuda::thread_scope Scope,
-          template <typename, typename>
-          class Selector>
+          template <typename, typename> class Selector>
 __host__ __device__ void test_for_all_types()
 {
   test<Atomic<short, Scope>, short, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/4b_integral_cuda.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/4b_integral_cuda.pass.cpp
index 7df29b0dc04..1af63517f50 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/4b_integral_cuda.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/4b_integral_cuda.pass.cpp
@@ -98,8 +98,7 @@ __host__ __device__ __noinline__ void test()
 
 template <template <typename, cuda::thread_scope> class Atomic,
           cuda::thread_scope Scope,
-          template <typename, typename>
-          class Selector>
+          template <typename, typename> class Selector>
 __host__ __device__ void test_for_all_types()
 {
   test<Atomic<int, Scope>, int, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/4b_integral_std.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/4b_integral_std.pass.cpp
index 9c85979457d..cb2efeaab43 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/4b_integral_std.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/4b_integral_std.pass.cpp
@@ -98,8 +98,7 @@ __host__ __device__ __noinline__ void test()
 
 template <template <typename, cuda::thread_scope> class Atomic,
           cuda::thread_scope Scope,
-          template <typename, typename>
-          class Selector>
+          template <typename, typename> class Selector>
 __host__ __device__ void test_for_all_types()
 {
   test<Atomic<int, Scope>, int, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/8b_integral_cuda.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/8b_integral_cuda.pass.cpp
index 0ba82452f85..b0a90e5fccc 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/8b_integral_cuda.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/8b_integral_cuda.pass.cpp
@@ -98,8 +98,7 @@ __host__ __device__ __noinline__ void test()
 
 template <template <typename, cuda::thread_scope> class Atomic,
           cuda::thread_scope Scope,
-          template <typename, typename>
-          class Selector>
+          template <typename, typename> class Selector>
 __host__ __device__ void test_for_all_types()
 {
   test<Atomic<long, Scope>, long, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/8b_integral_std.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/8b_integral_std.pass.cpp
index 8263f359e6f..9d71b47735a 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/8b_integral_std.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/8b_integral_std.pass.cpp
@@ -98,8 +98,7 @@ __host__ __device__ __noinline__ void test()
 
 template <template <typename, cuda::thread_scope> class Atomic,
           cuda::thread_scope Scope,
-          template <typename, typename>
-          class Selector>
+          template <typename, typename> class Selector>
 __host__ __device__ void test_for_all_types()
 {
   test<Atomic<long, Scope>, long, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/integral_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/integral_ref.pass.cpp
index b2da9693495..1193b49bede 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/integral_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/integral_ref.pass.cpp
@@ -164,8 +164,7 @@ __host__ __device__ __noinline__ void test()
 
 template <template <typename, cuda::thread_scope> class Atomic,
           cuda::thread_scope Scope,
-          template <typename, typename>
-          class Selector>
+          template <typename, typename> class Selector>
 __host__ __device__ void test_for_all_types()
 {
   test<Atomic<int, Scope>, int, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/integral_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/integral_ref_constness.pass.cpp
index 426551da2f3..7125e74b316 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/integral_ref_constness.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/integral_ref_constness.pass.cpp
@@ -164,8 +164,7 @@ __host__ __device__ __noinline__ void test()
 
 template <template <typename, cuda::thread_scope> class Atomic,
           cuda::thread_scope Scope,
-          template <typename, typename>
-          class Selector>
+          template <typename, typename> class Selector>
 __host__ __device__ void test_for_all_types()
 {
   test<Atomic<int, Scope>, int, Selector>();
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_helpers.h b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_helpers.h
index 1344c1aa2fb..9c0a8ef670c 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_helpers.h
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_helpers.h
@@ -28,8 +28,7 @@ struct UserAtomicType
 };
 
 template <template <class, template <typename, typename> class, cuda::thread_scope> class TestFunctor,
-          template <typename, typename>
-          class Selector,
+          template <typename, typename> class Selector,
           cuda::thread_scope Scope
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
           = cuda::thread_scope_system
@@ -67,8 +66,7 @@ struct TestEachIntegralType
 };
 
 template <template <class, template <typename, typename> class, cuda::thread_scope> class TestFunctor,
-          template <typename, typename>
-          class Selector,
+          template <typename, typename> class Selector,
           cuda::thread_scope Scope
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
           = cuda::thread_scope_system
@@ -84,8 +82,7 @@ struct TestEachFloatingPointType
 };
 
 template <template <class, template <typename, typename> class, cuda::thread_scope> class TestFunctor,
-          template <typename, typename>
-          class Selector,
+          template <typename, typename> class Selector,
           cuda::thread_scope Scope
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
           = cuda::thread_scope_system
@@ -104,8 +101,7 @@ struct TestEachAtomicType
 };
 
 template <template <class, template <typename, typename> class, cuda::thread_scope> class TestFunctor,
-          template <typename, typename>
-          class Selector,
+          template <typename, typename> class Selector,
           cuda::thread_scope Scope
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
           = cuda::thread_scope_system
@@ -132,8 +128,7 @@ struct TestEachIntegralRefType
 };
 
 template <template <class, template <typename, typename> class, cuda::thread_scope> class TestFunctor,
-          template <typename, typename>
-          class Selector,
+          template <typename, typename> class Selector,
           cuda::thread_scope Scope
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
           = cuda::thread_scope_system
diff --git a/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp b/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp
index a2650b5bc68..3171d716ed0 100644
--- a/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp
@@ -51,25 +51,25 @@ static_assert(equality_comparable<int S::*>, "");
 static_assert(equality_comparable<int (S::*)()>, "");
 static_assert(equality_comparable<int (S::*)() noexcept>, "");
 static_assert(equality_comparable<int (S::*)() &>, "");
-static_assert(equality_comparable<int (S::*)() & noexcept>, "");
+static_assert(equality_comparable < int(S::*)() & noexcept >, "");
 static_assert(equality_comparable<int (S::*)() &&>, "");
 static_assert(equality_comparable < int(S::*)() && noexcept >, "");
 static_assert(equality_comparable<int (S::*)() const>, "");
 static_assert(equality_comparable<int (S::*)() const noexcept>, "");
 static_assert(equality_comparable<int (S::*)() const&>, "");
-static_assert(equality_comparable<int (S::*)() const & noexcept>, "");
+static_assert(equality_comparable < int(S::*)() const& noexcept >, "");
 static_assert(equality_comparable<int (S::*)() const&&>, "");
 static_assert(equality_comparable < int(S::*)() const&& noexcept >, "");
 static_assert(equality_comparable<int (S::*)() volatile>, "");
 static_assert(equality_comparable<int (S::*)() volatile noexcept>, "");
 static_assert(equality_comparable<int (S::*)() volatile&>, "");
-static_assert(equality_comparable<int (S::*)() volatile & noexcept>, "");
+static_assert(equality_comparable < int(S::*)() volatile & noexcept >, "");
 static_assert(equality_comparable<int (S::*)() volatile&&>, "");
 static_assert(equality_comparable < int(S::*)() volatile && noexcept >, "");
 static_assert(equality_comparable<int (S::*)() const volatile>, "");
 static_assert(equality_comparable<int (S::*)() const volatile noexcept>, "");
 static_assert(equality_comparable<int (S::*)() const volatile&>, "");
-static_assert(equality_comparable<int (S::*)() const volatile & noexcept>, "");
+static_assert(equality_comparable < int(S::*)() const volatile& noexcept >, "");
 static_assert(equality_comparable<int (S::*)() const volatile&&>, "");
 static_assert(equality_comparable < int(S::*)() const volatile&& noexcept >, "");
 
diff --git a/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp b/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp
index f2299543519..83561c30db7 100644
--- a/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp
@@ -79,13 +79,13 @@ static_assert(!check_equality_comparable_with<int, int (S::*)() volatile & noexc
 static_assert(!check_equality_comparable_with<int, int (S::*)() const volatile&>(), "");
 static_assert(!check_equality_comparable_with<int, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int*, int*>(), "");
 static_assert(check_equality_comparable_with<int*, int[5]>(), "");
@@ -108,13 +108,13 @@ static_assert(!check_equality_comparable_with<int*, int (S::*)() volatile & noex
 static_assert(!check_equality_comparable_with<int*, int (S::*)() const volatile&>(), "");
 static_assert(!check_equality_comparable_with<int*, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int*, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int*, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int*, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int*, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int*, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int*, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int*, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int*, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int*, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int*, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int*, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int*, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int[5], int[5]>(), "");
 static_assert(!check_equality_comparable_with<int[5], int (*)()>(), "");
@@ -136,13 +136,13 @@ static_assert(!check_equality_comparable_with<int[5], int (S::*)() volatile & no
 static_assert(!check_equality_comparable_with<int[5], int (S::*)() const volatile&>(), "");
 static_assert(!check_equality_comparable_with<int[5], int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int[5], int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int[5], int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int[5], int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int[5], int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int[5], int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int[5], int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int[5], int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int[5], int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int[5], int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int[5], int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int[5], int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int[5], int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (*)(), int (*)()>(), "");
 static_assert(check_equality_comparable_with<int (*)(), int (&)()>(), "");
@@ -163,13 +163,13 @@ static_assert(!check_equality_comparable_with<int (*)(), int (S::*)() volatile &
 static_assert(!check_equality_comparable_with<int (*)(), int (S::*)() const volatile&>(), "");
 static_assert(!check_equality_comparable_with<int (*)(), int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (*)(), int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int (*)(), int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (*)(), int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (*)(), int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int (*)(), int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (*)(), int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (*)(), int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int (*)(), int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (*)(), int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (*)(), int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int (*)(), int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (*)(), int (S::*)() const volatile && noexcept>(), "");
 
 #ifdef INVESTIGATE_COMPILER_BUG
 static_assert(check_equality_comparable_with<int (&)(), int (&)()>(), "");
@@ -191,13 +191,13 @@ static_assert(!check_equality_comparable_with<int (&)(), int (S::*)() volatile &
 static_assert(!check_equality_comparable_with<int (&)(), int (S::*)() const volatile&>(), "");
 static_assert(!check_equality_comparable_with<int (&)(), int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (&)(), int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int (&)(), int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (&)(), int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (&)(), int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int (&)(), int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (&)(), int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (&)(), int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int (&)(), int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (&)(), int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (&)(), int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int (&)(), int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (&)(), int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)(), int (S::*)()>(), "");
 #ifndef TEST_COMPILER_BROKEN_SMF_NOEXCEPT
@@ -218,13 +218,13 @@ static_assert(!check_equality_comparable_with<int (S::*)(), int (S::*)() volatil
 static_assert(!check_equality_comparable_with<int (S::*)(), int (S::*)() const volatile&>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)(), int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)(), int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)(), int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)(), int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)(), int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)(), int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)(), int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)(), int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)(), int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)(), int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)(), int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)(), int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)(), int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)() noexcept, int (S::*)() noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() noexcept, int (S::*)() const>(), "");
@@ -242,13 +242,13 @@ static_assert(!check_equality_comparable_with<int (S::*)() noexcept, int (S::*)(
 static_assert(!check_equality_comparable_with<int (S::*)() noexcept, int (S::*)() const volatile&>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() noexcept, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() noexcept, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() noexcept, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() noexcept, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() noexcept, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() noexcept, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() noexcept, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() noexcept, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)() const, int (S::*)() const>(), "");
 #ifndef TEST_COMPILER_BROKEN_SMF_NOEXCEPT
@@ -267,13 +267,13 @@ static_assert(!check_equality_comparable_with<int (S::*)() const, int (S::*)() v
 static_assert(!check_equality_comparable_with<int (S::*)() const, int (S::*)() const volatile&>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)() const noexcept, int (S::*)() const noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const noexcept, int (S::*)() volatile>(), "");
@@ -290,14 +290,13 @@ static_assert(!check_equality_comparable_with<int (S::*)() const noexcept, int (
 static_assert(!check_equality_comparable_with<int (S::*)() const noexcept, int (S::*)() const volatile & noexcept>(),
               "");
 static_assert(!check_equality_comparable_with<int (S::*)() const noexcept, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const noexcept, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const noexcept, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const noexcept, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const noexcept, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const noexcept, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const noexcept,
-              int (S::*)() const volatile&& noexcept > (),
+static_assert(!check_equality_comparable_with<int (S::*)() const noexcept, int (S::*)() const volatile && noexcept>(),
               "");
 
 static_assert(check_equality_comparable_with<int (S::*)() volatile, int (S::*)() volatile>(), "");
@@ -315,13 +314,13 @@ static_assert(!check_equality_comparable_with<int (S::*)() volatile, int (S::*)(
 static_assert(!check_equality_comparable_with<int (S::*)() volatile, int (S::*)() const volatile&>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)() volatile noexcept, int (S::*)() volatile noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile noexcept, int (S::*)() const volatile>(), "");
@@ -337,17 +336,14 @@ static_assert(!check_equality_comparable_with<int (S::*)() volatile noexcept, in
 static_assert(!check_equality_comparable_with<int (S::*)() volatile noexcept, int (S::*)() const volatile & noexcept>(),
               "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile noexcept, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile noexcept, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile noexcept, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile noexcept,
-              int (S::*)() volatile&& noexcept > (),
-              "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile noexcept, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile noexcept, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile noexcept,
-              int (S::*)() const volatile&& noexcept > (),
-              "");
+static_assert(
+  !check_equality_comparable_with<int (S::*)() volatile noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)() const volatile, int (S::*)() const volatile>(), "");
 #ifndef TEST_COMPILER_BROKEN_SMF_NOEXCEPT
@@ -363,14 +359,13 @@ static_assert(!check_equality_comparable_with<int (S::*)() const volatile, int (
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile, int (S::*)() const volatile & noexcept>(),
               "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const volatile, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const volatile, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const volatile, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile,
-              int (S::*)() const volatile&& noexcept > (),
+static_assert(!check_equality_comparable_with<int (S::*)() const volatile, int (S::*)() const volatile && noexcept>(),
               "");
 
 static_assert(
@@ -388,20 +383,17 @@ static_assert(!check_equality_comparable_with<int (S::*)() const volatile noexce
 static_assert(
   !check_equality_comparable_with<int (S::*)() const volatile noexcept, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile noexcept, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const volatile noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile noexcept,
-              int (S::*)() const&& noexcept > (),
+static_assert(!check_equality_comparable_with<int (S::*)() const volatile noexcept, int (S::*)() const && noexcept>(),
               "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile noexcept,
-              int (S::*)() volatile&& noexcept > (),
-              "");
+static_assert(
+  !check_equality_comparable_with<int (S::*)() const volatile noexcept, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile noexcept, int (S::*)() const volatile&&>(),
               "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile noexcept,
-              int (S::*)() const volatile&& noexcept > (),
-              "");
+static_assert(
+  !check_equality_comparable_with<int (S::*)() const volatile noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)() &, int (S::*)() &>(), "");
 #ifndef TEST_COMPILER_BROKEN_SMF_NOEXCEPT
@@ -414,13 +406,13 @@ static_assert(!check_equality_comparable_with<int (S::*)() &, int (S::*)() volat
 static_assert(!check_equality_comparable_with<int (S::*)() &, int (S::*)() const volatile&>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() &, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() &, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() &, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() &, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() &, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() &, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() &, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() &, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() &, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() &, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() &, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() &, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() &, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)() & noexcept, int (S::*)() & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() & noexcept, int (S::*)() const&>(), "");
@@ -430,13 +422,13 @@ static_assert(!check_equality_comparable_with<int (S::*)() & noexcept, int (S::*
 static_assert(!check_equality_comparable_with<int (S::*)() & noexcept, int (S::*)() const volatile&>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() & noexcept, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() & noexcept, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() & noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() & noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() & noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() & noexcept, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() & noexcept, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() & noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() & noexcept, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() & noexcept, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() & noexcept, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() & noexcept, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() & noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)() const&, int (S::*)() const&>(), "");
 #ifndef TEST_COMPILER_BROKEN_SMF_NOEXCEPT
@@ -447,13 +439,13 @@ static_assert(!check_equality_comparable_with<int (S::*)() const&, int (S::*)()
 static_assert(!check_equality_comparable_with<int (S::*)() const&, int (S::*)() const volatile&>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const&, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const&, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const&, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const&, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const&, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const&, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const&, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const&, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const&, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const&, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const&, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const&, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const&, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)() const & noexcept, int (S::*)() const & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const & noexcept, int (S::*)() volatile&>(), "");
@@ -462,14 +454,13 @@ static_assert(!check_equality_comparable_with<int (S::*)() const & noexcept, int
 static_assert(!check_equality_comparable_with<int (S::*)() const & noexcept, int (S::*)() const volatile & noexcept>(),
               "");
 static_assert(!check_equality_comparable_with<int (S::*)() const & noexcept, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const& noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const & noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const & noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const& noexcept, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const & noexcept, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const & noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const& noexcept, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const & noexcept, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const & noexcept, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const& noexcept,
-              int (S::*)() const volatile&& noexcept > (),
+static_assert(!check_equality_comparable_with<int (S::*)() const & noexcept, int (S::*)() const volatile && noexcept>(),
               "");
 
 static_assert(check_equality_comparable_with<int (S::*)() volatile&, int (S::*)() volatile&>(), "");
@@ -479,30 +470,28 @@ static_assert(check_equality_comparable_with<int (S::*)() volatile&, int (S::*)(
 static_assert(!check_equality_comparable_with<int (S::*)() volatile&, int (S::*)() const volatile&>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile&, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile&, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile&, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile&, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile&, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile&, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile&, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile&, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile&, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile&, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile&, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile&, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile&, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)() volatile & noexcept, int (S::*)() volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile & noexcept, int (S::*)() const volatile&>(), "");
 static_assert(
   !check_equality_comparable_with<int (S::*)() volatile & noexcept, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile & noexcept, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile & noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile & noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile & noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile & noexcept, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile & noexcept, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile & noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile & noexcept,
-              int (S::*)() volatile&& noexcept > (),
+static_assert(!check_equality_comparable_with<int (S::*)() volatile & noexcept, int (S::*)() volatile && noexcept>(),
               "");
 static_assert(!check_equality_comparable_with<int (S::*)() volatile & noexcept, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile & noexcept,
-              int (S::*)() const volatile&& noexcept > (),
-              "");
+static_assert(
+  !check_equality_comparable_with<int (S::*)() volatile & noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)() const volatile&, int (S::*)() const volatile&>(), "");
 #ifndef TEST_COMPILER_BROKEN_SMF_NOEXCEPT
@@ -510,98 +499,87 @@ static_assert(check_equality_comparable_with<int (S::*)() const volatile&, int (
               "");
 #endif // TEST_COMPILER_BROKEN_SMF_NOEXCEPT
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile&, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile&, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const volatile&, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile&, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile&, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const volatile&, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile&, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile&, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const volatile&, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile&, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile&,
-              int (S::*)() const volatile&& noexcept > (),
+static_assert(!check_equality_comparable_with<int (S::*)() const volatile&, int (S::*)() const volatile && noexcept>(),
               "");
 
 static_assert(
   check_equality_comparable_with<int (S::*)() const volatile & noexcept, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile & noexcept, int (S::*)() &&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile& noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const volatile & noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile & noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile& noexcept,
-              int (S::*)() const&& noexcept > (),
+static_assert(!check_equality_comparable_with<int (S::*)() const volatile & noexcept, int (S::*)() const && noexcept>(),
               "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile & noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile& noexcept,
-              int (S::*)() volatile&& noexcept > (),
-              "");
+static_assert(
+  !check_equality_comparable_with<int (S::*)() const volatile & noexcept, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const volatile & noexcept, int (S::*)() const volatile&&>(),
               "");
-static_assert(!check_equality_comparable_with < int(S::*)() const volatile& noexcept,
-              int (S::*)() const volatile&& noexcept > (),
-              "");
+static_assert(
+  !check_equality_comparable_with<int (S::*)() const volatile & noexcept, int (S::*)() const volatile && noexcept>(),
+  "");
 
 static_assert(check_equality_comparable_with<int (S::*)() &&, int (S::*)() &&>(), "");
 #ifndef TEST_COMPILER_BROKEN_SMF_NOEXCEPT
-static_assert(check_equality_comparable_with < int(S::*)() &&, int (S::*)() && noexcept > (), "");
+static_assert(check_equality_comparable_with<int (S::*)() &&, int (S::*)() && noexcept>(), "");
 #endif // TEST_COMPILER_BROKEN_SMF_NOEXCEPT
 static_assert(!check_equality_comparable_with<int (S::*)() &&, int (S::*)() const&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() &&, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() &&, int (S::*)() const && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() &&, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() &&, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() &&, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() &&, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() &&, int (S::*)() const volatile&& noexcept > (), "");
-
-static_assert(check_equality_comparable_with < int(S::*)() && noexcept, int (S::*)() && noexcept > (), "");
-static_assert(!check_equality_comparable_with < int(S::*)() && noexcept, int (S::*)() const&& > (), "");
-static_assert(!check_equality_comparable_with < int(S::*)() && noexcept, int (S::*)() const&& noexcept > (), "");
-static_assert(!check_equality_comparable_with < int(S::*)() && noexcept, int (S::*)() volatile&& > (), "");
-static_assert(!check_equality_comparable_with < int(S::*)() && noexcept, int (S::*)() volatile&& noexcept > (), "");
-static_assert(!check_equality_comparable_with < int(S::*)() && noexcept, int (S::*)() const volatile&& > (), "");
-static_assert(!check_equality_comparable_with < int(S::*)() && noexcept,
-              int (S::*)() const volatile&& noexcept > (),
-              "");
+static_assert(!check_equality_comparable_with<int (S::*)() &&, int (S::*)() const volatile && noexcept>(), "");
+
+static_assert(check_equality_comparable_with<int (S::*)() && noexcept, int (S::*)() && noexcept>(), "");
+static_assert(!check_equality_comparable_with<int (S::*)() && noexcept, int (S::*)() const&&>(), "");
+static_assert(!check_equality_comparable_with<int (S::*)() && noexcept, int (S::*)() const && noexcept>(), "");
+static_assert(!check_equality_comparable_with<int (S::*)() && noexcept, int (S::*)() volatile&&>(), "");
+static_assert(!check_equality_comparable_with<int (S::*)() && noexcept, int (S::*)() volatile && noexcept>(), "");
+static_assert(!check_equality_comparable_with<int (S::*)() && noexcept, int (S::*)() const volatile&&>(), "");
+static_assert(!check_equality_comparable_with<int (S::*)() && noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)() const&&, int (S::*)() const&&>(), "");
 #ifndef TEST_COMPILER_BROKEN_SMF_NOEXCEPT
-static_assert(check_equality_comparable_with < int(S::*)() const&&, int (S::*)() const&& noexcept > (), "");
+static_assert(check_equality_comparable_with<int (S::*)() const&&, int (S::*)() const && noexcept>(), "");
 #endif // TEST_COMPILER_BROKEN_SMF_NOEXCEPT
 static_assert(!check_equality_comparable_with<int (S::*)() const&&, int (S::*)() volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const&&, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const&&, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_equality_comparable_with<int (S::*)() const&&, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const&&, int (S::*)() const volatile&& noexcept > (), "");
-
-static_assert(check_equality_comparable_with < int(S::*)() const&& noexcept, int (S::*)() const&& noexcept > (), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const&& noexcept, int (S::*)() volatile&& > (), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const&& noexcept, int (S::*)() volatile&& noexcept > (), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const&& noexcept, int (S::*)() const volatile&& > (), "");
-static_assert(!check_equality_comparable_with < int(S::*)() const&& noexcept,
-              int (S::*)() const volatile&& noexcept > (),
-              "");
+static_assert(!check_equality_comparable_with<int (S::*)() const&&, int (S::*)() const volatile && noexcept>(), "");
+
+static_assert(check_equality_comparable_with<int (S::*)() const && noexcept, int (S::*)() const && noexcept>(), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const && noexcept, int (S::*)() volatile&&>(), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const && noexcept, int (S::*)() volatile && noexcept>(), "");
+static_assert(!check_equality_comparable_with<int (S::*)() const && noexcept, int (S::*)() const volatile&&>(), "");
+static_assert(
+  !check_equality_comparable_with<int (S::*)() const && noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)() volatile&&, int (S::*)() volatile&&>(), "");
 #ifndef TEST_COMPILER_BROKEN_SMF_NOEXCEPT
-static_assert(check_equality_comparable_with < int(S::*)() volatile&&, int (S::*)() volatile&& noexcept > (), "");
+static_assert(check_equality_comparable_with<int (S::*)() volatile&&, int (S::*)() volatile && noexcept>(), "");
 #endif // TEST_COMPILER_BROKEN_SMF_NOEXCEPT
 static_assert(!check_equality_comparable_with<int (S::*)() volatile&&, int (S::*)() const volatile&&>(), "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile&&, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile&&, int (S::*)() const volatile && noexcept>(), "");
 
-static_assert(check_equality_comparable_with < int(S::*)() volatile && noexcept,
-              int (S::*)() volatile&& noexcept > (),
-              "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile && noexcept,
-              int (S::*)() const volatile&& > (),
-              "");
-static_assert(!check_equality_comparable_with < int(S::*)() volatile && noexcept,
-              int (S::*)() const volatile&& noexcept > (),
+static_assert(check_equality_comparable_with<int (S::*)() volatile && noexcept, int (S::*)() volatile && noexcept>(),
               "");
+static_assert(!check_equality_comparable_with<int (S::*)() volatile && noexcept, int (S::*)() const volatile&&>(), "");
+static_assert(
+  !check_equality_comparable_with<int (S::*)() volatile && noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_equality_comparable_with<int (S::*)() const volatile&&, int (S::*)() const volatile&&>(), "");
 #ifndef TEST_COMPILER_BROKEN_SMF_NOEXCEPT
-static_assert(check_equality_comparable_with < int(S::*)() const volatile&&,
-              int (S::*)() const volatile&& noexcept > (),
+static_assert(check_equality_comparable_with<int (S::*)() const volatile&&, int (S::*)() const volatile && noexcept>(),
               "");
 #endif // TEST_COMPILER_BROKEN_SMF_NOEXCEPT
-static_assert(check_equality_comparable_with < int(S::*)() const volatile&& noexcept,
-              int (S::*)() const volatile&& noexcept > (),
-              "");
+static_assert(
+  check_equality_comparable_with<int (S::*)() const volatile && noexcept, int (S::*)() const volatile && noexcept>(),
+  "");
 
 static_assert(!check_equality_comparable_with<nullptr_t, int>(), "");
 static_assert(check_equality_comparable_with<nullptr_t, int*>(), "");
@@ -627,13 +605,13 @@ static_assert(check_equality_comparable_with<nullptr_t, int (S::*)() volatile &
 static_assert(check_equality_comparable_with<nullptr_t, int (S::*)() const volatile&>(), "");
 static_assert(check_equality_comparable_with<nullptr_t, int (S::*)() const volatile & noexcept>(), "");
 static_assert(check_equality_comparable_with<nullptr_t, int (S::*)() &&>(), "");
-static_assert(check_equality_comparable_with < nullptr_t, int (S::*)() && noexcept > (), "");
+static_assert(check_equality_comparable_with<nullptr_t, int (S::*)() && noexcept>(), "");
 static_assert(check_equality_comparable_with<nullptr_t, int (S::*)() const&&>(), "");
-static_assert(check_equality_comparable_with < nullptr_t, int (S::*)() const&& noexcept > (), "");
+static_assert(check_equality_comparable_with<nullptr_t, int (S::*)() const && noexcept>(), "");
 static_assert(check_equality_comparable_with<nullptr_t, int (S::*)() volatile&&>(), "");
-static_assert(check_equality_comparable_with < nullptr_t, int (S::*)() volatile&& noexcept > (), "");
+static_assert(check_equality_comparable_with<nullptr_t, int (S::*)() volatile && noexcept>(), "");
 static_assert(check_equality_comparable_with<nullptr_t, int (S::*)() const volatile&&>(), "");
-static_assert(check_equality_comparable_with < nullptr_t, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(check_equality_comparable_with<nullptr_t, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!equality_comparable_with<void, int>, "");
 static_assert(!equality_comparable_with<void, int*>, "");
@@ -651,13 +629,13 @@ static_assert(!equality_comparable_with<void, int (S::*)() volatile noexcept>, "
 static_assert(!equality_comparable_with<void, int (S::*)() const volatile>, "");
 static_assert(!equality_comparable_with<void, int (S::*)() const volatile noexcept>, "");
 static_assert(!equality_comparable_with<void, int (S::*)() &>, "");
-static_assert(!equality_comparable_with<void, int (S::*)() & noexcept>, "");
+static_assert(!equality_comparable_with < void, int (S::*)() & noexcept >, "");
 static_assert(!equality_comparable_with<void, int (S::*)() const&>, "");
-static_assert(!equality_comparable_with<void, int (S::*)() const & noexcept>, "");
+static_assert(!equality_comparable_with < void, int (S::*)() const& noexcept >, "");
 static_assert(!equality_comparable_with<void, int (S::*)() volatile&>, "");
-static_assert(!equality_comparable_with<void, int (S::*)() volatile & noexcept>, "");
+static_assert(!equality_comparable_with < void, int (S::*)() volatile& noexcept >, "");
 static_assert(!equality_comparable_with<void, int (S::*)() const volatile&>, "");
-static_assert(!equality_comparable_with<void, int (S::*)() const volatile & noexcept>, "");
+static_assert(!equality_comparable_with < void, int (S::*)() const volatile& noexcept >, "");
 static_assert(!equality_comparable_with<void, int (S::*)() &&>, "");
 static_assert(!equality_comparable_with < void, int (S::*)() && noexcept >, "");
 static_assert(!equality_comparable_with<void, int (S::*)() const&&>, "");
diff --git a/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.pass.cpp b/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.pass.cpp
index a76fb02b07a..a5d7fda4f85 100644
--- a/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.pass.cpp
@@ -88,25 +88,25 @@ static_assert(!totally_ordered<int S::*>, "");
 static_assert(!totally_ordered<int (S::*)()>, "");
 static_assert(!totally_ordered<int (S::*)() noexcept>, "");
 static_assert(!totally_ordered<int (S::*)() &>, "");
-static_assert(!totally_ordered<int (S::*)() & noexcept>, "");
+static_assert(!totally_ordered < int(S::*)() & noexcept >, "");
 static_assert(!totally_ordered<int (S::*)() &&>, "");
 static_assert(!totally_ordered < int(S::*)() && noexcept >, "");
 static_assert(!totally_ordered<int (S::*)() const>, "");
 static_assert(!totally_ordered<int (S::*)() const noexcept>, "");
 static_assert(!totally_ordered<int (S::*)() const&>, "");
-static_assert(!totally_ordered<int (S::*)() const & noexcept>, "");
+static_assert(!totally_ordered < int(S::*)() const& noexcept >, "");
 static_assert(!totally_ordered<int (S::*)() const&&>, "");
 static_assert(!totally_ordered < int(S::*)() const&& noexcept >, "");
 static_assert(!totally_ordered<int (S::*)() volatile>, "");
 static_assert(!totally_ordered<int (S::*)() volatile noexcept>, "");
 static_assert(!totally_ordered<int (S::*)() volatile&>, "");
-static_assert(!totally_ordered<int (S::*)() volatile & noexcept>, "");
+static_assert(!totally_ordered < int(S::*)() volatile & noexcept >, "");
 static_assert(!totally_ordered<int (S::*)() volatile&&>, "");
 static_assert(!totally_ordered < int(S::*)() volatile && noexcept >, "");
 static_assert(!totally_ordered<int (S::*)() const volatile>, "");
 static_assert(!totally_ordered<int (S::*)() const volatile noexcept>, "");
 static_assert(!totally_ordered<int (S::*)() const volatile&>, "");
-static_assert(!totally_ordered<int (S::*)() const volatile & noexcept>, "");
+static_assert(!totally_ordered < int(S::*)() const volatile& noexcept >, "");
 static_assert(!totally_ordered<int (S::*)() const volatile&&>, "");
 static_assert(!totally_ordered < int(S::*)() const volatile&& noexcept >, "");
 
diff --git a/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.pass.cpp b/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.pass.cpp
index a67a915346c..493cf05f637 100644
--- a/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.pass.cpp
@@ -85,13 +85,13 @@ static_assert(!check_totally_ordered_with<int, int (S::*)() volatile & noexcept>
 static_assert(!check_totally_ordered_with<int, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_totally_ordered_with<int*, int*>(), "");
 static_assert(check_totally_ordered_with<int*, int[5]>(), "");
@@ -114,13 +114,13 @@ static_assert(!check_totally_ordered_with<int*, int (S::*)() volatile & noexcept
 static_assert(!check_totally_ordered_with<int*, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int*, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int*, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int*, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int*, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int*, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int*, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int*, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int*, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int*, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int*, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int*, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int*, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int*, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_totally_ordered_with<int[5], int[5]>(), "");
 static_assert(!check_totally_ordered_with<int[5], int (*)()>(), "");
@@ -142,13 +142,13 @@ static_assert(!check_totally_ordered_with<int[5], int (S::*)() volatile & noexce
 static_assert(!check_totally_ordered_with<int[5], int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int[5], int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int[5], int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int[5], int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int[5], int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int[5], int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int[5], int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int[5], int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int[5], int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int[5], int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int[5], int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int[5], int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int[5], int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int[5], int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(check_totally_ordered_with<int (*)(), int (*)()>(), "");
 static_assert(check_totally_ordered_with<int (*)(), int (&)()>(), "");
@@ -169,13 +169,13 @@ static_assert(!check_totally_ordered_with<int (*)(), int (S::*)() volatile & noe
 static_assert(!check_totally_ordered_with<int (*)(), int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (*)(), int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (*)(), int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int (*)(), int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (*)(), int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (*)(), int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int (*)(), int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (*)(), int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (*)(), int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int (*)(), int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (*)(), int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (*)(), int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int (*)(), int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (*)(), int (S::*)() const volatile && noexcept>(), "");
 #ifdef INVESTIGATE_COMPILER_BUG
 static_assert(check_totally_ordered_with<int (&)(), int (&)()>(), "");
 #endif // INVESTIGATE_COMPILER_BUG
@@ -196,13 +196,13 @@ static_assert(!check_totally_ordered_with<int (&)(), int (S::*)() volatile & noe
 static_assert(!check_totally_ordered_with<int (&)(), int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (&)(), int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (&)(), int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int (&)(), int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (&)(), int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (&)(), int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int (&)(), int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (&)(), int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (&)(), int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int (&)(), int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (&)(), int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (&)(), int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int (&)(), int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (&)(), int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!check_totally_ordered_with<int (S::*)(), int (S::*)()>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)(), int (S::*)() noexcept>(), "");
@@ -221,13 +221,13 @@ static_assert(!check_totally_ordered_with<int (S::*)(), int (S::*)() volatile &
 static_assert(!check_totally_ordered_with<int (S::*)(), int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)(), int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)(), int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)(), int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)(), int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)(), int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)(), int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)(), int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)(), int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)(), int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)(), int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)(), int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)(), int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)(), int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() noexcept, int (S::*)() noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() noexcept, int (S::*)() const>(), "");
@@ -245,13 +245,13 @@ static_assert(!check_totally_ordered_with<int (S::*)() noexcept, int (S::*)() vo
 static_assert(!check_totally_ordered_with<int (S::*)() noexcept, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() noexcept, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() noexcept, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() noexcept, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() noexcept, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() noexcept, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() noexcept, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() noexcept, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() noexcept, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() const, int (S::*)() const>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const, int (S::*)() const noexcept>(), "");
@@ -268,13 +268,13 @@ static_assert(!check_totally_ordered_with<int (S::*)() const, int (S::*)() volat
 static_assert(!check_totally_ordered_with<int (S::*)() const, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() const noexcept, int (S::*)() const noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const noexcept, int (S::*)() volatile>(), "");
@@ -290,13 +290,13 @@ static_assert(!check_totally_ordered_with<int (S::*)() const noexcept, int (S::*
 static_assert(!check_totally_ordered_with<int (S::*)() const noexcept, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const noexcept, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const noexcept, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const noexcept, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const noexcept, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const noexcept, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const noexcept, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const noexcept, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const noexcept, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() volatile, int (S::*)() volatile>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile, int (S::*)() volatile noexcept>(), "");
@@ -311,13 +311,13 @@ static_assert(!check_totally_ordered_with<int (S::*)() volatile, int (S::*)() vo
 static_assert(!check_totally_ordered_with<int (S::*)() volatile, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() volatile noexcept, int (S::*)() volatile noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile noexcept, int (S::*)() const volatile>(), "");
@@ -332,14 +332,13 @@ static_assert(!check_totally_ordered_with<int (S::*)() volatile noexcept, int (S
 static_assert(!check_totally_ordered_with<int (S::*)() volatile noexcept, int (S::*)() const volatile & noexcept>(),
               "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile noexcept, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile noexcept, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile noexcept, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile noexcept, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile noexcept, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile noexcept, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile noexcept,
-              int (S::*)() const volatile&& noexcept > (),
+static_assert(!check_totally_ordered_with<int (S::*)() volatile noexcept, int (S::*)() const volatile && noexcept>(),
               "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile, int (S::*)() const volatile>(), "");
@@ -353,13 +352,13 @@ static_assert(!check_totally_ordered_with<int (S::*)() const volatile, int (S::*
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile noexcept, int (S::*)() const volatile noexcept>(),
               "");
@@ -374,17 +373,15 @@ static_assert(!check_totally_ordered_with<int (S::*)() const volatile noexcept,
 static_assert(
   !check_totally_ordered_with<int (S::*)() const volatile noexcept, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile noexcept, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile noexcept, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile noexcept, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile noexcept,
-              int (S::*)() volatile&& noexcept > (),
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile noexcept, int (S::*)() volatile && noexcept>(),
               "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile noexcept, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile noexcept,
-              int (S::*)() const volatile&& noexcept > (),
-              "");
+static_assert(
+  !check_totally_ordered_with<int (S::*)() const volatile noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() &, int (S::*)() &>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() &, int (S::*)() & noexcept>(), "");
@@ -395,13 +392,13 @@ static_assert(!check_totally_ordered_with<int (S::*)() &, int (S::*)() volatile
 static_assert(!check_totally_ordered_with<int (S::*)() &, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() &, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() &, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() &, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() &, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() &, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() &, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() &, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() &, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() &, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() &, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() &, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() &, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() &, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() & noexcept, int (S::*)() & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() & noexcept, int (S::*)() const&>(), "");
@@ -411,13 +408,13 @@ static_assert(!check_totally_ordered_with<int (S::*)() & noexcept, int (S::*)()
 static_assert(!check_totally_ordered_with<int (S::*)() & noexcept, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() & noexcept, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() & noexcept, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() & noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() & noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() & noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() & noexcept, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() & noexcept, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() & noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() & noexcept, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() & noexcept, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() & noexcept, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() & noexcept, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() & noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() const&, int (S::*)() const&>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const&, int (S::*)() const & noexcept>(), "");
@@ -426,13 +423,13 @@ static_assert(!check_totally_ordered_with<int (S::*)() const&, int (S::*)() vola
 static_assert(!check_totally_ordered_with<int (S::*)() const&, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const&, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const&, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const&, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const&, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const&, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const&, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const&, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const&, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const&, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const&, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const&, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const&, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const&, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() const & noexcept, int (S::*)() const & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const & noexcept, int (S::*)() volatile&>(), "");
@@ -440,14 +437,13 @@ static_assert(!check_totally_ordered_with<int (S::*)() const & noexcept, int (S:
 static_assert(!check_totally_ordered_with<int (S::*)() const & noexcept, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const & noexcept, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const & noexcept, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const& noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const & noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const & noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const& noexcept, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const & noexcept, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const & noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const& noexcept, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const & noexcept, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const & noexcept, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const& noexcept,
-              int (S::*)() const volatile&& noexcept > (),
+static_assert(!check_totally_ordered_with<int (S::*)() const & noexcept, int (S::*)() const volatile && noexcept>(),
               "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() volatile&, int (S::*)() volatile&>(), "");
@@ -455,109 +451,99 @@ static_assert(!check_totally_ordered_with<int (S::*)() volatile&, int (S::*)() v
 static_assert(!check_totally_ordered_with<int (S::*)() volatile&, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile&, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile&, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile&, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile&, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile&, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile&, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile&, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile&, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile&, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile&, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile&, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile&, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile&, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() volatile & noexcept, int (S::*)() volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile & noexcept, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile & noexcept, int (S::*)() const volatile & noexcept>(),
               "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile & noexcept, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile & noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile & noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile & noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile & noexcept, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile & noexcept, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile & noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile & noexcept, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile & noexcept, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile & noexcept, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile & noexcept,
-              int (S::*)() const volatile&& noexcept > (),
+static_assert(!check_totally_ordered_with<int (S::*)() volatile & noexcept, int (S::*)() const volatile && noexcept>(),
               "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile&, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile&, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile&, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile&, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile&, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile&, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile&, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile&, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile&, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile&, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile&, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile&, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile&,
-              int (S::*)() const volatile&& noexcept > (),
-              "");
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile&, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(
   !check_totally_ordered_with<int (S::*)() const volatile & noexcept, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile & noexcept, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile& noexcept, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile & noexcept, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile & noexcept, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile& noexcept,
-              int (S::*)() const&& noexcept > (),
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile & noexcept, int (S::*)() const && noexcept>(),
               "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile & noexcept, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile& noexcept,
-              int (S::*)() volatile&& noexcept > (),
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile & noexcept, int (S::*)() volatile && noexcept>(),
               "");
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile & noexcept, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile& noexcept,
-              int (S::*)() const volatile&& noexcept > (),
-              "");
+static_assert(
+  !check_totally_ordered_with<int (S::*)() const volatile & noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() &&, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() &&, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() &&, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() &&, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() &&, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() &&, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() &&, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() &&, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() &&, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() &&, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() &&, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() &&, int (S::*)() const volatile && noexcept>(), "");
 
-static_assert(!check_totally_ordered_with < int(S::*)() && noexcept, int (S::*)() && noexcept > (), "");
-static_assert(!check_totally_ordered_with < int(S::*)() && noexcept, int (S::*)() const&& > (), "");
-static_assert(!check_totally_ordered_with < int(S::*)() && noexcept, int (S::*)() const&& noexcept > (), "");
-static_assert(!check_totally_ordered_with < int(S::*)() && noexcept, int (S::*)() volatile&& > (), "");
-static_assert(!check_totally_ordered_with < int(S::*)() && noexcept, int (S::*)() volatile&& noexcept > (), "");
-static_assert(!check_totally_ordered_with < int(S::*)() && noexcept, int (S::*)() const volatile&& > (), "");
-static_assert(!check_totally_ordered_with < int(S::*)() && noexcept, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() && noexcept, int (S::*)() && noexcept>(), "");
+static_assert(!check_totally_ordered_with<int (S::*)() && noexcept, int (S::*)() const&&>(), "");
+static_assert(!check_totally_ordered_with<int (S::*)() && noexcept, int (S::*)() const && noexcept>(), "");
+static_assert(!check_totally_ordered_with<int (S::*)() && noexcept, int (S::*)() volatile&&>(), "");
+static_assert(!check_totally_ordered_with<int (S::*)() && noexcept, int (S::*)() volatile && noexcept>(), "");
+static_assert(!check_totally_ordered_with<int (S::*)() && noexcept, int (S::*)() const volatile&&>(), "");
+static_assert(!check_totally_ordered_with<int (S::*)() && noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() const&&, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const&&, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const&&, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const&&, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const&&, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const&&, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() const&&, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const&&, int (S::*)() const volatile&& noexcept > (), "");
-
-static_assert(!check_totally_ordered_with < int(S::*)() const&& noexcept, int (S::*)() const&& noexcept > (), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const&& noexcept, int (S::*)() volatile&& > (), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const&& noexcept, int (S::*)() volatile&& noexcept > (), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const&& noexcept, int (S::*)() const volatile&& > (), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const&& noexcept,
-              int (S::*)() const volatile&& noexcept > (),
+static_assert(!check_totally_ordered_with<int (S::*)() const&&, int (S::*)() const volatile && noexcept>(), "");
+
+static_assert(!check_totally_ordered_with<int (S::*)() const && noexcept, int (S::*)() const && noexcept>(), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const && noexcept, int (S::*)() volatile&&>(), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const && noexcept, int (S::*)() volatile && noexcept>(), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const && noexcept, int (S::*)() const volatile&&>(), "");
+static_assert(!check_totally_ordered_with<int (S::*)() const && noexcept, int (S::*)() const volatile && noexcept>(),
               "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() volatile&&, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile&&, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile&&, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<int (S::*)() volatile&&, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile&&, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile&&, int (S::*)() const volatile && noexcept>(), "");
 
-static_assert(!check_totally_ordered_with < int(S::*)() volatile && noexcept, int (S::*)() volatile&& noexcept > (), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile && noexcept, int (S::*)() const volatile&& > (), "");
-static_assert(!check_totally_ordered_with < int(S::*)() volatile && noexcept,
-              int (S::*)() const volatile&& noexcept > (),
+static_assert(!check_totally_ordered_with<int (S::*)() volatile && noexcept, int (S::*)() volatile && noexcept>(), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile && noexcept, int (S::*)() const volatile&&>(), "");
+static_assert(!check_totally_ordered_with<int (S::*)() volatile && noexcept, int (S::*)() const volatile && noexcept>(),
               "");
 
 static_assert(!check_totally_ordered_with<int (S::*)() const volatile&&, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile&&,
-              int (S::*)() const volatile&& noexcept > (),
-              "");
-static_assert(!check_totally_ordered_with < int(S::*)() const volatile&& noexcept,
-              int (S::*)() const volatile&& noexcept > (),
+static_assert(!check_totally_ordered_with<int (S::*)() const volatile&&, int (S::*)() const volatile && noexcept>(),
               "");
+static_assert(
+  !check_totally_ordered_with<int (S::*)() const volatile && noexcept, int (S::*)() const volatile && noexcept>(), "");
 
 #if !defined(TEST_COMPILER_GCC) && defined(INVESTIGATE_COMPILER_BUG)
 static_assert(!check_totally_ordered_with<nullptr_t, int>(), "");
@@ -586,13 +572,13 @@ static_assert(!check_totally_ordered_with<nullptr_t, int (S::*)() volatile & noe
 static_assert(!check_totally_ordered_with<nullptr_t, int (S::*)() const volatile&>(), "");
 static_assert(!check_totally_ordered_with<nullptr_t, int (S::*)() const volatile & noexcept>(), "");
 static_assert(!check_totally_ordered_with<nullptr_t, int (S::*)() &&>(), "");
-static_assert(!check_totally_ordered_with < nullptr_t, int (S::*)() && noexcept > (), "");
+static_assert(!check_totally_ordered_with<nullptr_t, int (S::*)() && noexcept>(), "");
 static_assert(!check_totally_ordered_with<nullptr_t, int (S::*)() const&&>(), "");
-static_assert(!check_totally_ordered_with < nullptr_t, int (S::*)() const&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<nullptr_t, int (S::*)() const && noexcept>(), "");
 static_assert(!check_totally_ordered_with<nullptr_t, int (S::*)() volatile&&>(), "");
-static_assert(!check_totally_ordered_with < nullptr_t, int (S::*)() volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<nullptr_t, int (S::*)() volatile && noexcept>(), "");
 static_assert(!check_totally_ordered_with<nullptr_t, int (S::*)() const volatile&&>(), "");
-static_assert(!check_totally_ordered_with < nullptr_t, int (S::*)() const volatile&& noexcept > (), "");
+static_assert(!check_totally_ordered_with<nullptr_t, int (S::*)() const volatile && noexcept>(), "");
 
 static_assert(!equality_comparable_with<void, int>, "");
 static_assert(!equality_comparable_with<void, int*>, "");
@@ -610,13 +596,13 @@ static_assert(!equality_comparable_with<void, int (S::*)() volatile noexcept>, "
 static_assert(!equality_comparable_with<void, int (S::*)() const volatile>, "");
 static_assert(!equality_comparable_with<void, int (S::*)() const volatile noexcept>, "");
 static_assert(!equality_comparable_with<void, int (S::*)() &>, "");
-static_assert(!equality_comparable_with<void, int (S::*)() & noexcept>, "");
+static_assert(!equality_comparable_with < void, int (S::*)() & noexcept >, "");
 static_assert(!equality_comparable_with<void, int (S::*)() const&>, "");
-static_assert(!equality_comparable_with<void, int (S::*)() const & noexcept>, "");
+static_assert(!equality_comparable_with < void, int (S::*)() const& noexcept >, "");
 static_assert(!equality_comparable_with<void, int (S::*)() volatile&>, "");
-static_assert(!equality_comparable_with<void, int (S::*)() volatile & noexcept>, "");
+static_assert(!equality_comparable_with < void, int (S::*)() volatile& noexcept >, "");
 static_assert(!equality_comparable_with<void, int (S::*)() const volatile&>, "");
-static_assert(!equality_comparable_with<void, int (S::*)() const volatile & noexcept>, "");
+static_assert(!equality_comparable_with < void, int (S::*)() const volatile& noexcept >, "");
 static_assert(!equality_comparable_with<void, int (S::*)() &&>, "");
 static_assert(!equality_comparable_with < void, int (S::*)() && noexcept >, "");
 static_assert(!equality_comparable_with<void, int (S::*)() const&&>, "");
diff --git a/libcudacxx/test/libcudacxx/std/concepts/concepts.object/copyable.compile.pass.cpp b/libcudacxx/test/libcudacxx/std/concepts/concepts.object/copyable.compile.pass.cpp
index 6769ea45c40..51314e675ac 100644
--- a/libcudacxx/test/libcudacxx/std/concepts/concepts.object/copyable.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/concepts/concepts.object/copyable.compile.pass.cpp
@@ -36,25 +36,25 @@ static_assert(copyable<int S::*>, "");
 static_assert(copyable<int (S::*)()>, "");
 static_assert(copyable<int (S::*)() noexcept>, "");
 static_assert(copyable<int (S::*)() &>, "");
-static_assert(copyable<int (S::*)() & noexcept>, "");
+static_assert(copyable < int(S::*)() & noexcept >, "");
 static_assert(copyable<int (S::*)() &&>, "");
 static_assert(copyable < int(S::*)() && noexcept >, "");
 static_assert(copyable<int (S::*)() const>, "");
 static_assert(copyable<int (S::*)() const noexcept>, "");
 static_assert(copyable<int (S::*)() const&>, "");
-static_assert(copyable<int (S::*)() const & noexcept>, "");
+static_assert(copyable < int(S::*)() const& noexcept >, "");
 static_assert(copyable<int (S::*)() const&&>, "");
 static_assert(copyable < int(S::*)() const&& noexcept >, "");
 static_assert(copyable<int (S::*)() volatile>, "");
 static_assert(copyable<int (S::*)() volatile noexcept>, "");
 static_assert(copyable<int (S::*)() volatile&>, "");
-static_assert(copyable<int (S::*)() volatile & noexcept>, "");
+static_assert(copyable < int(S::*)() volatile & noexcept >, "");
 static_assert(copyable<int (S::*)() volatile&&>, "");
 static_assert(copyable < int(S::*)() volatile && noexcept >, "");
 static_assert(copyable<int (S::*)() const volatile>, "");
 static_assert(copyable<int (S::*)() const volatile noexcept>, "");
 static_assert(copyable<int (S::*)() const volatile&>, "");
-static_assert(copyable<int (S::*)() const volatile & noexcept>, "");
+static_assert(copyable < int(S::*)() const volatile& noexcept >, "");
 static_assert(copyable<int (S::*)() const volatile&&>, "");
 static_assert(copyable < int(S::*)() const volatile&& noexcept >, "");
 
diff --git a/libcudacxx/test/libcudacxx/std/concepts/concepts.object/movable.compile.pass.cpp b/libcudacxx/test/libcudacxx/std/concepts/concepts.object/movable.compile.pass.cpp
index 52cd49b311d..68006e6d465 100644
--- a/libcudacxx/test/libcudacxx/std/concepts/concepts.object/movable.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/concepts/concepts.object/movable.compile.pass.cpp
@@ -37,25 +37,25 @@ static_assert(movable<int S::*>, "");
 static_assert(movable<int (S::*)()>, "");
 static_assert(movable<int (S::*)() noexcept>, "");
 static_assert(movable<int (S::*)() &>, "");
-static_assert(movable<int (S::*)() & noexcept>, "");
+static_assert(movable < int(S::*)() & noexcept >, "");
 static_assert(movable<int (S::*)() &&>, "");
 static_assert(movable < int(S::*)() && noexcept >, "");
 static_assert(movable<int (S::*)() const>, "");
 static_assert(movable<int (S::*)() const noexcept>, "");
 static_assert(movable<int (S::*)() const&>, "");
-static_assert(movable<int (S::*)() const & noexcept>, "");
+static_assert(movable < int(S::*)() const& noexcept >, "");
 static_assert(movable<int (S::*)() const&&>, "");
 static_assert(movable < int(S::*)() const&& noexcept >, "");
 static_assert(movable<int (S::*)() volatile>, "");
 static_assert(movable<int (S::*)() volatile noexcept>, "");
 static_assert(movable<int (S::*)() volatile&>, "");
-static_assert(movable<int (S::*)() volatile & noexcept>, "");
+static_assert(movable < int(S::*)() volatile & noexcept >, "");
 static_assert(movable<int (S::*)() volatile&&>, "");
 static_assert(movable < int(S::*)() volatile && noexcept >, "");
 static_assert(movable<int (S::*)() const volatile>, "");
 static_assert(movable<int (S::*)() const volatile noexcept>, "");
 static_assert(movable<int (S::*)() const volatile&>, "");
-static_assert(movable<int (S::*)() const volatile & noexcept>, "");
+static_assert(movable < int(S::*)() const volatile& noexcept >, "");
 static_assert(movable<int (S::*)() const volatile&&>, "");
 static_assert(movable < int(S::*)() const volatile&& noexcept >, "");
 
diff --git a/libcudacxx/test/libcudacxx/std/concepts/concepts.object/regular.compile.pass.cpp b/libcudacxx/test/libcudacxx/std/concepts/concepts.object/regular.compile.pass.cpp
index cf8a4608289..d476b3cbf29 100644
--- a/libcudacxx/test/libcudacxx/std/concepts/concepts.object/regular.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/concepts/concepts.object/regular.compile.pass.cpp
@@ -40,25 +40,25 @@ static_assert(regular<int S::*>, "");
 static_assert(regular<int (S::*)()>, "");
 static_assert(regular<int (S::*)() noexcept>, "");
 static_assert(regular<int (S::*)() &>, "");
-static_assert(regular<int (S::*)() & noexcept>, "");
+static_assert(regular < int(S::*)() & noexcept >, "");
 static_assert(regular<int (S::*)() &&>, "");
 static_assert(regular < int(S::*)() && noexcept >, "");
 static_assert(regular<int (S::*)() const>, "");
 static_assert(regular<int (S::*)() const noexcept>, "");
 static_assert(regular<int (S::*)() const&>, "");
-static_assert(regular<int (S::*)() const & noexcept>, "");
+static_assert(regular < int(S::*)() const& noexcept >, "");
 static_assert(regular<int (S::*)() const&&>, "");
 static_assert(regular < int(S::*)() const&& noexcept >, "");
 static_assert(regular<int (S::*)() volatile>, "");
 static_assert(regular<int (S::*)() volatile noexcept>, "");
 static_assert(regular<int (S::*)() volatile&>, "");
-static_assert(regular<int (S::*)() volatile & noexcept>, "");
+static_assert(regular < int(S::*)() volatile & noexcept >, "");
 static_assert(regular<int (S::*)() volatile&&>, "");
 static_assert(regular < int(S::*)() volatile && noexcept >, "");
 static_assert(regular<int (S::*)() const volatile>, "");
 static_assert(regular<int (S::*)() const volatile noexcept>, "");
 static_assert(regular<int (S::*)() const volatile&>, "");
-static_assert(regular<int (S::*)() const volatile & noexcept>, "");
+static_assert(regular < int(S::*)() const volatile& noexcept >, "");
 static_assert(regular<int (S::*)() const volatile&&>, "");
 static_assert(regular < int(S::*)() const volatile&& noexcept >, "");
 
@@ -69,25 +69,25 @@ static_assert(regular<int U::*>, "");
 static_assert(regular<int (U::*)()>, "");
 static_assert(regular<int (U::*)() noexcept>, "");
 static_assert(regular<int (U::*)() &>, "");
-static_assert(regular<int (U::*)() & noexcept>, "");
+static_assert(regular < int(U::*)() & noexcept >, "");
 static_assert(regular<int (U::*)() &&>, "");
 static_assert(regular < int(U::*)() && noexcept >, "");
 static_assert(regular<int (U::*)() const>, "");
 static_assert(regular<int (U::*)() const noexcept>, "");
 static_assert(regular<int (U::*)() const&>, "");
-static_assert(regular<int (U::*)() const & noexcept>, "");
+static_assert(regular < int(U::*)() const& noexcept >, "");
 static_assert(regular<int (U::*)() const&&>, "");
 static_assert(regular < int(U::*)() const&& noexcept >, "");
 static_assert(regular<int (U::*)() volatile>, "");
 static_assert(regular<int (U::*)() volatile noexcept>, "");
 static_assert(regular<int (U::*)() volatile&>, "");
-static_assert(regular<int (U::*)() volatile & noexcept>, "");
+static_assert(regular < int(U::*)() volatile & noexcept >, "");
 static_assert(regular<int (U::*)() volatile&&>, "");
 static_assert(regular < int(U::*)() volatile && noexcept >, "");
 static_assert(regular<int (U::*)() const volatile>, "");
 static_assert(regular<int (U::*)() const volatile noexcept>, "");
 static_assert(regular<int (U::*)() const volatile&>, "");
-static_assert(regular<int (U::*)() const volatile & noexcept>, "");
+static_assert(regular < int(U::*)() const volatile& noexcept >, "");
 static_assert(regular<int (U::*)() const volatile&&>, "");
 static_assert(regular < int(U::*)() const volatile&& noexcept >, "");
 
diff --git a/libcudacxx/test/libcudacxx/std/concepts/concepts.object/semiregular.compile.pass.cpp b/libcudacxx/test/libcudacxx/std/concepts/concepts.object/semiregular.compile.pass.cpp
index 44e7b55b803..b2087fd4a75 100644
--- a/libcudacxx/test/libcudacxx/std/concepts/concepts.object/semiregular.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/concepts/concepts.object/semiregular.compile.pass.cpp
@@ -36,25 +36,25 @@ static_assert(semiregular<int S::*>, "");
 static_assert(semiregular<int (S::*)()>, "");
 static_assert(semiregular<int (S::*)() noexcept>, "");
 static_assert(semiregular<int (S::*)() &>, "");
-static_assert(semiregular<int (S::*)() & noexcept>, "");
+static_assert(semiregular < int(S::*)() & noexcept >, "");
 static_assert(semiregular<int (S::*)() &&>, "");
 static_assert(semiregular < int(S::*)() && noexcept >, "");
 static_assert(semiregular<int (S::*)() const>, "");
 static_assert(semiregular<int (S::*)() const noexcept>, "");
 static_assert(semiregular<int (S::*)() const&>, "");
-static_assert(semiregular<int (S::*)() const & noexcept>, "");
+static_assert(semiregular < int(S::*)() const& noexcept >, "");
 static_assert(semiregular<int (S::*)() const&&>, "");
 static_assert(semiregular < int(S::*)() const&& noexcept >, "");
 static_assert(semiregular<int (S::*)() volatile>, "");
 static_assert(semiregular<int (S::*)() volatile noexcept>, "");
 static_assert(semiregular<int (S::*)() volatile&>, "");
-static_assert(semiregular<int (S::*)() volatile & noexcept>, "");
+static_assert(semiregular < int(S::*)() volatile & noexcept >, "");
 static_assert(semiregular<int (S::*)() volatile&&>, "");
 static_assert(semiregular < int(S::*)() volatile && noexcept >, "");
 static_assert(semiregular<int (S::*)() const volatile>, "");
 static_assert(semiregular<int (S::*)() const volatile noexcept>, "");
 static_assert(semiregular<int (S::*)() const volatile&>, "");
-static_assert(semiregular<int (S::*)() const volatile & noexcept>, "");
+static_assert(semiregular < int(S::*)() const volatile& noexcept >, "");
 static_assert(semiregular<int (S::*)() const volatile&&>, "");
 static_assert(semiregular < int(S::*)() const volatile&& noexcept >, "");
 
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/array/array.creation/to_array.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/array/array.creation/to_array.pass.cpp
index eb4ffd3998d..e5b6aaf298e 100644
--- a/libcudacxx/test/libcudacxx/std/containers/sequences/array/array.creation/to_array.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/array/array.creation/to_array.pass.cpp
@@ -94,7 +94,7 @@ __host__ __device__ constexpr bool tests()
 #if defined(TEST_COMPILER_NVRTC) && defined(TEST_COMPILER_MSVC)
   // Test C99 compound literal.
   {
-    auto arr = cuda::std::to_array((int[]){3, 4});
+    auto arr = cuda::std::to_array((int[]) {3, 4});
     ASSERT_SAME_TYPE(decltype(arr), cuda::std::array<int, 2>);
     assert(arr[0] == 3);
     assert(arr[1] == 4);
diff --git a/libcudacxx/test/libcudacxx/std/iterators/iterator.requirements/iterator.assoc.types/incrementable.traits/incrementable_traits.compile.pass.cpp b/libcudacxx/test/libcudacxx/std/iterators/iterator.requirements/iterator.assoc.types/incrementable.traits/incrementable_traits.compile.pass.cpp
index 6afc7f82fe8..b913eaac535 100644
--- a/libcudacxx/test/libcudacxx/std/iterators/iterator.requirements/iterator.assoc.types/incrementable.traits/incrementable_traits.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/iterators/iterator.requirements/iterator.assoc.types/incrementable.traits/incrementable_traits.compile.pass.cpp
@@ -158,12 +158,12 @@ static_assert(!check_has_difference_type<int (*)() noexcept>, "");
 static_assert(!check_has_difference_type<int (&)()>, "");
 static_assert(!check_has_difference_type<int (&)() noexcept>, "");
 
-#define TEST_POINTER_TO_MEMBER_FUNCTION(type, cv)                               \
-  static_assert(!check_has_difference_type<int (type::*)() cv>, "");            \
-  static_assert(!check_has_difference_type<int (type::*)() cv noexcept>, "");   \
-  static_assert(!check_has_difference_type<int (type::*)() cv&>, "");           \
-  static_assert(!check_has_difference_type<int (type::*)() cv & noexcept>, ""); \
-  static_assert(!check_has_difference_type<int (type::*)() cv&&>, "");          \
+#define TEST_POINTER_TO_MEMBER_FUNCTION(type, cv)                                 \
+  static_assert(!check_has_difference_type<int (type::*)() cv>, "");              \
+  static_assert(!check_has_difference_type<int (type::*)() cv noexcept>, "");     \
+  static_assert(!check_has_difference_type<int (type::*)() cv&>, "");             \
+  static_assert(!check_has_difference_type < int(type::*)() cv & noexcept >, ""); \
+  static_assert(!check_has_difference_type<int (type::*)() cv&&>, "");            \
   static_assert(!check_has_difference_type < int(type::*)() cv && noexcept >, "");
 
 struct empty
diff --git a/libcudacxx/test/libcudacxx/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.readable/indirectly_readable.compile.pass.cpp b/libcudacxx/test/libcudacxx/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.readable/indirectly_readable.compile.pass.cpp
index 7331af44c38..b90e5b74218 100644
--- a/libcudacxx/test/libcudacxx/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.readable/indirectly_readable.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.readable/indirectly_readable.compile.pass.cpp
@@ -221,19 +221,19 @@ static_assert(!cuda::std::indirectly_readable<int S::*>, "");
 static_assert(!cuda::std::indirectly_readable<int (S::*)()>, "");
 static_assert(!cuda::std::indirectly_readable<int (S::*)() noexcept>, "");
 static_assert(!cuda::std::indirectly_readable<int (S::*)() &>, "");
-static_assert(!cuda::std::indirectly_readable<int (S::*)() & noexcept>, "");
+static_assert(!cuda::std::indirectly_readable < int(S::*)() & noexcept >, "");
 static_assert(!cuda::std::indirectly_readable<int (S::*)() &&>, "");
 static_assert(!cuda::std::indirectly_readable < int(S::*)() && noexcept >, "");
 static_assert(!cuda::std::indirectly_readable<int (S::*)() const>, "");
 static_assert(!cuda::std::indirectly_readable<int (S::*)() const noexcept>, "");
 static_assert(!cuda::std::indirectly_readable<int (S::*)() const&>, "");
-static_assert(!cuda::std::indirectly_readable<int (S::*)() const & noexcept>, "");
+static_assert(!cuda::std::indirectly_readable < int(S::*)() const& noexcept >, "");
 static_assert(!cuda::std::indirectly_readable<int (S::*)() const&&>, "");
 static_assert(!cuda::std::indirectly_readable < int(S::*)() const&& noexcept >, "");
 static_assert(!cuda::std::indirectly_readable<int (S::*)() volatile>, "");
 static_assert(!cuda::std::indirectly_readable<int (S::*)() volatile noexcept>, "");
 static_assert(!cuda::std::indirectly_readable<int (S::*)() volatile&>, "");
-static_assert(!cuda::std::indirectly_readable<int (S::*)() volatile & noexcept>, "");
+static_assert(!cuda::std::indirectly_readable < int(S::*)() volatile & noexcept >, "");
 static_assert(!cuda::std::indirectly_readable<int (S::*)() volatile&&>, "");
 static_assert(!cuda::std::indirectly_readable < int(S::*)() volatile && noexcept >, "");
 
diff --git a/libcudacxx/test/libcudacxx/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.winc/weakly_incrementable.compile.pass.cpp b/libcudacxx/test/libcudacxx/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.winc/weakly_incrementable.compile.pass.cpp
index 333a27abcd1..0087bb825dd 100644
--- a/libcudacxx/test/libcudacxx/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.winc/weakly_incrementable.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.winc/weakly_incrementable.compile.pass.cpp
@@ -36,12 +36,12 @@ struct S
 {};
 static_assert(!cuda::std::weakly_incrementable<int S::*>, "");
 
-#define CHECK_POINTER_TO_MEMBER_FUNCTIONS(qualifier)                                      \
-  static_assert(!cuda::std::weakly_incrementable<int (S::*)() qualifier>, "");            \
-  static_assert(!cuda::std::weakly_incrementable<int (S::*)() qualifier noexcept>, "");   \
-  static_assert(!cuda::std::weakly_incrementable<int (S::*)() qualifier&>, "");           \
-  static_assert(!cuda::std::weakly_incrementable<int (S::*)() qualifier & noexcept>, ""); \
-  static_assert(!cuda::std::weakly_incrementable<int (S::*)() qualifier&&>, "");          \
+#define CHECK_POINTER_TO_MEMBER_FUNCTIONS(qualifier)                                        \
+  static_assert(!cuda::std::weakly_incrementable<int (S::*)() qualifier>, "");              \
+  static_assert(!cuda::std::weakly_incrementable<int (S::*)() qualifier noexcept>, "");     \
+  static_assert(!cuda::std::weakly_incrementable<int (S::*)() qualifier&>, "");             \
+  static_assert(!cuda::std::weakly_incrementable < int(S::*)() qualifier & noexcept >, ""); \
+  static_assert(!cuda::std::weakly_incrementable<int (S::*)() qualifier&&>, "");            \
   static_assert(!cuda::std::weakly_incrementable < int(S::*)() qualifier && noexcept >, "");
 
 #define NO_QUALIFIER
diff --git a/libcudacxx/test/libcudacxx/std/thread/thread.barrier/completion.pass.cpp b/libcudacxx/test/libcudacxx/std/thread/thread.barrier/completion.pass.cpp
index f956bbe8119..59abb795d1b 100644
--- a/libcudacxx/test/libcudacxx/std/thread/thread.barrier/completion.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/thread/thread.barrier/completion.pass.cpp
@@ -23,8 +23,7 @@
 #include "test_macros.h"
 
 template <template <typename> class Barrier,
-          template <typename, typename>
-          class Selector,
+          template <typename, typename> class Selector,
           typename Initializer = constructor_initializer>
 __host__ __device__ void test()
 {
diff --git a/libcudacxx/test/libcudacxx/std/utilities/function.objects/func.bind_front/bind_front.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/function.objects/func.bind_front/bind_front.pass.cpp
index dd0689a868e..caacd9f073b 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/function.objects/func.bind_front/bind_front.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/function.objects/func.bind_front/bind_front.pass.cpp
@@ -49,8 +49,8 @@ template <class... Args>
 struct is_bind_frontable
 {
   template <class... LocalArgs>
-  __host__ __device__ static auto
-  test(int) -> decltype((void) cuda::std::bind_front(cuda::std::declval<LocalArgs>()...), cuda::std::true_type());
+  __host__ __device__ static auto test(int)
+    -> decltype((void) cuda::std::bind_front(cuda::std::declval<LocalArgs>()...), cuda::std::true_type());
 
   template <class...>
   __host__ __device__ static cuda::std::false_type test(...);
diff --git a/libcudacxx/test/libcudacxx/std/utilities/function.objects/func.invoke/invoke.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/function.objects/func.invoke/invoke.pass.cpp
index 3585e6dce99..b35ce02fee3 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/function.objects/func.invoke/invoke.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/function.objects/func.invoke/invoke.pass.cpp
@@ -122,7 +122,7 @@ template <class Signature, class Expect, class Functor>
 __host__ __device__ void test_b12(Functor&& f)
 {
   // Create the callable object.
-  typedef Signature TestClass::*ClassFunc;
+  typedef Signature TestClass::* ClassFunc;
   ClassFunc func_ptr = &TestClass::operator();
 
   // Create the dummy arg.
@@ -145,7 +145,7 @@ template <class Expect, class Functor>
 __host__ __device__ void test_b34(Functor&& f)
 {
   // Create the callable object.
-  typedef int TestClass::*ClassFunc;
+  typedef int TestClass::* ClassFunc;
   ClassFunc func_ptr = &TestClass::data;
 
   // Check that the deduced return type of invoke is what is expected.
diff --git a/libcudacxx/test/libcudacxx/std/utilities/function.objects/refwrap/refwrap.invoke/invoke.compile.fail.cpp b/libcudacxx/test/libcudacxx/std/utilities/function.objects/refwrap/refwrap.invoke/invoke.compile.fail.cpp
index faa8bde67e7..2cd67ca6247 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/function.objects/refwrap/refwrap.invoke/invoke.compile.fail.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/function.objects/refwrap/refwrap.invoke/invoke.compile.fail.cpp
@@ -35,7 +35,7 @@ __host__ __device__ void test_int_1()
 {
   // member data pointer
   {
-    int A_int_1::*fp = &A_int_1::data_;
+    int A_int_1::* fp = &A_int_1::data_;
     cuda::std::reference_wrapper<int A_int_1::*> r1(fp);
     A_int_1 a;
     assert(r1(a) == 5);
diff --git a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.trans/meta.trans.other/common_type.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.trans/meta.trans.other/common_type.pass.cpp
index 96988b28bc0..8a187fa4297 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.trans/meta.trans.other/common_type.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.trans/meta.trans.other/common_type.pass.cpp
@@ -88,7 +88,8 @@ template <class Tp>
 using always_bool = typename always_bool_imp<Tp>::type;
 
 template <class... Args>
-__host__ __device__ constexpr auto no_common_type_imp(int) -> always_bool<typename cuda::std::common_type<Args...>::type>
+__host__ __device__ constexpr auto no_common_type_imp(int)
+  -> always_bool<typename cuda::std::common_type<Args...>::type>
 {
   return false;
 }
diff --git a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.trans/meta.trans.other/result_of.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.trans/meta.trans.other/result_of.pass.cpp
index 2f7eb636ae4..445927790d3 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.trans/meta.trans.other/result_of.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.trans/meta.trans.other/result_of.pass.cpp
@@ -404,7 +404,7 @@ int main(int, char**)
     test_result_of<PMS3CV(S&, int, long), const int&>();
   }
   { // pointer to member data
-    typedef char S::*PMD;
+    typedef char S::* PMD;
     test_result_of<PMD(S&), char&>();
     test_result_of<PMD(S*), char&>();
     test_result_of<PMD(S* const), char&>();
diff --git a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
index dac77e6d9b6..a34dbeca162 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
@@ -77,7 +77,7 @@ __host__ __device__ void test_result_of_imp()
 int main(int, char**)
 {
   {
-    typedef char F::*PMD;
+    typedef char F::* PMD;
     test_result_of_imp<PMD(F&), char&>();
     test_result_of_imp<PMD(F const&), char const&>();
     test_result_of_imp<PMD(F volatile&), char volatile&>();
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
index 25a0d2aff05..04da8fea7e2 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
@@ -20,7 +20,7 @@ template <cuda::std::size_t N>
 __host__ __device__ TEST_CONSTEXPR_CXX14 void test_to_ullong()
 {
   const cuda::std::size_t M = sizeof(unsigned long long) * CHAR_BIT < N ? sizeof(unsigned long long) * CHAR_BIT : N;
-  const bool is_M_zero      = cuda::std::integral_constant<bool, M == 0>::value; // avoid compiler warnings
+  const bool is_M_zero      = cuda::std::integral_constant < bool, M == 0 > ::value; // avoid compiler warnings
   const cuda::std::size_t X =
     is_M_zero ? sizeof(unsigned long long) * CHAR_BIT - 1 : sizeof(unsigned long long) * CHAR_BIT - M;
   const unsigned long long max = is_M_zero ? 0 : (unsigned long long) (-1) >> X;
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp
index 61953443dfd..c44a923622c 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp
@@ -21,7 +21,7 @@ template <cuda::std::size_t N>
 __host__ __device__ TEST_CONSTEXPR_CXX14 void test_to_ulong()
 {
   const cuda::std::size_t M   = sizeof(unsigned long) * CHAR_BIT < N ? sizeof(unsigned long) * CHAR_BIT : N;
-  const bool is_M_zero        = cuda::std::integral_constant<bool, M == 0>::value; // avoid compiler warnings
+  const bool is_M_zero        = cuda::std::integral_constant < bool, M == 0 > ::value; // avoid compiler warnings
   const cuda::std::size_t X   = is_M_zero ? sizeof(unsigned long) * CHAR_BIT - 1 : sizeof(unsigned long) * CHAR_BIT - M;
   const cuda::std::size_t max = is_M_zero ? 0 : cuda::std::size_t(cuda::std::numeric_limits<unsigned long>::max()) >> X;
   cuda::std::size_t tests[]   = {
diff --git a/libcudacxx/test/libcudacxx/std/utilities/tuple/tuple.tuple/tuple.apply/apply_extended_types.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/tuple/tuple.tuple/tuple.apply/apply_extended_types.pass.cpp
index 2a72004f266..31e4d56c3f8 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/tuple/tuple.tuple/tuple.apply/apply_extended_types.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/tuple/tuple.tuple/tuple.apply/apply_extended_types.pass.cpp
@@ -137,7 +137,7 @@ __host__ __device__ void test_ext_int_0()
   typedef int (T::*mem2_t)() const;
   mem2_t mem2 = &T::mem2;
 
-  typedef int const T::*obj1_t;
+  typedef int const T::* obj1_t;
   obj1_t obj1 = &T::obj1;
 
   // member function w/ref
diff --git a/libcudacxx/test/libcudacxx/std/utilities/utility/utility.swap/swap.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/utility/utility.swap/swap.pass.cpp
index a27ed8d89fd..fbba514ca6b 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/utility/utility.swap/swap.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/utility/utility.swap/swap.pass.cpp
@@ -75,8 +75,8 @@ struct NotMoveAssignable
 };
 
 template <class Tp>
-__host__ __device__ auto
-can_swap_test(int) -> decltype(cuda::std::swap(cuda::std::declval<Tp>(), cuda::std::declval<Tp>()));
+__host__ __device__ auto can_swap_test(int)
+  -> decltype(cuda::std::swap(cuda::std::declval<Tp>(), cuda::std::declval<Tp>()));
 
 template <class Tp>
 __host__ __device__ auto can_swap_test(...) -> cuda::std::false_type;
diff --git a/libcudacxx/test/libcudacxx/std/utilities/utility/utility.swap/swap_array.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/utility/utility.swap/swap_array.pass.cpp
index ba4cbef2595..4cc02b7405b 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/utility/utility.swap/swap_array.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/utility/utility.swap/swap_array.pass.cpp
@@ -59,8 +59,8 @@ struct NotMoveConstructible
 };
 
 template <class Tp>
-__host__ __device__ auto
-can_swap_test(int) -> decltype(cuda::std::swap(cuda::std::declval<Tp>(), cuda::std::declval<Tp>()));
+__host__ __device__ auto can_swap_test(int)
+  -> decltype(cuda::std::swap(cuda::std::declval<Tp>(), cuda::std::declval<Tp>()));
 
 template <class Tp>
 __host__ __device__ auto can_swap_test(...) -> cuda::std::false_type;
diff --git a/libcudacxx/test/libcudacxx/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
index 84f234ea4ee..69da5a87905 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
@@ -472,8 +472,8 @@ void test_exceptions_different_alternatives()
 #endif // !TEST_HAS_NO_EXCEPTIONS
 
 template <class Var>
-__host__ __device__ constexpr auto
-has_swap_member_imp(int) -> decltype(cuda::std::declval<Var&>().swap(cuda::std::declval<Var&>()), true)
+__host__ __device__ constexpr auto has_swap_member_imp(int)
+  -> decltype(cuda::std::declval<Var&>().swap(cuda::std::declval<Var&>()), true)
 {
   return true;
 }
diff --git a/libcudacxx/test/support/archetypes.h b/libcudacxx/test/support/archetypes.h
index 1621e2d07e9..07f18944854 100644
--- a/libcudacxx/test/support/archetypes.h
+++ b/libcudacxx/test/support/archetypes.h
@@ -343,7 +343,8 @@ namespace NonThrowingTypes
 namespace NonTrivialTypes
 {
 #define DEFINE_CTOR \
-  {}
+  {                 \
+  }
 #define DEFINE_CTOR_ANNOTATIONS __host__ __device__
 #define DEFINE_ASSIGN \
   {                   \
diff --git a/libcudacxx/test/support/charconv_test_helpers.h b/libcudacxx/test/support/charconv_test_helpers.h
index 09e50e90efe..c334ebfab54 100644
--- a/libcudacxx/test/support/charconv_test_helpers.h
+++ b/libcudacxx/test/support/charconv_test_helpers.h
@@ -128,8 +128,8 @@ struct to_chars_test_base
     return r;
   }
 
-  static auto
-  fromchars(char const* p, char const* ep, int base = 10) -> decltype(fromchars(p, ep, base, std::is_signed<X>()))
+  static auto fromchars(char const* p, char const* ep, int base = 10)
+    -> decltype(fromchars(p, ep, base, std::is_signed<X>()))
   {
     return fromchars(p, ep, base, std::is_signed<X>());
   }
diff --git a/libcudacxx/test/support/concurrent_agents.h b/libcudacxx/test/support/concurrent_agents.h
index 0e17de0b2c5..6b57b3531a0 100644
--- a/libcudacxx/test/support/concurrent_agents.h
+++ b/libcudacxx/test/support/concurrent_agents.h
@@ -50,8 +50,7 @@ __host__ __device__ void concurrent_agents_launch(Fs... fs)
      __syncthreads();),
     (std::thread threads[]{std::thread{std::forward<Fs>(fs)}...};
 
-     for (auto&& thread
-          : threads) { thread.join(); }))
+     for (auto&& thread : threads) { thread.join(); }))
 }
 
 #endif // _CONCURRENT_AGENTS_H
diff --git a/libcudacxx/test/support/counting_predicates.h b/libcudacxx/test/support/counting_predicates.h
index 85bce9875c6..227466e139d 100644
--- a/libcudacxx/test/support/counting_predicates.h
+++ b/libcudacxx/test/support/counting_predicates.h
@@ -91,16 +91,16 @@ class counting_predicate
   {}
 
   template <class... Args>
-  __host__ __device__ TEST_CONSTEXPR_CXX14 auto
-  operator()(Args&&... args) -> decltype(pred_(cuda::std::forward<Args>(args)...))
+  __host__ __device__ TEST_CONSTEXPR_CXX14 auto operator()(Args&&... args)
+    -> decltype(pred_(cuda::std::forward<Args>(args)...))
   {
     ++(*count_);
     return pred_(cuda::std::forward<Args>(args)...);
   }
 
   template <class... Args>
-  __host__ __device__ TEST_CONSTEXPR_CXX14 auto
-  operator()(Args&&... args) const -> decltype(pred_(cuda::std::forward<Args>(args)...))
+  __host__ __device__ TEST_CONSTEXPR_CXX14 auto operator()(Args&&... args) const
+    -> decltype(pred_(cuda::std::forward<Args>(args)...))
   {
     ++(*count_);
     return pred_(cuda::std::forward<Args>(args)...);
diff --git a/libcudacxx/test/support/cuda_space_selector.h b/libcudacxx/test/support/cuda_space_selector.h
index 2217699fbee..c848ff95e26 100644
--- a/libcudacxx/test/support/cuda_space_selector.h
+++ b/libcudacxx/test/support/cuda_space_selector.h
@@ -129,8 +129,7 @@ struct default_initializer
 };
 
 template <typename T,
-          template <typename, cuda::std::size_t>
-          class Provider,
+          template <typename, cuda::std::size_t> class Provider,
           typename Initializer           = constructor_initializer,
           cuda::std::size_t SharedOffset = 0>
 class memory_selector
diff --git a/libcudacxx/test/support/is_transparent.h b/libcudacxx/test/support/is_transparent.h
index 35fa5ed1af7..39241874449 100644
--- a/libcudacxx/test/support/is_transparent.h
+++ b/libcudacxx/test/support/is_transparent.h
@@ -17,8 +17,8 @@
 struct transparent_less
 {
   template <class T, class U>
-  constexpr auto operator()(T&& t, U&& u) const
-    noexcept(noexcept(std::forward<T>(t) < std::forward<U>(u))) -> decltype(std::forward<T>(t) < std::forward<U>(u))
+  constexpr auto operator()(T&& t, U&& u) const noexcept(noexcept(std::forward<T>(t) < std::forward<U>(u)))
+    -> decltype(std::forward<T>(t) < std::forward<U>(u))
   {
     return std::forward<T>(t) < std::forward<U>(u);
   }
@@ -28,8 +28,8 @@ struct transparent_less
 struct transparent_less_not_referenceable
 {
   template <class T, class U>
-  constexpr auto operator()(T&& t, U&& u) const
-    noexcept(noexcept(std::forward<T>(t) < std::forward<U>(u))) -> decltype(std::forward<T>(t) < std::forward<U>(u))
+  constexpr auto operator()(T&& t, U&& u) const noexcept(noexcept(std::forward<T>(t) < std::forward<U>(u)))
+    -> decltype(std::forward<T>(t) < std::forward<U>(u))
   {
     return std::forward<T>(t) < std::forward<U>(u);
   }
@@ -39,8 +39,8 @@ struct transparent_less_not_referenceable
 struct transparent_less_no_type
 {
   template <class T, class U>
-  constexpr auto operator()(T&& t, U&& u) const
-    noexcept(noexcept(std::forward<T>(t) < std::forward<U>(u))) -> decltype(std::forward<T>(t) < std::forward<U>(u))
+  constexpr auto operator()(T&& t, U&& u) const noexcept(noexcept(std::forward<T>(t) < std::forward<U>(u)))
+    -> decltype(std::forward<T>(t) < std::forward<U>(u))
   {
     return std::forward<T>(t) < std::forward<U>(u);
   }
@@ -52,8 +52,8 @@ struct transparent_less_no_type
 struct transparent_less_private
 {
   template <class T, class U>
-  constexpr auto operator()(T&& t, U&& u) const
-    noexcept(noexcept(std::forward<T>(t) < std::forward<U>(u))) -> decltype(std::forward<T>(t) < std::forward<U>(u))
+  constexpr auto operator()(T&& t, U&& u) const noexcept(noexcept(std::forward<T>(t) < std::forward<U>(u)))
+    -> decltype(std::forward<T>(t) < std::forward<U>(u))
   {
     return std::forward<T>(t) < std::forward<U>(u);
   }
@@ -65,8 +65,8 @@ struct transparent_less_private
 struct transparent_less_not_a_type
 {
   template <class T, class U>
-  constexpr auto operator()(T&& t, U&& u) const
-    noexcept(noexcept(std::forward<T>(t) < std::forward<U>(u))) -> decltype(std::forward<T>(t) < std::forward<U>(u))
+  constexpr auto operator()(T&& t, U&& u) const noexcept(noexcept(std::forward<T>(t) < std::forward<U>(u)))
+    -> decltype(std::forward<T>(t) < std::forward<U>(u))
   {
     return std::forward<T>(t) < std::forward<U>(u);
   }
diff --git a/libcudacxx/test/support/rapid-cxx-test.h b/libcudacxx/test/support/rapid-cxx-test.h
index 6429df670dc..792bb6451db 100644
--- a/libcudacxx/test/support/rapid-cxx-test.h
+++ b/libcudacxx/test/support/rapid-cxx-test.h
@@ -444,87 +444,87 @@
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-#define TEST_WARN_EQUAL_COLLECTIONS(...)                                         \
-  do                                                                             \
-  {                                                                              \
-    TEST_SET_CHECKPOINT();                                                       \
-    ::rapid_cxx_test::test_outcome m_f(                                          \
-      ::rapid_cxx_test::failure_type::none,                                      \
-      __FILE__,                                                                  \
-      TEST_FUNC_NAME(),                                                          \
-      __LINE__,                                                                  \
-      "TEST_WARN_EQUAL_COLLECTIONS(" #__VA_ARGS__ ")",                           \
-      "");                                                                       \
-    if (not ::rapid_cxx_test::detail::check_equal_collections_impl(__VA_ARGS__)) \
-    {                                                                            \
-      m_f.type = ::rapid_cxx_test::failure_type::warn;                           \
-    }                                                                            \
-    ::rapid_cxx_test::get_reporter().report(m_f);                                \
+#define TEST_WARN_EQUAL_COLLECTIONS(...)                                        \
+  do                                                                            \
+  {                                                                             \
+    TEST_SET_CHECKPOINT();                                                      \
+    ::rapid_cxx_test::test_outcome m_f(                                         \
+      ::rapid_cxx_test::failure_type::none,                                     \
+      __FILE__,                                                                 \
+      TEST_FUNC_NAME(),                                                         \
+      __LINE__,                                                                 \
+      "TEST_WARN_EQUAL_COLLECTIONS(" #__VA_ARGS__ ")",                          \
+      "");                                                                      \
+    if (not::rapid_cxx_test::detail::check_equal_collections_impl(__VA_ARGS__)) \
+    {                                                                           \
+      m_f.type = ::rapid_cxx_test::failure_type::warn;                          \
+    }                                                                           \
+    ::rapid_cxx_test::get_reporter().report(m_f);                               \
   } while (false)
 #
 
-#define TEST_CHECK_EQUAL_COLLECTIONS(...)                                        \
-  do                                                                             \
-  {                                                                              \
-    TEST_SET_CHECKPOINT();                                                       \
-    ::rapid_cxx_test::test_outcome m_f(                                          \
-      ::rapid_cxx_test::failure_type::none,                                      \
-      __FILE__,                                                                  \
-      TEST_FUNC_NAME(),                                                          \
-      __LINE__,                                                                  \
-      "TEST_CHECK_EQUAL_COLLECTIONS(" #__VA_ARGS__ ")",                          \
-      "");                                                                       \
-    if (not ::rapid_cxx_test::detail::check_equal_collections_impl(__VA_ARGS__)) \
-    {                                                                            \
-      m_f.type = ::rapid_cxx_test::failure_type::check;                          \
-    }                                                                            \
-    ::rapid_cxx_test::get_reporter().report(m_f);                                \
+#define TEST_CHECK_EQUAL_COLLECTIONS(...)                                       \
+  do                                                                            \
+  {                                                                             \
+    TEST_SET_CHECKPOINT();                                                      \
+    ::rapid_cxx_test::test_outcome m_f(                                         \
+      ::rapid_cxx_test::failure_type::none,                                     \
+      __FILE__,                                                                 \
+      TEST_FUNC_NAME(),                                                         \
+      __LINE__,                                                                 \
+      "TEST_CHECK_EQUAL_COLLECTIONS(" #__VA_ARGS__ ")",                         \
+      "");                                                                      \
+    if (not::rapid_cxx_test::detail::check_equal_collections_impl(__VA_ARGS__)) \
+    {                                                                           \
+      m_f.type = ::rapid_cxx_test::failure_type::check;                         \
+    }                                                                           \
+    ::rapid_cxx_test::get_reporter().report(m_f);                               \
   } while (false)
 #
 
-#define TEST_REQUIRE_EQUAL_COLLECTIONS(...)                                      \
-  do                                                                             \
-  {                                                                              \
-    TEST_SET_CHECKPOINT();                                                       \
-    ::rapid_cxx_test::test_outcome m_f(                                          \
-      ::rapid_cxx_test::failure_type::none,                                      \
-      __FILE__,                                                                  \
-      TEST_FUNC_NAME(),                                                          \
-      __LINE__,                                                                  \
-      "TEST_REQUIRE_EQUAL_COLLECTIONS(" #__VA_ARGS__ ")",                        \
-      "");                                                                       \
-    if (not ::rapid_cxx_test::detail::check_equal_collections_impl(__VA_ARGS__)) \
-    {                                                                            \
-      m_f.type = ::rapid_cxx_test::failure_type::require;                        \
-    }                                                                            \
-    ::rapid_cxx_test::get_reporter().report(m_f);                                \
-    if (m_f.type != ::rapid_cxx_test::failure_type::none)                        \
-    {                                                                            \
-      return;                                                                    \
-    }                                                                            \
+#define TEST_REQUIRE_EQUAL_COLLECTIONS(...)                                     \
+  do                                                                            \
+  {                                                                             \
+    TEST_SET_CHECKPOINT();                                                      \
+    ::rapid_cxx_test::test_outcome m_f(                                         \
+      ::rapid_cxx_test::failure_type::none,                                     \
+      __FILE__,                                                                 \
+      TEST_FUNC_NAME(),                                                         \
+      __LINE__,                                                                 \
+      "TEST_REQUIRE_EQUAL_COLLECTIONS(" #__VA_ARGS__ ")",                       \
+      "");                                                                      \
+    if (not::rapid_cxx_test::detail::check_equal_collections_impl(__VA_ARGS__)) \
+    {                                                                           \
+      m_f.type = ::rapid_cxx_test::failure_type::require;                       \
+    }                                                                           \
+    ::rapid_cxx_test::get_reporter().report(m_f);                               \
+    if (m_f.type != ::rapid_cxx_test::failure_type::none)                       \
+    {                                                                           \
+      return;                                                                   \
+    }                                                                           \
   } while (false)
 #
 
-#define TEST_ASSERT_EQUAL_COLLECTIONS(...)                                       \
-  do                                                                             \
-  {                                                                              \
-    TEST_SET_CHECKPOINT();                                                       \
-    ::rapid_cxx_test::test_outcome m_f(                                          \
-      ::rapid_cxx_test::failure_type::none,                                      \
-      __FILE__,                                                                  \
-      TEST_FUNC_NAME(),                                                          \
-      __LINE__,                                                                  \
-      "TEST_ASSERT_EQUAL_COLLECTIONS(" #__VA_ARGS__ ")",                         \
-      "");                                                                       \
-    if (not ::rapid_cxx_test::detail::check_equal_collections_impl(__VA_ARGS__)) \
-    {                                                                            \
-      m_f.type = ::rapid_cxx_test::failure_type::assert;                         \
-    }                                                                            \
-    ::rapid_cxx_test::get_reporter().report(m_f);                                \
-    if (m_f.type != ::rapid_cxx_test::failure_type::none)                        \
-    {                                                                            \
-      ::std::abort();                                                            \
-    }                                                                            \
+#define TEST_ASSERT_EQUAL_COLLECTIONS(...)                                      \
+  do                                                                            \
+  {                                                                             \
+    TEST_SET_CHECKPOINT();                                                      \
+    ::rapid_cxx_test::test_outcome m_f(                                         \
+      ::rapid_cxx_test::failure_type::none,                                     \
+      __FILE__,                                                                 \
+      TEST_FUNC_NAME(),                                                         \
+      __LINE__,                                                                 \
+      "TEST_ASSERT_EQUAL_COLLECTIONS(" #__VA_ARGS__ ")",                        \
+      "");                                                                      \
+    if (not::rapid_cxx_test::detail::check_equal_collections_impl(__VA_ARGS__)) \
+    {                                                                           \
+      m_f.type = ::rapid_cxx_test::failure_type::assert;                        \
+    }                                                                           \
+    ::rapid_cxx_test::get_reporter().report(m_f);                               \
+    if (m_f.type != ::rapid_cxx_test::failure_type::none)                       \
+    {                                                                           \
+      ::std::abort();                                                           \
+    }                                                                           \
   } while (false)
 #
 
diff --git a/libcudacxx/test/support/test_convertible.h b/libcudacxx/test/support/test_convertible.h
index 843a4e36849..5de6472f652 100644
--- a/libcudacxx/test/support/test_convertible.h
+++ b/libcudacxx/test/support/test_convertible.h
@@ -24,8 +24,8 @@ template <class Tp>
 __host__ __device__ void eat_type(Tp);
 
 template <class Tp, class... Args>
-__host__ __device__ constexpr auto
-test_convertible_imp(int) -> decltype(eat_type<Tp>({cuda::std::declval<Args>()...}), true)
+__host__ __device__ constexpr auto test_convertible_imp(int)
+  -> decltype(eat_type<Tp>({cuda::std::declval<Args>()...}), true)
 {
   return true;
 }
diff --git a/thrust/testing/async_transform.cu b/thrust/testing/async_transform.cu
index 0f94f9d88f3..bfb30006ff2 100644
--- a/thrust/testing/async_transform.cu
+++ b/thrust/testing/async_transform.cu
@@ -128,10 +128,8 @@ DEFINE_SYNC_TRANSFORM_UNARY_INVOKER(
 ///////////////////////////////////////////////////////////////////////////////
 
 template <template <typename> class AsyncTransformUnaryInvoker,
-          template <typename>
-          class SyncTransformUnaryInvoker,
-          template <typename>
-          class UnaryOperation>
+          template <typename> class SyncTransformUnaryInvoker,
+          template <typename> class UnaryOperation>
 struct test_async_transform_unary
 {
   template <typename T>
@@ -222,10 +220,8 @@ DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
 ///////////////////////////////////////////////////////////////////////////////
 
 template <template <typename> class AsyncTransformUnaryInvoker,
-          template <typename>
-          class SyncTransformUnaryInvoker,
-          template <typename>
-          class UnaryOperation>
+          template <typename> class SyncTransformUnaryInvoker,
+          template <typename> class UnaryOperation>
 struct test_async_transform_unary_inplace
 {
   template <typename T>
@@ -305,10 +301,8 @@ DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
 ///////////////////////////////////////////////////////////////////////////////
 
 template <template <typename> class AsyncTransformUnaryInvoker,
-          template <typename>
-          class SyncTransformUnaryInvoker,
-          template <typename>
-          class UnaryOperation>
+          template <typename> class SyncTransformUnaryInvoker,
+          template <typename> class UnaryOperation>
 struct test_async_transform_unary_counting_iterator
 {
   template <typename T>
diff --git a/thrust/testing/cuda/transform.cu b/thrust/testing/cuda/transform.cu
index 2e474ccfb5a..594194cd183 100644
--- a/thrust/testing/cuda/transform.cu
+++ b/thrust/testing/cuda/transform.cu
@@ -347,8 +347,8 @@ DECLARE_UNITTEST(TestTransformBinaryCudaStreams);
 
 struct sum_five
 {
-  _CCCL_HOST_DEVICE auto
-  operator()(std::int8_t a, std::int16_t b, std::int32_t c, std::int64_t d, float e) const -> double
+  _CCCL_HOST_DEVICE auto operator()(std::int8_t a, std::int16_t b, std::int32_t c, std::int64_t d, float e) const
+    -> double
   {
     return a + b + c + d + e;
   }
@@ -373,8 +373,8 @@ public:
   }
 
   template <typename Tuple>
-  _CCCL_HOST_DEVICE auto
-  operator()(Tuple&& t) const -> decltype(detail::zip_detail::apply(std::declval<sum_five>(), THRUST_FWD(t)))
+  _CCCL_HOST_DEVICE auto operator()(Tuple&& t) const
+    -> decltype(detail::zip_detail::apply(std::declval<sum_five>(), THRUST_FWD(t)))
   {
     // not calling func, so we would get a wrong result if we were called
     return {};
diff --git a/thrust/testing/unittest/testframework.h b/thrust/testing/unittest/testframework.h
index f3f9c942b35..cd5799e0ca0 100644
--- a/thrust/testing/unittest/testframework.h
+++ b/thrust/testing/unittest/testframework.h
@@ -571,10 +571,8 @@ class VariableUnitTest : public UnitTest
 
 template <template <typename> class TestName,
           typename TypeList,
-          template <typename, typename>
-          class Vector,
-          template <typename>
-          class Alloc>
+          template <typename, typename> class Vector,
+          template <typename> class Alloc>
 struct VectorUnitTest : public UnitTest
 {
   VectorUnitTest()
diff --git a/thrust/thrust/detail/functional/actor.h b/thrust/thrust/detail/functional/actor.h
index 2de51b62285..c727be09306 100644
--- a/thrust/thrust/detail/functional/actor.h
+++ b/thrust/thrust/detail/functional/actor.h
@@ -82,8 +82,8 @@ template <unsigned int Pos>
 struct argument
 {
   template <typename... Ts>
-  _CCCL_HOST_DEVICE auto
-  eval(Ts&&... args) const -> decltype(thrust::get<Pos>(thrust::tuple<Ts&&...>{THRUST_FWD(args)...}))
+  _CCCL_HOST_DEVICE auto eval(Ts&&... args) const
+    -> decltype(thrust::get<Pos>(thrust::tuple<Ts&&...>{THRUST_FWD(args)...}))
   {
     return thrust::get<Pos>(thrust::tuple<Ts&&...>{THRUST_FWD(args)...});
   }
diff --git a/thrust/thrust/detail/functional/operators.h b/thrust/thrust/detail/functional/operators.h
index f4f89f5f7d7..57c52b92520 100644
--- a/thrust/thrust/detail/functional/operators.h
+++ b/thrust/thrust/detail/functional/operators.h
@@ -263,8 +263,8 @@ struct unary_plus
 
   _CCCL_EXEC_CHECK_DISABLE
   template <typename T1>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
-    noexcept(noexcept(+THRUST_FWD(t1))) -> decltype(+THRUST_FWD(t1))
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const noexcept(noexcept(+THRUST_FWD(t1)))
+    -> decltype(+THRUST_FWD(t1))
   {
     return +THRUST_FWD(t1);
   }
@@ -277,8 +277,8 @@ struct prefix_increment
 
   _CCCL_EXEC_CHECK_DISABLE
   template <typename T1>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
-    noexcept(noexcept(++THRUST_FWD(t1))) -> decltype(++THRUST_FWD(t1))
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const noexcept(noexcept(++THRUST_FWD(t1)))
+    -> decltype(++THRUST_FWD(t1))
   {
     return ++THRUST_FWD(t1);
   }
@@ -291,8 +291,8 @@ struct postfix_increment
 
   _CCCL_EXEC_CHECK_DISABLE
   template <typename T1>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
-    noexcept(noexcept(THRUST_FWD(t1)++)) -> decltype(THRUST_FWD(t1)++)
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const noexcept(noexcept(THRUST_FWD(t1)++))
+    -> decltype(THRUST_FWD(t1)++)
   {
     return THRUST_FWD(t1)++;
   }
@@ -305,8 +305,8 @@ struct prefix_decrement
 
   _CCCL_EXEC_CHECK_DISABLE
   template <typename T1>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
-    noexcept(noexcept(--THRUST_FWD(t1))) -> decltype(--THRUST_FWD(t1))
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const noexcept(noexcept(--THRUST_FWD(t1)))
+    -> decltype(--THRUST_FWD(t1))
   {
     return --THRUST_FWD(t1);
   }
@@ -319,8 +319,8 @@ struct postfix_decrement
 
   _CCCL_EXEC_CHECK_DISABLE
   template <typename T1>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
-    noexcept(noexcept(THRUST_FWD(t1)--)) -> decltype(THRUST_FWD(t1)--)
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const noexcept(noexcept(THRUST_FWD(t1)--))
+    -> decltype(THRUST_FWD(t1)--)
   {
     return THRUST_FWD(t1)--;
   }
@@ -333,8 +333,8 @@ struct bit_not
 
   _CCCL_EXEC_CHECK_DISABLE
   template <typename T1>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const
-    noexcept(noexcept(~THRUST_FWD(t1))) -> decltype(~THRUST_FWD(t1))
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1) const noexcept(noexcept(~THRUST_FWD(t1)))
+    -> decltype(~THRUST_FWD(t1))
   {
     return ~THRUST_FWD(t1);
   }
diff --git a/thrust/thrust/detail/tuple_transform.h b/thrust/thrust/detail/tuple_transform.h
index 49ccdf11cdd..c7cdfcd5b2e 100644
--- a/thrust/thrust/detail/tuple_transform.h
+++ b/thrust/thrust/detail/tuple_transform.h
@@ -35,8 +35,7 @@ namespace detail
 {
 
 template <typename Tuple,
-          template <typename>
-          class UnaryMetaFunction,
+          template <typename> class UnaryMetaFunction,
           typename UnaryFunction,
           typename IndexSequence = thrust::make_index_sequence<thrust::tuple_size<Tuple>::value>>
 struct tuple_transform_functor;
diff --git a/thrust/thrust/detail/type_traits/pointer_traits.h b/thrust/thrust/detail/type_traits/pointer_traits.h
index 9ac54ee7860..7570aba2242 100644
--- a/thrust/thrust/detail/type_traits/pointer_traits.h
+++ b/thrust/thrust/detail/type_traits/pointer_traits.h
@@ -84,8 +84,7 @@ struct rebind_pointer<Ptr<OldT, Tail...>, T>
 template <template <typename, typename, typename, typename...> class Ptr,
           typename OldT,
           typename Tag,
-          template <typename...>
-          class Ref,
+          template <typename...> class Ref,
           typename... RefTail,
           typename... PtrTail,
           typename T>
@@ -100,11 +99,9 @@ struct rebind_pointer<Ptr<OldT, Tag, Ref<OldT, RefTail...>, PtrTail...>, T>
 template <template <typename, typename, typename, typename...> class Ptr,
           typename OldT,
           typename Tag,
-          template <typename...>
-          class Ref,
+          template <typename...> class Ref,
           typename... RefTail,
-          template <typename...>
-          class DerivedPtr,
+          template <typename...> class DerivedPtr,
           typename... DerivedPtrTail,
           typename T>
 struct rebind_pointer<Ptr<OldT, Tag, Ref<OldT, RefTail...>, DerivedPtr<OldT, DerivedPtrTail...>>, T>
@@ -130,8 +127,7 @@ struct rebind_pointer<Ptr<OldT, Tag, typename std::add_lvalue_reference<OldT>::t
 template <template <typename, typename, typename, typename...> class Ptr,
           typename OldT,
           typename Tag,
-          template <typename...>
-          class DerivedPtr,
+          template <typename...> class DerivedPtr,
           typename... DerivedPtrTail,
           typename T>
 struct rebind_pointer<Ptr<OldT, Tag, typename std::add_lvalue_reference<OldT>::type, DerivedPtr<OldT, DerivedPtrTail...>>,
diff --git a/thrust/thrust/functional.h b/thrust/thrust/functional.h
index 16c464f702e..a72a3473840 100644
--- a/thrust/thrust/functional.h
+++ b/thrust/thrust/functional.h
@@ -884,8 +884,8 @@ struct project1st<void, void>
   using is_transparent = void;
   _CCCL_EXEC_CHECK_DISABLE
   template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&&) const
-    noexcept(noexcept(THRUST_FWD(t1))) -> decltype(THRUST_FWD(t1))
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&& t1, T2&&) const noexcept(noexcept(THRUST_FWD(t1)))
+    -> decltype(THRUST_FWD(t1))
   {
     return THRUST_FWD(t1);
   }
@@ -943,8 +943,8 @@ struct project2nd<void, void>
   using is_transparent = void;
   _CCCL_EXEC_CHECK_DISABLE
   template <typename T1, typename T2>
-  _CCCL_HOST_DEVICE constexpr auto operator()(T1&&, T2&& t2) const
-    noexcept(noexcept(THRUST_FWD(t2))) -> decltype(THRUST_FWD(t2))
+  _CCCL_HOST_DEVICE constexpr auto operator()(T1&&, T2&& t2) const noexcept(noexcept(THRUST_FWD(t2)))
+    -> decltype(THRUST_FWD(t2))
   {
     return THRUST_FWD(t2);
   }
@@ -967,15 +967,15 @@ struct not_fun_t
   F f;
 
   template <typename... Ts>
-  _CCCL_HOST_DEVICE auto
-  operator()(Ts&&... args) noexcept(noexcept(!f(std::forward<Ts>(args)...))) -> decltype(!f(std::forward<Ts>(args)...))
+  _CCCL_HOST_DEVICE auto operator()(Ts&&... args) noexcept(noexcept(!f(std::forward<Ts>(args)...)))
+    -> decltype(!f(std::forward<Ts>(args)...))
   {
     return !f(std::forward<Ts>(args)...);
   }
 
   template <typename... Ts>
-  _CCCL_HOST_DEVICE auto operator()(Ts&&... args) const
-    noexcept(noexcept(!f(std::forward<Ts>(args)...))) -> decltype(!f(std::forward<Ts>(args)...))
+  _CCCL_HOST_DEVICE auto operator()(Ts&&... args) const noexcept(noexcept(!f(std::forward<Ts>(args)...)))
+    -> decltype(!f(std::forward<Ts>(args)...))
   {
     return !f(std::forward<Ts>(args)...);
   }
diff --git a/thrust/thrust/optional.h b/thrust/thrust/optional.h
index 788f6597953..3cef29ae7ec 100644
--- a/thrust/thrust/optional.h
+++ b/thrust/thrust/optional.h
@@ -154,16 +154,18 @@ template <
 #  endif
   typename = enable_if_t<std::is_member_pointer<decay_t<Fn>>::value>,
   int      = 0>
-_CCCL_HOST_DEVICE constexpr auto invoke(Fn&& f, Args&&... args) noexcept(
-  noexcept(std::mem_fn(f)(std::forward<Args>(args)...))) -> decltype(std::mem_fn(f)(std::forward<Args>(args)...))
+_CCCL_HOST_DEVICE constexpr auto
+invoke(Fn&& f, Args&&... args) noexcept(noexcept(std::mem_fn(f)(std::forward<Args>(args)...)))
+  -> decltype(std::mem_fn(f)(std::forward<Args>(args)...))
 {
   return std::mem_fn(f)(std::forward<Args>(args)...);
 }
 
 _CCCL_EXEC_CHECK_DISABLE
 template <typename Fn, typename... Args, typename = enable_if_t<!std::is_member_pointer<decay_t<Fn>>::value>>
-_CCCL_HOST_DEVICE constexpr auto invoke(Fn&& f, Args&&... args) noexcept(noexcept(
-  std::forward<Fn>(f)(std::forward<Args>(args)...))) -> decltype(std::forward<Fn>(f)(std::forward<Args>(args)...))
+_CCCL_HOST_DEVICE constexpr auto
+invoke(Fn&& f, Args&&... args) noexcept(noexcept(std::forward<Fn>(f)(std::forward<Args>(args)...)))
+  -> decltype(std::forward<Fn>(f)(std::forward<Args>(args)...))
 {
   return std::forward<Fn>(f)(std::forward<Args>(args)...);
 }
diff --git a/thrust/thrust/system/cuda/detail/transform.h b/thrust/thrust/system/cuda/detail/transform.h
index 4cf879c3eaa..64154a8f6da 100644
--- a/thrust/thrust/system/cuda/detail/transform.h
+++ b/thrust/thrust/system/cuda/detail/transform.h
@@ -276,8 +276,8 @@ OutputIt THRUST_FUNCTION cub_transform_many(
 }
 
 template <typename... Ts, std::size_t... Is>
-THRUST_FUNCTION auto
-convert_to_std_tuple(tuple<Ts...> t, ::cuda::std::index_sequence<Is...>) -> ::cuda::std::tuple<Ts...>
+THRUST_FUNCTION auto convert_to_std_tuple(tuple<Ts...> t, ::cuda::std::index_sequence<Is...>)
+  -> ::cuda::std::tuple<Ts...>
 {
   return ::cuda::std::tuple<Ts...>{get<Is>(t)...};
 }
diff --git a/thrust/thrust/type_traits/is_contiguous_iterator.h b/thrust/thrust/type_traits/is_contiguous_iterator.h
index 7950422a4e0..95f6c503b15 100644
--- a/thrust/thrust/type_traits/is_contiguous_iterator.h
+++ b/thrust/thrust/type_traits/is_contiguous_iterator.h
@@ -229,8 +229,8 @@ using unwrap_contiguous_iterator_t = typename detail::contiguous_iterator_traits
 
 //! Converts a contiguous iterator to its underlying raw pointer.
 template <typename ContiguousIterator>
-_CCCL_HOST_DEVICE auto
-unwrap_contiguous_iterator(ContiguousIterator it) -> unwrap_contiguous_iterator_t<ContiguousIterator>
+_CCCL_HOST_DEVICE auto unwrap_contiguous_iterator(ContiguousIterator it)
+  -> unwrap_contiguous_iterator_t<ContiguousIterator>
 {
   static_assert(thrust::is_contiguous_iterator<ContiguousIterator>::value,
                 "unwrap_contiguous_iterator called with non-contiguous iterator.");

From d76eed5b6122dae6fdfd9bc93291f230a3a06c39 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 9 Jan 2025 11:49:38 +0100
Subject: [PATCH 05/31] Drop Thrust legacy arch macros (#3298)

Which were disabled and could be re-enabled using THRUST_PROVIDE_LEGACY_ARCH_MACROS
---
 .../thrust/detail/config/cpp_compatibility.h  | 27 -------------------
 thrust/thrust/system/cuda/config.h            | 13 ---------
 2 files changed, 40 deletions(-)

diff --git a/thrust/thrust/detail/config/cpp_compatibility.h b/thrust/thrust/detail/config/cpp_compatibility.h
index 6094319dae0..a45115688b4 100644
--- a/thrust/thrust/detail/config/cpp_compatibility.h
+++ b/thrust/thrust/detail/config/cpp_compatibility.h
@@ -52,30 +52,3 @@
 #  define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static constexpr
 
 #endif
-
-// These definitions were intended for internal use only and are now obsolete.
-// If you relied on them, consider porting your code to use the functionality
-// in libcu++'s <nv/target> header.
-// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make
-// them available again. These should be considered deprecated and will be
-// fully removed in a future version.
-#ifdef THRUST_PROVIDE_LEGACY_ARCH_MACROS
-#  ifndef THRUST_IS_DEVICE_CODE
-#    if defined(_NVHPC_CUDA)
-#      define THRUST_IS_DEVICE_CODE      __builtin_is_device_code()
-#      define THRUST_IS_HOST_CODE        (!__builtin_is_device_code())
-#      define THRUST_INCLUDE_DEVICE_CODE 1
-#      define THRUST_INCLUDE_HOST_CODE   1
-#    elif defined(__CUDA_ARCH__)
-#      define THRUST_IS_DEVICE_CODE      1
-#      define THRUST_IS_HOST_CODE        0
-#      define THRUST_INCLUDE_DEVICE_CODE 1
-#      define THRUST_INCLUDE_HOST_CODE   0
-#    else
-#      define THRUST_IS_DEVICE_CODE      0
-#      define THRUST_IS_HOST_CODE        1
-#      define THRUST_INCLUDE_DEVICE_CODE 0
-#      define THRUST_INCLUDE_HOST_CODE   1
-#    endif
-#  endif
-#endif // THRUST_PROVIDE_LEGACY_ARCH_MACROS
diff --git a/thrust/thrust/system/cuda/config.h b/thrust/thrust/system/cuda/config.h
index c5c7f73242c..8c4acff1dda 100644
--- a/thrust/thrust/system/cuda/config.h
+++ b/thrust/thrust/system/cuda/config.h
@@ -84,19 +84,6 @@
 #  define __THRUST_HAS_CUDART__ 0
 #endif
 
-// These definitions were intended for internal use only and are now obsolete.
-// If you relied on them, consider porting your code to use the functionality
-// in libcu++'s <nv/target> header.
-//
-// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make
-// them available again. These should be considered deprecated and will be
-// fully removed in a future version.
-#ifdef THRUST_PROVIDE_LEGACY_ARCH_MACROS
-#  ifdef __CUDA_ARCH__
-#    define THRUST_DEVICE_CODE
-#  endif // __CUDA_ARCH__
-#endif // THRUST_PROVIDE_LEGACY_ARCH_MACROS
-
 #ifdef THRUST_AGENT_ENTRY_NOINLINE
 #  define THRUST_AGENT_ENTRY_INLINE_ATTR __noinline__
 #else

From acfdf80840e40c1a721eb1560bc7aec78867850c Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 9 Jan 2025 11:56:02 +0100
Subject: [PATCH 06/31] Drop Thrust's compiler_fence.h (#3300)

---
 thrust/thrust/detail/config/compiler_fence.h | 79 --------------------
 1 file changed, 79 deletions(-)
 delete mode 100644 thrust/thrust/detail/config/compiler_fence.h

diff --git a/thrust/thrust/detail/config/compiler_fence.h b/thrust/thrust/detail/config/compiler_fence.h
deleted file mode 100644
index 4b93b682c99..00000000000
--- a/thrust/thrust/detail/config/compiler_fence.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <thrust/detail/preprocessor.h>
-
-#if _CCCL_COMPILER(MSVC)
-#  pragma message( \
-    "warning: The functionality in this header is unsafe, deprecated, and will soon be removed. Use C++11 atomics instead.")
-#else
-#warning The functionality in this header is unsafe, deprecated, and will soon be removed. Use C++11 or C11 atomics instead.
-#endif
-
-// msvc case
-#if _CCCL_COMPILER(MSVC)
-
-#  ifndef _DEBUG
-
-#    include <intrin.h>
-#    pragma intrinsic(_ReadWriteBarrier)
-#    define __thrust_compiler_fence() _ReadWriteBarrier()
-#  else
-
-#    define __thrust_compiler_fence() \
-      do                              \
-      {                               \
-      } while (0)
-
-#  endif // _DEBUG
-
-// gcc case
-#elif _CCCL_COMPILER(GCC)
-
-#  if _CCCL_COMPILER(GCC, >=, 4, 2) // atomic built-ins were introduced ~4.2
-#    define __thrust_compiler_fence() __sync_synchronize()
-#  else
-// allow the code to compile without any guarantees
-#    define __thrust_compiler_fence() \
-      do                              \
-      {                               \
-      } while (0)
-#  endif // _CCCL_COMPILER(GCC, >=, 4, 2)
-
-// unknown case
-#elif _CCCL_COMPILER(CLANG)
-#  define __thrust_compiler_fence() __sync_synchronize()
-#else
-
-// allow the code to compile without any guarantees
-#  define __thrust_compiler_fence() \
-    do                              \
-    {                               \
-    } while (0)
-
-#endif

From 97f4c34bdf54ef508334771b753ac3621d7dd288 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 9 Jan 2025 12:24:24 +0100
Subject: [PATCH 07/31] Drop CTK 11.x from CI (#3275)

* Add cuda12.0-gcc7 devcontainer
* Move MSVC2017 jobs to CTK 12.6
Those is the only combination where rapidsai has devcontainers
* Add /Zc:__cplusplus for the libcudacxx tests
* Only add excape hatch for affected CTKs
* Workaround missing cudaLaunchKernelEx on MSVC
cudaLaunchKernelEx requires C++11, but unfortunately <cuda_runtime.h> checks this using the __cplusplus macro, which is reported wrongly for MSVC. CTK 12.3 fixed this by additionally detecting _MSV_VER. As a workaround, we provide our own copy of cudaLaunchKernelEx when it is not available from the CTK.
* Workaround nvcc+MSVC issue
* Regenerate devcontainers

Fixes: #3249

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 .devcontainer/cuda11.1-gcc9/devcontainer.json | 54 -------------------
 .../cuda11.1-llvm9/devcontainer.json          | 54 -------------------
 .../cuda11.8-gcc11/devcontainer.json          | 54 -------------------
 .../devcontainer.json                         | 10 ++--
 .../devcontainer.json                         | 10 ++--
 README.md                                     |  6 +--
 ci/matrix.yaml                                | 17 ++----
 libcudacxx/test/libcudacxx/CMakeLists.txt     |  7 ++-
 thrust/testing/functional.cu                  |  4 +-
 .../cuda/detail/core/triple_chevron_launch.h  | 20 +++++++
 10 files changed, 45 insertions(+), 191 deletions(-)
 delete mode 100644 .devcontainer/cuda11.1-gcc9/devcontainer.json
 delete mode 100644 .devcontainer/cuda11.1-llvm9/devcontainer.json
 delete mode 100644 .devcontainer/cuda11.8-gcc11/devcontainer.json
 rename .devcontainer/{cuda11.1-gcc7 => cuda12.0-gcc7}/devcontainer.json (90%)
 rename .devcontainer/{cuda11.1-gcc8 => cuda12.0-gcc8}/devcontainer.json (90%)

diff --git a/.devcontainer/cuda11.1-gcc9/devcontainer.json b/.devcontainer/cuda11.1-gcc9/devcontainer.json
deleted file mode 100644
index 9d711be5f66..00000000000
--- a/.devcontainer/cuda11.1-gcc9/devcontainer.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-gcc9-cuda11.1",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda11.1-gcc9",
-    "CCCL_CUDA_VERSION": "11.1",
-    "CCCL_HOST_COMPILER": "gcc",
-    "CCCL_HOST_COMPILER_VERSION": "9",
-    "CCCL_BUILD_INFIX": "cuda11.1-gcc9",
-    "CCCL_CUDA_EXTENDED": "false"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda11.1-gcc9"
-}
diff --git a/.devcontainer/cuda11.1-llvm9/devcontainer.json b/.devcontainer/cuda11.1-llvm9/devcontainer.json
deleted file mode 100644
index e39eb910443..00000000000
--- a/.devcontainer/cuda11.1-llvm9/devcontainer.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm9-cuda11.1",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda11.1-llvm9",
-    "CCCL_CUDA_VERSION": "11.1",
-    "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "9",
-    "CCCL_BUILD_INFIX": "cuda11.1-llvm9",
-    "CCCL_CUDA_EXTENDED": "false"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda11.1-llvm9"
-}
diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json
deleted file mode 100644
index 87098679264..00000000000
--- a/.devcontainer/cuda11.8-gcc11/devcontainer.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-gcc11-cuda11.8",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda11.8-gcc11",
-    "CCCL_CUDA_VERSION": "11.8",
-    "CCCL_HOST_COMPILER": "gcc",
-    "CCCL_HOST_COMPILER_VERSION": "11",
-    "CCCL_BUILD_INFIX": "cuda11.8-gcc11",
-    "CCCL_CUDA_EXTENDED": "false"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda11.8-gcc11"
-}
diff --git a/.devcontainer/cuda11.1-gcc7/devcontainer.json b/.devcontainer/cuda12.0-gcc7/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda11.1-gcc7/devcontainer.json
rename to .devcontainer/cuda12.0-gcc7/devcontainer.json
index e7d2a6572f8..96a32136eb1 100644
--- a/.devcontainer/cuda11.1-gcc7/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc7/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-gcc7-cuda11.1",
+  "image": "rapidsai/devcontainers:25.02-cpp-gcc7-cuda12.0",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda11.1-gcc7",
-    "CCCL_CUDA_VERSION": "11.1",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc7",
+    "CCCL_CUDA_VERSION": "12.0",
     "CCCL_HOST_COMPILER": "gcc",
     "CCCL_HOST_COMPILER_VERSION": "7",
-    "CCCL_BUILD_INFIX": "cuda11.1-gcc7",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc7",
     "CCCL_CUDA_EXTENDED": "false"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda11.1-gcc7"
+  "name": "cuda12.0-gcc7"
 }
diff --git a/.devcontainer/cuda11.1-gcc8/devcontainer.json b/.devcontainer/cuda12.0-gcc8/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda11.1-gcc8/devcontainer.json
rename to .devcontainer/cuda12.0-gcc8/devcontainer.json
index f590606adef..9cfe4709e07 100644
--- a/.devcontainer/cuda11.1-gcc8/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc8/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-gcc8-cuda11.1",
+  "image": "rapidsai/devcontainers:25.02-cpp-gcc8-cuda12.0",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda11.1-gcc8",
-    "CCCL_CUDA_VERSION": "11.1",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc8",
+    "CCCL_CUDA_VERSION": "12.0",
     "CCCL_HOST_COMPILER": "gcc",
     "CCCL_HOST_COMPILER_VERSION": "8",
-    "CCCL_BUILD_INFIX": "cuda11.1-gcc8",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc8",
     "CCCL_CUDA_EXTENDED": "false"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda11.1-gcc8"
+  "name": "cuda12.0-gcc8"
 }
diff --git a/README.md b/README.md
index 45766d0bfbc..383afe7e339 100644
--- a/README.md
+++ b/README.md
@@ -219,18 +219,16 @@ CCCL users are encouraged to capitalize on the latest enhancements and ["live at
 For a seamless experience, you can upgrade CCCL independently of the entire CUDA Toolkit.
 This is possible because CCCL maintains backward compatibility with the latest patch release of every minor CTK release from both the current and previous major version series.
 In some exceptional cases, the minimum supported minor version of the CUDA Toolkit release may need to be newer than the oldest release within its major version series.
-For instance, CCCL requires a minimum supported version of 11.1 from the 11.x series due to an unavoidable compiler issue present in CTK 11.0.
 
 When a new major CTK is released, we drop support for the oldest supported major version.
 
 | CCCL Version | Supports CUDA Toolkit Version                  |
 |--------------|------------------------------------------------|
 | 2.x          | 11.1 - 11.8, 12.x (only latest patch releases) |
-| 3.x (Future) | 12.x, 13.x  (only latest patch releases)       |
+| 3.x          | 12.x, 13.x  (only latest patch releases)       |
 
 [Well-behaved code](#compatibility-guidelines) using the latest CCCL should compile and run successfully with any supported CTK version.
 Exceptions may occur for new features that depend on new CTK features, so those features would not work on older versions of the CTK.
-For example, C++20 support was not added to `nvcc` until CUDA 12.0, so CCCL features that depend on C++20 would not work with CTK 11.x.
 
 Users can integrate a newer version of CCCL into an older CTK, but not the other way around.
 This means an older version of CCCL is not compatible with a newer CTK.
@@ -287,7 +285,7 @@ Note that some features may only support certain architectures/Compute Capabilit
 CCCL's testing strategy strikes a balance between testing as many configurations as possible and maintaining reasonable CI times.
 
 For CUDA Toolkit versions, testing is done against both the oldest and the newest supported versions.
-For instance, if the latest version of the CUDA Toolkit is 12.3, tests are conducted against 11.1 and 12.3.
+For instance, if the latest version of the CUDA Toolkit is 12.6, tests are conducted against 11.1 and 12.6.
 For each CUDA version, builds are completed against all supported host compilers with all supported C++ dialects.
 
 The testing strategy and matrix are constantly evolving.
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 4452f040e54..2adbcd718ff 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -10,8 +10,8 @@ workflows:
   override:
 
   pull_request:
-    # Old CTK
-    - {jobs: ['build'], std: 'minmax', ctk: '11.1', cxx: ['gcc7', 'gcc9', 'clang9']}
+    # Old CTK/compiler
+    - {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc7', 'gcc9', 'clang9', 'msvc2019']}
     # Current CTK build-only
     - {jobs: ['build'], std: [11, 14], cxx: ['gcc7', 'clang9']}
     - {jobs: ['build'], std: 'max', cxx: ['gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
@@ -41,7 +41,6 @@ workflows:
     # verify-codegen:
     - {jobs: ['verify_codegen'], project: 'libcudacxx'}
     # cudax has different CTK reqs:
-    - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 17,       cxx: ['gcc9', 'clang9']}
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 20,       cxx: ['msvc14.36']}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['gcc10', 'gcc11', 'gcc12']}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['clang10', 'clang11', 'clang12', 'clang13']}
@@ -55,7 +54,6 @@ workflows:
     # Python and c/parallel jobs:
     - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6'}
     # cccl-infra:
-    - {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc7',  'clang9']}
     - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14']}
     - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc',   'clang']}
 
@@ -64,9 +62,9 @@ workflows:
     - {jobs: ['limited'], project: 'cub', std: 17}
     - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'}
     - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit'}
-    # Old CTK
-    - {jobs: ['build'], std: 'all', ctk: '11.1', cxx: ['gcc7', 'gcc8', 'gcc9', 'clang9']}
-    - {jobs: ['build'], std: 'all', ctk: '11.8', cxx: ['gcc11'], sm: '60;70;80;90'}
+    # Old CTK/compiler
+    - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc7', 'gcc8', 'gcc9', 'clang9', 'msvc2019']}
+    - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc11'], sm: '60;70;80;90'}
     # Current CTK build-only
     - {jobs: ['build'], std: 'all', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
     - {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']}
@@ -116,9 +114,6 @@ workflows:
   exclude:
     # GPU runners are not available on Windows.
     - {jobs: ['test', 'test_gpu', 'test_nolid', 'test_lid0', 'test_lid1', 'test_lid2'], cxx: ['msvc2019', 'msvc14.36', 'msvc2022']}
-    # Ubuntu 18.04 is EOL and we only use it to get access to CTK 11.1 containers for CUDA testing.
-    # Disable non-CUDA tests on this platform.
-    - {jobs: ['test_cpu'], ctk: '11.1'}
 
 
 #############################################################################################
@@ -131,8 +126,6 @@ devcontainer_version: '25.02'
 all_stds: [11, 14, 17, 20]
 
 ctk_versions:
-  11.1: { stds: [11, 14, 17,   ] }
-  11.8: { stds: [11, 14, 17,   ] }
   12.0: { stds: [11, 14, 17, 20] }
   12.5: { stds: [11, 14, 17, 20] }
   12.6: { stds: [11, 14, 17, 20], aka: 'curr' }
diff --git a/libcudacxx/test/libcudacxx/CMakeLists.txt b/libcudacxx/test/libcudacxx/CMakeLists.txt
index e6d1754d2ef..2aad419a35f 100644
--- a/libcudacxx/test/libcudacxx/CMakeLists.txt
+++ b/libcudacxx/test/libcudacxx/CMakeLists.txt
@@ -49,9 +49,14 @@ if (NOT MSVC AND NOT ${CMAKE_CUDA_COMPILER_ID} STREQUAL "Clang")
   set(LIBCUDACXX_WARNING_LEVEL "--compiler-options=-Wall --compiler-options=-Wextra")
 endif()
 
-# sccache cannot handle the -Fd option generationg pdb files
 if (MSVC)
+  # sccache cannot handle the -Fd option generationg pdb files
   set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded)
+
+  # We want to use cudaLaunchKernelEx which is guarded by __cplusplus
+  if ("${CMAKE_CUDA_COMPILER_VERSION}" LESS "12.3.0")
+    string(APPEND LIBCUDACXX_TEST_COMPILER_FLAGS " -Xcompiler=/Zc:__cplusplus")
+  endif()
 endif()
 
 if (CCCL_SUPPRESS_MSVC2017_DEPRECATION_WARNING)
diff --git a/thrust/testing/functional.cu b/thrust/testing/functional.cu
index 750a0f76ace..1818084fb34 100644
--- a/thrust/testing/functional.cu
+++ b/thrust/testing/functional.cu
@@ -211,8 +211,8 @@ THRUST_DISABLE_BROKEN_GCC_VECTORIZER void TestIdentityFunctional()
 
   // value categories when casting to different type
   static_assert(::cuda::std::is_same<decltype(thrust::identity<int>{}(3.14)), int&&>::value, "");
-  // unfortunately, old versions of MSVC pick the `const int&` overload instead of `int&&`
-#if !_CCCL_COMPILER(MSVC, <, 19, 29)
+  // unfortunately, old versions of MSVC or nvcc in MSVC mode pick the `const int&` overload instead of `int&&`
+#if !_CCCL_COMPILER(MSVC, <, 19, 29) && !(_CCCL_COMPILER(MSVC) && _CCCL_CUDA_COMPILER(NVCC, <, 12, 1))
   static_assert(::cuda::std::is_same<decltype(thrust::identity<int>{}(d)), int&&>::value, "");
   static_assert(::cuda::std::is_same<decltype(thrust::identity<int>{}(as_const(d))), int&&>::value, "");
 #endif
diff --git a/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h b/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h
index cdff339b91f..a4ac894308d 100644
--- a/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h
+++ b/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h
@@ -67,6 +67,22 @@ struct _CCCL_VISIBILITY_HIDDEN triple_chevron
       , stream(stream_)
   {}
 
+  // cudaLaunchKernelEx requires C++11, but unfortunately <cuda_runtime.h> checks this using the __cplusplus macro,
+  // which is reported wrongly for MSVC. CTK 12.3 fixed this by additionally detecting _MSV_VER. As a workaround, we
+  // provide our own copy of cudaLaunchKernelEx when it is not available from the CTK.
+#if _CCCL_COMPILER(MSVC) && _CCCL_CUDACC_BELOW(12, 3)
+  // Copied from <cuda_runtime.h>
+  template <typename... ExpTypes, typename... ActTypes>
+  static cudaError_t _CCCL_HOST
+  cudaLaunchKernelEx_MSVC_workaround(const cudaLaunchConfig_t* config, void (*kernel)(ExpTypes...), ActTypes&&... args)
+  {
+    return [&](ExpTypes... coercedArgs) {
+      void* pArgs[] = {&coercedArgs...};
+      return ::cudaLaunchKernelExC(config, (const void*) kernel, pArgs);
+    }(std::forward<ActTypes>(args)...);
+  }
+#endif
+
   template <class K, class... Args>
   cudaError_t _CCCL_HOST doit_host(K k, Args const&... args) const
   {
@@ -84,7 +100,11 @@ struct _CCCL_VISIBILITY_HIDDEN triple_chevron
       config.stream           = stream;
       config.attrs            = attribute;
       config.numAttrs         = 1;
+#  if _CCCL_COMPILER(MSVC) && _CCCL_CUDACC_BELOW(12, 3)
+      cudaLaunchKernelEx_MSVC_workaround(&config, k, args...);
+#  else
       cudaLaunchKernelEx(&config, k, args...);
+#  endif
     }
     else
 #endif // _CCCL_HAS_PDL

From 4166471b97062120a3abe8bade829a9f54116386 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 9 Jan 2025 13:02:41 +0100
Subject: [PATCH 08/31] Drop CUB's util_compiler.cuh (#3302)

All contained macros were deprecated
---
 cub/cub/config.cuh                    |  1 -
 cub/cub/util_compiler.cuh             | 66 ---------------------------
 cub/cub/util_cpp_dialect.cuh          |  2 -
 cub/test/test_warning_suppression.cuh |  1 -
 4 files changed, 70 deletions(-)

diff --git a/cub/cub/config.cuh b/cub/cub/config.cuh
index 16e7edd4905..d05078cdd2d 100644
--- a/cub/cub/config.cuh
+++ b/cub/cub/config.cuh
@@ -44,7 +44,6 @@
 #endif // no system header
 
 #include <cub/util_arch.cuh> // IWYU pragma: export
-#include <cub/util_compiler.cuh> // IWYU pragma: export
 #include <cub/util_cpp_dialect.cuh> // IWYU pragma: export
 #include <cub/util_macro.cuh> // IWYU pragma: export
 #include <cub/util_namespace.cuh> // IWYU pragma: export
diff --git a/cub/cub/util_compiler.cuh b/cub/cub/util_compiler.cuh
index b34a889fd21..8279c6e1fbd 100644
--- a/cub/cub/util_compiler.cuh
+++ b/cub/cub/util_compiler.cuh
@@ -42,69 +42,3 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-
-// enumerate host compilers we know about
-//! deprecated [Since 2.7]
-#define CUB_HOST_COMPILER_UNKNOWN 0
-//! deprecated [Since 2.7]
-#define CUB_HOST_COMPILER_MSVC 1
-//! deprecated [Since 2.7]
-#define CUB_HOST_COMPILER_GCC 2
-//! deprecated [Since 2.7]
-#define CUB_HOST_COMPILER_CLANG 3
-
-// enumerate device compilers we know about
-//! deprecated [Since 2.7]
-#define CUB_DEVICE_COMPILER_UNKNOWN 0
-//! deprecated [Since 2.7]
-#define CUB_DEVICE_COMPILER_MSVC 1
-//! deprecated [Since 2.7]
-#define CUB_DEVICE_COMPILER_GCC 2
-//! deprecated [Since 2.7]
-#define CUB_DEVICE_COMPILER_NVCC 3
-//! deprecated [Since 2.7]
-#define CUB_DEVICE_COMPILER_CLANG 4
-
-// figure out which host compiler we're using
-#if _CCCL_COMPILER(MSVC)
-//! deprecated [Since 2.7]
-#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC
-//! deprecated [Since 2.7]
-#  define CUB_MSVC_VERSION _MSC_VER
-//! deprecated [Since 2.7]
-#  define CUB_MSVC_VERSION_FULL _MSC_FULL_VER
-#elif _CCCL_COMPILER(CLANG)
-//! deprecated [Since 2.7]
-#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG
-//! deprecated [Since 2.7]
-#  define CUB_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
-#elif _CCCL_COMPILER(GCC)
-//! deprecated [Since 2.7]
-#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC
-//! deprecated [Since 2.7]
-#  define CUB_GCC_VERSION   (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#endif
-
-// figure out which device compiler we're using
-#if _CCCL_CUDA_COMPILER(NVCC) || _CCCL_CUDA_COMPILER(NVHPC)
-//! deprecated [Since 2.7]
-#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
-#elif _CCCL_COMPILER(MSVC)
-//! deprecated [Since 2.7]
-#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC
-#elif _CCCL_COMPILER(GCC)
-//! deprecated [Since 2.7]
-#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC
-#elif _CCCL_COMPILER(CLANG)
-// CUDA-capable clang should behave similar to NVCC.
-#  if _CCCL_CUDA_COMPILER(NVCC)
-//! deprecated [Since 2.7]
-#    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
-#  else
-//! deprecated [Since 2.7]
-#    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG
-#  endif
-#else
-//! deprecated [Since 2.7]
-#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN
-#endif
diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh
index 5a4f4a63825..de745638ac8 100644
--- a/cub/cub/util_cpp_dialect.cuh
+++ b/cub/cub/util_cpp_dialect.cuh
@@ -40,8 +40,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/util_compiler.cuh> // IWYU pragma: export
-
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 // Deprecation warnings may be silenced by defining the following macros. These
diff --git a/cub/test/test_warning_suppression.cuh b/cub/test/test_warning_suppression.cuh
index 46c6080fed7..448230343f3 100644
--- a/cub/test/test_warning_suppression.cuh
+++ b/cub/test/test_warning_suppression.cuh
@@ -27,7 +27,6 @@
 
 #pragma once
 
-#include <cub/util_compiler.cuh>
 #include <cub/util_cpp_dialect.cuh>
 
 // C4127: conditional expression is constant

From 76f0ac6e9aae362181e0c3c606900630114b3497 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Thu, 9 Jan 2025 07:21:37 -0500
Subject: [PATCH 09/31] Update packman and repo_docs versions (#3293)

Co-authored-by: Ashwin Srinath <shwina@users.noreply.github.com>
---
 docs/repo.bat                                 | 47 ++++++++++---
 docs/repo.sh                                  | 30 ++++++++-
 docs/repo.toml                                |  2 +
 docs/tools/deps/repo-deps.packman.xml         |  2 +-
 docs/tools/packman/bootstrap/configure.bat    | 34 ++++------
 .../packman/bootstrap/install_package.py      | 24 ++++++-
 docs/tools/packman/packman                    | 66 ++++++++++++-------
 docs/tools/packman/packman.cmd                | 35 +++++++++-
 docs/tools/packman/packmanconf.py             | 43 +++++++++---
 docs/tools/packman/python.bat                 | 17 ++++-
 docs/tools/packman/python.sh                  | 14 +++-
 11 files changed, 238 insertions(+), 76 deletions(-)

diff --git a/docs/repo.bat b/docs/repo.bat
index 9d37b64e5ed..aeb625b79f5 100644
--- a/docs/repo.bat
+++ b/docs/repo.bat
@@ -1,10 +1,37 @@
-@echo off
-
-call "%~dp0tools\packman\python.bat" %~dp0tools\repoman\repoman.py %*
-if %errorlevel% neq 0 ( goto Error )
-
-:Success
-exit /b 0
-
-:Error
-exit /b %errorlevel%
+@echo off
+
+:: Set OMNI_REPO_ROOT early so `repo` bootstrapping can target the repository
+:: root when writing out Python dependencies.
+:: Use SETLOCAL and ENDLOCAL to constrain these variables to this batch file.
+:: Use ENABLEDELAYEDEXPANSION to evaluate the value of PM_PACKAGES_ROOT
+:: at execution time.
+SETLOCAL ENABLEDELAYEDEXPANSION
+set OMNI_REPO_ROOT="%~dp0"
+
+:: Set Packman cache directory early if repo-cache.json is configured
+:: so that the Packman Python version is not fetched from the web.
+IF NOT EXIST "%~dp0repo-cache.json" goto :RepoCacheEnd
+
+:: Read PM_PACKAGES_ROOT from repo-cache.json and make sure it is an absolute path (assume relative to the script directory).
+for /f "usebackq tokens=*" %%i in (`powershell -NoProfile -Command "$PM_PACKAGES_ROOT = (Get-Content '%~dp0repo-cache.json' | ConvertFrom-Json).PM_PACKAGES_ROOT; if ([System.IO.Path]::IsPathRooted($PM_PACKAGES_ROOT)) { Write-Output ('absolute;' + $PM_PACKAGES_ROOT) } else { Write-Output ('relative;' + $PM_PACKAGES_ROOT) }"`) do (
+    for /f "tokens=1,2 delims=;" %%A in ("%%i") do (
+        if /i "%%A" == "relative" (
+            set PM_PACKAGES_ROOT=%~dp0%%B
+        ) else (
+            set PM_PACKAGES_ROOT=%%B
+        )
+    )
+)
+
+:RepoCacheEnd
+
+call "%~dp0tools\packman\python.bat" "%~dp0tools\repoman\repoman.py" %*
+if %errorlevel% neq 0 ( goto Error )
+
+:Success
+ENDLOCAL
+exit /b 0
+
+:Error
+ENDLOCAL
+exit /b %errorlevel%
diff --git a/docs/repo.sh b/docs/repo.sh
index 2297585db31..489bd9810b0 100755
--- a/docs/repo.sh
+++ b/docs/repo.sh
@@ -2,7 +2,35 @@
 
 set -e
 
+# Set OMNI_REPO_ROOT early so `repo` bootstrapping can target the repository
+# root when writing out Python dependencies.
+export OMNI_REPO_ROOT="$( cd "$(dirname "$0")" ; pwd -P )"
+
+# By default custom caching is disabled in repo_man. But if a repo-cache.json
+# caching configuration file is generated via the `repo cache` command, it's
+# presence will trigger the configuration of custom caching.
+if [[ -f "${OMNI_REPO_ROOT}/repo-cache.json" ]]; then
+    PM_PACKAGES_ROOT=$(grep '"PM_PACKAGES_ROOT"' "${OMNI_REPO_ROOT}/repo-cache.json" | sed 's/.*"PM_PACKAGES_ROOT": "\(.*\)".*/\1/')
+
+    # PM_PACKAGES_ROOT is present in the config file. We set this early
+    # so Packman will reference our cached package repository.
+    if [[ -n "${PM_PACKAGES_ROOT}" ]]; then
+        # Use eval to resolve ~ and perform parameter expansion
+        RESOLVED_PACKAGES_ROOT=$(eval echo "$PM_PACKAGES_ROOT")
+
+        if [[ "${RESOLVED_PACKAGES_ROOT}" != /* ]]; then
+            # PM_PACKAGES_ROOT is not an abs path, assumption is then
+            # that it is a relative path to the repository root.
+            PM_PACKAGES_ROOT="${OMNI_REPO_ROOT}/${RESOLVED_PACKAGES_ROOT}"
+        else
+            PM_PACKAGES_ROOT=${RESOLVED_PACKAGES_ROOT}
+        fi
+        export PM_PACKAGES_ROOT
+    fi
+fi
+
 SCRIPT_DIR=$(dirname ${BASH_SOURCE})
 cd "$SCRIPT_DIR"
 
-exec "tools/packman/python.sh" tools/repoman/repoman.py $@
+# Use "exec" to ensure that environment variables don't accidentally affect other processes.
+exec "tools/packman/python.sh" tools/repoman/repoman.py "$@"
diff --git a/docs/repo.toml b/docs/repo.toml
index f7a1198d766..64cf4a1f870 100644
--- a/docs/repo.toml
+++ b/docs/repo.toml
@@ -15,6 +15,8 @@ social_media     = [
     [ "discord", "https://discord.com/channels/1019361803752456192/1161051667945508884" ],
 ]
 
+sphinx_version = "4.5.0.2-py3.10-${platform}"
+
 enhanced_search_enabled = true
 api_output_directory = "api"
 use_fast_doxygen_conversion = true
diff --git a/docs/tools/deps/repo-deps.packman.xml b/docs/tools/deps/repo-deps.packman.xml
index 70417fb4b09..21d1acf02af 100644
--- a/docs/tools/deps/repo-deps.packman.xml
+++ b/docs/tools/deps/repo-deps.packman.xml
@@ -1,6 +1,6 @@
 <project toolsVersion="5.0">
   <dependency name="repo_man" linkPath="../../_repo/deps/repo_man">
-    <package name="repo_man" version="1.10.1"/>
+    <package name="repo_man" version="1.71.1"/>
   </dependency>
   <dependency name="repo_docs" linkPath="../../_repo/deps/repo_docs">
     <package name="repo_docs" version="0.10.4"/>
diff --git a/docs/tools/packman/bootstrap/configure.bat b/docs/tools/packman/bootstrap/configure.bat
index 0391abfa195..f04572ef6b5 100755
--- a/docs/tools/packman/bootstrap/configure.bat
+++ b/docs/tools/packman/bootstrap/configure.bat
@@ -1,4 +1,4 @@
-:: Copyright 2019 NVIDIA CORPORATION
+:: Copyright 2019-2023 NVIDIA CORPORATION
 ::
 :: Licensed under the Apache License, Version 2.0 (the "License");
 :: you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 :: See the License for the specific language governing permissions and
 :: limitations under the License.
 
-set PM_PACKMAN_VERSION=6.57
+set PM_PACKMAN_VERSION=7.24.4
 
 :: Specify where packman command is rooted
 set PM_INSTALL_PATH=%~dp0..
@@ -59,7 +59,7 @@ if defined PM_PYTHON_EXT (
 	goto PACKMAN
 )
 
-set PM_PYTHON_VERSION=3.7.13-nv1-windows-x86_64
+set PM_PYTHON_VERSION=3.10.5-1-windows-x86_64
 set PM_PYTHON_BASE_DIR=%PM_PACKAGES_ROOT%\python
 set PM_PYTHON_DIR=%PM_PYTHON_BASE_DIR%\%PM_PYTHON_VERSION%
 set PM_PYTHON=%PM_PYTHON_DIR%\python.exe
@@ -95,11 +95,16 @@ if exist "%PM_PYTHON%" (
     if exist "%PM_PYTHON_DIR%" ( rd /s /q "%PM_PYTHON_DIR%" > nul )
 )
 
-:: Perform atomic rename
-rename "%TEMP_FOLDER_NAME%" "%PM_PYTHON_VERSION%" 1> nul
-:: Failure during move, need to clean up and abort
+:: Perform atomic move (allowing overwrite, /y)
+move /y "%TEMP_FOLDER_NAME%" "%PM_PYTHON_DIR%" 1> nul
+:: Verify that python.exe is now where we expect
+if exist "%PM_PYTHON%" goto PACKMAN
+
+:: Wait a second and try again (can help with access denied weirdness)
+timeout /t 1 /nobreak 1> nul
+move /y "%TEMP_FOLDER_NAME%" "%PM_PYTHON_DIR%" 1> nul
 if %errorlevel% neq 0 (
-    echo !!! Error renaming python !!!
+    echo !!! Error moving python %TEMP_FOLDER_NAME% -> %PM_PYTHON_DIR% !!!
     call :CLEAN_UP_TEMP_FOLDER
     goto ERROR
 )
@@ -114,7 +119,7 @@ if defined PM_MODULE_DIR_EXT (
 
 set PM_MODULE=%PM_MODULE_DIR%\run.py
 
-if exist "%PM_MODULE%" goto ENSURE_7ZA
+if exist "%PM_MODULE%" goto END
 
 :: Clean out broken PM_MODULE_DIR if it exists
 if exist "%PM_MODULE_DIR%" ( rd /s /q "%PM_MODULE_DIR%" > nul )
@@ -137,19 +142,6 @@ if %errorlevel% neq 0 (
 
 del "%TARGET%"
 
-:ENSURE_7ZA
-set PM_7Za_VERSION=22.01-1
-set PM_7Za_PATH=%PM_PACKAGES_ROOT%\7za\%PM_7ZA_VERSION%
-if exist "%PM_7Za_PATH%" goto END
-set PM_7Za_PATH=%PM_PACKAGES_ROOT%\chk\7za\%PM_7ZA_VERSION%
-if exist "%PM_7Za_PATH%" goto END
-
-"%PM_PYTHON%" -S -s -u -E "%PM_MODULE%" pull "%PM_MODULE_DIR%\deps.packman.xml"
-if %errorlevel% neq 0 (
-    echo !!! Error fetching packman dependencies !!!
-    goto ERROR
-)
-
 goto END
 
 :ERROR_MKDIR_PACKAGES_ROOT
diff --git a/docs/tools/packman/bootstrap/install_package.py b/docs/tools/packman/bootstrap/install_package.py
index d62252d6cba..82ba12e1548 100644
--- a/docs/tools/packman/bootstrap/install_package.py
+++ b/docs/tools/packman/bootstrap/install_package.py
@@ -19,7 +19,8 @@
 import os
 import stat
 import time
-from typing import Any, Callable
+import hashlib
+from typing import Any, Callable, Union
 
 
 RENAME_RETRY_COUNT = 100
@@ -130,7 +131,24 @@ def rename_folder_with_retry(staging_dir: StagingDirectory, folder_name):
     )
 
 
-def install_package(package_path, install_path):
+def generate_sha256_for_file(file_path: Union[str, os.PathLike]) -> str:
+    """Returns the SHA-256 hex digest for the file at `file_path`"""
+    hash = hashlib.sha256()
+    # Read the file in binary mode and update the hash object with data
+    with open(file_path, "rb") as file:
+        for chunk in iter(lambda: file.read(4096), b""):
+            hash.update(chunk)
+    return hash.hexdigest()
+
+
+def install_common_module(package_path, install_path):
+    COMMON_SHA256 = "ef974608cf903f39dbd3f22b3e26c15a1dd21f6e71d05c1510dcf423128cd7a4"
+    package_sha256 = generate_sha256_for_file(package_path)
+    if package_sha256 != COMMON_SHA256:
+        raise RuntimeError(
+            f"Package at '{package_path}' must have a sha256 of '{COMMON_SHA256}' "
+            f"but was found to have '{package_sha256}'"
+        )
     staging_path, version = os.path.split(install_path)
     with StagingDirectory(staging_path) as staging_dir:
         output_folder = staging_dir.get_temp_folder_path()
@@ -151,4 +169,4 @@ def install_package(package_path, install_path):
     for exec_path in paths_list:
         if os.path.normcase(os.path.normpath(exec_path)) == target_path_np_nc:
             raise RuntimeError(f"packman will not install to executable path '{exec_path}'")
-    install_package(sys.argv[1], target_path_np)
+    install_common_module(sys.argv[1], target_path_np)
diff --git a/docs/tools/packman/packman b/docs/tools/packman/packman
index e342154a4b9..c14607865ea 100755
--- a/docs/tools/packman/packman
+++ b/docs/tools/packman/packman
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2019-2020 NVIDIA CORPORATION
+# Copyright 2019-2023 NVIDIA CORPORATION
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,11 +18,13 @@ set -eu
 
 if echo ${PM_VERBOSITY-} | grep -i "debug" > /dev/null ; then
 	set -x
+	PM_CURL_SILENT=""
+	PM_WGET_QUIET=""
 else
 	PM_CURL_SILENT="-s -S"
 	PM_WGET_QUIET="--quiet"
 fi
-PM_PACKMAN_VERSION=6.57
+export PM_PACKMAN_VERSION=7.24.4
 
 # This is necessary for newer macOS
 if [ `uname` == 'Darwin' ]; then
@@ -58,17 +60,49 @@ if [ ! -d "$PM_PACKAGES_ROOT" ]; then
 	mkdir -p -m a+rwx "$PM_PACKAGES_ROOT"
 fi
 
+execute_with_retry()
+{
+    # Don't exit on error, we need to handle them
+    set +e
+
+    local CMD="$1"
+    local MAX_TRIES=4
+    local DELAY=2
+    local TRIES=0
+    local exit_code
+
+    while [ $TRIES -lt $MAX_TRIES ]
+    do
+        ((TRIES++))
+        eval $CMD
+        exit_code=$?
+        if [ $exit_code -eq 0 ]; then
+            return 0
+        fi
+
+        if [ $TRIES -lt $MAX_TRIES ]; then
+            echo "Attempt $TRIES failed. Retrying in $DELAY seconds ..."
+            sleep $DELAY
+			DELAY=$((DELAY * DELAY))
+			echo "Retrying ..."
+        fi
+    done
+
+    echo "Command failed after $MAX_TRIES attempts: $CMD"
+    return $exit_code
+}
+
 fetch_file_from_s3()
 {
-	SOURCE=$1
-	SOURCE_URL=http://bootstrap.packman.nvidia.com/$SOURCE
-	TARGET=$2
+	local SOURCE=$1
+	local SOURCE_URL=http://bootstrap.packman.nvidia.com/$SOURCE
+	local TARGET=$2
 	echo "Fetching $SOURCE from bootstrap.packman.nvidia.com ..."
+	local CMD="curl -o $TARGET $SOURCE_URL $PM_CURL_SILENT"
 	if command -v wget >/dev/null 2>&1; then
-		wget $PM_WGET_QUIET -O$TARGET $SOURCE_URL
-	else
-		curl -o $TARGET $SOURCE_URL $PM_CURL_SILENT
+		CMD="wget $PM_WGET_QUIET -O$TARGET $SOURCE_URL"
 	fi
+	execute_with_retry "$CMD"
 }
 
 generate_temp_file_name()
@@ -85,7 +119,7 @@ install_python()
 {
 	PLATFORM=`uname`
 	PROCESSOR=`uname -m`
-	PYTHON_VERSION=3.7.13-nv1
+	PYTHON_VERSION=3.10.5-1
 
 	if [ $PLATFORM == 'Darwin' ]; then
 		PYTHON_PACKAGE=$PYTHON_VERSION-macos-x86_64
@@ -154,20 +188,6 @@ if [ ! -f "$PM_MODULE" ]; then
 	fi
 fi
 
-# Ensure 7za package exists:
-PM_7za_VERSION=22.01-1
-export PM_7za_PATH="$PM_PACKAGES_ROOT/7za/$PM_7za_VERSION"
-if [ ! -d "$PM_7za_PATH" ]; then
-    export PM_7za_PATH="$PM_PACKAGES_ROOT/chk/7za/$PM_7za_VERSION"
-    if [ ! -d "$PM_7za_PATH" ]; then
-        "$PM_PYTHON" -S -s -u -E "$PM_MODULE" pull "$PM_MODULE_DIR/deps.packman.xml"
-        if [ "$?" -ne 0 ]; then
-           echo "Failure while installing required 7za package"
-           exit 1
-        fi
-    fi
-fi
-
 # Generate temporary file name for environment variables:
 PM_VAR_PATH=`mktemp -u -t tmp.$$.pmvars.XXXXXX`
 
diff --git a/docs/tools/packman/packman.cmd b/docs/tools/packman/packman.cmd
index f555331fef4..41bc55507a9 100755
--- a/docs/tools/packman/packman.cmd
+++ b/docs/tools/packman/packman.cmd
@@ -1,11 +1,12 @@
 :: RUN_PM_MODULE must always be at the same spot for packman update to work (batch reloads file during update!)
-:: [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
+:: [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
 :: Reset errorlevel status (don't inherit from caller)
 @call :ECHO_AND_RESET_ERROR
-:: You can remove the call below if you do your own manual configuration of the dev machines
-call "%~dp0\bootstrap\configure.bat"
 
+:: You can remove this section if you do your own manual configuration of the dev machines
+call :CONFIGURE
 if %errorlevel% neq 0 ( exit /b %errorlevel% )
+
 :: Everything below is mandatory
 if not defined PM_PYTHON goto :PYTHON_ENV_ERROR
 if not defined PM_MODULE goto :MODULE_ENV_ERROR
@@ -46,6 +47,7 @@ exit /b 1
 @echo Error while processing and setting environment variables!
 exit /b 1
 
+:: pad [xxxx]
 :ECHO_AND_RESET_ERROR
 @echo off
 if /I "%PM_VERBOSITY%"=="debug" (
@@ -58,3 +60,30 @@ exit /b 0
 for /f "delims=" %%a in ('%PM_PYTHON% -S -s -u -E -c "import tempfile;file = tempfile.NamedTemporaryFile(mode='w+t', delete=False);print(file.name)"') do (set PM_VAR_PATH=%%a)
 set PM_VAR_PATH_ARG=--var-path="%PM_VAR_PATH%"
 goto :RUN_PM_MODULE
+
+:CONFIGURE
+:: Must capture and set code page to work around issue #279, powershell invocation mutates console font
+:: This issue only happens in Windows CMD shell when using 65001 code page. Some Git Bash implementations
+:: don't support chcp so this workaround is a bit convoluted.
+:: Test for chcp:
+chcp > nul 2>&1
+if %errorlevel% equ 0 (
+	for /f "tokens=2 delims=:" %%a in ('chcp') do (set PM_OLD_CODE_PAGE=%%a)
+) else (
+	call :ECHO_AND_RESET_ERROR
+)
+:: trim leading space (this is safe even when PM_OLD_CODE_PAGE has not been set)
+set PM_OLD_CODE_PAGE=%PM_OLD_CODE_PAGE:~1%
+if "%PM_OLD_CODE_PAGE%" equ "65001" (
+	chcp 437 > nul
+	set PM_RESTORE_CODE_PAGE=1
+)
+call "%~dp0\bootstrap\configure.bat"
+set PM_CONFIG_ERRORLEVEL=%errorlevel%
+if defined PM_RESTORE_CODE_PAGE (
+	:: Restore code page
+	chcp %PM_OLD_CODE_PAGE% > nul
+)
+set PM_OLD_CODE_PAGE=
+set PM_RESTORE_CODE_PAGE=
+exit /b %PM_CONFIG_ERRORLEVEL%
diff --git a/docs/tools/packman/packmanconf.py b/docs/tools/packman/packmanconf.py
index f1bcd2feaf9..79d0ae39c85 100644
--- a/docs/tools/packman/packmanconf.py
+++ b/docs/tools/packman/packmanconf.py
@@ -1,4 +1,18 @@
-# Use this file to bootstrap packman into your Python environment (3.7.x). Simply
+# Copyright 2021-2024 NVIDIA CORPORATION
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Use this file to bootstrap packman into your Python environment. Simply
 # add the path by doing sys.insert to where packmanconf.py is located and then execute:
 #
 # >>> import packmanconf
@@ -32,11 +46,16 @@ def init():
         >>> import packmanapi
         >>> packmanapi.set_verbosity_level(packmanapi.VERBOSITY_HIGH)
     """
-    major = sys.version_info[0]
-    minor = sys.version_info[1]
-    if major != 3 or minor != 7:
+    major = sys.version_info.major
+    minor = sys.version_info.minor
+    patch = sys.version_info.micro
+    if major == 3 and (minor == 10 or (minor == 11 and patch <= 2)):
+        # we are good
+        pass
+    else:
         raise RuntimeError(
-            f"This version of packman requires Python 3.7.x, but {major}.{minor} was provided"
+            f"This version of packman requires Python 3.10.0 up to 3.11.2, "
+            f"but {major}.{minor}.{patch} was provided"
         )
     conf_dir = os.path.dirname(os.path.abspath(__file__))
     os.environ["PM_INSTALL_PATH"] = conf_dir
@@ -56,7 +75,7 @@ def get_packages_root(conf_dir: str) -> str:
         elif platform_name == "Darwin":
             # macOS
             root = os.path.join(
-                os.path.expanduser("~"), "/Library/Application Support/packman-cache"
+                os.path.expanduser("~"), "Library/Application Support/packman-cache"
             )
         elif platform_name == "Linux":
             try:
@@ -79,7 +98,13 @@ def get_module_dir(conf_dir, packages_root: str, version: str) -> str:
         tf = tempfile.NamedTemporaryFile(delete=False)
         target_name = tf.name
         tf.close()
-        url = f"http://bootstrap.packman.nvidia.com/packman-common@{version}.zip"
+        # Using http here and not https is by design. Unfortunately SSL keeps getting revised
+        # which breaks old clients when servers are forced to upgrade to newer version of TLS
+        # and refuse to downgrade when asked. Instead of relying on SSL for transport security
+        # packman does SHA256 verification of the downloaded package in the `install_package`
+        # method. We therefore inform SonarQube to stop complaining about the line below.
+        # See issue #367 for more detail.
+        url = f"http://bootstrap.packman.nvidia.com/packman-common@{version}.zip"  # NOSONAR
         print(f"Downloading '{url}' ...")
         import urllib.request
 
@@ -90,7 +115,7 @@ def get_module_dir(conf_dir, packages_root: str, version: str) -> str:
         script_path = os.path.join(conf_dir, "bootstrap", "install_package.py")
         ip = SourceFileLoader("install_package", script_path).load_module()
         print("Unpacking ...")
-        ip.install_package(target_name, module_dir)
+        ip.install_common_module(target_name, module_dir)
         os.unlink(tf.name)
     return module_dir
 
@@ -101,7 +126,7 @@ def get_version(conf_dir: str):
         path += ".sh"
     with open(path, "rt", encoding="utf8") as launch_file:
         for line in launch_file.readlines():
-            if line.startswith("PM_PACKMAN_VERSION"):
+            if "PM_PACKMAN_VERSION" in line:
                 _, value = line.split("=")
                 return value.strip()
     raise RuntimeError(f"Unable to find 'PM_PACKMAN_VERSION' in '{path}'")
diff --git a/docs/tools/packman/python.bat b/docs/tools/packman/python.bat
index e5490a98b26..e60b7a6e103 100755
--- a/docs/tools/packman/python.bat
+++ b/docs/tools/packman/python.bat
@@ -13,9 +13,20 @@
 :: limitations under the License.
 
 @echo off
-setlocal
+setlocal enableextensions
 
 call "%~dp0\packman" init
 set "PYTHONPATH=%PM_MODULE_DIR%;%PYTHONPATH%"
-set PYTHONNOUSERSITE=1
-"%PM_PYTHON%" -u %*
+
+if not defined PYTHONNOUSERSITE (
+    set PYTHONNOUSERSITE=1
+)
+
+REM For performance, default to unbuffered; however, allow overriding via
+REM PYTHONUNBUFFERED=0 since PYTHONUNBUFFERED on windows can truncate output
+REM when printing long strings
+if not defined PYTHONUNBUFFERED (
+    set PYTHONUNBUFFERED=1
+)
+
+"%PM_PYTHON%" %*
diff --git a/docs/tools/packman/python.sh b/docs/tools/packman/python.sh
index 37c9f1b8922..74328bf0d48 100755
--- a/docs/tools/packman/python.sh
+++ b/docs/tools/packman/python.sh
@@ -22,11 +22,21 @@ if [ ! -f "$PACKMAN_CMD" ]; then
 fi
 source "$PACKMAN_CMD" init
 export PYTHONPATH="${PM_MODULE_DIR}:${PYTHONPATH}"
-export PYTHONNOUSERSITE=1
+
+if [ -z "${PYTHONNOUSERSITE:-}" ]; then
+    export PYTHONNOUSERSITE=1
+fi
+
+# For performance, default to unbuffered; however, allow overriding via
+# PYTHONUNBUFFERED=0 since PYTHONUNBUFFERED on windows can truncate output
+# when printing long strings
+if [ -z "${PYTHONUNBUFFERED:-}" ]; then
+    export PYTHONUNBUFFERED=1
+fi
 
 # workaround for our python not shipping with certs
 if [[ -z ${SSL_CERT_DIR:-} ]]; then
     export SSL_CERT_DIR=/etc/ssl/certs/
 fi
 
-"${PM_PYTHON}" -u "$@"
+"${PM_PYTHON}" "$@"

From e7e74a1e45d6d2b23fa5551d6f901f23d40821e2 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 9 Jan 2025 13:48:17 +0100
Subject: [PATCH 10/31] Drop Thrust's deprecated compiler macros (#3301)

---
 thrust/cmake/ThrustHeaderTesting.cmake      |  1 -
 thrust/examples/arbitrary_transformation.cu |  9 +--
 thrust/testing/zip_function.cu              | 23 +++---
 thrust/thrust/detail/config/compiler.h      | 78 ---------------------
 thrust/thrust/detail/modern_gcc_required.h  | 33 ---------
 thrust/thrust/zip_function.h                | 24 +++----
 6 files changed, 21 insertions(+), 147 deletions(-)
 delete mode 100644 thrust/thrust/detail/modern_gcc_required.h

diff --git a/thrust/cmake/ThrustHeaderTesting.cmake b/thrust/cmake/ThrustHeaderTesting.cmake
index 9a39c93e26a..2eb3e33b5d6 100644
--- a/thrust/cmake/ThrustHeaderTesting.cmake
+++ b/thrust/cmake/ThrustHeaderTesting.cmake
@@ -125,7 +125,6 @@ function(thrust_add_header_test thrust_target label definitions)
     ${header_definitions}
     "THRUST_CPP11_REQUIRED_NO_ERROR"
     "THRUST_CPP14_REQUIRED_NO_ERROR"
-    "THRUST_MODERN_GCC_REQUIRED_NO_ERROR"
   )
   thrust_clone_target_properties(${headertest_target} ${thrust_target})
 
diff --git a/thrust/examples/arbitrary_transformation.cu b/thrust/examples/arbitrary_transformation.cu
index ead08af9bb9..369794c6192 100644
--- a/thrust/examples/arbitrary_transformation.cu
+++ b/thrust/examples/arbitrary_transformation.cu
@@ -3,13 +3,10 @@
 #include <thrust/device_vector.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/zip_function.h>
 
 #include <iostream>
 
-#if !defined(THRUST_LEGACY_GCC)
-#  include <thrust/zip_function.h>
-#endif // >= C++11
-
 #include "include/host_device.h"
 
 // This example shows how to implement an arbitrary transformation of
@@ -54,7 +51,6 @@ struct arbitrary_functor1
   }
 };
 
-#if !defined(THRUST_LEGACY_GCC)
 struct arbitrary_functor2
 {
   __host__ __device__ void operator()(const float& a, const float& b, const float& c, float& d)
@@ -63,7 +59,6 @@ struct arbitrary_functor2
     d = a + b * c;
   }
 };
-#endif // >= C++11
 
 int main()
 {
@@ -95,7 +90,6 @@ int main()
   }
 
   // apply the transformation using zip_function
-#if !defined(THRUST_LEGACY_GCC)
   thrust::device_vector<float> D2(5);
   thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D2.begin())),
                    thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end(), D2.end())),
@@ -107,5 +101,4 @@ int main()
   {
     std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D2[i] << std::endl;
   }
-#endif // >= C++11
 }
diff --git a/thrust/testing/zip_function.cu b/thrust/testing/zip_function.cu
index 2a7117a05b1..279960592b5 100644
--- a/thrust/testing/zip_function.cu
+++ b/thrust/testing/zip_function.cu
@@ -1,17 +1,15 @@
 #include <thrust/detail/config.h>
 
-#if !defined(THRUST_LEGACY_GCC)
+#include <thrust/device_vector.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/zip_function.h>
 
-#  include <thrust/device_vector.h>
-#  include <thrust/iterator/zip_iterator.h>
-#  include <thrust/remove.h>
-#  include <thrust/sort.h>
-#  include <thrust/transform.h>
-#  include <thrust/zip_function.h>
+#include <iostream>
 
-#  include <iostream>
-
-#  include <unittest/unittest.h>
+#include <unittest/unittest.h>
 
 using namespace unittest;
 
@@ -35,9 +33,9 @@ struct TestZipFunctionCtor
   {
     ASSERT_EQUAL(thrust::zip_function<SumThree>()(thrust::make_tuple(1, 2, 3)), SumThree{}(1, 2, 3));
     ASSERT_EQUAL(thrust::zip_function<SumThree>(SumThree{})(thrust::make_tuple(1, 2, 3)), SumThree{}(1, 2, 3));
-#  ifdef __cpp_deduction_guides
+#ifdef __cpp_deduction_guides
     ASSERT_EQUAL(thrust::zip_function(SumThree{})(thrust::make_tuple(1, 2, 3)), SumThree{}(1, 2, 3));
-#  endif // __cpp_deduction_guides
+#endif // __cpp_deduction_guides
   }
 };
 SimpleUnitTest<TestZipFunctionCtor, type_list<int>> TestZipFunctionCtorInstance;
@@ -176,4 +174,3 @@ struct TestNestedZipFunction2
   }
 };
 SimpleUnitTest<TestNestedZipFunction2, type_list<int, float>> TestNestedZipFunctionInstance2;
-#endif // _CCCL_STD_VER
diff --git a/thrust/thrust/detail/config/compiler.h b/thrust/thrust/detail/config/compiler.h
index 06a5f950712..0a74cf3baea 100644
--- a/thrust/thrust/detail/config/compiler.h
+++ b/thrust/thrust/detail/config/compiler.h
@@ -30,84 +30,6 @@
 #  pragma system_header
 #endif // no system header
 
-// enumerate host compilers we know about
-//! deprecated [Since 2.7]
-#define THRUST_HOST_COMPILER_UNKNOWN 0
-//! deprecated [Since 2.7]
-#define THRUST_HOST_COMPILER_MSVC 1
-//! deprecated [Since 2.7]
-#define THRUST_HOST_COMPILER_GCC 2
-//! deprecated [Since 2.7]
-#define THRUST_HOST_COMPILER_CLANG 3
-//! deprecated [Since 2.7]
-#define THRUST_HOST_COMPILER_INTEL 4
-
-// enumerate device compilers we know about
-//! deprecated [Since 2.7]
-#define THRUST_DEVICE_COMPILER_UNKNOWN 0
-//! deprecated [Since 2.7]
-#define THRUST_DEVICE_COMPILER_MSVC 1
-//! deprecated [Since 2.7]
-#define THRUST_DEVICE_COMPILER_GCC 2
-//! deprecated [Since 2.7]
-#define THRUST_DEVICE_COMPILER_CLANG 3
-//! deprecated [Since 2.7]
-#define THRUST_DEVICE_COMPILER_NVCC 4
-
-// figure out which host compiler we're using
-#if _CCCL_COMPILER(MSVC)
-//! deprecated [Since 2.7]
-#  define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_MSVC
-//! deprecated [Since 2.7]
-#  define THRUST_MSVC_VERSION _MSC_VER
-//! deprecated [Since 2.7]
-#  define THRUST_MSVC_VERSION_FULL _MSC_FULL_VER
-#elif _CCCL_COMPILER(CLANG)
-//! deprecated [Since 2.7]
-#  define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_CLANG
-//! deprecated [Since 2.7]
-#  define THRUST_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
-#elif _CCCL_COMPILER(GCC)
-//! deprecated [Since 2.7]
-#  define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_GCC
-//! deprecated [Since 2.7]
-#  define THRUST_GCC_VERSION   (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#  if _CCCL_COMPILER(GCC, >=, 5)
-//! deprecated [Since 2.7]
-#    define THRUST_MODERN_GCC
-#  else
-//! deprecated [Since 2.7]
-#    define THRUST_LEGACY_GCC
-#  endif
-#else
-//! deprecated [Since 2.7]
-#  define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_UNKNOWN
-#endif // TRUST_HOST_COMPILER
-
-// figure out which device compiler we're using
-#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
-//! deprecated [Since 2.7]
-#  define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC
-#elif _CCCL_COMPILER(MSVC)
-//! deprecated [Since 2.7]
-#  define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_MSVC
-#elif _CCCL_COMPILER(GCC)
-//! deprecated [Since 2.7]
-#  define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_GCC
-#elif _CCCL_COMPILER(CLANG)
-// CUDA-capable clang should behave similar to NVCC.
-#  if defined(__CUDA__)
-//! deprecated [Since 2.7]
-#    define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC
-#  else
-//! deprecated [Since 2.7]
-#    define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_CLANG
-#  endif
-#else
-//! deprecated [Since 2.7]
-#  define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_UNKNOWN
-#endif
-
 // is the device compiler capable of compiling omp?
 #if defined(_OPENMP) || defined(_NVHPC_STDPAR_OPENMP)
 #  define THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE THRUST_TRUE
diff --git a/thrust/thrust/detail/modern_gcc_required.h b/thrust/thrust/detail/modern_gcc_required.h
deleted file mode 100644
index b639d1d933a..00000000000
--- a/thrust/thrust/detail/modern_gcc_required.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright 2018 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config/cpp_dialect.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#ifndef THRUST_MODERN_GCC_REQUIRED_NO_ERROR
-#  if _CCCL_COMPILER(GCC) && !defined(THRUST_MODERN_GCC)
-#    error GCC 5 or later is required for this Thrust feature; please upgrade your compiler.
-#  endif
-#endif
diff --git a/thrust/thrust/zip_function.h b/thrust/thrust/zip_function.h
index 24c52cc34eb..e9bd1bf9848 100644
--- a/thrust/thrust/zip_function.h
+++ b/thrust/thrust/zip_function.h
@@ -16,14 +16,12 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/modern_gcc_required.h>
-#if !defined(THRUST_LEGACY_GCC)
 
-#  include <thrust/detail/type_deduction.h>
-#  include <thrust/tuple.h>
-#  include <thrust/type_traits/integer_sequence.h>
+#include <thrust/detail/type_deduction.h>
+#include <thrust/tuple.h>
+#include <thrust/type_traits/integer_sequence.h>
 
-#  include <cuda/functional>
+#include <cuda/functional>
 
 THRUST_NAMESPACE_BEGIN
 
@@ -42,7 +40,7 @@ namespace zip_detail
 {
 
 // Add workaround for decltype(auto) on C++11-only compilers:
-#  if _CCCL_STD_VER >= 2014
+#if _CCCL_STD_VER >= 2014
 
 _CCCL_EXEC_CHECK_DISABLE
 template <typename Function, typename Tuple, std::size_t... Is>
@@ -58,7 +56,7 @@ _CCCL_HOST_DEVICE decltype(auto) apply(Function&& func, Tuple&& args)
   return apply_impl(THRUST_FWD(func), THRUST_FWD(args), make_index_sequence<tuple_size>{});
 }
 
-#  else // _CCCL_STD_VER
+#else // _CCCL_STD_VER
 
 _CCCL_EXEC_CHECK_DISABLE
 template <typename Function, typename Tuple, std::size_t... Is>
@@ -71,7 +69,7 @@ _CCCL_HOST_DEVICE auto apply_impl(Function&& func, Tuple&& args, index_sequence<
       THRUST_FWD(args),
       make_index_sequence<thrust::tuple_size<typename std::decay<Tuple>::type>::value>{}))
 
-#  endif // _CCCL_STD_VER
+#endif // _CCCL_STD_VER
 
 } // namespace zip_detail
 } // namespace detail
@@ -149,7 +147,7 @@ class zip_function
   {}
 
 // Add workaround for decltype(auto) on C++11-only compilers:
-#  if _CCCL_STD_VER >= 2014
+#if _CCCL_STD_VER >= 2014
 
   template <typename Tuple>
   _CCCL_HOST_DEVICE decltype(auto) operator()(Tuple&& args) const
@@ -157,7 +155,7 @@ class zip_function
     return detail::zip_detail::apply(func, THRUST_FWD(args));
   }
 
-#  else // _CCCL_STD_VER
+#else // _CCCL_STD_VER
 
   // Can't just use THRUST_DECLTYPE_RETURNS here since we need to use
   // std::declval for the signature components:
@@ -169,7 +167,7 @@ class zip_function
     return detail::zip_detail::apply(func, THRUST_FWD(args));
   }
 
-#  endif // _CCCL_STD_VER
+#endif // _CCCL_STD_VER
 
   //! Returns a reference to the underlying function.
   _CCCL_HOST_DEVICE Function& underlying_function() const
@@ -208,5 +206,3 @@ template <typename F>
 struct proclaims_copyable_arguments<THRUST_NS_QUALIFIER::zip_function<F>> : proclaims_copyable_arguments<F>
 {};
 _LIBCUDACXX_END_NAMESPACE_CUDA
-
-#endif

From 37bd0a4995573b7f1f3b93ab7de3b9d20537e798 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 9 Jan 2025 14:31:03 +0100
Subject: [PATCH 11/31] Drop CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__
 (#3305)

---
 cub/cub/detail/detect_cuda_runtime.cuh        | 25 --------------
 cub/test/cmake/check_source_files.cmake       | 33 -------------------
 thrust/testing/cmake/check_source_files.cmake | 33 -------------------
 thrust/thrust/system/cuda/config.h            | 18 ----------
 4 files changed, 109 deletions(-)

diff --git a/cub/cub/detail/detect_cuda_runtime.cuh b/cub/cub/detail/detect_cuda_runtime.cuh
index d83b2c1179a..7666f9b2d23 100644
--- a/cub/cub/detail/detect_cuda_runtime.cuh
+++ b/cub/cub/detail/detect_cuda_runtime.cuh
@@ -73,40 +73,15 @@
  */
 #  define CUB_RUNTIME_FUNCTION
 
-/**
- * \def CUB_RUNTIME_ENABLED
- *
- * Whether or not the active compiler pass is allowed to invoke device kernels
- * or methods from the CUDA runtime API.
- *
- * This macro should not be used in CUB, as it depends on `__CUDA_ARCH__`
- * and is not compatible with `NV_IF_TARGET`. It is provided for legacy
- * purposes only.
- *
- * Replace any usages with `CUB_RDC_ENABLED` and `NV_IF_TARGET`.
- */
-#  define CUB_RUNTIME_ENABLED
-
 #else // Non-doxygen pass:
 
 #  ifndef CUB_RUNTIME_FUNCTION
-
 #    if defined(__CUDACC_RDC__) && !defined(CUB_DISABLE_CDP)
-
 #      define CUB_RDC_ENABLED
 #      define CUB_RUNTIME_FUNCTION _CCCL_HOST_DEVICE
-
 #    else // RDC disabled:
-
 #      define CUB_RUNTIME_FUNCTION _CCCL_HOST
-
 #    endif // RDC enabled
-
-#    if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
-// Legacy only -- do not use in new code.
-#      define CUB_RUNTIME_ENABLED
-#    endif
-
 #  endif // CUB_RUNTIME_FUNCTION predefined
 
 #  ifdef CUB_RDC_ENABLED
diff --git a/cub/test/cmake/check_source_files.cmake b/cub/test/cmake/check_source_files.cmake
index 1554a2256e1..1fba8476f67 100644
--- a/cub/test/cmake/check_source_files.cmake
+++ b/cub/test/cmake/check_source_files.cmake
@@ -83,24 +83,6 @@ if (NOT valid_count EQUAL 5)
     "Matched ${valid_count} times, expected 5.")
 endif()
 
-################################################################################
-# Legacy macro checks.
-# Check all files in CUB to make sure that they aren't using the legacy
-# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros.
-#
-# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET.
-# They are provided for legacy purposes and should be replaced with
-# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code.
-#
-#
-set(legacy_macro_header_exclusions
-  # This header defines a legacy CUDART macro:
-  cub/detail/detect_cuda_runtime.cuh
-)
-
-set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED")
-set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__")
-
 ################################################################################
 # Read source files:
 foreach(src ${cub_srcs})
@@ -156,21 +138,6 @@ foreach(src ${cub_srcs})
       set(found_errors 1)
     endif()
   endif()
-
-  if (NOT ${src} IN_LIST legacy_macro_header_exclusions)
-    count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count)
-    count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count)
-
-    if (NOT thrust_count EQUAL 0)
-      message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.")
-      set(found_errors 1)
-    endif()
-
-    if (NOT cub_count EQUAL 0)
-      message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.")
-      set(found_errors 1)
-    endif()
-  endif()
 endforeach()
 
 if (NOT found_errors EQUAL 0)
diff --git a/thrust/testing/cmake/check_source_files.cmake b/thrust/testing/cmake/check_source_files.cmake
index 900300c6730..866f5e7db8c 100644
--- a/thrust/testing/cmake/check_source_files.cmake
+++ b/thrust/testing/cmake/check_source_files.cmake
@@ -84,24 +84,6 @@ if (NOT valid_count EQUAL 5)
     "Matched ${valid_count} times, expected 5.")
 endif()
 
-################################################################################
-# Legacy macro checks.
-# Check all files in Thrust to make sure that they aren't using the legacy
-# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros.
-#
-# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET.
-# They are provided for legacy purposes and should be replaced with
-# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code.
-#
-#
-set(legacy_macro_header_exclusions
-  # This header defines a legacy CUDART macro:
-  thrust/system/cuda/config.h
-)
-
-set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED")
-set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__")
-
 ################################################################################
 # Read source files:
 foreach(src ${thrust_srcs})
@@ -163,21 +145,6 @@ foreach(src ${thrust_srcs})
       set(found_errors 1)
     endif()
   endif()
-
-  if (NOT ${src} IN_LIST legacy_macro_header_exclusions)
-    count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count)
-    count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count)
-
-    if (NOT thrust_count EQUAL 0)
-      message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.")
-      set(found_errors 1)
-    endif()
-
-    if (NOT cub_count EQUAL 0)
-      message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.")
-      set(found_errors 1)
-    endif()
-  endif()
 endforeach()
 
 if (NOT found_errors EQUAL 0)
diff --git a/thrust/thrust/system/cuda/config.h b/thrust/thrust/system/cuda/config.h
index 8c4acff1dda..f25345eb615 100644
--- a/thrust/thrust/system/cuda/config.h
+++ b/thrust/thrust/system/cuda/config.h
@@ -66,24 +66,6 @@
 #  define THRUST_RDC_ENABLED
 #endif
 
-/**
- * \def __THRUST_HAS_CUDART__
- *
- * Whether or not the active compiler pass is allowed to invoke device kernels
- * or methods from the CUDA runtime API.
- *
- * This macro should not be used in Thrust, as it depends on `__CUDA_ARCH__`
- * and is not compatible with `NV_IF_TARGET`. It is provided for legacy
- * purposes only.
- *
- * Replace any usages with `THRUST_RDC_ENABLED` and `NV_IF_TARGET`.
- */
-#ifdef CUB_RUNTIME_ENABLED
-#  define __THRUST_HAS_CUDART__ 1
-#else
-#  define __THRUST_HAS_CUDART__ 0
-#endif
-
 #ifdef THRUST_AGENT_ENTRY_NOINLINE
 #  define THRUST_AGENT_ENTRY_INLINE_ATTR __noinline__
 #else

From 55ca876de9ef18fa64ceb4f3e9f04d971439fe71 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Thu, 9 Jan 2025 16:17:10 +0100
Subject: [PATCH 12/31] Adds support for large number of items to
 `DevicePartition::If` with the `ThreeWayPartition` overload (#2506)

* adds support for large number of items to three-way partition

* adapts interface to use choose_signed_offset_t

* integrates applicable feedback from device-select pr

* changes behavior for empty problems

* unifies grid constant macro

* fixes kernel template specialization mismatch

* integrates _CCCL_GRID_CONSTANT changes

* resolve merge conflicts

* fixes checks in test

* fixes test verification

* improves tests

* makes few improvements to streaming dispatch

* improves code comment on test

* fixes unrelated compiler error

* minor style improvements
---
 cub/cub/agent/agent_three_way_partition.cuh   |  41 ++-
 cub/cub/device/device_partition.cuh           |  32 ++-
 .../dispatch/dispatch_three_way_partition.cuh | 254 ++++++++++++------
 cub/test/catch2_segmented_sort_helper.cuh     |  10 +-
 .../catch2_test_device_three_way_partition.cu | 107 ++++++--
 5 files changed, 326 insertions(+), 118 deletions(-)

diff --git a/cub/cub/agent/agent_three_way_partition.cuh b/cub/cub/agent/agent_three_way_partition.cuh
index 3a07944d4e2..eec24057163 100644
--- a/cub/cub/agent/agent_three_way_partition.cuh
+++ b/cub/cub/agent/agent_three_way_partition.cuh
@@ -175,7 +175,8 @@ template <typename PolicyT,
           typename UnselectedOutputIteratorT,
           typename SelectFirstPartOp,
           typename SelectSecondPartOp,
-          typename OffsetT>
+          typename OffsetT,
+          typename StreamingContextT>
 struct AgentThreeWayPartition
 {
   //---------------------------------------------------------------------
@@ -251,6 +252,9 @@ struct AgentThreeWayPartition
   SelectSecondPartOp select_second_part_op;
   OffsetT num_items; ///< Total number of input items
 
+  // Note: This is a const reference because we have seen double-digit percentage perf regressions otherwise
+  const StreamingContextT& streaming_context; ///< Context for the current partition
+
   //---------------------------------------------------------------------
   // Constructor
   //---------------------------------------------------------------------
@@ -264,7 +268,8 @@ struct AgentThreeWayPartition
     UnselectedOutputIteratorT d_unselected_out,
     SelectFirstPartOp select_first_part_op,
     SelectSecondPartOp select_second_part_op,
-    OffsetT num_items)
+    OffsetT num_items,
+    const StreamingContextT& streaming_context)
       : temp_storage(temp_storage.Alias())
       , d_in(d_in)
       , d_first_part_out(d_first_part_out)
@@ -273,6 +278,7 @@ struct AgentThreeWayPartition
       , select_first_part_op(select_first_part_op)
       , select_second_part_op(select_second_part_op)
       , num_items(num_items)
+      , streaming_context(streaming_context)
   {}
 
   //---------------------------------------------------------------------
@@ -350,6 +356,11 @@ struct AgentThreeWayPartition
     CTA_SYNC();
 
     // Gather items from shared memory and scatter to global
+    auto first_base =
+      d_first_part_out + (streaming_context.num_previously_selected_first() + num_first_selections_prefix);
+    auto second_base =
+      d_second_part_out + (streaming_context.num_previously_selected_second() + num_second_selections_prefix);
+    auto unselected_base = d_unselected_out + (streaming_context.num_previously_rejected() + num_rejected_prefix);
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
     {
       int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x;
@@ -360,16 +371,16 @@ struct AgentThreeWayPartition
 
         if (item_idx < first_item_end)
         {
-          d_first_part_out[num_first_selections_prefix + item_idx] = item;
+          first_base[item_idx] = item;
         }
         else if (item_idx < second_item_end)
         {
-          d_second_part_out[num_second_selections_prefix + item_idx - first_item_end] = item;
+          second_base[item_idx - first_item_end] = item;
         }
         else
         {
-          int rejection_idx                                     = item_idx - second_item_end;
-          d_unselected_out[num_rejected_prefix + rejection_idx] = item;
+          int rejection_idx              = item_idx - second_item_end;
+          unselected_base[rejection_idx] = item;
         }
       }
     }
@@ -400,11 +411,12 @@ struct AgentThreeWayPartition
     // Load items
     if (IS_LAST_TILE)
     {
-      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+      BlockLoadT(temp_storage.load_items)
+        .Load(d_in + streaming_context.input_offset() + tile_offset, items, num_tile_items);
     }
     else
     {
-      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+      BlockLoadT(temp_storage.load_items).Load(d_in + streaming_context.input_offset() + tile_offset, items);
     }
 
     // Initialize selection_flags
@@ -464,11 +476,12 @@ struct AgentThreeWayPartition
     // Load items
     if (IS_LAST_TILE)
     {
-      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+      BlockLoadT(temp_storage.load_items)
+        .Load(d_in + streaming_context.input_offset() + tile_offset, items, num_tile_items);
     }
     else
     {
-      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+      BlockLoadT(temp_storage.load_items).Load(d_in + streaming_context.input_offset() + tile_offset, items);
     }
 
     // Initialize selection_flags
@@ -551,7 +564,7 @@ struct AgentThreeWayPartition
   {
     // Blocks are launched in increasing order, so just assign one tile per block
     // Current tile index
-    const int tile_idx = static_cast<int>((blockIdx.x * gridDim.y) + blockIdx.y);
+    const int tile_idx = blockIdx.x;
 
     // Global offset for the current tile
     const OffsetT tile_offset = tile_idx * TILE_ITEMS;
@@ -572,9 +585,9 @@ struct AgentThreeWayPartition
 
       if (threadIdx.x == 0)
       {
-        // Output the total number of items selection_flags
-        d_num_selected_out[0] = AccumPackHelperT::first(accum);
-        d_num_selected_out[1] = AccumPackHelperT::second(accum);
+        // Update the number of selected items with this partition's selections
+        streaming_context.update_num_selected(
+          d_num_selected_out, AccumPackHelperT::first(accum), AccumPackHelperT::second(accum), num_items);
       }
     }
   }
diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh
index c68f6cf4d61..1b9eef947fa 100644
--- a/cub/cub/device/device_partition.cuh
+++ b/cub/cub/device/device_partition.cuh
@@ -445,7 +445,8 @@ private:
             typename UnselectedOutputIteratorT,
             typename NumSelectedIteratorT,
             typename SelectFirstPartOp,
-            typename SelectSecondPartOp>
+            typename SelectSecondPartOp,
+            typename NumItemsT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t IfNoNVTX(
     void* d_temp_storage,
     std::size_t& temp_storage_bytes,
@@ -454,12 +455,13 @@ private:
     SecondOutputIteratorT d_second_part_out,
     UnselectedOutputIteratorT d_unselected_out,
     NumSelectedIteratorT d_num_selected_out,
-    int num_items,
+    NumItemsT num_items,
     SelectFirstPartOp select_first_part_op,
     SelectSecondPartOp select_second_part_op,
     cudaStream_t stream = 0)
   {
-    using OffsetT                      = int;
+    using ChooseOffsetT                = detail::choose_signed_offset<NumItemsT>;
+    using OffsetT                      = typename ChooseOffsetT::type;
     using DispatchThreeWayPartitionIfT = DispatchThreeWayPartitionIf<
       InputIteratorT,
       FirstOutputIteratorT,
@@ -470,6 +472,14 @@ private:
       SelectSecondPartOp,
       OffsetT>;
 
+    // Signed integer type for global offsets
+    // Check if the number of items exceeds the range covered by the selected signed offset type
+    cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items);
+    if (error)
+    {
+      return error;
+    }
+
     return DispatchThreeWayPartitionIfT::Dispatch(
       d_temp_storage,
       temp_storage_bytes,
@@ -625,6 +635,9 @@ public:
   //! @tparam SelectSecondPartOp
   //!   **[inferred]** Selection functor type having member `bool operator()(const T &a)`
   //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
   //! @param[in] d_temp_storage
   //!   Device-accessible allocation of temporary storage. When `nullptr`, the
   //!   required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -670,7 +683,8 @@ public:
             typename UnselectedOutputIteratorT,
             typename NumSelectedIteratorT,
             typename SelectFirstPartOp,
-            typename SelectSecondPartOp>
+            typename SelectSecondPartOp,
+            typename NumItemsT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
   If(void* d_temp_storage,
      std::size_t& temp_storage_bytes,
@@ -679,7 +693,7 @@ public:
      SecondOutputIteratorT d_second_part_out,
      UnselectedOutputIteratorT d_unselected_out,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     NumItemsT num_items,
      SelectFirstPartOp select_first_part_op,
      SelectSecondPartOp select_second_part_op,
      cudaStream_t stream = 0)
@@ -706,7 +720,8 @@ public:
             typename UnselectedOutputIteratorT,
             typename NumSelectedIteratorT,
             typename SelectFirstPartOp,
-            typename SelectSecondPartOp>
+            typename SelectSecondPartOp,
+            typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
   If(void* d_temp_storage,
      std::size_t& temp_storage_bytes,
@@ -715,7 +730,7 @@ public:
      SecondOutputIteratorT d_second_part_out,
      UnselectedOutputIteratorT d_unselected_out,
      NumSelectedIteratorT d_num_selected_out,
-     int num_items,
+     NumItemsT num_items,
      SelectFirstPartOp select_first_part_op,
      SelectSecondPartOp select_second_part_op,
      cudaStream_t stream,
@@ -729,7 +744,8 @@ public:
               UnselectedOutputIteratorT,
               NumSelectedIteratorT,
               SelectFirstPartOp,
-              SelectSecondPartOp>(
+              SelectSecondPartOp,
+              NumItemsT>(
       d_temp_storage,
       temp_storage_bytes,
       d_in,
diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
index 90295f2c06f..fc259499b85 100644
--- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -46,17 +46,97 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
-#include <cstdio>
 #include <iterator>
 
 #include <nv/target>
 
 CUB_NAMESPACE_BEGIN
 
+namespace detail
+{
+
+namespace three_way_partition
+{
+// Offset type used to instantiate the stream three-way-partition-kernel and agent to index the items within one
+// partition
+using per_partition_offset_t = ::cuda::std::int32_t;
+
+template <typename TotalNumItemsT>
+class streaming_context_t
+{
+private:
+  bool first_partition = true;
+  bool last_partition  = false;
+  TotalNumItemsT total_previous_num_items{};
+
+  // We use a double-buffer for keeping track of the number of previously selected items
+  TotalNumItemsT* d_num_selected_in  = nullptr;
+  TotalNumItemsT* d_num_selected_out = nullptr;
+
+public:
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
+  streaming_context_t(TotalNumItemsT* d_num_selected_in, TotalNumItemsT* d_num_selected_out, bool is_last_partition)
+      : last_partition(is_last_partition)
+      , d_num_selected_in(d_num_selected_in)
+      , d_num_selected_out(d_num_selected_out)
+  {}
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT num_items, bool next_partition_is_the_last)
+  {
+    ::cuda::std::swap(d_num_selected_in, d_num_selected_out);
+    first_partition = false;
+    last_partition  = next_partition_is_the_last;
+    total_previous_num_items += num_items;
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset() const
+  {
+    return first_partition ? TotalNumItemsT{0} : total_previous_num_items;
+  };
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected_first() const
+  {
+    return first_partition ? TotalNumItemsT{0} : d_num_selected_in[0];
+  };
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected_second() const
+  {
+    return first_partition ? TotalNumItemsT{0} : d_num_selected_in[1];
+  };
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const
+  {
+    return first_partition ? TotalNumItemsT{0} : d_num_selected_in[2];
+    ;
+  };
+
+  template <typename NumSelectedIteratorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void update_num_selected(
+    NumSelectedIteratorT user_num_selected_out_it,
+    TotalNumItemsT num_selected_first,
+    TotalNumItemsT num_selected_second,
+    TotalNumItemsT num_items_in_partition) const
+  {
+    if (last_partition)
+    {
+      user_num_selected_out_it[0] = num_previously_selected_first() + num_selected_first;
+      user_num_selected_out_it[1] = num_previously_selected_second() + num_selected_second;
+    }
+    else
+    {
+      d_num_selected_out[0] = num_previously_selected_first() + num_selected_first;
+      d_num_selected_out[1] = num_previously_selected_second() + num_selected_second;
+      d_num_selected_out[2] =
+        num_previously_rejected() + (num_items_in_partition - num_selected_second - num_selected_first);
+    }
+  }
+};
+} // namespace three_way_partition
+} // namespace detail
+
 /******************************************************************************
  * Kernel entry points
  *****************************************************************************/
-
 template <typename ChainedPolicyT,
           typename InputIteratorT,
           typename FirstOutputIteratorT,
@@ -66,7 +146,8 @@ template <typename ChainedPolicyT,
           typename ScanTileStateT,
           typename SelectFirstPartOp,
           typename SelectSecondPartOp,
-          typename OffsetT>
+          typename OffsetT,
+          typename StreamingContextT>
 __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLOCK_THREADS))
   CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceThreeWayPartitionKernel(
     InputIteratorT d_in,
@@ -78,7 +159,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO
     SelectFirstPartOp select_first_part_op,
     SelectSecondPartOp select_second_part_op,
     OffsetT num_items,
-    int num_tiles)
+    int num_tiles,
+    _CCCL_GRID_CONSTANT const StreamingContextT streaming_context)
 {
   using AgentThreeWayPartitionPolicyT = typename ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy;
 
@@ -91,7 +173,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO
     UnselectedOutputIteratorT,
     SelectFirstPartOp,
     SelectSecondPartOp,
-    OffsetT>;
+    OffsetT,
+    StreamingContextT>;
 
   // Shared memory for AgentThreeWayPartition
   __shared__ typename AgentThreeWayPartitionT::TempStorage temp_storage;
@@ -105,7 +188,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO
     d_unselected_out,
     select_first_part_op,
     select_second_part_op,
-    num_items)
+    num_items,
+    streaming_context)
     .ConsumeRange(num_tiles, tile_status, d_num_selected_out);
 }
 
@@ -160,14 +244,23 @@ template <typename InputIteratorT,
           typename SelectFirstPartOp,
           typename SelectSecondPartOp,
           typename OffsetT,
-          typename PolicyHub = detail::three_way_partition::policy_hub<cub::detail::value_t<InputIteratorT>, OffsetT>>
+          typename PolicyHub = detail::three_way_partition::
+            policy_hub<cub::detail::value_t<InputIteratorT>, detail::three_way_partition::per_partition_offset_t>>
 struct DispatchThreeWayPartitionIf
 {
   /*****************************************************************************
    * Types and constants
    ****************************************************************************/
 
-  using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t<OffsetT>;
+  // Offset type used to instantiate the three-way partition-kernel and agent to index the items within one partition
+  using per_partition_offset_t = detail::three_way_partition::per_partition_offset_t;
+
+  // Type used to provide streaming information about each partition's context
+  static constexpr per_partition_offset_t partition_size = ::cuda::std::numeric_limits<per_partition_offset_t>::max();
+
+  using streaming_context_t = detail::three_way_partition::streaming_context_t<OffsetT>;
+
+  using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t<per_partition_offset_t>;
   using AccumPackT       = typename AccumPackHelperT::pack_t;
   using ScanTileStateT   = cub::ScanTileState<AccumPackT>;
 
@@ -222,64 +315,77 @@ struct DispatchThreeWayPartitionIf
 
     constexpr int block_threads    = ActivePolicyT::ThreeWayPartitionPolicy::BLOCK_THREADS;
     constexpr int items_per_thread = ActivePolicyT::ThreeWayPartitionPolicy::ITEMS_PER_THREAD;
+    constexpr int tile_size        = block_threads * items_per_thread;
 
-    do
-    {
-      // Get device ordinal
-      int device_ordinal;
-      error = CubDebug(cudaGetDevice(&device_ordinal));
-      if (cudaSuccess != error)
-      {
-        break;
-      }
+    // The maximum number of items for which we will ever invoke the kernel (i.e. largest partition size)
+    auto const max_partition_size =
+      static_cast<OffsetT>(::cuda::std::min(static_cast<uint64_t>(num_items), static_cast<uint64_t>(partition_size)));
 
-      // Number of input tiles
-      int tile_size = block_threads * items_per_thread;
-      int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
+    // The number of partitions required to "iterate" over the total input
+    auto const num_partitions =
+      (max_partition_size == 0) ? OffsetT{1} : ::cuda::ceil_div(num_items, max_partition_size);
 
-      // Specify temporary storage allocation requirements
-      size_t allocation_sizes[1]; // bytes needed for tile status descriptors
+    // The maximum number of tiles for which we will ever invoke the kernel
+    auto const max_num_tiles_per_invocation = static_cast<OffsetT>(::cuda::ceil_div(max_partition_size, tile_size));
 
-      error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
-      if (cudaSuccess != error)
-      {
-        break;
-      }
+    // For streaming invocations, we need two sets (for double-buffering) of three counters each
+    constexpr ::cuda::std::size_t num_counters_per_pass  = 3;
+    constexpr ::cuda::std::size_t num_streaming_counters = 2 * num_counters_per_pass;
+    ::cuda::std::size_t streaming_selection_storage_bytes =
+      (num_partitions > 1) ? num_streaming_counters * sizeof(OffsetT) : ::cuda::std::size_t{0};
 
-      // Compute allocation pointers into the single storage blob (or compute
-      // the necessary size of the blob)
-      void* allocations[1] = {};
+    // Specify temporary storage allocation requirements
+    size_t allocation_sizes[2] = {0ULL, streaming_selection_storage_bytes};
 
-      error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
-      if (cudaSuccess != error)
-      {
-        break;
-      }
+    error =
+      CubDebug(ScanTileStateT::AllocationSize(static_cast<int>(max_num_tiles_per_invocation), allocation_sizes[0]));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
 
-      if (d_temp_storage == nullptr)
-      {
-        // Return if the caller is simply requesting the size of the storage
-        // allocation
-        break;
-      }
+    // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+    void* allocations[2] = {};
 
-      // Return if empty problem
-      if (num_items == 0)
-      {
-        break;
-      }
+    error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    if (d_temp_storage == nullptr)
+    {
+      // Return if the caller is simply requesting the size of the storage
+      // allocation
+      return cudaSuccess;
+    }
+
+    // Initialize the streaming context with the temporary storage for double-buffering the previously selected items
+    // and the total number (across all partitions) of items
+    OffsetT* tmp_num_selected_out = static_cast<OffsetT*>(allocations[1]);
+    streaming_context_t streaming_context{
+      tmp_num_selected_out, (tmp_num_selected_out + num_counters_per_pass), (num_partitions <= 1)};
+
+    // Iterate over the partitions until all input is processed
+    for (OffsetT partition_idx = 0; partition_idx < num_partitions; partition_idx++)
+    {
+      OffsetT current_partition_offset = partition_idx * max_partition_size;
+      OffsetT current_num_items =
+        (partition_idx + 1 == num_partitions) ? (num_items - current_partition_offset) : max_partition_size;
 
       // Construct the tile status interface
-      ScanTileStateT tile_status;
+      const auto current_num_tiles = static_cast<int>(::cuda::ceil_div(current_num_items, tile_size));
 
-      error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]));
+      // Construct the tile status interface
+      ScanTileStateT tile_status;
+      error = CubDebug(tile_status.Init(current_num_tiles, allocations[0], allocation_sizes[0]));
       if (cudaSuccess != error)
       {
-        break;
+        return error;
       }
 
       // Log three_way_partition_init_kernel configuration
-      int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS));
+      int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(current_num_tiles, INIT_KERNEL_THREADS));
 
 #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
       _CubLog("Invoking three_way_partition_init_kernel<<<%d, %d, 0, %lld>>>()\n",
@@ -290,36 +396,29 @@ struct DispatchThreeWayPartitionIf
 
       // Invoke three_way_partition_init_kernel to initialize tile descriptors
       THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
-        .doit(three_way_partition_init_kernel, tile_status, num_tiles, d_num_selected_out);
+        .doit(three_way_partition_init_kernel, tile_status, current_num_tiles, d_num_selected_out);
 
       // Check for failure to launch
       error = CubDebug(cudaPeekAtLastError());
       if (cudaSuccess != error)
       {
-        break;
+        return error;
       }
 
       // Sync the stream if specified to flush runtime errors
       error = CubDebug(detail::DebugSyncStream(stream));
       if (cudaSuccess != error)
       {
-        break;
+        return error;
       }
 
-      // Get max x-dimension of grid
-      int max_dim_x;
-      error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
-      if (cudaSuccess != error)
+      // No more items to process (note, we do not want to return early for num_items==0, because we need to make sure
+      // that `three_way_partition_init_kernel` has written '0' to d_num_selected_out)
+      if (current_num_items == 0)
       {
-        break;
+        return cudaSuccess;
       }
 
-      // Get grid size for scanning tiles
-      dim3 scan_grid_size;
-      scan_grid_size.z = 1;
-      scan_grid_size.y = ::cuda::ceil_div(num_tiles, max_dim_x);
-      scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
-
 // Log select_if_kernel configuration
 #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
       {
@@ -330,14 +429,12 @@ struct DispatchThreeWayPartitionIf
                                         block_threads));
         if (cudaSuccess != error)
         {
-          break;
+          return error;
         }
 
-        _CubLog("Invoking three_way_partition_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d "
+        _CubLog("Invoking three_way_partition_kernel<<<%d, %d, 0, %lld>>>(), %d "
                 "items per thread, %d SM occupancy\n",
-                scan_grid_size.x,
-                scan_grid_size.y,
-                scan_grid_size.z,
+                current_num_tiles,
                 block_threads,
                 reinterpret_cast<long long>(stream),
                 items_per_thread,
@@ -346,7 +443,7 @@ struct DispatchThreeWayPartitionIf
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke select_if_kernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(current_num_tiles, block_threads, 0, stream)
         .doit(three_way_partition_kernel,
               d_in,
               d_first_part_out,
@@ -356,23 +453,27 @@ struct DispatchThreeWayPartitionIf
               tile_status,
               select_first_part_op,
               select_second_part_op,
-              num_items,
-              num_tiles);
+              static_cast<per_partition_offset_t>(current_num_items),
+              current_num_tiles,
+              streaming_context);
 
       // Check for failure to launch
       error = CubDebug(cudaPeekAtLastError());
       if (cudaSuccess != error)
       {
-        break;
+        return error;
       }
 
       // Sync the stream if specified to flush runtime errors
       error = CubDebug(detail::DebugSyncStream(stream));
       if (cudaSuccess != error)
       {
-        break;
+        return error;
       }
-    } while (0);
+
+      // Prepare streaming context for next partition (swap double buffers, advance number of processed items, etc.)
+      streaming_context.advance(current_num_items, (partition_idx + OffsetT{2} == num_partitions));
+    }
 
     return error;
   }
@@ -393,7 +494,8 @@ struct DispatchThreeWayPartitionIf
         ScanTileStateT,
         SelectFirstPartOp,
         SelectSecondPartOp,
-        OffsetT>);
+        per_partition_offset_t,
+        streaming_context_t>);
   }
 
   /**
diff --git a/cub/test/catch2_segmented_sort_helper.cuh b/cub/test/catch2_segmented_sort_helper.cuh
index 3e93b3f0d62..0852921bebf 100644
--- a/cub/test/catch2_segmented_sort_helper.cuh
+++ b/cub/test/catch2_segmented_sort_helper.cuh
@@ -1388,11 +1388,11 @@ struct generate_edge_case_offsets_dispatch
   static constexpr int a_bunch_of = 42;
   static constexpr int a_lot_of   = 420;
 
-  int small_segment_max_segment_size;
-  int items_per_small_segment;
-  int medium_segment_max_segment_size;
-  int single_thread_segment_size;
-  int large_cached_segment_max_segment_size;
+  int small_segment_max_segment_size{};
+  int items_per_small_segment{};
+  int medium_segment_max_segment_size{};
+  int single_thread_segment_size{};
+  int large_cached_segment_max_segment_size{};
 
   template <typename ActivePolicyT>
   CUB_RUNTIME_FUNCTION cudaError_t Invoke()
diff --git a/cub/test/catch2_test_device_three_way_partition.cu b/cub/test/catch2_test_device_three_way_partition.cu
index 8c6524adf7c..3b5f96c8d60 100644
--- a/cub/test/catch2_test_device_three_way_partition.cu
+++ b/cub/test/catch2_test_device_three_way_partition.cu
@@ -30,6 +30,10 @@
 
 #include <cub/device/device_partition.cuh>
 
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/partition.h>
 #include <thrust/random.h>
 #include <thrust/reduce.h>
@@ -38,6 +42,8 @@
 
 #include <cuda/std/utility>
 
+#include "catch2_large_problem_helper.cuh"
+#include "catch2_test_device_select_common.cuh"
 #include "catch2_test_launch_helper.h"
 #include "cub/util_type.cuh"
 #include <c2h/catch2_test_helper.h>
@@ -48,20 +54,8 @@ DECLARE_LAUNCH_WRAPPER(cub::DevicePartition::If, partition);
 
 using types = c2h::type_list<std::int32_t, std::int64_t>;
 
-template <typename T>
-struct less_than_t
-{
-  T compare;
-
-  explicit __host__ less_than_t(T compare)
-      : compare(compare)
-  {}
-
-  __device__ bool operator()(const T& a) const
-  {
-    return a < compare;
-  }
-};
+// List of offset types to be used for testing large number of items
+using offset_types = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t>;
 
 template <typename T>
 struct equal_to_t
@@ -103,6 +97,29 @@ struct count_to_pair_t
   }
 };
 
+template <typename T>
+struct mod_equal_to
+{
+  T mod;
+  T val;
+  __host__ __device__ bool operator()(T x) const
+  {
+    return x % mod == val;
+  }
+};
+
+template <typename T>
+struct multiply_and_add
+{
+  T mul;
+  T add;
+
+  __host__ __device__ T operator()(T x) const
+  {
+    return x * mul + add;
+  }
+};
+
 C2H_TEST("Device three-way partition can handle empty problems", "[partition][device]", types)
 {
   using type = typename c2h::get<0, TestType>;
@@ -113,12 +130,15 @@ C2H_TEST("Device three-way partition can handle empty problems", "[partition][de
   type* d_first_part_out{};
   type* d_second_part_out{};
   type* d_unselected_out{};
-  type* d_num_selected_out{};
+  c2h::device_vector<type> num_selected_out{42, 42};
+  type* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
 
   less_than_t<type> le(type{0});
   greater_or_equal_t<type> ge(type{1});
 
   partition(in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, num_items, le, ge);
+  REQUIRE(num_selected_out[0] == 0);
+  REQUIRE(num_selected_out[1] == 0);
 }
 
 template <typename T>
@@ -440,3 +460,60 @@ C2H_TEST("Device three-way partition handles single output", "[partition][device
     second_part_val);
   REQUIRE(actual_num_items_in_second_part == num_items_in_second_part);
 }
+
+C2H_TEST("Device three-way partition works for very large number of items", "[device][partition]", offset_types)
+try
+{
+  using offset_t = typename c2h::get<0, TestType>;
+
+  auto num_items_max_ull =
+    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
+             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
+  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
+  offset_t num_items_min =
+    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
+  offset_t num_items = GENERATE_COPY(
+    values(
+      {num_items_max, static_cast<offset_t>(num_items_max - 1), static_cast<offset_t>(1), static_cast<offset_t>(3)}),
+    take(2, random(num_items_min, num_items_max)));
+
+  auto in = thrust::make_counting_iterator(offset_t{0});
+
+  auto first_selector  = mod_equal_to<offset_t>{3, 0};
+  auto second_selector = mod_equal_to<offset_t>{3, 1};
+
+  offset_t expected_first  = num_items / offset_t{3} + (num_items % offset_t{3} >= 1);
+  offset_t expected_second = num_items / offset_t{3} + (num_items % offset_t{3} >= 2);
+  offset_t expected_third  = num_items / offset_t{3};
+
+  auto expected_first_it  = thrust::make_transform_iterator(in, multiply_and_add<offset_t>{3, 0});
+  auto expected_second_it = thrust::make_transform_iterator(in, multiply_and_add<offset_t>{3, 1});
+  auto expected_third_it  = thrust::make_transform_iterator(in, multiply_and_add<offset_t>{3, 2});
+
+  // Prepare tabulate output iterators to verify results in a memory-efficient way
+  auto check_first_partition_helper  = detail::large_problem_test_helper(expected_first);
+  auto check_first_it                = check_first_partition_helper.get_flagging_output_iterator(expected_first_it);
+  auto check_second_partition_helper = detail::large_problem_test_helper(expected_second);
+  auto check_second_it               = check_second_partition_helper.get_flagging_output_iterator(expected_second_it);
+  auto check_third_partition_helper  = detail::large_problem_test_helper(expected_third);
+  auto check_third_it                = check_third_partition_helper.get_flagging_output_iterator(expected_third_it);
+
+  // Needs to be device accessible
+  c2h::device_vector<offset_t> num_selected_out{0, 0};
+  offset_t* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  partition(
+    in, check_first_it, check_second_it, check_third_it, d_num_selected_out, num_items, first_selector, second_selector);
+
+  // Ensure that we created the correct output
+  REQUIRE(num_selected_out[0] == expected_first);
+  REQUIRE(num_selected_out[1] == expected_second);
+  check_first_partition_helper.check_all_results_correct();
+  check_second_partition_helper.check_all_results_correct();
+  check_third_partition_helper.check_all_results_correct();
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}

From fc0164684c6a01cef0f3712140791062a22b7d26 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 9 Jan 2025 16:52:50 +0100
Subject: [PATCH 13/31] Refactor scan tunings (#3262)

---
 .../device/dispatch/tuning/tuning_scan.cuh    | 40 ++++++++++---------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index 2163c4b7431..2efa551d4c6 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -108,18 +108,6 @@ constexpr accum_size classify_accum_size()
          : accum_size::unknown;
 }
 
-template <class AccumT, int Threads, int Items, int L2B, int L2W>
-struct tuning
-{
-  static constexpr int threads = Threads;
-  static constexpr int items   = Items;
-  using delay_constructor      = fixed_delay_constructor_t<L2B, L2W>;
-  static constexpr BlockLoadAlgorithm load_algorithm =
-    (sizeof(AccumT) > 128) ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE;
-  static constexpr BlockStoreAlgorithm store_algorithm =
-    (sizeof(AccumT) > 128) ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE;
-};
-
 template <class AccumT,
           primitive_op PrimitiveOp,
           primitive_accum PrimitiveAccumulator = is_primitive_accum<AccumT>(),
@@ -209,17 +197,31 @@ template <class AccumT,
           accum_size AccumSize                 = classify_accum_size<AccumT>()>
 struct sm90_tuning;
 
+template <class AccumT, int Threads, int Items, int L2B, int L2W>
+struct sm90_tuning_vals
+{
+  static constexpr int threads = Threads;
+  static constexpr int items   = Items;
+  using delay_constructor      = fixed_delay_constructor_t<L2B, L2W>;
+  // same logic as default policy:
+  static constexpr bool large_values = sizeof(AccumT) > 128;
+  static constexpr BlockLoadAlgorithm load_algorithm =
+    large_values ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE;
+  static constexpr BlockStoreAlgorithm store_algorithm =
+    large_values ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE;
+};
+
 // clang-format off
-template <class T> struct sm90_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_1> : tuning<T, 192, 22, 168, 1140> {};
-template <class T> struct sm90_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_2> : tuning<T, 512, 12, 376, 1125> {};
-template <class T> struct sm90_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_4> : tuning<T, 128, 24, 648, 1245> {};
-template <class T> struct sm90_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_8> : tuning<T, 224, 24, 632, 1290> {};
+template <class T> struct sm90_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_1> : sm90_tuning_vals<T, 192, 22, 168, 1140> {};
+template <class T> struct sm90_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_2> : sm90_tuning_vals<T, 512, 12, 376, 1125> {};
+template <class T> struct sm90_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_4> : sm90_tuning_vals<T, 128, 24, 648, 1245> {};
+template <class T> struct sm90_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_8> : sm90_tuning_vals<T, 224, 24, 632, 1290> {};
 
-template <> struct sm90_tuning<float,  primitive_op::yes, primitive_accum::yes, accum_size::_4> : tuning<float,  128, 24, 688, 1140> {};
-template <> struct sm90_tuning<double, primitive_op::yes, primitive_accum::yes, accum_size::_8> : tuning<double, 224, 24, 576, 1215> {};
+template <> struct sm90_tuning<float,  primitive_op::yes, primitive_accum::yes, accum_size::_4> : sm90_tuning_vals<float,  128, 24, 688, 1140> {};
+template <> struct sm90_tuning<double, primitive_op::yes, primitive_accum::yes, accum_size::_8> : sm90_tuning_vals<double, 224, 24, 576, 1215> {};
 
 #if CUB_IS_INT128_ENABLED
-template <> struct sm90_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : tuning<__int128_t, 576, 21, 860, 630> {};
+template <> struct sm90_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : sm90_tuning_vals<__int128_t, 576, 21, 860, 630> {};
 template <>
 struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_size::_16>
     : sm90_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16>

From fdaaf97cbcc69f0c4a7f9a81d1a1d9a8faeed9a2 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 9 Jan 2025 17:30:19 +0100
Subject: [PATCH 14/31] Require C++17 for compiling Thrust and CUB (#3255)

* Issue an unsuppressable warning when compiling with < C++17
* Remove C++11/14 presets
* Remove CCCL_IGNORE_DEPRECATED_CPP_DIALECT from headers
* Remove [CUB|THRUST|TCT]_IGNORE_DEPRECATED_CPP_[11|14]
* Remove CUB_ENABLE_DIALECT_CPP[11|14]
* Update CI runs
* Remove C++11/14 CI runs for CUB and Thrust
* Raise compiler minimum versions for C++17
* Update ReadMe
* Drop Thrust's cpp14_required.h
* Add escape hatch for C++17 removal

Fixes: #3252
---
 CMakePresets.json                             | 136 ------------------
 README.md                                     |   4 +-
 ci/matrix.yaml                                |   7 +-
 cub/cmake/CubBuildTargetList.cmake            |  10 --
 cub/cub/util_cpp_dialect.cuh                  |  37 ++---
 lib/cmake/thrust/thrust-config.cmake          |  26 ----
 .../include/cuda/std/__cccl/deprecated.h      |   9 +-
 thrust/cmake/ThrustHeaderTesting.cmake        |   5 -
 thrust/testing/unittest/util_async.h          |   2 -
 thrust/thrust/async/copy.h                    |   1 -
 thrust/thrust/async/for_each.h                |   1 -
 thrust/thrust/async/reduce.h                  |   1 -
 thrust/thrust/async/scan.h                    |   1 -
 thrust/thrust/async/sort.h                    |   1 -
 thrust/thrust/async/transform.h               |   1 -
 thrust/thrust/detail/config/cpp_dialect.h     |  38 ++---
 thrust/thrust/detail/cpp14_required.h         |  33 -----
 thrust/thrust/detail/event_error.h            |   1 -
 thrust/thrust/future.h                        |   1 -
 thrust/thrust/system/cuda/detail/async/copy.h |   1 -
 .../system/cuda/detail/async/customization.h  |   1 -
 .../system/cuda/detail/async/exclusive_scan.h |   1 -
 .../system/cuda/detail/async/for_each.h       |   1 -
 .../system/cuda/detail/async/inclusive_scan.h |   1 -
 .../thrust/system/cuda/detail/async/reduce.h  |   1 -
 thrust/thrust/system/cuda/detail/async/scan.h |   1 -
 thrust/thrust/system/cuda/detail/async/sort.h |   1 -
 .../system/cuda/detail/async/transform.h      |   1 -
 thrust/thrust/system/cuda/detail/future.inl   |   1 -
 thrust/thrust/system/cuda/future.h            |   1 -
 30 files changed, 32 insertions(+), 294 deletions(-)
 delete mode 100644 thrust/thrust/detail/cpp14_required.h

diff --git a/CMakePresets.json b/CMakePresets.json
index 089f9b81798..bd10a95200b 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -69,8 +69,6 @@
         "CUB_ENABLE_TESTING": true,
         "CUB_ENABLE_EXAMPLES": true,
         "CUB_SEPARATE_CATCH2": true,
-        "CUB_ENABLE_DIALECT_CPP11": true,
-        "CUB_ENABLE_DIALECT_CPP14": true,
         "CUB_ENABLE_DIALECT_CPP17": true,
         "CUB_ENABLE_DIALECT_CPP20": true,
         "THRUST_ENABLE_MULTICONFIG": true,
@@ -232,30 +230,10 @@
         "CUB_ENABLE_TESTING": true,
         "CUB_ENABLE_EXAMPLES": true,
         "CUB_SEPARATE_CATCH2": true,
-        "CUB_ENABLE_DIALECT_CPP11": false,
-        "CUB_ENABLE_DIALECT_CPP14": false,
         "CUB_ENABLE_DIALECT_CPP17": false,
         "CUB_ENABLE_DIALECT_CPP20": false
       }
     },
-    {
-      "name": "cub-cpp11",
-      "displayName": "CUB: C++11",
-      "inherits": "cub-base",
-      "cacheVariables": {
-        "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true,
-        "CUB_ENABLE_DIALECT_CPP11": true
-      }
-    },
-    {
-      "name": "cub-cpp14",
-      "displayName": "CUB: C++14",
-      "inherits": "cub-base",
-      "cacheVariables": {
-        "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true,
-        "CUB_ENABLE_DIALECT_CPP14": true
-      }
-    },
     {
       "name": "cub-cpp17",
       "displayName": "CUB: C++17",
@@ -289,24 +267,6 @@
         "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP20": false
       }
     },
-    {
-      "name": "thrust-cpp11",
-      "displayName": "Thrust: C++11",
-      "inherits": "thrust-base",
-      "cacheVariables": {
-        "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true,
-        "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP11": true
-      }
-    },
-    {
-      "name": "thrust-cpp14",
-      "displayName": "Thrust: C++14",
-      "inherits": "thrust-base",
-      "cacheVariables": {
-        "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true,
-        "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP14": true
-      }
-    },
     {
       "name": "thrust-cpp17",
       "displayName": "Thrust: C++17",
@@ -520,14 +480,6 @@
         "libcudacxx-base"
       ]
     },
-    {
-      "name": "cub-cpp11",
-      "configurePreset": "cub-cpp11"
-    },
-    {
-      "name": "cub-cpp14",
-      "configurePreset": "cub-cpp14"
-    },
     {
       "name": "cub-cpp17",
       "configurePreset": "cub-cpp17"
@@ -536,14 +488,6 @@
       "name": "cub-cpp20",
       "configurePreset": "cub-cpp20"
     },
-    {
-      "name": "thrust-cpp11",
-      "configurePreset": "thrust-cpp11"
-    },
-    {
-      "name": "thrust-cpp14",
-      "configurePreset": "thrust-cpp14"
-    },
     {
       "name": "thrust-cpp17",
       "configurePreset": "thrust-cpp17"
@@ -736,16 +680,6 @@
         }
       }
     },
-    {
-      "name": "cub-nolid-cpp11",
-      "configurePreset": "cub-cpp11",
-      "inherits": "cub-nolid-base"
-    },
-    {
-      "name": "cub-nolid-cpp14",
-      "configurePreset": "cub-cpp14",
-      "inherits": "cub-nolid-base"
-    },
     {
       "name": "cub-nolid-cpp17",
       "configurePreset": "cub-cpp17",
@@ -756,16 +690,6 @@
       "configurePreset": "cub-cpp20",
       "inherits": "cub-nolid-base"
     },
-    {
-      "name": "cub-lid0-cpp11",
-      "configurePreset": "cub-cpp11",
-      "inherits": "cub-lid0-base"
-    },
-    {
-      "name": "cub-lid0-cpp14",
-      "configurePreset": "cub-cpp14",
-      "inherits": "cub-lid0-base"
-    },
     {
       "name": "cub-lid0-cpp17",
       "configurePreset": "cub-cpp17",
@@ -776,16 +700,6 @@
       "configurePreset": "cub-cpp20",
       "inherits": "cub-lid0-base"
     },
-    {
-      "name": "cub-lid1-cpp11",
-      "configurePreset": "cub-cpp11",
-      "inherits": "cub-lid1-base"
-    },
-    {
-      "name": "cub-lid1-cpp14",
-      "configurePreset": "cub-cpp14",
-      "inherits": "cub-lid1-base"
-    },
     {
       "name": "cub-lid1-cpp17",
       "configurePreset": "cub-cpp17",
@@ -796,16 +710,6 @@
       "configurePreset": "cub-cpp20",
       "inherits": "cub-lid1-base"
     },
-    {
-      "name": "cub-lid2-cpp11",
-      "configurePreset": "cub-cpp11",
-      "inherits": "cub-lid2-base"
-    },
-    {
-      "name": "cub-lid2-cpp14",
-      "configurePreset": "cub-cpp14",
-      "inherits": "cub-lid2-base"
-    },
     {
       "name": "cub-lid2-cpp17",
       "configurePreset": "cub-cpp17",
@@ -816,16 +720,6 @@
       "configurePreset": "cub-cpp20",
       "inherits": "cub-lid2-base"
     },
-    {
-      "name": "cub-cpp11",
-      "configurePreset": "cub-cpp11",
-      "inherits": "cub-base"
-    },
-    {
-      "name": "cub-cpp14",
-      "configurePreset": "cub-cpp14",
-      "inherits": "cub-base"
-    },
     {
       "name": "cub-cpp17",
       "configurePreset": "cub-cpp17",
@@ -866,16 +760,6 @@
         }
       }
     },
-    {
-      "name": "thrust-gpu-cpp11",
-      "configurePreset": "thrust-cpp11",
-      "inherits": "thrust-gpu-base"
-    },
-    {
-      "name": "thrust-gpu-cpp14",
-      "configurePreset": "thrust-cpp14",
-      "inherits": "thrust-gpu-base"
-    },
     {
       "name": "thrust-gpu-cpp17",
       "configurePreset": "thrust-cpp17",
@@ -886,16 +770,6 @@
       "configurePreset": "thrust-cpp20",
       "inherits": "thrust-gpu-base"
     },
-    {
-      "name": "thrust-cpu-cpp11",
-      "configurePreset": "thrust-cpp11",
-      "inherits": "thrust-cpu-base"
-    },
-    {
-      "name": "thrust-cpu-cpp14",
-      "configurePreset": "thrust-cpp14",
-      "inherits": "thrust-cpu-base"
-    },
     {
       "name": "thrust-cpu-cpp17",
       "configurePreset": "thrust-cpp17",
@@ -906,16 +780,6 @@
       "configurePreset": "thrust-cpp20",
       "inherits": "thrust-cpu-base"
     },
-    {
-      "name": "thrust-cpp11",
-      "configurePreset": "thrust-cpp11",
-      "inherits": "thrust-base"
-    },
-    {
-      "name": "thrust-cpp14",
-      "configurePreset": "thrust-cpp14",
-      "inherits": "thrust-base"
-    },
     {
       "name": "thrust-cpp17",
       "configurePreset": "thrust-cpp17",
diff --git a/README.md b/README.md
index 383afe7e339..358adadc87b 100644
--- a/README.md
+++ b/README.md
@@ -269,8 +269,8 @@ But we will not invest significant time in triaging or fixing issues for older c
 In the spirit of "You only support what you test", see our [CI Overview](https://github.com/NVIDIA/cccl/blob/main/ci-overview.md) for more information on exactly what we test.
 
 ### C++ Dialects
-- C++11 (Deprecated in Thrust/CUB, to be removed in next major version)
-- C++14 (Deprecated in Thrust/CUB, to be removed in next major version)
+- C++11 (only libcu++)
+- C++14 (only libcu++)
 - C++17
 - C++20
 
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 2adbcd718ff..66cc47dd271 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -13,7 +13,8 @@ workflows:
     # Old CTK/compiler
     - {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc7', 'gcc9', 'clang9', 'msvc2019']}
     # Current CTK build-only
-    - {jobs: ['build'], std: [11, 14], cxx: ['gcc7', 'clang9']}
+    - {jobs: ['build'], std: [11, 14], cxx: ['gcc7', 'clang9'], project: 'libcudacxx'}
+    - {jobs: ['build'], std: [17], cxx: ['gcc7', 'clang9']}
     - {jobs: ['build'], std: 'max', cxx: ['gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
     - {jobs: ['build'], std: 'max', cxx: ['clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
@@ -246,11 +247,11 @@ projects:
     stds: [11, 14, 17, 20]
   cub:
     name: 'CUB'
-    stds: [11, 14, 17, 20]
+    stds: [17, 20]
     job_map: { test: ['test_nolid', 'test_lid0', 'test_lid1', 'test_lid2'] }
   thrust:
     name: 'Thrust'
-    stds: [11, 14, 17, 20]
+    stds: [17, 20]
     job_map: { test: ['test_cpu', 'test_gpu'] }
   cudax:
     stds: [17, 20]
diff --git a/cub/cmake/CubBuildTargetList.cmake b/cub/cmake/CubBuildTargetList.cmake
index 27dedd68210..7c6f59e8856 100644
--- a/cub/cmake/CubBuildTargetList.cmake
+++ b/cub/cmake/CubBuildTargetList.cmake
@@ -150,16 +150,6 @@ function(cub_build_target_list)
     cmake_minimum_required(VERSION 3.18.3)
   endif()
 
-  # Supported versions of MSVC do not distinguish between C++11 and C++14.
-  # Warn the user that they may be generating a ton of redundant targets.
-  if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND
-      CUB_ENABLE_DIALECT_CPP11)
-    message(WARNING
-      "Supported versions of MSVC (2017+) do not distinguish between C++11 "
-      "and C++14. The requested C++11 targets will be built with C++14."
-    )
-  endif()
-
   # Generic config flags:
   macro(add_flag_option flag docstring default)
     set(opt "CCCL_${flag}")
diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh
index de745638ac8..a6eee36539c 100644
--- a/cub/cub/util_cpp_dialect.cuh
+++ b/cub/cub/util_cpp_dialect.cuh
@@ -44,17 +44,9 @@
 
 // Deprecation warnings may be silenced by defining the following macros. These
 // may be combined.
-// - CCCL_IGNORE_DEPRECATED_CPP_DIALECT:
-//   Ignore all deprecated C++ dialects and outdated compilers.
-// - CCCL_IGNORE_DEPRECATED_CPP_11:
-//   Ignore deprecation warnings when compiling with C++11. C++03 and outdated
-//   compilers will still issue warnings.
-// - CCCL_IGNORE_DEPRECATED_CPP_14:
-//   Ignore deprecation warnings when compiling with C++14. C++03 and outdated
-//   compilers will still issue warnings.
 // - CCCL_IGNORE_DEPRECATED_COMPILER
 //   Ignore deprecation warnings when using deprecated compilers. Compiling
-//   with C++03, C++11 and C++14 will still issue warnings.
+//   with deprecated C++ dialects will still issue warnings.
 
 #  define CUB_CPP_DIALECT _CCCL_STD_VER
 
@@ -65,6 +57,7 @@
 #    define CUB_COMP_DEPR_IMPL(msg) _CCCL_PRAGMA(GCC warning #msg)
 #  endif
 
+// Compiler checks:
 // clang-format off
 #  define CUB_COMPILER_DEPRECATION(REQ) \
     CUB_COMP_DEPR_IMPL(CUB requires at least REQ. Define CCCL_IGNORE_DEPRECATED_COMPILER to suppress this message.)
@@ -72,12 +65,10 @@
 #  define CUB_COMPILER_DEPRECATION_SOFT(REQ, CUR)                                                        \
     CUB_COMP_DEPR_IMPL(                                                                                  \
       CUB requires at least REQ. CUR is deprecated but still supported. CUR support will be removed in a \
-        future release. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+        future release. Define CCCL_IGNORE_DEPRECATED_COMPILER to suppress this message.)
 // clang-format on
 
 #  ifndef CCCL_IGNORE_DEPRECATED_COMPILER
-
-// Compiler checks:
 #    if _CCCL_COMPILER(GCC, <, 7)
 CUB_COMPILER_DEPRECATION(GCC 7.0);
 #    elif _CCCL_COMPILER(CLANG, <, 7)
@@ -89,24 +80,18 @@ CUB_COMPILER_DEPRECATION(MSVC 2019(19.20 / 16.0 / 14.20));
 // >=2017, <2019. Soft deprecation message:
 CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017);
 #    endif
-
 #  endif // CCCL_IGNORE_DEPRECATED_COMPILER
 
-#  if _CCCL_STD_VER < 2011
-// <C++11. Hard upgrade message:
-CUB_COMPILER_DEPRECATION(C++ 17);
-#  elif _CCCL_STD_VER == 2011 && !defined(CCCL_IGNORE_DEPRECATED_CPP_11)
-// =C++11. Soft upgrade message:
-CUB_COMPILER_DEPRECATION_SOFT(C++ 17, C++ 11);
-#  elif _CCCL_STD_VER == 2014 && !defined(CCCL_IGNORE_DEPRECATED_CPP_14)
-// =C++14. Soft upgrade message:
-CUB_COMPILER_DEPRECATION_SOFT(C++ 17, C++ 14);
-#  endif // _CCCL_STD_VER >= 2017
-
 #  undef CUB_COMPILER_DEPRECATION_SOFT
 #  undef CUB_COMPILER_DEPRECATION
+
+// C++17 dialect check:
+#  ifndef CCCL_IGNORE_DEPRECATED_CPP_DIALECT
+#    if _CCCL_STD_VER < 2017
+CUB_COMP_DEPR_IMPL(CUB requires at least C++ 17. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+#    endif // _CCCL_STD_VER >= 2017
+#  endif
+
 #  undef CUB_COMP_DEPR_IMPL
-#  undef CUB_COMP_DEPR_IMPL0
-#  undef CUB_COMP_DEPR_IMPL1
 
 #endif // !_CCCL_DOXYGEN_INVOKED
diff --git a/lib/cmake/thrust/thrust-config.cmake b/lib/cmake/thrust/thrust-config.cmake
index e13bb8c5d32..fc7db1d86ba 100644
--- a/lib/cmake/thrust/thrust-config.cmake
+++ b/lib/cmake/thrust/thrust-config.cmake
@@ -237,36 +237,10 @@ function(thrust_create_target target_name)
     target_compile_definitions(${target_name} INTERFACE "THRUST_FORCE_64_BIT_OFFSET_TYPE")
   endif()
 
-  # This would be nice to enforce, but breaks when using old cmake + new
-  # compiler, since cmake doesn't know what features the new compiler version
-  # supports.
-  # Leaving this here as a reminder not to add it back. Just let the
-  # compile-time checks in thrust/detail/config/cpp_dialect.h handle it.
-  #
-  #  if (NOT TCT_IGNORE_DEPRECATED_CPP_DIALECT)
-  #    if (TCT_IGNORE_DEPRECATED_CPP_11)
-  #      target_compile_features(${target_name} INTERFACE cxx_std_11)
-  #    else()
-  #      target_compile_features(${target_name} INTERFACE cxx_std_14)
-  #    endif()
-  #  endif()
-
-  if (TCT_IGNORE_DEPRECATED_CPP_DIALECT OR CCCL_IGNORE_DEPRECATED_CPP_DIALECT)
-    target_compile_definitions(${target_name} INTERFACE "CCCL_IGNORE_DEPRECATED_CPP_DIALECT")
-  endif()
-
   if (TCT_IGNORE_DEPRECATED_API OR CCCL_IGNORE_DEPRECATED_API)
     target_compile_definitions(${target_name} INTERFACE "CCCL_IGNORE_DEPRECATED_API")
   endif()
 
-  if (TCT_IGNORE_DEPRECATED_CPP_11 OR CCCL_IGNORE_DEPRECATED_CPP_11)
-    target_compile_definitions(${target_name} INTERFACE "CCCL_IGNORE_DEPRECATED_CPP_11")
-  endif()
-
-  if (TCT_IGNORE_DEPRECATED_CPP_14 OR CCCL_IGNORE_DEPRECATED_CPP_14)
-    target_compile_definitions(${target_name} INTERFACE "CCCL_IGNORE_DEPRECATED_CPP_14")
-  endif()
-
   if (TCT_IGNORE_DEPRECATED_COMPILER OR CCCL_IGNORE_DEPRECATED_COMPILER)
     target_compile_definitions(${target_name} INTERFACE "CCCL_IGNORE_DEPRECATED_COMPILER")
   endif()
diff --git a/libcudacxx/include/cuda/std/__cccl/deprecated.h b/libcudacxx/include/cuda/std/__cccl/deprecated.h
index aa10c47ebcd..f03ecfc07f7 100644
--- a/libcudacxx/include/cuda/std/__cccl/deprecated.h
+++ b/libcudacxx/include/cuda/std/__cccl/deprecated.h
@@ -24,20 +24,17 @@
 #endif // no system header
 
 // Check for deprecation opt outs
-#if defined(LIBCUDACXX_IGNORE_DEPRECATED_CPP_DIALECT) || defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT) \
-  || defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT)
+#if defined(LIBCUDACXX_IGNORE_DEPRECATED_CPP_DIALECT)
 #  if !defined(CCCL_IGNORE_DEPRECATED_CPP_DIALECT)
 #    define CCCL_IGNORE_DEPRECATED_CPP_DIALECT
 #  endif
 #endif // suppress all dialect deprecation warnings
-#if defined(LIBCUDACXX_IGNORE_DEPRECATED_CPP_14) || defined(THRUST_IGNORE_DEPRECATED_CPP_14) \
-  || defined(CUB_IGNORE_DEPRECATED_CPP_14) || defined(CCCL_IGNORE_DEPRECATED_CPP_DIALECT)
+#if defined(LIBCUDACXX_IGNORE_DEPRECATED_CPP_14) || defined(CCCL_IGNORE_DEPRECATED_CPP_DIALECT)
 #  if !defined(CCCL_IGNORE_DEPRECATED_CPP_14)
 #    define CCCL_IGNORE_DEPRECATED_CPP_14
 #  endif
 #endif // suppress all c++14 dialect deprecation warnings
-#if defined(LIBCUDACXX_IGNORE_DEPRECATED_CPP_11) || defined(THRUST_IGNORE_DEPRECATED_CPP_11) \
-  || defined(CUB_IGNORE_DEPRECATED_CPP_11) || defined(CCCL_IGNORE_DEPRECATED_CPP_DIALECT)    \
+#if defined(LIBCUDACXX_IGNORE_DEPRECATED_CPP_11) || defined(CCCL_IGNORE_DEPRECATED_CPP_DIALECT) \
   || defined(CCCL_IGNORE_DEPRECATED_CPP_14)
 #  if !defined(CCCL_IGNORE_DEPRECATED_CPP_11)
 #    define CCCL_IGNORE_DEPRECATED_CPP_11
diff --git a/thrust/cmake/ThrustHeaderTesting.cmake b/thrust/cmake/ThrustHeaderTesting.cmake
index 2eb3e33b5d6..7321e0db7a5 100644
--- a/thrust/cmake/ThrustHeaderTesting.cmake
+++ b/thrust/cmake/ThrustHeaderTesting.cmake
@@ -121,11 +121,6 @@ function(thrust_add_header_test thrust_target label definitions)
     HEADERS ${headers}
   )
   target_link_libraries(${headertest_target} PUBLIC ${thrust_target})
-  target_compile_definitions(${headertest_target} PRIVATE
-    ${header_definitions}
-    "THRUST_CPP11_REQUIRED_NO_ERROR"
-    "THRUST_CPP14_REQUIRED_NO_ERROR"
-  )
   thrust_clone_target_properties(${headertest_target} ${thrust_target})
 
   if ("CUDA" STREQUAL "${config_device}")
diff --git a/thrust/testing/unittest/util_async.h b/thrust/testing/unittest/util_async.h
index 4e129ff1a82..15293598b12 100644
--- a/thrust/testing/unittest/util_async.h
+++ b/thrust/testing/unittest/util_async.h
@@ -2,8 +2,6 @@
 
 #include <thrust/detail/config.h>
 
-#include <thrust/detail/cpp14_required.h>
-
 #if _CCCL_STD_VER >= 2014
 
 #  include <thrust/future.h>
diff --git a/thrust/thrust/async/copy.h b/thrust/thrust/async/copy.h
index d52526444a0..b821f908a79 100644
--- a/thrust/thrust/async/copy.h
+++ b/thrust/thrust/async/copy.h
@@ -29,7 +29,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/async/for_each.h b/thrust/thrust/async/for_each.h
index 17376343fed..9cdc9c1e048 100644
--- a/thrust/thrust/async/for_each.h
+++ b/thrust/thrust/async/for_each.h
@@ -29,7 +29,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/async/reduce.h b/thrust/thrust/async/reduce.h
index d4cd7cd7c3f..18172bd692f 100644
--- a/thrust/thrust/async/reduce.h
+++ b/thrust/thrust/async/reduce.h
@@ -29,7 +29,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/async/scan.h b/thrust/thrust/async/scan.h
index fcbb41ad43c..f58b37e8d3b 100644
--- a/thrust/thrust/async/scan.h
+++ b/thrust/thrust/async/scan.h
@@ -29,7 +29,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/async/sort.h b/thrust/thrust/async/sort.h
index bd294048311..19304e38660 100644
--- a/thrust/thrust/async/sort.h
+++ b/thrust/thrust/async/sort.h
@@ -29,7 +29,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/async/transform.h b/thrust/thrust/async/transform.h
index 901518c12e8..c54b328388e 100644
--- a/thrust/thrust/async/transform.h
+++ b/thrust/thrust/async/transform.h
@@ -29,7 +29,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/detail/config/cpp_dialect.h b/thrust/thrust/detail/config/cpp_dialect.h
index 4c7a7930ad3..62850f61465 100644
--- a/thrust/thrust/detail/config/cpp_dialect.h
+++ b/thrust/thrust/detail/config/cpp_dialect.h
@@ -34,17 +34,9 @@
 
 // Deprecation warnings may be silenced by defining the following macros. These
 // may be combined.
-// - CCCL_IGNORE_DEPRECATED_CPP_DIALECT:
-//   Ignore all deprecated C++ dialects and outdated compilers.
-// - CCCL_IGNORE_DEPRECATED_CPP_11:
-//   Ignore deprecation warnings when compiling with C++11. C++03 and outdated
-//   compilers will still issue warnings.
-// - CCCL_IGNORE_DEPRECATED_CPP_14:
-//   Ignore deprecation warnings when compiling with C++14. C++03 and outdated
-//   compilers will still issue warnings.
 // - CCCL_IGNORE_DEPRECATED_COMPILER
 //   Ignore deprecation warnings when using deprecated compilers. Compiling
-//   with C++03, C++11 and C++14 will still issue warnings.
+//   with deprecated C++ dialects will still issue warnings.
 
 #define THRUST_CPP_DIALECT _CCCL_STD_VER
 
@@ -55,6 +47,7 @@
 #  define THRUST_COMP_DEPR_IMPL(msg) _CCCL_PRAGMA(GCC warning #msg)
 #endif
 
+// Compiler checks:
 // clang-format off
 #define THRUST_COMPILER_DEPRECATION(REQ) \
   THRUST_COMP_DEPR_IMPL(Thrust requires at least REQ. Define CCCL_IGNORE_DEPRECATED_COMPILER to suppress this message.)
@@ -62,12 +55,10 @@
 #define THRUST_COMPILER_DEPRECATION_SOFT(REQ, CUR)                                                        \
   THRUST_COMP_DEPR_IMPL(                                                                                  \
     Thrust requires at least REQ. CUR is deprecated but still supported. CUR support will be removed in a \
-      future release. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+      future release. Define CCCL_IGNORE_DEPRECATED_COMPILER to suppress this message.)
 // clang-format on
 
 #ifndef CCCL_IGNORE_DEPRECATED_COMPILER
-
-// Compiler checks:
 #  if _CCCL_COMPILER(GCC, <, 7)
 THRUST_COMPILER_DEPRECATION(GCC 7.0);
 #  elif _CCCL_COMPILER(CLANG, <, 7)
@@ -79,22 +70,17 @@ THRUST_COMPILER_DEPRECATION(MSVC 2019(19.20 / 16.0 / 14.20));
 // >=2017, <2019. Soft deprecation message:
 THRUST_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017);
 #  endif
-
 #endif // CCCL_IGNORE_DEPRECATED_COMPILER
 
-#if _CCCL_STD_VER < 2011
-// <C++11. Hard upgrade message:
-THRUST_COMPILER_DEPRECATION(C++ 17);
-#elif _CCCL_STD_VER == 2011 && !defined(CCCL_IGNORE_DEPRECATED_CPP_11)
-// =C++11. Soft upgrade message:
-THRUST_COMPILER_DEPRECATION_SOFT(C++ 17, C++ 11);
-#elif _CCCL_STD_VER == 2014 && !defined(CCCL_IGNORE_DEPRECATED_CPP_14)
-// =C++14. Soft upgrade message:
-THRUST_COMPILER_DEPRECATION_SOFT(C++ 17, C++ 14);
-#endif // _CCCL_STD_VER >= 2017
-
 #undef THRUST_COMPILER_DEPRECATION_SOFT
 #undef THRUST_COMPILER_DEPRECATION
+
+// C++17 dialect check:
+#ifndef CCCL_IGNORE_DEPRECATED_CPP_DIALECT
+#  if _CCCL_STD_VER < 2017
+THRUST_COMP_DEPR_IMPL(
+  Thrust requires at least C++ 17. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+#  endif // _CCCL_STD_VER >= 2017
+#endif
+
 #undef THRUST_COMP_DEPR_IMPL
-#undef THRUST_COMP_DEPR_IMPL0
-#undef THRUST_COMP_DEPR_IMPL1
diff --git a/thrust/thrust/detail/cpp14_required.h b/thrust/thrust/detail/cpp14_required.h
deleted file mode 100644
index 62894c1fc95..00000000000
--- a/thrust/thrust/detail/cpp14_required.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright 2018 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config/cpp_dialect.h>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#ifndef THRUST_CPP14_REQUIRED_NO_ERROR
-#  if _CCCL_STD_VER < 2014
-#    error C++14 is required for this Thrust feature; please upgrade your compiler or pass the appropriate -std=c++14 flag to it.
-#  endif
-#endif
diff --git a/thrust/thrust/detail/event_error.h b/thrust/thrust/detail/event_error.h
index 60f8bb9f92a..fb9e6c27b89 100644
--- a/thrust/thrust/detail/event_error.h
+++ b/thrust/thrust/detail/event_error.h
@@ -28,7 +28,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/future.h b/thrust/thrust/future.h
index 39f25a0b0c0..46f70d58f58 100644
--- a/thrust/thrust/future.h
+++ b/thrust/thrust/future.h
@@ -29,7 +29,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/system/cuda/detail/async/copy.h b/thrust/thrust/system/cuda/detail/async/copy.h
index 39c3b647cba..d73724de621 100644
--- a/thrust/thrust/system/cuda/detail/async/copy.h
+++ b/thrust/thrust/system/cuda/detail/async/copy.h
@@ -38,7 +38,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/system/cuda/detail/async/customization.h b/thrust/thrust/system/cuda/detail/async/customization.h
index e1f3786d20d..9ce296267b2 100644
--- a/thrust/thrust/system/cuda/detail/async/customization.h
+++ b/thrust/thrust/system/cuda/detail/async/customization.h
@@ -38,7 +38,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/system/cuda/detail/async/exclusive_scan.h b/thrust/thrust/system/cuda/detail/async/exclusive_scan.h
index 8f534c3c626..19758e28405 100644
--- a/thrust/thrust/system/cuda/detail/async/exclusive_scan.h
+++ b/thrust/thrust/system/cuda/detail/async/exclusive_scan.h
@@ -36,7 +36,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/system/cuda/detail/async/for_each.h b/thrust/thrust/system/cuda/detail/async/for_each.h
index 412c0c36408..3e2f6ee25c0 100644
--- a/thrust/thrust/system/cuda/detail/async/for_each.h
+++ b/thrust/thrust/system/cuda/detail/async/for_each.h
@@ -39,7 +39,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
index d7e4c8a94c7..bc7f9165a6c 100644
--- a/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
+++ b/thrust/thrust/system/cuda/detail/async/inclusive_scan.h
@@ -36,7 +36,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/system/cuda/detail/async/reduce.h b/thrust/thrust/system/cuda/detail/async/reduce.h
index 8842540b985..fc65efb0f9f 100644
--- a/thrust/thrust/system/cuda/detail/async/reduce.h
+++ b/thrust/thrust/system/cuda/detail/async/reduce.h
@@ -40,7 +40,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/system/cuda/detail/async/scan.h b/thrust/thrust/system/cuda/detail/async/scan.h
index f64015c0921..278f97d8388 100644
--- a/thrust/thrust/system/cuda/detail/async/scan.h
+++ b/thrust/thrust/system/cuda/detail/async/scan.h
@@ -37,6 +37,5 @@
 #  pragma system_header
 #endif // no system header
 
-#include <thrust/detail/cpp14_required.h>
 #include <thrust/system/cuda/detail/async/exclusive_scan.h>
 #include <thrust/system/cuda/detail/async/inclusive_scan.h>
diff --git a/thrust/thrust/system/cuda/detail/async/sort.h b/thrust/thrust/system/cuda/detail/async/sort.h
index 3bb5207127b..a37514cdf5a 100644
--- a/thrust/thrust/system/cuda/detail/async/sort.h
+++ b/thrust/thrust/system/cuda/detail/async/sort.h
@@ -38,7 +38,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/system/cuda/detail/async/transform.h b/thrust/thrust/system/cuda/detail/async/transform.h
index d987f5c67c5..1f0f4d57f7d 100644
--- a/thrust/thrust/system/cuda/detail/async/transform.h
+++ b/thrust/thrust/system/cuda/detail/async/transform.h
@@ -38,7 +38,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/system/cuda/detail/future.inl b/thrust/thrust/system/cuda/detail/future.inl
index 7200e51d491..da2347e3552 100644
--- a/thrust/thrust/system/cuda/detail/future.inl
+++ b/thrust/thrust/system/cuda/detail/future.inl
@@ -17,7 +17,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 
diff --git a/thrust/thrust/system/cuda/future.h b/thrust/thrust/system/cuda/future.h
index 3ecd9307038..71f4f94181b 100644
--- a/thrust/thrust/system/cuda/future.h
+++ b/thrust/thrust/system/cuda/future.h
@@ -14,7 +14,6 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
-#include <thrust/detail/cpp14_required.h>
 
 #if _CCCL_STD_VER >= 2014
 

From ad5e7b20a99d8b88f83e979312b8c618fca7f79f Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 9 Jan 2025 21:22:02 +0100
Subject: [PATCH 15/31] Implement `views::empty` (#3254)

* Disable pair conversion of subrange with clang in C++17

* Fix namespace views

* Implement `views::empty`

This implements `std::ranges::views::empty`, see https://en.cppreference.com/w/cpp/ranges/empty_view
---
 .../include/cuda/std/__internal/namespaces.h  |  2 +-
 .../include/cuda/std/__ranges/empty_view.h    | 80 ++++++++++++++++++
 .../include/cuda/std/__ranges/subrange.h      |  7 +-
 libcudacxx/include/cuda/std/ranges            |  1 +
 .../range.empty/borrowing.compile.pass.cpp    | 30 +++++++
 .../range.empty/empty_view.pass.cpp           | 83 +++++++++++++++++++
 .../range.empty/views.empty.pass.cpp          | 56 +++++++++++++
 .../ctor.pair_like_conv.pass.cpp              |  3 +-
 libcudacxx/test/support/test_range.h          |  4 +-
 9 files changed, 259 insertions(+), 7 deletions(-)
 create mode 100644 libcudacxx/include/cuda/std/__ranges/empty_view.h
 create mode 100644 libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.empty/borrowing.compile.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.empty/empty_view.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.empty/views.empty.pass.cpp

diff --git a/libcudacxx/include/cuda/std/__internal/namespaces.h b/libcudacxx/include/cuda/std/__internal/namespaces.h
index d909bd3fe71..b23eed5366d 100644
--- a/libcudacxx/include/cuda/std/__internal/namespaces.h
+++ b/libcudacxx/include/cuda/std/__internal/namespaces.h
@@ -75,7 +75,7 @@
 #  define _CUDA_VSTD_NOVERSION ::cuda::std
 #  define _CUDA_VSTD           ::cuda::std::_LIBCUDACXX_ABI_NAMESPACE
 #  define _CUDA_VRANGES        ::cuda::std::ranges::_LIBCUDACXX_ABI_NAMESPACE
-#  define _CUDA_VIEWS          ::cuda::std::ranges::views::_LIBCUDACXX_CUDA_ABI_NAMESPACE
+#  define _CUDA_VIEWS          ::cuda::std::ranges::views::_LIBCUDACXX_ABI_NAMESPACE
 #  define _CUDA_VMR            ::cuda::mr::_LIBCUDACXX_ABI_NAMESPACE
 #  define _CUDA_VPTX           ::cuda::ptx::_LIBCUDACXX_ABI_NAMESPACE
 #  define _CUDA_VSTD_FS        ::cuda::std::__fs::filesystem::_LIBCUDACXX_ABI_NAMESPACE
diff --git a/libcudacxx/include/cuda/std/__ranges/empty_view.h b/libcudacxx/include/cuda/std/__ranges/empty_view.h
new file mode 100644
index 00000000000..a1a90e7f197
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__ranges/empty_view.h
@@ -0,0 +1,80 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
+//
+//===----------------------------------------------------------------------===//
+#ifndef _LIBCUDACXX___RANGES_EMPTY_VIEW_H
+#define _LIBCUDACXX___RANGES_EMPTY_VIEW_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__ranges/enable_borrowed_range.h>
+#include <cuda/std/__ranges/view_interface.h>
+#include <cuda/std/__type_traits/is_object.h>
+
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
+
+_LIBCUDACXX_BEGIN_NAMESPACE_RANGES
+_LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI
+
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(_CCCL_TRAIT(is_object, _Tp))
+class empty_view : public view_interface<empty_view<_Tp>>
+{
+public:
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp* begin() noexcept
+  {
+    return nullptr;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp* end() noexcept
+  {
+    return nullptr;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp* data() noexcept
+  {
+    return nullptr;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr size_t size() noexcept
+  {
+    return 0;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool empty() noexcept
+  {
+    return true;
+  }
+};
+
+_LIBCUDACXX_END_NAMESPACE_RANGES_ABI
+
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr bool enable_borrowed_range<empty_view<_Tp>> = true;
+
+_LIBCUDACXX_END_NAMESPACE_RANGES
+
+_LIBCUDACXX_BEGIN_NAMESPACE_VIEWS
+
+#  if _CCCL_COMPILER(MSVC)
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr empty_view<_Tp> empty{};
+#  else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+template <class _Tp>
+_CCCL_GLOBAL_CONSTANT empty_view<_Tp> empty{};
+#  endif // !_CCCL_COMPILER_MSVC
+
+_LIBCUDACXX_END_NAMESPACE_VIEWS
+
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+
+#endif // _LIBCUDACXX___RANGES_EMPTY_VIEW_H
diff --git a/libcudacxx/include/cuda/std/__ranges/subrange.h b/libcudacxx/include/cuda/std/__ranges/subrange.h
index b1b2d9c81d0..28da5c75774 100644
--- a/libcudacxx/include/cuda/std/__ranges/subrange.h
+++ b/libcudacxx/include/cuda/std/__ranges/subrange.h
@@ -244,7 +244,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT subrange : public view_interface<subrange<_I
 
 public:
 #  if !defined(_CCCL_NO_CONCEPTS)
-  subrange()
+  _CCCL_HIDE_FROM_ABI subrange()
     requires default_initializable<_Iter>
   = default;
 #  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
@@ -294,14 +294,15 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT subrange : public view_interface<subrange<_I
       : subrange(_CUDA_VRANGES::begin(__range), _CUDA_VRANGES::end(__range), __n)
   {}
 
-#  if (!_CCCL_COMPILER(GCC) || _CCCL_COMPILER(GCC, >=, 9))
+  // This often ICEs all of clang and old gcc when it encounteres a rvalue subrange in a pipe
+#  if !defined(_CCCL_NO_CONCEPTS)
   _CCCL_TEMPLATE(class _Pair)
   _CCCL_REQUIRES(__pair_like<_Pair> _CCCL_AND __subrange_to_pair<_Iter, _Sent, _Kind, _Pair>)
   _LIBCUDACXX_HIDE_FROM_ABI constexpr operator _Pair() const
   {
     return _Pair(__begin_, __end_);
   }
-#  endif // (!_CCCL_COMPILER(GCC) || _CCCL_COMPILER(GCC, >=, 9))
+#  endif // !_CCCL_NO_CONCEPTS
 
   _CCCL_TEMPLATE(class _It = _Iter)
   _CCCL_REQUIRES(copyable<_It>)
diff --git a/libcudacxx/include/cuda/std/ranges b/libcudacxx/include/cuda/std/ranges
index a9678d6a43a..03f1438f456 100644
--- a/libcudacxx/include/cuda/std/ranges
+++ b/libcudacxx/include/cuda/std/ranges
@@ -30,6 +30,7 @@ _CCCL_DIAG_SUPPRESS_MSVC(4848)
 #include <cuda/std/__ranges/dangling.h>
 #include <cuda/std/__ranges/data.h>
 #include <cuda/std/__ranges/empty.h>
+#include <cuda/std/__ranges/empty_view.h>
 #include <cuda/std/__ranges/enable_borrowed_range.h>
 #include <cuda/std/__ranges/enable_view.h>
 #include <cuda/std/__ranges/rbegin.h>
diff --git a/libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.empty/borrowing.compile.pass.cpp b/libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.empty/borrowing.compile.pass.cpp
new file mode 100644
index 00000000000..14a44c21886
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.empty/borrowing.compile.pass.cpp
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: msvc-19.16
+
+// template<class T>
+//   inline constexpr bool enable_borrowed_range<empty_view<T>> = true;
+
+#include <cuda/std/ranges>
+
+#include "test_range.h"
+
+static_assert(cuda::std::ranges::borrowed_range<cuda::std::ranges::empty_view<int>>);
+static_assert(cuda::std::ranges::borrowed_range<cuda::std::ranges::empty_view<int*>>);
+static_assert(cuda::std::ranges::borrowed_range<cuda::std::ranges::empty_view<BorrowedView>>);
+#if _LIBCUDACXX_HAS_RANGES
+static_assert(cuda::std::ranges::borrowed_range<cuda::std::ranges::empty_view<NonBorrowedView>>);
+#endif
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.empty/empty_view.pass.cpp b/libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.empty/empty_view.pass.cpp
new file mode 100644
index 00000000000..68c8149e15f
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.empty/empty_view.pass.cpp
@@ -0,0 +1,83 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: msvc-19.16
+
+// template<class T>
+// class empty_view;
+
+#include <cuda/std/cassert>
+#include <cuda/std/ranges>
+
+#include "test_macros.h"
+
+template <class T>
+__host__ __device__ constexpr void testType()
+{
+  static_assert(cuda::std::ranges::range<cuda::std::ranges::empty_view<T>>);
+  static_assert(cuda::std::ranges::range<const cuda::std::ranges::empty_view<T>>);
+  static_assert(cuda::std::ranges::view<cuda::std::ranges::empty_view<T>>);
+
+  cuda::std::ranges::empty_view<T> empty{};
+
+  assert(empty.begin() == nullptr);
+  assert(empty.end() == nullptr);
+  assert(empty.data() == nullptr);
+  assert(empty.size() == 0);
+  assert(empty.empty() == true);
+
+  assert(cuda::std::ranges::begin(empty) == nullptr);
+  assert(cuda::std::ranges::end(empty) == nullptr);
+  assert(cuda::std::ranges::data(empty) == nullptr);
+  assert(cuda::std::ranges::size(empty) == 0);
+  assert(cuda::std::ranges::empty(empty) == true);
+}
+
+struct Empty
+{};
+struct BigType
+{
+  char buff[8];
+};
+
+#if TEST_STD_VER >= 2020
+template <class T>
+concept ValidEmptyView = requires { typename cuda::std::ranges::empty_view<T>; };
+#else // ^^^ C++20 ^^^ / vvv C++17 vvv
+template <class T, class = void>
+constexpr bool ValidEmptyView = false;
+
+template <class T>
+constexpr bool ValidEmptyView<T, cuda::std::void_t<cuda::std::ranges::empty_view<T>>> = true;
+#endif // TEST_STD_VER <= 2017
+
+__host__ __device__ constexpr bool test()
+{
+  // Not objects:
+  static_assert(!ValidEmptyView<int&>);
+  static_assert(!ValidEmptyView<void>);
+
+  testType<int>();
+  testType<const int>();
+  testType<int*>();
+  testType<Empty>();
+  testType<const Empty>();
+  testType<BigType>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.empty/views.empty.pass.cpp b/libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.empty/views.empty.pass.cpp
new file mode 100644
index 00000000000..68a5e132a54
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/ranges/range.adaptors/range.empty/views.empty.pass.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: msvc-19.16
+
+// template <class _Tp>
+// inline constexpr empty_view<_Tp> empty{};
+
+#include <cuda/std/cassert>
+#include <cuda/std/ranges>
+
+#include "test_macros.h"
+
+template <class T>
+__host__ __device__ constexpr void testType()
+{
+  ASSERT_SAME_TYPE(decltype(cuda::std::views::empty<T>), const cuda::std::ranges::empty_view<T>);
+  ASSERT_SAME_TYPE(decltype((cuda::std::views::empty<T>) ), const cuda::std::ranges::empty_view<T>&);
+
+  auto v = cuda::std::views::empty<T>;
+  assert(cuda::std::ranges::empty(v));
+}
+
+struct Empty
+{};
+struct BigType
+{
+  char buff[8];
+};
+
+__host__ __device__ constexpr bool test()
+{
+  testType<int>();
+  testType<const int>();
+  testType<int*>();
+  testType<Empty>();
+  testType<const Empty>();
+  testType<BigType>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/ranges/range.utility/range.subrange/ctor.pair_like_conv.pass.cpp b/libcudacxx/test/libcudacxx/std/ranges/range.utility/range.subrange/ctor.pair_like_conv.pass.cpp
index 251a1ccfce6..d59863e6c0d 100644
--- a/libcudacxx/test/libcudacxx/std/ranges/range.utility/range.subrange/ctor.pair_like_conv.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/ranges/range.utility/range.subrange/ctor.pair_like_conv.pass.cpp
@@ -7,9 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: msvc-19.16
-// UNSUPPORTED: gcc-7, gcc-8
 
 // class cuda::std::ranges::subrange;
 
diff --git a/libcudacxx/test/support/test_range.h b/libcudacxx/test/support/test_range.h
index 692843da5e2..83e8a25c425 100644
--- a/libcudacxx/test/support/test_range.h
+++ b/libcudacxx/test/support/test_range.h
@@ -105,11 +105,13 @@ _CCCL_INLINE_VAR constexpr bool enable_borrowed_range<BorrowedRange> = true;
 static_assert(!cuda::std::ranges::view<BorrowedRange>, "");
 static_assert(cuda::std::ranges::borrowed_range<BorrowedRange>, "");
 
-#if _LIBCUDACXX_HAS_RANGES
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 using BorrowedView = cuda::std::ranges::empty_view<int>;
 static_assert(cuda::std::ranges::view<BorrowedView>, "");
 static_assert(cuda::std::ranges::borrowed_range<BorrowedView>, "");
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
+#if _LIBCUDACXX_HAS_RANGES
 using NonBorrowedView = cuda::std::ranges::single_view<int>;
 static_assert(cuda::std::ranges::view<NonBorrowedView>, "");
 static_assert(!cuda::std::ranges::borrowed_range<NonBorrowedView>, "");

From 1503b25db06b7be04a4ed04c1061d392c28c8aba Mon Sep 17 00:00:00 2001
From: David Bayer <48736217+davebayer@users.noreply.github.com>
Date: Thu, 9 Jan 2025 21:23:22 +0100
Subject: [PATCH 16/31] Refactor `limits` and `climits` (#3221)

* implement builtins for huge val, nan and nans

* change `INFINITY` and `NAN` implementation for NVRTC
---
 libcudacxx/include/cuda/std/__cccl/builtin.h  |   42 +
 .../include/cuda/std/__cuda/climits_prelude.h |  105 --
 .../msvc_win32.h}                             |   25 +-
 .../cuda/std/__memory/allocator_traits.h      |    2 +
 .../cuda/std/__memory/temporary_buffer.h      |    3 +-
 libcudacxx/include/cuda/std/climits           |   65 +-
 .../cuda/std/detail/libcxx/include/climits    |   64 -
 .../cuda/std/detail/libcxx/include/cmath      |    5 +-
 .../cuda/std/detail/libcxx/include/limits     | 1120 -----------------
 libcudacxx/include/cuda/std/limits            |  546 +++++++-
 .../test/internal_headers/CMakeLists.txt      |    5 +
 .../numeric.limits.members/lowest.pass.cpp    |    1 +
 .../numeric.limits.members/max.pass.cpp       |    1 +
 .../numeric.limits.members/min.pass.cpp       |    1 +
 .../numerics/bit/bit.cast/bit_cast.pass.cpp   |    4 +-
 .../numerics/bit/bit.endian/endian.pass.cpp   |    1 +
 .../std/numerics/bit/invocable.pass.cpp       |    1 +
 .../pairs.pair/trivial_copy_move.pass.cpp     |    1 +
 18 files changed, 678 insertions(+), 1314 deletions(-)
 delete mode 100644 libcudacxx/include/cuda/std/__cuda/climits_prelude.h
 rename libcudacxx/include/cuda/std/{detail/libcxx/include/support/win32/limits_msvc_win32.h => __limits/msvc_win32.h} (71%)
 delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/climits
 delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/limits

diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h
index b19f7de4371..3a5fda2f0f5 100644
--- a/libcudacxx/include/cuda/std/__cccl/builtin.h
+++ b/libcudacxx/include/cuda/std/__cccl/builtin.h
@@ -288,6 +288,48 @@
 #  define _CCCL_BUILTIN_LINE() __LINE__
 #endif // _CCCL_CUDACC_BELOW(11, 3)
 
+#if _CCCL_CHECK_BUILTIN(builtin_huge_valf) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10)
+#  define _CCCL_BUILTIN_HUGE_VALF() __builtin_huge_valf()
+#endif // _CCCL_CHECK_BUILTIN(builtin_huge_valf)
+
+#if _CCCL_CHECK_BUILTIN(builtin_huge_val) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10)
+#  define _CCCL_BUILTIN_HUGE_VAL() __builtin_huge_val()
+#endif // _CCCL_CHECK_BUILTIN(builtin_huge_val)
+
+#if _CCCL_CHECK_BUILTIN(builtin_huge_vall) || _CCCL_COMPILER(GCC, <, 10)
+#  define _CCCL_BUILTIN_HUGE_VALL() __builtin_huge_vall()
+#elif _CCCL_COMPILER(MSVC)
+#  define _CCCL_BUILTIN_HUGE_VALL() static_cast<long double>(__builtin_huge_val())
+#endif // _CCCL_CHECK_BUILTIN(builtin_huge_vall)
+
+#if _CCCL_CHECK_BUILTIN(builtin_nanf) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10)
+#  define _CCCL_BUILTIN_NANF(...) __builtin_nanf(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_nanf)
+
+#if _CCCL_CHECK_BUILTIN(builtin_nan) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10)
+#  define _CCCL_BUILTIN_NAN(...) __builtin_nan(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_nan)
+
+#if _CCCL_CHECK_BUILTIN(builtin_nanl) || _CCCL_COMPILER(GCC, <, 10)
+#  define _CCCL_BUILTIN_NANL(...) __builtin_nanl(__VA_ARGS__)
+#elif _CCCL_COMPILER(MSVC)
+#  define _CCCL_BUILTIN_NANL(...) static_cast<long double>(__builtin_nan(__VA_ARGS__))
+#endif // _CCCL_CHECK_BUILTIN(builtin_nanl)
+
+#if _CCCL_CHECK_BUILTIN(builtin_nansf) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10)
+#  define _CCCL_BUILTIN_NANSF(...) __builtin_nansf(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_nansf)
+
+#if _CCCL_CHECK_BUILTIN(builtin_nans) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, <, 10)
+#  define _CCCL_BUILTIN_NANS(...) __builtin_nans(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_nans)
+
+#if _CCCL_CHECK_BUILTIN(builtin_nansl) || _CCCL_COMPILER(GCC, <, 10)
+#  define _CCCL_BUILTIN_NANSL(...) __builtin_nansl(__VA_ARGS__)
+#elif _CCCL_COMPILER(MSVC)
+#  define _CCCL_BUILTIN_NANSL(...) static_cast<long double>(__builtin_nans(__VA_ARGS__))
+#endif // _CCCL_CHECK_BUILTIN(builtin_nansl)
+
 #if _CCCL_CHECK_BUILTIN(builtin_log) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_LOGF(...) __builtin_logf(__VA_ARGS__)
 #  define _CCCL_BUILTIN_LOG(...)  __builtin_log(__VA_ARGS__)
diff --git a/libcudacxx/include/cuda/std/__cuda/climits_prelude.h b/libcudacxx/include/cuda/std/__cuda/climits_prelude.h
deleted file mode 100644
index e788eaa29ea..00000000000
--- a/libcudacxx/include/cuda/std/__cuda/climits_prelude.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX___CUDA_CLIMITS_PRELUDE_H
-#define _LIBCUDACXX___CUDA_CLIMITS_PRELUDE_H
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#if !_CCCL_COMPILER(NVRTC)
-#  include <climits>
-#  include <cstdint>
-
-#  include <limits.h>
-#else // ^^^ !_CCCL_COMPILER(NVRTC) ^^^ / vvv _CCCL_COMPILER(NVRTC) vvv
-#  define CHAR_BIT 8
-
-#  define SCHAR_MIN         (-128)
-#  define SCHAR_MAX         127
-#  define UCHAR_MAX         255
-#  define __CHAR_UNSIGNED__ ('\xff' > 0) // CURSED
-#  if __CHAR_UNSIGNED__
-#    define CHAR_MIN 0
-#    define CHAR_MAX UCHAR_MAX
-#  else
-#    define CHAR_MIN SCHAR_MIN
-#    define CHAR_MAX SCHAR_MAX
-#  endif
-#  define SHRT_MIN  (-SHRT_MAX - 1)
-#  define SHRT_MAX  0x7fff
-#  define USHRT_MAX 0xffff
-#  define INT_MIN   (-INT_MAX - 1)
-#  define INT_MAX   0x7fffffff
-#  define UINT_MAX  0xffffffff
-#  define LONG_MIN  (-LONG_MAX - 1)
-#  ifdef __LP64__
-#    define LONG_MAX  LLONG_MAX
-#    define ULONG_MAX ULLONG_MAX
-#  else
-#    define LONG_MAX  INT_MAX
-#    define ULONG_MAX UINT_MAX
-#  endif
-#  define LLONG_MIN  (-LLONG_MAX - 1)
-#  define LLONG_MAX  0x7fffffffffffffffLL
-#  define ULLONG_MAX 0xffffffffffffffffUL
-
-#  define __FLT_RADIX__      2
-#  define __FLT_MANT_DIG__   24
-#  define __FLT_DIG__        6
-#  define __FLT_MIN__        1.17549435082228750796873653722224568e-38F
-#  define __FLT_MAX__        3.40282346638528859811704183484516925e+38F
-#  define __FLT_EPSILON__    1.19209289550781250000000000000000000e-7F
-#  define __FLT_MIN_EXP__    (-125)
-#  define __FLT_MIN_10_EXP__ (-37)
-#  define __FLT_MAX_EXP__    128
-#  define __FLT_MAX_10_EXP__ 38
-#  define __FLT_DENORM_MIN__ 1.40129846432481707092372958328991613e-45F
-#  define __DBL_MANT_DIG__   53
-#  define __DBL_DIG__        15
-#  define __DBL_MIN__        2.22507385850720138309023271733240406e-308
-#  define __DBL_MAX__        1.79769313486231570814527423731704357e+308
-#  define __DBL_EPSILON__    2.22044604925031308084726333618164062e-16
-#  define __DBL_MIN_EXP__    (-1021)
-#  define __DBL_MIN_10_EXP__ (-307)
-#  define __DBL_MAX_EXP__    1024
-#  define __DBL_MAX_10_EXP__ 308
-#  define __DBL_DENORM_MIN__ 4.94065645841246544176568792868221372e-324
-
-template <typename _To, typename _From>
-static _CCCL_DEVICE _CCCL_FORCEINLINE _To __cowchild_cast(_From __from)
-{
-  static_assert(sizeof(_From) == sizeof(_To), "");
-  union __cast
-  {
-    _From __from;
-    _To __to;
-  };
-  __cast __c;
-  __c.__from = __from;
-  return __c.__to;
-}
-
-#  define __builtin_huge_valf()    __cowchild_cast<float>(0x7f800000)
-#  define __builtin_nanf(__dummy)  __cowchild_cast<float>(0x7fc00000)
-#  define __builtin_nansf(__dummy) __cowchild_cast<float>(0x7fa00000)
-#  define __builtin_huge_val()     __cowchild_cast<double>(0x7ff0000000000000)
-#  define __builtin_nan(__dummy)   __cowchild_cast<double>(0x7ff8000000000000)
-#  define __builtin_nans(__dummy)  __cowchild_cast<double>(0x7ff4000000000000)
-#endif // _CCCL_COMPILER(NVRTC)
-
-#endif // _LIBCUDACXX___CUDA_CLIMITS_PRELUDE_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/win32/limits_msvc_win32.h b/libcudacxx/include/cuda/std/__limits/msvc_win32.h
similarity index 71%
rename from libcudacxx/include/cuda/std/detail/libcxx/include/support/win32/limits_msvc_win32.h
rename to libcudacxx/include/cuda/std/__limits/msvc_win32.h
index 1b4eeedba0b..d00c76895bc 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/win32/limits_msvc_win32.h
+++ b/libcudacxx/include/cuda/std/__limits/msvc_win32.h
@@ -1,14 +1,16 @@
 // -*- C++ -*-
-//===------------------ support/win32/limits_msvc_win32.h -----------------===//
+//===----------------------------------------------------------------------===//
 //
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCUDACXX_SUPPORT_WIN32_LIMITS_MSVC_WIN32_H
-#define _LIBCUDACXX_SUPPORT_WIN32_LIMITS_MSVC_WIN32_H
+#ifndef _LIBCUDACXX___LIMITS_MSVC_WIN32_H
+#define _LIBCUDACXX___LIMITS_MSVC_WIN32_H
 
 #if defined(__MINGW32__)
 #  error "This header complements the Microsoft C Runtime library, and should not be included otherwise."
@@ -19,8 +21,6 @@
 
 #include <float.h> // limit constants
 #include <limits.h> // CHAR_BIT
-#include <math.h> // HUGE_VAL
-#include <ymath.h> // internal MSVC header providing the needed functionality
 
 #define __CHAR_BIT__ CHAR_BIT
 
@@ -63,15 +63,4 @@
 // predefined by MinGW GCC
 #define __LDBL_DENORM_MIN__ 3.64519953188247460253e-4951L
 
-// __builtin replacements/workarounds
-#if _MSC_VER < 1934
-#  define __builtin_huge_vall()    _LInf._Long_double
-#  define __builtin_nanl(__dummmy) _LNan._Long_double
-#  define __builtin_nansl(__dummy) _LSnan._Long_double
-#else
-#  define __builtin_huge_vall() __builtin_huge_val()
-#  define __builtin_nanl(__v)   __builtin_nan(__v)
-#  define __builtin_nansl(__v)  __builtin_nans(__v)
-#endif
-
-#endif // _LIBCUDACXX_SUPPORT_WIN32_LIMITS_MSVC_WIN32_H
+#endif // _LIBCUDACXX___LIMITS_MSVC_WIN32_H
diff --git a/libcudacxx/include/cuda/std/__memory/allocator_traits.h b/libcudacxx/include/cuda/std/__memory/allocator_traits.h
index b553056ad99..035731687a3 100644
--- a/libcudacxx/include/cuda/std/__memory/allocator_traits.h
+++ b/libcudacxx/include/cuda/std/__memory/allocator_traits.h
@@ -28,6 +28,8 @@
 #include <cuda/std/__type_traits/is_copy_constructible.h>
 #include <cuda/std/__type_traits/is_empty.h>
 #include <cuda/std/__type_traits/is_move_constructible.h>
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/__type_traits/is_trivially_move_constructible.h>
 #include <cuda/std/__type_traits/make_unsigned.h>
 #include <cuda/std/__type_traits/remove_reference.h>
 #include <cuda/std/__type_traits/void_t.h>
diff --git a/libcudacxx/include/cuda/std/__memory/temporary_buffer.h b/libcudacxx/include/cuda/std/__memory/temporary_buffer.h
index 2aa33cad869..232003ab1ed 100644
--- a/libcudacxx/include/cuda/std/__memory/temporary_buffer.h
+++ b/libcudacxx/include/cuda/std/__memory/temporary_buffer.h
@@ -26,10 +26,11 @@
 #include <cuda/std/__iterator/iterator_traits.h>
 #include <cuda/std/__memory/addressof.h>
 #include <cuda/std/__new_>
+#include <cuda/std/__type_traits/alignment_of.h>
 #include <cuda/std/__utility/move.h>
 #include <cuda/std/__utility/pair.h>
 #include <cuda/std/cstddef>
-#include <cuda/std/detail/libcxx/include/limits>
+#include <cuda/std/limits>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/climits b/libcudacxx/include/cuda/std/climits
index 2860fc56468..a605f2dc597 100644
--- a/libcudacxx/include/cuda/std/climits
+++ b/libcudacxx/include/cuda/std/climits
@@ -23,7 +23,70 @@
 
 _CCCL_PUSH_MACROS
 
-#include <cuda/std/detail/libcxx/include/climits>
+#if !_CCCL_COMPILER(NVRTC)
+#  include <climits>
+#else // ^^^ !_CCCL_COMPILER(NVRTC) ^^^ / vvv _CCCL_COMPILER(NVRTC) vvv
+#  define CHAR_BIT 8
+
+#  define SCHAR_MIN         (-128)
+#  define SCHAR_MAX         127
+#  define UCHAR_MAX         255
+#  define __CHAR_UNSIGNED__ ('\xff' > 0) // CURSED
+#  if __CHAR_UNSIGNED__
+#    define CHAR_MIN 0
+#    define CHAR_MAX UCHAR_MAX
+#  else
+#    define CHAR_MIN SCHAR_MIN
+#    define CHAR_MAX SCHAR_MAX
+#  endif
+#  define SHRT_MIN  (-SHRT_MAX - 1)
+#  define SHRT_MAX  0x7fff
+#  define USHRT_MAX 0xffff
+#  define INT_MIN   (-INT_MAX - 1)
+#  define INT_MAX   0x7fffffff
+#  define UINT_MAX  0xffffffff
+#  define LONG_MIN  (-LONG_MAX - 1)
+#  ifdef __LP64__
+#    define LONG_MAX  LLONG_MAX
+#    define ULONG_MAX ULLONG_MAX
+#  else
+#    define LONG_MAX  INT_MAX
+#    define ULONG_MAX UINT_MAX
+#  endif
+#  define LLONG_MIN  (-LLONG_MAX - 1)
+#  define LLONG_MAX  0x7fffffffffffffffLL
+#  define ULLONG_MAX 0xffffffffffffffffUL
+
+#  define __FLT_RADIX__      2
+#  define __FLT_MANT_DIG__   24
+#  define __FLT_DIG__        6
+#  define __FLT_MIN__        1.17549435082228750796873653722224568e-38F
+#  define __FLT_MAX__        3.40282346638528859811704183484516925e+38F
+#  define __FLT_EPSILON__    1.19209289550781250000000000000000000e-7F
+#  define __FLT_MIN_EXP__    (-125)
+#  define __FLT_MIN_10_EXP__ (-37)
+#  define __FLT_MAX_EXP__    128
+#  define __FLT_MAX_10_EXP__ 38
+#  define __FLT_DENORM_MIN__ 1.40129846432481707092372958328991613e-45F
+#  define __DBL_MANT_DIG__   53
+#  define __DBL_DIG__        15
+#  define __DBL_MIN__        2.22507385850720138309023271733240406e-308
+#  define __DBL_MAX__        1.79769313486231570814527423731704357e+308
+#  define __DBL_EPSILON__    2.22044604925031308084726333618164062e-16
+#  define __DBL_MIN_EXP__    (-1021)
+#  define __DBL_MIN_10_EXP__ (-307)
+#  define __DBL_MAX_EXP__    1024
+#  define __DBL_MAX_10_EXP__ 308
+#  define __DBL_DENORM_MIN__ 4.94065645841246544176568792868221372e-324
+#endif // _CCCL_COMPILER(NVRTC)
+
+#if _CCCL_COMPILER(MSVC)
+#  include <cuda/std/__limits/msvc_win32.h>
+#endif // _CCCL_COMPILER(MSVC)
+
+#ifndef __CHAR_BIT__
+#  define __CHAR_BIT__ 8
+#endif // !__CHAR_BIT__
 
 _CCCL_POP_MACROS
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/climits b/libcudacxx/include/cuda/std/detail/libcxx/include/climits
deleted file mode 100644
index a80cd27db50..00000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/climits
+++ /dev/null
@@ -1,64 +0,0 @@
-// -*- C++ -*-
-//===--------------------------- climits ----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_CLIMITS
-#define _LIBCUDACXX_CLIMITS
-
-/*
-    climits synopsis
-
-Macros:
-
-    CHAR_BIT
-    SCHAR_MIN
-    SCHAR_MAX
-    UCHAR_MAX
-    CHAR_MIN
-    CHAR_MAX
-    MB_LEN_MAX
-    SHRT_MIN
-    SHRT_MAX
-    USHRT_MAX
-    INT_MIN
-    INT_MAX
-    UINT_MAX
-    LONG_MIN
-    LONG_MAX
-    ULONG_MAX
-    LLONG_MIN   // C99
-    LLONG_MAX   // C99
-    ULLONG_MAX  // C99
-
-*/
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/__cuda/climits_prelude.h>
-
-_CCCL_PUSH_MACROS
-
-#if _CCCL_COMPILER(MSVC)
-#  include <cuda/std/detail/libcxx/include/support/win32/limits_msvc_win32.h>
-#endif // _CCCL_COMPILER(MSVC)
-
-#ifndef __CHAR_BIT__
-#  define __CHAR_BIT__ 8
-#endif // !__CHAR_BIT__
-
-_CCCL_POP_MACROS
-
-#endif // _LIBCUDACXX_CLIMITS
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
index a3ef375cb10..6d5618b7d78 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
@@ -338,9 +338,8 @@ long double    truncl(long double x);
 #endif // _LIBCUDACXX_HAS_NVBF16
 
 #if _CCCL_COMPILER(NVRTC)
-#  include <cuda/std/climits>
-#  define INFINITY __builtin_huge_val()
-#  define NAN      __builtin_nan()
+#  define INFINITY _CUDA_VSTD::numeric_limits<float>::infinity()
+#  define NAN      _CUDA_VSTD::numeric_limits<float>::quiet_NaN()
 #endif // _CCCL_COMPILER(NVRTC)
 
 _CCCL_PUSH_MACROS
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/limits b/libcudacxx/include/cuda/std/detail/libcxx/include/limits
deleted file mode 100644
index 95980e41dc9..00000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/limits
+++ /dev/null
@@ -1,1120 +0,0 @@
-// -*- C++ -*-
-//===---------------------------- limits ----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_LIMITS
-#define _LIBCUDACXX_LIMITS
-
-/*
-    limits synopsis
-
-namespace std
-{
-
-template<class T>
-class numeric_limits
-{
-public:
-    static constexpr bool is_specialized = false;
-    static constexpr T min() noexcept;
-    static constexpr T max() noexcept;
-    static constexpr T lowest() noexcept;
-
-    static constexpr int  digits = 0;
-    static constexpr int  digits10 = 0;
-    static constexpr int  max_digits10 = 0;
-    static constexpr bool is_signed = false;
-    static constexpr bool is_integer = false;
-    static constexpr bool is_exact = false;
-    static constexpr int  radix = 0;
-    static constexpr T epsilon() noexcept;
-    static constexpr T round_error() noexcept;
-
-    static constexpr int  min_exponent = 0;
-    static constexpr int  min_exponent10 = 0;
-    static constexpr int  max_exponent = 0;
-    static constexpr int  max_exponent10 = 0;
-
-    static constexpr bool has_infinity = false;
-    static constexpr bool has_quiet_NaN = false;
-    static constexpr bool has_signaling_NaN = false;
-    static constexpr float_denorm_style has_denorm = denorm_absent;
-    static constexpr bool has_denorm_loss = false;
-    static constexpr T infinity() noexcept;
-    static constexpr T quiet_NaN() noexcept;
-    static constexpr T signaling_NaN() noexcept;
-    static constexpr T denorm_min() noexcept;
-
-    static constexpr bool is_iec559 = false;
-    static constexpr bool is_bounded = false;
-    static constexpr bool is_modulo = false;
-
-    static constexpr bool traps = false;
-    static constexpr bool tinyness_before = false;
-    static constexpr float_round_style round_style = round_toward_zero;
-};
-
-enum float_round_style
-{
-    round_indeterminate       = -1,
-    round_toward_zero         =  0,
-    round_to_nearest          =  1,
-    round_toward_infinity     =  2,
-    round_toward_neg_infinity =  3
-};
-
-enum float_denorm_style
-{
-    denorm_indeterminate = -1,
-    denorm_absent = 0,
-    denorm_present = 1
-};
-
-template<> class numeric_limits<cv bool>;
-
-template<> class numeric_limits<cv char>;
-template<> class numeric_limits<cv signed char>;
-template<> class numeric_limits<cv unsigned char>;
-template<> class numeric_limits<cv wchar_t>;
-template<> class numeric_limits<cv char8_t>; // C++20
-template<> class numeric_limits<cv char16_t>;
-template<> class numeric_limits<cv char32_t>;
-
-template<> class numeric_limits<cv short>;
-template<> class numeric_limits<cv int>;
-template<> class numeric_limits<cv long>;
-template<> class numeric_limits<cv long long>;
-template<> class numeric_limits<cv unsigned short>;
-template<> class numeric_limits<cv unsigned int>;
-template<> class numeric_limits<cv unsigned long>;
-template<> class numeric_limits<cv unsigned long long>;
-
-template<> class numeric_limits<cv float>;
-template<> class numeric_limits<cv double>;
-template<> class numeric_limits<cv long double>;
-
-}  // std
-
-*/
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/climits>
-#include <cuda/std/type_traits>
-#include <cuda/std/version>
-
-_CCCL_PUSH_MACROS
-
-#if _CCCL_COMPILER(MSVC)
-#  include <cuda/std/detail/libcxx/include/support/win32/limits_msvc_win32.h>
-#endif // _CCCL_COMPILER(MSVC)
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-enum float_round_style
-{
-  round_indeterminate       = -1,
-  round_toward_zero         = 0,
-  round_to_nearest          = 1,
-  round_toward_infinity     = 2,
-  round_toward_neg_infinity = 3
-};
-
-enum float_denorm_style
-{
-  denorm_indeterminate = -1,
-  denorm_absent        = 0,
-  denorm_present       = 1
-};
-
-template <class _Tp, bool = is_arithmetic<_Tp>::value>
-class __cccl_numeric_limits
-{
-protected:
-  typedef _Tp type;
-
-  static constexpr bool is_specialized = false;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return type();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return type();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return type();
-  }
-
-  static constexpr int digits       = 0;
-  static constexpr int digits10     = 0;
-  static constexpr int max_digits10 = 0;
-  static constexpr bool is_signed   = false;
-  static constexpr bool is_integer  = false;
-  static constexpr bool is_exact    = false;
-  static constexpr int radix        = 0;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return type();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return type();
-  }
-
-  static constexpr int min_exponent   = 0;
-  static constexpr int min_exponent10 = 0;
-  static constexpr int max_exponent   = 0;
-  static constexpr int max_exponent10 = 0;
-
-  static constexpr bool has_infinity             = false;
-  static constexpr bool has_quiet_NaN            = false;
-  static constexpr bool has_signaling_NaN        = false;
-  static constexpr float_denorm_style has_denorm = denorm_absent;
-  static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return type();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return type();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return type();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return type();
-  }
-
-  static constexpr bool is_iec559  = false;
-  static constexpr bool is_bounded = false;
-  static constexpr bool is_modulo  = false;
-
-  static constexpr bool traps                    = false;
-  static constexpr bool tinyness_before          = false;
-  static constexpr float_round_style round_style = round_toward_zero;
-};
-
-_CCCL_DIAG_PUSH
-_CCCL_DIAG_SUPPRESS_MSVC(4309)
-template <class _Tp, int __digits, bool _IsSigned>
-struct __cccl_compute_min
-{
-  static constexpr _Tp value = static_cast<_Tp>(_Tp(1) << __digits);
-};
-_CCCL_DIAG_POP
-
-template <class _Tp, int __digits>
-struct __cccl_compute_min<_Tp, __digits, false>
-{
-  static constexpr _Tp value = _Tp(0);
-};
-
-template <class _Tp>
-class __cccl_numeric_limits<_Tp, true>
-{
-protected:
-  typedef _Tp type;
-
-  static constexpr bool is_specialized = true;
-
-  static constexpr bool is_signed   = type(-1) < type(0);
-  static constexpr int digits       = static_cast<int>(sizeof(type) * __CHAR_BIT__ - is_signed);
-  static constexpr int digits10     = digits * 3 / 10;
-  static constexpr int max_digits10 = 0;
-  static constexpr type __min       = __cccl_compute_min<type, digits, is_signed>::value;
-  static constexpr type __max       = is_signed ? type(type(~0) ^ __min) : type(~0);
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return __min;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return __max;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return min();
-  }
-
-  static constexpr bool is_integer = true;
-  static constexpr bool is_exact   = true;
-  static constexpr int radix       = 2;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return type(0);
-  }
-
-  static constexpr int min_exponent   = 0;
-  static constexpr int min_exponent10 = 0;
-  static constexpr int max_exponent   = 0;
-  static constexpr int max_exponent10 = 0;
-
-  static constexpr bool has_infinity             = false;
-  static constexpr bool has_quiet_NaN            = false;
-  static constexpr bool has_signaling_NaN        = false;
-  static constexpr float_denorm_style has_denorm = denorm_absent;
-  static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return type(0);
-  }
-
-  static constexpr bool is_iec559  = false;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo  = !_CUDA_VSTD::is_signed<_Tp>::value;
-
-#if defined(__i386__) || defined(__x86_64__) || defined(__pnacl__) || defined(__wasm__)
-  static constexpr bool traps = true;
-#else
-  static constexpr bool traps = false;
-#endif
-  static constexpr bool tinyness_before          = false;
-  static constexpr float_round_style round_style = round_toward_zero;
-};
-
-template <>
-class __cccl_numeric_limits<bool, true>
-{
-protected:
-  typedef bool type;
-
-  static constexpr bool is_specialized = true;
-
-  static constexpr bool is_signed   = false;
-  static constexpr int digits       = 1;
-  static constexpr int digits10     = 0;
-  static constexpr int max_digits10 = 0;
-  static constexpr type __min       = false;
-  static constexpr type __max       = true;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return __min;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return __max;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return min();
-  }
-
-  static constexpr bool is_integer = true;
-  static constexpr bool is_exact   = true;
-  static constexpr int radix       = 2;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return type(0);
-  }
-
-  static constexpr int min_exponent   = 0;
-  static constexpr int min_exponent10 = 0;
-  static constexpr int max_exponent   = 0;
-  static constexpr int max_exponent10 = 0;
-
-  static constexpr bool has_infinity             = false;
-  static constexpr bool has_quiet_NaN            = false;
-  static constexpr bool has_signaling_NaN        = false;
-  static constexpr float_denorm_style has_denorm = denorm_absent;
-  static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return type(0);
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return type(0);
-  }
-
-  static constexpr bool is_iec559  = false;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo  = false;
-
-  static constexpr bool traps                    = false;
-  static constexpr bool tinyness_before          = false;
-  static constexpr float_round_style round_style = round_toward_zero;
-};
-
-template <>
-class __cccl_numeric_limits<float, true>
-{
-protected:
-  typedef float type;
-
-  static constexpr bool is_specialized = true;
-
-  static constexpr bool is_signed   = true;
-  static constexpr int digits       = __FLT_MANT_DIG__;
-  static constexpr int digits10     = __FLT_DIG__;
-  static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return __FLT_MIN__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return __FLT_MAX__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return -max();
-  }
-
-  static constexpr bool is_integer = false;
-  static constexpr bool is_exact   = false;
-  static constexpr int radix       = __FLT_RADIX__;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return __FLT_EPSILON__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return 0.5F;
-  }
-
-  static constexpr int min_exponent   = __FLT_MIN_EXP__;
-  static constexpr int min_exponent10 = __FLT_MIN_10_EXP__;
-  static constexpr int max_exponent   = __FLT_MAX_EXP__;
-  static constexpr int max_exponent10 = __FLT_MAX_10_EXP__;
-
-  static constexpr bool has_infinity             = true;
-  static constexpr bool has_quiet_NaN            = true;
-  static constexpr bool has_signaling_NaN        = true;
-  static constexpr float_denorm_style has_denorm = denorm_present;
-  static constexpr bool has_denorm_loss          = false;
-#if _CCCL_COMPILER(NVRTC)
-  _LIBCUDACXX_HIDE_FROM_ABI static type infinity() noexcept
-  {
-    return __builtin_huge_valf();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static type quiet_NaN() noexcept
-  {
-    return __builtin_nanf("");
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static type signaling_NaN() noexcept
-  {
-    return __builtin_nansf("");
-  }
-#else // ^^^ _CCCL_COMPILER(NVRTC) ^^^ // vvv !_CCCL_COMPILER(NVRTC) vvv
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return __builtin_huge_valf();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return __builtin_nanf("");
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return __builtin_nansf("");
-  }
-#endif // !_CCCL_COMPILER(NVRTC)
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return __FLT_DENORM_MIN__;
-  }
-
-  static constexpr bool is_iec559  = true;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo  = false;
-
-  static constexpr bool traps                    = false;
-  static constexpr bool tinyness_before          = false;
-  static constexpr float_round_style round_style = round_to_nearest;
-};
-
-template <>
-class __cccl_numeric_limits<double, true>
-{
-protected:
-  typedef double type;
-
-  static constexpr bool is_specialized = true;
-
-  static constexpr bool is_signed   = true;
-  static constexpr int digits       = __DBL_MANT_DIG__;
-  static constexpr int digits10     = __DBL_DIG__;
-  static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return __DBL_MIN__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return __DBL_MAX__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return -max();
-  }
-
-  static constexpr bool is_integer = false;
-  static constexpr bool is_exact   = false;
-  static constexpr int radix       = __FLT_RADIX__;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return __DBL_EPSILON__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return 0.5;
-  }
-
-  static constexpr int min_exponent   = __DBL_MIN_EXP__;
-  static constexpr int min_exponent10 = __DBL_MIN_10_EXP__;
-  static constexpr int max_exponent   = __DBL_MAX_EXP__;
-  static constexpr int max_exponent10 = __DBL_MAX_10_EXP__;
-
-  static constexpr bool has_infinity             = true;
-  static constexpr bool has_quiet_NaN            = true;
-  static constexpr bool has_signaling_NaN        = true;
-  static constexpr float_denorm_style has_denorm = denorm_present;
-  static constexpr bool has_denorm_loss          = false;
-#if _CCCL_COMPILER(NVRTC)
-  _LIBCUDACXX_HIDE_FROM_ABI static type infinity() noexcept
-  {
-    return __builtin_huge_val();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static type quiet_NaN() noexcept
-  {
-    return __builtin_nan("");
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static type signaling_NaN() noexcept
-  {
-    return __builtin_nans("");
-  }
-#else // ^^^ _CCCL_COMPILER(NVRTC) ^^^ // vvv !_CCCL_COMPILER(NVRTC) vvv
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return __builtin_huge_val();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return __builtin_nan("");
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return __builtin_nans("");
-  }
-#endif // !_CCCL_COMPILER(NVRTC)
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return __DBL_DENORM_MIN__;
-  }
-
-  static constexpr bool is_iec559  = true;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo  = false;
-
-  static constexpr bool traps                    = false;
-  static constexpr bool tinyness_before          = false;
-  static constexpr float_round_style round_style = round_to_nearest;
-};
-
-template <>
-class __cccl_numeric_limits<long double, true>
-{
-#ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
-
-protected:
-  typedef long double type;
-
-  static constexpr bool is_specialized = true;
-
-  static constexpr bool is_signed   = true;
-  static constexpr int digits       = __LDBL_MANT_DIG__;
-  static constexpr int digits10     = __LDBL_DIG__;
-  static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return __LDBL_MIN__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return __LDBL_MAX__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return -max();
-  }
-
-  static constexpr bool is_integer = false;
-  static constexpr bool is_exact   = false;
-  static constexpr int radix       = __FLT_RADIX__;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return __LDBL_EPSILON__;
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return 0.5L;
-  }
-
-  static constexpr int min_exponent   = __LDBL_MIN_EXP__;
-  static constexpr int min_exponent10 = __LDBL_MIN_10_EXP__;
-  static constexpr int max_exponent   = __LDBL_MAX_EXP__;
-  static constexpr int max_exponent10 = __LDBL_MAX_10_EXP__;
-
-  static constexpr bool has_infinity             = true;
-  static constexpr bool has_quiet_NaN            = true;
-  static constexpr bool has_signaling_NaN        = true;
-  static constexpr float_denorm_style has_denorm = denorm_present;
-  static constexpr bool has_denorm_loss          = false;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return __builtin_huge_vall();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return __builtin_nanl("");
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return __builtin_nansl("");
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return __LDBL_DENORM_MIN__;
-  }
-
-  static constexpr bool is_iec559  = true;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo  = false;
-
-  static constexpr bool traps                    = false;
-  static constexpr bool tinyness_before          = false;
-  static constexpr float_round_style round_style = round_to_nearest;
-#endif
-};
-
-template <class _Tp>
-class _CCCL_TYPE_VISIBILITY_DEFAULT numeric_limits : private __cccl_numeric_limits<remove_cv_t<_Tp>>
-{
-  typedef __cccl_numeric_limits<remove_cv_t<_Tp>> __base;
-  typedef typename __base::type type;
-
-public:
-  static constexpr bool is_specialized = __base::is_specialized;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return __base::min();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return __base::max();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return __base::lowest();
-  }
-
-  static constexpr int digits       = __base::digits;
-  static constexpr int digits10     = __base::digits10;
-  static constexpr int max_digits10 = __base::max_digits10;
-  static constexpr bool is_signed   = __base::is_signed;
-  static constexpr bool is_integer  = __base::is_integer;
-  static constexpr bool is_exact    = __base::is_exact;
-  static constexpr int radix        = __base::radix;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return __base::epsilon();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return __base::round_error();
-  }
-
-  static constexpr int min_exponent   = __base::min_exponent;
-  static constexpr int min_exponent10 = __base::min_exponent10;
-  static constexpr int max_exponent   = __base::max_exponent;
-  static constexpr int max_exponent10 = __base::max_exponent10;
-
-  static constexpr bool has_infinity             = __base::has_infinity;
-  static constexpr bool has_quiet_NaN            = __base::has_quiet_NaN;
-  static constexpr bool has_signaling_NaN        = __base::has_signaling_NaN;
-  static constexpr float_denorm_style has_denorm = __base::has_denorm;
-  static constexpr bool has_denorm_loss          = __base::has_denorm_loss;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return __base::infinity();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return __base::quiet_NaN();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return __base::signaling_NaN();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return __base::denorm_min();
-  }
-
-  static constexpr bool is_iec559  = __base::is_iec559;
-  static constexpr bool is_bounded = __base::is_bounded;
-  static constexpr bool is_modulo  = __base::is_modulo;
-
-  static constexpr bool traps                    = __base::traps;
-  static constexpr bool tinyness_before          = __base::tinyness_before;
-  static constexpr float_round_style round_style = __base::round_style;
-};
-
-template <class _Tp>
-constexpr bool numeric_limits<_Tp>::is_specialized;
-template <class _Tp>
-constexpr int numeric_limits<_Tp>::digits;
-template <class _Tp>
-constexpr int numeric_limits<_Tp>::digits10;
-template <class _Tp>
-constexpr int numeric_limits<_Tp>::max_digits10;
-template <class _Tp>
-constexpr bool numeric_limits<_Tp>::is_signed;
-template <class _Tp>
-constexpr bool numeric_limits<_Tp>::is_integer;
-template <class _Tp>
-constexpr bool numeric_limits<_Tp>::is_exact;
-template <class _Tp>
-constexpr int numeric_limits<_Tp>::radix;
-template <class _Tp>
-constexpr int numeric_limits<_Tp>::min_exponent;
-template <class _Tp>
-constexpr int numeric_limits<_Tp>::min_exponent10;
-template <class _Tp>
-constexpr int numeric_limits<_Tp>::max_exponent;
-template <class _Tp>
-constexpr int numeric_limits<_Tp>::max_exponent10;
-template <class _Tp>
-constexpr bool numeric_limits<_Tp>::has_infinity;
-template <class _Tp>
-constexpr bool numeric_limits<_Tp>::has_quiet_NaN;
-template <class _Tp>
-constexpr bool numeric_limits<_Tp>::has_signaling_NaN;
-template <class _Tp>
-constexpr float_denorm_style numeric_limits<_Tp>::has_denorm;
-template <class _Tp>
-constexpr bool numeric_limits<_Tp>::has_denorm_loss;
-template <class _Tp>
-constexpr bool numeric_limits<_Tp>::is_iec559;
-template <class _Tp>
-constexpr bool numeric_limits<_Tp>::is_bounded;
-template <class _Tp>
-constexpr bool numeric_limits<_Tp>::is_modulo;
-template <class _Tp>
-constexpr bool numeric_limits<_Tp>::traps;
-template <class _Tp>
-constexpr bool numeric_limits<_Tp>::tinyness_before;
-template <class _Tp>
-constexpr float_round_style numeric_limits<_Tp>::round_style;
-
-template <class _Tp>
-class _CCCL_TYPE_VISIBILITY_DEFAULT numeric_limits<const _Tp> : private numeric_limits<_Tp>
-{
-  typedef numeric_limits<_Tp> __base;
-  typedef _Tp type;
-
-public:
-  static constexpr bool is_specialized = __base::is_specialized;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return __base::min();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return __base::max();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return __base::lowest();
-  }
-
-  static constexpr int digits       = __base::digits;
-  static constexpr int digits10     = __base::digits10;
-  static constexpr int max_digits10 = __base::max_digits10;
-  static constexpr bool is_signed   = __base::is_signed;
-  static constexpr bool is_integer  = __base::is_integer;
-  static constexpr bool is_exact    = __base::is_exact;
-  static constexpr int radix        = __base::radix;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return __base::epsilon();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return __base::round_error();
-  }
-
-  static constexpr int min_exponent   = __base::min_exponent;
-  static constexpr int min_exponent10 = __base::min_exponent10;
-  static constexpr int max_exponent   = __base::max_exponent;
-  static constexpr int max_exponent10 = __base::max_exponent10;
-
-  static constexpr bool has_infinity             = __base::has_infinity;
-  static constexpr bool has_quiet_NaN            = __base::has_quiet_NaN;
-  static constexpr bool has_signaling_NaN        = __base::has_signaling_NaN;
-  static constexpr float_denorm_style has_denorm = __base::has_denorm;
-  static constexpr bool has_denorm_loss          = __base::has_denorm_loss;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return __base::infinity();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return __base::quiet_NaN();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return __base::signaling_NaN();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return __base::denorm_min();
-  }
-
-  static constexpr bool is_iec559  = __base::is_iec559;
-  static constexpr bool is_bounded = __base::is_bounded;
-  static constexpr bool is_modulo  = __base::is_modulo;
-
-  static constexpr bool traps                    = __base::traps;
-  static constexpr bool tinyness_before          = __base::tinyness_before;
-  static constexpr float_round_style round_style = __base::round_style;
-};
-
-template <class _Tp>
-constexpr bool numeric_limits<const _Tp>::is_specialized;
-template <class _Tp>
-constexpr int numeric_limits<const _Tp>::digits;
-template <class _Tp>
-constexpr int numeric_limits<const _Tp>::digits10;
-template <class _Tp>
-constexpr int numeric_limits<const _Tp>::max_digits10;
-template <class _Tp>
-constexpr bool numeric_limits<const _Tp>::is_signed;
-template <class _Tp>
-constexpr bool numeric_limits<const _Tp>::is_integer;
-template <class _Tp>
-constexpr bool numeric_limits<const _Tp>::is_exact;
-template <class _Tp>
-constexpr int numeric_limits<const _Tp>::radix;
-template <class _Tp>
-constexpr int numeric_limits<const _Tp>::min_exponent;
-template <class _Tp>
-constexpr int numeric_limits<const _Tp>::min_exponent10;
-template <class _Tp>
-constexpr int numeric_limits<const _Tp>::max_exponent;
-template <class _Tp>
-constexpr int numeric_limits<const _Tp>::max_exponent10;
-template <class _Tp>
-constexpr bool numeric_limits<const _Tp>::has_infinity;
-template <class _Tp>
-constexpr bool numeric_limits<const _Tp>::has_quiet_NaN;
-template <class _Tp>
-constexpr bool numeric_limits<const _Tp>::has_signaling_NaN;
-template <class _Tp>
-constexpr float_denorm_style numeric_limits<const _Tp>::has_denorm;
-template <class _Tp>
-constexpr bool numeric_limits<const _Tp>::has_denorm_loss;
-template <class _Tp>
-constexpr bool numeric_limits<const _Tp>::is_iec559;
-template <class _Tp>
-constexpr bool numeric_limits<const _Tp>::is_bounded;
-template <class _Tp>
-constexpr bool numeric_limits<const _Tp>::is_modulo;
-template <class _Tp>
-constexpr bool numeric_limits<const _Tp>::traps;
-template <class _Tp>
-constexpr bool numeric_limits<const _Tp>::tinyness_before;
-template <class _Tp>
-constexpr float_round_style numeric_limits<const _Tp>::round_style;
-
-template <class _Tp>
-class _CCCL_TYPE_VISIBILITY_DEFAULT numeric_limits<volatile _Tp> : private numeric_limits<_Tp>
-{
-  typedef numeric_limits<_Tp> __base;
-  typedef _Tp type;
-
-public:
-  static constexpr bool is_specialized = __base::is_specialized;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return __base::min();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return __base::max();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return __base::lowest();
-  }
-
-  static constexpr int digits       = __base::digits;
-  static constexpr int digits10     = __base::digits10;
-  static constexpr int max_digits10 = __base::max_digits10;
-  static constexpr bool is_signed   = __base::is_signed;
-  static constexpr bool is_integer  = __base::is_integer;
-  static constexpr bool is_exact    = __base::is_exact;
-  static constexpr int radix        = __base::radix;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return __base::epsilon();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return __base::round_error();
-  }
-
-  static constexpr int min_exponent   = __base::min_exponent;
-  static constexpr int min_exponent10 = __base::min_exponent10;
-  static constexpr int max_exponent   = __base::max_exponent;
-  static constexpr int max_exponent10 = __base::max_exponent10;
-
-  static constexpr bool has_infinity             = __base::has_infinity;
-  static constexpr bool has_quiet_NaN            = __base::has_quiet_NaN;
-  static constexpr bool has_signaling_NaN        = __base::has_signaling_NaN;
-  static constexpr float_denorm_style has_denorm = __base::has_denorm;
-  static constexpr bool has_denorm_loss          = __base::has_denorm_loss;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return __base::infinity();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return __base::quiet_NaN();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return __base::signaling_NaN();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return __base::denorm_min();
-  }
-
-  static constexpr bool is_iec559  = __base::is_iec559;
-  static constexpr bool is_bounded = __base::is_bounded;
-  static constexpr bool is_modulo  = __base::is_modulo;
-
-  static constexpr bool traps                    = __base::traps;
-  static constexpr bool tinyness_before          = __base::tinyness_before;
-  static constexpr float_round_style round_style = __base::round_style;
-};
-
-template <class _Tp>
-constexpr bool numeric_limits<volatile _Tp>::is_specialized;
-template <class _Tp>
-constexpr int numeric_limits<volatile _Tp>::digits;
-template <class _Tp>
-constexpr int numeric_limits<volatile _Tp>::digits10;
-template <class _Tp>
-constexpr int numeric_limits<volatile _Tp>::max_digits10;
-template <class _Tp>
-constexpr bool numeric_limits<volatile _Tp>::is_signed;
-template <class _Tp>
-constexpr bool numeric_limits<volatile _Tp>::is_integer;
-template <class _Tp>
-constexpr bool numeric_limits<volatile _Tp>::is_exact;
-template <class _Tp>
-constexpr int numeric_limits<volatile _Tp>::radix;
-template <class _Tp>
-constexpr int numeric_limits<volatile _Tp>::min_exponent;
-template <class _Tp>
-constexpr int numeric_limits<volatile _Tp>::min_exponent10;
-template <class _Tp>
-constexpr int numeric_limits<volatile _Tp>::max_exponent;
-template <class _Tp>
-constexpr int numeric_limits<volatile _Tp>::max_exponent10;
-template <class _Tp>
-constexpr bool numeric_limits<volatile _Tp>::has_infinity;
-template <class _Tp>
-constexpr bool numeric_limits<volatile _Tp>::has_quiet_NaN;
-template <class _Tp>
-constexpr bool numeric_limits<volatile _Tp>::has_signaling_NaN;
-template <class _Tp>
-constexpr float_denorm_style numeric_limits<volatile _Tp>::has_denorm;
-template <class _Tp>
-constexpr bool numeric_limits<volatile _Tp>::has_denorm_loss;
-template <class _Tp>
-constexpr bool numeric_limits<volatile _Tp>::is_iec559;
-template <class _Tp>
-constexpr bool numeric_limits<volatile _Tp>::is_bounded;
-template <class _Tp>
-constexpr bool numeric_limits<volatile _Tp>::is_modulo;
-template <class _Tp>
-constexpr bool numeric_limits<volatile _Tp>::traps;
-template <class _Tp>
-constexpr bool numeric_limits<volatile _Tp>::tinyness_before;
-template <class _Tp>
-constexpr float_round_style numeric_limits<volatile _Tp>::round_style;
-
-template <class _Tp>
-class _CCCL_TYPE_VISIBILITY_DEFAULT numeric_limits<const volatile _Tp> : private numeric_limits<_Tp>
-{
-  typedef numeric_limits<_Tp> __base;
-  typedef _Tp type;
-
-public:
-  static constexpr bool is_specialized = __base::is_specialized;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
-  {
-    return __base::min();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
-  {
-    return __base::max();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
-  {
-    return __base::lowest();
-  }
-
-  static constexpr int digits       = __base::digits;
-  static constexpr int digits10     = __base::digits10;
-  static constexpr int max_digits10 = __base::max_digits10;
-  static constexpr bool is_signed   = __base::is_signed;
-  static constexpr bool is_integer  = __base::is_integer;
-  static constexpr bool is_exact    = __base::is_exact;
-  static constexpr int radix        = __base::radix;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
-  {
-    return __base::epsilon();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
-  {
-    return __base::round_error();
-  }
-
-  static constexpr int min_exponent   = __base::min_exponent;
-  static constexpr int min_exponent10 = __base::min_exponent10;
-  static constexpr int max_exponent   = __base::max_exponent;
-  static constexpr int max_exponent10 = __base::max_exponent10;
-
-  static constexpr bool has_infinity             = __base::has_infinity;
-  static constexpr bool has_quiet_NaN            = __base::has_quiet_NaN;
-  static constexpr bool has_signaling_NaN        = __base::has_signaling_NaN;
-  static constexpr float_denorm_style has_denorm = __base::has_denorm;
-  static constexpr bool has_denorm_loss          = __base::has_denorm_loss;
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
-  {
-    return __base::infinity();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
-  {
-    return __base::quiet_NaN();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
-  {
-    return __base::signaling_NaN();
-  }
-  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
-  {
-    return __base::denorm_min();
-  }
-
-  static constexpr bool is_iec559  = __base::is_iec559;
-  static constexpr bool is_bounded = __base::is_bounded;
-  static constexpr bool is_modulo  = __base::is_modulo;
-
-  static constexpr bool traps                    = __base::traps;
-  static constexpr bool tinyness_before          = __base::tinyness_before;
-  static constexpr float_round_style round_style = __base::round_style;
-};
-
-template <class _Tp>
-constexpr bool numeric_limits<const volatile _Tp>::is_specialized;
-template <class _Tp>
-constexpr int numeric_limits<const volatile _Tp>::digits;
-template <class _Tp>
-constexpr int numeric_limits<const volatile _Tp>::digits10;
-template <class _Tp>
-constexpr int numeric_limits<const volatile _Tp>::max_digits10;
-template <class _Tp>
-constexpr bool numeric_limits<const volatile _Tp>::is_signed;
-template <class _Tp>
-constexpr bool numeric_limits<const volatile _Tp>::is_integer;
-template <class _Tp>
-constexpr bool numeric_limits<const volatile _Tp>::is_exact;
-template <class _Tp>
-constexpr int numeric_limits<const volatile _Tp>::radix;
-template <class _Tp>
-constexpr int numeric_limits<const volatile _Tp>::min_exponent;
-template <class _Tp>
-constexpr int numeric_limits<const volatile _Tp>::min_exponent10;
-template <class _Tp>
-constexpr int numeric_limits<const volatile _Tp>::max_exponent;
-template <class _Tp>
-constexpr int numeric_limits<const volatile _Tp>::max_exponent10;
-template <class _Tp>
-constexpr bool numeric_limits<const volatile _Tp>::has_infinity;
-template <class _Tp>
-constexpr bool numeric_limits<const volatile _Tp>::has_quiet_NaN;
-template <class _Tp>
-constexpr bool numeric_limits<const volatile _Tp>::has_signaling_NaN;
-template <class _Tp>
-constexpr float_denorm_style numeric_limits<const volatile _Tp>::has_denorm;
-template <class _Tp>
-constexpr bool numeric_limits<const volatile _Tp>::has_denorm_loss;
-template <class _Tp>
-constexpr bool numeric_limits<const volatile _Tp>::is_iec559;
-template <class _Tp>
-constexpr bool numeric_limits<const volatile _Tp>::is_bounded;
-template <class _Tp>
-constexpr bool numeric_limits<const volatile _Tp>::is_modulo;
-template <class _Tp>
-constexpr bool numeric_limits<const volatile _Tp>::traps;
-template <class _Tp>
-constexpr bool numeric_limits<const volatile _Tp>::tinyness_before;
-template <class _Tp>
-constexpr float_round_style numeric_limits<const volatile _Tp>::round_style;
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-_CCCL_POP_MACROS
-
-#endif // _LIBCUDACXX_LIMITS
diff --git a/libcudacxx/include/cuda/std/limits b/libcudacxx/include/cuda/std/limits
index e1c5981cd3e..98c63813b7b 100644
--- a/libcudacxx/include/cuda/std/limits
+++ b/libcudacxx/include/cuda/std/limits
@@ -21,9 +21,553 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__bit/bit_cast.h>
+#include <cuda/std/__type_traits/is_arithmetic.h>
+#include <cuda/std/climits>
+#include <cuda/std/version>
+
 _CCCL_PUSH_MACROS
 
-#include <cuda/std/detail/libcxx/include/limits>
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+enum float_round_style
+{
+  round_indeterminate       = -1,
+  round_toward_zero         = 0,
+  round_to_nearest          = 1,
+  round_toward_infinity     = 2,
+  round_toward_neg_infinity = 3
+};
+
+enum float_denorm_style
+{
+  denorm_indeterminate = -1,
+  denorm_absent        = 0,
+  denorm_present       = 1
+};
+
+template <class _Tp, bool = is_arithmetic<_Tp>::value>
+class __numeric_limits_impl
+{
+public:
+  using type = _Tp;
+
+  static constexpr bool is_specialized = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return type();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return type();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return type();
+  }
+
+  static constexpr int digits       = 0;
+  static constexpr int digits10     = 0;
+  static constexpr int max_digits10 = 0;
+  static constexpr bool is_signed   = false;
+  static constexpr bool is_integer  = false;
+  static constexpr bool is_exact    = false;
+  static constexpr int radix        = 0;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return type();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return type();
+  }
+
+  static constexpr int min_exponent   = 0;
+  static constexpr int min_exponent10 = 0;
+  static constexpr int max_exponent   = 0;
+  static constexpr int max_exponent10 = 0;
+
+  static constexpr bool has_infinity             = false;
+  static constexpr bool has_quiet_NaN            = false;
+  static constexpr bool has_signaling_NaN        = false;
+  static constexpr float_denorm_style has_denorm = denorm_absent;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return type();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return type();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return type();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return type();
+  }
+
+  static constexpr bool is_iec559  = false;
+  static constexpr bool is_bounded = false;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_toward_zero;
+};
+
+// MSVC warns about overflowing left shift
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_MSVC(4309)
+template <class _Tp, int __digits, bool _IsSigned>
+struct __int_min
+{
+  static constexpr _Tp value = static_cast<_Tp>(_Tp(1) << __digits);
+};
+_CCCL_DIAG_POP
+
+template <class _Tp, int __digits>
+struct __int_min<_Tp, __digits, false>
+{
+  static constexpr _Tp value = _Tp(0);
+};
+
+template <class _Tp>
+class __numeric_limits_impl<_Tp, true>
+{
+public:
+  using type = _Tp;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = type(-1) < type(0);
+  static constexpr int digits       = static_cast<int>(sizeof(type) * __CHAR_BIT__ - is_signed);
+  static constexpr int digits10     = digits * 3 / 10;
+  static constexpr int max_digits10 = 0;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return __int_min<type, digits, is_signed>::value;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return is_signed ? type(type(~0) ^ min()) : type(~0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return min();
+  }
+
+  static constexpr bool is_integer = true;
+  static constexpr bool is_exact   = true;
+  static constexpr int radix       = 2;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return type(0);
+  }
+
+  static constexpr int min_exponent   = 0;
+  static constexpr int min_exponent10 = 0;
+  static constexpr int max_exponent   = 0;
+  static constexpr int max_exponent10 = 0;
+
+  static constexpr bool has_infinity             = false;
+  static constexpr bool has_quiet_NaN            = false;
+  static constexpr bool has_signaling_NaN        = false;
+  static constexpr float_denorm_style has_denorm = denorm_absent;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return type(0);
+  }
+
+  static constexpr bool is_iec559  = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = !is_signed;
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__pnacl__) || defined(__wasm__)
+  static constexpr bool traps = true;
+#else
+  static constexpr bool traps = false;
+#endif
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_toward_zero;
+};
+
+template <>
+class __numeric_limits_impl<bool, true>
+{
+public:
+  using type = bool;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = false;
+  static constexpr int digits       = 1;
+  static constexpr int digits10     = 0;
+  static constexpr int max_digits10 = 0;
+
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return false;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return true;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return min();
+  }
+
+  static constexpr bool is_integer = true;
+  static constexpr bool is_exact   = true;
+  static constexpr int radix       = 2;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return type(0);
+  }
+
+  static constexpr int min_exponent   = 0;
+  static constexpr int min_exponent10 = 0;
+  static constexpr int max_exponent   = 0;
+  static constexpr int max_exponent10 = 0;
+
+  static constexpr bool has_infinity             = false;
+  static constexpr bool has_quiet_NaN            = false;
+  static constexpr bool has_signaling_NaN        = false;
+  static constexpr float_denorm_style has_denorm = denorm_absent;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return type(0);
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return type(0);
+  }
+
+  static constexpr bool is_iec559  = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_toward_zero;
+};
+
+template <>
+class __numeric_limits_impl<float, true>
+{
+public:
+  using type = float;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = true;
+  static constexpr int digits       = __FLT_MANT_DIG__;
+  static constexpr int digits10     = __FLT_DIG__;
+  static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return __FLT_MIN__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return __FLT_MAX__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return -max();
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return __FLT_EPSILON__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return 0.5F;
+  }
+
+  static constexpr int min_exponent   = __FLT_MIN_EXP__;
+  static constexpr int min_exponent10 = __FLT_MIN_10_EXP__;
+  static constexpr int max_exponent   = __FLT_MAX_EXP__;
+  static constexpr int max_exponent10 = __FLT_MAX_10_EXP__;
+
+  static constexpr bool has_infinity             = true;
+  static constexpr bool has_quiet_NaN            = true;
+  static constexpr bool has_signaling_NaN        = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss          = false;
+
+#if defined(_CCCL_BUILTIN_HUGE_VALF)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return _CCCL_BUILTIN_HUGE_VALF();
+  }
+#else // ^^^ _CCCL_BUILTIN_HUGE_VALF ^^^ // vvv !_CCCL_BUILTIN_HUGE_VALF vvv
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type infinity() noexcept
+  {
+    return _CUDA_VSTD::bit_cast<type>(0x7f800000);
+  }
+#endif // !_CCCL_BUILTIN_HUGE_VALF
+#if defined(_CCCL_BUILTIN_NANF)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return _CCCL_BUILTIN_NANF("");
+  }
+#else // ^^^ _CCCL_BUILTIN_NANF ^^^ // vvv !_CCCL_BUILTIN_NANF vvv
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type quiet_NaN() noexcept
+  {
+    return _CUDA_VSTD::bit_cast<type>(0x7fc00000);
+  }
+#endif // !_CCCL_BUILTIN_NANF
+#if defined(_CCCL_BUILTIN_NANSF)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return _CCCL_BUILTIN_NANSF("");
+  }
+#else // ^^^ _CCCL_BUILTIN_NANSF ^^^ // vvv !_CCCL_BUILTIN_NANSF vvv
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type signaling_NaN() noexcept
+  {
+    return _CUDA_VSTD::bit_cast<type>(0x7fa00000);
+  }
+#endif // !_CCCL_BUILTIN_NANSF
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return __FLT_DENORM_MIN__;
+  }
+
+  static constexpr bool is_iec559  = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+};
+
+template <>
+class __numeric_limits_impl<double, true>
+{
+public:
+  using type = double;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = true;
+  static constexpr int digits       = __DBL_MANT_DIG__;
+  static constexpr int digits10     = __DBL_DIG__;
+  static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return __DBL_MIN__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return __DBL_MAX__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return -max();
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return __DBL_EPSILON__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return 0.5;
+  }
+
+  static constexpr int min_exponent   = __DBL_MIN_EXP__;
+  static constexpr int min_exponent10 = __DBL_MIN_10_EXP__;
+  static constexpr int max_exponent   = __DBL_MAX_EXP__;
+  static constexpr int max_exponent10 = __DBL_MAX_10_EXP__;
+
+  static constexpr bool has_infinity             = true;
+  static constexpr bool has_quiet_NaN            = true;
+  static constexpr bool has_signaling_NaN        = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss          = false;
+
+#if defined(_CCCL_BUILTIN_HUGE_VAL)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return _CCCL_BUILTIN_HUGE_VAL();
+  }
+#else // ^^^ _CCCL_BUILTIN_HUGE_VAL ^^^ // vvv !_CCCL_BUILTIN_HUGE_VAL vvv
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type infinity() noexcept
+  {
+    return _CUDA_VSTD::bit_cast<type>(0x7ff0000000000000);
+  }
+#endif // !_CCCL_BUILTIN_HUGE_VAL
+#if defined(_CCCL_BUILTIN_NAN)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return _CCCL_BUILTIN_NAN("");
+  }
+#else // ^^^ _CCCL_BUILTIN_NAN ^^^ // vvv !_CCCL_BUILTIN_NAN vvv
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type quiet_NaN() noexcept
+  {
+    return std::bit_cast<type>(0x7ff8000000000000);
+  }
+#endif // !_CCCL_BUILTIN_NAN
+#if defined(_CCCL_BUILTIN_NANS)
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return _CCCL_BUILTIN_NANS("");
+  }
+#else // ^^^ _CCCL_BUILTIN_NANS ^^^ // vvv !_CCCL_BUILTIN_NANS vvv
+  _LIBCUDACXX_HIDE_FROM_ABI static _LIBCUDACXX_CONSTEXPR_BIT_CAST type signaling_NaN() noexcept
+  {
+    return _CUDA_VSTD::bit_cast<type>(0x7ff4000000000000);
+  }
+#endif // !_CCCL_BUILTIN_NANS
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return __DBL_DENORM_MIN__;
+  }
+
+  static constexpr bool is_iec559  = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+};
+
+template <>
+class __numeric_limits_impl<long double, true>
+{
+#ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+public:
+  using type = long double;
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed   = true;
+  static constexpr int digits       = __LDBL_MANT_DIG__;
+  static constexpr int digits10     = __LDBL_DIG__;
+  static constexpr int max_digits10 = 2 + (digits * 30103l) / 100000l;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
+  {
+    return __LDBL_MIN__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type max() noexcept
+  {
+    return __LDBL_MAX__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type lowest() noexcept
+  {
+    return -max();
+  }
+
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact   = false;
+  static constexpr int radix       = __FLT_RADIX__;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type epsilon() noexcept
+  {
+    return __LDBL_EPSILON__;
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type round_error() noexcept
+  {
+    return 0.5L;
+  }
+
+  static constexpr int min_exponent   = __LDBL_MIN_EXP__;
+  static constexpr int min_exponent10 = __LDBL_MIN_10_EXP__;
+  static constexpr int max_exponent   = __LDBL_MAX_EXP__;
+  static constexpr int max_exponent10 = __LDBL_MAX_10_EXP__;
+
+  static constexpr bool has_infinity             = true;
+  static constexpr bool has_quiet_NaN            = true;
+  static constexpr bool has_signaling_NaN        = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss          = false;
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type infinity() noexcept
+  {
+    return _CCCL_BUILTIN_HUGE_VALL();
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type quiet_NaN() noexcept
+  {
+    return _CCCL_BUILTIN_NANL("");
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type signaling_NaN() noexcept
+  {
+    return _CCCL_BUILTIN_NANSL("");
+  }
+  _LIBCUDACXX_HIDE_FROM_ABI static constexpr type denorm_min() noexcept
+  {
+    return __LDBL_DENORM_MIN__;
+  }
+
+  static constexpr bool is_iec559  = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo  = false;
+
+  static constexpr bool traps                    = false;
+  static constexpr bool tinyness_before          = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+};
+
+template <class _Tp>
+class numeric_limits : public __numeric_limits_impl<_Tp>
+{};
+
+template <class _Tp>
+class numeric_limits<const _Tp> : public numeric_limits<_Tp>
+{};
+
+template <class _Tp>
+class numeric_limits<volatile _Tp> : public numeric_limits<_Tp>
+{};
+
+template <class _Tp>
+class numeric_limits<const volatile _Tp> : public numeric_limits<_Tp>
+{};
+
+_LIBCUDACXX_END_NAMESPACE_STD
 
 _CCCL_POP_MACROS
 
diff --git a/libcudacxx/test/internal_headers/CMakeLists.txt b/libcudacxx/test/internal_headers/CMakeLists.txt
index cbb4ccce057..0129b5540ad 100644
--- a/libcudacxx/test/internal_headers/CMakeLists.txt
+++ b/libcudacxx/test/internal_headers/CMakeLists.txt
@@ -26,6 +26,11 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND NOT "${CMAKE_CXX_STANDARD}" M
   list(FILTER internal_headers EXCLUDE REGEX "mdspan")
 endif()
 
+# Exclude __limits/msvc_win32.h on non-MSVC compilers
+if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+  list(FILTER internal_headers EXCLUDE REGEX "__limits/msvc_win32.h")
+endif()
+
 # generated cuda::ptx headers are not standalone
 list(FILTER internal_headers EXCLUDE REGEX "__ptx/instructions/generated")
 
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
index 0c3bcb22229..6fec93e4a3d 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/lowest.pass.cpp
@@ -13,6 +13,7 @@
 #include <cuda/std/cassert>
 #include <cuda/std/cfloat>
 #include <cuda/std/climits>
+#include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
index 4e7a65d40b0..67c94051729 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/max.pass.cpp
@@ -13,6 +13,7 @@
 #include <cuda/std/cassert>
 #include <cuda/std/cfloat>
 #include <cuda/std/climits>
+#include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
index c377645f698..53d196d2a51 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.limits/limits/numeric.limits.members/min.pass.cpp
@@ -13,6 +13,7 @@
 #include <cuda/std/cassert>
 #include <cuda/std/cfloat>
 #include <cuda/std/climits>
+#include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.pass.cpp
index 8aeef5fcff4..b4c5f35683a 100644
--- a/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.pass.cpp
@@ -263,7 +263,9 @@ __host__ __device__ bool tests()
 #if !defined(TEST_COMPILER_NVRTC) && !defined(TEST_COMPILER_CLANG_CUDA)
         cuda::std::nanf(""),
 #endif // !TEST_COMPILER_NVRTC && !TEST_COMPILER_CLANG_CUDA
-        __builtin_nanf("0x55550001"), // NaN with a payload
+#if defined(_CCCL_BUILTIN_NANF)
+        _CCCL_BUILTIN_NANF("0x55550001"), // NaN with a payload
+#endif // _CCCL_BUILTIN_NANF
         cuda::std::numeric_limits<float>::signaling_NaN(),
         cuda::std::numeric_limits<float>::quiet_NaN(),
         cuda::std::numeric_limits<float>::infinity()})
diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bit.endian/endian.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.endian/endian.pass.cpp
index 073a54568cf..8f8d721fee7 100644
--- a/libcudacxx/test/libcudacxx/std/numerics/bit/bit.endian/endian.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.endian/endian.pass.cpp
@@ -15,6 +15,7 @@
 // #include <cuda/std/cstring>
 #include <cuda/std/cassert>
 #include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/invocable.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/invocable.pass.cpp
index 369263bedc1..f2a32e5af68 100644
--- a/libcudacxx/test/libcudacxx/std/numerics/bit/invocable.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/numerics/bit/invocable.pass.cpp
@@ -11,6 +11,7 @@
 
 #include <cuda/std/bit>
 #include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
 
 #if (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_MSC_VER) && _MSVC_LANG >= 201703L)
 #  define CPP17_PERFORM_INVOCABLE_TEST
diff --git a/libcudacxx/test/libcudacxx/std/utilities/utility/pairs/pairs.pair/trivial_copy_move.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/utility/pairs/pairs.pair/trivial_copy_move.pass.cpp
index 0b225097a33..7c15a194d65 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/utility/pairs/pairs.pair/trivial_copy_move.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/utility/pairs/pairs.pair/trivial_copy_move.pass.cpp
@@ -16,6 +16,7 @@
 // pair(pair&&) = default;
 
 #include <cuda/std/cassert>
+#include <cuda/std/type_traits>
 #include <cuda/std/utility>
 
 #include "test_macros.h"

From 950ba5f7112a5411e388d9bc448180c8f8d637a7 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Thu, 9 Jan 2025 16:12:51 -0600
Subject: [PATCH 17/31] cuda.parallel: Add documentation for the current
 iterators along with examples and tests (#3311)

* Add tests demonstrating usage of different iterators

* Update documentation of reduce_into by merging import code snippet with the rest of the example

* Add documentation for current iterators

* Run pre-commit checks and update accordingly

* Fix comments to refer to the proper lines in the code snippets in the docs
---
 .../experimental/algorithms/reduce.py         |  11 +-
 .../experimental/iterators/__init__.py        |  81 ++++++++-
 python/cuda_parallel/tests/test_reduce_api.py | 157 +++++++++++++++++-
 3 files changed, 228 insertions(+), 21 deletions(-)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
index b99b5c4c9e1..7a1a26bbc9f 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
@@ -157,16 +157,7 @@ def reduce_into(
     """Computes a device-wide reduction using the specified binary ``op`` functor and initial value ``init``.
 
     Example:
-        The code snippet below illustrates a user-defined min-reduction of a
-        device vector of ``int`` data elements.
-
-        .. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
-            :language: python
-            :dedent:
-            :start-after: example-begin imports
-            :end-before: example-end imports
-
-        Below is the code snippet that demonstrates the usage of the ``reduce_into`` API:
+        The code snippet below demonstrates the usage of the ``reduce_into`` API:
 
         .. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
             :language: python
diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/__init__.py
index 3309377b121..656338da4c8 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/iterators/__init__.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/__init__.py
@@ -9,6 +9,22 @@ def CacheModifiedInputIterator(device_array, modifier):
     Similar to https://nvidia.github.io/cccl/cub/api/classcub_1_1CacheModifiedInputIterator.html
 
     Currently the only supported modifier is "stream" (LOAD_CS).
+
+    Example:
+        The code snippet below demonstrates the usage of a ``CacheModifiedInputIterator``:
+
+        .. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin cache-iterator
+            :end-before: example-end cache-iterator
+
+    Args:
+        device_array: CUDA device array storing the input sequence of data items
+        modifier: The PTX cache load modifier
+
+    Returns:
+        A ``CacheModifiedInputIterator`` object initialized with ``device_array``
     """
     if modifier != "stream":
         raise NotImplementedError("Only stream modifier is supported")
@@ -19,15 +35,74 @@ def CacheModifiedInputIterator(device_array, modifier):
 
 
 def ConstantIterator(value):
-    """Returns an Iterator representing a sequence of constant values."""
+    """Returns an Iterator representing a sequence of constant values.
+
+    Similar to https://nvidia.github.io/cccl/thrust/api/classthrust_1_1constant__iterator.html
+
+    Example:
+        The code snippet below demonstrates the usage of a ``ConstantIterator``
+        representing the sequence ``[10, 10, 10]``:
+
+        .. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin constant-iterator
+            :end-before: example-end constant-iterator
+
+    Args:
+        value: The value of every item in the sequence
+
+    Returns:
+        A ``ConstantIterator`` object initialized to ``value``
+    """
     return _iterators.ConstantIterator(value)
 
 
 def CountingIterator(offset):
-    """Returns an Iterator representing a sequence of incrementing values."""
+    """Returns an Iterator representing a sequence of incrementing values.
+
+    Similar to https://nvidia.github.io/cccl/thrust/api/classthrust_1_1counting__iterator.html
+
+    Example:
+        The code snippet below demonstrates the usage of a ``CountingIterator``
+        representing the sequence ``[10, 11, 12]``:
+
+        .. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin counting-iterator
+            :end-before: example-end counting-iterator
+
+    Args:
+        offset: The initial value of the sequence
+
+    Returns:
+        A ``CountingIterator`` object initialized to ``offset``
+    """
     return _iterators.CountingIterator(offset)
 
 
 def TransformIterator(it, op):
-    """Returns an Iterator representing a transformed sequence of values."""
+    """Returns an Iterator representing a transformed sequence of values.
+
+    Similar to https://nvidia.github.io/cccl/cub/api/classcub_1_1TransformInputIterator.html
+
+    Example:
+        The code snippet below demonstrates the usage of a ``TransformIterator``
+        composed with a ``CountingIterator``, transforming the sequence ``[10, 11, 12]``
+        by squaring each item before reducing the output:
+
+        .. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin transform-iterator
+            :end-before: example-end transform-iterator
+
+    Args:
+        it: The iterator object to be transformed
+        op: The transform operation
+
+    Returns:
+        A ``TransformIterator`` object to transform the items in ``it`` using ``op``
+    """
     return _iterators.make_transform_iterator(it, op)
diff --git a/python/cuda_parallel/tests/test_reduce_api.py b/python/cuda_parallel/tests/test_reduce_api.py
index f6dc91426b7..c8c20f51cd7 100644
--- a/python/cuda_parallel/tests/test_reduce_api.py
+++ b/python/cuda_parallel/tests/test_reduce_api.py
@@ -2,17 +2,14 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-# example-begin imports
-import cupy as cp
-import numpy as np
-
-import cuda.parallel.experimental.algorithms as algorithms
-
-# example-end imports
-
 
 def test_device_reduce():
     # example-begin reduce-min
+    import cupy as cp
+    import numpy as np
+
+    import cuda.parallel.experimental.algorithms as algorithms
+
     def min_op(a, b):
         return a if a < b else b
 
@@ -37,3 +34,147 @@ def min_op(a, b):
     expected_output = 0
     assert (d_output == expected_output).all()
     # example-end reduce-min
+
+
+def test_cache_modified_input_iterator():
+    # example-begin cache-iterator
+    import functools
+
+    import cupy as cp
+    import numpy as np
+
+    import cuda.parallel.experimental.algorithms as algorithms
+    import cuda.parallel.experimental.iterators as iterators
+
+    def add_op(a, b):
+        return a + b
+
+    values = [8, 6, 7, 5, 3, 0, 9]
+    d_input = cp.array(values, dtype=np.int32)
+    d_output = cp.empty(1, dtype=np.int32)
+
+    iterator = iterators.CacheModifiedInputIterator(
+        d_input, modifier="stream"
+    )  # Input sequence
+    h_init = np.array([0], dtype=np.int32)  # Initial value for the reduction
+    d_output = cp.empty(1, dtype=np.int32)  # Storage for output
+
+    # Instantiate reduction, determine storage requirements, and allocate storage
+    reduce_into = algorithms.reduce_into(iterator, d_output, add_op, h_init)
+    temp_storage_size = reduce_into(None, iterator, d_output, len(values), h_init)
+    d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
+
+    # Run reduction
+    reduce_into(d_temp_storage, iterator, d_output, len(values), h_init)
+
+    expected_output = functools.reduce(lambda a, b: a + b, values)
+    assert (d_output == expected_output).all()
+    # example-end cache-iterator
+
+
+def test_constant_iterator():
+    # example-begin constant-iterator
+    import functools
+
+    import cupy as cp
+    import numpy as np
+
+    import cuda.parallel.experimental.algorithms as algorithms
+    import cuda.parallel.experimental.iterators as iterators
+
+    def add_op(a, b):
+        return a + b
+
+    value = 10
+    num_items = 3
+
+    constant_it = iterators.ConstantIterator(np.int32(value))  # Input sequence
+    h_init = np.array([0], dtype=np.int32)  # Initial value for the reduction
+    d_output = cp.empty(1, dtype=np.int32)  # Storage for output
+
+    # Instantiate reduction, determine storage requirements, and allocate storage
+    reduce_into = algorithms.reduce_into(constant_it, d_output, add_op, h_init)
+    temp_storage_size = reduce_into(None, constant_it, d_output, num_items, h_init)
+    d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
+
+    # Run reduction
+    reduce_into(d_temp_storage, constant_it, d_output, num_items, h_init)
+
+    expected_output = functools.reduce(lambda a, b: a + b, [value] * num_items)
+    assert (d_output == expected_output).all()
+    # example-end constant-iterator
+
+
+def test_counting_iterator():
+    # example-begin counting-iterator
+    import functools
+
+    import cupy as cp
+    import numpy as np
+
+    import cuda.parallel.experimental.algorithms as algorithms
+    import cuda.parallel.experimental.iterators as iterators
+
+    def add_op(a, b):
+        return a + b
+
+    first_item = 10
+    num_items = 3
+
+    first_it = iterators.CountingIterator(np.int32(first_item))  # Input sequence
+    h_init = np.array([0], dtype=np.int32)  # Initial value for the reduction
+    d_output = cp.empty(1, dtype=np.int32)  # Storage for output
+
+    # Instantiate reduction, determine storage requirements, and allocate storage
+    reduce_into = algorithms.reduce_into(first_it, d_output, add_op, h_init)
+    temp_storage_size = reduce_into(None, first_it, d_output, num_items, h_init)
+    d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
+
+    # Run reduction
+    reduce_into(d_temp_storage, first_it, d_output, num_items, h_init)
+
+    expected_output = functools.reduce(
+        lambda a, b: a + b, range(first_item, first_item + num_items)
+    )
+    assert (d_output == expected_output).all()
+    # example-end counting-iterator
+
+
+def test_transform_iterator():
+    # example-begin transform-iterator
+    import functools
+
+    import cupy as cp
+    import numpy as np
+
+    import cuda.parallel.experimental.algorithms as algorithms
+    import cuda.parallel.experimental.iterators as iterators
+
+    def add_op(a, b):
+        return a + b
+
+    def square_op(a):
+        return a**2
+
+    first_item = 10
+    num_items = 3
+
+    transform_it = iterators.TransformIterator(
+        iterators.CountingIterator(np.int32(first_item)), square_op
+    )  # Input sequence
+    h_init = np.array([0], dtype=np.int32)  # Initial value for the reduction
+    d_output = cp.empty(1, dtype=np.int32)  # Storage for output
+
+    # Instantiate reduction, determine storage requirements, and allocate storage
+    reduce_into = algorithms.reduce_into(transform_it, d_output, add_op, h_init)
+    temp_storage_size = reduce_into(None, transform_it, d_output, num_items, h_init)
+    d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
+
+    # Run reduction
+    reduce_into(d_temp_storage, transform_it, d_output, num_items, h_init)
+
+    expected_output = functools.reduce(
+        lambda a, b: a + b, [a**2 for a in range(first_item, first_item + num_items)]
+    )
+    assert (d_output == expected_output).all()
+    # example-end transform-iterator

From d5e0f5f1056c345c480c50833b9d617d6c1a4f4f Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Thu, 9 Jan 2025 17:20:50 -0500
Subject: [PATCH 18/31] Drop clang<14 from CI, update devcontainers. (#3309)

Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
---
 .../cuda12.0-llvm10/devcontainer.json         | 54 -------------------
 .../cuda12.0-llvm11/devcontainer.json         | 54 -------------------
 .../cuda12.0-llvm12/devcontainer.json         | 54 -------------------
 .../cuda12.0-llvm13/devcontainer.json         | 54 -------------------
 .../cuda12.0-llvm9/devcontainer.json          | 54 -------------------
 .../cuda12.6-llvm10/devcontainer.json         | 54 -------------------
 .../cuda12.6-llvm11/devcontainer.json         | 54 -------------------
 .../cuda12.6-llvm12/devcontainer.json         | 54 -------------------
 .../cuda12.6-llvm13/devcontainer.json         | 54 -------------------
 .../cuda12.6-llvm9/devcontainer.json          | 54 -------------------
 ci/matrix.yaml                                | 25 ++++-----
 11 files changed, 9 insertions(+), 556 deletions(-)
 delete mode 100644 .devcontainer/cuda12.0-llvm10/devcontainer.json
 delete mode 100644 .devcontainer/cuda12.0-llvm11/devcontainer.json
 delete mode 100644 .devcontainer/cuda12.0-llvm12/devcontainer.json
 delete mode 100644 .devcontainer/cuda12.0-llvm13/devcontainer.json
 delete mode 100644 .devcontainer/cuda12.0-llvm9/devcontainer.json
 delete mode 100644 .devcontainer/cuda12.6-llvm10/devcontainer.json
 delete mode 100644 .devcontainer/cuda12.6-llvm11/devcontainer.json
 delete mode 100644 .devcontainer/cuda12.6-llvm12/devcontainer.json
 delete mode 100644 .devcontainer/cuda12.6-llvm13/devcontainer.json
 delete mode 100644 .devcontainer/cuda12.6-llvm9/devcontainer.json

diff --git a/.devcontainer/cuda12.0-llvm10/devcontainer.json b/.devcontainer/cuda12.0-llvm10/devcontainer.json
deleted file mode 100644
index 6f75525f808..00000000000
--- a/.devcontainer/cuda12.0-llvm10/devcontainer.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm10-cuda12.0",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.0-llvm10",
-    "CCCL_CUDA_VERSION": "12.0",
-    "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "10",
-    "CCCL_BUILD_INFIX": "cuda12.0-llvm10",
-    "CCCL_CUDA_EXTENDED": "false"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda12.0-llvm10"
-}
diff --git a/.devcontainer/cuda12.0-llvm11/devcontainer.json b/.devcontainer/cuda12.0-llvm11/devcontainer.json
deleted file mode 100644
index fd21f30fbd3..00000000000
--- a/.devcontainer/cuda12.0-llvm11/devcontainer.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm11-cuda12.0",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.0-llvm11",
-    "CCCL_CUDA_VERSION": "12.0",
-    "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "11",
-    "CCCL_BUILD_INFIX": "cuda12.0-llvm11",
-    "CCCL_CUDA_EXTENDED": "false"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda12.0-llvm11"
-}
diff --git a/.devcontainer/cuda12.0-llvm12/devcontainer.json b/.devcontainer/cuda12.0-llvm12/devcontainer.json
deleted file mode 100644
index b402063c837..00000000000
--- a/.devcontainer/cuda12.0-llvm12/devcontainer.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm12-cuda12.0",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.0-llvm12",
-    "CCCL_CUDA_VERSION": "12.0",
-    "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "12",
-    "CCCL_BUILD_INFIX": "cuda12.0-llvm12",
-    "CCCL_CUDA_EXTENDED": "false"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda12.0-llvm12"
-}
diff --git a/.devcontainer/cuda12.0-llvm13/devcontainer.json b/.devcontainer/cuda12.0-llvm13/devcontainer.json
deleted file mode 100644
index 40187a60e6c..00000000000
--- a/.devcontainer/cuda12.0-llvm13/devcontainer.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm13-cuda12.0",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.0-llvm13",
-    "CCCL_CUDA_VERSION": "12.0",
-    "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "13",
-    "CCCL_BUILD_INFIX": "cuda12.0-llvm13",
-    "CCCL_CUDA_EXTENDED": "false"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda12.0-llvm13"
-}
diff --git a/.devcontainer/cuda12.0-llvm9/devcontainer.json b/.devcontainer/cuda12.0-llvm9/devcontainer.json
deleted file mode 100644
index e72c6da2fdd..00000000000
--- a/.devcontainer/cuda12.0-llvm9/devcontainer.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm9-cuda12.0",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.0-llvm9",
-    "CCCL_CUDA_VERSION": "12.0",
-    "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "9",
-    "CCCL_BUILD_INFIX": "cuda12.0-llvm9",
-    "CCCL_CUDA_EXTENDED": "false"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda12.0-llvm9"
-}
diff --git a/.devcontainer/cuda12.6-llvm10/devcontainer.json b/.devcontainer/cuda12.6-llvm10/devcontainer.json
deleted file mode 100644
index ef06f7cf9a2..00000000000
--- a/.devcontainer/cuda12.6-llvm10/devcontainer.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm10-cuda12.6",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-llvm10",
-    "CCCL_CUDA_VERSION": "12.6",
-    "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "10",
-    "CCCL_BUILD_INFIX": "cuda12.6-llvm10",
-    "CCCL_CUDA_EXTENDED": "false"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda12.6-llvm10"
-}
diff --git a/.devcontainer/cuda12.6-llvm11/devcontainer.json b/.devcontainer/cuda12.6-llvm11/devcontainer.json
deleted file mode 100644
index 38c13841ee6..00000000000
--- a/.devcontainer/cuda12.6-llvm11/devcontainer.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm11-cuda12.6",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-llvm11",
-    "CCCL_CUDA_VERSION": "12.6",
-    "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "11",
-    "CCCL_BUILD_INFIX": "cuda12.6-llvm11",
-    "CCCL_CUDA_EXTENDED": "false"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda12.6-llvm11"
-}
diff --git a/.devcontainer/cuda12.6-llvm12/devcontainer.json b/.devcontainer/cuda12.6-llvm12/devcontainer.json
deleted file mode 100644
index 8898d216573..00000000000
--- a/.devcontainer/cuda12.6-llvm12/devcontainer.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm12-cuda12.6",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-llvm12",
-    "CCCL_CUDA_VERSION": "12.6",
-    "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "12",
-    "CCCL_BUILD_INFIX": "cuda12.6-llvm12",
-    "CCCL_CUDA_EXTENDED": "false"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda12.6-llvm12"
-}
diff --git a/.devcontainer/cuda12.6-llvm13/devcontainer.json b/.devcontainer/cuda12.6-llvm13/devcontainer.json
deleted file mode 100644
index 8d713720c51..00000000000
--- a/.devcontainer/cuda12.6-llvm13/devcontainer.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm13-cuda12.6",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-llvm13",
-    "CCCL_CUDA_VERSION": "12.6",
-    "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "13",
-    "CCCL_BUILD_INFIX": "cuda12.6-llvm13",
-    "CCCL_CUDA_EXTENDED": "false"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda12.6-llvm13"
-}
diff --git a/.devcontainer/cuda12.6-llvm9/devcontainer.json b/.devcontainer/cuda12.6-llvm9/devcontainer.json
deleted file mode 100644
index be41e2506c5..00000000000
--- a/.devcontainer/cuda12.6-llvm9/devcontainer.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm9-cuda12.6",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-llvm9",
-    "CCCL_CUDA_VERSION": "12.6",
-    "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "9",
-    "CCCL_BUILD_INFIX": "cuda12.6-llvm9",
-    "CCCL_CUDA_EXTENDED": "false"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda12.6-llvm9"
-}
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 66cc47dd271..881f553f65d 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -5,18 +5,18 @@ workflows:
   #
   # Example:
   # override:
-  #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'llvm16']}
+  #   - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']}
   #
   override:
 
   pull_request:
     # Old CTK/compiler
-    - {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc7', 'gcc9', 'clang9', 'msvc2019']}
+    - {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc7', 'gcc9', 'clang14', 'msvc2019']}
     # Current CTK build-only
-    - {jobs: ['build'], std: [11, 14], cxx: ['gcc7', 'clang9'], project: 'libcudacxx'}
-    - {jobs: ['build'], std: [17], cxx: ['gcc7', 'clang9']}
+    - {jobs: ['build'], std: [11, 14], cxx: ['gcc7', 'clang14'], project: 'libcudacxx'}
+    - {jobs: ['build'], std: [17], cxx: ['gcc7', 'clang14']}
     - {jobs: ['build'], std: 'max', cxx: ['gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
-    - {jobs: ['build'], std: 'max', cxx: ['clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']}
+    - {jobs: ['build'], std: 'max', cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
     - {jobs: ['build'], std: [17, 20], cxx: ['gcc', 'clang', 'msvc']}
     # Current CTK testing:
@@ -44,7 +44,6 @@ workflows:
     # cudax has different CTK reqs:
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 20,       cxx: ['msvc14.36']}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['gcc10', 'gcc11', 'gcc12']}
-    - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['clang10', 'clang11', 'clang12', 'clang13']}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], project: 'cudax', ctk: ['12.5'], std: [17, 20], cxx: ['nvhpc']}
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20,       cxx: ['msvc2022']}
@@ -64,11 +63,11 @@ workflows:
     - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'}
     - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit'}
     # Old CTK/compiler
-    - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc7', 'gcc8', 'gcc9', 'clang9', 'msvc2019']}
+    - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc7', 'gcc8', 'gcc9', 'clang14', 'msvc2019']}
     - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc11'], sm: '60;70;80;90'}
     # Current CTK build-only
     - {jobs: ['build'], std: 'all', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
-    - {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']}
+    - {jobs: ['build'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], std: 'all', cxx: ['msvc2019']}
     # Test current CTK
     - {jobs: ['test'],  std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']}
@@ -82,7 +81,6 @@ workflows:
     - {jobs: ['build'], project: 'libcudacxx', std: 'all', cudacxx: 'clang', cxx: 'clang', sm: '90a'}
     # cudax
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc9', 'gcc10', 'gcc11']}
-    - {jobs: ['build'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13']}
     - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
     - {jobs: ['build'], project: 'cudax', ctk: [        '12.5'], std: 'all', cxx: ['nvhpc']}
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0',       ], std: 'all', cxx: ['msvc14.36']}
@@ -96,11 +94,11 @@ workflows:
 
 #  # These are waiting on the NVKS nodes:
 #    - {jobs: ['test'],  ctk: '11.1', gpu: 'v100',     sm: 'gpu', cxx: 'gcc7',    std: [11]}
-#    - {jobs: ['test'],  ctk: '11.1', gpu: 't4',       sm: 'gpu', cxx: 'clang9',  std: [17]}
+#    - {jobs: ['test'],  ctk: '11.1', gpu: 't4',       sm: 'gpu', cxx: 'clang14',  std: [17]}
 #    - {jobs: ['test'],  ctk: '11.8', gpu: 'rtx2080',  sm: 'gpu', cxx: 'gcc11',   std: [17]}
 #    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7',    std: [14]}
 #    - {jobs: ['test'],  ctk: 'curr', gpu: 'l4',       sm: 'gpu', cxx: 'gcc13',   std: 'all'}
-#    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang9',  std: [11]}
+#    - {jobs: ['test'],  ctk: 'curr', gpu: 'rtx4090',  sm: 'gpu', cxx: 'clang14',  std: [11]}
 #    # H100 runners are currently flakey, only build since those use CPU-only runners:
 #    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'gcc12',   std: [11, 20]}
 #    - {jobs: ['build'], ctk: 'curr', gpu: 'h100',     sm: 'gpu', cxx: 'clang18', std: [17]}
@@ -157,11 +155,6 @@ host_compilers:
     container_tag: 'llvm'
     exe: 'clang++'
     versions:
-      9:  { stds: [11, 14, 17,   ] }
-      10: { stds: [11, 14, 17,   ] }
-      11: { stds: [11, 14, 17, 20] }
-      12: { stds: [11, 14, 17, 20] }
-      13: { stds: [11, 14, 17, 20] }
       14: { stds: [11, 14, 17, 20] }
       15: { stds: [11, 14, 17, 20] }
       16: { stds: [11, 14, 17, 20] }

From 577da80204f43e0afde9becdc30c311593f25d98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?=
 <158148890+caugonnet@users.noreply.github.com>
Date: Fri, 10 Jan 2025 11:09:40 +0100
Subject: [PATCH 19/31] [STF] Cleanup task dependencies object constructors
 (#3291)

* Define tag types for access modes

* - Rework how we build task_dep objects based on access mode tags
- pack_state is now responsible for using a const_cast for read only data

* Greatly simplify the previous attempt : do not define new types, but use integral constants based on the enums

* It seems the const_cast was not necessarily so we can simplify it and not even do some dispatch based on access modes
---
 .../experimental/__stf/internal/logical_data.cuh   |  8 +-------
 .../cuda/experimental/__stf/internal/task_dep.cuh  | 14 --------------
 2 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh b/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh
index ee01f53a90b..73491c72e5a 100644
--- a/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh
@@ -2294,13 +2294,7 @@ public:
   auto read(Pack&&... pack) const
   {
     using U = readonly_type_of<T>;
-    // The constness of *this implies that access mode is read
-    // Note that we do not provide an access mode, because this is how we
-    // dispatch statically between read-only and non read-only access modes in
-    // task_dep_untyped.
-    // TODO : we could make this cleaner if we had a tag type in addition to
-    // the access_mode enum class.
-    return task_dep<U, ::std::monostate, false>(*this, /* access_mode::read, */ ::std::forward<Pack>(pack)...);
+    return task_dep<U, ::std::monostate, false>(*this, access_mode::read, ::std::forward<Pack>(pack)...);
   }
 
   template <typename... Pack>
diff --git a/cudax/include/cuda/experimental/__stf/internal/task_dep.cuh b/cudax/include/cuda/experimental/__stf/internal/task_dep.cuh
index f25c176a603..60764181e4e 100644
--- a/cudax/include/cuda/experimental/__stf/internal/task_dep.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/task_dep.cuh
@@ -70,20 +70,6 @@ public:
       : task_dep_untyped(d, m, data_place::affine, mv(redux_op))
   {}
 
-  // These constructors take no access_mode, which is a way to identify that
-  // they are using read-only access mode.
-  // TODO : That was the only way to dispatch at compile time which constructor
-  // is used. We might use a tag_type or a std::true_type to do this static
-  // dispatch in a better way.
-  task_dep_untyped(
-    const logical_data_untyped& d, data_place dplace, ::std::shared_ptr<reduction_operator_base> redux_op = nullptr)
-      : task_dep_untyped(const_cast<logical_data_untyped&>(d), access_mode::read, mv(dplace), mv(redux_op))
-  {}
-
-  task_dep_untyped(const logical_data_untyped& d, ::std::shared_ptr<reduction_operator_base> redux_op = nullptr)
-      : task_dep_untyped(const_cast<logical_data_untyped&>(d), access_mode::read, mv(redux_op))
-  {}
-
   logical_data_untyped get_data() const;
 
   instance_id_t get_instance_id() const

From 2c5bdcd8691c2917c06c7d4c570a3082e0180660 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 10 Jan 2025 11:10:30 +0100
Subject: [PATCH 20/31] Disable test with a gcc-14 regression (#3297)

---
 .../specialized.destroy/destroy_at.pass.cpp                  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_at.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_at.pass.cpp
index 6426a1dd14a..7e1105f76af 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_at.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_at.pass.cpp
@@ -143,8 +143,9 @@ int main(int, char**)
   test_arrays();
 #if TEST_STD_VER > 2017
 #  if !defined(TEST_COMPILER_NVRTC)
-#    if (defined(TEST_COMPILER_CLANG) && __clang_major__ > 10) || (defined(TEST_COMPILER_GCC) && __GNUC__ > 9) \
-      || defined(TEST_COMPILER_MSVC_2022) || defined(TEST_COMPILER_NVHPC)
+#    if (defined(TEST_COMPILER_CLANG) && __clang_major__ > 10)                                               \
+      || (defined(TEST_COMPILER_GCC) && (__GNUC__ > 9 && __GNUC__ < 14)) || defined(TEST_COMPILER_MSVC_2022) \
+      || defined(TEST_COMPILER_NVHPC)
   static_assert(test());
   // TODO: Until cuda::std::__construct_at has support for arrays, it's impossible to test this
   //       in a constexpr context (see https://reviews.llvm.org/D114903).

From 6c3cbb69b4a23fb2a03abe8c938200eed4c90020 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 10 Jan 2025 14:31:27 +0100
Subject: [PATCH 21/31] Deprecate Thrust's cpp_compatibility.h macros (#3299)

---
 docs/repo.toml                                |  1 +
 thrust/thrust/async/copy.h                    |  2 +-
 thrust/thrust/async/for_each.h                |  2 +-
 thrust/thrust/async/reduce.h                  |  4 +--
 thrust/thrust/async/scan.h                    |  4 +--
 thrust/thrust/async/sort.h                    |  4 +--
 thrust/thrust/async/transform.h               |  2 +-
 .../thrust/detail/config/cpp_compatibility.h  | 32 ++++---------------
 thrust/thrust/detail/select_system.h          |  2 +-
 thrust/thrust/detail/seq.h                    |  2 +-
 thrust/thrust/detail/static_assert.h          |  2 +-
 thrust/thrust/execution_policy.h              |  2 +-
 thrust/thrust/functional.h                    | 20 ++++++------
 thrust/thrust/future.h                        |  2 +-
 thrust/thrust/system/cpp/detail/par.h         |  2 +-
 thrust/thrust/system/cuda/detail/future.inl   |  2 +-
 thrust/thrust/system/cuda/detail/par.h        |  4 +--
 .../detail/sequential/execution_policy.h      |  2 +-
 18 files changed, 36 insertions(+), 55 deletions(-)

diff --git a/docs/repo.toml b/docs/repo.toml
index 64cf4a1f870..7e0d3108bbb 100644
--- a/docs/repo.toml
+++ b/docs/repo.toml
@@ -273,6 +273,7 @@ doxygen_predefined = [
   "_CCCL_DIAG_SUPPRESS_GCC(x)=",
   "_CCCL_DIAG_SUPPRESS_MSVC(x)=",
   "_CCCL_DIAG_SUPPRESS_NVHPC(x)=",
+  "_CCCL_GLOBAL_CONSTANT=constexpr",
   "CUDASTF_HOST=",
   "CUDASTF_DEVICE=",
   "CUDASTF_HOST_DEVICE=",
diff --git a/thrust/thrust/async/copy.h b/thrust/thrust/async/copy.h
index b821f908a79..1adc90c3dff 100644
--- a/thrust/thrust/async/copy.h
+++ b/thrust/thrust/async/copy.h
@@ -116,7 +116,7 @@ struct copy_fn final
 
 } // namespace copy_detail
 
-THRUST_INLINE_CONSTANT copy_detail::copy_fn copy{};
+_CCCL_GLOBAL_CONSTANT copy_detail::copy_fn copy{};
 
 /*! \endcond
  */
diff --git a/thrust/thrust/async/for_each.h b/thrust/thrust/async/for_each.h
index 9cdc9c1e048..6128b6a7625 100644
--- a/thrust/thrust/async/for_each.h
+++ b/thrust/thrust/async/for_each.h
@@ -93,7 +93,7 @@ struct for_each_fn final
 
 } // namespace for_each_detail
 
-THRUST_INLINE_CONSTANT for_each_detail::for_each_fn for_each{};
+_CCCL_GLOBAL_CONSTANT for_each_detail::for_each_fn for_each{};
 
 /*! \endcond
  */
diff --git a/thrust/thrust/async/reduce.h b/thrust/thrust/async/reduce.h
index 18172bd692f..a6cea12b5ab 100644
--- a/thrust/thrust/async/reduce.h
+++ b/thrust/thrust/async/reduce.h
@@ -159,7 +159,7 @@ struct reduce_fn final
 
 } // namespace reduce_detail
 
-THRUST_INLINE_CONSTANT reduce_detail::reduce_fn reduce{};
+_CCCL_GLOBAL_CONSTANT reduce_detail::reduce_fn reduce{};
 
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -295,7 +295,7 @@ struct reduce_into_fn final
 
 } // namespace reduce_into_detail
 
-THRUST_INLINE_CONSTANT reduce_into_detail::reduce_into_fn reduce_into{};
+_CCCL_GLOBAL_CONSTANT reduce_into_detail::reduce_into_fn reduce_into{};
 
 /*! \endcond
  */
diff --git a/thrust/thrust/async/scan.h b/thrust/thrust/async/scan.h
index f58b37e8d3b..4963d17225e 100644
--- a/thrust/thrust/async/scan.h
+++ b/thrust/thrust/async/scan.h
@@ -178,7 +178,7 @@ struct inclusive_scan_fn final
 
 } // namespace inclusive_scan_detail
 
-THRUST_INLINE_CONSTANT inclusive_scan_detail::inclusive_scan_fn inclusive_scan{};
+_CCCL_GLOBAL_CONSTANT inclusive_scan_detail::inclusive_scan_fn inclusive_scan{};
 
 namespace exclusive_scan_detail
 {
@@ -287,7 +287,7 @@ struct exclusive_scan_fn final
 
 } // namespace exclusive_scan_detail
 
-THRUST_INLINE_CONSTANT exclusive_scan_detail::exclusive_scan_fn exclusive_scan{};
+_CCCL_GLOBAL_CONSTANT exclusive_scan_detail::exclusive_scan_fn exclusive_scan{};
 
 } // namespace async
 
diff --git a/thrust/thrust/async/sort.h b/thrust/thrust/async/sort.h
index 19304e38660..ae37abb5d61 100644
--- a/thrust/thrust/async/sort.h
+++ b/thrust/thrust/async/sort.h
@@ -145,7 +145,7 @@ struct stable_sort_fn final
 
 } // namespace stable_sort_detail
 
-THRUST_INLINE_CONSTANT stable_sort_detail::stable_sort_fn stable_sort{};
+_CCCL_GLOBAL_CONSTANT stable_sort_detail::stable_sort_fn stable_sort{};
 
 namespace fallback
 {
@@ -258,7 +258,7 @@ struct sort_fn final
 
 } // namespace sort_detail
 
-THRUST_INLINE_CONSTANT sort_detail::sort_fn sort{};
+_CCCL_GLOBAL_CONSTANT sort_detail::sort_fn sort{};
 
 /*! \endcond
  */
diff --git a/thrust/thrust/async/transform.h b/thrust/thrust/async/transform.h
index c54b328388e..0862141ee0d 100644
--- a/thrust/thrust/async/transform.h
+++ b/thrust/thrust/async/transform.h
@@ -124,7 +124,7 @@ struct transform_fn final
 
 } // namespace transform_detail
 
-THRUST_INLINE_CONSTANT transform_detail::transform_fn transform{};
+_CCCL_GLOBAL_CONSTANT transform_detail::transform_fn transform{};
 
 /*! \endcond
  */
diff --git a/thrust/thrust/detail/config/cpp_compatibility.h b/thrust/thrust/detail/config/cpp_compatibility.h
index a45115688b4..9b74fbf4b65 100644
--- a/thrust/thrust/detail/config/cpp_compatibility.h
+++ b/thrust/thrust/detail/config/cpp_compatibility.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-// Internal config header that is only included through thrust/detail/config/config.h
+#include <cuda/__cccl_config>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -26,29 +26,9 @@
 #  pragma system_header
 #endif // no system header
 
-#include <thrust/detail/config/cpp_dialect.h> // IWYU pragma: export
-
-#include <cuda/std/cstddef>
-
+// deprecated [Since 2.8.0]
 #define THRUST_NODISCARD _CCCL_NODISCARD
-
-// FIXME: Combine THRUST_INLINE_CONSTANT and
-// THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT into one macro when NVCC properly
-// supports `constexpr` globals in host and device code.
-#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
-// FIXME: Add this when NVCC supports inline variables.
-// #  if   _CCCL_STD_VER >= 2017
-// #    define THRUST_INLINE_CONSTANT                 inline constexpr
-// #    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT inline constexpr
-#  define THRUST_INLINE_CONSTANT                 static const _CCCL_DEVICE
-#  define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static constexpr
-
-#else
-// FIXME: Add this when NVCC supports inline variables.
-// #  if   _CCCL_STD_VER >= 2017
-// #    define THRUST_INLINE_CONSTANT                 inline constexpr
-// #    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT inline constexpr
-#  define THRUST_INLINE_CONSTANT                 static constexpr
-#  define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static constexpr
-
-#endif
+// deprecated [Since 2.8.0]
+#define THRUST_INLINE_CONSTANT _CCCL_GLOBAL_CONSTANT
+// deprecated [Since 2.8.0]
+#define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static constexpr
diff --git a/thrust/thrust/detail/select_system.h b/thrust/thrust/detail/select_system.h
index 65450582967..65f4593cc7a 100644
--- a/thrust/thrust/detail/select_system.h
+++ b/thrust/thrust/detail/select_system.h
@@ -64,7 +64,7 @@ struct select_system_fn final
 
 } // namespace select_system_detail
 
-THRUST_INLINE_CONSTANT select_system_detail::select_system_fn select_system{};
+_CCCL_GLOBAL_CONSTANT select_system_detail::select_system_fn select_system{};
 
 } // namespace detail
 
diff --git a/thrust/thrust/detail/seq.h b/thrust/thrust/detail/seq.h
index 992a9b09100..7adf956b376 100644
--- a/thrust/thrust/detail/seq.h
+++ b/thrust/thrust/detail/seq.h
@@ -49,6 +49,6 @@ struct seq_t
 
 } // namespace detail
 
-THRUST_INLINE_CONSTANT detail::seq_t seq;
+_CCCL_GLOBAL_CONSTANT detail::seq_t seq;
 
 THRUST_NAMESPACE_END
diff --git a/thrust/thrust/detail/static_assert.h b/thrust/thrust/detail/static_assert.h
index a49d5d96eb1..d21dcbdfbf2 100644
--- a/thrust/thrust/detail/static_assert.h
+++ b/thrust/thrust/detail/static_assert.h
@@ -45,7 +45,7 @@ namespace detail
 template <typename, bool x>
 struct depend_on_instantiation
 {
-  THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT bool value = x;
+  static constexpr bool value = x;
 };
 
 #if _CCCL_STD_VER >= 2017
diff --git a/thrust/thrust/execution_policy.h b/thrust/thrust/execution_policy.h
index 56199aa5e91..8f733215e9a 100644
--- a/thrust/thrust/execution_policy.h
+++ b/thrust/thrust/execution_policy.h
@@ -336,7 +336,7 @@ static const detail::host_t host;
  *  \see host_execution_policy
  *  \see thrust::device
  */
-THRUST_INLINE_CONSTANT detail::device_t device;
+_CCCL_GLOBAL_CONSTANT detail::device_t device;
 
 // define seq for the purpose of Doxygenating it
 // it is actually defined elsewhere
diff --git a/thrust/thrust/functional.h b/thrust/thrust/functional.h
index a72a3473840..a58207d6e09 100644
--- a/thrust/thrust/functional.h
+++ b/thrust/thrust/functional.h
@@ -1046,43 +1046,43 @@ namespace placeholders
 
 /*! \p thrust::placeholders::_1 is the placeholder for the first function parameter.
  */
-THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<0>::type _1;
+_CCCL_GLOBAL_CONSTANT thrust::detail::functional::placeholder<0>::type _1;
 
 /*! \p thrust::placeholders::_2 is the placeholder for the second function parameter.
  */
-THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<1>::type _2;
+_CCCL_GLOBAL_CONSTANT thrust::detail::functional::placeholder<1>::type _2;
 
 /*! \p thrust::placeholders::_3 is the placeholder for the third function parameter.
  */
-THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<2>::type _3;
+_CCCL_GLOBAL_CONSTANT thrust::detail::functional::placeholder<2>::type _3;
 
 /*! \p thrust::placeholders::_4 is the placeholder for the fourth function parameter.
  */
-THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<3>::type _4;
+_CCCL_GLOBAL_CONSTANT thrust::detail::functional::placeholder<3>::type _4;
 
 /*! \p thrust::placeholders::_5 is the placeholder for the fifth function parameter.
  */
-THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<4>::type _5;
+_CCCL_GLOBAL_CONSTANT thrust::detail::functional::placeholder<4>::type _5;
 
 /*! \p thrust::placeholders::_6 is the placeholder for the sixth function parameter.
  */
-THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<5>::type _6;
+_CCCL_GLOBAL_CONSTANT thrust::detail::functional::placeholder<5>::type _6;
 
 /*! \p thrust::placeholders::_7 is the placeholder for the seventh function parameter.
  */
-THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<6>::type _7;
+_CCCL_GLOBAL_CONSTANT thrust::detail::functional::placeholder<6>::type _7;
 
 /*! \p thrust::placeholders::_8 is the placeholder for the eighth function parameter.
  */
-THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<7>::type _8;
+_CCCL_GLOBAL_CONSTANT thrust::detail::functional::placeholder<7>::type _8;
 
 /*! \p thrust::placeholders::_9 is the placeholder for the ninth function parameter.
  */
-THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<8>::type _9;
+_CCCL_GLOBAL_CONSTANT thrust::detail::functional::placeholder<8>::type _9;
 
 /*! \p thrust::placeholders::_10 is the placeholder for the tenth function parameter.
  */
-THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<9>::type _10;
+_CCCL_GLOBAL_CONSTANT thrust::detail::functional::placeholder<9>::type _10;
 
 } // namespace placeholders
 
diff --git a/thrust/thrust/future.h b/thrust/thrust/future.h
index 46f70d58f58..97c9c3bfa20 100644
--- a/thrust/thrust/future.h
+++ b/thrust/thrust/future.h
@@ -163,7 +163,7 @@ using device_future = device_unique_eager_future<T>;
 struct new_stream_t final
 {};
 
-THRUST_INLINE_CONSTANT new_stream_t new_stream{};
+_CCCL_GLOBAL_CONSTANT new_stream_t new_stream{};
 
 ///////////////////////////////////////////////////////////////////////////////
 
diff --git a/thrust/thrust/system/cpp/detail/par.h b/thrust/thrust/system/cpp/detail/par.h
index 5e32d6d0149..162f51215aa 100644
--- a/thrust/thrust/system/cpp/detail/par.h
+++ b/thrust/thrust/system/cpp/detail/par.h
@@ -47,7 +47,7 @@ struct par_t
 
 } // namespace detail
 
-THRUST_INLINE_CONSTANT detail::par_t par;
+_CCCL_GLOBAL_CONSTANT detail::par_t par;
 
 } // namespace cpp
 } // namespace system
diff --git a/thrust/thrust/system/cuda/detail/future.inl b/thrust/thrust/system/cuda/detail/future.inl
index da2347e3552..cb7555e38a5 100644
--- a/thrust/thrust/system/cuda/detail/future.inl
+++ b/thrust/thrust/system/cuda/detail/future.inl
@@ -57,7 +57,7 @@ namespace detail
 struct nonowning_t final
 {};
 
-THRUST_INLINE_CONSTANT nonowning_t nonowning{};
+_CCCL_GLOBAL_CONSTANT nonowning_t nonowning{};
 
 ///////////////////////////////////////////////////////////////////////////////
 
diff --git a/thrust/thrust/system/cuda/detail/par.h b/thrust/thrust/system/cuda/detail/par.h
index 5541ca60074..f52bae2ed81 100644
--- a/thrust/thrust/system/cuda/detail/par.h
+++ b/thrust/thrust/system/cuda/detail/par.h
@@ -166,7 +166,7 @@ struct par_nosync_t
   }
 };
 
-THRUST_INLINE_CONSTANT par_t par;
+_CCCL_GLOBAL_CONSTANT par_t par;
 
 /*! \p thrust::cuda::par_nosync is a parallel execution policy targeting Thrust's CUDA device backend.
  *  Similar to \p thrust::cuda::par it allows execution of Thrust algorithms in a specific CUDA stream.
@@ -215,7 +215,7 @@ THRUST_INLINE_CONSTANT par_t par;
  *  \endcode
  *
  */
-THRUST_INLINE_CONSTANT par_nosync_t par_nosync;
+_CCCL_GLOBAL_CONSTANT par_nosync_t par_nosync;
 } // namespace cuda_cub
 
 namespace system
diff --git a/thrust/thrust/system/detail/sequential/execution_policy.h b/thrust/thrust/system/detail/sequential/execution_policy.h
index d9d52aad85f..d4834ebc89b 100644
--- a/thrust/thrust/system/detail/sequential/execution_policy.h
+++ b/thrust/thrust/system/detail/sequential/execution_policy.h
@@ -70,7 +70,7 @@ struct execution_policy : thrust::execution_policy<Derived>
   }
 };
 
-THRUST_INLINE_CONSTANT tag seq;
+_CCCL_GLOBAL_CONSTANT tag seq;
 
 } // namespace sequential
 } // namespace detail

From 51959e0b8886066b7737496c76491a1e96d1e252 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 10 Jan 2025 15:34:04 +0100
Subject: [PATCH 22/31] Remove dropped function objects from docs (#3319)

---
 docs/thrust/api_docs/function_objects/adaptors.rst | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/thrust/api_docs/function_objects/adaptors.rst b/docs/thrust/api_docs/function_objects/adaptors.rst
index 4716e48f08f..5e5cec96d38 100644
--- a/docs/thrust/api_docs/function_objects/adaptors.rst
+++ b/docs/thrust/api_docs/function_objects/adaptors.rst
@@ -3,8 +3,4 @@
 Function Object Adaptors
 -------------------------
 
-  - :cpp:struct:`thrust::unary_function <thrust::unary_function>`
-  - :cpp:struct:`thrust::binary_function <thrust::binary_function>`
-  - :cpp:struct:`thrust::unary_negate <thrust::unary_negate>`
-  - :cpp:struct:`thrust::binary_negate <thrust::binary_negate>`
   - :cpp:class:`thrust::zip_function <thrust::zip_function>`

From bdd64ba7eeceb535f3257eee16628f1ae5b35677 Mon Sep 17 00:00:00 2001
From: Federico Busato <50413820+fbusato@users.noreply.github.com>
Date: Fri, 10 Jan 2025 10:03:06 -0800
Subject: [PATCH 23/31] Document `NV_TARGET` macros (#3313)

---
 docs/cccl_development/index.rst |  2 +-
 docs/cccl_development/macro.rst | 86 ++++++++++++++++++++++++---------
 2 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/docs/cccl_development/index.rst b/docs/cccl_development/index.rst
index 395a9ddcfc1..322e3cdcb3d 100644
--- a/docs/cccl_development/index.rst
+++ b/docs/cccl_development/index.rst
@@ -13,4 +13,4 @@ This living document serves to describe the internal details and the development
 
 Documentation:
 
-- `CCCL Internal Macros <https://nvidia.github.io/cccl/cccl_development/macro/>`__
+- `CCCL Internal Macros <https://nvidia.github.io/cccl/cccl_development/macro.html>`__
diff --git a/docs/cccl_development/macro.rst b/docs/cccl_development/macro.rst
index e8cb5afa618..6bf1b0b67ab 100644
--- a/docs/cccl_development/macro.rst
+++ b/docs/cccl_development/macro.rst
@@ -12,21 +12,21 @@ Compiler Macros
 
 **Host compiler macros**:
 
-+-----------------------------+--------------------------------+
-| ``_CCCL_COMPILER(CLANG)``   | Clang                          |
-+-----------------------------+--------------------------------+
-| ``_CCCL_COMPILER(GCC)``     | GCC                            |
-+-----------------------------+--------------------------------+
-| ``_CCCL_COMPILER(NVHPC)``   | Nvidia HPC compiler            |
-+-----------------------------+--------------------------------+
-| ``_CCCL_COMPILER(MSVC)``    | Microsoft Visual Studio        |
-+-----------------------------+--------------------------------+
-| ``_CCCL_COMPILER(MSVC2017)`` | Microsoft Visual Studio 2017  |
-+-----------------------------+--------------------------------+
-| ``_CCCL_COMPILER(MSVC2019)`` | Microsoft Visual Studio 2019  |
-+-----------------------------+--------------------------------+
-| ``_CCCL_COMPILER(MSVC2022)`` | Microsoft Visual Studio 2022  |
-+-----------------------------+--------------------------------+
++------------------------------+--------------------------------+
+| ``_CCCL_COMPILER(CLANG)``    | Clang                          |
++------------------------------+--------------------------------+
+| ``_CCCL_COMPILER(GCC)``      | GCC                            |
++------------------------------+--------------------------------+
+| ``_CCCL_COMPILER(NVHPC)``    | Nvidia HPC compiler            |
++------------------------------+--------------------------------+
+| ``_CCCL_COMPILER(MSVC)``     | Microsoft Visual Studio        |
++------------------------------+--------------------------------+
+| ``_CCCL_COMPILER(MSVC2017)`` | Microsoft Visual Studio 2017   |
++------------------------------+--------------------------------+
+| ``_CCCL_COMPILER(MSVC2019)`` | Microsoft Visual Studio 2019   |
++------------------------------+--------------------------------+
+| ``_CCCL_COMPILER(MSVC2022)`` | Microsoft Visual Studio 2022   |
++------------------------------+--------------------------------+
 
 The ``_CCCL_COMPILER`` function-like macro can also be used to check the version of a compiler.
 
@@ -104,10 +104,10 @@ OS Macros
 
 ----
 
-CUDA Extension Macros
----------------------
+Execution Space
+---------------
 
-**Execution space**:
+**Functions**
 
 +-----------------------+-----------------------+
 | ``_CCCL_HOST``        | Host function         |
@@ -117,15 +117,58 @@ CUDA Extension Macros
 | ``_CCCL_HOST_DEVICE`` | Host/Device function  |
 +-----------------------+-----------------------+
 
-**Other CUDA attributes**:
+In addition, ``_CCCL_EXEC_CHECK_DISABLE`` disables the execution space check for the NVHPC compiler
+
+**Target Macros**
+
++---------------------------------------------------------------------------------+--------------------------------------------------------------------------+
+| ``NV_IF_TARGET(TARGET, (CODE))``                                                | Enable ``CODE`` only if ``TARGET`` is satisfied.                         |
++---------------------------------------------------------------------------------+--------------------------------------------------------------------------+
+| ``NV_IF_ELSE_TARGET(TARGET, (IF_CODE), (ELSE_CODE))``                           | Enable ``CODE_IF`` if ``TARGET`` is satisfied, ``CODE_ELSE`` otherwise.  |
++---------------------------------------------------------------------------------+--------------------------------------------------------------------------+
+| ``NV_DISPATCH_TARGET(TARGET1, (TARGET1_CODE), ..., TARGET_N, (TARGET_N_CODE))`` | Enable a single code block if any of ``TARGET_i`` is satisfied.          |
++---------------------------------------------------------------------------------+--------------------------------------------------------------------------+
+
+Possible ``TARGET`` values:
+
++---------------------------+-------------------------------------------------------------------+
+| ``NV_IS_HOST``            | Host-code target                                                  |
++---------------------------+-------------------------------------------------------------------+
+| ``NV_IS_DEVICE``          | Device-code target                                                |
++---------------------------+-------------------------------------------------------------------+
+| ``NV_PROVIDES_SM_<VER>``  | SM architecture is at least ``VER``, e.g. ``NV_PROVIDES_SM_80``   |
++---------------------------+-------------------------------------------------------------------+
+| ``NV_IS_EXACTLY_SM_<NN>`` | SM architecture is exactly ``VER``, e.g. ``NV_IS_EXACTLY_SM_80``  |
++---------------------------+-------------------------------------------------------------------+
+
+Usage example:
+
+.. code-block:: c++
+
+    NV_IF_TARGET(NV_IS_DEVICE,    (auto x = threadIdx.x; return x;));
+    NV_IF_ELSE_TARGET(NV_IS_HOST, (return 0;), (auto x = threadIdx.x; return x;));
+    NV_DISPATCH_TARGET(NV_PROVIDES_SM_90,   (return "Hopper+";),
+                       NV_IS_EXACTLY_SM_75, (return "Turing";),
+                       NV_IS_HOST,          (return "Host";))
+
+*Pitfalls*:
+
+* All target macros generate the code in a local scope, i.e. ``{ code }``.
+* ``NV_DISPATCH_TARGET`` is *NOT* a switch statement. It enables the code associated with the first condition satisfied.
+* The target macros take ``code`` as an argument, so it is *not* possible to use any conditional compilation, .e.g ``#if _CCCL_STD_VER >= 20`` within a target macro
+
+----
+
+CUDA Extension Macros
+---------------------
+
+**CUDA attributes**:
 
 +------------------------------+----------------------------------------------------------+
 | ``_CCCL_GRID_CONSTANT``      | Grid constant kernel parameter                           |
 +------------------------------+----------------------------------------------------------+
 | ``_CCCL_GLOBAL_CONSTANT``    | Host/device global scope constant (``inline constexpr``) |
 +------------------------------+----------------------------------------------------------+
-| ``_CCCL_EXEC_CHECK_DISABLE`` | Disable execution space check for the NVHPC compiler     |
-+------------------------------+----------------------------------------------------------+
 
 **Extended floating-point types**:
 
@@ -141,7 +184,6 @@ CUDA Extension Macros
 | ``_LIBCUDACXX_HAS_NVBF16``   | `__nv_bfloat16/__nv_bfloat162` host/device support (CUDA 12.2) |
 +------------------------------+----------------------------------------------------------------+
 
-
 ----
 
 C++ Language Macros

From fae0a1b6171682628ceb6d8dc02cfc2d23d9b9e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?=
 <158148890+caugonnet@users.noreply.github.com>
Date: Fri, 10 Jan 2025 19:04:44 +0100
Subject: [PATCH 24/31] [STF] Define ctx.pick_stream() which was missing for
 the unified context (#3326)

* Define ctx.pick_stream() which was missing for the unified context

* clang-format
---
 cudax/include/cuda/experimental/stf.cuh | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/cudax/include/cuda/experimental/stf.cuh b/cudax/include/cuda/experimental/stf.cuh
index b43f7cf27a0..d76494aa9ff 100644
--- a/cudax/include/cuda/experimental/stf.cuh
+++ b/cudax/include/cuda/experimental/stf.cuh
@@ -752,6 +752,23 @@ public:
     }
   }
 
+  /**
+   * @brief Get a CUDA stream from the stream pool associated to the context
+   *
+   * This helper is intended to avoid creating CUDA streams manually. Using
+   * this stream after the context has been finalized is an undefined
+   * behaviour.
+   */
+  cudaStream_t pick_stream()
+  {
+    _CCCL_ASSERT(payload.index() != ::std::variant_npos, "Context is not initialized");
+    return ::std::visit(
+      [](auto& self) {
+        return self.pick_stream();
+      },
+      payload);
+  }
+
 private:
   template <typename Fun>
   auto visit(Fun&& fun)

From 7eee1d01aea40abaee3ad2603c2f88696b9acdae Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 10 Jan 2025 21:53:25 +0100
Subject: [PATCH 25/31] Deprecate cub::IterateThreadStore (#3337)

---
 cub/cub/thread/thread_store.cuh | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh
index a895884a60d..7f936258ab6 100644
--- a/cub/cub/thread/thread_store.cuh
+++ b/cub/cub/thread/thread_store.cuh
@@ -116,28 +116,30 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val);
 
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
+namespace detail
+{
 /// Helper structure for templated store iteration (inductive case)
 template <int COUNT, int MAX>
-struct IterateThreadStore
+struct iterate_thread_store
 {
   template <CacheStoreModifier MODIFIER, typename T>
   static _CCCL_DEVICE _CCCL_FORCEINLINE void Store(T* ptr, T* vals)
   {
     ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
-    IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
+    iterate_thread_store<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
   }
 
   template <typename OutputIteratorT, typename T>
   static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(OutputIteratorT ptr, T* vals)
   {
     ptr[COUNT] = vals[COUNT];
-    IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
+    iterate_thread_store<COUNT + 1, MAX>::Dereference(ptr, vals);
   }
 };
 
 /// Helper structure for templated store iteration (termination case)
 template <int MAX>
-struct IterateThreadStore<MAX, MAX>
+struct iterate_thread_store<MAX, MAX>
 {
   template <CacheStoreModifier MODIFIER, typename T>
   static _CCCL_DEVICE _CCCL_FORCEINLINE void Store(T* /*ptr*/, T* /*vals*/)
@@ -147,6 +149,10 @@ struct IterateThreadStore<MAX, MAX>
   static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(OutputIteratorT /*ptr*/, T* /*vals*/)
   {}
 };
+} // namespace detail
+
+template <int COUNT, int MAX>
+using IterateThreadStore CCCL_DEPRECATED = detail::iterate_thread_store<COUNT, MAX>;
 
 /**
  * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
@@ -305,7 +311,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStoreVolatilePtr(T* ptr, T val, Int2Ty
     reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
   }
 
-  IterateThreadStore<0, VOLATILE_MULTIPLE>::Dereference(reinterpret_cast<volatile VolatileWord*>(ptr), words);
+  detail::iterate_thread_store<0, VOLATILE_MULTIPLE>::Dereference(reinterpret_cast<volatile VolatileWord*>(ptr), words);
 }
 
 /**
@@ -340,7 +346,7 @@ ThreadStore(T* ptr, T val, Int2Type<MODIFIER> /*modifier*/, Int2Type<true> /*is_
     reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
   }
 
-  IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
+  detail::iterate_thread_store<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
     reinterpret_cast<DeviceWord*>(ptr), words);
 }
 

From 6ed60ca1caaa3cbfdc888b5704a7dd64194d25d4 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 10 Jan 2025 21:55:03 +0100
Subject: [PATCH 26/31] Drop CUB's BinaryFlip operator (#3332)

---
 cub/cub/thread/thread_operators.cuh | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index feef89776a9..ba189a9ad13 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -380,33 +380,6 @@ struct ReduceByKeyOp
   }
 };
 
-//! Deprecated [Since 2.8]
-template <typename BinaryOpT>
-struct CCCL_DEPRECATED BinaryFlip
-{
-  BinaryOpT binary_op;
-
-  _CCCL_HOST_DEVICE explicit BinaryFlip(BinaryOpT binary_op)
-      : binary_op(binary_op)
-  {}
-
-  template <typename T, typename U>
-  _CCCL_DEVICE auto operator()(T&& t, U&& u)
-    -> decltype(binary_op(::cuda::std::forward<U>(u), ::cuda::std::forward<T>(t)))
-  {
-    return binary_op(::cuda::std::forward<U>(u), ::cuda::std::forward<T>(t));
-  }
-};
-
-_CCCL_SUPPRESS_DEPRECATED_PUSH
-//! Deprecated [Since 2.8]
-template <typename BinaryOpT>
-CCCL_DEPRECATED _CCCL_HOST_DEVICE BinaryFlip<BinaryOpT> MakeBinaryFlip(BinaryOpT binary_op)
-{
-  return BinaryFlip<BinaryOpT>(binary_op);
-}
-_CCCL_SUPPRESS_DEPRECATED_POP
-
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 namespace internal

From 2b081805bb227d29c6c19a27a369e35b6b6da4bb Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 10 Jan 2025 21:55:21 +0100
Subject: [PATCH 27/31] Deprecate cub::Swap (#3333)

---
 cub/cub/thread/thread_sort.cuh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh
index 7d9e8622f82..e21d9dd3ecd 100644
--- a/cub/cub/thread/thread_sort.cuh
+++ b/cub/cub/thread/thread_sort.cuh
@@ -45,6 +45,7 @@
 CUB_NAMESPACE_BEGIN
 
 template <typename T>
+CCCL_DEPRECATED_BECAUSE("Use cuda::std::swap")
 _CCCL_DEVICE _CCCL_FORCEINLINE void Swap(T& lhs, T& rhs)
 {
   T temp = lhs;
@@ -95,10 +96,11 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
     {
       if (compare_op(keys[j + 1], keys[j]))
       {
-        Swap(keys[j], keys[j + 1]);
+        using ::cuda::std::swap;
+        swap(keys[j], keys[j + 1]);
         if (!KEYS_ONLY)
         {
-          Swap(items[j], items[j + 1]);
+          swap(items[j], items[j + 1]);
         }
       }
     } // inner loop

From 1ad31e0e668ff6b0c3371dcc5c89fa3c4ce5a107 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 10 Jan 2025 21:55:56 +0100
Subject: [PATCH 28/31] Clarify transform output can overlap input (#3323)

---
 cub/cub/device/device_transform.cuh | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index 953f09ca005..7c19fce3f52 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -46,7 +46,9 @@ struct DeviceTransform
   //!
   //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
   //! iterators' value types must be trivially relocatable.
-  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param output An iterator to the output sequence where num_items results are written to. May point to the
+  //! beginning of one of the input sequences, performing the transformation inplace. The output sequence must not
+  //! overlap with any of the input sequence in any other way.
   //! @param num_items The number of elements in each input sequence.
   //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
   //! types must be convertible to the parameters of the function object's call operator. The return type of the call
@@ -110,7 +112,9 @@ struct DeviceTransform
   //!
   //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type
   //! must be trivially relocatable.
-  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param output An iterator to the output sequence where num_items results are written to. May point to the
+  //! beginning of one of the input sequences, performing the transformation inplace. The output sequence must not
+  //! overlap with any of the input sequence in any other way.
   //! @param num_items The number of elements in each input sequence.
   //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
   //! types must be convertible to the parameters of the function object's call operator. The return type of the call
@@ -180,7 +184,9 @@ struct DeviceTransform
   //!
   //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
   //! iterators' value types must be trivially relocatable.
-  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param output An iterator to the output sequence where num_items results are written to. May point to the
+  //! beginning of one of the input sequences, performing the transformation inplace. The output sequence must not
+  //! overlap with any of the input sequence in any other way.
   //! @param num_items The number of elements in each input sequence.
   //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
   //! types must be convertible to the parameters of the function object's call operator. The return type of the call
@@ -242,7 +248,9 @@ struct DeviceTransform
   //!
   //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type
   //! must be trivially relocatable.
-  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param output An iterator to the output sequence where num_items results are written to. May point to the
+  //! beginning of one of the input sequences, performing the transformation inplace. The output sequence must not
+  //! overlap with any of the input sequence in any other way.
   //! @param num_items The number of elements in each input sequence.
   //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
   //! types must be convertible to the parameters of the function object's call operator. The return type of the call

From 38e3d0dc10f32625f0f8fd1409434e23518bb9de Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 10 Jan 2025 21:56:54 +0100
Subject: [PATCH 29/31] Drop CUB APIs with a debug_synchronous parameter
 (#3330)

Fixes: #3329
---
 cub/cub/device/device_adjacent_difference.cuh |  70 ---
 cub/cub/device/device_histogram.cuh           | 246 ----------
 cub/cub/device/device_merge_sort.cuh          | 127 -----
 cub/cub/device/device_partition.cuh           |  95 ----
 cub/cub/device/device_radix_sort.cuh          | 179 -------
 cub/cub/device/device_reduce.cuh              | 146 ------
 cub/cub/device/device_run_length_encode.cuh   |  47 --
 cub/cub/device/device_scan.cuh                | 307 ------------
 .../device/device_segmented_radix_sort.cuh    | 268 -----------
 cub/cub/device/device_segmented_reduce.cuh    | 136 ------
 cub/cub/device/device_segmented_sort.cuh      | 444 ------------------
 cub/cub/device/device_select.cuh              | 136 ------
 cub/cub/device/device_spmv.cuh                |  33 --
 .../dispatch/dispatch_adjacent_difference.cuh |  41 --
 .../device/dispatch/dispatch_histogram.cuh    | 136 ------
 .../device/dispatch/dispatch_merge_sort.cuh   |  54 ---
 .../device/dispatch/dispatch_radix_sort.cuh   | 112 -----
 cub/cub/device/dispatch/dispatch_reduce.cuh   | 108 -----
 .../dispatch/dispatch_reduce_by_key.cuh       |  33 --
 cub/cub/device/dispatch/dispatch_rle.cuh      |  29 --
 cub/cub/device/dispatch/dispatch_scan.cuh     |  46 --
 .../device/dispatch/dispatch_scan_by_key.cuh  |  62 ---
 .../dispatch/dispatch_segmented_sort.cuh      |  60 ---
 .../device/dispatch/dispatch_select_if.cuh    |  31 --
 .../device/dispatch/dispatch_spmv_orig.cuh    |  53 ---
 .../dispatch/dispatch_three_way_partition.cuh |  33 --
 .../dispatch/dispatch_unique_by_key.cuh       |  60 ---
 cub/cub/util_debug.cuh                        |  14 -
 docs/repo.toml                                |   1 -
 29 files changed, 3107 deletions(-)

diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh
index a63ff9111e1..1af5f01f033 100644
--- a/cub/cub/device/device_adjacent_difference.cuh
+++ b/cub/cub/device/device_adjacent_difference.cuh
@@ -266,24 +266,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename DifferenceOpT, typename NumItemsT = std::uint32_t>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    InputIteratorT d_input,
-    OutputIteratorT d_output,
-    NumItemsT num_items,
-    DifferenceOpT difference_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SubtractLeftCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory.
   //!
@@ -397,23 +379,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename RandomAccessIteratorT, typename DifferenceOpT, typename NumItemsT = std::uint32_t>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    RandomAccessIteratorT d_input,
-    NumItemsT num_items,
-    DifferenceOpT difference_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SubtractLeft(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory.
   //!
@@ -544,24 +509,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename DifferenceOpT, typename NumItemsT = std::uint32_t>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRightCopy(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    InputIteratorT d_input,
-    OutputIteratorT d_output,
-    NumItemsT num_items,
-    DifferenceOpT difference_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SubtractRightCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory.
   //!
@@ -663,23 +610,6 @@ public:
     return AdjacentDifference<may_alias, read_left>(
       d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream);
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename RandomAccessIteratorT, typename DifferenceOpT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRight(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    RandomAccessIteratorT d_input,
-    NumItemsT num_items,
-    DifferenceOpT difference_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SubtractRight(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh
index cd3b922028a..b8a92334047 100644
--- a/cub/cub/device/device_histogram.cuh
+++ b/cub/cub/device/device_histogram.cuh
@@ -205,35 +205,6 @@ struct DeviceHistogram
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    SampleIteratorT d_samples,
-    CounterT* d_histogram,
-    int num_levels,
-    LevelT lower_level,
-    LevelT upper_level,
-    OffsetT num_samples,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return HistogramEven(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_samples,
-      d_histogram,
-      num_levels,
-      lower_level,
-      upper_level,
-      num_samples,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes an intensity histogram from a sequence of data samples using equal-width bins.
   //!
@@ -385,39 +356,6 @@ struct DeviceHistogram
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    SampleIteratorT d_samples,
-    CounterT* d_histogram,
-    int num_levels,
-    LevelT lower_level,
-    LevelT upper_level,
-    OffsetT num_row_samples,
-    OffsetT num_rows,
-    size_t row_stride_bytes,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return HistogramEven(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_samples,
-      d_histogram,
-      num_levels,
-      lower_level,
-      upper_level,
-      num_row_samples,
-      num_rows,
-      row_stride_bytes,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using
   //! equal-width bins.
@@ -587,40 +525,6 @@ struct DeviceHistogram
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <int NUM_CHANNELS,
-            int NUM_ACTIVE_CHANNELS,
-            typename SampleIteratorT,
-            typename CounterT,
-            typename LevelT,
-            typename OffsetT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    SampleIteratorT d_samples,
-    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
-    const int num_levels[NUM_ACTIVE_CHANNELS],
-    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
-    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
-    OffsetT num_pixels,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return MultiHistogramEven(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_samples,
-      d_histogram,
-      num_levels,
-      lower_level,
-      upper_level,
-      num_pixels,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes per-channel intensity histograms from a sequence of
   //! multi-channel "pixel" data samples using equal-width bins.
@@ -835,44 +739,6 @@ struct DeviceHistogram
       is_byte_sample);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <int NUM_CHANNELS,
-            int NUM_ACTIVE_CHANNELS,
-            typename SampleIteratorT,
-            typename CounterT,
-            typename LevelT,
-            typename OffsetT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    SampleIteratorT d_samples,
-    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
-    const int num_levels[NUM_ACTIVE_CHANNELS],
-    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
-    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
-    OffsetT num_row_pixels,
-    OffsetT num_rows,
-    size_t row_stride_bytes,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return MultiHistogramEven(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_samples,
-      d_histogram,
-      num_levels,
-      lower_level,
-      upper_level,
-      num_row_pixels,
-      num_rows,
-      row_stride_bytes,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @}  end member group
   //! @name Custom bin ranges
   //! @{
@@ -998,26 +864,6 @@ struct DeviceHistogram
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    SampleIteratorT d_samples,
-    CounterT* d_histogram,
-    int num_levels,
-    const LevelT* d_levels,
-    OffsetT num_samples,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return HistogramRange(
-      d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
   //!
@@ -1156,37 +1002,6 @@ struct DeviceHistogram
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    SampleIteratorT d_samples,
-    CounterT* d_histogram,
-    int num_levels,
-    const LevelT* d_levels,
-    OffsetT num_row_samples,
-    OffsetT num_rows,
-    size_t row_stride_bytes,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return HistogramRange(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_samples,
-      d_histogram,
-      num_levels,
-      d_levels,
-      num_row_samples,
-      num_rows,
-      row_stride_bytes,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples
   //! using the specified bin boundary levels.
@@ -1345,31 +1160,6 @@ struct DeviceHistogram
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <int NUM_CHANNELS,
-            int NUM_ACTIVE_CHANNELS,
-            typename SampleIteratorT,
-            typename CounterT,
-            typename LevelT,
-            typename OffsetT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    SampleIteratorT d_samples,
-    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
-    const int num_levels[NUM_ACTIVE_CHANNELS],
-    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
-    OffsetT num_pixels,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return MultiHistogramRange(
-      d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_pixels, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using
   //! the specified bin boundary levels.
@@ -1573,42 +1363,6 @@ struct DeviceHistogram
       is_byte_sample);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <int NUM_CHANNELS,
-            int NUM_ACTIVE_CHANNELS,
-            typename SampleIteratorT,
-            typename CounterT,
-            typename LevelT,
-            typename OffsetT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    SampleIteratorT d_samples,
-    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
-    const int num_levels[NUM_ACTIVE_CHANNELS],
-    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
-    OffsetT num_row_pixels,
-    OffsetT num_rows,
-    size_t row_stride_bytes,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return MultiHistogramRange(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_samples,
-      d_histogram,
-      num_levels,
-      d_levels,
-      num_row_pixels,
-      num_rows,
-      row_stride_bytes,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //@}  end member group
 };
 
diff --git a/cub/cub/device/device_merge_sort.cuh b/cub/cub/device/device_merge_sort.cuh
index d42f6033a7e..9fd714746d4 100644
--- a/cub/cub/device/device_merge_sort.cuh
+++ b/cub/cub/device/device_merge_sort.cuh
@@ -245,25 +245,6 @@ public:
     return SortPairsNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    KeyIteratorT d_keys,
-    ValueIteratorT d_items,
-    OffsetT num_items,
-    CompareOpT compare_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortPairs<KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
-      d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   /**
    * @brief Sorts items using a merge sorting method.
    *
@@ -410,40 +391,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyInputIteratorT,
-            typename ValueInputIteratorT,
-            typename KeyIteratorT,
-            typename ValueIteratorT,
-            typename OffsetT,
-            typename CompareOpT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsCopy(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    KeyInputIteratorT d_input_keys,
-    ValueInputIteratorT d_input_items,
-    KeyIteratorT d_output_keys,
-    ValueIteratorT d_output_items,
-    OffsetT num_items,
-    CompareOpT compare_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortPairsCopy<KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_input_keys,
-      d_input_items,
-      d_output_keys,
-      d_output_items,
-      num_items,
-      compare_op,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
 private:
   // Internal version without NVTX range
   template <typename KeyIteratorT, typename OffsetT, typename CompareOpT>
@@ -569,24 +516,6 @@ public:
     return SortKeysNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyIteratorT, typename OffsetT, typename CompareOpT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    KeyIteratorT d_keys,
-    OffsetT num_items,
-    CompareOpT compare_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortKeys<KeyIteratorT, OffsetT, CompareOpT>(
-      d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
 private:
   // Internal version without NVTX range
   template <typename KeyInputIteratorT, typename KeyIteratorT, typename OffsetT, typename CompareOpT>
@@ -728,25 +657,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyInputIteratorT, typename KeyIteratorT, typename OffsetT, typename CompareOpT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    KeyInputIteratorT d_input_keys,
-    KeyIteratorT d_output_keys,
-    OffsetT num_items,
-    CompareOpT compare_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortKeysCopy<KeyInputIteratorT, KeyIteratorT, OffsetT, CompareOpT>(
-      d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   /**
    * @brief Sorts items using a merge sorting method.
    *
@@ -856,25 +766,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    KeyIteratorT d_keys,
-    ValueIteratorT d_items,
-    OffsetT num_items,
-    CompareOpT compare_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return StableSortPairs<KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
-      d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   /**
    * @brief Sorts items using a merge sorting method.
    *
@@ -975,24 +866,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyIteratorT, typename OffsetT, typename CompareOpT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    KeyIteratorT d_keys,
-    OffsetT num_items,
-    CompareOpT compare_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return StableSortKeys<KeyIteratorT, OffsetT, CompareOpT>(
-      d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   /**
    * @brief Sorts items using a merge sorting method.
    *
diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh
index 1b9eef947fa..768d8413e6f 100644
--- a/cub/cub/device/device_partition.cuh
+++ b/cub/cub/device/device_partition.cuh
@@ -222,30 +222,6 @@ struct DevicePartition
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT,
-            typename FlagIterator,
-            typename OutputIteratorT,
-            typename NumSelectedIteratorT,
-            typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    FlagIterator d_flags,
-    OutputIteratorT d_out,
-    NumSelectedIteratorT d_num_selected_out,
-    NumItemsT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Flagged<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Uses the ``select_op`` functor to split the corresponding items from ``d_in`` into
   //! a partitioned sequence ``d_out``. The total number of items copied into the first partition is written
@@ -404,30 +380,6 @@ struct DevicePartition
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT,
-            typename OutputIteratorT,
-            typename NumSelectedIteratorT,
-            typename SelectOp,
-            typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
-  If(void* d_temp_storage,
-     size_t& temp_storage_bytes,
-     InputIteratorT d_in,
-     OutputIteratorT d_out,
-     NumSelectedIteratorT d_num_selected_out,
-     NumItemsT num_items,
-     SelectOp select_op,
-     cudaStream_t stream,
-     bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return If<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, SelectOp, NumItemsT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
 private:
   template <bool IS_DESCENDING,
             typename KeyT,
@@ -712,53 +664,6 @@ public:
       select_second_part_op,
       stream);
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT,
-            typename FirstOutputIteratorT,
-            typename SecondOutputIteratorT,
-            typename UnselectedOutputIteratorT,
-            typename NumSelectedIteratorT,
-            typename SelectFirstPartOp,
-            typename SelectSecondPartOp,
-            typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
-  If(void* d_temp_storage,
-     std::size_t& temp_storage_bytes,
-     InputIteratorT d_in,
-     FirstOutputIteratorT d_first_part_out,
-     SecondOutputIteratorT d_second_part_out,
-     UnselectedOutputIteratorT d_unselected_out,
-     NumSelectedIteratorT d_num_selected_out,
-     NumItemsT num_items,
-     SelectFirstPartOp select_first_part_op,
-     SelectSecondPartOp select_second_part_op,
-     cudaStream_t stream,
-     bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return If<InputIteratorT,
-              FirstOutputIteratorT,
-              SecondOutputIteratorT,
-              UnselectedOutputIteratorT,
-              NumSelectedIteratorT,
-              SelectFirstPartOp,
-              SelectSecondPartOp,
-              NumItemsT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      d_first_part_out,
-      d_second_part_out,
-      d_unselected_out,
-      d_num_selected_out,
-      num_items,
-      select_first_part_op,
-      select_second_part_op,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_radix_sort.cuh b/cub/cub/device/device_radix_sort.cuh
index 32156b75e34..25798297682 100644
--- a/cub/cub/device/device_radix_sort.cuh
+++ b/cub/cub/device/device_radix_sort.cuh
@@ -362,37 +362,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    const ValueT* d_values_in,
-    ValueT* d_values_out,
-    NumItemsT num_items,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortPairs<KeyT, ValueT, NumItemsT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      d_values_in,
-      d_values_out,
-      num_items,
-      begin_bit,
-      end_bit,
-      stream);
-  }
-#endif
-
   //! @rst
   //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
   //!
@@ -817,26 +786,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    DoubleBuffer<ValueT>& d_values,
-    NumItemsT num_items,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortPairs<KeyT, ValueT, NumItemsT>(
-      d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream);
-  }
-#endif
-
   //! @rst
   //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
   //!
@@ -1251,37 +1200,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    const ValueT* d_values_in,
-    ValueT* d_values_out,
-    NumItemsT num_items,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortPairsDescending<KeyT, ValueT, NumItemsT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      d_values_in,
-      d_values_out,
-      num_items,
-      begin_bit,
-      end_bit,
-      stream);
-  }
-#endif
-
   //! @rst
   //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
   //!
@@ -1705,26 +1623,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    DoubleBuffer<ValueT>& d_values,
-    NumItemsT num_items,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortPairsDescending<KeyT, ValueT, NumItemsT>(
-      d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream);
-  }
-#endif
-
   //! @rst
   //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
   //!
@@ -2411,26 +2309,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    NumItemsT num_items,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortKeys<KeyT, NumItemsT>(
-      d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream);
-  }
-#endif
-
   //! @brief Sorts keys into ascending order. (`~N` auxiliary storage required).
   //!
   //! @par
@@ -2551,24 +2429,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    NumItemsT num_items,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortKeys<KeyT, NumItemsT>(d_temp_storage, temp_storage_bytes, d_keys, num_items, begin_bit, end_bit, stream);
-  }
-#endif
-
   //! @rst
   //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
   //!
@@ -2944,26 +2804,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    NumItemsT num_items,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortKeysDescending<KeyT, NumItemsT>(
-      d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream);
-  }
-#endif
-
   //! @rst
   //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
   //!
@@ -3344,25 +3184,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    NumItemsT num_items,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortKeysDescending<KeyT, NumItemsT>(
-      d_temp_storage, temp_storage_bytes, d_keys, num_items, begin_bit, end_bit, stream);
-  }
-#endif
-
   //! @rst
   //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
   //!
diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index 0841662261d..a5c3de4a313 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -227,26 +227,6 @@ struct DeviceReduce
       d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<OffsetT>(num_items), reduction_op, init, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT, typename T>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    int num_items,
-    ReductionOpT reduction_op,
-    T init,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Reduce<InputIteratorT, OutputIteratorT, ReductionOpT, T>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide sum using the addition (``+``) operator.
   //!
@@ -352,23 +332,6 @@ struct DeviceReduce
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
-  Sum(void* d_temp_storage,
-      size_t& temp_storage_bytes,
-      InputIteratorT d_in,
-      OutputIteratorT d_out,
-      int num_items,
-      cudaStream_t stream,
-      bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Sum<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide minimum using the less-than (``<``) operator.
   //!
@@ -478,23 +441,6 @@ struct DeviceReduce
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
-  Min(void* d_temp_storage,
-      size_t& temp_storage_bytes,
-      InputIteratorT d_in,
-      OutputIteratorT d_out,
-      int num_items,
-      cudaStream_t stream,
-      bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Min<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Finds the first device-wide minimum using the less-than (``<``) operator and also returns the index of that item.
   //!
@@ -754,23 +700,6 @@ struct DeviceReduce
       d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    ::cuda::std::int64_t num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return ArgMin<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide maximum using the greater-than (``>``) operator.
   //!
@@ -878,23 +807,6 @@ struct DeviceReduce
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
-  Max(void* d_temp_storage,
-      size_t& temp_storage_bytes,
-      InputIteratorT d_in,
-      OutputIteratorT d_out,
-      int num_items,
-      cudaStream_t stream,
-      bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Max<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that
   //! item.
@@ -1158,23 +1070,6 @@ struct DeviceReduce
       d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    ::cuda::std::int64_t num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return ArgMax<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Fuses transform and reduce operations
   //!
@@ -1498,47 +1393,6 @@ struct DeviceReduce
                          static_cast<OffsetT>(num_items),
                          stream);
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeysInputIteratorT,
-            typename UniqueOutputIteratorT,
-            typename ValuesInputIteratorT,
-            typename AggregatesOutputIteratorT,
-            typename NumRunsOutputIteratorT,
-            typename ReductionOpT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t ReduceByKey(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeysInputIteratorT d_keys_in,
-    UniqueOutputIteratorT d_unique_out,
-    ValuesInputIteratorT d_values_in,
-    AggregatesOutputIteratorT d_aggregates_out,
-    NumRunsOutputIteratorT d_num_runs_out,
-    ReductionOpT reduction_op,
-    int num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return ReduceByKey<KeysInputIteratorT,
-                       UniqueOutputIteratorT,
-                       ValuesInputIteratorT,
-                       AggregatesOutputIteratorT,
-                       NumRunsOutputIteratorT,
-                       ReductionOpT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_unique_out,
-      d_values_in,
-      d_aggregates_out,
-      d_num_runs_out,
-      reduction_op,
-      num_items,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh
index 73512b2296f..751cdd46424 100644
--- a/cub/cub/device/device_run_length_encode.cuh
+++ b/cub/cub/device/device_run_length_encode.cuh
@@ -230,29 +230,6 @@ struct DeviceRunLengthEncode
                           stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT,
-            typename UniqueOutputIteratorT,
-            typename LengthsOutputIteratorT,
-            typename NumRunsOutputIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Encode(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    UniqueOutputIteratorT d_unique_out,
-    LengthsOutputIteratorT d_counts_out,
-    NumRunsOutputIteratorT d_num_runs_out,
-    int num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Encode<InputIteratorT, UniqueOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Enumerates the starting offsets and lengths of all non-trivial runs
   //! (of ``length > 1``) of same-valued keys in the sequence ``d_in``.
@@ -384,30 +361,6 @@ struct DeviceRunLengthEncode
                          num_items,
                          stream);
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT,
-            typename OffsetsOutputIteratorT,
-            typename LengthsOutputIteratorT,
-            typename NumRunsOutputIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
-  NonTrivialRuns(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OffsetsOutputIteratorT d_offsets_out,
-    LengthsOutputIteratorT d_lengths_out,
-    NumRunsOutputIteratorT d_num_runs_out,
-    int num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return NonTrivialRuns<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh
index 1c06c83af66..0c1638bd955 100644
--- a/cub/cub/device/device_scan.cuh
+++ b/cub/cub/device/device_scan.cuh
@@ -207,24 +207,6 @@ struct DeviceScan
                stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    NumItemsT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return ExclusiveSum<InputIteratorT, OutputIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide exclusive prefix sum in-place.
   //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_data``.
@@ -301,22 +283,6 @@ struct DeviceScan
     return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename IteratorT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    IteratorT d_data,
-    NumItemsT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return ExclusiveSum<IteratorT>(d_temp_storage, temp_storage_bytes, d_data, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide exclusive prefix scan using the specified
   //! binary ``scan_op`` functor. The ``init_value`` value is applied as
@@ -449,26 +415,6 @@ struct DeviceScan
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    ScanOpT scan_op,
-    InitValueT init_value,
-    NumItemsT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return ExclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide exclusive prefix scan using the specified
   //! binary ``scan_op`` functor. The ``init_value`` value is applied as
@@ -578,25 +524,6 @@ struct DeviceScan
     return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    IteratorT d_data,
-    ScanOpT scan_op,
-    InitValueT init_value,
-    NumItemsT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return ExclusiveScan<IteratorT, ScanOpT, InitValueT>(
-      d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide exclusive prefix scan using the specified
   //! binary ``scan_op`` functor. The ``init_value`` value is provided as a future value.
@@ -738,31 +665,6 @@ struct DeviceScan
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT,
-            typename OutputIteratorT,
-            typename ScanOpT,
-            typename InitValueT,
-            typename InitValueIterT = InitValueT*,
-            typename NumItemsT      = int>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    ScanOpT scan_op,
-    FutureValue<InitValueT, InitValueIterT> init_value,
-    NumItemsT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return ExclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, InitValueIterT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide exclusive prefix scan using the specified binary ``scan_op`` functor.
   //! The ``init_value`` value is provided as a future value.
@@ -879,29 +781,6 @@ struct DeviceScan
     return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename IteratorT,
-            typename ScanOpT,
-            typename InitValueT,
-            typename InitValueIterT = InitValueT*,
-            typename NumItemsT      = int>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    IteratorT d_data,
-    ScanOpT scan_op,
-    FutureValue<InitValueT, InitValueIterT> init_value,
-    NumItemsT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return ExclusiveScan<IteratorT, ScanOpT, InitValueT, InitValueIterT>(
-      d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @}  end member group
   //! @name Inclusive scans
   //! @{
@@ -1002,24 +881,6 @@ struct DeviceScan
       d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    NumItemsT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return InclusiveSum<InputIteratorT, OutputIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide inclusive prefix sum in-place.
   //!
@@ -1095,22 +956,6 @@ struct DeviceScan
     return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename IteratorT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    IteratorT d_data,
-    NumItemsT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return InclusiveSum<IteratorT>(d_temp_storage, temp_storage_bytes, d_data, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor.
   //!
@@ -1332,25 +1177,6 @@ struct DeviceScan
                                 stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    ScanOpT scan_op,
-    NumItemsT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return InclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor.
   //!
@@ -1450,23 +1276,6 @@ struct DeviceScan
     return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename IteratorT, typename ScanOpT, typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    IteratorT d_data,
-    ScanOpT scan_op,
-    NumItemsT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return InclusiveScan<IteratorT, ScanOpT>(d_temp_storage, temp_storage_bytes, d_data, scan_op, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide exclusive prefix sum-by-key with key equality
   //! defined by ``equality_op``. The value of ``0`` is applied as the initial
@@ -1607,30 +1416,6 @@ struct DeviceScan
                          stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeysInputIteratorT,
-            typename ValuesInputIteratorT,
-            typename ValuesOutputIteratorT,
-            typename EqualityOpT = ::cuda::std::equal_to<>,
-            typename NumItemsT   = std::uint32_t>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeysInputIteratorT d_keys_in,
-    ValuesInputIteratorT d_values_in,
-    ValuesOutputIteratorT d_values_out,
-    NumItemsT num_items,
-    EqualityOpT equality_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return ExclusiveSumByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT>(
-      d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide exclusive prefix scan-by-key using the
   //! specified binary ``scan_op`` functor. The key equality is defined by
@@ -1813,48 +1598,6 @@ struct DeviceScan
                          stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeysInputIteratorT,
-            typename ValuesInputIteratorT,
-            typename ValuesOutputIteratorT,
-            typename ScanOpT,
-            typename InitValueT,
-            typename EqualityOpT = ::cuda::std::equal_to<>,
-            typename NumItemsT   = std::uint32_t>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeysInputIteratorT d_keys_in,
-    ValuesInputIteratorT d_values_in,
-    ValuesOutputIteratorT d_values_out,
-    ScanOpT scan_op,
-    InitValueT init_value,
-    NumItemsT num_items,
-    EqualityOpT equality_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return ExclusiveScanByKey<KeysInputIteratorT,
-                              ValuesInputIteratorT,
-                              ValuesOutputIteratorT,
-                              ScanOpT,
-                              InitValueT,
-                              EqualityOpT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_values_in,
-      d_values_out,
-      scan_op,
-      init_value,
-      num_items,
-      equality_op,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``.
   //!
@@ -1989,30 +1732,6 @@ struct DeviceScan
                          stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeysInputIteratorT,
-            typename ValuesInputIteratorT,
-            typename ValuesOutputIteratorT,
-            typename EqualityOpT = ::cuda::std::equal_to<>,
-            typename NumItemsT   = std::uint32_t>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeysInputIteratorT d_keys_in,
-    ValuesInputIteratorT d_values_in,
-    ValuesOutputIteratorT d_values_out,
-    NumItemsT num_items,
-    EqualityOpT equality_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return InclusiveSumByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT>(
-      d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide inclusive prefix scan-by-key using the
   //! specified binary ``scan_op`` functor. The key equality is defined by ``equality_op``.
@@ -2179,32 +1898,6 @@ struct DeviceScan
                          stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeysInputIteratorT,
-            typename ValuesInputIteratorT,
-            typename ValuesOutputIteratorT,
-            typename ScanOpT,
-            typename EqualityOpT = ::cuda::std::equal_to<>,
-            typename NumItemsT   = std::uint32_t>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeysInputIteratorT d_keys_in,
-    ValuesInputIteratorT d_values_in,
-    ValuesOutputIteratorT d_values_out,
-    ScanOpT scan_op,
-    NumItemsT num_items,
-    EqualityOpT equality_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return InclusiveScanByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, ScanOpT, EqualityOpT>(
-      d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, num_items, equality_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @}  end member group
 };
 
diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh
index 6bde88ed9da..ae47119bfa3 100644
--- a/cub/cub/device/device_segmented_radix_sort.cuh
+++ b/cub/cub/device/device_segmented_radix_sort.cuh
@@ -264,43 +264,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    const ValueT* d_values_in,
-    ValueT* d_values_out,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      d_values_in,
-      d_values_out,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Sorts segments of key-value pairs into ascending order. (``~N`` auxiliary storage required)
   //!
@@ -475,39 +438,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    DoubleBuffer<ValueT>& d_values,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Sorts segments of key-value pairs into descending order. (``~2N`` auxiliary storage required).
   //!
@@ -682,43 +612,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    const ValueT* d_values_in,
-    ValueT* d_values_out,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortPairsDescending<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      d_values_in,
-      d_values_out,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Sorts segments of key-value pairs into descending order. (``~N`` auxiliary storage required).
   //!
@@ -897,39 +790,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    DoubleBuffer<ValueT>& d_values,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortPairsDescending<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @}  end member group
   //! @name Keys-only
   //! @{
@@ -1091,39 +951,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Sorts segments of keys into ascending order. (``~N`` auxiliary storage required).
   //!
@@ -1290,37 +1117,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Sorts segments of keys into descending order. (``~2N`` auxiliary storage required).
   //!
@@ -1478,39 +1274,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Sorts segments of keys into descending order. (``~N`` auxiliary storage required).
   //!
@@ -1675,37 +1438,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    int begin_bit,
-    int end_bit,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @}  end member group
 };
 
diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh
index 591930ad01c..5eac51ee742 100644
--- a/cub/cub/device/device_segmented_reduce.cuh
+++ b/cub/cub/device/device_segmented_reduce.cuh
@@ -271,42 +271,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT,
-            typename OutputIteratorT,
-            typename BeginOffsetIteratorT,
-            typename EndOffsetIteratorT,
-            typename ReductionOpT,
-            typename T>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    ReductionOpT reduction_op,
-    T initial_value,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Reduce<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, ReductionOpT, T>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      d_out,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      reduction_op,
-      initial_value,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide segmented sum using the addition (``+``) operator.
   //!
@@ -425,26 +389,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
-  Sum(void* d_temp_storage,
-      size_t& temp_storage_bytes,
-      InputIteratorT d_in,
-      OutputIteratorT d_out,
-      int num_segments,
-      BeginOffsetIteratorT d_begin_offsets,
-      EndOffsetIteratorT d_end_offsets,
-      cudaStream_t stream,
-      bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Sum<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide segmented minimum using the less-than (``<``) operator.
   //!
@@ -571,26 +515,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
-  Min(void* d_temp_storage,
-      size_t& temp_storage_bytes,
-      InputIteratorT d_in,
-      OutputIteratorT d_out,
-      int num_segments,
-      BeginOffsetIteratorT d_begin_offsets,
-      EndOffsetIteratorT d_end_offsets,
-      cudaStream_t stream,
-      bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Min<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Finds the first device-wide minimum in each segment using the
   //! less-than (``<``) operator, also returning the in-segment index of that item.
@@ -741,26 +665,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return ArgMin<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Computes a device-wide segmented maximum using the greater-than (``>``) operator.
   //!
@@ -876,26 +780,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
-  Max(void* d_temp_storage,
-      size_t& temp_storage_bytes,
-      InputIteratorT d_in,
-      OutputIteratorT d_out,
-      int num_segments,
-      BeginOffsetIteratorT d_begin_offsets,
-      EndOffsetIteratorT d_end_offsets,
-      cudaStream_t stream,
-      bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Max<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Finds the first device-wide maximum in each segment using the
   //! greater-than (``>``) operator, also returning the in-segment index of that item
@@ -1048,26 +932,6 @@ public:
       initial_value,
       stream);
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return ArgMax<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh
index 26b55f9988e..1fb5656b82f 100644
--- a/cub/cub/device/device_segmented_sort.cuh
+++ b/cub/cub/device/device_segmented_sort.cuh
@@ -305,35 +305,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
 private:
   // Internal version without NVTX range
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
@@ -502,35 +473,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
 private:
   // Internal version without NVTX range
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
@@ -701,26 +643,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
 private:
   // Internal version without NVTX range
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
@@ -892,26 +814,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Sorts segments of keys into ascending order. Approximately
   //! ``num_items +  2 * num_segments`` auxiliary storage required.
@@ -1048,35 +950,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return StableSortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Sorts segments of keys into descending order.
   //! Approximately ``num_items + 2 * num_segments`` auxiliary storage required.
@@ -1213,35 +1086,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return StableSortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Sorts segments of keys into ascending order.
   //! Approximately ``2 * num_segments`` auxiliary storage required.
@@ -1380,26 +1224,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return StableSortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Sorts segments of keys into descending order.
   //! Approximately ``2 * num_segments`` auxiliary storage required.
@@ -1537,26 +1361,6 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return StableSortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
 private:
   // Internal version without NVTX range
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
@@ -1756,39 +1560,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    const ValueT* d_values_in,
-    ValueT* d_values_out,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      d_values_in,
-      d_values_out,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
 private:
   // Internal version without NVTX range
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
@@ -1984,39 +1755,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    const ValueT* d_values_in,
-    ValueT* d_values_out,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortPairsDescending<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      d_values_in,
-      d_values_out,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
 private:
   // Internal version without NVTX range
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
@@ -2212,35 +1950,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    DoubleBuffer<ValueT>& d_values,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
 private:
   // Internal version without NVTX range
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
@@ -2435,35 +2144,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    DoubleBuffer<ValueT>& d_values,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return SortPairsDescending<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Sorts segments of key-value pairs into ascending order.
   //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
@@ -2622,39 +2302,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    const ValueT* d_values_in,
-    ValueT* d_values_out,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return StableSortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      d_values_in,
-      d_values_out,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Sorts segments of key-value pairs into descending order.
   //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
@@ -2813,39 +2460,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    const KeyT* d_keys_in,
-    KeyT* d_keys_out,
-    const ValueT* d_values_in,
-    ValueT* d_values_out,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return StableSortPairsDescending<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_keys_out,
-      d_values_in,
-      d_values_out,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Sorts segments of key-value pairs into ascending order.
   //! Approximately ``2 * num_segments`` auxiliary storage required.
@@ -3010,35 +2624,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    DoubleBuffer<ValueT>& d_values,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return StableSortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Sorts segments of key-value pairs into descending order.
   //! Approximately ``2 * num_segments`` auxiliary storage required.
@@ -3202,35 +2787,6 @@ public:
       stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    DoubleBuffer<ValueT>& d_values,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return StableSortPairsDescending<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @}  end member group
 };
 
diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index 72e47cbebfe..7d5099ca7e1 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -203,26 +203,6 @@ struct DeviceSelect
                                     stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename FlagIterator, typename OutputIteratorT, typename NumSelectedIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    FlagIterator d_flags,
-    OutputIteratorT d_out,
-    NumSelectedIteratorT d_num_selected_out,
-    ::cuda::std::int64_t num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Flagged<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Uses the ``d_flags`` sequence to selectively compact the items in `d_data``.
   //! The total number of items selected is written to ``d_num_selected_out``.
@@ -339,25 +319,6 @@ struct DeviceSelect
                                    stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename IteratorT, typename FlagIterator, typename NumSelectedIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    IteratorT d_data,
-    FlagIterator d_flags,
-    NumSelectedIteratorT d_num_selected_out,
-    ::cuda::std::int64_t num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Flagged<IteratorT, FlagIterator, NumSelectedIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_data, d_flags, d_num_selected_out, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Uses the ``select_op`` functor to selectively copy items from ``d_in`` into ``d_out``.
   //! The total number of items selected is written to ``d_num_selected_out``.
@@ -497,26 +458,6 @@ struct DeviceSelect
                                     stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename SelectOp>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
-  If(void* d_temp_storage,
-     size_t& temp_storage_bytes,
-     InputIteratorT d_in,
-     OutputIteratorT d_out,
-     NumSelectedIteratorT d_num_selected_out,
-     ::cuda::std::int64_t num_items,
-     SelectOp select_op,
-     cudaStream_t stream,
-     bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return If<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, SelectOp>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Uses the ``select_op`` functor to selectively compact items in ``d_data``.
   //! The total number of items selected is written to ``d_num_selected_out``.
@@ -647,25 +588,6 @@ struct DeviceSelect
                                         stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename IteratorT, typename NumSelectedIteratorT, typename SelectOp>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
-  If(void* d_temp_storage,
-     size_t& temp_storage_bytes,
-     IteratorT d_data,
-     NumSelectedIteratorT d_num_selected_out,
-     ::cuda::std::int64_t num_items,
-     SelectOp select_op,
-     cudaStream_t stream,
-     bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return If<IteratorT, NumSelectedIteratorT, SelectOp>(
-      d_temp_storage, temp_storage_bytes, d_data, d_num_selected_out, num_items, select_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Uses the ``select_op`` functor applied to ``d_flags`` to selectively copy the
   //! corresponding items from ``d_in`` into ``d_out``.
@@ -1010,25 +932,6 @@ struct DeviceSelect
                                     stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    NumSelectedIteratorT d_num_selected_out,
-    ::cuda::std::int64_t num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Unique<InputIteratorT, OutputIteratorT, NumSelectedIteratorT>(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @rst
   //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive
   //! equal-valued keys, only the first key and its value from each run is selectively copied
@@ -1328,45 +1231,6 @@ struct DeviceSelect
       ::cuda::std::equal_to<>{},
       stream);
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename KeyInputIteratorT,
-            typename ValueInputIteratorT,
-            typename KeyOutputIteratorT,
-            typename ValueOutputIteratorT,
-            typename NumSelectedIteratorT,
-            typename NumItemsT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeyInputIteratorT d_keys_in,
-    ValueInputIteratorT d_values_in,
-    KeyOutputIteratorT d_keys_out,
-    ValueOutputIteratorT d_values_out,
-    NumSelectedIteratorT d_num_selected_out,
-    NumItemsT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return UniqueByKey<KeyInputIteratorT,
-                       ValueInputIteratorT,
-                       KeyOutputIteratorT,
-                       ValueOutputIteratorT,
-                       NumSelectedIteratorT,
-                       NumItemsT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_values_in,
-      d_keys_out,
-      d_values_out,
-      d_num_selected_out,
-      num_items,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh
index 114454a791a..5a751181842 100644
--- a/cub/cub/device/device_spmv.cuh
+++ b/cub/cub/device/device_spmv.cuh
@@ -207,39 +207,6 @@ struct DeviceSpmv
     return DispatchSpmv<ValueT, int>::Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream);
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename ValueT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t CsrMV(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    const ValueT* d_values,
-    const int* d_row_offsets,
-    const int* d_column_indices,
-    const ValueT* d_vector_x,
-    ValueT* d_vector_y,
-    int num_rows,
-    int num_cols,
-    int num_nonzeros,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return CsrMV<ValueT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_values,
-      d_row_offsets,
-      d_column_indices,
-      d_vector_x,
-      d_vector_y,
-      num_rows,
-      num_cols,
-      num_nonzeros,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //! @}  end member group
 };
 
diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index 6908b7d0638..a8c733ef309 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -137,29 +137,6 @@ struct DispatchAdjacentDifference
       , stream(stream)
   {}
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CCCL_DEPRECATED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchAdjacentDifference(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    InputIteratorT d_input,
-    OutputIteratorT d_output,
-    OffsetT num_items,
-    DifferenceOpT difference_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-      : d_temp_storage(d_temp_storage)
-      , temp_storage_bytes(temp_storage_bytes)
-      , d_input(d_input)
-      , d_output(d_output)
-      , num_items(num_items)
-      , difference_op(difference_op)
-      , stream(stream)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   /// Invocation
   template <typename ActivePolicyT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
@@ -319,24 +296,6 @@ struct DispatchAdjacentDifference
 
     return error;
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION static cudaError_t Dispatch(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    InputIteratorT d_input,
-    OutputIteratorT d_output,
-    OffsetT num_items,
-    DifferenceOpT difference_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index b3b52fc8391..900f758cdfb 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -1031,39 +1031,6 @@ public:
     return error;
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    SampleIteratorT d_samples,
-    CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
-    int num_output_levels[NUM_ACTIVE_CHANNELS],
-    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
-    OffsetT num_row_pixels,
-    OffsetT num_rows,
-    OffsetT row_stride_samples,
-    cudaStream_t stream,
-    bool debug_synchronous,
-    Int2Type<false> is_byte_sample)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return DispatchRange(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_samples,
-      d_output_histograms,
-      num_output_levels,
-      d_levels,
-      num_row_pixels,
-      num_rows,
-      row_stride_samples,
-      stream,
-      is_byte_sample);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   /**
    * Dispatch routine for HistogramRange, specialized for 8-bit sample types
    * (computes 256-bin privatized histograms and then reduces to user-specified levels)
@@ -1197,39 +1164,6 @@ public:
     return error;
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    SampleIteratorT d_samples,
-    CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
-    const int num_output_levels[NUM_ACTIVE_CHANNELS],
-    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
-    OffsetT num_row_pixels,
-    OffsetT num_rows,
-    OffsetT row_stride_samples,
-    cudaStream_t stream,
-    bool debug_synchronous,
-    Int2Type<true> is_byte_sample)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return DispatchRange(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_samples,
-      d_output_histograms,
-      num_output_levels,
-      d_levels,
-      num_row_pixels,
-      num_rows,
-      row_stride_samples,
-      stream,
-      is_byte_sample);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   /**
    * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
    *
@@ -1415,41 +1349,6 @@ public:
     return error;
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    SampleIteratorT d_samples,
-    CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
-    const int num_output_levels[NUM_ACTIVE_CHANNELS],
-    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
-    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
-    OffsetT num_row_pixels,
-    OffsetT num_rows,
-    OffsetT row_stride_samples,
-    cudaStream_t stream,
-    bool debug_synchronous,
-    Int2Type<false> is_byte_sample)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return DispatchEven(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_samples,
-      d_output_histograms,
-      num_output_levels,
-      lower_level,
-      upper_level,
-      num_row_pixels,
-      num_rows,
-      row_stride_samples,
-      stream,
-      is_byte_sample);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   /**
    * Dispatch routine for HistogramEven, specialized for 8-bit sample types
    * (computes 256-bin privatized histograms and then reduces to user-specified levels)
@@ -1586,41 +1485,6 @@ public:
 
     return error;
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    SampleIteratorT d_samples,
-    CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
-    const int num_output_levels[NUM_ACTIVE_CHANNELS],
-    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
-    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
-    OffsetT num_row_pixels,
-    OffsetT num_rows,
-    OffsetT row_stride_samples,
-    cudaStream_t stream,
-    bool debug_synchronous,
-    Int2Type<true> is_byte_sample)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return DispatchEven(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_samples,
-      d_output_histograms,
-      num_output_levels,
-      lower_level,
-      upper_level,
-      num_row_pixels,
-      num_rows,
-      row_stride_samples,
-      stream,
-      is_byte_sample);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
index 507b7776de6..1d455bdfbf1 100644
--- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -399,33 +399,6 @@ struct DispatchMergeSort
       , ptx_version(ptx_version)
   {}
 
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchMergeSort(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    KeyInputIteratorT d_input_keys,
-    ValueInputIteratorT d_input_items,
-    KeyIteratorT d_output_keys,
-    ValueIteratorT d_output_items,
-    OffsetT num_items,
-    CompareOpT compare_op,
-    cudaStream_t stream,
-    bool debug_synchronous,
-    int ptx_version)
-      : d_temp_storage(d_temp_storage)
-      , temp_storage_bytes(temp_storage_bytes)
-      , d_input_keys(d_input_keys)
-      , d_input_items(d_input_items)
-      , d_output_keys(d_output_keys)
-      , d_output_items(d_output_items)
-      , num_items(num_items)
-      , compare_op(compare_op)
-      , stream(stream)
-      , ptx_version(ptx_version)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-  }
-
   // Invocation
   template <typename ActivePolicyT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
@@ -682,33 +655,6 @@ struct DispatchMergeSort
 
     return error;
   }
-
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    KeyInputIteratorT d_input_keys,
-    ValueInputIteratorT d_input_items,
-    KeyIteratorT d_output_keys,
-    ValueIteratorT d_output_items,
-    OffsetT num_items,
-    CompareOpT compare_op,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_input_keys,
-      d_input_items,
-      d_output_keys,
-      d_output_items,
-      num_items,
-      compare_op,
-      stream);
-  }
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index ed971c1a739..0d4d9bf1ea9 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -934,33 +934,6 @@ struct DispatchRadixSort
       , decomposer(decomposer)
   {}
 
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchRadixSort(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    DoubleBuffer<ValueT>& d_values,
-    OffsetT num_items,
-    int begin_bit,
-    int end_bit,
-    bool is_overwrite_okay,
-    cudaStream_t stream,
-    bool debug_synchronous,
-    int ptx_version)
-      : d_temp_storage(d_temp_storage)
-      , temp_storage_bytes(temp_storage_bytes)
-      , d_keys(d_keys)
-      , d_values(d_values)
-      , num_items(num_items)
-      , begin_bit(begin_bit)
-      , end_bit(end_bit)
-      , stream(stream)
-      , ptx_version(ptx_version)
-      , is_overwrite_okay(is_overwrite_okay)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-  }
-
   //------------------------------------------------------------------------------
   // Small-problem (single tile) invocation
   //------------------------------------------------------------------------------
@@ -1872,25 +1845,6 @@ struct DispatchRadixSort
 
     return error;
   }
-
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    DoubleBuffer<ValueT>& d_values,
-    OffsetT num_items,
-    int begin_bit,
-    int end_bit,
-    bool is_overwrite_okay,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(
-      d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
-  }
 };
 
 /******************************************************************************
@@ -2027,39 +1981,6 @@ struct DispatchSegmentedRadixSort
       , decomposer(decomposer)
   {}
 
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedRadixSort(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    DoubleBuffer<ValueT>& d_values,
-    OffsetT num_items,
-    OffsetT num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    int begin_bit,
-    int end_bit,
-    bool is_overwrite_okay,
-    cudaStream_t stream,
-    bool debug_synchronous,
-    int ptx_version)
-      : d_temp_storage(d_temp_storage)
-      , temp_storage_bytes(temp_storage_bytes)
-      , d_keys(d_keys)
-      , d_values(d_values)
-      , num_items(num_items)
-      , num_segments(num_segments)
-      , d_begin_offsets(d_begin_offsets)
-      , d_end_offsets(d_end_offsets)
-      , begin_bit(begin_bit)
-      , end_bit(end_bit)
-      , stream(stream)
-      , ptx_version(ptx_version)
-      , is_overwrite_okay(is_overwrite_okay)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-  }
-
   //------------------------------------------------------------------------------
   // Multi-segment invocation
   //------------------------------------------------------------------------------
@@ -2428,39 +2349,6 @@ struct DispatchSegmentedRadixSort
 
     return error;
   }
-
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    DoubleBuffer<ValueT>& d_values,
-    int num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    int begin_bit,
-    int end_bit,
-    bool is_overwrite_okay,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      begin_bit,
-      end_bit,
-      is_overwrite_okay,
-      stream);
-  }
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index c9ea3fc1bd2..0cca1e1a982 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -345,33 +345,6 @@ struct DispatchReduce
       , launcher_factory(launcher_factory)
   {}
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchReduce(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    OffsetT num_items,
-    ReductionOpT reduction_op,
-    InitT init,
-    cudaStream_t stream,
-    bool debug_synchronous,
-    int ptx_version)
-      : d_temp_storage(d_temp_storage)
-      , temp_storage_bytes(temp_storage_bytes)
-      , d_in(d_in)
-      , d_out(d_out)
-      , num_items(num_items)
-      , reduction_op(reduction_op)
-      , init(init)
-      , stream(stream)
-      , ptx_version(ptx_version)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //---------------------------------------------------------------------------
   // Small-problem (single tile) invocation
   //---------------------------------------------------------------------------
@@ -689,25 +662,6 @@ struct DispatchReduce
 
     return error;
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    OffsetT num_items,
-    ReductionOpT reduction_op,
-    InitT init,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 /**
@@ -884,37 +838,6 @@ struct DispatchSegmentedReduce
       , ptx_version(ptx_version)
   {}
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedReduce(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    ReductionOpT reduction_op,
-    InitT init,
-    cudaStream_t stream,
-    bool debug_synchronous,
-    int ptx_version)
-      : d_temp_storage(d_temp_storage)
-      , temp_storage_bytes(temp_storage_bytes)
-      , d_in(d_in)
-      , d_out(d_out)
-      , num_segments(num_segments)
-      , d_begin_offsets(d_begin_offsets)
-      , d_end_offsets(d_end_offsets)
-      , reduction_op(reduction_op)
-      , init(init)
-      , stream(stream)
-      , ptx_version(ptx_version)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   //---------------------------------------------------------------------------
   // Chained policy invocation
   //---------------------------------------------------------------------------
@@ -1109,37 +1032,6 @@ struct DispatchSegmentedReduce
 
     return error;
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    ReductionOpT reduction_op,
-    InitT init,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      d_out,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      reduction_op,
-      init,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 8449a10ea62..804371588f3 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -541,39 +541,6 @@ struct DispatchReduceByKey
 
     return error;
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeysInputIteratorT d_keys_in,
-    UniqueOutputIteratorT d_unique_out,
-    ValuesInputIteratorT d_values_in,
-    AggregatesOutputIteratorT d_aggregates_out,
-    NumRunsOutputIteratorT d_num_runs_out,
-    EqualityOpT equality_op,
-    ReductionOpT reduction_op,
-    OffsetT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_unique_out,
-      d_values_in,
-      d_aggregates_out,
-      d_num_runs_out,
-      equality_op,
-      reduction_op,
-      num_items,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index 5c8f1e01d0f..b1542462a58 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -539,35 +539,6 @@ struct DeviceRleDispatch
 
     return error;
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OffsetsOutputIteratorT d_offsets_out,
-    LengthsOutputIteratorT d_lengths_out,
-    NumRunsOutputIteratorT d_num_runs_out,
-    EqualityOpT equality_op,
-    OffsetT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      d_offsets_out,
-      d_lengths_out,
-      d_num_runs_out,
-      equality_op,
-      num_items,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 676b08df49e..0ba4cc1dcae 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -329,33 +329,6 @@ struct DispatchScan
       , ptx_version(ptx_version)
   {}
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScan(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    OffsetT num_items,
-    ScanOpT scan_op,
-    InitValueT init_value,
-    cudaStream_t stream,
-    bool debug_synchronous,
-    int ptx_version)
-      : d_temp_storage(d_temp_storage)
-      , temp_storage_bytes(temp_storage_bytes)
-      , d_in(d_in)
-      , d_out(d_out)
-      , scan_op(scan_op)
-      , init_value(init_value)
-      , num_items(num_items)
-      , stream(stream)
-      , ptx_version(ptx_version)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
   CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
   {
@@ -588,25 +561,6 @@ struct DispatchScan
 
     return error;
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    OutputIteratorT d_out,
-    ScanOpT scan_op,
-    InitValueT init_value,
-    OffsetT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index 6f2432874b5..c88656dff48 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -341,37 +341,6 @@ struct DispatchScanByKey
       , ptx_version(ptx_version)
   {}
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScanByKey(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeysInputIteratorT d_keys_in,
-    ValuesInputIteratorT d_values_in,
-    ValuesOutputIteratorT d_values_out,
-    EqualityOp equality_op,
-    ScanOpT scan_op,
-    InitValueT init_value,
-    OffsetT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous,
-    int ptx_version)
-      : d_temp_storage(d_temp_storage)
-      , temp_storage_bytes(temp_storage_bytes)
-      , d_keys_in(d_keys_in)
-      , d_values_in(d_values_in)
-      , d_values_out(d_values_out)
-      , equality_op(equality_op)
-      , scan_op(scan_op)
-      , init_value(init_value)
-      , num_items(num_items)
-      , stream(stream)
-      , ptx_version(ptx_version)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
   CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
   {
@@ -616,37 +585,6 @@ struct DispatchScanByKey
 
     return error;
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeysInputIteratorT d_keys_in,
-    ValuesInputIteratorT d_values_in,
-    ValuesOutputIteratorT d_values_out,
-    EqualityOp equality_op,
-    ScanOpT scan_op,
-    InitValueT init_value,
-    OffsetT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_values_in,
-      d_values_out,
-      equality_op,
-      scan_op,
-      init_value,
-      num_items,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index 9250ab87f61..9d011d414ba 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -820,35 +820,6 @@ struct DispatchSegmentedSort
       , stream(stream)
   {}
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedSort(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    DoubleBuffer<ValueT>& d_values,
-    OffsetT num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    bool is_overwrite_okay,
-    cudaStream_t stream,
-    bool debug_synchronous)
-      : d_temp_storage(d_temp_storage)
-      , temp_storage_bytes(temp_storage_bytes)
-      , d_keys(d_keys)
-      , d_values(d_values)
-      , num_items(num_items)
-      , num_segments(num_segments)
-      , d_begin_offsets(d_begin_offsets)
-      , d_end_offsets(d_end_offsets)
-      , is_overwrite_okay(is_overwrite_okay)
-      , stream(stream)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   template <typename ActivePolicyT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
   {
@@ -1128,37 +1099,6 @@ struct DispatchSegmentedSort
     return error;
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    DoubleBuffer<KeyT>& d_keys,
-    DoubleBuffer<ValueT>& d_values,
-    OffsetT num_items,
-    int num_segments,
-    BeginOffsetIteratorT d_begin_offsets,
-    EndOffsetIteratorT d_end_offsets,
-    bool is_overwrite_okay,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys,
-      d_values,
-      num_items,
-      num_segments,
-      d_begin_offsets,
-      d_end_offsets,
-      is_overwrite_okay,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
 private:
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE int GetNumPasses(int radix_bits)
   {
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index be8b09a5c9a..c41dfb389eb 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -840,37 +840,6 @@ struct DispatchSelectIf
 
     return CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch));
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    FlagsInputIteratorT d_flags,
-    SelectedOutputIteratorT d_selected_out,
-    NumSelectedIteratorT d_num_selected_out,
-    SelectOpT select_op,
-    EqualityOpT equality_op,
-    OffsetT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      d_flags,
-      d_selected_out,
-      d_num_selected_out,
-      select_op,
-      equality_op,
-      num_items,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
index 2fb435699db..6dc4f44aeca 100644
--- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -892,44 +892,6 @@ struct DispatchSpmv
     return error;
   }
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  template <typename Spmv1ColKernelT,
-            typename SpmvSearchKernelT,
-            typename SpmvKernelT,
-            typename SegmentFixupKernelT,
-            typename SpmvEmptyMatrixKernelT>
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN
-  _CCCL_FORCEINLINE static cudaError_t
-  Dispatch(void* d_temp_storage,
-           size_t& temp_storage_bytes,
-           SpmvParamsT& spmv_params,
-           cudaStream_t stream,
-           bool debug_synchronous,
-           Spmv1ColKernelT spmv_1col_kernel,
-           SpmvSearchKernelT spmv_search_kernel,
-           SpmvKernelT spmv_kernel,
-           SegmentFixupKernelT segment_fixup_kernel,
-           SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel,
-           KernelConfig spmv_config,
-           KernelConfig segment_fixup_config)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch<Spmv1ColKernelT, SpmvSearchKernelT, SpmvKernelT, SegmentFixupKernelT, SpmvEmptyMatrixKernelT>(
-      d_temp_storage,
-      temp_storage_bytes,
-      spmv_params,
-      stream,
-      spmv_1col_kernel,
-      spmv_search_kernel,
-      spmv_kernel,
-      segment_fixup_kernel,
-      spmv_empty_matrix_kernel,
-      spmv_config,
-      segment_fixup_config);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   /**
    * @brief Internal dispatch routine for computing a device-wide reduction
    *
@@ -988,21 +950,6 @@ struct DispatchSpmv
 
     return error;
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    SpmvParamsT& spmv_params,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
index fc259499b85..2d5566d76a3 100644
--- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -551,39 +551,6 @@ struct DispatchThreeWayPartitionIf
 
     return error;
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    std::size_t& temp_storage_bytes,
-    InputIteratorT d_in,
-    FirstOutputIteratorT d_first_part_out,
-    SecondOutputIteratorT d_second_part_out,
-    UnselectedOutputIteratorT d_unselected_out,
-    NumSelectedIteratorT d_num_selected_out,
-    SelectFirstPartOp select_first_part_op,
-    SelectSecondPartOp select_second_part_op,
-    OffsetT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      d_first_part_out,
-      d_second_part_out,
-      d_unselected_out,
-      d_num_selected_out,
-      select_first_part_op,
-      select_second_part_op,
-      num_items,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
index 46ac0a44b9b..e07084fe24a 100644
--- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -322,35 +322,6 @@ struct DispatchUniqueByKey
       , stream(stream)
   {}
 
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchUniqueByKey(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeyInputIteratorT d_keys_in,
-    ValueInputIteratorT d_values_in,
-    KeyOutputIteratorT d_keys_out,
-    ValueOutputIteratorT d_values_out,
-    NumSelectedIteratorT d_num_selected_out,
-    EqualityOpT equality_op,
-    OffsetT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-      : d_temp_storage(d_temp_storage)
-      , temp_storage_bytes(temp_storage_bytes)
-      , d_keys_in(d_keys_in)
-      , d_values_in(d_values_in)
-      , d_keys_out(d_keys_out)
-      , d_values_out(d_values_out)
-      , d_num_selected_out(d_num_selected_out)
-      , equality_op(equality_op)
-      , num_items(num_items)
-      , stream(stream)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
-
   /******************************************************************************
    * Dispatch entrypoints
    ******************************************************************************/
@@ -626,37 +597,6 @@ struct DispatchUniqueByKey
 
     return error;
   }
-
-#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
-    void* d_temp_storage,
-    size_t& temp_storage_bytes,
-    KeyInputIteratorT d_keys_in,
-    ValueInputIteratorT d_values_in,
-    KeyOutputIteratorT d_keys_out,
-    ValueOutputIteratorT d_values_out,
-    NumSelectedIteratorT d_num_selected_out,
-    EqualityOpT equality_op,
-    OffsetT num_items,
-    cudaStream_t stream,
-    bool debug_synchronous)
-  {
-    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
-
-    return Dispatch(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_keys_in,
-      d_values_in,
-      d_keys_out,
-      d_values_out,
-      d_num_selected_out,
-      equality_op,
-      num_items,
-      stream);
-  }
-#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/util_debug.cuh b/cub/cub/util_debug.cuh
index 3971c6a99ca..275c915e8f2 100644
--- a/cub/cub/util_debug.cuh
+++ b/cub/cub/util_debug.cuh
@@ -309,18 +309,4 @@ inline _CCCL_HOST_DEVICE void va_printf(char const*, Args const&...)
 #  endif
 #endif
 
-#define CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED             \
-  CCCL_DEPRECATED_BECAUSE(                                         \
-    "CUB no longer accepts `debug_synchronous` parameter. "        \
-    "Define CUB_DEBUG_SYNC instead, or silence this message with " \
-    "CCCL_IGNORE_DEPRECATED_API.")
-
-#define CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG                     \
-  if (debug_synchronous)                                            \
-  {                                                                 \
-    _CubLog("%s\n",                                                 \
-            "CUB no longer accepts `debug_synchronous` parameter. " \
-            "Define CUB_DEBUG_SYNC instead.");                      \
-  }
-
 CUB_NAMESPACE_END
diff --git a/docs/repo.toml b/docs/repo.toml
index 7e0d3108bbb..e949beb6e7c 100644
--- a/docs/repo.toml
+++ b/docs/repo.toml
@@ -179,7 +179,6 @@ doxygen_predefined = [
     "CCCL_DEPRECATED=",
     "CUB_STATIC_ASSERT(cond,msg)=",
     "CUB_RUNTIME_FUNCTION",
-    "CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED",
     "CUDASTF_HOST",
     "CUDASTF_DEVICE",
     "CUDASTF_HOST_DEVICE"

From 094b9e8b7c441a332f10592c974a2aff8cedb97f Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Sat, 11 Jan 2025 00:52:07 +0100
Subject: [PATCH 30/31] Drop CUB's util_compiler.cuh for real (#3340)

PR #3302 planned to drop the file, but only dropped its content. This
was an oversight. So let's drop the entire file.
---
 cub/cub/util_compiler.cuh | 44 ---------------------------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 cub/cub/util_compiler.cuh

diff --git a/cub/cub/util_compiler.cuh b/cub/cub/util_compiler.cuh
deleted file mode 100644
index 8279c6e1fbd..00000000000
--- a/cub/cub/util_compiler.cuh
+++ /dev/null
@@ -1,44 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Detect compiler information.
- */
-
-#pragma once
-
-// For _CCCL_IMPLICIT_SYSTEM_HEADER
-#include <cuda/__cccl_config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header

From 9a0494199f1425bc86bc2942231aeb90eec7f18b Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Sat, 11 Jan 2025 00:52:27 +0100
Subject: [PATCH 31/31] Drop cub::ValueCache (#3346)

---
 cub/cub/util_device.cuh | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index 0ff0f76df9f..bd96393ae5f 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -130,24 +130,6 @@ CUB_RUNTIME_FUNCTION inline int DeviceCountUncached()
   return count;
 }
 
-/**
- * \brief Cache for an arbitrary value produced by a nullary function.
- * deprecated [Since 2.6.0]
- */
-template <typename T, T (*Function)()>
-struct CCCL_DEPRECATED ValueCache
-{
-  T const value;
-
-  /**
-   * \brief Call the nullary function to produce the value and construct the
-   *        cache.
-   */
-  _CCCL_HOST inline ValueCache()
-      : value(Function())
-  {}
-};
-
 // Host code. This is a separate function to avoid defining a local static in a host/device function.
 _CCCL_HOST inline int DeviceCountCachedValue()
 {