NVIDIA · bernhardmgruber · Jan 30, 2025 · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024
@@ -21,3 +21,8 @@ Multicast
 ---------
 
 .. include:: generated/cp_async_bulk_tensor_multicast.rst
+
+Scatter / Gather
+----------------
+
+.. include:: generated/cp_async_bulk_tensor_gather_scatter.rst
@@ -13,6 +13,11 @@ fence
 
 .. include:: generated/fence.rst
 
+fence.sync_restrict
+-------------------
+
+.. include:: generated/fence_sync_restrict.rst
+
 fence.mbarrier_init
 -------------------
 
@@ -29,6 +34,11 @@ fence.proxy.async
 
 .. include:: generated/fence_proxy_async.rst
 
+fence.proxy.async.sync_restrict
+-------------------------------
+
+.. include:: generated/fence_proxy_async_generic_sync_restrict.rst
+
 fence.proxy.tensormap
 ---------------------
 

@@ -33,6 +33,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 // 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 #include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h>
 #include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX

@@ -36,7 +36,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 #include <cuda/__ptx/instructions/generated/fence_mbarrier_init.h>
 #include <cuda/__ptx/instructions/generated/fence_proxy_alias.h>
 #include <cuda/__ptx/instructions/generated/fence_proxy_async.h>
+#include <cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h>
 #include <cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h>
+#include <cuda/__ptx/instructions/generated/fence_sync_restrict.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 

@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_MBARRIER_EXPECT_TX_H_
+#define _CUDA_PTX_MBARRIER_EXPECT_TX_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/ptx_dot_variants.h>
+#include <cuda/__ptx/ptx_helper_functions.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#include <cuda/__ptx/instructions/generated/mbarrier_expect_tx.h>
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CUDA_PTX_MBARRIER_EXPECT_TX_H_
@@ -80,6 +80,7 @@
 #include <cuda/__ptx/instructions/get_sreg.h>
 #include <cuda/__ptx/instructions/getctarank.h>
 #include <cuda/__ptx/instructions/mbarrier_arrive.h>
+#include <cuda/__ptx/instructions/mbarrier_expect_tx.h>
 #include <cuda/__ptx/instructions/mbarrier_init.h>
 #include <cuda/__ptx/instructions/mbarrier_wait.h>
 #include <cuda/__ptx/instructions/red_async.h>

@@ -17,6 +17,7 @@
 #include "nvrtc_workaround.h"
 // above header needs to be included before the generated test header
 #include "generated/cp_async_bulk_tensor.h"
+#include "generated/cp_async_bulk_tensor_gather_scatter.h"
 
 int main(int, char**)
 {

@@ -20,7 +20,9 @@
 #include "generated/fence_mbarrier_init.h"
 #include "generated/fence_proxy_alias.h"
 #include "generated/fence_proxy_async.h"
+#include "generated/fence_proxy_async_generic_sync_restrict.h"
 #include "generated/fence_proxy_tensormap_generic.h"
+#include "generated/fence_sync_restrict.h"
 
 int main(int, char**)
 {

@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "generated/mbarrier_expect_tx.h"
+
+int main(int, char**)
+{
+  return 0;
+}