Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PTX: Update existing instructions #3584

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,8 @@ Multicast
---------

.. include:: generated/cp_async_bulk_tensor_multicast.rst

Scatter / Gather
----------------

.. include:: generated/cp_async_bulk_tensor_gather_scatter.rst
10 changes: 10 additions & 0 deletions docs/libcudacxx/ptx/instructions/fence.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ fence

.. include:: generated/fence.rst

fence.sync_restrict
-------------------

.. include:: generated/fence_sync_restrict.rst

fence.mbarrier_init
-------------------

Expand All @@ -29,6 +34,11 @@ fence.proxy.async

.. include:: generated/fence_proxy_async.rst

fence.proxy.async.sync_restrict
-------------------------------

.. include:: generated/fence_proxy_async_generic_sync_restrict.rst

fence.proxy.tensormap
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
// 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h>
#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h>
#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h>

_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
Expand Down
2 changes: 2 additions & 0 deletions libcudacxx/include/cuda/__ptx/instructions/fence.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
#include <cuda/__ptx/instructions/generated/fence_mbarrier_init.h>
#include <cuda/__ptx/instructions/generated/fence_proxy_alias.h>
#include <cuda/__ptx/instructions/generated/fence_proxy_async.h>
#include <cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h>
#include <cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h>
#include <cuda/__ptx/instructions/generated/fence_sync_restrict.h>

_LIBCUDACXX_END_NAMESPACE_CUDA_PTX

Expand Down
37 changes: 37 additions & 0 deletions libcudacxx/include/cuda/__ptx/instructions/mbarrier_expect_tx.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_PTX_MBARRIER_EXPECT_TX_H_
#define _CUDA_PTX_MBARRIER_EXPECT_TX_H_

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/__ptx/ptx_dot_variants.h>
#include <cuda/__ptx/ptx_helper_functions.h>
#include <cuda/std/cstdint>

#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX

#include <cuda/__ptx/instructions/generated/mbarrier_expect_tx.h>

_LIBCUDACXX_END_NAMESPACE_CUDA_PTX

#endif // _CUDA_PTX_MBARRIER_EXPECT_TX_H_
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/ptx
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
#include <cuda/__ptx/instructions/get_sreg.h>
#include <cuda/__ptx/instructions/getctarank.h>
#include <cuda/__ptx/instructions/mbarrier_arrive.h>
#include <cuda/__ptx/instructions/mbarrier_expect_tx.h>
#include <cuda/__ptx/instructions/mbarrier_init.h>
#include <cuda/__ptx/instructions/mbarrier_wait.h>
#include <cuda/__ptx/instructions/red_async.h>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "nvrtc_workaround.h"
// above header needs to be included before the generated test header
#include "generated/cp_async_bulk_tensor.h"
#include "generated/cp_async_bulk_tensor_gather_scatter.h"

int main(int, char**)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
#include "generated/fence_mbarrier_init.h"
#include "generated/fence_proxy_alias.h"
#include "generated/fence_proxy_async.h"
#include "generated/fence_proxy_async_generic_sync_restrict.h"
#include "generated/fence_proxy_tensormap_generic.h"
#include "generated/fence_sync_restrict.h"

int main(int, char**)
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads

// <cuda/ptx>

#include <cuda/ptx>
#include <cuda/std/utility>

#include "generated/mbarrier_expect_tx.h"

int main(int, char**)
{
return 0;
}
Loading