diff --git a/.devcontainer/cuda11.1-gcc6/devcontainer.json b/.devcontainer/cuda11.1-gcc6/devcontainer.json deleted file mode 100644 index 6311f6a882b..00000000000 --- a/.devcontainer/cuda11.1-gcc6/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-gcc6-cuda11.1", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda11.1-gcc6", - "CCCL_CUDA_VERSION": "11.1", - "CCCL_HOST_COMPILER": "gcc", - "CCCL_HOST_COMPILER_VERSION": "6", - "CCCL_BUILD_INFIX": "cuda11.1-gcc6", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda11.1-gcc6" -} diff --git a/.devcontainer/cuda11.1-gcc9/devcontainer.json b/.devcontainer/cuda11.1-gcc9/devcontainer.json deleted file mode 100644 index 9d711be5f66..00000000000 --- a/.devcontainer/cuda11.1-gcc9/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-gcc9-cuda11.1", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda11.1-gcc9", - "CCCL_CUDA_VERSION": "11.1", - "CCCL_HOST_COMPILER": "gcc", - "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda11.1-gcc9", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda11.1-gcc9" -} diff --git a/.devcontainer/cuda11.1-llvm9/devcontainer.json b/.devcontainer/cuda11.1-llvm9/devcontainer.json deleted file mode 100644 index e39eb910443..00000000000 --- a/.devcontainer/cuda11.1-llvm9/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm9-cuda11.1", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda11.1-llvm9", - "CCCL_CUDA_VERSION": "11.1", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda11.1-llvm9", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda11.1-llvm9" -} diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json deleted file mode 100644 index 87098679264..00000000000 --- a/.devcontainer/cuda11.8-gcc11/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-gcc11-cuda11.8", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda11.8-gcc11", - "CCCL_CUDA_VERSION": "11.8", - "CCCL_HOST_COMPILER": "gcc", - "CCCL_HOST_COMPILER_VERSION": "11", - "CCCL_BUILD_INFIX": "cuda11.8-gcc11", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda11.8-gcc11" -} diff --git a/.devcontainer/cuda11.1-gcc7/devcontainer.json b/.devcontainer/cuda12.0-gcc7/devcontainer.json similarity index 90% rename from .devcontainer/cuda11.1-gcc7/devcontainer.json rename to .devcontainer/cuda12.0-gcc7/devcontainer.json index e7d2a6572f8..96a32136eb1 100644 --- a/.devcontainer/cuda11.1-gcc7/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc7/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-gcc7-cuda11.1", + "image": "rapidsai/devcontainers:25.02-cpp-gcc7-cuda12.0", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda11.1-gcc7", - "CCCL_CUDA_VERSION": "11.1", + "DEVCONTAINER_NAME": "cuda12.0-gcc7", + "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "7", - "CCCL_BUILD_INFIX": "cuda11.1-gcc7", + "CCCL_BUILD_INFIX": "cuda12.0-gcc7", "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", @@ -50,5 +50,5 @@ } } }, - "name": "cuda11.1-gcc7" + "name": "cuda12.0-gcc7" } diff --git a/.devcontainer/cuda11.1-gcc8/devcontainer.json b/.devcontainer/cuda12.0-gcc8/devcontainer.json similarity index 90% rename from .devcontainer/cuda11.1-gcc8/devcontainer.json rename to .devcontainer/cuda12.0-gcc8/devcontainer.json index f590606adef..9cfe4709e07 100644 --- a/.devcontainer/cuda11.1-gcc8/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc8/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-gcc8-cuda11.1", + "image": "rapidsai/devcontainers:25.02-cpp-gcc8-cuda12.0", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda11.1-gcc8", - "CCCL_CUDA_VERSION": "11.1", + "DEVCONTAINER_NAME": "cuda12.0-gcc8", + "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "8", - "CCCL_BUILD_INFIX": "cuda11.1-gcc8", + "CCCL_BUILD_INFIX": "cuda12.0-gcc8", "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", @@ -50,5 +50,5 @@ } } }, - "name": "cuda11.1-gcc8" + "name": "cuda12.0-gcc8" } diff --git a/.devcontainer/cuda12.0-llvm10/devcontainer.json b/.devcontainer/cuda12.0-llvm10/devcontainer.json deleted file mode 100644 index 6f75525f808..00000000000 --- a/.devcontainer/cuda12.0-llvm10/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm10-cuda12.0", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.0-llvm10", - "CCCL_CUDA_VERSION": "12.0", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "10", - "CCCL_BUILD_INFIX": "cuda12.0-llvm10", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.0-llvm10" -} diff --git a/.devcontainer/cuda12.0-llvm11/devcontainer.json b/.devcontainer/cuda12.0-llvm11/devcontainer.json deleted file mode 100644 index fd21f30fbd3..00000000000 --- a/.devcontainer/cuda12.0-llvm11/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm11-cuda12.0", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.0-llvm11", - "CCCL_CUDA_VERSION": "12.0", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "11", - "CCCL_BUILD_INFIX": "cuda12.0-llvm11", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.0-llvm11" -} diff --git a/.devcontainer/cuda12.0-llvm12/devcontainer.json b/.devcontainer/cuda12.0-llvm12/devcontainer.json deleted file mode 100644 index b402063c837..00000000000 --- a/.devcontainer/cuda12.0-llvm12/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm12-cuda12.0", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.0-llvm12", - "CCCL_CUDA_VERSION": "12.0", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "12", - "CCCL_BUILD_INFIX": "cuda12.0-llvm12", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.0-llvm12" -} diff --git a/.devcontainer/cuda12.0-llvm13/devcontainer.json b/.devcontainer/cuda12.0-llvm13/devcontainer.json deleted file mode 100644 index 40187a60e6c..00000000000 --- a/.devcontainer/cuda12.0-llvm13/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm13-cuda12.0", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.0-llvm13", - "CCCL_CUDA_VERSION": "12.0", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "13", - "CCCL_BUILD_INFIX": "cuda12.0-llvm13", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.0-llvm13" -} diff --git a/.devcontainer/cuda12.0-llvm9/devcontainer.json b/.devcontainer/cuda12.0-llvm9/devcontainer.json deleted file mode 100644 index e72c6da2fdd..00000000000 --- a/.devcontainer/cuda12.0-llvm9/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm9-cuda12.0", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.0-llvm9", - "CCCL_CUDA_VERSION": "12.0", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda12.0-llvm9", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.0-llvm9" -} diff --git a/.devcontainer/cuda12.6-llvm10/devcontainer.json b/.devcontainer/cuda12.6-llvm10/devcontainer.json deleted file mode 100644 index ef06f7cf9a2..00000000000 --- a/.devcontainer/cuda12.6-llvm10/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm10-cuda12.6", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.6-llvm10", - "CCCL_CUDA_VERSION": "12.6", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "10", - "CCCL_BUILD_INFIX": "cuda12.6-llvm10", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.6-llvm10" -} diff --git a/.devcontainer/cuda12.6-llvm11/devcontainer.json b/.devcontainer/cuda12.6-llvm11/devcontainer.json deleted file mode 100644 index 38c13841ee6..00000000000 --- a/.devcontainer/cuda12.6-llvm11/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm11-cuda12.6", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.6-llvm11", - "CCCL_CUDA_VERSION": "12.6", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "11", - "CCCL_BUILD_INFIX": "cuda12.6-llvm11", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.6-llvm11" -} diff --git a/.devcontainer/cuda12.6-llvm12/devcontainer.json b/.devcontainer/cuda12.6-llvm12/devcontainer.json deleted file mode 100644 index 8898d216573..00000000000 --- a/.devcontainer/cuda12.6-llvm12/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm12-cuda12.6", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.6-llvm12", - "CCCL_CUDA_VERSION": "12.6", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "12", - "CCCL_BUILD_INFIX": "cuda12.6-llvm12", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.6-llvm12" -} diff --git a/.devcontainer/cuda12.6-llvm13/devcontainer.json b/.devcontainer/cuda12.6-llvm13/devcontainer.json deleted file mode 100644 index 8d713720c51..00000000000 --- a/.devcontainer/cuda12.6-llvm13/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm13-cuda12.6", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.6-llvm13", - "CCCL_CUDA_VERSION": "12.6", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "13", - "CCCL_BUILD_INFIX": "cuda12.6-llvm13", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.6-llvm13" -} diff --git a/.devcontainer/cuda12.6-llvm9/devcontainer.json b/.devcontainer/cuda12.6-llvm9/devcontainer.json deleted file mode 100644 index be41e2506c5..00000000000 --- a/.devcontainer/cuda12.6-llvm9/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm9-cuda12.6", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.6-llvm9", - "CCCL_CUDA_VERSION": "12.6", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda12.6-llvm9", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.6-llvm9" -} diff --git a/.devcontainer/cuda12.6-oneapi2023.2.0/devcontainer.json b/.devcontainer/cuda12.6-oneapi2023.2.0/devcontainer.json deleted file mode 100644 index 7c54383deeb..00000000000 --- a/.devcontainer/cuda12.6-oneapi2023.2.0/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-oneapi2023.2.0-cuda12.6", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.6-oneapi2023.2.0", - "CCCL_CUDA_VERSION": "12.6", - "CCCL_HOST_COMPILER": "oneapi", - "CCCL_HOST_COMPILER_VERSION": "2023.2.0", - "CCCL_BUILD_INFIX": "cuda12.6-oneapi2023.2.0", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.6-oneapi2023.2.0" -} diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 725009e6256..74de63e7a94 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -37,6 +37,11 @@ body: - Thrust - CUB - libcu++ + - CUDA Experimental (cudax) + - cuda.cooperative (Python) + - cuda.parallel (Python) + - General CCCL + - Infrastructure - Not sure validations: required: true diff --git a/.github/actions/workflow-build/build-workflow.py b/.github/actions/workflow-build/build-workflow.py index 15d91a7ab60..62b90f1472d 100755 --- a/.github/actions/workflow-build/build-workflow.py +++ b/.github/actions/workflow-build/build-workflow.py @@ -62,8 +62,8 @@ import re import struct import sys -import yaml +import yaml matrix_yaml = None diff --git a/.github/actions/workflow-run-job-windows/action.yml b/.github/actions/workflow-run-job-windows/action.yml index 805beff3446..1b5289a5a7d 100644 --- a/.github/actions/workflow-run-job-windows/action.yml +++ b/.github/actions/workflow-run-job-windows/action.yml @@ -50,6 +50,7 @@ runs: docker run \ --mount type=bind,source="${{steps.paths.outputs.HOST_REPO}}",target="${{steps.paths.outputs.MOUNT_REPO}}" \ --workdir "${{steps.paths.outputs.MOUNT_REPO}}" \ + --isolation=process \ ${{ inputs.image }} \ powershell -c " [System.Environment]::SetEnvironmentVariable('AWS_ACCESS_KEY_ID','${{env.AWS_ACCESS_KEY_ID}}'); diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 27e4a3ec4ea..d317e931e78 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: - id: mixed-line-ending - id: trailing-whitespace - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.8 + rev: v19.1.6 hooks: - id: clang-format types_or: [file] @@ -39,13 +39,10 @@ repos: # TODO/REMINDER: add the Ruff vscode extension to the devcontainers # Ruff, the Python auto-correcting linter/formatter written in Rust - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.3 + rev: v0.8.6 hooks: - - id: ruff - args: ["--fix", "--show-fixes"] - exclude: "^docs/tools/" - - id: ruff-format - exclude: "^docs/tools/" + - id: ruff # linter + - id: ruff-format # formatter - repo: https://github.com/codespell-project/codespell rev: v2.3.0 hooks: @@ -60,7 +57,7 @@ repos: - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.13.0' + rev: 'v1.14.1' hooks: - id: mypy additional_dependencies: [types-cachetools, numpy] diff --git a/CMakePresets.json b/CMakePresets.json index 2519ca09adf..bd10a95200b 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -22,7 +22,6 @@ "CCCL_ENABLE_TESTING": false, "CCCL_ENABLE_EXAMPLES": false, "CCCL_ENABLE_C": false, - "CCCL_SUPPRESS_ICC_DEPRECATION_WARNING": true, "CCCL_SUPPRESS_MSVC2017_DEPRECATION_WARNING": true, "libcudacxx_ENABLE_INSTALL_RULES": true, "CUB_ENABLE_INSTALL_RULES": true, @@ -70,8 +69,6 @@ "CUB_ENABLE_TESTING": true, "CUB_ENABLE_EXAMPLES": true, "CUB_SEPARATE_CATCH2": true, - "CUB_ENABLE_DIALECT_CPP11": true, - "CUB_ENABLE_DIALECT_CPP14": true, "CUB_ENABLE_DIALECT_CPP17": true, "CUB_ENABLE_DIALECT_CPP20": true, "THRUST_ENABLE_MULTICONFIG": true, @@ -233,30 +230,10 @@ "CUB_ENABLE_TESTING": true, "CUB_ENABLE_EXAMPLES": true, "CUB_SEPARATE_CATCH2": true, - "CUB_ENABLE_DIALECT_CPP11": false, - "CUB_ENABLE_DIALECT_CPP14": false, "CUB_ENABLE_DIALECT_CPP17": false, "CUB_ENABLE_DIALECT_CPP20": false } }, - { - "name": "cub-cpp11", - "displayName": "CUB: C++11", - "inherits": "cub-base", - "cacheVariables": { - "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true, - "CUB_ENABLE_DIALECT_CPP11": true - } - }, - { - "name": "cub-cpp14", - "displayName": "CUB: C++14", - "inherits": "cub-base", - "cacheVariables": { - "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true, - "CUB_ENABLE_DIALECT_CPP14": true - } - }, { "name": "cub-cpp17", "displayName": "CUB: C++17", @@ -290,24 +267,6 @@ "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP20": false } }, - { - "name": "thrust-cpp11", - "displayName": "Thrust: C++11", - "inherits": "thrust-base", - "cacheVariables": { - "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true, - "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP11": true - } - }, - { - "name": "thrust-cpp14", - "displayName": "Thrust: C++14", - "inherits": "thrust-base", - "cacheVariables": { - "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true, - "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP14": true - } - }, { "name": "thrust-cpp17", "displayName": "Thrust: C++17", @@ -521,14 +480,6 @@ "libcudacxx-base" ] }, - { - "name": "cub-cpp11", - "configurePreset": "cub-cpp11" - }, - { - "name": "cub-cpp14", - "configurePreset": "cub-cpp14" - }, { "name": "cub-cpp17", "configurePreset": "cub-cpp17" @@ -537,14 +488,6 @@ "name": "cub-cpp20", "configurePreset": "cub-cpp20" }, - { - "name": "thrust-cpp11", - "configurePreset": "thrust-cpp11" - }, - { - "name": "thrust-cpp14", - "configurePreset": "thrust-cpp14" - }, { "name": "thrust-cpp17", "configurePreset": "thrust-cpp17" @@ -737,16 +680,6 @@ } } }, - { - "name": "cub-nolid-cpp11", - "configurePreset": "cub-cpp11", - "inherits": "cub-nolid-base" - }, - { - "name": "cub-nolid-cpp14", - "configurePreset": "cub-cpp14", - "inherits": "cub-nolid-base" - }, { "name": "cub-nolid-cpp17", "configurePreset": "cub-cpp17", @@ -757,16 +690,6 @@ "configurePreset": "cub-cpp20", "inherits": "cub-nolid-base" }, - { - "name": "cub-lid0-cpp11", - "configurePreset": "cub-cpp11", - "inherits": "cub-lid0-base" - }, - { - "name": "cub-lid0-cpp14", - "configurePreset": "cub-cpp14", - "inherits": "cub-lid0-base" - }, { "name": "cub-lid0-cpp17", "configurePreset": "cub-cpp17", @@ -777,16 +700,6 @@ "configurePreset": "cub-cpp20", "inherits": "cub-lid0-base" }, - { - "name": "cub-lid1-cpp11", - "configurePreset": "cub-cpp11", - "inherits": "cub-lid1-base" - }, - { - "name": "cub-lid1-cpp14", - "configurePreset": "cub-cpp14", - "inherits": "cub-lid1-base" - }, { "name": "cub-lid1-cpp17", "configurePreset": "cub-cpp17", @@ -797,16 +710,6 @@ "configurePreset": "cub-cpp20", "inherits": "cub-lid1-base" }, - { - "name": "cub-lid2-cpp11", - "configurePreset": "cub-cpp11", - "inherits": "cub-lid2-base" - }, - { - "name": "cub-lid2-cpp14", - "configurePreset": "cub-cpp14", - "inherits": "cub-lid2-base" - }, { "name": "cub-lid2-cpp17", "configurePreset": "cub-cpp17", @@ -817,16 +720,6 @@ "configurePreset": "cub-cpp20", "inherits": "cub-lid2-base" }, - { - "name": "cub-cpp11", - "configurePreset": "cub-cpp11", - "inherits": "cub-base" - }, - { - "name": "cub-cpp14", - "configurePreset": "cub-cpp14", - "inherits": "cub-base" - }, { "name": "cub-cpp17", "configurePreset": "cub-cpp17", @@ -867,16 +760,6 @@ } } }, - { - "name": "thrust-gpu-cpp11", - "configurePreset": "thrust-cpp11", - "inherits": "thrust-gpu-base" - }, - { - "name": "thrust-gpu-cpp14", - "configurePreset": "thrust-cpp14", - "inherits": "thrust-gpu-base" - }, { "name": "thrust-gpu-cpp17", "configurePreset": "thrust-cpp17", @@ -887,16 +770,6 @@ "configurePreset": "thrust-cpp20", "inherits": "thrust-gpu-base" }, - { - "name": "thrust-cpu-cpp11", - "configurePreset": "thrust-cpp11", - "inherits": "thrust-cpu-base" - }, - { - "name": "thrust-cpu-cpp14", - "configurePreset": "thrust-cpp14", - "inherits": "thrust-cpu-base" - }, { "name": "thrust-cpu-cpp17", "configurePreset": "thrust-cpp17", @@ -907,16 +780,6 @@ "configurePreset": "thrust-cpp20", "inherits": "thrust-cpu-base" }, - { - "name": "thrust-cpp11", - "configurePreset": "thrust-cpp11", - "inherits": "thrust-base" - }, - { - "name": "thrust-cpp14", - "configurePreset": "thrust-cpp14", - "inherits": "thrust-base" - }, { "name": "thrust-cpp17", "configurePreset": "thrust-cpp17", diff --git a/README.md b/README.md index b9795ea2f06..358adadc87b 100644 --- a/README.md +++ b/README.md @@ -219,18 +219,16 @@ CCCL users are encouraged to capitalize on the latest enhancements and ["live at For a seamless experience, you can upgrade CCCL independently of the entire CUDA Toolkit. This is possible because CCCL maintains backward compatibility with the latest patch release of every minor CTK release from both the current and previous major version series. In some exceptional cases, the minimum supported minor version of the CUDA Toolkit release may need to be newer than the oldest release within its major version series. -For instance, CCCL requires a minimum supported version of 11.1 from the 11.x series due to an unavoidable compiler issue present in CTK 11.0. When a new major CTK is released, we drop support for the oldest supported major version. | CCCL Version | Supports CUDA Toolkit Version | |--------------|------------------------------------------------| | 2.x | 11.1 - 11.8, 12.x (only latest patch releases) | -| 3.x (Future) | 12.x, 13.x (only latest patch releases) | +| 3.x | 12.x, 13.x (only latest patch releases) | [Well-behaved code](#compatibility-guidelines) using the latest CCCL should compile and run successfully with any supported CTK version. Exceptions may occur for new features that depend on new CTK features, so those features would not work on older versions of the CTK. -For example, C++20 support was not added to `nvcc` until CUDA 12.0, so CCCL features that depend on C++20 would not work with CTK 11.x. Users can integrate a newer version of CCCL into an older CTK, but not the other way around. This means an older version of CCCL is not compatible with a newer CTK. @@ -260,6 +258,8 @@ Unless otherwise specified, CCCL supports the same host compilers as the latest - [Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#host-compiler-support-policy) - [Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html#system-requirements) +For GCC on Linux, at least 7.x is required. + When using older CUDA Toolkits, we also only support the host compilers of the latest CUDA Toolkit, but at least the most recent host compiler of any supported older CUDA Toolkit. @@ -269,8 +269,8 @@ But we will not invest significant time in triaging or fixing issues for older c In the spirit of "You only support what you test", see our [CI Overview](https://github.com/NVIDIA/cccl/blob/main/ci-overview.md) for more information on exactly what we test. ### C++ Dialects -- C++11 (Deprecated in Thrust/CUB, to be removed in next major version) -- C++14 (Deprecated in Thrust/CUB, to be removed in next major version) +- C++11 (only libcu++) +- C++14 (only libcu++) - C++17 - C++20 @@ -285,7 +285,7 @@ Note that some features may only support certain architectures/Compute Capabilit CCCL's testing strategy strikes a balance between testing as many configurations as possible and maintaining reasonable CI times. For CUDA Toolkit versions, testing is done against both the oldest and the newest supported versions. -For instance, if the latest version of the CUDA Toolkit is 12.3, tests are conducted against 11.1 and 12.3. +For instance, if the latest version of the CUDA Toolkit is 12.6, tests are conducted against 11.1 and 12.6. For each CUDA version, builds are completed against all supported host compilers with all supported C++ dialects. The testing strategy and matrix are constantly evolving. diff --git a/benchmarks/scripts/analyze.py b/benchmarks/scripts/analyze.py index 39223f20dac..f3eceb258ad 100755 --- a/benchmarks/scripts/analyze.py +++ b/benchmarks/scripts/analyze.py @@ -1,16 +1,17 @@ #!/usr/bin/env python3 +import argparse +import functools +import itertools +import json +import math import os import re -import json + import cccl -import math -import argparse -import itertools -import functools +import matplotlib.pyplot as plt import numpy as np import pandas as pd -import matplotlib.pyplot as plt from scipy.stats import mannwhitneyu from scipy.stats.mstats import hdquantiles @@ -330,8 +331,8 @@ def coverage(args): def parallel_coordinates_plot(df, title): # Parallel coordinates plot adaptation of https://stackoverflow.com/a/69411450 import matplotlib.cm as cm - from matplotlib.path import Path import matplotlib.patches as patches + from matplotlib.path import Path # Variables (the first variable must be categoric): my_vars = df.columns.tolist() diff --git a/benchmarks/scripts/cccl/bench/__init__.py b/benchmarks/scripts/cccl/bench/__init__.py index a01f59b7b5c..6f3755648db 100644 --- a/benchmarks/scripts/cccl/bench/__init__.py +++ b/benchmarks/scripts/cccl/bench/__init__.py @@ -1,6 +1,6 @@ -from .config import * # noqa: F403 -from .storage import * # noqa: F403 from .bench import Bench # noqa: F401 from .cmake import CMake # noqa: F401 +from .config import * # noqa: F403 from .score import * # noqa: F403 from .search import * # noqa: F403 +from .storage import * # noqa: F403 diff --git a/benchmarks/scripts/cccl/bench/bench.py b/benchmarks/scripts/cccl/bench/bench.py index e3c2de7cddb..a4e8c34f7cc 100644 --- a/benchmarks/scripts/cccl/bench/bench.py +++ b/benchmarks/scripts/cccl/bench/bench.py @@ -1,17 +1,18 @@ -import os +import itertools import json -import time -import fpzip +import os import signal -import itertools import subprocess +import time + +import fpzip import numpy as np from .cmake import CMake from .config import BasePoint, Config -from .storage import Storage, get_bench_table_name -from .score import compute_axes_ids, compute_weight_matrices, get_workload_weight from .logger import Logger +from .score import compute_axes_ids, compute_weight_matrices, get_workload_weight +from .storage import Storage, get_bench_table_name def first_val(my_dict): diff --git a/benchmarks/scripts/cccl/bench/cmake.py b/benchmarks/scripts/cccl/bench/cmake.py index db72f979709..65f3e786ef1 100644 --- a/benchmarks/scripts/cccl/bench/cmake.py +++ b/benchmarks/scripts/cccl/bench/cmake.py @@ -1,12 +1,12 @@ import os -import time import signal import subprocess +import time from .build import Build from .config import Config -from .storage import Storage from .logger import Logger +from .storage import Storage def create_builds_table(conn): diff --git a/benchmarks/scripts/cccl/bench/config.py b/benchmarks/scripts/cccl/bench/config.py index 6c3792f8a3e..0d1a724a422 100644 --- a/benchmarks/scripts/cccl/bench/config.py +++ b/benchmarks/scripts/cccl/bench/config.py @@ -1,6 +1,6 @@ import os -import sys import random +import sys def randomized_cartesian_product(list_of_lists): diff --git a/benchmarks/scripts/cccl/bench/score.py b/benchmarks/scripts/cccl/bench/score.py index 7102db940c8..5a2ab099213 100644 --- a/benchmarks/scripts/cccl/bench/score.py +++ b/benchmarks/scripts/cccl/bench/score.py @@ -1,4 +1,5 @@ import math + import numpy as np diff --git a/benchmarks/scripts/cccl/bench/search.py b/benchmarks/scripts/cccl/bench/search.py index 9573dd73d5e..7419bcd40a7 100644 --- a/benchmarks/scripts/cccl/bench/search.py +++ b/benchmarks/scripts/cccl/bench/search.py @@ -1,11 +1,12 @@ -import re import argparse +import re + import numpy as np -from .bench import Bench, BaseBench +from .bench import BaseBench, Bench +from .cmake import CMake from .config import Config from .storage import Storage -from .cmake import CMake def list_benches(algnames): diff --git a/benchmarks/scripts/cccl/bench/storage.py b/benchmarks/scripts/cccl/bench/storage.py index d3cafca625c..f4008c88efe 100644 --- a/benchmarks/scripts/cccl/bench/storage.py +++ b/benchmarks/scripts/cccl/bench/storage.py @@ -1,10 +1,10 @@ import os -import fpzip import sqlite3 + +import fpzip import numpy as np import pandas as pd - db_name = "cccl_meta_bench.db" diff --git a/benchmarks/scripts/compare.py b/benchmarks/scripts/compare.py index 443eb5f9e1d..64428b37400 100755 --- a/benchmarks/scripts/compare.py +++ b/benchmarks/scripts/compare.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 +import argparse import os + import cccl -import argparse import numpy as np import pandas as pd - from colorama import Fore diff --git a/benchmarks/scripts/run.py b/benchmarks/scripts/run.py index e8cdd9adb45..6bdd2fad789 100755 --- a/benchmarks/scripts/run.py +++ b/benchmarks/scripts/run.py @@ -1,8 +1,9 @@ #!/usr/bin/env python3 +import math import os import sys -import math + import cccl.bench diff --git a/benchmarks/scripts/search.py b/benchmarks/scripts/search.py index 8d5d2d5a65b..9d5ba0af0bb 100755 --- a/benchmarks/scripts/search.py +++ b/benchmarks/scripts/search.py @@ -2,7 +2,6 @@ import cccl.bench as bench - # TODO: # - driver version # - host compiler + version diff --git a/benchmarks/scripts/sol.py b/benchmarks/scripts/sol.py index 7cc26c30d21..e93175f07a6 100755 --- a/benchmarks/scripts/sol.py +++ b/benchmarks/scripts/sol.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 +import argparse import os + import cccl -import argparse +import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns -import matplotlib.pyplot as plt def is_finite(x): diff --git a/benchmarks/scripts/verify.py b/benchmarks/scripts/verify.py index a1c4c39623f..7a98243016e 100755 --- a/benchmarks/scripts/verify.py +++ b/benchmarks/scripts/verify.py @@ -1,7 +1,8 @@ #!/usr/bin/env python3 -import sys import argparse +import sys + import cccl.bench diff --git a/cccl-version.json b/cccl-version.json index fc6b155463e..d274eaaa5f3 100644 --- a/cccl-version.json +++ b/cccl-version.json @@ -1,6 +1,6 @@ { - "full": "2.8.0", - "major": 2, - "minor": 8, + "full": "3.0.0", + "major": 3, + "minor": 0, "patch": 0 } diff --git a/ci/matrix.yaml b/ci/matrix.yaml index fd7f2f079c4..881f553f65d 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -5,18 +5,19 @@ workflows: # # Example: # override: - # - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'llvm16']} + # - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']} # override: pull_request: - # Old CTK - - {jobs: ['build'], std: 'minmax', ctk: '11.1', cxx: ['gcc6', 'gcc9', 'clang9', 'msvc2017']} + # Old CTK/compiler + - {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc7', 'gcc9', 'clang14', 'msvc2019']} # Current CTK build-only - - {jobs: ['build'], std: [11, 14], cxx: ['gcc7', 'clang9']} + - {jobs: ['build'], std: [11, 14], cxx: ['gcc7', 'clang14'], project: 'libcudacxx'} + - {jobs: ['build'], std: [17], cxx: ['gcc7', 'clang14']} - {jobs: ['build'], std: 'max', cxx: ['gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']} - - {jobs: ['build'], std: 'max', cxx: ['clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']} - - {jobs: ['build'], std: 'max', cxx: ['intel', 'msvc2019']} + - {jobs: ['build'], std: 'max', cxx: ['clang14', 'clang15', 'clang16', 'clang17']} + - {jobs: ['build'], std: 'max', cxx: ['msvc2019']} - {jobs: ['build'], std: [17, 20], cxx: ['gcc', 'clang', 'msvc']} # Current CTK testing: - {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc']} @@ -41,10 +42,8 @@ workflows: # verify-codegen: - {jobs: ['verify_codegen'], project: 'libcudacxx'} # cudax has different CTK reqs: - - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 17, cxx: ['gcc9', 'clang9']} - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 20, cxx: ['msvc14.36']} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc10', 'gcc11', 'gcc12']} - - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['clang10', 'clang11', 'clang12', 'clang13']} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['clang14', 'clang15', 'clang16', 'clang17']} - {jobs: ['build'], project: 'cudax', ctk: ['12.5'], std: [17, 20], cxx: ['nvhpc']} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['msvc2022']} @@ -55,7 +54,6 @@ workflows: # Python and c/parallel jobs: - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6'} # cccl-infra: - - {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc6', 'clang9']} - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14']} - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang']} @@ -64,13 +62,13 @@ workflows: - {jobs: ['limited'], project: 'cub', std: 17} - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'} - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit'} - # Old CTK - - {jobs: ['build'], std: 'all', ctk: '11.1', cxx: ['gcc6', 'gcc7', 'gcc8', 'gcc9', 'clang9', 'msvc2017']} - - {jobs: ['build'], std: 'all', ctk: '11.8', cxx: ['gcc11'], sm: '60;70;80;90'} + # Old CTK/compiler + - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc7', 'gcc8', 'gcc9', 'clang14', 'msvc2019']} + - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc11'], sm: '60;70;80;90'} # Current CTK build-only - {jobs: ['build'], std: 'all', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']} - - {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']} - - {jobs: ['build'], std: 'all', cxx: ['intel', 'msvc2019']} + - {jobs: ['build'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']} + - {jobs: ['build'], std: 'all', cxx: ['msvc2019']} # Test current CTK - {jobs: ['test'], std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']} # Modded builds: @@ -83,7 +81,6 @@ workflows: - {jobs: ['build'], project: 'libcudacxx', std: 'all', cudacxx: 'clang', cxx: 'clang', sm: '90a'} # cudax - {jobs: ['build'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc9', 'gcc10', 'gcc11']} - - {jobs: ['build'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13']} - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']} - {jobs: ['build'], project: 'cudax', ctk: [ '12.5'], std: 'all', cxx: ['nvhpc']} - {jobs: ['build'], project: 'cudax', ctk: ['12.0', ], std: 'all', cxx: ['msvc14.36']} @@ -96,12 +93,12 @@ workflows: - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang18']} # # These are waiting on the NVKS nodes: -# - {jobs: ['test'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc6', std: [11]} -# - {jobs: ['test'], ctk: '11.1', gpu: 't4', sm: 'gpu', cxx: 'clang9', std: [17]} +# - {jobs: ['test'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc7', std: [11]} +# - {jobs: ['test'], ctk: '11.1', gpu: 't4', sm: 'gpu', cxx: 'clang14', std: [17]} # - {jobs: ['test'], ctk: '11.8', gpu: 'rtx2080', sm: 'gpu', cxx: 'gcc11', std: [17]} # - {jobs: ['test'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7', std: [14]} # - {jobs: ['test'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc13', std: 'all'} -# - {jobs: ['test'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang9', std: [11]} +# - {jobs: ['test'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang14', std: [11]} # # H100 runners are currently flakey, only build since those use CPU-only runners: # - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc12', std: [11, 20]} # - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'clang18', std: [17]} @@ -115,10 +112,7 @@ workflows: # Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows. exclude: # GPU runners are not available on Windows. - - {jobs: ['test', 'test_gpu', 'test_nolid', 'test_lid0', 'test_lid1', 'test_lid2'], cxx: ['msvc2017', 'msvc2019', 'msvc14.36', 'msvc2022']} - # Ubuntu 18.04 is EOL and we only use it to get access to CTK 11.1 containers for CUDA testing. - # Disable non-CUDA tests on this platform. - - {jobs: ['test_cpu'], ctk: '11.1'} + - {jobs: ['test', 'test_gpu', 'test_nolid', 'test_lid0', 'test_lid1', 'test_lid2'], cxx: ['msvc2019', 'msvc14.36', 'msvc2022']} ############################################################################################# @@ -131,8 +125,6 @@ devcontainer_version: '25.02' all_stds: [11, 14, 17, 20] ctk_versions: - 11.1: { stds: [11, 14, 17, ] } - 11.8: { stds: [11, 14, 17, ] } 12.0: { stds: [11, 14, 17, 20] } 12.5: { stds: [11, 14, 17, 20] } 12.6: { stds: [11, 14, 17, 20], aka: 'curr' } @@ -151,7 +143,6 @@ host_compilers: container_tag: 'gcc' exe: 'g++' versions: - 6: { stds: [11, 14, ] } 7: { stds: [11, 14, 17, ] } 8: { stds: [11, 14, 17, ] } 9: { stds: [11, 14, 17, ] } @@ -164,11 +155,6 @@ host_compilers: container_tag: 'llvm' exe: 'clang++' versions: - 9: { stds: [11, 14, 17, ] } - 10: { stds: [11, 14, 17, ] } - 11: { stds: [11, 14, 17, 20] } - 12: { stds: [11, 14, 17, 20] } - 13: { stds: [11, 14, 17, 20] } 14: { stds: [11, 14, 17, 20] } 15: { stds: [11, 14, 17, 20] } 16: { stds: [11, 14, 17, 20] } @@ -179,16 +165,9 @@ host_compilers: container_tag: 'cl' exe: cl versions: - 14.16: { stds: [ 14, ], aka: '2017' } 14.29: { stds: [ 14, 17, ], aka: '2019' } 14.36: { stds: [ 14, 17, 20] } 14.39: { stds: [ 14, 17, 20], aka: '2022' } - intel: - name: 'Intel' - container_tag: 'oneapi' - exe: icpc - versions: - 2023.2.0: { stds: [11, 14, 17, ] } nvhpc: name: 'NVHPC' container_tag: 'nvhpc' @@ -261,11 +240,11 @@ projects: stds: [11, 14, 17, 20] cub: name: 'CUB' - stds: [11, 14, 17, 20] + stds: [17, 20] job_map: { test: ['test_nolid', 'test_lid0', 'test_lid1', 'test_lid2'] } thrust: name: 'Thrust' - stds: [11, 14, 17, 20] + stds: [17, 20] job_map: { test: ['test_cpu', 'test_gpu'] } cudax: stds: [17, 20] diff --git a/cmake/CCCLBuildCompilerTargets.cmake b/cmake/CCCLBuildCompilerTargets.cmake index 906e287e439..98644b69779 100644 --- a/cmake/CCCLBuildCompilerTargets.cmake +++ b/cmake/CCCLBuildCompilerTargets.cmake @@ -23,7 +23,6 @@ set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded) option(CCCL_ENABLE_EXCEPTIONS "Enable exceptions within CCCL libraries." ON) option(CCCL_ENABLE_RTTI "Enable RTTI within CCCL libraries." ON) option(CCCL_ENABLE_WERROR "Treat warnings as errors for CCCL targets." ON) -option(CCCL_SUPPRESS_ICC_DEPRECATION_WARNING "Suppress Intel Compiler deprecation warnings" OFF) option(CCCL_SUPPRESS_MSVC2017_DEPRECATION_WARNING "Suppress Visual Studio 2017 deprecation warnings" OFF) function(cccl_build_compiler_interface interface_target cuda_compile_options cxx_compile_options compile_defs) @@ -69,10 +68,6 @@ function(cccl_build_compiler_targets) list(APPEND cxx_compile_definitions "CCCL_DISABLE_RTTI") endif() - if (CCCL_SUPPRESS_ICC_DEPRECATION_WARNING) - list(APPEND cxx_compile_definitions "CCCL_SUPPRESS_ICC_DEPRECATION_WARNING") - endif() - if (CCCL_SUPPRESS_MSVC2017_DEPRECATION_WARNING) list(APPEND cxx_compile_definitions "CCCL_SUPPRESS_MSVC2017_DEPRECATION_WARNING") endif() @@ -160,16 +155,6 @@ function(cccl_build_compiler_targets) endif() endif() - if ("Intel" STREQUAL "${CMAKE_CXX_COMPILER_ID}") - # Do not flush denormal floats to zero - append_option_if_available("-no-ftz" cxx_compile_options) - # Disable warning that inlining is inhibited by compiler thresholds. - append_option_if_available("-diag-disable=11074" cxx_compile_options) - append_option_if_available("-diag-disable=11076" cxx_compile_options) - # Disable warning about deprecated classic compiler - append_option_if_available("-diag-disable=10441" cxx_compile_options) - endif() - cccl_build_compiler_interface(cccl.compiler_interface "${cuda_compile_options}" "${cxx_compile_options}" diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cu b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cu index cf41124b2df..87beac9adee 100644 --- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cu +++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cu @@ -752,7 +752,9 @@ void do_not_optimize(const void* ptr) seed_t, cuda::std::span, std::size_t, std::size_t) INSTANTIATE(int32_t); +INSTANTIATE(uint32_t); INSTANTIATE(int64_t); +INSTANTIATE(uint64_t); #undef INSTANTIATE diff --git a/cub/cmake/CubBuildTargetList.cmake b/cub/cmake/CubBuildTargetList.cmake index 27dedd68210..7c6f59e8856 100644 --- a/cub/cmake/CubBuildTargetList.cmake +++ b/cub/cmake/CubBuildTargetList.cmake @@ -150,16 +150,6 @@ function(cub_build_target_list) cmake_minimum_required(VERSION 3.18.3) endif() - # Supported versions of MSVC do not distinguish between C++11 and C++14. - # Warn the user that they may be generating a ton of redundant targets. - if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND - CUB_ENABLE_DIALECT_CPP11) - message(WARNING - "Supported versions of MSVC (2017+) do not distinguish between C++11 " - "and C++14. The requested C++11 targets will be built with C++14." - ) - endif() - # Generic config flags: macro(add_flag_option flag docstring default) set(opt "CCCL_${flag}") diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index 21a487828ca..e454dc837b1 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -629,7 +629,7 @@ struct AgentHistogram // Set valid flags MarkValid( - is_valid, valid_samples, Int2Type{}); + is_valid, valid_samples, Int2Type < AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED > {}); // Accumulate samples if (prefer_smem) diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh index 2e0d94b219c..d5e3514f369 100644 --- a/cub/cub/agent/agent_reduce.cuh +++ b/cub/cub/agent/agent_reduce.cuh @@ -382,8 +382,8 @@ struct AgentReduce even_share.template BlockInit(block_offset, block_end); return (IsAligned(d_in + block_offset, Int2Type())) - ? ConsumeRange(even_share, Int2Type < true && ATTEMPT_VECTORIZATION > ()) - : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ()); + ? ConsumeRange(even_share, Int2Type()) + : ConsumeRange(even_share, Int2Type()); } /** @@ -396,8 +396,8 @@ struct AgentReduce even_share.template BlockInit(); return (IsAligned(d_in, Int2Type())) - ? ConsumeRange(even_share, Int2Type < true && ATTEMPT_VECTORIZATION > ()) - : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ()); + ? ConsumeRange(even_share, Int2Type()) + : ConsumeRange(even_share, Int2Type()); } private: diff --git a/cub/cub/agent/agent_three_way_partition.cuh b/cub/cub/agent/agent_three_way_partition.cuh index 3a07944d4e2..eec24057163 100644 --- a/cub/cub/agent/agent_three_way_partition.cuh +++ b/cub/cub/agent/agent_three_way_partition.cuh @@ -175,7 +175,8 @@ template + typename OffsetT, + typename StreamingContextT> struct AgentThreeWayPartition { //--------------------------------------------------------------------- @@ -251,6 +252,9 @@ struct AgentThreeWayPartition SelectSecondPartOp select_second_part_op; OffsetT num_items; ///< Total number of input items + // Note: This is a const reference because we have seen double-digit percentage perf regressions otherwise + const StreamingContextT& streaming_context; ///< Context for the current partition + //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- @@ -264,7 +268,8 @@ struct AgentThreeWayPartition UnselectedOutputIteratorT d_unselected_out, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, - OffsetT num_items) + OffsetT num_items, + const StreamingContextT& streaming_context) : temp_storage(temp_storage.Alias()) , d_in(d_in) , d_first_part_out(d_first_part_out) @@ -273,6 +278,7 @@ struct AgentThreeWayPartition , select_first_part_op(select_first_part_op) , select_second_part_op(select_second_part_op) , num_items(num_items) + , streaming_context(streaming_context) {} //--------------------------------------------------------------------- @@ -350,6 +356,11 @@ struct AgentThreeWayPartition CTA_SYNC(); // Gather items from shared memory and scatter to global + auto first_base = + d_first_part_out + (streaming_context.num_previously_selected_first() + num_first_selections_prefix); + auto second_base = + d_second_part_out + (streaming_context.num_previously_selected_second() + num_second_selections_prefix); + auto unselected_base = d_unselected_out + (streaming_context.num_previously_rejected() + num_rejected_prefix); for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; @@ -360,16 +371,16 @@ struct AgentThreeWayPartition if (item_idx < first_item_end) { - d_first_part_out[num_first_selections_prefix + item_idx] = item; + first_base[item_idx] = item; } else if (item_idx < second_item_end) { - d_second_part_out[num_second_selections_prefix + item_idx - first_item_end] = item; + second_base[item_idx - first_item_end] = item; } else { - int rejection_idx = item_idx - second_item_end; - d_unselected_out[num_rejected_prefix + rejection_idx] = item; + int rejection_idx = item_idx - second_item_end; + unselected_base[rejection_idx] = item; } } } @@ -400,11 +411,12 @@ struct AgentThreeWayPartition // Load items if (IS_LAST_TILE) { - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); + BlockLoadT(temp_storage.load_items) + .Load(d_in + streaming_context.input_offset() + tile_offset, items, num_tile_items); } else { - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); + BlockLoadT(temp_storage.load_items).Load(d_in + streaming_context.input_offset() + tile_offset, items); } // Initialize selection_flags @@ -464,11 +476,12 @@ struct AgentThreeWayPartition // Load items if (IS_LAST_TILE) { - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); + BlockLoadT(temp_storage.load_items) + .Load(d_in + streaming_context.input_offset() + tile_offset, items, num_tile_items); } else { - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); + BlockLoadT(temp_storage.load_items).Load(d_in + streaming_context.input_offset() + tile_offset, items); } // Initialize selection_flags @@ -551,7 +564,7 @@ struct AgentThreeWayPartition { // Blocks are launched in increasing order, so just assign one tile per block // Current tile index - const int tile_idx = static_cast((blockIdx.x * gridDim.y) + blockIdx.y); + const int tile_idx = blockIdx.x; // Global offset for the current tile const OffsetT tile_offset = tile_idx * TILE_ITEMS; @@ -572,9 +585,9 @@ struct AgentThreeWayPartition if (threadIdx.x == 0) { - // Output the total number of items selection_flags - d_num_selected_out[0] = AccumPackHelperT::first(accum); - d_num_selected_out[1] = AccumPackHelperT::second(accum); + // Update the number of selected items with this partition's selections + streaming_context.update_num_selected( + d_num_selected_out, AccumPackHelperT::first(accum), AccumPackHelperT::second(accum), num_items); } } } diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index 490abb86bda..92605b5168d 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -606,8 +606,7 @@ private: { volatile DigitCounterT warp_digit_counters[RADIX_DIGITS][PADDED_WARPS]; DigitCounterT raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT]; - } - aliasable; + } aliasable; }; #endif // !_CCCL_DOXYGEN_INVOKED diff --git a/cub/cub/config.cuh b/cub/cub/config.cuh index 16e7edd4905..d05078cdd2d 100644 --- a/cub/cub/config.cuh +++ b/cub/cub/config.cuh @@ -44,7 +44,6 @@ #endif // no system header #include // IWYU pragma: export -#include // IWYU pragma: export #include // IWYU pragma: export #include // IWYU pragma: export #include // IWYU pragma: export diff --git a/cub/cub/detail/detect_cuda_runtime.cuh b/cub/cub/detail/detect_cuda_runtime.cuh index d83b2c1179a..7666f9b2d23 100644 --- a/cub/cub/detail/detect_cuda_runtime.cuh +++ b/cub/cub/detail/detect_cuda_runtime.cuh @@ -73,40 +73,15 @@ */ # define CUB_RUNTIME_FUNCTION -/** - * \def CUB_RUNTIME_ENABLED - * - * Whether or not the active compiler pass is allowed to invoke device kernels - * or methods from the CUDA runtime API. - * - * This macro should not be used in CUB, as it depends on `__CUDA_ARCH__` - * and is not compatible with `NV_IF_TARGET`. It is provided for legacy - * purposes only. - * - * Replace any usages with `CUB_RDC_ENABLED` and `NV_IF_TARGET`. - */ -# define CUB_RUNTIME_ENABLED - #else // Non-doxygen pass: # ifndef CUB_RUNTIME_FUNCTION - # if defined(__CUDACC_RDC__) && !defined(CUB_DISABLE_CDP) - # define CUB_RDC_ENABLED # define CUB_RUNTIME_FUNCTION _CCCL_HOST_DEVICE - # else // RDC disabled: - # define CUB_RUNTIME_FUNCTION _CCCL_HOST - # endif // RDC enabled - -# if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__) -// Legacy only -- do not use in new code. -# define CUB_RUNTIME_ENABLED -# endif - # endif // CUB_RUNTIME_FUNCTION predefined # ifdef CUB_RDC_ENABLED diff --git a/cub/cub/detail/fast_modulo_division.cuh b/cub/cub/detail/fast_modulo_division.cuh index 2ae65400e71..4a5f2048e32 100644 --- a/cub/cub/detail/fast_modulo_division.cuh +++ b/cub/cub/detail/fast_modulo_division.cuh @@ -109,8 +109,6 @@ multiply_extract_higher_bits(T value, R multiplier) { static_assert(supported_integral::value, "unsupported type"); static_assert(supported_integral::value, "unsupported type"); - _CCCL_DIAG_PUSH - _CCCL_DIAG_SUPPRESS_ICC(186) // pointless comparison of unsigned integer with zero _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(::cuda::std::is_signed, T)) { _CCCL_ASSERT(value >= 0, "value must be non-negative"); @@ -119,7 +117,6 @@ multiply_extract_higher_bits(T value, R multiplier) { _CCCL_ASSERT(multiplier >= 0, "multiplier must be non-negative"); } - _CCCL_DIAG_POP static constexpr int NumBits = sizeof(DivisorType) * CHAR_BIT; using unsigned_t = unsigned_implicit_prom_t; using larger_t = larger_unsigned_type_t; diff --git a/cub/cub/detail/strong_load.cuh b/cub/cub/detail/strong_load.cuh index 61693d808e2..b6ba4bb5fc8 100644 --- a/cub/cub/detail/strong_load.cuh +++ b/cub/cub/detail/strong_load.cuh @@ -59,14 +59,14 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint4 load_relaxed(uint4 const* ptr) uint4 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) - : "l"(ptr) - : "memory");)); + (asm volatile("ld.relaxed.gpu.v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x), + "=r"(retval.y), + "=r"(retval.z), + "=r"(retval.w) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x), + "=r"(retval.y), + "=r"(retval.z), + "=r"(retval.w) : "l"(ptr) : "memory");)); return retval; } @@ -75,14 +75,8 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 load_relaxed(ulonglong2 const* ulonglong2 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.v2.u64 {%0, %1}, [%2];" - : "=l"(retval.x), "=l"(retval.y) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" - : "=l"(retval.x), "=l"(retval.y) - : "l"(ptr) - : "memory");)); + (asm volatile("ld.relaxed.gpu.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory");)); return retval; } @@ -91,14 +85,14 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE ushort4 load_relaxed(ushort4 const* ptr) ushort4 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.v4.u16 {%0, %1, %2, %3}, [%4];" - : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v4.u16 {%0, %1, %2, %3}, [%4];" - : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) - : "l"(ptr) - : "memory");)); + (asm volatile("ld.relaxed.gpu.v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x), + "=h"(retval.y), + "=h"(retval.z), + "=h"(retval.w) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x), + "=h"(retval.y), + "=h"(retval.z), + "=h"(retval.w) : "l"(ptr) : "memory");)); return retval; } @@ -107,46 +101,26 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_relaxed(uint2 const* ptr) uint2 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.v2.u32 {%0, %1}, [%2];" - : "=r"(retval.x), "=r"(retval.y) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" - : "=r"(retval.x), "=r"(retval.y) - : "l"(ptr) - : "memory");)); + (asm volatile("ld.relaxed.gpu.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory");)); return retval; } static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned long long load_relaxed(unsigned long long const* ptr) { unsigned long long retval; - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.u64 %0, [%1];" - : "=l"(retval) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.u64 %0, [%1];" - : "=l"(retval) - : "l"(ptr) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.u64 %0, [%1];" : "=l"(retval) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.u64 %0, [%1];" : "=l"(retval) : "l"(ptr) : "memory");)); return retval; } static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_relaxed(unsigned int const* ptr) { unsigned int retval; - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.u32 %0, [%1];" - : "=r"(retval) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.u32 %0, [%1];" - : "=r"(retval) - : "l"(ptr) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory");)); return retval; } @@ -154,16 +128,9 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_relaxed(unsigned int con static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned short load_relaxed(unsigned short const* ptr) { unsigned short retval; - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.u16 %0, [%1];" - : "=h"(retval) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.u16 %0, [%1];" - : "=h"(retval) - : "l"(ptr) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.u16 %0, [%1];" : "=h"(retval) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.u16 %0, [%1];" : "=h"(retval) : "l"(ptr) : "memory");)); return retval; } @@ -172,24 +139,16 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned char load_relaxed(unsigned char c unsigned short retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile( - "{" - " .reg .u8 datum;" - " ld.relaxed.gpu.u8 datum, [%1];" - " cvt.u16.u8 %0, datum;" - "}" - : "=h"(retval) - : "l"(ptr) - : "memory");), - (asm volatile( - "{" - " .reg .u8 datum;" - " ld.cg.u8 datum, [%1];" - " cvt.u16.u8 %0, datum;" - "}" - : "=h"(retval) - : "l"(ptr) - : "memory");)); + (asm volatile("{" + " .reg .u8 datum;" + " ld.relaxed.gpu.u8 datum, [%1];" + " cvt.u16.u8 %0, datum;" + "}" : "=h"(retval) : "l"(ptr) : "memory");), + (asm volatile("{" + " .reg .u8 datum;" + " ld.cg.u8 datum, [%1];" + " cvt.u16.u8 %0, datum;" + "}" : "=h"(retval) : "l"(ptr) : "memory");)); return (unsigned char) retval; } @@ -198,14 +157,8 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 load_acquire(ulonglong2 const* ulonglong2 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.acquire.gpu.v2.u64 {%0, %1}, [%2];" - : "=l"(retval.x), "=l"(retval.y) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" - : "=l"(retval.x), "=l"(retval.y) - : "l"(ptr) - : "memory"); + (asm volatile("ld.acquire.gpu.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory"); __threadfence();)); return retval; } @@ -215,14 +168,8 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_acquire(uint2 const* ptr) uint2 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.acquire.gpu.v2.u32 {%0, %1}, [%2];" - : "=r"(retval.x), "=r"(retval.y) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" - : "=r"(retval.x), "=r"(retval.y) - : "l"(ptr) - : "memory"); + (asm volatile("ld.acquire.gpu.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory"); __threadfence();)); return retval; } @@ -230,17 +177,9 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_acquire(uint2 const* ptr) static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_acquire(unsigned int const* ptr) { unsigned int retval; - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("ld.acquire.gpu.u32 %0, [%1];" - : "=r"(retval) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.u32 %0, [%1];" - : "=r"(retval) - : "l"(ptr) - : "memory"); - __threadfence();)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.acquire.gpu.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory"); __threadfence();)); return retval; } diff --git a/cub/cub/detail/strong_store.cuh b/cub/cub/detail/strong_store.cuh index 9b8091738db..cc0e8f60e71 100644 --- a/cub/cub/detail/strong_store.cuh +++ b/cub/cub/detail/strong_store.cuh @@ -56,98 +56,61 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(uint4* ptr, uint4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.v4.u32 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) - : "memory");), - (asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) - : "memory");)); + (asm volatile("st.relaxed.gpu.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr), + "r"(val.x), + "r"(val.y), + "r"(val.z), + "r"(val.w) : "memory");), + (asm volatile( + "st.cg.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(ulonglong2* ptr, ulonglong2 val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.v2.u64 [%0], {%1, %2};" - : - : "l"(ptr), "l"(val.x), "l"(val.y) - : "memory");), - (asm volatile("st.cg.v2.u64 [%0], {%1, %2};" - : - : "l"(ptr), "l"(val.x), "l"(val.y) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");), + (asm volatile("st.cg.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(ushort4* ptr, ushort4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.v4.u16 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) - : "memory");), - (asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) - : "memory");)); + (asm volatile("st.relaxed.gpu.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr), + "h"(val.x), + "h"(val.y), + "h"(val.z), + "h"(val.w) : "memory");), + (asm volatile( + "st.cg.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(uint2* ptr, uint2 val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.v2.u32 [%0], {%1, %2};" - : - : "l"(ptr), "r"(val.x), "r"(val.y) - : "memory");), - (asm volatile("st.cg.v2.u32 [%0], {%1, %2};" - : - : "l"(ptr), "r"(val.x), "r"(val.y) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");), + (asm volatile("st.cg.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned long long* ptr, unsigned long long val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.u64 [%0], %1;" - : - : "l"(ptr), "l"(val) - : "memory");), - (asm volatile("st.cg.u64 [%0], %1;" - : - : "l"(ptr), "l"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");), + (asm volatile("st.cg.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned int* ptr, unsigned int val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.u32 [%0], %1;" - : - : "l"(ptr), "r"(val) - : "memory");), - (asm volatile("st.cg.u32 [%0], %1;" - : - : "l"(ptr), "r"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");), + (asm volatile("st.cg.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned short* ptr, unsigned short val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.u16 [%0], %1;" - : - : "l"(ptr), "h"(val) - : "memory");), - (asm volatile("st.cg.u16 [%0], %1;" - : - : "l"(ptr), "h"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");), + (asm volatile("st.cg.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned char* ptr, unsigned char val) @@ -158,123 +121,77 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned char* ptr, uns " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.relaxed.gpu.u8 [%0], datum;" - "}" - : - : "l"(ptr), "h"((unsigned short) val) - : "memory");), + "}" : : "l"(ptr), + "h"((unsigned short) val) : "memory");), (asm volatile("{" " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.cg.u8 [%0], datum;" - "}" - : - : "l"(ptr), "h"((unsigned short) val) - : "memory");)); + "}" : : "l"(ptr), + "h"((unsigned short) val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(uint4* ptr, uint4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.v4.u32 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) - : "memory");), - (__threadfence(); - asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) - : "memory");)); + (asm volatile("st.release.gpu.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr), + "r"(val.x), + "r"(val.y), + "r"(val.z), + "r"(val.w) : "memory");), + (__threadfence(); asm volatile( + "st.cg.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(ulonglong2* ptr, ulonglong2 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.v2.u64 [%0], {%1, %2};" - : - : "l"(ptr), "l"(val.x), "l"(val.y) - : "memory");), - (__threadfence(); - asm volatile("st.cg.v2.u64 [%0], {%1, %2};" - : - : "l"(ptr), "l"(val.x), "l"(val.y) - : "memory");)); + (asm volatile("st.release.gpu.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");), + (__threadfence(); asm volatile("st.cg.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(ushort4* ptr, ushort4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.v4.u16 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) - : "memory");), - (__threadfence(); - asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) - : "memory");)); + (asm volatile("st.release.gpu.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr), + "h"(val.x), + "h"(val.y), + "h"(val.z), + "h"(val.w) : "memory");), + (__threadfence(); asm volatile( + "st.cg.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(uint2* ptr, uint2 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.v2.u32 [%0], {%1, %2};" - : - : "l"(ptr), "r"(val.x), "r"(val.y) - : "memory");), - (__threadfence(); - asm volatile("st.cg.v2.u32 [%0], {%1, %2};" - : - : "l"(ptr), "r"(val.x), "r"(val.y) - : "memory");)); + (asm volatile("st.release.gpu.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");), + (__threadfence(); asm volatile("st.cg.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned long long* ptr, unsigned long long val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.u64 [%0], %1;" - : - : "l"(ptr), "l"(val) - : "memory");), - (__threadfence(); - asm volatile("st.cg.u64 [%0], %1;" - : - : "l"(ptr), "l"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");), + (__threadfence(); asm volatile("st.cg.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned int* ptr, unsigned int val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.u32 [%0], %1;" - : - : "l"(ptr), "r"(val) - : "memory");), - (__threadfence(); - asm volatile("st.cg.u32 [%0], %1;" - : - : "l"(ptr), "r"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");), + (__threadfence(); asm volatile("st.cg.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned short* ptr, unsigned short val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.u16 [%0], %1;" - : - : "l"(ptr), "h"(val) - : "memory");), - (__threadfence(); - asm volatile("st.cg.u16 [%0], %1;" - : - : "l"(ptr), "h"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");), + (__threadfence(); asm volatile("st.cg.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned char* ptr, unsigned char val) @@ -285,19 +202,15 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned char* ptr, unsigned c " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.release.gpu.u8 [%0], datum;" - "}" - : - : "l"(ptr), "h"((unsigned short) val) - : "memory");), + "}" : : "l"(ptr), + "h"((unsigned short) val) : "memory");), (__threadfence(); asm volatile( "{" " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.cg.u8 [%0], datum;" - "}" - : - : "l"(ptr), "h"((unsigned short) val) - : "memory");)); + "}" : : "l"(ptr), + "h"((unsigned short) val) : "memory");)); } } // namespace detail diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh index c68f6cf4d61..1b9eef947fa 100644 --- a/cub/cub/device/device_partition.cuh +++ b/cub/cub/device/device_partition.cuh @@ -445,7 +445,8 @@ private: typename UnselectedOutputIteratorT, typename NumSelectedIteratorT, typename SelectFirstPartOp, - typename SelectSecondPartOp> + typename SelectSecondPartOp, + typename NumItemsT> CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t IfNoNVTX( void* d_temp_storage, std::size_t& temp_storage_bytes, @@ -454,12 +455,13 @@ private: SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + NumItemsT num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream = 0) { - using OffsetT = int; + using ChooseOffsetT = detail::choose_signed_offset; + using OffsetT = typename ChooseOffsetT::type; using DispatchThreeWayPartitionIfT = DispatchThreeWayPartitionIf< InputIteratorT, FirstOutputIteratorT, @@ -470,6 +472,14 @@ private: SelectSecondPartOp, OffsetT>; + // Signed integer type for global offsets + // Check if the number of items exceeds the range covered by the selected signed offset type + cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items); + if (error) + { + return error; + } + return DispatchThreeWayPartitionIfT::Dispatch( d_temp_storage, temp_storage_bytes, @@ -625,6 +635,9 @@ public: //! @tparam SelectSecondPartOp //! **[inferred]** Selection functor type having member `bool operator()(const T &a)` //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -670,7 +683,8 @@ public: typename UnselectedOutputIteratorT, typename NumSelectedIteratorT, typename SelectFirstPartOp, - typename SelectSecondPartOp> + typename SelectSecondPartOp, + typename NumItemsT> CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, std::size_t& temp_storage_bytes, @@ -679,7 +693,7 @@ public: SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + NumItemsT num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream = 0) @@ -706,7 +720,8 @@ public: typename UnselectedOutputIteratorT, typename NumSelectedIteratorT, typename SelectFirstPartOp, - typename SelectSecondPartOp> + typename SelectSecondPartOp, + typename NumItemsT> CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, std::size_t& temp_storage_bytes, @@ -715,7 +730,7 @@ public: SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + NumItemsT num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream, @@ -729,7 +744,8 @@ public: UnselectedOutputIteratorT, NumSelectedIteratorT, SelectFirstPartOp, - SelectSecondPartOp>( + SelectSecondPartOp, + NumItemsT>( d_temp_storage, temp_storage_bytes, d_in, diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh index 90295f2c06f..fc259499b85 100644 --- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -46,17 +46,97 @@ #include -#include #include #include CUB_NAMESPACE_BEGIN +namespace detail +{ + +namespace three_way_partition +{ +// Offset type used to instantiate the stream three-way-partition-kernel and agent to index the items within one +// partition +using per_partition_offset_t = ::cuda::std::int32_t; + +template +class streaming_context_t +{ +private: + bool first_partition = true; + bool last_partition = false; + TotalNumItemsT total_previous_num_items{}; + + // We use a double-buffer for keeping track of the number of previously selected items + TotalNumItemsT* d_num_selected_in = nullptr; + TotalNumItemsT* d_num_selected_out = nullptr; + +public: + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE + streaming_context_t(TotalNumItemsT* d_num_selected_in, TotalNumItemsT* d_num_selected_out, bool is_last_partition) + : last_partition(is_last_partition) + , d_num_selected_in(d_num_selected_in) + , d_num_selected_out(d_num_selected_out) + {} + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT num_items, bool next_partition_is_the_last) + { + ::cuda::std::swap(d_num_selected_in, d_num_selected_out); + first_partition = false; + last_partition = next_partition_is_the_last; + total_previous_num_items += num_items; + }; + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset() const + { + return first_partition ? TotalNumItemsT{0} : total_previous_num_items; + }; + + _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected_first() const + { + return first_partition ? TotalNumItemsT{0} : d_num_selected_in[0]; + }; + + _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected_second() const + { + return first_partition ? TotalNumItemsT{0} : d_num_selected_in[1]; + }; + + _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const + { + return first_partition ? TotalNumItemsT{0} : d_num_selected_in[2]; + ; + }; + + template + _CCCL_DEVICE _CCCL_FORCEINLINE void update_num_selected( + NumSelectedIteratorT user_num_selected_out_it, + TotalNumItemsT num_selected_first, + TotalNumItemsT num_selected_second, + TotalNumItemsT num_items_in_partition) const + { + if (last_partition) + { + user_num_selected_out_it[0] = num_previously_selected_first() + num_selected_first; + user_num_selected_out_it[1] = num_previously_selected_second() + num_selected_second; + } + else + { + d_num_selected_out[0] = num_previously_selected_first() + num_selected_first; + d_num_selected_out[1] = num_previously_selected_second() + num_selected_second; + d_num_selected_out[2] = + num_previously_rejected() + (num_items_in_partition - num_selected_second - num_selected_first); + } + } +}; +} // namespace three_way_partition +} // namespace detail + /****************************************************************************** * Kernel entry points *****************************************************************************/ - template + typename OffsetT, + typename StreamingContextT> __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceThreeWayPartitionKernel( InputIteratorT d_in, @@ -78,7 +159,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, OffsetT num_items, - int num_tiles) + int num_tiles, + _CCCL_GRID_CONSTANT const StreamingContextT streaming_context) { using AgentThreeWayPartitionPolicyT = typename ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy; @@ -91,7 +173,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO UnselectedOutputIteratorT, SelectFirstPartOp, SelectSecondPartOp, - OffsetT>; + OffsetT, + StreamingContextT>; // Shared memory for AgentThreeWayPartition __shared__ typename AgentThreeWayPartitionT::TempStorage temp_storage; @@ -105,7 +188,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO d_unselected_out, select_first_part_op, select_second_part_op, - num_items) + num_items, + streaming_context) .ConsumeRange(num_tiles, tile_status, d_num_selected_out); } @@ -160,14 +244,23 @@ template , OffsetT>> + typename PolicyHub = detail::three_way_partition:: + policy_hub, detail::three_way_partition::per_partition_offset_t>> struct DispatchThreeWayPartitionIf { /***************************************************************************** * Types and constants ****************************************************************************/ - using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t; + // Offset type used to instantiate the three-way partition-kernel and agent to index the items within one partition + using per_partition_offset_t = detail::three_way_partition::per_partition_offset_t; + + // Type used to provide streaming information about each partition's context + static constexpr per_partition_offset_t partition_size = ::cuda::std::numeric_limits::max(); + + using streaming_context_t = detail::three_way_partition::streaming_context_t; + + using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t; using AccumPackT = typename AccumPackHelperT::pack_t; using ScanTileStateT = cub::ScanTileState; @@ -222,64 +315,77 @@ struct DispatchThreeWayPartitionIf constexpr int block_threads = ActivePolicyT::ThreeWayPartitionPolicy::BLOCK_THREADS; constexpr int items_per_thread = ActivePolicyT::ThreeWayPartitionPolicy::ITEMS_PER_THREAD; + constexpr int tile_size = block_threads * items_per_thread; - do - { - // Get device ordinal - int device_ordinal; - error = CubDebug(cudaGetDevice(&device_ordinal)); - if (cudaSuccess != error) - { - break; - } + // The maximum number of items for which we will ever invoke the kernel (i.e. largest partition size) + auto const max_partition_size = + static_cast(::cuda::std::min(static_cast(num_items), static_cast(partition_size))); - // Number of input tiles - int tile_size = block_threads * items_per_thread; - int num_tiles = static_cast(::cuda::ceil_div(num_items, tile_size)); + // The number of partitions required to "iterate" over the total input + auto const num_partitions = + (max_partition_size == 0) ? OffsetT{1} : ::cuda::ceil_div(num_items, max_partition_size); - // Specify temporary storage allocation requirements - size_t allocation_sizes[1]; // bytes needed for tile status descriptors + // The maximum number of tiles for which we will ever invoke the kernel + auto const max_num_tiles_per_invocation = static_cast(::cuda::ceil_div(max_partition_size, tile_size)); - error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); - if (cudaSuccess != error) - { - break; - } + // For streaming invocations, we need two sets (for double-buffering) of three counters each + constexpr ::cuda::std::size_t num_counters_per_pass = 3; + constexpr ::cuda::std::size_t num_streaming_counters = 2 * num_counters_per_pass; + ::cuda::std::size_t streaming_selection_storage_bytes = + (num_partitions > 1) ? num_streaming_counters * sizeof(OffsetT) : ::cuda::std::size_t{0}; - // Compute allocation pointers into the single storage blob (or compute - // the necessary size of the blob) - void* allocations[1] = {}; + // Specify temporary storage allocation requirements + size_t allocation_sizes[2] = {0ULL, streaming_selection_storage_bytes}; - error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); - if (cudaSuccess != error) - { - break; - } + error = + CubDebug(ScanTileStateT::AllocationSize(static_cast(max_num_tiles_per_invocation), allocation_sizes[0])); + if (cudaSuccess != error) + { + return error; + } - if (d_temp_storage == nullptr) - { - // Return if the caller is simply requesting the size of the storage - // allocation - break; - } + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[2] = {}; - // Return if empty problem - if (num_items == 0) - { - break; - } + error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) + { + return error; + } + + if (d_temp_storage == nullptr) + { + // Return if the caller is simply requesting the size of the storage + // allocation + return cudaSuccess; + } + + // Initialize the streaming context with the temporary storage for double-buffering the previously selected items + // and the total number (across all partitions) of items + OffsetT* tmp_num_selected_out = static_cast(allocations[1]); + streaming_context_t streaming_context{ + tmp_num_selected_out, (tmp_num_selected_out + num_counters_per_pass), (num_partitions <= 1)}; + + // Iterate over the partitions until all input is processed + for (OffsetT partition_idx = 0; partition_idx < num_partitions; partition_idx++) + { + OffsetT current_partition_offset = partition_idx * max_partition_size; + OffsetT current_num_items = + (partition_idx + 1 == num_partitions) ? (num_items - current_partition_offset) : max_partition_size; // Construct the tile status interface - ScanTileStateT tile_status; + const auto current_num_tiles = static_cast(::cuda::ceil_div(current_num_items, tile_size)); - error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0])); + // Construct the tile status interface + ScanTileStateT tile_status; + error = CubDebug(tile_status.Init(current_num_tiles, allocations[0], allocation_sizes[0])); if (cudaSuccess != error) { - break; + return error; } // Log three_way_partition_init_kernel configuration - int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS)); + int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(current_num_tiles, INIT_KERNEL_THREADS)); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking three_way_partition_init_kernel<<<%d, %d, 0, %lld>>>()\n", @@ -290,36 +396,29 @@ struct DispatchThreeWayPartitionIf // Invoke three_way_partition_init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) - .doit(three_way_partition_init_kernel, tile_status, num_tiles, d_num_selected_out); + .doit(three_way_partition_init_kernel, tile_status, current_num_tiles, d_num_selected_out); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { - break; + return error; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { - break; + return error; } - // Get max x-dimension of grid - int max_dim_x; - error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); - if (cudaSuccess != error) + // No more items to process (note, we do not want to return early for num_items==0, because we need to make sure + // that `three_way_partition_init_kernel` has written '0' to d_num_selected_out) + if (current_num_items == 0) { - break; + return cudaSuccess; } - // Get grid size for scanning tiles - dim3 scan_grid_size; - scan_grid_size.z = 1; - scan_grid_size.y = ::cuda::ceil_div(num_tiles, max_dim_x); - scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); - // Log select_if_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG { @@ -330,14 +429,12 @@ struct DispatchThreeWayPartitionIf block_threads)); if (cudaSuccess != error) { - break; + return error; } - _CubLog("Invoking three_way_partition_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d " + _CubLog("Invoking three_way_partition_kernel<<<%d, %d, 0, %lld>>>(), %d " "items per thread, %d SM occupancy\n", - scan_grid_size.x, - scan_grid_size.y, - scan_grid_size.z, + current_num_tiles, block_threads, reinterpret_cast(stream), items_per_thread, @@ -346,7 +443,7 @@ struct DispatchThreeWayPartitionIf #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke select_if_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(current_num_tiles, block_threads, 0, stream) .doit(three_way_partition_kernel, d_in, d_first_part_out, @@ -356,23 +453,27 @@ struct DispatchThreeWayPartitionIf tile_status, select_first_part_op, select_second_part_op, - num_items, - num_tiles); + static_cast(current_num_items), + current_num_tiles, + streaming_context); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { - break; + return error; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { - break; + return error; } - } while (0); + + // Prepare streaming context for next partition (swap double buffers, advance number of processed items, etc.) + streaming_context.advance(current_num_items, (partition_idx + OffsetT{2} == num_partitions)); + } return error; } @@ -393,7 +494,8 @@ struct DispatchThreeWayPartitionIf ScanTileStateT, SelectFirstPartOp, SelectSecondPartOp, - OffsetT>); + per_partition_offset_t, + streaming_context_t>); } /** diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index 386a6276dfa..fa4fa80d0ef 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -169,11 +169,10 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply_impl(F&& f, Tuple&& t, ::cuda::st } template -_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t) - -> decltype(poor_apply_impl( - ::cuda::std::forward(f), - ::cuda::std::forward(t), - ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::remove_reference_t>::value>{})) +_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t) -> decltype(poor_apply_impl( + ::cuda::std::forward(f), + ::cuda::std::forward(t), + ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::remove_reference_t>::value>{})) { return poor_apply_impl( ::cuda::std::forward(f), @@ -473,8 +472,9 @@ using needs_aligned_ptr_t = #ifdef _CUB_HAS_TRANSFORM_UBLKCP template ::value, int> = 0> -_CCCL_DEVICE _CCCL_FORCEINLINE auto select_kernel_arg( - ::cuda::std::integral_constant, kernel_arg&& arg) -> aligned_base_ptr>&& +_CCCL_DEVICE _CCCL_FORCEINLINE auto +select_kernel_arg(::cuda::std::integral_constant, kernel_arg&& arg) + -> aligned_base_ptr>&& { return ::cuda::std::move(arg.aligned_ptr); } @@ -660,10 +660,9 @@ struct dispatch_t - CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel() - -> PoorExpected< - ::cuda::std:: - tuple> + CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel() -> PoorExpected< + ::cuda::std:: + tuple> { using policy_t = typename ActivePolicy::algo_policy; constexpr int block_dim = policy_t::block_threads; diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index 992398c5cfb..02bfb443fc1 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -613,6 +613,7 @@ struct policy_hub static constexpr int max_input_bytes = static_cast(::cuda::std::max(sizeof(KeyT), sizeof(AccumT))); static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(AccumT); + template struct DefaultPolicy { static constexpr int nominal_4B_items_per_thread = 6; @@ -627,13 +628,13 @@ struct policy_hub AgentReduceByKeyPolicy<128, items_per_thread, BLOCK_LOAD_DIRECT, - LOAD_LDG, + LoadModifier, BLOCK_SCAN_WARP_SCANS, default_reduce_by_key_delay_constructor_t>; }; struct Policy350 - : DefaultPolicy + : DefaultPolicy , ChainedPolicy<350, Policy350, Policy350> {}; @@ -648,7 +649,7 @@ struct policy_hub typename Tuning::delay_constructor>; template - static auto select_agent_policy(long) -> typename DefaultPolicy::ReduceByKeyPolicyT; + static auto select_agent_policy(long) -> typename DefaultPolicy::ReduceByKeyPolicyT; struct Policy800 : ChainedPolicy<800, Policy800, Policy350> { @@ -657,7 +658,7 @@ struct policy_hub }; struct Policy860 - : DefaultPolicy + : DefaultPolicy , ChainedPolicy<860, Policy860, Policy800> {}; diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh index 783da6820d5..33771f6882f 100644 --- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -239,6 +239,7 @@ struct policy_hub static constexpr int max_input_bytes = static_cast(::cuda::std::max(sizeof(KeyT), sizeof(LengthT))); static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(LengthT); + template struct DefaultPolicy { static constexpr int nominal_4B_items_per_thread = 6; @@ -252,14 +253,14 @@ struct policy_hub AgentReduceByKeyPolicy<128, items, BLOCK_LOAD_DIRECT, - LOAD_LDG, + LoadModifier, BLOCK_SCAN_WARP_SCANS, default_reduce_by_key_delay_constructor_t>; }; // SM35 struct Policy350 - : DefaultPolicy + : DefaultPolicy , ChainedPolicy<350, Policy350, Policy350> {}; @@ -273,7 +274,7 @@ struct policy_hub BLOCK_SCAN_WARP_SCANS, typename Tuning::delay_constructor>; template - static auto select_agent_policy(long) -> typename DefaultPolicy::ReduceByKeyPolicyT; + static auto select_agent_policy(long) -> typename DefaultPolicy::ReduceByKeyPolicyT; // SM80 struct Policy800 : ChainedPolicy<800, Policy800, Policy350> @@ -283,7 +284,7 @@ struct policy_hub // SM86 struct Policy860 - : DefaultPolicy + : DefaultPolicy , ChainedPolicy<860, Policy860, Policy800> {}; @@ -433,7 +434,7 @@ struct sm90_tuning struct policy_hub { - template + template struct DefaultPolicy { static constexpr int nominal_4B_items_per_thread = 15; @@ -444,7 +445,7 @@ struct policy_hub AgentRlePolicy<96, ITEMS_PER_THREAD, BlockLoad, - LOAD_LDG, + LoadModifier, true, BLOCK_SCAN_WARP_SCANS, default_reduce_by_key_delay_constructor_t>; @@ -452,7 +453,7 @@ struct policy_hub // SM35 struct Policy350 - : DefaultPolicy // TODO(bgruber): I think we want `LengthT` instead of `int` + : DefaultPolicy // TODO(bgruber): I think we want `LengthT` instead of `int` , ChainedPolicy<350, Policy350, Policy350> {}; @@ -467,7 +468,8 @@ struct policy_hub BLOCK_SCAN_WARP_SCANS, typename Tuning::delay_constructor>; template - static auto select_agent_policy(long) -> typename DefaultPolicy::RleSweepPolicyT; + static auto select_agent_policy(long) -> + typename DefaultPolicy::RleSweepPolicyT; // SM80 struct Policy800 : ChainedPolicy<800, Policy800, Policy350> @@ -477,7 +479,7 @@ struct policy_hub // SM86 struct Policy860 - : DefaultPolicy // TODO(bgruber): I think we want `LengthT` instead of `int` + : DefaultPolicy // TODO(bgruber): I think we want `LengthT` instead of `int` , ChainedPolicy<860, Policy860, Policy800> {}; diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index ac5dbfc5868..2efa551d4c6 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -108,14 +108,6 @@ constexpr accum_size classify_accum_size() : accum_size::unknown; } -template -struct tuning -{ - static constexpr int threads = Threads; - static constexpr int items = Items; - using delay_constructor = fixed_delay_constructor_t; -}; - template (), @@ -205,17 +197,31 @@ template ()> struct sm90_tuning; +template +struct sm90_tuning_vals +{ + static constexpr int threads = Threads; + static constexpr int items = Items; + using delay_constructor = fixed_delay_constructor_t; + // same logic as default policy: + static constexpr bool large_values = sizeof(AccumT) > 128; + static constexpr BlockLoadAlgorithm load_algorithm = + large_values ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE; + static constexpr BlockStoreAlgorithm store_algorithm = + large_values ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE; +}; + // clang-format off -template struct sm90_tuning : tuning<192, 22, 168, 1140> {}; -template struct sm90_tuning : tuning<512, 12, 376, 1125> {}; -template struct sm90_tuning : tuning<128, 24, 648, 1245> {}; -template struct sm90_tuning : tuning<224, 24, 632, 1290> {}; +template struct sm90_tuning : sm90_tuning_vals {}; +template struct sm90_tuning : sm90_tuning_vals {}; +template struct sm90_tuning : sm90_tuning_vals {}; +template struct sm90_tuning : sm90_tuning_vals {}; -template <> struct sm90_tuning : tuning<128, 24, 688, 1140> {}; -template <> struct sm90_tuning : tuning<224, 24, 576, 1215> {}; +template <> struct sm90_tuning : sm90_tuning_vals {}; +template <> struct sm90_tuning : sm90_tuning_vals {}; #if CUB_IS_INT128_ENABLED -template <> struct sm90_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : tuning<576, 21, 860, 630> {}; +template <> struct sm90_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : sm90_tuning_vals<__int128_t, 576, 21, 860, 630> {}; template <> struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : sm90_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> diff --git a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh index c6894ccbc86..3645e4b9ed7 100644 --- a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh @@ -255,9 +255,8 @@ struct policy_hub typename Tuning::delay_constructor>; template - static auto select_agent_policy(long) -> - typename DefaultPolicy< - default_delay_constructor_t::pack_t>>::ThreeWayPartitionPolicy; + static auto select_agent_policy(long) -> typename DefaultPolicy< + default_delay_constructor_t::pack_t>>::ThreeWayPartitionPolicy; struct Policy800 : ChainedPolicy<800, Policy800, Policy350> { diff --git a/cub/cub/grid/grid_barrier.cuh b/cub/cub/grid/grid_barrier.cuh index 74ff85d6153..f2ae69fc091 100644 --- a/cub/cub/grid/grid_barrier.cuh +++ b/cub/cub/grid/grid_barrier.cuh @@ -50,8 +50,10 @@ CUB_NAMESPACE_BEGIN /** * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid + * + * deprecated [Since 2.9.0] */ -class GridBarrier +class CCCL_DEPRECATED_BECAUSE("Use the APIs from cooperative groups instead") GridBarrier { protected: using SyncFlag = unsigned int; @@ -131,8 +133,11 @@ public: * * Uses RAII for lifetime, i.e., device resources are reclaimed when * the destructor is called. + * + * deprecated [Since 2.9.0] */ -class GridBarrierLifetime : public GridBarrier +_CCCL_SUPPRESS_DEPRECATED_PUSH +class CCCL_DEPRECATED_BECAUSE("Use the APIs from cooperative groups instead") GridBarrierLifetime : public GridBarrier { protected: // Number of bytes backed by d_sync @@ -211,5 +216,6 @@ public: return retval; } }; +_CCCL_SUPPRESS_DEPRECATED_POP CUB_NAMESPACE_END diff --git a/cub/cub/host/mutex.cuh b/cub/cub/host/mutex.cuh deleted file mode 100644 index efffa159ff1..00000000000 --- a/cub/cub/host/mutex.cuh +++ /dev/null @@ -1,70 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Simple portable mutex - */ - -#pragma once - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include - -CUB_NAMESPACE_BEGIN - -/** - * Wraps std::mutex - * deprecated [Since CUB 2.1.0] The `cub::Mutex` is deprecated and will be removed - * in a future release. Use `std::mutex` instead. - */ -struct CCCL_DEPRECATED Mutex -{ - std::mutex mtx; - - void Lock() - { - mtx.lock(); - } - - void Unlock() - { - mtx.unlock(); - } -}; - -CUB_NAMESPACE_END diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index 7af32df392c..feef89776a9 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -391,8 +391,8 @@ struct CCCL_DEPRECATED BinaryFlip {} template - _CCCL_DEVICE auto - operator()(T&& t, U&& u) -> decltype(binary_op(::cuda::std::forward(u), ::cuda::std::forward(t))) + _CCCL_DEVICE auto operator()(T&& t, U&& u) + -> decltype(binary_op(::cuda::std::forward(u), ::cuda::std::forward(t))) { return binary_op(::cuda::std::forward(u), ::cuda::std::forward(t)); } diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh index 5727b395b04..d3850051ca7 100644 --- a/cub/cub/thread/thread_reduce.cuh +++ b/cub/cub/thread/thread_reduce.cuh @@ -543,8 +543,8 @@ ThreadReduceTernaryTree(const Input& input, ReductionOp reduction_op) // never reached. Protect instantion of ThreadReduceSimd with arbitrary types and operators _CCCL_TEMPLATE(typename Input, typename ReductionOp) _CCCL_REQUIRES((!cub::internal::enable_generic_simd_reduction())) -_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto -ThreadReduceSimd(const Input& input, ReductionOp) -> ::cuda::std::remove_cvref_t +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, ReductionOp) + -> ::cuda::std::remove_cvref_t { assert(false); return input[0]; @@ -552,8 +552,8 @@ ThreadReduceSimd(const Input& input, ReductionOp) -> ::cuda::std::remove_cvref_t _CCCL_TEMPLATE(typename Input, typename ReductionOp) _CCCL_REQUIRES((cub::internal::enable_generic_simd_reduction())) -_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto -ThreadReduceSimd(const Input& input, ReductionOp reduction_op) -> ::cuda::std::remove_cvref_t +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, ReductionOp reduction_op) + -> ::cuda::std::remove_cvref_t { using cub::detail::unsafe_bitcast; using T = ::cuda::std::remove_cvref_t; @@ -627,7 +627,8 @@ _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& ::cuda::minimum<>, ::cuda::minimum, cub::internal::SimdMin, - cub::internal::SimdMax>()) + cub::internal::SimdMax>() + || sizeof(ValueT) >= 8) { return cub::internal::ThreadReduceSequential(input, reduction_op); } diff --git a/cub/cub/util_arch.cuh b/cub/cub/util_arch.cuh index 1d6d7289b78..b1da6a03b5d 100644 --- a/cub/cub/util_arch.cuh +++ b/cub/cub/util_arch.cuh @@ -54,9 +54,6 @@ CUB_NAMESPACE_BEGIN #ifndef _CCCL_DOXYGEN_INVOKED // Do not document -// \deprecated [Since 2.1.0] -# define CUB_USE_COOPERATIVE_GROUPS - /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are /// compiling. In host code, CUB_PTX_ARCH's value is implementation defined. # ifndef CUB_PTX_ARCH @@ -72,33 +69,6 @@ CUB_NAMESPACE_BEGIN # endif # endif -// These definitions were intended for internal use only and are now obsolete. -// If you relied on them, consider porting your code to use the functionality -// in libcu++'s header. -// For a temporary workaround, define CUB_PROVIDE_LEGACY_ARCH_MACROS to make -// them available again. These should be considered deprecated and will be -// fully removed in a future version. -# ifdef CUB_PROVIDE_LEGACY_ARCH_MACROS -# ifndef CUB_IS_DEVICE_CODE -# if defined(_NVHPC_CUDA) -# define CUB_IS_DEVICE_CODE __builtin_is_device_code() -# define CUB_IS_HOST_CODE (!__builtin_is_device_code()) -# define CUB_INCLUDE_DEVICE_CODE 1 -# define CUB_INCLUDE_HOST_CODE 1 -# elif CUB_PTX_ARCH > 0 -# define CUB_IS_DEVICE_CODE 1 -# define CUB_IS_HOST_CODE 0 -# define CUB_INCLUDE_DEVICE_CODE 1 -# define CUB_INCLUDE_HOST_CODE 0 -# else -# define CUB_IS_DEVICE_CODE 0 -# define CUB_IS_HOST_CODE 1 -# define CUB_INCLUDE_DEVICE_CODE 0 -# define CUB_INCLUDE_HOST_CODE 1 -# endif -# endif -# endif // CUB_PROVIDE_LEGACY_ARCH_MACROS - /// Maximum number of devices supported. # ifndef CUB_MAX_DEVICES # define CUB_MAX_DEVICES (128) diff --git a/cub/cub/util_compiler.cuh b/cub/cub/util_compiler.cuh index b34a889fd21..8279c6e1fbd 100644 --- a/cub/cub/util_compiler.cuh +++ b/cub/cub/util_compiler.cuh @@ -42,69 +42,3 @@ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header - -// enumerate host compilers we know about -//! deprecated [Since 2.7] -#define CUB_HOST_COMPILER_UNKNOWN 0 -//! deprecated [Since 2.7] -#define CUB_HOST_COMPILER_MSVC 1 -//! deprecated [Since 2.7] -#define CUB_HOST_COMPILER_GCC 2 -//! deprecated [Since 2.7] -#define CUB_HOST_COMPILER_CLANG 3 - -// enumerate device compilers we know about -//! deprecated [Since 2.7] -#define CUB_DEVICE_COMPILER_UNKNOWN 0 -//! deprecated [Since 2.7] -#define CUB_DEVICE_COMPILER_MSVC 1 -//! deprecated [Since 2.7] -#define CUB_DEVICE_COMPILER_GCC 2 -//! deprecated [Since 2.7] -#define CUB_DEVICE_COMPILER_NVCC 3 -//! deprecated [Since 2.7] -#define CUB_DEVICE_COMPILER_CLANG 4 - -// figure out which host compiler we're using -#if _CCCL_COMPILER(MSVC) -//! deprecated [Since 2.7] -# define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC -//! deprecated [Since 2.7] -# define CUB_MSVC_VERSION _MSC_VER -//! deprecated [Since 2.7] -# define CUB_MSVC_VERSION_FULL _MSC_FULL_VER -#elif _CCCL_COMPILER(CLANG) -//! deprecated [Since 2.7] -# define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG -//! deprecated [Since 2.7] -# define CUB_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) -#elif _CCCL_COMPILER(GCC) -//! deprecated [Since 2.7] -# define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC -//! deprecated [Since 2.7] -# define CUB_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) -#endif - -// figure out which device compiler we're using -#if _CCCL_CUDA_COMPILER(NVCC) || _CCCL_CUDA_COMPILER(NVHPC) -//! deprecated [Since 2.7] -# define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC -#elif _CCCL_COMPILER(MSVC) -//! deprecated [Since 2.7] -# define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC -#elif _CCCL_COMPILER(GCC) -//! deprecated [Since 2.7] -# define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC -#elif _CCCL_COMPILER(CLANG) -// CUDA-capable clang should behave similar to NVCC. -# if _CCCL_CUDA_COMPILER(NVCC) -//! deprecated [Since 2.7] -# define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC -# else -//! deprecated [Since 2.7] -# define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG -# endif -#else -//! deprecated [Since 2.7] -# define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN -#endif diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh index 6a85b971884..a6eee36539c 100644 --- a/cub/cub/util_cpp_dialect.cuh +++ b/cub/cub/util_cpp_dialect.cuh @@ -40,23 +40,13 @@ # pragma system_header #endif // no system header -#include // IWYU pragma: export - #ifndef _CCCL_DOXYGEN_INVOKED // Do not document // Deprecation warnings may be silenced by defining the following macros. These // may be combined. -// - CCCL_IGNORE_DEPRECATED_CPP_DIALECT: -// Ignore all deprecated C++ dialects and outdated compilers. -// - CCCL_IGNORE_DEPRECATED_CPP_11: -// Ignore deprecation warnings when compiling with C++11. C++03 and outdated -// compilers will still issue warnings. -// - CCCL_IGNORE_DEPRECATED_CPP_14: -// Ignore deprecation warnings when compiling with C++14. C++03 and outdated -// compilers will still issue warnings. // - CCCL_IGNORE_DEPRECATED_COMPILER // Ignore deprecation warnings when using deprecated compilers. Compiling -// with C++03, C++11 and C++14 will still issue warnings. +// with deprecated C++ dialects will still issue warnings. # define CUB_CPP_DIALECT _CCCL_STD_VER @@ -67,6 +57,7 @@ # define CUB_COMP_DEPR_IMPL(msg) _CCCL_PRAGMA(GCC warning #msg) # endif +// Compiler checks: // clang-format off # define CUB_COMPILER_DEPRECATION(REQ) \ CUB_COMP_DEPR_IMPL(CUB requires at least REQ. Define CCCL_IGNORE_DEPRECATED_COMPILER to suppress this message.) @@ -74,14 +65,12 @@ # define CUB_COMPILER_DEPRECATION_SOFT(REQ, CUR) \ CUB_COMP_DEPR_IMPL( \ CUB requires at least REQ. CUR is deprecated but still supported. CUR support will be removed in a \ - future release. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.) + future release. Define CCCL_IGNORE_DEPRECATED_COMPILER to suppress this message.) // clang-format on # ifndef CCCL_IGNORE_DEPRECATED_COMPILER - -// Compiler checks: -# if _CCCL_COMPILER(GCC, <, 5) -CUB_COMPILER_DEPRECATION(GCC 5.0); +# if _CCCL_COMPILER(GCC, <, 7) +CUB_COMPILER_DEPRECATION(GCC 7.0); # elif _CCCL_COMPILER(CLANG, <, 7) CUB_COMPILER_DEPRECATION(Clang 7.0); # elif _CCCL_COMPILER(MSVC, <, 19, 10) @@ -91,24 +80,18 @@ CUB_COMPILER_DEPRECATION(MSVC 2019(19.20 / 16.0 / 14.20)); // >=2017, <2019. Soft deprecation message: CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017); # endif - # endif // CCCL_IGNORE_DEPRECATED_COMPILER -# if _CCCL_STD_VER < 2011 -// = 2017 - # undef CUB_COMPILER_DEPRECATION_SOFT # undef CUB_COMPILER_DEPRECATION + +// C++17 dialect check: +# ifndef CCCL_IGNORE_DEPRECATED_CPP_DIALECT +# if _CCCL_STD_VER < 2017 +CUB_COMP_DEPR_IMPL(CUB requires at least C++ 17. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.) +# endif // _CCCL_STD_VER >= 2017 +# endif + # undef CUB_COMP_DEPR_IMPL -# undef CUB_COMP_DEPR_IMPL0 -# undef CUB_COMP_DEPR_IMPL1 #endif // !_CCCL_DOXYGEN_INVOKED diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh index ae42e5fe875..c58d90682e1 100644 --- a/cub/cub/util_macro.cuh +++ b/cub/cub/util_macro.cuh @@ -112,12 +112,6 @@ _CCCL_DIAG_SUPPRESS_CLANG("-Wattributes") # if !_CCCL_CUDA_COMPILER(NVHPC) _CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage) # endif // !_CCCL_CUDA_COMPILER(NVHPC) -# if _CCCL_COMPILER(ICC) -# pragma nv_diag_suppress 1407 // the "__visibility__" attribute can only appear on functions and - // variables with external linkage' -# pragma warning(disable : 1890) // the "__visibility__" attribute can only appear on functions and - // variables with external linkage' -# endif // _CCCL_COMPILER(ICC) #endif // !CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION #ifndef CUB_DEFINE_KERNEL_GETTER diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh index c6be2a5209f..4d1db99a821 100644 --- a/cub/cub/util_type.cuh +++ b/cub/cub/util_type.cuh @@ -82,9 +82,9 @@ CUB_NAMESPACE_BEGIN # endif // !defined(__CUDACC_RTC_INT128__) # else // !defined(__CUDACC_RTC__) # if _CCCL_CUDACC_AT_LEAST(11, 5) -# if _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(ICC) || _CCCL_COMPILER(NVHPC) +# if _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC) # define CUB_IS_INT128_ENABLED 1 -# endif // GCC || CLANG || ICC || NVHPC +# endif // GCC || CLANG || NVHPC # endif // _CCCL_CUDACC_AT_LEAST(11, 5) # endif // !defined(__CUDACC_RTC__) #endif // !defined(CUB_IS_INT128_ENABLED) diff --git a/cub/cub/version.cuh b/cub/cub/version.cuh index 2d5232939c8..19024741926 100644 --- a/cub/cub/version.cuh +++ b/cub/cub/version.cuh @@ -58,7 +58,7 @@ * CUB_VERSION / 100 % 1000 is the minor version. * CUB_VERSION / 100000 is the major version. */ -#define CUB_VERSION 200800 // macro expansion with ## requires this to be a single value +#define CUB_VERSION 300000 // macro expansion with ## requires this to be a single value /*! \def CUB_MAJOR_VERSION * \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index 17201c4704f..c86d24754de 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -227,10 +227,8 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id) endif() # CUB_SEPARATE_CATCH2 if ("${test_target}" MATCHES "nvrtc") - target_compile_definitions(${test_target} PRIVATE NVRTC_CUB_PATH="-I${CMAKE_SOURCE_DIR}/cub") - target_compile_definitions(${test_target} PRIVATE NVRTC_THRUST_PATH="-I${CMAKE_SOURCE_DIR}/thrust") - target_compile_definitions(${test_target} PRIVATE NVRTC_LIBCUDACXX_PATH="-I${CMAKE_SOURCE_DIR}/libcudacxx/include") - target_compile_definitions(${test_target} PRIVATE NVRTC_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}") + configure_file("cmake/nvrtc_args.h.in" ${CMAKE_CURRENT_BINARY_DIR}/nvrtc_args.h) + target_include_directories(${test_target} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) endif() if ("${test_target}" MATCHES "test.iterator") diff --git a/cub/test/catch2_segmented_sort_helper.cuh b/cub/test/catch2_segmented_sort_helper.cuh index 3e93b3f0d62..0852921bebf 100644 --- a/cub/test/catch2_segmented_sort_helper.cuh +++ b/cub/test/catch2_segmented_sort_helper.cuh @@ -1388,11 +1388,11 @@ struct generate_edge_case_offsets_dispatch static constexpr int a_bunch_of = 42; static constexpr int a_lot_of = 420; - int small_segment_max_segment_size; - int items_per_small_segment; - int medium_segment_max_segment_size; - int single_thread_segment_size; - int large_cached_segment_max_segment_size; + int small_segment_max_segment_size{}; + int items_per_small_segment{}; + int medium_segment_max_segment_size{}; + int single_thread_segment_size{}; + int large_cached_segment_max_segment_size{}; template CUB_RUNTIME_FUNCTION cudaError_t Invoke() diff --git a/cub/test/catch2_test_device_for_each_in_extents.cu b/cub/test/catch2_test_device_for_each_in_extents.cu index 6f11810101c..3e5a6c6689a 100644 --- a/cub/test/catch2_test_device_for_each_in_extents.cu +++ b/cub/test/catch2_test_device_for_each_in_extents.cu @@ -55,7 +55,7 @@ static auto fill_linear_impl(c2h::host_vector& vector, const ExtentType&, siz _CCCL_TRAILING_REQUIRES(void)((Rank == ExtentType::rank())) { vector[pos++] = {indices...}; - return void(); // Intel and nvc++ require a return statement + return void(); // nvc++ requires a return statement } template @@ -67,7 +67,7 @@ static auto fill_linear_impl(c2h::host_vector& vector, const ExtentType& ext, { fill_linear_impl(vector, ext, pos, indices..., i); } - return void(); // Intel and nvc++ require a return statement + return void(); // nvc++ requires a return statement } template @@ -135,8 +135,8 @@ using dimensions = cuda::std::index_sequence<3, 2, 5, 4>>; template -auto build_static_extents(IndexType, - cuda::std::index_sequence) -> cuda::std::extents +auto build_static_extents(IndexType, cuda::std::index_sequence) + -> cuda::std::extents { return {}; } diff --git a/cub/test/catch2_test_device_three_way_partition.cu b/cub/test/catch2_test_device_three_way_partition.cu index 8c6524adf7c..3b5f96c8d60 100644 --- a/cub/test/catch2_test_device_three_way_partition.cu +++ b/cub/test/catch2_test_device_three_way_partition.cu @@ -30,6 +30,10 @@ #include +#include +#include +#include +#include #include #include #include @@ -38,6 +42,8 @@ #include +#include "catch2_large_problem_helper.cuh" +#include "catch2_test_device_select_common.cuh" #include "catch2_test_launch_helper.h" #include "cub/util_type.cuh" #include @@ -48,20 +54,8 @@ DECLARE_LAUNCH_WRAPPER(cub::DevicePartition::If, partition); using types = c2h::type_list; -template -struct less_than_t -{ - T compare; - - explicit __host__ less_than_t(T compare) - : compare(compare) - {} - - __device__ bool operator()(const T& a) const - { - return a < compare; - } -}; +// List of offset types to be used for testing large number of items +using offset_types = c2h::type_list; template struct equal_to_t @@ -103,6 +97,29 @@ struct count_to_pair_t } }; +template +struct mod_equal_to +{ + T mod; + T val; + __host__ __device__ bool operator()(T x) const + { + return x % mod == val; + } +}; + +template +struct multiply_and_add +{ + T mul; + T add; + + __host__ __device__ T operator()(T x) const + { + return x * mul + add; + } +}; + C2H_TEST("Device three-way partition can handle empty problems", "[partition][device]", types) { using type = typename c2h::get<0, TestType>; @@ -113,12 +130,15 @@ C2H_TEST("Device three-way partition can handle empty problems", "[partition][de type* d_first_part_out{}; type* d_second_part_out{}; type* d_unselected_out{}; - type* d_num_selected_out{}; + c2h::device_vector num_selected_out{42, 42}; + type* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); less_than_t le(type{0}); greater_or_equal_t ge(type{1}); partition(in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, num_items, le, ge); + REQUIRE(num_selected_out[0] == 0); + REQUIRE(num_selected_out[1] == 0); } template @@ -440,3 +460,60 @@ C2H_TEST("Device three-way partition handles single output", "[partition][device second_part_val); REQUIRE(actual_num_items_in_second_part == num_items_in_second_part); } + +C2H_TEST("Device three-way partition works for very large number of items", "[device][partition]", offset_types) +try +{ + using offset_t = typename c2h::get<0, TestType>; + + auto num_items_max_ull = + std::min(static_cast(::cuda::std::numeric_limits::max()), + ::cuda::std::numeric_limits::max() + static_cast(2000000ULL)); + offset_t num_items_max = static_cast(num_items_max_ull); + offset_t num_items_min = + num_items_max_ull > 10000 ? static_cast(num_items_max_ull - 10000ULL) : offset_t{0}; + offset_t num_items = GENERATE_COPY( + values( + {num_items_max, static_cast(num_items_max - 1), static_cast(1), static_cast(3)}), + take(2, random(num_items_min, num_items_max))); + + auto in = thrust::make_counting_iterator(offset_t{0}); + + auto first_selector = mod_equal_to{3, 0}; + auto second_selector = mod_equal_to{3, 1}; + + offset_t expected_first = num_items / offset_t{3} + (num_items % offset_t{3} >= 1); + offset_t expected_second = num_items / offset_t{3} + (num_items % offset_t{3} >= 2); + offset_t expected_third = num_items / offset_t{3}; + + auto expected_first_it = thrust::make_transform_iterator(in, multiply_and_add{3, 0}); + auto expected_second_it = thrust::make_transform_iterator(in, multiply_and_add{3, 1}); + auto expected_third_it = thrust::make_transform_iterator(in, multiply_and_add{3, 2}); + + // Prepare tabulate output iterators to verify results in a memory-efficient way + auto check_first_partition_helper = detail::large_problem_test_helper(expected_first); + auto check_first_it = check_first_partition_helper.get_flagging_output_iterator(expected_first_it); + auto check_second_partition_helper = detail::large_problem_test_helper(expected_second); + auto check_second_it = check_second_partition_helper.get_flagging_output_iterator(expected_second_it); + auto check_third_partition_helper = detail::large_problem_test_helper(expected_third); + auto check_third_it = check_third_partition_helper.get_flagging_output_iterator(expected_third_it); + + // Needs to be device accessible + c2h::device_vector num_selected_out{0, 0}; + offset_t* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); + + // Run test + partition( + in, check_first_it, check_second_it, check_third_it, d_num_selected_out, num_items, first_selector, second_selector); + + // Ensure that we created the correct output + REQUIRE(num_selected_out[0] == expected_first); + REQUIRE(num_selected_out[1] == expected_second); + check_first_partition_helper.check_all_results_correct(); + check_second_partition_helper.check_all_results_correct(); + check_third_partition_helper.check_all_results_correct(); +} +catch (std::bad_alloc&) +{ + // Exceeding memory is not a failure. +} diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu index 06f2b7c31a7..95c4794b8cf 100644 --- a/cub/test/catch2_test_device_transform.cu +++ b/cub/test/catch2_test_device_transform.cu @@ -166,8 +166,8 @@ struct alignas(Alignment) overaligned_addable_t return a.value == b.value; } - _CCCL_HOST_DEVICE friend auto - operator+(const overaligned_addable_t& a, const overaligned_addable_t& b) -> overaligned_addable_t + _CCCL_HOST_DEVICE friend auto operator+(const overaligned_addable_t& a, const overaligned_addable_t& b) + -> overaligned_addable_t { check(a); check(b); diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu index 01f39027ce0..71187ecc83a 100644 --- a/cub/test/catch2_test_nvrtc.cu +++ b/cub/test/catch2_test_nvrtc.cu @@ -31,6 +31,7 @@ #include #include +#include TEST_CASE("Test nvrtc", "[test][nvrtc]") { diff --git a/cub/test/cmake/check_source_files.cmake b/cub/test/cmake/check_source_files.cmake index 1554a2256e1..1fba8476f67 100644 --- a/cub/test/cmake/check_source_files.cmake +++ b/cub/test/cmake/check_source_files.cmake @@ -83,24 +83,6 @@ if (NOT valid_count EQUAL 5) "Matched ${valid_count} times, expected 5.") endif() -################################################################################ -# Legacy macro checks. -# Check all files in CUB to make sure that they aren't using the legacy -# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros. -# -# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET. -# They are provided for legacy purposes and should be replaced with -# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code. -# -# -set(legacy_macro_header_exclusions - # This header defines a legacy CUDART macro: - cub/detail/detect_cuda_runtime.cuh -) - -set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED") -set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__") - ################################################################################ # Read source files: foreach(src ${cub_srcs}) @@ -156,21 +138,6 @@ foreach(src ${cub_srcs}) set(found_errors 1) endif() endif() - - if (NOT ${src} IN_LIST legacy_macro_header_exclusions) - count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count) - count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count) - - if (NOT thrust_count EQUAL 0) - message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.") - set(found_errors 1) - endif() - - if (NOT cub_count EQUAL 0) - message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.") - set(found_errors 1) - endif() - endif() endforeach() if (NOT found_errors EQUAL 0) diff --git a/cub/test/cmake/nvrtc_args.h.in b/cub/test/cmake/nvrtc_args.h.in new file mode 100644 index 00000000000..215804ad0f0 --- /dev/null +++ b/cub/test/cmake/nvrtc_args.h.in @@ -0,0 +1,6 @@ +#pragma once + +const char* NVRTC_CUB_PATH = "-I@CMAKE_SOURCE_DIR@/cub"; +const char* NVRTC_THRUST_PATH = "-I@CMAKE_SOURCE_DIR@/thrust"; +const char* NVRTC_LIBCUDACXX_PATH = "-I@CMAKE_SOURCE_DIR@/libcudacxx/include"; +const char* NVRTC_CTK_PATH = "-I@CUDAToolkit_INCLUDE_DIRS@"; diff --git a/cub/test/test_block_radix_rank.cu b/cub/test/test_block_radix_rank.cu index 8c1df1a80c7..c53c6b179e3 100644 --- a/cub/test/test_block_radix_rank.cu +++ b/cub/test/test_block_radix_rank.cu @@ -310,7 +310,7 @@ void Test() Test(); Test(); - Test(cub::Int2Type<(BlockThreads % 32) == 0>{}); + Test(cub::Int2Type < (BlockThreads % 32) == 0 > {}); } int main(int argc, char** argv) diff --git a/cub/test/test_grid_barrier.cu b/cub/test/test_grid_barrier.cu index 2f5ecfa3ebb..e763b48d1e2 100644 --- a/cub/test/test_grid_barrier.cu +++ b/cub/test/test_grid_barrier.cu @@ -47,7 +47,9 @@ using namespace cub; /** * Kernel that iterates through the specified number of software global barriers */ -__global__ void Kernel(GridBarrier global_barrier, int iterations) +_CCCL_SUPPRESS_DEPRECATED_PUSH +__global__ void Kernel(GridBarrier global_barrier, int iterations) // + _CCCL_SUPPRESS_DEPRECATED_POP { for (int i = 0; i < iterations; i++) { @@ -126,7 +128,9 @@ int main(int argc, char** argv) fflush(stdout); // Init global barrier + _CCCL_SUPPRESS_DEPRECATED_PUSH GridBarrierLifetime global_barrier; + _CCCL_SUPPRESS_DEPRECATED_POP global_barrier.Setup(grid_size); // Time kernel diff --git a/cub/test/test_warning_suppression.cuh b/cub/test/test_warning_suppression.cuh index 46c6080fed7..448230343f3 100644 --- a/cub/test/test_warning_suppression.cuh +++ b/cub/test/test_warning_suppression.cuh @@ -27,7 +27,6 @@ #pragma once -#include #include // C4127: conditional expression is constant diff --git a/cudax/examples/stf/void_data_interface.cu b/cudax/examples/stf/void_data_interface.cu index 72ac76e6fe1..9e7b5096cd7 100644 --- a/cudax/examples/stf/void_data_interface.cu +++ b/cudax/examples/stf/void_data_interface.cu @@ -23,14 +23,23 @@ int main() { context ctx; - auto ltask_res = ctx.logical_data(shape_of()); - ctx.task(ltask_res.write())->*[](cudaStream_t, auto) { + auto token = ctx.logical_data(shape_of()); + ctx.task(token.write())->*[](cudaStream_t, auto) { }; void_interface sync; - auto ltask2_res = ctx.logical_data(sync); - ctx.task(ltask2_res.write(), ltask_res.read())->*[](cudaStream_t, auto, auto) { + auto token2 = ctx.logical_data(sync); + + auto token3 = ctx.logical_token(); + ctx.task(token2.write(), token.read())->*[](cudaStream_t, auto, auto) { + + }; + + // Do not pass useless arguments by removing void_interface arguments + // Note that the rw() access is possible even if there was no prior write() + // or actual underlying data. + ctx.task(token3.rw(), token.read())->*[](cudaStream_t) { }; diff --git a/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh b/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh index 459beddee22..ae8ad239d46 100644 --- a/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh +++ b/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh @@ -60,8 +60,8 @@ struct receiver_defaults } template - _CUDAX_TRIVIAL_API static auto - set_stopped(__ignore, _Rcvr& __rcvr) noexcept -> __async::completion_signatures<__async::set_stopped_t()> + _CUDAX_TRIVIAL_API static auto set_stopped(__ignore, _Rcvr& __rcvr) noexcept + -> __async::completion_signatures<__async::set_stopped_t()> { __async::set_stopped(static_cast<_Rcvr&&>(__rcvr)); return {}; @@ -198,15 +198,15 @@ _CUDAX_TRIVIAL_API auto __make_opstate(_Sndr __sndr, _Rcvr __rcvr) } template -_CUDAX_TRIVIAL_API auto -__get_attrs(int, const _Data& __data, const _Sndrs&... __sndrs) noexcept -> decltype(__data.get_attrs(__sndrs...)) +_CUDAX_TRIVIAL_API auto __get_attrs(int, const _Data& __data, const _Sndrs&... __sndrs) noexcept + -> decltype(__data.get_attrs(__sndrs...)) { return __data.get_attrs(__sndrs...); } template -_CUDAX_TRIVIAL_API auto -__get_attrs(long, const _Data&, const _Sndrs&... __sndrs) noexcept -> decltype(__async::get_env(__sndrs...)) +_CUDAX_TRIVIAL_API auto __get_attrs(long, const _Data&, const _Sndrs&... __sndrs) noexcept + -> decltype(__async::get_env(__sndrs...)) { return __async::get_env(__sndrs...); } diff --git a/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh b/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh index 25d5ef04d76..868c911b1da 100644 --- a/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh +++ b/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh @@ -76,48 +76,36 @@ template class _Vy, template class _ using __transform_sig_t = decltype(__transform_sig<_Sig, _Vy, _Ey, _Sy>()); template - class _Vy, - template - class _Ey, + template class _Vy, + template class _Ey, class _Sy, - template - class _Variant, + template class _Variant, class... _More> extern _DIAGNOSTIC<_Sigs> __transform_completion_signatures_v; template - class _Vy, - template - class _Ey, + template class _Vy, + template class _Ey, class _Sy, - template - class _Variant, + template class _Variant, class... _More> extern __fn_t<_ERROR<_What...>>* __transform_completion_signatures_v<_ERROR<_What...>, _Vy, _Ey, _Sy, _Variant, _More...>; template - class _Vy, - template - class _Ey, + template class _Vy, + template class _Ey, class _Sy, - template - class _Variant, + template class _Variant, class... _More> extern __fn_t<_Variant<__transform_sig_t<_Sigs, _Vy, _Ey, _Sy>..., _More...>>* __transform_completion_signatures_v, _Vy, _Ey, _Sy, _Variant, _More...>; template - class _Vy, - template - class _Ey, + template class _Vy, + template class _Ey, class _Sy, - template - class _Variant, + template class _Variant, class... _More> using __transform_completion_signatures = decltype(__transform_completion_signatures_v<_Sigs, _Vy, _Ey, _Sy, _Variant, _More...>()); @@ -129,12 +117,9 @@ template <> struct __gather_sigs_fn { template - class _Then, - template - class _Else, - template - class _Variant, + template class _Then, + template class _Else, + template class _Variant, class... _More> using __call = __transform_completion_signatures< _Sigs, @@ -149,12 +134,9 @@ template <> struct __gather_sigs_fn { template - class _Then, - template - class _Else, - template - class _Variant, + template class _Then, + template class _Else, + template class _Variant, class... _More> using __call = __transform_completion_signatures< _Sigs, @@ -169,12 +151,9 @@ template <> struct __gather_sigs_fn { template - class _Then, - template - class _Else, - template - class _Variant, + template class _Then, + template class _Else, + template class _Variant, class... _More> using __call = __transform_completion_signatures< _Sigs, @@ -187,12 +166,9 @@ struct __gather_sigs_fn template - class _Then, - template - class _Else, - template - class _Variant, + template class _Then, + template class _Else, + template class _Variant, class... _More> using __gather_completion_signatures = typename __gather_sigs_fn<_WantedTag>::template __call<_Sigs, _Then, _Else, _Variant, _More...>; @@ -404,13 +380,12 @@ template auto completion(_Tag, _Args&&...) -> __csig::__sigs<_Tag(_Args...)>&; template -auto completions_of(_Sndr&&, - _Rcvr = {}) -> decltype(__csig::__to_sigs(__declval&>())); +auto completions_of(_Sndr&&, _Rcvr = {}) + -> decltype(__csig::__to_sigs(__declval&>())); template -auto eptr_completion_if() - -> _CUDA_VSTD:: - conditional_t<_PotentiallyThrowing, __csig::__sigs, __csig::__sigs<>>&; +auto eptr_completion_if() -> _CUDA_VSTD:: + conditional_t<_PotentiallyThrowing, __csig::__sigs, __csig::__sigs<>>&; } // namespace meta } // namespace cuda::experimental::__async diff --git a/cudax/include/cuda/experimental/__async/sender/continue_on.cuh b/cudax/include/cuda/experimental/__async/sender/continue_on.cuh index 9a0c142e21c..8da87a443a3 100644 --- a/cudax/include/cuda/experimental/__async/sender/continue_on.cuh +++ b/cudax/include/cuda/experimental/__async/sender/continue_on.cuh @@ -267,8 +267,8 @@ struct continue_on_t::__sndr_t }; template -_CUDAX_API auto -continue_on_t::operator()(_Sndr __sndr, _Sch __sch) const noexcept -> continue_on_t::__sndr_t<_Sndr, _Sch> +_CUDAX_API auto continue_on_t::operator()(_Sndr __sndr, _Sch __sch) const noexcept + -> continue_on_t::__sndr_t<_Sndr, _Sch> { return __sndr_t<_Sndr, _Sch>{{}, __sch, static_cast<_Sndr&&>(__sndr)}; } diff --git a/cudax/include/cuda/experimental/__async/sender/cpos.cuh b/cudax/include/cuda/experimental/__async/sender/cpos.cuh index 7f1fb383a71..dab62e7ac10 100644 --- a/cudax/include/cuda/experimental/__async/sender/cpos.cuh +++ b/cudax/include/cuda/experimental/__async/sender/cpos.cuh @@ -110,8 +110,8 @@ _CCCL_GLOBAL_CONSTANT struct set_error_t _CCCL_GLOBAL_CONSTANT struct set_stopped_t { template - _CUDAX_TRIVIAL_API auto - operator()(_Rcvr&& __rcvr) const noexcept -> decltype(static_cast<_Rcvr&&>(__rcvr).set_stopped()) + _CUDAX_TRIVIAL_API auto operator()(_Rcvr&& __rcvr) const noexcept + -> decltype(static_cast<_Rcvr&&>(__rcvr).set_stopped()) { static_assert(_CUDA_VSTD::is_same_v(__rcvr).set_stopped()), void>); static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_stopped())); @@ -119,8 +119,8 @@ _CCCL_GLOBAL_CONSTANT struct set_stopped_t } template - _CUDAX_TRIVIAL_API auto - operator()(_Rcvr* __rcvr) const noexcept -> decltype(static_cast<_Rcvr&&>(*__rcvr).set_stopped()) + _CUDAX_TRIVIAL_API auto operator()(_Rcvr* __rcvr) const noexcept + -> decltype(static_cast<_Rcvr&&>(*__rcvr).set_stopped()) { static_assert(_CUDA_VSTD::is_same_v(*__rcvr).set_stopped()), void>); static_assert(noexcept(static_cast<_Rcvr&&>(*__rcvr).set_stopped())); diff --git a/cudax/include/cuda/experimental/__async/sender/let_value.cuh b/cudax/include/cuda/experimental/__async/sender/let_value.cuh index 7d06e071fe0..6742a1c1d6c 100644 --- a/cudax/include/cuda/experimental/__async/sender/let_value.cuh +++ b/cudax/include/cuda/experimental/__async/sender/let_value.cuh @@ -243,8 +243,9 @@ private: _Sndr __sndr_; template - _CUDAX_API auto connect(_Rcvr __rcvr) && noexcept( - __nothrow_constructible<__opstate_t<_Rcvr, _Sndr, _Fn>, _Sndr, _Fn, _Rcvr>) -> __opstate_t<_Rcvr, _Sndr, _Fn> + _CUDAX_API auto + connect(_Rcvr __rcvr) && noexcept(__nothrow_constructible<__opstate_t<_Rcvr, _Sndr, _Fn>, _Sndr, _Fn, _Rcvr>) + -> __opstate_t<_Rcvr, _Sndr, _Fn> { return __opstate_t<_Rcvr, _Sndr, _Fn>( static_cast<_Sndr&&>(__sndr_), static_cast<_Fn&&>(__fn_), static_cast<_Rcvr&&>(__rcvr)); diff --git a/cudax/include/cuda/experimental/__async/sender/stop_token.cuh b/cudax/include/cuda/experimental/__async/sender/stop_token.cuh index 35e6d4d164a..693816dbb45 100644 --- a/cudax/include/cuda/experimental/__async/sender/stop_token.cuh +++ b/cudax/include/cuda/experimental/__async/sender/stop_token.cuh @@ -369,8 +369,8 @@ _CUDAX_API inline void inplace_stop_source::__unlock(uint8_t __old_state) const (void) __state_.store(__old_state, _CUDA_VSTD::memory_order_release); } -_CUDAX_API inline auto -inplace_stop_source::__try_lock_unless_stop_requested(bool __set_stop_requested) const noexcept -> bool +_CUDAX_API inline auto inplace_stop_source::__try_lock_unless_stop_requested(bool __set_stop_requested) const noexcept + -> bool { __stok::__spin_wait __spin; auto __old_state = __state_.load(_CUDA_VSTD::memory_order_relaxed); diff --git a/cudax/include/cuda/experimental/__async/sender/tuple.cuh b/cudax/include/cuda/experimental/__async/sender/tuple.cuh index 98a1d0997f1..0229ed8b9c7 100644 --- a/cudax/include/cuda/experimental/__async/sender/tuple.cuh +++ b/cudax/include/cuda/experimental/__async/sender/tuple.cuh @@ -65,8 +65,8 @@ struct __tupl<_CUDA_VSTD::index_sequence<_Idx...>, _Ts...> : __box<_Idx, _Ts>... template _CUDAX_TRIVIAL_API static auto __for_each(_Fn&& __fn, _Self&& __self, _Us&&... __us) // - noexcept((__nothrow_callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> - && ...)) -> _CUDA_VSTD::enable_if_t<(__callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> && ...)> + noexcept((__nothrow_callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> && ...)) + -> _CUDA_VSTD::enable_if_t<(__callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> && ...)> { return ( static_cast<_Fn&&>(__fn)(static_cast<_Us&&>(__us)..., static_cast<_Self&&>(__self).__box<_Idx, _Ts>::__value_), diff --git a/cudax/include/cuda/experimental/__detail/config.cuh b/cudax/include/cuda/experimental/__detail/config.cuh index 632e689f564..2ac25eb0a3b 100644 --- a/cudax/include/cuda/experimental/__detail/config.cuh +++ b/cudax/include/cuda/experimental/__detail/config.cuh @@ -35,15 +35,9 @@ // two attributes: // - `_CUDAX_API` declares the function host/device and hides the symbol from the ABI // - `_CUDAX_TRIVIAL_API` does the same while also forcing inlining and hiding the function from debuggers -#if _CCCL_COMPILER(ICC) // ICC has issues with visibility attributes on symbols with internal linkage -# define _CUDAX_API _CCCL_HOST_DEVICE -# define _CUDAX_HOST_API _CCCL_HOST -# define _CUDAX_DEVICE_API _CCCL_DEVICE -#else // ^^^ _CCCL_COMPILER(ICC) ^^^ / vvv !_CCCL_COMPILER(ICC) vvv -# define _CUDAX_API _CCCL_HOST_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION -# define _CUDAX_HOST_API _CCCL_HOST _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION -# define _CUDAX_DEVICE_API _CCCL_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION -#endif // !_CCCL_COMPILER(ICC) +#define _CUDAX_API _CCCL_HOST_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION +#define _CUDAX_HOST_API _CCCL_HOST _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION +#define _CUDAX_DEVICE_API _CCCL_DEVICE _CCCL_VISIBILITY_HIDDEN _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION // _CUDAX_TRIVIAL_API force-inlines a function, marks its visibility as hidden, and causes debuggers to skip it. // This is useful for trivial internal functions that do dispatching or other plumbing work. It is particularly diff --git a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh index 8a42bab40ca..0e1dceff19b 100644 --- a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh @@ -80,8 +80,8 @@ struct __with_property template struct __iproperty : interface<__iproperty> { - _CUDAX_HOST_API friend auto - get_property([[maybe_unused]] const __iproperty& __obj, _Property) -> __property_result_t<_Property> + _CUDAX_HOST_API friend auto get_property([[maybe_unused]] const __iproperty& __obj, _Property) + -> __property_result_t<_Property> { if constexpr (!_CUDA_VSTD::is_same_v<__property_result_t<_Property>, void>) { @@ -268,8 +268,8 @@ template struct __with_try_get_property { template - _CUDAX_HOST_API _CCCL_NODISCARD_FRIEND auto - try_get_property(const _Derived& __self, _Property) noexcept -> __try_property_result_t<_Property> + _CUDAX_HOST_API _CCCL_NODISCARD_FRIEND auto try_get_property(const _Derived& __self, _Property) noexcept + -> __try_property_result_t<_Property> { auto __prop = __cudax::dynamic_any_cast*>(&__self); if constexpr (_CUDA_VSTD::is_same_v<__property_result_t<_Property>, void>) diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh index aefaa699f21..8a1a8a41168 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh @@ -27,6 +27,7 @@ #include #include +#include #include #include #include diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh index a435c85126e..c64d52437c4 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh @@ -31,6 +31,7 @@ #include // graph_task<> has-a backend_ctx_untyped #include #include +#include namespace cuda::experimental::stf { @@ -508,8 +509,12 @@ public: dot.template add_vertex(*this); } + constexpr bool fun_invocable_stream_deps = ::std::is_invocable_v; + constexpr bool fun_invocable_stream_non_void_deps = + reserved::is_invocable_with_filtered::value; + // Default for the first argument is a `cudaStream_t`. - if constexpr (::std::is_invocable_v) + if constexpr (fun_invocable_stream_deps || fun_invocable_stream_non_void_deps) { // // CAPTURE the lambda @@ -522,7 +527,16 @@ public: cuda_safe_call(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal)); // Launch the user provided function - ::std::apply(f, tuple_prepend(mv(capture_stream), typed_deps())); + if constexpr (fun_invocable_stream_deps) + { + ::std::apply(f, tuple_prepend(mv(capture_stream), typed_deps())); + } + else if constexpr (fun_invocable_stream_non_void_deps) + { + // Remove void arguments + ::std::apply(::std::forward(f), + tuple_prepend(mv(capture_stream), reserved::remove_void_interface_types(typed_deps()))); + } cuda_safe_call(cudaStreamEndCapture(capture_stream, &childGraph)); @@ -534,7 +548,12 @@ public: } else { - static_assert(::std::is_invocable_v, "Incorrect lambda function signature."); + constexpr bool fun_invocable_graph_deps = ::std::is_invocable_v; + constexpr bool fun_invocable_graph_non_void_deps = + reserved::is_invocable_with_filtered::value; + + static_assert(fun_invocable_graph_deps || fun_invocable_graph_non_void_deps, + "Incorrect lambda function signature."); // // Give the lambda a child graph // diff --git a/cudax/include/cuda/experimental/__stf/graph/interfaces/void_interface.cuh b/cudax/include/cuda/experimental/__stf/graph/interfaces/void_interface.cuh new file mode 100644 index 00000000000..12f36d06c16 --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/graph/interfaces/void_interface.cuh @@ -0,0 +1,114 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief This implements a void data interface over the graph_ctx backend + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +namespace cuda::experimental::stf +{ + +template +struct graphed_interface_of; + +/** + * @brief Data interface to manipulate the void interface in the CUDA graph backend + */ +class void_graph_interface : public graph_data_interface +{ +public: + /// @brief Alias for the base class + using base = graph_data_interface; + /// @brief Alias for the shape type + using base::shape_t; + + void_graph_interface(void_interface s) + : base(mv(s)) + {} + void_graph_interface(shape_of s) + : base(mv(s)) + {} + + void data_allocate( + backend_ctx_untyped&, + block_allocator_untyped&, + const data_place&, + instance_id_t, + ::std::ptrdiff_t& s, + void**, + event_list&) override + { + s = 0; + } + + void data_deallocate( + backend_ctx_untyped&, block_allocator_untyped&, const data_place&, instance_id_t, void*, event_list&) final + {} + + cudaGraphNode_t graph_data_copy( + cudaMemcpyKind, + instance_id_t, + instance_id_t, + cudaGraph_t graph, + const cudaGraphNode_t* input_nodes, + size_t input_cnt) override + { + cudaGraphNode_t dummy; + cuda_safe_call(cudaGraphAddEmptyNode(&dummy, graph, input_nodes, input_cnt)); + return dummy; + } + + bool pin_host_memory(instance_id_t) override + { + // no-op + return false; + } + + void unpin_host_memory(instance_id_t) override {} + + /* This helps detecting when we are manipulating a void data interface, so + * that we can optimize useless stages such as allocations or copies */ + bool is_void_interface() const override final + { + return true; + } +}; + +/** + * @brief Define how the CUDA stream backend must manipulate this void interface + * + * Note that we specialize cuda::experimental::stf::shape_of to avoid ambiguous specialization + * + * @extends graphed_interface_of + */ +template <> +struct graphed_interface_of +{ + using type = void_graph_interface; +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh index 119c4e52d40..15d05f1f894 100644 --- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh @@ -39,6 +39,7 @@ #include // backend_ctx uses shape_of #include // backend_ctx_untyped::impl has-a ctx_stack #include +#include #include // XXX there is currently a dependency on this header for places.h @@ -195,7 +196,22 @@ public: { delete w; }; - ::std::apply(::std::forward(w->first), mv(w->second)); + + constexpr bool fun_invocable_task_deps = reserved::is_tuple_invocable_v; + constexpr bool fun_invocable_task_non_void_deps = + reserved::is_tuple_invocable_with_filtered::value; + + static_assert(fun_invocable_task_deps || fun_invocable_task_non_void_deps, + "Incorrect lambda function signature in host_launch."); + + if constexpr (fun_invocable_task_deps) + { + ::std::apply(::std::forward(w->first), mv(w->second)); + } + else if constexpr (fun_invocable_task_non_void_deps) + { + ::std::apply(::std::forward(w->first), reserved::remove_void_interface_types(mv(w->second))); + } }; if constexpr (::std::is_same_v) @@ -1067,6 +1083,16 @@ public: return logical_data(make_slice(p, n), mv(dplace)); } + auto logical_token() + { + // We do not use a shape because we want the first rw() access to succeed + // without an initial write() + // + // Note that we do not disable write back as the write-back mechanism is + // handling void_interface specifically to ignore it anyway. + return logical_data(void_interface{}); + } + template frozen_logical_data freeze(cuda::experimental::stf::logical_data d, access_mode m = access_mode::read, diff --git a/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh b/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh index 1caf710496b..c2d91313451 100644 --- a/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh @@ -296,6 +296,15 @@ public: return tp.find_data_instance_id(d); } + /** + * @brief Indicates whether this is a void data interface, which permits to + * skip some operations to allocate or move data for example + */ + virtual bool is_void_interface() const + { + return false; + } + private: /** * @brief Get the common implementation of the data interface. diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh index 2b50d480fff..61b01525093 100644 --- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh @@ -451,21 +451,17 @@ public: } size_t p_rank = 0; - if constexpr (::std::is_same_v) + for (auto p : e_place) { - for (auto p : e_place) + if constexpr (::std::is_same_v) { reserved::launch_impl(interpreted_policy, p, f, args, t.get_stream(p_rank), p_rank); - p_rank++; } - } - else - { - for (auto p : e_place) + else { reserved::graph_launch_impl(t, interpreted_policy, p, f, args, p_rank); - p_rank++; } + p_rank++; } } diff --git a/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh b/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh index 5858a809f46..ee01f53a90b 100644 --- a/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh @@ -467,6 +467,12 @@ public: return dinterface != nullptr; } + bool is_void_interface() const + { + _CCCL_ASSERT(has_interface(), "uninitialized logical data"); + return dinterface->is_void_interface(); + } + bool has_ref() const { assert(refcnt.load() >= 0); @@ -1255,6 +1261,15 @@ public: return pimpl->dinterface != nullptr; } + /** + * @brief Returns true if the data is a void data interface + */ + bool is_void_interface() const + { + assert(pimpl); + return pimpl->is_void_interface(); + } + // This function applies the reduction operator over 2 instances, the one // identified by "in_instance_id" is not modified, the one identified as // "inout_instance_id" is where the result is put. @@ -1727,7 +1742,7 @@ inline void reserved::logical_data_untyped_impl::erase() /* If there is a reference instance id, it needs to be updated with a * valid copy if that is not the case yet */ - if (enable_write_back) + if (enable_write_back && !is_void_interface()) { instance_id_t ref_id = reference_instance_id; assert(ref_id != instance_id_t::invalid); @@ -2032,7 +2047,7 @@ inline void fetch_data( { event_list stf_prereq = reserved::enforce_stf_deps_before(ctx, d, instance_id, t, mode, eplace); - if (d.has_interface()) + if (d.has_interface() && !d.is_void_interface()) { // Allocate data if needed (and possibly reclaim memory to do so) reserved::dep_allocate(ctx, d, mode, dplace, eplace, instance_id, stf_prereq); diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh index 21e5ac7fb0f..06f1f7f689b 100644 --- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh @@ -916,7 +916,7 @@ public: const size_t n = shape.size(); // Tuple , size_t , fun, shape> - using args_t = ::std::tuple; + using args_t = ::std::tuple; // Create a tuple with all instances (eg. tuple, slice>) deps_tup_t instances = ::std::apply( @@ -940,7 +940,7 @@ public: auto& data = ::std::get<0>(*p); const size_t n = ::std::get<1>(*p); - Fun&& f = mv(::std::get<2>(*p)); + Fun& f = ::std::get<2>(*p); const sub_shape_t& shape = ::std::get<3>(*p); // deps_ops_t are pairs of data instance type, and a reduction operator, diff --git a/cudax/include/cuda/experimental/__stf/internal/void_interface.cuh b/cudax/include/cuda/experimental/__stf/internal/void_interface.cuh index 5a557ad1a29..e238bff7641 100644 --- a/cudax/include/cuda/experimental/__stf/internal/void_interface.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/void_interface.cuh @@ -11,7 +11,7 @@ /** * @file * - * @brief This implements a void data interface useful to implement STF + * @brief This defines a void data interface useful to implement STF * dependencies without actual data (e.g. to enforce task dependencies) */ @@ -27,8 +27,6 @@ # pragma system_header #endif // no system header -#include -#include #include namespace cuda::experimental::stf @@ -37,12 +35,6 @@ namespace cuda::experimental::stf template class shape_of; -template -struct streamed_interface_of; - -template -struct graphed_interface_of; - class void_interface {}; @@ -71,136 +63,123 @@ public: }; /** - * @brief Data interface to manipulate the void interface in the CUDA stream backend + * @brief A hash of the matrix */ -class void_stream_interface : public stream_data_interface_simple +template <> +struct hash { -public: - using base = stream_data_interface_simple; - using base::shape_t; + ::std::size_t operator()(void_interface const&) const noexcept + { + return 42; + } +}; - void_stream_interface(void_interface m) - : base(::std::move(m)) - {} - void_stream_interface(typename base::shape_t s) - : base(s) - {} +namespace reserved +{ - /// Copy the content of an instance to another instance : this is a no-op - void stream_data_copy(const data_place&, instance_id_t, const data_place&, instance_id_t, cudaStream_t) override {} +template +struct remove_void_interface +{ + using type = ::std::tuple<>; +}; - /// Pretend we allocate an instance on a specific data place : we do not do any allocation here - void stream_data_allocate( - backend_ctx_untyped&, const data_place&, instance_id_t, ::std::ptrdiff_t& s, void**, cudaStream_t) override - { - // By filling a non negative number, we notify that the allocation was successful - s = 0; - } +template +struct remove_void_interface +{ +private: + using tail = typename remove_void_interface::type; - /// Pretend we deallocate an instance (no-op) - void stream_data_deallocate(backend_ctx_untyped&, const data_place&, instance_id_t, void*, cudaStream_t) override {} + // If T is void_interface, skip it, otherwise prepend it to tail + using filtered = + std::conditional_t<::std::is_same_v, + tail, + decltype(::std::tuple_cat(::std::declval<::std::tuple>(), ::std::declval()))>; - bool pin_host_memory(instance_id_t) override - { - // no-op - return false; - } +public: + using type = filtered; +}; - void unpin_host_memory(instance_id_t) override {} +template +using remove_void_interface_t = typename remove_void_interface::type; + +template +struct remove_void_interface_from_tuple +{ + // By default, if T is not a std::tuple, do nothing special + using type = T; }; -/** - * @brief Define how the CUDA stream backend must manipulate this void interface - * - * Note that we specialize cuda::experimental::stf::shape_of to avoid ambiguous specialization - * - * @extends streamed_interface_of - */ -template <> -struct streamed_interface_of +template +struct remove_void_interface_from_tuple<::std::tuple> { - using type = void_stream_interface; + using type = remove_void_interface_t; }; +template +using remove_void_interface_from_tuple_t = typename remove_void_interface_from_tuple::type; + /** - * @brief Data interface to manipulate the void interface in the CUDA graph backend + * @brief Check if a function can be invoked while eliding arguments with a void_interface type. */ -class void_graph_interface : public graph_data_interface +template +struct is_invocable_with_filtered { -public: - /// @brief Alias for the base class - using base = graph_data_interface; - /// @brief Alias for the shape type - using base::shape_t; - - void_graph_interface(void_interface s) - : base(mv(s)) - {} - void_graph_interface(shape_of s) - : base(mv(s)) - {} - - void data_allocate( - backend_ctx_untyped&, - block_allocator_untyped&, - const data_place&, - instance_id_t, - ::std::ptrdiff_t& s, - void**, - event_list&) override +private: + template + static auto test(int) -> ::std::bool_constant<::std::is_invocable_v> { - s = 0; + return {}; } - void data_deallocate( - backend_ctx_untyped&, block_allocator_untyped&, const data_place&, instance_id_t, void*, event_list&) final - {} - - cudaGraphNode_t graph_data_copy( - cudaMemcpyKind, - instance_id_t, - instance_id_t, - cudaGraph_t graph, - const cudaGraphNode_t* input_nodes, - size_t input_cnt) override + template + static auto test(...) -> ::std::false_type { - cudaGraphNode_t dummy; - cuda_safe_call(cudaGraphAddEmptyNode(&dummy, graph, input_nodes, input_cnt)); - return dummy; + return {}; } - bool pin_host_memory(instance_id_t) override + template <::std::size_t... Idx> + static auto check(::std::index_sequence) { - // no-op - return false; + using filtered = remove_void_interface_t; + return test...>(0); } - void unpin_host_memory(instance_id_t) override {} +public: + static constexpr bool value = + decltype(check(::std::make_index_sequence<::std::tuple_size_v>>{}))::value; }; /** - * @brief Define how the CUDA stream backend must manipulate this void interface - * - * Note that we specialize cuda::experimental::stf::shape_of to avoid ambiguous specialization - * - * @extends graphed_interface_of + * @brief Check if a function can be invoked using std::apply while eliding tuple arguments with a void_interface type. */ -template <> -struct graphed_interface_of -{ - using type = void_graph_interface; -}; +template +struct is_tuple_invocable_with_filtered : is_tuple_invocable> +{}; /** - * @brief A hash of the matrix + * @brief Strip tuple entries with a "void_interface" type */ -template <> -struct hash +template +auto remove_void_interface_types(const ::std::tuple& tpl) { - ::std::size_t operator()(void_interface const&) const noexcept - { - return 42; - } -}; + return ::std::apply( + [](auto&&... args) { + auto filter_one = [](auto&& arg) { + using T = ::std::decay_t; + if constexpr (::std::is_same_v) + { + return ::std::tuple<>{}; + } + else + { + return ::std::tuple(::std::forward(arg)); + } + }; + return ::std::tuple_cat(filter_one(::std::forward(args))...); + }, + tpl); +} + +} // end namespace reserved } // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/void_interface.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/void_interface.cuh new file mode 100644 index 00000000000..78ca63d5aab --- /dev/null +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/void_interface.cuh @@ -0,0 +1,97 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief This implements the void data interface in the stream_ctx backend + */ + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +namespace cuda::experimental::stf +{ + +template +struct streamed_interface_of; + +/** + * @brief Data interface to manipulate the void interface in the CUDA stream backend + */ +class void_stream_interface : public stream_data_interface_simple +{ +public: + using base = stream_data_interface_simple; + using base::shape_t; + + void_stream_interface(void_interface m) + : base(::std::move(m)) + {} + void_stream_interface(typename base::shape_t s) + : base(s) + {} + + /// Copy the content of an instance to another instance : this is a no-op + void stream_data_copy(const data_place&, instance_id_t, const data_place&, instance_id_t, cudaStream_t) override {} + + /// Pretend we allocate an instance on a specific data place : we do not do any allocation here + void stream_data_allocate( + backend_ctx_untyped&, const data_place&, instance_id_t, ::std::ptrdiff_t& s, void**, cudaStream_t) override + { + // By filling a non negative number, we notify that the allocation was successful + s = 0; + } + + /// Pretend we deallocate an instance (no-op) + void stream_data_deallocate(backend_ctx_untyped&, const data_place&, instance_id_t, void*, cudaStream_t) override {} + + bool pin_host_memory(instance_id_t) override + { + // no-op + return false; + } + + void unpin_host_memory(instance_id_t) override {} + + /* This helps detecting when we are manipulating a void data interface, so + * that we can optimize useless stages such as allocations or copies */ + bool is_void_interface() const override final + { + return true; + } +}; + +/** + * @brief Define how the CUDA stream backend must manipulate this void interface + * + * Note that we specialize cuda::experimental::stf::shape_of to avoid ambiguous specialization + * + * @extends streamed_interface_of + */ +template <> +struct streamed_interface_of +{ + using type = void_stream_interface; +}; + +} // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh index 7416fb922ab..3a729bdb2c2 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh @@ -34,6 +34,7 @@ #include #include // for unit test! #include // For implicit logical_data_untyped constructors +#include #include namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh index 540d3b0424d..348136778b7 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -593,12 +594,31 @@ public: auto t = tuple_prepend(get_stream(), typed_deps()); return ::std::apply(::std::forward(fun), t); } + else if constexpr (reserved::is_invocable_with_filtered::value) + { + // Use the filtered tuple + auto t = tuple_prepend(get_stream(), reserved::remove_void_interface_types(typed_deps())); + return ::std::apply(::std::forward(fun), t); + } else { + constexpr bool fun_invocable_task_deps = ::std::is_invocable_v; + constexpr bool fun_invocable_task_non_void_deps = + reserved::is_invocable_with_filtered::value; + // Invoke passing `*this` as the first argument, followed by the slices - static_assert(::std::is_invocable_v, "Incorrect lambda function signature."); - auto t = tuple_prepend(*this, typed_deps()); - return ::std::apply(::std::forward(fun), t); + static_assert(fun_invocable_task_deps || fun_invocable_task_non_void_deps, + "Incorrect lambda function signature."); + + if constexpr (fun_invocable_task_deps) + { + return ::std::apply(::std::forward(fun), tuple_prepend(*this, typed_deps())); + } + else if constexpr (fun_invocable_task_non_void_deps) + { + return ::std::apply(::std::forward(fun), + tuple_prepend(*this, reserved::remove_void_interface_types(typed_deps()))); + } } } diff --git a/cudax/include/cuda/experimental/__stf/utility/traits.cuh b/cudax/include/cuda/experimental/__stf/utility/traits.cuh index 8308e56d702..a30596f7bde 100644 --- a/cudax/include/cuda/experimental/__stf/utility/traits.cuh +++ b/cudax/include/cuda/experimental/__stf/utility/traits.cuh @@ -562,6 +562,22 @@ auto shuffled_array_tuple(ArgTypes... args) namespace reserved { +/** + * @brief Trait class to check if a function can be invoked with std::apply using a tuple type + */ +template +struct is_tuple_invocable : ::std::false_type +{}; + +// Partial specialization that unpacks the tuple +template +struct is_tuple_invocable> : ::std::is_invocable +{}; + +// Convenient alias template +template +inline constexpr bool is_tuple_invocable_v = is_tuple_invocable::value; + /** * @brief A compile-time boolean that checks if a type supports streaming with std::ostream <<. * diff --git a/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh b/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh index 5b64dbc531d..bd481b3dea2 100644 --- a/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh +++ b/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh @@ -50,8 +50,8 @@ _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto basic_any_from(_Interface<_Super>& } template