diff --git a/.devcontainer/cuda11.1-gcc9/devcontainer.json b/.devcontainer/cuda11.1-gcc9/devcontainer.json deleted file mode 100644 index 9d711be5f66..00000000000 --- a/.devcontainer/cuda11.1-gcc9/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-gcc9-cuda11.1", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda11.1-gcc9", - "CCCL_CUDA_VERSION": "11.1", - "CCCL_HOST_COMPILER": "gcc", - "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda11.1-gcc9", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda11.1-gcc9" -} diff --git a/.devcontainer/cuda11.1-llvm9/devcontainer.json b/.devcontainer/cuda11.1-llvm9/devcontainer.json deleted file mode 100644 index e39eb910443..00000000000 --- a/.devcontainer/cuda11.1-llvm9/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm9-cuda11.1", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda11.1-llvm9", - "CCCL_CUDA_VERSION": "11.1", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda11.1-llvm9", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda11.1-llvm9" -} diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json deleted file mode 100644 index 87098679264..00000000000 --- a/.devcontainer/cuda11.8-gcc11/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-gcc11-cuda11.8", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda11.8-gcc11", - "CCCL_CUDA_VERSION": "11.8", - "CCCL_HOST_COMPILER": "gcc", - "CCCL_HOST_COMPILER_VERSION": "11", - "CCCL_BUILD_INFIX": "cuda11.8-gcc11", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda11.8-gcc11" -} diff --git a/.devcontainer/cuda11.1-gcc7/devcontainer.json b/.devcontainer/cuda12.0-gcc7/devcontainer.json similarity index 90% rename from .devcontainer/cuda11.1-gcc7/devcontainer.json rename to .devcontainer/cuda12.0-gcc7/devcontainer.json index e7d2a6572f8..96a32136eb1 100644 --- a/.devcontainer/cuda11.1-gcc7/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc7/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-gcc7-cuda11.1", + "image": "rapidsai/devcontainers:25.02-cpp-gcc7-cuda12.0", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda11.1-gcc7", - "CCCL_CUDA_VERSION": "11.1", + "DEVCONTAINER_NAME": "cuda12.0-gcc7", + "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "7", - "CCCL_BUILD_INFIX": "cuda11.1-gcc7", + "CCCL_BUILD_INFIX": "cuda12.0-gcc7", "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", @@ -50,5 +50,5 @@ } } }, - "name": "cuda11.1-gcc7" + "name": "cuda12.0-gcc7" } diff --git a/.devcontainer/cuda11.1-gcc8/devcontainer.json b/.devcontainer/cuda12.0-gcc8/devcontainer.json similarity index 90% rename from .devcontainer/cuda11.1-gcc8/devcontainer.json rename to .devcontainer/cuda12.0-gcc8/devcontainer.json index f590606adef..9cfe4709e07 100644 --- a/.devcontainer/cuda11.1-gcc8/devcontainer.json +++ b/.devcontainer/cuda12.0-gcc8/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-gcc8-cuda11.1", + "image": "rapidsai/devcontainers:25.02-cpp-gcc8-cuda12.0", "hostRequirements": { "gpu": "optional" }, @@ -15,11 +15,11 @@ "SCCACHE_BUCKET": "rapids-sccache-devs", "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda11.1-gcc8", - "CCCL_CUDA_VERSION": "11.1", + "DEVCONTAINER_NAME": "cuda12.0-gcc8", + "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "8", - "CCCL_BUILD_INFIX": "cuda11.1-gcc8", + "CCCL_BUILD_INFIX": "cuda12.0-gcc8", "CCCL_CUDA_EXTENDED": "false" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", @@ -50,5 +50,5 @@ } } }, - "name": "cuda11.1-gcc8" + "name": "cuda12.0-gcc8" } diff --git a/.devcontainer/cuda12.0-llvm10/devcontainer.json b/.devcontainer/cuda12.0-llvm10/devcontainer.json deleted file mode 100644 index 6f75525f808..00000000000 --- a/.devcontainer/cuda12.0-llvm10/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm10-cuda12.0", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.0-llvm10", - "CCCL_CUDA_VERSION": "12.0", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "10", - "CCCL_BUILD_INFIX": "cuda12.0-llvm10", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.0-llvm10" -} diff --git a/.devcontainer/cuda12.0-llvm11/devcontainer.json b/.devcontainer/cuda12.0-llvm11/devcontainer.json deleted file mode 100644 index fd21f30fbd3..00000000000 --- a/.devcontainer/cuda12.0-llvm11/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm11-cuda12.0", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.0-llvm11", - "CCCL_CUDA_VERSION": "12.0", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "11", - "CCCL_BUILD_INFIX": "cuda12.0-llvm11", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.0-llvm11" -} diff --git a/.devcontainer/cuda12.0-llvm12/devcontainer.json b/.devcontainer/cuda12.0-llvm12/devcontainer.json deleted file mode 100644 index b402063c837..00000000000 --- a/.devcontainer/cuda12.0-llvm12/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm12-cuda12.0", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.0-llvm12", - "CCCL_CUDA_VERSION": "12.0", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "12", - "CCCL_BUILD_INFIX": "cuda12.0-llvm12", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.0-llvm12" -} diff --git a/.devcontainer/cuda12.0-llvm13/devcontainer.json b/.devcontainer/cuda12.0-llvm13/devcontainer.json deleted file mode 100644 index 40187a60e6c..00000000000 --- a/.devcontainer/cuda12.0-llvm13/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm13-cuda12.0", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.0-llvm13", - "CCCL_CUDA_VERSION": "12.0", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "13", - "CCCL_BUILD_INFIX": "cuda12.0-llvm13", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.0-llvm13" -} diff --git a/.devcontainer/cuda12.0-llvm9/devcontainer.json b/.devcontainer/cuda12.0-llvm9/devcontainer.json deleted file mode 100644 index e72c6da2fdd..00000000000 --- a/.devcontainer/cuda12.0-llvm9/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm9-cuda12.0", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.0-llvm9", - "CCCL_CUDA_VERSION": "12.0", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda12.0-llvm9", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.0-llvm9" -} diff --git a/.devcontainer/cuda12.6-llvm10/devcontainer.json b/.devcontainer/cuda12.6-llvm10/devcontainer.json deleted file mode 100644 index ef06f7cf9a2..00000000000 --- a/.devcontainer/cuda12.6-llvm10/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm10-cuda12.6", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.6-llvm10", - "CCCL_CUDA_VERSION": "12.6", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "10", - "CCCL_BUILD_INFIX": "cuda12.6-llvm10", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.6-llvm10" -} diff --git a/.devcontainer/cuda12.6-llvm11/devcontainer.json b/.devcontainer/cuda12.6-llvm11/devcontainer.json deleted file mode 100644 index 38c13841ee6..00000000000 --- a/.devcontainer/cuda12.6-llvm11/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm11-cuda12.6", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.6-llvm11", - "CCCL_CUDA_VERSION": "12.6", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "11", - "CCCL_BUILD_INFIX": "cuda12.6-llvm11", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.6-llvm11" -} diff --git a/.devcontainer/cuda12.6-llvm12/devcontainer.json b/.devcontainer/cuda12.6-llvm12/devcontainer.json deleted file mode 100644 index 8898d216573..00000000000 --- a/.devcontainer/cuda12.6-llvm12/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm12-cuda12.6", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.6-llvm12", - "CCCL_CUDA_VERSION": "12.6", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "12", - "CCCL_BUILD_INFIX": "cuda12.6-llvm12", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.6-llvm12" -} diff --git a/.devcontainer/cuda12.6-llvm13/devcontainer.json b/.devcontainer/cuda12.6-llvm13/devcontainer.json deleted file mode 100644 index 8d713720c51..00000000000 --- a/.devcontainer/cuda12.6-llvm13/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm13-cuda12.6", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.6-llvm13", - "CCCL_CUDA_VERSION": "12.6", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "13", - "CCCL_BUILD_INFIX": "cuda12.6-llvm13", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.6-llvm13" -} diff --git a/.devcontainer/cuda12.6-llvm9/devcontainer.json b/.devcontainer/cuda12.6-llvm9/devcontainer.json deleted file mode 100644 index be41e2506c5..00000000000 --- a/.devcontainer/cuda12.6-llvm9/devcontainer.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:25.02-cpp-llvm9-cuda12.6", - "hostRequirements": { - "gpu": "optional" - }, - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;", - "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;" - ], - "containerEnv": { - "SCCACHE_REGION": "us-east-2", - "SCCACHE_BUCKET": "rapids-sccache-devs", - "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs", - "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", - "DEVCONTAINER_NAME": "cuda12.6-llvm9", - "CCCL_CUDA_VERSION": "12.6", - "CCCL_HOST_COMPILER": "llvm", - "CCCL_HOST_COMPILER_VERSION": "9", - "CCCL_BUILD_INFIX": "cuda12.6-llvm9", - "CCCL_CUDA_EXTENDED": "false" - }, - "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", - "mounts": [ - "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=cccl-build,target=/home/coder/cccl/build" - ], - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "xaver.clang-format", - "nvidia.nsight-vscode-edition", - "ms-vscode.cmake-tools" - ], - "settings": { - "editor.defaultFormatter": "xaver.clang-format", - "editor.formatOnSave": true, - "clang-format.executable": "/usr/bin/clang-format", - "clangd.arguments": [ - "--compile-commands-dir=${workspaceFolder}" - ], - "files.eol": "\n", - "files.trimTrailingWhitespace": true - } - } - }, - "name": "cuda12.6-llvm9" -} diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 725009e6256..74de63e7a94 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -37,6 +37,11 @@ body: - Thrust - CUB - libcu++ + - CUDA Experimental (cudax) + - cuda.cooperative (Python) + - cuda.parallel (Python) + - General CCCL + - Infrastructure - Not sure validations: required: true diff --git a/.github/actions/workflow-run-job-windows/action.yml b/.github/actions/workflow-run-job-windows/action.yml index 805beff3446..1b5289a5a7d 100644 --- a/.github/actions/workflow-run-job-windows/action.yml +++ b/.github/actions/workflow-run-job-windows/action.yml @@ -50,6 +50,7 @@ runs: docker run \ --mount type=bind,source="${{steps.paths.outputs.HOST_REPO}}",target="${{steps.paths.outputs.MOUNT_REPO}}" \ --workdir "${{steps.paths.outputs.MOUNT_REPO}}" \ + --isolation=process \ ${{ inputs.image }} \ powershell -c " [System.Environment]::SetEnvironmentVariable('AWS_ACCESS_KEY_ID','${{env.AWS_ACCESS_KEY_ID}}'); diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7dd411ba39b..d317e931e78 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: - id: mixed-line-ending - id: trailing-whitespace - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.8 + rev: v19.1.6 hooks: - id: clang-format types_or: [file] @@ -39,7 +39,7 @@ repos: # TODO/REMINDER: add the Ruff vscode extension to the devcontainers # Ruff, the Python auto-correcting linter/formatter written in Rust - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.3 + rev: v0.8.6 hooks: - id: ruff # linter - id: ruff-format # formatter @@ -57,7 +57,7 @@ repos: - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.13.0' + rev: 'v1.14.1' hooks: - id: mypy additional_dependencies: [types-cachetools, numpy] diff --git a/CMakePresets.json b/CMakePresets.json index 089f9b81798..bd10a95200b 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -69,8 +69,6 @@ "CUB_ENABLE_TESTING": true, "CUB_ENABLE_EXAMPLES": true, "CUB_SEPARATE_CATCH2": true, - "CUB_ENABLE_DIALECT_CPP11": true, - "CUB_ENABLE_DIALECT_CPP14": true, "CUB_ENABLE_DIALECT_CPP17": true, "CUB_ENABLE_DIALECT_CPP20": true, "THRUST_ENABLE_MULTICONFIG": true, @@ -232,30 +230,10 @@ "CUB_ENABLE_TESTING": true, "CUB_ENABLE_EXAMPLES": true, "CUB_SEPARATE_CATCH2": true, - "CUB_ENABLE_DIALECT_CPP11": false, - "CUB_ENABLE_DIALECT_CPP14": false, "CUB_ENABLE_DIALECT_CPP17": false, "CUB_ENABLE_DIALECT_CPP20": false } }, - { - "name": "cub-cpp11", - "displayName": "CUB: C++11", - "inherits": "cub-base", - "cacheVariables": { - "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true, - "CUB_ENABLE_DIALECT_CPP11": true - } - }, - { - "name": "cub-cpp14", - "displayName": "CUB: C++14", - "inherits": "cub-base", - "cacheVariables": { - "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true, - "CUB_ENABLE_DIALECT_CPP14": true - } - }, { "name": "cub-cpp17", "displayName": "CUB: C++17", @@ -289,24 +267,6 @@ "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP20": false } }, - { - "name": "thrust-cpp11", - "displayName": "Thrust: C++11", - "inherits": "thrust-base", - "cacheVariables": { - "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true, - "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP11": true - } - }, - { - "name": "thrust-cpp14", - "displayName": "Thrust: C++14", - "inherits": "thrust-base", - "cacheVariables": { - "CCCL_IGNORE_DEPRECATED_CPP_DIALECT": true, - "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP14": true - } - }, { "name": "thrust-cpp17", "displayName": "Thrust: C++17", @@ -520,14 +480,6 @@ "libcudacxx-base" ] }, - { - "name": "cub-cpp11", - "configurePreset": "cub-cpp11" - }, - { - "name": "cub-cpp14", - "configurePreset": "cub-cpp14" - }, { "name": "cub-cpp17", "configurePreset": "cub-cpp17" @@ -536,14 +488,6 @@ "name": "cub-cpp20", "configurePreset": "cub-cpp20" }, - { - "name": "thrust-cpp11", - "configurePreset": "thrust-cpp11" - }, - { - "name": "thrust-cpp14", - "configurePreset": "thrust-cpp14" - }, { "name": "thrust-cpp17", "configurePreset": "thrust-cpp17" @@ -736,16 +680,6 @@ } } }, - { - "name": "cub-nolid-cpp11", - "configurePreset": "cub-cpp11", - "inherits": "cub-nolid-base" - }, - { - "name": "cub-nolid-cpp14", - "configurePreset": "cub-cpp14", - "inherits": "cub-nolid-base" - }, { "name": "cub-nolid-cpp17", "configurePreset": "cub-cpp17", @@ -756,16 +690,6 @@ "configurePreset": "cub-cpp20", "inherits": "cub-nolid-base" }, - { - "name": "cub-lid0-cpp11", - "configurePreset": "cub-cpp11", - "inherits": "cub-lid0-base" - }, - { - "name": "cub-lid0-cpp14", - "configurePreset": "cub-cpp14", - "inherits": "cub-lid0-base" - }, { "name": "cub-lid0-cpp17", "configurePreset": "cub-cpp17", @@ -776,16 +700,6 @@ "configurePreset": "cub-cpp20", "inherits": "cub-lid0-base" }, - { - "name": "cub-lid1-cpp11", - "configurePreset": "cub-cpp11", - "inherits": "cub-lid1-base" - }, - { - "name": "cub-lid1-cpp14", - "configurePreset": "cub-cpp14", - "inherits": "cub-lid1-base" - }, { "name": "cub-lid1-cpp17", "configurePreset": "cub-cpp17", @@ -796,16 +710,6 @@ "configurePreset": "cub-cpp20", "inherits": "cub-lid1-base" }, - { - "name": "cub-lid2-cpp11", - "configurePreset": "cub-cpp11", - "inherits": "cub-lid2-base" - }, - { - "name": "cub-lid2-cpp14", - "configurePreset": "cub-cpp14", - "inherits": "cub-lid2-base" - }, { "name": "cub-lid2-cpp17", "configurePreset": "cub-cpp17", @@ -816,16 +720,6 @@ "configurePreset": "cub-cpp20", "inherits": "cub-lid2-base" }, - { - "name": "cub-cpp11", - "configurePreset": "cub-cpp11", - "inherits": "cub-base" - }, - { - "name": "cub-cpp14", - "configurePreset": "cub-cpp14", - "inherits": "cub-base" - }, { "name": "cub-cpp17", "configurePreset": "cub-cpp17", @@ -866,16 +760,6 @@ } } }, - { - "name": "thrust-gpu-cpp11", - "configurePreset": "thrust-cpp11", - "inherits": "thrust-gpu-base" - }, - { - "name": "thrust-gpu-cpp14", - "configurePreset": "thrust-cpp14", - "inherits": "thrust-gpu-base" - }, { "name": "thrust-gpu-cpp17", "configurePreset": "thrust-cpp17", @@ -886,16 +770,6 @@ "configurePreset": "thrust-cpp20", "inherits": "thrust-gpu-base" }, - { - "name": "thrust-cpu-cpp11", - "configurePreset": "thrust-cpp11", - "inherits": "thrust-cpu-base" - }, - { - "name": "thrust-cpu-cpp14", - "configurePreset": "thrust-cpp14", - "inherits": "thrust-cpu-base" - }, { "name": "thrust-cpu-cpp17", "configurePreset": "thrust-cpp17", @@ -906,16 +780,6 @@ "configurePreset": "thrust-cpp20", "inherits": "thrust-cpu-base" }, - { - "name": "thrust-cpp11", - "configurePreset": "thrust-cpp11", - "inherits": "thrust-base" - }, - { - "name": "thrust-cpp14", - "configurePreset": "thrust-cpp14", - "inherits": "thrust-base" - }, { "name": "thrust-cpp17", "configurePreset": "thrust-cpp17", diff --git a/README.md b/README.md index 45766d0bfbc..358adadc87b 100644 --- a/README.md +++ b/README.md @@ -219,18 +219,16 @@ CCCL users are encouraged to capitalize on the latest enhancements and ["live at For a seamless experience, you can upgrade CCCL independently of the entire CUDA Toolkit. This is possible because CCCL maintains backward compatibility with the latest patch release of every minor CTK release from both the current and previous major version series. In some exceptional cases, the minimum supported minor version of the CUDA Toolkit release may need to be newer than the oldest release within its major version series. -For instance, CCCL requires a minimum supported version of 11.1 from the 11.x series due to an unavoidable compiler issue present in CTK 11.0. When a new major CTK is released, we drop support for the oldest supported major version. | CCCL Version | Supports CUDA Toolkit Version | |--------------|------------------------------------------------| | 2.x | 11.1 - 11.8, 12.x (only latest patch releases) | -| 3.x (Future) | 12.x, 13.x (only latest patch releases) | +| 3.x | 12.x, 13.x (only latest patch releases) | [Well-behaved code](#compatibility-guidelines) using the latest CCCL should compile and run successfully with any supported CTK version. Exceptions may occur for new features that depend on new CTK features, so those features would not work on older versions of the CTK. -For example, C++20 support was not added to `nvcc` until CUDA 12.0, so CCCL features that depend on C++20 would not work with CTK 11.x. Users can integrate a newer version of CCCL into an older CTK, but not the other way around. This means an older version of CCCL is not compatible with a newer CTK. @@ -271,8 +269,8 @@ But we will not invest significant time in triaging or fixing issues for older c In the spirit of "You only support what you test", see our [CI Overview](https://github.com/NVIDIA/cccl/blob/main/ci-overview.md) for more information on exactly what we test. ### C++ Dialects -- C++11 (Deprecated in Thrust/CUB, to be removed in next major version) -- C++14 (Deprecated in Thrust/CUB, to be removed in next major version) +- C++11 (only libcu++) +- C++14 (only libcu++) - C++17 - C++20 @@ -287,7 +285,7 @@ Note that some features may only support certain architectures/Compute Capabilit CCCL's testing strategy strikes a balance between testing as many configurations as possible and maintaining reasonable CI times. For CUDA Toolkit versions, testing is done against both the oldest and the newest supported versions. -For instance, if the latest version of the CUDA Toolkit is 12.3, tests are conducted against 11.1 and 12.3. +For instance, if the latest version of the CUDA Toolkit is 12.6, tests are conducted against 11.1 and 12.6. For each CUDA version, builds are completed against all supported host compilers with all supported C++ dialects. The testing strategy and matrix are constantly evolving. diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 4452f040e54..881f553f65d 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -5,17 +5,18 @@ workflows: # # Example: # override: - # - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'llvm16']} + # - {jobs: ['test'], project: 'thrust', std: 17, ctk: 'curr', cxx: ['gcc12', 'clang16']} # override: pull_request: - # Old CTK - - {jobs: ['build'], std: 'minmax', ctk: '11.1', cxx: ['gcc7', 'gcc9', 'clang9']} + # Old CTK/compiler + - {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc7', 'gcc9', 'clang14', 'msvc2019']} # Current CTK build-only - - {jobs: ['build'], std: [11, 14], cxx: ['gcc7', 'clang9']} + - {jobs: ['build'], std: [11, 14], cxx: ['gcc7', 'clang14'], project: 'libcudacxx'} + - {jobs: ['build'], std: [17], cxx: ['gcc7', 'clang14']} - {jobs: ['build'], std: 'max', cxx: ['gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']} - - {jobs: ['build'], std: 'max', cxx: ['clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']} + - {jobs: ['build'], std: 'max', cxx: ['clang14', 'clang15', 'clang16', 'clang17']} - {jobs: ['build'], std: 'max', cxx: ['msvc2019']} - {jobs: ['build'], std: [17, 20], cxx: ['gcc', 'clang', 'msvc']} # Current CTK testing: @@ -41,10 +42,8 @@ workflows: # verify-codegen: - {jobs: ['verify_codegen'], project: 'libcudacxx'} # cudax has different CTK reqs: - - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 17, cxx: ['gcc9', 'clang9']} - {jobs: ['build'], project: 'cudax', ctk: ['12.0'], std: 20, cxx: ['msvc14.36']} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['gcc10', 'gcc11', 'gcc12']} - - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['clang10', 'clang11', 'clang12', 'clang13']} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['clang14', 'clang15', 'clang16', 'clang17']} - {jobs: ['build'], project: 'cudax', ctk: ['12.5'], std: [17, 20], cxx: ['nvhpc']} - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 20, cxx: ['msvc2022']} @@ -55,7 +54,6 @@ workflows: # Python and c/parallel jobs: - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6'} # cccl-infra: - - {jobs: ['infra'], project: 'cccl', ctk: '11.1', cxx: ['gcc7', 'clang9']} - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14']} - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang']} @@ -64,12 +62,12 @@ workflows: - {jobs: ['limited'], project: 'cub', std: 17} - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'} - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit'} - # Old CTK - - {jobs: ['build'], std: 'all', ctk: '11.1', cxx: ['gcc7', 'gcc8', 'gcc9', 'clang9']} - - {jobs: ['build'], std: 'all', ctk: '11.8', cxx: ['gcc11'], sm: '60;70;80;90'} + # Old CTK/compiler + - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc7', 'gcc8', 'gcc9', 'clang14', 'msvc2019']} + - {jobs: ['build'], std: 'all', ctk: '12.0', cxx: ['gcc11'], sm: '60;70;80;90'} # Current CTK build-only - {jobs: ['build'], std: 'all', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']} - - {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16', 'clang17']} + - {jobs: ['build'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']} - {jobs: ['build'], std: 'all', cxx: ['msvc2019']} # Test current CTK - {jobs: ['test'], std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']} @@ -83,7 +81,6 @@ workflows: - {jobs: ['build'], project: 'libcudacxx', std: 'all', cudacxx: 'clang', cxx: 'clang', sm: '90a'} # cudax - {jobs: ['build'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc9', 'gcc10', 'gcc11']} - - {jobs: ['build'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13']} - {jobs: ['build'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']} - {jobs: ['build'], project: 'cudax', ctk: [ '12.5'], std: 'all', cxx: ['nvhpc']} - {jobs: ['build'], project: 'cudax', ctk: ['12.0', ], std: 'all', cxx: ['msvc14.36']} @@ -97,11 +94,11 @@ workflows: # # These are waiting on the NVKS nodes: # - {jobs: ['test'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc7', std: [11]} -# - {jobs: ['test'], ctk: '11.1', gpu: 't4', sm: 'gpu', cxx: 'clang9', std: [17]} +# - {jobs: ['test'], ctk: '11.1', gpu: 't4', sm: 'gpu', cxx: 'clang14', std: [17]} # - {jobs: ['test'], ctk: '11.8', gpu: 'rtx2080', sm: 'gpu', cxx: 'gcc11', std: [17]} # - {jobs: ['test'], ctk: 'curr', gpu: 'rtxa6000', sm: 'gpu', cxx: 'gcc7', std: [14]} # - {jobs: ['test'], ctk: 'curr', gpu: 'l4', sm: 'gpu', cxx: 'gcc13', std: 'all'} -# - {jobs: ['test'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang9', std: [11]} +# - {jobs: ['test'], ctk: 'curr', gpu: 'rtx4090', sm: 'gpu', cxx: 'clang14', std: [11]} # # H100 runners are currently flakey, only build since those use CPU-only runners: # - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'gcc12', std: [11, 20]} # - {jobs: ['build'], ctk: 'curr', gpu: 'h100', sm: 'gpu', cxx: 'clang18', std: [17]} @@ -116,9 +113,6 @@ workflows: exclude: # GPU runners are not available on Windows. - {jobs: ['test', 'test_gpu', 'test_nolid', 'test_lid0', 'test_lid1', 'test_lid2'], cxx: ['msvc2019', 'msvc14.36', 'msvc2022']} - # Ubuntu 18.04 is EOL and we only use it to get access to CTK 11.1 containers for CUDA testing. - # Disable non-CUDA tests on this platform. - - {jobs: ['test_cpu'], ctk: '11.1'} ############################################################################################# @@ -131,8 +125,6 @@ devcontainer_version: '25.02' all_stds: [11, 14, 17, 20] ctk_versions: - 11.1: { stds: [11, 14, 17, ] } - 11.8: { stds: [11, 14, 17, ] } 12.0: { stds: [11, 14, 17, 20] } 12.5: { stds: [11, 14, 17, 20] } 12.6: { stds: [11, 14, 17, 20], aka: 'curr' } @@ -163,11 +155,6 @@ host_compilers: container_tag: 'llvm' exe: 'clang++' versions: - 9: { stds: [11, 14, 17, ] } - 10: { stds: [11, 14, 17, ] } - 11: { stds: [11, 14, 17, 20] } - 12: { stds: [11, 14, 17, 20] } - 13: { stds: [11, 14, 17, 20] } 14: { stds: [11, 14, 17, 20] } 15: { stds: [11, 14, 17, 20] } 16: { stds: [11, 14, 17, 20] } @@ -253,11 +240,11 @@ projects: stds: [11, 14, 17, 20] cub: name: 'CUB' - stds: [11, 14, 17, 20] + stds: [17, 20] job_map: { test: ['test_nolid', 'test_lid0', 'test_lid1', 'test_lid2'] } thrust: name: 'Thrust' - stds: [11, 14, 17, 20] + stds: [17, 20] job_map: { test: ['test_cpu', 'test_gpu'] } cudax: stds: [17, 20] diff --git a/cub/cmake/CubBuildTargetList.cmake b/cub/cmake/CubBuildTargetList.cmake index 27dedd68210..7c6f59e8856 100644 --- a/cub/cmake/CubBuildTargetList.cmake +++ b/cub/cmake/CubBuildTargetList.cmake @@ -150,16 +150,6 @@ function(cub_build_target_list) cmake_minimum_required(VERSION 3.18.3) endif() - # Supported versions of MSVC do not distinguish between C++11 and C++14. - # Warn the user that they may be generating a ton of redundant targets. - if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND - CUB_ENABLE_DIALECT_CPP11) - message(WARNING - "Supported versions of MSVC (2017+) do not distinguish between C++11 " - "and C++14. The requested C++11 targets will be built with C++14." - ) - endif() - # Generic config flags: macro(add_flag_option flag docstring default) set(opt "CCCL_${flag}") diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index 21a487828ca..e454dc837b1 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -629,7 +629,7 @@ struct AgentHistogram // Set valid flags MarkValid( - is_valid, valid_samples, Int2Type{}); + is_valid, valid_samples, Int2Type < AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED > {}); // Accumulate samples if (prefer_smem) diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh index 2e0d94b219c..d5e3514f369 100644 --- a/cub/cub/agent/agent_reduce.cuh +++ b/cub/cub/agent/agent_reduce.cuh @@ -382,8 +382,8 @@ struct AgentReduce even_share.template BlockInit(block_offset, block_end); return (IsAligned(d_in + block_offset, Int2Type())) - ? ConsumeRange(even_share, Int2Type < true && ATTEMPT_VECTORIZATION > ()) - : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ()); + ? ConsumeRange(even_share, Int2Type()) + : ConsumeRange(even_share, Int2Type()); } /** @@ -396,8 +396,8 @@ struct AgentReduce even_share.template BlockInit(); return (IsAligned(d_in, Int2Type())) - ? ConsumeRange(even_share, Int2Type < true && ATTEMPT_VECTORIZATION > ()) - : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ()); + ? ConsumeRange(even_share, Int2Type()) + : ConsumeRange(even_share, Int2Type()); } private: diff --git a/cub/cub/agent/agent_three_way_partition.cuh b/cub/cub/agent/agent_three_way_partition.cuh index 3a07944d4e2..eec24057163 100644 --- a/cub/cub/agent/agent_three_way_partition.cuh +++ b/cub/cub/agent/agent_three_way_partition.cuh @@ -175,7 +175,8 @@ template + typename OffsetT, + typename StreamingContextT> struct AgentThreeWayPartition { //--------------------------------------------------------------------- @@ -251,6 +252,9 @@ struct AgentThreeWayPartition SelectSecondPartOp select_second_part_op; OffsetT num_items; ///< Total number of input items + // Note: This is a const reference because we have seen double-digit percentage perf regressions otherwise + const StreamingContextT& streaming_context; ///< Context for the current partition + //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- @@ -264,7 +268,8 @@ struct AgentThreeWayPartition UnselectedOutputIteratorT d_unselected_out, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, - OffsetT num_items) + OffsetT num_items, + const StreamingContextT& streaming_context) : temp_storage(temp_storage.Alias()) , d_in(d_in) , d_first_part_out(d_first_part_out) @@ -273,6 +278,7 @@ struct AgentThreeWayPartition , select_first_part_op(select_first_part_op) , select_second_part_op(select_second_part_op) , num_items(num_items) + , streaming_context(streaming_context) {} //--------------------------------------------------------------------- @@ -350,6 +356,11 @@ struct AgentThreeWayPartition CTA_SYNC(); // Gather items from shared memory and scatter to global + auto first_base = + d_first_part_out + (streaming_context.num_previously_selected_first() + num_first_selections_prefix); + auto second_base = + d_second_part_out + (streaming_context.num_previously_selected_second() + num_second_selections_prefix); + auto unselected_base = d_unselected_out + (streaming_context.num_previously_rejected() + num_rejected_prefix); for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; @@ -360,16 +371,16 @@ struct AgentThreeWayPartition if (item_idx < first_item_end) { - d_first_part_out[num_first_selections_prefix + item_idx] = item; + first_base[item_idx] = item; } else if (item_idx < second_item_end) { - d_second_part_out[num_second_selections_prefix + item_idx - first_item_end] = item; + second_base[item_idx - first_item_end] = item; } else { - int rejection_idx = item_idx - second_item_end; - d_unselected_out[num_rejected_prefix + rejection_idx] = item; + int rejection_idx = item_idx - second_item_end; + unselected_base[rejection_idx] = item; } } } @@ -400,11 +411,12 @@ struct AgentThreeWayPartition // Load items if (IS_LAST_TILE) { - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); + BlockLoadT(temp_storage.load_items) + .Load(d_in + streaming_context.input_offset() + tile_offset, items, num_tile_items); } else { - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); + BlockLoadT(temp_storage.load_items).Load(d_in + streaming_context.input_offset() + tile_offset, items); } // Initialize selection_flags @@ -464,11 +476,12 @@ struct AgentThreeWayPartition // Load items if (IS_LAST_TILE) { - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); + BlockLoadT(temp_storage.load_items) + .Load(d_in + streaming_context.input_offset() + tile_offset, items, num_tile_items); } else { - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); + BlockLoadT(temp_storage.load_items).Load(d_in + streaming_context.input_offset() + tile_offset, items); } // Initialize selection_flags @@ -551,7 +564,7 @@ struct AgentThreeWayPartition { // Blocks are launched in increasing order, so just assign one tile per block // Current tile index - const int tile_idx = static_cast((blockIdx.x * gridDim.y) + blockIdx.y); + const int tile_idx = blockIdx.x; // Global offset for the current tile const OffsetT tile_offset = tile_idx * TILE_ITEMS; @@ -572,9 +585,9 @@ struct AgentThreeWayPartition if (threadIdx.x == 0) { - // Output the total number of items selection_flags - d_num_selected_out[0] = AccumPackHelperT::first(accum); - d_num_selected_out[1] = AccumPackHelperT::second(accum); + // Update the number of selected items with this partition's selections + streaming_context.update_num_selected( + d_num_selected_out, AccumPackHelperT::first(accum), AccumPackHelperT::second(accum), num_items); } } } diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index 490abb86bda..92605b5168d 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -606,8 +606,7 @@ private: { volatile DigitCounterT warp_digit_counters[RADIX_DIGITS][PADDED_WARPS]; DigitCounterT raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT]; - } - aliasable; + } aliasable; }; #endif // !_CCCL_DOXYGEN_INVOKED diff --git a/cub/cub/config.cuh b/cub/cub/config.cuh index 16e7edd4905..d05078cdd2d 100644 --- a/cub/cub/config.cuh +++ b/cub/cub/config.cuh @@ -44,7 +44,6 @@ #endif // no system header #include // IWYU pragma: export -#include // IWYU pragma: export #include // IWYU pragma: export #include // IWYU pragma: export #include // IWYU pragma: export diff --git a/cub/cub/detail/detect_cuda_runtime.cuh b/cub/cub/detail/detect_cuda_runtime.cuh index d83b2c1179a..7666f9b2d23 100644 --- a/cub/cub/detail/detect_cuda_runtime.cuh +++ b/cub/cub/detail/detect_cuda_runtime.cuh @@ -73,40 +73,15 @@ */ # define CUB_RUNTIME_FUNCTION -/** - * \def CUB_RUNTIME_ENABLED - * - * Whether or not the active compiler pass is allowed to invoke device kernels - * or methods from the CUDA runtime API. - * - * This macro should not be used in CUB, as it depends on `__CUDA_ARCH__` - * and is not compatible with `NV_IF_TARGET`. It is provided for legacy - * purposes only. - * - * Replace any usages with `CUB_RDC_ENABLED` and `NV_IF_TARGET`. - */ -# define CUB_RUNTIME_ENABLED - #else // Non-doxygen pass: # ifndef CUB_RUNTIME_FUNCTION - # if defined(__CUDACC_RDC__) && !defined(CUB_DISABLE_CDP) - # define CUB_RDC_ENABLED # define CUB_RUNTIME_FUNCTION _CCCL_HOST_DEVICE - # else // RDC disabled: - # define CUB_RUNTIME_FUNCTION _CCCL_HOST - # endif // RDC enabled - -# if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__) -// Legacy only -- do not use in new code. -# define CUB_RUNTIME_ENABLED -# endif - # endif // CUB_RUNTIME_FUNCTION predefined # ifdef CUB_RDC_ENABLED diff --git a/cub/cub/detail/strong_load.cuh b/cub/cub/detail/strong_load.cuh index 61693d808e2..b6ba4bb5fc8 100644 --- a/cub/cub/detail/strong_load.cuh +++ b/cub/cub/detail/strong_load.cuh @@ -59,14 +59,14 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint4 load_relaxed(uint4 const* ptr) uint4 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) - : "l"(ptr) - : "memory");)); + (asm volatile("ld.relaxed.gpu.v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x), + "=r"(retval.y), + "=r"(retval.z), + "=r"(retval.w) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x), + "=r"(retval.y), + "=r"(retval.z), + "=r"(retval.w) : "l"(ptr) : "memory");)); return retval; } @@ -75,14 +75,8 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 load_relaxed(ulonglong2 const* ulonglong2 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.v2.u64 {%0, %1}, [%2];" - : "=l"(retval.x), "=l"(retval.y) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" - : "=l"(retval.x), "=l"(retval.y) - : "l"(ptr) - : "memory");)); + (asm volatile("ld.relaxed.gpu.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory");)); return retval; } @@ -91,14 +85,14 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE ushort4 load_relaxed(ushort4 const* ptr) ushort4 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.v4.u16 {%0, %1, %2, %3}, [%4];" - : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v4.u16 {%0, %1, %2, %3}, [%4];" - : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) - : "l"(ptr) - : "memory");)); + (asm volatile("ld.relaxed.gpu.v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x), + "=h"(retval.y), + "=h"(retval.z), + "=h"(retval.w) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x), + "=h"(retval.y), + "=h"(retval.z), + "=h"(retval.w) : "l"(ptr) : "memory");)); return retval; } @@ -107,46 +101,26 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_relaxed(uint2 const* ptr) uint2 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.v2.u32 {%0, %1}, [%2];" - : "=r"(retval.x), "=r"(retval.y) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" - : "=r"(retval.x), "=r"(retval.y) - : "l"(ptr) - : "memory");)); + (asm volatile("ld.relaxed.gpu.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory");)); return retval; } static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned long long load_relaxed(unsigned long long const* ptr) { unsigned long long retval; - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.u64 %0, [%1];" - : "=l"(retval) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.u64 %0, [%1];" - : "=l"(retval) - : "l"(ptr) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.u64 %0, [%1];" : "=l"(retval) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.u64 %0, [%1];" : "=l"(retval) : "l"(ptr) : "memory");)); return retval; } static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_relaxed(unsigned int const* ptr) { unsigned int retval; - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.u32 %0, [%1];" - : "=r"(retval) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.u32 %0, [%1];" - : "=r"(retval) - : "l"(ptr) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory");)); return retval; } @@ -154,16 +128,9 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_relaxed(unsigned int con static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned short load_relaxed(unsigned short const* ptr) { unsigned short retval; - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("ld.relaxed.gpu.u16 %0, [%1];" - : "=h"(retval) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.u16 %0, [%1];" - : "=h"(retval) - : "l"(ptr) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.relaxed.gpu.u16 %0, [%1];" : "=h"(retval) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.u16 %0, [%1];" : "=h"(retval) : "l"(ptr) : "memory");)); return retval; } @@ -172,24 +139,16 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned char load_relaxed(unsigned char c unsigned short retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile( - "{" - " .reg .u8 datum;" - " ld.relaxed.gpu.u8 datum, [%1];" - " cvt.u16.u8 %0, datum;" - "}" - : "=h"(retval) - : "l"(ptr) - : "memory");), - (asm volatile( - "{" - " .reg .u8 datum;" - " ld.cg.u8 datum, [%1];" - " cvt.u16.u8 %0, datum;" - "}" - : "=h"(retval) - : "l"(ptr) - : "memory");)); + (asm volatile("{" + " .reg .u8 datum;" + " ld.relaxed.gpu.u8 datum, [%1];" + " cvt.u16.u8 %0, datum;" + "}" : "=h"(retval) : "l"(ptr) : "memory");), + (asm volatile("{" + " .reg .u8 datum;" + " ld.cg.u8 datum, [%1];" + " cvt.u16.u8 %0, datum;" + "}" : "=h"(retval) : "l"(ptr) : "memory");)); return (unsigned char) retval; } @@ -198,14 +157,8 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 load_acquire(ulonglong2 const* ulonglong2 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.acquire.gpu.v2.u64 {%0, %1}, [%2];" - : "=l"(retval.x), "=l"(retval.y) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" - : "=l"(retval.x), "=l"(retval.y) - : "l"(ptr) - : "memory"); + (asm volatile("ld.acquire.gpu.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr) : "memory"); __threadfence();)); return retval; } @@ -215,14 +168,8 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_acquire(uint2 const* ptr) uint2 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("ld.acquire.gpu.v2.u32 {%0, %1}, [%2];" - : "=r"(retval.x), "=r"(retval.y) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" - : "=r"(retval.x), "=r"(retval.y) - : "l"(ptr) - : "memory"); + (asm volatile("ld.acquire.gpu.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr) : "memory"); __threadfence();)); return retval; } @@ -230,17 +177,9 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_acquire(uint2 const* ptr) static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_acquire(unsigned int const* ptr) { unsigned int retval; - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("ld.acquire.gpu.u32 %0, [%1];" - : "=r"(retval) - : "l"(ptr) - : "memory");), - (asm volatile("ld.cg.u32 %0, [%1];" - : "=r"(retval) - : "l"(ptr) - : "memory"); - __threadfence();)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("ld.acquire.gpu.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory");), + (asm volatile("ld.cg.u32 %0, [%1];" : "=r"(retval) : "l"(ptr) : "memory"); __threadfence();)); return retval; } diff --git a/cub/cub/detail/strong_store.cuh b/cub/cub/detail/strong_store.cuh index 9b8091738db..cc0e8f60e71 100644 --- a/cub/cub/detail/strong_store.cuh +++ b/cub/cub/detail/strong_store.cuh @@ -56,98 +56,61 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(uint4* ptr, uint4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.v4.u32 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) - : "memory");), - (asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) - : "memory");)); + (asm volatile("st.relaxed.gpu.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr), + "r"(val.x), + "r"(val.y), + "r"(val.z), + "r"(val.w) : "memory");), + (asm volatile( + "st.cg.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(ulonglong2* ptr, ulonglong2 val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.v2.u64 [%0], {%1, %2};" - : - : "l"(ptr), "l"(val.x), "l"(val.y) - : "memory");), - (asm volatile("st.cg.v2.u64 [%0], {%1, %2};" - : - : "l"(ptr), "l"(val.x), "l"(val.y) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");), + (asm volatile("st.cg.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(ushort4* ptr, ushort4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.v4.u16 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) - : "memory");), - (asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) - : "memory");)); + (asm volatile("st.relaxed.gpu.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr), + "h"(val.x), + "h"(val.y), + "h"(val.z), + "h"(val.w) : "memory");), + (asm volatile( + "st.cg.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(uint2* ptr, uint2 val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.v2.u32 [%0], {%1, %2};" - : - : "l"(ptr), "r"(val.x), "r"(val.y) - : "memory");), - (asm volatile("st.cg.v2.u32 [%0], {%1, %2};" - : - : "l"(ptr), "r"(val.x), "r"(val.y) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");), + (asm volatile("st.cg.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned long long* ptr, unsigned long long val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.u64 [%0], %1;" - : - : "l"(ptr), "l"(val) - : "memory");), - (asm volatile("st.cg.u64 [%0], %1;" - : - : "l"(ptr), "l"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");), + (asm volatile("st.cg.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned int* ptr, unsigned int val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.u32 [%0], %1;" - : - : "l"(ptr), "r"(val) - : "memory");), - (asm volatile("st.cg.u32 [%0], %1;" - : - : "l"(ptr), "r"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");), + (asm volatile("st.cg.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned short* ptr, unsigned short val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.relaxed.gpu.u16 [%0], %1;" - : - : "l"(ptr), "h"(val) - : "memory");), - (asm volatile("st.cg.u16 [%0], %1;" - : - : "l"(ptr), "h"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.relaxed.gpu.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");), + (asm volatile("st.cg.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned char* ptr, unsigned char val) @@ -158,123 +121,77 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned char* ptr, uns " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.relaxed.gpu.u8 [%0], datum;" - "}" - : - : "l"(ptr), "h"((unsigned short) val) - : "memory");), + "}" : : "l"(ptr), + "h"((unsigned short) val) : "memory");), (asm volatile("{" " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.cg.u8 [%0], datum;" - "}" - : - : "l"(ptr), "h"((unsigned short) val) - : "memory");)); + "}" : : "l"(ptr), + "h"((unsigned short) val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(uint4* ptr, uint4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.v4.u32 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) - : "memory");), - (__threadfence(); - asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) - : "memory");)); + (asm volatile("st.release.gpu.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr), + "r"(val.x), + "r"(val.y), + "r"(val.z), + "r"(val.w) : "memory");), + (__threadfence(); asm volatile( + "st.cg.v4.u32 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(ulonglong2* ptr, ulonglong2 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.v2.u64 [%0], {%1, %2};" - : - : "l"(ptr), "l"(val.x), "l"(val.y) - : "memory");), - (__threadfence(); - asm volatile("st.cg.v2.u64 [%0], {%1, %2};" - : - : "l"(ptr), "l"(val.x), "l"(val.y) - : "memory");)); + (asm volatile("st.release.gpu.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");), + (__threadfence(); asm volatile("st.cg.v2.u64 [%0], {%1, %2};" : : "l"(ptr), "l"(val.x), "l"(val.y) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(ushort4* ptr, ushort4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.v4.u16 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) - : "memory");), - (__threadfence(); - asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};" - : - : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) - : "memory");)); + (asm volatile("st.release.gpu.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr), + "h"(val.x), + "h"(val.y), + "h"(val.z), + "h"(val.w) : "memory");), + (__threadfence(); asm volatile( + "st.cg.v4.u16 [%0], {%1, %2, %3, %4};" : : "l"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(uint2* ptr, uint2 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.v2.u32 [%0], {%1, %2};" - : - : "l"(ptr), "r"(val.x), "r"(val.y) - : "memory");), - (__threadfence(); - asm volatile("st.cg.v2.u32 [%0], {%1, %2};" - : - : "l"(ptr), "r"(val.x), "r"(val.y) - : "memory");)); + (asm volatile("st.release.gpu.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");), + (__threadfence(); asm volatile("st.cg.v2.u32 [%0], {%1, %2};" : : "l"(ptr), "r"(val.x), "r"(val.y) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned long long* ptr, unsigned long long val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.u64 [%0], %1;" - : - : "l"(ptr), "l"(val) - : "memory");), - (__threadfence(); - asm volatile("st.cg.u64 [%0], %1;" - : - : "l"(ptr), "l"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");), + (__threadfence(); asm volatile("st.cg.u64 [%0], %1;" : : "l"(ptr), "l"(val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned int* ptr, unsigned int val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.u32 [%0], %1;" - : - : "l"(ptr), "r"(val) - : "memory");), - (__threadfence(); - asm volatile("st.cg.u32 [%0], %1;" - : - : "l"(ptr), "r"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");), + (__threadfence(); asm volatile("st.cg.u32 [%0], %1;" : : "l"(ptr), "r"(val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned short* ptr, unsigned short val) { - NV_IF_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("st.release.gpu.u16 [%0], %1;" - : - : "l"(ptr), "h"(val) - : "memory");), - (__threadfence(); - asm volatile("st.cg.u16 [%0], %1;" - : - : "l"(ptr), "h"(val) - : "memory");)); + NV_IF_TARGET(NV_PROVIDES_SM_70, + (asm volatile("st.release.gpu.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");), + (__threadfence(); asm volatile("st.cg.u16 [%0], %1;" : : "l"(ptr), "h"(val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned char* ptr, unsigned char val) @@ -285,19 +202,15 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned char* ptr, unsigned c " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.release.gpu.u8 [%0], datum;" - "}" - : - : "l"(ptr), "h"((unsigned short) val) - : "memory");), + "}" : : "l"(ptr), + "h"((unsigned short) val) : "memory");), (__threadfence(); asm volatile( "{" " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.cg.u8 [%0], datum;" - "}" - : - : "l"(ptr), "h"((unsigned short) val) - : "memory");)); + "}" : : "l"(ptr), + "h"((unsigned short) val) : "memory");)); } } // namespace detail diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh index a63ff9111e1..1af5f01f033 100644 --- a/cub/cub/device/device_adjacent_difference.cuh +++ b/cub/cub/device/device_adjacent_difference.cuh @@ -266,24 +266,6 @@ public: d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_input, - OutputIteratorT d_output, - NumItemsT num_items, - DifferenceOpT difference_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SubtractLeftCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory. //! @@ -397,23 +379,6 @@ public: d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - RandomAccessIteratorT d_input, - NumItemsT num_items, - DifferenceOpT difference_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SubtractLeft(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory. //! @@ -544,24 +509,6 @@ public: d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRightCopy( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_input, - OutputIteratorT d_output, - NumItemsT num_items, - DifferenceOpT difference_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SubtractRightCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory. //! @@ -663,23 +610,6 @@ public: return AdjacentDifference( d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream); } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRight( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - RandomAccessIteratorT d_input, - NumItemsT num_items, - DifferenceOpT difference_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SubtractRight(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh index cd3b922028a..b8a92334047 100644 --- a/cub/cub/device/device_histogram.cuh +++ b/cub/cub/device/device_histogram.cuh @@ -205,35 +205,6 @@ struct DeviceHistogram stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( - void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram, - int num_levels, - LevelT lower_level, - LevelT upper_level, - OffsetT num_samples, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return HistogramEven( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - lower_level, - upper_level, - num_samples, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes an intensity histogram from a sequence of data samples using equal-width bins. //! @@ -385,39 +356,6 @@ struct DeviceHistogram stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( - void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram, - int num_levels, - LevelT lower_level, - LevelT upper_level, - OffsetT num_row_samples, - OffsetT num_rows, - size_t row_stride_bytes, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return HistogramEven( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - lower_level, - upper_level, - num_row_samples, - num_rows, - row_stride_bytes, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using //! equal-width bins. @@ -587,40 +525,6 @@ struct DeviceHistogram stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven( - void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], - const int num_levels[NUM_ACTIVE_CHANNELS], - const LevelT lower_level[NUM_ACTIVE_CHANNELS], - const LevelT upper_level[NUM_ACTIVE_CHANNELS], - OffsetT num_pixels, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return MultiHistogramEven( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - lower_level, - upper_level, - num_pixels, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes per-channel intensity histograms from a sequence of //! multi-channel "pixel" data samples using equal-width bins. @@ -835,44 +739,6 @@ struct DeviceHistogram is_byte_sample); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven( - void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], - const int num_levels[NUM_ACTIVE_CHANNELS], - const LevelT lower_level[NUM_ACTIVE_CHANNELS], - const LevelT upper_level[NUM_ACTIVE_CHANNELS], - OffsetT num_row_pixels, - OffsetT num_rows, - size_t row_stride_bytes, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return MultiHistogramEven( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - lower_level, - upper_level, - num_row_pixels, - num_rows, - row_stride_bytes, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @} end member group //! @name Custom bin ranges //! @{ @@ -998,26 +864,6 @@ struct DeviceHistogram stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange( - void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram, - int num_levels, - const LevelT* d_levels, - OffsetT num_samples, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return HistogramRange( - d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. //! @@ -1156,37 +1002,6 @@ struct DeviceHistogram stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange( - void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram, - int num_levels, - const LevelT* d_levels, - OffsetT num_row_samples, - OffsetT num_rows, - size_t row_stride_bytes, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return HistogramRange( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - d_levels, - num_row_samples, - num_rows, - row_stride_bytes, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples //! using the specified bin boundary levels. @@ -1345,31 +1160,6 @@ struct DeviceHistogram stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange( - void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], - const int num_levels[NUM_ACTIVE_CHANNELS], - const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], - OffsetT num_pixels, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return MultiHistogramRange( - d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_pixels, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using //! the specified bin boundary levels. @@ -1573,42 +1363,6 @@ struct DeviceHistogram is_byte_sample); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange( - void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], - const int num_levels[NUM_ACTIVE_CHANNELS], - const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], - OffsetT num_row_pixels, - OffsetT num_rows, - size_t row_stride_bytes, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return MultiHistogramRange( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - d_levels, - num_row_pixels, - num_rows, - row_stride_bytes, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //@} end member group }; diff --git a/cub/cub/device/device_merge_sort.cuh b/cub/cub/device/device_merge_sort.cuh index d42f6033a7e..9fd714746d4 100644 --- a/cub/cub/device/device_merge_sort.cuh +++ b/cub/cub/device/device_merge_sort.cuh @@ -245,25 +245,6 @@ public: return SortPairsNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - ValueIteratorT d_items, - OffsetT num_items, - CompareOpT compare_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortPairs( - d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - /** * @brief Sorts items using a merge sorting method. * @@ -410,40 +391,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsCopy( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - ValueInputIteratorT d_input_items, - KeyIteratorT d_output_keys, - ValueIteratorT d_output_items, - OffsetT num_items, - CompareOpT compare_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortPairsCopy( - d_temp_storage, - temp_storage_bytes, - d_input_keys, - d_input_items, - d_output_keys, - d_output_items, - num_items, - compare_op, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - private: // Internal version without NVTX range template @@ -569,24 +516,6 @@ public: return SortKeysNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - OffsetT num_items, - CompareOpT compare_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortKeys( - d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - private: // Internal version without NVTX range template @@ -728,25 +657,6 @@ public: d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - KeyIteratorT d_output_keys, - OffsetT num_items, - CompareOpT compare_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortKeysCopy( - d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - /** * @brief Sorts items using a merge sorting method. * @@ -856,25 +766,6 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - ValueIteratorT d_items, - OffsetT num_items, - CompareOpT compare_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return StableSortPairs( - d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - /** * @brief Sorts items using a merge sorting method. * @@ -975,24 +866,6 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyIteratorT d_keys, - OffsetT num_items, - CompareOpT compare_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return StableSortKeys( - d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - /** * @brief Sorts items using a merge sorting method. * diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh index c68f6cf4d61..768d8413e6f 100644 --- a/cub/cub/device/device_partition.cuh +++ b/cub/cub/device/device_partition.cuh @@ -222,30 +222,6 @@ struct DevicePartition stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - FlagIterator d_flags, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - NumItemsT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Flagged( - d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Uses the ``select_op`` functor to split the corresponding items from ``d_in`` into //! a partitioned sequence ``d_out``. The total number of items copied into the first partition is written @@ -404,30 +380,6 @@ struct DevicePartition stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t - If(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - NumItemsT num_items, - SelectOp select_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return If( - d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - private: template + typename SelectSecondPartOp, + typename NumItemsT> CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t IfNoNVTX( void* d_temp_storage, std::size_t& temp_storage_bytes, @@ -454,12 +407,13 @@ private: SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + NumItemsT num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream = 0) { - using OffsetT = int; + using ChooseOffsetT = detail::choose_signed_offset; + using OffsetT = typename ChooseOffsetT::type; using DispatchThreeWayPartitionIfT = DispatchThreeWayPartitionIf< InputIteratorT, FirstOutputIteratorT, @@ -470,6 +424,14 @@ private: SelectSecondPartOp, OffsetT>; + // Signed integer type for global offsets + // Check if the number of items exceeds the range covered by the selected signed offset type + cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items); + if (error) + { + return error; + } + return DispatchThreeWayPartitionIfT::Dispatch( d_temp_storage, temp_storage_bytes, @@ -625,6 +587,9 @@ public: //! @tparam SelectSecondPartOp //! **[inferred]** Selection functor type having member `bool operator()(const T &a)` //! + //! @tparam NumItemsT + //! **[inferred]** Type of num_items + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -670,7 +635,8 @@ public: typename UnselectedOutputIteratorT, typename NumSelectedIteratorT, typename SelectFirstPartOp, - typename SelectSecondPartOp> + typename SelectSecondPartOp, + typename NumItemsT> CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, std::size_t& temp_storage_bytes, @@ -679,7 +645,7 @@ public: SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, - int num_items, + NumItemsT num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream = 0) @@ -698,51 +664,6 @@ public: select_second_part_op, stream); } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t - If(void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_in, - FirstOutputIteratorT d_first_part_out, - SecondOutputIteratorT d_second_part_out, - UnselectedOutputIteratorT d_unselected_out, - NumSelectedIteratorT d_num_selected_out, - int num_items, - SelectFirstPartOp select_first_part_op, - SelectSecondPartOp select_second_part_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return If( - d_temp_storage, - temp_storage_bytes, - d_in, - d_first_part_out, - d_second_part_out, - d_unselected_out, - d_num_selected_out, - num_items, - select_first_part_op, - select_second_part_op, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/device_radix_sort.cuh b/cub/cub/device/device_radix_sort.cuh index 32156b75e34..25798297682 100644 --- a/cub/cub/device/device_radix_sort.cuh +++ b/cub/cub/device/device_radix_sort.cuh @@ -362,37 +362,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( - void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - NumItemsT num_items, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortPairs( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - begin_bit, - end_bit, - stream); - } -#endif - //! @rst //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage. //! @@ -817,26 +786,6 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( - void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - NumItemsT num_items, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortPairs( - d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream); - } -#endif - //! @rst //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage. //! @@ -1251,37 +1200,6 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( - void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - NumItemsT num_items, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortPairsDescending( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - begin_bit, - end_bit, - stream); - } -#endif - //! @rst //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage. //! @@ -1705,26 +1623,6 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( - void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - NumItemsT num_items, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortPairsDescending( - d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream); - } -#endif - //! @rst //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage. //! @@ -2411,26 +2309,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( - void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - NumItemsT num_items, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortKeys( - d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream); - } -#endif - //! @brief Sorts keys into ascending order. (`~N` auxiliary storage required). //! //! @par @@ -2551,24 +2429,6 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( - void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - NumItemsT num_items, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, begin_bit, end_bit, stream); - } -#endif - //! @rst //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage. //! @@ -2944,26 +2804,6 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( - void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - NumItemsT num_items, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortKeysDescending( - d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream); - } -#endif - //! @rst //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage. //! @@ -3344,25 +3184,6 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( - void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - NumItemsT num_items, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortKeysDescending( - d_temp_storage, temp_storage_bytes, d_keys, num_items, begin_bit, end_bit, stream); - } -#endif - //! @rst //! Sorts keys into descending order using :math:`\approx N` auxiliary storage. //! diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index 0841662261d..a5c3de4a313 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -227,26 +227,6 @@ struct DeviceReduce d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast(num_items), reduction_op, init, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - ReductionOpT reduction_op, - T init, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Reduce( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide sum using the addition (``+``) operator. //! @@ -352,23 +332,6 @@ struct DeviceReduce stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t - Sum(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide minimum using the less-than (``<``) operator. //! @@ -478,23 +441,6 @@ struct DeviceReduce stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t - Min(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Finds the first device-wide minimum using the less-than (``<``) operator and also returns the index of that item. //! @@ -754,23 +700,6 @@ struct DeviceReduce d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ::cuda::std::int64_t num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide maximum using the greater-than (``>``) operator. //! @@ -878,23 +807,6 @@ struct DeviceReduce stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t - Max(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Finds the first device-wide maximum using the greater-than (``>``) operator and also returns the index of that //! item. @@ -1158,23 +1070,6 @@ struct DeviceReduce d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ::cuda::std::int64_t num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Fuses transform and reduce operations //! @@ -1498,47 +1393,6 @@ struct DeviceReduce static_cast(num_items), stream); } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t ReduceByKey( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - UniqueOutputIteratorT d_unique_out, - ValuesInputIteratorT d_values_in, - AggregatesOutputIteratorT d_aggregates_out, - NumRunsOutputIteratorT d_num_runs_out, - ReductionOpT reduction_op, - int num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return ReduceByKey( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_unique_out, - d_values_in, - d_aggregates_out, - d_num_runs_out, - reduction_op, - num_items, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index 73512b2296f..751cdd46424 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -230,29 +230,6 @@ struct DeviceRunLengthEncode stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Encode( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - UniqueOutputIteratorT d_unique_out, - LengthsOutputIteratorT d_counts_out, - NumRunsOutputIteratorT d_num_runs_out, - int num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Encode( - d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Enumerates the starting offsets and lengths of all non-trivial runs //! (of ``length > 1``) of same-valued keys in the sequence ``d_in``. @@ -384,30 +361,6 @@ struct DeviceRunLengthEncode num_items, stream); } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t - NonTrivialRuns( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OffsetsOutputIteratorT d_offsets_out, - LengthsOutputIteratorT d_lengths_out, - NumRunsOutputIteratorT d_num_runs_out, - int num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return NonTrivialRuns( - d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 1c06c83af66..0c1638bd955 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -207,24 +207,6 @@ struct DeviceScan stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumItemsT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return ExclusiveSum( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide exclusive prefix sum in-place. //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_data``. @@ -301,22 +283,6 @@ struct DeviceScan return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( - void* d_temp_storage, - size_t& temp_storage_bytes, - IteratorT d_data, - NumItemsT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide exclusive prefix scan using the specified //! binary ``scan_op`` functor. The ``init_value`` value is applied as @@ -449,26 +415,6 @@ struct DeviceScan stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - InitValueT init_value, - NumItemsT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return ExclusiveScan( - d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide exclusive prefix scan using the specified //! binary ``scan_op`` functor. The ``init_value`` value is applied as @@ -578,25 +524,6 @@ struct DeviceScan return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( - void* d_temp_storage, - size_t& temp_storage_bytes, - IteratorT d_data, - ScanOpT scan_op, - InitValueT init_value, - NumItemsT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return ExclusiveScan( - d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide exclusive prefix scan using the specified //! binary ``scan_op`` functor. The ``init_value`` value is provided as a future value. @@ -738,31 +665,6 @@ struct DeviceScan stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - FutureValue init_value, - NumItemsT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return ExclusiveScan( - d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide exclusive prefix scan using the specified binary ``scan_op`` functor. //! The ``init_value`` value is provided as a future value. @@ -879,29 +781,6 @@ struct DeviceScan return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( - void* d_temp_storage, - size_t& temp_storage_bytes, - IteratorT d_data, - ScanOpT scan_op, - FutureValue init_value, - NumItemsT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return ExclusiveScan( - d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @} end member group //! @name Inclusive scans //! @{ @@ -1002,24 +881,6 @@ struct DeviceScan d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumItemsT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return InclusiveSum( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide inclusive prefix sum in-place. //! @@ -1095,22 +956,6 @@ struct DeviceScan return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( - void* d_temp_storage, - size_t& temp_storage_bytes, - IteratorT d_data, - NumItemsT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor. //! @@ -1332,25 +1177,6 @@ struct DeviceScan stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - NumItemsT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return InclusiveScan( - d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor. //! @@ -1450,23 +1276,6 @@ struct DeviceScan return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( - void* d_temp_storage, - size_t& temp_storage_bytes, - IteratorT d_data, - ScanOpT scan_op, - NumItemsT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, scan_op, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide exclusive prefix sum-by-key with key equality //! defined by ``equality_op``. The value of ``0`` is applied as the initial @@ -1607,30 +1416,6 @@ struct DeviceScan stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template , - typename NumItemsT = std::uint32_t> - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - NumItemsT num_items, - EqualityOpT equality_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return ExclusiveSumByKey( - d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide exclusive prefix scan-by-key using the //! specified binary ``scan_op`` functor. The key equality is defined by @@ -1813,48 +1598,6 @@ struct DeviceScan stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template , - typename NumItemsT = std::uint32_t> - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - ScanOpT scan_op, - InitValueT init_value, - NumItemsT num_items, - EqualityOpT equality_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return ExclusiveScanByKey( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - scan_op, - init_value, - num_items, - equality_op, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``. //! @@ -1989,30 +1732,6 @@ struct DeviceScan stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template , - typename NumItemsT = std::uint32_t> - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - NumItemsT num_items, - EqualityOpT equality_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return InclusiveSumByKey( - d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide inclusive prefix scan-by-key using the //! specified binary ``scan_op`` functor. The key equality is defined by ``equality_op``. @@ -2179,32 +1898,6 @@ struct DeviceScan stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template , - typename NumItemsT = std::uint32_t> - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - ScanOpT scan_op, - NumItemsT num_items, - EqualityOpT equality_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return InclusiveScanByKey( - d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, num_items, equality_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @} end member group }; diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh index 6bde88ed9da..ae47119bfa3 100644 --- a/cub/cub/device/device_segmented_radix_sort.cuh +++ b/cub/cub/device/device_segmented_radix_sort.cuh @@ -264,43 +264,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( - void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortPairs( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Sorts segments of key-value pairs into ascending order. (``~N`` auxiliary storage required) //! @@ -475,39 +438,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( - void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortPairs( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Sorts segments of key-value pairs into descending order. (``~2N`` auxiliary storage required). //! @@ -682,43 +612,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( - void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortPairsDescending( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Sorts segments of key-value pairs into descending order. (``~N`` auxiliary storage required). //! @@ -897,39 +790,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( - void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortPairsDescending( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @} end member group //! @name Keys-only //! @{ @@ -1091,39 +951,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( - void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortKeys( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Sorts segments of keys into ascending order. (``~N`` auxiliary storage required). //! @@ -1290,37 +1117,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( - void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortKeys( - d_temp_storage, - temp_storage_bytes, - d_keys, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Sorts segments of keys into descending order. (``~2N`` auxiliary storage required). //! @@ -1478,39 +1274,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( - void* d_temp_storage, - size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortKeysDescending( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Sorts segments of keys into descending order. (``~N`` auxiliary storage required). //! @@ -1675,37 +1438,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( - void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortKeysDescending( - d_temp_storage, - temp_storage_bytes, - d_keys, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @} end member group }; diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh index 591930ad01c..5eac51ee742 100644 --- a/cub/cub/device/device_segmented_reduce.cuh +++ b/cub/cub/device/device_segmented_reduce.cuh @@ -271,42 +271,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - ReductionOpT reduction_op, - T initial_value, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Reduce( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - reduction_op, - initial_value, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide segmented sum using the addition (``+``) operator. //! @@ -425,26 +389,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t - Sum(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Sum( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide segmented minimum using the less-than (``<``) operator. //! @@ -571,26 +515,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t - Min(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Min( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Finds the first device-wide minimum in each segment using the //! less-than (``<``) operator, also returning the in-segment index of that item. @@ -741,26 +665,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return ArgMin( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Computes a device-wide segmented maximum using the greater-than (``>``) operator. //! @@ -876,26 +780,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t - Max(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Max( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Finds the first device-wide maximum in each segment using the //! greater-than (``>``) operator, also returning the in-segment index of that item @@ -1048,26 +932,6 @@ public: initial_value, stream); } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return ArgMax( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh index 26b55f9988e..1fb5656b82f 100644 --- a/cub/cub/device/device_segmented_sort.cuh +++ b/cub/cub/device/device_segmented_sort.cuh @@ -305,35 +305,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortKeys( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - private: // Internal version without NVTX range template @@ -502,35 +473,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortKeysDescending( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - private: // Internal version without NVTX range template @@ -701,26 +643,6 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortKeys( - d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - private: // Internal version without NVTX range template @@ -892,26 +814,6 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortKeysDescending( - d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Sorts segments of keys into ascending order. Approximately //! ``num_items + 2 * num_segments`` auxiliary storage required. @@ -1048,35 +950,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return StableSortKeys( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Sorts segments of keys into descending order. //! Approximately ``num_items + 2 * num_segments`` auxiliary storage required. @@ -1213,35 +1086,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return StableSortKeysDescending( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Sorts segments of keys into ascending order. //! Approximately ``2 * num_segments`` auxiliary storage required. @@ -1380,26 +1224,6 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return StableSortKeys( - d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Sorts segments of keys into descending order. //! Approximately ``2 * num_segments`` auxiliary storage required. @@ -1537,26 +1361,6 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return StableSortKeysDescending( - d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - private: // Internal version without NVTX range template @@ -1756,39 +1560,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortPairs( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - private: // Internal version without NVTX range template @@ -1984,39 +1755,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortPairsDescending( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - private: // Internal version without NVTX range template @@ -2212,35 +1950,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortPairs( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - private: // Internal version without NVTX range template @@ -2435,35 +2144,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return SortPairsDescending( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Sorts segments of key-value pairs into ascending order. //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required. @@ -2622,39 +2302,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return StableSortPairs( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Sorts segments of key-value pairs into descending order. //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required. @@ -2813,39 +2460,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - const KeyT* d_keys_in, - KeyT* d_keys_out, - const ValueT* d_values_in, - ValueT* d_values_out, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return StableSortPairsDescending( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Sorts segments of key-value pairs into ascending order. //! Approximately ``2 * num_segments`` auxiliary storage required. @@ -3010,35 +2624,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return StableSortPairs( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Sorts segments of key-value pairs into descending order. //! Approximately ``2 * num_segments`` auxiliary storage required. @@ -3202,35 +2787,6 @@ public: stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return StableSortPairsDescending( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @} end member group }; diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index 72e47cbebfe..7d5099ca7e1 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -203,26 +203,6 @@ struct DeviceSelect stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - FlagIterator d_flags, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - ::cuda::std::int64_t num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Flagged( - d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Uses the ``d_flags`` sequence to selectively compact the items in `d_data``. //! The total number of items selected is written to ``d_num_selected_out``. @@ -339,25 +319,6 @@ struct DeviceSelect stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( - void* d_temp_storage, - size_t& temp_storage_bytes, - IteratorT d_data, - FlagIterator d_flags, - NumSelectedIteratorT d_num_selected_out, - ::cuda::std::int64_t num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Flagged( - d_temp_storage, temp_storage_bytes, d_data, d_flags, d_num_selected_out, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Uses the ``select_op`` functor to selectively copy items from ``d_in`` into ``d_out``. //! The total number of items selected is written to ``d_num_selected_out``. @@ -497,26 +458,6 @@ struct DeviceSelect stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t - If(void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - ::cuda::std::int64_t num_items, - SelectOp select_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return If( - d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Uses the ``select_op`` functor to selectively compact items in ``d_data``. //! The total number of items selected is written to ``d_num_selected_out``. @@ -647,25 +588,6 @@ struct DeviceSelect stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t - If(void* d_temp_storage, - size_t& temp_storage_bytes, - IteratorT d_data, - NumSelectedIteratorT d_num_selected_out, - ::cuda::std::int64_t num_items, - SelectOp select_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return If( - d_temp_storage, temp_storage_bytes, d_data, d_num_selected_out, num_items, select_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Uses the ``select_op`` functor applied to ``d_flags`` to selectively copy the //! corresponding items from ``d_in`` into ``d_out``. @@ -1010,25 +932,6 @@ struct DeviceSelect stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - ::cuda::std::int64_t num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Unique( - d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @rst //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive //! equal-valued keys, only the first key and its value from each run is selectively copied @@ -1328,45 +1231,6 @@ struct DeviceSelect ::cuda::std::equal_to<>{}, stream); } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeyInputIteratorT d_keys_in, - ValueInputIteratorT d_values_in, - KeyOutputIteratorT d_keys_out, - ValueOutputIteratorT d_values_out, - NumSelectedIteratorT d_num_selected_out, - NumItemsT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return UniqueByKey( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_keys_out, - d_values_out, - d_num_selected_out, - num_items, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh index 114454a791a..5a751181842 100644 --- a/cub/cub/device/device_spmv.cuh +++ b/cub/cub/device/device_spmv.cuh @@ -207,39 +207,6 @@ struct DeviceSpmv return DispatchSpmv::Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream); } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t CsrMV( - void* d_temp_storage, - size_t& temp_storage_bytes, - const ValueT* d_values, - const int* d_row_offsets, - const int* d_column_indices, - const ValueT* d_vector_x, - ValueT* d_vector_y, - int num_rows, - int num_cols, - int num_nonzeros, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return CsrMV( - d_temp_storage, - temp_storage_bytes, - d_values, - d_row_offsets, - d_column_indices, - d_vector_x, - d_vector_y, - num_rows, - num_cols, - num_nonzeros, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - //! @} end member group }; diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index 953f09ca005..7c19fce3f52 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -46,7 +46,9 @@ struct DeviceTransform //! //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The //! iterators' value types must be trivially relocatable. - //! @param output An iterator to the output sequence where num_items results are written to. + //! @param output An iterator to the output sequence where num_items results are written to. May point to the + //! beginning of one of the input sequences, performing the transformation inplace. The output sequence must not + //! overlap with any of the input sequence in any other way. //! @param num_items The number of elements in each input sequence. //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value //! types must be convertible to the parameters of the function object's call operator. The return type of the call @@ -110,7 +112,9 @@ struct DeviceTransform //! //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type //! must be trivially relocatable. - //! @param output An iterator to the output sequence where num_items results are written to. + //! @param output An iterator to the output sequence where num_items results are written to. May point to the + //! beginning of one of the input sequences, performing the transformation inplace. The output sequence must not + //! overlap with any of the input sequence in any other way. //! @param num_items The number of elements in each input sequence. //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value //! types must be convertible to the parameters of the function object's call operator. The return type of the call @@ -180,7 +184,9 @@ struct DeviceTransform //! //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The //! iterators' value types must be trivially relocatable. - //! @param output An iterator to the output sequence where num_items results are written to. + //! @param output An iterator to the output sequence where num_items results are written to. May point to the + //! beginning of one of the input sequences, performing the transformation inplace. The output sequence must not + //! overlap with any of the input sequence in any other way. //! @param num_items The number of elements in each input sequence. //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value //! types must be convertible to the parameters of the function object's call operator. The return type of the call @@ -242,7 +248,9 @@ struct DeviceTransform //! //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type //! must be trivially relocatable. - //! @param output An iterator to the output sequence where num_items results are written to. + //! @param output An iterator to the output sequence where num_items results are written to. May point to the + //! beginning of one of the input sequences, performing the transformation inplace. The output sequence must not + //! overlap with any of the input sequence in any other way. //! @param num_items The number of elements in each input sequence. //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value //! types must be convertible to the parameters of the function object's call operator. The return type of the call diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index 6908b7d0638..a8c733ef309 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -137,29 +137,6 @@ struct DispatchAdjacentDifference , stream(stream) {} -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CCCL_DEPRECATED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchAdjacentDifference( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_input, - OutputIteratorT d_output, - OffsetT num_items, - DifferenceOpT difference_op, - cudaStream_t stream, - bool debug_synchronous) - : d_temp_storage(d_temp_storage) - , temp_storage_bytes(temp_storage_bytes) - , d_input(d_input) - , d_output(d_output) - , num_items(num_items) - , difference_op(difference_op) - , stream(stream) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - } -#endif // _CCCL_DOXYGEN_INVOKED - /// Invocation template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() @@ -319,24 +296,6 @@ struct DispatchAdjacentDifference return error; } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION static cudaError_t Dispatch( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_input, - OutputIteratorT d_output, - OffsetT num_items, - DifferenceOpT difference_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index b3b52fc8391..900f758cdfb 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -1031,39 +1031,6 @@ public: return error; } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange( - void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], - int num_output_levels[NUM_ACTIVE_CHANNELS], - const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], - OffsetT num_row_pixels, - OffsetT num_rows, - OffsetT row_stride_samples, - cudaStream_t stream, - bool debug_synchronous, - Int2Type is_byte_sample) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return DispatchRange( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_output_levels, - d_levels, - num_row_pixels, - num_rows, - row_stride_samples, - stream, - is_byte_sample); - } -#endif // _CCCL_DOXYGEN_INVOKED - /** * Dispatch routine for HistogramRange, specialized for 8-bit sample types * (computes 256-bin privatized histograms and then reduces to user-specified levels) @@ -1197,39 +1164,6 @@ public: return error; } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange( - void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], - const int num_output_levels[NUM_ACTIVE_CHANNELS], - const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], - OffsetT num_row_pixels, - OffsetT num_rows, - OffsetT row_stride_samples, - cudaStream_t stream, - bool debug_synchronous, - Int2Type is_byte_sample) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return DispatchRange( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_output_levels, - d_levels, - num_row_pixels, - num_rows, - row_stride_samples, - stream, - is_byte_sample); - } -#endif // _CCCL_DOXYGEN_INVOKED - /** * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit * @@ -1415,41 +1349,6 @@ public: return error; } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven( - void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], - const int num_output_levels[NUM_ACTIVE_CHANNELS], - const LevelT lower_level[NUM_ACTIVE_CHANNELS], - const LevelT upper_level[NUM_ACTIVE_CHANNELS], - OffsetT num_row_pixels, - OffsetT num_rows, - OffsetT row_stride_samples, - cudaStream_t stream, - bool debug_synchronous, - Int2Type is_byte_sample) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return DispatchEven( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_output_levels, - lower_level, - upper_level, - num_row_pixels, - num_rows, - row_stride_samples, - stream, - is_byte_sample); - } -#endif // _CCCL_DOXYGEN_INVOKED - /** * Dispatch routine for HistogramEven, specialized for 8-bit sample types * (computes 256-bin privatized histograms and then reduces to user-specified levels) @@ -1586,41 +1485,6 @@ public: return error; } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven( - void* d_temp_storage, - size_t& temp_storage_bytes, - SampleIteratorT d_samples, - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], - const int num_output_levels[NUM_ACTIVE_CHANNELS], - const LevelT lower_level[NUM_ACTIVE_CHANNELS], - const LevelT upper_level[NUM_ACTIVE_CHANNELS], - OffsetT num_row_pixels, - OffsetT num_rows, - OffsetT row_stride_samples, - cudaStream_t stream, - bool debug_synchronous, - Int2Type is_byte_sample) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return DispatchEven( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_output_levels, - lower_level, - upper_level, - num_row_pixels, - num_rows, - row_stride_samples, - stream, - is_byte_sample); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 507b7776de6..1d455bdfbf1 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -399,33 +399,6 @@ struct DispatchMergeSort , ptx_version(ptx_version) {} - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchMergeSort( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - ValueInputIteratorT d_input_items, - KeyIteratorT d_output_keys, - ValueIteratorT d_output_items, - OffsetT num_items, - CompareOpT compare_op, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : d_temp_storage(d_temp_storage) - , temp_storage_bytes(temp_storage_bytes) - , d_input_keys(d_input_keys) - , d_input_items(d_input_items) - , d_output_keys(d_output_keys) - , d_output_items(d_output_items) - , num_items(num_items) - , compare_op(compare_op) - , stream(stream) - , ptx_version(ptx_version) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - } - // Invocation template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() @@ -682,33 +655,6 @@ struct DispatchMergeSort return error; } - - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - KeyInputIteratorT d_input_keys, - ValueInputIteratorT d_input_items, - KeyIteratorT d_output_keys, - ValueIteratorT d_output_items, - OffsetT num_items, - CompareOpT compare_op, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch( - d_temp_storage, - temp_storage_bytes, - d_input_keys, - d_input_items, - d_output_keys, - d_output_items, - num_items, - compare_op, - stream); - } }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index ed971c1a739..0d4d9bf1ea9 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -934,33 +934,6 @@ struct DispatchRadixSort , decomposer(decomposer) {} - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchRadixSort( - void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - OffsetT num_items, - int begin_bit, - int end_bit, - bool is_overwrite_okay, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : d_temp_storage(d_temp_storage) - , temp_storage_bytes(temp_storage_bytes) - , d_keys(d_keys) - , d_values(d_values) - , num_items(num_items) - , begin_bit(begin_bit) - , end_bit(end_bit) - , stream(stream) - , ptx_version(ptx_version) - , is_overwrite_okay(is_overwrite_okay) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - } - //------------------------------------------------------------------------------ // Small-problem (single tile) invocation //------------------------------------------------------------------------------ @@ -1872,25 +1845,6 @@ struct DispatchRadixSort return error; } - - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - OffsetT num_items, - int begin_bit, - int end_bit, - bool is_overwrite_okay, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch( - d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); - } }; /****************************************************************************** @@ -2027,39 +1981,6 @@ struct DispatchSegmentedRadixSort , decomposer(decomposer) {} - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedRadixSort( - void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - OffsetT num_items, - OffsetT num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - bool is_overwrite_okay, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : d_temp_storage(d_temp_storage) - , temp_storage_bytes(temp_storage_bytes) - , d_keys(d_keys) - , d_values(d_values) - , num_items(num_items) - , num_segments(num_segments) - , d_begin_offsets(d_begin_offsets) - , d_end_offsets(d_end_offsets) - , begin_bit(begin_bit) - , end_bit(end_bit) - , stream(stream) - , ptx_version(ptx_version) - , is_overwrite_okay(is_overwrite_okay) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - } - //------------------------------------------------------------------------------ // Multi-segment invocation //------------------------------------------------------------------------------ @@ -2428,39 +2349,6 @@ struct DispatchSegmentedRadixSort return error; } - - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - int num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - bool is_overwrite_okay, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - is_overwrite_okay, - stream); - } }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index c9ea3fc1bd2..0cca1e1a982 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -345,33 +345,6 @@ struct DispatchReduce , launcher_factory(launcher_factory) {} -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchReduce( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_items, - ReductionOpT reduction_op, - InitT init, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : d_temp_storage(d_temp_storage) - , temp_storage_bytes(temp_storage_bytes) - , d_in(d_in) - , d_out(d_out) - , num_items(num_items) - , reduction_op(reduction_op) - , init(init) - , stream(stream) - , ptx_version(ptx_version) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - } -#endif // _CCCL_DOXYGEN_INVOKED - //--------------------------------------------------------------------------- // Small-problem (single tile) invocation //--------------------------------------------------------------------------- @@ -689,25 +662,6 @@ struct DispatchReduce return error; } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_items, - ReductionOpT reduction_op, - InitT init, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; /** @@ -884,37 +838,6 @@ struct DispatchSegmentedReduce , ptx_version(ptx_version) {} -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedReduce( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - ReductionOpT reduction_op, - InitT init, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : d_temp_storage(d_temp_storage) - , temp_storage_bytes(temp_storage_bytes) - , d_in(d_in) - , d_out(d_out) - , num_segments(num_segments) - , d_begin_offsets(d_begin_offsets) - , d_end_offsets(d_end_offsets) - , reduction_op(reduction_op) - , init(init) - , stream(stream) - , ptx_version(ptx_version) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - } -#endif // _CCCL_DOXYGEN_INVOKED - //--------------------------------------------------------------------------- // Chained policy invocation //--------------------------------------------------------------------------- @@ -1109,37 +1032,6 @@ struct DispatchSegmentedReduce return error; } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - ReductionOpT reduction_op, - InitT init, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - reduction_op, - init, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 8449a10ea62..804371588f3 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -541,39 +541,6 @@ struct DispatchReduceByKey return error; } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - UniqueOutputIteratorT d_unique_out, - ValuesInputIteratorT d_values_in, - AggregatesOutputIteratorT d_aggregates_out, - NumRunsOutputIteratorT d_num_runs_out, - EqualityOpT equality_op, - ReductionOpT reduction_op, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_unique_out, - d_values_in, - d_aggregates_out, - d_num_runs_out, - equality_op, - reduction_op, - num_items, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index 5c8f1e01d0f..b1542462a58 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -539,35 +539,6 @@ struct DeviceRleDispatch return error; } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OffsetsOutputIteratorT d_offsets_out, - LengthsOutputIteratorT d_lengths_out, - NumRunsOutputIteratorT d_num_runs_out, - EqualityOpT equality_op, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_offsets_out, - d_lengths_out, - d_num_runs_out, - equality_op, - num_items, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 676b08df49e..0ba4cc1dcae 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -329,33 +329,6 @@ struct DispatchScan , ptx_version(ptx_version) {} -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScan( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_items, - ScanOpT scan_op, - InitValueT init_value, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : d_temp_storage(d_temp_storage) - , temp_storage_bytes(temp_storage_bytes) - , d_in(d_in) - , d_out(d_out) - , scan_op(scan_op) - , init_value(init_value) - , num_items(num_items) - , stream(stream) - , ptx_version(ptx_version) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - } -#endif // _CCCL_DOXYGEN_INVOKED - template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) { @@ -588,25 +561,6 @@ struct DispatchScan return error; } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - InitValueT init_value, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index 6f2432874b5..c88656dff48 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -341,37 +341,6 @@ struct DispatchScanByKey , ptx_version(ptx_version) {} -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScanByKey( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - EqualityOp equality_op, - ScanOpT scan_op, - InitValueT init_value, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : d_temp_storage(d_temp_storage) - , temp_storage_bytes(temp_storage_bytes) - , d_keys_in(d_keys_in) - , d_values_in(d_values_in) - , d_values_out(d_values_out) - , equality_op(equality_op) - , scan_op(scan_op) - , init_value(init_value) - , num_items(num_items) - , stream(stream) - , ptx_version(ptx_version) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - } -#endif // _CCCL_DOXYGEN_INVOKED - template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) { @@ -616,37 +585,6 @@ struct DispatchScanByKey return error; } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - EqualityOp equality_op, - ScanOpT scan_op, - InitValueT init_value, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_values_out, - equality_op, - scan_op, - init_value, - num_items, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index 9250ab87f61..9d011d414ba 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -820,35 +820,6 @@ struct DispatchSegmentedSort , stream(stream) {} -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedSort( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - OffsetT num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - bool is_overwrite_okay, - cudaStream_t stream, - bool debug_synchronous) - : d_temp_storage(d_temp_storage) - , temp_storage_bytes(temp_storage_bytes) - , d_keys(d_keys) - , d_values(d_values) - , num_items(num_items) - , num_segments(num_segments) - , d_begin_offsets(d_begin_offsets) - , d_end_offsets(d_end_offsets) - , is_overwrite_okay(is_overwrite_okay) - , stream(stream) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - } -#endif // _CCCL_DOXYGEN_INVOKED - template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { @@ -1128,37 +1099,6 @@ struct DispatchSegmentedSort return error; } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - DoubleBuffer& d_keys, - DoubleBuffer& d_values, - OffsetT num_items, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - bool is_overwrite_okay, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - is_overwrite_okay, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED - private: CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE int GetNumPasses(int radix_bits) { diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index be8b09a5c9a..c41dfb389eb 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -840,37 +840,6 @@ struct DispatchSelectIf return CubDebug(PolicyHub::MaxPolicy::Invoke(ptx_version, dispatch)); } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - FlagsInputIteratorT d_flags, - SelectedOutputIteratorT d_selected_out, - NumSelectedIteratorT d_num_selected_out, - SelectOpT select_op, - EqualityOpT equality_op, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_flags, - d_selected_out, - d_num_selected_out, - select_op, - equality_op, - num_items, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh index 2fb435699db..6dc4f44aeca 100644 --- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -892,44 +892,6 @@ struct DispatchSpmv return error; } -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN - _CCCL_FORCEINLINE static cudaError_t - Dispatch(void* d_temp_storage, - size_t& temp_storage_bytes, - SpmvParamsT& spmv_params, - cudaStream_t stream, - bool debug_synchronous, - Spmv1ColKernelT spmv_1col_kernel, - SpmvSearchKernelT spmv_search_kernel, - SpmvKernelT spmv_kernel, - SegmentFixupKernelT segment_fixup_kernel, - SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel, - KernelConfig spmv_config, - KernelConfig segment_fixup_config) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch( - d_temp_storage, - temp_storage_bytes, - spmv_params, - stream, - spmv_1col_kernel, - spmv_search_kernel, - spmv_kernel, - segment_fixup_kernel, - spmv_empty_matrix_kernel, - spmv_config, - segment_fixup_config); - } -#endif // _CCCL_DOXYGEN_INVOKED - /** * @brief Internal dispatch routine for computing a device-wide reduction * @@ -988,21 +950,6 @@ struct DispatchSpmv return error; } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - SpmvParamsT& spmv_params, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh index 90295f2c06f..2d5566d76a3 100644 --- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -46,17 +46,97 @@ #include -#include #include #include CUB_NAMESPACE_BEGIN +namespace detail +{ + +namespace three_way_partition +{ +// Offset type used to instantiate the stream three-way-partition-kernel and agent to index the items within one +// partition +using per_partition_offset_t = ::cuda::std::int32_t; + +template +class streaming_context_t +{ +private: + bool first_partition = true; + bool last_partition = false; + TotalNumItemsT total_previous_num_items{}; + + // We use a double-buffer for keeping track of the number of previously selected items + TotalNumItemsT* d_num_selected_in = nullptr; + TotalNumItemsT* d_num_selected_out = nullptr; + +public: + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE + streaming_context_t(TotalNumItemsT* d_num_selected_in, TotalNumItemsT* d_num_selected_out, bool is_last_partition) + : last_partition(is_last_partition) + , d_num_selected_in(d_num_selected_in) + , d_num_selected_out(d_num_selected_out) + {} + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT num_items, bool next_partition_is_the_last) + { + ::cuda::std::swap(d_num_selected_in, d_num_selected_out); + first_partition = false; + last_partition = next_partition_is_the_last; + total_previous_num_items += num_items; + }; + + _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset() const + { + return first_partition ? TotalNumItemsT{0} : total_previous_num_items; + }; + + _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected_first() const + { + return first_partition ? TotalNumItemsT{0} : d_num_selected_in[0]; + }; + + _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected_second() const + { + return first_partition ? TotalNumItemsT{0} : d_num_selected_in[1]; + }; + + _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const + { + return first_partition ? TotalNumItemsT{0} : d_num_selected_in[2]; + ; + }; + + template + _CCCL_DEVICE _CCCL_FORCEINLINE void update_num_selected( + NumSelectedIteratorT user_num_selected_out_it, + TotalNumItemsT num_selected_first, + TotalNumItemsT num_selected_second, + TotalNumItemsT num_items_in_partition) const + { + if (last_partition) + { + user_num_selected_out_it[0] = num_previously_selected_first() + num_selected_first; + user_num_selected_out_it[1] = num_previously_selected_second() + num_selected_second; + } + else + { + d_num_selected_out[0] = num_previously_selected_first() + num_selected_first; + d_num_selected_out[1] = num_previously_selected_second() + num_selected_second; + d_num_selected_out[2] = + num_previously_rejected() + (num_items_in_partition - num_selected_second - num_selected_first); + } + } +}; +} // namespace three_way_partition +} // namespace detail + /****************************************************************************** * Kernel entry points *****************************************************************************/ - template + typename OffsetT, + typename StreamingContextT> __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceThreeWayPartitionKernel( InputIteratorT d_in, @@ -78,7 +159,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, OffsetT num_items, - int num_tiles) + int num_tiles, + _CCCL_GRID_CONSTANT const StreamingContextT streaming_context) { using AgentThreeWayPartitionPolicyT = typename ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy; @@ -91,7 +173,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO UnselectedOutputIteratorT, SelectFirstPartOp, SelectSecondPartOp, - OffsetT>; + OffsetT, + StreamingContextT>; // Shared memory for AgentThreeWayPartition __shared__ typename AgentThreeWayPartitionT::TempStorage temp_storage; @@ -105,7 +188,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO d_unselected_out, select_first_part_op, select_second_part_op, - num_items) + num_items, + streaming_context) .ConsumeRange(num_tiles, tile_status, d_num_selected_out); } @@ -160,14 +244,23 @@ template , OffsetT>> + typename PolicyHub = detail::three_way_partition:: + policy_hub, detail::three_way_partition::per_partition_offset_t>> struct DispatchThreeWayPartitionIf { /***************************************************************************** * Types and constants ****************************************************************************/ - using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t; + // Offset type used to instantiate the three-way partition-kernel and agent to index the items within one partition + using per_partition_offset_t = detail::three_way_partition::per_partition_offset_t; + + // Type used to provide streaming information about each partition's context + static constexpr per_partition_offset_t partition_size = ::cuda::std::numeric_limits::max(); + + using streaming_context_t = detail::three_way_partition::streaming_context_t; + + using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t; using AccumPackT = typename AccumPackHelperT::pack_t; using ScanTileStateT = cub::ScanTileState; @@ -222,64 +315,77 @@ struct DispatchThreeWayPartitionIf constexpr int block_threads = ActivePolicyT::ThreeWayPartitionPolicy::BLOCK_THREADS; constexpr int items_per_thread = ActivePolicyT::ThreeWayPartitionPolicy::ITEMS_PER_THREAD; + constexpr int tile_size = block_threads * items_per_thread; - do - { - // Get device ordinal - int device_ordinal; - error = CubDebug(cudaGetDevice(&device_ordinal)); - if (cudaSuccess != error) - { - break; - } + // The maximum number of items for which we will ever invoke the kernel (i.e. largest partition size) + auto const max_partition_size = + static_cast(::cuda::std::min(static_cast(num_items), static_cast(partition_size))); - // Number of input tiles - int tile_size = block_threads * items_per_thread; - int num_tiles = static_cast(::cuda::ceil_div(num_items, tile_size)); + // The number of partitions required to "iterate" over the total input + auto const num_partitions = + (max_partition_size == 0) ? OffsetT{1} : ::cuda::ceil_div(num_items, max_partition_size); - // Specify temporary storage allocation requirements - size_t allocation_sizes[1]; // bytes needed for tile status descriptors + // The maximum number of tiles for which we will ever invoke the kernel + auto const max_num_tiles_per_invocation = static_cast(::cuda::ceil_div(max_partition_size, tile_size)); - error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); - if (cudaSuccess != error) - { - break; - } + // For streaming invocations, we need two sets (for double-buffering) of three counters each + constexpr ::cuda::std::size_t num_counters_per_pass = 3; + constexpr ::cuda::std::size_t num_streaming_counters = 2 * num_counters_per_pass; + ::cuda::std::size_t streaming_selection_storage_bytes = + (num_partitions > 1) ? num_streaming_counters * sizeof(OffsetT) : ::cuda::std::size_t{0}; - // Compute allocation pointers into the single storage blob (or compute - // the necessary size of the blob) - void* allocations[1] = {}; + // Specify temporary storage allocation requirements + size_t allocation_sizes[2] = {0ULL, streaming_selection_storage_bytes}; - error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); - if (cudaSuccess != error) - { - break; - } + error = + CubDebug(ScanTileStateT::AllocationSize(static_cast(max_num_tiles_per_invocation), allocation_sizes[0])); + if (cudaSuccess != error) + { + return error; + } - if (d_temp_storage == nullptr) - { - // Return if the caller is simply requesting the size of the storage - // allocation - break; - } + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[2] = {}; - // Return if empty problem - if (num_items == 0) - { - break; - } + error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) + { + return error; + } + + if (d_temp_storage == nullptr) + { + // Return if the caller is simply requesting the size of the storage + // allocation + return cudaSuccess; + } + + // Initialize the streaming context with the temporary storage for double-buffering the previously selected items + // and the total number (across all partitions) of items + OffsetT* tmp_num_selected_out = static_cast(allocations[1]); + streaming_context_t streaming_context{ + tmp_num_selected_out, (tmp_num_selected_out + num_counters_per_pass), (num_partitions <= 1)}; + + // Iterate over the partitions until all input is processed + for (OffsetT partition_idx = 0; partition_idx < num_partitions; partition_idx++) + { + OffsetT current_partition_offset = partition_idx * max_partition_size; + OffsetT current_num_items = + (partition_idx + 1 == num_partitions) ? (num_items - current_partition_offset) : max_partition_size; // Construct the tile status interface - ScanTileStateT tile_status; + const auto current_num_tiles = static_cast(::cuda::ceil_div(current_num_items, tile_size)); - error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0])); + // Construct the tile status interface + ScanTileStateT tile_status; + error = CubDebug(tile_status.Init(current_num_tiles, allocations[0], allocation_sizes[0])); if (cudaSuccess != error) { - break; + return error; } // Log three_way_partition_init_kernel configuration - int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS)); + int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(current_num_tiles, INIT_KERNEL_THREADS)); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking three_way_partition_init_kernel<<<%d, %d, 0, %lld>>>()\n", @@ -290,36 +396,29 @@ struct DispatchThreeWayPartitionIf // Invoke three_way_partition_init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) - .doit(three_way_partition_init_kernel, tile_status, num_tiles, d_num_selected_out); + .doit(three_way_partition_init_kernel, tile_status, current_num_tiles, d_num_selected_out); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { - break; + return error; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { - break; + return error; } - // Get max x-dimension of grid - int max_dim_x; - error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); - if (cudaSuccess != error) + // No more items to process (note, we do not want to return early for num_items==0, because we need to make sure + // that `three_way_partition_init_kernel` has written '0' to d_num_selected_out) + if (current_num_items == 0) { - break; + return cudaSuccess; } - // Get grid size for scanning tiles - dim3 scan_grid_size; - scan_grid_size.z = 1; - scan_grid_size.y = ::cuda::ceil_div(num_tiles, max_dim_x); - scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); - // Log select_if_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG { @@ -330,14 +429,12 @@ struct DispatchThreeWayPartitionIf block_threads)); if (cudaSuccess != error) { - break; + return error; } - _CubLog("Invoking three_way_partition_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d " + _CubLog("Invoking three_way_partition_kernel<<<%d, %d, 0, %lld>>>(), %d " "items per thread, %d SM occupancy\n", - scan_grid_size.x, - scan_grid_size.y, - scan_grid_size.z, + current_num_tiles, block_threads, reinterpret_cast(stream), items_per_thread, @@ -346,7 +443,7 @@ struct DispatchThreeWayPartitionIf #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke select_if_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(current_num_tiles, block_threads, 0, stream) .doit(three_way_partition_kernel, d_in, d_first_part_out, @@ -356,23 +453,27 @@ struct DispatchThreeWayPartitionIf tile_status, select_first_part_op, select_second_part_op, - num_items, - num_tiles); + static_cast(current_num_items), + current_num_tiles, + streaming_context); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { - break; + return error; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { - break; + return error; } - } while (0); + + // Prepare streaming context for next partition (swap double buffers, advance number of processed items, etc.) + streaming_context.advance(current_num_items, (partition_idx + OffsetT{2} == num_partitions)); + } return error; } @@ -393,7 +494,8 @@ struct DispatchThreeWayPartitionIf ScanTileStateT, SelectFirstPartOp, SelectSecondPartOp, - OffsetT>); + per_partition_offset_t, + streaming_context_t>); } /** @@ -449,39 +551,6 @@ struct DispatchThreeWayPartitionIf return error; } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - std::size_t& temp_storage_bytes, - InputIteratorT d_in, - FirstOutputIteratorT d_first_part_out, - SecondOutputIteratorT d_second_part_out, - UnselectedOutputIteratorT d_unselected_out, - NumSelectedIteratorT d_num_selected_out, - SelectFirstPartOp select_first_part_op, - SelectSecondPartOp select_second_part_op, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_first_part_out, - d_second_part_out, - d_unselected_out, - d_num_selected_out, - select_first_part_op, - select_second_part_op, - num_items, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index 386a6276dfa..fa4fa80d0ef 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -169,11 +169,10 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply_impl(F&& f, Tuple&& t, ::cuda::st } template -_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t) - -> decltype(poor_apply_impl( - ::cuda::std::forward(f), - ::cuda::std::forward(t), - ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::remove_reference_t>::value>{})) +_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t) -> decltype(poor_apply_impl( + ::cuda::std::forward(f), + ::cuda::std::forward(t), + ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::remove_reference_t>::value>{})) { return poor_apply_impl( ::cuda::std::forward(f), @@ -473,8 +472,9 @@ using needs_aligned_ptr_t = #ifdef _CUB_HAS_TRANSFORM_UBLKCP template ::value, int> = 0> -_CCCL_DEVICE _CCCL_FORCEINLINE auto select_kernel_arg( - ::cuda::std::integral_constant, kernel_arg&& arg) -> aligned_base_ptr>&& +_CCCL_DEVICE _CCCL_FORCEINLINE auto +select_kernel_arg(::cuda::std::integral_constant, kernel_arg&& arg) + -> aligned_base_ptr>&& { return ::cuda::std::move(arg.aligned_ptr); } @@ -660,10 +660,9 @@ struct dispatch_t - CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel() - -> PoorExpected< - ::cuda::std:: - tuple> + CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel() -> PoorExpected< + ::cuda::std:: + tuple> { using policy_t = typename ActivePolicy::algo_policy; constexpr int block_dim = policy_t::block_threads; diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index 46ac0a44b9b..e07084fe24a 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -322,35 +322,6 @@ struct DispatchUniqueByKey , stream(stream) {} -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchUniqueByKey( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeyInputIteratorT d_keys_in, - ValueInputIteratorT d_values_in, - KeyOutputIteratorT d_keys_out, - ValueOutputIteratorT d_values_out, - NumSelectedIteratorT d_num_selected_out, - EqualityOpT equality_op, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) - : d_temp_storage(d_temp_storage) - , temp_storage_bytes(temp_storage_bytes) - , d_keys_in(d_keys_in) - , d_values_in(d_values_in) - , d_keys_out(d_keys_out) - , d_values_out(d_values_out) - , d_num_selected_out(d_num_selected_out) - , equality_op(equality_op) - , num_items(num_items) - , stream(stream) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - } -#endif // _CCCL_DOXYGEN_INVOKED - /****************************************************************************** * Dispatch entrypoints ******************************************************************************/ @@ -626,37 +597,6 @@ struct DispatchUniqueByKey return error; } - -#ifndef _CCCL_DOXYGEN_INVOKED // Do not document - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - KeyInputIteratorT d_keys_in, - ValueInputIteratorT d_values_in, - KeyOutputIteratorT d_keys_out, - ValueOutputIteratorT d_values_out, - NumSelectedIteratorT d_num_selected_out, - EqualityOpT equality_op, - OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) - { - CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - - return Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_values_in, - d_keys_out, - d_values_out, - d_num_selected_out, - equality_op, - num_items, - stream); - } -#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index 2163c4b7431..2efa551d4c6 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -108,18 +108,6 @@ constexpr accum_size classify_accum_size() : accum_size::unknown; } -template -struct tuning -{ - static constexpr int threads = Threads; - static constexpr int items = Items; - using delay_constructor = fixed_delay_constructor_t; - static constexpr BlockLoadAlgorithm load_algorithm = - (sizeof(AccumT) > 128) ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr BlockStoreAlgorithm store_algorithm = - (sizeof(AccumT) > 128) ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE; -}; - template (), @@ -209,17 +197,31 @@ template ()> struct sm90_tuning; +template +struct sm90_tuning_vals +{ + static constexpr int threads = Threads; + static constexpr int items = Items; + using delay_constructor = fixed_delay_constructor_t; + // same logic as default policy: + static constexpr bool large_values = sizeof(AccumT) > 128; + static constexpr BlockLoadAlgorithm load_algorithm = + large_values ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE; + static constexpr BlockStoreAlgorithm store_algorithm = + large_values ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE; +}; + // clang-format off -template struct sm90_tuning : tuning {}; -template struct sm90_tuning : tuning {}; -template struct sm90_tuning : tuning {}; -template struct sm90_tuning : tuning {}; +template struct sm90_tuning : sm90_tuning_vals {}; +template struct sm90_tuning : sm90_tuning_vals {}; +template struct sm90_tuning : sm90_tuning_vals {}; +template struct sm90_tuning : sm90_tuning_vals {}; -template <> struct sm90_tuning : tuning {}; -template <> struct sm90_tuning : tuning {}; +template <> struct sm90_tuning : sm90_tuning_vals {}; +template <> struct sm90_tuning : sm90_tuning_vals {}; #if CUB_IS_INT128_ENABLED -template <> struct sm90_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : tuning<__int128_t, 576, 21, 860, 630> {}; +template <> struct sm90_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : sm90_tuning_vals<__int128_t, 576, 21, 860, 630> {}; template <> struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : sm90_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> diff --git a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh index c6894ccbc86..3645e4b9ed7 100644 --- a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh @@ -255,9 +255,8 @@ struct policy_hub typename Tuning::delay_constructor>; template - static auto select_agent_policy(long) -> - typename DefaultPolicy< - default_delay_constructor_t::pack_t>>::ThreeWayPartitionPolicy; + static auto select_agent_policy(long) -> typename DefaultPolicy< + default_delay_constructor_t::pack_t>>::ThreeWayPartitionPolicy; struct Policy800 : ChainedPolicy<800, Policy800, Policy350> { diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index 7af32df392c..ba189a9ad13 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -380,33 +380,6 @@ struct ReduceByKeyOp } }; -//! Deprecated [Since 2.8] -template -struct CCCL_DEPRECATED BinaryFlip -{ - BinaryOpT binary_op; - - _CCCL_HOST_DEVICE explicit BinaryFlip(BinaryOpT binary_op) - : binary_op(binary_op) - {} - - template - _CCCL_DEVICE auto - operator()(T&& t, U&& u) -> decltype(binary_op(::cuda::std::forward(u), ::cuda::std::forward(t))) - { - return binary_op(::cuda::std::forward(u), ::cuda::std::forward(t)); - } -}; - -_CCCL_SUPPRESS_DEPRECATED_PUSH -//! Deprecated [Since 2.8] -template -CCCL_DEPRECATED _CCCL_HOST_DEVICE BinaryFlip MakeBinaryFlip(BinaryOpT binary_op) -{ - return BinaryFlip(binary_op); -} -_CCCL_SUPPRESS_DEPRECATED_POP - #ifndef _CCCL_DOXYGEN_INVOKED // Do not document namespace internal diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh index 294bc449e31..d3850051ca7 100644 --- a/cub/cub/thread/thread_reduce.cuh +++ b/cub/cub/thread/thread_reduce.cuh @@ -543,8 +543,8 @@ ThreadReduceTernaryTree(const Input& input, ReductionOp reduction_op) // never reached. Protect instantion of ThreadReduceSimd with arbitrary types and operators _CCCL_TEMPLATE(typename Input, typename ReductionOp) _CCCL_REQUIRES((!cub::internal::enable_generic_simd_reduction())) -_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto -ThreadReduceSimd(const Input& input, ReductionOp) -> ::cuda::std::remove_cvref_t +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, ReductionOp) + -> ::cuda::std::remove_cvref_t { assert(false); return input[0]; @@ -552,8 +552,8 @@ ThreadReduceSimd(const Input& input, ReductionOp) -> ::cuda::std::remove_cvref_t _CCCL_TEMPLATE(typename Input, typename ReductionOp) _CCCL_REQUIRES((cub::internal::enable_generic_simd_reduction())) -_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto -ThreadReduceSimd(const Input& input, ReductionOp reduction_op) -> ::cuda::std::remove_cvref_t +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto ThreadReduceSimd(const Input& input, ReductionOp reduction_op) + -> ::cuda::std::remove_cvref_t { using cub::detail::unsafe_bitcast; using T = ::cuda::std::remove_cvref_t; diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh index 7d9e8622f82..e21d9dd3ecd 100644 --- a/cub/cub/thread/thread_sort.cuh +++ b/cub/cub/thread/thread_sort.cuh @@ -45,6 +45,7 @@ CUB_NAMESPACE_BEGIN template +CCCL_DEPRECATED_BECAUSE("Use cuda::std::swap") _CCCL_DEVICE _CCCL_FORCEINLINE void Swap(T& lhs, T& rhs) { T temp = lhs; @@ -95,10 +96,11 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE { if (compare_op(keys[j + 1], keys[j])) { - Swap(keys[j], keys[j + 1]); + using ::cuda::std::swap; + swap(keys[j], keys[j + 1]); if (!KEYS_ONLY) { - Swap(items[j], items[j + 1]); + swap(items[j], items[j + 1]); } } } // inner loop diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh index a895884a60d..7f936258ab6 100644 --- a/cub/cub/thread/thread_store.cuh +++ b/cub/cub/thread/thread_store.cuh @@ -116,28 +116,30 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val); #ifndef _CCCL_DOXYGEN_INVOKED // Do not document +namespace detail +{ /// Helper structure for templated store iteration (inductive case) template -struct IterateThreadStore +struct iterate_thread_store { template static _CCCL_DEVICE _CCCL_FORCEINLINE void Store(T* ptr, T* vals) { ThreadStore(ptr + COUNT, vals[COUNT]); - IterateThreadStore::template Store(ptr, vals); + iterate_thread_store::template Store(ptr, vals); } template static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(OutputIteratorT ptr, T* vals) { ptr[COUNT] = vals[COUNT]; - IterateThreadStore::Dereference(ptr, vals); + iterate_thread_store::Dereference(ptr, vals); } }; /// Helper structure for templated store iteration (termination case) template -struct IterateThreadStore +struct iterate_thread_store { template static _CCCL_DEVICE _CCCL_FORCEINLINE void Store(T* /*ptr*/, T* /*vals*/) @@ -147,6 +149,10 @@ struct IterateThreadStore static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(OutputIteratorT /*ptr*/, T* /*vals*/) {} }; +} // namespace detail + +template +using IterateThreadStore CCCL_DEPRECATED = detail::iterate_thread_store; /** * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier @@ -305,7 +311,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStoreVolatilePtr(T* ptr, T val, Int2Ty reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; } - IterateThreadStore<0, VOLATILE_MULTIPLE>::Dereference(reinterpret_cast(ptr), words); + detail::iterate_thread_store<0, VOLATILE_MULTIPLE>::Dereference(reinterpret_cast(ptr), words); } /** @@ -340,7 +346,7 @@ ThreadStore(T* ptr, T val, Int2Type /*modifier*/, Int2Type /*is_ reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; } - IterateThreadStore<0, DEVICE_MULTIPLE>::template Store( + detail::iterate_thread_store<0, DEVICE_MULTIPLE>::template Store( reinterpret_cast(ptr), words); } diff --git a/cub/cub/util_compiler.cuh b/cub/cub/util_compiler.cuh deleted file mode 100644 index b34a889fd21..00000000000 --- a/cub/cub/util_compiler.cuh +++ /dev/null @@ -1,110 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Detect compiler information. - */ - -#pragma once - -// For _CCCL_IMPLICIT_SYSTEM_HEADER -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -// enumerate host compilers we know about -//! deprecated [Since 2.7] -#define CUB_HOST_COMPILER_UNKNOWN 0 -//! deprecated [Since 2.7] -#define CUB_HOST_COMPILER_MSVC 1 -//! deprecated [Since 2.7] -#define CUB_HOST_COMPILER_GCC 2 -//! deprecated [Since 2.7] -#define CUB_HOST_COMPILER_CLANG 3 - -// enumerate device compilers we know about -//! deprecated [Since 2.7] -#define CUB_DEVICE_COMPILER_UNKNOWN 0 -//! deprecated [Since 2.7] -#define CUB_DEVICE_COMPILER_MSVC 1 -//! deprecated [Since 2.7] -#define CUB_DEVICE_COMPILER_GCC 2 -//! deprecated [Since 2.7] -#define CUB_DEVICE_COMPILER_NVCC 3 -//! deprecated [Since 2.7] -#define CUB_DEVICE_COMPILER_CLANG 4 - -// figure out which host compiler we're using -#if _CCCL_COMPILER(MSVC) -//! deprecated [Since 2.7] -# define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC -//! deprecated [Since 2.7] -# define CUB_MSVC_VERSION _MSC_VER -//! deprecated [Since 2.7] -# define CUB_MSVC_VERSION_FULL _MSC_FULL_VER -#elif _CCCL_COMPILER(CLANG) -//! deprecated [Since 2.7] -# define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG -//! deprecated [Since 2.7] -# define CUB_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) -#elif _CCCL_COMPILER(GCC) -//! deprecated [Since 2.7] -# define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC -//! deprecated [Since 2.7] -# define CUB_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) -#endif - -// figure out which device compiler we're using -#if _CCCL_CUDA_COMPILER(NVCC) || _CCCL_CUDA_COMPILER(NVHPC) -//! deprecated [Since 2.7] -# define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC -#elif _CCCL_COMPILER(MSVC) -//! deprecated [Since 2.7] -# define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC -#elif _CCCL_COMPILER(GCC) -//! deprecated [Since 2.7] -# define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC -#elif _CCCL_COMPILER(CLANG) -// CUDA-capable clang should behave similar to NVCC. -# if _CCCL_CUDA_COMPILER(NVCC) -//! deprecated [Since 2.7] -# define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC -# else -//! deprecated [Since 2.7] -# define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG -# endif -#else -//! deprecated [Since 2.7] -# define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN -#endif diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh index 5a4f4a63825..a6eee36539c 100644 --- a/cub/cub/util_cpp_dialect.cuh +++ b/cub/cub/util_cpp_dialect.cuh @@ -40,23 +40,13 @@ # pragma system_header #endif // no system header -#include // IWYU pragma: export - #ifndef _CCCL_DOXYGEN_INVOKED // Do not document // Deprecation warnings may be silenced by defining the following macros. These // may be combined. -// - CCCL_IGNORE_DEPRECATED_CPP_DIALECT: -// Ignore all deprecated C++ dialects and outdated compilers. -// - CCCL_IGNORE_DEPRECATED_CPP_11: -// Ignore deprecation warnings when compiling with C++11. C++03 and outdated -// compilers will still issue warnings. -// - CCCL_IGNORE_DEPRECATED_CPP_14: -// Ignore deprecation warnings when compiling with C++14. C++03 and outdated -// compilers will still issue warnings. // - CCCL_IGNORE_DEPRECATED_COMPILER // Ignore deprecation warnings when using deprecated compilers. Compiling -// with C++03, C++11 and C++14 will still issue warnings. +// with deprecated C++ dialects will still issue warnings. # define CUB_CPP_DIALECT _CCCL_STD_VER @@ -67,6 +57,7 @@ # define CUB_COMP_DEPR_IMPL(msg) _CCCL_PRAGMA(GCC warning #msg) # endif +// Compiler checks: // clang-format off # define CUB_COMPILER_DEPRECATION(REQ) \ CUB_COMP_DEPR_IMPL(CUB requires at least REQ. Define CCCL_IGNORE_DEPRECATED_COMPILER to suppress this message.) @@ -74,12 +65,10 @@ # define CUB_COMPILER_DEPRECATION_SOFT(REQ, CUR) \ CUB_COMP_DEPR_IMPL( \ CUB requires at least REQ. CUR is deprecated but still supported. CUR support will be removed in a \ - future release. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.) + future release. Define CCCL_IGNORE_DEPRECATED_COMPILER to suppress this message.) // clang-format on # ifndef CCCL_IGNORE_DEPRECATED_COMPILER - -// Compiler checks: # if _CCCL_COMPILER(GCC, <, 7) CUB_COMPILER_DEPRECATION(GCC 7.0); # elif _CCCL_COMPILER(CLANG, <, 7) @@ -91,24 +80,18 @@ CUB_COMPILER_DEPRECATION(MSVC 2019(19.20 / 16.0 / 14.20)); // >=2017, <2019. Soft deprecation message: CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017); # endif - # endif // CCCL_IGNORE_DEPRECATED_COMPILER -# if _CCCL_STD_VER < 2011 -// = 2017 - # undef CUB_COMPILER_DEPRECATION_SOFT # undef CUB_COMPILER_DEPRECATION + +// C++17 dialect check: +# ifndef CCCL_IGNORE_DEPRECATED_CPP_DIALECT +# if _CCCL_STD_VER < 2017 +CUB_COMP_DEPR_IMPL(CUB requires at least C++ 17. Define CCCL_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.) +# endif // _CCCL_STD_VER >= 2017 +# endif + # undef CUB_COMP_DEPR_IMPL -# undef CUB_COMP_DEPR_IMPL0 -# undef CUB_COMP_DEPR_IMPL1 #endif // !_CCCL_DOXYGEN_INVOKED diff --git a/cub/cub/util_debug.cuh b/cub/cub/util_debug.cuh index 3971c6a99ca..275c915e8f2 100644 --- a/cub/cub/util_debug.cuh +++ b/cub/cub/util_debug.cuh @@ -309,18 +309,4 @@ inline _CCCL_HOST_DEVICE void va_printf(char const*, Args const&...) # endif #endif -#define CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED \ - CCCL_DEPRECATED_BECAUSE( \ - "CUB no longer accepts `debug_synchronous` parameter. " \ - "Define CUB_DEBUG_SYNC instead, or silence this message with " \ - "CCCL_IGNORE_DEPRECATED_API.") - -#define CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG \ - if (debug_synchronous) \ - { \ - _CubLog("%s\n", \ - "CUB no longer accepts `debug_synchronous` parameter. " \ - "Define CUB_DEBUG_SYNC instead."); \ - } - CUB_NAMESPACE_END diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index 0ff0f76df9f..bd96393ae5f 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -130,24 +130,6 @@ CUB_RUNTIME_FUNCTION inline int DeviceCountUncached() return count; } -/** - * \brief Cache for an arbitrary value produced by a nullary function. - * deprecated [Since 2.6.0] - */ -template -struct CCCL_DEPRECATED ValueCache -{ - T const value; - - /** - * \brief Call the nullary function to produce the value and construct the - * cache. - */ - _CCCL_HOST inline ValueCache() - : value(Function()) - {} -}; - // Host code. This is a separate function to avoid defining a local static in a host/device function. _CCCL_HOST inline int DeviceCountCachedValue() { diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index 17201c4704f..c86d24754de 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -227,10 +227,8 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id) endif() # CUB_SEPARATE_CATCH2 if ("${test_target}" MATCHES "nvrtc") - target_compile_definitions(${test_target} PRIVATE NVRTC_CUB_PATH="-I${CMAKE_SOURCE_DIR}/cub") - target_compile_definitions(${test_target} PRIVATE NVRTC_THRUST_PATH="-I${CMAKE_SOURCE_DIR}/thrust") - target_compile_definitions(${test_target} PRIVATE NVRTC_LIBCUDACXX_PATH="-I${CMAKE_SOURCE_DIR}/libcudacxx/include") - target_compile_definitions(${test_target} PRIVATE NVRTC_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}") + configure_file("cmake/nvrtc_args.h.in" ${CMAKE_CURRENT_BINARY_DIR}/nvrtc_args.h) + target_include_directories(${test_target} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) endif() if ("${test_target}" MATCHES "test.iterator") diff --git a/cub/test/catch2_segmented_sort_helper.cuh b/cub/test/catch2_segmented_sort_helper.cuh index 3e93b3f0d62..0852921bebf 100644 --- a/cub/test/catch2_segmented_sort_helper.cuh +++ b/cub/test/catch2_segmented_sort_helper.cuh @@ -1388,11 +1388,11 @@ struct generate_edge_case_offsets_dispatch static constexpr int a_bunch_of = 42; static constexpr int a_lot_of = 420; - int small_segment_max_segment_size; - int items_per_small_segment; - int medium_segment_max_segment_size; - int single_thread_segment_size; - int large_cached_segment_max_segment_size; + int small_segment_max_segment_size{}; + int items_per_small_segment{}; + int medium_segment_max_segment_size{}; + int single_thread_segment_size{}; + int large_cached_segment_max_segment_size{}; template CUB_RUNTIME_FUNCTION cudaError_t Invoke() diff --git a/cub/test/catch2_test_device_for_each_in_extents.cu b/cub/test/catch2_test_device_for_each_in_extents.cu index 8ad75a1d0cb..3e5a6c6689a 100644 --- a/cub/test/catch2_test_device_for_each_in_extents.cu +++ b/cub/test/catch2_test_device_for_each_in_extents.cu @@ -135,8 +135,8 @@ using dimensions = cuda::std::index_sequence<3, 2, 5, 4>>; template -auto build_static_extents(IndexType, - cuda::std::index_sequence) -> cuda::std::extents +auto build_static_extents(IndexType, cuda::std::index_sequence) + -> cuda::std::extents { return {}; } diff --git a/cub/test/catch2_test_device_three_way_partition.cu b/cub/test/catch2_test_device_three_way_partition.cu index 8c6524adf7c..3b5f96c8d60 100644 --- a/cub/test/catch2_test_device_three_way_partition.cu +++ b/cub/test/catch2_test_device_three_way_partition.cu @@ -30,6 +30,10 @@ #include +#include +#include +#include +#include #include #include #include @@ -38,6 +42,8 @@ #include +#include "catch2_large_problem_helper.cuh" +#include "catch2_test_device_select_common.cuh" #include "catch2_test_launch_helper.h" #include "cub/util_type.cuh" #include @@ -48,20 +54,8 @@ DECLARE_LAUNCH_WRAPPER(cub::DevicePartition::If, partition); using types = c2h::type_list; -template -struct less_than_t -{ - T compare; - - explicit __host__ less_than_t(T compare) - : compare(compare) - {} - - __device__ bool operator()(const T& a) const - { - return a < compare; - } -}; +// List of offset types to be used for testing large number of items +using offset_types = c2h::type_list; template struct equal_to_t @@ -103,6 +97,29 @@ struct count_to_pair_t } }; +template +struct mod_equal_to +{ + T mod; + T val; + __host__ __device__ bool operator()(T x) const + { + return x % mod == val; + } +}; + +template +struct multiply_and_add +{ + T mul; + T add; + + __host__ __device__ T operator()(T x) const + { + return x * mul + add; + } +}; + C2H_TEST("Device three-way partition can handle empty problems", "[partition][device]", types) { using type = typename c2h::get<0, TestType>; @@ -113,12 +130,15 @@ C2H_TEST("Device three-way partition can handle empty problems", "[partition][de type* d_first_part_out{}; type* d_second_part_out{}; type* d_unselected_out{}; - type* d_num_selected_out{}; + c2h::device_vector num_selected_out{42, 42}; + type* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); less_than_t le(type{0}); greater_or_equal_t ge(type{1}); partition(in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, num_items, le, ge); + REQUIRE(num_selected_out[0] == 0); + REQUIRE(num_selected_out[1] == 0); } template @@ -440,3 +460,60 @@ C2H_TEST("Device three-way partition handles single output", "[partition][device second_part_val); REQUIRE(actual_num_items_in_second_part == num_items_in_second_part); } + +C2H_TEST("Device three-way partition works for very large number of items", "[device][partition]", offset_types) +try +{ + using offset_t = typename c2h::get<0, TestType>; + + auto num_items_max_ull = + std::min(static_cast(::cuda::std::numeric_limits::max()), + ::cuda::std::numeric_limits::max() + static_cast(2000000ULL)); + offset_t num_items_max = static_cast(num_items_max_ull); + offset_t num_items_min = + num_items_max_ull > 10000 ? static_cast(num_items_max_ull - 10000ULL) : offset_t{0}; + offset_t num_items = GENERATE_COPY( + values( + {num_items_max, static_cast(num_items_max - 1), static_cast(1), static_cast(3)}), + take(2, random(num_items_min, num_items_max))); + + auto in = thrust::make_counting_iterator(offset_t{0}); + + auto first_selector = mod_equal_to{3, 0}; + auto second_selector = mod_equal_to{3, 1}; + + offset_t expected_first = num_items / offset_t{3} + (num_items % offset_t{3} >= 1); + offset_t expected_second = num_items / offset_t{3} + (num_items % offset_t{3} >= 2); + offset_t expected_third = num_items / offset_t{3}; + + auto expected_first_it = thrust::make_transform_iterator(in, multiply_and_add{3, 0}); + auto expected_second_it = thrust::make_transform_iterator(in, multiply_and_add{3, 1}); + auto expected_third_it = thrust::make_transform_iterator(in, multiply_and_add{3, 2}); + + // Prepare tabulate output iterators to verify results in a memory-efficient way + auto check_first_partition_helper = detail::large_problem_test_helper(expected_first); + auto check_first_it = check_first_partition_helper.get_flagging_output_iterator(expected_first_it); + auto check_second_partition_helper = detail::large_problem_test_helper(expected_second); + auto check_second_it = check_second_partition_helper.get_flagging_output_iterator(expected_second_it); + auto check_third_partition_helper = detail::large_problem_test_helper(expected_third); + auto check_third_it = check_third_partition_helper.get_flagging_output_iterator(expected_third_it); + + // Needs to be device accessible + c2h::device_vector num_selected_out{0, 0}; + offset_t* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data()); + + // Run test + partition( + in, check_first_it, check_second_it, check_third_it, d_num_selected_out, num_items, first_selector, second_selector); + + // Ensure that we created the correct output + REQUIRE(num_selected_out[0] == expected_first); + REQUIRE(num_selected_out[1] == expected_second); + check_first_partition_helper.check_all_results_correct(); + check_second_partition_helper.check_all_results_correct(); + check_third_partition_helper.check_all_results_correct(); +} +catch (std::bad_alloc&) +{ + // Exceeding memory is not a failure. +} diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu index 06f2b7c31a7..95c4794b8cf 100644 --- a/cub/test/catch2_test_device_transform.cu +++ b/cub/test/catch2_test_device_transform.cu @@ -166,8 +166,8 @@ struct alignas(Alignment) overaligned_addable_t return a.value == b.value; } - _CCCL_HOST_DEVICE friend auto - operator+(const overaligned_addable_t& a, const overaligned_addable_t& b) -> overaligned_addable_t + _CCCL_HOST_DEVICE friend auto operator+(const overaligned_addable_t& a, const overaligned_addable_t& b) + -> overaligned_addable_t { check(a); check(b); diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu index 01f39027ce0..71187ecc83a 100644 --- a/cub/test/catch2_test_nvrtc.cu +++ b/cub/test/catch2_test_nvrtc.cu @@ -31,6 +31,7 @@ #include #include +#include TEST_CASE("Test nvrtc", "[test][nvrtc]") { diff --git a/cub/test/cmake/check_source_files.cmake b/cub/test/cmake/check_source_files.cmake index 1554a2256e1..1fba8476f67 100644 --- a/cub/test/cmake/check_source_files.cmake +++ b/cub/test/cmake/check_source_files.cmake @@ -83,24 +83,6 @@ if (NOT valid_count EQUAL 5) "Matched ${valid_count} times, expected 5.") endif() -################################################################################ -# Legacy macro checks. -# Check all files in CUB to make sure that they aren't using the legacy -# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros. -# -# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET. -# They are provided for legacy purposes and should be replaced with -# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code. -# -# -set(legacy_macro_header_exclusions - # This header defines a legacy CUDART macro: - cub/detail/detect_cuda_runtime.cuh -) - -set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED") -set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__") - ################################################################################ # Read source files: foreach(src ${cub_srcs}) @@ -156,21 +138,6 @@ foreach(src ${cub_srcs}) set(found_errors 1) endif() endif() - - if (NOT ${src} IN_LIST legacy_macro_header_exclusions) - count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count) - count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count) - - if (NOT thrust_count EQUAL 0) - message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.") - set(found_errors 1) - endif() - - if (NOT cub_count EQUAL 0) - message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.") - set(found_errors 1) - endif() - endif() endforeach() if (NOT found_errors EQUAL 0) diff --git a/cub/test/cmake/nvrtc_args.h.in b/cub/test/cmake/nvrtc_args.h.in new file mode 100644 index 00000000000..215804ad0f0 --- /dev/null +++ b/cub/test/cmake/nvrtc_args.h.in @@ -0,0 +1,6 @@ +#pragma once + +const char* NVRTC_CUB_PATH = "-I@CMAKE_SOURCE_DIR@/cub"; +const char* NVRTC_THRUST_PATH = "-I@CMAKE_SOURCE_DIR@/thrust"; +const char* NVRTC_LIBCUDACXX_PATH = "-I@CMAKE_SOURCE_DIR@/libcudacxx/include"; +const char* NVRTC_CTK_PATH = "-I@CUDAToolkit_INCLUDE_DIRS@"; diff --git a/cub/test/test_block_radix_rank.cu b/cub/test/test_block_radix_rank.cu index 8c1df1a80c7..c53c6b179e3 100644 --- a/cub/test/test_block_radix_rank.cu +++ b/cub/test/test_block_radix_rank.cu @@ -310,7 +310,7 @@ void Test() Test(); Test(); - Test(cub::Int2Type<(BlockThreads % 32) == 0>{}); + Test(cub::Int2Type < (BlockThreads % 32) == 0 > {}); } int main(int argc, char** argv) diff --git a/cub/test/test_warning_suppression.cuh b/cub/test/test_warning_suppression.cuh index 46c6080fed7..448230343f3 100644 --- a/cub/test/test_warning_suppression.cuh +++ b/cub/test/test_warning_suppression.cuh @@ -27,7 +27,6 @@ #pragma once -#include #include // C4127: conditional expression is constant diff --git a/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh b/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh index 459beddee22..ae8ad239d46 100644 --- a/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh +++ b/cudax/include/cuda/experimental/__async/sender/basic_sender.cuh @@ -60,8 +60,8 @@ struct receiver_defaults } template - _CUDAX_TRIVIAL_API static auto - set_stopped(__ignore, _Rcvr& __rcvr) noexcept -> __async::completion_signatures<__async::set_stopped_t()> + _CUDAX_TRIVIAL_API static auto set_stopped(__ignore, _Rcvr& __rcvr) noexcept + -> __async::completion_signatures<__async::set_stopped_t()> { __async::set_stopped(static_cast<_Rcvr&&>(__rcvr)); return {}; @@ -198,15 +198,15 @@ _CUDAX_TRIVIAL_API auto __make_opstate(_Sndr __sndr, _Rcvr __rcvr) } template -_CUDAX_TRIVIAL_API auto -__get_attrs(int, const _Data& __data, const _Sndrs&... __sndrs) noexcept -> decltype(__data.get_attrs(__sndrs...)) +_CUDAX_TRIVIAL_API auto __get_attrs(int, const _Data& __data, const _Sndrs&... __sndrs) noexcept + -> decltype(__data.get_attrs(__sndrs...)) { return __data.get_attrs(__sndrs...); } template -_CUDAX_TRIVIAL_API auto -__get_attrs(long, const _Data&, const _Sndrs&... __sndrs) noexcept -> decltype(__async::get_env(__sndrs...)) +_CUDAX_TRIVIAL_API auto __get_attrs(long, const _Data&, const _Sndrs&... __sndrs) noexcept + -> decltype(__async::get_env(__sndrs...)) { return __async::get_env(__sndrs...); } diff --git a/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh b/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh index 25d5ef04d76..868c911b1da 100644 --- a/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh +++ b/cudax/include/cuda/experimental/__async/sender/completion_signatures.cuh @@ -76,48 +76,36 @@ template class _Vy, template class _ using __transform_sig_t = decltype(__transform_sig<_Sig, _Vy, _Ey, _Sy>()); template - class _Vy, - template - class _Ey, + template class _Vy, + template class _Ey, class _Sy, - template - class _Variant, + template class _Variant, class... _More> extern _DIAGNOSTIC<_Sigs> __transform_completion_signatures_v; template - class _Vy, - template - class _Ey, + template class _Vy, + template class _Ey, class _Sy, - template - class _Variant, + template class _Variant, class... _More> extern __fn_t<_ERROR<_What...>>* __transform_completion_signatures_v<_ERROR<_What...>, _Vy, _Ey, _Sy, _Variant, _More...>; template - class _Vy, - template - class _Ey, + template class _Vy, + template class _Ey, class _Sy, - template - class _Variant, + template class _Variant, class... _More> extern __fn_t<_Variant<__transform_sig_t<_Sigs, _Vy, _Ey, _Sy>..., _More...>>* __transform_completion_signatures_v, _Vy, _Ey, _Sy, _Variant, _More...>; template - class _Vy, - template - class _Ey, + template class _Vy, + template class _Ey, class _Sy, - template - class _Variant, + template class _Variant, class... _More> using __transform_completion_signatures = decltype(__transform_completion_signatures_v<_Sigs, _Vy, _Ey, _Sy, _Variant, _More...>()); @@ -129,12 +117,9 @@ template <> struct __gather_sigs_fn { template - class _Then, - template - class _Else, - template - class _Variant, + template class _Then, + template class _Else, + template class _Variant, class... _More> using __call = __transform_completion_signatures< _Sigs, @@ -149,12 +134,9 @@ template <> struct __gather_sigs_fn { template - class _Then, - template - class _Else, - template - class _Variant, + template class _Then, + template class _Else, + template class _Variant, class... _More> using __call = __transform_completion_signatures< _Sigs, @@ -169,12 +151,9 @@ template <> struct __gather_sigs_fn { template - class _Then, - template - class _Else, - template - class _Variant, + template class _Then, + template class _Else, + template class _Variant, class... _More> using __call = __transform_completion_signatures< _Sigs, @@ -187,12 +166,9 @@ struct __gather_sigs_fn template - class _Then, - template - class _Else, - template - class _Variant, + template class _Then, + template class _Else, + template class _Variant, class... _More> using __gather_completion_signatures = typename __gather_sigs_fn<_WantedTag>::template __call<_Sigs, _Then, _Else, _Variant, _More...>; @@ -404,13 +380,12 @@ template auto completion(_Tag, _Args&&...) -> __csig::__sigs<_Tag(_Args...)>&; template -auto completions_of(_Sndr&&, - _Rcvr = {}) -> decltype(__csig::__to_sigs(__declval&>())); +auto completions_of(_Sndr&&, _Rcvr = {}) + -> decltype(__csig::__to_sigs(__declval&>())); template -auto eptr_completion_if() - -> _CUDA_VSTD:: - conditional_t<_PotentiallyThrowing, __csig::__sigs, __csig::__sigs<>>&; +auto eptr_completion_if() -> _CUDA_VSTD:: + conditional_t<_PotentiallyThrowing, __csig::__sigs, __csig::__sigs<>>&; } // namespace meta } // namespace cuda::experimental::__async diff --git a/cudax/include/cuda/experimental/__async/sender/continue_on.cuh b/cudax/include/cuda/experimental/__async/sender/continue_on.cuh index 9a0c142e21c..8da87a443a3 100644 --- a/cudax/include/cuda/experimental/__async/sender/continue_on.cuh +++ b/cudax/include/cuda/experimental/__async/sender/continue_on.cuh @@ -267,8 +267,8 @@ struct continue_on_t::__sndr_t }; template -_CUDAX_API auto -continue_on_t::operator()(_Sndr __sndr, _Sch __sch) const noexcept -> continue_on_t::__sndr_t<_Sndr, _Sch> +_CUDAX_API auto continue_on_t::operator()(_Sndr __sndr, _Sch __sch) const noexcept + -> continue_on_t::__sndr_t<_Sndr, _Sch> { return __sndr_t<_Sndr, _Sch>{{}, __sch, static_cast<_Sndr&&>(__sndr)}; } diff --git a/cudax/include/cuda/experimental/__async/sender/cpos.cuh b/cudax/include/cuda/experimental/__async/sender/cpos.cuh index 7f1fb383a71..dab62e7ac10 100644 --- a/cudax/include/cuda/experimental/__async/sender/cpos.cuh +++ b/cudax/include/cuda/experimental/__async/sender/cpos.cuh @@ -110,8 +110,8 @@ _CCCL_GLOBAL_CONSTANT struct set_error_t _CCCL_GLOBAL_CONSTANT struct set_stopped_t { template - _CUDAX_TRIVIAL_API auto - operator()(_Rcvr&& __rcvr) const noexcept -> decltype(static_cast<_Rcvr&&>(__rcvr).set_stopped()) + _CUDAX_TRIVIAL_API auto operator()(_Rcvr&& __rcvr) const noexcept + -> decltype(static_cast<_Rcvr&&>(__rcvr).set_stopped()) { static_assert(_CUDA_VSTD::is_same_v(__rcvr).set_stopped()), void>); static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_stopped())); @@ -119,8 +119,8 @@ _CCCL_GLOBAL_CONSTANT struct set_stopped_t } template - _CUDAX_TRIVIAL_API auto - operator()(_Rcvr* __rcvr) const noexcept -> decltype(static_cast<_Rcvr&&>(*__rcvr).set_stopped()) + _CUDAX_TRIVIAL_API auto operator()(_Rcvr* __rcvr) const noexcept + -> decltype(static_cast<_Rcvr&&>(*__rcvr).set_stopped()) { static_assert(_CUDA_VSTD::is_same_v(*__rcvr).set_stopped()), void>); static_assert(noexcept(static_cast<_Rcvr&&>(*__rcvr).set_stopped())); diff --git a/cudax/include/cuda/experimental/__async/sender/let_value.cuh b/cudax/include/cuda/experimental/__async/sender/let_value.cuh index 7d06e071fe0..6742a1c1d6c 100644 --- a/cudax/include/cuda/experimental/__async/sender/let_value.cuh +++ b/cudax/include/cuda/experimental/__async/sender/let_value.cuh @@ -243,8 +243,9 @@ private: _Sndr __sndr_; template - _CUDAX_API auto connect(_Rcvr __rcvr) && noexcept( - __nothrow_constructible<__opstate_t<_Rcvr, _Sndr, _Fn>, _Sndr, _Fn, _Rcvr>) -> __opstate_t<_Rcvr, _Sndr, _Fn> + _CUDAX_API auto + connect(_Rcvr __rcvr) && noexcept(__nothrow_constructible<__opstate_t<_Rcvr, _Sndr, _Fn>, _Sndr, _Fn, _Rcvr>) + -> __opstate_t<_Rcvr, _Sndr, _Fn> { return __opstate_t<_Rcvr, _Sndr, _Fn>( static_cast<_Sndr&&>(__sndr_), static_cast<_Fn&&>(__fn_), static_cast<_Rcvr&&>(__rcvr)); diff --git a/cudax/include/cuda/experimental/__async/sender/stop_token.cuh b/cudax/include/cuda/experimental/__async/sender/stop_token.cuh index 35e6d4d164a..693816dbb45 100644 --- a/cudax/include/cuda/experimental/__async/sender/stop_token.cuh +++ b/cudax/include/cuda/experimental/__async/sender/stop_token.cuh @@ -369,8 +369,8 @@ _CUDAX_API inline void inplace_stop_source::__unlock(uint8_t __old_state) const (void) __state_.store(__old_state, _CUDA_VSTD::memory_order_release); } -_CUDAX_API inline auto -inplace_stop_source::__try_lock_unless_stop_requested(bool __set_stop_requested) const noexcept -> bool +_CUDAX_API inline auto inplace_stop_source::__try_lock_unless_stop_requested(bool __set_stop_requested) const noexcept + -> bool { __stok::__spin_wait __spin; auto __old_state = __state_.load(_CUDA_VSTD::memory_order_relaxed); diff --git a/cudax/include/cuda/experimental/__async/sender/tuple.cuh b/cudax/include/cuda/experimental/__async/sender/tuple.cuh index 98a1d0997f1..0229ed8b9c7 100644 --- a/cudax/include/cuda/experimental/__async/sender/tuple.cuh +++ b/cudax/include/cuda/experimental/__async/sender/tuple.cuh @@ -65,8 +65,8 @@ struct __tupl<_CUDA_VSTD::index_sequence<_Idx...>, _Ts...> : __box<_Idx, _Ts>... template _CUDAX_TRIVIAL_API static auto __for_each(_Fn&& __fn, _Self&& __self, _Us&&... __us) // - noexcept((__nothrow_callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> - && ...)) -> _CUDA_VSTD::enable_if_t<(__callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> && ...)> + noexcept((__nothrow_callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> && ...)) + -> _CUDA_VSTD::enable_if_t<(__callable<_Fn, _Us..., __copy_cvref_t<_Self, _Ts>> && ...)> { return ( static_cast<_Fn&&>(__fn)(static_cast<_Us&&>(__us)..., static_cast<_Self&&>(__self).__box<_Idx, _Ts>::__value_), diff --git a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh index 8a42bab40ca..0e1dceff19b 100644 --- a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh @@ -80,8 +80,8 @@ struct __with_property template struct __iproperty : interface<__iproperty> { - _CUDAX_HOST_API friend auto - get_property([[maybe_unused]] const __iproperty& __obj, _Property) -> __property_result_t<_Property> + _CUDAX_HOST_API friend auto get_property([[maybe_unused]] const __iproperty& __obj, _Property) + -> __property_result_t<_Property> { if constexpr (!_CUDA_VSTD::is_same_v<__property_result_t<_Property>, void>) { @@ -268,8 +268,8 @@ template struct __with_try_get_property { template - _CUDAX_HOST_API _CCCL_NODISCARD_FRIEND auto - try_get_property(const _Derived& __self, _Property) noexcept -> __try_property_result_t<_Property> + _CUDAX_HOST_API _CCCL_NODISCARD_FRIEND auto try_get_property(const _Derived& __self, _Property) noexcept + -> __try_property_result_t<_Property> { auto __prop = __cudax::dynamic_any_cast*>(&__self); if constexpr (_CUDA_VSTD::is_same_v<__property_result_t<_Property>, void>) diff --git a/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh b/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh index ee01f53a90b..73491c72e5a 100644 --- a/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh @@ -2294,13 +2294,7 @@ public: auto read(Pack&&... pack) const { using U = readonly_type_of; - // The constness of *this implies that access mode is read - // Note that we do not provide an access mode, because this is how we - // dispatch statically between read-only and non read-only access modes in - // task_dep_untyped. - // TODO : we could make this cleaner if we had a tag type in addition to - // the access_mode enum class. - return task_dep(*this, /* access_mode::read, */ ::std::forward(pack)...); + return task_dep(*this, access_mode::read, ::std::forward(pack)...); } template diff --git a/cudax/include/cuda/experimental/__stf/internal/task_dep.cuh b/cudax/include/cuda/experimental/__stf/internal/task_dep.cuh index f25c176a603..60764181e4e 100644 --- a/cudax/include/cuda/experimental/__stf/internal/task_dep.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/task_dep.cuh @@ -70,20 +70,6 @@ public: : task_dep_untyped(d, m, data_place::affine, mv(redux_op)) {} - // These constructors take no access_mode, which is a way to identify that - // they are using read-only access mode. - // TODO : That was the only way to dispatch at compile time which constructor - // is used. We might use a tag_type or a std::true_type to do this static - // dispatch in a better way. - task_dep_untyped( - const logical_data_untyped& d, data_place dplace, ::std::shared_ptr redux_op = nullptr) - : task_dep_untyped(const_cast(d), access_mode::read, mv(dplace), mv(redux_op)) - {} - - task_dep_untyped(const logical_data_untyped& d, ::std::shared_ptr redux_op = nullptr) - : task_dep_untyped(const_cast(d), access_mode::read, mv(redux_op)) - {} - logical_data_untyped get_data() const; instance_id_t get_instance_id() const diff --git a/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh b/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh index 5b64dbc531d..bd481b3dea2 100644 --- a/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh +++ b/cudax/include/cuda/experimental/__utility/basic_any/basic_any_from.cuh @@ -50,8 +50,8 @@ _CCCL_NODISCARD _CUDAX_TRIVIAL_HOST_API auto basic_any_from(_Interface<_Super>& } template