From 680d8d0a8a61851b83fbf497c1ae6d7576db1ab6 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Mon, 20 May 2024 17:40:44 -0400
Subject: [PATCH 01/15] DOC v24.08 Updates [skip ci]

---
 .github/workflows/build.yaml                   | 12 ++++++------
 .github/workflows/pr.yaml                      | 18 +++++++++---------
 .github/workflows/test.yaml                    |  6 +++---
 VERSION                                        |  2 +-
 ci/build_docs.sh                               |  2 +-
 .../environments/all_cuda-118_arch-x86_64.yaml |  4 ++--
 .../environments/all_cuda-122_arch-x86_64.yaml |  4 ++--
 cpp/CMakeLists.txt                             |  2 +-
 cpp/Doxyfile                                   |  2 +-
 dependencies.yaml                              |  4 ++--
 fetch_rapids.cmake                             |  2 +-
 python/pylibwholegraph/CMakeLists.txt          |  2 +-
 12 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2d039122b..d09ba5a4d 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -49,7 +49,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -62,7 +62,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -70,7 +70,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibwholegraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -80,7 +80,7 @@ jobs:
   wheel-publish-pylibwholegraph:
     needs: wheel-build-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 339646eca..b48246626 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -21,41 +21,41 @@ jobs:
       - wheel-build-pylibwholegraph
       - wheel-test-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       arch: "amd64"
@@ -64,14 +64,14 @@ jobs:
   wheel-build-pylibwholegraph:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: pull-request
       script: ci/build_wheel.sh
   wheel-test-pylibwholegraph:
     needs: wheel-build-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 348476641..eb258f5ae 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-pytorch-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibwholegraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index 0bff6981a..ec8489fda 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.06.00
+24.08.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 61fa8ec22..e37759885 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -22,7 +22,7 @@ rapids-print-env
 rapids-logger "Downloading artifacts from previous jobs"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
-export RAPIDS_VERSION_NUMBER="24.06"
+export RAPIDS_VERSION_NUMBER="24.08"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-mamba-retry install \
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 45fc02021..cd74ceafd 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -24,8 +24,8 @@ dependencies:
 - graphviz
 - ipykernel
 - ipython
-- libraft-headers==24.6.*
-- librmm==24.6.*
+- libraft-headers==24.8.*
+- librmm==24.8.*
 - nanobind>=0.2.0
 - nbsphinx
 - nccl
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index dd33e60c1..ece6b9df3 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -25,8 +25,8 @@ dependencies:
 - graphviz
 - ipykernel
 - ipython
-- libraft-headers==24.6.*
-- librmm==24.6.*
+- libraft-headers==24.8.*
+- librmm==24.8.*
 - nanobind>=0.2.0
 - nbsphinx
 - nccl
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index b3fdc6d74..cea2c0459 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -14,7 +14,7 @@
 # limitations under the License.
 #=============================================================================
 
-set(RAPIDS_VERSION "24.06")
+set(RAPIDS_VERSION "24.08")
 set(WHOLEGRAPH_VERSION "${RAPIDS_VERSION}.00")
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
diff --git a/cpp/Doxyfile b/cpp/Doxyfile
index 3e4e9e53f..58cef9e82 100644
--- a/cpp/Doxyfile
+++ b/cpp/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "WholeGraph C API"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 24.06
+PROJECT_NUMBER         = 24.08
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/dependencies.yaml b/dependencies.yaml
index d20ccf9bc..7a049baef 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -74,8 +74,8 @@ dependencies:
           - cxx-compiler
           - cython>=3.0.0
           - &doxygen doxygen==1.9.1
-          - libraft-headers==24.6.*
-          - librmm==24.6.*
+          - libraft-headers==24.8.*
+          - librmm==24.8.*
           - nanobind>=0.2.0
           - nccl
           - scikit-build-core>=0.7.0
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 4226d5b23..bfaae09f8 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.06/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.08/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake
   )
 endif()
diff --git a/python/pylibwholegraph/CMakeLists.txt b/python/pylibwholegraph/CMakeLists.txt
index d22e3d51c..d6e6df9da 100644
--- a/python/pylibwholegraph/CMakeLists.txt
+++ b/python/pylibwholegraph/CMakeLists.txt
@@ -16,7 +16,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(RAPIDS_VERSION "24.06")
+set(RAPIDS_VERSION "24.08")
 set(WHOLEGRAPH_VERSION "${RAPIDS_VERSION}.00")
 
 include(FetchContent)

From 03c802a9c48471ee874339cc46ebd765250f7323 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 7 Jun 2024 15:01:13 -0500
Subject: [PATCH 02/15] Adopt CI/packaging codeowners (#183)

---
 .github/CODEOWNERS | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 4eae70144..c336af191 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -22,9 +22,14 @@ python/            @rapidsai/wholegraph-python-codeowners
 **/CMakeLists.txt  @rapidsai/wholegraph-cmake-codeowners
 **/cmake/          @rapidsai/wholegraph-cmake-codeowners
 
-#build/ops code owners
-.github/           @rapidsai/ops-codeowners
-ci/                @rapidsai/ops-codeowners
-conda/             @rapidsai/ops-codeowners
-**/Dockerfile      @rapidsai/ops-codeowners
-**/.dockerignore   @rapidsai/ops-codeowners
+#CI code owners
+/.github/                @rapidsai/ci-codeowners
+/ci/                     @rapidsai/ci-codeowners
+/.pre-commit-config.yaml @rapidsai/ci-codeowners
+
+#packaging code owners
+/.devcontainer/    @rapidsai/packaging-codeowners
+/conda/            @rapidsai/packaging-codeowners
+/dependencies.yaml @rapidsai/packaging-codeowners
+/build.sh          @rapidsai/packaging-codeowners
+pyproject.toml     @rapidsai/packaging-codeowners

From c92bba3b8d46b543d3dc7c59a904788a60d19579 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 10 Jun 2024 11:21:40 -0500
Subject: [PATCH 03/15] use rapids-build-backend (#181)

Contributes to https://github.com/rapidsai/build-planning/issues/31
Contributes to https://github.com/rapidsai/dependency-file-generator/issues/89

Proposes introducing `rapids-build-backend` as this project's build backend, to reduce the complexity of various CI/build scripts.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/wholegraph/pull/181
---
 .pre-commit-config.yaml                       |  2 +-
 build.sh                                      |  6 ++++-
 ci/build_docs.sh                              |  2 +-
 ci/build_python.sh                            | 11 +++-----
 ci/build_wheel.sh                             | 24 +----------------
 ci/check_style.sh                             |  2 +-
 ci/release/update-version.sh                  |  6 ++---
 ci/test_clang_tidy.sh                         |  2 +-
 ci/test_cpp.sh                                |  2 +-
 ci/test_python.sh                             |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             |  5 ++--
 .../all_cuda-122_arch-x86_64.yaml             |  5 ++--
 conda/recipes/pylibwholegraph/meta.yaml       |  1 +
 dependencies.yaml                             | 27 ++++++++++++++++---
 .../pylibwholegraph/_version.py               | 16 ++++++++---
 .../pylibwholegraph/tests/test_version.py     | 12 +++++++++
 python/pylibwholegraph/pyproject.toml         | 15 ++++++++---
 17 files changed, 84 insertions(+), 56 deletions(-)
 create mode 100644 python/pylibwholegraph/pylibwholegraph/tests/test_version.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2cb5d320c..ebd3249c0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,7 +32,7 @@ repos:
         types_or: [c, c++, cuda]
         args: ["-fallback-style=none", "-style=file", "-i"]
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.11.0
+    rev: v1.13.11
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
diff --git a/build.sh b/build.sh
index 9d6bfa7fd..1ab1f70ca 100755
--- a/build.sh
+++ b/build.sh
@@ -271,7 +271,11 @@ if buildAll || hasArg pylibwholegraph; then
     if ! hasArg --compile-cmd; then
         cd ${REPODIR}/python/pylibwholegraph
         env LIBWHOLEGRAPH_DIR=${LIBWHOLEGRAPH_DIR} \
-        SKBUILD_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=${BUILD_TYPE};${EXTRA_CMAKE_ARGS/ /;}" ${PYTHON} -m pip install --no-build-isolation --no-deps .
+        SKBUILD_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=${BUILD_TYPE};${EXTRA_CMAKE_ARGS/ /;}" ${PYTHON} -m pip install \
+            --no-build-isolation \
+            --no-deps \
+            --config-settings rapidsai.disable-cuda=true \
+            .
 
     else
         # just invoke cmake without going through scikit-build-core
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index e37759885..f4ad95f30 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -8,7 +8,7 @@ rapids-logger "Create test conda environment"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key docs \
+  --file-key docs \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n docs
diff --git a/ci/build_python.sh b/ci/build_python.sh
index efb7bfe4a..1a3812b36 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -17,19 +17,14 @@ PACKAGES="libwholegraph"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
-version=$(rapids-generate-version)
-git_commit=$(git rev-parse HEAD)
-export RAPIDS_PACKAGE_VERSION=${version}
-echo "${version}" > VERSION
+rapids-generate-version > ./VERSION
 
 rapids-logger "Begin py build"
 
 # TODO: Remove `--no-test` flags once importing on a CPU
 # node works correctly
 rapids-logger "Begin pylibwholegraph build"
-version_file_pylibwholegraph="python/pylibwholegraph/pylibwholegraph/_version.py"
-sed -i "/^__git_commit__/ s/= .*/= \"${git_commit}\"/g" ${version_file_pylibwholegraph}
-rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/pylibwholegraph
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 1f7e5580a..3a59ab481 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -9,32 +9,10 @@ package_dir="python/pylibwholegraph"
 source rapids-configure-sccache
 source rapids-date-string
 
-version=$(rapids-generate-version)
-git_commit=$(git rev-parse HEAD)
+rapids-generate-version > ./VERSION
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# This is the version of the suffix with a preceding hyphen. It's used
-# everywhere except in the final wheel name.
-PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
-
-# Patch project metadata files to include the CUDA version suffix and version override.
-pyproject_file="${package_dir}/pyproject.toml"
-version_file="${package_dir}/${package_name}/_version.py"
-
-sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
-echo "${version}" > VERSION
-sed -i "/^__git_commit__ / s/= .*/= \"${git_commit}\"/g" ${version_file}
-
-# For nightlies we want to ensure that we're pulling in alphas as well. The
-# easiest way to do so is to augment the spec with a constraint containing a
-# min alpha version that doesn't affect the version bounds but does allow usage
-# of alpha versions for that dependency without --pre
-alpha_spec=''
-if ! rapids-is-release-build; then
-    alpha_spec=',>=0.0.0a0'
-fi
-
 cd "${package_dir}"
 
 # Hardcode the output dir
diff --git a/ci/check_style.sh b/ci/check_style.sh
index d7baa88e8..d7ba4cae2 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -8,7 +8,7 @@ rapids-logger "Create checks conda environment"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key checks \
+  --file-key checks \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n checks
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index da9a19e9f..1ba99e790 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -68,10 +68,10 @@ DEPENDENCIES=(
 )
 for DEP in "${DEPENDENCIES[@]}"; do
   for FILE in dependencies.yaml conda/environments/*.yaml; do
-    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" ${FILE}
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" ${FILE}
   done
   for FILE in python/**/pyproject.toml; do
-    sed_runner "/\"${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE}
+    sed_runner "/\"${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
   done
 done
 
diff --git a/ci/test_clang_tidy.sh b/ci/test_clang_tidy.sh
index 90157c59c..469d71f08 100755
--- a/ci/test_clang_tidy.sh
+++ b/ci/test_clang_tidy.sh
@@ -9,7 +9,7 @@ rapids-logger "Create clang-tidy conda environment"
 rapids-logger "Generate clang-tidy testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
-  --file_key clang_tidy \
+  --file-key clang_tidy \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n clang_tidy
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 5e150862d..49f76376a 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -11,7 +11,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 rapids-logger "Generate C++ testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_cpp \
+  --file-key test_cpp \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n test
diff --git a/ci/test_python.sh b/ci/test_python.sh
index dd56e7b92..80bc3513f 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -24,7 +24,7 @@ fi
 rapids-logger "Generate Python testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_python \
+  --file-key test_python \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=${ARCH};py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n test
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index cd74ceafd..2f4f002af 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -24,8 +24,8 @@ dependencies:
 - graphviz
 - ipykernel
 - ipython
-- libraft-headers==24.8.*
-- librmm==24.8.*
+- libraft-headers==24.8.*,>=0.0.0a0
+- librmm==24.8.*,>=0.0.0a0
 - nanobind>=0.2.0
 - nbsphinx
 - nccl
@@ -41,6 +41,7 @@ dependencies:
 - python>=3.9,<3.12
 - pytorch-cuda=11.8
 - pytorch=2.0.0
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
 - recommonmark
 - scikit-build-core>=0.7.0
 - sphinx-copybutton
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index ece6b9df3..e924b5825 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -25,8 +25,8 @@ dependencies:
 - graphviz
 - ipykernel
 - ipython
-- libraft-headers==24.8.*
-- librmm==24.8.*
+- libraft-headers==24.8.*,>=0.0.0a0
+- librmm==24.8.*,>=0.0.0a0
 - nanobind>=0.2.0
 - nbsphinx
 - nccl
@@ -39,6 +39,7 @@ dependencies:
 - pytest-forked
 - pytest-xdist
 - python>=3.9,<3.12
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
 - recommonmark
 - scikit-build-core>=0.7.0
 - sphinx-copybutton
diff --git a/conda/recipes/pylibwholegraph/meta.yaml b/conda/recipes/pylibwholegraph/meta.yaml
index 829350851..d3f9a49b7 100644
--- a/conda/recipes/pylibwholegraph/meta.yaml
+++ b/conda/recipes/pylibwholegraph/meta.yaml
@@ -63,6 +63,7 @@ requirements:
     - cython
     - libwholegraph ={{ version }}
     - python
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core {{ scikit_build_core_version }}
   run:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 7a049baef..1a7ee4e4c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -11,6 +11,7 @@ files:
       - cuda
       - cuda_version
       - py_version
+      - rapids_build_skbuild
       - run
       - test_python
       - docs
@@ -44,6 +45,7 @@ files:
       - cuda
       - cuda_version
       - py_version
+      - rapids_build_skbuild
       - run
       - pytorch_cpu
       - clang_tools
@@ -52,6 +54,14 @@ files:
     pyproject_dir: python/pylibwholegraph
     extras:
       table: build-system
+    includes:
+      - rapids_build_skbuild
+  py_rapids_build_pylibwholegraph:
+    output: pyproject
+    pyproject_dir: python/pylibwholegraph
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
     includes:
       - python_build_wheel
 channels:
@@ -74,11 +84,10 @@ dependencies:
           - cxx-compiler
           - cython>=3.0.0
           - &doxygen doxygen==1.9.1
-          - libraft-headers==24.8.*
-          - librmm==24.8.*
+          - libraft-headers==24.8.*,>=0.0.0a0
+          - librmm==24.8.*,>=0.0.0a0
           - nanobind>=0.2.0
           - nccl
-          - scikit-build-core>=0.7.0
     specific:
       - output_types: conda
         matrices:
@@ -315,6 +324,17 @@ dependencies:
         packages:
           - clangxx==16.0.6
           - clang-tools==16.0.6
+  rapids_build_skbuild:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - rapids-build-backend>=0.3.0,<0.4.0.dev0
+      - output_types: conda
+        packages:
+          - scikit-build-core>=0.7.0
+      - output_types: [requirements, pyproject]
+        packages:
+          - scikit-build-core[pyproject]>=0.7.0
   python_build_wheel:
     common:
       - output_types: [pyproject]
@@ -322,4 +342,3 @@ dependencies:
           - cmake>=3.26.4
           - cython>=3.0.0
           - ninja
-          - scikit-build-core[pyproject]>=0.7.0
diff --git a/python/pylibwholegraph/pylibwholegraph/_version.py b/python/pylibwholegraph/pylibwholegraph/_version.py
index 394252fd0..e8adcc314 100644
--- a/python/pylibwholegraph/pylibwholegraph/_version.py
+++ b/python/pylibwholegraph/pylibwholegraph/_version.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,6 +16,16 @@
 import importlib.resources
 
 __version__ = (
-    importlib.resources.files("pylibwholegraph").joinpath("VERSION").read_text().strip()
+    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
 )
-__git_commit__ = ""
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/python/pylibwholegraph/pylibwholegraph/tests/test_version.py b/python/pylibwholegraph/pylibwholegraph/tests/test_version.py
new file mode 100644
index 000000000..e43e590d0
--- /dev/null
+++ b/python/pylibwholegraph/pylibwholegraph/tests/test_version.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pylibwholegraph
+
+
+def test_version_constants_are_populated():
+    # __git_commit__ will only be non-empty in a built distribution
+    assert isinstance(pylibwholegraph.__git_commit__, str)
+
+    # __version__ should always be non-empty
+    assert isinstance(pylibwholegraph.__version__, str)
+    assert len(pylibwholegraph.__version__) > 0
diff --git a/python/pylibwholegraph/pyproject.toml b/python/pylibwholegraph/pyproject.toml
index 3f061998d..341f8b252 100644
--- a/python/pylibwholegraph/pyproject.toml
+++ b/python/pylibwholegraph/pyproject.toml
@@ -13,11 +13,9 @@
 # limitations under the License.
 
 [build-system]
-build-backend = "scikit_build_core.build"
+build-backend = "rapids_build_backend.build"
 requires = [
-    "cmake>=3.26.4",
-    "cython>=3.0.0",
-    "ninja",
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -42,6 +40,15 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
 ]
 
+[tool.rapids-build-backend]
+build-backend = "scikit_build_core.build"
+dependencies-file = "../../dependencies.yaml"
+requires = [
+    "cmake>=3.26.4",
+    "cython>=3.0.0",
+    "ninja",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"

From 996f8f71bd5a10a271f802e9e2d2faa6ce7140f1 Mon Sep 17 00:00:00 2001
From: zhuofan1123 <zhuofanl@nvidia.com>
Date: Fri, 14 Jun 2024 00:24:56 +0800
Subject: [PATCH 04/15] fixed bugs (#180)

Authors:
  - https://github.com/zhuofan1123

Approvers:
  - https://github.com/linhu-nv
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/wholegraph/pull/180
---
 cpp/src/wholememory/embedding_optimizer.cpp                | 4 ++--
 .../pylibwholegraph/binding/wholememory_binding.pyx        | 7 ++++---
 python/pylibwholegraph/pylibwholegraph/torch/embedding.py  | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/src/wholememory/embedding_optimizer.cpp b/cpp/src/wholememory/embedding_optimizer.cpp
index e96585289..1fa761014 100644
--- a/cpp/src/wholememory/embedding_optimizer.cpp
+++ b/cpp/src/wholememory/embedding_optimizer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,7 +80,6 @@ wholememory_tensor_t embedding_optimizer_impl_base::get_optimizer_state(
   WHOLEMEMORY_CHECK_NOTHROW(optimizer_state != nullptr);
   WHOLEMEMORY_CHECK_NOTHROW(state_names_.size() == optimizer_state->cachable_states.size() +
                                                      optimizer_state->uncachable_states.size() + 1);
-  WHOLEMEMORY_FAIL_NOTHROW("optimizer state name %s not found for %s", state_name, name_);
   for (size_t i = 0; i < optimizer_state->cachable_states.size(); i++) {
     if (strcmp(state_name, optimizer_state->cachable_states[i].name.c_str()) == 0) {
       WHOLEMEMORY_CHECK_NOTHROW(strcmp(state_name, state_names_[i]) == 0);
@@ -94,6 +93,7 @@ wholememory_tensor_t embedding_optimizer_impl_base::get_optimizer_state(
       return optimizer_state->uncachable_states[i].global_raw_sub_tensor;
     }
   }
+  WHOLEMEMORY_FAIL_NOTHROW("optimizer state name %s not found for %s", state_name, name_);
   return nullptr;
 }
 
diff --git a/python/pylibwholegraph/pylibwholegraph/binding/wholememory_binding.pyx b/python/pylibwholegraph/pylibwholegraph/binding/wholememory_binding.pyx
index feffa9162..1d49065e3 100644
--- a/python/pylibwholegraph/pylibwholegraph/binding/wholememory_binding.pyx
+++ b/python/pylibwholegraph/pylibwholegraph/binding/wholememory_binding.pyx
@@ -828,9 +828,10 @@ cdef class PyWholeMemoryEmbedding:
         result = []
         cdef const char * const * state_names
         state_names = wholememory_embedding_get_optimizer_state_names(self.wm_embedding)
-        while state_names[i] != NULL:
-            result.append(<object> PyUnicode_FromString(state_names[i]))
-            i += 1
+        if state_names != NULL:
+            while state_names[i] != NULL:
+                result.append(<object> PyUnicode_FromString(state_names[i]))
+                i += 1
         return result
 
     def get_optimizer_state(self,
diff --git a/python/pylibwholegraph/pylibwholegraph/torch/embedding.py b/python/pylibwholegraph/pylibwholegraph/torch/embedding.py
index 8abc92be9..67f02df77 100644
--- a/python/pylibwholegraph/pylibwholegraph/torch/embedding.py
+++ b/python/pylibwholegraph/pylibwholegraph/torch/embedding.py
@@ -267,7 +267,7 @@ def __init__(
         super().__init__()
         self.wmb_embedding = wmb_embedding
         self.embedding_tensor = None
-        self.optimizer_states = None
+        self.optimizer_states = dict()
 
         self.wmb_optimizer = wmb_optimizer
         self.wmb_cache_policy = wmb_cache_policy

From 8d4cd9b372d42b5d1c402a6caf088b9826576fcb Mon Sep 17 00:00:00 2001
From: zhuofan1123 <zhuofanl@nvidia.com>
Date: Fri, 14 Jun 2024 00:25:37 +0800
Subject: [PATCH 05/15] decouple embedding creation from optimizer (#186)

This PR refactors the embedding creation interface, decoupling it from the optimizer dependency. Users now can designate the embeddings for optimization during optimizer initialization.
cpp:
```
wholememory_create_embedding(&wm_embedding, ...);
wholememory_create_embedding_optimizer(&optimizer, ...);
wholememory_embedding_set_optimizer(wm_embedding, optimizer);
```
python:
```
wm_embedding = wgth.create_embedding(...)
wm_optimizer = wgth.create_wholememory_optimizer(wm_embedding, "adam", {})
```

Authors:
  - https://github.com/zhuofan1123

Approvers:
  - https://github.com/linhu-nv
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/wholegraph/pull/186
---
 cpp/include/wholememory/embedding.h           | 11 ++-
 cpp/src/wholememory/embedding.cpp             | 67 ++++++++++-----
 cpp/src/wholememory/embedding.hpp             |  6 +-
 ...lememory_embedding_gradient_apply_tests.cu |  5 +-
 .../wholememory_embedding_tests.cu            |  3 +-
 .../examples/node_classfication.py            | 13 +--
 .../binding/wholememory_binding.pyx           | 13 +--
 .../pylibwholegraph/torch/embedding.py        | 85 +++++++++----------
 8 files changed, 114 insertions(+), 89 deletions(-)

diff --git a/cpp/include/wholememory/embedding.h b/cpp/include/wholememory/embedding.h
index 40c7fd778..08cd73e84 100644
--- a/cpp/include/wholememory/embedding.h
+++ b/cpp/include/wholememory/embedding.h
@@ -128,7 +128,6 @@ wholememory_error_code_t wholememory_destroy_embedding_cache_policy(
  * @param comm : WholeMemory Communicator
  * @param memory_type : Memory Type of the underlying WholeMemory
  * @param memory_location : Memory Location of the underlying WholeMemory
- * @param optimizer : Optimizer to use for training, if don't train embedding, use nullptr
  * @param cache_policy : Cache policy for this embedding, if don't use cache, use nullptr
  * @param user_defined_sms : User-defined sms number for raw embedding gather/scatter
  * @param round_robin_size : continuous embedding size in each rank under round-robin shard mode
@@ -140,7 +139,6 @@ wholememory_error_code_t wholememory_create_embedding(
   wholememory_comm_t comm,
   wholememory_memory_type_t memory_type,
   wholememory_memory_location_t memory_location,
-  wholememory_embedding_optimizer_t optimizer,
   wholememory_embedding_cache_policy_t cache_policy,
   int user_defined_sms = -1,
   int round_robin_size = 0);
@@ -161,6 +159,15 @@ wholememory_error_code_t wholememory_destroy_embedding(
 wholememory_tensor_t wholememory_embedding_get_embedding_tensor(
   wholememory_embedding_t wholememory_embedding);
 
+/**
+ * Set Optimizer for WholeMemory Embedding
+ * @param wholememory_embedding : WholeMemory Embedding
+ * @param optimizer : Optimizer to be set
+ * @return : wholememory_error_code_t
+ */
+wholememory_error_code_t wholememory_embedding_set_optimizer(
+  wholememory_embedding_t wholememory_embedding, wholememory_embedding_optimizer_t optimizer);
+
 /**
  * Gather from WholeMemory Embedding
  * @param wholememory_embedding : WholeMemory Embedding
diff --git a/cpp/src/wholememory/embedding.cpp b/cpp/src/wholememory/embedding.cpp
index 32ea1d3db..f1a868a84 100644
--- a/cpp/src/wholememory/embedding.cpp
+++ b/cpp/src/wholememory/embedding.cpp
@@ -49,23 +49,53 @@ static int64_t align_embedding_dim(int64_t embedding_dim, size_t element_size)
   return embedding_stride;
 }
 
+wholememory_error_code_t embedding_base::set_optimizer(wholememory_embedding_optimizer_t opt)
+{
+  try {
+    if (optimizer != nullptr) {
+      WHOLEMEMORY_ERROR("optimizer can only be set once.");
+      return WHOLEMEMORY_NOT_SUPPORTED;
+    }
+    optimizer = opt;
+    if (optimizer != nullptr) {
+      if (embedding_dtype_ != WHOLEMEMORY_DT_FLOAT) {
+        WHOLEMEMORY_ERROR("Only float embedding supports training.");
+        return WHOLEMEMORY_NOT_IMPLEMENTED;
+      }
+      if (cache_policy != nullptr) {
+        WHOLEMEMORY_CHECK_NOTHROW(cache_policy->access_type == WHOLEMEMORY_AT_READWRITE);
+        if (cache_policy->cache_comm != raw_embedding_comm_) {
+          WHOLEMEMORY_ERROR("optimizer not supported for local cached global readonly embedding.");
+          return WHOLEMEMORY_INVALID_INPUT;
+        }
+      }
+      optimizer_impl_base_ = static_cast<embedding_optimizer_impl_base*>(optimizer);
+      WHOLEMEMORY_RETURN_ON_FAIL(create_optimizer_states());
+      WHOLEMEMORY_RETURN_ON_FAIL(init_optimizer_states());
+    }
+  } catch (std::bad_alloc& sba) {
+    WHOLEMEMORY_ERROR("bad_alloc");
+    return WHOLEMEMORY_OUT_OF_MEMORY;
+  } catch (...) {
+    WHOLEMEMORY_ERROR("Unknown error");
+    return WHOLEMEMORY_UNKNOW_ERROR;
+  }
+
+  return WHOLEMEMORY_SUCCESS;
+}
+
 wholememory_error_code_t embedding_base::allocate(
   wholememory_matrix_description_t* embedding_description,
   wholememory_comm_t comm,
   wholememory_memory_type_t memory_type,
   wholememory_memory_location_t memory_location,
-  wholememory_embedding_cache_policy_t policy,
-  wholememory_embedding_optimizer_t opt) noexcept
+  wholememory_embedding_cache_policy_t policy) noexcept
 {
   cache_policy        = policy;
-  optimizer           = opt;
   raw_embedding_comm_ = comm;
+  embedding_dtype_    = embedding_description->dtype;
   wholememory_tensor_description_t padded_embedding_tensor_description;
   try {
-    if (optimizer != nullptr && embedding_description->dtype != WHOLEMEMORY_DT_FLOAT) {
-      WHOLEMEMORY_ERROR("Only float embedding supports training.");
-      return WHOLEMEMORY_NOT_IMPLEMENTED;
-    }
     if (cache_policy != nullptr) {
       WHOLEMEMORY_CHECK_NOTHROW(cache_policy->cache_comm != nullptr);
       if (cache_policy->cache_comm != comm) {
@@ -99,14 +129,6 @@ wholememory_error_code_t embedding_base::allocate(
     WHOLEMEMORY_RETURN_ON_FAIL(
       wholememory_tensor_get_subtensor(allocated_embedding, &starts[0], &ends[0], &user_embedding));
     if (cache_ptr_ != nullptr) { WHOLEMEMORY_RETURN_ON_FAIL(cache_ptr_->allocate(user_embedding)); }
-    if (optimizer != nullptr) {
-      if (cache_policy != nullptr) {
-        WHOLEMEMORY_CHECK_NOTHROW(cache_policy->access_type == WHOLEMEMORY_AT_READWRITE);
-      }
-      optimizer_impl_base_ = static_cast<embedding_optimizer_impl_base*>(optimizer);
-      WHOLEMEMORY_RETURN_ON_FAIL(create_optimizer_states());
-      WHOLEMEMORY_RETURN_ON_FAIL(init_optimizer_states());
-    }
   } catch (std::bad_alloc& sba) {
     WHOLEMEMORY_ERROR("bad_alloc");
     return WHOLEMEMORY_OUT_OF_MEMORY;
@@ -341,7 +363,6 @@ wholememory_error_code_t embedding_base::create_optimizer_states() noexcept
                                    raw_embedding_comm_,
                                    memory_type,
                                    memory_location,
-                                   nullptr,
                                    cache_policy));
 
     optimizer_state_->global_cachable_raw_user_tensor =
@@ -881,7 +902,6 @@ wholememory_error_code_t wholememory_create_embedding(
   wholememory_comm_t comm,
   wholememory_memory_type_t memory_type,
   wholememory_memory_location_t memory_location,
-  wholememory_embedding_optimizer_t optimizer,
   wholememory_embedding_cache_policy_t cache_policy,
   int user_defined_sms,
   int round_robin_size)
@@ -939,10 +959,6 @@ wholememory_error_code_t wholememory_create_embedding(
           "Only ReadOnly access type supported for local cached global readonly embedding.");
         return WHOLEMEMORY_INVALID_INPUT;
       }
-      if (optimizer != nullptr) {
-        WHOLEMEMORY_ERROR("optimizer not supported for local cached global readonly embedding.");
-        return WHOLEMEMORY_INVALID_INPUT;
-      }
       embedding_impl_ptr = new wholememory::local_cached_global_readonly_embedding();
     }
   } else {
@@ -953,11 +969,18 @@ wholememory_error_code_t wholememory_create_embedding(
     &embedding_matrix_description, embedding_world_size, round_robin_size);
   embedding_impl_ptr->set_gather_sms(user_defined_sms);
   WHOLEMEMORY_RETURN_ON_FAIL(embedding_impl_ptr->allocate(
-    &embedding_matrix_description, comm, memory_type, memory_location, cache_policy, optimizer));
+    &embedding_matrix_description, comm, memory_type, memory_location, cache_policy));
   *wholememory_embedding = static_cast<wholememory_embedding_t>(embedding_impl_ptr);
   return WHOLEMEMORY_SUCCESS;
 }
 
+wholememory_error_code_t wholememory_embedding_set_optimizer(
+  wholememory_embedding_t wholememory_embedding, wholememory_embedding_optimizer_t optimizer)
+{
+  auto* embedding_impl_ptr = static_cast<wholememory::embedding_base*>(wholememory_embedding);
+  WHOLEMEMORY_RETURN_ON_FAIL(embedding_impl_ptr->set_optimizer(optimizer));
+  return WHOLEMEMORY_SUCCESS;
+}
 wholememory_error_code_t wholememory_destroy_embedding(
   wholememory_embedding_t wholememory_embedding)
 {
diff --git a/cpp/src/wholememory/embedding.hpp b/cpp/src/wholememory/embedding.hpp
index e37c6c444..f593c36ab 100644
--- a/cpp/src/wholememory/embedding.hpp
+++ b/cpp/src/wholememory/embedding.hpp
@@ -45,8 +45,7 @@ class embedding_base : public wholememory_embedding_ {
                                     wholememory_comm_t comm,
                                     wholememory_memory_type_t memory_type,
                                     wholememory_memory_location_t memory_location,
-                                    wholememory_embedding_cache_policy_t policy,
-                                    wholememory_embedding_optimizer_t opt) noexcept;
+                                    wholememory_embedding_cache_policy_t policy) noexcept;
   void deallocate() noexcept;
   virtual wholememory_error_code_t gather(wholememory_tensor_t indices,
                                           wholememory_tensor_t output,
@@ -61,6 +60,8 @@ class embedding_base : public wholememory_embedding_ {
                                                  wholememory_env_func_t* p_env_fns,
                                                  cudaStream_t stream);
 
+  wholememory_error_code_t set_optimizer(wholememory_embedding_optimizer_t opt);
+
   [[nodiscard]] const char* const* get_optimizer_state_names() const noexcept
   {
     if (optimizer_impl_base_ != nullptr) {
@@ -104,6 +105,7 @@ class embedding_base : public wholememory_embedding_ {
 
   int gather_sms_;
   int round_robin_size_;
+  wholememory_dtype_t embedding_dtype_                             = WHOLEMEMORY_DT_UNKNOWN;
   wholememory_comm_t raw_embedding_comm_                           = nullptr;
   wholememory::embedding_cache_base* cache_ptr_                    = nullptr;
   wholememory::embedding_optimizer_impl_base* optimizer_impl_base_ = nullptr;
diff --git a/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu b/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu
index bb6360fc0..83e57e14b 100644
--- a/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu
+++ b/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -592,10 +592,9 @@ TEST_P(WholeMemoryEmbeddingBackwardParameterTests, EmbeddingGatherGradientApplyT
                                              wm_comm,
                                              params.memory_type,
                                              params.memory_location,
-                                             optimizer,
                                              cache_policy),
                 WHOLEMEMORY_SUCCESS);
-
+      EXPECT_EQ(wholememory_embedding_set_optimizer(wm_embedding, optimizer), WHOLEMEMORY_SUCCESS);
       wholememory_tensor_t embedding_tensor =
         wholememory_embedding_get_embedding_tensor(wm_embedding);
       wholememory_tensor_t local_embed_tensor;
diff --git a/cpp/tests/wholememory_ops/wholememory_embedding_tests.cu b/cpp/tests/wholememory_ops/wholememory_embedding_tests.cu
index e8eb382a4..03f798775 100644
--- a/cpp/tests/wholememory_ops/wholememory_embedding_tests.cu
+++ b/cpp/tests/wholememory_ops/wholememory_embedding_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -244,7 +244,6 @@ TEST_P(WholeMemoryEmbeddingParameterTests, EmbeddingGatherTest)
                                            wm_comm,
                                            params.memory_type,
                                            params.memory_location,
-                                           nullptr,
                                            cache_policy),
               WHOLEMEMORY_SUCCESS);
 
diff --git a/python/pylibwholegraph/examples/node_classfication.py b/python/pylibwholegraph/examples/node_classfication.py
index e8465fa67..475c3fcd5 100644
--- a/python/pylibwholegraph/examples/node_classfication.py
+++ b/python/pylibwholegraph/examples/node_classfication.py
@@ -201,15 +201,9 @@ def main_func():
         args.cache_ratio,
     )
 
-    wm_optimizer = (
-        None
-        if args.train_embedding is False
-        else wgth.create_wholememory_optimizer("adam", {})
-    )
-
+    wm_optimizer = None
     embedding_dtype = torch.float32 if not args.fp16_mbedding else torch.float16
-
-    if wm_optimizer is None:
+    if args.train_embedding is False:
         node_feat_wm_embedding = wgth.create_embedding_from_filelist(
             feature_comm,
             embedding_wholememory_type,
@@ -217,7 +211,6 @@ def main_func():
             os.path.join(args.root_dir, "node_feat.bin"),
             embedding_dtype,
             args.feat_dim,
-            optimizer=wm_optimizer,
             cache_policy=cache_policy,
             round_robin_size=args.round_robin_size,
         )
@@ -228,11 +221,11 @@ def main_func():
             embedding_wholememory_location,
             embedding_dtype,
             [graph_structure.node_count, args.feat_dim],
-            optimizer=wm_optimizer,
             cache_policy=cache_policy,
             random_init=True,
             round_robin_size=args.round_robin_size,
         )
+        wm_optimizer = wgth.create_wholememory_optimizer(node_feat_wm_embedding, "adam", {})
     wgth.set_framework(args.framework)
     model = wgth.HomoGNNModel(graph_structure, node_feat_wm_embedding, args)
     model.cuda()
diff --git a/python/pylibwholegraph/pylibwholegraph/binding/wholememory_binding.pyx b/python/pylibwholegraph/pylibwholegraph/binding/wholememory_binding.pyx
index 1d49065e3..ddf5de544 100644
--- a/python/pylibwholegraph/pylibwholegraph/binding/wholememory_binding.pyx
+++ b/python/pylibwholegraph/pylibwholegraph/binding/wholememory_binding.pyx
@@ -624,7 +624,6 @@ cdef extern from "wholememory/embedding.h":
             wholememory_comm_t comm,
             wholememory_memory_type_t memory_type,
             wholememory_memory_location_t memory_location,
-            wholememory_embedding_optimizer_t optimizer,
             wholememory_embedding_cache_policy_t cache_policy,
             int user_defined_sms,
             int round_robin_size)
@@ -632,6 +631,10 @@ cdef extern from "wholememory/embedding.h":
     cdef wholememory_error_code_t wholememory_destroy_embedding(
             wholememory_embedding_t wholememory_embedding)
 
+    cdef wholememory_error_code_t wholememory_embedding_set_optimizer(
+            wholememory_embedding_t  wholememory_embedding,
+            wholememory_embedding_optimizer_t optimizer);
+
     cdef wholememory_error_code_t wholememory_embedding_gather(wholememory_embedding_t wholememory_embedding,
                                                                wholememory_tensor_t indices,
                                                                wholememory_tensor_t output,
@@ -701,6 +704,10 @@ cdef class WholeMemoryOptimizer:
             check_wholememory_error_code(
                 wholememory_optimizer_set_parameter(self.wm_optimizer, key_bytes, &param_value))
 
+    def add_embedding(self,
+                    PyWholeMemoryEmbedding embedding):
+        wholememory_embedding_set_optimizer(embedding.wm_embedding, self.wm_optimizer)
+
     def destroy_optimizer(self):
         if self.wm_optimizer == NULL:
             return
@@ -789,7 +796,6 @@ cdef class PyWholeMemoryEmbedding:
                          PyWholeMemoryComm comm,
                          WholeMemoryMemoryType memory_type,
                          WholeMemoryMemoryLocation memory_location,
-                         WholeMemoryOptimizer optimizer,
                          WholeMemoryCachePolicy cache_policy,
                          int user_defined_sms,
                          int round_robin_size):
@@ -800,7 +806,6 @@ cdef class PyWholeMemoryEmbedding:
                                                                   comm.comm_id,
                                                                   self.memory_type,
                                                                   self.memory_location,
-                                                                  optimizer.wm_optimizer,
                                                                   cache_policy.cache_policy,
                                                                   user_defined_sms,
                                                                   round_robin_size))
@@ -848,7 +853,6 @@ def create_embedding(PyWholeMemoryTensorDescription tensor_desc,
                      PyWholeMemoryComm comm,
                      WholeMemoryMemoryType memory_type,
                      WholeMemoryMemoryLocation memory_location,
-                     WholeMemoryOptimizer optimizer,
                      WholeMemoryCachePolicy cache_policy,
                      int user_defined_sms,
                      int round_robin_size):
@@ -857,7 +861,6 @@ def create_embedding(PyWholeMemoryTensorDescription tensor_desc,
                                   comm,
                                   memory_type,
                                   memory_location,
-                                  optimizer,
                                   cache_policy,
                                   user_defined_sms,
                                   round_robin_size)
diff --git a/python/pylibwholegraph/pylibwholegraph/torch/embedding.py b/python/pylibwholegraph/pylibwholegraph/torch/embedding.py
index 67f02df77..634508408 100644
--- a/python/pylibwholegraph/pylibwholegraph/torch/embedding.py
+++ b/python/pylibwholegraph/pylibwholegraph/torch/embedding.py
@@ -45,10 +45,16 @@ def __init__(self, global_comm: WholeMemoryCommunicator):
 
     def add_embedding(self, wm_embedding):
         """Add WholeMemory Embedding to this optimizer
-        NOTE: you don't need to call this method, it is automatic called when WholeMemory Embedding is created.
+        NOTE: you don't need to call this method, it is automatic called when WholeMemory Optimizer is created.
         :param wm_embedding: WholeMemory Embedding that use this optimizer
         :return: None
         """
+        assert isinstance(wm_embedding, WholeMemoryEmbedding)
+        if wm_embedding.wmb_optimizer is not None:
+            raise ValueError("optimizer can only be set once.")
+        wm_embedding.wmb_optimizer = self.wmb_opt
+        wm_embedding.dummy_input.requires_grad_(True)
+        self.wmb_opt.add_embedding(wm_embedding.wmb_embedding)
         self.embeddings.append(wm_embedding)
 
     def step(self, lr: float):
@@ -61,30 +67,6 @@ def step(self, lr: float):
         self.global_comm.barrier()
 
 
-def create_wholememory_optimizer(optimizer_type: str, param_dict: dict):
-    """
-    Create WholeMemoryOptimizer.
-    :param optimizer_type: Type of the Optimizer
-    :param param_dict: parameters of the optimizer
-    :return: WholeMemoryOptimizer
-    """
-    wm_optimizer = WholeMemoryOptimizer(get_global_communicator())
-    wm_optimizer.wmb_opt.create_optimizer(
-        str_to_wmb_wholememory_optimizer_type(optimizer_type), param_dict
-    )
-    return wm_optimizer
-
-
-def destroy_wholememory_optimizer(optimizer: WholeMemoryOptimizer):
-    """
-    Destroy WholeMemoryOptimizer
-    :param optimizer: WholeMemoryOptimizer to destroy
-    :return: None
-    """
-    optimizer.wmb_opt.destroy_optimizer()
-    optimizer.wmb_opt = None
-
-
 class WholeMemoryCachePolicy(object):
     """
     Cache policy to create WholeMemoryEmbedding.
@@ -261,7 +243,6 @@ class WholeMemoryEmbedding(object):
     def __init__(
         self,
         wmb_embedding: wmb.PyWholeMemoryEmbedding,
-        wmb_optimizer: Union[WholeMemoryOptimizer, None],
         wmb_cache_policy: Union[WholeMemoryCachePolicy, None],
     ):
         super().__init__()
@@ -269,16 +250,15 @@ def __init__(
         self.embedding_tensor = None
         self.optimizer_states = dict()
 
-        self.wmb_optimizer = wmb_optimizer
         self.wmb_cache_policy = wmb_cache_policy
 
         self.adjust_cache = True if self.wmb_cache_policy is not None else False
 
-        dummy_input_need_grad = True if self.wmb_optimizer is not None else False
+        self.wmb_optimizer = None
+
         self.dummy_input = torch.nn.Parameter(
-            torch.zeros(1), requires_grad=dummy_input_need_grad
+            torch.zeros(1), requires_grad=False
         )
-
         self.need_apply = False
         self.sparse_indices = []
         self.sparse_grads = []
@@ -403,7 +383,6 @@ def create_embedding(
     dtype: torch.dtype,
     sizes: List[int],
     *,
-    optimizer: Union[WholeMemoryOptimizer, None] = None,
     cache_policy: Union[WholeMemoryCachePolicy, None] = None,
     random_init: bool = False,
     gather_sms: int = -1,
@@ -416,16 +395,11 @@ def create_embedding(
     :param memory_location: WholeMemory location, should be cpu or cuda
     :param dtype: data type
     :param sizes: size of the embedding, must be 2D
-    :param optimizer: optimizer
     :param cache_policy: cache policy
     :param gather_sms: the number of SMs used in gather process
     :param round_robin_size: continuous embedding size of a rank using round robin shard strategy
     :return: WholeMemoryEmbedding
     """
-    if optimizer is None:
-        wmb_optimizer = wmb.create_non_optimizer()
-    else:
-        wmb_optimizer = optimizer.wmb_opt
     if cache_policy is None:
         wmb_cache_policy = wmb.create_non_cache_policy()
     else:
@@ -448,16 +422,12 @@ def create_embedding(
             comm.wmb_comm,
             str_to_wmb_wholememory_memory_type(memory_type),
             str_to_wmb_wholememory_location(memory_location),
-            wmb_optimizer,
             wmb_cache_policy,
             user_defined_sms=gather_sms,
             round_robin_size=round_robin_size,
         ),
-        optimizer,
         cache_policy,
     )
-    if optimizer is not None:
-        optimizer.add_embedding(wm_embedding)
     if random_init is True:
         (
             local_tensor,
@@ -476,7 +446,6 @@ def create_embedding_from_filelist(
     dtype: torch.dtype,
     last_dim_size: int,
     *,
-    optimizer: Union[WholeMemoryOptimizer, None] = None,
     cache_policy: Union[WholeMemoryCachePolicy, None] = None,
     gather_sms: int = -1,
     round_robin_size: int = 0,
@@ -489,7 +458,6 @@ def create_embedding_from_filelist(
     :param filelist: list of files
     :param dtype: data type
     :param last_dim_size: size of last dim
-    :param optimizer: optimizer
     :param cache_policy: cache policy
     :param gather_sms: the number of SMs used in gather process
     :param round_robin_size: continuous embedding size of a rank using round robin shard strategy
@@ -516,7 +484,6 @@ def create_embedding_from_filelist(
         memory_location,
         dtype,
         [total_entry_count, last_dim_size],
-        optimizer=optimizer,
         cache_policy=cache_policy,
         gather_sms=gather_sms,
         round_robin_size=round_robin_size,
@@ -554,3 +521,35 @@ def forward(
             self.training,
             force_dtype,
         )
+
+
+def create_wholememory_optimizer(embeddings: Union[WholeMemoryEmbedding, List[WholeMemoryEmbedding]],
+                                 optimizer_type: str,
+                                 param_dict: dict):
+    """
+    Create WholeMemoryOptimizer.
+    :param embeddings: WholememoryEmbeddings to set the Optimizer
+    :param optimizer_type: Type of the Optimizer
+    :param param_dict: parameters of the optimizer
+    :return: WholeMemoryOptimizer
+    """
+    wm_optimizer = WholeMemoryOptimizer(get_global_communicator())
+    wm_optimizer.wmb_opt.create_optimizer(
+        str_to_wmb_wholememory_optimizer_type(optimizer_type), param_dict
+    )
+    if isinstance(embeddings, WholeMemoryEmbedding):
+        wm_optimizer.add_embedding(embeddings)
+    else:
+        for em in embeddings:
+            wm_optimizer.add_embedding(em)
+    return wm_optimizer
+
+
+def destroy_wholememory_optimizer(optimizer: WholeMemoryOptimizer):
+    """
+    Destroy WholeMemoryOptimizer
+    :param optimizer: WholeMemoryOptimizer to destroy
+    :return: None
+    """
+    optimizer.wmb_opt.destroy_optimizer()
+    optimizer.wmb_opt = None

From ba505af5c4c8620632c302f8278df33414e76d8c Mon Sep 17 00:00:00 2001
From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
Date: Tue, 18 Jun 2024 20:55:01 +0800
Subject: [PATCH 06/15] Mnnvl with split comm (#185)

support split comm and get_local_mnnvl_comm

split_comm
```

def split_communicator(comm: WholeMemoryCommunicator, color: int, key: int = 0):
    """Split Communicator.
    Creates a set of new communicators from an existing one. Ranks which pass the same color value will be part of the
    same group; color must be a non-negative value.
    The value of key will determine the rank order, and the smaller key means the smaller rank in new communicator.
    If keys are equal between ranks, then the rank in the original communicator will be used to order ranks.
    """
```

Authors:
  - Chuang Zhu (https://github.com/chuangz0)

Approvers:
  - https://github.com/linhu-nv
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/wholegraph/pull/185
---
 cpp/CMakeLists.txt                            |   5 +
 cpp/include/wholememory/wholememory.h         |  33 +++
 cpp/src/wholememory/communicator.cpp          | 191 +++++++++++++++++-
 cpp/src/wholememory/communicator.hpp          |  16 +-
 cpp/src/wholememory/memory_handle.cpp         |   2 +-
 cpp/src/wholememory/nccl_comms.cpp            |   4 +-
 cpp/src/wholememory/nccl_comms.hpp            |   3 +-
 cpp/src/wholememory/system_info.cpp           |  70 ++++++-
 cpp/src/wholememory/system_info.hpp           |  15 +-
 cpp/src/wholememory/wholememory.cpp           |  19 ++
 .../binding/wholememory_binding.pyx           |  30 +++
 .../pylibwholegraph/torch/__init__.py         |   4 +-
 .../pylibwholegraph/torch/comm.py             | 107 ++++++++--
 13 files changed, 456 insertions(+), 43 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cea2c0459..9c364b0f6 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -199,6 +199,11 @@ PRIVATE
     NCCL::NCCL
 )
 
+if (CUDAToolkit_VERSION VERSION_GREATER "12.3")
+    # Link the NVML library if CUDA version is greater than 12.3
+    target_link_libraries(wholegraph PRIVATE CUDA::nvml)
+endif()
+
 if(BUILD_WITH_NVSHMEM)
 
         file(GLOB_RECURSE NVSHMEM_SOURCE_FILES "src/wholememory_ops/functions/nvshmem*.cu")
diff --git a/cpp/include/wholememory/wholememory.h b/cpp/include/wholememory/wholememory.h
index 08f16213f..f6bacccb3 100644
--- a/cpp/include/wholememory/wholememory.h
+++ b/cpp/include/wholememory/wholememory.h
@@ -40,6 +40,7 @@ enum wholememory_error_code_t {
   WHOLEMEMORY_INVALID_VALUE,       /*!< input value is invalid */
   WHOLEMEMORY_OUT_OF_MEMORY,       /*!< out of memory */
   WHOLEMEMORY_NOT_SUPPORTED,       /*!< not supported */
+  WHOLEMEMORY_SYSTEM_ERROR,        /*!< system error>*/
 };
 
 #define WHOLEMEMORY_RETURN_ON_FAIL(X)                                                 \
@@ -90,6 +91,7 @@ enum LogLevel {
   LEVEL_TRACE      /*!< Trace */
 };
 
+#define WHOLEMEMORY_SPILT_NO_COLOR -1
 /**
  * Initialize WholeMemory library
  * @param flags : reserved should be 0
@@ -111,6 +113,15 @@ wholememory_error_code_t wholememory_finalize();
  */
 typedef struct wholememory_comm_* wholememory_comm_t;
 
+struct clique_info_t {
+  int is_in_clique;  // is_in_clique >0 means the gpu belongs to  a mnnvl domain
+  int clique_first_rank;
+  int clique_rank;      // the rank of gpu in a mnnvl domain
+  int clique_rank_num;  // the num of gpu in the mnnvl domain
+  int clique_id;        // the id of clique
+  int clique_num;       // the num of clique in the comm domain.
+};
+
 #define WHOLEMEMORY_UNIQUE_ID_BYTES (128)
 /**
  * @brief Unique ID for WholeMemory Communicators
@@ -142,6 +153,24 @@ wholememory_error_code_t wholememory_create_communicator(wholememory_comm_t* com
                                                          int rank,
                                                          int size);
 
+/**
+ * Split WholeMemory Communicator
+ * @param new_comm: returned the splited wholeMemory Communicator
+ * @param comm: WholeMemory Communicator to split
+ * @param color: color value to split communicator,Ranks which pass the same color value will be
+ * part of the same group; color must be a non-negative value. If it is passed as
+ * WHOLEMEMORY_SPLIT_NOCOLOR, it means that the rank will not be part of any group, therefore
+ * returning NULL as newcomm.
+ * @param key: key value to split communicator,the value of key will determine the
+ * rank order, and the smaller key means the smaller rank in new communicator. If keys are equal
+ * between ranks, then the rank in the original communicator will be used to order ranks.
+ * @return : wholememory_error_code_t
+
+*/
+wholememory_error_code_t wholememory_split_communicator(wholememory_comm_t* new_comm,
+                                                        wholememory_comm_t comm,
+                                                        int color,
+                                                        int key);
 /**
  * Destroy WholeMemory Communicator
  * @param comm : WholeMemory Communicator to destroy
@@ -177,6 +206,9 @@ wholememory_error_code_t wholememory_communicator_get_rank(int* rank, wholememor
  */
 wholememory_error_code_t wholememory_communicator_get_size(int* size, wholememory_comm_t comm);
 
+wholememory_error_code_t wholememory_communicator_get_clique_info(clique_info_t* clique_info,
+                                                                  wholememory_comm_t comm);
+
 bool wholememory_communicator_is_bind_to_nvshmem(wholememory_comm_t comm);
 
 wholememory_error_code_t wholememory_communicator_set_distributed_backend(
@@ -393,6 +425,7 @@ wholememory_error_code_t wholememory_store_to_file(wholememory_handle_t wholemem
  */
 bool wholememory_is_intranode_communicator(wholememory_comm_t comm);
 
+bool wholememory_is_intra_mnnvl_communicator(wholememory_comm_t comm);
 bool wholememory_is_build_with_nvshmem();
 #ifdef WITH_NVSHMEM_SUPPORT
 wholememory_error_code_t wholememory_get_nvshmem_reference(
diff --git a/cpp/src/wholememory/communicator.cpp b/cpp/src/wholememory/communicator.cpp
index 5e8bf855c..d08fe0804 100644
--- a/cpp/src/wholememory/communicator.cpp
+++ b/cpp/src/wholememory/communicator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #include "communicator.hpp"
 
 #include <cstdlib>
+#include <set>
 #include <sys/stat.h>
 #include <unistd.h>
 
@@ -352,18 +353,19 @@ void wholememory_comm_::device_multicast_sendrecv(const void* sendbuf,
 
 bool wholememory_comm_::is_intranode() const { return intra_node_rank_num == world_size; }
 
+bool wholememory_comm_::is_intra_mnnvl() const { return support_mnnvl; }
 bool wholememory_comm_::support_type_location(wholememory_memory_type_t memory_type,
                                               wholememory_memory_location_t memory_location) const
 {
   if (memory_location == WHOLEMEMORY_ML_HOST) {
     if (is_intranode() || memory_type == WHOLEMEMORY_MT_DISTRIBUTED) return true;
-    return SupportMNNVLForEGM();
+    return is_intra_mnnvl() && SupportEGM();
   } else if (memory_location == WHOLEMEMORY_ML_DEVICE) {
     if (memory_type == WHOLEMEMORY_MT_DISTRIBUTED) return true;
     if (is_intranode()) {
       return DevicesCanAccessP2P(&local_gpu_ids[0], intra_node_rank_num);
     } else {
-      return DevicesCanAccessP2P(&local_gpu_ids[0], intra_node_rank_num) && SupportMNNVL();
+      return DevicesCanAccessP2P(&local_gpu_ids[0], intra_node_rank_num) && is_intra_mnnvl();
     }
   } else {
     return false;
@@ -422,6 +424,10 @@ struct rank_info {
   int rank;
   int size;
   int gpu_id;
+// MNNVL support
+#if CUDA_VERSION >= 12030
+  nvmlGpuFabricInfo_t fabric_info;
+#endif
 };
 
 static void get_host_name(char* hostname, int maxlen, const char delim)
@@ -487,20 +493,72 @@ void get_host_info(host_info* phi)
   get_shm_devid(&phi->shm_dev);
 }
 
+bool comm_support_mnnvl(wholememory_comm_t wm_comm, const std::unique_ptr<rank_info[]>& p_rank_info)
+{
+#if CUDA_VERSION >= 12030
+  int flag = 0;
+  CUdevice currentDev;
+  WM_CU_CHECK_NO_THROW(cuDeviceGet(&currentDev, wm_comm->dev_id));
+  // Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
+  WM_CU_CHECK_NO_THROW(
+    cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));
+  if (!flag) return false;
+
+  nvmlGpuFabricInfo_t gpuFabricInfo;
+  WHOLEMEMORY_CHECK_NOTHROW(wholememory::GetGpuFabricInfo(wm_comm->dev_id, &gpuFabricInfo) ==
+                            WHOLEMEMORY_SUCCESS);
+
+  if (gpuFabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) { return false; }
+
+  // Check that all ranks have initialized the fabric fully
+  for (int i = 0; i < wm_comm->world_rank; i++) {
+    if (p_rank_info.get()[i].fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED) return 0;
+  }
+
+  return GetCudaCompCap() >= 90;
+#else
+
+  return 0;
+#endif
+};
+
 void exchange_rank_info(wholememory_comm_t wm_comm)
 {
   rank_info ri;
   get_host_info(&ri.rank_host_info);
-  ri.rank   = wm_comm->world_rank;
-  ri.size   = wm_comm->world_size;
-  ri.pid    = getpid();
-  ri.gpu_id = wm_comm->dev_id;
+  ri.rank                           = wm_comm->world_rank;
+  ri.size                           = wm_comm->world_size;
+  ri.pid                            = getpid();
+  ri.gpu_id                         = wm_comm->dev_id;
+  wm_comm->clique_info.is_in_clique = 0;
+
+#if CUDA_VERSION >= 12030
+  memset(&ri.fabric_info, 0, sizeof(ri.fabric_info));
+  WHOLEMEMORY_CHECK_NOTHROW(GetGpuFabricInfo(wm_comm->dev_id, &ri.fabric_info) ==
+                            WHOLEMEMORY_SUCCESS);
+
+  //    // A zero UUID means we don't have MNNVL fabric info
+  if (((((long*)ri.fabric_info.clusterUuid)[0] | ((long*)ri.fabric_info.clusterUuid)[1]) == 0)) {
+    wm_comm->clique_info.is_in_clique = 0;
+
+  } else {
+    wm_comm->clique_info.is_in_clique = 1;
+  }
+
+#endif
 
   std::unique_ptr<rank_info[]> p_rank_info(new rank_info[ri.size]);
   wm_comm->host_allgather(&ri, p_rank_info.get(), sizeof(rank_info), WHOLEMEMORY_DT_INT8);
   wm_comm->intra_node_first_rank = -1;
   wm_comm->intra_node_rank_num   = 0;
   wm_comm->intra_node_rank       = -1;
+
+  wm_comm->clique_info.clique_first_rank = -1;
+  wm_comm->clique_info.clique_rank       = -1;
+  wm_comm->clique_info.clique_rank_num   = 0;
+
+  std::set<int> clique_ids{};
+
   for (int r = 0; r < wm_comm->world_size; r++) {
     WHOLEMEMORY_CHECK(r == p_rank_info.get()[r].rank);
     if (ri.rank_host_info == p_rank_info.get()[r].rank_host_info) {
@@ -512,7 +570,36 @@ void exchange_rank_info(wholememory_comm_t wm_comm)
       wm_comm->local_gpu_ids[wm_comm->intra_node_rank_num] = p_rank_info.get()[r].gpu_id;
       wm_comm->intra_node_rank_num++;
     }
+
+#if CUDA_VERSION >= 12030
+
+    if ((memcmp(ri.fabric_info.clusterUuid,
+                p_rank_info.get()[r].fabric_info.clusterUuid,
+                NVML_GPU_FABRIC_UUID_LEN) == 0) &&
+        (ri.fabric_info.cliqueId == p_rank_info.get()[r].fabric_info.cliqueId)) {
+      if (r == wm_comm->world_rank) {
+        wm_comm->clique_info.clique_rank = wm_comm->clique_info.clique_rank_num;
+      }
+      if (wm_comm->clique_info.clique_rank_num == 0) { wm_comm->clique_info.clique_first_rank = r; }
+      wm_comm->clique_info.clique_rank_num++;
+    }
+    clique_ids.insert(p_rank_info.get()[r].fabric_info.cliqueId);
+
+#endif
+  }
+
+#if CUDA_VERSION >= 12030
+  wm_comm->clique_info.clique_num = clique_ids.size();
+  int id                          = 0;
+  for (auto clique_id : clique_ids) {
+    if (clique_id == ri.fabric_info.cliqueId) { wm_comm->clique_info.clique_id = id; }
+    id++;
   }
+
+  wm_comm->support_mnnvl = (comm_support_mnnvl(wm_comm, p_rank_info)) &&
+                           (wm_comm->clique_info.clique_rank_num == wm_comm->world_size);
+
+#endif
 }
 
 void negotiate_communicator_id_locked(wholememory_comm_t wm_comm)
@@ -648,6 +735,70 @@ wholememory_error_code_t create_communicator(wholememory_comm_t* comm,
   }
 }
 
+/**
+ *
+ * Ranks which pass the same color value will be part of the same group; color must be a
+ * non-negative value. If it is passed as WHOLEMEMORY_SPLIT_NOCOLOR, it means that the rank will not
+ * be part of any group, therefore returning NULL as newcomm. The value of key will determine the
+ * rank order, and the smaller key means the smaller rank in new communicator. If keys are equal
+ * between ranks, then the rank in the original communicator will be used to order ranks.
+ *
+ */
+
+wholememory_error_code_t split_communicator(wholememory_comm_t* new_comm,
+                                            wholememory_comm_t parent_comm,
+                                            int color,
+                                            int key) noexcept
+{
+  try {
+    std::unique_lock<std::mutex> mlock(comm_mu);
+
+    WHOLEMEMORY_EXPECTS(wholememory_communicator_is_bind_to_nvshmem(parent_comm) == false,
+                        "Cannot split a communicator that is already bind to NVSHMEM");
+
+    ncclComm_t nccl_comm = parent_comm->raft_nccl_comm->raw_nccl_comm();
+    WHOLEMEMORY_CHECK(nccl_comm != nullptr);
+    ncclComm_t new_nccl_comm;
+    WHOLEMEMORY_CHECK(ncclCommSplit(nccl_comm, color, key, &new_nccl_comm, NULL) == ncclSuccess);
+    cudaStream_t cuda_stream;
+    WM_CUDA_CHECK(cudaStreamCreateWithFlags(&cuda_stream, cudaStreamNonBlocking));
+    if (new_nccl_comm == NULL) {
+      *new_comm = nullptr;
+      return WHOLEMEMORY_SUCCESS;
+    }
+    int new_rank;
+    int new_size;
+    WHOLEMEMORY_CHECK(ncclCommUserRank(new_nccl_comm, &new_rank) == ncclSuccess);
+    WHOLEMEMORY_CHECK(ncclCommCount(new_nccl_comm, &new_size) == ncclSuccess);
+
+    auto* wm_comm = new wholememory_comm_(new_nccl_comm, new_size, new_rank, cuda_stream);
+    *new_comm     = wm_comm;
+    WM_COMM_CHECK_ALL_SAME(wm_comm, WM_COMM_OP_STARTING);
+
+    exchange_rank_info(wm_comm);
+
+    negotiate_communicator_id_locked(wm_comm);
+
+    maybe_create_temp_dir(wm_comm);
+
+    determine_alloc_granularity(wm_comm);
+
+    return WHOLEMEMORY_SUCCESS;
+  } catch (const wholememory::cu_error& wce) {
+    WHOLEMEMORY_FAIL_NOTHROW("%s", wce.what());
+  } catch (const wholememory::cuda_error& wce) {
+    WHOLEMEMORY_FAIL_NOTHROW("%s", wce.what());
+  } catch (const wholememory::logic_error& wle) {
+    WHOLEMEMORY_FAIL_NOTHROW("%s", wle.what());
+  } catch (const raft::exception& re) {
+    WHOLEMEMORY_FAIL_NOTHROW("%s", re.what());
+  } catch (const std::bad_alloc& sba) {
+    WHOLEMEMORY_FAIL_NOTHROW("%s", sba.what());
+  } catch (...) {
+    WHOLEMEMORY_FAIL_NOTHROW("Unknown exception.");
+  }
+}
+
 void destroy_all_wholememory(wholememory_comm_t comm) noexcept
 {
   try {
@@ -740,6 +891,27 @@ wholememory_error_code_t communicator_get_size(int* size, wholememory_comm_t com
   return WHOLEMEMORY_SUCCESS;
 }
 
+// wholememory_error_code_t communicator_get_clique_rank(int* clique_rank,
+//                                                       wholememory_comm_t comm) noexcept
+// {
+//   *clique_rank = comm->clique_rank;
+//   return WHOLEMEMORY_SUCCESS;
+// }
+
+// wholememory_error_code_t communicator_get_clique_size(int* clique_size,
+//                                                       wholememory_comm_t comm) noexcept
+// {
+//   *clique_size = comm->clique_rank_num;
+//   return WHOLEMEMORY_SUCCESS;
+// }
+
+wholememory_error_code_t communicator_get_clique_info(clique_info_t* clique_info,
+                                                      wholememory_comm_t comm) noexcept
+{
+  *clique_info = comm->clique_info;
+  return WHOLEMEMORY_SUCCESS;
+}
+
 bool communicator_is_bind_to_nvshmem(wholememory_comm_t comm) noexcept
 {
 #ifdef WITH_NVSHMEM_SUPPORT
@@ -772,6 +944,11 @@ void communicator_barrier(wholememory_comm_t comm)
 
 bool is_intranode_communicator(wholememory_comm_t comm) noexcept { return comm->is_intranode(); }
 
+bool is_intra_mnnvl_communicator(wholememory_comm_t comm) noexcept
+{
+  return comm->is_intra_mnnvl();
+}
+
 #ifdef WITH_NVSHMEM_SUPPORT
 wholememory_error_code_t init_nvshmem_with_comm(wholememory_comm_t comm) noexcept
 {
diff --git a/cpp/src/wholememory/communicator.hpp b/cpp/src/wholememory/communicator.hpp
index 5ed68a9df..b48d66b77 100644
--- a/cpp/src/wholememory/communicator.hpp
+++ b/cpp/src/wholememory/communicator.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -192,6 +192,7 @@ struct wholememory_comm_ {
 
   bool is_intranode() const;
 
+  bool is_intra_mnnvl() const;
   bool support_type_location(wholememory_memory_type_t memory_type,
                              wholememory_memory_location_t memory_location) const;
 
@@ -212,10 +213,13 @@ struct wholememory_comm_ {
   int intra_node_rank_num       = 0;
   int intra_node_first_rank_pid = -1;
 
+  clique_info_t clique_info;
+
   int comm_id = -1;
 
   int dev_id            = -1;
   int local_gpu_ids[16] = {0};
+  bool support_mnnvl    = false;
 
   size_t alloc_granularity = 2 * 1024 * 1024UL;
 
@@ -267,6 +271,11 @@ wholememory_error_code_t create_communicator(wholememory_comm_t* comm,
                                              int rank,
                                              int size) noexcept;
 
+wholememory_error_code_t split_communicator(wholememory_comm_t* new_comm,
+                                            wholememory_comm_t parent_comm,
+                                            int color,
+                                            int key) noexcept;
+
 wholememory_error_code_t destroy_communicator_locked(wholememory_comm_t comm) noexcept;
 
 wholememory_error_code_t destroy_communicator(wholememory_comm_t comm) noexcept;
@@ -282,10 +291,15 @@ wholememory_error_code_t communicator_get_rank(int* rank, wholememory_comm_t com
 
 wholememory_error_code_t communicator_get_size(int* size, wholememory_comm_t comm) noexcept;
 
+wholememory_error_code_t communicator_get_clique_info(clique_info_t* clique_info,
+                                                      wholememory_comm_t comm) noexcept;
+
 void communicator_barrier(wholememory_comm_t comm);
 
 bool is_intranode_communicator(wholememory_comm_t comm) noexcept;
 
+bool is_intra_mnnvl_communicator(wholememory_comm_t comm) noexcept;
+
 std::string get_temporary_directory_path(wholememory_comm_t comm);
 
 std::string get_shm_prefix(wholememory_comm_t comm);
diff --git a/cpp/src/wholememory/memory_handle.cpp b/cpp/src/wholememory/memory_handle.cpp
index ca8b0ad75..2e55edb62 100644
--- a/cpp/src/wholememory/memory_handle.cpp
+++ b/cpp/src/wholememory/memory_handle.cpp
@@ -1318,7 +1318,7 @@ class continuous_mnnvl_wholememory_impl : public continuous_device_wholememory_i
   void check_valid()
   {
     if (location_ == WHOLEMEMORY_ML_HOST) { WHOLEMEMORY_CHECK_NOTHROW(SupportEGM()); }
-    WHOLEMEMORY_CHECK_NOTHROW(SupportMNNVL());
+    WHOLEMEMORY_CHECK_NOTHROW(comm_->is_intra_mnnvl());
   }
   void create_memory() override
   {
diff --git a/cpp/src/wholememory/nccl_comms.cpp b/cpp/src/wholememory/nccl_comms.cpp
index b06313551..4f6f96806 100644
--- a/cpp/src/wholememory/nccl_comms.cpp
+++ b/cpp/src/wholememory/nccl_comms.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -514,4 +514,6 @@ void nccl_comms::group_start() const { RAFT_NCCL_TRY(ncclGroupStart()); }
 
 void nccl_comms::group_end() const { RAFT_NCCL_TRY(ncclGroupEnd()); }
 
+ncclComm_t nccl_comms::raw_nccl_comm() const { return nccl_comm_; }
+
 }  // namespace wholememory
diff --git a/cpp/src/wholememory/nccl_comms.hpp b/cpp/src/wholememory/nccl_comms.hpp
index 49babab9d..55eab2437 100644
--- a/cpp/src/wholememory/nccl_comms.hpp
+++ b/cpp/src/wholememory/nccl_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -197,6 +197,7 @@ class nccl_comms {
   void group_start() const;
 
   void group_end() const;
+  ncclComm_t raw_nccl_comm() const;
 
  private:
   ncclComm_t nccl_comm_;
diff --git a/cpp/src/wholememory/system_info.cpp b/cpp/src/wholememory/system_info.cpp
index c8a35f400..01c124a6f 100644
--- a/cpp/src/wholememory/system_info.cpp
+++ b/cpp/src/wholememory/system_info.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,21 @@
 
 #include "cuda_macros.hpp"
 
+#include "logger.hpp"
+#include "system_info.hpp"
+#include "wholememory/wholememory.h"
+#if CUDA_VERSION >= 12030
+#include <nvml.h>
+
+namespace {
+
+std::mutex lock;  // NVML has had some thread safety bugs
+bool nvmlInitialized                = false;
+thread_local bool threadInitialized = false;
+wholememory_error_code_t initResult;
+};  // namespace
+
+#endif
 bool DevAttrPagebleMemoryAccess()
 {
   int current_dev_id = -1;
@@ -88,16 +103,55 @@ const char* GetCPUArch()
   return arch_str;
 }
 
-bool SupportMNNVL()
-{
-  // TODO: replace with NVML, nvmlDeviceGetGpuFabricInfo
-  return GetCudaCompCap() >= 90;
-}
-
 bool SupportEGM()
 {
   std::string const arch_str = GetCPUArch();
   return arch_str == "arm64" && DevAttrPagebleMemoryAccess();
 }
 
-bool SupportMNNVLForEGM() { return SupportMNNVL() && SupportEGM(); }
+// bool SupportMNNVLForEGM() { return SupportMNNVL() && SupportEGM(); }
+#if CUDA_VERSION >= 12030
+
+namespace wholememory {
+
+wholememory_error_code_t NvmlEnsureInitialized()
+{
+  // Optimization to avoid repeatedly grabbing the lock when we only want to
+  // read from the global tables.
+  if (threadInitialized) return initResult;
+  threadInitialized = true;
+
+  std::lock_guard<std::mutex> locked(lock);
+
+  if (nvmlInitialized) return initResult;
+  nvmlInitialized       = true;
+  nvmlReturn_t nvml_res = nvmlInit();
+  if (nvml_res != NVML_SUCCESS) {
+    WHOLEMEMORY_ERROR("nvmlInit() failed, the error is %s", nvmlErrorString(nvml_res));
+    initResult = WHOLEMEMORY_SYSTEM_ERROR;
+
+    return initResult;
+  }
+  initResult = WHOLEMEMORY_SUCCESS;
+
+  return initResult;
+}
+
+wholememory_error_code_t GetGpuFabricInfo(int dev, nvmlGpuFabricInfo_t* gpuFabricInfo)
+{
+  WHOLEMEMORY_CHECK_NOTHROW(NvmlEnsureInitialized() == WHOLEMEMORY_SUCCESS);
+  std::lock_guard<std::mutex> locked(lock);
+  // gpuFabricInfo->version = nvmlGpuFabricInfo_v2;
+  nvmlDevice_t nvml_device;
+  nvmlReturn_t ret = nvmlDeviceGetHandleByIndex(dev, &nvml_device);
+  WHOLEMEMORY_EXPECTS_NOTHROW(
+    ret == NVML_SUCCESS, "nvmlDeviceGetHandleByIndex error:%s", nvmlErrorString(ret));
+  ret = nvmlDeviceGetGpuFabricInfo(nvml_device, gpuFabricInfo);
+  WHOLEMEMORY_EXPECTS_NOTHROW(
+    ret == NVML_SUCCESS, "nvmlDeviceGetGpuFabricInfo error:%s", nvmlErrorString(ret));
+
+  return WHOLEMEMORY_SUCCESS;
+}
+
+};  // namespace wholememory
+#endif
diff --git a/cpp/src/wholememory/system_info.hpp b/cpp/src/wholememory/system_info.hpp
index f62364300..a157924eb 100644
--- a/cpp/src/wholememory/system_info.hpp
+++ b/cpp/src/wholememory/system_info.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,11 @@
  */
 #pragma once
 
+#include "wholememory/wholememory.h"
+
+#if CUDA_VERSION >= 12030
+#include <nvml.h>
+#endif
 bool DevAttrPagebleMemoryAccess();
 
 bool DeviceCanAccessPeer(int peer_device);
@@ -29,4 +34,10 @@ bool SupportMNNVL();
 
 bool SupportEGM();
 
-bool SupportMNNVLForEGM();
+// bool SupportMNNVLForEGM();
+#if CUDA_VERSION >= 12030
+namespace wholememory {
+wholememory_error_code_t GetGpuFabricInfo(int dev, nvmlGpuFabricInfo_t* gpuFabricInfo);
+}
+
+#endif
diff --git a/cpp/src/wholememory/wholememory.cpp b/cpp/src/wholememory/wholememory.cpp
index 180da2f01..814e90087 100644
--- a/cpp/src/wholememory/wholememory.cpp
+++ b/cpp/src/wholememory/wholememory.cpp
@@ -45,6 +45,14 @@ wholememory_error_code_t wholememory_create_communicator(wholememory_comm_t* com
   return wholememory::create_communicator(comm, unique_id, rank, size);
 }
 
+wholememory_error_code_t wholememory_split_communicator(wholememory_comm_t* new_comm,
+                                                        wholememory_comm_t comm,
+                                                        int color,
+                                                        int key)
+{
+  return wholememory::split_communicator(new_comm, comm, color, key);
+}
+
 wholememory_error_code_t wholememory_destroy_communicator(wholememory_comm_t comm)
 {
   return wholememory::destroy_communicator(comm);
@@ -266,6 +274,17 @@ bool wholememory_is_intranode_communicator(wholememory_comm_t comm)
   return wholememory::is_intranode_communicator(comm);
 }
 
+bool wholememory_is_intra_mnnvl_communicator(wholememory_comm_t comm)
+{
+  return wholememory::is_intra_mnnvl_communicator(comm);
+}
+
+wholememory_error_code_t wholememory_communicator_get_clique_info(clique_info_t* clique_info,
+                                                                  wholememory_comm_t comm)
+{
+  return wholememory::communicator_get_clique_info(clique_info, comm);
+}
+
 bool wholememory_is_build_with_nvshmem()
 {
 #ifdef WITH_NVSHMEM_SUPPORT
diff --git a/python/pylibwholegraph/pylibwholegraph/binding/wholememory_binding.pyx b/python/pylibwholegraph/pylibwholegraph/binding/wholememory_binding.pyx
index ddf5de544..dc72eb32c 100644
--- a/python/pylibwholegraph/pylibwholegraph/binding/wholememory_binding.pyx
+++ b/python/pylibwholegraph/pylibwholegraph/binding/wholememory_binding.pyx
@@ -185,6 +185,24 @@ cdef extern from "wholememory/wholememory.h":
     cdef wholememory_distributed_backend_t wholememory_communicator_get_distributed_backend(
                                                                             wholememory_comm_t comm)
     cdef bool wholememory_is_intranode_communicator(wholememory_comm_t comm)
+    cdef bool wholememory_is_intra_mnnvl_communicator(wholememory_comm_t comm)
+
+
+    cdef struct clique_info_t:
+        int is_in_clique
+        int clique_first_rank
+        int clique_rank
+        int clique_rank_num
+        int clique_id
+        int clique_num
+
+    cdef wholememory_error_code_t wholememory_communicator_get_clique_info(clique_info_t* clique_info, wholememory_comm_t comm)
+
+
+    cdef wholememory_error_code_t wholememory_split_communicator(wholememory_comm_t* new_comm,
+                                                        wholememory_comm_t comm,
+                                                        int color,
+                                                        int key)
 
 cpdef enum WholeMemoryErrorCode:
     Success = WHOLEMEMORY_SUCCESS
@@ -1267,6 +1285,14 @@ cdef class PyWholeMemoryComm:
         cdef int world_size = -1
         check_wholememory_error_code(wholememory_communicator_get_size(&world_size, self.comm_id))
         return world_size
+    def get_clique_info(self):
+        cdef clique_info_t clique_info
+        check_wholememory_error_code(wholememory_communicator_get_clique_info(&clique_info,self.comm_id))
+
+        cdef bint is_in_clique = clique_info.is_in_clique > 0
+
+        return is_in_clique,clique_info.clique_first_rank,clique_info.clique_rank,clique_info.clique_rank_num,clique_info.clique_id,clique_info.clique_num
+
     def barrier(self):
         check_wholememory_error_code(wholememory_communicator_barrier(self.comm_id))
 
@@ -1628,6 +1654,10 @@ def create_communicator(PyWholeMemoryUniqueID py_uid, int world_rank, int world_
 def destroy_communicator(PyWholeMemoryComm py_comm):
     check_wholememory_error_code(wholememory_destroy_communicator(py_comm.comm_id))
 
+def split_communicator(PyWholeMemoryComm comm,int color,int key):
+    py_comm = PyWholeMemoryComm()
+    check_wholememory_error_code(wholememory_split_communicator(&py_comm.comm_id,comm.comm_id,color,key))
+    return py_comm
 
 def communicator_set_distributed_backend(PyWholeMemoryComm py_comm,WholeMemoryDistributedBackend distributed_backend):
     check_wholememory_error_code(wholememory_communicator_set_distributed_backend(py_comm.comm_id,int(distributed_backend)))
diff --git a/python/pylibwholegraph/pylibwholegraph/torch/__init__.py b/python/pylibwholegraph/pylibwholegraph/torch/__init__.py
index ced391605..873f0d729 100644
--- a/python/pylibwholegraph/pylibwholegraph/torch/__init__.py
+++ b/python/pylibwholegraph/pylibwholegraph/torch/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,6 +20,8 @@
     get_global_communicator,
     get_local_node_communicator,
     get_local_device_communicator,
+    split_communicator,
+    get_local_mnnvl_communicator,
 )
 
 from .embedding import (
diff --git a/python/pylibwholegraph/pylibwholegraph/torch/comm.py b/python/pylibwholegraph/pylibwholegraph/torch/comm.py
index aa15d3a0a..1f8d9f520 100644
--- a/python/pylibwholegraph/pylibwholegraph/torch/comm.py
+++ b/python/pylibwholegraph/pylibwholegraph/torch/comm.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,12 +19,13 @@
     str_to_wmb_wholememory_distributed_backend_type,
     wholememory_distributed_backend_type_to_str,
     str_to_wmb_wholememory_memory_type,
-    str_to_wmb_wholememory_location
+    str_to_wmb_wholememory_location,
 )
 
 global_communicators = {}
 local_node_communicator = None
 local_device_communicator = None
+local_mnnvl_communicator = None
 
 all_comm_world_rank = 0
 all_comm_world_size = 1
@@ -34,10 +35,11 @@
 
 def reset_communicators():
     global all_comm_world_rank, all_comm_world_size, all_comm_local_rank, all_comm_local_size
-    global global_communicators, local_node_communicator, local_device_communicator
+    global global_communicators, local_node_communicator, local_device_communicator, local_mnnvl_communicator
     global_communicators = {}
     local_node_communicator = None
     local_device_communicator = None
+    local_mnnvl_communicator = None
 
     all_comm_world_rank = 0
     all_comm_world_size = 1
@@ -82,6 +84,18 @@ def get_size(self):
         """Get world size of this communicator"""
         return self.wmb_comm.get_size()
 
+    def get_clique_info(self):
+        """Get info of clique where current process is located, a clique is made up of GPUs in same mnnvl domain.
+        return:
+        is_in_clique: is_in_clique >0 means the gpu belongs to  a mnnvl domain
+        clique_first_rank; // the rank in the comm of first gpu in the clique ,
+        clique_rank;      // the rank of gpu in a mnnvl domain
+        clique_rank_num;  // the num of gpu in the mnnvl domain
+        clique_id;        // the id of clique
+        clique_num;       // the num of clique in the comm domain.
+        """
+        return self.wmb_comm.get_clique_info()
+
     def barrier(self):
         """
         Barrier on WholeMemory Communicator.
@@ -91,9 +105,7 @@ def barrier(self):
         """
         return self.wmb_comm.barrier()
 
-    def support_type_location(self,
-                              memory_type: str,
-                              memory_location: str):
+    def support_type_location(self, memory_type: str, memory_location: str):
         """
         Return True if Communicator supports combination of memory_type and memory_location.
         """
@@ -107,11 +119,15 @@ def destroy(self):
 
     @property
     def distributed_backend(self):
-        return wholememory_distributed_backend_type_to_str(self.wmb_comm.get_distributed_backend())
+        return wholememory_distributed_backend_type_to_str(
+            self.wmb_comm.get_distributed_backend()
+        )
 
     @distributed_backend.setter
     def distributed_backend(self, value):
-        self.wmb_comm.set_distributed_backend(str_to_wmb_wholememory_distributed_backend_type(value))
+        self.wmb_comm.set_distributed_backend(
+            str_to_wmb_wholememory_distributed_backend_type(value)
+        )
 
 
 def create_group_communicator(group_size: int = -1, comm_stride: int = 1):
@@ -152,6 +168,21 @@ def create_group_communicator(group_size: int = -1, comm_stride: int = 1):
     return WholeMemoryCommunicator(wm_comm)
 
 
+def split_communicator(comm: WholeMemoryCommunicator, color: int, key: int = 0):
+    """Split Communicator.
+    Creates a set of new communicators from an existing one. Ranks which pass the same color value will be part of the
+    same group; color must be a non-negative value.
+    The value of key will determine the rank order, and the smaller key means the smaller rank in new communicator.
+    If keys are equal between ranks, then the rank in the original communicator will be used to order ranks.
+    """
+    if not isinstance(color, int) or not isinstance(key, int):
+        raise TypeError("color and key must be int")
+    if color < 0:
+        return None
+    new_wm_comm = wmb.split_communicator(comm.wmb_comm, color, key)
+    return WholeMemoryCommunicator(new_wm_comm)
+
+
 def destroy_communicator(wm_comm: WholeMemoryCommunicator):
     """
     Destroy WholeMemoryCommunicator
@@ -163,19 +194,24 @@ def destroy_communicator(wm_comm: WholeMemoryCommunicator):
         wm_comm.wmb_comm = None
 
 
-def get_global_communicator(distributed_backend='nccl'):
+def get_global_communicator(distributed_backend="nccl"):
     """
     Get the global communicator of this job
     :return: WholeMemoryCommunicator that has all GPUs in it.
     """
-    global global_communicators, local_node_communicator, local_device_communicator
+    global global_communicators, local_node_communicator, local_device_communicator, local_mnnvl_communicator
     global all_comm_local_size, all_comm_world_size
     if distributed_backend not in global_communicators:
         global_communicator = create_group_communicator()
         comm_set_distributed_backend(global_communicator, distributed_backend)
         global_communicators[distributed_backend] = global_communicator
-        if distributed_backend == 'nccl':  # local_node/device_communicator can only be nccl backend for now
-            if local_node_communicator is None and all_comm_local_size == all_comm_world_size:
+        if (
+            distributed_backend == "nccl"
+        ):  # local_node/device_communicator can only be nccl backend for now
+            if (
+                local_node_communicator is None
+                and all_comm_local_size == all_comm_world_size
+            ):
                 local_node_communicator = global_communicator
             if local_device_communicator is None and all_comm_world_size == 1:
                 local_device_communicator = global_communicator
@@ -187,13 +223,13 @@ def get_local_node_communicator():
     Get the local node communicator of this job
     :return: WholeMemoryCommunicator that has GPUs in the same node.
     """
-    global global_communicators, local_node_communicator, local_device_communicator
+    global global_communicators, local_node_communicator, local_device_communicator, local_mnnvl_communicator
     global all_comm_local_size, all_comm_world_size
     if local_node_communicator is None:
         local_node_communicator = create_group_communicator(all_comm_local_size)
         if all_comm_local_size == all_comm_world_size:
-            assert 'nccl' not in global_communicators
-            global_communicators['nccl'] = local_node_communicator
+            assert "nccl" not in global_communicators
+            global_communicators["nccl"] = local_node_communicator
         if all_comm_local_size == 1:
             assert local_device_communicator is None
             local_device_communicator = local_node_communicator
@@ -205,7 +241,7 @@ def get_local_device_communicator():
     Get the local device communicator of this job
     :return: WholeMemoryCommunicator that has only the GPU belonging to current process.
     """
-    global global_communicators, local_node_communicator, local_device_communicator
+    global global_communicators, local_node_communicator, local_device_communicator, local_mnnvl_communicator
     global all_comm_local_size, all_comm_world_size
     if local_device_communicator is None:
         local_device_communicator = create_group_communicator(1)
@@ -213,13 +249,42 @@ def get_local_device_communicator():
             assert local_node_communicator is None
             local_node_communicator = local_device_communicator
         if all_comm_world_size == 1:
-            assert 'nccl' not in global_communicators
-            global_communicators['nccl'] = local_device_communicator
+            assert "nccl" not in global_communicators
+            global_communicators["nccl"] = local_device_communicator
     return local_device_communicator
 
 
-def comm_set_distributed_backend(wm_comm: WholeMemoryCommunicator, distributed_backend: str):
+def get_local_mnnvl_communicator():
+    """ """
+    global global_communicators, local_node_communicator, local_device_communicator, local_mnnvl_communicator
+    global all_comm_local_size, all_comm_world_size
 
-    wmb.communicator_set_distributed_backend(wm_comm.wmb_comm,
-                                             str_to_wmb_wholememory_distributed_backend_type(distributed_backend))
+    if local_mnnvl_communicator is None:
+        g_communicator = get_global_communicator()
+        (
+            is_in_clique,
+            _,
+            _,
+            _,
+            clique_id,
+            _,
+        ) = g_communicator.get_clique_info()
+        if not is_in_clique:
+            raise RuntimeError(
+                "the gpu does not belong to any mnnvl domain,can not create local_mnnvl_communicator"
+            )
+
+        local_mnnvl_communicator = split_communicator(g_communicator, clique_id)
+
+    return local_mnnvl_communicator
+
+
+def comm_set_distributed_backend(
+    wm_comm: WholeMemoryCommunicator, distributed_backend: str
+):
+
+    wmb.communicator_set_distributed_backend(
+        wm_comm.wmb_comm,
+        str_to_wmb_wholememory_distributed_backend_type(distributed_backend),
+    )
     return

From 4ee62ba7c1d4776957a5bd9558c8ebb297a47889 Mon Sep 17 00:00:00 2001
From: linhu-nv <141609318+linhu-nv@users.noreply.github.com>
Date: Tue, 18 Jun 2024 20:55:29 +0800
Subject: [PATCH 07/15] allow users to choose shm allocation method for
 chunked/continous host memory (#187)

1. The default shm option is still SYSTEMV, but users can choose POSIX API through system env using `export WG_USE_POSIX_SHM=1`.
2. `unlink` shm files immediately after `shm_open` to avoid leftover memory in `/dev/shm` in case of a wholegraph crash.

Authors:
  - https://github.com/linhu-nv

Approvers:
  - Chuang Zhu (https://github.com/chuangz0)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/wholegraph/pull/187
---
 cpp/src/wholememory/memory_handle.cpp | 154 ++++++++++++++------------
 1 file changed, 83 insertions(+), 71 deletions(-)

diff --git a/cpp/src/wholememory/memory_handle.cpp b/cpp/src/wholememory/memory_handle.cpp
index 2e55edb62..16ed43760 100644
--- a/cpp/src/wholememory/memory_handle.cpp
+++ b/cpp/src/wholememory/memory_handle.cpp
@@ -456,76 +456,87 @@ class global_mapped_host_wholememory_impl : public wholememory_impl {
     host_memory_full_path.append("_").append("wm_host_").append(std::to_string(tensor_id));
     return host_memory_full_path;
   }
-#define USE_SYSTEMV_SHM
+
 #define SYSTEMV_SHM_PROJ_ID (0xE601EEEE)
   void create_and_map_shared_host_memory()
   {
     WHOLEMEMORY_CHECK(is_intranode_communicator(comm_));
-#ifdef USE_SYSTEMV_SHM
-    std::string shm_full_path = "/tmp/";
-    shm_full_path.append(get_host_memory_full_path(comm_, handle_->handle_id));
-    FILE* shm_fp = fopen(shm_full_path.c_str(), "w");
-    WHOLEMEMORY_CHECK(shm_fp != nullptr);
-    WHOLEMEMORY_CHECK(fclose(shm_fp) == 0);
-    auto shm_key = ftok(shm_full_path.c_str(), SYSTEMV_SHM_PROJ_ID);
-    WHOLEMEMORY_CHECK(shm_key != (key_t)-1);
+    const char* shm_env_var = std::getenv("WG_USE_POSIX_SHM");
+    if (shm_env_var == nullptr || shm_env_var[0] == '0') {
+      use_systemv_shm_ = true;
+    } else {
+      use_systemv_shm_ = false;
+    }
+    std::string shm_full_path;
+    if (use_systemv_shm_) {
+      shm_full_path = "/tmp/";
+      shm_full_path.append(get_host_memory_full_path(comm_, handle_->handle_id));
+      FILE* shm_fp = fopen(shm_full_path.c_str(), "w");
+      WHOLEMEMORY_CHECK(shm_fp != nullptr);
+      WHOLEMEMORY_CHECK(fclose(shm_fp) == 0);
+    } else {
+      shm_full_path = get_host_memory_full_path(comm_, handle_->handle_id);
+    }
     int shm_id = -1;
-#else
-    auto shm_full_path = get_host_memory_full_path(comm_, handle_->handle_id);
-    int shm_fd         = -1;
-#endif
+    int shm_fd = -1;
     if (comm_->world_rank == 0) {
-#ifdef USE_SYSTEMV_SHM
-      shm_id = shmget(shm_key, alloc_strategy_.local_alloc_size, 0644 | IPC_CREAT | IPC_EXCL);
-      if (shm_id == -1) {
-        WHOLEMEMORY_FATAL(
-          "Create host shared memory from IPC key %d failed, Reason=%s", shm_key, strerror(errno));
-      }
-#else
-      shm_fd = shm_open(shm_full_path.c_str(), O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
-      if (shm_fd < 0) {
-        WHOLEMEMORY_FATAL("Create host shared memory from file %s failed, Reason=%s.",
-                          shm_full_path.c_str(),
-                          strerror(errno));
+      if (use_systemv_shm_) {
+        auto shm_key = ftok(shm_full_path.c_str(), SYSTEMV_SHM_PROJ_ID);
+        WHOLEMEMORY_CHECK(shm_key != (key_t)-1);
+        shm_id = shmget(shm_key, alloc_strategy_.local_alloc_size, 0644 | IPC_CREAT | IPC_EXCL);
+        if (shm_id == -1) {
+          WHOLEMEMORY_FATAL("Create host shared memory from IPC key %d failed, Reason=%s",
+                            shm_key,
+                            strerror(errno));
+        }
+      } else {
+        shm_fd = shm_open(shm_full_path.c_str(), O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+        if (shm_fd < 0) {
+          WHOLEMEMORY_FATAL("Create host shared memory from file %s failed, Reason=%s.",
+                            shm_full_path.c_str(),
+                            strerror(errno));
+        }
+        WHOLEMEMORY_CHECK(ftruncate(shm_fd, alloc_strategy_.local_alloc_size) == 0);
       }
-      WHOLEMEMORY_CHECK(ftruncate(shm_fd, alloc_strategy_.local_alloc_size) == 0);
-#endif
       communicator_barrier(comm_);
     } else {
       communicator_barrier(comm_);
-#ifdef USE_SYSTEMV_SHM
-      shm_id = shmget(shm_key, alloc_strategy_.local_alloc_size, 0644);
-      if (shm_id == -1) {
-        WHOLEMEMORY_FATAL(
-          "Get host shared memory from IPC key %d failed, Reason=%s", shm_key, strerror(errno));
-      }
-#else
-      shm_fd = shm_open(shm_full_path.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
-      if (shm_fd < 0) {
-        WHOLEMEMORY_FATAL("Rank=%d open host shared memory from file %s failed.",
-                          comm_->world_rank,
-                          shm_full_path.c_str());
+      if (use_systemv_shm_) {
+        auto shm_key = ftok(shm_full_path.c_str(), SYSTEMV_SHM_PROJ_ID);
+        WHOLEMEMORY_CHECK(shm_key != (key_t)-1);
+        shm_id = shmget(shm_key, alloc_strategy_.local_alloc_size, 0644);
+        if (shm_id == -1) {
+          WHOLEMEMORY_FATAL(
+            "Get host shared memory from IPC key %d failed, Reason=%s", shm_key, strerror(errno));
+        }
+      } else {
+        shm_fd = shm_open(shm_full_path.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
+        if (shm_fd < 0) {
+          WHOLEMEMORY_FATAL("Rank=%d open host shared memory from file %s failed.",
+                            comm_->world_rank,
+                            shm_full_path.c_str());
+        }
       }
-#endif
     }
     communicator_barrier(comm_);
+    if (!use_systemv_shm_ && comm_->world_rank == 0) {
+      WHOLEMEMORY_CHECK(shm_unlink(shm_full_path.c_str()) == 0);
+    }
     void* mmap_ptr = nullptr;
-#ifdef USE_SYSTEMV_SHM
-    mmap_ptr = shmat(shm_id, nullptr, 0);
-    WHOLEMEMORY_CHECK(mmap_ptr != (void*)-1);
-#else
-    mmap_ptr = mmap(
-      nullptr, alloc_strategy_.total_alloc_size, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0);
-    WHOLEMEMORY_CHECK(mmap_ptr != (void*)-1);
-#endif
+    if (use_systemv_shm_) {
+      mmap_ptr = shmat(shm_id, nullptr, 0);
+      WHOLEMEMORY_CHECK(mmap_ptr != (void*)-1);
+    } else {
+      mmap_ptr = mmap(
+        nullptr, alloc_strategy_.total_alloc_size, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0);
+      WHOLEMEMORY_CHECK(mmap_ptr != (void*)-1);
+    }
     memset(static_cast<char*>(mmap_ptr) + rank_partition_strategy_.local_mem_offset,
            0,
            rank_partition_strategy_.local_mem_size);
     WM_CUDA_CHECK_NO_THROW(
       cudaHostRegister(mmap_ptr, alloc_strategy_.total_alloc_size, cudaHostRegisterDefault));
-#ifndef USE_SYSTEMV_SHM
-    WHOLEMEMORY_CHECK(close(shm_fd) == 0);
-#endif
+    if (!use_systemv_shm_) WHOLEMEMORY_CHECK(close(shm_fd) == 0);
     void* dev_ptr = nullptr;
     WM_CUDA_CHECK_NO_THROW(cudaHostGetDevicePointer(&dev_ptr, mmap_ptr, 0));
     WHOLEMEMORY_CHECK(dev_ptr == mmap_ptr);
@@ -540,31 +551,30 @@ class global_mapped_host_wholememory_impl : public wholememory_impl {
       void* ptr = shared_host_handle_.shared_host_memory_ptr;
       if (ptr == nullptr) return;
       WM_CUDA_CHECK(cudaHostUnregister(ptr));
-#ifdef USE_SYSTEMV_SHM
-      std::string shm_full_path = "/tmp/";
-      shm_full_path.append(get_host_memory_full_path(comm_, handle_->handle_id));
-      auto shm_key = ftok(shm_full_path.c_str(), SYSTEMV_SHM_PROJ_ID);
-      WHOLEMEMORY_CHECK(shm_key != (key_t)-1);
-      int shm_id = shmget(shm_key, alloc_strategy_.local_alloc_size, 0644);
-      if (shm_id == -1) {
-        WHOLEMEMORY_FATAL("Get host shared memory from IPC key %d for delete failed, Reason=%s",
-                          shm_key,
-                          strerror(errno));
+      std::string shm_full_path;
+      int shm_id = -1;
+      if (use_systemv_shm_) {
+        shm_full_path = "/tmp/";
+        shm_full_path.append(get_host_memory_full_path(comm_, handle_->handle_id));
+        auto shm_key = ftok(shm_full_path.c_str(), SYSTEMV_SHM_PROJ_ID);
+        WHOLEMEMORY_CHECK(shm_key != (key_t)-1);
+        shm_id = shmget(shm_key, alloc_strategy_.local_alloc_size, 0644);
+        if (shm_id == -1) {
+          WHOLEMEMORY_FATAL("Get host shared memory from IPC key %d for delete failed, Reason=%s",
+                            shm_key,
+                            strerror(errno));
+        }
+        WHOLEMEMORY_CHECK(shmdt(ptr) == 0);
+      } else {
+        shm_full_path = get_host_memory_full_path(comm_, handle_->handle_id);
+        WHOLEMEMORY_CHECK(munmap(ptr, alloc_strategy_.total_alloc_size) == 0);
       }
-      WHOLEMEMORY_CHECK(shmdt(ptr) == 0);
-#else
-      auto shm_full_path = get_host_memory_full_path(comm_, handle_->handle_id);
-      WHOLEMEMORY_CHECK(munmap(ptr, alloc_strategy_.total_alloc_size) == 0);
-#endif
       communicator_barrier(comm_);
-#ifdef USE_SYSTEMV_SHM
-      if (comm_->world_rank == 0) {
+      if (use_systemv_shm_ && comm_->world_rank == 0) {
         WHOLEMEMORY_CHECK(shmctl(shm_id, IPC_RMID, nullptr) == 0);
         WHOLEMEMORY_CHECK(unlink(shm_full_path.c_str()) == 0);
       }
-#else
-      if (comm_->world_rank == 0) { WHOLEMEMORY_CHECK(shm_unlink(shm_full_path.c_str()) == 0); }
-#endif
+
       communicator_barrier(comm_);
       shared_host_handle_.shared_host_memory_ptr = nullptr;
     } catch (const wholememory::logic_error& wle) {
@@ -579,6 +589,8 @@ class global_mapped_host_wholememory_impl : public wholememory_impl {
   struct shared_host_handle {
     void* shared_host_memory_ptr = nullptr;
   } shared_host_handle_;
+
+  bool use_systemv_shm_;
 };
 
 // Implementation for continuous device wholememory that need global map.

From f413879b984a35bcf2f89606f1b2a82426043ded Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 2 Jul 2024 13:19:47 -0400
Subject: [PATCH 08/15] Use verify-alpha-spec hook (#188)

With the deployment of rapids-build-backend, we need to make sure our dependencies have alpha specs.

Contributes to https://github.com/rapidsai/build-planning/issues/31

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/wholegraph/pull/188
---
 .pre-commit-config.yaml | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ebd3249c0..56f2ca814 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,13 +31,8 @@ repos:
           )
         types_or: [c, c++, cuda]
         args: ["-fallback-style=none", "-style=file", "-i"]
-  - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.13.11
-    hooks:
-      - id: rapids-dependency-file-generator
-        args: ["--clean"]
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.0.3
+    rev: v0.2.0
     hooks:
       - id: verify-copyright
         files: |
@@ -47,3 +42,9 @@ repos:
               meta[.]yaml$|
               setup[.]cfg$|
               [.]flake8[.]cython$
+      - id: verify-alpha-spec
+  - repo: https://github.com/rapidsai/dependency-file-generator
+    rev: v1.13.11
+    hooks:
+      - id: rapids-dependency-file-generator
+        args: ["--clean"]

From ef798c35ffc1cb52ade73636969013b0384b128d Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 5 Jul 2024 17:00:13 -0500
Subject: [PATCH 09/15] skip CMake 3.30.0 (#189)

Contributes to https://github.com/rapidsai/build-planning/issues/80

Adds constraints to avoid pulling in CMake 3.30.0, for the reasons described in that issue.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/wholegraph/pull/189
---
 conda/environments/all_cuda-118_arch-x86_64.yaml      | 2 +-
 conda/environments/all_cuda-122_arch-x86_64.yaml      | 2 +-
 conda/recipes/libwholegraph/conda_build_config.yaml   | 2 +-
 conda/recipes/pylibwholegraph/conda_build_config.yaml | 2 +-
 dependencies.yaml                                     | 4 ++--
 python/pylibwholegraph/pyproject.toml                 | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 2f4f002af..9bb060599 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - c-compiler
 - clang-tools==16.0.6
 - clangxx==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-version=11.8
 - cudatoolkit
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index e924b5825..f57462904 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - c-compiler
 - clang-tools==16.0.6
 - clangxx==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx
diff --git a/conda/recipes/libwholegraph/conda_build_config.yaml b/conda/recipes/libwholegraph/conda_build_config.yaml
index 52573b012..35b1d6b62 100644
--- a/conda/recipes/libwholegraph/conda_build_config.yaml
+++ b/conda/recipes/libwholegraph/conda_build_config.yaml
@@ -11,7 +11,7 @@ cuda11_compiler:
   - nvcc
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 doxygen_version:
   - ">=1.8.11"
diff --git a/conda/recipes/pylibwholegraph/conda_build_config.yaml b/conda/recipes/pylibwholegraph/conda_build_config.yaml
index 41050978a..46f3a251b 100644
--- a/conda/recipes/pylibwholegraph/conda_build_config.yaml
+++ b/conda/recipes/pylibwholegraph/conda_build_config.yaml
@@ -11,7 +11,7 @@ cuda11_compiler:
   - nvcc
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 scikit_build_core_version:
   - ">=0.7.0"
diff --git a/dependencies.yaml b/dependencies.yaml
index 1a7ee4e4c..bc41a9677 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -79,7 +79,7 @@ dependencies:
       - output_types: conda
         packages:
           - c-compiler
-          - cmake>=3.26.4
+          - cmake>=3.26.4,!=3.30.0
           - cudnn=8.8
           - cxx-compiler
           - cython>=3.0.0
@@ -339,6 +339,6 @@ dependencies:
     common:
       - output_types: [pyproject]
         packages:
-          - cmake>=3.26.4
+          - cmake>=3.26.4,!=3.30.0
           - cython>=3.0.0
           - ninja
diff --git a/python/pylibwholegraph/pyproject.toml b/python/pylibwholegraph/pyproject.toml
index 341f8b252..19b48cb9f 100644
--- a/python/pylibwholegraph/pyproject.toml
+++ b/python/pylibwholegraph/pyproject.toml
@@ -44,7 +44,7 @@ classifiers = [
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
 requires = [
-    "cmake>=3.26.4",
+    "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 0e0284c4e02c75c3356d6b1da1a8f4373825fca2 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 8 Jul 2024 17:01:21 -0400
Subject: [PATCH 10/15] Add CUDA_STATIC_MATH_LIBRARIES (#190)

Usage of the CUDA math libraries is independent of the CUDA runtime. Make their static/shared status separately controllable.

Contributes to https://github.com/rapidsai/build-planning/issues/35

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/wholegraph/pull/190
---
 ci/build_wheel.sh  | 2 +-
 cpp/CMakeLists.txt | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 3a59ab481..9b052a84e 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -16,7 +16,7 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 cd "${package_dir}"
 
 # Hardcode the output dir
-SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DBUILD_SHARED_LIBS=OFF;-DCMAKE_MESSAGE_LOG_LEVEL=VERBOSE;-DCUDA_STATIC_RUNTIME=ON;-DWHOLEGRAPH_BUILD_WHEELS=ON" \
+SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DBUILD_SHARED_LIBS=OFF;-DCMAKE_MESSAGE_LOG_LEVEL=VERBOSE;-DCUDA_STATIC_RUNTIME=ON;-DCUDA_STATIC_MATH_LIBRARIES=ON;-DWHOLEGRAPH_BUILD_WHEELS=ON" \
   python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
 
 mkdir -p final_dist
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9c364b0f6..f1ea4fd78 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -51,7 +51,8 @@ rapids_cmake_write_version_file(include/wholegraph/version_config.hpp)
 option(BUILD_SHARED_LIBS "Build libwholegraph shared libraries" ON)
 option(CMAKE_CUDA_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler" OFF)
 option(BUILD_TESTS "Configure CMake to build tests" ON)
-option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and libraries" OFF)
+option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
+option(CUDA_STATIC_MATH_LIBRARIES "Statically link the CUDA math libraries" OFF)
 option(BUILD_WITH_NVSHMEM "Enable nvshmem integration" OFF)
 option(WHOLEGRAPH_EXCLUDE_NVSHMEM_FROM_ALL "Exclude nvshmem targets from wholeGraph's 'all' target" ON)
 option(BUILD_BENCHMARKS "Configure CMake to build benchmark" ON)
@@ -59,7 +60,7 @@ option(BUILD_BENCHMARKS "Configure CMake to build benchmark" ON)
 ##############################################################################
 # - Set options based on user defined one  -----------------------------------
 set(_ctk_static_suffix "")
-if(CUDA_STATIC_RUNTIME)
+if(CUDA_STATIC_MATH_LIBRARIES)
   set(_ctk_static_suffix "_static")
 endif()
 

From ea6ea26c3533f8c79d56d905ba80f4d4f8517c37 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 15 Jul 2024 15:45:31 -0400
Subject: [PATCH 11/15] Revert "Add CUDA_STATIC_MATH_LIBRARIES" (#192)

#190 was supposed to separate static CUDA math libraries from static CUDA runtime library, but accidentally pulled the runtime along with the math libraries. The way we'd normally fix this is by creating a separate variable for the runtime. However, since this project doesn't actually use any math libraries, we can just revert the whole thing.

Contributes to https://github.com/rapidsai/build-planning/issues/35

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/wholegraph/pull/192
---
 ci/build_wheel.sh  | 2 +-
 cpp/CMakeLists.txt | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 9b052a84e..3a59ab481 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -16,7 +16,7 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 cd "${package_dir}"
 
 # Hardcode the output dir
-SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DBUILD_SHARED_LIBS=OFF;-DCMAKE_MESSAGE_LOG_LEVEL=VERBOSE;-DCUDA_STATIC_RUNTIME=ON;-DCUDA_STATIC_MATH_LIBRARIES=ON;-DWHOLEGRAPH_BUILD_WHEELS=ON" \
+SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DBUILD_SHARED_LIBS=OFF;-DCMAKE_MESSAGE_LOG_LEVEL=VERBOSE;-DCUDA_STATIC_RUNTIME=ON;-DWHOLEGRAPH_BUILD_WHEELS=ON" \
   python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
 
 mkdir -p final_dist
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f1ea4fd78..9c364b0f6 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -51,8 +51,7 @@ rapids_cmake_write_version_file(include/wholegraph/version_config.hpp)
 option(BUILD_SHARED_LIBS "Build libwholegraph shared libraries" ON)
 option(CMAKE_CUDA_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler" OFF)
 option(BUILD_TESTS "Configure CMake to build tests" ON)
-option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
-option(CUDA_STATIC_MATH_LIBRARIES "Statically link the CUDA math libraries" OFF)
+option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and libraries" OFF)
 option(BUILD_WITH_NVSHMEM "Enable nvshmem integration" OFF)
 option(WHOLEGRAPH_EXCLUDE_NVSHMEM_FROM_ALL "Exclude nvshmem targets from wholeGraph's 'all' target" ON)
 option(BUILD_BENCHMARKS "Configure CMake to build benchmark" ON)
@@ -60,7 +59,7 @@ option(BUILD_BENCHMARKS "Configure CMake to build benchmark" ON)
 ##############################################################################
 # - Set options based on user defined one  -----------------------------------
 set(_ctk_static_suffix "")
-if(CUDA_STATIC_MATH_LIBRARIES)
+if(CUDA_STATIC_RUNTIME)
   set(_ctk_static_suffix "_static")
 endif()
 

From 5a89834e8e669a040f9c74c5b4bfbd242f5b1662 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 16 Jul 2024 13:55:37 -0400
Subject: [PATCH 12/15] Build and test with CUDA 12.5.1 (#191)

This PR updates the latest CUDA build/test version 12.2.2 to 12.5.1.

Contributes to https://github.com/rapidsai/build-planning/issues/73

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/wholegraph/pull/191
---
 .github/workflows/build.yaml                   | 12 ++++++------
 .github/workflows/pr.yaml                      | 18 +++++++++---------
 .github/workflows/test.yaml                    |  6 +++---
 ci/build_wheel.sh                              |  5 ++++-
 ...6_64.yaml => all_cuda-125_arch-x86_64.yaml} |  4 ++--
 conda/recipes/libwholegraph/meta.yaml          |  1 +
 dependencies.yaml                              |  6 +++++-
 7 files changed, 30 insertions(+), 22 deletions(-)
 rename conda/environments/{all_cuda-122_arch-x86_64.yaml => all_cuda-125_arch-x86_64.yaml} (94%)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index d09ba5a4d..3cbd30cae 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -49,7 +49,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -62,7 +62,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -70,7 +70,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibwholegraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -80,7 +80,7 @@ jobs:
   wheel-publish-pylibwholegraph:
     needs: wheel-build-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index b48246626..eb334f412 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -21,41 +21,41 @@ jobs:
       - wheel-build-pylibwholegraph
       - wheel-test-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.5.1
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.5.1
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       arch: "amd64"
@@ -64,14 +64,14 @@ jobs:
   wheel-build-pylibwholegraph:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       script: ci/build_wheel.sh
   wheel-test-pylibwholegraph:
     needs: wheel-build-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index eb258f5ae..09ba0a034 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-pytorch-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibwholegraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 3a59ab481..e889c2079 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -20,6 +20,9 @@ SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DBUILD_SHARED_LIBS=OFF;-DCMAKE_MESSA
   python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
 
 mkdir -p final_dist
-python -m auditwheel repair --exclude libcuda.so.1 -w final_dist dist/*
+python -m auditwheel repair \
+  --exclude libcuda.so.1 \
+  --exclude libnvidia-ml.so.1 \
+  -w final_dist dist/*
 
 RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
similarity index 94%
rename from conda/environments/all_cuda-122_arch-x86_64.yaml
rename to conda/environments/all_cuda-125_arch-x86_64.yaml
index f57462904..41807e89b 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx
-- cuda-version=12.2
+- cuda-version=12.5
 - cudnn=8.8
 - cxx-compiler
 - cython>=3.0.0
@@ -47,4 +47,4 @@ dependencies:
 - sphinx<6
 - sphinxcontrib-websupport
 - sysroot_linux-64=2.17
-name: all_cuda-122_arch-x86_64
+name: all_cuda-125_arch-x86_64
diff --git a/conda/recipes/libwholegraph/meta.yaml b/conda/recipes/libwholegraph/meta.yaml
index e4c400e60..2a9468a15 100644
--- a/conda/recipes/libwholegraph/meta.yaml
+++ b/conda/recipes/libwholegraph/meta.yaml
@@ -55,6 +55,7 @@ requirements:
     - cudatoolkit
     {% else %}
     - cuda-cudart-dev
+    - cuda-nvml-dev
     - cuda-driver-dev
     {% endif %}
     - cuda-version ={{ cuda_version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index bc41a9677..6bb940eac 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: [conda]
     matrix:
-      cuda: ["11.8", "12.2"]
+      cuda: ["11.8", "12.5"]
       arch: [x86_64]
     includes:
       - checks
@@ -150,6 +150,10 @@ dependencies:
               cuda: "12.2"
             packages:
               - cuda-version=12.2
+          - matrix:
+              cuda: "12.5"
+            packages:
+              - cuda-version=12.5
   cuda:
     specific:
       - output_types: conda

From f85ee4356f2e3d42195a2e0a6c7f195154c47091 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 19 Jul 2024 12:15:49 -0400
Subject: [PATCH 13/15] Use workflow branch 24.08 again (#193)

After updating everything to CUDA 12.5.1, use `shared-workflows@branch-24.08` again.

Contributes to https://github.com/rapidsai/build-planning/issues/73

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/wholegraph/pull/193
---
 .github/workflows/build.yaml | 12 ++++++------
 .github/workflows/pr.yaml    | 18 +++++++++---------
 .github/workflows/test.yaml  |  6 +++---
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 3cbd30cae..d09ba5a4d 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -49,7 +49,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -62,7 +62,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -70,7 +70,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibwholegraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -80,7 +80,7 @@ jobs:
   wheel-publish-pylibwholegraph:
     needs: wheel-build-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index eb334f412..b48246626 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -21,41 +21,41 @@ jobs:
       - wheel-build-pylibwholegraph
       - wheel-test-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       arch: "amd64"
@@ -64,14 +64,14 @@ jobs:
   wheel-build-pylibwholegraph:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: pull-request
       script: ci/build_wheel.sh
   wheel-test-pylibwholegraph:
     needs: wheel-build-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 09ba0a034..eb258f5ae 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-pytorch-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibwholegraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From 94367fdff5b49a1ceba307fd02d6cc1cc874ff18 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 22 Jul 2024 13:02:07 -0500
Subject: [PATCH 14/15] clarify which dependencies in dependencies.yaml are
 conda-only (#195)

This project has some dependencies in `dependencies.yaml` which are part of groups that have `output_types: requirements`, but which use version specifiers that aren't recognized by `pip`.

For example, the use of a secondary build component matching a build string (a conda-specific pattern):

https://github.com/rapidsai/wholegraph/blob/f85ee4356f2e3d42195a2e0a6c7f195154c47091/dependencies.yaml#L247

And the use of a single `=` pin (not recognized by `pip`)

https://github.com/rapidsai/wholegraph/blob/f85ee4356f2e3d42195a2e0a6c7f195154c47091/dependencies.yaml#L288

I believe these were intended to only affect `conda` outputs from `rapids-dependency-file-generator`. This marks them that way.

## Notes for Reviewers

I discovered this while running the following `cuda11.8-pip` unified devcontainer from https://github.com/rapidsai/devcontainers.

```shell
rapids-make-pip-env --force
```

That resulted in an error like this when `wholegraph` was included.

```text
ERROR: Invalid requirement: 'pytorch-cuda=11.8': Expected end or semicolon (after name and no valid version specifier)
    pytorch-cuda=11.8
                ^ (from line 75 of /tmp/rapids.requirements.txt)
Hint: = is not a valid operator. Did you mean == ?
```

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/wholegraph/pull/195
---
 dependencies.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 6bb940eac..1ffc3b912 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -231,7 +231,7 @@ dependencies:
           - pytest-forked
           - pytest-xdist
     specific:
-      - output_types: [conda, requirements]
+      - output_types: [conda]
         matrices:
           - matrix:
               arch: x86_64
@@ -315,7 +315,7 @@ dependencies:
           - sphinxcontrib-websupport
   pytorch_cpu:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda]
         packages:
           - pytorch=2.0.0
           - cpuonly

From 563ed8c259ae97dc5c64cb80a2513160bb86e74e Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 23 Jul 2024 13:19:44 -0500
Subject: [PATCH 15/15] Add cuda-nvml-dev to dependencies.yaml. (#197)

---
 conda/environments/all_cuda-125_arch-x86_64.yaml | 1 +
 dependencies.yaml                                | 1 +
 2 files changed, 2 insertions(+)

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 41807e89b..3c8750697 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -14,6 +14,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-cudart-dev
 - cuda-nvcc
+- cuda-nvml-dev
 - cuda-nvtx
 - cuda-version=12.5
 - cudnn=8.8
diff --git a/dependencies.yaml b/dependencies.yaml
index 1ffc3b912..29135fa7f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -182,6 +182,7 @@ dependencies:
               cuda: "12.*"
             packages:
               - cuda-cudart-dev
+              - cuda-nvml-dev
               - cuda-nvtx
   checks:
     common: