Merge branch 'master' into eltociear-patch-1

quantumlib · Nov 14, 2023 · ba089eb · ba089eb
2 parents 5ba246e + 49e9ba0
commit ba089eb
Show file tree

Hide file tree

Showing 32 changed files with 473 additions and 61 deletions.
diff --git a/.github/workflows/release_wheels.yml b/.github/workflows/release_wheels.yml
@@ -45,6 +45,8 @@ jobs:
 
       # Used to host cibuildwheel
       - uses: actions/setup-python@v2
+        with:
+          python-version: '3.11'
 
       - name: Install cibuildwheel and twine
         run: python -m pip install cibuildwheel==2.12.3

diff --git a/.github/workflows/testing_wheels.yml b/.github/workflows/testing_wheels.yml
@@ -50,6 +50,8 @@ jobs:
 
       # Used to host cibuildwheel
       - uses: actions/setup-python@v2
+        with:
+          python-version: '3.11'
 
       - name: Install cibuildwheel and twine
         run: python -m pip install cibuildwheel==2.12.3

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,7 +2,13 @@ cmake_minimum_required(VERSION 3.11)
 
 execute_process(COMMAND which nvcc OUTPUT_VARIABLE has_nvcc)
 if(has_nvcc STREQUAL "")
-    project(qsim)
+    execute_process(COMMAND which hipcc OUTPUT_VARIABLE has_hipcc)
+    if(has_hipcc STREQUAL "")
+        project(qsim)
+    else()
+        project(qsim LANGUAGES CXX HIP)
+        ADD_SUBDIRECTORY(pybind_interface/hip)
+    endif()
 else()
     project(qsim LANGUAGES CXX CUDA)
     ADD_SUBDIRECTORY(pybind_interface/cuda)

diff --git a/Makefile b/Makefile
@@ -6,10 +6,12 @@ TESTS = run-cxx-tests
 
 CXX=g++
 NVCC=nvcc
+HIPCC=hipcc
 
 CXXFLAGS = -O3 -fopenmp
 ARCHFLAGS = -march=native
 NVCCFLAGS = -O3
+HIPCCFLAGS = -O3
 
 # CUQUANTUM_ROOT should be set.
 CUSTATEVECFLAGS = -I$(CUQUANTUM_ROOT)/include -L${CUQUANTUM_ROOT}/lib -L$(CUQUANTUM_ROOT)/lib64 -lcustatevec -lcublas
@@ -22,6 +24,8 @@ export ARCHFLAGS
 export NVCC
 export NVCCFLAGS
 export CUSTATEVECFLAGS
+export HIPCC
+export HIPCCFLAGS
 
 ifeq ($(PYBIND11), true)
   TARGETS += pybind
@@ -43,6 +47,10 @@ qsim-cuda:
 qsim-custatevec:
 	$(MAKE) -C apps/ qsim-custatevec
 
+.PHONY: qsim-hip
+qsim-hip:
+	$(MAKE) -C apps/ qsim-hip
+
 .PHONY: pybind
 pybind:
 	$(MAKE) -C pybind_interface/ pybind
@@ -59,6 +67,10 @@ cuda-tests:
 custatevec-tests:
 	$(MAKE) -C tests/ custatevec-tests
 
+.PHONY: hip-tests
+hip-tests:
+	$(MAKE) -C tests/ hip-tests
+
 .PHONY: run-cxx-tests
 run-cxx-tests: cxx-tests
 	$(MAKE) -C tests/ run-cxx-tests
@@ -71,6 +83,10 @@ run-cuda-tests: cuda-tests
 run-custatevec-tests: custatevec-tests
 	$(MAKE) -C tests/ run-custatevec-tests
 
+.PHONY: run-hip-tests
+run-hip-tests: hip-tests
+	$(MAKE) -C tests/ run-hip-tests
+
 PYTESTS = $(shell find qsimcirq_tests/ -name '*_test.py')
 
 .PHONY: run-py-tests

diff --git a/apps/Makefile b/apps/Makefile
@@ -7,6 +7,9 @@ CUDA_TARGETS := $(CUDA_TARGETS:%cuda.cu=%cuda.x)
 CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec.cu")
 CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec.cu=%custatevec.x)
 
+HIP_TARGETS = $(shell find . -maxdepth 1 -name '*cuda.cu')
+HIP_TARGETS := $(HIP_TARGETS:%cuda.cu=%hip.x)
+
 .PHONY: qsim
 qsim: $(CXX_TARGETS)
 
@@ -16,6 +19,9 @@ qsim-cuda: $(CUDA_TARGETS)
 .PHONY: qsim-custatevec
 qsim-custatevec: $(CUSTATEVEC_TARGETS)
 
+.PHONY: qsim-hip
+qsim-hip: $(HIP_TARGETS)
+
 %.x: %.cc
 	$(CXX) -o ./$@ $< $(CXXFLAGS) $(ARCHFLAGS)
 
@@ -25,6 +31,9 @@ qsim-custatevec: $(CUSTATEVEC_TARGETS)
 %custatevec.x: %custatevec.cu
 	$(NVCC) -o ./$@ $< $(NVCCFLAGS) $(CUSTATEVECFLAGS)
 
+%hip.x: %cuda.cu
+	$(HIPCC) -o ./$@ $< $(HIPCCFLAGS)
+
 .PHONY: clean
 clean:
 	-rm -f ./*.x ./*.a ./*.so ./*.mod
diff --git a/apps/make.sh b/apps/make.sh
@@ -23,9 +23,15 @@ g++ -O3 -march=native -fopenmp -o qsim_amplitudes.x qsim_amplitudes.cc
 g++ -O3 -march=native -fopenmp -o qsimh_base.x qsimh_base.cc
 g++ -O3 -march=native -fopenmp -o qsimh_amplitudes.x qsimh_amplitudes.cc
 
-nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
-nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu
+if command -v nvcc &>/dev/null; then
+    nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
+    nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu
 
-# CUQUANTUM_ROOT should be set.
-CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
-nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
+    if [ -n "$CUQUANTUM_ROOT" ]; then
+        CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
+        nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
+    fi
+elif command -v hipcc &>/dev/null; then
+    hipcc -O3 -o qsim_base_hip.x qsim_base_cuda.cu
+    hipcc -O3 -o qsim_qtrajectory_hip.x qsim_qtrajectory_cuda.cu
+fi
diff --git a/docs/_book.yaml b/docs/_book.yaml
@@ -26,6 +26,8 @@ upper_tabs:
          path: /qsim/tutorials/q32d14
        - title: "Simulate noise"
          path: /qsim/tutorials/noisy_qsimcirq
+       - title: "AMD GPU support"
+         path: /qsim/tutorials/amd_gpu
 
    - name: "Guides"
      contents:

diff --git a/docs/choose_hw.md b/docs/choose_hw.md
@@ -258,7 +258,7 @@ are “embarrassingly parallelizable”, there is an automated workflow for
 distributing these trajectories over multiple nodes. A simulation of many
 noiseless circuits can also be distributed over multiple compute nodes.
 
-For mor information about running a mulitnode simulation, see [Multinode quantum
+For more information about running a mulitnode simulation, see [Multinode quantum
 simulation using HTCondor on Google Cloud](/qsim/tutorials/multinode).
 
 ## Runtime estimates

diff --git a/docs/tutorials/amd_gpu.md b/docs/tutorials/amd_gpu.md
@@ -0,0 +1,86 @@
+# Support for AMD Instinct™ MI Series Accelerators
+
+qsim provides support for AMD Instinct accelerators.
+The implementation covers the native GPU support in qsim
+by utilizing [AMD HIP SDK](https://rocm.docs.amd.com/projects/HIP)
+(Heterogeneous-Compute Interface for Portability).
+The cuQuantum implementation is currently not covered.
+
+## Building
+
+Building qsim with support for AMD Instinct accelerators requires installation of
+[AMD ROCm™ Open Software Platform](https://www.amd.com/en/developer/resources/rocm-hub.html).
+Instructions for installing ROCm are available at https://rocm.docs.amd.com/.
+
+To enable support for AMD GPUs, qsim needs to be built from sources.
+This can be done as follows:
+
+```
+conda env list
+conda create -y -n CirqDevEnv python=3
+conda activate CirqDevEnv
+pip install pybind11
+
+git clone https://github.com/quantumlib/qsim.git
+cd qsim
+
+make -j qsim      # to build CPU qsim
+make -j qsim-hip  # to build HIP qsim
+make -j pybind    # to build Python bindings
+make -j cxx-tests # to build CPU tests
+make -j hip-tests # to build HIP tests
+
+pip install .
+```
+
+Note: To avoid problems when building qsim with support for AMD GPUs,
+make sure to use the latest version of CMake.
+
+## Testing
+
+### Simulator
+
+To test the qsim simulator:
+
+```
+make run-cxx-tests # to run CPU tests
+make run-hip-tests # to run HIP tests
+```
+
+or
+
+```
+cd tests
+for file in *.x; do ./"$file"; done          # to run all tests
+for file in *_hip_test.x; do ./"$file"; done # to run HIP tests only
+```
+
+### Python Bindings
+
+To test the Python bindings:
+
+```
+make run-py-tests
+```
+
+or
+
+```
+cd qsimcirq_tests
+python3 -m pytest -v qsimcirq_test.py
+```
+
+## Using
+
+Using qsim on AMD Instinct GPUs is identical to using it on NVIDIA GPUs.
+I.e., it is done by passing `use_gpu=True` and `gpu_mode=0` as `qsimcirq.QSimOptions`:
+
+```
+simulator = qsimcirq.QSimSimulator(qsim_options=qsimcirq.QSimOptions(
+        use_gpu=True,
+        gpu_mode=0,
+        ...
+    ))
+```
+
+Note: `gpu_mode` has to be set to zero for AMD GPUs, as cuStateVec is not supported.
diff --git a/docs/tutorials/gcp_gpu.md b/docs/tutorials/gcp_gpu.md
@@ -34,7 +34,7 @@ instance section, ensure that your VM has the following properties:
 *   In the **Boot disk** section, click the **Change** button:
     1.   In the **Operating System** option, choose **Ubuntu**.
     2.   In the **Version** option, choose **20.04 LTS**.
-    3.   In the **Size** field, enter **30** (minimum).
+    3.   In the **Size** field, enter **100** (minimum known to be sufficient).
 *   The instructions above override steps 3 through 5 in the [Create a Linux VM
     instance](https://cloud.google.com/compute/docs/quickstart-linux)
     Quickstart.

diff --git a/lib/cuda2hip.h b/lib/cuda2hip.h
@@ -0,0 +1,61 @@
+// Copyright 2023 Advanced Micro Devices, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_CUDA2HIP_H_
+#define SIMULATOR_CUDA2HIP_H_
+
+#define cublasCaxpy              hipblasCaxpy
+#define cublasCdotc              hipblasCdotc
+#define cublasCreate             hipblasCreate
+#define cublasCscal              hipblasCscal
+#define cublasCsscal             hipblasCsscal
+#define cublasDestroy            hipblasDestroy
+#define cublasDznrm2             hipblasDznrm2
+#define cublasHandle_t           hipblasHandle_t
+#define cublasScnrm2             hipblasScnrm2
+#define CUBLAS_STATUS_SUCCESS    HIPBLAS_STATUS_SUCCESS
+#define cublasStatus_t           hipblasStatus_t
+#define cublasZaxpy              hipblasZaxpy
+#define cublasZdotc              hipblasZdotc
+#define cublasZdscal             hipblasZdscal
+#define cublasZscal              hipblasZscal
+#define cuCimagf                 hipCimagf
+#define cuCimag                  hipCimag
+#define cuComplex                hipComplex
+#define cuCrealf                 hipCrealf
+#define cuCreal                  hipCreal
+#define CUDA_C_32F               HIPBLAS_C_32F
+#define CUDA_C_64F               HIPBLAS_C_64F
+#define cudaDeviceSynchronize    hipDeviceSynchronize
+#define cudaError_t              hipError_t
+#define cudaFree                 hipFree
+#define cudaGetErrorString       hipGetErrorString
+#define cudaMalloc               hipMalloc
+#define cudaMemcpyAsync          hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost   hipMemcpyDeviceToHost
+#define cudaMemcpy               hipMemcpy
+#define cudaMemcpyHostToDevice   hipMemcpyHostToDevice
+#define cudaMemset               hipMemset
+#define cudaPeekAtLastError      hipPeekAtLastError
+#define cudaSuccess              hipSuccess
+#define cuDoubleComplex          hipDoubleComplex
+
+template <typename T>
+__device__ __forceinline__ T __shfl_down_sync(
+    unsigned mask, T var, unsigned int delta, int width = warpSize) {
+  return __shfl_down(var, delta, width);
+}
+
+#endif  // SIMULATOR_CUDA2HIP_H_
diff --git a/lib/fuser_mqubit.h b/lib/fuser_mqubit.h
@@ -561,8 +561,6 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
   static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat,
                                 std::vector<GateF*>& orphaned_gates,
                                 std::vector<GateFused>& fused_gates) {
-    unsigned count = 0;
-
     for (std::size_t i = 0; i < orphaned_gates.size(); ++i) {
       auto ogate1 = orphaned_gates[i];
 
@@ -575,8 +573,6 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
 
         if (ogate2->visited == kFinal) continue;
 
-        ++count;
-
         unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size();
 
         if (cur_size <= max_fused_size) {

diff --git a/lib/simulator_cuda_kernels.h b/lib/simulator_cuda_kernels.h
@@ -15,10 +15,15 @@
 #ifndef SIMULATOR_CUDA_KERNELS_H_
 #define SIMULATOR_CUDA_KERNELS_H_
 
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include "util_cuda.h"
+#ifdef __NVCC__
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+
+  #include "util_cuda.h"
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 namespace qsim {
 

diff --git a/lib/statespace_cuda.h b/lib/statespace_cuda.h
@@ -15,7 +15,12 @@
 #ifndef STATESPACE_CUDA_H_
 #define STATESPACE_CUDA_H_
 
-#include <cuda.h>
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 #include <algorithm>
 #include <complex>
@@ -102,7 +107,8 @@ class StateSpaceCUDA :
   }
 
   void SetAllZeros(State& state) const {
-    cudaMemset(state.get(), 0, MinSize(state.num_qubits()) * sizeof(fp_type));
+    ErrorCheck(cudaMemset(state.get(), 0,
+               MinSize(state.num_qubits()) * sizeof(fp_type)));
   }
 
   // Uniform superposition.

diff --git a/lib/statespace_cuda_kernels.h b/lib/statespace_cuda_kernels.h
@@ -15,7 +15,12 @@
 #ifndef STATESPACE_CUDA_KERNELS_H_
 #define STATESPACE_CUDA_KERNELS_H_
 
-#include <cuda.h>
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 #include "util_cuda.h"