diff --git a/.github/workflows/release_wheels.yml b/.github/workflows/release_wheels.yml
index 9775c09f..292f4dc5 100644
--- a/.github/workflows/release_wheels.yml
+++ b/.github/workflows/release_wheels.yml
@@ -45,6 +45,8 @@ jobs:
 
       # Used to host cibuildwheel
       - uses: actions/setup-python@v2
+        with:
+          python-version: '3.11'
 
       - name: Install cibuildwheel and twine
         run: python -m pip install cibuildwheel==2.12.3
diff --git a/.github/workflows/testing_wheels.yml b/.github/workflows/testing_wheels.yml
index 5ef42431..4d3c1ca3 100644
--- a/.github/workflows/testing_wheels.yml
+++ b/.github/workflows/testing_wheels.yml
@@ -50,6 +50,8 @@ jobs:
 
       # Used to host cibuildwheel
       - uses: actions/setup-python@v2
+        with:
+          python-version: '3.11'
 
       - name: Install cibuildwheel and twine
         run: python -m pip install cibuildwheel==2.12.3
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 615a33db..f902a421 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,13 @@ cmake_minimum_required(VERSION 3.11)
 
 execute_process(COMMAND which nvcc OUTPUT_VARIABLE has_nvcc)
 if(has_nvcc STREQUAL "")
-    project(qsim)
+    execute_process(COMMAND which hipcc OUTPUT_VARIABLE has_hipcc)
+    if(has_hipcc STREQUAL "")
+        project(qsim)
+    else()
+        project(qsim LANGUAGES CXX HIP)
+        ADD_SUBDIRECTORY(pybind_interface/hip)
+    endif()
 else()
     project(qsim LANGUAGES CXX CUDA)
     ADD_SUBDIRECTORY(pybind_interface/cuda)
diff --git a/Makefile b/Makefile
index 9ef0afdd..227ca603 100644
--- a/Makefile
+++ b/Makefile
@@ -6,10 +6,12 @@ TESTS = run-cxx-tests
 
 CXX=g++
 NVCC=nvcc
+HIPCC=hipcc
 
 CXXFLAGS = -O3 -fopenmp
 ARCHFLAGS = -march=native
 NVCCFLAGS = -O3
+HIPCCFLAGS = -O3
 
 # CUQUANTUM_ROOT should be set.
 CUSTATEVECFLAGS = -I$(CUQUANTUM_ROOT)/include -L${CUQUANTUM_ROOT}/lib -L$(CUQUANTUM_ROOT)/lib64 -lcustatevec -lcublas
@@ -22,6 +24,8 @@ export ARCHFLAGS
 export NVCC
 export NVCCFLAGS
 export CUSTATEVECFLAGS
+export HIPCC
+export HIPCCFLAGS
 
 ifeq ($(PYBIND11), true)
   TARGETS += pybind
@@ -43,6 +47,10 @@ qsim-cuda:
 qsim-custatevec:
 	$(MAKE) -C apps/ qsim-custatevec
 
+.PHONY: qsim-hip
+qsim-hip:
+	$(MAKE) -C apps/ qsim-hip
+
 .PHONY: pybind
 pybind:
 	$(MAKE) -C pybind_interface/ pybind
@@ -59,6 +67,10 @@ cuda-tests:
 custatevec-tests:
 	$(MAKE) -C tests/ custatevec-tests
 
+.PHONY: hip-tests
+hip-tests:
+	$(MAKE) -C tests/ hip-tests
+
 .PHONY: run-cxx-tests
 run-cxx-tests: cxx-tests
 	$(MAKE) -C tests/ run-cxx-tests
@@ -71,6 +83,10 @@ run-cuda-tests: cuda-tests
 run-custatevec-tests: custatevec-tests
 	$(MAKE) -C tests/ run-custatevec-tests
 
+.PHONY: run-hip-tests
+run-hip-tests: hip-tests
+	$(MAKE) -C tests/ run-hip-tests
+
 PYTESTS = $(shell find qsimcirq_tests/ -name '*_test.py')
 
 .PHONY: run-py-tests
diff --git a/apps/Makefile b/apps/Makefile
index 41fb81e5..48b25cab 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -7,6 +7,9 @@ CUDA_TARGETS := $(CUDA_TARGETS:%cuda.cu=%cuda.x)
 CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec.cu")
 CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec.cu=%custatevec.x)
 
+HIP_TARGETS = $(shell find . -maxdepth 1 -name '*cuda.cu')
+HIP_TARGETS := $(HIP_TARGETS:%cuda.cu=%hip.x)
+
 .PHONY: qsim
 qsim: $(CXX_TARGETS)
 
@@ -16,6 +19,9 @@ qsim-cuda: $(CUDA_TARGETS)
 .PHONY: qsim-custatevec
 qsim-custatevec: $(CUSTATEVEC_TARGETS)
 
+.PHONY: qsim-hip
+qsim-hip: $(HIP_TARGETS)
+
 %.x: %.cc
 	$(CXX) -o ./$@ $< $(CXXFLAGS) $(ARCHFLAGS)
 
@@ -25,6 +31,9 @@ qsim-custatevec: $(CUSTATEVEC_TARGETS)
 %custatevec.x: %custatevec.cu
 	$(NVCC) -o ./$@ $< $(NVCCFLAGS) $(CUSTATEVECFLAGS)
 
+%hip.x: %cuda.cu
+	$(HIPCC) -o ./$@ $< $(HIPCCFLAGS)
+
 .PHONY: clean
 clean:
 	-rm -f ./*.x ./*.a ./*.so ./*.mod
diff --git a/apps/make.sh b/apps/make.sh
index f2e777e5..5ddd4d6b 100755
--- a/apps/make.sh
+++ b/apps/make.sh
@@ -23,9 +23,15 @@ g++ -O3 -march=native -fopenmp -o qsim_amplitudes.x qsim_amplitudes.cc
 g++ -O3 -march=native -fopenmp -o qsimh_base.x qsimh_base.cc
 g++ -O3 -march=native -fopenmp -o qsimh_amplitudes.x qsimh_amplitudes.cc
 
-nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
-nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu
+if command -v nvcc &>/dev/null; then
+    nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
+    nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu
 
-# CUQUANTUM_ROOT should be set.
-CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
-nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
+    if [ -n "$CUQUANTUM_ROOT" ]; then
+        CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
+        nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
+    fi
+elif command -v hipcc &>/dev/null; then
+    hipcc -O3 -o qsim_base_hip.x qsim_base_cuda.cu
+    hipcc -O3 -o qsim_qtrajectory_hip.x qsim_qtrajectory_cuda.cu
+fi
diff --git a/docs/_book.yaml b/docs/_book.yaml
index 2fe9a2c8..05c862b3 100644
--- a/docs/_book.yaml
+++ b/docs/_book.yaml
@@ -26,6 +26,8 @@ upper_tabs:
          path: /qsim/tutorials/q32d14
        - title: "Simulate noise"
          path: /qsim/tutorials/noisy_qsimcirq
+       - title: "AMD GPU support"
+         path: /qsim/tutorials/amd_gpu
 
    - name: "Guides"
      contents:
diff --git a/docs/choose_hw.md b/docs/choose_hw.md
index ff692530..96721ed7 100644
--- a/docs/choose_hw.md
+++ b/docs/choose_hw.md
@@ -258,7 +258,7 @@ are “embarrassingly parallelizable”, there is an automated workflow for
 distributing these trajectories over multiple nodes. A simulation of many
 noiseless circuits can also be distributed over multiple compute nodes.
 
-For mor information about running a mulitnode simulation, see [Multinode quantum
+For more information about running a mulitnode simulation, see [Multinode quantum
 simulation using HTCondor on Google Cloud](/qsim/tutorials/multinode).
 
 ## Runtime estimates
diff --git a/docs/tutorials/amd_gpu.md b/docs/tutorials/amd_gpu.md
new file mode 100644
index 00000000..e7c5f613
--- /dev/null
+++ b/docs/tutorials/amd_gpu.md
@@ -0,0 +1,86 @@
+# Support for AMD Instinct™ MI Series Accelerators
+
+qsim provides support for AMD Instinct accelerators.
+The implementation covers the native GPU support in qsim
+by utilizing [AMD HIP SDK](https://rocm.docs.amd.com/projects/HIP)
+(Heterogeneous-Compute Interface for Portability).
+The cuQuantum implementation is currently not covered.
+
+## Building
+
+Building qsim with support for AMD Instinct accelerators requires installation of
+[AMD ROCm™ Open Software Platform](https://www.amd.com/en/developer/resources/rocm-hub.html).
+Instructions for installing ROCm are available at https://rocm.docs.amd.com/.
+
+To enable support for AMD GPUs, qsim needs to be built from sources.
+This can be done as follows:
+
+```
+conda env list
+conda create -y -n CirqDevEnv python=3
+conda activate CirqDevEnv
+pip install pybind11
+
+git clone https://github.com/quantumlib/qsim.git
+cd qsim
+
+make -j qsim      # to build CPU qsim
+make -j qsim-hip  # to build HIP qsim
+make -j pybind    # to build Python bindings
+make -j cxx-tests # to build CPU tests
+make -j hip-tests # to build HIP tests
+
+pip install .
+```
+
+Note: To avoid problems when building qsim with support for AMD GPUs,
+make sure to use the latest version of CMake.
+
+## Testing
+
+### Simulator
+
+To test the qsim simulator:
+
+```
+make run-cxx-tests # to run CPU tests
+make run-hip-tests # to run HIP tests
+```
+
+or
+
+```
+cd tests
+for file in *.x; do ./"$file"; done          # to run all tests
+for file in *_hip_test.x; do ./"$file"; done # to run HIP tests only
+```
+
+### Python Bindings
+
+To test the Python bindings:
+
+```
+make run-py-tests
+```
+
+or
+
+```
+cd qsimcirq_tests
+python3 -m pytest -v qsimcirq_test.py
+```
+
+## Using
+
+Using qsim on AMD Instinct GPUs is identical to using it on NVIDIA GPUs.
+I.e., it is done by passing `use_gpu=True` and `gpu_mode=0` as `qsimcirq.QSimOptions`:
+
+```
+simulator = qsimcirq.QSimSimulator(qsim_options=qsimcirq.QSimOptions(
+        use_gpu=True,
+        gpu_mode=0,
+        ...
+    ))
+```
+
+Note: `gpu_mode` has to be set to zero for AMD GPUs, as cuStateVec is not supported.
diff --git a/docs/tutorials/gcp_gpu.md b/docs/tutorials/gcp_gpu.md
index 09a642e7..f79a5be8 100644
--- a/docs/tutorials/gcp_gpu.md
+++ b/docs/tutorials/gcp_gpu.md
@@ -34,7 +34,7 @@ instance section, ensure that your VM has the following properties:
 *   In the **Boot disk** section, click the **Change** button:
     1.   In the **Operating System** option, choose **Ubuntu**.
     2.   In the **Version** option, choose **20.04 LTS**.
-    3.   In the **Size** field, enter **30** (minimum).
+    3.   In the **Size** field, enter **100** (minimum known to be sufficient).
 *   The instructions above override steps 3 through 5 in the [Create a Linux VM
     instance](https://cloud.google.com/compute/docs/quickstart-linux)
     Quickstart.
diff --git a/lib/cuda2hip.h b/lib/cuda2hip.h
new file mode 100644
index 00000000..da2d074c
--- /dev/null
+++ b/lib/cuda2hip.h
@@ -0,0 +1,61 @@
+// Copyright 2023 Advanced Micro Devices, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_CUDA2HIP_H_
+#define SIMULATOR_CUDA2HIP_H_
+
+#define cublasCaxpy              hipblasCaxpy
+#define cublasCdotc              hipblasCdotc
+#define cublasCreate             hipblasCreate
+#define cublasCscal              hipblasCscal
+#define cublasCsscal             hipblasCsscal
+#define cublasDestroy            hipblasDestroy
+#define cublasDznrm2             hipblasDznrm2
+#define cublasHandle_t           hipblasHandle_t
+#define cublasScnrm2             hipblasScnrm2
+#define CUBLAS_STATUS_SUCCESS    HIPBLAS_STATUS_SUCCESS
+#define cublasStatus_t           hipblasStatus_t
+#define cublasZaxpy              hipblasZaxpy
+#define cublasZdotc              hipblasZdotc
+#define cublasZdscal             hipblasZdscal
+#define cublasZscal              hipblasZscal
+#define cuCimagf                 hipCimagf
+#define cuCimag                  hipCimag
+#define cuComplex                hipComplex
+#define cuCrealf                 hipCrealf
+#define cuCreal                  hipCreal
+#define CUDA_C_32F               HIPBLAS_C_32F
+#define CUDA_C_64F               HIPBLAS_C_64F
+#define cudaDeviceSynchronize    hipDeviceSynchronize
+#define cudaError_t              hipError_t
+#define cudaFree                 hipFree
+#define cudaGetErrorString       hipGetErrorString
+#define cudaMalloc               hipMalloc
+#define cudaMemcpyAsync          hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost   hipMemcpyDeviceToHost
+#define cudaMemcpy               hipMemcpy
+#define cudaMemcpyHostToDevice   hipMemcpyHostToDevice
+#define cudaMemset               hipMemset
+#define cudaPeekAtLastError      hipPeekAtLastError
+#define cudaSuccess              hipSuccess
+#define cuDoubleComplex          hipDoubleComplex
+
+template <typename T>
+__device__ __forceinline__ T __shfl_down_sync(
+    unsigned mask, T var, unsigned int delta, int width = warpSize) {
+  return __shfl_down(var, delta, width);
+}
+
+#endif  // SIMULATOR_CUDA2HIP_H_
diff --git a/lib/fuser_mqubit.h b/lib/fuser_mqubit.h
index aa4e6305..fdbfddb0 100644
--- a/lib/fuser_mqubit.h
+++ b/lib/fuser_mqubit.h
@@ -561,8 +561,6 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
   static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat,
                                 std::vector<GateF*>& orphaned_gates,
                                 std::vector<GateFused>& fused_gates) {
-    unsigned count = 0;
-
     for (std::size_t i = 0; i < orphaned_gates.size(); ++i) {
       auto ogate1 = orphaned_gates[i];
 
@@ -575,8 +573,6 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
 
         if (ogate2->visited == kFinal) continue;
 
-        ++count;
-
         unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size();
 
         if (cur_size <= max_fused_size) {
diff --git a/lib/simulator_cuda_kernels.h b/lib/simulator_cuda_kernels.h
index 6510fadf..e21a9d62 100644
--- a/lib/simulator_cuda_kernels.h
+++ b/lib/simulator_cuda_kernels.h
@@ -15,10 +15,15 @@
 #ifndef SIMULATOR_CUDA_KERNELS_H_
 #define SIMULATOR_CUDA_KERNELS_H_
 
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include "util_cuda.h"
+#ifdef __NVCC__
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+
+  #include "util_cuda.h"
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 namespace qsim {
 
diff --git a/lib/statespace_cuda.h b/lib/statespace_cuda.h
index 82f5128b..660db074 100644
--- a/lib/statespace_cuda.h
+++ b/lib/statespace_cuda.h
@@ -15,7 +15,12 @@
 #ifndef STATESPACE_CUDA_H_
 #define STATESPACE_CUDA_H_
 
-#include <cuda.h>
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 #include <algorithm>
 #include <complex>
@@ -102,7 +107,8 @@ class StateSpaceCUDA :
   }
 
   void SetAllZeros(State& state) const {
-    cudaMemset(state.get(), 0, MinSize(state.num_qubits()) * sizeof(fp_type));
+    ErrorCheck(cudaMemset(state.get(), 0,
+               MinSize(state.num_qubits()) * sizeof(fp_type)));
   }
 
   // Uniform superposition.
diff --git a/lib/statespace_cuda_kernels.h b/lib/statespace_cuda_kernels.h
index bcb7fd25..b54ebca9 100644
--- a/lib/statespace_cuda_kernels.h
+++ b/lib/statespace_cuda_kernels.h
@@ -15,7 +15,12 @@
 #ifndef STATESPACE_CUDA_KERNELS_H_
 #define STATESPACE_CUDA_KERNELS_H_
 
-#include <cuda.h>
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 #include "util_cuda.h"
 
diff --git a/lib/util_cuda.h b/lib/util_cuda.h
index 591d852d..5d8cb5df 100644
--- a/lib/util_cuda.h
+++ b/lib/util_cuda.h
@@ -15,7 +15,11 @@
 #ifndef UTIL_CUDA_H_
 #define UTIL_CUDA_H_
 
-#include <cuda.h>
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+#endif
 
 #include <cstdlib>
 
diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h
index 0c1d6167..fd91553d 100644
--- a/lib/vectorspace_cuda.h
+++ b/lib/vectorspace_cuda.h
@@ -15,8 +15,13 @@
 #ifndef VECTORSPACE_CUDA_H_
 #define VECTORSPACE_CUDA_H_
 
-#include <cuda.h>
-#include <cuda_runtime.h>
+#ifdef __NVCC__
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 #include <memory>
 #include <utility>
@@ -28,7 +33,7 @@ namespace detail {
 inline void do_not_free(void*) {}
 
 inline void free(void* ptr) {
-  cudaFree(ptr);
+  ErrorCheck(cudaFree(ptr));
 }
 
 }  // namespace detail
@@ -114,9 +119,10 @@ class VectorSpaceCUDA {
       return false;
     }
 
-    cudaMemcpy(dest.get(), src.get(),
-               sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
-               cudaMemcpyDeviceToDevice);
+    ErrorCheck(
+        cudaMemcpy(dest.get(), src.get(),
+                   sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
+                   cudaMemcpyDeviceToDevice));
 
     return true;
   }
@@ -124,9 +130,10 @@ class VectorSpaceCUDA {
   // It is the client's responsibility to make sure that dest has at least
   // Impl::MinSize(src.num_qubits()) elements.
   bool Copy(const Vector& src, fp_type* dest) const {
-    cudaMemcpy(dest, src.get(),
-               sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
-               cudaMemcpyDeviceToHost);
+    ErrorCheck(
+        cudaMemcpy(dest, src.get(),
+                   sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
+                   cudaMemcpyDeviceToHost));
 
     return true;
   }
@@ -134,9 +141,10 @@ class VectorSpaceCUDA {
   // It is the client's responsibility to make sure that src has at least
   // Impl::MinSize(dest.num_qubits()) elements.
   bool Copy(const fp_type* src, Vector& dest) const {
-    cudaMemcpy(dest.get(), src,
-               sizeof(fp_type) * Impl::MinSize(dest.num_qubits()),
-               cudaMemcpyHostToDevice);
+    ErrorCheck(
+        cudaMemcpy(dest.get(), src,
+                   sizeof(fp_type) * Impl::MinSize(dest.num_qubits()),
+                   cudaMemcpyHostToDevice));
 
     return true;
   }
@@ -145,12 +153,15 @@ class VectorSpaceCUDA {
   // min(size, Impl::MinSize(dest.num_qubits())) elements.
   bool Copy(const fp_type* src, uint64_t size, Vector& dest) const {
     size = std::min(size, Impl::MinSize(dest.num_qubits()));
-    cudaMemcpy(dest.get(), src, sizeof(fp_type) * size, cudaMemcpyHostToDevice);
+    ErrorCheck(
+        cudaMemcpy(dest.get(), src,
+                   sizeof(fp_type) * size,
+                   cudaMemcpyHostToDevice));
     return true;
   }
 
   void DeviceSync() {
-    cudaDeviceSynchronize();
+    ErrorCheck(cudaDeviceSynchronize());
   }
 
  protected:
diff --git a/pybind_interface/Makefile b/pybind_interface/Makefile
index c7c70860..95f64b32 100644
--- a/pybind_interface/Makefile
+++ b/pybind_interface/Makefile
@@ -5,6 +5,7 @@ QSIMLIB_AVX2 = ../qsimcirq/qsim_avx2`python3-config --extension-suffix`
 QSIMLIB_AVX512 = ../qsimcirq/qsim_avx512`python3-config --extension-suffix`
 QSIMLIB_CUDA = ../qsimcirq/qsim_cuda`python3-config --extension-suffix`
 QSIMLIB_CUSTATEVEC = ../qsimcirq/qsim_custatevec`python3-config --extension-suffix`
+QSIMLIB_HIP = ../qsimcirq/qsim_hip`python3-config --extension-suffix`
 QSIMLIB_DECIDE = ../qsimcirq/qsim_decide`python3-config --extension-suffix`
 
 # The flags for the compilation of the simd-specific Pybind11 interfaces
@@ -13,21 +14,29 @@ PYBINDFLAGS_SSE = -msse4.1 -Wall -shared -std=c++17 -fPIC `python3 -m pybind11 -
 PYBINDFLAGS_AVX2 = -mavx2 -mfma -Wall -shared -std=c++17 -fPIC `python3 -m pybind11 --includes`
 PYBINDFLAGS_AVX512 = -mavx512f -mbmi2 -Wall -shared -std=c++17 -fPIC `python3 -m pybind11 --includes`
 
-# The flags for the compilation of GPU-specific Pybind11 interfaces
+# The flags for the compilation of CUDA-specific Pybind11 interfaces
 PYBINDFLAGS_CUDA = -std=c++17 -x cu -Xcompiler "-Wall -shared -fPIC `python3 -m pybind11 --includes`"
 
 # The flags for the compilation of cuStateVec-specific Pybind11 interfaces
 PYBINDFLAGS_CUSTATEVEC = $(CUSTATEVECFLAGS) $(PYBINDFLAGS_CUDA)
 
+# The flags for the compilation of HIP-specific Pybind11 interfaces
+PYBINDFLAGS_HIP = -std=c++17 -Wall -shared -fPIC `python3 -m pybind11 --includes`
+
 # Check for nvcc to decide compilation mode.
 ifeq ($(shell which $(NVCC)),)
+# Check for hipcc to decide compilation mode.
+ifeq ($(shell which $(HIPCC)),)
 pybind: pybind-cpu decide-cpu
 else
+pybind: pybind-hip decide-hip
+endif
+else
 # Check for the cuStateVec library.
 ifeq ($(CUQUANTUM_ROOT),)
-pybind: pybind-cpu pybind-gpu decide-gpu
+pybind: pybind-cpu pybind-cuda decide-cuda
 else
-pybind: pybind-cpu pybind-gpu pybind-custatevec decide-custatevec
+pybind: pybind-cpu pybind-cuda pybind-custatevec decide-custatevec
 endif
 endif
 
@@ -40,14 +49,15 @@ pybind-cpu:
 
 .PHONY: decide-cpu
 decide-cpu:
+	echo "building decide-cpu"
 	$(CXX) decide/decide.cpp -o $(QSIMLIB_DECIDE) $(CXXFLAGS) $(PYBINDFLAGS_BASIC)
 
-.PHONY: pybind-gpu
-pybind-gpu:
+.PHONY: pybind-cuda
+pybind-cuda:
 	$(NVCC) cuda/pybind_main_cuda.cpp -o $(QSIMLIB_CUDA) $(NVCCFLAGS) $(PYBINDFLAGS_CUDA)
 
-.PHONY: decide-gpu
-decide-gpu:
+.PHONY: decide-cuda
+decide-cuda:
 	$(NVCC) decide/decide.cpp -o $(QSIMLIB_DECIDE) $(NVCCFLAGS) $(PYBINDFLAGS_CUDA)
 
 .PHONY: pybind-custatevec
@@ -58,6 +68,18 @@ pybind-custatevec:
 decide-custatevec:
 	$(NVCC) decide/decide.cpp -D__CUSTATEVEC__ -o $(QSIMLIB_DECIDE) $(NVCCFLAGS) $(PYBINDFLAGS_CUDA)
 
+.PHONY: pybind-hip
+pybind-hip:
+	$(HIPCC) basic/pybind_main_basic.cpp -o $(QSIMLIB_BASIC) $(CXXFLAGS) $(PYBINDFLAGS_BASIC)
+	$(HIPCC) sse/pybind_main_sse.cpp -o $(QSIMLIB_SSE) $(CXXFLAGS) $(PYBINDFLAGS_SSE)
+	$(HIPCC) avx2/pybind_main_avx2.cpp -o $(QSIMLIB_AVX2) $(CXXFLAGS) $(PYBINDFLAGS_AVX2)
+	$(HIPCC) avx512/pybind_main_avx512.cpp -o $(QSIMLIB_AVX512) $(CXXFLAGS) $(PYBINDFLAGS_AVX512)
+	$(HIPCC) hip/pybind_main_hip.cpp -o $(QSIMLIB_HIP) $(HIPCCFLAGS) $(PYBINDFLAGS_HIP)
+
+.PHONY: decide-hip
+decide-hip:
+	$(HIPCC) decide/decide.cpp -o $(QSIMLIB_DECIDE) $(HIPCCFLAGS) $(PYBINDFLAGS_HIP)
+
 .PHONY: clean
 clean:
 	-rm -f ./basic/*.x ./basic/*.a ./basic/*.so ./basic/*.mod $(QSIMLIB_BASIC)
@@ -65,5 +87,6 @@ clean:
 	-rm -f ./avx2/*.x ./avx2/*.a ./avx2/*.so ./avx2/*.mod $(QSIMLIB_AVX2)
 	-rm -f ./avx512/*.x ./avx512/*.a ./avx512/*.so ./avx512/*.mod $(QSIMLIB_AVX512)
 	-rm -f ./cuda/*.x ./cuda/*.a ./cuda/*.so ./cuda/*.mod $(QSIMLIB_CUDA)
+	-rm -f ./hip/*.x ./hip/*.a ./hip/*.so ./hip/*.mod $(QSIMLIB_HIP)
 	-rm -f ./custatevec/*.x ./custatevec/*.a ./custatevec/*.so ./custatevec/*.mod $(QSIMLIB_CUSTATEVEC)
 	-rm -f ./decide/*.x ./decide/*.a ./decide/*.so ./decide/*.mod $(QSIMLIB_DECIDE)
diff --git a/pybind_interface/decide/CMakeLists.txt b/pybind_interface/decide/CMakeLists.txt
index 441f3d66..b94b0ade 100644
--- a/pybind_interface/decide/CMakeLists.txt
+++ b/pybind_interface/decide/CMakeLists.txt
@@ -2,7 +2,12 @@ cmake_minimum_required(VERSION 3.11)
 
 execute_process(COMMAND which nvcc OUTPUT_VARIABLE has_nvcc)
 if(has_nvcc STREQUAL "")
-  project(qsim)
+  execute_process(COMMAND which hipcc OUTPUT_VARIABLE has_hipcc)
+  if(has_hipcc STREQUAL "")
+    project(qsim)
+  else()
+    project(qsim LANGUAGES CXX HIP)
+  endif()
 else()
   project(qsim LANGUAGES CXX CUDA)
 endif()
@@ -22,7 +27,23 @@ endif()
 INCLUDE(../GetPybind11.cmake)
 
 if(has_nvcc STREQUAL "")
-  pybind11_add_module(qsim_decide decide.cpp)
+  if(has_hipcc STREQUAL "")
+    pybind11_add_module(qsim_decide decide.cpp)
+  else()
+    list(APPEND CMAKE_MODULE_PATH "/opt/rocm/lib/cmake/hip")
+    find_package(HIP REQUIRED)
+    find_package(PythonLibs 3.7 REQUIRED)
+
+    include_directories(${PYTHON_INCLUDE_DIRS} ${pybind11_SOURCE_DIR}/include)
+
+    hip_add_library(qsim_decide MODULE decide.cpp)
+
+    set_target_properties(qsim_decide PROPERTIES
+        PREFIX "${PYTHON_MODULE_PREFIX}"
+        SUFFIX "${PYTHON_MODULE_EXTENSION}"
+    )
+    set_source_files_properties(decide.cpp PROPERTIES LANGUAGE HIP)
+  endif()
 else()
   find_package(PythonLibs 3.7 REQUIRED)
   find_package(CUDA REQUIRED)
diff --git a/pybind_interface/decide/decide.cpp b/pybind_interface/decide/decide.cpp
index 6355ad3b..5e9a63f1 100644
--- a/pybind_interface/decide/decide.cpp
+++ b/pybind_interface/decide/decide.cpp
@@ -60,7 +60,7 @@ int detect_instructions() {
 }
 
 enum GPUCapabilities {
-    CUDA = 0, CUSTATEVEC = 1, NO_GPU = 10, NO_CUSTATEVEC = 11 };
+    CUDA = 0, CUSTATEVEC = 1, HIP = 2, NO_GPU = 10, NO_CUSTATEVEC = 11 };
 
 // For now, GPU detection is performed at compile time, as our wheels are
 // generated on Github Actions runners which do not have GPU support.
@@ -70,6 +70,8 @@ enum GPUCapabilities {
 int detect_gpu() {
   #ifdef __NVCC__
   GPUCapabilities gpu = CUDA;
+  #elif __HIP__
+  GPUCapabilities gpu = HIP;
   #else
   GPUCapabilities gpu = NO_GPU;
   #endif
diff --git a/pybind_interface/hip/CMakeLists.txt b/pybind_interface/hip/CMakeLists.txt
new file mode 100644
index 00000000..fe4b1c54
--- /dev/null
+++ b/pybind_interface/hip/CMakeLists.txt
@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 3.18)
+project(qsim LANGUAGES CXX HIP)
+
+IF (WIN32)
+    set(CMAKE_CXX_FLAGS "/O2 /openmp")
+ELSE()
+    set(CMAKE_CXX_FLAGS "-O3 -fopenmp")
+ENDIF()
+
+INCLUDE(../GetPybind11.cmake)
+find_package(PythonLibs 3.7 REQUIRED)
+
+list(APPEND CMAKE_MODULE_PATH "/opt/rocm/lib/cmake/hip")
+find_package(HIP REQUIRED)
+
+include_directories(${PYTHON_INCLUDE_DIRS} ${pybind11_SOURCE_DIR}/include)
+
+hip_add_library(qsim_hip MODULE pybind_main_hip.cpp)
+
+set_target_properties(qsim_hip PROPERTIES
+       PREFIX "${PYTHON_MODULE_PREFIX}"
+       SUFFIX "${PYTHON_MODULE_EXTENSION}"
+)
+set_source_files_properties(pybind_main_hip.cpp PROPERTIES LANGUAGE HIP)
diff --git a/pybind_interface/hip/pybind_main_hip.cpp b/pybind_interface/hip/pybind_main_hip.cpp
new file mode 100644
index 00000000..d6ee181f
--- /dev/null
+++ b/pybind_interface/hip/pybind_main_hip.cpp
@@ -0,0 +1,47 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pybind_main_hip.h"
+
+#include "../../lib/simulator_cuda.h"
+
+namespace qsim {
+  using Simulator = SimulatorCUDA<float>;
+
+  struct Factory {
+    using Simulator = qsim::Simulator;
+    using StateSpace = Simulator::StateSpace;
+
+    Factory(
+      unsigned num_sim_threads,
+      unsigned num_state_threads,
+      unsigned num_dblocks
+    ) : ss_params{num_state_threads, num_dblocks} {}
+
+    StateSpace CreateStateSpace() const {
+      return StateSpace(ss_params);
+    }
+
+    Simulator CreateSimulator() const {
+      return Simulator();
+    }
+
+    StateSpace::Parameter ss_params;
+  };
+
+  inline void SetFlushToZeroAndDenormalsAreZeros() {}
+  inline void ClearFlushToZeroAndDenormalsAreZeros() {}
+}
+
+#include "../pybind_main.cpp"
diff --git a/pybind_interface/hip/pybind_main_hip.h b/pybind_interface/hip/pybind_main_hip.h
new file mode 100644
index 00000000..55a672d5
--- /dev/null
+++ b/pybind_interface/hip/pybind_main_hip.h
@@ -0,0 +1,17 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "../pybind_main.h"
+
+PYBIND11_MODULE(qsim_hip, m) { GPU_MODULE_BINDINGS }
diff --git a/pybind_interface/pybind_main.cpp b/pybind_interface/pybind_main.cpp
index c5615ebc..63a03e4a 100644
--- a/pybind_interface/pybind_main.cpp
+++ b/pybind_interface/pybind_main.cpp
@@ -377,7 +377,7 @@ std::vector<std::complex<float>> qsim_simulate(const py::dict &options) {
   std::vector<std::complex<float>> amplitudes;
   amplitudes.reserve(bitstrings.size());
 
-  auto measure = [&bitstrings, &circuit, &amplitudes](
+  auto measure = [&bitstrings, &amplitudes](
                      unsigned k, const StateSpace &state_space,
                      const State &state) {
     for (const auto &b : bitstrings) {
@@ -481,7 +481,7 @@ std::vector<std::complex<float>> qtrajectory_simulate(const py::dict &options) {
   Simulator simulator = factory.CreateSimulator();
   StateSpace state_space = factory.CreateStateSpace();
 
-  auto measure = [&bitstrings, &ncircuit, &amplitudes, &state_space](
+  auto measure = [&bitstrings, &amplitudes, &state_space](
                   unsigned k, const State &state, Runner::Stat& stat) {
     for (const auto &b : bitstrings) {
       amplitudes.push_back(state_space.GetAmpl(state, b));
@@ -1082,7 +1082,7 @@ std::vector<unsigned> qtrajectory_sample(const py::dict &options) {
 
   std::vector<std::vector<unsigned>> results;
 
-  auto measure = [&results, &ncircuit, &state_space](
+  auto measure = [&results, &ncircuit](
                   unsigned k, const State& state, Runner::Stat& stat) {
     // Converts stat (which matches the MeasurementResult 'bits' field) into
     // bitstrings matching the MeasurementResult 'bitstring' field.
diff --git a/qsimcirq/__init__.py b/qsimcirq/__init__.py
index 0f022a1a..d9c5504d 100644
--- a/qsimcirq/__init__.py
+++ b/qsimcirq/__init__.py
@@ -34,6 +34,8 @@ def _load_qsim_gpu():
     instr = qsim_decide.detect_gpu()
     if instr == 0:
         qsim_gpu = importlib.import_module("qsimcirq.qsim_cuda")
+    elif instr == 2:
+        qsim_gpu = importlib.import_module("qsimcirq.qsim_hip")
     else:
         qsim_gpu = None
     return qsim_gpu
diff --git a/qsimcirq/_version.py b/qsimcirq/_version.py
index 95385f6b..802744d1 100644
--- a/qsimcirq/_version.py
+++ b/qsimcirq/_version.py
@@ -1,3 +1,3 @@
 """The version number defined here is read automatically in setup.py."""
 
-__version__ = "0.16.3"
+__version__ = "0.18.0"
diff --git a/qsimcirq/qsim_circuit.py b/qsimcirq/qsim_circuit.py
index 0da46be0..9d33881e 100644
--- a/qsimcirq/qsim_circuit.py
+++ b/qsimcirq/qsim_circuit.py
@@ -343,7 +343,6 @@ def __init__(
         cirq_circuit: cirq.Circuit,
         allow_decomposition: bool = False,
     ):
-
         if allow_decomposition:
             super().__init__()
             for moment in cirq_circuit:
@@ -352,6 +351,7 @@ def __init__(
                     self.append(op)
         else:
             super().__init__(cirq_circuit)
+        self._check_for_confusion_matrix()
 
     def __eq__(self, other):
         if not isinstance(other, QSimCircuit):
@@ -364,6 +364,24 @@ def _resolve_parameters_(
     ):
         return QSimCircuit(cirq.resolve_parameters(super(), param_resolver, recursive))
 
+    def _check_for_confusion_matrix(self):
+        """Checks cirq Circuit for Measurement Gates with confusion matrices.
+        Returns:
+            Throws a runtime exception if a MeasurementGate with a confusion matrix is included in the circuit
+        """
+        confusion_maps_on_measurement_gates = [
+            op.gate.confusion_map
+            for _, op, _ in self.findall_operations_with_gate_type(cirq.MeasurementGate)
+            if op.gate.confusion_map
+        ]
+        for confusion_map in confusion_maps_on_measurement_gates:
+            for map_values in confusion_map.values():
+                if map_values:
+                    raise ValueError(
+                        "Confusion Matrices are not currently supported in Qsim. "
+                        "See https://github.com/quantumlib/Cirq/issues/6305 for latest status"
+                    )
+
     def translate_cirq_to_qsim(
         self, qubit_order: cirq.QubitOrderOrList = cirq.QubitOrder.DEFAULT
     ) -> qsim.Circuit:
diff --git a/qsimcirq/qsim_simulator.py b/qsimcirq/qsim_simulator.py
index 1bf0a4d0..2513ba50 100644
--- a/qsimcirq/qsim_simulator.py
+++ b/qsimcirq/qsim_simulator.py
@@ -213,7 +213,7 @@ def _run(
         """Run a simulation, mimicking quantum hardware.
 
         Args:
-            program: The circuit to simulate.
+            circuit: The circuit to simulate.
             param_resolver: Parameters to run with the program.
             repetitions: Number of times to repeat the run.
 
diff --git a/qsimcirq_tests/qsimcirq_test.py b/qsimcirq_tests/qsimcirq_test.py
index e0d19fae..40d0c261 100644
--- a/qsimcirq_tests/qsimcirq_test.py
+++ b/qsimcirq_tests/qsimcirq_test.py
@@ -336,6 +336,19 @@ def test_numpy_params():
     qsim_result = qsim_simulator.simulate_sweep(circuit, params=prs)
 
 
+def test_confusion_matrix_exception():
+    qubit = cirq.LineQubit(0)
+    cmap = {(0,): np.array([[0.8, 0.2], [0.2, 0.8]])}
+    circuit = cirq.Circuit()
+    circuit += cirq.X(qubit)
+    circuit += cirq.MeasurementGate(1, confusion_map=cmap)(qubit)
+    x, y = sympy.Symbol("x"), sympy.Symbol("y")
+    prs = [{x: np.int64(0), y: np.int64(1)}]
+    qsim_simulator = qsimcirq.QSimSimulator()
+    with pytest.raises(ValueError):
+        _ = qsim_simulator.simulate_sweep(circuit, params=prs)
+
+
 def test_invalid_params():
     # Parameters must have numeric values.
     q0 = cirq.LineQubit(0)
@@ -1219,7 +1232,6 @@ def test_multi_qubit_fusion():
 
 @pytest.mark.parametrize("mode", ["noiseless", "noisy"])
 def test_cirq_qsim_simulate_random_unitary(mode: str):
-
     q0, q1 = cirq.LineQubit.range(2)
     options = qsimcirq.QSimOptions(cpu_threads=16, verbosity=0)
     qsimSim = qsimcirq.QSimSimulator(qsim_options=options)
diff --git a/setup.py b/setup.py
index c57a37a9..593620a6 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,7 @@
 import os
 import re
 import sys
+import shutil
 import platform
 import subprocess
 
@@ -63,6 +64,12 @@ def build_extension(self, ext):
                 "-DCMAKE_CXX_COMPILER=/usr/local/opt/llvm/bin/clang++",
             ]
 
+        if shutil.which("hipcc") is not None:
+            cmake_args += [
+                "-DCMAKE_C_COMPILER=hipcc",
+                "-DCMAKE_CXX_COMPILER=hipcc",
+            ]
+
         env = os.environ.copy()
         env["CXXFLAGS"] = '{} -DVERSION_INFO=\\"{}\\"'.format(
             env.get("CXXFLAGS", ""), self.distribution.get_version()
@@ -111,6 +118,7 @@ def build_extension(self, ext):
         CMakeExtension("qsimcirq/qsim_cuda"),
         CMakeExtension("qsimcirq/qsim_custatevec"),
         CMakeExtension("qsimcirq/qsim_decide"),
+        CMakeExtension("qsimcirq/qsim_hip"),
     ],
     cmdclass=dict(build_ext=CMakeBuild),
     zip_safe=False,
diff --git a/tests/Makefile b/tests/Makefile
index f4a37278..6a81e70b 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -14,6 +14,9 @@ CUDA_TARGETS := $(CUDA_TARGETS:%cuda_test.cu=%cuda_test.x)
 CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec_test.cu")
 CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec_test.cu=%custatevec_test.x)
 
+HIP_TARGETS = $(shell find . -maxdepth 1 -name "*cuda_test.cu")
+HIP_TARGETS := $(HIP_TARGETS:%cuda_test.cu=%hip_test.x)
+
 GTEST_DIR = $(CURDIR)/googletest/googletest
 GMOCK_DIR = $(CURDIR)/googletest/googlemock
 
@@ -30,6 +33,9 @@ cuda-tests: $(CUDA_TARGETS)
 .PHONY: custatevec-tests
 custatevec-tests: $(CUSTATEVEC_TARGETS)
 
+.PHONY: hip-tests
+hip-tests: $(HIP_TARGETS)
+
 .PHONY: run-cxx-tests
 run-cxx-tests: cxx-tests
 	for exe in $(CXX_TARGETS); do if ! ./$$exe; then exit 1; fi; done
@@ -42,6 +48,10 @@ run-cuda-tests: cuda-tests
 run-custatevec-tests: custatevec-tests
 	for exe in $(CUSTATEVEC_TARGETS); do if ! ./$$exe; then exit 1; fi; done
 
+.PHONY: run-hip-tests
+run-hip-tests: hip-tests
+	for exe in $(HIP_TARGETS); do if ! ./$$exe; then exit 1; fi; done
+
 $(GTEST_DIR)/make:
 	-git submodule update --init --recursive googletest
 	mkdir -p $(GTEST_DIR)/make
@@ -56,6 +66,9 @@ $(GTEST_DIR)/make:
 %custatevec_test.x: %custatevec_test.cu $(GTEST_DIR)/make
 	$(NVCC) -o ./$@ $< $(TESTFLAGS) $(NVCCFLAGS) $(CUSTATEVECFLAGS)
 
+%hip_test.x: %cuda_test.cu $(GTEST_DIR)/make
+	$(HIPCC) -o ./$@ $< $(TESTFLAGS) $(HIPCCFLAGS)
+
 .PHONY: clean
 clean:
 	-rm -f ./*.x ./*.a ./*.so ./*.mod
diff --git a/tests/make.sh b/tests/make.sh
index 890663da..b38742df 100755
--- a/tests/make.sh
+++ b/tests/make.sh
@@ -55,14 +55,22 @@ g++ -O3 -I$path_to_include -L$path_to_lib -fopenmp -o unitaryspace_basic_test.x
 g++ -O3 -I$path_to_include -L$path_to_lib -msse4 -fopenmp -o unitaryspace_sse_test.x unitaryspace_sse_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -o vectorspace_test.x vectorspace_test.cc -lgtest -lpthread
 
-nvcc -O3 -I$path_to_include -L$path_to_lib -o hybrid_cuda_test.x hybrid_cuda_test.cu -lgtest -lpthread
-nvcc -O3 -I$path_to_include -L$path_to_lib -o qtrajectory_cuda_test.x qtrajectory_cuda_test.cu -lgtest -lpthread
-nvcc -O3 -I$path_to_include -L$path_to_lib -o simulator_cuda_test.x simulator_cuda_test.cu -lgtest -lpthread
-nvcc -O3 -I$path_to_include -L$path_to_lib -o statespace_cuda_test.x statespace_cuda_test.cu -lgtest -lpthread
+if command -v nvcc &>/dev/null; then
+    nvcc -O3 -I$path_to_include -L$path_to_lib -o hybrid_cuda_test.x hybrid_cuda_test.cu -lgtest -lpthread
+    nvcc -O3 -I$path_to_include -L$path_to_lib -o qtrajectory_cuda_test.x qtrajectory_cuda_test.cu -lgtest -lpthread
+    nvcc -O3 -I$path_to_include -L$path_to_lib -o simulator_cuda_test.x simulator_cuda_test.cu -lgtest -lpthread
+    nvcc -O3 -I$path_to_include -L$path_to_lib -o statespace_cuda_test.x statespace_cuda_test.cu -lgtest -lpthread
 
-# CUQUANTUM_ROOT should be set.
-CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
-nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o hybrid_custatevec_test.x hybrid_custatevec_test.cu -lgtest -lpthread
-nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o qtrajectory_custatevec_test.x qtrajectory_custatevec_test.cu -lgtest -lpthread
-nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o simulator_custatevec_test.x simulator_custatevec_test.cu -lgtest -lpthread
-nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o statespace_custatevec_test.x statespace_custatevec_test.cu -lgtest -lpthread
+    if [ -n "$CUQUANTUM_ROOT" ]; then
+        CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
+        nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o hybrid_custatevec_test.x hybrid_custatevec_test.cu -lgtest -lpthread
+        nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o qtrajectory_custatevec_test.x qtrajectory_custatevec_test.cu -lgtest -lpthread
+        nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o simulator_custatevec_test.x simulator_custatevec_test.cu -lgtest -lpthread
+        nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o statespace_custatevec_test.x statespace_custatevec_test.cu -lgtest -lpthread
+    fi
+elif command -v hipcc &>/dev/null; then
+    hipcc -O3 -I$path_to_include -L$path_to_lib -o hybrid_hip_test.x hybrid_cuda_test.cu -lgtest -lpthread
+    hipcc -O3 -I$path_to_include -L$path_to_lib -o qtrajectory_hip_test.x qtrajectory_cuda_test.cu -lgtest -lpthread
+    hipcc -O3 -I$path_to_include -L$path_to_lib -o simulator_hip_test.x simulator_cuda_test.cu -lgtest -lpthread
+    hipcc -O3 -I$path_to_include -L$path_to_lib -o statespace_hip_test.x statespace_cuda_test.cu -lgtest -lpthread
+fi