From c3c50aa2e6936d943b9b86bbfad632ce502abddf Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 6 Jul 2023 18:34:55 -0700
Subject: [PATCH 01/43] add cpp layer

---
 .../src/simulator/StateVectorCudaMPI.hpp      | 351 ++++++++++++++++++
 .../mpi/Test_StateVectorCudaMPI_NonParam.cpp  |  56 +++
 .../src/util/DataBuffer.hpp                   |   4 +-
 .../src/util/MPIManager.hpp                   |  44 +++
 4 files changed, 453 insertions(+), 2 deletions(-)
diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 53f8e9db..97986028 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -64,6 +64,20 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                                const size_t index, bool async,
                                cudaStream_t stream_id);
 
+template <class Precision, class index_type> struct CSRMatrix {
+    std::vector<std::complex<Precision>> values;
+    std::vector<index_type> columns;
+    std::vector<index_type> csrOffsets;
+
+    CSRMatrix(size_t num_rows, size_t nnz) {
+        values = std::vector<std::complex<Precision>>(nnz);
+        columns = std::vector<index_type>(nnz, 0);
+        csrOffsets = std::vector<index_type>(num_rows + 1, 0);
+    }
+
+    CSRMatrix(){};
+};
+
 /**
  * @brief Managed memory CUDA state-vector class using custateVec backed
  * gate-calls.
@@ -941,6 +955,343 @@ class StateVectorCudaMPI
         return expect_val;
     }
 
+    template <class index_type>
+    auto scatterCSRMatrix(std::vector<CSRMatrix<Precision, index_type>> &matrix,
+                          size_t root) -> CSRMatrix<Precision, index_type> {
+        // Bcast num_rows and num_cols
+        size_t local_num_rows;
+        size_t num_col_blocks;
+        std::vector<size_t> nnzs;
+
+        local_num_rows = size_t{1} << this->getNumLocalQubits();
+        num_col_blocks = mpi_manager_.getSize();
+
+        if (mpi_manager_.getRank() == root) {
+            for (size_t j = 0; j < matrix.size(); j++) {
+                nnzs.push_back(matrix[j].values.size());
+            }
+        }
+
+        size_t local_nnz = mpi_manager_.scatter<size_t>(nnzs, 0)[0];
+
+        CSRMatrix<Precision, index_type> localCSRMatrix(local_num_rows,
+                                                        local_nnz);
+
+        if (mpi_manager_.getRank() == root) {
+            localCSRMatrix.values = matrix[0].values;
+            localCSRMatrix.csrOffsets = matrix[0].csrOffsets;
+            localCSRMatrix.columns = matrix[0].columns;
+        }
+
+        for (size_t k = 1; k < num_col_blocks; k++) {
+            size_t dest = k;
+            size_t source = 0;
+
+            if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) {
+                mpi_manager_.Send<std::complex<Precision>>(matrix[k].values,
+                                                           dest);
+            } else if (mpi_manager_.getRank() == k && local_nnz) {
+                mpi_manager_.Recv<std::complex<Precision>>(
+                    localCSRMatrix.values, source);
+            }
+
+            if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) {
+                mpi_manager_.Send<index_type>(matrix[k].csrOffsets, dest);
+            } else if (mpi_manager_.getRank() == k && local_nnz) {
+                mpi_manager_.Recv<index_type>(localCSRMatrix.csrOffsets,
+                                              source);
+            }
+
+            if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) {
+                mpi_manager_.Send<index_type>(matrix[k].columns, dest);
+            } else if (mpi_manager_.getRank() == k && local_nnz) {
+                mpi_manager_.Recv<index_type>(localCSRMatrix.columns, source);
+            }
+
+            /*
+            int sendtag = 0;
+            int recvtag = 0;
+
+            if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) {
+                MPI_Send(matrix[k].values.data(), matrix[k].values.size(),
+                         MPI_C_DOUBLE_COMPLEX, dest, sendtag,
+                         mpi_manager_.getComm());
+            } else if (mpi_manager_.getRank() == k && local_nnz) {
+                MPI_Recv(localCSRMatrix.values.data(),
+                         localCSRMatrix.values.size(), MPI_C_DOUBLE_COMPLEX,
+                         source, recvtag, mpi_manager_.getComm(),
+                         MPI_STATUS_IGNORE);
+            }
+
+            if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) {
+                MPI_Send(matrix[k].csrOffsets.data(),
+                         matrix[k].csrOffsets.size(), MPI_INT64_T, dest,
+                         sendtag, mpi_manager_.getComm());
+            } else if (mpi_manager_.getRank() == k && local_nnz) {
+                MPI_Recv(localCSRMatrix.csrOffsets.data(),
+                         localCSRMatrix.csrOffsets.size(), MPI_INT64_T, source,
+                         recvtag, mpi_manager_.getComm(), MPI_STATUS_IGNORE);
+            }
+
+            if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) {
+                MPI_Send(matrix[k].columns.data(), matrix[k].columns.size(),
+                         MPI_INT64_T, dest, sendtag, mpi_manager_.getComm());
+            } else if (mpi_manager_.getRank() == k && local_nnz) {
+                MPI_Recv(localCSRMatrix.columns.data(),
+                         localCSRMatrix.columns.size(), MPI_INT64_T, source,
+                         recvtag, mpi_manager_.getComm(), MPI_STATUS_IGNORE);
+            }
+            */
+        }
+        return localCSRMatrix;
+    }
+
+    template <class index_type>
+    auto splitCSRMatrix(const size_t &num_row_blocks,
+                        const size_t &num_col_blocks,
+                        const index_type *csrOffsets_ptr,
+                        const index_type *columns_ptr,
+                        const std::complex<Precision> *values_ptr)
+        -> std::vector<std::vector<CSRMatrix<Precision, index_type>>> {
+
+        std::vector<std::vector<CSRMatrix<Precision, index_type>>>
+            splitSparseMatrix(
+                num_row_blocks,
+                std::vector<CSRMatrix<Precision, index_type>>(num_col_blocks));
+
+        size_t num_rows = size_t{1} << this->getTotalNumQubits();
+        size_t row_block_size = size_t{1} << this->getNumLocalQubits();
+        size_t col_block_size = row_block_size;
+
+        for (size_t row = 0; row < num_rows; row++) {
+            for (size_t col_idx = static_cast<size_t>(csrOffsets_ptr[row]);
+                 col_idx < static_cast<size_t>(csrOffsets_ptr[row + 1]);
+                 col_idx++) {
+
+                size_t current_global_row = row;
+                size_t current_global_col = columns_ptr[col_idx];
+                std::complex<Precision> current_val = values_ptr[col_idx];
+
+                size_t block_row_id = current_global_row / row_block_size;
+                size_t block_col_id = current_global_col / col_block_size;
+
+                size_t local_row_id = current_global_row % row_block_size;
+                size_t local_col_id = current_global_col % col_block_size;
+
+                if (splitSparseMatrix[block_row_id][block_col_id]
+                        .csrOffsets.size() == 0) {
+                    splitSparseMatrix[block_row_id][block_col_id].csrOffsets =
+                        std::vector<index_type>(row_block_size + 1, 0);
+                }
+
+                splitSparseMatrix[block_row_id][block_col_id]
+                    .csrOffsets[local_row_id + 1]++;
+                splitSparseMatrix[block_row_id][block_col_id].columns.push_back(
+                    local_col_id);
+                splitSparseMatrix[block_row_id][block_col_id].values.push_back(
+                    current_val);
+            }
+        }
+
+        for (size_t block_row_id = 0; block_row_id < num_row_blocks;
+             block_row_id++) {
+            for (size_t block_col_id = 0; block_col_id < num_col_blocks;
+                 block_col_id++) {
+                for (size_t i0 = 1;
+                     i0 < splitSparseMatrix[block_row_id][block_col_id]
+                              .csrOffsets.size();
+                     i0++) {
+                    splitSparseMatrix[block_row_id][block_col_id]
+                        .csrOffsets[i0] +=
+                        splitSparseMatrix[block_row_id][block_col_id]
+                            .csrOffsets[i0 - 1];
+                }
+            }
+        }
+
+        return splitSparseMatrix;
+    }
+
+    /**
+     * @brief expval(H) calculation with cuSparseSpMV.
+     *
+     * @tparam index_type Integer type used as indices of the sparse matrix.
+     * @param csr_Offsets_ptr Pointer to the array of row offsets of the sparse
+     * matrix. Array of size csrOffsets_size.
+     * @param csrOffsets_size Number of Row offsets of the sparse matrix.
+     * @param columns_ptr Pointer to the array of column indices of the sparse
+     * matrix. Array of size numNNZ
+     * @param values_ptr Pointer to the array of the non-zero elements
+     * @param numNNZ Number of non-zero elements.
+     * @return auto Expectation value.
+     */
+    template <class index_type>
+    auto
+    getExpectationValueOnSparseSpMV(const index_type *csrOffsets_ptr,
+                                    const index_type *columns_ptr,
+                                    const std::complex<Precision> *values_ptr) {
+        const CFP_t alpha = {1.0, 0.0};
+        const CFP_t beta = {0.0, 0.0};
+
+        Precision local_expect = 0;
+
+        auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
+        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
+
+        cudaDataType_t data_type;
+        cusparseIndexType_t compute_type;
+
+        if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
+                      std::is_same_v<CFP_t, double2>) {
+            data_type = CUDA_C_64F;
+            compute_type = CUSPARSE_INDEX_64I;
+        } else {
+            data_type = CUDA_C_32F;
+            compute_type = CUSPARSE_INDEX_32I;
+        }
+
+        std::vector<std::vector<CSRMatrix<Precision, index_type>>>
+            csrmatrix_blocks;
+
+        size_t num_col_blocks = mpi_manager_.getSize();
+        size_t num_row_blocks = mpi_manager_.getSize();
+
+        if (mpi_manager_.getRank() == 0) {
+            csrmatrix_blocks =
+                splitCSRMatrix(num_col_blocks, num_row_blocks, csrOffsets_ptr,
+                               columns_ptr, values_ptr);
+        }
+
+        const size_t length_local = size_t{1} << this->getNumLocalQubits();
+
+        DataBuffer<CFP_t, int> d_tmp{length_local, device_id, stream_id, true};
+        DataBuffer<CFP_t, int> d_tmp_res{length_local, device_id, stream_id,
+                                         true};
+        d_tmp_res.zeroInit();
+
+        for (size_t i = 0; i < num_row_blocks; i++) {
+            std::vector<CSRMatrix<Precision, index_type>> sparsematices_row;
+
+            if (mpi_manager_.getRank() == 0) {
+                sparsematices_row = csrmatrix_blocks[i];
+            }
+            mpi_manager_.Barrier();
+
+            auto localCSRMatrix = scatterCSRMatrix(sparsematices_row, 0);
+            mpi_manager_.Barrier();
+
+            int64_t num_rows_local =
+                static_cast<int64_t>(localCSRMatrix.csrOffsets.size() - 1);
+            int64_t num_cols_local =
+                static_cast<int64_t>(localCSRMatrix.columns.size());
+            int64_t nnz_local =
+                static_cast<int64_t>(localCSRMatrix.values.size());
+
+            size_t color = 0;
+
+            if (localCSRMatrix.values.size() != 0) {
+                d_tmp.zeroInit();
+
+                DataBuffer<index_type, int> d_csrOffsets{
+                    localCSRMatrix.csrOffsets.size(), device_id, stream_id,
+                    true};
+                DataBuffer<index_type, int> d_columns{
+                    localCSRMatrix.columns.size(), device_id, stream_id, true};
+                DataBuffer<CFP_t, int> d_values{localCSRMatrix.values.size(),
+                                                device_id, stream_id, true};
+
+                d_csrOffsets.CopyHostDataToGpu(localCSRMatrix.csrOffsets.data(),
+                                               localCSRMatrix.csrOffsets.size(),
+                                               false);
+                d_columns.CopyHostDataToGpu(localCSRMatrix.columns.data(),
+                                            localCSRMatrix.columns.size(),
+                                            false);
+                d_values.CopyHostDataToGpu(localCSRMatrix.values.data(),
+                                           localCSRMatrix.values.size(), false);
+
+                // CUSPARSE APIs
+                cusparseSpMatDescr_t mat;
+                cusparseDnVecDescr_t vecX, vecY;
+
+                size_t bufferSize = 0;
+                cusparseHandle_t handle = getCusparseHandle();
+
+                // Create sparse matrix A in CSR format
+                PL_CUSPARSE_IS_SUCCESS(cusparseCreateCsr(
+                    &mat, num_rows_local, num_cols_local, nnz_local,
+                    d_csrOffsets.getData(), d_columns.getData(),
+                    d_values.getData(), compute_type, compute_type,
+                    CUSPARSE_INDEX_BASE_ZERO, data_type));
+
+                // Create dense vector X
+                PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec(
+                    &vecX, num_cols_local, BaseType::getData(), data_type));
+
+                // Create dense vector y
+                PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec(
+                    &vecY, num_rows_local, d_tmp.getData(), data_type));
+
+                // allocate an external buffer if needed
+                PL_CUSPARSE_IS_SUCCESS(cusparseSpMV_bufferSize(
+                    handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat, vecX,
+                    &beta, vecY, data_type, CUSPARSE_SPMV_ALG_DEFAULT,
+                    &bufferSize));
+
+                DataBuffer<cudaDataType_t, int> dBuffer{bufferSize, device_id,
+                                                        stream_id, true};
+
+                // execute SpMV
+                PL_CUSPARSE_IS_SUCCESS(cusparseSpMV(
+                    handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat, vecX,
+                    &beta, vecY, data_type, CUSPARSE_SPMV_ALG_DEFAULT,
+                    reinterpret_cast<void *>(dBuffer.getData())));
+
+                // destroy matrix/vector descriptors
+                PL_CUSPARSE_IS_SUCCESS(cusparseDestroySpMat(mat));
+                PL_CUSPARSE_IS_SUCCESS(cusparseDestroyDnVec(vecX));
+                PL_CUSPARSE_IS_SUCCESS(cusparseDestroyDnVec(vecY));
+
+                color = 1;
+            }
+
+            PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+            mpi_manager_.Barrier();
+
+            if (mpi_manager_.getRank() == i) {
+                color = 1;
+            }
+
+            auto new_mpi_manager =
+                mpi_manager_.split(color, mpi_manager_.getRank());
+            int reduce_root_rank = -1;
+
+            if (mpi_manager_.getRank() == i) {
+                reduce_root_rank = new_mpi_manager.getRank();
+            }
+
+            mpi_manager_.Bcast<int>(reduce_root_rank, i);
+
+            if (new_mpi_manager.getComm() != MPI_COMM_NULL) {
+                new_mpi_manager.Reduce<CFP_t>(d_tmp.getData(),
+                                              d_tmp_res.getData(), length_local,
+                                              reduce_root_rank, "sum");
+            }
+        }
+
+        mpi_manager_.Barrier();
+
+        local_expect = innerProdC_CUDA(d_tmp_res.getData(), BaseType::getData(),
+                                       BaseType::getLength(), device_id,
+                                       stream_id, getCublasCaller())
+                           .x;
+
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
+
+        auto expect = mpi_manager_.allreduce<Precision>(local_expect, "sum");
+        return expect;
+    }
+
     /**
      * @brief Utility method for probability calculation using given wires.
      *
diff --git a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
index f4664d6a..fb63aa96 100644
--- a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
+++ b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
@@ -926,3 +926,59 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::getExpectationValuePauliWords",
         CHECK(expval_mpi == Approx(-0.0105768395).margin(1e-7));
     }
 }
+
+TEMPLATE_TEST_CASE("StateVectorCudaMPI::Hamiltonian_expval_cuSparse",
+                   "[StateVectorCudaMPI_Nonparam]", float, double) {
+    using PrecisionT = TestType;
+    using cp_t = std::complex<PrecisionT>;
+    MPIManager mpi_manager(MPI_COMM_WORLD);
+    size_t numqubits = 3;
+    size_t mpi_buffersize = 1;
+
+    size_t nGlobalIndexBits =
+        std::bit_width(static_cast<size_t>(mpi_manager.getSize())) - 1;
+    size_t nLocalIndexBits = numqubits - nGlobalIndexBits;
+    mpi_manager.Barrier();
+
+    std::vector<cp_t> init_sv{{0.0, 0.0}, {0.0, 0.1}, {0.1, 0.1}, {0.1, 0.2},
+                              {0.2, 0.2}, {0.3, 0.3}, {0.3, 0.4}, {0.4, 0.5}};
+
+    using index_type =
+        typename std::conditional<std::is_same<TestType, float>::value, int32_t,
+                                  int64_t>::type;
+
+    auto local_state = mpi_manager.scatter<cp_t>(init_sv, 0);
+
+    int nDevices = 0; // Number of GPU devices per node
+    cudaGetDeviceCount(&nDevices);
+    int deviceId = mpi_manager.getRank() % nDevices;
+    cudaSetDevice(deviceId);
+    DevTag<int> dt_local(deviceId, 0);
+    mpi_manager.Barrier();
+
+    SECTION("GetExpectionCuSparse") {
+        index_type csrOffsets[9] = {0, 2, 4, 6, 8, 10, 12, 14, 16};
+        index_type columns[16] = {0, 3, 1, 2, 1, 2, 0, 3,
+                                  4, 7, 5, 6, 5, 6, 4, 7};
+
+        std::complex<TestType> values[16] = {
+            {1.0, 0.0},  {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0},
+            {0.0, -1.0}, {1.0, 0.0},  {0.0, 1.0}, {1.0, 0.0},
+            {1.0, 0.0},  {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0},
+            {0.0, -1.0}, {1.0, 0.0},  {0.0, 1.0}, {1.0, 0.0}};
+
+        // index_type num_csrOffsets = 9;
+        // index_type nnz = 16;
+
+        StateVectorCudaMPI<PrecisionT> sv(mpi_manager, dt_local, mpi_buffersize,
+                                          nGlobalIndexBits, nLocalIndexBits);
+        sv.CopyHostDataToGpu(local_state, false);
+
+        auto results = sv.template getExpectationValueOnSparseSpMV<index_type>(
+            csrOffsets, columns, values);
+
+        TestType expected = 1;
+
+        CHECK(expected == Approx(results).epsilon(1e-7));
+    }
+}
\ No newline at end of file
diff --git a/pennylane_lightning_gpu/src/util/DataBuffer.hpp b/pennylane_lightning_gpu/src/util/DataBuffer.hpp
index 41afa6e4..3b6d483f 100644
--- a/pennylane_lightning_gpu/src/util/DataBuffer.hpp
+++ b/pennylane_lightning_gpu/src/util/DataBuffer.hpp
@@ -30,8 +30,8 @@ template <class GPUDataT, class DevTagT = int> class DataBuffer {
 
     DataBuffer(std::size_t length, int device_id = 0,
                cudaStream_t stream_id = 0, bool alloc_memory = true)
-        : length_{length}, dev_tag_{device_id, stream_id}, gpu_buffer_{
-                                                               nullptr} {
+        : length_{length}, dev_tag_{device_id, stream_id},
+          gpu_buffer_{nullptr} {
         if (alloc_memory && (length > 0)) {
             dev_tag_.refresh();
             PL_CUDA_IS_SUCCESS(
diff --git a/pennylane_lightning_gpu/src/util/MPIManager.hpp b/pennylane_lightning_gpu/src/util/MPIManager.hpp
index 5fb3f9f5..4e9a7c17 100644
--- a/pennylane_lightning_gpu/src/util/MPIManager.hpp
+++ b/pennylane_lightning_gpu/src/util/MPIManager.hpp
@@ -424,6 +424,15 @@ class MPIManager final {
                                      this->getComm()));
     }
 
+    template <typename T>
+    void Reduce(T *sendBuf, T *recvBuf, size_t length, size_t root,
+                const std::string &op_str) {
+        MPI_Datatype datatype = getMPIDatatype<T>();
+        MPI_Op op = getMPIOpType(op_str);
+        PL_MPI_IS_SUCCESS(MPI_Reduce(sendBuf, recvBuf, length, datatype, op,
+                                     root, this->getComm()));
+    }
+
     template <typename T>
     void Gather(T &sendBuf, std::vector<T> &recvBuf, size_t root) {
         MPI_Datatype datatype = getMPIDatatype<T>();
@@ -533,6 +542,41 @@ class MPIManager final {
         return recvBuf;
     }
 
+    /**
+     * @brief MPI_Send wrapper.
+     *
+     * @tparam T C++ data type.
+     * @param sendBuf Send buffer vector.
+     * @param dest Rank of send dest.
+     */
+
+    template <typename T> void Send(std::vector<T> &sendBuf, size_t dest) {
+        MPI_Datatype datatype = getMPIDatatype<T>();
+        const int tag = 0;
+
+        PL_MPI_IS_SUCCESS(MPI_Send(sendBuf.data(), sendBuf.size(), datatype,
+                                   static_cast<int>(dest), tag,
+                                   this->getComm()));
+    }
+
+    /**
+     * @brief MPI_Recv wrapper.
+     *
+     * @tparam T C++ data type.
+     * @param recvBuf Recv buffer vector.
+     * @param source Rank of data source.
+     */
+
+    template <typename T> void Recv(std::vector<T> &recvBuf, size_t source) {
+        MPI_Datatype datatype = getMPIDatatype<T>();
+        MPI_Status status;
+        const int tag = 0;
+
+        PL_MPI_IS_SUCCESS(MPI_Recv(recvBuf.data(), recvBuf.size(), datatype,
+                                   static_cast<int>(source), tag,
+                                   this->getComm(), &status));
+    }
+
     /**
      * @brief MPI_Sendrecv wrapper.
      *

From d39816191ed797745ef7ce57d293233daf18741d Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 6 Jul 2023 19:11:29 -0700
Subject: [PATCH 02/43] add docstring

---
 .../src/simulator/StateVectorCudaMPI.hpp      | 121 ++++++++++--------
 .../mpi/Test_StateVectorCudaMPI_NonParam.cpp  |   6 +-
 2 files changed, 73 insertions(+), 54 deletions(-)

diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 97986028..f23d5355 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -955,6 +955,13 @@ class StateVectorCudaMPI
         return expect_val;
     }
 
+    /**
+     * @brief Scatter a CSR (Compress Sparse Row) format matrix.
+     *
+     * @tparam index_type Integer type used as indices of the sparse matrix.
+     * @param matrix CSR (Compress Sparse Row) format matrix.
+     * @param root Root rank of the scatter operation.
+     */
     template <class index_type>
     auto scatterCSRMatrix(std::vector<CSRMatrix<Precision, index_type>> &matrix,
                           size_t root) -> CSRMatrix<Precision, index_type> {
@@ -1007,45 +1014,25 @@ class StateVectorCudaMPI
             } else if (mpi_manager_.getRank() == k && local_nnz) {
                 mpi_manager_.Recv<index_type>(localCSRMatrix.columns, source);
             }
-
-            /*
-            int sendtag = 0;
-            int recvtag = 0;
-
-            if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) {
-                MPI_Send(matrix[k].values.data(), matrix[k].values.size(),
-                         MPI_C_DOUBLE_COMPLEX, dest, sendtag,
-                         mpi_manager_.getComm());
-            } else if (mpi_manager_.getRank() == k && local_nnz) {
-                MPI_Recv(localCSRMatrix.values.data(),
-                         localCSRMatrix.values.size(), MPI_C_DOUBLE_COMPLEX,
-                         source, recvtag, mpi_manager_.getComm(),
-                         MPI_STATUS_IGNORE);
-            }
-
-            if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) {
-                MPI_Send(matrix[k].csrOffsets.data(),
-                         matrix[k].csrOffsets.size(), MPI_INT64_T, dest,
-                         sendtag, mpi_manager_.getComm());
-            } else if (mpi_manager_.getRank() == k && local_nnz) {
-                MPI_Recv(localCSRMatrix.csrOffsets.data(),
-                         localCSRMatrix.csrOffsets.size(), MPI_INT64_T, source,
-                         recvtag, mpi_manager_.getComm(), MPI_STATUS_IGNORE);
-            }
-
-            if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) {
-                MPI_Send(matrix[k].columns.data(), matrix[k].columns.size(),
-                         MPI_INT64_T, dest, sendtag, mpi_manager_.getComm());
-            } else if (mpi_manager_.getRank() == k && local_nnz) {
-                MPI_Recv(localCSRMatrix.columns.data(),
-                         localCSRMatrix.columns.size(), MPI_INT64_T, source,
-                         recvtag, mpi_manager_.getComm(), MPI_STATUS_IGNORE);
-            }
-            */
         }
         return localCSRMatrix;
     }
 
+    /**
+     * @brief Convert a global CSR (Compress Sparse Row) format matrix into
+     * local blocks. This operation should be conducted on the rank 0.
+     *
+     * @tparam index_type Integer type used as indices of the sparse matrix.
+     * @param num_row_blocks Number of local blocks per global row.
+     * @param num_col_blocks Number of local blocks per global column.
+     * @param csrOffsets_ptr Pointer to the array of row offsets of the sparse
+     * matrix. Array of size csrOffsets_size.
+     * @param columns_ptr Pointer to the array of column indices of the sparse
+     * matrix. Array of size numNNZ
+     * @param values_ptr Pointer to the array of the non-zero elements
+     *
+     * @return auto A vector of vector of CSRMatrix.
+     */
     template <class index_type>
     auto splitCSRMatrix(const size_t &num_row_blocks,
                         const size_t &num_col_blocks,
@@ -1126,10 +1113,15 @@ class StateVectorCudaMPI
      * @return auto Expectation value.
      */
     template <class index_type>
-    auto
-    getExpectationValueOnSparseSpMV(const index_type *csrOffsets_ptr,
-                                    const index_type *columns_ptr,
-                                    const std::complex<Precision> *values_ptr) {
+    auto getExpectationValueOnSparseSpMV(
+        const index_type *csrOffsets_ptr, const index_type csrOffsets_size,
+        const index_type *columns_ptr,
+        const std::complex<Precision> *values_ptr, const index_type numNNZ) {
+        PL_ABORT_IF_NOT(static_cast<size_t>(csrOffsets_size - 1) ==
+                            (size_t{1} << this->getTotalNumQubits()),
+                        "Incorrect size of CSR Offsets.");
+        PL_ABORT_IF_NOT(numNNZ > 0, "Empty CSR matrix.");
+
         const CFP_t alpha = {1.0, 0.0};
         const CFP_t beta = {0.0, 0.0};
 
@@ -1218,33 +1210,60 @@ class StateVectorCudaMPI
 
                 // Create sparse matrix A in CSR format
                 PL_CUSPARSE_IS_SUCCESS(cusparseCreateCsr(
-                    &mat, num_rows_local, num_cols_local, nnz_local,
-                    d_csrOffsets.getData(), d_columns.getData(),
-                    d_values.getData(), compute_type, compute_type,
-                    CUSPARSE_INDEX_BASE_ZERO, data_type));
+                    /* cusparseSpMatDescr_t* */ &mat,
+                    /* int64_t */ num_rows_local,
+                    /* int64_t */ num_cols_local,
+                    /* int64_t */ nnz_local,
+                    /* void* */ d_csrOffsets.getData(),
+                    /* void* */ d_columns.getData(),
+                    /* void* */ d_values.getData(),
+                    /* cusparseIndexType_t */ compute_type,
+                    /* cusparseIndexType_t */ compute_type,
+                    /* cusparseIndexBase_t */ CUSPARSE_INDEX_BASE_ZERO,
+                    /* cudaDataType */ data_type));
 
                 // Create dense vector X
                 PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec(
-                    &vecX, num_cols_local, BaseType::getData(), data_type));
+                    /* cusparseDnVecDescr_t* */ &vecX,
+                    /* int64_t */ num_cols_local,
+                    /* void* */ BaseType::getData(),
+                    /* cudaDataType */ data_type));
 
                 // Create dense vector y
                 PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec(
-                    &vecY, num_rows_local, d_tmp.getData(), data_type));
+                    /* cusparseDnVecDescr_t* */ &vecY,
+                    /* int64_t */ num_rows_local,
+                    /* void* */ d_tmp.getData(),
+                    /* cudaDataType */ data_type));
 
                 // allocate an external buffer if needed
                 PL_CUSPARSE_IS_SUCCESS(cusparseSpMV_bufferSize(
-                    handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat, vecX,
-                    &beta, vecY, data_type, CUSPARSE_SPMV_ALG_DEFAULT,
-                    &bufferSize));
+                    /* cusparseHandle_t */ handle,
+                    /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    /* const void* */ &alpha,
+                    /* cusparseSpMatDescr_t */ mat,
+                    /* cusparseDnVecDescr_t */ vecX,
+                    /* const void* */ &beta,
+                    /* cusparseDnVecDescr_t */ vecY,
+                    /* cudaDataType */ data_type,
+                    /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT,
+                    /* size_t* */ &bufferSize));
 
                 DataBuffer<cudaDataType_t, int> dBuffer{bufferSize, device_id,
                                                         stream_id, true};
 
                 // execute SpMV
                 PL_CUSPARSE_IS_SUCCESS(cusparseSpMV(
-                    handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat, vecX,
-                    &beta, vecY, data_type, CUSPARSE_SPMV_ALG_DEFAULT,
-                    reinterpret_cast<void *>(dBuffer.getData())));
+                    /* cusparseHandle_t */ handle,
+                    /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    /* const void* */ &alpha,
+                    /* cusparseSpMatDescr_t */ mat,
+                    /* cusparseDnVecDescr_t */ vecX,
+                    /* const void* */ &beta,
+                    /* cusparseDnVecDescr_t */ vecY,
+                    /* cudaDataType */ data_type,
+                    /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT,
+                    /* void* */ reinterpret_cast<void *>(dBuffer.getData())));
 
                 // destroy matrix/vector descriptors
                 PL_CUSPARSE_IS_SUCCESS(cusparseDestroySpMat(mat));
diff --git a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
index fb63aa96..dbd735c4 100644
--- a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
+++ b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
@@ -967,15 +967,15 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::Hamiltonian_expval_cuSparse",
             {1.0, 0.0},  {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0},
             {0.0, -1.0}, {1.0, 0.0},  {0.0, 1.0}, {1.0, 0.0}};
 
-        // index_type num_csrOffsets = 9;
-        // index_type nnz = 16;
+        index_type num_csrOffsets = 9;
+        index_type nnz = 16;
 
         StateVectorCudaMPI<PrecisionT> sv(mpi_manager, dt_local, mpi_buffersize,
                                           nGlobalIndexBits, nLocalIndexBits);
         sv.CopyHostDataToGpu(local_state, false);
 
         auto results = sv.template getExpectationValueOnSparseSpMV<index_type>(
-            csrOffsets, columns, values);
+            csrOffsets, num_csrOffsets, columns, values, nnz);
 
         TestType expected = 1;
 

From 087a85ea98d7b692b7af42c9448b3440aa5832d2 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Fri, 7 Jul 2023 19:53:59 -0700
Subject: [PATCH 03/43] add python layer

---
 mpitests/test_adjoint_jacobian.py             |  3 +-
 mpitests/test_apply.py                        | 42 +++++++++++++++++++
 pennylane_lightning_gpu/lightning_gpu.py      | 15 ++++++-
 .../src/bindings/Bindings.cpp                 | 20 +++++++++
 .../src/simulator/StateVectorCudaMPI.hpp      | 10 +++--
 5 files changed, 83 insertions(+), 7 deletions(-)

diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
index 638e9758..20d3c01c 100644
--- a/mpitests/test_adjoint_jacobian.py
+++ b/mpitests/test_adjoint_jacobian.py
@@ -1043,7 +1043,7 @@ def circuit(params):
     with pytest.raises((TypeError, ValueError)):
         j_gpu = qml.jacobian(qnode_gpu)(params)
 
-
+'''
 @pytest.mark.parametrize(
     "returns",
     [
@@ -1088,6 +1088,7 @@ def circuit(params):
     ):
         j_gpu = qml.jacobian(qnode_gpu)(params)
 
+'''
 
 @pytest.mark.parametrize(
     "obs,obs_type_c64,obs_type_c128",
diff --git a/mpitests/test_apply.py b/mpitests/test_apply.py
index a08cc2f5..914eeab0 100644
--- a/mpitests/test_apply.py
+++ b/mpitests/test_apply.py
@@ -553,6 +553,48 @@ def circuit():
         assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
 
 
+class TestSparseHamiltonianExpval:
+    def test_sparse_hamiltonian_expectation(self, tol):
+        comm = MPI.COMM_WORLD
+        commSize = comm.Get_size()
+        num_global_wires = commSize.bit_length() - 1
+        num_local_wires = 3 - num_global_wires
+
+        obs = qml.Identity(0) @ qml.PauliX(1) @ qml.PauliY(2)
+        obs1 = qml.Identity(1)
+        Hmat = qml.Hamiltonian([1.0, 1.0], [obs1, obs]).sparse_matrix()
+
+        state_vector = np.array(
+            [
+                0.0 + 0.0j,
+                0.0 + 0.1j,
+                0.1 + 0.1j,
+                0.1 + 0.2j,
+                0.2 + 0.2j,
+                0.3 + 0.3j,
+                0.3 + 0.4j,
+                0.4 + 0.5j,
+            ],
+            dtype=np.complex128,
+        )
+
+        local_state_vector = np.zeros(1 << num_local_wires).astype(np.complex128)
+        comm.Scatter(state_vector, local_state_vector, root=0)
+
+        dev_gpumpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=np.complex128)
+
+        dev_gpumpi.syncH2D(local_state_vector)
+
+        H_sparse = qml.SparseHamiltonian(Hmat, wires=range(3))
+
+        comm.Barrier()
+
+        res = dev_gpumpi.expval(H_sparse)
+        expected = 1
+
+        assert np.allclose(res, expected)
+
+
 class TestExpval:
     """Tests that expectation values are properly calculated or that the proper errors are raised."""
 
diff --git a/pennylane_lightning_gpu/lightning_gpu.py b/pennylane_lightning_gpu/lightning_gpu.py
index c931703f..014a6dbb 100644
--- a/pennylane_lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning_gpu/lightning_gpu.py
@@ -832,8 +832,19 @@ def expval(self, observable, shot_range=None, bin_size=None):
                         CSR_SparseHamiltonian.data,
                     )
                 else:
-                    raise RuntimeError(
-                        "LightningGPU-MPI does not currently support SparseHamiltonian."
+                    # Identity for CSR_SparseHamiltonian to pass to processes with rank != 0 to reduce
+                    # host(cpu) memory requirements
+                    obs = qml.Identity(0)
+                    Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix()
+                    H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1))
+                    CSR_SparseHamiltonian = H_sparse.sparse_matrix().tocsr()
+                    # CSR_SparseHamiltonian for rank == 0
+                    if self._mpi_manager.getRank() == 0:
+                        CSR_SparseHamiltonian = observable.sparse_matrix().tocsr()
+                    return self._gpu_state.ExpectationValue(
+                        CSR_SparseHamiltonian.indptr,
+                        CSR_SparseHamiltonian.indices,
+                        CSR_SparseHamiltonian.data,
                     )
 
             if observable.name in ["Hamiltonian"]:
diff --git a/pennylane_lightning_gpu/src/bindings/Bindings.cpp b/pennylane_lightning_gpu/src/bindings/Bindings.cpp
index dc91744c..2b0d677d 100644
--- a/pennylane_lightning_gpu/src/bindings/Bindings.cpp
+++ b/pennylane_lightning_gpu/src/bindings/Bindings.cpp
@@ -1352,6 +1352,26 @@ void StateVectorCudaMPI_class_bindings(py::module &m) {
             },
             "Calculate the expectation value of the Hamiltonian observable "
             "with custatevecComputeExpectation.")
+
+        .def(
+            "ExpectationValue",
+            [](StateVectorCudaMPI<PrecisionT> &sv,
+               const np_arr_sparse_ind &csrOffsets,
+               const np_arr_sparse_ind &columns, const np_arr_c values) {
+                using index_type = typename std::conditional<
+                    std::is_same<ParamT, float>::value, int32_t, int64_t>::type;
+                return sv.template getExpectationValueOnSparseSpMV<index_type>(
+                    static_cast<index_type *>(csrOffsets.request().ptr),
+                    static_cast<index_type>(
+                        csrOffsets.request()
+                            .size), // num_rows + 1 or csrOffsets
+                    static_cast<index_type *>(columns.request().ptr), // columns
+                    static_cast<std::complex<PrecisionT> *>(
+                        values.request().ptr),
+                    static_cast<index_type>(values.request().size)); // nnz
+            },
+            "Calculate the expectation value of a sparse Hamiltonian.")
+
         .def(
             "ExpectationValue",
             [](StateVectorCudaMPI<PrecisionT> &sv,
diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index f23d5355..5cd3d162 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -1117,10 +1117,12 @@ class StateVectorCudaMPI
         const index_type *csrOffsets_ptr, const index_type csrOffsets_size,
         const index_type *columns_ptr,
         const std::complex<Precision> *values_ptr, const index_type numNNZ) {
-        PL_ABORT_IF_NOT(static_cast<size_t>(csrOffsets_size - 1) ==
-                            (size_t{1} << this->getTotalNumQubits()),
-                        "Incorrect size of CSR Offsets.");
-        PL_ABORT_IF_NOT(numNNZ > 0, "Empty CSR matrix.");
+        if (mpi_manager_.getRank() == 0) {
+            PL_ABORT_IF_NOT(static_cast<size_t>(csrOffsets_size - 1) ==
+                                (size_t{1} << this->getTotalNumQubits()),
+                            "Incorrect size of CSR Offsets.");
+            PL_ABORT_IF_NOT(numNNZ > 0, "Empty CSR matrix.");
+        }
 
         const CFP_t alpha = {1.0, 0.0};
         const CFP_t beta = {0.0, 0.0};

From 2e29ce9f05f31111ab6f1e71cb8092d3eb3e3597 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Sat, 8 Jul 2023 15:06:11 -0700
Subject: [PATCH 04/43] add mpi support for spmv adjoint

---
 mpitests/test_adjoint_jacobian.py             | 147 +++++++++
 pennylane_lightning_gpu/_serialize.py         |  31 +-
 pennylane_lightning_gpu/lightning_gpu.py      |   2 +
 .../src/algorithms/ObservablesGPUMPI.hpp      | 286 +++++++++++++++++-
 .../src/bindings/Bindings.cpp                 |  43 +++
 5 files changed, 495 insertions(+), 14 deletions(-)

diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
index 20d3c01c..a67480f9 100644
--- a/mpitests/test_adjoint_jacobian.py
+++ b/mpitests/test_adjoint_jacobian.py
@@ -31,6 +31,8 @@
     TensorProdObsGPUMPI_C128,
     HamiltonianGPUMPI_C64,
     HamiltonianGPUMPI_C128,
+    SparseHamiltonianGPUMPI_C64,
+    SparseHamiltonianGPUMPI_C128,
 )
 from pennylane_lightning_gpu._serialize import _serialize_ob
 
@@ -1043,6 +1045,7 @@ def circuit(params):
     with pytest.raises((TypeError, ValueError)):
         j_gpu = qml.jacobian(qnode_gpu)(params)
 
+
 '''
 @pytest.mark.parametrize(
     "returns",
@@ -1090,6 +1093,145 @@ def circuit(params):
 
 '''
 
+
+custom_wires0 = ["alice", 3.14, -1, 0, "bob", "luc"]
+""" Important Note: Ensure wires index arranged from high to low
+    qml.SparseHamiltonian(
+        qml.Hamiltonian(
+            [0.1], [qml.PauliX(wires=custom_wires0[1]) @ qml.PauliY(wires=custom_wires0[0])]
+        ).sparse_matrix(custom_wires0.[::-1]),
+    ) 
+"""
+
+
+@pytest.mark.parametrize(
+    "returns",
+    [
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [0.1], [qml.PauliX(wires=custom_wires0[0]) @ qml.PauliZ(wires=custom_wires0[1])]
+            ).sparse_matrix(custom_wires0[::-1]),
+            wires=custom_wires0,
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [2.0], [qml.PauliX(wires=custom_wires0[2]) @ qml.PauliZ(wires=custom_wires0[0])]
+            ).sparse_matrix(custom_wires0[::-1]),
+            wires=custom_wires0,
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [1.1], [qml.PauliX(wires=custom_wires0[0]) @ qml.PauliZ(wires=custom_wires0[2])]
+            ).sparse_matrix(custom_wires0[::-1]),
+            wires=custom_wires0,
+        ),
+    ],
+)
+def test_adjoint_SparseHamiltonian_custom_wires(returns):
+    """Integration tests that compare to default.qubit for a large circuit containing parametrized
+    operations and when using custom wire labels"""
+
+    comm = MPI.COMM_WORLD
+    dev_gpu = qml.device("lightning.gpu", wires=custom_wires0, mpi=True)
+    dev_cpu = qml.device("default.qubit", wires=custom_wires0)
+
+    def circuit(params):
+        circuit_ansatz(params, wires=custom_wires0)
+        return qml.expval(returns)
+
+    if comm.Get_rank() == 0:
+        n_params = 30
+        np.random.seed(1337)
+        params = np.random.rand(n_params)
+    else:
+        params = None
+
+    params = comm.bcast(params, root=0)
+
+    qnode_gpu = qml.QNode(circuit, dev_gpu, diff_method="adjoint")
+    qnode_cpu = qml.QNode(circuit, dev_cpu, diff_method="parameter-shift")
+
+    j_gpu = qml.jacobian(qnode_gpu)(params)
+    j_cpu = qml.jacobian(qnode_cpu)(params)
+
+    assert np.allclose(j_cpu, j_gpu)
+
+
+""" Important Note: Ensure wires index arranged from high to low
+    qml.SparseHamiltonian(
+    qml.Hamiltonian(
+                [0.1],
+                [qml.PauliX(1) @ qml.PauliZ(0)],
+            ).sparse_matrix(range(5, -1, -1)),
+            wires=range(6),
+    )
+"""
+
+
+@pytest.mark.parametrize(
+    "returns",
+    [
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [0.1],
+                [qml.PauliZ(1) @ qml.PauliX(0) @ qml.Identity(2) @ qml.PauliX(4) @ qml.Identity(5)],
+            ).sparse_matrix(range(5, -1, -1)),
+            wires=range(6),
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [0.1],
+                [qml.PauliX(1) @ qml.PauliZ(0)],
+            ).sparse_matrix(range(5, -1, -1)),
+            wires=range(6),
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [0.1],
+                [qml.PauliX(0) @ qml.PauliZ(1)],
+            ).sparse_matrix(range(5, -1, -1)),
+            wires=range(6),
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian([2.0], [qml.PauliX(1) @ qml.PauliZ(2)]).sparse_matrix(range(5, -1, -1)),
+            wires=range(6),
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian([1.1], [qml.PauliX(2) @ qml.PauliZ(0)]).sparse_matrix(range(5, -1, -1)),
+            wires=range(6),
+        ),
+    ],
+)
+def test_adjoint_SparseHamiltonian(returns):
+    """Integration tests that compare to default.qubit for a large circuit containing parametrized
+    operations and when using custom wire labels"""
+
+    comm = MPI.COMM_WORLD
+    dev_gpu = qml.device("lightning.gpu", wires=6, mpi=True)
+    dev_cpu = qml.device("default.qubit", wires=6)
+
+    def circuit(params):
+        circuit_ansatz(params, wires=range(6))
+        return qml.expval(returns)
+
+    if comm.Get_rank() == 0:
+        n_params = 30
+        np.random.seed(1337)
+        params = np.random.rand(n_params)
+    else:
+        params = None
+
+    params = comm.bcast(params, root=0)
+
+    qnode_gpu = qml.QNode(circuit, dev_gpu, diff_method="adjoint")
+    qnode_cpu = qml.QNode(circuit, dev_cpu, diff_method="parameter-shift")
+
+    j_gpu = qml.jacobian(qnode_gpu)(params)
+    j_cpu = qml.jacobian(qnode_cpu)(params)
+
+    assert np.allclose(j_cpu, j_gpu)
+
+
 @pytest.mark.parametrize(
     "obs,obs_type_c64,obs_type_c128",
     [
@@ -1102,6 +1244,11 @@ def circuit(params):
             HamiltonianGPUMPI_C64,
             HamiltonianGPUMPI_C128,
         ),
+        (
+            qml.SparseHamiltonian(qml.Hamiltonian([1], [qml.PauliZ(0)]).sparse_matrix(), wires=[0]),
+            SparseHamiltonianGPUMPI_C64,
+            SparseHamiltonianGPUMPI_C128,
+        ),
     ],
 )
 @pytest.mark.parametrize("use_csingle", [True, False])
diff --git a/pennylane_lightning_gpu/_serialize.py b/pennylane_lightning_gpu/_serialize.py
index e989ed49..7df08516 100644
--- a/pennylane_lightning_gpu/_serialize.py
+++ b/pennylane_lightning_gpu/_serialize.py
@@ -64,6 +64,8 @@
             TensorProdObsGPUMPI_C128,
             HamiltonianGPUMPI_C64,
             HamiltonianGPUMPI_C128,
+            SparseHamiltonianGPUMPI_C64,
+            SparseHamiltonianGPUMPI_C128,
             HermitianObsGPUMPI_C64,
             HermitianObsGPUMPI_C128,
         )
@@ -117,6 +119,20 @@ def _hamiltonian_ob_dtype(use_csingle, use_mpi: bool):
     )
 
 
+def _sparsehamiltonian_ob_dtype(use_csingle, use_mpi: bool):
+    if not use_mpi:
+        return (
+            [SparseHamiltonianGPU_C64, np.float32, np.int32]
+            if use_csingle
+            else [SparseHamiltonianGPU_C128, np.float64, np.int64]
+        )
+    return (
+        [SparseHamiltonianGPUMPI_C64, np.float32, np.int32]
+        if use_csingle
+        else [SparseHamiltonianGPUMPI_C128, np.float64, np.int64]
+    )
+
+
 def _sv_py_dtype(use_csingle, use_mpi: bool):
     if not use_mpi:
         return LightningGPU_C64 if use_csingle else LightningGPU_C128
@@ -151,15 +167,8 @@ def _serialize_hamiltonian(
     return hamiltonian_obs(coeffs, terms)
 
 
-def _serialize_sparsehamiltonian(ob, wires_map: dict, use_csingle: bool):
-    if use_csingle:
-        ctype = np.complex64
-        rtype = np.int32
-        sparsehamiltonian_obs = SparseHamiltonianGPU_C64
-    else:
-        ctype = np.complex128
-        rtype = np.int64
-        sparsehamiltonian_obs = SparseHamiltonianGPU_C128
+def _serialize_sparsehamiltonian(ob, wires_map: dict, use_csingle: bool, use_mpi: bool):
+    sparsehamiltonian_obs, ctype, rtype = _sparsehamiltonian_ob_dtype(use_csingle, use_mpi)
 
     spm = ob.sparse_matrix()
     data = np.array(spm.data).astype(ctype)
@@ -214,9 +223,7 @@ def _serialize_ob(ob, wires_map, use_csingle, use_mpi: bool = False, use_splitti
     elif ob.name == "Hamiltonian":
         return _serialize_hamiltonian(ob, wires_map, use_csingle, use_mpi, use_splitting)
     elif ob.name == "SparseHamiltonian":
-        if use_mpi:
-            raise TypeError("SparseHamiltonian is not supported for MPI backend.")
-        return _serialize_sparsehamiltonian(ob, wires_map, use_csingle)
+        return _serialize_sparsehamiltonian(ob, wires_map, use_csingle, use_mpi)
     elif isinstance(ob, (PauliX, PauliY, PauliZ, Identity, Hadamard)):
         return _serialize_named_ob(ob, wires_map, use_csingle, use_mpi)
     elif ob._pauli_rep is not None:
diff --git a/pennylane_lightning_gpu/lightning_gpu.py b/pennylane_lightning_gpu/lightning_gpu.py
index 014a6dbb..38c06e10 100644
--- a/pennylane_lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning_gpu/lightning_gpu.py
@@ -85,6 +85,8 @@
             TensorProdObsGPUMPI_C128,
             HamiltonianGPUMPI_C64,
             HamiltonianGPUMPI_C128,
+            SparseHamiltonianGPUMPI_C64,
+            SparseHamiltonianGPUMPI_C128,
         )
 
         MPI_SUPPORT = True
diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
index 27c40a8c..d7ad2d56 100644
--- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
+++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
@@ -17,6 +17,7 @@
 #include <functional>
 #include <vector>
 
+#include "MPIManager.hpp"
 #include "StateVectorCudaMPI.hpp"
 
 namespace Pennylane::Algorithms {
@@ -362,8 +363,9 @@ class HamiltonianGPUMPI final : public ObservableGPUMPI<T> {
         for (size_t term_idx = 0; term_idx < coeffs_.size(); term_idx++) {
             DevTag<int> dt_local(sv.getDataBuffer().getDevTag());
             dt_local.refresh();
-            StateVectorCudaMPI<T> tmp(dt_local, sv.getNumGlobalQubits(),
-                                      sv.getNumLocalQubits(), sv.getData());
+            StateVectorCudaMPI<PrecisionT> tmp(
+                dt_local, sv.getNumGlobalQubits(), sv.getNumLocalQubits(),
+                sv.getData());
             obs_[term_idx]->applyInPlace(tmp);
             scaleAndAddC_CUDA(std::complex<T>{coeffs_[term_idx], 0.0},
                               tmp.getData(), buffer.getData(), tmp.getLength(),
@@ -404,4 +406,284 @@ class HamiltonianGPUMPI final : public ObservableGPUMPI<T> {
     }
 };
 
+/**
+ * @brief Sparse representation of HamiltonianGPUMPI<T>
+ *
+ * @tparam T Floating-point precision.
+ */
+template <typename T>
+class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
+  public:
+    using PrecisionT = T;
+    // cuSparse required index type
+    using IdxT = typename std::conditional<std::is_same<T, float>::value,
+                                           int32_t, int64_t>::type;
+
+  private:
+    std::vector<std::complex<T>> data_;
+    std::vector<IdxT> indices_;
+    std::vector<IdxT> offsets_;
+    std::vector<std::size_t> wires_;
+
+    [[nodiscard]] bool
+    isEqual(const ObservableGPUMPI<T> &other) const override {
+        const auto &other_cast =
+            static_cast<const SparseHamiltonianGPUMPI<T> &>(other);
+
+        if (data_ != other_cast.data_ || indices_ != other_cast.indices_ ||
+            offsets_ != other_cast.offsets_) {
+            return false;
+        }
+
+        return true;
+    }
+
+  public:
+    /**
+     * @brief Create a SparseHamiltonianMPI from data, indices and offsets in
+     * CSR format.
+     *
+     * @param arg1 Arguments to construct data
+     * @param arg2 Arguments to construct indices
+     * @param arg3 Arguments to construct offsets
+     * @param arg4 Arguments to construct wires
+     */
+    template <typename T1, typename T2, typename T3 = T2,
+              typename T4 = std::vector<std::size_t>>
+    SparseHamiltonianGPUMPI(T1 &&arg1, T2 &&arg2, T3 &&arg3, T4 &&arg4)
+        : data_{std::forward<T1>(arg1)}, indices_{std::forward<T2>(arg2)},
+          offsets_{std::forward<T3>(arg3)}, wires_{std::forward<T4>(arg4)} {
+        PL_ASSERT(data_.size() == indices_.size());
+    }
+
+    /**
+     * @brief Convenient wrapper for the constructor as the constructor does not
+     * convert the std::shared_ptr with a derived class correctly.
+     *
+     * This function is useful as std::make_shared does not handle
+     * brace-enclosed initializer list correctly.
+     *
+     * @param arg1 Argument to construct data
+     * @param arg2 Argument to construct indices
+     * @param arg3 Argument to construct ofsets
+     * @param arg4 Argument to construct wires
+     */
+    static auto create(std::initializer_list<T> arg1,
+                       std::initializer_list<IdxT> arg2,
+                       std::initializer_list<IdxT> arg3,
+                       std::initializer_list<std::size_t> arg4)
+        -> std::shared_ptr<SparseHamiltonianGPUMPI<T>> {
+        return std::shared_ptr<SparseHamiltonianGPUMPI<T>>(
+            new SparseHamiltonianGPUMPI<T>{std::move(arg1), std::move(arg2),
+                                           std::move(arg3), std::move(arg4)});
+    }
+
+    /**
+     * @brief Updates the statevector SV:->SV', where SV' = a*H*SV, and where H
+     * is a sparse Hamiltonian.
+     *
+     */
+    inline void applyInPlace(StateVectorCudaMPI<T> &sv) const override {
+        PL_ABORT_IF_NOT(wires_.size() == sv.getTotalNumQubits(),
+                        "SparseH wire count does not match state-vector size");
+        using CFP_t = typename StateVectorCudaMPI<T>::CFP_t;
+        const CFP_t alpha = {1.0, 0.0};
+        const CFP_t beta = {0.0, 0.0};
+
+        auto device_id = sv.getDataBuffer().getDevTag().getDeviceID();
+        auto stream_id = sv.getDataBuffer().getDevTag().getStreamID();
+
+        auto mpi_manager = sv.getMPIManager();
+
+        // clang-format on
+        cudaDataType_t data_type;
+        cusparseIndexType_t compute_type;
+
+        if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
+                      std::is_same_v<CFP_t, double2>) {
+            data_type = CUDA_C_64F;
+            compute_type = CUSPARSE_INDEX_64I;
+        } else {
+            data_type = CUDA_C_32F;
+            compute_type = CUSPARSE_INDEX_32I;
+        }
+
+        std::vector<std::vector<CSRMatrix<PrecisionT, IdxT>>> csrmatrix_blocks;
+
+        size_t num_col_blocks = mpi_manager.getSize();
+        size_t num_row_blocks = mpi_manager.getSize();
+
+        if (mpi_manager.getRank() == 0) {
+            csrmatrix_blocks = sv.template splitCSRMatrix<IdxT>(
+                num_col_blocks, num_row_blocks, offsets_.data(),
+                indices_.data(), data_.data());
+        }
+
+        const size_t length_local = size_t{1} << sv.getNumLocalQubits();
+
+        DevTag<int> dt_local(sv.getDataBuffer().getDevTag());
+        dt_local.refresh();
+        StateVectorCudaMPI<PrecisionT> d_sv_prime(
+            dt_local, sv.getNumGlobalQubits(), sv.getNumLocalQubits(),
+            sv.getData());
+
+        StateVectorCudaMPI<PrecisionT> d_tmp(dt_local, sv.getNumGlobalQubits(),
+                                             sv.getNumLocalQubits(),
+                                             sv.getData());
+
+        for (size_t i = 0; i < num_row_blocks; i++) {
+            std::vector<CSRMatrix<PrecisionT, IdxT>> sparsematices_row;
+
+            if (mpi_manager.getRank() == 0) {
+                sparsematices_row = csrmatrix_blocks[i];
+            }
+
+            auto localCSRMatrix =
+                sv.template scatterCSRMatrix<IdxT>(sparsematices_row, 0);
+
+            int64_t num_rows_local =
+                static_cast<int64_t>(localCSRMatrix.csrOffsets.size() - 1);
+            int64_t num_cols_local =
+                static_cast<int64_t>(localCSRMatrix.columns.size());
+            int64_t nnz_local =
+                static_cast<int64_t>(localCSRMatrix.values.size());
+
+            size_t color = 0;
+
+            if (localCSRMatrix.values.size() != 0) {
+                DataBuffer<IdxT, int> d_csrOffsets{
+                    localCSRMatrix.csrOffsets.size(), device_id, stream_id,
+                    true};
+                DataBuffer<IdxT, int> d_columns{localCSRMatrix.columns.size(),
+                                                device_id, stream_id, true};
+                DataBuffer<CFP_t, int> d_values{localCSRMatrix.values.size(),
+                                                device_id, stream_id, true};
+
+                d_csrOffsets.CopyHostDataToGpu(localCSRMatrix.csrOffsets.data(),
+                                               localCSRMatrix.csrOffsets.size(),
+                                               false);
+                d_columns.CopyHostDataToGpu(localCSRMatrix.columns.data(),
+                                            localCSRMatrix.columns.size(),
+                                            false);
+                d_values.CopyHostDataToGpu(localCSRMatrix.values.data(),
+                                           localCSRMatrix.values.size(), false);
+
+                // CUSPARSE APIs
+                cusparseSpMatDescr_t mat;
+                cusparseDnVecDescr_t vecX, vecY;
+
+                size_t bufferSize = 0;
+                cusparseHandle_t handle = sv.getCusparseHandle();
+
+                // Create sparse matrix A in CSR format
+                PL_CUSPARSE_IS_SUCCESS(cusparseCreateCsr(
+                    /* cusparseSpMatDescr_t* */ &mat,
+                    /* int64_t */ num_rows_local,
+                    /* int64_t */ num_cols_local,
+                    /* int64_t */ nnz_local,
+                    /* void* */ d_csrOffsets.getData(),
+                    /* void* */ d_columns.getData(),
+                    /* void* */ d_values.getData(),
+                    /* cusparseIndexType_t */ compute_type,
+                    /* cusparseIndexType_t */ compute_type,
+                    /* cusparseIndexBase_t */ CUSPARSE_INDEX_BASE_ZERO,
+                    /* cudaDataType */ data_type));
+
+                // Create dense vector X
+                PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec(
+                    /* cusparseDnVecDescr_t* */ &vecX,
+                    /* int64_t */ num_cols_local,
+                    /* void* */ sv.getData(),
+                    /* cudaDataType */ data_type));
+
+                // Create dense vector y
+                PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec(
+                    /* cusparseDnVecDescr_t* */ &vecY,
+                    /* int64_t */ num_rows_local,
+                    /* void* */ d_tmp.getData(),
+                    /* cudaDataType */ data_type));
+
+                // allocate an external buffer if needed
+                PL_CUSPARSE_IS_SUCCESS(cusparseSpMV_bufferSize(
+                    /* cusparseHandle_t */ handle,
+                    /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    /* const void* */ &alpha,
+                    /* cusparseSpMatDescr_t */ mat,
+                    /* cusparseDnVecDescr_t */ vecX,
+                    /* const void* */ &beta,
+                    /* cusparseDnVecDescr_t */ vecY,
+                    /* cudaDataType */ data_type,
+                    /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT,
+                    /* size_t* */ &bufferSize));
+
+                DataBuffer<cudaDataType_t, int> dBuffer{bufferSize, device_id,
+                                                        stream_id, true};
+
+                // execute SpMV
+                PL_CUSPARSE_IS_SUCCESS(cusparseSpMV(
+                    /* cusparseHandle_t */ handle,
+                    /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    /* const void* */ &alpha,
+                    /* cusparseSpMatDescr_t */ mat,
+                    /* cusparseDnVecDescr_t */ vecX,
+                    /* const void* */ &beta,
+                    /* cusparseDnVecDescr_t */ vecY,
+                    /* cudaDataType */ data_type,
+                    /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT,
+                    /* void* */ reinterpret_cast<void *>(dBuffer.getData())));
+
+                // destroy matrix/vector descriptors
+                PL_CUSPARSE_IS_SUCCESS(cusparseDestroySpMat(mat));
+                PL_CUSPARSE_IS_SUCCESS(cusparseDestroyDnVec(vecX));
+                PL_CUSPARSE_IS_SUCCESS(cusparseDestroyDnVec(vecY));
+
+                color = 1;
+            }
+
+            if (mpi_manager.getRank() == i) {
+                color = 1;
+            }
+
+            auto new_mpi_manager =
+                mpi_manager.split(color, mpi_manager.getRank());
+            int reduce_root_rank = -1;
+
+            if (mpi_manager.getRank() == i) {
+                reduce_root_rank = new_mpi_manager.getRank();
+            }
+
+            mpi_manager.template Bcast<int>(reduce_root_rank, i);
+
+            if (new_mpi_manager.getComm() != MPI_COMM_NULL) {
+                new_mpi_manager.template Reduce<CFP_t>(
+                    d_tmp.getData(), d_sv_prime.getData(), length_local,
+                    reduce_root_rank, "sum");
+            }
+        }
+        sv.CopyGpuDataToGpuIn(d_sv_prime.getData(), d_sv_prime.getLength());
+    }
+
+    [[nodiscard]] auto getObsName() const -> std::string override {
+        using Pennylane::Util::operator<<;
+        std::ostringstream ss;
+        ss << "SparseHamiltonian: {\n'data' : ";
+        for (const auto &d : data_)
+            ss << d;
+        ss << ",\n'indices' : ";
+        for (const auto &i : indices_)
+            ss << i;
+        ss << ",\n'offsets' : ";
+        for (const auto &o : offsets_)
+            ss << o;
+        ss << "\n}";
+        return ss.str();
+    }
+    /**
+     * @brief Get the wires the observable applies to.
+     */
+    [[nodiscard]] auto getWires() const -> std::vector<size_t> {
+        return wires_;
+    };
+};
+
 } // namespace Pennylane::Algorithms
diff --git a/pennylane_lightning_gpu/src/bindings/Bindings.cpp b/pennylane_lightning_gpu/src/bindings/Bindings.cpp
index 2b0d677d..bf55653f 100644
--- a/pennylane_lightning_gpu/src/bindings/Bindings.cpp
+++ b/pennylane_lightning_gpu/src/bindings/Bindings.cpp
@@ -1590,6 +1590,49 @@ void StateVectorCudaMPI_class_bindings(py::module &m) {
             },
             "Compare two observables");
 
+    class_name = "SparseHamiltonianGPUMPI_C" + bitsize;
+    using SpIDX = typename SparseHamiltonianGPUMPI<PrecisionT>::IdxT;
+    py::class_<SparseHamiltonianGPUMPI<PrecisionT>,
+               std::shared_ptr<SparseHamiltonianGPUMPI<PrecisionT>>,
+               ObservableGPUMPI<PrecisionT>>(m, class_name.c_str(),
+                                             py::module_local())
+        .def(py::init([](const np_arr_c &data, const np_arr_sparse_ind &indices,
+                         const np_arr_sparse_ind &offsets,
+                         const std::vector<std::size_t> &wires) {
+            const py::buffer_info buffer_data = data.request();
+            const auto *data_ptr =
+                static_cast<complex<PrecisionT> *>(buffer_data.ptr);
+
+            const py::buffer_info buffer_indices = indices.request();
+            const auto *indices_ptr = static_cast<SpIDX *>(buffer_indices.ptr);
+
+            const py::buffer_info buffer_offsets = offsets.request();
+            const auto *offsets_ptr = static_cast<SpIDX *>(buffer_offsets.ptr);
+
+            return SparseHamiltonianGPUMPI<PrecisionT>{
+                std::vector<complex<PrecisionT>>(
+                    {data_ptr, data_ptr + data.size()}),
+                std::vector<SpIDX>({indices_ptr, indices_ptr + indices.size()}),
+                std::vector<SpIDX>({offsets_ptr, offsets_ptr + offsets.size()}),
+                wires};
+        }))
+        .def("__repr__", &SparseHamiltonianGPUMPI<PrecisionT>::getObsName)
+        .def("get_wires", &SparseHamiltonianGPUMPI<PrecisionT>::getWires,
+             "Get wires of observables")
+        .def(
+            "__eq__",
+            [](const SparseHamiltonianGPUMPI<PrecisionT> &self,
+               py::handle other) -> bool {
+                if (!py::isinstance<SparseHamiltonianGPUMPI<PrecisionT>>(
+                        other)) {
+                    return false;
+                }
+                auto other_cast =
+                    other.cast<SparseHamiltonianGPUMPI<PrecisionT>>();
+                return self == other_cast;
+            },
+            "Compare two observables");
+
     //***********************************************************************//
     //                              MPI Adj Jac
     //***********************************************************************//

From 608c8e109f50fb314bca46bcd7db62a0a74441e8 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Sat, 8 Jul 2023 19:23:06 -0700
Subject: [PATCH 05/43] tidy up code

---
 mpitests/test_adjoint_jacobian.py | 50 +------------------------------
 1 file changed, 1 insertion(+), 49 deletions(-)

diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
index a67480f9..a7639dfe 100644
--- a/mpitests/test_adjoint_jacobian.py
+++ b/mpitests/test_adjoint_jacobian.py
@@ -1046,54 +1046,6 @@ def circuit(params):
         j_gpu = qml.jacobian(qnode_gpu)(params)
 
 
-'''
-@pytest.mark.parametrize(
-    "returns",
-    [
-        qml.SparseHamiltonian(
-            qml.Hamiltonian(
-                [0.1], [qml.PauliX(wires=custom_wires[0]) @ qml.PauliZ(wires=custom_wires[1])]
-            ).sparse_matrix(custom_wires),
-            wires=custom_wires,
-        ),
-        qml.SparseHamiltonian(
-            qml.Hamiltonian(
-                [2.0], [qml.PauliX(wires=custom_wires[2]) @ qml.PauliZ(wires=custom_wires[0])]
-            ).sparse_matrix(custom_wires),
-            wires=custom_wires,
-        ),
-        qml.SparseHamiltonian(
-            qml.Hamiltonian(
-                [1.1], [qml.PauliX(wires=custom_wires[0]) @ qml.PauliZ(wires=custom_wires[2])]
-            ).sparse_matrix(custom_wires),
-            wires=custom_wires,
-        ),
-    ],
-)
-def test_failed_adjoint_SparseHamiltonian(returns):
-    """Integration tests that compare to default.qubit for a large circuit containing parametrized
-    operations and when using custom wire labels"""
-
-    dev_gpu = qml.device("lightning.gpu", wires=custom_wires, mpi=True)
-
-    def circuit(params):
-        circuit_ansatz(params, wires=custom_wires)
-        return qml.expval(returns)
-
-    n_params = 30
-    np.random.seed(1337)
-    params = np.random.rand(n_params)
-
-    qnode_gpu = qml.QNode(circuit, dev_gpu, diff_method="adjoint")
-
-    with pytest.raises(
-        RuntimeError, match="LightningGPU-MPI does not currently support SparseHamiltonian."
-    ):
-        j_gpu = qml.jacobian(qnode_gpu)(params)
-
-'''
-
-
 custom_wires0 = ["alice", 3.14, -1, 0, "bob", "luc"]
 """ Important Note: Ensure wires index arranged from high to low
     qml.SparseHamiltonian(
@@ -1109,7 +1061,7 @@ def circuit(params):
     [
         qml.SparseHamiltonian(
             qml.Hamiltonian(
-                [0.1], [qml.PauliX(wires=custom_wires0[0]) @ qml.PauliZ(wires=custom_wires0[1])]
+                [0.1], [qml.PauliX(wires=custom_wires0[0]) @ qml.PauliY(wires=custom_wires0[1])]
             ).sparse_matrix(custom_wires0[::-1]),
             wires=custom_wires0,
         ),

From 482f760e07a468e0d2d9a2158bc2b6c34e700441 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Sun, 9 Jul 2023 05:12:37 -0700
Subject: [PATCH 06/43] memory optimization

---
 pennylane_lightning_gpu/_serialize.py | 41 +++++++++++++++++++++------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/pennylane_lightning_gpu/_serialize.py b/pennylane_lightning_gpu/_serialize.py
index 7df08516..41fab52f 100644
--- a/pennylane_lightning_gpu/_serialize.py
+++ b/pennylane_lightning_gpu/_serialize.py
@@ -170,14 +170,39 @@ def _serialize_hamiltonian(
 def _serialize_sparsehamiltonian(ob, wires_map: dict, use_csingle: bool, use_mpi: bool):
     sparsehamiltonian_obs, ctype, rtype = _sparsehamiltonian_ob_dtype(use_csingle, use_mpi)
 
-    spm = ob.sparse_matrix()
-    data = np.array(spm.data).astype(ctype)
-    indices = np.array(spm.indices).astype(rtype)
-    offsets = np.array(spm.indptr).astype(rtype)
-
-    wires = []
-    wires_list = ob.wires.tolist()
-    wires.extend([wires_map[w] for w in wires_list])
+    if use_mpi:
+        mpi_manager_local = MPIManager()
+        #Only root 0 needs the overall sparsematrix data
+        if mpi_manager_local.getRank() == 0:
+            spm = ob.sparse_matrix()
+            data = np.array(spm.data).astype(ctype)
+            indices = np.array(spm.indices).astype(rtype)
+            offsets = np.array(spm.indptr).astype(rtype)
+            wires = []
+            wires_list = ob.wires.tolist()
+            wires.extend([wires_map[w] for w in wires_list])
+        else:
+        #Other root only needs non-null some sparsematrix data to pass
+            obs = qml.Identity(0)
+            Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix()
+            H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1))
+            spm = H_sparse.sparse_matrix()
+
+            data = np.array(spm.data).astype(ctype)
+            indices = np.array(spm.indices).astype(rtype)
+            offsets = np.array(spm.indptr).astype(rtype)
+
+            wires = []
+            wires_list = ob.wires.tolist()
+            wires.extend([wires_map[w] for w in wires_list])
+    else:
+        spm = ob.sparse_matrix()
+        data = np.array(spm.data).astype(ctype)
+        indices = np.array(spm.indices).astype(rtype)
+        offsets = np.array(spm.indptr).astype(rtype)
+        wires = []
+        wires_list = ob.wires.tolist()
+        wires.extend([wires_map[w] for w in wires_list])
 
     return sparsehamiltonian_obs(data, indices, offsets, wires)
 

From 51a71394039e37ea9c1f78f9637238abbb65caff Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Sun, 9 Jul 2023 05:17:36 -0700
Subject: [PATCH 07/43] tidy up the code

---
 .../src/algorithms/ObservablesGPUMPI.hpp                 | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
index d7ad2d56..61f6fc9a 100644
--- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
+++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
@@ -484,8 +484,13 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
      *
      */
     inline void applyInPlace(StateVectorCudaMPI<T> &sv) const override {
-        PL_ABORT_IF_NOT(wires_.size() == sv.getTotalNumQubits(),
+        auto mpi_manager = sv.getMPIManager();
+
+        if(mpi_manager.getRank() == 0){
+            PL_ABORT_IF_NOT(wires_.size() == sv.getTotalNumQubits(),
                         "SparseH wire count does not match state-vector size");
+        }
+        
         using CFP_t = typename StateVectorCudaMPI<T>::CFP_t;
         const CFP_t alpha = {1.0, 0.0};
         const CFP_t beta = {0.0, 0.0};
@@ -493,8 +498,6 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
         auto device_id = sv.getDataBuffer().getDevTag().getDeviceID();
         auto stream_id = sv.getDataBuffer().getDevTag().getStreamID();
 
-        auto mpi_manager = sv.getMPIManager();
-
         // clang-format on
         cudaDataType_t data_type;
         cusparseIndexType_t compute_type;

From d7604bbd8255066192de4815d50993d5e9954593 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Sun, 9 Jul 2023 10:57:14 -0700
Subject: [PATCH 08/43] quick update

---
 mpitests/test_apply.py                           | 16 +++++++++-------
 .../src/simulator/StateVectorCudaMPI.hpp         |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/mpitests/test_apply.py b/mpitests/test_apply.py
index 914eeab0..58ca1ede 100644
--- a/mpitests/test_apply.py
+++ b/mpitests/test_apply.py
@@ -554,7 +554,8 @@ def circuit():
 
 
 class TestSparseHamiltonianExpval:
-    def test_sparse_hamiltonian_expectation(self, tol):
+    @pytest.fixture(params=[np.complex64, np.complex128])
+    def test_sparse_hamiltonian_expectation(self, tol, request):
         comm = MPI.COMM_WORLD
         commSize = comm.Get_size()
         num_global_wires = commSize.bit_length() - 1
@@ -575,13 +576,13 @@ def test_sparse_hamiltonian_expectation(self, tol):
                 0.3 + 0.4j,
                 0.4 + 0.5j,
             ],
-            dtype=np.complex128,
+            dtype=request.params,
         )
 
-        local_state_vector = np.zeros(1 << num_local_wires).astype(np.complex128)
+        local_state_vector = np.zeros(1 << num_local_wires).astype(request.params)
         comm.Scatter(state_vector, local_state_vector, root=0)
 
-        dev_gpumpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=np.complex128)
+        dev_gpumpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=request.params)
 
         dev_gpumpi.syncH2D(local_state_vector)
 
@@ -615,6 +616,7 @@ def test_expval_single_wire_no_parameters(self, tol, operation, wires):
         obs = operation(wires)
         expval_single_wire_no_param(tol, obs)
 
+    @pytest.fixture(params=[np.complex64, np.complex128])
     @pytest.mark.parametrize(
         "obs",
         [
@@ -626,13 +628,13 @@ def test_expval_single_wire_no_parameters(self, tol, operation, wires):
             qml.PauliZ(numQubits - 2) @ qml.PauliZ(numQubits - 1),
         ],
     )
-    def test_expval_multiple_obs(self, obs, tol):
+    def test_expval_multiple_obs(self, obs, request, tol):
         """Test expval with Hamiltonian"""
         num_wires = numQubits
         comm = MPI.COMM_WORLD
 
-        dev_cpu = qml.device("default.qubit", wires=num_wires, c_dtype=np.complex128)
-        dev_gpumpi = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=np.complex128)
+        dev_cpu = qml.device("default.qubit", wires=num_wires, c_dtype=request.params)
+        dev_gpumpi = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=request.params)
 
         def circuit():
             qml.RX(0.4, wires=[0])
diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 5cd3d162..1725fcfc 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -75,7 +75,7 @@ template <class Precision, class index_type> struct CSRMatrix {
         csrOffsets = std::vector<index_type>(num_rows + 1, 0);
     }
 
-    CSRMatrix(){};
+    CSRMatrix() = default;;
 };
 
 /**

From 55e505f1e088e3191956e45e388aff86639e46e9 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Sun, 9 Jul 2023 11:03:57 -0700
Subject: [PATCH 09/43] make format

---
 mpitests/test_adjoint_jacobian.py                            | 2 ++
 pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
index 20d3c01c..b42f9e71 100644
--- a/mpitests/test_adjoint_jacobian.py
+++ b/mpitests/test_adjoint_jacobian.py
@@ -1043,6 +1043,7 @@ def circuit(params):
     with pytest.raises((TypeError, ValueError)):
         j_gpu = qml.jacobian(qnode_gpu)(params)
 
+
 '''
 @pytest.mark.parametrize(
     "returns",
@@ -1090,6 +1091,7 @@ def circuit(params):
 
 '''
 
+
 @pytest.mark.parametrize(
     "obs,obs_type_c64,obs_type_c128",
     [
diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 1725fcfc..0ffb85b5 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -75,7 +75,7 @@ template <class Precision, class index_type> struct CSRMatrix {
         csrOffsets = std::vector<index_type>(num_rows + 1, 0);
     }
 
-    CSRMatrix() = default;;
+    CSRMatrix() = default;
 };
 
 /**

From 2f21e759b580489d14982368ba8241d222d6fd95 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Mon, 10 Jul 2023 07:56:12 -0700
Subject: [PATCH 10/43] Add changelog

---
 .github/CHANGELOG.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index c968f270..38abb29e 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ### New features since last release
 
+ * Add Sparse Hamiltonian support for expectation value calculation.
+ [(#127)] (https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/127)
+
+
 ### Breaking changes
 
 ### Improvements
@@ -14,6 +18,8 @@
 
 This release contains contributions from (in alphabetical order):
 
+Shuli Shu
+
 ---
 
 # Release 0.31.0

From 1a451d2f650ce8bb5286df383f77ae05f7deb689 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Mon, 10 Jul 2023 08:09:50 -0700
Subject: [PATCH 11/43] make format

---
 pennylane_lightning_gpu/src/util/DataBuffer.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pennylane_lightning_gpu/src/util/DataBuffer.hpp b/pennylane_lightning_gpu/src/util/DataBuffer.hpp
index 3b6d483f..41afa6e4 100644
--- a/pennylane_lightning_gpu/src/util/DataBuffer.hpp
+++ b/pennylane_lightning_gpu/src/util/DataBuffer.hpp
@@ -30,8 +30,8 @@ template <class GPUDataT, class DevTagT = int> class DataBuffer {
 
     DataBuffer(std::size_t length, int device_id = 0,
                cudaStream_t stream_id = 0, bool alloc_memory = true)
-        : length_{length}, dev_tag_{device_id, stream_id},
-          gpu_buffer_{nullptr} {
+        : length_{length}, dev_tag_{device_id, stream_id}, gpu_buffer_{
+                                                               nullptr} {
         if (alloc_memory && (length > 0)) {
             dev_tag_.refresh();
             PL_CUDA_IS_SUCCESS(

From 2fb4c552a3e9c11cbb70fff877234e57665c9bae Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Mon, 10 Jul 2023 08:22:20 -0700
Subject: [PATCH 12/43] Trigger MPI CI


From 93011e9ffb53d27849380da481234464e5266feb Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 11 Jul 2023 04:16:35 -0700
Subject: [PATCH 13/43] update base on comments

---
 .../src/simulator/StateVectorCudaMPI.hpp           | 14 +++++++-------
 .../tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 0ffb85b5..fef5a637 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -65,13 +65,13 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                                cudaStream_t stream_id);
 
 template <class Precision, class index_type> struct CSRMatrix {
-    std::vector<std::complex<Precision>> values;
     std::vector<index_type> columns;
+    std::vector<std::complex<Precision>> values;
     std::vector<index_type> csrOffsets;
 
     CSRMatrix(size_t num_rows, size_t nnz) {
-        values = std::vector<std::complex<Precision>>(nnz);
         columns = std::vector<index_type>(nnz, 0);
+        values = std::vector<std::complex<Precision>>(nnz);
         csrOffsets = std::vector<index_type>(num_rows + 1, 0);
     }
 
@@ -956,10 +956,10 @@ class StateVectorCudaMPI
     }
 
     /**
-     * @brief Scatter a CSR (Compress Sparse Row) format matrix.
+     * @brief Scatter a CSR (Compressed Sparse Row) format matrix.
      *
      * @tparam index_type Integer type used as indices of the sparse matrix.
-     * @param matrix CSR (Compress Sparse Row) format matrix.
+     * @param matrix CSR (Compressed Sparse Row) format matrix.
      * @param root Root rank of the scatter operation.
      */
     template <class index_type>
@@ -1164,14 +1164,14 @@ class StateVectorCudaMPI
         d_tmp_res.zeroInit();
 
         for (size_t i = 0; i < num_row_blocks; i++) {
-            std::vector<CSRMatrix<Precision, index_type>> sparsematices_row;
+            std::vector<CSRMatrix<Precision, index_type>> sparsematrices_row;
 
             if (mpi_manager_.getRank() == 0) {
-                sparsematices_row = csrmatrix_blocks[i];
+                sparsematrices_row = csrmatrix_blocks[i];
             }
             mpi_manager_.Barrier();
 
-            auto localCSRMatrix = scatterCSRMatrix(sparsematices_row, 0);
+            auto localCSRMatrix = scatterCSRMatrix(sparsematrices_row, 0);
             mpi_manager_.Barrier();
 
             int64_t num_rows_local =
diff --git a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
index dbd735c4..26e81aff 100644
--- a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
+++ b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
@@ -977,7 +977,7 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::Hamiltonian_expval_cuSparse",
         auto results = sv.template getExpectationValueOnSparseSpMV<index_type>(
             csrOffsets, num_csrOffsets, columns, values, nnz);
 
-        TestType expected = 1;
+        TestType expected = 1.0;
 
         CHECK(expected == Approx(results).epsilon(1e-7));
     }

From 50a3d81aea3bb5452aa07d9aa58239f031b2d914 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 11 Jul 2023 11:24:06 -0700
Subject: [PATCH 14/43] add changelog

---
 .github/CHANGELOG.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index c968f270..a7657699 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -2,6 +2,18 @@
 
 ### New features since last release
 
+ * Add sparse Hamiltonian support to multi-node/multi-GPU adjoint methods. 
+ [(#128)] (https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/128)
+
+ Note each MPI process will return the overall result of the adjoint method. To ensure wires index arranged in a way aligned with cuQuantum backend, `wires` should be reordered in a descending manner in a python script as follows:
+ ```python
+    qml.SparseHamiltonian(
+        qml.Hamiltonian(
+            [0.1], [qml.PauliX(0) @ qml.PauliY(1)]
+        ).sparse_matrix(wires.[::-1]),
+    )
+ ```
+
 ### Breaking changes
 
 ### Improvements
@@ -14,6 +26,7 @@
 
 This release contains contributions from (in alphabetical order):
 
+Shuli Shu
 ---
 
 # Release 0.31.0

From bebbe2b7dde3fc962203cbe1bcab673cfe61b26e Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 11 Jul 2023 11:27:56 -0700
Subject: [PATCH 15/43] make format

---
 pennylane_lightning_gpu/_serialize.py                    | 4 ++--
 .../src/algorithms/ObservablesGPUMPI.hpp                 | 9 +++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/pennylane_lightning_gpu/_serialize.py b/pennylane_lightning_gpu/_serialize.py
index 41fab52f..de108949 100644
--- a/pennylane_lightning_gpu/_serialize.py
+++ b/pennylane_lightning_gpu/_serialize.py
@@ -172,7 +172,7 @@ def _serialize_sparsehamiltonian(ob, wires_map: dict, use_csingle: bool, use_mpi
 
     if use_mpi:
         mpi_manager_local = MPIManager()
-        #Only root 0 needs the overall sparsematrix data
+        # Only root 0 needs the overall sparsematrix data
         if mpi_manager_local.getRank() == 0:
             spm = ob.sparse_matrix()
             data = np.array(spm.data).astype(ctype)
@@ -182,7 +182,7 @@ def _serialize_sparsehamiltonian(ob, wires_map: dict, use_csingle: bool, use_mpi
             wires_list = ob.wires.tolist()
             wires.extend([wires_map[w] for w in wires_list])
         else:
-        #Other root only needs non-null some sparsematrix data to pass
+            # Other root only needs non-null some sparsematrix data to pass
             obs = qml.Identity(0)
             Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix()
             H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1))
diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
index 61f6fc9a..72513456 100644
--- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
+++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
@@ -486,11 +486,12 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
     inline void applyInPlace(StateVectorCudaMPI<T> &sv) const override {
         auto mpi_manager = sv.getMPIManager();
 
-        if(mpi_manager.getRank() == 0){
-            PL_ABORT_IF_NOT(wires_.size() == sv.getTotalNumQubits(),
-                        "SparseH wire count does not match state-vector size");
+        if (mpi_manager.getRank() == 0) {
+            PL_ABORT_IF_NOT(
+                wires_.size() == sv.getTotalNumQubits(),
+                "SparseH wire count does not match state-vector size");
         }
-        
+
         using CFP_t = typename StateVectorCudaMPI<T>::CFP_t;
         const CFP_t alpha = {1.0, 0.0};
         const CFP_t beta = {0.0, 0.0};

From 147afbe54e6a87efa32b722bab6a2594f4ff3435 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 18 Jul 2023 17:45:58 -0700
Subject: [PATCH 16/43] update based on comments

---
 .../src/simulator/StateVectorCudaMPI.hpp      | 32 ++++++-------------
 .../src/util/MPIManager.hpp                   | 11 +++++--
 2 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index fef5a637..4940293b 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -69,11 +69,8 @@ template <class Precision, class index_type> struct CSRMatrix {
     std::vector<std::complex<Precision>> values;
     std::vector<index_type> csrOffsets;
 
-    CSRMatrix(size_t num_rows, size_t nnz) {
-        columns = std::vector<index_type>(nnz, 0);
-        values = std::vector<std::complex<Precision>>(nnz);
-        csrOffsets = std::vector<index_type>(num_rows + 1, 0);
-    }
+    CSRMatrix(size_t num_rows, size_t nnz)
+        : columns(nnz, 0), values(nnz), csrOffsets(num_rows + 1, 0) {}
 
     CSRMatrix() = default;
 };
@@ -966,14 +963,12 @@ class StateVectorCudaMPI
     auto scatterCSRMatrix(std::vector<CSRMatrix<Precision, index_type>> &matrix,
                           size_t root) -> CSRMatrix<Precision, index_type> {
         // Bcast num_rows and num_cols
-        size_t local_num_rows;
-        size_t num_col_blocks;
+        size_t local_num_rows = size_t{1} << this->getNumLocalQubits();
+        size_t num_col_blocks = mpi_manager_.getSize();
         std::vector<size_t> nnzs;
 
-        local_num_rows = size_t{1} << this->getNumLocalQubits();
-        num_col_blocks = mpi_manager_.getSize();
-
         if (mpi_manager_.getRank() == root) {
+            nnzs.reserve(matrix.size());
             for (size_t j = 0; j < matrix.size(); j++) {
                 nnzs.push_back(matrix[j].values.size());
             }
@@ -997,21 +992,13 @@ class StateVectorCudaMPI
             if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) {
                 mpi_manager_.Send<std::complex<Precision>>(matrix[k].values,
                                                            dest);
+                mpi_manager_.Send<index_type>(matrix[k].csrOffsets, dest);
+                mpi_manager_.Send<index_type>(matrix[k].columns, dest);
             } else if (mpi_manager_.getRank() == k && local_nnz) {
                 mpi_manager_.Recv<std::complex<Precision>>(
                     localCSRMatrix.values, source);
-            }
-
-            if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) {
-                mpi_manager_.Send<index_type>(matrix[k].csrOffsets, dest);
-            } else if (mpi_manager_.getRank() == k && local_nnz) {
                 mpi_manager_.Recv<index_type>(localCSRMatrix.csrOffsets,
                                               source);
-            }
-
-            if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) {
-                mpi_manager_.Send<index_type>(matrix[k].columns, dest);
-            } else if (mpi_manager_.getRank() == k && local_nnz) {
                 mpi_manager_.Recv<index_type>(localCSRMatrix.columns, source);
             }
         }
@@ -1019,7 +1006,7 @@ class StateVectorCudaMPI
     }
 
     /**
-     * @brief Convert a global CSR (Compress Sparse Row) format matrix into
+     * @brief Convert a global CSR (Compressed Sparse Row) format matrix into
      * local blocks. This operation should be conducted on the rank 0.
      *
      * @tparam index_type Integer type used as indices of the sparse matrix.
@@ -1293,8 +1280,7 @@ class StateVectorCudaMPI
             mpi_manager_.Bcast<int>(reduce_root_rank, i);
 
             if (new_mpi_manager.getComm() != MPI_COMM_NULL) {
-                new_mpi_manager.Reduce<CFP_t>(d_tmp.getData(),
-                                              d_tmp_res.getData(), length_local,
+                new_mpi_manager.Reduce<CFP_t>(d_tmp, d_tmp_res, length_local,
                                               reduce_root_rank, "sum");
             }
         }
diff --git a/pennylane_lightning_gpu/src/util/MPIManager.hpp b/pennylane_lightning_gpu/src/util/MPIManager.hpp
index 4e9a7c17..025cf6cb 100644
--- a/pennylane_lightning_gpu/src/util/MPIManager.hpp
+++ b/pennylane_lightning_gpu/src/util/MPIManager.hpp
@@ -31,6 +31,13 @@
 #include <vector>
 
 #include "Error.hpp"
+#include "DataBuffer.hpp"
+
+/// @cond DEV
+namespace {
+using namespace Pennylane::CUDA;
+} // namespace
+/// @endcond
 
 namespace Pennylane::MPI {
 inline void errhandler(int errcode, const char *str) {
@@ -425,11 +432,11 @@ class MPIManager final {
     }
 
     template <typename T>
-    void Reduce(T *sendBuf, T *recvBuf, size_t length, size_t root,
+    void Reduce(DataBuffer<T> &sendBuf, DataBuffer<T> &recvBuf, size_t length, size_t root,
                 const std::string &op_str) {
         MPI_Datatype datatype = getMPIDatatype<T>();
         MPI_Op op = getMPIOpType(op_str);
-        PL_MPI_IS_SUCCESS(MPI_Reduce(sendBuf, recvBuf, length, datatype, op,
+        PL_MPI_IS_SUCCESS(MPI_Reduce(sendBuf.getData(), recvBuf.getData(), length, datatype, op,
                                      root, this->getComm()));
     }
 

From 5a1705a6f466d14f9bb59828512d1e83e91e618b Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 18 Jul 2023 19:01:07 -0700
Subject: [PATCH 17/43] make format & add to-do work

---
 .../src/simulator/StateVectorCudaMPI.hpp              |  4 +++-
 pennylane_lightning_gpu/src/util/MPIManager.hpp       | 11 ++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 4940293b..985416d7 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -1037,6 +1037,7 @@ class StateVectorCudaMPI
         size_t row_block_size = size_t{1} << this->getNumLocalQubits();
         size_t col_block_size = row_block_size;
 
+        // Add OpenMP support here later.
         for (size_t row = 0; row < num_rows; row++) {
             for (size_t col_idx = static_cast<size_t>(csrOffsets_ptr[row]);
                  col_idx < static_cast<size_t>(csrOffsets_ptr[row + 1]);
@@ -1067,6 +1068,7 @@ class StateVectorCudaMPI
             }
         }
 
+        // Add OpenMP support here later.
         for (size_t block_row_id = 0; block_row_id < num_row_blocks;
              block_row_id++) {
             for (size_t block_col_id = 0; block_col_id < num_col_blocks;
@@ -2709,4 +2711,4 @@ class StateVectorCudaMPI
     }
 };
 
-}; // namespace Pennylane
\ No newline at end of file
+}; // namespace Pennylane
diff --git a/pennylane_lightning_gpu/src/util/MPIManager.hpp b/pennylane_lightning_gpu/src/util/MPIManager.hpp
index 025cf6cb..fd09f122 100644
--- a/pennylane_lightning_gpu/src/util/MPIManager.hpp
+++ b/pennylane_lightning_gpu/src/util/MPIManager.hpp
@@ -30,8 +30,8 @@
 #include <unordered_map>
 #include <vector>
 
-#include "Error.hpp"
 #include "DataBuffer.hpp"
+#include "Error.hpp"
 
 /// @cond DEV
 namespace {
@@ -432,12 +432,13 @@ class MPIManager final {
     }
 
     template <typename T>
-    void Reduce(DataBuffer<T> &sendBuf, DataBuffer<T> &recvBuf, size_t length, size_t root,
-                const std::string &op_str) {
+    void Reduce(DataBuffer<T> &sendBuf, DataBuffer<T> &recvBuf, size_t length,
+                size_t root, const std::string &op_str) {
         MPI_Datatype datatype = getMPIDatatype<T>();
         MPI_Op op = getMPIOpType(op_str);
-        PL_MPI_IS_SUCCESS(MPI_Reduce(sendBuf.getData(), recvBuf.getData(), length, datatype, op,
-                                     root, this->getComm()));
+        PL_MPI_IS_SUCCESS(MPI_Reduce(sendBuf.getData(), recvBuf.getData(),
+                                     length, datatype, op, root,
+                                     this->getComm()));
     }
 
     template <typename T>

From d68281c7d4b1523165880c3500862778415b2e7b Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Fri, 21 Jul 2023 05:52:59 -0700
Subject: [PATCH 18/43] make format

---
 .../src/simulator/StateVectorCudaMPI.hpp               | 10 ++--------
 pennylane_lightning_gpu/src/util/MPIManager.hpp        |  6 +++---
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 985416d7..a0cb9a72 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -70,7 +70,7 @@ template <class Precision, class index_type> struct CSRMatrix {
     std::vector<index_type> csrOffsets;
 
     CSRMatrix(size_t num_rows, size_t nnz)
-        : columns(nnz, 0), values(nnz), csrOffsets(num_rows + 1, 0) {}
+        : columns(nnz, 0), values(nnz), csrOffsets(num_rows + 1, 0){};
 
     CSRMatrix() = default;
 };
@@ -1153,14 +1153,8 @@ class StateVectorCudaMPI
         d_tmp_res.zeroInit();
 
         for (size_t i = 0; i < num_row_blocks; i++) {
-            std::vector<CSRMatrix<Precision, index_type>> sparsematrices_row;
 
-            if (mpi_manager_.getRank() == 0) {
-                sparsematrices_row = csrmatrix_blocks[i];
-            }
-            mpi_manager_.Barrier();
-
-            auto localCSRMatrix = scatterCSRMatrix(sparsematrices_row, 0);
+            auto localCSRMatrix = scatterCSRMatrix(csrmatrix_blocks[i], 0);
             mpi_manager_.Barrier();
 
             int64_t num_rows_local =
diff --git a/pennylane_lightning_gpu/src/util/MPIManager.hpp b/pennylane_lightning_gpu/src/util/MPIManager.hpp
index fd09f122..7d3135c7 100644
--- a/pennylane_lightning_gpu/src/util/MPIManager.hpp
+++ b/pennylane_lightning_gpu/src/util/MPIManager.hpp
@@ -560,7 +560,7 @@ class MPIManager final {
 
     template <typename T> void Send(std::vector<T> &sendBuf, size_t dest) {
         MPI_Datatype datatype = getMPIDatatype<T>();
-        const int tag = 0;
+        const int tag = 6789;
 
         PL_MPI_IS_SUCCESS(MPI_Send(sendBuf.data(), sendBuf.size(), datatype,
                                    static_cast<int>(dest), tag,
@@ -578,7 +578,7 @@ class MPIManager final {
     template <typename T> void Recv(std::vector<T> &recvBuf, size_t source) {
         MPI_Datatype datatype = getMPIDatatype<T>();
         MPI_Status status;
-        const int tag = 0;
+        const int tag = MPI_ANY_TAG;
 
         PL_MPI_IS_SUCCESS(MPI_Recv(recvBuf.data(), recvBuf.size(), datatype,
                                    static_cast<int>(source), tag,
@@ -740,4 +740,4 @@ class MPIManager final {
         {cppTypeToString<cudaIpcMemHandle_t>(), MPI_UINT8_T},
         {cppTypeToString<cudaIpcEventHandle_t>(), MPI_UINT8_T}};
 };
-} // namespace Pennylane::MPI
\ No newline at end of file
+} // namespace Pennylane::MPI

From 521f84a74e9ac41822b5875fe04451e049660965 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Sun, 23 Jul 2023 23:29:27 -0700
Subject: [PATCH 19/43] sparse matrix struct to class

---
 .../src/simulator/StateVectorCudaMPI.hpp      | 118 ++++++++++--------
 1 file changed, 69 insertions(+), 49 deletions(-)

diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index a0cb9a72..9433adfb 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -65,14 +65,22 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                                cudaStream_t stream_id);
 
 template <class Precision, class index_type> struct CSRMatrix {
+  private:
     std::vector<index_type> columns;
     std::vector<std::complex<Precision>> values;
     std::vector<index_type> csrOffsets;
 
+  public:
     CSRMatrix(size_t num_rows, size_t nnz)
         : columns(nnz, 0), values(nnz), csrOffsets(num_rows + 1, 0){};
 
     CSRMatrix() = default;
+
+    auto getColumns() -> std::vector<index_type> & { return columns; }
+    auto getValues() -> std::vector<std::complex<Precision>> & {
+        return values;
+    }
+    auto getCsrOffsets() -> std::vector<index_type> & { return csrOffsets; }
 };
 
 /**
@@ -970,7 +978,7 @@ class StateVectorCudaMPI
         if (mpi_manager_.getRank() == root) {
             nnzs.reserve(matrix.size());
             for (size_t j = 0; j < matrix.size(); j++) {
-                nnzs.push_back(matrix[j].values.size());
+                nnzs.push_back(matrix[j].getValues().size());
             }
         }
 
@@ -980,26 +988,27 @@ class StateVectorCudaMPI
                                                         local_nnz);
 
         if (mpi_manager_.getRank() == root) {
-            localCSRMatrix.values = matrix[0].values;
-            localCSRMatrix.csrOffsets = matrix[0].csrOffsets;
-            localCSRMatrix.columns = matrix[0].columns;
+            localCSRMatrix.getValues() = matrix[0].getValues();
+            localCSRMatrix.getCsrOffsets() = matrix[0].getCsrOffsets();
+            localCSRMatrix.getColumns() = matrix[0].getColumns();
         }
 
         for (size_t k = 1; k < num_col_blocks; k++) {
             size_t dest = k;
             size_t source = 0;
 
-            if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) {
-                mpi_manager_.Send<std::complex<Precision>>(matrix[k].values,
-                                                           dest);
-                mpi_manager_.Send<index_type>(matrix[k].csrOffsets, dest);
-                mpi_manager_.Send<index_type>(matrix[k].columns, dest);
+            if (mpi_manager_.getRank() == 0 && matrix[k].getValues().size()) {
+                mpi_manager_.Send<std::complex<Precision>>(
+                    matrix[k].getValues(), dest);
+                mpi_manager_.Send<index_type>(matrix[k].getCsrOffsets(), dest);
+                mpi_manager_.Send<index_type>(matrix[k].getColumns(), dest);
             } else if (mpi_manager_.getRank() == k && local_nnz) {
                 mpi_manager_.Recv<std::complex<Precision>>(
-                    localCSRMatrix.values, source);
-                mpi_manager_.Recv<index_type>(localCSRMatrix.csrOffsets,
+                    localCSRMatrix.getValues(), source);
+                mpi_manager_.Recv<index_type>(localCSRMatrix.getCsrOffsets(),
+                                              source);
+                mpi_manager_.Recv<index_type>(localCSRMatrix.getColumns(),
                                               source);
-                mpi_manager_.Recv<index_type>(localCSRMatrix.columns, source);
             }
         }
         return localCSRMatrix;
@@ -1054,17 +1063,21 @@ class StateVectorCudaMPI
                 size_t local_col_id = current_global_col % col_block_size;
 
                 if (splitSparseMatrix[block_row_id][block_col_id]
-                        .csrOffsets.size() == 0) {
-                    splitSparseMatrix[block_row_id][block_col_id].csrOffsets =
+                        .getCsrOffsets()
+                        .size() == 0) {
+                    splitSparseMatrix[block_row_id][block_col_id]
+                        .getCsrOffsets() =
                         std::vector<index_type>(row_block_size + 1, 0);
                 }
 
                 splitSparseMatrix[block_row_id][block_col_id]
-                    .csrOffsets[local_row_id + 1]++;
-                splitSparseMatrix[block_row_id][block_col_id].columns.push_back(
-                    local_col_id);
-                splitSparseMatrix[block_row_id][block_col_id].values.push_back(
-                    current_val);
+                    .getCsrOffsets()[local_row_id + 1]++;
+                splitSparseMatrix[block_row_id][block_col_id]
+                    .getColumns()
+                    .push_back(local_col_id);
+                splitSparseMatrix[block_row_id][block_col_id]
+                    .getValues()
+                    .push_back(current_val);
             }
         }
 
@@ -1075,12 +1088,13 @@ class StateVectorCudaMPI
                  block_col_id++) {
                 for (size_t i0 = 1;
                      i0 < splitSparseMatrix[block_row_id][block_col_id]
-                              .csrOffsets.size();
+                              .getCsrOffsets()
+                              .size();
                      i0++) {
                     splitSparseMatrix[block_row_id][block_col_id]
-                        .csrOffsets[i0] +=
+                        .getCsrOffsets()[i0] +=
                         splitSparseMatrix[block_row_id][block_col_id]
-                            .csrOffsets[i0 - 1];
+                            .getCsrOffsets()[i0 - 1];
                 }
             }
         }
@@ -1147,10 +1161,11 @@ class StateVectorCudaMPI
 
         const size_t length_local = size_t{1} << this->getNumLocalQubits();
 
-        DataBuffer<CFP_t, int> d_tmp{length_local, device_id, stream_id, true};
-        DataBuffer<CFP_t, int> d_tmp_res{length_local, device_id, stream_id,
-                                         true};
-        d_tmp_res.zeroInit();
+        DataBuffer<CFP_t, int> d_res_per_block{length_local, device_id,
+                                               stream_id, true};
+        DataBuffer<CFP_t, int> d_res_per_rowblock{length_local, device_id,
+                                                  stream_id, true};
+        d_res_per_rowblock.zeroInit();
 
         for (size_t i = 0; i < num_row_blocks; i++) {
 
@@ -1158,33 +1173,36 @@ class StateVectorCudaMPI
             mpi_manager_.Barrier();
 
             int64_t num_rows_local =
-                static_cast<int64_t>(localCSRMatrix.csrOffsets.size() - 1);
+                static_cast<int64_t>(localCSRMatrix.getCsrOffsets().size() - 1);
             int64_t num_cols_local =
-                static_cast<int64_t>(localCSRMatrix.columns.size());
+                static_cast<int64_t>(localCSRMatrix.getColumns().size());
             int64_t nnz_local =
-                static_cast<int64_t>(localCSRMatrix.values.size());
+                static_cast<int64_t>(localCSRMatrix.getValues().size());
 
             size_t color = 0;
 
-            if (localCSRMatrix.values.size() != 0) {
-                d_tmp.zeroInit();
+            if (localCSRMatrix.getValues().size() != 0) {
+                d_res_per_block.zeroInit();
 
                 DataBuffer<index_type, int> d_csrOffsets{
-                    localCSRMatrix.csrOffsets.size(), device_id, stream_id,
+                    localCSRMatrix.getCsrOffsets().size(), device_id, stream_id,
                     true};
                 DataBuffer<index_type, int> d_columns{
-                    localCSRMatrix.columns.size(), device_id, stream_id, true};
-                DataBuffer<CFP_t, int> d_values{localCSRMatrix.values.size(),
-                                                device_id, stream_id, true};
-
-                d_csrOffsets.CopyHostDataToGpu(localCSRMatrix.csrOffsets.data(),
-                                               localCSRMatrix.csrOffsets.size(),
-                                               false);
-                d_columns.CopyHostDataToGpu(localCSRMatrix.columns.data(),
-                                            localCSRMatrix.columns.size(),
+                    localCSRMatrix.getColumns().size(), device_id, stream_id,
+                    true};
+                DataBuffer<CFP_t, int> d_values{
+                    localCSRMatrix.getValues().size(), device_id, stream_id,
+                    true};
+
+                d_csrOffsets.CopyHostDataToGpu(
+                    localCSRMatrix.getCsrOffsets().data(),
+                    localCSRMatrix.getCsrOffsets().size(), false);
+                d_columns.CopyHostDataToGpu(localCSRMatrix.getColumns().data(),
+                                            localCSRMatrix.getColumns().size(),
                                             false);
-                d_values.CopyHostDataToGpu(localCSRMatrix.values.data(),
-                                           localCSRMatrix.values.size(), false);
+                d_values.CopyHostDataToGpu(localCSRMatrix.getValues().data(),
+                                           localCSRMatrix.getValues().size(),
+                                           false);
 
                 // CUSPARSE APIs
                 cusparseSpMatDescr_t mat;
@@ -1218,7 +1236,7 @@ class StateVectorCudaMPI
                 PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec(
                     /* cusparseDnVecDescr_t* */ &vecY,
                     /* int64_t */ num_rows_local,
-                    /* void* */ d_tmp.getData(),
+                    /* void* */ d_res_per_block.getData(),
                     /* cudaDataType */ data_type));
 
                 // allocate an external buffer if needed
@@ -1276,17 +1294,19 @@ class StateVectorCudaMPI
             mpi_manager_.Bcast<int>(reduce_root_rank, i);
 
             if (new_mpi_manager.getComm() != MPI_COMM_NULL) {
-                new_mpi_manager.Reduce<CFP_t>(d_tmp, d_tmp_res, length_local,
+                new_mpi_manager.Reduce<CFP_t>(d_res_per_block,
+                                              d_res_per_rowblock, length_local,
                                               reduce_root_rank, "sum");
             }
         }
 
         mpi_manager_.Barrier();
 
-        local_expect = innerProdC_CUDA(d_tmp_res.getData(), BaseType::getData(),
-                                       BaseType::getLength(), device_id,
-                                       stream_id, getCublasCaller())
-                           .x;
+        local_expect =
+            innerProdC_CUDA(d_res_per_block.getData(), BaseType::getData(),
+                            BaseType::getLength(), device_id, stream_id,
+                            getCublasCaller())
+                .x;
 
         PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
         mpi_manager_.Barrier();

From 9f9777cbb5a5b6857e479ea7bd09135b51b86ad7 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Mon, 24 Jul 2023 01:30:11 -0700
Subject: [PATCH 20/43] update unit test cases

---
 mpitests/test_apply.py                        | 21 +++++++++++--------
 .../mpi/Test_StateVectorCudaMPI_NonParam.cpp  |  6 +++---
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/mpitests/test_apply.py b/mpitests/test_apply.py
index 58ca1ede..211469b1 100644
--- a/mpitests/test_apply.py
+++ b/mpitests/test_apply.py
@@ -553,9 +553,10 @@ def circuit():
         assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
 
 
-class TestSparseHamiltonianExpval:
-    @pytest.fixture(params=[np.complex64, np.complex128])
-    def test_sparse_hamiltonian_expectation(self, tol, request):
+class TestSparseHamExpval:
+    """Tests sparse hamiltonian expectation values."""
+
+    def test_sparse_hamiltonian_expectation(self, tol):
         comm = MPI.COMM_WORLD
         commSize = comm.Get_size()
         num_global_wires = commSize.bit_length() - 1
@@ -572,26 +573,28 @@ def test_sparse_hamiltonian_expectation(self, tol, request):
                 0.1 + 0.1j,
                 0.1 + 0.2j,
                 0.2 + 0.2j,
+                0.2 + 0.3j,
                 0.3 + 0.3j,
-                0.3 + 0.4j,
-                0.4 + 0.5j,
+                0.3 + 0.5j,
             ],
-            dtype=request.params,
+            dtype=np.complex128,
         )
 
-        local_state_vector = np.zeros(1 << num_local_wires).astype(request.params)
+        local_state_vector = np.zeros(1 << num_local_wires).astype(np.complex128)
         comm.Scatter(state_vector, local_state_vector, root=0)
 
-        dev_gpumpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=request.params)
+        dev_gpumpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=np.complex128)
+        dev_gpu = qml.device("lightning.gpu", wires=3, mpi=False, c_dtype=np.complex128)
 
         dev_gpumpi.syncH2D(local_state_vector)
+        dev_gpu.syncH2D(state_vector)
 
         H_sparse = qml.SparseHamiltonian(Hmat, wires=range(3))
 
         comm.Barrier()
 
         res = dev_gpumpi.expval(H_sparse)
-        expected = 1
+        expected = dev_gpu.expval(H_sparse)
 
         assert np.allclose(res, expected)
 
diff --git a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
index 26e81aff..3a426c7b 100644
--- a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
+++ b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp
@@ -941,7 +941,7 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::Hamiltonian_expval_cuSparse",
     mpi_manager.Barrier();
 
     std::vector<cp_t> init_sv{{0.0, 0.0}, {0.0, 0.1}, {0.1, 0.1}, {0.1, 0.2},
-                              {0.2, 0.2}, {0.3, 0.3}, {0.3, 0.4}, {0.4, 0.5}};
+                              {0.2, 0.2}, {0.2, 0.3}, {0.3, 0.3}, {0.3, 0.5}};
 
     using index_type =
         typename std::conditional<std::is_same<TestType, float>::value, int32_t,
@@ -977,8 +977,8 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::Hamiltonian_expval_cuSparse",
         auto results = sv.template getExpectationValueOnSparseSpMV<index_type>(
             csrOffsets, num_csrOffsets, columns, values, nnz);
 
-        TestType expected = 1.0;
+        TestType expected = 0.97;
 
         CHECK(expected == Approx(results).epsilon(1e-7));
     }
-}
\ No newline at end of file
+}

From 914aac679660926885204df185d7379ad0065f4c Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 26 Jul 2023 19:59:54 -0700
Subject: [PATCH 21/43] update docstring

---
 .../src/simulator/StateVectorCudaMPI.hpp      | 38 +++++++++++++------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 9433adfb..58922b04 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -1103,8 +1103,19 @@ class StateVectorCudaMPI
     }
 
     /**
-     * @brief expval(H) calculation with cuSparseSpMV.
-     *
+     * @brief expval(H) calculates the expected value using cuSparseSpMV and MPI
+     * to implement distributed Sparse Matrix-Vector multiplication. The dense
+     * vector is distributed across multiple GPU devices and only the MPI rank 0
+     * holds the complete sparse matrix data. The process involves the following
+     * steps: 1. The rank 0 splits the full sparse matrix into n by n blocks,
+     * where n is the size of MPI communicator. Each row of blocks is then
+     * distributed across multiple GPUs. 2. For each GPU, cuSparseSpMV is
+     * invoked to perform the local sparse matrix block and local state vector
+     * multiplication. 3. Each GPU will collect computation results for its
+     * respective row block of the sparse matrix. 4. After all sparse matrix
+     * operations are completed on each GPU, an inner product is performed and
+     * MPI reduce operation is ultilized to obtain the final result for the
+     * expectation value.
      * @tparam index_type Integer type used as indices of the sparse matrix.
      * @param csr_Offsets_ptr Pointer to the array of row offsets of the sparse
      * matrix. Array of size csrOffsets_size.
@@ -1137,13 +1148,20 @@ class StateVectorCudaMPI
 
         cudaDataType_t data_type;
         cusparseIndexType_t compute_type;
+        cusparseOperation_t operation_type = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        cusparseSpMVAlg_t spmvalg_type = CUSPARSE_SPMV_ALG_DEFAULT;
+        cusparseIndexBase_t index_base_type = CUSPARSE_INDEX_BASE_ZERO;
 
         if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
                       std::is_same_v<CFP_t, double2>) {
             data_type = CUDA_C_64F;
-            compute_type = CUSPARSE_INDEX_64I;
         } else {
             data_type = CUDA_C_32F;
+        }
+
+        if constexpr (std::is_same_v<index_type, int32_t>) {
+            compute_type = CUSPARSE_INDEX_64I;
+        } else {
             compute_type = CUSPARSE_INDEX_32I;
         }
 
@@ -1168,9 +1186,7 @@ class StateVectorCudaMPI
         d_res_per_rowblock.zeroInit();
 
         for (size_t i = 0; i < num_row_blocks; i++) {
-
             auto localCSRMatrix = scatterCSRMatrix(csrmatrix_blocks[i], 0);
-            mpi_manager_.Barrier();
 
             int64_t num_rows_local =
                 static_cast<int64_t>(localCSRMatrix.getCsrOffsets().size() - 1);
@@ -1222,7 +1238,7 @@ class StateVectorCudaMPI
                     /* void* */ d_values.getData(),
                     /* cusparseIndexType_t */ compute_type,
                     /* cusparseIndexType_t */ compute_type,
-                    /* cusparseIndexBase_t */ CUSPARSE_INDEX_BASE_ZERO,
+                    /* cusparseIndexBase_t */ index_base_type,
                     /* cudaDataType */ data_type));
 
                 // Create dense vector X
@@ -1242,14 +1258,14 @@ class StateVectorCudaMPI
                 // allocate an external buffer if needed
                 PL_CUSPARSE_IS_SUCCESS(cusparseSpMV_bufferSize(
                     /* cusparseHandle_t */ handle,
-                    /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    /* cusparseOperation_t */ operation_type,
                     /* const void* */ &alpha,
                     /* cusparseSpMatDescr_t */ mat,
                     /* cusparseDnVecDescr_t */ vecX,
                     /* const void* */ &beta,
                     /* cusparseDnVecDescr_t */ vecY,
                     /* cudaDataType */ data_type,
-                    /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT,
+                    /* cusparseSpMVAlg_t */ spmvalg_type,
                     /* size_t* */ &bufferSize));
 
                 DataBuffer<cudaDataType_t, int> dBuffer{bufferSize, device_id,
@@ -1258,14 +1274,14 @@ class StateVectorCudaMPI
                 // execute SpMV
                 PL_CUSPARSE_IS_SUCCESS(cusparseSpMV(
                     /* cusparseHandle_t */ handle,
-                    /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    /* cusparseOperation_t */ operation_type,
                     /* const void* */ &alpha,
                     /* cusparseSpMatDescr_t */ mat,
                     /* cusparseDnVecDescr_t */ vecX,
                     /* const void* */ &beta,
                     /* cusparseDnVecDescr_t */ vecY,
                     /* cudaDataType */ data_type,
-                    /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT,
+                    /* cusparseSpMVAlg_t */ spmvalg_type,
                     /* void* */ reinterpret_cast<void *>(dBuffer.getData())));
 
                 // destroy matrix/vector descriptors
@@ -1303,7 +1319,7 @@ class StateVectorCudaMPI
         mpi_manager_.Barrier();
 
         local_expect =
-            innerProdC_CUDA(d_res_per_block.getData(), BaseType::getData(),
+            innerProdC_CUDA(d_res_per_rowblock.getData(), BaseType::getData(),
                             BaseType::getLength(), device_id, stream_id,
                             getCublasCaller())
                 .x;

From fb78ed806014bfcf69777ae13dd2a0b5d84c609e Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 26 Jul 2023 20:04:45 -0700
Subject: [PATCH 22/43] add more docstring

---
 pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 58922b04..093fe071 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -1186,6 +1186,8 @@ class StateVectorCudaMPI
         d_res_per_rowblock.zeroInit();
 
         for (size_t i = 0; i < num_row_blocks; i++) {
+            //Need to investigate if non-blocking MPI operation can improve
+            //performace here. 
             auto localCSRMatrix = scatterCSRMatrix(csrmatrix_blocks[i], 0);
 
             int64_t num_rows_local =

From 77abc0e7d7e6e9d47f3ff058cc568a9c1251950d Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 26 Jul 2023 20:06:21 -0700
Subject: [PATCH 23/43] make format

---
 pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 093fe071..2cfd111a 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -1186,8 +1186,8 @@ class StateVectorCudaMPI
         d_res_per_rowblock.zeroInit();
 
         for (size_t i = 0; i < num_row_blocks; i++) {
-            //Need to investigate if non-blocking MPI operation can improve
-            //performace here. 
+            // Need to investigate if non-blocking MPI operation can improve
+            // performace here.
             auto localCSRMatrix = scatterCSRMatrix(csrmatrix_blocks[i], 0);
 
             int64_t num_rows_local =

From 6c949818125f01a7468acf89f66bfacddedd9085 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 26 Jul 2023 21:40:02 -0700
Subject: [PATCH 24/43] update based on new csrmatrix

---
 .../src/algorithms/ObservablesGPUMPI.hpp      | 51 ++++++++++---------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
index 72513456..b5c08e83 100644
--- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
+++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
@@ -502,13 +502,20 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
         // clang-format on
         cudaDataType_t data_type;
         cusparseIndexType_t compute_type;
+        cusparseOperation_t operation_type = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        cusparseSpMVAlg_t spmvalg_type = CUSPARSE_SPMV_ALG_DEFAULT;
+        cusparseIndexBase_t index_base_type = CUSPARSE_INDEX_BASE_ZERO;
 
         if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
                       std::is_same_v<CFP_t, double2>) {
             data_type = CUDA_C_64F;
-            compute_type = CUSPARSE_INDEX_64I;
         } else {
             data_type = CUDA_C_32F;
+        }
+
+        if constexpr (std::is_same_v<index_type, int64_t>) {
+            compute_type = CUSPARSE_INDEX_64I;
+        } else {
             compute_type = CUSPARSE_INDEX_32I;
         }
 
@@ -536,41 +543,35 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
                                              sv.getData());
 
         for (size_t i = 0; i < num_row_blocks; i++) {
-            std::vector<CSRMatrix<PrecisionT, IdxT>> sparsematices_row;
-
-            if (mpi_manager.getRank() == 0) {
-                sparsematices_row = csrmatrix_blocks[i];
-            }
-
             auto localCSRMatrix =
-                sv.template scatterCSRMatrix<IdxT>(sparsematices_row, 0);
+                sv.template scatterCSRMatrix<IdxT>(csrmatrix_blocks[i], 0);
 
             int64_t num_rows_local =
-                static_cast<int64_t>(localCSRMatrix.csrOffsets.size() - 1);
+                static_cast<int64_t>(localCSRMatrix.getCsrOffsets().size() - 1);
             int64_t num_cols_local =
-                static_cast<int64_t>(localCSRMatrix.columns.size());
+                static_cast<int64_t>(localCSRMatrix.getColumns().size());
             int64_t nnz_local =
-                static_cast<int64_t>(localCSRMatrix.values.size());
+                static_cast<int64_t>(localCSRMatrix.getValues().size());
 
             size_t color = 0;
 
             if (localCSRMatrix.values.size() != 0) {
                 DataBuffer<IdxT, int> d_csrOffsets{
-                    localCSRMatrix.csrOffsets.size(), device_id, stream_id,
+                    localCSRMatrix.getCsrOffsets().size(), device_id, stream_id,
                     true};
-                DataBuffer<IdxT, int> d_columns{localCSRMatrix.columns.size(),
+                DataBuffer<IdxT, int> d_columns{localCSRMatrix.getColumns().size(),
                                                 device_id, stream_id, true};
-                DataBuffer<CFP_t, int> d_values{localCSRMatrix.values.size(),
+                DataBuffer<CFP_t, int> d_values{localCSRMatrix.getValues().size(),
                                                 device_id, stream_id, true};
 
-                d_csrOffsets.CopyHostDataToGpu(localCSRMatrix.csrOffsets.data(),
-                                               localCSRMatrix.csrOffsets.size(),
+                d_csrOffsets.CopyHostDataToGpu(localCSRMatrix.getCsrOffsets().data(),
+                                               localCSRMatrix.getCsrOffsets().size(),
                                                false);
-                d_columns.CopyHostDataToGpu(localCSRMatrix.columns.data(),
-                                            localCSRMatrix.columns.size(),
+                d_columns.CopyHostDataToGpu(localCSRMatrix.getColumns().data(),
+                                            localCSRMatrix.getColumns().size(),
                                             false);
-                d_values.CopyHostDataToGpu(localCSRMatrix.values.data(),
-                                           localCSRMatrix.values.size(), false);
+                d_values.CopyHostDataToGpu(localCSRMatrix.getValues().data(),
+                                           localCSRMatrix.getValues().size(), false);
 
                 // CUSPARSE APIs
                 cusparseSpMatDescr_t mat;
@@ -590,7 +591,7 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
                     /* void* */ d_values.getData(),
                     /* cusparseIndexType_t */ compute_type,
                     /* cusparseIndexType_t */ compute_type,
-                    /* cusparseIndexBase_t */ CUSPARSE_INDEX_BASE_ZERO,
+                    /* cusparseIndexBase_t */ index_base_type,
                     /* cudaDataType */ data_type));
 
                 // Create dense vector X
@@ -610,14 +611,14 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
                 // allocate an external buffer if needed
                 PL_CUSPARSE_IS_SUCCESS(cusparseSpMV_bufferSize(
                     /* cusparseHandle_t */ handle,
-                    /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    /* cusparseOperation_t */ operation_type,
                     /* const void* */ &alpha,
                     /* cusparseSpMatDescr_t */ mat,
                     /* cusparseDnVecDescr_t */ vecX,
                     /* const void* */ &beta,
                     /* cusparseDnVecDescr_t */ vecY,
                     /* cudaDataType */ data_type,
-                    /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT,
+                    /* cusparseSpMVAlg_t */ spmvalg_type,
                     /* size_t* */ &bufferSize));
 
                 DataBuffer<cudaDataType_t, int> dBuffer{bufferSize, device_id,
@@ -626,14 +627,14 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
                 // execute SpMV
                 PL_CUSPARSE_IS_SUCCESS(cusparseSpMV(
                     /* cusparseHandle_t */ handle,
-                    /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE,
+                    /* cusparseOperation_t */ operation_type,
                     /* const void* */ &alpha,
                     /* cusparseSpMatDescr_t */ mat,
                     /* cusparseDnVecDescr_t */ vecX,
                     /* const void* */ &beta,
                     /* cusparseDnVecDescr_t */ vecY,
                     /* cudaDataType */ data_type,
-                    /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT,
+                    /* cusparseSpMVAlg_t */ spmvalg_type,
                     /* void* */ reinterpret_cast<void *>(dBuffer.getData())));
 
                 // destroy matrix/vector descriptors

From e3b6a8a5c46fd6cd6f36a356d6bf880dcc501c60 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 26 Jul 2023 21:41:18 -0700
Subject: [PATCH 25/43] bug fix

---
 pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 2cfd111a..688fd4e7 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -1159,7 +1159,7 @@ class StateVectorCudaMPI
             data_type = CUDA_C_32F;
         }
 
-        if constexpr (std::is_same_v<index_type, int32_t>) {
+        if constexpr (std::is_same_v<index_type, int64_t>) {
             compute_type = CUSPARSE_INDEX_64I;
         } else {
             compute_type = CUSPARSE_INDEX_32I;

From 97b89268e6cc0335a98b986291513cd250e6c6bf Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 27 Jul 2023 13:07:48 +0800
Subject: [PATCH 26/43] make format

---
 mpitests/test_adjoint_jacobian.py             |  1 -
 .../src/algorithms/ObservablesGPUMPI.hpp      | 21 +++++++++++--------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
index 7b8af953..a7639dfe 100644
--- a/mpitests/test_adjoint_jacobian.py
+++ b/mpitests/test_adjoint_jacobian.py
@@ -1184,7 +1184,6 @@ def circuit(params):
     assert np.allclose(j_cpu, j_gpu)
 
 
-
 @pytest.mark.parametrize(
     "obs,obs_type_c64,obs_type_c128",
     [
diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
index b5c08e83..dd4e035a 100644
--- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
+++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
@@ -559,19 +559,22 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
                 DataBuffer<IdxT, int> d_csrOffsets{
                     localCSRMatrix.getCsrOffsets().size(), device_id, stream_id,
                     true};
-                DataBuffer<IdxT, int> d_columns{localCSRMatrix.getColumns().size(),
-                                                device_id, stream_id, true};
-                DataBuffer<CFP_t, int> d_values{localCSRMatrix.getValues().size(),
-                                                device_id, stream_id, true};
-
-                d_csrOffsets.CopyHostDataToGpu(localCSRMatrix.getCsrOffsets().data(),
-                                               localCSRMatrix.getCsrOffsets().size(),
-                                               false);
+                DataBuffer<IdxT, int> d_columns{
+                    localCSRMatrix.getColumns().size(), device_id, stream_id,
+                    true};
+                DataBuffer<CFP_t, int> d_values{
+                    localCSRMatrix.getValues().size(), device_id, stream_id,
+                    true};
+
+                d_csrOffsets.CopyHostDataToGpu(
+                    localCSRMatrix.getCsrOffsets().data(),
+                    localCSRMatrix.getCsrOffsets().size(), false);
                 d_columns.CopyHostDataToGpu(localCSRMatrix.getColumns().data(),
                                             localCSRMatrix.getColumns().size(),
                                             false);
                 d_values.CopyHostDataToGpu(localCSRMatrix.getValues().data(),
-                                           localCSRMatrix.getValues().size(), false);
+                                           localCSRMatrix.getValues().size(),
+                                           false);
 
                 // CUSPARSE APIs
                 cusparseSpMatDescr_t mat;

From 6c94020ba5d819a4dde04f66a71312f59e8a89c5 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 27 Jul 2023 13:11:38 +0800
Subject: [PATCH 27/43] quick fix

---
 pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
index dd4e035a..59e3f5da 100644
--- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
+++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
@@ -513,7 +513,7 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
             data_type = CUDA_C_32F;
         }
 
-        if constexpr (std::is_same_v<index_type, int64_t>) {
+        if constexpr (std::is_same_v<IdxT, int64_t>) {
             compute_type = CUSPARSE_INDEX_64I;
         } else {
             compute_type = CUSPARSE_INDEX_32I;

From ebac2a002f6353dc1b206600d246e1747a65859d Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 27 Jul 2023 13:32:45 +0800
Subject: [PATCH 28/43] update mpimanager

---
 .../src/algorithms/ObservablesGPUMPI.hpp                 | 3 ++-
 pennylane_lightning_gpu/src/util/MPIManager.hpp          | 9 +++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
index 59e3f5da..990a22ec 100644
--- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
+++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
@@ -534,6 +534,7 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
 
         DevTag<int> dt_local(sv.getDataBuffer().getDevTag());
         dt_local.refresh();
+
         StateVectorCudaMPI<PrecisionT> d_sv_prime(
             dt_local, sv.getNumGlobalQubits(), sv.getNumLocalQubits(),
             sv.getData());
@@ -555,7 +556,7 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
 
             size_t color = 0;
 
-            if (localCSRMatrix.values.size() != 0) {
+            if (localCSRMatrix.getValues().size() != 0) {
                 DataBuffer<IdxT, int> d_csrOffsets{
                     localCSRMatrix.getCsrOffsets().size(), device_id, stream_id,
                     true};
diff --git a/pennylane_lightning_gpu/src/util/MPIManager.hpp b/pennylane_lightning_gpu/src/util/MPIManager.hpp
index 7d3135c7..4fcdf328 100644
--- a/pennylane_lightning_gpu/src/util/MPIManager.hpp
+++ b/pennylane_lightning_gpu/src/util/MPIManager.hpp
@@ -441,6 +441,15 @@ class MPIManager final {
                                      this->getComm()));
     }
 
+    template <typename T>
+    void Reduce(T *sendBuf, T *recvBuf, size_t length, size_t root,
+                const std::string &op_str) {
+        MPI_Datatype datatype = getMPIDatatype<T>();
+        MPI_Op op = getMPIOpType(op_str);
+        PL_MPI_IS_SUCCESS(MPI_Reduce(sendBuf, recvBuf, length, datatype, op,
+                                     root, this->getComm()));
+    }
+
     template <typename T>
     void Gather(T &sendBuf, std::vector<T> &recvBuf, size_t root) {
         MPI_Datatype datatype = getMPIDatatype<T>();

From 13d6da5f8cd6c254e102719550d7e08f5699b896 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 27 Jul 2023 16:36:16 -0700
Subject: [PATCH 29/43] quick update

---
 .../src/simulator/StateVectorCudaMPI.hpp      | 51 ++++++++++---------
 .../src/util/MPIManager.hpp                   | 45 +++++++++++++++-
 2 files changed, 70 insertions(+), 26 deletions(-)

diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 688fd4e7..d0fd5d95 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -64,15 +64,15 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                                const size_t index, bool async,
                                cudaStream_t stream_id);
 
-template <class Precision, class index_type> struct CSRMatrix {
+template <class Precision, class index_type> class CSRMatrix {
   private:
     std::vector<index_type> columns;
-    std::vector<std::complex<Precision>> values;
     std::vector<index_type> csrOffsets;
+    std::vector<std::complex<Precision>> values;
 
   public:
     CSRMatrix(size_t num_rows, size_t nnz)
-        : columns(nnz, 0), values(nnz), csrOffsets(num_rows + 1, 0){};
+        : columns(nnz, 0), csrOffsets(num_rows + 1, 0), values(nnz){};
 
     CSRMatrix() = default;
 
@@ -1046,21 +1046,25 @@ class StateVectorCudaMPI
         size_t row_block_size = size_t{1} << this->getNumLocalQubits();
         size_t col_block_size = row_block_size;
 
-        // Add OpenMP support here later.
+        // Add OpenMP support here later. Need to pay attention to
+        // race condition.
+        size_t current_global_row, current_global_col;
+        size_t block_row_id, block_col_id;
+        size_t local_row_id, local_col_id;
         for (size_t row = 0; row < num_rows; row++) {
             for (size_t col_idx = static_cast<size_t>(csrOffsets_ptr[row]);
                  col_idx < static_cast<size_t>(csrOffsets_ptr[row + 1]);
                  col_idx++) {
 
-                size_t current_global_row = row;
-                size_t current_global_col = columns_ptr[col_idx];
+                current_global_row = row;
+                current_global_col = columns_ptr[col_idx];
                 std::complex<Precision> current_val = values_ptr[col_idx];
 
-                size_t block_row_id = current_global_row / row_block_size;
-                size_t block_col_id = current_global_col / col_block_size;
+                block_row_id = current_global_row / row_block_size;
+                block_col_id = current_global_col / col_block_size;
 
-                size_t local_row_id = current_global_row % row_block_size;
-                size_t local_col_id = current_global_col % col_block_size;
+                local_row_id = current_global_row % row_block_size;
+                local_col_id = current_global_col % col_block_size;
 
                 if (splitSparseMatrix[block_row_id][block_col_id]
                         .getCsrOffsets()
@@ -1086,15 +1090,13 @@ class StateVectorCudaMPI
              block_row_id++) {
             for (size_t block_col_id = 0; block_col_id < num_col_blocks;
                  block_col_id++) {
-                for (size_t i0 = 1;
-                     i0 < splitSparseMatrix[block_row_id][block_col_id]
-                              .getCsrOffsets()
-                              .size();
-                     i0++) {
-                    splitSparseMatrix[block_row_id][block_col_id]
-                        .getCsrOffsets()[i0] +=
-                        splitSparseMatrix[block_row_id][block_col_id]
-                            .getCsrOffsets()[i0 - 1];
+                auto &localSpMat =
+                    splitSparseMatrix[block_row_id][block_col_id];
+                size_t local_csr_offset_size =
+                    localSpMat.getCsrOffsets().size();
+                for (size_t i0 = 1; i0 < local_csr_offset_size; i0++) {
+                    localSpMat.getCsrOffsets()[i0] +=
+                        localSpMat.getCsrOffsets()[i0 - 1];
                 }
             }
         }
@@ -1148,9 +1150,10 @@ class StateVectorCudaMPI
 
         cudaDataType_t data_type;
         cusparseIndexType_t compute_type;
-        cusparseOperation_t operation_type = CUSPARSE_OPERATION_NON_TRANSPOSE;
-        cusparseSpMVAlg_t spmvalg_type = CUSPARSE_SPMV_ALG_DEFAULT;
-        cusparseIndexBase_t index_base_type = CUSPARSE_INDEX_BASE_ZERO;
+        const cusparseOperation_t operation_type =
+            CUSPARSE_OPERATION_NON_TRANSPOSE;
+        const cusparseSpMVAlg_t spmvalg_type = CUSPARSE_SPMV_ALG_DEFAULT;
+        const cusparseIndexBase_t index_base_type = CUSPARSE_INDEX_BASE_ZERO;
 
         if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
                       std::is_same_v<CFP_t, double2>) {
@@ -1168,8 +1171,8 @@ class StateVectorCudaMPI
         std::vector<std::vector<CSRMatrix<Precision, index_type>>>
             csrmatrix_blocks;
 
-        size_t num_col_blocks = mpi_manager_.getSize();
-        size_t num_row_blocks = mpi_manager_.getSize();
+        const size_t num_col_blocks = mpi_manager_.getSize();
+        const size_t num_row_blocks = mpi_manager_.getSize();
 
         if (mpi_manager_.getRank() == 0) {
             csrmatrix_blocks =
diff --git a/pennylane_lightning_gpu/src/util/MPIManager.hpp b/pennylane_lightning_gpu/src/util/MPIManager.hpp
index 7d3135c7..d46f5ab9 100644
--- a/pennylane_lightning_gpu/src/util/MPIManager.hpp
+++ b/pennylane_lightning_gpu/src/util/MPIManager.hpp
@@ -412,6 +412,15 @@ class MPIManager final {
         return recvBuf;
     }
 
+    /**
+     * @brief MPI_Reduce wrapper.
+     *
+     * @tparam T C++ data type.
+     * @param sendBuf Send buffer.
+     * @param recvBuf Receive buffer.
+     * @param root Rank of root process.
+     * @param op_str String of MPI_Op.
+     */
     template <typename T>
     void Reduce(T &sendBuf, T &recvBuf, size_t root,
                 const std::string &op_str) {
@@ -421,6 +430,15 @@ class MPIManager final {
                                      this->getComm()));
     }
 
+    /**
+     * @brief MPI_Reduce wrapper.
+     *
+     * @tparam T C++ data type.
+     * @param sendBuf Send buffer vector.
+     * @param recvBuf Receive buffer vector.
+     * @param root Rank of root process.
+     * @param op_str String of MPI_Op.
+     */
     template <typename T>
     void Reduce(std::vector<T> &sendBuf, std::vector<T> &recvBuf, size_t root,
                 const std::string &op_str) {
@@ -431,6 +449,15 @@ class MPIManager final {
                                      this->getComm()));
     }
 
+    /**
+     * @brief MPI_Reduce wrapper.
+     *
+     * @tparam T C++ data type.
+     * @param sendBuf Send buffer (DataBuffer type).
+     * @param recvBuf Receive buffer (DataBuffer type).
+     * @param root Rank of root process.
+     * @param op_str String of MPI_Op.
+     */
     template <typename T>
     void Reduce(DataBuffer<T> &sendBuf, DataBuffer<T> &recvBuf, size_t length,
                 size_t root, const std::string &op_str) {
@@ -441,6 +468,14 @@ class MPIManager final {
                                      this->getComm()));
     }
 
+    /**
+     * @brief MPI_Reduce wrapper.
+     *
+     * @tparam T C++ data type.
+     * @param sendBuf Send buffer.
+     * @param recvBuf Receive buffer vector.
+     * @param root Rank of root process.
+     */
     template <typename T>
     void Gather(T &sendBuf, std::vector<T> &recvBuf, size_t root) {
         MPI_Datatype datatype = getMPIDatatype<T>();
@@ -448,6 +483,14 @@ class MPIManager final {
                                      datatype, root, this->getComm()));
     }
 
+    /**
+     * @brief MPI_Reduce wrapper.
+     *
+     * @tparam T C++ data type.
+     * @param sendBuf Send buffer vector.
+     * @param recvBuf Receive buffer vector.
+     * @param root Rank of root process.
+     */
     template <typename T>
     void Gather(std::vector<T> &sendBuf, std::vector<T> &recvBuf, size_t root) {
         MPI_Datatype datatype = getMPIDatatype<T>();
@@ -557,7 +600,6 @@ class MPIManager final {
      * @param sendBuf Send buffer vector.
      * @param dest Rank of send dest.
      */
-
     template <typename T> void Send(std::vector<T> &sendBuf, size_t dest) {
         MPI_Datatype datatype = getMPIDatatype<T>();
         const int tag = 6789;
@@ -574,7 +616,6 @@ class MPIManager final {
      * @param recvBuf Recv buffer vector.
      * @param source Rank of data source.
      */
-
     template <typename T> void Recv(std::vector<T> &recvBuf, size_t source) {
         MPI_Datatype datatype = getMPIDatatype<T>();
         MPI_Status status;

From 5b0712c2cb7c5f3211ffa7261b297d76e812d313 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 27 Jul 2023 19:17:40 -0700
Subject: [PATCH 30/43] merge sparse_ham

---
 .../src/algorithms/ObservablesGPUMPI.hpp              | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
index 990a22ec..0fc1f3a0 100644
--- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
+++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
@@ -502,9 +502,10 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
         // clang-format on
         cudaDataType_t data_type;
         cusparseIndexType_t compute_type;
-        cusparseOperation_t operation_type = CUSPARSE_OPERATION_NON_TRANSPOSE;
-        cusparseSpMVAlg_t spmvalg_type = CUSPARSE_SPMV_ALG_DEFAULT;
-        cusparseIndexBase_t index_base_type = CUSPARSE_INDEX_BASE_ZERO;
+        const cusparseOperation_t operation_type =
+            CUSPARSE_OPERATION_NON_TRANSPOSE;
+        const cusparseSpMVAlg_t spmvalg_type = CUSPARSE_SPMV_ALG_DEFAULT;
+        const cusparseIndexBase_t index_base_type = CUSPARSE_INDEX_BASE_ZERO;
 
         if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
                       std::is_same_v<CFP_t, double2>) {
@@ -521,8 +522,8 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
 
         std::vector<std::vector<CSRMatrix<PrecisionT, IdxT>>> csrmatrix_blocks;
 
-        size_t num_col_blocks = mpi_manager.getSize();
-        size_t num_row_blocks = mpi_manager.getSize();
+        const size_t num_col_blocks = mpi_manager.getSize();
+        const size_t num_row_blocks = mpi_manager.getSize();
 
         if (mpi_manager.getRank() == 0) {
             csrmatrix_blocks = sv.template splitCSRMatrix<IdxT>(

From 500d8c571d8e1400ee124673a9f9895c9df3da41 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Sat, 29 Jul 2023 18:10:54 +0800
Subject: [PATCH 31/43] add more docstring

---
 .../src/simulator/StateVectorCudaMPI.hpp      | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index d0fd5d95..20f58dd9 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -64,6 +64,14 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                                const size_t index, bool async,
                                cudaStream_t stream_id);
 
+/**
+ * @brief Manage memory of Compressed Sparse Row (CSR) sparse matrix. CSR format
+ * represents a matrix M by three (one-dimensional) arrays, that respectively
+ * contain nonzero values, row offsets, and column indices.
+ *
+ * @tparam Precision Floating-point precision type.
+ * @tparam index_type Integer type.
+ */
 template <class Precision, class index_type> class CSRMatrix {
   private:
     std::vector<index_type> columns;
@@ -76,11 +84,22 @@ template <class Precision, class index_type> class CSRMatrix {
 
     CSRMatrix() = default;
 
+    /**
+     * @brief Get the CSR format index vector of the matrix.
+     */
     auto getColumns() -> std::vector<index_type> & { return columns; }
+
+    /**
+     * @brief Get CSR format offset vector of the matrix.
+     */
+    auto getCsrOffsets() -> std::vector<index_type> & { return csrOffsets; }
+
+    /**
+     * @brief Get CSR format data vector of the matrix.
+     */
     auto getValues() -> std::vector<std::complex<Precision>> & {
         return values;
     }
-    auto getCsrOffsets() -> std::vector<index_type> & { return csrOffsets; }
 };
 
 /**
@@ -1116,7 +1135,7 @@ class StateVectorCudaMPI
      * multiplication. 3. Each GPU will collect computation results for its
      * respective row block of the sparse matrix. 4. After all sparse matrix
      * operations are completed on each GPU, an inner product is performed and
-     * MPI reduce operation is ultilized to obtain the final result for the
+     * MPI reduce operation is utilized to obtain the final result for the
      * expectation value.
      * @tparam index_type Integer type used as indices of the sparse matrix.
      * @param csr_Offsets_ptr Pointer to the array of row offsets of the sparse

From a09d10b6cc4de8ebf415f26ca8b5f485160a2513 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 8 Aug 2023 12:15:45 -0700
Subject: [PATCH 32/43] update typo

---
 mpitests/test_adjoint_jacobian.py | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
index 7b8af953..ba4b6f6c 100644
--- a/mpitests/test_adjoint_jacobian.py
+++ b/mpitests/test_adjoint_jacobian.py
@@ -1046,12 +1046,11 @@ def circuit(params):
         j_gpu = qml.jacobian(qnode_gpu)(params)
 
 
-custom_wires0 = ["alice", 3.14, -1, 0, "bob", "luc"]
 """ Important Note: Ensure wires index arranged from high to low
     qml.SparseHamiltonian(
         qml.Hamiltonian(
-            [0.1], [qml.PauliX(wires=custom_wires0[1]) @ qml.PauliY(wires=custom_wires0[0])]
-        ).sparse_matrix(custom_wires0.[::-1]),
+            [0.1], [qml.PauliX(wires=custom_wires[1]) @ qml.PauliY(wires=custom_wires[0])]
+        ).sparse_matrix(custom_wires.[::-1]),
     ) 
 """
 
@@ -1061,21 +1060,21 @@ def circuit(params):
     [
         qml.SparseHamiltonian(
             qml.Hamiltonian(
-                [0.1], [qml.PauliX(wires=custom_wires0[0]) @ qml.PauliY(wires=custom_wires0[1])]
-            ).sparse_matrix(custom_wires0[::-1]),
-            wires=custom_wires0,
+                [0.1], [qml.PauliX(wires=custom_wires[0]) @ qml.PauliY(wires=custom_wires[1])]
+            ).sparse_matrix(custom_wires[::-1]),
+            wires=custom_wires,
         ),
         qml.SparseHamiltonian(
             qml.Hamiltonian(
-                [2.0], [qml.PauliX(wires=custom_wires0[2]) @ qml.PauliZ(wires=custom_wires0[0])]
-            ).sparse_matrix(custom_wires0[::-1]),
-            wires=custom_wires0,
+                [2.0], [qml.PauliX(wires=custom_wires[2]) @ qml.PauliZ(wires=custom_wires[0])]
+            ).sparse_matrix(custom_wires[::-1]),
+            wires=custom_wires,
         ),
         qml.SparseHamiltonian(
             qml.Hamiltonian(
-                [1.1], [qml.PauliX(wires=custom_wires0[0]) @ qml.PauliZ(wires=custom_wires0[2])]
-            ).sparse_matrix(custom_wires0[::-1]),
-            wires=custom_wires0,
+                [1.1], [qml.PauliX(wires=custom_wires[0]) @ qml.PauliZ(wires=custom_wires[2])]
+            ).sparse_matrix(custom_wires[::-1]),
+            wires=custom_wires,
         ),
     ],
 )
@@ -1084,11 +1083,11 @@ def test_adjoint_SparseHamiltonian_custom_wires(returns):
     operations and when using custom wire labels"""
 
     comm = MPI.COMM_WORLD
-    dev_gpu = qml.device("lightning.gpu", wires=custom_wires0, mpi=True)
-    dev_cpu = qml.device("default.qubit", wires=custom_wires0)
+    dev_gpu = qml.device("lightning.gpu", wires=custom_wires, mpi=True)
+    dev_cpu = qml.device("default.qubit", wires=custom_wires)
 
     def circuit(params):
-        circuit_ansatz(params, wires=custom_wires0)
+        circuit_ansatz(params, wires=custom_wires)
         return qml.expval(returns)
 
     if comm.Get_rank() == 0:
@@ -1184,7 +1183,6 @@ def circuit(params):
     assert np.allclose(j_cpu, j_gpu)
 
 
-
 @pytest.mark.parametrize(
     "obs,obs_type_c64,obs_type_c128",
     [

From c9131f0f2cf5cea7c7e93b7d4a7a01b6eaa517a9 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 8 Aug 2023 18:32:22 -0700
Subject: [PATCH 33/43] update indices order

---
 .github/CHANGELOG.md              |  2 +-
 mpitests/test_adjoint_jacobian.py | 38 ++++++++++++++++++++++++++-----
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index db1ce874..5d037e67 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -5,7 +5,7 @@
  * Add sparse Hamiltonian support to multi-node/multi-GPU adjoint methods. 
  [(#128)] (https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/128)
 
- Note each MPI process will return the overall result of the adjoint method. To ensure wires index arranged in a way aligned with cuQuantum backend, `wires` should be reordered in a descending manner in a python script as follows:
+ Note each MPI process will return the overall result of the adjoint method. If there is a global target wire when constructing the sparse Hamiltonian, `wires` should be reordered in a descending manner in a python script as follows:
  ```python
     qml.SparseHamiltonian(
         qml.Hamiltonian(
diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
index ba4b6f6c..97ee4ec2 100644
--- a/mpitests/test_adjoint_jacobian.py
+++ b/mpitests/test_adjoint_jacobian.py
@@ -1046,7 +1046,8 @@ def circuit(params):
         j_gpu = qml.jacobian(qnode_gpu)(params)
 
 
-""" Important Note: Ensure wires index arranged from high to low
+""" Important Note: Ensure wires index arranged from high to low if
+    there is a global target wire.
     qml.SparseHamiltonian(
         qml.Hamiltonian(
             [0.1], [qml.PauliX(wires=custom_wires[1]) @ qml.PauliY(wires=custom_wires[0])]
@@ -1067,7 +1068,13 @@ def circuit(params):
         qml.SparseHamiltonian(
             qml.Hamiltonian(
                 [2.0], [qml.PauliX(wires=custom_wires[2]) @ qml.PauliZ(wires=custom_wires[0])]
-            ).sparse_matrix(custom_wires[::-1]),
+            ).sparse_matrix(custom_wires),
+            wires=custom_wires,
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [2.0], [qml.PauliX(wires=custom_wires[1]) @ qml.PauliZ(wires=custom_wires[2])]
+            ).sparse_matrix(custom_wires),
             wires=custom_wires,
         ),
         qml.SparseHamiltonian(
@@ -1108,11 +1115,12 @@ def circuit(params):
     assert np.allclose(j_cpu, j_gpu)
 
 
-""" Important Note: Ensure wires index arranged from high to low
+""" Important Note: Ensure wires index arranged from high to low if
+    there is a global target wire.
     qml.SparseHamiltonian(
     qml.Hamiltonian(
                 [0.1],
-                [qml.PauliX(1) @ qml.PauliZ(0)],
+                [qml.PauliX(0) @ qml.PauliZ(1)],
             ).sparse_matrix(range(5, -1, -1)),
             wires=range(6),
     )
@@ -1133,9 +1141,23 @@ def circuit(params):
             qml.Hamiltonian(
                 [0.1],
                 [qml.PauliX(1) @ qml.PauliZ(0)],
+            ).sparse_matrix(range(6)),
+            wires=range(6),
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [0.1],
+                [qml.PauliX(0)],
             ).sparse_matrix(range(5, -1, -1)),
             wires=range(6),
         ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [0.1],
+                [qml.PauliX(5)],
+            ).sparse_matrix(range(6)),
+            wires=range(6),
+        ),
         qml.SparseHamiltonian(
             qml.Hamiltonian(
                 [0.1],
@@ -1144,11 +1166,15 @@ def circuit(params):
             wires=range(6),
         ),
         qml.SparseHamiltonian(
-            qml.Hamiltonian([2.0], [qml.PauliX(1) @ qml.PauliZ(2)]).sparse_matrix(range(5, -1, -1)),
+            qml.Hamiltonian([2.0], [qml.PauliX(1) @ qml.PauliZ(2)]).sparse_matrix(range(6)),
+            wires=range(6),
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian([2.0], [qml.PauliX(2) @ qml.PauliZ(4)]).sparse_matrix(range(6)),
             wires=range(6),
         ),
         qml.SparseHamiltonian(
-            qml.Hamiltonian([1.1], [qml.PauliX(2) @ qml.PauliZ(0)]).sparse_matrix(range(5, -1, -1)),
+            qml.Hamiltonian([1.1], [qml.PauliX(2) @ qml.PauliZ(0)]).sparse_matrix(range(6)),
             wires=range(6),
         ),
     ],

From 618bdf2414b81053b1124094c81f3cf02fd7929b Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Sat, 19 Aug 2023 14:35:28 -0700
Subject: [PATCH 34/43] add matrix sort v0

---
 .../src/algorithms/ObservablesGPUMPI.hpp      | 42 +++++++++++++++++--
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
index 0fc1f3a0..150eb841 100644
--- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
+++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
@@ -22,6 +22,16 @@
 
 namespace Pennylane::Algorithms {
 
+inline size_t reverseBits(size_t num, size_t nbits){
+    size_t reversed = 0;
+    for(size_t i = 0; i < nbits; ++i){
+        //ith bit value
+        size_t bit = num & (1 << i);
+        reversed += bit << (nbits - i);
+    }
+    return reversed
+} 
+
 /**
  * @brief A base class for all observable classes.
  *
@@ -420,10 +430,12 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
                                            int32_t, int64_t>::type;
 
   private:
+    MPIManager mpi_manager_;
     std::vector<std::complex<T>> data_;
     std::vector<IdxT> indices_;
     std::vector<IdxT> offsets_;
     std::vector<std::size_t> wires_;
+    
 
     [[nodiscard]] bool
     isEqual(const ObservableGPUMPI<T> &other) const override {
@@ -450,10 +462,32 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
      */
     template <typename T1, typename T2, typename T3 = T2,
               typename T4 = std::vector<std::size_t>>
-    SparseHamiltonianGPUMPI(T1 &&arg1, T2 &&arg2, T3 &&arg3, T4 &&arg4)
-        : data_{std::forward<T1>(arg1)}, indices_{std::forward<T2>(arg2)},
-          offsets_{std::forward<T3>(arg3)}, wires_{std::forward<T4>(arg4)} {
-        PL_ASSERT(data_.size() == indices_.size());
+    SparseHamiltonianGPUMPI(T1 &&arg1, T2 &&arg2, T3 &&arg3, T4 &&arg4):mpi_manager_(MPI_COMM_WORLD){
+        if(mpi_manager_.getRank() == 0){
+            std::<std::complex<T>> data_tmp = std::move(arg1);
+            std::vector<IdxT> indices_tmp = std::move(arg2);
+            std::vector<IdxT> offsets_tmp = std::move(arg3);
+            std::vector<std::size_t> wires_tmp = std::move(arg4);
+            PL_ASSERT(data_tmp.size() == indices_tmp.size());
+
+            offsets_.push_back(0);
+            //sort sparseMV
+            for(size_t i = 0; i < offsets_tmp.size() - 1; i++){
+                size_t target_row = reverseBits(i,wire_tmp.size());
+                size_t offset = offsets_tmp[target_row];
+                size_t local_col_size = offsets_tmp[target_row+1] - offset;
+                offset_.push_back(offsets_.back()+local_col_size);
+                //get unsorted column indices
+                std::vector<IdxT> local_col_indices(indices_.begin()+offset,indices_tmp.begin()+offset+local_col_size+1);
+                for(size_t j = 0; j < local_col_indices.size(); j++){
+                    size_t target_col = reverseBits(local_col_indices[i],wire_tmp.size());
+                    local_col_indices[i] = target_col;
+                }
+                //sort indices and get the rank of column index
+
+                //sort data
+            }
+        }
     }
 
     /**

From de7db96e36b9390f37909986da9c770fa1834391 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 22 Aug 2023 06:16:22 -0700
Subject: [PATCH 35/43] refactor csrmatrix

---
 .../src/bindings/Bindings.cpp                 |  41 +++-
 .../src/simulator/StateVectorCudaMPI.hpp      | 209 +-----------------
 .../src/util/CSRMatrix.hpp                    | 185 ++++++++++++++++
 3 files changed, 222 insertions(+), 213 deletions(-)
 create mode 100644 pennylane_lightning_gpu/src/util/CSRMatrix.hpp

diff --git a/pennylane_lightning_gpu/src/bindings/Bindings.cpp b/pennylane_lightning_gpu/src/bindings/Bindings.cpp
index 2b0d677d..33a6ece5 100644
--- a/pennylane_lightning_gpu/src/bindings/Bindings.cpp
+++ b/pennylane_lightning_gpu/src/bindings/Bindings.cpp
@@ -39,6 +39,7 @@
 
 #ifdef ENABLE_MPI
 #include "AdjointDiffGPUMPI.hpp"
+#include "CSRMatrix.hpp"
 #include "MPIManager.hpp"
 #include "StateVectorCudaMPI.hpp"
 #endif
@@ -1360,15 +1361,39 @@ void StateVectorCudaMPI_class_bindings(py::module &m) {
                const np_arr_sparse_ind &columns, const np_arr_c values) {
                 using index_type = typename std::conditional<
                     std::is_same<ParamT, float>::value, int32_t, int64_t>::type;
+                //Distribute sparse matrix across multi-nodes/multi-gpus
+                size_t num_rows =
+                    static_cast<index_type>(csrOffsets.request().size) - 1;
+                size_t local_num_rows = size_t{1} << sv.getNumLocalQubits();
+                size_t root = 0;
+
+                MPIManager mpi_manager = sv.getMPIManager();
+
+                std::vector<std::vector<CSRMatrix<PrecisionT, index_type>>>
+                    csrmatrix_blocks;
+
+                if (mpi_manager.getRank() == root) {
+                    csrmatrix_blocks = splitCSRMatrix<PrecisionT, index_type>(
+                        mpi_manager, num_rows,
+                        static_cast<index_type *>(csrOffsets.request().ptr),
+                        static_cast<index_type *>(columns.request().ptr),
+                        static_cast<std::complex<PrecisionT> *>(
+                            values.request().ptr));
+                }
+                mpi_manager.Barrier();
+
+                std::vector<CSRMatrix<PrecisionT, index_type>>
+                    localCSRMatVector;
+                for (size_t i = 0; i < mpi_manager.getSize(); i++) {
+                    auto localCSRMat = scatterCSRMatrix<PrecisionT, index_type>(
+                        mpi_manager, csrmatrix_blocks[i], local_num_rows, root);
+                    localCSRMatVector.push_back(localCSRMat);
+                }
+
+                mpi_manager.Barrier();
+
                 return sv.template getExpectationValueOnSparseSpMV<index_type>(
-                    static_cast<index_type *>(csrOffsets.request().ptr),
-                    static_cast<index_type>(
-                        csrOffsets.request()
-                            .size), // num_rows + 1 or csrOffsets
-                    static_cast<index_type *>(columns.request().ptr), // columns
-                    static_cast<std::complex<PrecisionT> *>(
-                        values.request().ptr),
-                    static_cast<index_type>(values.request().size)); // nnz
+                    localCSRMatVector);
             },
             "Calculate the expectation value of a sparse Hamiltonian.")
 
diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 20f58dd9..b1c52568 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -27,6 +27,7 @@
 #include <cuda.h>
 #include <custatevec.h> // custatevecApplyMatrix
 
+#include "CSRMatrix.hpp"
 #include "Constant.hpp"
 #include "Error.hpp"
 #include "MPIManager.hpp"
@@ -64,44 +65,6 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                                const size_t index, bool async,
                                cudaStream_t stream_id);
 
-/**
- * @brief Manage memory of Compressed Sparse Row (CSR) sparse matrix. CSR format
- * represents a matrix M by three (one-dimensional) arrays, that respectively
- * contain nonzero values, row offsets, and column indices.
- *
- * @tparam Precision Floating-point precision type.
- * @tparam index_type Integer type.
- */
-template <class Precision, class index_type> class CSRMatrix {
-  private:
-    std::vector<index_type> columns;
-    std::vector<index_type> csrOffsets;
-    std::vector<std::complex<Precision>> values;
-
-  public:
-    CSRMatrix(size_t num_rows, size_t nnz)
-        : columns(nnz, 0), csrOffsets(num_rows + 1, 0), values(nnz){};
-
-    CSRMatrix() = default;
-
-    /**
-     * @brief Get the CSR format index vector of the matrix.
-     */
-    auto getColumns() -> std::vector<index_type> & { return columns; }
-
-    /**
-     * @brief Get CSR format offset vector of the matrix.
-     */
-    auto getCsrOffsets() -> std::vector<index_type> & { return csrOffsets; }
-
-    /**
-     * @brief Get CSR format data vector of the matrix.
-     */
-    auto getValues() -> std::vector<std::complex<Precision>> & {
-        return values;
-    }
-};
-
 /**
  * @brief Managed memory CUDA state-vector class using custateVec backed
  * gate-calls.
@@ -979,150 +942,6 @@ class StateVectorCudaMPI
         return expect_val;
     }
 
-    /**
-     * @brief Scatter a CSR (Compressed Sparse Row) format matrix.
-     *
-     * @tparam index_type Integer type used as indices of the sparse matrix.
-     * @param matrix CSR (Compressed Sparse Row) format matrix.
-     * @param root Root rank of the scatter operation.
-     */
-    template <class index_type>
-    auto scatterCSRMatrix(std::vector<CSRMatrix<Precision, index_type>> &matrix,
-                          size_t root) -> CSRMatrix<Precision, index_type> {
-        // Bcast num_rows and num_cols
-        size_t local_num_rows = size_t{1} << this->getNumLocalQubits();
-        size_t num_col_blocks = mpi_manager_.getSize();
-        std::vector<size_t> nnzs;
-
-        if (mpi_manager_.getRank() == root) {
-            nnzs.reserve(matrix.size());
-            for (size_t j = 0; j < matrix.size(); j++) {
-                nnzs.push_back(matrix[j].getValues().size());
-            }
-        }
-
-        size_t local_nnz = mpi_manager_.scatter<size_t>(nnzs, 0)[0];
-
-        CSRMatrix<Precision, index_type> localCSRMatrix(local_num_rows,
-                                                        local_nnz);
-
-        if (mpi_manager_.getRank() == root) {
-            localCSRMatrix.getValues() = matrix[0].getValues();
-            localCSRMatrix.getCsrOffsets() = matrix[0].getCsrOffsets();
-            localCSRMatrix.getColumns() = matrix[0].getColumns();
-        }
-
-        for (size_t k = 1; k < num_col_blocks; k++) {
-            size_t dest = k;
-            size_t source = 0;
-
-            if (mpi_manager_.getRank() == 0 && matrix[k].getValues().size()) {
-                mpi_manager_.Send<std::complex<Precision>>(
-                    matrix[k].getValues(), dest);
-                mpi_manager_.Send<index_type>(matrix[k].getCsrOffsets(), dest);
-                mpi_manager_.Send<index_type>(matrix[k].getColumns(), dest);
-            } else if (mpi_manager_.getRank() == k && local_nnz) {
-                mpi_manager_.Recv<std::complex<Precision>>(
-                    localCSRMatrix.getValues(), source);
-                mpi_manager_.Recv<index_type>(localCSRMatrix.getCsrOffsets(),
-                                              source);
-                mpi_manager_.Recv<index_type>(localCSRMatrix.getColumns(),
-                                              source);
-            }
-        }
-        return localCSRMatrix;
-    }
-
-    /**
-     * @brief Convert a global CSR (Compressed Sparse Row) format matrix into
-     * local blocks. This operation should be conducted on the rank 0.
-     *
-     * @tparam index_type Integer type used as indices of the sparse matrix.
-     * @param num_row_blocks Number of local blocks per global row.
-     * @param num_col_blocks Number of local blocks per global column.
-     * @param csrOffsets_ptr Pointer to the array of row offsets of the sparse
-     * matrix. Array of size csrOffsets_size.
-     * @param columns_ptr Pointer to the array of column indices of the sparse
-     * matrix. Array of size numNNZ
-     * @param values_ptr Pointer to the array of the non-zero elements
-     *
-     * @return auto A vector of vector of CSRMatrix.
-     */
-    template <class index_type>
-    auto splitCSRMatrix(const size_t &num_row_blocks,
-                        const size_t &num_col_blocks,
-                        const index_type *csrOffsets_ptr,
-                        const index_type *columns_ptr,
-                        const std::complex<Precision> *values_ptr)
-        -> std::vector<std::vector<CSRMatrix<Precision, index_type>>> {
-
-        std::vector<std::vector<CSRMatrix<Precision, index_type>>>
-            splitSparseMatrix(
-                num_row_blocks,
-                std::vector<CSRMatrix<Precision, index_type>>(num_col_blocks));
-
-        size_t num_rows = size_t{1} << this->getTotalNumQubits();
-        size_t row_block_size = size_t{1} << this->getNumLocalQubits();
-        size_t col_block_size = row_block_size;
-
-        // Add OpenMP support here later. Need to pay attention to
-        // race condition.
-        size_t current_global_row, current_global_col;
-        size_t block_row_id, block_col_id;
-        size_t local_row_id, local_col_id;
-        for (size_t row = 0; row < num_rows; row++) {
-            for (size_t col_idx = static_cast<size_t>(csrOffsets_ptr[row]);
-                 col_idx < static_cast<size_t>(csrOffsets_ptr[row + 1]);
-                 col_idx++) {
-
-                current_global_row = row;
-                current_global_col = columns_ptr[col_idx];
-                std::complex<Precision> current_val = values_ptr[col_idx];
-
-                block_row_id = current_global_row / row_block_size;
-                block_col_id = current_global_col / col_block_size;
-
-                local_row_id = current_global_row % row_block_size;
-                local_col_id = current_global_col % col_block_size;
-
-                if (splitSparseMatrix[block_row_id][block_col_id]
-                        .getCsrOffsets()
-                        .size() == 0) {
-                    splitSparseMatrix[block_row_id][block_col_id]
-                        .getCsrOffsets() =
-                        std::vector<index_type>(row_block_size + 1, 0);
-                }
-
-                splitSparseMatrix[block_row_id][block_col_id]
-                    .getCsrOffsets()[local_row_id + 1]++;
-                splitSparseMatrix[block_row_id][block_col_id]
-                    .getColumns()
-                    .push_back(local_col_id);
-                splitSparseMatrix[block_row_id][block_col_id]
-                    .getValues()
-                    .push_back(current_val);
-            }
-        }
-
-        // Add OpenMP support here later.
-        for (size_t block_row_id = 0; block_row_id < num_row_blocks;
-             block_row_id++) {
-            for (size_t block_col_id = 0; block_col_id < num_col_blocks;
-                 block_col_id++) {
-                auto &localSpMat =
-                    splitSparseMatrix[block_row_id][block_col_id];
-                size_t local_csr_offset_size =
-                    localSpMat.getCsrOffsets().size();
-                for (size_t i0 = 1; i0 < local_csr_offset_size; i0++) {
-                    localSpMat.getCsrOffsets()[i0] +=
-                        localSpMat.getCsrOffsets()[i0 - 1];
-                }
-            }
-        }
-
-        return splitSparseMatrix;
-    }
-
     /**
      * @brief expval(H) calculates the expected value using cuSparseSpMV and MPI
      * to implement distributed Sparse Matrix-Vector multiplication. The dense
@@ -1149,15 +968,7 @@ class StateVectorCudaMPI
      */
     template <class index_type>
     auto getExpectationValueOnSparseSpMV(
-        const index_type *csrOffsets_ptr, const index_type csrOffsets_size,
-        const index_type *columns_ptr,
-        const std::complex<Precision> *values_ptr, const index_type numNNZ) {
-        if (mpi_manager_.getRank() == 0) {
-            PL_ABORT_IF_NOT(static_cast<size_t>(csrOffsets_size - 1) ==
-                                (size_t{1} << this->getTotalNumQubits()),
-                            "Incorrect size of CSR Offsets.");
-            PL_ABORT_IF_NOT(numNNZ > 0, "Empty CSR matrix.");
-        }
+        std::vector<CSRMatrix<Precision, index_type>> &CSRMatVector) {
 
         const CFP_t alpha = {1.0, 0.0};
         const CFP_t beta = {0.0, 0.0};
@@ -1187,18 +998,6 @@ class StateVectorCudaMPI
             compute_type = CUSPARSE_INDEX_32I;
         }
 
-        std::vector<std::vector<CSRMatrix<Precision, index_type>>>
-            csrmatrix_blocks;
-
-        const size_t num_col_blocks = mpi_manager_.getSize();
-        const size_t num_row_blocks = mpi_manager_.getSize();
-
-        if (mpi_manager_.getRank() == 0) {
-            csrmatrix_blocks =
-                splitCSRMatrix(num_col_blocks, num_row_blocks, csrOffsets_ptr,
-                               columns_ptr, values_ptr);
-        }
-
         const size_t length_local = size_t{1} << this->getNumLocalQubits();
 
         DataBuffer<CFP_t, int> d_res_per_block{length_local, device_id,
@@ -1207,10 +1006,10 @@ class StateVectorCudaMPI
                                                   stream_id, true};
         d_res_per_rowblock.zeroInit();
 
-        for (size_t i = 0; i < num_row_blocks; i++) {
+        for (size_t i = 0; i < mpi_manager_.getSize(); i++) {
             // Need to investigate if non-blocking MPI operation can improve
             // performace here.
-            auto localCSRMatrix = scatterCSRMatrix(csrmatrix_blocks[i], 0);
+            auto &localCSRMatrix = CSRMatVector[i];
 
             int64_t num_rows_local =
                 static_cast<int64_t>(localCSRMatrix.getCsrOffsets().size() - 1);
diff --git a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
new file mode 100644
index 00000000..7a87673a
--- /dev/null
+++ b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
@@ -0,0 +1,185 @@
+#pragma once
+
+#include <algorithm>
+#include <bit>
+#include <complex>
+#include <vector>
+
+#include "MPIManager.hpp"
+
+/// @cond DEV
+namespace {
+using namespace Pennylane::CUDA;
+} // namespace
+/// @endcond
+namespace Pennylane::MPI {
+/**
+ * @brief Manage memory of Compressed Sparse Row (CSR) sparse matrix. CSR format
+ * represents a matrix M by three (one-dimensional) arrays, that respectively
+ * contain nonzero values, row offsets, and column indices.
+ *
+ * @tparam Precision Floating-point precision type.
+ * @tparam index_type Integer type.
+ */
+template <class Precision, class index_type> class CSRMatrix {
+  private:
+    std::vector<index_type> columns_;
+    std::vector<index_type> csrOffsets_;
+    std::vector<std::complex<Precision>> values_;
+
+  public:
+    CSRMatrix(size_t num_rows, size_t nnz)
+        : columns_(nnz, 0), csrOffsets_(num_rows + 1, 0), values_(nnz){};
+
+    CSRMatrix(size_t num_rows, size_t nnz, index_type *column_ptr,
+              index_type *csrOffsets_ptr, std::complex<Precision> *value_ptr)
+        : columns_(column_ptr, column_ptr + num_rows),
+          csrOffsets_(csrOffsets_ptr, csrOffsets_ptr + num_rows + 1),
+          values_(value_ptr, value_ptr + nnz){};
+
+    CSRMatrix() = default;
+
+    /**
+     * @brief Get the CSR format index vector of the matrix.
+     */
+    auto getColumns() -> std::vector<index_type> & { return columns_; }
+
+    /**
+     * @brief Get CSR format offset vector of the matrix.
+     */
+    auto getCsrOffsets() -> std::vector<index_type> & { return csrOffsets_; }
+
+    /**
+     * @brief Get CSR format data vector of the matrix.
+     */
+    auto getValues() -> std::vector<std::complex<Precision>> & {
+        return values_;
+    }
+};
+
+template <class Precision, class index_type>
+auto splitCSRMatrix(MPIManager &mpi_manager, const size_t &num_rows,
+                    const index_type *csrOffsets_ptr,
+                    const index_type *columns_ptr,
+                    const std::complex<Precision> *values_ptr)
+    -> std::vector<std::vector<CSRMatrix<Precision, index_type>>> {
+
+    size_t num_row_blocks = mpi_manager.getSize();
+    size_t num_col_blocks = num_row_blocks;
+
+    std::vector<std::vector<CSRMatrix<Precision, index_type>>>
+        splitSparseMatrix(
+            num_row_blocks,
+            std::vector<CSRMatrix<Precision, index_type>>(num_col_blocks));
+
+    size_t row_block_size = num_rows / num_row_blocks;
+    size_t col_block_size = row_block_size;
+
+    // Add OpenMP support here later. Need to pay attention to
+    // race condition.
+    size_t current_global_row, current_global_col;
+    size_t block_row_id, block_col_id;
+    size_t local_row_id, local_col_id;
+    for (size_t row = 0; row < num_rows; row++) {
+        for (size_t col_idx = static_cast<size_t>(csrOffsets_ptr[row]);
+             col_idx < static_cast<size_t>(csrOffsets_ptr[row + 1]);
+             col_idx++) {
+
+            current_global_row = row;
+            current_global_col = columns_ptr[col_idx];
+            std::complex<Precision> current_val = values_ptr[col_idx];
+
+            block_row_id = current_global_row / row_block_size;
+            block_col_id = current_global_col / col_block_size;
+
+            local_row_id = current_global_row % row_block_size;
+            local_col_id = current_global_col % col_block_size;
+
+            if (splitSparseMatrix[block_row_id][block_col_id]
+                    .getCsrOffsets()
+                    .size() == 0) {
+                splitSparseMatrix[block_row_id][block_col_id].getCsrOffsets() =
+                    std::vector<index_type>(row_block_size + 1, 0);
+            }
+
+            splitSparseMatrix[block_row_id][block_col_id]
+                .getCsrOffsets()[local_row_id + 1]++;
+            splitSparseMatrix[block_row_id][block_col_id]
+                .getColumns()
+                .push_back(local_col_id);
+            splitSparseMatrix[block_row_id][block_col_id].getValues().push_back(
+                current_val);
+        }
+    }
+
+    // Add OpenMP support here later.
+    for (size_t block_row_id = 0; block_row_id < num_row_blocks;
+         block_row_id++) {
+        for (size_t block_col_id = 0; block_col_id < num_col_blocks;
+             block_col_id++) {
+            auto &localSpMat = splitSparseMatrix[block_row_id][block_col_id];
+            size_t local_csr_offset_size = localSpMat.getCsrOffsets().size();
+            for (size_t i0 = 1; i0 < local_csr_offset_size; i0++) {
+                localSpMat.getCsrOffsets()[i0] +=
+                    localSpMat.getCsrOffsets()[i0 - 1];
+            }
+        }
+    }
+
+    return splitSparseMatrix;
+}
+
+/**
+ * @brief Scatter a CSR (Compressed Sparse Row) format matrix.
+ *
+ * @tparam index_type Integer type used as indices of the sparse matrix.
+ * @param matrix CSR (Compressed Sparse Row) format matrix.
+ * @param root Root rank of the scatter operation.
+ */
+template <class Precision, class index_type>
+auto scatterCSRMatrix(MPIManager &mpi_manager,
+                      std::vector<CSRMatrix<Precision, index_type>> &matrix,
+                      size_t local_num_rows, size_t root)
+    -> CSRMatrix<Precision, index_type> {
+    // Bcast num_rows and num_cols
+    size_t num_col_blocks = mpi_manager.getSize();
+
+    std::vector<size_t> nnzs;
+
+    if (mpi_manager.getRank() == root) {
+        nnzs.reserve(matrix.size());
+        for (size_t j = 0; j < matrix.size(); j++) {
+            nnzs.push_back(matrix[j].getValues().size());
+        }
+    }
+
+    size_t local_nnz = mpi_manager.scatter<size_t>(nnzs, 0)[0];
+
+    CSRMatrix<Precision, index_type> localCSRMatrix(local_num_rows, local_nnz);
+
+    if (mpi_manager.getRank() == root) {
+        localCSRMatrix.getValues() = matrix[0].getValues();
+        localCSRMatrix.getCsrOffsets() = matrix[0].getCsrOffsets();
+        localCSRMatrix.getColumns() = matrix[0].getColumns();
+    }
+
+    for (size_t k = 1; k < num_col_blocks; k++) {
+        size_t dest = k;
+        size_t source = root;
+
+        if (mpi_manager.getRank() == 0 && matrix[k].getValues().size()) {
+            mpi_manager.Send<std::complex<Precision>>(matrix[k].getValues(),
+                                                      dest);
+            mpi_manager.Send<index_type>(matrix[k].getCsrOffsets(), dest);
+            mpi_manager.Send<index_type>(matrix[k].getColumns(), dest);
+        } else if (mpi_manager.getRank() == k && local_nnz) {
+            mpi_manager.Recv<std::complex<Precision>>(
+                localCSRMatrix.getValues(), source);
+            mpi_manager.Recv<index_type>(localCSRMatrix.getCsrOffsets(),
+                                         source);
+            mpi_manager.Recv<index_type>(localCSRMatrix.getColumns(), source);
+        }
+    }
+    return localCSRMatrix;
+}
+} // namespace Pennylane::MPI
\ No newline at end of file

From 134cfc312ae7bed1f7cab228631549341b8053d7 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Tue, 22 Aug 2023 13:50:20 -0700
Subject: [PATCH 36/43] add crsmatrix

---
 .../src/algorithms/ObservablesGPUMPI.hpp      |  43 +-----
 .../src/bindings/Bindings.cpp                 |  12 ++
 .../src/simulator/StateVectorCudaMPI.hpp      |  39 +-----
 .../src/util/CSRMatrix.hpp                    | 131 ++++++++++++++++++
 4 files changed, 149 insertions(+), 76 deletions(-)
 create mode 100644 pennylane_lightning_gpu/src/util/CSRMatrix.hpp

diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
index 150eb841..16a82f5f 100644
--- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
+++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
@@ -22,16 +22,6 @@
 
 namespace Pennylane::Algorithms {
 
-inline size_t reverseBits(size_t num, size_t nbits){
-    size_t reversed = 0;
-    for(size_t i = 0; i < nbits; ++i){
-        //ith bit value
-        size_t bit = num & (1 << i);
-        reversed += bit << (nbits - i);
-    }
-    return reversed
-} 
-
 /**
  * @brief A base class for all observable classes.
  *
@@ -430,12 +420,11 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
                                            int32_t, int64_t>::type;
 
   private:
-    MPIManager mpi_manager_;
     std::vector<std::complex<T>> data_;
     std::vector<IdxT> indices_;
     std::vector<IdxT> offsets_;
+
     std::vector<std::size_t> wires_;
-    
 
     [[nodiscard]] bool
     isEqual(const ObservableGPUMPI<T> &other) const override {
@@ -462,32 +451,10 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
      */
     template <typename T1, typename T2, typename T3 = T2,
               typename T4 = std::vector<std::size_t>>
-    SparseHamiltonianGPUMPI(T1 &&arg1, T2 &&arg2, T3 &&arg3, T4 &&arg4):mpi_manager_(MPI_COMM_WORLD){
-        if(mpi_manager_.getRank() == 0){
-            std::<std::complex<T>> data_tmp = std::move(arg1);
-            std::vector<IdxT> indices_tmp = std::move(arg2);
-            std::vector<IdxT> offsets_tmp = std::move(arg3);
-            std::vector<std::size_t> wires_tmp = std::move(arg4);
-            PL_ASSERT(data_tmp.size() == indices_tmp.size());
-
-            offsets_.push_back(0);
-            //sort sparseMV
-            for(size_t i = 0; i < offsets_tmp.size() - 1; i++){
-                size_t target_row = reverseBits(i,wire_tmp.size());
-                size_t offset = offsets_tmp[target_row];
-                size_t local_col_size = offsets_tmp[target_row+1] - offset;
-                offset_.push_back(offsets_.back()+local_col_size);
-                //get unsorted column indices
-                std::vector<IdxT> local_col_indices(indices_.begin()+offset,indices_tmp.begin()+offset+local_col_size+1);
-                for(size_t j = 0; j < local_col_indices.size(); j++){
-                    size_t target_col = reverseBits(local_col_indices[i],wire_tmp.size());
-                    local_col_indices[i] = target_col;
-                }
-                //sort indices and get the rank of column index
-
-                //sort data
-            }
-        }
+    SparseHamiltonianGPUMPI(T1 &&arg1, T2 &&arg2, T3 &&arg3, T4 &&arg4)
+        : data_{std::forward<T1>(arg1)}, indices_{std::forward<T2>(arg2)},
+          offsets_{std::forward<T3>(arg3)}, wires_{std::forward<T4>(arg4)} {
+        PL_ASSERT(data_.size() == indices_.size());
     }
 
     /**
diff --git a/pennylane_lightning_gpu/src/bindings/Bindings.cpp b/pennylane_lightning_gpu/src/bindings/Bindings.cpp
index bf55653f..d6674830 100644
--- a/pennylane_lightning_gpu/src/bindings/Bindings.cpp
+++ b/pennylane_lightning_gpu/src/bindings/Bindings.cpp
@@ -39,6 +39,7 @@
 
 #ifdef ENABLE_MPI
 #include "AdjointDiffGPUMPI.hpp"
+#include "CSRMatrix.hpp"
 #include "MPIManager.hpp"
 #include "StateVectorCudaMPI.hpp"
 #endif
@@ -1608,13 +1609,24 @@ void StateVectorCudaMPI_class_bindings(py::module &m) {
 
             const py::buffer_info buffer_offsets = offsets.request();
             const auto *offsets_ptr = static_cast<SpIDX *>(buffer_offsets.ptr);
+            
+            MPIManager mpi_manager(MPI_COMM_WORLD);
 
+            CSRMatrix<PrecisionT, SpIDX> sparseMat(offsets.size() - 1, data.size(), indices_ptr, offsets_ptr, data_ptr);
+            auto sparseMatReorder = sparseMat.matrixReorder();
+
+            mpi_manager.Barrier();
+
+            return SparseHamiltonianGPUMPI<PrecisionT>{sparseMatReorder.getValues(), sparseMatReorder.getColumns(), sparseMatReorder.getCsrOffsets(), wires};
+
+           /*
             return SparseHamiltonianGPUMPI<PrecisionT>{
                 std::vector<complex<PrecisionT>>(
                     {data_ptr, data_ptr + data.size()}),
                 std::vector<SpIDX>({indices_ptr, indices_ptr + indices.size()}),
                 std::vector<SpIDX>({offsets_ptr, offsets_ptr + offsets.size()}),
                 wires};
+            */
         }))
         .def("__repr__", &SparseHamiltonianGPUMPI<PrecisionT>::getObsName)
         .def("get_wires", &SparseHamiltonianGPUMPI<PrecisionT>::getWires,
diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 20f58dd9..d3c8587f 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -27,6 +27,7 @@
 #include <cuda.h>
 #include <custatevec.h> // custatevecApplyMatrix
 
+#include "CSRMatrix.hpp"
 #include "Constant.hpp"
 #include "Error.hpp"
 #include "MPIManager.hpp"
@@ -64,44 +65,6 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value,
                                const size_t index, bool async,
                                cudaStream_t stream_id);
 
-/**
- * @brief Manage memory of Compressed Sparse Row (CSR) sparse matrix. CSR format
- * represents a matrix M by three (one-dimensional) arrays, that respectively
- * contain nonzero values, row offsets, and column indices.
- *
- * @tparam Precision Floating-point precision type.
- * @tparam index_type Integer type.
- */
-template <class Precision, class index_type> class CSRMatrix {
-  private:
-    std::vector<index_type> columns;
-    std::vector<index_type> csrOffsets;
-    std::vector<std::complex<Precision>> values;
-
-  public:
-    CSRMatrix(size_t num_rows, size_t nnz)
-        : columns(nnz, 0), csrOffsets(num_rows + 1, 0), values(nnz){};
-
-    CSRMatrix() = default;
-
-    /**
-     * @brief Get the CSR format index vector of the matrix.
-     */
-    auto getColumns() -> std::vector<index_type> & { return columns; }
-
-    /**
-     * @brief Get CSR format offset vector of the matrix.
-     */
-    auto getCsrOffsets() -> std::vector<index_type> & { return csrOffsets; }
-
-    /**
-     * @brief Get CSR format data vector of the matrix.
-     */
-    auto getValues() -> std::vector<std::complex<Precision>> & {
-        return values;
-    }
-};
-
 /**
  * @brief Managed memory CUDA state-vector class using custateVec backed
  * gate-calls.
diff --git a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
new file mode 100644
index 00000000..81b482cd
--- /dev/null
+++ b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
@@ -0,0 +1,131 @@
+#pragma once
+
+#include <algorithm>
+#include <bit>
+#include <complex>
+#include <map>
+#include <vector>
+
+#include "MPIManager.hpp"
+
+/// @cond DEV
+namespace {
+using namespace Pennylane::CUDA;
+} // namespace
+/// @endcond
+namespace Pennylane::MPI {
+
+inline size_t reverseBits(size_t num, size_t nbits) {
+    size_t reversed = 0;
+    for (size_t i = 0; i < nbits; ++i) {
+        // ith bit value
+        size_t bit = (num & (1 << i)) >> i;
+        reversed += bit << (nbits - i - 1);
+    }
+    return reversed;
+}
+
+template <class index_type>
+inline std::tuple<std::vector<index_type>, std::vector<index_type>>
+rankVector(const std::vector<index_type> &input) {
+    // Create a copy of the input vector for sorting
+    std::vector<index_type> sortedInput = input;
+
+    // Sort the copy in ascending order
+    std::sort(sortedInput.begin(), sortedInput.end());
+
+    std::vector<index_type> ranks(input.size());
+    for (size_t i = 0; i < input.size(); ++i) {
+        auto it =
+            std::lower_bound(sortedInput.begin(), sortedInput.end(), input[i]);
+        ranks[i] = std::distance(sortedInput.begin(), it);
+    }
+
+    return {sortedInput, ranks};
+}
+
+/**
+ * @brief Manage memory of Compressed Sparse Row (CSR) sparse matrix. CSR format
+ * represents a matrix M by three (one-dimensional) arrays, that respectively
+ * contain nonzero values, row offsets, and column indices.
+ *
+ * @tparam Precision Floating-point precision type.
+ * @tparam index_type Integer type.
+ */
+template <class Precision, class index_type> class CSRMatrix {
+  private:
+    std::vector<index_type> columns_;
+    std::vector<index_type> csrOffsets_;
+    std::vector<std::complex<Precision>> values_;
+
+  public:
+    CSRMatrix(size_t num_rows, size_t nnz)
+        : columns_(nnz, 0), csrOffsets_(num_rows + 1, 0), values_(nnz){};
+
+    CSRMatrix(size_t num_rows, size_t nnz, const index_type *column_ptr,
+              const index_type *csrOffsets_ptr, const std::complex<Precision> *value_ptr)
+        : columns_(column_ptr, column_ptr + nnz),
+          csrOffsets_(csrOffsets_ptr, csrOffsets_ptr + num_rows + 1),
+          values_(value_ptr, value_ptr + nnz){};
+
+    CSRMatrix() = default;
+
+    /**
+     * @brief Get the CSR format index vector of the matrix.
+     */
+    auto getColumns() -> std::vector<index_type> & { return columns_; }
+
+    /**
+     * @brief Get CSR format offset vector of the matrix.
+     */
+    auto getCsrOffsets() -> std::vector<index_type> & { return csrOffsets_; }
+
+    /**
+     * @brief Get CSR format data vector of the matrix.
+     */
+    auto getValues() -> std::vector<std::complex<Precision>> & {
+        return values_;
+    }
+
+    auto matrixReorder() -> CSRMatrix<Precision, index_type>  {
+        size_t num_rows = this->getCsrOffsets().size() - 1;
+        size_t nnz = this->getColumns().size();
+        size_t nbits = std::bit_width(num_rows) - 1;
+        CSRMatrix<Precision, index_type> reorderedMatrix(num_rows, nnz);
+
+        for (size_t row_idx = 0; row_idx < num_rows; row_idx++) {
+            size_t org_row_idx = reverseBits(row_idx, nbits);
+
+            size_t org_offset = this->getCsrOffsets()[org_row_idx];
+            size_t local_col_size =
+                this->getCsrOffsets()[org_row_idx + 1] - org_offset;
+
+            reorderedMatrix.getCsrOffsets()[row_idx + 1] =
+                local_col_size + reorderedMatrix.getCsrOffsets()[row_idx];
+
+            // get unsorted column indices
+            std::vector<index_type> local_col_indices(
+                this->getColumns().begin() + org_offset,
+                this->getColumns().begin() + org_offset + local_col_size);
+            for (size_t i = 0; i < local_col_indices.size(); i++) {
+                size_t col_idx = reverseBits(local_col_indices[i], nbits);
+                local_col_indices[i] = col_idx;
+            }
+
+            auto [sorted_col_indices, ranks] =
+                rankVector<index_type>(local_col_indices);
+
+            for (size_t i = 0; i < sorted_col_indices.size(); i++) {
+                reorderedMatrix
+                    .getColumns()[reorderedMatrix.getCsrOffsets()[row_idx] +
+                                  i] = sorted_col_indices[i];
+                reorderedMatrix
+                    .getValues()[reorderedMatrix.getCsrOffsets()[row_idx] + i] =
+                    this->getValues()[org_offset + ranks[i]];
+
+            }
+        }
+        return reorderedMatrix;
+    }
+};
+} // namespace Pennylane::MPI
\ No newline at end of file

From 45633b8b10b71f27e775e7e319927f869ae9cda0 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 23 Aug 2023 07:22:36 -0700
Subject: [PATCH 37/43] refactor binding

---
 .../src/bindings/Bindings.cpp                 | 39 ++++-------------
 .../src/simulator/StateVectorCudaMPI.hpp      | 43 ++++++++++++++++---
 .../src/util/CSRMatrix.hpp                    |  2 +-
 3 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/pennylane_lightning_gpu/src/bindings/Bindings.cpp b/pennylane_lightning_gpu/src/bindings/Bindings.cpp
index 33a6ece5..04a36034 100644
--- a/pennylane_lightning_gpu/src/bindings/Bindings.cpp
+++ b/pennylane_lightning_gpu/src/bindings/Bindings.cpp
@@ -1361,39 +1361,16 @@ void StateVectorCudaMPI_class_bindings(py::module &m) {
                const np_arr_sparse_ind &columns, const np_arr_c values) {
                 using index_type = typename std::conditional<
                     std::is_same<ParamT, float>::value, int32_t, int64_t>::type;
-                //Distribute sparse matrix across multi-nodes/multi-gpus
-                size_t num_rows =
-                    static_cast<index_type>(csrOffsets.request().size) - 1;
-                size_t local_num_rows = size_t{1} << sv.getNumLocalQubits();
-                size_t root = 0;
-
-                MPIManager mpi_manager = sv.getMPIManager();
-
-                std::vector<std::vector<CSRMatrix<PrecisionT, index_type>>>
-                    csrmatrix_blocks;
-
-                if (mpi_manager.getRank() == root) {
-                    csrmatrix_blocks = splitCSRMatrix<PrecisionT, index_type>(
-                        mpi_manager, num_rows,
-                        static_cast<index_type *>(csrOffsets.request().ptr),
-                        static_cast<index_type *>(columns.request().ptr),
-                        static_cast<std::complex<PrecisionT> *>(
-                            values.request().ptr));
-                }
-                mpi_manager.Barrier();
-
-                std::vector<CSRMatrix<PrecisionT, index_type>>
-                    localCSRMatVector;
-                for (size_t i = 0; i < mpi_manager.getSize(); i++) {
-                    auto localCSRMat = scatterCSRMatrix<PrecisionT, index_type>(
-                        mpi_manager, csrmatrix_blocks[i], local_num_rows, root);
-                    localCSRMatVector.push_back(localCSRMat);
-                }
-
-                mpi_manager.Barrier();
 
                 return sv.template getExpectationValueOnSparseSpMV<index_type>(
-                    localCSRMatVector);
+                    static_cast<index_type *>(csrOffsets.request().ptr),
+                    static_cast<index_type>(
+                        csrOffsets.request()
+                            .size), // num_rows + 1 or csrOffsets
+                    static_cast<index_type *>(columns.request().ptr), // columns
+                    static_cast<std::complex<PrecisionT> *>(
+                        values.request().ptr),
+                    static_cast<index_type>(values.request().size)); // nnz
             },
             "Calculate the expectation value of a sparse Hamiltonian.")
 
diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index b1c52568..93971616 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -968,7 +968,16 @@ class StateVectorCudaMPI
      */
     template <class index_type>
     auto getExpectationValueOnSparseSpMV(
-        std::vector<CSRMatrix<Precision, index_type>> &CSRMatVector) {
+        const index_type *csrOffsets_ptr, const index_type csrOffsets_size,
+        const index_type *columns_ptr,
+        const std::complex<Precision> *values_ptr, const index_type numNNZ) {
+
+        if (mpi_manager_.getRank() == 0) {
+            PL_ABORT_IF_NOT(static_cast<size_t>(csrOffsets_size - 1) ==
+                                (size_t{1} << this->getTotalNumQubits()),
+                            "Incorrect size of CSR Offsets.");
+            PL_ABORT_IF_NOT(numNNZ > 0, "Empty CSR matrix.");
+        }
 
         const CFP_t alpha = {1.0, 0.0};
         const CFP_t beta = {0.0, 0.0};
@@ -998,6 +1007,30 @@ class StateVectorCudaMPI
             compute_type = CUSPARSE_INDEX_32I;
         }
 
+        // Distribute sparse matrix across multi-nodes/multi-gpus
+        size_t num_rows = size_t{1} << this->getTotalNumQubits();
+        size_t local_num_rows = size_t{1} << this->getNumLocalQubits();
+        size_t root = 0;
+
+        std::vector<std::vector<CSRMatrix<Precision, index_type>>>
+            csrmatrix_blocks;
+
+        if (mpi_manager_.getRank() == root) {
+            csrmatrix_blocks = splitCSRMatrix<Precision, index_type>(
+                mpi_manager_, num_rows, csrOffsets_ptr, columns_ptr,
+                values_ptr);
+        }
+        mpi_manager_.Barrier();
+
+        std::vector<CSRMatrix<Precision, index_type>> localCSRMatVector;
+        for (size_t i = 0; i < mpi_manager_.getSize(); i++) {
+            auto localCSRMat = scatterCSRMatrix<Precision, index_type>(
+                mpi_manager_, csrmatrix_blocks[i], local_num_rows, root);
+            localCSRMatVector.push_back(localCSRMat);
+        }
+
+        mpi_manager_.Barrier();
+
         const size_t length_local = size_t{1} << this->getNumLocalQubits();
 
         DataBuffer<CFP_t, int> d_res_per_block{length_local, device_id,
@@ -1009,12 +1042,10 @@ class StateVectorCudaMPI
         for (size_t i = 0; i < mpi_manager_.getSize(); i++) {
             // Need to investigate if non-blocking MPI operation can improve
             // performace here.
-            auto &localCSRMatrix = CSRMatVector[i];
+            auto &localCSRMatrix = localCSRMatVector[i];
 
-            int64_t num_rows_local =
-                static_cast<int64_t>(localCSRMatrix.getCsrOffsets().size() - 1);
-            int64_t num_cols_local =
-                static_cast<int64_t>(localCSRMatrix.getColumns().size());
+            int64_t num_rows_local = local_num_rows;
+            int64_t num_cols_local = num_rows_local;
             int64_t nnz_local =
                 static_cast<int64_t>(localCSRMatrix.getValues().size());
 
diff --git a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
index 7a87673a..31a64be5 100644
--- a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
+++ b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
@@ -33,7 +33,7 @@ template <class Precision, class index_type> class CSRMatrix {
 
     CSRMatrix(size_t num_rows, size_t nnz, index_type *column_ptr,
               index_type *csrOffsets_ptr, std::complex<Precision> *value_ptr)
-        : columns_(column_ptr, column_ptr + num_rows),
+        : columns_(column_ptr, column_ptr + nnz),
           csrOffsets_(csrOffsets_ptr, csrOffsets_ptr + num_rows + 1),
           values_(value_ptr, value_ptr + nnz){};
 

From 77ecd93b3b3a7a452c7646c4e4e041d799063f1c Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 23 Aug 2023 11:14:34 -0700
Subject: [PATCH 38/43] init zeros for tmp for empty spmat

---
 .github/CHANGELOG.md                          | 11 +--
 mpitests/test_adjoint_jacobian.py             | 32 ++------
 pennylane_lightning_gpu/_serialize.py         | 55 +++++--------
 pennylane_lightning_gpu/lightning_gpu.py      |  2 +-
 .../src/algorithms/ObservablesGPUMPI.hpp      | 77 ++++++++++++-------
 .../src/bindings/Bindings.cpp                 | 12 ---
 .../src/simulator/StateVectorCudaMPI.hpp      | 11 ++-
 .../src/util/CSRMatrix.hpp                    | 15 ++++
 8 files changed, 100 insertions(+), 115 deletions(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index ebb60298..dee589e5 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -4,16 +4,9 @@
 
  * Add sparse Hamiltonian support to multi-node/multi-GPU adjoint methods. 
  [(#128)] (https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/128)
-
- Note each MPI process will return the overall result of the adjoint method. If there is a global target wire when constructing the sparse Hamiltonian, `wires` should be reordered in a descending manner in a python script as follows:
- ```python
-    qml.SparseHamiltonian(
-        qml.Hamiltonian(
-            [0.1], [qml.PauliX(0) @ qml.PauliY(1)]
-        ).sparse_matrix(wires.[::-1]),
-    )
+ Note each MPI process will return the overall result of the adjoint method. 
  ```
- 
+
  * Add Sparse Hamiltonian support for expectation value calculation.
  [(#127)] (https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/127)
 
diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
index 97ee4ec2..a4dcb197 100644
--- a/mpitests/test_adjoint_jacobian.py
+++ b/mpitests/test_adjoint_jacobian.py
@@ -1046,23 +1046,13 @@ def circuit(params):
         j_gpu = qml.jacobian(qnode_gpu)(params)
 
 
-""" Important Note: Ensure wires index arranged from high to low if
-    there is a global target wire.
-    qml.SparseHamiltonian(
-        qml.Hamiltonian(
-            [0.1], [qml.PauliX(wires=custom_wires[1]) @ qml.PauliY(wires=custom_wires[0])]
-        ).sparse_matrix(custom_wires.[::-1]),
-    ) 
-"""
-
-
 @pytest.mark.parametrize(
     "returns",
     [
         qml.SparseHamiltonian(
             qml.Hamiltonian(
                 [0.1], [qml.PauliX(wires=custom_wires[0]) @ qml.PauliY(wires=custom_wires[1])]
-            ).sparse_matrix(custom_wires[::-1]),
+            ).sparse_matrix(custom_wires),
             wires=custom_wires,
         ),
         qml.SparseHamiltonian(
@@ -1080,7 +1070,7 @@ def circuit(params):
         qml.SparseHamiltonian(
             qml.Hamiltonian(
                 [1.1], [qml.PauliX(wires=custom_wires[0]) @ qml.PauliZ(wires=custom_wires[2])]
-            ).sparse_matrix(custom_wires[::-1]),
+            ).sparse_matrix(custom_wires),
             wires=custom_wires,
         ),
     ],
@@ -1115,18 +1105,6 @@ def circuit(params):
     assert np.allclose(j_cpu, j_gpu)
 
 
-""" Important Note: Ensure wires index arranged from high to low if
-    there is a global target wire.
-    qml.SparseHamiltonian(
-    qml.Hamiltonian(
-                [0.1],
-                [qml.PauliX(0) @ qml.PauliZ(1)],
-            ).sparse_matrix(range(5, -1, -1)),
-            wires=range(6),
-    )
-"""
-
-
 @pytest.mark.parametrize(
     "returns",
     [
@@ -1134,7 +1112,7 @@ def circuit(params):
             qml.Hamiltonian(
                 [0.1],
                 [qml.PauliZ(1) @ qml.PauliX(0) @ qml.Identity(2) @ qml.PauliX(4) @ qml.Identity(5)],
-            ).sparse_matrix(range(5, -1, -1)),
+            ).sparse_matrix(range(6)),
             wires=range(6),
         ),
         qml.SparseHamiltonian(
@@ -1148,7 +1126,7 @@ def circuit(params):
             qml.Hamiltonian(
                 [0.1],
                 [qml.PauliX(0)],
-            ).sparse_matrix(range(5, -1, -1)),
+            ).sparse_matrix(range(6)),
             wires=range(6),
         ),
         qml.SparseHamiltonian(
@@ -1162,7 +1140,7 @@ def circuit(params):
             qml.Hamiltonian(
                 [0.1],
                 [qml.PauliX(0) @ qml.PauliZ(1)],
-            ).sparse_matrix(range(5, -1, -1)),
+            ).sparse_matrix(range(6)),
             wires=range(6),
         ),
         qml.SparseHamiltonian(
diff --git a/pennylane_lightning_gpu/_serialize.py b/pennylane_lightning_gpu/_serialize.py
index de108949..af96e082 100644
--- a/pennylane_lightning_gpu/_serialize.py
+++ b/pennylane_lightning_gpu/_serialize.py
@@ -122,14 +122,14 @@ def _hamiltonian_ob_dtype(use_csingle, use_mpi: bool):
 def _sparsehamiltonian_ob_dtype(use_csingle, use_mpi: bool):
     if not use_mpi:
         return (
-            [SparseHamiltonianGPU_C64, np.float32, np.int32]
+            [SparseHamiltonianGPU_C64, np.complex64, np.int32]
             if use_csingle
-            else [SparseHamiltonianGPU_C128, np.float64, np.int64]
+            else [SparseHamiltonianGPU_C128, np.complex128, np.int64]
         )
     return (
-        [SparseHamiltonianGPUMPI_C64, np.float32, np.int32]
+        [SparseHamiltonianGPUMPI_C64, np.complex64, np.int32]
         if use_csingle
-        else [SparseHamiltonianGPUMPI_C128, np.float64, np.int64]
+        else [SparseHamiltonianGPUMPI_C128, np.complex128, np.int64]
     )
 
 
@@ -169,40 +169,25 @@ def _serialize_hamiltonian(
 
 def _serialize_sparsehamiltonian(ob, wires_map: dict, use_csingle: bool, use_mpi: bool):
     sparsehamiltonian_obs, ctype, rtype = _sparsehamiltonian_ob_dtype(use_csingle, use_mpi)
-
+    wires = []
+    wires_list = ob.wires.tolist()
+    wires.extend([wires_map[w] for w in wires_list])
     if use_mpi:
         mpi_manager_local = MPIManager()
+        # Other root only needs non-null some sparsematrix data to pass
+        obs = qml.Identity(0)
+        Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix()
+        H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1))
+        spm = H_sparse.sparse_matrix()
         # Only root 0 needs the overall sparsematrix data
         if mpi_manager_local.getRank() == 0:
             spm = ob.sparse_matrix()
-            data = np.array(spm.data).astype(ctype)
-            indices = np.array(spm.indices).astype(rtype)
-            offsets = np.array(spm.indptr).astype(rtype)
-            wires = []
-            wires_list = ob.wires.tolist()
-            wires.extend([wires_map[w] for w in wires_list])
-        else:
-            # Other root only needs non-null some sparsematrix data to pass
-            obs = qml.Identity(0)
-            Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix()
-            H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1))
-            spm = H_sparse.sparse_matrix()
-
-            data = np.array(spm.data).astype(ctype)
-            indices = np.array(spm.indices).astype(rtype)
-            offsets = np.array(spm.indptr).astype(rtype)
-
-            wires = []
-            wires_list = ob.wires.tolist()
-            wires.extend([wires_map[w] for w in wires_list])
+        mpi_manager_local.Barrier()
     else:
         spm = ob.sparse_matrix()
-        data = np.array(spm.data).astype(ctype)
-        indices = np.array(spm.indices).astype(rtype)
-        offsets = np.array(spm.indptr).astype(rtype)
-        wires = []
-        wires_list = ob.wires.tolist()
-        wires.extend([wires_map[w] for w in wires_list])
+    data = np.array(spm.data).astype(ctype)
+    indices = np.array(spm.indices).astype(rtype)
+    offsets = np.array(spm.indptr).astype(rtype)
 
     return sparsehamiltonian_obs(data, indices, offsets, wires)
 
@@ -272,9 +257,10 @@ def _serialize_observables(
         tape (QuantumTape): the input quantum tape
         wires_map (dict): a dictionary mapping input wires to the device's backend wires
         use_csingle (bool): whether to use np.complex64 instead of np.complex128
+        use_mpi (bool): whether MPI is used or not
 
     Returns:
-        list(ObservableGPU_C64 or ObservableGPU_C128): A list of observable objects compatible with the C++ backend
+        list(ObservableGPU_C64/128 or ObservableGPUMPI_C64/128): A list of observable objects compatible with the C++ backend
     """
     output = []
     offsets = [0]
@@ -291,7 +277,7 @@ def _serialize_observables(
 
 
 def _serialize_ops(
-    tape: QuantumTape, wires_map: dict, use_csingle: bool = False
+    tape: QuantumTape, wires_map: dict, use_csingle: bool = False, use_mpi: bool = False
 ) -> Tuple[List[List[str]], List[np.ndarray], List[List[int]], List[bool], List[np.ndarray]]:
     """Serializes the operations of an input tape.
 
@@ -301,6 +287,7 @@ def _serialize_ops(
         tape (QuantumTape): the input quantum tape
         wires_map (dict): a dictionary mapping input wires to the device's backend wires
         use_csingle (bool): whether to use np.complex64 instead of np.complex128
+        use_mpi (bool): whether MPI is used or not
 
     Returns:
         Tuple[list, list, list, list, list]: A serialization of the operations, containing a list
@@ -315,7 +302,7 @@ def _serialize_ops(
 
     uses_stateprep = False
 
-    sv_py = LightningGPU_C64 if use_csingle else LightningGPU_C128
+    sv_py = _sv_py_dtype(use_csingle, use_mpi)
 
     for o in tape.operations:
         if isinstance(o, (BasisState, QubitStateVector)):
diff --git a/pennylane_lightning_gpu/lightning_gpu.py b/pennylane_lightning_gpu/lightning_gpu.py
index f9b2aaad..86825c8d 100644
--- a/pennylane_lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning_gpu/lightning_gpu.py
@@ -666,7 +666,7 @@ def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False, **
             )
 
             ops_serialized, use_sp = _serialize_ops(
-                tape, self.wire_map, use_csingle=self.use_csingle
+                tape, self.wire_map, use_csingle=self.use_csingle, use_mpi=self._mpi
             )
             ops_serialized = adj.create_ops_list(*ops_serialized)
 
diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
index 16a82f5f..d99431ba 100644
--- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
+++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
@@ -17,6 +17,7 @@
 #include <functional>
 #include <vector>
 
+#include "CSRMatrix.hpp"
 #include "MPIManager.hpp"
 #include "StateVectorCudaMPI.hpp"
 
@@ -493,6 +494,28 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
                 "SparseH wire count does not match state-vector size");
         }
 
+        // Distribute sparse matrix across multi-nodes/multi-gpus
+        size_t num_rows = size_t{1} << sv.getTotalNumQubits();
+        size_t local_num_rows = size_t{1} << sv.getNumLocalQubits();
+
+        std::vector<std::vector<CSRMatrix<PrecisionT, IdxT>>> csrmatrix_blocks;
+
+        if (mpi_manager.getRank() == 0) {
+            csrmatrix_blocks = splitCSRMatrix<PrecisionT, IdxT>(
+                mpi_manager, num_rows, offsets_.data(), indices_.data(),
+                data_.data());
+        }
+        mpi_manager.Barrier();
+
+        std::vector<CSRMatrix<PrecisionT, IdxT>> localCSRMatVector;
+        for (size_t i = 0; i < mpi_manager.getSize(); i++) {
+            auto localCSRMat = scatterCSRMatrix<PrecisionT, IdxT>(
+                mpi_manager, csrmatrix_blocks[i], local_num_rows, 0);
+            localCSRMatVector.push_back(localCSRMat);
+        }
+
+        mpi_manager.Barrier();
+
         using CFP_t = typename StateVectorCudaMPI<T>::CFP_t;
         const CFP_t alpha = {1.0, 0.0};
         const CFP_t beta = {0.0, 0.0};
@@ -521,38 +544,24 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
             compute_type = CUSPARSE_INDEX_32I;
         }
 
-        std::vector<std::vector<CSRMatrix<PrecisionT, IdxT>>> csrmatrix_blocks;
-
-        const size_t num_col_blocks = mpi_manager.getSize();
-        const size_t num_row_blocks = mpi_manager.getSize();
-
-        if (mpi_manager.getRank() == 0) {
-            csrmatrix_blocks = sv.template splitCSRMatrix<IdxT>(
-                num_col_blocks, num_row_blocks, offsets_.data(),
-                indices_.data(), data_.data());
-        }
-
         const size_t length_local = size_t{1} << sv.getNumLocalQubits();
 
-        DevTag<int> dt_local(sv.getDataBuffer().getDevTag());
-        dt_local.refresh();
-
-        StateVectorCudaMPI<PrecisionT> d_sv_prime(
-            dt_local, sv.getNumGlobalQubits(), sv.getNumLocalQubits(),
-            sv.getData());
+        std::unique_ptr<DataBuffer<CFP_t>> d_sv_prime =
+            std::make_unique<DataBuffer<CFP_t>>(length_local, device_id,
+                                                stream_id, true);
+        std::unique_ptr<DataBuffer<CFP_t>> d_tmp =
+            std::make_unique<DataBuffer<CFP_t>>(length_local, device_id,
+                                                stream_id, true);
+        d_sv_prime->zeroInit();
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager.Barrier();
 
-        StateVectorCudaMPI<PrecisionT> d_tmp(dt_local, sv.getNumGlobalQubits(),
-                                             sv.getNumLocalQubits(),
-                                             sv.getData());
-
-        for (size_t i = 0; i < num_row_blocks; i++) {
-            auto localCSRMatrix =
-                sv.template scatterCSRMatrix<IdxT>(csrmatrix_blocks[i], 0);
+        for (size_t i = 0; i < mpi_manager.getSize(); i++) {
+            auto &localCSRMatrix = localCSRMatVector[i];
 
             int64_t num_rows_local =
                 static_cast<int64_t>(localCSRMatrix.getCsrOffsets().size() - 1);
-            int64_t num_cols_local =
-                static_cast<int64_t>(localCSRMatrix.getColumns().size());
+            int64_t num_cols_local = num_rows_local;
             int64_t nnz_local =
                 static_cast<int64_t>(localCSRMatrix.getValues().size());
 
@@ -611,7 +620,7 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
                 PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec(
                     /* cusparseDnVecDescr_t* */ &vecY,
                     /* int64_t */ num_rows_local,
-                    /* void* */ d_tmp.getData(),
+                    /* void* */ d_tmp->getData(),
                     /* cudaDataType */ data_type));
 
                 // allocate an external buffer if needed
@@ -650,10 +659,17 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
 
                 color = 1;
             }
+            PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+            mpi_manager.Barrier();
 
             if (mpi_manager.getRank() == i) {
                 color = 1;
+                if (localCSRMatrix.getValues().size() == 0) {
+                    d_tmp->zeroInit();
+                }
             }
+            PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+            mpi_manager.Barrier();
 
             auto new_mpi_manager =
                 mpi_manager.split(color, mpi_manager.getRank());
@@ -667,11 +683,14 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
 
             if (new_mpi_manager.getComm() != MPI_COMM_NULL) {
                 new_mpi_manager.template Reduce<CFP_t>(
-                    d_tmp.getData(), d_sv_prime.getData(), length_local,
+                    d_tmp->getData(), d_sv_prime->getData(), length_local,
                     reduce_root_rank, "sum");
             }
+            PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+            mpi_manager.Barrier();
         }
-        sv.CopyGpuDataToGpuIn(d_sv_prime.getData(), d_sv_prime.getLength());
+        sv.CopyGpuDataToGpuIn(d_sv_prime->getData(), d_sv_prime->getLength());
+        mpi_manager.Barrier();
     }
 
     [[nodiscard]] auto getObsName() const -> std::string override {
diff --git a/pennylane_lightning_gpu/src/bindings/Bindings.cpp b/pennylane_lightning_gpu/src/bindings/Bindings.cpp
index e12ff8cd..907cf0c6 100644
--- a/pennylane_lightning_gpu/src/bindings/Bindings.cpp
+++ b/pennylane_lightning_gpu/src/bindings/Bindings.cpp
@@ -1361,7 +1361,6 @@ void StateVectorCudaMPI_class_bindings(py::module &m) {
                const np_arr_sparse_ind &columns, const np_arr_c values) {
                 using index_type = typename std::conditional<
                     std::is_same<ParamT, float>::value, int32_t, int64_t>::type;
-
                 return sv.template getExpectationValueOnSparseSpMV<index_type>(
                     static_cast<index_type *>(csrOffsets.request().ptr),
                     static_cast<index_type>(
@@ -1610,24 +1609,13 @@ void StateVectorCudaMPI_class_bindings(py::module &m) {
 
             const py::buffer_info buffer_offsets = offsets.request();
             const auto *offsets_ptr = static_cast<SpIDX *>(buffer_offsets.ptr);
-            
-            MPIManager mpi_manager(MPI_COMM_WORLD);
-
-            CSRMatrix<PrecisionT, SpIDX> sparseMat(offsets.size() - 1, data.size(), indices_ptr, offsets_ptr, data_ptr);
-            auto sparseMatReorder = sparseMat.matrixReorder();
-
-            mpi_manager.Barrier();
-
-            return SparseHamiltonianGPUMPI<PrecisionT>{sparseMatReorder.getValues(), sparseMatReorder.getColumns(), sparseMatReorder.getCsrOffsets(), wires};
 
-           /*
             return SparseHamiltonianGPUMPI<PrecisionT>{
                 std::vector<complex<PrecisionT>>(
                     {data_ptr, data_ptr + data.size()}),
                 std::vector<SpIDX>({indices_ptr, indices_ptr + indices.size()}),
                 std::vector<SpIDX>({offsets_ptr, offsets_ptr + offsets.size()}),
                 wires};
-            */
         }))
         .def("__repr__", &SparseHamiltonianGPUMPI<PrecisionT>::getObsName)
         .def("get_wires", &SparseHamiltonianGPUMPI<PrecisionT>::getWires,
diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 93971616..2143557e 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -1010,12 +1010,11 @@ class StateVectorCudaMPI
         // Distribute sparse matrix across multi-nodes/multi-gpus
         size_t num_rows = size_t{1} << this->getTotalNumQubits();
         size_t local_num_rows = size_t{1} << this->getNumLocalQubits();
-        size_t root = 0;
 
         std::vector<std::vector<CSRMatrix<Precision, index_type>>>
             csrmatrix_blocks;
 
-        if (mpi_manager_.getRank() == root) {
+        if (mpi_manager_.getRank() == 0) {
             csrmatrix_blocks = splitCSRMatrix<Precision, index_type>(
                 mpi_manager_, num_rows, csrOffsets_ptr, columns_ptr,
                 values_ptr);
@@ -1025,7 +1024,7 @@ class StateVectorCudaMPI
         std::vector<CSRMatrix<Precision, index_type>> localCSRMatVector;
         for (size_t i = 0; i < mpi_manager_.getSize(); i++) {
             auto localCSRMat = scatterCSRMatrix<Precision, index_type>(
-                mpi_manager_, csrmatrix_blocks[i], local_num_rows, root);
+                mpi_manager_, csrmatrix_blocks[i], local_num_rows, 0);
             localCSRMatVector.push_back(localCSRMat);
         }
 
@@ -1151,8 +1150,14 @@ class StateVectorCudaMPI
 
             if (mpi_manager_.getRank() == i) {
                 color = 1;
+                if (localCSRMatrix.getValues().size() == 0) {
+                    d_res_per_block.zeroInit();
+                }
             }
 
+            PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+            mpi_manager_.Barrier();
+
             auto new_mpi_manager =
                 mpi_manager_.split(color, mpi_manager_.getRank());
             int reduce_root_rank = -1;
diff --git a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
index 31a64be5..59856d46 100644
--- a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
+++ b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
@@ -57,6 +57,21 @@ template <class Precision, class index_type> class CSRMatrix {
     }
 };
 
+/**
+ * @brief Convert a global CSR (Compressed Sparse Row) format matrix into
+ * local blocks. This operation should be conducted on the rank 0.
+ *
+ * @tparam index_type Integer type used as indices of the sparse matrix.
+ * @param num_row_blocks Number of local blocks per global row.
+ * @param num_col_blocks Number of local blocks per global column.
+ * @param csrOffsets_ptr Pointer to the array of row offsets of the sparse
+ * matrix. Array of size csrOffsets_size.
+ * @param columns_ptr Pointer to the array of column indices of the sparse
+ * matrix. Array of size numNNZ
+ * @param values_ptr Pointer to the array of the non-zero elements
+ *
+ * @return auto A vector of vector of CSRMatrix.
+ */
 template <class Precision, class index_type>
 auto splitCSRMatrix(MPIManager &mpi_manager, const size_t &num_rows,
                     const index_type *csrOffsets_ptr,

From c2c8e5d348c9e5d6a1db31c18d3c0f7ccdba0767 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 23 Aug 2023 11:19:45 -0700
Subject: [PATCH 39/43] update changelog and merge main

---
 .github/CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index d685a518..90cfa934 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -5,7 +5,6 @@
  * Add sparse Hamiltonian support to multi-node/multi-GPU adjoint methods. 
  [(#128)] (https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/128)
  Note each MPI process will return the overall result of the adjoint method. 
- ```
 
  * Add Sparse Hamiltonian support for expectation value calculation.
  [(#127)] (https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/127)

From 877760ac0d6dcf8df2ac02c372f579bd211fa1dc Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 23 Aug 2023 12:14:31 -0700
Subject: [PATCH 40/43] add unit tests for CSRMatrix related methods

---
 .../src/tests/mpi/Test_MPIManager.cpp         | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/pennylane_lightning_gpu/src/tests/mpi/Test_MPIManager.cpp b/pennylane_lightning_gpu/src/tests/mpi/Test_MPIManager.cpp
index 4c75e1dd..613487f1 100644
--- a/pennylane_lightning_gpu/src/tests/mpi/Test_MPIManager.cpp
+++ b/pennylane_lightning_gpu/src/tests/mpi/Test_MPIManager.cpp
@@ -8,6 +8,7 @@
 
 #include <catch2/catch.hpp>
 
+#include "CSRMatrix.hpp"
 #include "MPIManager.hpp"
 
 #include "../TestHelpersLGPU.hpp"
@@ -217,4 +218,112 @@ TEST_CASE("MPIManager::split") {
     int key = rank;
     auto newComm = mpi_manager.split(color, key);
     CHECK(newComm.getSize() * 2 == mpi_manager.getSize());
+}
+
+TEMPLATE_TEST_CASE("CRSMatrix::Split", "[CRSMatrix]", float, double) {
+    using PrecisionT = TestType;
+    using cp_t = std::complex<PrecisionT>;
+    using index_type =
+        typename std::conditional<std::is_same<TestType, float>::value, int32_t,
+                                  int64_t>::type;
+
+    MPIManager mpi_manager(MPI_COMM_WORLD);
+
+    int rank = mpi_manager.getRank();
+    int size = mpi_manager.getSize();
+
+    index_type csrOffsets[9] = {0, 2, 4, 6, 8, 10, 12, 14, 16};
+    index_type columns[16] = {0, 3, 1, 2, 1, 2, 0, 3, 4, 7, 5, 6, 5, 6, 4, 7};
+
+    cp_t values[16] = {{1.0, 0.0},  {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0},
+                       {0.0, -1.0}, {1.0, 0.0},  {0.0, 1.0}, {1.0, 0.0},
+                       {1.0, 0.0},  {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0},
+                       {0.0, -1.0}, {1.0, 0.0},  {0.0, 1.0}, {1.0, 0.0}};
+
+    index_type num_csrOffsets = 9;
+    index_type num_rows = num_csrOffsets - 1;
+
+    SECTION("Apply split") {
+        if (rank == 0) {
+            auto CSRMatVector = splitCSRMatrix<TestType, index_type>(
+                mpi_manager, num_rows, csrOffsets, columns, values);
+
+            std::vector<index_type> localcsrOffsets = {0, 2, 4, 6, 8};
+            std::vector<index_type> local_indices = {0, 3, 1, 2, 1, 2, 0, 3};
+
+            for (size_t i = 0; i < localcsrOffsets.size(); i++) {
+                CHECK(CSRMatVector[0][0].getCsrOffsets()[i] ==
+                      localcsrOffsets[i]);
+                CHECK(CSRMatVector[1][1].getCsrOffsets()[i] ==
+                      localcsrOffsets[i]);
+            }
+
+            for (size_t i = 0; i < local_indices.size(); i++) {
+                CHECK(CSRMatVector[0][0].getColumns()[i] == local_indices[i]);
+                CHECK(CSRMatVector[1][1].getColumns()[i] == local_indices[i]);
+            }
+
+            for (size_t i = 0; i < 8; i++) {
+                CHECK(CSRMatVector[0][0].getValues()[i] == values[i]);
+                CHECK(CSRMatVector[1][1].getValues()[i] == values[i + 8]);
+            }
+
+            CHECK(CSRMatVector[0][1].getValues().size() == 0);
+            CHECK(CSRMatVector[1][0].getValues().size() == 0);
+        }
+    }
+
+    SECTION("Apply SparseMatrix scatter") {
+        std::vector<std::vector<CSRMatrix<TestType, index_type>>>
+            csrmatrix_blocks;
+
+        if (rank == 0) {
+            csrmatrix_blocks = splitCSRMatrix<TestType, index_type>(
+                mpi_manager, num_rows, csrOffsets, columns, values);
+        }
+
+        size_t local_num_rows = num_rows / size;
+
+        std::vector<CSRMatrix<TestType, index_type>> localCSRMatVector;
+        for (size_t i = 0; i < mpi_manager.getSize(); i++) {
+            auto localCSRMat = scatterCSRMatrix<TestType, index_type>(
+                mpi_manager, csrmatrix_blocks[i], local_num_rows, 0);
+            localCSRMatVector.push_back(localCSRMat);
+        }
+
+        std::vector<index_type> localcsrOffsets = {0, 2, 4, 6, 8};
+        std::vector<index_type> local_indices = {0, 3, 1, 2, 1, 2, 0, 3};
+
+        if (rank == 0) {
+            for (size_t i = 0; i < localcsrOffsets.size(); i++) {
+                CHECK(localCSRMatVector[0].getCsrOffsets()[i] ==
+                      localcsrOffsets[i]);
+            }
+
+            for (size_t i = 0; i < local_indices.size(); i++) {
+                CHECK(localCSRMatVector[0].getColumns()[i] == local_indices[i]);
+            }
+
+            for (size_t i = 0; i < 8; i++) {
+                CHECK(localCSRMatVector[0].getValues()[i] == values[i]);
+            }
+
+            CHECK(localCSRMatVector[1].getValues().size() == 0);
+        } else {
+            for (size_t i = 0; i < localcsrOffsets.size(); i++) {
+                CHECK(localCSRMatVector[1].getCsrOffsets()[i] ==
+                      localcsrOffsets[i]);
+            }
+
+            for (size_t i = 0; i < local_indices.size(); i++) {
+                CHECK(localCSRMatVector[1].getColumns()[i] == local_indices[i]);
+            }
+
+            for (size_t i = 0; i < 8; i++) {
+                CHECK(localCSRMatVector[1].getValues()[i] == values[i + 8]);
+            }
+
+            CHECK(localCSRMatVector[0].getValues().size() == 0);
+        }
+    }
 }
\ No newline at end of file

From 6a8cce0a3bc7dd5a2a7cd22f5aa7377afb6761d5 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 23 Aug 2023 19:11:08 -0700
Subject: [PATCH 41/43] add docstring

---
 .../src/algorithms/ObservablesGPUMPI.hpp              |  5 ++---
 .../src/simulator/StateVectorCudaMPI.hpp              |  1 -
 pennylane_lightning_gpu/src/util/CSRMatrix.hpp        | 11 +++++++----
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
index d99431ba..fdaefc15 100644
--- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
+++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
@@ -364,9 +364,8 @@ class HamiltonianGPUMPI final : public ObservableGPUMPI<T> {
         for (size_t term_idx = 0; term_idx < coeffs_.size(); term_idx++) {
             DevTag<int> dt_local(sv.getDataBuffer().getDevTag());
             dt_local.refresh();
-            StateVectorCudaMPI<PrecisionT> tmp(
-                dt_local, sv.getNumGlobalQubits(), sv.getNumLocalQubits(),
-                sv.getData());
+            StateVectorCudaMPI<T> tmp(dt_local, sv.getNumGlobalQubits(),
+                                      sv.getNumLocalQubits(), sv.getData());
             obs_[term_idx]->applyInPlace(tmp);
             scaleAndAddC_CUDA(std::complex<T>{coeffs_[term_idx], 0.0},
                               tmp.getData(), buffer.getData(), tmp.getLength(),
diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
index 4de7efe1..d8efa764 100644
--- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp
@@ -959,7 +959,6 @@ class StateVectorCudaMPI
         const index_type *csrOffsets_ptr, const index_type csrOffsets_size,
         const index_type *columns_ptr,
         const std::complex<Precision> *values_ptr, const index_type numNNZ) {
-
         if (mpi_manager_.getRank() == 0) {
             PL_ABORT_IF_NOT(static_cast<size_t>(csrOffsets_size - 1) ==
                                 (size_t{1} << this->getTotalNumQubits()),
diff --git a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
index 59856d46..78a60ddf 100644
--- a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
+++ b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
@@ -61,9 +61,10 @@ template <class Precision, class index_type> class CSRMatrix {
  * @brief Convert a global CSR (Compressed Sparse Row) format matrix into
  * local blocks. This operation should be conducted on the rank 0.
  *
+ * @tparam Precision Floating-point precision type.
  * @tparam index_type Integer type used as indices of the sparse matrix.
- * @param num_row_blocks Number of local blocks per global row.
- * @param num_col_blocks Number of local blocks per global column.
+ * @param mpi_manager MPIManager object.
+ * @param num_rows Number of rows of the CSR matrix.
  * @param csrOffsets_ptr Pointer to the array of row offsets of the sparse
  * matrix. Array of size csrOffsets_size.
  * @param columns_ptr Pointer to the array of column indices of the sparse
@@ -147,8 +148,11 @@ auto splitCSRMatrix(MPIManager &mpi_manager, const size_t &num_rows,
 /**
  * @brief Scatter a CSR (Compressed Sparse Row) format matrix.
  *
+ * @tparam Precision Floating-point precision type.
  * @tparam index_type Integer type used as indices of the sparse matrix.
- * @param matrix CSR (Compressed Sparse Row) format matrix.
+ * @param mpi_manager MPIManager object.
+ * @param matrix CSR (Compressed Sparse Row) format matrix vector.
+ * @param local_num_rows Number of rows of local CSR matrix.
  * @param root Root rank of the scatter operation.
  */
 template <class Precision, class index_type>
@@ -156,7 +160,6 @@ auto scatterCSRMatrix(MPIManager &mpi_manager,
                       std::vector<CSRMatrix<Precision, index_type>> &matrix,
                       size_t local_num_rows, size_t root)
     -> CSRMatrix<Precision, index_type> {
-    // Bcast num_rows and num_cols
     size_t num_col_blocks = mpi_manager.getSize();
 
     std::vector<size_t> nnzs;

From d1f14558afaf3b91ba9cf0f5f38f6604d96d8cac Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 24 Aug 2023 06:20:56 -0700
Subject: [PATCH 42/43] add license header

---
 pennylane_lightning_gpu/src/util/CSRMatrix.hpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
index 78a60ddf..5da81235 100644
--- a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
+++ b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp
@@ -1,3 +1,17 @@
+// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once
 
 #include <algorithm>

From d75c26800cd9ac80bcf53e2b691bcc3caf33c8e5 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 24 Aug 2023 06:47:36 -0700
Subject: [PATCH 43/43] tidy up code

---
 pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
index fdaefc15..67df085f 100644
--- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
+++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp
@@ -522,7 +522,6 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI<T> {
         auto device_id = sv.getDataBuffer().getDevTag().getDeviceID();
         auto stream_id = sv.getDataBuffer().getDevTag().getStreamID();
 
-        // clang-format on
         cudaDataType_t data_type;
         cusparseIndexType_t compute_type;
         const cusparseOperation_t operation_type =