From c3c50aa2e6936d943b9b86bbfad632ce502abddf Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Thu, 6 Jul 2023 18:34:55 -0700 Subject: [PATCH 01/43] add cpp layer --- .../src/simulator/StateVectorCudaMPI.hpp | 351 ++++++++++++++++++ .../mpi/Test_StateVectorCudaMPI_NonParam.cpp | 56 +++ .../src/util/DataBuffer.hpp | 4 +- .../src/util/MPIManager.hpp | 44 +++ 4 files changed, 453 insertions(+), 2 deletions(-) diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 53f8e9db..97986028 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -64,6 +64,20 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value, const size_t index, bool async, cudaStream_t stream_id); +template struct CSRMatrix { + std::vector> values; + std::vector columns; + std::vector csrOffsets; + + CSRMatrix(size_t num_rows, size_t nnz) { + values = std::vector>(nnz); + columns = std::vector(nnz, 0); + csrOffsets = std::vector(num_rows + 1, 0); + } + + CSRMatrix(){}; +}; + /** * @brief Managed memory CUDA state-vector class using custateVec backed * gate-calls. @@ -941,6 +955,343 @@ class StateVectorCudaMPI return expect_val; } + template + auto scatterCSRMatrix(std::vector> &matrix, + size_t root) -> CSRMatrix { + // Bcast num_rows and num_cols + size_t local_num_rows; + size_t num_col_blocks; + std::vector nnzs; + + local_num_rows = size_t{1} << this->getNumLocalQubits(); + num_col_blocks = mpi_manager_.getSize(); + + if (mpi_manager_.getRank() == root) { + for (size_t j = 0; j < matrix.size(); j++) { + nnzs.push_back(matrix[j].values.size()); + } + } + + size_t local_nnz = mpi_manager_.scatter(nnzs, 0)[0]; + + CSRMatrix localCSRMatrix(local_num_rows, + local_nnz); + + if (mpi_manager_.getRank() == root) { + localCSRMatrix.values = matrix[0].values; + localCSRMatrix.csrOffsets = matrix[0].csrOffsets; + localCSRMatrix.columns = matrix[0].columns; + } + + for (size_t k = 1; k < num_col_blocks; k++) { + size_t dest = k; + size_t source = 0; + + if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) { + mpi_manager_.Send>(matrix[k].values, + dest); + } else if (mpi_manager_.getRank() == k && local_nnz) { + mpi_manager_.Recv>( + localCSRMatrix.values, source); + } + + if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) { + mpi_manager_.Send(matrix[k].csrOffsets, dest); + } else if (mpi_manager_.getRank() == k && local_nnz) { + mpi_manager_.Recv(localCSRMatrix.csrOffsets, + source); + } + + if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) { + mpi_manager_.Send(matrix[k].columns, dest); + } else if (mpi_manager_.getRank() == k && local_nnz) { + mpi_manager_.Recv(localCSRMatrix.columns, source); + } + + /* + int sendtag = 0; + int recvtag = 0; + + if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) { + MPI_Send(matrix[k].values.data(), matrix[k].values.size(), + MPI_C_DOUBLE_COMPLEX, dest, sendtag, + mpi_manager_.getComm()); + } else if (mpi_manager_.getRank() == k && local_nnz) { + MPI_Recv(localCSRMatrix.values.data(), + localCSRMatrix.values.size(), MPI_C_DOUBLE_COMPLEX, + source, recvtag, mpi_manager_.getComm(), + MPI_STATUS_IGNORE); + } + + if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) { + MPI_Send(matrix[k].csrOffsets.data(), + matrix[k].csrOffsets.size(), MPI_INT64_T, dest, + sendtag, mpi_manager_.getComm()); + } else if (mpi_manager_.getRank() == k && local_nnz) { + MPI_Recv(localCSRMatrix.csrOffsets.data(), + localCSRMatrix.csrOffsets.size(), MPI_INT64_T, source, + recvtag, mpi_manager_.getComm(), MPI_STATUS_IGNORE); + } + + if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) { + MPI_Send(matrix[k].columns.data(), matrix[k].columns.size(), + MPI_INT64_T, dest, sendtag, mpi_manager_.getComm()); + } else if (mpi_manager_.getRank() == k && local_nnz) { + MPI_Recv(localCSRMatrix.columns.data(), + localCSRMatrix.columns.size(), MPI_INT64_T, source, + recvtag, mpi_manager_.getComm(), MPI_STATUS_IGNORE); + } + */ + } + return localCSRMatrix; + } + + template + auto splitCSRMatrix(const size_t &num_row_blocks, + const size_t &num_col_blocks, + const index_type *csrOffsets_ptr, + const index_type *columns_ptr, + const std::complex *values_ptr) + -> std::vector>> { + + std::vector>> + splitSparseMatrix( + num_row_blocks, + std::vector>(num_col_blocks)); + + size_t num_rows = size_t{1} << this->getTotalNumQubits(); + size_t row_block_size = size_t{1} << this->getNumLocalQubits(); + size_t col_block_size = row_block_size; + + for (size_t row = 0; row < num_rows; row++) { + for (size_t col_idx = static_cast(csrOffsets_ptr[row]); + col_idx < static_cast(csrOffsets_ptr[row + 1]); + col_idx++) { + + size_t current_global_row = row; + size_t current_global_col = columns_ptr[col_idx]; + std::complex current_val = values_ptr[col_idx]; + + size_t block_row_id = current_global_row / row_block_size; + size_t block_col_id = current_global_col / col_block_size; + + size_t local_row_id = current_global_row % row_block_size; + size_t local_col_id = current_global_col % col_block_size; + + if (splitSparseMatrix[block_row_id][block_col_id] + .csrOffsets.size() == 0) { + splitSparseMatrix[block_row_id][block_col_id].csrOffsets = + std::vector(row_block_size + 1, 0); + } + + splitSparseMatrix[block_row_id][block_col_id] + .csrOffsets[local_row_id + 1]++; + splitSparseMatrix[block_row_id][block_col_id].columns.push_back( + local_col_id); + splitSparseMatrix[block_row_id][block_col_id].values.push_back( + current_val); + } + } + + for (size_t block_row_id = 0; block_row_id < num_row_blocks; + block_row_id++) { + for (size_t block_col_id = 0; block_col_id < num_col_blocks; + block_col_id++) { + for (size_t i0 = 1; + i0 < splitSparseMatrix[block_row_id][block_col_id] + .csrOffsets.size(); + i0++) { + splitSparseMatrix[block_row_id][block_col_id] + .csrOffsets[i0] += + splitSparseMatrix[block_row_id][block_col_id] + .csrOffsets[i0 - 1]; + } + } + } + + return splitSparseMatrix; + } + + /** + * @brief expval(H) calculation with cuSparseSpMV. + * + * @tparam index_type Integer type used as indices of the sparse matrix. + * @param csr_Offsets_ptr Pointer to the array of row offsets of the sparse + * matrix. Array of size csrOffsets_size. + * @param csrOffsets_size Number of Row offsets of the sparse matrix. + * @param columns_ptr Pointer to the array of column indices of the sparse + * matrix. Array of size numNNZ + * @param values_ptr Pointer to the array of the non-zero elements + * @param numNNZ Number of non-zero elements. + * @return auto Expectation value. + */ + template + auto + getExpectationValueOnSparseSpMV(const index_type *csrOffsets_ptr, + const index_type *columns_ptr, + const std::complex *values_ptr) { + const CFP_t alpha = {1.0, 0.0}; + const CFP_t beta = {0.0, 0.0}; + + Precision local_expect = 0; + + auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID(); + auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID(); + + cudaDataType_t data_type; + cusparseIndexType_t compute_type; + + if constexpr (std::is_same_v || + std::is_same_v) { + data_type = CUDA_C_64F; + compute_type = CUSPARSE_INDEX_64I; + } else { + data_type = CUDA_C_32F; + compute_type = CUSPARSE_INDEX_32I; + } + + std::vector>> + csrmatrix_blocks; + + size_t num_col_blocks = mpi_manager_.getSize(); + size_t num_row_blocks = mpi_manager_.getSize(); + + if (mpi_manager_.getRank() == 0) { + csrmatrix_blocks = + splitCSRMatrix(num_col_blocks, num_row_blocks, csrOffsets_ptr, + columns_ptr, values_ptr); + } + + const size_t length_local = size_t{1} << this->getNumLocalQubits(); + + DataBuffer d_tmp{length_local, device_id, stream_id, true}; + DataBuffer d_tmp_res{length_local, device_id, stream_id, + true}; + d_tmp_res.zeroInit(); + + for (size_t i = 0; i < num_row_blocks; i++) { + std::vector> sparsematices_row; + + if (mpi_manager_.getRank() == 0) { + sparsematices_row = csrmatrix_blocks[i]; + } + mpi_manager_.Barrier(); + + auto localCSRMatrix = scatterCSRMatrix(sparsematices_row, 0); + mpi_manager_.Barrier(); + + int64_t num_rows_local = + static_cast(localCSRMatrix.csrOffsets.size() - 1); + int64_t num_cols_local = + static_cast(localCSRMatrix.columns.size()); + int64_t nnz_local = + static_cast(localCSRMatrix.values.size()); + + size_t color = 0; + + if (localCSRMatrix.values.size() != 0) { + d_tmp.zeroInit(); + + DataBuffer d_csrOffsets{ + localCSRMatrix.csrOffsets.size(), device_id, stream_id, + true}; + DataBuffer d_columns{ + localCSRMatrix.columns.size(), device_id, stream_id, true}; + DataBuffer d_values{localCSRMatrix.values.size(), + device_id, stream_id, true}; + + d_csrOffsets.CopyHostDataToGpu(localCSRMatrix.csrOffsets.data(), + localCSRMatrix.csrOffsets.size(), + false); + d_columns.CopyHostDataToGpu(localCSRMatrix.columns.data(), + localCSRMatrix.columns.size(), + false); + d_values.CopyHostDataToGpu(localCSRMatrix.values.data(), + localCSRMatrix.values.size(), false); + + // CUSPARSE APIs + cusparseSpMatDescr_t mat; + cusparseDnVecDescr_t vecX, vecY; + + size_t bufferSize = 0; + cusparseHandle_t handle = getCusparseHandle(); + + // Create sparse matrix A in CSR format + PL_CUSPARSE_IS_SUCCESS(cusparseCreateCsr( + &mat, num_rows_local, num_cols_local, nnz_local, + d_csrOffsets.getData(), d_columns.getData(), + d_values.getData(), compute_type, compute_type, + CUSPARSE_INDEX_BASE_ZERO, data_type)); + + // Create dense vector X + PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec( + &vecX, num_cols_local, BaseType::getData(), data_type)); + + // Create dense vector y + PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec( + &vecY, num_rows_local, d_tmp.getData(), data_type)); + + // allocate an external buffer if needed + PL_CUSPARSE_IS_SUCCESS(cusparseSpMV_bufferSize( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat, vecX, + &beta, vecY, data_type, CUSPARSE_SPMV_ALG_DEFAULT, + &bufferSize)); + + DataBuffer dBuffer{bufferSize, device_id, + stream_id, true}; + + // execute SpMV + PL_CUSPARSE_IS_SUCCESS(cusparseSpMV( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat, vecX, + &beta, vecY, data_type, CUSPARSE_SPMV_ALG_DEFAULT, + reinterpret_cast(dBuffer.getData()))); + + // destroy matrix/vector descriptors + PL_CUSPARSE_IS_SUCCESS(cusparseDestroySpMat(mat)); + PL_CUSPARSE_IS_SUCCESS(cusparseDestroyDnVec(vecX)); + PL_CUSPARSE_IS_SUCCESS(cusparseDestroyDnVec(vecY)); + + color = 1; + } + + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); + + if (mpi_manager_.getRank() == i) { + color = 1; + } + + auto new_mpi_manager = + mpi_manager_.split(color, mpi_manager_.getRank()); + int reduce_root_rank = -1; + + if (mpi_manager_.getRank() == i) { + reduce_root_rank = new_mpi_manager.getRank(); + } + + mpi_manager_.Bcast(reduce_root_rank, i); + + if (new_mpi_manager.getComm() != MPI_COMM_NULL) { + new_mpi_manager.Reduce(d_tmp.getData(), + d_tmp_res.getData(), length_local, + reduce_root_rank, "sum"); + } + } + + mpi_manager_.Barrier(); + + local_expect = innerProdC_CUDA(d_tmp_res.getData(), BaseType::getData(), + BaseType::getLength(), device_id, + stream_id, getCublasCaller()) + .x; + + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); + + auto expect = mpi_manager_.allreduce(local_expect, "sum"); + return expect; + } + /** * @brief Utility method for probability calculation using given wires. * diff --git a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp index f4664d6a..fb63aa96 100644 --- a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp +++ b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp @@ -926,3 +926,59 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::getExpectationValuePauliWords", CHECK(expval_mpi == Approx(-0.0105768395).margin(1e-7)); } } + +TEMPLATE_TEST_CASE("StateVectorCudaMPI::Hamiltonian_expval_cuSparse", + "[StateVectorCudaMPI_Nonparam]", float, double) { + using PrecisionT = TestType; + using cp_t = std::complex; + MPIManager mpi_manager(MPI_COMM_WORLD); + size_t numqubits = 3; + size_t mpi_buffersize = 1; + + size_t nGlobalIndexBits = + std::bit_width(static_cast(mpi_manager.getSize())) - 1; + size_t nLocalIndexBits = numqubits - nGlobalIndexBits; + mpi_manager.Barrier(); + + std::vector init_sv{{0.0, 0.0}, {0.0, 0.1}, {0.1, 0.1}, {0.1, 0.2}, + {0.2, 0.2}, {0.3, 0.3}, {0.3, 0.4}, {0.4, 0.5}}; + + using index_type = + typename std::conditional::value, int32_t, + int64_t>::type; + + auto local_state = mpi_manager.scatter(init_sv, 0); + + int nDevices = 0; // Number of GPU devices per node + cudaGetDeviceCount(&nDevices); + int deviceId = mpi_manager.getRank() % nDevices; + cudaSetDevice(deviceId); + DevTag dt_local(deviceId, 0); + mpi_manager.Barrier(); + + SECTION("GetExpectionCuSparse") { + index_type csrOffsets[9] = {0, 2, 4, 6, 8, 10, 12, 14, 16}; + index_type columns[16] = {0, 3, 1, 2, 1, 2, 0, 3, + 4, 7, 5, 6, 5, 6, 4, 7}; + + std::complex values[16] = { + {1.0, 0.0}, {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0}, + {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0}, {1.0, 0.0}, + {1.0, 0.0}, {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0}, + {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0}, {1.0, 0.0}}; + + // index_type num_csrOffsets = 9; + // index_type nnz = 16; + + StateVectorCudaMPI sv(mpi_manager, dt_local, mpi_buffersize, + nGlobalIndexBits, nLocalIndexBits); + sv.CopyHostDataToGpu(local_state, false); + + auto results = sv.template getExpectationValueOnSparseSpMV( + csrOffsets, columns, values); + + TestType expected = 1; + + CHECK(expected == Approx(results).epsilon(1e-7)); + } +} \ No newline at end of file diff --git a/pennylane_lightning_gpu/src/util/DataBuffer.hpp b/pennylane_lightning_gpu/src/util/DataBuffer.hpp index 41afa6e4..3b6d483f 100644 --- a/pennylane_lightning_gpu/src/util/DataBuffer.hpp +++ b/pennylane_lightning_gpu/src/util/DataBuffer.hpp @@ -30,8 +30,8 @@ template class DataBuffer { DataBuffer(std::size_t length, int device_id = 0, cudaStream_t stream_id = 0, bool alloc_memory = true) - : length_{length}, dev_tag_{device_id, stream_id}, gpu_buffer_{ - nullptr} { + : length_{length}, dev_tag_{device_id, stream_id}, + gpu_buffer_{nullptr} { if (alloc_memory && (length > 0)) { dev_tag_.refresh(); PL_CUDA_IS_SUCCESS( diff --git a/pennylane_lightning_gpu/src/util/MPIManager.hpp b/pennylane_lightning_gpu/src/util/MPIManager.hpp index 5fb3f9f5..4e9a7c17 100644 --- a/pennylane_lightning_gpu/src/util/MPIManager.hpp +++ b/pennylane_lightning_gpu/src/util/MPIManager.hpp @@ -424,6 +424,15 @@ class MPIManager final { this->getComm())); } + template + void Reduce(T *sendBuf, T *recvBuf, size_t length, size_t root, + const std::string &op_str) { + MPI_Datatype datatype = getMPIDatatype(); + MPI_Op op = getMPIOpType(op_str); + PL_MPI_IS_SUCCESS(MPI_Reduce(sendBuf, recvBuf, length, datatype, op, + root, this->getComm())); + } + template void Gather(T &sendBuf, std::vector &recvBuf, size_t root) { MPI_Datatype datatype = getMPIDatatype(); @@ -533,6 +542,41 @@ class MPIManager final { return recvBuf; } + /** + * @brief MPI_Send wrapper. + * + * @tparam T C++ data type. + * @param sendBuf Send buffer vector. + * @param dest Rank of send dest. + */ + + template void Send(std::vector &sendBuf, size_t dest) { + MPI_Datatype datatype = getMPIDatatype(); + const int tag = 0; + + PL_MPI_IS_SUCCESS(MPI_Send(sendBuf.data(), sendBuf.size(), datatype, + static_cast(dest), tag, + this->getComm())); + } + + /** + * @brief MPI_Recv wrapper. + * + * @tparam T C++ data type. + * @param recvBuf Recv buffer vector. + * @param source Rank of data source. + */ + + template void Recv(std::vector &recvBuf, size_t source) { + MPI_Datatype datatype = getMPIDatatype(); + MPI_Status status; + const int tag = 0; + + PL_MPI_IS_SUCCESS(MPI_Recv(recvBuf.data(), recvBuf.size(), datatype, + static_cast(source), tag, + this->getComm(), &status)); + } + /** * @brief MPI_Sendrecv wrapper. * From d39816191ed797745ef7ce57d293233daf18741d Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Thu, 6 Jul 2023 19:11:29 -0700 Subject: [PATCH 02/43] add docstring --- .../src/simulator/StateVectorCudaMPI.hpp | 121 ++++++++++-------- .../mpi/Test_StateVectorCudaMPI_NonParam.cpp | 6 +- 2 files changed, 73 insertions(+), 54 deletions(-) diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 97986028..f23d5355 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -955,6 +955,13 @@ class StateVectorCudaMPI return expect_val; } + /** + * @brief Scatter a CSR (Compress Sparse Row) format matrix. + * + * @tparam index_type Integer type used as indices of the sparse matrix. + * @param matrix CSR (Compress Sparse Row) format matrix. + * @param root Root rank of the scatter operation. + */ template auto scatterCSRMatrix(std::vector> &matrix, size_t root) -> CSRMatrix { @@ -1007,45 +1014,25 @@ class StateVectorCudaMPI } else if (mpi_manager_.getRank() == k && local_nnz) { mpi_manager_.Recv(localCSRMatrix.columns, source); } - - /* - int sendtag = 0; - int recvtag = 0; - - if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) { - MPI_Send(matrix[k].values.data(), matrix[k].values.size(), - MPI_C_DOUBLE_COMPLEX, dest, sendtag, - mpi_manager_.getComm()); - } else if (mpi_manager_.getRank() == k && local_nnz) { - MPI_Recv(localCSRMatrix.values.data(), - localCSRMatrix.values.size(), MPI_C_DOUBLE_COMPLEX, - source, recvtag, mpi_manager_.getComm(), - MPI_STATUS_IGNORE); - } - - if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) { - MPI_Send(matrix[k].csrOffsets.data(), - matrix[k].csrOffsets.size(), MPI_INT64_T, dest, - sendtag, mpi_manager_.getComm()); - } else if (mpi_manager_.getRank() == k && local_nnz) { - MPI_Recv(localCSRMatrix.csrOffsets.data(), - localCSRMatrix.csrOffsets.size(), MPI_INT64_T, source, - recvtag, mpi_manager_.getComm(), MPI_STATUS_IGNORE); - } - - if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) { - MPI_Send(matrix[k].columns.data(), matrix[k].columns.size(), - MPI_INT64_T, dest, sendtag, mpi_manager_.getComm()); - } else if (mpi_manager_.getRank() == k && local_nnz) { - MPI_Recv(localCSRMatrix.columns.data(), - localCSRMatrix.columns.size(), MPI_INT64_T, source, - recvtag, mpi_manager_.getComm(), MPI_STATUS_IGNORE); - } - */ } return localCSRMatrix; } + /** + * @brief Convert a global CSR (Compress Sparse Row) format matrix into + * local blocks. This operation should be conducted on the rank 0. + * + * @tparam index_type Integer type used as indices of the sparse matrix. + * @param num_row_blocks Number of local blocks per global row. + * @param num_col_blocks Number of local blocks per global column. + * @param csrOffsets_ptr Pointer to the array of row offsets of the sparse + * matrix. Array of size csrOffsets_size. + * @param columns_ptr Pointer to the array of column indices of the sparse + * matrix. Array of size numNNZ + * @param values_ptr Pointer to the array of the non-zero elements + * + * @return auto A vector of vector of CSRMatrix. + */ template auto splitCSRMatrix(const size_t &num_row_blocks, const size_t &num_col_blocks, @@ -1126,10 +1113,15 @@ class StateVectorCudaMPI * @return auto Expectation value. */ template - auto - getExpectationValueOnSparseSpMV(const index_type *csrOffsets_ptr, - const index_type *columns_ptr, - const std::complex *values_ptr) { + auto getExpectationValueOnSparseSpMV( + const index_type *csrOffsets_ptr, const index_type csrOffsets_size, + const index_type *columns_ptr, + const std::complex *values_ptr, const index_type numNNZ) { + PL_ABORT_IF_NOT(static_cast(csrOffsets_size - 1) == + (size_t{1} << this->getTotalNumQubits()), + "Incorrect size of CSR Offsets."); + PL_ABORT_IF_NOT(numNNZ > 0, "Empty CSR matrix."); + const CFP_t alpha = {1.0, 0.0}; const CFP_t beta = {0.0, 0.0}; @@ -1218,33 +1210,60 @@ class StateVectorCudaMPI // Create sparse matrix A in CSR format PL_CUSPARSE_IS_SUCCESS(cusparseCreateCsr( - &mat, num_rows_local, num_cols_local, nnz_local, - d_csrOffsets.getData(), d_columns.getData(), - d_values.getData(), compute_type, compute_type, - CUSPARSE_INDEX_BASE_ZERO, data_type)); + /* cusparseSpMatDescr_t* */ &mat, + /* int64_t */ num_rows_local, + /* int64_t */ num_cols_local, + /* int64_t */ nnz_local, + /* void* */ d_csrOffsets.getData(), + /* void* */ d_columns.getData(), + /* void* */ d_values.getData(), + /* cusparseIndexType_t */ compute_type, + /* cusparseIndexType_t */ compute_type, + /* cusparseIndexBase_t */ CUSPARSE_INDEX_BASE_ZERO, + /* cudaDataType */ data_type)); // Create dense vector X PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec( - &vecX, num_cols_local, BaseType::getData(), data_type)); + /* cusparseDnVecDescr_t* */ &vecX, + /* int64_t */ num_cols_local, + /* void* */ BaseType::getData(), + /* cudaDataType */ data_type)); // Create dense vector y PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec( - &vecY, num_rows_local, d_tmp.getData(), data_type)); + /* cusparseDnVecDescr_t* */ &vecY, + /* int64_t */ num_rows_local, + /* void* */ d_tmp.getData(), + /* cudaDataType */ data_type)); // allocate an external buffer if needed PL_CUSPARSE_IS_SUCCESS(cusparseSpMV_bufferSize( - handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat, vecX, - &beta, vecY, data_type, CUSPARSE_SPMV_ALG_DEFAULT, - &bufferSize)); + /* cusparseHandle_t */ handle, + /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE, + /* const void* */ &alpha, + /* cusparseSpMatDescr_t */ mat, + /* cusparseDnVecDescr_t */ vecX, + /* const void* */ &beta, + /* cusparseDnVecDescr_t */ vecY, + /* cudaDataType */ data_type, + /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT, + /* size_t* */ &bufferSize)); DataBuffer dBuffer{bufferSize, device_id, stream_id, true}; // execute SpMV PL_CUSPARSE_IS_SUCCESS(cusparseSpMV( - handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat, vecX, - &beta, vecY, data_type, CUSPARSE_SPMV_ALG_DEFAULT, - reinterpret_cast(dBuffer.getData()))); + /* cusparseHandle_t */ handle, + /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE, + /* const void* */ &alpha, + /* cusparseSpMatDescr_t */ mat, + /* cusparseDnVecDescr_t */ vecX, + /* const void* */ &beta, + /* cusparseDnVecDescr_t */ vecY, + /* cudaDataType */ data_type, + /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT, + /* void* */ reinterpret_cast(dBuffer.getData()))); // destroy matrix/vector descriptors PL_CUSPARSE_IS_SUCCESS(cusparseDestroySpMat(mat)); diff --git a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp index fb63aa96..dbd735c4 100644 --- a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp +++ b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp @@ -967,15 +967,15 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::Hamiltonian_expval_cuSparse", {1.0, 0.0}, {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0}, {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0}, {1.0, 0.0}}; - // index_type num_csrOffsets = 9; - // index_type nnz = 16; + index_type num_csrOffsets = 9; + index_type nnz = 16; StateVectorCudaMPI sv(mpi_manager, dt_local, mpi_buffersize, nGlobalIndexBits, nLocalIndexBits); sv.CopyHostDataToGpu(local_state, false); auto results = sv.template getExpectationValueOnSparseSpMV( - csrOffsets, columns, values); + csrOffsets, num_csrOffsets, columns, values, nnz); TestType expected = 1; From 087a85ea98d7b692b7af42c9448b3440aa5832d2 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Fri, 7 Jul 2023 19:53:59 -0700 Subject: [PATCH 03/43] add python layer --- mpitests/test_adjoint_jacobian.py | 3 +- mpitests/test_apply.py | 42 +++++++++++++++++++ pennylane_lightning_gpu/lightning_gpu.py | 15 ++++++- .../src/bindings/Bindings.cpp | 20 +++++++++ .../src/simulator/StateVectorCudaMPI.hpp | 10 +++-- 5 files changed, 83 insertions(+), 7 deletions(-) diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py index 638e9758..20d3c01c 100644 --- a/mpitests/test_adjoint_jacobian.py +++ b/mpitests/test_adjoint_jacobian.py @@ -1043,7 +1043,7 @@ def circuit(params): with pytest.raises((TypeError, ValueError)): j_gpu = qml.jacobian(qnode_gpu)(params) - +''' @pytest.mark.parametrize( "returns", [ @@ -1088,6 +1088,7 @@ def circuit(params): ): j_gpu = qml.jacobian(qnode_gpu)(params) +''' @pytest.mark.parametrize( "obs,obs_type_c64,obs_type_c128", diff --git a/mpitests/test_apply.py b/mpitests/test_apply.py index a08cc2f5..914eeab0 100644 --- a/mpitests/test_apply.py +++ b/mpitests/test_apply.py @@ -553,6 +553,48 @@ def circuit(): assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0) +class TestSparseHamiltonianExpval: + def test_sparse_hamiltonian_expectation(self, tol): + comm = MPI.COMM_WORLD + commSize = comm.Get_size() + num_global_wires = commSize.bit_length() - 1 + num_local_wires = 3 - num_global_wires + + obs = qml.Identity(0) @ qml.PauliX(1) @ qml.PauliY(2) + obs1 = qml.Identity(1) + Hmat = qml.Hamiltonian([1.0, 1.0], [obs1, obs]).sparse_matrix() + + state_vector = np.array( + [ + 0.0 + 0.0j, + 0.0 + 0.1j, + 0.1 + 0.1j, + 0.1 + 0.2j, + 0.2 + 0.2j, + 0.3 + 0.3j, + 0.3 + 0.4j, + 0.4 + 0.5j, + ], + dtype=np.complex128, + ) + + local_state_vector = np.zeros(1 << num_local_wires).astype(np.complex128) + comm.Scatter(state_vector, local_state_vector, root=0) + + dev_gpumpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=np.complex128) + + dev_gpumpi.syncH2D(local_state_vector) + + H_sparse = qml.SparseHamiltonian(Hmat, wires=range(3)) + + comm.Barrier() + + res = dev_gpumpi.expval(H_sparse) + expected = 1 + + assert np.allclose(res, expected) + + class TestExpval: """Tests that expectation values are properly calculated or that the proper errors are raised.""" diff --git a/pennylane_lightning_gpu/lightning_gpu.py b/pennylane_lightning_gpu/lightning_gpu.py index c931703f..014a6dbb 100644 --- a/pennylane_lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning_gpu/lightning_gpu.py @@ -832,8 +832,19 @@ def expval(self, observable, shot_range=None, bin_size=None): CSR_SparseHamiltonian.data, ) else: - raise RuntimeError( - "LightningGPU-MPI does not currently support SparseHamiltonian." + # Identity for CSR_SparseHamiltonian to pass to processes with rank != 0 to reduce + # host(cpu) memory requirements + obs = qml.Identity(0) + Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix() + H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1)) + CSR_SparseHamiltonian = H_sparse.sparse_matrix().tocsr() + # CSR_SparseHamiltonian for rank == 0 + if self._mpi_manager.getRank() == 0: + CSR_SparseHamiltonian = observable.sparse_matrix().tocsr() + return self._gpu_state.ExpectationValue( + CSR_SparseHamiltonian.indptr, + CSR_SparseHamiltonian.indices, + CSR_SparseHamiltonian.data, ) if observable.name in ["Hamiltonian"]: diff --git a/pennylane_lightning_gpu/src/bindings/Bindings.cpp b/pennylane_lightning_gpu/src/bindings/Bindings.cpp index dc91744c..2b0d677d 100644 --- a/pennylane_lightning_gpu/src/bindings/Bindings.cpp +++ b/pennylane_lightning_gpu/src/bindings/Bindings.cpp @@ -1352,6 +1352,26 @@ void StateVectorCudaMPI_class_bindings(py::module &m) { }, "Calculate the expectation value of the Hamiltonian observable " "with custatevecComputeExpectation.") + + .def( + "ExpectationValue", + [](StateVectorCudaMPI &sv, + const np_arr_sparse_ind &csrOffsets, + const np_arr_sparse_ind &columns, const np_arr_c values) { + using index_type = typename std::conditional< + std::is_same::value, int32_t, int64_t>::type; + return sv.template getExpectationValueOnSparseSpMV( + static_cast(csrOffsets.request().ptr), + static_cast( + csrOffsets.request() + .size), // num_rows + 1 or csrOffsets + static_cast(columns.request().ptr), // columns + static_cast *>( + values.request().ptr), + static_cast(values.request().size)); // nnz + }, + "Calculate the expectation value of a sparse Hamiltonian.") + .def( "ExpectationValue", [](StateVectorCudaMPI &sv, diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index f23d5355..5cd3d162 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -1117,10 +1117,12 @@ class StateVectorCudaMPI const index_type *csrOffsets_ptr, const index_type csrOffsets_size, const index_type *columns_ptr, const std::complex *values_ptr, const index_type numNNZ) { - PL_ABORT_IF_NOT(static_cast(csrOffsets_size - 1) == - (size_t{1} << this->getTotalNumQubits()), - "Incorrect size of CSR Offsets."); - PL_ABORT_IF_NOT(numNNZ > 0, "Empty CSR matrix."); + if (mpi_manager_.getRank() == 0) { + PL_ABORT_IF_NOT(static_cast(csrOffsets_size - 1) == + (size_t{1} << this->getTotalNumQubits()), + "Incorrect size of CSR Offsets."); + PL_ABORT_IF_NOT(numNNZ > 0, "Empty CSR matrix."); + } const CFP_t alpha = {1.0, 0.0}; const CFP_t beta = {0.0, 0.0}; From 2e29ce9f05f31111ab6f1e71cb8092d3eb3e3597 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Sat, 8 Jul 2023 15:06:11 -0700 Subject: [PATCH 04/43] add mpi support for spmv adjoint --- mpitests/test_adjoint_jacobian.py | 147 +++++++++ pennylane_lightning_gpu/_serialize.py | 31 +- pennylane_lightning_gpu/lightning_gpu.py | 2 + .../src/algorithms/ObservablesGPUMPI.hpp | 286 +++++++++++++++++- .../src/bindings/Bindings.cpp | 43 +++ 5 files changed, 495 insertions(+), 14 deletions(-) diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py index 20d3c01c..a67480f9 100644 --- a/mpitests/test_adjoint_jacobian.py +++ b/mpitests/test_adjoint_jacobian.py @@ -31,6 +31,8 @@ TensorProdObsGPUMPI_C128, HamiltonianGPUMPI_C64, HamiltonianGPUMPI_C128, + SparseHamiltonianGPUMPI_C64, + SparseHamiltonianGPUMPI_C128, ) from pennylane_lightning_gpu._serialize import _serialize_ob @@ -1043,6 +1045,7 @@ def circuit(params): with pytest.raises((TypeError, ValueError)): j_gpu = qml.jacobian(qnode_gpu)(params) + ''' @pytest.mark.parametrize( "returns", @@ -1090,6 +1093,145 @@ def circuit(params): ''' + +custom_wires0 = ["alice", 3.14, -1, 0, "bob", "luc"] +""" Important Note: Ensure wires index arranged from high to low + qml.SparseHamiltonian( + qml.Hamiltonian( + [0.1], [qml.PauliX(wires=custom_wires0[1]) @ qml.PauliY(wires=custom_wires0[0])] + ).sparse_matrix(custom_wires0.[::-1]), + ) +""" + + +@pytest.mark.parametrize( + "returns", + [ + qml.SparseHamiltonian( + qml.Hamiltonian( + [0.1], [qml.PauliX(wires=custom_wires0[0]) @ qml.PauliZ(wires=custom_wires0[1])] + ).sparse_matrix(custom_wires0[::-1]), + wires=custom_wires0, + ), + qml.SparseHamiltonian( + qml.Hamiltonian( + [2.0], [qml.PauliX(wires=custom_wires0[2]) @ qml.PauliZ(wires=custom_wires0[0])] + ).sparse_matrix(custom_wires0[::-1]), + wires=custom_wires0, + ), + qml.SparseHamiltonian( + qml.Hamiltonian( + [1.1], [qml.PauliX(wires=custom_wires0[0]) @ qml.PauliZ(wires=custom_wires0[2])] + ).sparse_matrix(custom_wires0[::-1]), + wires=custom_wires0, + ), + ], +) +def test_adjoint_SparseHamiltonian_custom_wires(returns): + """Integration tests that compare to default.qubit for a large circuit containing parametrized + operations and when using custom wire labels""" + + comm = MPI.COMM_WORLD + dev_gpu = qml.device("lightning.gpu", wires=custom_wires0, mpi=True) + dev_cpu = qml.device("default.qubit", wires=custom_wires0) + + def circuit(params): + circuit_ansatz(params, wires=custom_wires0) + return qml.expval(returns) + + if comm.Get_rank() == 0: + n_params = 30 + np.random.seed(1337) + params = np.random.rand(n_params) + else: + params = None + + params = comm.bcast(params, root=0) + + qnode_gpu = qml.QNode(circuit, dev_gpu, diff_method="adjoint") + qnode_cpu = qml.QNode(circuit, dev_cpu, diff_method="parameter-shift") + + j_gpu = qml.jacobian(qnode_gpu)(params) + j_cpu = qml.jacobian(qnode_cpu)(params) + + assert np.allclose(j_cpu, j_gpu) + + +""" Important Note: Ensure wires index arranged from high to low + qml.SparseHamiltonian( + qml.Hamiltonian( + [0.1], + [qml.PauliX(1) @ qml.PauliZ(0)], + ).sparse_matrix(range(5, -1, -1)), + wires=range(6), + ) +""" + + +@pytest.mark.parametrize( + "returns", + [ + qml.SparseHamiltonian( + qml.Hamiltonian( + [0.1], + [qml.PauliZ(1) @ qml.PauliX(0) @ qml.Identity(2) @ qml.PauliX(4) @ qml.Identity(5)], + ).sparse_matrix(range(5, -1, -1)), + wires=range(6), + ), + qml.SparseHamiltonian( + qml.Hamiltonian( + [0.1], + [qml.PauliX(1) @ qml.PauliZ(0)], + ).sparse_matrix(range(5, -1, -1)), + wires=range(6), + ), + qml.SparseHamiltonian( + qml.Hamiltonian( + [0.1], + [qml.PauliX(0) @ qml.PauliZ(1)], + ).sparse_matrix(range(5, -1, -1)), + wires=range(6), + ), + qml.SparseHamiltonian( + qml.Hamiltonian([2.0], [qml.PauliX(1) @ qml.PauliZ(2)]).sparse_matrix(range(5, -1, -1)), + wires=range(6), + ), + qml.SparseHamiltonian( + qml.Hamiltonian([1.1], [qml.PauliX(2) @ qml.PauliZ(0)]).sparse_matrix(range(5, -1, -1)), + wires=range(6), + ), + ], +) +def test_adjoint_SparseHamiltonian(returns): + """Integration tests that compare to default.qubit for a large circuit containing parametrized + operations and when using custom wire labels""" + + comm = MPI.COMM_WORLD + dev_gpu = qml.device("lightning.gpu", wires=6, mpi=True) + dev_cpu = qml.device("default.qubit", wires=6) + + def circuit(params): + circuit_ansatz(params, wires=range(6)) + return qml.expval(returns) + + if comm.Get_rank() == 0: + n_params = 30 + np.random.seed(1337) + params = np.random.rand(n_params) + else: + params = None + + params = comm.bcast(params, root=0) + + qnode_gpu = qml.QNode(circuit, dev_gpu, diff_method="adjoint") + qnode_cpu = qml.QNode(circuit, dev_cpu, diff_method="parameter-shift") + + j_gpu = qml.jacobian(qnode_gpu)(params) + j_cpu = qml.jacobian(qnode_cpu)(params) + + assert np.allclose(j_cpu, j_gpu) + + @pytest.mark.parametrize( "obs,obs_type_c64,obs_type_c128", [ @@ -1102,6 +1244,11 @@ def circuit(params): HamiltonianGPUMPI_C64, HamiltonianGPUMPI_C128, ), + ( + qml.SparseHamiltonian(qml.Hamiltonian([1], [qml.PauliZ(0)]).sparse_matrix(), wires=[0]), + SparseHamiltonianGPUMPI_C64, + SparseHamiltonianGPUMPI_C128, + ), ], ) @pytest.mark.parametrize("use_csingle", [True, False]) diff --git a/pennylane_lightning_gpu/_serialize.py b/pennylane_lightning_gpu/_serialize.py index e989ed49..7df08516 100644 --- a/pennylane_lightning_gpu/_serialize.py +++ b/pennylane_lightning_gpu/_serialize.py @@ -64,6 +64,8 @@ TensorProdObsGPUMPI_C128, HamiltonianGPUMPI_C64, HamiltonianGPUMPI_C128, + SparseHamiltonianGPUMPI_C64, + SparseHamiltonianGPUMPI_C128, HermitianObsGPUMPI_C64, HermitianObsGPUMPI_C128, ) @@ -117,6 +119,20 @@ def _hamiltonian_ob_dtype(use_csingle, use_mpi: bool): ) +def _sparsehamiltonian_ob_dtype(use_csingle, use_mpi: bool): + if not use_mpi: + return ( + [SparseHamiltonianGPU_C64, np.float32, np.int32] + if use_csingle + else [SparseHamiltonianGPU_C128, np.float64, np.int64] + ) + return ( + [SparseHamiltonianGPUMPI_C64, np.float32, np.int32] + if use_csingle + else [SparseHamiltonianGPUMPI_C128, np.float64, np.int64] + ) + + def _sv_py_dtype(use_csingle, use_mpi: bool): if not use_mpi: return LightningGPU_C64 if use_csingle else LightningGPU_C128 @@ -151,15 +167,8 @@ def _serialize_hamiltonian( return hamiltonian_obs(coeffs, terms) -def _serialize_sparsehamiltonian(ob, wires_map: dict, use_csingle: bool): - if use_csingle: - ctype = np.complex64 - rtype = np.int32 - sparsehamiltonian_obs = SparseHamiltonianGPU_C64 - else: - ctype = np.complex128 - rtype = np.int64 - sparsehamiltonian_obs = SparseHamiltonianGPU_C128 +def _serialize_sparsehamiltonian(ob, wires_map: dict, use_csingle: bool, use_mpi: bool): + sparsehamiltonian_obs, ctype, rtype = _sparsehamiltonian_ob_dtype(use_csingle, use_mpi) spm = ob.sparse_matrix() data = np.array(spm.data).astype(ctype) @@ -214,9 +223,7 @@ def _serialize_ob(ob, wires_map, use_csingle, use_mpi: bool = False, use_splitti elif ob.name == "Hamiltonian": return _serialize_hamiltonian(ob, wires_map, use_csingle, use_mpi, use_splitting) elif ob.name == "SparseHamiltonian": - if use_mpi: - raise TypeError("SparseHamiltonian is not supported for MPI backend.") - return _serialize_sparsehamiltonian(ob, wires_map, use_csingle) + return _serialize_sparsehamiltonian(ob, wires_map, use_csingle, use_mpi) elif isinstance(ob, (PauliX, PauliY, PauliZ, Identity, Hadamard)): return _serialize_named_ob(ob, wires_map, use_csingle, use_mpi) elif ob._pauli_rep is not None: diff --git a/pennylane_lightning_gpu/lightning_gpu.py b/pennylane_lightning_gpu/lightning_gpu.py index 014a6dbb..38c06e10 100644 --- a/pennylane_lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning_gpu/lightning_gpu.py @@ -85,6 +85,8 @@ TensorProdObsGPUMPI_C128, HamiltonianGPUMPI_C64, HamiltonianGPUMPI_C128, + SparseHamiltonianGPUMPI_C64, + SparseHamiltonianGPUMPI_C128, ) MPI_SUPPORT = True diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp index 27c40a8c..d7ad2d56 100644 --- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp +++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp @@ -17,6 +17,7 @@ #include #include +#include "MPIManager.hpp" #include "StateVectorCudaMPI.hpp" namespace Pennylane::Algorithms { @@ -362,8 +363,9 @@ class HamiltonianGPUMPI final : public ObservableGPUMPI { for (size_t term_idx = 0; term_idx < coeffs_.size(); term_idx++) { DevTag dt_local(sv.getDataBuffer().getDevTag()); dt_local.refresh(); - StateVectorCudaMPI tmp(dt_local, sv.getNumGlobalQubits(), - sv.getNumLocalQubits(), sv.getData()); + StateVectorCudaMPI tmp( + dt_local, sv.getNumGlobalQubits(), sv.getNumLocalQubits(), + sv.getData()); obs_[term_idx]->applyInPlace(tmp); scaleAndAddC_CUDA(std::complex{coeffs_[term_idx], 0.0}, tmp.getData(), buffer.getData(), tmp.getLength(), @@ -404,4 +406,284 @@ class HamiltonianGPUMPI final : public ObservableGPUMPI { } }; +/** + * @brief Sparse representation of HamiltonianGPUMPI + * + * @tparam T Floating-point precision. + */ +template +class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { + public: + using PrecisionT = T; + // cuSparse required index type + using IdxT = typename std::conditional::value, + int32_t, int64_t>::type; + + private: + std::vector> data_; + std::vector indices_; + std::vector offsets_; + std::vector wires_; + + [[nodiscard]] bool + isEqual(const ObservableGPUMPI &other) const override { + const auto &other_cast = + static_cast &>(other); + + if (data_ != other_cast.data_ || indices_ != other_cast.indices_ || + offsets_ != other_cast.offsets_) { + return false; + } + + return true; + } + + public: + /** + * @brief Create a SparseHamiltonianMPI from data, indices and offsets in + * CSR format. + * + * @param arg1 Arguments to construct data + * @param arg2 Arguments to construct indices + * @param arg3 Arguments to construct offsets + * @param arg4 Arguments to construct wires + */ + template > + SparseHamiltonianGPUMPI(T1 &&arg1, T2 &&arg2, T3 &&arg3, T4 &&arg4) + : data_{std::forward(arg1)}, indices_{std::forward(arg2)}, + offsets_{std::forward(arg3)}, wires_{std::forward(arg4)} { + PL_ASSERT(data_.size() == indices_.size()); + } + + /** + * @brief Convenient wrapper for the constructor as the constructor does not + * convert the std::shared_ptr with a derived class correctly. + * + * This function is useful as std::make_shared does not handle + * brace-enclosed initializer list correctly. + * + * @param arg1 Argument to construct data + * @param arg2 Argument to construct indices + * @param arg3 Argument to construct ofsets + * @param arg4 Argument to construct wires + */ + static auto create(std::initializer_list arg1, + std::initializer_list arg2, + std::initializer_list arg3, + std::initializer_list arg4) + -> std::shared_ptr> { + return std::shared_ptr>( + new SparseHamiltonianGPUMPI{std::move(arg1), std::move(arg2), + std::move(arg3), std::move(arg4)}); + } + + /** + * @brief Updates the statevector SV:->SV', where SV' = a*H*SV, and where H + * is a sparse Hamiltonian. + * + */ + inline void applyInPlace(StateVectorCudaMPI &sv) const override { + PL_ABORT_IF_NOT(wires_.size() == sv.getTotalNumQubits(), + "SparseH wire count does not match state-vector size"); + using CFP_t = typename StateVectorCudaMPI::CFP_t; + const CFP_t alpha = {1.0, 0.0}; + const CFP_t beta = {0.0, 0.0}; + + auto device_id = sv.getDataBuffer().getDevTag().getDeviceID(); + auto stream_id = sv.getDataBuffer().getDevTag().getStreamID(); + + auto mpi_manager = sv.getMPIManager(); + + // clang-format on + cudaDataType_t data_type; + cusparseIndexType_t compute_type; + + if constexpr (std::is_same_v || + std::is_same_v) { + data_type = CUDA_C_64F; + compute_type = CUSPARSE_INDEX_64I; + } else { + data_type = CUDA_C_32F; + compute_type = CUSPARSE_INDEX_32I; + } + + std::vector>> csrmatrix_blocks; + + size_t num_col_blocks = mpi_manager.getSize(); + size_t num_row_blocks = mpi_manager.getSize(); + + if (mpi_manager.getRank() == 0) { + csrmatrix_blocks = sv.template splitCSRMatrix( + num_col_blocks, num_row_blocks, offsets_.data(), + indices_.data(), data_.data()); + } + + const size_t length_local = size_t{1} << sv.getNumLocalQubits(); + + DevTag dt_local(sv.getDataBuffer().getDevTag()); + dt_local.refresh(); + StateVectorCudaMPI d_sv_prime( + dt_local, sv.getNumGlobalQubits(), sv.getNumLocalQubits(), + sv.getData()); + + StateVectorCudaMPI d_tmp(dt_local, sv.getNumGlobalQubits(), + sv.getNumLocalQubits(), + sv.getData()); + + for (size_t i = 0; i < num_row_blocks; i++) { + std::vector> sparsematices_row; + + if (mpi_manager.getRank() == 0) { + sparsematices_row = csrmatrix_blocks[i]; + } + + auto localCSRMatrix = + sv.template scatterCSRMatrix(sparsematices_row, 0); + + int64_t num_rows_local = + static_cast(localCSRMatrix.csrOffsets.size() - 1); + int64_t num_cols_local = + static_cast(localCSRMatrix.columns.size()); + int64_t nnz_local = + static_cast(localCSRMatrix.values.size()); + + size_t color = 0; + + if (localCSRMatrix.values.size() != 0) { + DataBuffer d_csrOffsets{ + localCSRMatrix.csrOffsets.size(), device_id, stream_id, + true}; + DataBuffer d_columns{localCSRMatrix.columns.size(), + device_id, stream_id, true}; + DataBuffer d_values{localCSRMatrix.values.size(), + device_id, stream_id, true}; + + d_csrOffsets.CopyHostDataToGpu(localCSRMatrix.csrOffsets.data(), + localCSRMatrix.csrOffsets.size(), + false); + d_columns.CopyHostDataToGpu(localCSRMatrix.columns.data(), + localCSRMatrix.columns.size(), + false); + d_values.CopyHostDataToGpu(localCSRMatrix.values.data(), + localCSRMatrix.values.size(), false); + + // CUSPARSE APIs + cusparseSpMatDescr_t mat; + cusparseDnVecDescr_t vecX, vecY; + + size_t bufferSize = 0; + cusparseHandle_t handle = sv.getCusparseHandle(); + + // Create sparse matrix A in CSR format + PL_CUSPARSE_IS_SUCCESS(cusparseCreateCsr( + /* cusparseSpMatDescr_t* */ &mat, + /* int64_t */ num_rows_local, + /* int64_t */ num_cols_local, + /* int64_t */ nnz_local, + /* void* */ d_csrOffsets.getData(), + /* void* */ d_columns.getData(), + /* void* */ d_values.getData(), + /* cusparseIndexType_t */ compute_type, + /* cusparseIndexType_t */ compute_type, + /* cusparseIndexBase_t */ CUSPARSE_INDEX_BASE_ZERO, + /* cudaDataType */ data_type)); + + // Create dense vector X + PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec( + /* cusparseDnVecDescr_t* */ &vecX, + /* int64_t */ num_cols_local, + /* void* */ sv.getData(), + /* cudaDataType */ data_type)); + + // Create dense vector y + PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec( + /* cusparseDnVecDescr_t* */ &vecY, + /* int64_t */ num_rows_local, + /* void* */ d_tmp.getData(), + /* cudaDataType */ data_type)); + + // allocate an external buffer if needed + PL_CUSPARSE_IS_SUCCESS(cusparseSpMV_bufferSize( + /* cusparseHandle_t */ handle, + /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE, + /* const void* */ &alpha, + /* cusparseSpMatDescr_t */ mat, + /* cusparseDnVecDescr_t */ vecX, + /* const void* */ &beta, + /* cusparseDnVecDescr_t */ vecY, + /* cudaDataType */ data_type, + /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT, + /* size_t* */ &bufferSize)); + + DataBuffer dBuffer{bufferSize, device_id, + stream_id, true}; + + // execute SpMV + PL_CUSPARSE_IS_SUCCESS(cusparseSpMV( + /* cusparseHandle_t */ handle, + /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE, + /* const void* */ &alpha, + /* cusparseSpMatDescr_t */ mat, + /* cusparseDnVecDescr_t */ vecX, + /* const void* */ &beta, + /* cusparseDnVecDescr_t */ vecY, + /* cudaDataType */ data_type, + /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT, + /* void* */ reinterpret_cast(dBuffer.getData()))); + + // destroy matrix/vector descriptors + PL_CUSPARSE_IS_SUCCESS(cusparseDestroySpMat(mat)); + PL_CUSPARSE_IS_SUCCESS(cusparseDestroyDnVec(vecX)); + PL_CUSPARSE_IS_SUCCESS(cusparseDestroyDnVec(vecY)); + + color = 1; + } + + if (mpi_manager.getRank() == i) { + color = 1; + } + + auto new_mpi_manager = + mpi_manager.split(color, mpi_manager.getRank()); + int reduce_root_rank = -1; + + if (mpi_manager.getRank() == i) { + reduce_root_rank = new_mpi_manager.getRank(); + } + + mpi_manager.template Bcast(reduce_root_rank, i); + + if (new_mpi_manager.getComm() != MPI_COMM_NULL) { + new_mpi_manager.template Reduce( + d_tmp.getData(), d_sv_prime.getData(), length_local, + reduce_root_rank, "sum"); + } + } + sv.CopyGpuDataToGpuIn(d_sv_prime.getData(), d_sv_prime.getLength()); + } + + [[nodiscard]] auto getObsName() const -> std::string override { + using Pennylane::Util::operator<<; + std::ostringstream ss; + ss << "SparseHamiltonian: {\n'data' : "; + for (const auto &d : data_) + ss << d; + ss << ",\n'indices' : "; + for (const auto &i : indices_) + ss << i; + ss << ",\n'offsets' : "; + for (const auto &o : offsets_) + ss << o; + ss << "\n}"; + return ss.str(); + } + /** + * @brief Get the wires the observable applies to. + */ + [[nodiscard]] auto getWires() const -> std::vector { + return wires_; + }; +}; + } // namespace Pennylane::Algorithms diff --git a/pennylane_lightning_gpu/src/bindings/Bindings.cpp b/pennylane_lightning_gpu/src/bindings/Bindings.cpp index 2b0d677d..bf55653f 100644 --- a/pennylane_lightning_gpu/src/bindings/Bindings.cpp +++ b/pennylane_lightning_gpu/src/bindings/Bindings.cpp @@ -1590,6 +1590,49 @@ void StateVectorCudaMPI_class_bindings(py::module &m) { }, "Compare two observables"); + class_name = "SparseHamiltonianGPUMPI_C" + bitsize; + using SpIDX = typename SparseHamiltonianGPUMPI::IdxT; + py::class_, + std::shared_ptr>, + ObservableGPUMPI>(m, class_name.c_str(), + py::module_local()) + .def(py::init([](const np_arr_c &data, const np_arr_sparse_ind &indices, + const np_arr_sparse_ind &offsets, + const std::vector &wires) { + const py::buffer_info buffer_data = data.request(); + const auto *data_ptr = + static_cast *>(buffer_data.ptr); + + const py::buffer_info buffer_indices = indices.request(); + const auto *indices_ptr = static_cast(buffer_indices.ptr); + + const py::buffer_info buffer_offsets = offsets.request(); + const auto *offsets_ptr = static_cast(buffer_offsets.ptr); + + return SparseHamiltonianGPUMPI{ + std::vector>( + {data_ptr, data_ptr + data.size()}), + std::vector({indices_ptr, indices_ptr + indices.size()}), + std::vector({offsets_ptr, offsets_ptr + offsets.size()}), + wires}; + })) + .def("__repr__", &SparseHamiltonianGPUMPI::getObsName) + .def("get_wires", &SparseHamiltonianGPUMPI::getWires, + "Get wires of observables") + .def( + "__eq__", + [](const SparseHamiltonianGPUMPI &self, + py::handle other) -> bool { + if (!py::isinstance>( + other)) { + return false; + } + auto other_cast = + other.cast>(); + return self == other_cast; + }, + "Compare two observables"); + //***********************************************************************// // MPI Adj Jac //***********************************************************************// From 608c8e109f50fb314bca46bcd7db62a0a74441e8 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Sat, 8 Jul 2023 19:23:06 -0700 Subject: [PATCH 05/43] tidy up code --- mpitests/test_adjoint_jacobian.py | 50 +------------------------------ 1 file changed, 1 insertion(+), 49 deletions(-) diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py index a67480f9..a7639dfe 100644 --- a/mpitests/test_adjoint_jacobian.py +++ b/mpitests/test_adjoint_jacobian.py @@ -1046,54 +1046,6 @@ def circuit(params): j_gpu = qml.jacobian(qnode_gpu)(params) -''' -@pytest.mark.parametrize( - "returns", - [ - qml.SparseHamiltonian( - qml.Hamiltonian( - [0.1], [qml.PauliX(wires=custom_wires[0]) @ qml.PauliZ(wires=custom_wires[1])] - ).sparse_matrix(custom_wires), - wires=custom_wires, - ), - qml.SparseHamiltonian( - qml.Hamiltonian( - [2.0], [qml.PauliX(wires=custom_wires[2]) @ qml.PauliZ(wires=custom_wires[0])] - ).sparse_matrix(custom_wires), - wires=custom_wires, - ), - qml.SparseHamiltonian( - qml.Hamiltonian( - [1.1], [qml.PauliX(wires=custom_wires[0]) @ qml.PauliZ(wires=custom_wires[2])] - ).sparse_matrix(custom_wires), - wires=custom_wires, - ), - ], -) -def test_failed_adjoint_SparseHamiltonian(returns): - """Integration tests that compare to default.qubit for a large circuit containing parametrized - operations and when using custom wire labels""" - - dev_gpu = qml.device("lightning.gpu", wires=custom_wires, mpi=True) - - def circuit(params): - circuit_ansatz(params, wires=custom_wires) - return qml.expval(returns) - - n_params = 30 - np.random.seed(1337) - params = np.random.rand(n_params) - - qnode_gpu = qml.QNode(circuit, dev_gpu, diff_method="adjoint") - - with pytest.raises( - RuntimeError, match="LightningGPU-MPI does not currently support SparseHamiltonian." - ): - j_gpu = qml.jacobian(qnode_gpu)(params) - -''' - - custom_wires0 = ["alice", 3.14, -1, 0, "bob", "luc"] """ Important Note: Ensure wires index arranged from high to low qml.SparseHamiltonian( @@ -1109,7 +1061,7 @@ def circuit(params): [ qml.SparseHamiltonian( qml.Hamiltonian( - [0.1], [qml.PauliX(wires=custom_wires0[0]) @ qml.PauliZ(wires=custom_wires0[1])] + [0.1], [qml.PauliX(wires=custom_wires0[0]) @ qml.PauliY(wires=custom_wires0[1])] ).sparse_matrix(custom_wires0[::-1]), wires=custom_wires0, ), From 482f760e07a468e0d2d9a2158bc2b6c34e700441 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Sun, 9 Jul 2023 05:12:37 -0700 Subject: [PATCH 06/43] memory optimization --- pennylane_lightning_gpu/_serialize.py | 41 +++++++++++++++++++++------ 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/pennylane_lightning_gpu/_serialize.py b/pennylane_lightning_gpu/_serialize.py index 7df08516..41fab52f 100644 --- a/pennylane_lightning_gpu/_serialize.py +++ b/pennylane_lightning_gpu/_serialize.py @@ -170,14 +170,39 @@ def _serialize_hamiltonian( def _serialize_sparsehamiltonian(ob, wires_map: dict, use_csingle: bool, use_mpi: bool): sparsehamiltonian_obs, ctype, rtype = _sparsehamiltonian_ob_dtype(use_csingle, use_mpi) - spm = ob.sparse_matrix() - data = np.array(spm.data).astype(ctype) - indices = np.array(spm.indices).astype(rtype) - offsets = np.array(spm.indptr).astype(rtype) - - wires = [] - wires_list = ob.wires.tolist() - wires.extend([wires_map[w] for w in wires_list]) + if use_mpi: + mpi_manager_local = MPIManager() + #Only root 0 needs the overall sparsematrix data + if mpi_manager_local.getRank() == 0: + spm = ob.sparse_matrix() + data = np.array(spm.data).astype(ctype) + indices = np.array(spm.indices).astype(rtype) + offsets = np.array(spm.indptr).astype(rtype) + wires = [] + wires_list = ob.wires.tolist() + wires.extend([wires_map[w] for w in wires_list]) + else: + #Other root only needs non-null some sparsematrix data to pass + obs = qml.Identity(0) + Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix() + H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1)) + spm = H_sparse.sparse_matrix() + + data = np.array(spm.data).astype(ctype) + indices = np.array(spm.indices).astype(rtype) + offsets = np.array(spm.indptr).astype(rtype) + + wires = [] + wires_list = ob.wires.tolist() + wires.extend([wires_map[w] for w in wires_list]) + else: + spm = ob.sparse_matrix() + data = np.array(spm.data).astype(ctype) + indices = np.array(spm.indices).astype(rtype) + offsets = np.array(spm.indptr).astype(rtype) + wires = [] + wires_list = ob.wires.tolist() + wires.extend([wires_map[w] for w in wires_list]) return sparsehamiltonian_obs(data, indices, offsets, wires) From 51a71394039e37ea9c1f78f9637238abbb65caff Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Sun, 9 Jul 2023 05:17:36 -0700 Subject: [PATCH 07/43] tidy up the code --- .../src/algorithms/ObservablesGPUMPI.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp index d7ad2d56..61f6fc9a 100644 --- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp +++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp @@ -484,8 +484,13 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { * */ inline void applyInPlace(StateVectorCudaMPI &sv) const override { - PL_ABORT_IF_NOT(wires_.size() == sv.getTotalNumQubits(), + auto mpi_manager = sv.getMPIManager(); + + if(mpi_manager.getRank() == 0){ + PL_ABORT_IF_NOT(wires_.size() == sv.getTotalNumQubits(), "SparseH wire count does not match state-vector size"); + } + using CFP_t = typename StateVectorCudaMPI::CFP_t; const CFP_t alpha = {1.0, 0.0}; const CFP_t beta = {0.0, 0.0}; @@ -493,8 +498,6 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { auto device_id = sv.getDataBuffer().getDevTag().getDeviceID(); auto stream_id = sv.getDataBuffer().getDevTag().getStreamID(); - auto mpi_manager = sv.getMPIManager(); - // clang-format on cudaDataType_t data_type; cusparseIndexType_t compute_type; From d7604bbd8255066192de4815d50993d5e9954593 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Sun, 9 Jul 2023 10:57:14 -0700 Subject: [PATCH 08/43] quick update --- mpitests/test_apply.py | 16 +++++++++------- .../src/simulator/StateVectorCudaMPI.hpp | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/mpitests/test_apply.py b/mpitests/test_apply.py index 914eeab0..58ca1ede 100644 --- a/mpitests/test_apply.py +++ b/mpitests/test_apply.py @@ -554,7 +554,8 @@ def circuit(): class TestSparseHamiltonianExpval: - def test_sparse_hamiltonian_expectation(self, tol): + @pytest.fixture(params=[np.complex64, np.complex128]) + def test_sparse_hamiltonian_expectation(self, tol, request): comm = MPI.COMM_WORLD commSize = comm.Get_size() num_global_wires = commSize.bit_length() - 1 @@ -575,13 +576,13 @@ def test_sparse_hamiltonian_expectation(self, tol): 0.3 + 0.4j, 0.4 + 0.5j, ], - dtype=np.complex128, + dtype=request.params, ) - local_state_vector = np.zeros(1 << num_local_wires).astype(np.complex128) + local_state_vector = np.zeros(1 << num_local_wires).astype(request.params) comm.Scatter(state_vector, local_state_vector, root=0) - dev_gpumpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=np.complex128) + dev_gpumpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=request.params) dev_gpumpi.syncH2D(local_state_vector) @@ -615,6 +616,7 @@ def test_expval_single_wire_no_parameters(self, tol, operation, wires): obs = operation(wires) expval_single_wire_no_param(tol, obs) + @pytest.fixture(params=[np.complex64, np.complex128]) @pytest.mark.parametrize( "obs", [ @@ -626,13 +628,13 @@ def test_expval_single_wire_no_parameters(self, tol, operation, wires): qml.PauliZ(numQubits - 2) @ qml.PauliZ(numQubits - 1), ], ) - def test_expval_multiple_obs(self, obs, tol): + def test_expval_multiple_obs(self, obs, request, tol): """Test expval with Hamiltonian""" num_wires = numQubits comm = MPI.COMM_WORLD - dev_cpu = qml.device("default.qubit", wires=num_wires, c_dtype=np.complex128) - dev_gpumpi = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=np.complex128) + dev_cpu = qml.device("default.qubit", wires=num_wires, c_dtype=request.params) + dev_gpumpi = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=request.params) def circuit(): qml.RX(0.4, wires=[0]) diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 5cd3d162..1725fcfc 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -75,7 +75,7 @@ template struct CSRMatrix { csrOffsets = std::vector(num_rows + 1, 0); } - CSRMatrix(){}; + CSRMatrix() = default;; }; /** From 55e505f1e088e3191956e45e388aff86639e46e9 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Sun, 9 Jul 2023 11:03:57 -0700 Subject: [PATCH 09/43] make format --- mpitests/test_adjoint_jacobian.py | 2 ++ pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py index 20d3c01c..b42f9e71 100644 --- a/mpitests/test_adjoint_jacobian.py +++ b/mpitests/test_adjoint_jacobian.py @@ -1043,6 +1043,7 @@ def circuit(params): with pytest.raises((TypeError, ValueError)): j_gpu = qml.jacobian(qnode_gpu)(params) + ''' @pytest.mark.parametrize( "returns", @@ -1090,6 +1091,7 @@ def circuit(params): ''' + @pytest.mark.parametrize( "obs,obs_type_c64,obs_type_c128", [ diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 1725fcfc..0ffb85b5 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -75,7 +75,7 @@ template struct CSRMatrix { csrOffsets = std::vector(num_rows + 1, 0); } - CSRMatrix() = default;; + CSRMatrix() = default; }; /** From 2f21e759b580489d14982368ba8241d222d6fd95 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Mon, 10 Jul 2023 07:56:12 -0700 Subject: [PATCH 10/43] Add changelog --- .github/CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index c968f270..38abb29e 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -2,6 +2,10 @@ ### New features since last release + * Add Sparse Hamiltonian support for expectation value calculation. + [(#127)] (https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/127) + + ### Breaking changes ### Improvements @@ -14,6 +18,8 @@ This release contains contributions from (in alphabetical order): +Shuli Shu + --- # Release 0.31.0 From 1a451d2f650ce8bb5286df383f77ae05f7deb689 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Mon, 10 Jul 2023 08:09:50 -0700 Subject: [PATCH 11/43] make format --- pennylane_lightning_gpu/src/util/DataBuffer.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pennylane_lightning_gpu/src/util/DataBuffer.hpp b/pennylane_lightning_gpu/src/util/DataBuffer.hpp index 3b6d483f..41afa6e4 100644 --- a/pennylane_lightning_gpu/src/util/DataBuffer.hpp +++ b/pennylane_lightning_gpu/src/util/DataBuffer.hpp @@ -30,8 +30,8 @@ template class DataBuffer { DataBuffer(std::size_t length, int device_id = 0, cudaStream_t stream_id = 0, bool alloc_memory = true) - : length_{length}, dev_tag_{device_id, stream_id}, - gpu_buffer_{nullptr} { + : length_{length}, dev_tag_{device_id, stream_id}, gpu_buffer_{ + nullptr} { if (alloc_memory && (length > 0)) { dev_tag_.refresh(); PL_CUDA_IS_SUCCESS( From 2fb4c552a3e9c11cbb70fff877234e57665c9bae Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Mon, 10 Jul 2023 08:22:20 -0700 Subject: [PATCH 12/43] Trigger MPI CI From 93011e9ffb53d27849380da481234464e5266feb Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 11 Jul 2023 04:16:35 -0700 Subject: [PATCH 13/43] update base on comments --- .../src/simulator/StateVectorCudaMPI.hpp | 14 +++++++------- .../tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 0ffb85b5..fef5a637 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -65,13 +65,13 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value, cudaStream_t stream_id); template struct CSRMatrix { - std::vector> values; std::vector columns; + std::vector> values; std::vector csrOffsets; CSRMatrix(size_t num_rows, size_t nnz) { - values = std::vector>(nnz); columns = std::vector(nnz, 0); + values = std::vector>(nnz); csrOffsets = std::vector(num_rows + 1, 0); } @@ -956,10 +956,10 @@ class StateVectorCudaMPI } /** - * @brief Scatter a CSR (Compress Sparse Row) format matrix. + * @brief Scatter a CSR (Compressed Sparse Row) format matrix. * * @tparam index_type Integer type used as indices of the sparse matrix. - * @param matrix CSR (Compress Sparse Row) format matrix. + * @param matrix CSR (Compressed Sparse Row) format matrix. * @param root Root rank of the scatter operation. */ template @@ -1164,14 +1164,14 @@ class StateVectorCudaMPI d_tmp_res.zeroInit(); for (size_t i = 0; i < num_row_blocks; i++) { - std::vector> sparsematices_row; + std::vector> sparsematrices_row; if (mpi_manager_.getRank() == 0) { - sparsematices_row = csrmatrix_blocks[i]; + sparsematrices_row = csrmatrix_blocks[i]; } mpi_manager_.Barrier(); - auto localCSRMatrix = scatterCSRMatrix(sparsematices_row, 0); + auto localCSRMatrix = scatterCSRMatrix(sparsematrices_row, 0); mpi_manager_.Barrier(); int64_t num_rows_local = diff --git a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp index dbd735c4..26e81aff 100644 --- a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp +++ b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp @@ -977,7 +977,7 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::Hamiltonian_expval_cuSparse", auto results = sv.template getExpectationValueOnSparseSpMV( csrOffsets, num_csrOffsets, columns, values, nnz); - TestType expected = 1; + TestType expected = 1.0; CHECK(expected == Approx(results).epsilon(1e-7)); } From 50a3d81aea3bb5452aa07d9aa58239f031b2d914 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 11 Jul 2023 11:24:06 -0700 Subject: [PATCH 14/43] add changelog --- .github/CHANGELOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index c968f270..a7657699 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -2,6 +2,18 @@ ### New features since last release + * Add sparse Hamiltonian support to multi-node/multi-GPU adjoint methods. + [(#128)] (https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/128) + + Note each MPI process will return the overall result of the adjoint method. To ensure wires index arranged in a way aligned with cuQuantum backend, `wires` should be reordered in a descending manner in a python script as follows: + ```python + qml.SparseHamiltonian( + qml.Hamiltonian( + [0.1], [qml.PauliX(0) @ qml.PauliY(1)] + ).sparse_matrix(wires.[::-1]), + ) + ``` + ### Breaking changes ### Improvements @@ -14,6 +26,7 @@ This release contains contributions from (in alphabetical order): +Shuli Shu --- # Release 0.31.0 From bebbe2b7dde3fc962203cbe1bcab673cfe61b26e Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 11 Jul 2023 11:27:56 -0700 Subject: [PATCH 15/43] make format --- pennylane_lightning_gpu/_serialize.py | 4 ++-- .../src/algorithms/ObservablesGPUMPI.hpp | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pennylane_lightning_gpu/_serialize.py b/pennylane_lightning_gpu/_serialize.py index 41fab52f..de108949 100644 --- a/pennylane_lightning_gpu/_serialize.py +++ b/pennylane_lightning_gpu/_serialize.py @@ -172,7 +172,7 @@ def _serialize_sparsehamiltonian(ob, wires_map: dict, use_csingle: bool, use_mpi if use_mpi: mpi_manager_local = MPIManager() - #Only root 0 needs the overall sparsematrix data + # Only root 0 needs the overall sparsematrix data if mpi_manager_local.getRank() == 0: spm = ob.sparse_matrix() data = np.array(spm.data).astype(ctype) @@ -182,7 +182,7 @@ def _serialize_sparsehamiltonian(ob, wires_map: dict, use_csingle: bool, use_mpi wires_list = ob.wires.tolist() wires.extend([wires_map[w] for w in wires_list]) else: - #Other root only needs non-null some sparsematrix data to pass + # Other root only needs non-null some sparsematrix data to pass obs = qml.Identity(0) Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix() H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1)) diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp index 61f6fc9a..72513456 100644 --- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp +++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp @@ -486,11 +486,12 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { inline void applyInPlace(StateVectorCudaMPI &sv) const override { auto mpi_manager = sv.getMPIManager(); - if(mpi_manager.getRank() == 0){ - PL_ABORT_IF_NOT(wires_.size() == sv.getTotalNumQubits(), - "SparseH wire count does not match state-vector size"); + if (mpi_manager.getRank() == 0) { + PL_ABORT_IF_NOT( + wires_.size() == sv.getTotalNumQubits(), + "SparseH wire count does not match state-vector size"); } - + using CFP_t = typename StateVectorCudaMPI::CFP_t; const CFP_t alpha = {1.0, 0.0}; const CFP_t beta = {0.0, 0.0}; From 147afbe54e6a87efa32b722bab6a2594f4ff3435 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 18 Jul 2023 17:45:58 -0700 Subject: [PATCH 16/43] update based on comments --- .../src/simulator/StateVectorCudaMPI.hpp | 32 ++++++------------- .../src/util/MPIManager.hpp | 11 +++++-- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index fef5a637..4940293b 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -69,11 +69,8 @@ template struct CSRMatrix { std::vector> values; std::vector csrOffsets; - CSRMatrix(size_t num_rows, size_t nnz) { - columns = std::vector(nnz, 0); - values = std::vector>(nnz); - csrOffsets = std::vector(num_rows + 1, 0); - } + CSRMatrix(size_t num_rows, size_t nnz) + : columns(nnz, 0), values(nnz), csrOffsets(num_rows + 1, 0) {} CSRMatrix() = default; }; @@ -966,14 +963,12 @@ class StateVectorCudaMPI auto scatterCSRMatrix(std::vector> &matrix, size_t root) -> CSRMatrix { // Bcast num_rows and num_cols - size_t local_num_rows; - size_t num_col_blocks; + size_t local_num_rows = size_t{1} << this->getNumLocalQubits(); + size_t num_col_blocks = mpi_manager_.getSize(); std::vector nnzs; - local_num_rows = size_t{1} << this->getNumLocalQubits(); - num_col_blocks = mpi_manager_.getSize(); - if (mpi_manager_.getRank() == root) { + nnzs.reserve(matrix.size()); for (size_t j = 0; j < matrix.size(); j++) { nnzs.push_back(matrix[j].values.size()); } @@ -997,21 +992,13 @@ class StateVectorCudaMPI if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) { mpi_manager_.Send>(matrix[k].values, dest); + mpi_manager_.Send(matrix[k].csrOffsets, dest); + mpi_manager_.Send(matrix[k].columns, dest); } else if (mpi_manager_.getRank() == k && local_nnz) { mpi_manager_.Recv>( localCSRMatrix.values, source); - } - - if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) { - mpi_manager_.Send(matrix[k].csrOffsets, dest); - } else if (mpi_manager_.getRank() == k && local_nnz) { mpi_manager_.Recv(localCSRMatrix.csrOffsets, source); - } - - if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) { - mpi_manager_.Send(matrix[k].columns, dest); - } else if (mpi_manager_.getRank() == k && local_nnz) { mpi_manager_.Recv(localCSRMatrix.columns, source); } } @@ -1019,7 +1006,7 @@ class StateVectorCudaMPI } /** - * @brief Convert a global CSR (Compress Sparse Row) format matrix into + * @brief Convert a global CSR (Compressed Sparse Row) format matrix into * local blocks. This operation should be conducted on the rank 0. * * @tparam index_type Integer type used as indices of the sparse matrix. @@ -1293,8 +1280,7 @@ class StateVectorCudaMPI mpi_manager_.Bcast(reduce_root_rank, i); if (new_mpi_manager.getComm() != MPI_COMM_NULL) { - new_mpi_manager.Reduce(d_tmp.getData(), - d_tmp_res.getData(), length_local, + new_mpi_manager.Reduce(d_tmp, d_tmp_res, length_local, reduce_root_rank, "sum"); } } diff --git a/pennylane_lightning_gpu/src/util/MPIManager.hpp b/pennylane_lightning_gpu/src/util/MPIManager.hpp index 4e9a7c17..025cf6cb 100644 --- a/pennylane_lightning_gpu/src/util/MPIManager.hpp +++ b/pennylane_lightning_gpu/src/util/MPIManager.hpp @@ -31,6 +31,13 @@ #include #include "Error.hpp" +#include "DataBuffer.hpp" + +/// @cond DEV +namespace { +using namespace Pennylane::CUDA; +} // namespace +/// @endcond namespace Pennylane::MPI { inline void errhandler(int errcode, const char *str) { @@ -425,11 +432,11 @@ class MPIManager final { } template - void Reduce(T *sendBuf, T *recvBuf, size_t length, size_t root, + void Reduce(DataBuffer &sendBuf, DataBuffer &recvBuf, size_t length, size_t root, const std::string &op_str) { MPI_Datatype datatype = getMPIDatatype(); MPI_Op op = getMPIOpType(op_str); - PL_MPI_IS_SUCCESS(MPI_Reduce(sendBuf, recvBuf, length, datatype, op, + PL_MPI_IS_SUCCESS(MPI_Reduce(sendBuf.getData(), recvBuf.getData(), length, datatype, op, root, this->getComm())); } From 5a1705a6f466d14f9bb59828512d1e83e91e618b Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 18 Jul 2023 19:01:07 -0700 Subject: [PATCH 17/43] make format & add to-do work --- .../src/simulator/StateVectorCudaMPI.hpp | 4 +++- pennylane_lightning_gpu/src/util/MPIManager.hpp | 11 ++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 4940293b..985416d7 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -1037,6 +1037,7 @@ class StateVectorCudaMPI size_t row_block_size = size_t{1} << this->getNumLocalQubits(); size_t col_block_size = row_block_size; + // Add OpenMP support here later. for (size_t row = 0; row < num_rows; row++) { for (size_t col_idx = static_cast(csrOffsets_ptr[row]); col_idx < static_cast(csrOffsets_ptr[row + 1]); @@ -1067,6 +1068,7 @@ class StateVectorCudaMPI } } + // Add OpenMP support here later. for (size_t block_row_id = 0; block_row_id < num_row_blocks; block_row_id++) { for (size_t block_col_id = 0; block_col_id < num_col_blocks; @@ -2709,4 +2711,4 @@ class StateVectorCudaMPI } }; -}; // namespace Pennylane \ No newline at end of file +}; // namespace Pennylane diff --git a/pennylane_lightning_gpu/src/util/MPIManager.hpp b/pennylane_lightning_gpu/src/util/MPIManager.hpp index 025cf6cb..fd09f122 100644 --- a/pennylane_lightning_gpu/src/util/MPIManager.hpp +++ b/pennylane_lightning_gpu/src/util/MPIManager.hpp @@ -30,8 +30,8 @@ #include #include -#include "Error.hpp" #include "DataBuffer.hpp" +#include "Error.hpp" /// @cond DEV namespace { @@ -432,12 +432,13 @@ class MPIManager final { } template - void Reduce(DataBuffer &sendBuf, DataBuffer &recvBuf, size_t length, size_t root, - const std::string &op_str) { + void Reduce(DataBuffer &sendBuf, DataBuffer &recvBuf, size_t length, + size_t root, const std::string &op_str) { MPI_Datatype datatype = getMPIDatatype(); MPI_Op op = getMPIOpType(op_str); - PL_MPI_IS_SUCCESS(MPI_Reduce(sendBuf.getData(), recvBuf.getData(), length, datatype, op, - root, this->getComm())); + PL_MPI_IS_SUCCESS(MPI_Reduce(sendBuf.getData(), recvBuf.getData(), + length, datatype, op, root, + this->getComm())); } template From d68281c7d4b1523165880c3500862778415b2e7b Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Fri, 21 Jul 2023 05:52:59 -0700 Subject: [PATCH 18/43] make format --- .../src/simulator/StateVectorCudaMPI.hpp | 10 ++-------- pennylane_lightning_gpu/src/util/MPIManager.hpp | 6 +++--- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 985416d7..a0cb9a72 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -70,7 +70,7 @@ template struct CSRMatrix { std::vector csrOffsets; CSRMatrix(size_t num_rows, size_t nnz) - : columns(nnz, 0), values(nnz), csrOffsets(num_rows + 1, 0) {} + : columns(nnz, 0), values(nnz), csrOffsets(num_rows + 1, 0){}; CSRMatrix() = default; }; @@ -1153,14 +1153,8 @@ class StateVectorCudaMPI d_tmp_res.zeroInit(); for (size_t i = 0; i < num_row_blocks; i++) { - std::vector> sparsematrices_row; - if (mpi_manager_.getRank() == 0) { - sparsematrices_row = csrmatrix_blocks[i]; - } - mpi_manager_.Barrier(); - - auto localCSRMatrix = scatterCSRMatrix(sparsematrices_row, 0); + auto localCSRMatrix = scatterCSRMatrix(csrmatrix_blocks[i], 0); mpi_manager_.Barrier(); int64_t num_rows_local = diff --git a/pennylane_lightning_gpu/src/util/MPIManager.hpp b/pennylane_lightning_gpu/src/util/MPIManager.hpp index fd09f122..7d3135c7 100644 --- a/pennylane_lightning_gpu/src/util/MPIManager.hpp +++ b/pennylane_lightning_gpu/src/util/MPIManager.hpp @@ -560,7 +560,7 @@ class MPIManager final { template void Send(std::vector &sendBuf, size_t dest) { MPI_Datatype datatype = getMPIDatatype(); - const int tag = 0; + const int tag = 6789; PL_MPI_IS_SUCCESS(MPI_Send(sendBuf.data(), sendBuf.size(), datatype, static_cast(dest), tag, @@ -578,7 +578,7 @@ class MPIManager final { template void Recv(std::vector &recvBuf, size_t source) { MPI_Datatype datatype = getMPIDatatype(); MPI_Status status; - const int tag = 0; + const int tag = MPI_ANY_TAG; PL_MPI_IS_SUCCESS(MPI_Recv(recvBuf.data(), recvBuf.size(), datatype, static_cast(source), tag, @@ -740,4 +740,4 @@ class MPIManager final { {cppTypeToString(), MPI_UINT8_T}, {cppTypeToString(), MPI_UINT8_T}}; }; -} // namespace Pennylane::MPI \ No newline at end of file +} // namespace Pennylane::MPI From 521f84a74e9ac41822b5875fe04451e049660965 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Sun, 23 Jul 2023 23:29:27 -0700 Subject: [PATCH 19/43] sparse matrix struct to class --- .../src/simulator/StateVectorCudaMPI.hpp | 118 ++++++++++-------- 1 file changed, 69 insertions(+), 49 deletions(-) diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index a0cb9a72..9433adfb 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -65,14 +65,22 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value, cudaStream_t stream_id); template struct CSRMatrix { + private: std::vector columns; std::vector> values; std::vector csrOffsets; + public: CSRMatrix(size_t num_rows, size_t nnz) : columns(nnz, 0), values(nnz), csrOffsets(num_rows + 1, 0){}; CSRMatrix() = default; + + auto getColumns() -> std::vector & { return columns; } + auto getValues() -> std::vector> & { + return values; + } + auto getCsrOffsets() -> std::vector & { return csrOffsets; } }; /** @@ -970,7 +978,7 @@ class StateVectorCudaMPI if (mpi_manager_.getRank() == root) { nnzs.reserve(matrix.size()); for (size_t j = 0; j < matrix.size(); j++) { - nnzs.push_back(matrix[j].values.size()); + nnzs.push_back(matrix[j].getValues().size()); } } @@ -980,26 +988,27 @@ class StateVectorCudaMPI local_nnz); if (mpi_manager_.getRank() == root) { - localCSRMatrix.values = matrix[0].values; - localCSRMatrix.csrOffsets = matrix[0].csrOffsets; - localCSRMatrix.columns = matrix[0].columns; + localCSRMatrix.getValues() = matrix[0].getValues(); + localCSRMatrix.getCsrOffsets() = matrix[0].getCsrOffsets(); + localCSRMatrix.getColumns() = matrix[0].getColumns(); } for (size_t k = 1; k < num_col_blocks; k++) { size_t dest = k; size_t source = 0; - if (mpi_manager_.getRank() == 0 && matrix[k].values.size()) { - mpi_manager_.Send>(matrix[k].values, - dest); - mpi_manager_.Send(matrix[k].csrOffsets, dest); - mpi_manager_.Send(matrix[k].columns, dest); + if (mpi_manager_.getRank() == 0 && matrix[k].getValues().size()) { + mpi_manager_.Send>( + matrix[k].getValues(), dest); + mpi_manager_.Send(matrix[k].getCsrOffsets(), dest); + mpi_manager_.Send(matrix[k].getColumns(), dest); } else if (mpi_manager_.getRank() == k && local_nnz) { mpi_manager_.Recv>( - localCSRMatrix.values, source); - mpi_manager_.Recv(localCSRMatrix.csrOffsets, + localCSRMatrix.getValues(), source); + mpi_manager_.Recv(localCSRMatrix.getCsrOffsets(), + source); + mpi_manager_.Recv(localCSRMatrix.getColumns(), source); - mpi_manager_.Recv(localCSRMatrix.columns, source); } } return localCSRMatrix; @@ -1054,17 +1063,21 @@ class StateVectorCudaMPI size_t local_col_id = current_global_col % col_block_size; if (splitSparseMatrix[block_row_id][block_col_id] - .csrOffsets.size() == 0) { - splitSparseMatrix[block_row_id][block_col_id].csrOffsets = + .getCsrOffsets() + .size() == 0) { + splitSparseMatrix[block_row_id][block_col_id] + .getCsrOffsets() = std::vector(row_block_size + 1, 0); } splitSparseMatrix[block_row_id][block_col_id] - .csrOffsets[local_row_id + 1]++; - splitSparseMatrix[block_row_id][block_col_id].columns.push_back( - local_col_id); - splitSparseMatrix[block_row_id][block_col_id].values.push_back( - current_val); + .getCsrOffsets()[local_row_id + 1]++; + splitSparseMatrix[block_row_id][block_col_id] + .getColumns() + .push_back(local_col_id); + splitSparseMatrix[block_row_id][block_col_id] + .getValues() + .push_back(current_val); } } @@ -1075,12 +1088,13 @@ class StateVectorCudaMPI block_col_id++) { for (size_t i0 = 1; i0 < splitSparseMatrix[block_row_id][block_col_id] - .csrOffsets.size(); + .getCsrOffsets() + .size(); i0++) { splitSparseMatrix[block_row_id][block_col_id] - .csrOffsets[i0] += + .getCsrOffsets()[i0] += splitSparseMatrix[block_row_id][block_col_id] - .csrOffsets[i0 - 1]; + .getCsrOffsets()[i0 - 1]; } } } @@ -1147,10 +1161,11 @@ class StateVectorCudaMPI const size_t length_local = size_t{1} << this->getNumLocalQubits(); - DataBuffer d_tmp{length_local, device_id, stream_id, true}; - DataBuffer d_tmp_res{length_local, device_id, stream_id, - true}; - d_tmp_res.zeroInit(); + DataBuffer d_res_per_block{length_local, device_id, + stream_id, true}; + DataBuffer d_res_per_rowblock{length_local, device_id, + stream_id, true}; + d_res_per_rowblock.zeroInit(); for (size_t i = 0; i < num_row_blocks; i++) { @@ -1158,33 +1173,36 @@ class StateVectorCudaMPI mpi_manager_.Barrier(); int64_t num_rows_local = - static_cast(localCSRMatrix.csrOffsets.size() - 1); + static_cast(localCSRMatrix.getCsrOffsets().size() - 1); int64_t num_cols_local = - static_cast(localCSRMatrix.columns.size()); + static_cast(localCSRMatrix.getColumns().size()); int64_t nnz_local = - static_cast(localCSRMatrix.values.size()); + static_cast(localCSRMatrix.getValues().size()); size_t color = 0; - if (localCSRMatrix.values.size() != 0) { - d_tmp.zeroInit(); + if (localCSRMatrix.getValues().size() != 0) { + d_res_per_block.zeroInit(); DataBuffer d_csrOffsets{ - localCSRMatrix.csrOffsets.size(), device_id, stream_id, + localCSRMatrix.getCsrOffsets().size(), device_id, stream_id, true}; DataBuffer d_columns{ - localCSRMatrix.columns.size(), device_id, stream_id, true}; - DataBuffer d_values{localCSRMatrix.values.size(), - device_id, stream_id, true}; - - d_csrOffsets.CopyHostDataToGpu(localCSRMatrix.csrOffsets.data(), - localCSRMatrix.csrOffsets.size(), - false); - d_columns.CopyHostDataToGpu(localCSRMatrix.columns.data(), - localCSRMatrix.columns.size(), + localCSRMatrix.getColumns().size(), device_id, stream_id, + true}; + DataBuffer d_values{ + localCSRMatrix.getValues().size(), device_id, stream_id, + true}; + + d_csrOffsets.CopyHostDataToGpu( + localCSRMatrix.getCsrOffsets().data(), + localCSRMatrix.getCsrOffsets().size(), false); + d_columns.CopyHostDataToGpu(localCSRMatrix.getColumns().data(), + localCSRMatrix.getColumns().size(), false); - d_values.CopyHostDataToGpu(localCSRMatrix.values.data(), - localCSRMatrix.values.size(), false); + d_values.CopyHostDataToGpu(localCSRMatrix.getValues().data(), + localCSRMatrix.getValues().size(), + false); // CUSPARSE APIs cusparseSpMatDescr_t mat; @@ -1218,7 +1236,7 @@ class StateVectorCudaMPI PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec( /* cusparseDnVecDescr_t* */ &vecY, /* int64_t */ num_rows_local, - /* void* */ d_tmp.getData(), + /* void* */ d_res_per_block.getData(), /* cudaDataType */ data_type)); // allocate an external buffer if needed @@ -1276,17 +1294,19 @@ class StateVectorCudaMPI mpi_manager_.Bcast(reduce_root_rank, i); if (new_mpi_manager.getComm() != MPI_COMM_NULL) { - new_mpi_manager.Reduce(d_tmp, d_tmp_res, length_local, + new_mpi_manager.Reduce(d_res_per_block, + d_res_per_rowblock, length_local, reduce_root_rank, "sum"); } } mpi_manager_.Barrier(); - local_expect = innerProdC_CUDA(d_tmp_res.getData(), BaseType::getData(), - BaseType::getLength(), device_id, - stream_id, getCublasCaller()) - .x; + local_expect = + innerProdC_CUDA(d_res_per_block.getData(), BaseType::getData(), + BaseType::getLength(), device_id, stream_id, + getCublasCaller()) + .x; PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); mpi_manager_.Barrier(); From 9f9777cbb5a5b6857e479ea7bd09135b51b86ad7 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Mon, 24 Jul 2023 01:30:11 -0700 Subject: [PATCH 20/43] update unit test cases --- mpitests/test_apply.py | 21 +++++++++++-------- .../mpi/Test_StateVectorCudaMPI_NonParam.cpp | 6 +++--- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/mpitests/test_apply.py b/mpitests/test_apply.py index 58ca1ede..211469b1 100644 --- a/mpitests/test_apply.py +++ b/mpitests/test_apply.py @@ -553,9 +553,10 @@ def circuit(): assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0) -class TestSparseHamiltonianExpval: - @pytest.fixture(params=[np.complex64, np.complex128]) - def test_sparse_hamiltonian_expectation(self, tol, request): +class TestSparseHamExpval: + """Tests sparse hamiltonian expectation values.""" + + def test_sparse_hamiltonian_expectation(self, tol): comm = MPI.COMM_WORLD commSize = comm.Get_size() num_global_wires = commSize.bit_length() - 1 @@ -572,26 +573,28 @@ def test_sparse_hamiltonian_expectation(self, tol, request): 0.1 + 0.1j, 0.1 + 0.2j, 0.2 + 0.2j, + 0.2 + 0.3j, 0.3 + 0.3j, - 0.3 + 0.4j, - 0.4 + 0.5j, + 0.3 + 0.5j, ], - dtype=request.params, + dtype=np.complex128, ) - local_state_vector = np.zeros(1 << num_local_wires).astype(request.params) + local_state_vector = np.zeros(1 << num_local_wires).astype(np.complex128) comm.Scatter(state_vector, local_state_vector, root=0) - dev_gpumpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=request.params) + dev_gpumpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=np.complex128) + dev_gpu = qml.device("lightning.gpu", wires=3, mpi=False, c_dtype=np.complex128) dev_gpumpi.syncH2D(local_state_vector) + dev_gpu.syncH2D(state_vector) H_sparse = qml.SparseHamiltonian(Hmat, wires=range(3)) comm.Barrier() res = dev_gpumpi.expval(H_sparse) - expected = 1 + expected = dev_gpu.expval(H_sparse) assert np.allclose(res, expected) diff --git a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp index 26e81aff..3a426c7b 100644 --- a/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp +++ b/pennylane_lightning_gpu/src/tests/mpi/Test_StateVectorCudaMPI_NonParam.cpp @@ -941,7 +941,7 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::Hamiltonian_expval_cuSparse", mpi_manager.Barrier(); std::vector init_sv{{0.0, 0.0}, {0.0, 0.1}, {0.1, 0.1}, {0.1, 0.2}, - {0.2, 0.2}, {0.3, 0.3}, {0.3, 0.4}, {0.4, 0.5}}; + {0.2, 0.2}, {0.2, 0.3}, {0.3, 0.3}, {0.3, 0.5}}; using index_type = typename std::conditional::value, int32_t, @@ -977,8 +977,8 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::Hamiltonian_expval_cuSparse", auto results = sv.template getExpectationValueOnSparseSpMV( csrOffsets, num_csrOffsets, columns, values, nnz); - TestType expected = 1.0; + TestType expected = 0.97; CHECK(expected == Approx(results).epsilon(1e-7)); } -} \ No newline at end of file +} From 914aac679660926885204df185d7379ad0065f4c Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Wed, 26 Jul 2023 19:59:54 -0700 Subject: [PATCH 21/43] update docstring --- .../src/simulator/StateVectorCudaMPI.hpp | 38 +++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 9433adfb..58922b04 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -1103,8 +1103,19 @@ class StateVectorCudaMPI } /** - * @brief expval(H) calculation with cuSparseSpMV. - * + * @brief expval(H) calculates the expected value using cuSparseSpMV and MPI + * to implement distributed Sparse Matrix-Vector multiplication. The dense + * vector is distributed across multiple GPU devices and only the MPI rank 0 + * holds the complete sparse matrix data. The process involves the following + * steps: 1. The rank 0 splits the full sparse matrix into n by n blocks, + * where n is the size of MPI communicator. Each row of blocks is then + * distributed across multiple GPUs. 2. For each GPU, cuSparseSpMV is + * invoked to perform the local sparse matrix block and local state vector + * multiplication. 3. Each GPU will collect computation results for its + * respective row block of the sparse matrix. 4. After all sparse matrix + * operations are completed on each GPU, an inner product is performed and + * MPI reduce operation is ultilized to obtain the final result for the + * expectation value. * @tparam index_type Integer type used as indices of the sparse matrix. * @param csr_Offsets_ptr Pointer to the array of row offsets of the sparse * matrix. Array of size csrOffsets_size. @@ -1137,13 +1148,20 @@ class StateVectorCudaMPI cudaDataType_t data_type; cusparseIndexType_t compute_type; + cusparseOperation_t operation_type = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseSpMVAlg_t spmvalg_type = CUSPARSE_SPMV_ALG_DEFAULT; + cusparseIndexBase_t index_base_type = CUSPARSE_INDEX_BASE_ZERO; if constexpr (std::is_same_v || std::is_same_v) { data_type = CUDA_C_64F; - compute_type = CUSPARSE_INDEX_64I; } else { data_type = CUDA_C_32F; + } + + if constexpr (std::is_same_v) { + compute_type = CUSPARSE_INDEX_64I; + } else { compute_type = CUSPARSE_INDEX_32I; } @@ -1168,9 +1186,7 @@ class StateVectorCudaMPI d_res_per_rowblock.zeroInit(); for (size_t i = 0; i < num_row_blocks; i++) { - auto localCSRMatrix = scatterCSRMatrix(csrmatrix_blocks[i], 0); - mpi_manager_.Barrier(); int64_t num_rows_local = static_cast(localCSRMatrix.getCsrOffsets().size() - 1); @@ -1222,7 +1238,7 @@ class StateVectorCudaMPI /* void* */ d_values.getData(), /* cusparseIndexType_t */ compute_type, /* cusparseIndexType_t */ compute_type, - /* cusparseIndexBase_t */ CUSPARSE_INDEX_BASE_ZERO, + /* cusparseIndexBase_t */ index_base_type, /* cudaDataType */ data_type)); // Create dense vector X @@ -1242,14 +1258,14 @@ class StateVectorCudaMPI // allocate an external buffer if needed PL_CUSPARSE_IS_SUCCESS(cusparseSpMV_bufferSize( /* cusparseHandle_t */ handle, - /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE, + /* cusparseOperation_t */ operation_type, /* const void* */ &alpha, /* cusparseSpMatDescr_t */ mat, /* cusparseDnVecDescr_t */ vecX, /* const void* */ &beta, /* cusparseDnVecDescr_t */ vecY, /* cudaDataType */ data_type, - /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT, + /* cusparseSpMVAlg_t */ spmvalg_type, /* size_t* */ &bufferSize)); DataBuffer dBuffer{bufferSize, device_id, @@ -1258,14 +1274,14 @@ class StateVectorCudaMPI // execute SpMV PL_CUSPARSE_IS_SUCCESS(cusparseSpMV( /* cusparseHandle_t */ handle, - /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE, + /* cusparseOperation_t */ operation_type, /* const void* */ &alpha, /* cusparseSpMatDescr_t */ mat, /* cusparseDnVecDescr_t */ vecX, /* const void* */ &beta, /* cusparseDnVecDescr_t */ vecY, /* cudaDataType */ data_type, - /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT, + /* cusparseSpMVAlg_t */ spmvalg_type, /* void* */ reinterpret_cast(dBuffer.getData()))); // destroy matrix/vector descriptors @@ -1303,7 +1319,7 @@ class StateVectorCudaMPI mpi_manager_.Barrier(); local_expect = - innerProdC_CUDA(d_res_per_block.getData(), BaseType::getData(), + innerProdC_CUDA(d_res_per_rowblock.getData(), BaseType::getData(), BaseType::getLength(), device_id, stream_id, getCublasCaller()) .x; From fb78ed806014bfcf69777ae13dd2a0b5d84c609e Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Wed, 26 Jul 2023 20:04:45 -0700 Subject: [PATCH 22/43] add more docstring --- pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 58922b04..093fe071 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -1186,6 +1186,8 @@ class StateVectorCudaMPI d_res_per_rowblock.zeroInit(); for (size_t i = 0; i < num_row_blocks; i++) { + //Need to investigate if non-blocking MPI operation can improve + //performace here. auto localCSRMatrix = scatterCSRMatrix(csrmatrix_blocks[i], 0); int64_t num_rows_local = From 77abc0e7d7e6e9d47f3ff058cc568a9c1251950d Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Wed, 26 Jul 2023 20:06:21 -0700 Subject: [PATCH 23/43] make format --- pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 093fe071..2cfd111a 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -1186,8 +1186,8 @@ class StateVectorCudaMPI d_res_per_rowblock.zeroInit(); for (size_t i = 0; i < num_row_blocks; i++) { - //Need to investigate if non-blocking MPI operation can improve - //performace here. + // Need to investigate if non-blocking MPI operation can improve + // performace here. auto localCSRMatrix = scatterCSRMatrix(csrmatrix_blocks[i], 0); int64_t num_rows_local = From 6c949818125f01a7468acf89f66bfacddedd9085 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Wed, 26 Jul 2023 21:40:02 -0700 Subject: [PATCH 24/43] update based on new csrmatrix --- .../src/algorithms/ObservablesGPUMPI.hpp | 51 ++++++++++--------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp index 72513456..b5c08e83 100644 --- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp +++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp @@ -502,13 +502,20 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { // clang-format on cudaDataType_t data_type; cusparseIndexType_t compute_type; + cusparseOperation_t operation_type = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseSpMVAlg_t spmvalg_type = CUSPARSE_SPMV_ALG_DEFAULT; + cusparseIndexBase_t index_base_type = CUSPARSE_INDEX_BASE_ZERO; if constexpr (std::is_same_v || std::is_same_v) { data_type = CUDA_C_64F; - compute_type = CUSPARSE_INDEX_64I; } else { data_type = CUDA_C_32F; + } + + if constexpr (std::is_same_v) { + compute_type = CUSPARSE_INDEX_64I; + } else { compute_type = CUSPARSE_INDEX_32I; } @@ -536,41 +543,35 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { sv.getData()); for (size_t i = 0; i < num_row_blocks; i++) { - std::vector> sparsematices_row; - - if (mpi_manager.getRank() == 0) { - sparsematices_row = csrmatrix_blocks[i]; - } - auto localCSRMatrix = - sv.template scatterCSRMatrix(sparsematices_row, 0); + sv.template scatterCSRMatrix(csrmatrix_blocks[i], 0); int64_t num_rows_local = - static_cast(localCSRMatrix.csrOffsets.size() - 1); + static_cast(localCSRMatrix.getCsrOffsets().size() - 1); int64_t num_cols_local = - static_cast(localCSRMatrix.columns.size()); + static_cast(localCSRMatrix.getColumns().size()); int64_t nnz_local = - static_cast(localCSRMatrix.values.size()); + static_cast(localCSRMatrix.getValues().size()); size_t color = 0; if (localCSRMatrix.values.size() != 0) { DataBuffer d_csrOffsets{ - localCSRMatrix.csrOffsets.size(), device_id, stream_id, + localCSRMatrix.getCsrOffsets().size(), device_id, stream_id, true}; - DataBuffer d_columns{localCSRMatrix.columns.size(), + DataBuffer d_columns{localCSRMatrix.getColumns().size(), device_id, stream_id, true}; - DataBuffer d_values{localCSRMatrix.values.size(), + DataBuffer d_values{localCSRMatrix.getValues().size(), device_id, stream_id, true}; - d_csrOffsets.CopyHostDataToGpu(localCSRMatrix.csrOffsets.data(), - localCSRMatrix.csrOffsets.size(), + d_csrOffsets.CopyHostDataToGpu(localCSRMatrix.getCsrOffsets().data(), + localCSRMatrix.getCsrOffsets().size(), false); - d_columns.CopyHostDataToGpu(localCSRMatrix.columns.data(), - localCSRMatrix.columns.size(), + d_columns.CopyHostDataToGpu(localCSRMatrix.getColumns().data(), + localCSRMatrix.getColumns().size(), false); - d_values.CopyHostDataToGpu(localCSRMatrix.values.data(), - localCSRMatrix.values.size(), false); + d_values.CopyHostDataToGpu(localCSRMatrix.getValues().data(), + localCSRMatrix.getValues().size(), false); // CUSPARSE APIs cusparseSpMatDescr_t mat; @@ -590,7 +591,7 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { /* void* */ d_values.getData(), /* cusparseIndexType_t */ compute_type, /* cusparseIndexType_t */ compute_type, - /* cusparseIndexBase_t */ CUSPARSE_INDEX_BASE_ZERO, + /* cusparseIndexBase_t */ index_base_type, /* cudaDataType */ data_type)); // Create dense vector X @@ -610,14 +611,14 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { // allocate an external buffer if needed PL_CUSPARSE_IS_SUCCESS(cusparseSpMV_bufferSize( /* cusparseHandle_t */ handle, - /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE, + /* cusparseOperation_t */ operation_type, /* const void* */ &alpha, /* cusparseSpMatDescr_t */ mat, /* cusparseDnVecDescr_t */ vecX, /* const void* */ &beta, /* cusparseDnVecDescr_t */ vecY, /* cudaDataType */ data_type, - /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT, + /* cusparseSpMVAlg_t */ spmvalg_type, /* size_t* */ &bufferSize)); DataBuffer dBuffer{bufferSize, device_id, @@ -626,14 +627,14 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { // execute SpMV PL_CUSPARSE_IS_SUCCESS(cusparseSpMV( /* cusparseHandle_t */ handle, - /* cusparseOperation_t */ CUSPARSE_OPERATION_NON_TRANSPOSE, + /* cusparseOperation_t */ operation_type, /* const void* */ &alpha, /* cusparseSpMatDescr_t */ mat, /* cusparseDnVecDescr_t */ vecX, /* const void* */ &beta, /* cusparseDnVecDescr_t */ vecY, /* cudaDataType */ data_type, - /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT, + /* cusparseSpMVAlg_t */ spmvalg_type, /* void* */ reinterpret_cast(dBuffer.getData()))); // destroy matrix/vector descriptors From e3b6a8a5c46fd6cd6f36a356d6bf880dcc501c60 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Wed, 26 Jul 2023 21:41:18 -0700 Subject: [PATCH 25/43] bug fix --- pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 2cfd111a..688fd4e7 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -1159,7 +1159,7 @@ class StateVectorCudaMPI data_type = CUDA_C_32F; } - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { compute_type = CUSPARSE_INDEX_64I; } else { compute_type = CUSPARSE_INDEX_32I; From 97b89268e6cc0335a98b986291513cd250e6c6bf Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Thu, 27 Jul 2023 13:07:48 +0800 Subject: [PATCH 26/43] make format --- mpitests/test_adjoint_jacobian.py | 1 - .../src/algorithms/ObservablesGPUMPI.hpp | 21 +++++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py index 7b8af953..a7639dfe 100644 --- a/mpitests/test_adjoint_jacobian.py +++ b/mpitests/test_adjoint_jacobian.py @@ -1184,7 +1184,6 @@ def circuit(params): assert np.allclose(j_cpu, j_gpu) - @pytest.mark.parametrize( "obs,obs_type_c64,obs_type_c128", [ diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp index b5c08e83..dd4e035a 100644 --- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp +++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp @@ -559,19 +559,22 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { DataBuffer d_csrOffsets{ localCSRMatrix.getCsrOffsets().size(), device_id, stream_id, true}; - DataBuffer d_columns{localCSRMatrix.getColumns().size(), - device_id, stream_id, true}; - DataBuffer d_values{localCSRMatrix.getValues().size(), - device_id, stream_id, true}; - - d_csrOffsets.CopyHostDataToGpu(localCSRMatrix.getCsrOffsets().data(), - localCSRMatrix.getCsrOffsets().size(), - false); + DataBuffer d_columns{ + localCSRMatrix.getColumns().size(), device_id, stream_id, + true}; + DataBuffer d_values{ + localCSRMatrix.getValues().size(), device_id, stream_id, + true}; + + d_csrOffsets.CopyHostDataToGpu( + localCSRMatrix.getCsrOffsets().data(), + localCSRMatrix.getCsrOffsets().size(), false); d_columns.CopyHostDataToGpu(localCSRMatrix.getColumns().data(), localCSRMatrix.getColumns().size(), false); d_values.CopyHostDataToGpu(localCSRMatrix.getValues().data(), - localCSRMatrix.getValues().size(), false); + localCSRMatrix.getValues().size(), + false); // CUSPARSE APIs cusparseSpMatDescr_t mat; From 6c94020ba5d819a4dde04f66a71312f59e8a89c5 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Thu, 27 Jul 2023 13:11:38 +0800 Subject: [PATCH 27/43] quick fix --- pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp index dd4e035a..59e3f5da 100644 --- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp +++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp @@ -513,7 +513,7 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { data_type = CUDA_C_32F; } - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { compute_type = CUSPARSE_INDEX_64I; } else { compute_type = CUSPARSE_INDEX_32I; From ebac2a002f6353dc1b206600d246e1747a65859d Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Thu, 27 Jul 2023 13:32:45 +0800 Subject: [PATCH 28/43] update mpimanager --- .../src/algorithms/ObservablesGPUMPI.hpp | 3 ++- pennylane_lightning_gpu/src/util/MPIManager.hpp | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp index 59e3f5da..990a22ec 100644 --- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp +++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp @@ -534,6 +534,7 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { DevTag dt_local(sv.getDataBuffer().getDevTag()); dt_local.refresh(); + StateVectorCudaMPI d_sv_prime( dt_local, sv.getNumGlobalQubits(), sv.getNumLocalQubits(), sv.getData()); @@ -555,7 +556,7 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { size_t color = 0; - if (localCSRMatrix.values.size() != 0) { + if (localCSRMatrix.getValues().size() != 0) { DataBuffer d_csrOffsets{ localCSRMatrix.getCsrOffsets().size(), device_id, stream_id, true}; diff --git a/pennylane_lightning_gpu/src/util/MPIManager.hpp b/pennylane_lightning_gpu/src/util/MPIManager.hpp index 7d3135c7..4fcdf328 100644 --- a/pennylane_lightning_gpu/src/util/MPIManager.hpp +++ b/pennylane_lightning_gpu/src/util/MPIManager.hpp @@ -441,6 +441,15 @@ class MPIManager final { this->getComm())); } + template + void Reduce(T *sendBuf, T *recvBuf, size_t length, size_t root, + const std::string &op_str) { + MPI_Datatype datatype = getMPIDatatype(); + MPI_Op op = getMPIOpType(op_str); + PL_MPI_IS_SUCCESS(MPI_Reduce(sendBuf, recvBuf, length, datatype, op, + root, this->getComm())); + } + template void Gather(T &sendBuf, std::vector &recvBuf, size_t root) { MPI_Datatype datatype = getMPIDatatype(); From 13d6da5f8cd6c254e102719550d7e08f5699b896 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Thu, 27 Jul 2023 16:36:16 -0700 Subject: [PATCH 29/43] quick update --- .../src/simulator/StateVectorCudaMPI.hpp | 51 ++++++++++--------- .../src/util/MPIManager.hpp | 45 +++++++++++++++- 2 files changed, 70 insertions(+), 26 deletions(-) diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 688fd4e7..d0fd5d95 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -64,15 +64,15 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value, const size_t index, bool async, cudaStream_t stream_id); -template struct CSRMatrix { +template class CSRMatrix { private: std::vector columns; - std::vector> values; std::vector csrOffsets; + std::vector> values; public: CSRMatrix(size_t num_rows, size_t nnz) - : columns(nnz, 0), values(nnz), csrOffsets(num_rows + 1, 0){}; + : columns(nnz, 0), csrOffsets(num_rows + 1, 0), values(nnz){}; CSRMatrix() = default; @@ -1046,21 +1046,25 @@ class StateVectorCudaMPI size_t row_block_size = size_t{1} << this->getNumLocalQubits(); size_t col_block_size = row_block_size; - // Add OpenMP support here later. + // Add OpenMP support here later. Need to pay attention to + // race condition. + size_t current_global_row, current_global_col; + size_t block_row_id, block_col_id; + size_t local_row_id, local_col_id; for (size_t row = 0; row < num_rows; row++) { for (size_t col_idx = static_cast(csrOffsets_ptr[row]); col_idx < static_cast(csrOffsets_ptr[row + 1]); col_idx++) { - size_t current_global_row = row; - size_t current_global_col = columns_ptr[col_idx]; + current_global_row = row; + current_global_col = columns_ptr[col_idx]; std::complex current_val = values_ptr[col_idx]; - size_t block_row_id = current_global_row / row_block_size; - size_t block_col_id = current_global_col / col_block_size; + block_row_id = current_global_row / row_block_size; + block_col_id = current_global_col / col_block_size; - size_t local_row_id = current_global_row % row_block_size; - size_t local_col_id = current_global_col % col_block_size; + local_row_id = current_global_row % row_block_size; + local_col_id = current_global_col % col_block_size; if (splitSparseMatrix[block_row_id][block_col_id] .getCsrOffsets() @@ -1086,15 +1090,13 @@ class StateVectorCudaMPI block_row_id++) { for (size_t block_col_id = 0; block_col_id < num_col_blocks; block_col_id++) { - for (size_t i0 = 1; - i0 < splitSparseMatrix[block_row_id][block_col_id] - .getCsrOffsets() - .size(); - i0++) { - splitSparseMatrix[block_row_id][block_col_id] - .getCsrOffsets()[i0] += - splitSparseMatrix[block_row_id][block_col_id] - .getCsrOffsets()[i0 - 1]; + auto &localSpMat = + splitSparseMatrix[block_row_id][block_col_id]; + size_t local_csr_offset_size = + localSpMat.getCsrOffsets().size(); + for (size_t i0 = 1; i0 < local_csr_offset_size; i0++) { + localSpMat.getCsrOffsets()[i0] += + localSpMat.getCsrOffsets()[i0 - 1]; } } } @@ -1148,9 +1150,10 @@ class StateVectorCudaMPI cudaDataType_t data_type; cusparseIndexType_t compute_type; - cusparseOperation_t operation_type = CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseSpMVAlg_t spmvalg_type = CUSPARSE_SPMV_ALG_DEFAULT; - cusparseIndexBase_t index_base_type = CUSPARSE_INDEX_BASE_ZERO; + const cusparseOperation_t operation_type = + CUSPARSE_OPERATION_NON_TRANSPOSE; + const cusparseSpMVAlg_t spmvalg_type = CUSPARSE_SPMV_ALG_DEFAULT; + const cusparseIndexBase_t index_base_type = CUSPARSE_INDEX_BASE_ZERO; if constexpr (std::is_same_v || std::is_same_v) { @@ -1168,8 +1171,8 @@ class StateVectorCudaMPI std::vector>> csrmatrix_blocks; - size_t num_col_blocks = mpi_manager_.getSize(); - size_t num_row_blocks = mpi_manager_.getSize(); + const size_t num_col_blocks = mpi_manager_.getSize(); + const size_t num_row_blocks = mpi_manager_.getSize(); if (mpi_manager_.getRank() == 0) { csrmatrix_blocks = diff --git a/pennylane_lightning_gpu/src/util/MPIManager.hpp b/pennylane_lightning_gpu/src/util/MPIManager.hpp index 7d3135c7..d46f5ab9 100644 --- a/pennylane_lightning_gpu/src/util/MPIManager.hpp +++ b/pennylane_lightning_gpu/src/util/MPIManager.hpp @@ -412,6 +412,15 @@ class MPIManager final { return recvBuf; } + /** + * @brief MPI_Reduce wrapper. + * + * @tparam T C++ data type. + * @param sendBuf Send buffer. + * @param recvBuf Receive buffer. + * @param root Rank of root process. + * @param op_str String of MPI_Op. + */ template void Reduce(T &sendBuf, T &recvBuf, size_t root, const std::string &op_str) { @@ -421,6 +430,15 @@ class MPIManager final { this->getComm())); } + /** + * @brief MPI_Reduce wrapper. + * + * @tparam T C++ data type. + * @param sendBuf Send buffer vector. + * @param recvBuf Receive buffer vector. + * @param root Rank of root process. + * @param op_str String of MPI_Op. + */ template void Reduce(std::vector &sendBuf, std::vector &recvBuf, size_t root, const std::string &op_str) { @@ -431,6 +449,15 @@ class MPIManager final { this->getComm())); } + /** + * @brief MPI_Reduce wrapper. + * + * @tparam T C++ data type. + * @param sendBuf Send buffer (DataBuffer type). + * @param recvBuf Receive buffer (DataBuffer type). + * @param root Rank of root process. + * @param op_str String of MPI_Op. + */ template void Reduce(DataBuffer &sendBuf, DataBuffer &recvBuf, size_t length, size_t root, const std::string &op_str) { @@ -441,6 +468,14 @@ class MPIManager final { this->getComm())); } + /** + * @brief MPI_Reduce wrapper. + * + * @tparam T C++ data type. + * @param sendBuf Send buffer. + * @param recvBuf Receive buffer vector. + * @param root Rank of root process. + */ template void Gather(T &sendBuf, std::vector &recvBuf, size_t root) { MPI_Datatype datatype = getMPIDatatype(); @@ -448,6 +483,14 @@ class MPIManager final { datatype, root, this->getComm())); } + /** + * @brief MPI_Reduce wrapper. + * + * @tparam T C++ data type. + * @param sendBuf Send buffer vector. + * @param recvBuf Receive buffer vector. + * @param root Rank of root process. + */ template void Gather(std::vector &sendBuf, std::vector &recvBuf, size_t root) { MPI_Datatype datatype = getMPIDatatype(); @@ -557,7 +600,6 @@ class MPIManager final { * @param sendBuf Send buffer vector. * @param dest Rank of send dest. */ - template void Send(std::vector &sendBuf, size_t dest) { MPI_Datatype datatype = getMPIDatatype(); const int tag = 6789; @@ -574,7 +616,6 @@ class MPIManager final { * @param recvBuf Recv buffer vector. * @param source Rank of data source. */ - template void Recv(std::vector &recvBuf, size_t source) { MPI_Datatype datatype = getMPIDatatype(); MPI_Status status; From 5b0712c2cb7c5f3211ffa7261b297d76e812d313 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Thu, 27 Jul 2023 19:17:40 -0700 Subject: [PATCH 30/43] merge sparse_ham --- .../src/algorithms/ObservablesGPUMPI.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp index 990a22ec..0fc1f3a0 100644 --- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp +++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp @@ -502,9 +502,10 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { // clang-format on cudaDataType_t data_type; cusparseIndexType_t compute_type; - cusparseOperation_t operation_type = CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseSpMVAlg_t spmvalg_type = CUSPARSE_SPMV_ALG_DEFAULT; - cusparseIndexBase_t index_base_type = CUSPARSE_INDEX_BASE_ZERO; + const cusparseOperation_t operation_type = + CUSPARSE_OPERATION_NON_TRANSPOSE; + const cusparseSpMVAlg_t spmvalg_type = CUSPARSE_SPMV_ALG_DEFAULT; + const cusparseIndexBase_t index_base_type = CUSPARSE_INDEX_BASE_ZERO; if constexpr (std::is_same_v || std::is_same_v) { @@ -521,8 +522,8 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { std::vector>> csrmatrix_blocks; - size_t num_col_blocks = mpi_manager.getSize(); - size_t num_row_blocks = mpi_manager.getSize(); + const size_t num_col_blocks = mpi_manager.getSize(); + const size_t num_row_blocks = mpi_manager.getSize(); if (mpi_manager.getRank() == 0) { csrmatrix_blocks = sv.template splitCSRMatrix( From 500d8c571d8e1400ee124673a9f9895c9df3da41 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Sat, 29 Jul 2023 18:10:54 +0800 Subject: [PATCH 31/43] add more docstring --- .../src/simulator/StateVectorCudaMPI.hpp | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index d0fd5d95..20f58dd9 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -64,6 +64,14 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value, const size_t index, bool async, cudaStream_t stream_id); +/** + * @brief Manage memory of Compressed Sparse Row (CSR) sparse matrix. CSR format + * represents a matrix M by three (one-dimensional) arrays, that respectively + * contain nonzero values, row offsets, and column indices. + * + * @tparam Precision Floating-point precision type. + * @tparam index_type Integer type. + */ template class CSRMatrix { private: std::vector columns; @@ -76,11 +84,22 @@ template class CSRMatrix { CSRMatrix() = default; + /** + * @brief Get the CSR format index vector of the matrix. + */ auto getColumns() -> std::vector & { return columns; } + + /** + * @brief Get CSR format offset vector of the matrix. + */ + auto getCsrOffsets() -> std::vector & { return csrOffsets; } + + /** + * @brief Get CSR format data vector of the matrix. + */ auto getValues() -> std::vector> & { return values; } - auto getCsrOffsets() -> std::vector & { return csrOffsets; } }; /** @@ -1116,7 +1135,7 @@ class StateVectorCudaMPI * multiplication. 3. Each GPU will collect computation results for its * respective row block of the sparse matrix. 4. After all sparse matrix * operations are completed on each GPU, an inner product is performed and - * MPI reduce operation is ultilized to obtain the final result for the + * MPI reduce operation is utilized to obtain the final result for the * expectation value. * @tparam index_type Integer type used as indices of the sparse matrix. * @param csr_Offsets_ptr Pointer to the array of row offsets of the sparse From a09d10b6cc4de8ebf415f26ca8b5f485160a2513 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 8 Aug 2023 12:15:45 -0700 Subject: [PATCH 32/43] update typo --- mpitests/test_adjoint_jacobian.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py index 7b8af953..ba4b6f6c 100644 --- a/mpitests/test_adjoint_jacobian.py +++ b/mpitests/test_adjoint_jacobian.py @@ -1046,12 +1046,11 @@ def circuit(params): j_gpu = qml.jacobian(qnode_gpu)(params) -custom_wires0 = ["alice", 3.14, -1, 0, "bob", "luc"] """ Important Note: Ensure wires index arranged from high to low qml.SparseHamiltonian( qml.Hamiltonian( - [0.1], [qml.PauliX(wires=custom_wires0[1]) @ qml.PauliY(wires=custom_wires0[0])] - ).sparse_matrix(custom_wires0.[::-1]), + [0.1], [qml.PauliX(wires=custom_wires[1]) @ qml.PauliY(wires=custom_wires[0])] + ).sparse_matrix(custom_wires.[::-1]), ) """ @@ -1061,21 +1060,21 @@ def circuit(params): [ qml.SparseHamiltonian( qml.Hamiltonian( - [0.1], [qml.PauliX(wires=custom_wires0[0]) @ qml.PauliY(wires=custom_wires0[1])] - ).sparse_matrix(custom_wires0[::-1]), - wires=custom_wires0, + [0.1], [qml.PauliX(wires=custom_wires[0]) @ qml.PauliY(wires=custom_wires[1])] + ).sparse_matrix(custom_wires[::-1]), + wires=custom_wires, ), qml.SparseHamiltonian( qml.Hamiltonian( - [2.0], [qml.PauliX(wires=custom_wires0[2]) @ qml.PauliZ(wires=custom_wires0[0])] - ).sparse_matrix(custom_wires0[::-1]), - wires=custom_wires0, + [2.0], [qml.PauliX(wires=custom_wires[2]) @ qml.PauliZ(wires=custom_wires[0])] + ).sparse_matrix(custom_wires[::-1]), + wires=custom_wires, ), qml.SparseHamiltonian( qml.Hamiltonian( - [1.1], [qml.PauliX(wires=custom_wires0[0]) @ qml.PauliZ(wires=custom_wires0[2])] - ).sparse_matrix(custom_wires0[::-1]), - wires=custom_wires0, + [1.1], [qml.PauliX(wires=custom_wires[0]) @ qml.PauliZ(wires=custom_wires[2])] + ).sparse_matrix(custom_wires[::-1]), + wires=custom_wires, ), ], ) @@ -1084,11 +1083,11 @@ def test_adjoint_SparseHamiltonian_custom_wires(returns): operations and when using custom wire labels""" comm = MPI.COMM_WORLD - dev_gpu = qml.device("lightning.gpu", wires=custom_wires0, mpi=True) - dev_cpu = qml.device("default.qubit", wires=custom_wires0) + dev_gpu = qml.device("lightning.gpu", wires=custom_wires, mpi=True) + dev_cpu = qml.device("default.qubit", wires=custom_wires) def circuit(params): - circuit_ansatz(params, wires=custom_wires0) + circuit_ansatz(params, wires=custom_wires) return qml.expval(returns) if comm.Get_rank() == 0: @@ -1184,7 +1183,6 @@ def circuit(params): assert np.allclose(j_cpu, j_gpu) - @pytest.mark.parametrize( "obs,obs_type_c64,obs_type_c128", [ From c9131f0f2cf5cea7c7e93b7d4a7a01b6eaa517a9 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 8 Aug 2023 18:32:22 -0700 Subject: [PATCH 33/43] update indices order --- .github/CHANGELOG.md | 2 +- mpitests/test_adjoint_jacobian.py | 38 ++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index db1ce874..5d037e67 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -5,7 +5,7 @@ * Add sparse Hamiltonian support to multi-node/multi-GPU adjoint methods. [(#128)] (https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/128) - Note each MPI process will return the overall result of the adjoint method. To ensure wires index arranged in a way aligned with cuQuantum backend, `wires` should be reordered in a descending manner in a python script as follows: + Note each MPI process will return the overall result of the adjoint method. If there is a global target wire when constructing the sparse Hamiltonian, `wires` should be reordered in a descending manner in a python script as follows: ```python qml.SparseHamiltonian( qml.Hamiltonian( diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py index ba4b6f6c..97ee4ec2 100644 --- a/mpitests/test_adjoint_jacobian.py +++ b/mpitests/test_adjoint_jacobian.py @@ -1046,7 +1046,8 @@ def circuit(params): j_gpu = qml.jacobian(qnode_gpu)(params) -""" Important Note: Ensure wires index arranged from high to low +""" Important Note: Ensure wires index arranged from high to low if + there is a global target wire. qml.SparseHamiltonian( qml.Hamiltonian( [0.1], [qml.PauliX(wires=custom_wires[1]) @ qml.PauliY(wires=custom_wires[0])] @@ -1067,7 +1068,13 @@ def circuit(params): qml.SparseHamiltonian( qml.Hamiltonian( [2.0], [qml.PauliX(wires=custom_wires[2]) @ qml.PauliZ(wires=custom_wires[0])] - ).sparse_matrix(custom_wires[::-1]), + ).sparse_matrix(custom_wires), + wires=custom_wires, + ), + qml.SparseHamiltonian( + qml.Hamiltonian( + [2.0], [qml.PauliX(wires=custom_wires[1]) @ qml.PauliZ(wires=custom_wires[2])] + ).sparse_matrix(custom_wires), wires=custom_wires, ), qml.SparseHamiltonian( @@ -1108,11 +1115,12 @@ def circuit(params): assert np.allclose(j_cpu, j_gpu) -""" Important Note: Ensure wires index arranged from high to low +""" Important Note: Ensure wires index arranged from high to low if + there is a global target wire. qml.SparseHamiltonian( qml.Hamiltonian( [0.1], - [qml.PauliX(1) @ qml.PauliZ(0)], + [qml.PauliX(0) @ qml.PauliZ(1)], ).sparse_matrix(range(5, -1, -1)), wires=range(6), ) @@ -1133,9 +1141,23 @@ def circuit(params): qml.Hamiltonian( [0.1], [qml.PauliX(1) @ qml.PauliZ(0)], + ).sparse_matrix(range(6)), + wires=range(6), + ), + qml.SparseHamiltonian( + qml.Hamiltonian( + [0.1], + [qml.PauliX(0)], ).sparse_matrix(range(5, -1, -1)), wires=range(6), ), + qml.SparseHamiltonian( + qml.Hamiltonian( + [0.1], + [qml.PauliX(5)], + ).sparse_matrix(range(6)), + wires=range(6), + ), qml.SparseHamiltonian( qml.Hamiltonian( [0.1], @@ -1144,11 +1166,15 @@ def circuit(params): wires=range(6), ), qml.SparseHamiltonian( - qml.Hamiltonian([2.0], [qml.PauliX(1) @ qml.PauliZ(2)]).sparse_matrix(range(5, -1, -1)), + qml.Hamiltonian([2.0], [qml.PauliX(1) @ qml.PauliZ(2)]).sparse_matrix(range(6)), + wires=range(6), + ), + qml.SparseHamiltonian( + qml.Hamiltonian([2.0], [qml.PauliX(2) @ qml.PauliZ(4)]).sparse_matrix(range(6)), wires=range(6), ), qml.SparseHamiltonian( - qml.Hamiltonian([1.1], [qml.PauliX(2) @ qml.PauliZ(0)]).sparse_matrix(range(5, -1, -1)), + qml.Hamiltonian([1.1], [qml.PauliX(2) @ qml.PauliZ(0)]).sparse_matrix(range(6)), wires=range(6), ), ], From 618bdf2414b81053b1124094c81f3cf02fd7929b Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Sat, 19 Aug 2023 14:35:28 -0700 Subject: [PATCH 34/43] add matrix sort v0 --- .../src/algorithms/ObservablesGPUMPI.hpp | 42 +++++++++++++++++-- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp index 0fc1f3a0..150eb841 100644 --- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp +++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp @@ -22,6 +22,16 @@ namespace Pennylane::Algorithms { +inline size_t reverseBits(size_t num, size_t nbits){ + size_t reversed = 0; + for(size_t i = 0; i < nbits; ++i){ + //ith bit value + size_t bit = num & (1 << i); + reversed += bit << (nbits - i); + } + return reversed +} + /** * @brief A base class for all observable classes. * @@ -420,10 +430,12 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { int32_t, int64_t>::type; private: + MPIManager mpi_manager_; std::vector> data_; std::vector indices_; std::vector offsets_; std::vector wires_; + [[nodiscard]] bool isEqual(const ObservableGPUMPI &other) const override { @@ -450,10 +462,32 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { */ template > - SparseHamiltonianGPUMPI(T1 &&arg1, T2 &&arg2, T3 &&arg3, T4 &&arg4) - : data_{std::forward(arg1)}, indices_{std::forward(arg2)}, - offsets_{std::forward(arg3)}, wires_{std::forward(arg4)} { - PL_ASSERT(data_.size() == indices_.size()); + SparseHamiltonianGPUMPI(T1 &&arg1, T2 &&arg2, T3 &&arg3, T4 &&arg4):mpi_manager_(MPI_COMM_WORLD){ + if(mpi_manager_.getRank() == 0){ + std::> data_tmp = std::move(arg1); + std::vector indices_tmp = std::move(arg2); + std::vector offsets_tmp = std::move(arg3); + std::vector wires_tmp = std::move(arg4); + PL_ASSERT(data_tmp.size() == indices_tmp.size()); + + offsets_.push_back(0); + //sort sparseMV + for(size_t i = 0; i < offsets_tmp.size() - 1; i++){ + size_t target_row = reverseBits(i,wire_tmp.size()); + size_t offset = offsets_tmp[target_row]; + size_t local_col_size = offsets_tmp[target_row+1] - offset; + offset_.push_back(offsets_.back()+local_col_size); + //get unsorted column indices + std::vector local_col_indices(indices_.begin()+offset,indices_tmp.begin()+offset+local_col_size+1); + for(size_t j = 0; j < local_col_indices.size(); j++){ + size_t target_col = reverseBits(local_col_indices[i],wire_tmp.size()); + local_col_indices[i] = target_col; + } + //sort indices and get the rank of column index + + //sort data + } + } } /** From de7db96e36b9390f37909986da9c770fa1834391 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 22 Aug 2023 06:16:22 -0700 Subject: [PATCH 35/43] refactor csrmatrix --- .../src/bindings/Bindings.cpp | 41 +++- .../src/simulator/StateVectorCudaMPI.hpp | 209 +----------------- .../src/util/CSRMatrix.hpp | 185 ++++++++++++++++ 3 files changed, 222 insertions(+), 213 deletions(-) create mode 100644 pennylane_lightning_gpu/src/util/CSRMatrix.hpp diff --git a/pennylane_lightning_gpu/src/bindings/Bindings.cpp b/pennylane_lightning_gpu/src/bindings/Bindings.cpp index 2b0d677d..33a6ece5 100644 --- a/pennylane_lightning_gpu/src/bindings/Bindings.cpp +++ b/pennylane_lightning_gpu/src/bindings/Bindings.cpp @@ -39,6 +39,7 @@ #ifdef ENABLE_MPI #include "AdjointDiffGPUMPI.hpp" +#include "CSRMatrix.hpp" #include "MPIManager.hpp" #include "StateVectorCudaMPI.hpp" #endif @@ -1360,15 +1361,39 @@ void StateVectorCudaMPI_class_bindings(py::module &m) { const np_arr_sparse_ind &columns, const np_arr_c values) { using index_type = typename std::conditional< std::is_same::value, int32_t, int64_t>::type; + //Distribute sparse matrix across multi-nodes/multi-gpus + size_t num_rows = + static_cast(csrOffsets.request().size) - 1; + size_t local_num_rows = size_t{1} << sv.getNumLocalQubits(); + size_t root = 0; + + MPIManager mpi_manager = sv.getMPIManager(); + + std::vector>> + csrmatrix_blocks; + + if (mpi_manager.getRank() == root) { + csrmatrix_blocks = splitCSRMatrix( + mpi_manager, num_rows, + static_cast(csrOffsets.request().ptr), + static_cast(columns.request().ptr), + static_cast *>( + values.request().ptr)); + } + mpi_manager.Barrier(); + + std::vector> + localCSRMatVector; + for (size_t i = 0; i < mpi_manager.getSize(); i++) { + auto localCSRMat = scatterCSRMatrix( + mpi_manager, csrmatrix_blocks[i], local_num_rows, root); + localCSRMatVector.push_back(localCSRMat); + } + + mpi_manager.Barrier(); + return sv.template getExpectationValueOnSparseSpMV( - static_cast(csrOffsets.request().ptr), - static_cast( - csrOffsets.request() - .size), // num_rows + 1 or csrOffsets - static_cast(columns.request().ptr), // columns - static_cast *>( - values.request().ptr), - static_cast(values.request().size)); // nnz + localCSRMatVector); }, "Calculate the expectation value of a sparse Hamiltonian.") diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 20f58dd9..b1c52568 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -27,6 +27,7 @@ #include #include // custatevecApplyMatrix +#include "CSRMatrix.hpp" #include "Constant.hpp" #include "Error.hpp" #include "MPIManager.hpp" @@ -64,44 +65,6 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value, const size_t index, bool async, cudaStream_t stream_id); -/** - * @brief Manage memory of Compressed Sparse Row (CSR) sparse matrix. CSR format - * represents a matrix M by three (one-dimensional) arrays, that respectively - * contain nonzero values, row offsets, and column indices. - * - * @tparam Precision Floating-point precision type. - * @tparam index_type Integer type. - */ -template class CSRMatrix { - private: - std::vector columns; - std::vector csrOffsets; - std::vector> values; - - public: - CSRMatrix(size_t num_rows, size_t nnz) - : columns(nnz, 0), csrOffsets(num_rows + 1, 0), values(nnz){}; - - CSRMatrix() = default; - - /** - * @brief Get the CSR format index vector of the matrix. - */ - auto getColumns() -> std::vector & { return columns; } - - /** - * @brief Get CSR format offset vector of the matrix. - */ - auto getCsrOffsets() -> std::vector & { return csrOffsets; } - - /** - * @brief Get CSR format data vector of the matrix. - */ - auto getValues() -> std::vector> & { - return values; - } -}; - /** * @brief Managed memory CUDA state-vector class using custateVec backed * gate-calls. @@ -979,150 +942,6 @@ class StateVectorCudaMPI return expect_val; } - /** - * @brief Scatter a CSR (Compressed Sparse Row) format matrix. - * - * @tparam index_type Integer type used as indices of the sparse matrix. - * @param matrix CSR (Compressed Sparse Row) format matrix. - * @param root Root rank of the scatter operation. - */ - template - auto scatterCSRMatrix(std::vector> &matrix, - size_t root) -> CSRMatrix { - // Bcast num_rows and num_cols - size_t local_num_rows = size_t{1} << this->getNumLocalQubits(); - size_t num_col_blocks = mpi_manager_.getSize(); - std::vector nnzs; - - if (mpi_manager_.getRank() == root) { - nnzs.reserve(matrix.size()); - for (size_t j = 0; j < matrix.size(); j++) { - nnzs.push_back(matrix[j].getValues().size()); - } - } - - size_t local_nnz = mpi_manager_.scatter(nnzs, 0)[0]; - - CSRMatrix localCSRMatrix(local_num_rows, - local_nnz); - - if (mpi_manager_.getRank() == root) { - localCSRMatrix.getValues() = matrix[0].getValues(); - localCSRMatrix.getCsrOffsets() = matrix[0].getCsrOffsets(); - localCSRMatrix.getColumns() = matrix[0].getColumns(); - } - - for (size_t k = 1; k < num_col_blocks; k++) { - size_t dest = k; - size_t source = 0; - - if (mpi_manager_.getRank() == 0 && matrix[k].getValues().size()) { - mpi_manager_.Send>( - matrix[k].getValues(), dest); - mpi_manager_.Send(matrix[k].getCsrOffsets(), dest); - mpi_manager_.Send(matrix[k].getColumns(), dest); - } else if (mpi_manager_.getRank() == k && local_nnz) { - mpi_manager_.Recv>( - localCSRMatrix.getValues(), source); - mpi_manager_.Recv(localCSRMatrix.getCsrOffsets(), - source); - mpi_manager_.Recv(localCSRMatrix.getColumns(), - source); - } - } - return localCSRMatrix; - } - - /** - * @brief Convert a global CSR (Compressed Sparse Row) format matrix into - * local blocks. This operation should be conducted on the rank 0. - * - * @tparam index_type Integer type used as indices of the sparse matrix. - * @param num_row_blocks Number of local blocks per global row. - * @param num_col_blocks Number of local blocks per global column. - * @param csrOffsets_ptr Pointer to the array of row offsets of the sparse - * matrix. Array of size csrOffsets_size. - * @param columns_ptr Pointer to the array of column indices of the sparse - * matrix. Array of size numNNZ - * @param values_ptr Pointer to the array of the non-zero elements - * - * @return auto A vector of vector of CSRMatrix. - */ - template - auto splitCSRMatrix(const size_t &num_row_blocks, - const size_t &num_col_blocks, - const index_type *csrOffsets_ptr, - const index_type *columns_ptr, - const std::complex *values_ptr) - -> std::vector>> { - - std::vector>> - splitSparseMatrix( - num_row_blocks, - std::vector>(num_col_blocks)); - - size_t num_rows = size_t{1} << this->getTotalNumQubits(); - size_t row_block_size = size_t{1} << this->getNumLocalQubits(); - size_t col_block_size = row_block_size; - - // Add OpenMP support here later. Need to pay attention to - // race condition. - size_t current_global_row, current_global_col; - size_t block_row_id, block_col_id; - size_t local_row_id, local_col_id; - for (size_t row = 0; row < num_rows; row++) { - for (size_t col_idx = static_cast(csrOffsets_ptr[row]); - col_idx < static_cast(csrOffsets_ptr[row + 1]); - col_idx++) { - - current_global_row = row; - current_global_col = columns_ptr[col_idx]; - std::complex current_val = values_ptr[col_idx]; - - block_row_id = current_global_row / row_block_size; - block_col_id = current_global_col / col_block_size; - - local_row_id = current_global_row % row_block_size; - local_col_id = current_global_col % col_block_size; - - if (splitSparseMatrix[block_row_id][block_col_id] - .getCsrOffsets() - .size() == 0) { - splitSparseMatrix[block_row_id][block_col_id] - .getCsrOffsets() = - std::vector(row_block_size + 1, 0); - } - - splitSparseMatrix[block_row_id][block_col_id] - .getCsrOffsets()[local_row_id + 1]++; - splitSparseMatrix[block_row_id][block_col_id] - .getColumns() - .push_back(local_col_id); - splitSparseMatrix[block_row_id][block_col_id] - .getValues() - .push_back(current_val); - } - } - - // Add OpenMP support here later. - for (size_t block_row_id = 0; block_row_id < num_row_blocks; - block_row_id++) { - for (size_t block_col_id = 0; block_col_id < num_col_blocks; - block_col_id++) { - auto &localSpMat = - splitSparseMatrix[block_row_id][block_col_id]; - size_t local_csr_offset_size = - localSpMat.getCsrOffsets().size(); - for (size_t i0 = 1; i0 < local_csr_offset_size; i0++) { - localSpMat.getCsrOffsets()[i0] += - localSpMat.getCsrOffsets()[i0 - 1]; - } - } - } - - return splitSparseMatrix; - } - /** * @brief expval(H) calculates the expected value using cuSparseSpMV and MPI * to implement distributed Sparse Matrix-Vector multiplication. The dense @@ -1149,15 +968,7 @@ class StateVectorCudaMPI */ template auto getExpectationValueOnSparseSpMV( - const index_type *csrOffsets_ptr, const index_type csrOffsets_size, - const index_type *columns_ptr, - const std::complex *values_ptr, const index_type numNNZ) { - if (mpi_manager_.getRank() == 0) { - PL_ABORT_IF_NOT(static_cast(csrOffsets_size - 1) == - (size_t{1} << this->getTotalNumQubits()), - "Incorrect size of CSR Offsets."); - PL_ABORT_IF_NOT(numNNZ > 0, "Empty CSR matrix."); - } + std::vector> &CSRMatVector) { const CFP_t alpha = {1.0, 0.0}; const CFP_t beta = {0.0, 0.0}; @@ -1187,18 +998,6 @@ class StateVectorCudaMPI compute_type = CUSPARSE_INDEX_32I; } - std::vector>> - csrmatrix_blocks; - - const size_t num_col_blocks = mpi_manager_.getSize(); - const size_t num_row_blocks = mpi_manager_.getSize(); - - if (mpi_manager_.getRank() == 0) { - csrmatrix_blocks = - splitCSRMatrix(num_col_blocks, num_row_blocks, csrOffsets_ptr, - columns_ptr, values_ptr); - } - const size_t length_local = size_t{1} << this->getNumLocalQubits(); DataBuffer d_res_per_block{length_local, device_id, @@ -1207,10 +1006,10 @@ class StateVectorCudaMPI stream_id, true}; d_res_per_rowblock.zeroInit(); - for (size_t i = 0; i < num_row_blocks; i++) { + for (size_t i = 0; i < mpi_manager_.getSize(); i++) { // Need to investigate if non-blocking MPI operation can improve // performace here. - auto localCSRMatrix = scatterCSRMatrix(csrmatrix_blocks[i], 0); + auto &localCSRMatrix = CSRMatVector[i]; int64_t num_rows_local = static_cast(localCSRMatrix.getCsrOffsets().size() - 1); diff --git a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp new file mode 100644 index 00000000..7a87673a --- /dev/null +++ b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp @@ -0,0 +1,185 @@ +#pragma once + +#include +#include +#include +#include + +#include "MPIManager.hpp" + +/// @cond DEV +namespace { +using namespace Pennylane::CUDA; +} // namespace +/// @endcond +namespace Pennylane::MPI { +/** + * @brief Manage memory of Compressed Sparse Row (CSR) sparse matrix. CSR format + * represents a matrix M by three (one-dimensional) arrays, that respectively + * contain nonzero values, row offsets, and column indices. + * + * @tparam Precision Floating-point precision type. + * @tparam index_type Integer type. + */ +template class CSRMatrix { + private: + std::vector columns_; + std::vector csrOffsets_; + std::vector> values_; + + public: + CSRMatrix(size_t num_rows, size_t nnz) + : columns_(nnz, 0), csrOffsets_(num_rows + 1, 0), values_(nnz){}; + + CSRMatrix(size_t num_rows, size_t nnz, index_type *column_ptr, + index_type *csrOffsets_ptr, std::complex *value_ptr) + : columns_(column_ptr, column_ptr + num_rows), + csrOffsets_(csrOffsets_ptr, csrOffsets_ptr + num_rows + 1), + values_(value_ptr, value_ptr + nnz){}; + + CSRMatrix() = default; + + /** + * @brief Get the CSR format index vector of the matrix. + */ + auto getColumns() -> std::vector & { return columns_; } + + /** + * @brief Get CSR format offset vector of the matrix. + */ + auto getCsrOffsets() -> std::vector & { return csrOffsets_; } + + /** + * @brief Get CSR format data vector of the matrix. + */ + auto getValues() -> std::vector> & { + return values_; + } +}; + +template +auto splitCSRMatrix(MPIManager &mpi_manager, const size_t &num_rows, + const index_type *csrOffsets_ptr, + const index_type *columns_ptr, + const std::complex *values_ptr) + -> std::vector>> { + + size_t num_row_blocks = mpi_manager.getSize(); + size_t num_col_blocks = num_row_blocks; + + std::vector>> + splitSparseMatrix( + num_row_blocks, + std::vector>(num_col_blocks)); + + size_t row_block_size = num_rows / num_row_blocks; + size_t col_block_size = row_block_size; + + // Add OpenMP support here later. Need to pay attention to + // race condition. + size_t current_global_row, current_global_col; + size_t block_row_id, block_col_id; + size_t local_row_id, local_col_id; + for (size_t row = 0; row < num_rows; row++) { + for (size_t col_idx = static_cast(csrOffsets_ptr[row]); + col_idx < static_cast(csrOffsets_ptr[row + 1]); + col_idx++) { + + current_global_row = row; + current_global_col = columns_ptr[col_idx]; + std::complex current_val = values_ptr[col_idx]; + + block_row_id = current_global_row / row_block_size; + block_col_id = current_global_col / col_block_size; + + local_row_id = current_global_row % row_block_size; + local_col_id = current_global_col % col_block_size; + + if (splitSparseMatrix[block_row_id][block_col_id] + .getCsrOffsets() + .size() == 0) { + splitSparseMatrix[block_row_id][block_col_id].getCsrOffsets() = + std::vector(row_block_size + 1, 0); + } + + splitSparseMatrix[block_row_id][block_col_id] + .getCsrOffsets()[local_row_id + 1]++; + splitSparseMatrix[block_row_id][block_col_id] + .getColumns() + .push_back(local_col_id); + splitSparseMatrix[block_row_id][block_col_id].getValues().push_back( + current_val); + } + } + + // Add OpenMP support here later. + for (size_t block_row_id = 0; block_row_id < num_row_blocks; + block_row_id++) { + for (size_t block_col_id = 0; block_col_id < num_col_blocks; + block_col_id++) { + auto &localSpMat = splitSparseMatrix[block_row_id][block_col_id]; + size_t local_csr_offset_size = localSpMat.getCsrOffsets().size(); + for (size_t i0 = 1; i0 < local_csr_offset_size; i0++) { + localSpMat.getCsrOffsets()[i0] += + localSpMat.getCsrOffsets()[i0 - 1]; + } + } + } + + return splitSparseMatrix; +} + +/** + * @brief Scatter a CSR (Compressed Sparse Row) format matrix. + * + * @tparam index_type Integer type used as indices of the sparse matrix. + * @param matrix CSR (Compressed Sparse Row) format matrix. + * @param root Root rank of the scatter operation. + */ +template +auto scatterCSRMatrix(MPIManager &mpi_manager, + std::vector> &matrix, + size_t local_num_rows, size_t root) + -> CSRMatrix { + // Bcast num_rows and num_cols + size_t num_col_blocks = mpi_manager.getSize(); + + std::vector nnzs; + + if (mpi_manager.getRank() == root) { + nnzs.reserve(matrix.size()); + for (size_t j = 0; j < matrix.size(); j++) { + nnzs.push_back(matrix[j].getValues().size()); + } + } + + size_t local_nnz = mpi_manager.scatter(nnzs, 0)[0]; + + CSRMatrix localCSRMatrix(local_num_rows, local_nnz); + + if (mpi_manager.getRank() == root) { + localCSRMatrix.getValues() = matrix[0].getValues(); + localCSRMatrix.getCsrOffsets() = matrix[0].getCsrOffsets(); + localCSRMatrix.getColumns() = matrix[0].getColumns(); + } + + for (size_t k = 1; k < num_col_blocks; k++) { + size_t dest = k; + size_t source = root; + + if (mpi_manager.getRank() == 0 && matrix[k].getValues().size()) { + mpi_manager.Send>(matrix[k].getValues(), + dest); + mpi_manager.Send(matrix[k].getCsrOffsets(), dest); + mpi_manager.Send(matrix[k].getColumns(), dest); + } else if (mpi_manager.getRank() == k && local_nnz) { + mpi_manager.Recv>( + localCSRMatrix.getValues(), source); + mpi_manager.Recv(localCSRMatrix.getCsrOffsets(), + source); + mpi_manager.Recv(localCSRMatrix.getColumns(), source); + } + } + return localCSRMatrix; +} +} // namespace Pennylane::MPI \ No newline at end of file From 134cfc312ae7bed1f7cab228631549341b8053d7 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Tue, 22 Aug 2023 13:50:20 -0700 Subject: [PATCH 36/43] add crsmatrix --- .../src/algorithms/ObservablesGPUMPI.hpp | 43 +----- .../src/bindings/Bindings.cpp | 12 ++ .../src/simulator/StateVectorCudaMPI.hpp | 39 +----- .../src/util/CSRMatrix.hpp | 131 ++++++++++++++++++ 4 files changed, 149 insertions(+), 76 deletions(-) create mode 100644 pennylane_lightning_gpu/src/util/CSRMatrix.hpp diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp index 150eb841..16a82f5f 100644 --- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp +++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp @@ -22,16 +22,6 @@ namespace Pennylane::Algorithms { -inline size_t reverseBits(size_t num, size_t nbits){ - size_t reversed = 0; - for(size_t i = 0; i < nbits; ++i){ - //ith bit value - size_t bit = num & (1 << i); - reversed += bit << (nbits - i); - } - return reversed -} - /** * @brief A base class for all observable classes. * @@ -430,12 +420,11 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { int32_t, int64_t>::type; private: - MPIManager mpi_manager_; std::vector> data_; std::vector indices_; std::vector offsets_; + std::vector wires_; - [[nodiscard]] bool isEqual(const ObservableGPUMPI &other) const override { @@ -462,32 +451,10 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { */ template > - SparseHamiltonianGPUMPI(T1 &&arg1, T2 &&arg2, T3 &&arg3, T4 &&arg4):mpi_manager_(MPI_COMM_WORLD){ - if(mpi_manager_.getRank() == 0){ - std::> data_tmp = std::move(arg1); - std::vector indices_tmp = std::move(arg2); - std::vector offsets_tmp = std::move(arg3); - std::vector wires_tmp = std::move(arg4); - PL_ASSERT(data_tmp.size() == indices_tmp.size()); - - offsets_.push_back(0); - //sort sparseMV - for(size_t i = 0; i < offsets_tmp.size() - 1; i++){ - size_t target_row = reverseBits(i,wire_tmp.size()); - size_t offset = offsets_tmp[target_row]; - size_t local_col_size = offsets_tmp[target_row+1] - offset; - offset_.push_back(offsets_.back()+local_col_size); - //get unsorted column indices - std::vector local_col_indices(indices_.begin()+offset,indices_tmp.begin()+offset+local_col_size+1); - for(size_t j = 0; j < local_col_indices.size(); j++){ - size_t target_col = reverseBits(local_col_indices[i],wire_tmp.size()); - local_col_indices[i] = target_col; - } - //sort indices and get the rank of column index - - //sort data - } - } + SparseHamiltonianGPUMPI(T1 &&arg1, T2 &&arg2, T3 &&arg3, T4 &&arg4) + : data_{std::forward(arg1)}, indices_{std::forward(arg2)}, + offsets_{std::forward(arg3)}, wires_{std::forward(arg4)} { + PL_ASSERT(data_.size() == indices_.size()); } /** diff --git a/pennylane_lightning_gpu/src/bindings/Bindings.cpp b/pennylane_lightning_gpu/src/bindings/Bindings.cpp index bf55653f..d6674830 100644 --- a/pennylane_lightning_gpu/src/bindings/Bindings.cpp +++ b/pennylane_lightning_gpu/src/bindings/Bindings.cpp @@ -39,6 +39,7 @@ #ifdef ENABLE_MPI #include "AdjointDiffGPUMPI.hpp" +#include "CSRMatrix.hpp" #include "MPIManager.hpp" #include "StateVectorCudaMPI.hpp" #endif @@ -1608,13 +1609,24 @@ void StateVectorCudaMPI_class_bindings(py::module &m) { const py::buffer_info buffer_offsets = offsets.request(); const auto *offsets_ptr = static_cast(buffer_offsets.ptr); + + MPIManager mpi_manager(MPI_COMM_WORLD); + CSRMatrix sparseMat(offsets.size() - 1, data.size(), indices_ptr, offsets_ptr, data_ptr); + auto sparseMatReorder = sparseMat.matrixReorder(); + + mpi_manager.Barrier(); + + return SparseHamiltonianGPUMPI{sparseMatReorder.getValues(), sparseMatReorder.getColumns(), sparseMatReorder.getCsrOffsets(), wires}; + + /* return SparseHamiltonianGPUMPI{ std::vector>( {data_ptr, data_ptr + data.size()}), std::vector({indices_ptr, indices_ptr + indices.size()}), std::vector({offsets_ptr, offsets_ptr + offsets.size()}), wires}; + */ })) .def("__repr__", &SparseHamiltonianGPUMPI::getObsName) .def("get_wires", &SparseHamiltonianGPUMPI::getWires, diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 20f58dd9..d3c8587f 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -27,6 +27,7 @@ #include #include // custatevecApplyMatrix +#include "CSRMatrix.hpp" #include "Constant.hpp" #include "Error.hpp" #include "MPIManager.hpp" @@ -64,44 +65,6 @@ extern void setBasisState_CUDA(cuDoubleComplex *sv, cuDoubleComplex &value, const size_t index, bool async, cudaStream_t stream_id); -/** - * @brief Manage memory of Compressed Sparse Row (CSR) sparse matrix. CSR format - * represents a matrix M by three (one-dimensional) arrays, that respectively - * contain nonzero values, row offsets, and column indices. - * - * @tparam Precision Floating-point precision type. - * @tparam index_type Integer type. - */ -template class CSRMatrix { - private: - std::vector columns; - std::vector csrOffsets; - std::vector> values; - - public: - CSRMatrix(size_t num_rows, size_t nnz) - : columns(nnz, 0), csrOffsets(num_rows + 1, 0), values(nnz){}; - - CSRMatrix() = default; - - /** - * @brief Get the CSR format index vector of the matrix. - */ - auto getColumns() -> std::vector & { return columns; } - - /** - * @brief Get CSR format offset vector of the matrix. - */ - auto getCsrOffsets() -> std::vector & { return csrOffsets; } - - /** - * @brief Get CSR format data vector of the matrix. - */ - auto getValues() -> std::vector> & { - return values; - } -}; - /** * @brief Managed memory CUDA state-vector class using custateVec backed * gate-calls. diff --git a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp new file mode 100644 index 00000000..81b482cd --- /dev/null +++ b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp @@ -0,0 +1,131 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "MPIManager.hpp" + +/// @cond DEV +namespace { +using namespace Pennylane::CUDA; +} // namespace +/// @endcond +namespace Pennylane::MPI { + +inline size_t reverseBits(size_t num, size_t nbits) { + size_t reversed = 0; + for (size_t i = 0; i < nbits; ++i) { + // ith bit value + size_t bit = (num & (1 << i)) >> i; + reversed += bit << (nbits - i - 1); + } + return reversed; +} + +template +inline std::tuple, std::vector> +rankVector(const std::vector &input) { + // Create a copy of the input vector for sorting + std::vector sortedInput = input; + + // Sort the copy in ascending order + std::sort(sortedInput.begin(), sortedInput.end()); + + std::vector ranks(input.size()); + for (size_t i = 0; i < input.size(); ++i) { + auto it = + std::lower_bound(sortedInput.begin(), sortedInput.end(), input[i]); + ranks[i] = std::distance(sortedInput.begin(), it); + } + + return {sortedInput, ranks}; +} + +/** + * @brief Manage memory of Compressed Sparse Row (CSR) sparse matrix. CSR format + * represents a matrix M by three (one-dimensional) arrays, that respectively + * contain nonzero values, row offsets, and column indices. + * + * @tparam Precision Floating-point precision type. + * @tparam index_type Integer type. + */ +template class CSRMatrix { + private: + std::vector columns_; + std::vector csrOffsets_; + std::vector> values_; + + public: + CSRMatrix(size_t num_rows, size_t nnz) + : columns_(nnz, 0), csrOffsets_(num_rows + 1, 0), values_(nnz){}; + + CSRMatrix(size_t num_rows, size_t nnz, const index_type *column_ptr, + const index_type *csrOffsets_ptr, const std::complex *value_ptr) + : columns_(column_ptr, column_ptr + nnz), + csrOffsets_(csrOffsets_ptr, csrOffsets_ptr + num_rows + 1), + values_(value_ptr, value_ptr + nnz){}; + + CSRMatrix() = default; + + /** + * @brief Get the CSR format index vector of the matrix. + */ + auto getColumns() -> std::vector & { return columns_; } + + /** + * @brief Get CSR format offset vector of the matrix. + */ + auto getCsrOffsets() -> std::vector & { return csrOffsets_; } + + /** + * @brief Get CSR format data vector of the matrix. + */ + auto getValues() -> std::vector> & { + return values_; + } + + auto matrixReorder() -> CSRMatrix { + size_t num_rows = this->getCsrOffsets().size() - 1; + size_t nnz = this->getColumns().size(); + size_t nbits = std::bit_width(num_rows) - 1; + CSRMatrix reorderedMatrix(num_rows, nnz); + + for (size_t row_idx = 0; row_idx < num_rows; row_idx++) { + size_t org_row_idx = reverseBits(row_idx, nbits); + + size_t org_offset = this->getCsrOffsets()[org_row_idx]; + size_t local_col_size = + this->getCsrOffsets()[org_row_idx + 1] - org_offset; + + reorderedMatrix.getCsrOffsets()[row_idx + 1] = + local_col_size + reorderedMatrix.getCsrOffsets()[row_idx]; + + // get unsorted column indices + std::vector local_col_indices( + this->getColumns().begin() + org_offset, + this->getColumns().begin() + org_offset + local_col_size); + for (size_t i = 0; i < local_col_indices.size(); i++) { + size_t col_idx = reverseBits(local_col_indices[i], nbits); + local_col_indices[i] = col_idx; + } + + auto [sorted_col_indices, ranks] = + rankVector(local_col_indices); + + for (size_t i = 0; i < sorted_col_indices.size(); i++) { + reorderedMatrix + .getColumns()[reorderedMatrix.getCsrOffsets()[row_idx] + + i] = sorted_col_indices[i]; + reorderedMatrix + .getValues()[reorderedMatrix.getCsrOffsets()[row_idx] + i] = + this->getValues()[org_offset + ranks[i]]; + + } + } + return reorderedMatrix; + } +}; +} // namespace Pennylane::MPI \ No newline at end of file From 45633b8b10b71f27e775e7e319927f869ae9cda0 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Wed, 23 Aug 2023 07:22:36 -0700 Subject: [PATCH 37/43] refactor binding --- .../src/bindings/Bindings.cpp | 39 ++++------------- .../src/simulator/StateVectorCudaMPI.hpp | 43 ++++++++++++++++--- .../src/util/CSRMatrix.hpp | 2 +- 3 files changed, 46 insertions(+), 38 deletions(-) diff --git a/pennylane_lightning_gpu/src/bindings/Bindings.cpp b/pennylane_lightning_gpu/src/bindings/Bindings.cpp index 33a6ece5..04a36034 100644 --- a/pennylane_lightning_gpu/src/bindings/Bindings.cpp +++ b/pennylane_lightning_gpu/src/bindings/Bindings.cpp @@ -1361,39 +1361,16 @@ void StateVectorCudaMPI_class_bindings(py::module &m) { const np_arr_sparse_ind &columns, const np_arr_c values) { using index_type = typename std::conditional< std::is_same::value, int32_t, int64_t>::type; - //Distribute sparse matrix across multi-nodes/multi-gpus - size_t num_rows = - static_cast(csrOffsets.request().size) - 1; - size_t local_num_rows = size_t{1} << sv.getNumLocalQubits(); - size_t root = 0; - - MPIManager mpi_manager = sv.getMPIManager(); - - std::vector>> - csrmatrix_blocks; - - if (mpi_manager.getRank() == root) { - csrmatrix_blocks = splitCSRMatrix( - mpi_manager, num_rows, - static_cast(csrOffsets.request().ptr), - static_cast(columns.request().ptr), - static_cast *>( - values.request().ptr)); - } - mpi_manager.Barrier(); - - std::vector> - localCSRMatVector; - for (size_t i = 0; i < mpi_manager.getSize(); i++) { - auto localCSRMat = scatterCSRMatrix( - mpi_manager, csrmatrix_blocks[i], local_num_rows, root); - localCSRMatVector.push_back(localCSRMat); - } - - mpi_manager.Barrier(); return sv.template getExpectationValueOnSparseSpMV( - localCSRMatVector); + static_cast(csrOffsets.request().ptr), + static_cast( + csrOffsets.request() + .size), // num_rows + 1 or csrOffsets + static_cast(columns.request().ptr), // columns + static_cast *>( + values.request().ptr), + static_cast(values.request().size)); // nnz }, "Calculate the expectation value of a sparse Hamiltonian.") diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index b1c52568..93971616 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -968,7 +968,16 @@ class StateVectorCudaMPI */ template auto getExpectationValueOnSparseSpMV( - std::vector> &CSRMatVector) { + const index_type *csrOffsets_ptr, const index_type csrOffsets_size, + const index_type *columns_ptr, + const std::complex *values_ptr, const index_type numNNZ) { + + if (mpi_manager_.getRank() == 0) { + PL_ABORT_IF_NOT(static_cast(csrOffsets_size - 1) == + (size_t{1} << this->getTotalNumQubits()), + "Incorrect size of CSR Offsets."); + PL_ABORT_IF_NOT(numNNZ > 0, "Empty CSR matrix."); + } const CFP_t alpha = {1.0, 0.0}; const CFP_t beta = {0.0, 0.0}; @@ -998,6 +1007,30 @@ class StateVectorCudaMPI compute_type = CUSPARSE_INDEX_32I; } + // Distribute sparse matrix across multi-nodes/multi-gpus + size_t num_rows = size_t{1} << this->getTotalNumQubits(); + size_t local_num_rows = size_t{1} << this->getNumLocalQubits(); + size_t root = 0; + + std::vector>> + csrmatrix_blocks; + + if (mpi_manager_.getRank() == root) { + csrmatrix_blocks = splitCSRMatrix( + mpi_manager_, num_rows, csrOffsets_ptr, columns_ptr, + values_ptr); + } + mpi_manager_.Barrier(); + + std::vector> localCSRMatVector; + for (size_t i = 0; i < mpi_manager_.getSize(); i++) { + auto localCSRMat = scatterCSRMatrix( + mpi_manager_, csrmatrix_blocks[i], local_num_rows, root); + localCSRMatVector.push_back(localCSRMat); + } + + mpi_manager_.Barrier(); + const size_t length_local = size_t{1} << this->getNumLocalQubits(); DataBuffer d_res_per_block{length_local, device_id, @@ -1009,12 +1042,10 @@ class StateVectorCudaMPI for (size_t i = 0; i < mpi_manager_.getSize(); i++) { // Need to investigate if non-blocking MPI operation can improve // performace here. - auto &localCSRMatrix = CSRMatVector[i]; + auto &localCSRMatrix = localCSRMatVector[i]; - int64_t num_rows_local = - static_cast(localCSRMatrix.getCsrOffsets().size() - 1); - int64_t num_cols_local = - static_cast(localCSRMatrix.getColumns().size()); + int64_t num_rows_local = local_num_rows; + int64_t num_cols_local = num_rows_local; int64_t nnz_local = static_cast(localCSRMatrix.getValues().size()); diff --git a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp index 7a87673a..31a64be5 100644 --- a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp +++ b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp @@ -33,7 +33,7 @@ template class CSRMatrix { CSRMatrix(size_t num_rows, size_t nnz, index_type *column_ptr, index_type *csrOffsets_ptr, std::complex *value_ptr) - : columns_(column_ptr, column_ptr + num_rows), + : columns_(column_ptr, column_ptr + nnz), csrOffsets_(csrOffsets_ptr, csrOffsets_ptr + num_rows + 1), values_(value_ptr, value_ptr + nnz){}; From 77ecd93b3b3a7a452c7646c4e4e041d799063f1c Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Wed, 23 Aug 2023 11:14:34 -0700 Subject: [PATCH 38/43] init zeros for tmp for empty spmat --- .github/CHANGELOG.md | 11 +-- mpitests/test_adjoint_jacobian.py | 32 ++------ pennylane_lightning_gpu/_serialize.py | 55 +++++-------- pennylane_lightning_gpu/lightning_gpu.py | 2 +- .../src/algorithms/ObservablesGPUMPI.hpp | 77 ++++++++++++------- .../src/bindings/Bindings.cpp | 12 --- .../src/simulator/StateVectorCudaMPI.hpp | 11 ++- .../src/util/CSRMatrix.hpp | 15 ++++ 8 files changed, 100 insertions(+), 115 deletions(-) diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index ebb60298..dee589e5 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -4,16 +4,9 @@ * Add sparse Hamiltonian support to multi-node/multi-GPU adjoint methods. [(#128)] (https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/128) - - Note each MPI process will return the overall result of the adjoint method. If there is a global target wire when constructing the sparse Hamiltonian, `wires` should be reordered in a descending manner in a python script as follows: - ```python - qml.SparseHamiltonian( - qml.Hamiltonian( - [0.1], [qml.PauliX(0) @ qml.PauliY(1)] - ).sparse_matrix(wires.[::-1]), - ) + Note each MPI process will return the overall result of the adjoint method. ``` - + * Add Sparse Hamiltonian support for expectation value calculation. [(#127)] (https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/127) diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py index 97ee4ec2..a4dcb197 100644 --- a/mpitests/test_adjoint_jacobian.py +++ b/mpitests/test_adjoint_jacobian.py @@ -1046,23 +1046,13 @@ def circuit(params): j_gpu = qml.jacobian(qnode_gpu)(params) -""" Important Note: Ensure wires index arranged from high to low if - there is a global target wire. - qml.SparseHamiltonian( - qml.Hamiltonian( - [0.1], [qml.PauliX(wires=custom_wires[1]) @ qml.PauliY(wires=custom_wires[0])] - ).sparse_matrix(custom_wires.[::-1]), - ) -""" - - @pytest.mark.parametrize( "returns", [ qml.SparseHamiltonian( qml.Hamiltonian( [0.1], [qml.PauliX(wires=custom_wires[0]) @ qml.PauliY(wires=custom_wires[1])] - ).sparse_matrix(custom_wires[::-1]), + ).sparse_matrix(custom_wires), wires=custom_wires, ), qml.SparseHamiltonian( @@ -1080,7 +1070,7 @@ def circuit(params): qml.SparseHamiltonian( qml.Hamiltonian( [1.1], [qml.PauliX(wires=custom_wires[0]) @ qml.PauliZ(wires=custom_wires[2])] - ).sparse_matrix(custom_wires[::-1]), + ).sparse_matrix(custom_wires), wires=custom_wires, ), ], @@ -1115,18 +1105,6 @@ def circuit(params): assert np.allclose(j_cpu, j_gpu) -""" Important Note: Ensure wires index arranged from high to low if - there is a global target wire. - qml.SparseHamiltonian( - qml.Hamiltonian( - [0.1], - [qml.PauliX(0) @ qml.PauliZ(1)], - ).sparse_matrix(range(5, -1, -1)), - wires=range(6), - ) -""" - - @pytest.mark.parametrize( "returns", [ @@ -1134,7 +1112,7 @@ def circuit(params): qml.Hamiltonian( [0.1], [qml.PauliZ(1) @ qml.PauliX(0) @ qml.Identity(2) @ qml.PauliX(4) @ qml.Identity(5)], - ).sparse_matrix(range(5, -1, -1)), + ).sparse_matrix(range(6)), wires=range(6), ), qml.SparseHamiltonian( @@ -1148,7 +1126,7 @@ def circuit(params): qml.Hamiltonian( [0.1], [qml.PauliX(0)], - ).sparse_matrix(range(5, -1, -1)), + ).sparse_matrix(range(6)), wires=range(6), ), qml.SparseHamiltonian( @@ -1162,7 +1140,7 @@ def circuit(params): qml.Hamiltonian( [0.1], [qml.PauliX(0) @ qml.PauliZ(1)], - ).sparse_matrix(range(5, -1, -1)), + ).sparse_matrix(range(6)), wires=range(6), ), qml.SparseHamiltonian( diff --git a/pennylane_lightning_gpu/_serialize.py b/pennylane_lightning_gpu/_serialize.py index de108949..af96e082 100644 --- a/pennylane_lightning_gpu/_serialize.py +++ b/pennylane_lightning_gpu/_serialize.py @@ -122,14 +122,14 @@ def _hamiltonian_ob_dtype(use_csingle, use_mpi: bool): def _sparsehamiltonian_ob_dtype(use_csingle, use_mpi: bool): if not use_mpi: return ( - [SparseHamiltonianGPU_C64, np.float32, np.int32] + [SparseHamiltonianGPU_C64, np.complex64, np.int32] if use_csingle - else [SparseHamiltonianGPU_C128, np.float64, np.int64] + else [SparseHamiltonianGPU_C128, np.complex128, np.int64] ) return ( - [SparseHamiltonianGPUMPI_C64, np.float32, np.int32] + [SparseHamiltonianGPUMPI_C64, np.complex64, np.int32] if use_csingle - else [SparseHamiltonianGPUMPI_C128, np.float64, np.int64] + else [SparseHamiltonianGPUMPI_C128, np.complex128, np.int64] ) @@ -169,40 +169,25 @@ def _serialize_hamiltonian( def _serialize_sparsehamiltonian(ob, wires_map: dict, use_csingle: bool, use_mpi: bool): sparsehamiltonian_obs, ctype, rtype = _sparsehamiltonian_ob_dtype(use_csingle, use_mpi) - + wires = [] + wires_list = ob.wires.tolist() + wires.extend([wires_map[w] for w in wires_list]) if use_mpi: mpi_manager_local = MPIManager() + # Other root only needs non-null some sparsematrix data to pass + obs = qml.Identity(0) + Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix() + H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1)) + spm = H_sparse.sparse_matrix() # Only root 0 needs the overall sparsematrix data if mpi_manager_local.getRank() == 0: spm = ob.sparse_matrix() - data = np.array(spm.data).astype(ctype) - indices = np.array(spm.indices).astype(rtype) - offsets = np.array(spm.indptr).astype(rtype) - wires = [] - wires_list = ob.wires.tolist() - wires.extend([wires_map[w] for w in wires_list]) - else: - # Other root only needs non-null some sparsematrix data to pass - obs = qml.Identity(0) - Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix() - H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1)) - spm = H_sparse.sparse_matrix() - - data = np.array(spm.data).astype(ctype) - indices = np.array(spm.indices).astype(rtype) - offsets = np.array(spm.indptr).astype(rtype) - - wires = [] - wires_list = ob.wires.tolist() - wires.extend([wires_map[w] for w in wires_list]) + mpi_manager_local.Barrier() else: spm = ob.sparse_matrix() - data = np.array(spm.data).astype(ctype) - indices = np.array(spm.indices).astype(rtype) - offsets = np.array(spm.indptr).astype(rtype) - wires = [] - wires_list = ob.wires.tolist() - wires.extend([wires_map[w] for w in wires_list]) + data = np.array(spm.data).astype(ctype) + indices = np.array(spm.indices).astype(rtype) + offsets = np.array(spm.indptr).astype(rtype) return sparsehamiltonian_obs(data, indices, offsets, wires) @@ -272,9 +257,10 @@ def _serialize_observables( tape (QuantumTape): the input quantum tape wires_map (dict): a dictionary mapping input wires to the device's backend wires use_csingle (bool): whether to use np.complex64 instead of np.complex128 + use_mpi (bool): whether MPI is used or not Returns: - list(ObservableGPU_C64 or ObservableGPU_C128): A list of observable objects compatible with the C++ backend + list(ObservableGPU_C64/128 or ObservableGPUMPI_C64/128): A list of observable objects compatible with the C++ backend """ output = [] offsets = [0] @@ -291,7 +277,7 @@ def _serialize_observables( def _serialize_ops( - tape: QuantumTape, wires_map: dict, use_csingle: bool = False + tape: QuantumTape, wires_map: dict, use_csingle: bool = False, use_mpi: bool = False ) -> Tuple[List[List[str]], List[np.ndarray], List[List[int]], List[bool], List[np.ndarray]]: """Serializes the operations of an input tape. @@ -301,6 +287,7 @@ def _serialize_ops( tape (QuantumTape): the input quantum tape wires_map (dict): a dictionary mapping input wires to the device's backend wires use_csingle (bool): whether to use np.complex64 instead of np.complex128 + use_mpi (bool): whether MPI is used or not Returns: Tuple[list, list, list, list, list]: A serialization of the operations, containing a list @@ -315,7 +302,7 @@ def _serialize_ops( uses_stateprep = False - sv_py = LightningGPU_C64 if use_csingle else LightningGPU_C128 + sv_py = _sv_py_dtype(use_csingle, use_mpi) for o in tape.operations: if isinstance(o, (BasisState, QubitStateVector)): diff --git a/pennylane_lightning_gpu/lightning_gpu.py b/pennylane_lightning_gpu/lightning_gpu.py index f9b2aaad..86825c8d 100644 --- a/pennylane_lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning_gpu/lightning_gpu.py @@ -666,7 +666,7 @@ def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False, ** ) ops_serialized, use_sp = _serialize_ops( - tape, self.wire_map, use_csingle=self.use_csingle + tape, self.wire_map, use_csingle=self.use_csingle, use_mpi=self._mpi ) ops_serialized = adj.create_ops_list(*ops_serialized) diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp index 16a82f5f..d99431ba 100644 --- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp +++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp @@ -17,6 +17,7 @@ #include #include +#include "CSRMatrix.hpp" #include "MPIManager.hpp" #include "StateVectorCudaMPI.hpp" @@ -493,6 +494,28 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { "SparseH wire count does not match state-vector size"); } + // Distribute sparse matrix across multi-nodes/multi-gpus + size_t num_rows = size_t{1} << sv.getTotalNumQubits(); + size_t local_num_rows = size_t{1} << sv.getNumLocalQubits(); + + std::vector>> csrmatrix_blocks; + + if (mpi_manager.getRank() == 0) { + csrmatrix_blocks = splitCSRMatrix( + mpi_manager, num_rows, offsets_.data(), indices_.data(), + data_.data()); + } + mpi_manager.Barrier(); + + std::vector> localCSRMatVector; + for (size_t i = 0; i < mpi_manager.getSize(); i++) { + auto localCSRMat = scatterCSRMatrix( + mpi_manager, csrmatrix_blocks[i], local_num_rows, 0); + localCSRMatVector.push_back(localCSRMat); + } + + mpi_manager.Barrier(); + using CFP_t = typename StateVectorCudaMPI::CFP_t; const CFP_t alpha = {1.0, 0.0}; const CFP_t beta = {0.0, 0.0}; @@ -521,38 +544,24 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { compute_type = CUSPARSE_INDEX_32I; } - std::vector>> csrmatrix_blocks; - - const size_t num_col_blocks = mpi_manager.getSize(); - const size_t num_row_blocks = mpi_manager.getSize(); - - if (mpi_manager.getRank() == 0) { - csrmatrix_blocks = sv.template splitCSRMatrix( - num_col_blocks, num_row_blocks, offsets_.data(), - indices_.data(), data_.data()); - } - const size_t length_local = size_t{1} << sv.getNumLocalQubits(); - DevTag dt_local(sv.getDataBuffer().getDevTag()); - dt_local.refresh(); - - StateVectorCudaMPI d_sv_prime( - dt_local, sv.getNumGlobalQubits(), sv.getNumLocalQubits(), - sv.getData()); + std::unique_ptr> d_sv_prime = + std::make_unique>(length_local, device_id, + stream_id, true); + std::unique_ptr> d_tmp = + std::make_unique>(length_local, device_id, + stream_id, true); + d_sv_prime->zeroInit(); + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager.Barrier(); - StateVectorCudaMPI d_tmp(dt_local, sv.getNumGlobalQubits(), - sv.getNumLocalQubits(), - sv.getData()); - - for (size_t i = 0; i < num_row_blocks; i++) { - auto localCSRMatrix = - sv.template scatterCSRMatrix(csrmatrix_blocks[i], 0); + for (size_t i = 0; i < mpi_manager.getSize(); i++) { + auto &localCSRMatrix = localCSRMatVector[i]; int64_t num_rows_local = static_cast(localCSRMatrix.getCsrOffsets().size() - 1); - int64_t num_cols_local = - static_cast(localCSRMatrix.getColumns().size()); + int64_t num_cols_local = num_rows_local; int64_t nnz_local = static_cast(localCSRMatrix.getValues().size()); @@ -611,7 +620,7 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { PL_CUSPARSE_IS_SUCCESS(cusparseCreateDnVec( /* cusparseDnVecDescr_t* */ &vecY, /* int64_t */ num_rows_local, - /* void* */ d_tmp.getData(), + /* void* */ d_tmp->getData(), /* cudaDataType */ data_type)); // allocate an external buffer if needed @@ -650,10 +659,17 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { color = 1; } + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager.Barrier(); if (mpi_manager.getRank() == i) { color = 1; + if (localCSRMatrix.getValues().size() == 0) { + d_tmp->zeroInit(); + } } + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager.Barrier(); auto new_mpi_manager = mpi_manager.split(color, mpi_manager.getRank()); @@ -667,11 +683,14 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { if (new_mpi_manager.getComm() != MPI_COMM_NULL) { new_mpi_manager.template Reduce( - d_tmp.getData(), d_sv_prime.getData(), length_local, + d_tmp->getData(), d_sv_prime->getData(), length_local, reduce_root_rank, "sum"); } + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager.Barrier(); } - sv.CopyGpuDataToGpuIn(d_sv_prime.getData(), d_sv_prime.getLength()); + sv.CopyGpuDataToGpuIn(d_sv_prime->getData(), d_sv_prime->getLength()); + mpi_manager.Barrier(); } [[nodiscard]] auto getObsName() const -> std::string override { diff --git a/pennylane_lightning_gpu/src/bindings/Bindings.cpp b/pennylane_lightning_gpu/src/bindings/Bindings.cpp index e12ff8cd..907cf0c6 100644 --- a/pennylane_lightning_gpu/src/bindings/Bindings.cpp +++ b/pennylane_lightning_gpu/src/bindings/Bindings.cpp @@ -1361,7 +1361,6 @@ void StateVectorCudaMPI_class_bindings(py::module &m) { const np_arr_sparse_ind &columns, const np_arr_c values) { using index_type = typename std::conditional< std::is_same::value, int32_t, int64_t>::type; - return sv.template getExpectationValueOnSparseSpMV( static_cast(csrOffsets.request().ptr), static_cast( @@ -1610,24 +1609,13 @@ void StateVectorCudaMPI_class_bindings(py::module &m) { const py::buffer_info buffer_offsets = offsets.request(); const auto *offsets_ptr = static_cast(buffer_offsets.ptr); - - MPIManager mpi_manager(MPI_COMM_WORLD); - - CSRMatrix sparseMat(offsets.size() - 1, data.size(), indices_ptr, offsets_ptr, data_ptr); - auto sparseMatReorder = sparseMat.matrixReorder(); - - mpi_manager.Barrier(); - - return SparseHamiltonianGPUMPI{sparseMatReorder.getValues(), sparseMatReorder.getColumns(), sparseMatReorder.getCsrOffsets(), wires}; - /* return SparseHamiltonianGPUMPI{ std::vector>( {data_ptr, data_ptr + data.size()}), std::vector({indices_ptr, indices_ptr + indices.size()}), std::vector({offsets_ptr, offsets_ptr + offsets.size()}), wires}; - */ })) .def("__repr__", &SparseHamiltonianGPUMPI::getObsName) .def("get_wires", &SparseHamiltonianGPUMPI::getWires, diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 93971616..2143557e 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -1010,12 +1010,11 @@ class StateVectorCudaMPI // Distribute sparse matrix across multi-nodes/multi-gpus size_t num_rows = size_t{1} << this->getTotalNumQubits(); size_t local_num_rows = size_t{1} << this->getNumLocalQubits(); - size_t root = 0; std::vector>> csrmatrix_blocks; - if (mpi_manager_.getRank() == root) { + if (mpi_manager_.getRank() == 0) { csrmatrix_blocks = splitCSRMatrix( mpi_manager_, num_rows, csrOffsets_ptr, columns_ptr, values_ptr); @@ -1025,7 +1024,7 @@ class StateVectorCudaMPI std::vector> localCSRMatVector; for (size_t i = 0; i < mpi_manager_.getSize(); i++) { auto localCSRMat = scatterCSRMatrix( - mpi_manager_, csrmatrix_blocks[i], local_num_rows, root); + mpi_manager_, csrmatrix_blocks[i], local_num_rows, 0); localCSRMatVector.push_back(localCSRMat); } @@ -1151,8 +1150,14 @@ class StateVectorCudaMPI if (mpi_manager_.getRank() == i) { color = 1; + if (localCSRMatrix.getValues().size() == 0) { + d_res_per_block.zeroInit(); + } } + PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize()); + mpi_manager_.Barrier(); + auto new_mpi_manager = mpi_manager_.split(color, mpi_manager_.getRank()); int reduce_root_rank = -1; diff --git a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp index 31a64be5..59856d46 100644 --- a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp +++ b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp @@ -57,6 +57,21 @@ template class CSRMatrix { } }; +/** + * @brief Convert a global CSR (Compressed Sparse Row) format matrix into + * local blocks. This operation should be conducted on the rank 0. + * + * @tparam index_type Integer type used as indices of the sparse matrix. + * @param num_row_blocks Number of local blocks per global row. + * @param num_col_blocks Number of local blocks per global column. + * @param csrOffsets_ptr Pointer to the array of row offsets of the sparse + * matrix. Array of size csrOffsets_size. + * @param columns_ptr Pointer to the array of column indices of the sparse + * matrix. Array of size numNNZ + * @param values_ptr Pointer to the array of the non-zero elements + * + * @return auto A vector of vector of CSRMatrix. + */ template auto splitCSRMatrix(MPIManager &mpi_manager, const size_t &num_rows, const index_type *csrOffsets_ptr, From c2c8e5d348c9e5d6a1db31c18d3c0f7ccdba0767 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Wed, 23 Aug 2023 11:19:45 -0700 Subject: [PATCH 39/43] update changelog and merge main --- .github/CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index d685a518..90cfa934 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -5,7 +5,6 @@ * Add sparse Hamiltonian support to multi-node/multi-GPU adjoint methods. [(#128)] (https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/128) Note each MPI process will return the overall result of the adjoint method. - ``` * Add Sparse Hamiltonian support for expectation value calculation. [(#127)] (https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/127) From 877760ac0d6dcf8df2ac02c372f579bd211fa1dc Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Wed, 23 Aug 2023 12:14:31 -0700 Subject: [PATCH 40/43] add unit tests for CSRMatrix related methods --- .../src/tests/mpi/Test_MPIManager.cpp | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/pennylane_lightning_gpu/src/tests/mpi/Test_MPIManager.cpp b/pennylane_lightning_gpu/src/tests/mpi/Test_MPIManager.cpp index 4c75e1dd..613487f1 100644 --- a/pennylane_lightning_gpu/src/tests/mpi/Test_MPIManager.cpp +++ b/pennylane_lightning_gpu/src/tests/mpi/Test_MPIManager.cpp @@ -8,6 +8,7 @@ #include +#include "CSRMatrix.hpp" #include "MPIManager.hpp" #include "../TestHelpersLGPU.hpp" @@ -217,4 +218,112 @@ TEST_CASE("MPIManager::split") { int key = rank; auto newComm = mpi_manager.split(color, key); CHECK(newComm.getSize() * 2 == mpi_manager.getSize()); +} + +TEMPLATE_TEST_CASE("CRSMatrix::Split", "[CRSMatrix]", float, double) { + using PrecisionT = TestType; + using cp_t = std::complex; + using index_type = + typename std::conditional::value, int32_t, + int64_t>::type; + + MPIManager mpi_manager(MPI_COMM_WORLD); + + int rank = mpi_manager.getRank(); + int size = mpi_manager.getSize(); + + index_type csrOffsets[9] = {0, 2, 4, 6, 8, 10, 12, 14, 16}; + index_type columns[16] = {0, 3, 1, 2, 1, 2, 0, 3, 4, 7, 5, 6, 5, 6, 4, 7}; + + cp_t values[16] = {{1.0, 0.0}, {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0}, + {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0}, {1.0, 0.0}, + {1.0, 0.0}, {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0}, + {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0}, {1.0, 0.0}}; + + index_type num_csrOffsets = 9; + index_type num_rows = num_csrOffsets - 1; + + SECTION("Apply split") { + if (rank == 0) { + auto CSRMatVector = splitCSRMatrix( + mpi_manager, num_rows, csrOffsets, columns, values); + + std::vector localcsrOffsets = {0, 2, 4, 6, 8}; + std::vector local_indices = {0, 3, 1, 2, 1, 2, 0, 3}; + + for (size_t i = 0; i < localcsrOffsets.size(); i++) { + CHECK(CSRMatVector[0][0].getCsrOffsets()[i] == + localcsrOffsets[i]); + CHECK(CSRMatVector[1][1].getCsrOffsets()[i] == + localcsrOffsets[i]); + } + + for (size_t i = 0; i < local_indices.size(); i++) { + CHECK(CSRMatVector[0][0].getColumns()[i] == local_indices[i]); + CHECK(CSRMatVector[1][1].getColumns()[i] == local_indices[i]); + } + + for (size_t i = 0; i < 8; i++) { + CHECK(CSRMatVector[0][0].getValues()[i] == values[i]); + CHECK(CSRMatVector[1][1].getValues()[i] == values[i + 8]); + } + + CHECK(CSRMatVector[0][1].getValues().size() == 0); + CHECK(CSRMatVector[1][0].getValues().size() == 0); + } + } + + SECTION("Apply SparseMatrix scatter") { + std::vector>> + csrmatrix_blocks; + + if (rank == 0) { + csrmatrix_blocks = splitCSRMatrix( + mpi_manager, num_rows, csrOffsets, columns, values); + } + + size_t local_num_rows = num_rows / size; + + std::vector> localCSRMatVector; + for (size_t i = 0; i < mpi_manager.getSize(); i++) { + auto localCSRMat = scatterCSRMatrix( + mpi_manager, csrmatrix_blocks[i], local_num_rows, 0); + localCSRMatVector.push_back(localCSRMat); + } + + std::vector localcsrOffsets = {0, 2, 4, 6, 8}; + std::vector local_indices = {0, 3, 1, 2, 1, 2, 0, 3}; + + if (rank == 0) { + for (size_t i = 0; i < localcsrOffsets.size(); i++) { + CHECK(localCSRMatVector[0].getCsrOffsets()[i] == + localcsrOffsets[i]); + } + + for (size_t i = 0; i < local_indices.size(); i++) { + CHECK(localCSRMatVector[0].getColumns()[i] == local_indices[i]); + } + + for (size_t i = 0; i < 8; i++) { + CHECK(localCSRMatVector[0].getValues()[i] == values[i]); + } + + CHECK(localCSRMatVector[1].getValues().size() == 0); + } else { + for (size_t i = 0; i < localcsrOffsets.size(); i++) { + CHECK(localCSRMatVector[1].getCsrOffsets()[i] == + localcsrOffsets[i]); + } + + for (size_t i = 0; i < local_indices.size(); i++) { + CHECK(localCSRMatVector[1].getColumns()[i] == local_indices[i]); + } + + for (size_t i = 0; i < 8; i++) { + CHECK(localCSRMatVector[1].getValues()[i] == values[i + 8]); + } + + CHECK(localCSRMatVector[0].getValues().size() == 0); + } + } } \ No newline at end of file From 6a8cce0a3bc7dd5a2a7cd22f5aa7377afb6761d5 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Wed, 23 Aug 2023 19:11:08 -0700 Subject: [PATCH 41/43] add docstring --- .../src/algorithms/ObservablesGPUMPI.hpp | 5 ++--- .../src/simulator/StateVectorCudaMPI.hpp | 1 - pennylane_lightning_gpu/src/util/CSRMatrix.hpp | 11 +++++++---- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp index d99431ba..fdaefc15 100644 --- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp +++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp @@ -364,9 +364,8 @@ class HamiltonianGPUMPI final : public ObservableGPUMPI { for (size_t term_idx = 0; term_idx < coeffs_.size(); term_idx++) { DevTag dt_local(sv.getDataBuffer().getDevTag()); dt_local.refresh(); - StateVectorCudaMPI tmp( - dt_local, sv.getNumGlobalQubits(), sv.getNumLocalQubits(), - sv.getData()); + StateVectorCudaMPI tmp(dt_local, sv.getNumGlobalQubits(), + sv.getNumLocalQubits(), sv.getData()); obs_[term_idx]->applyInPlace(tmp); scaleAndAddC_CUDA(std::complex{coeffs_[term_idx], 0.0}, tmp.getData(), buffer.getData(), tmp.getLength(), diff --git a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp index 4de7efe1..d8efa764 100644 --- a/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp +++ b/pennylane_lightning_gpu/src/simulator/StateVectorCudaMPI.hpp @@ -959,7 +959,6 @@ class StateVectorCudaMPI const index_type *csrOffsets_ptr, const index_type csrOffsets_size, const index_type *columns_ptr, const std::complex *values_ptr, const index_type numNNZ) { - if (mpi_manager_.getRank() == 0) { PL_ABORT_IF_NOT(static_cast(csrOffsets_size - 1) == (size_t{1} << this->getTotalNumQubits()), diff --git a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp index 59856d46..78a60ddf 100644 --- a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp +++ b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp @@ -61,9 +61,10 @@ template class CSRMatrix { * @brief Convert a global CSR (Compressed Sparse Row) format matrix into * local blocks. This operation should be conducted on the rank 0. * + * @tparam Precision Floating-point precision type. * @tparam index_type Integer type used as indices of the sparse matrix. - * @param num_row_blocks Number of local blocks per global row. - * @param num_col_blocks Number of local blocks per global column. + * @param mpi_manager MPIManager object. + * @param num_rows Number of rows of the CSR matrix. * @param csrOffsets_ptr Pointer to the array of row offsets of the sparse * matrix. Array of size csrOffsets_size. * @param columns_ptr Pointer to the array of column indices of the sparse @@ -147,8 +148,11 @@ auto splitCSRMatrix(MPIManager &mpi_manager, const size_t &num_rows, /** * @brief Scatter a CSR (Compressed Sparse Row) format matrix. * + * @tparam Precision Floating-point precision type. * @tparam index_type Integer type used as indices of the sparse matrix. - * @param matrix CSR (Compressed Sparse Row) format matrix. + * @param mpi_manager MPIManager object. + * @param matrix CSR (Compressed Sparse Row) format matrix vector. + * @param local_num_rows Number of rows of local CSR matrix. * @param root Root rank of the scatter operation. */ template @@ -156,7 +160,6 @@ auto scatterCSRMatrix(MPIManager &mpi_manager, std::vector> &matrix, size_t local_num_rows, size_t root) -> CSRMatrix { - // Bcast num_rows and num_cols size_t num_col_blocks = mpi_manager.getSize(); std::vector nnzs; From d1f14558afaf3b91ba9cf0f5f38f6604d96d8cac Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Thu, 24 Aug 2023 06:20:56 -0700 Subject: [PATCH 42/43] add license header --- pennylane_lightning_gpu/src/util/CSRMatrix.hpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp index 78a60ddf..5da81235 100644 --- a/pennylane_lightning_gpu/src/util/CSRMatrix.hpp +++ b/pennylane_lightning_gpu/src/util/CSRMatrix.hpp @@ -1,3 +1,17 @@ +// Copyright 2022-2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #pragma once #include From d75c26800cd9ac80bcf53e2b691bcc3caf33c8e5 Mon Sep 17 00:00:00 2001 From: Shuli Shu <08cnbj@gmail.com> Date: Thu, 24 Aug 2023 06:47:36 -0700 Subject: [PATCH 43/43] tidy up code --- pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp index fdaefc15..67df085f 100644 --- a/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp +++ b/pennylane_lightning_gpu/src/algorithms/ObservablesGPUMPI.hpp @@ -522,7 +522,6 @@ class SparseHamiltonianGPUMPI final : public ObservableGPUMPI { auto device_id = sv.getDataBuffer().getDevTag().getDeviceID(); auto stream_id = sv.getDataBuffer().getDevTag().getStreamID(); - // clang-format on cudaDataType_t data_type; cusparseIndexType_t compute_type; const cusparseOperation_t operation_type =