From a23f5186ae800ce51419bb50811f29d0f8614251 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Thu, 17 Aug 2023 18:43:12 +0100 Subject: [PATCH 01/15] scaffolded cuQuantum build --- QuEST/CMakeLists.txt | 42 ++-- QuEST/src/GPU/CMakeLists.txt | 11 +- QuEST/src/GPU/QuEST_cuQuantum.cu | 351 ++++++++++++++++++++++++++++++ QuEST/src/GPU/QuEST_gpu.cu | 212 +----------------- QuEST/src/GPU/QuEST_gpu_common.cu | 261 ++++++++++++++++++++++ 5 files changed, 648 insertions(+), 229 deletions(-) create mode 100644 QuEST/src/GPU/QuEST_cuQuantum.cu create mode 100644 QuEST/src/GPU/QuEST_gpu_common.cu diff --git a/QuEST/CMakeLists.txt b/QuEST/CMakeLists.txt index 899f3fb98..b03789ac7 100644 --- a/QuEST/CMakeLists.txt +++ b/QuEST/CMakeLists.txt @@ -37,6 +37,8 @@ option(USE_HIP "Whether to use HIP for GPU code compilation for AMD GPUs. Set to set(GPU_ARCH gfx90 CACHE STRING "GPU hardware dependent, used for AMD GPUs when USE_HIP=1. Lookup at https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table. Write without fullstop") +option(USE_CUQUANTUM "Whether to use NVIDIA's cuQuantum library (requires prior installation) in lieu of QuEST's bespoke GPU kernel. Set to 1 to enable." 0) + # ***************************************************************************** # ***** NO CHANGES SHOULD BE REQUIRED FROM THE USER BEYOND THIS POINT ********* @@ -49,6 +51,7 @@ message(STATUS "OMP acceleration is ${MULTITHREADED}") message(STATUS "MPI distribution is ${DISTRIBUTED}") if (${GPUACCELERATED}) message(STATUS "HIP compilation is ${USE_HIP}") + message(STATUS "cuQuantum compilation is ${USE_CUQUANTUM}") endif() @@ -119,25 +122,28 @@ endif() if (GPUACCELERATED) if (USE_HIP) - if(NOT DEFINED HIP_PATH) - if(NOT DEFINED ENV{HIP_PATH}) - message(WARNING "WARNING: HIP_PATH is not defiend. Using default HIP_PATH=/opt/rocm/hip " ${HIP_VERSION}) - set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") - else() - set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + if(NOT DEFINED HIP_PATH) + if(NOT DEFINED ENV{HIP_PATH}) + message(WARNING "WARNING: HIP_PATH is not defiend. Using default HIP_PATH=/opt/rocm/hip " ${HIP_VERSION}) + set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") + else() + set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + endif() endif() - endif() - if(EXISTS "${HIP_PATH}") - set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) - find_package(HIP REQUIRED) - message(STATUS "Found HIP: " ${HIP_VERSION}) - message(STATUS "HIP PATH: " ${HIP_PATH}) - endif() - - ADD_DEFINITIONS( -DUSE_HIP ) - ADD_DEFINITIONS( -D__HIP_PLATFORM_AMD__ ) + if(EXISTS "${HIP_PATH}") + set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) + find_package(HIP REQUIRED) + message(STATUS "Found HIP: " ${HIP_VERSION}) + message(STATUS "HIP PATH: " ${HIP_PATH}) + endif() + + ADD_DEFINITIONS( -DUSE_HIP ) + ADD_DEFINITIONS( -D__HIP_PLATFORM_AMD__ ) + elseif (USE_CUQUANTUM) + find_package(CUDA REQUIRED) + ADD_DEFINITIONS( -DUSE_CUQUANTUM ) else() find_package(CUDA REQUIRED) endif() @@ -412,6 +418,10 @@ target_link_libraries(QuEST PUBLIC ${MPI_C_LIBRARIES}) # ----- GPU ------------------------------------------------------------------- if (USE_HIP) target_link_libraries(QuEST PUBLIC ${HIP_PATH}/lib/libamdhip64.so ) +elseif (USE_CUQUANTUM) + find_library(CUQUANTUM_LIBRARIES custatevec) + target_link_libraries(QuEST ${CUDA_LIBRARIES} ${CUQUANTUM_LIBRARIES}) + target_include_directories(QuEST PUBLIC "/usr/local/cuda/include") else() target_link_libraries(QuEST ${CUDA_LIBRARIES}) endif() diff --git a/QuEST/src/GPU/CMakeLists.txt b/QuEST/src/GPU/CMakeLists.txt index 037382808..14054a62e 100644 --- a/QuEST/src/GPU/CMakeLists.txt +++ b/QuEST/src/GPU/CMakeLists.txt @@ -1,6 +1,11 @@ +if (USE_CUQUANTUM) + set(GPU_CORE QuEST_cuQuantum.cu) +else () + set(GPU_CORE QuEST_gpu.cu) +endif() + set(QuEST_SRC_ARCHITECTURE_DEPENDENT - ${CMAKE_CURRENT_SOURCE_DIR}/QuEST_gpu.cu + ${CMAKE_CURRENT_SOURCE_DIR}/${GPU_CORE} + ${CMAKE_CURRENT_SOURCE_DIR}/QuEST_gpu_common.cu PARENT_SCOPE ) - - diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu new file mode 100644 index 000000000..45cab781f --- /dev/null +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -0,0 +1,351 @@ +// Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details + +/** @file + * An implementation of QuEST's backend (../QuEST_internal.h) using NVIDIA's cuQuantum library + * + * @author Tyson Jones + */ + +# include "QuEST.h" +# include "QuEST_precision.h" +# include + + + +#ifdef __cplusplus +extern "C" { +#endif + + + +void statevec_setAmps(Qureg qureg, long long int startInd, qreal* reals, qreal* imags, long long int numAmps) +{ +} + + +void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) +{ +} + +void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg) +{ +} + +void densmatr_initPlusState(Qureg qureg) +{ +} + +void densmatr_initClassicalState(Qureg qureg, long long int stateInd) +{ +} + +void statevec_createQureg(Qureg *qureg, int numQubits, QuESTEnv env) +{ +} + +void statevec_destroyQureg(Qureg qureg, QuESTEnv env) +{ +} + +void copyStateToGPU(Qureg qureg) +{ +} + +void copyStateFromGPU(Qureg qureg) +{ +} + +void statevec_copySubstateToGPU(Qureg qureg, long long int startInd, long long int numAmps) +{ +} + +void statevec_copySubstateFromGPU(Qureg qureg, long long int startInd, long long int numAmps) +{ +} + +void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank) +{ +} + +qreal statevec_getRealAmp(Qureg qureg, long long int index) +{ + return -1; +} + +qreal statevec_getImagAmp(Qureg qureg, long long int index) +{ + return -1; +} + +void statevec_initBlankState(Qureg qureg) +{ +} + +void statevec_initZeroState(Qureg qureg) +{ +} + +void statevec_initPlusState(Qureg qureg) +{ +} + +void statevec_initClassicalState(Qureg qureg, long long int stateInd) +{ +} + +void statevec_initDebugState(Qureg qureg) +{ +} + +void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome) +{ +} + +int statevec_initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env) +{ + return -1; +} + +int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision) +{ + return -1; +} + +void statevec_compactUnitary(Qureg qureg, int targetQubit, Complex alpha, Complex beta) +{ +} + +void statevec_controlledCompactUnitary(Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta) +{ +} + +void statevec_unitary(Qureg qureg, int targetQubit, ComplexMatrix2 u) +{ +} + +void statevec_multiControlledMultiQubitUnitary(Qureg qureg, long long int ctrlMask, int* targs, int numTargs, ComplexMatrixN u) +{ +} + +void statevec_multiControlledTwoQubitUnitary(Qureg qureg, long long int ctrlMask, int q1, int q2, ComplexMatrix4 u) +{ +} + +void statevec_controlledUnitary(Qureg qureg, int controlQubit, int targetQubit, ComplexMatrix2 u) +{ +} + +void statevec_multiControlledUnitary( + Qureg qureg, + long long int ctrlQubitsMask, long long int ctrlFlipMask, + int targetQubit, ComplexMatrix2 u) +{ +} + +void statevec_pauliX(Qureg qureg, int targetQubit) +{ +} + +void statevec_pauliY(Qureg qureg, int targetQubit) +{ +} + +void statevec_pauliYConj(Qureg qureg, int targetQubit) +{ +} + +void statevec_controlledPauliY(Qureg qureg, int controlQubit, int targetQubit) +{ +} + +void statevec_controlledPauliYConj(Qureg qureg, int controlQubit, int targetQubit) +{ +} + +void statevec_phaseShiftByTerm(Qureg qureg, int targetQubit, Complex term) +{ +} + +void statevec_controlledPhaseShift(Qureg qureg, int idQubit1, int idQubit2, qreal angle) +{ +} + +void statevec_multiControlledPhaseShift(Qureg qureg, int *controlQubits, int numControlQubits, qreal angle) +{ +} + +void statevec_multiRotateZ(Qureg qureg, long long int mask, qreal angle) +{ +} + +void statevec_multiControlledMultiRotateZ(Qureg qureg, long long int ctrlMask, long long int targMask, qreal angle) +{ +} + +qreal densmatr_calcTotalProb(Qureg qureg) +{ + return -1; +} + +qreal statevec_calcTotalProb(Qureg qureg) +{ + return -1; +} + +void statevec_controlledPhaseFlip(Qureg qureg, int idQubit1, int idQubit2) +{ +} + +void statevec_multiControlledPhaseFlip(Qureg qureg, int *controlQubits, int numControlQubits) +{ +} + +void statevec_swapQubitAmps(Qureg qureg, int qb1, int qb2) +{ +} + +void statevec_hadamard(Qureg qureg, int targetQubit) +{ +} + +void statevec_controlledNot(Qureg qureg, int controlQubit, int targetQubit) +{ +} + +void statevec_multiControlledMultiQubitNot(Qureg qureg, int ctrlMask, int targMask) +{ +} + +qreal statevec_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome) +{ + return -1; +} + +qreal densmatr_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome) +{ + return -1; +} + +void statevec_calcProbOfAllOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits) +{ +} + +void densmatr_calcProbOfAllOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits) +{ +} + +qreal densmatr_calcInnerProduct(Qureg a, Qureg b) +{ + return -1; +} + +Complex statevec_calcInnerProduct(Qureg bra, Qureg ket) +{ + return (Complex) {.real=-1, .imag=-1}; +} + +qreal densmatr_calcFidelity(Qureg qureg, Qureg pureState) +{ + return -1; +} + +qreal densmatr_calcHilbertSchmidtDistance(Qureg a, Qureg b) +{ + return -1; +} + +qreal densmatr_calcPurity(Qureg qureg) +{ + return -1; +} + +void statevec_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb) +{ +} + +void densmatr_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb) +{ +} + +void densmatr_mixDensityMatrix(Qureg combineQureg, qreal otherProb, Qureg otherQureg) +{ +} + +void densmatr_mixDephasing(Qureg qureg, int targetQubit, qreal dephase) +{ +} + +void densmatr_mixTwoQubitDephasing(Qureg qureg, int qubit1, int qubit2, qreal dephase) +{ +} + +void densmatr_mixDepolarising(Qureg qureg, int targetQubit, qreal depolLevel) +{ +} + +void densmatr_mixDamping(Qureg qureg, int targetQubit, qreal damping) +{ +} + +void densmatr_mixTwoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal depolLevel) +{ +} + +void statevec_setWeightedQureg(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out) +{ +} + +void statevec_applyDiagonalOp(Qureg qureg, DiagonalOp op) +{ +} + +void densmatr_applyDiagonalOp(Qureg qureg, DiagonalOp op) { +} + +void statevec_applySubDiagonalOp(Qureg qureg, int* targets, SubDiagonalOp op, int conj) +{ +} + +Complex statevec_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op) +{ + return (Complex) {.real=-1, .imag=-1}; +} + +Complex densmatr_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op) +{ + return (Complex) {.real=-1, .imag=-1}; +} + +void statevec_applyPhaseFuncOverrides( + Qureg qureg, int* qubits, int numQubits, enum bitEncoding encoding, + qreal* coeffs, qreal* exponents, int numTerms, + long long int* overrideInds, qreal* overridePhases, int numOverrides, + int conj) +{ +} + +void statevec_applyMultiVarPhaseFuncOverrides( + Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, + qreal* coeffs, qreal* exponents, int* numTermsPerReg, + long long int* overrideInds, qreal* overridePhases, int numOverrides, + int conj) +{ +} + +void statevec_applyParamNamedPhaseFuncOverrides( + Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, + enum phaseFunc phaseFuncName, qreal* params, int numParams, + long long int* overrideInds, qreal* overridePhases, int numOverrides, + int conj) +{ +} + +void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) +{ +} + + + +#ifdef __cplusplus +} +#endif diff --git a/QuEST/src/GPU/QuEST_gpu.cu b/QuEST/src/GPU/QuEST_gpu.cu index 3b2138a4c..0b924454a 100755 --- a/QuEST/src/GPU/QuEST_gpu.cu +++ b/QuEST/src/GPU/QuEST_gpu.cu @@ -1,7 +1,7 @@ // Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details /** @file - * An implementation of the backend in ../QuEST_internal.h for a GPU environment. + * A custom-kernel implementation of the backend in ../QuEST_internal.h for a GPU environment. * * @author Ania Brown * @author Tyson Jones @@ -10,8 +10,7 @@ # include "QuEST.h" # include "QuEST_precision.h" # include "QuEST_validation.h" -# include "QuEST_internal.h" // purely to resolve getQuESTDefaultSeedKey -# include "mt19937ar.h" +# include "QuEST_internal.h" // for getQubitBitmask # include # include @@ -328,178 +327,6 @@ void statevec_destroyQureg(Qureg qureg, QuESTEnv env) cudaFree(qureg.secondLevelReduction); } -DiagonalOp agnostic_createDiagonalOp(int numQubits, QuESTEnv env) { - - DiagonalOp op; - op.numQubits = numQubits; - op.numElemsPerChunk = (1LL << numQubits) / env.numRanks; - op.chunkId = env.rank; - op.numChunks = env.numRanks; - - // allocate CPU memory (initialised to zero) - op.real = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal)); - op.imag = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal)); - // @TODO no handling of rank>1 allocation (no distributed GPU) - - // check cpu memory allocation was successful - validateDiagonalOpAllocation(&op, env, __func__); - - // allocate GPU memory - size_t arrSize = op.numElemsPerChunk * sizeof(qreal); - cudaMalloc(&(op.deviceOperator.real), arrSize); - cudaMalloc(&(op.deviceOperator.imag), arrSize); - - // check gpu memory allocation was successful - validateDiagonalOpGPUAllocation(&op, env, __func__); - - // initialise GPU memory to zero - cudaMemset(op.deviceOperator.real, 0, arrSize); - cudaMemset(op.deviceOperator.imag, 0, arrSize); - - return op; -} - -void agnostic_destroyDiagonalOp(DiagonalOp op) { - free(op.real); - free(op.imag); - cudaFree(op.deviceOperator.real); - cudaFree(op.deviceOperator.imag); -} - -void agnostic_syncDiagonalOp(DiagonalOp op) { - cudaDeviceSynchronize(); - size_t mem_elems = op.numElemsPerChunk * sizeof *op.real; - cudaMemcpy(op.deviceOperator.real, op.real, mem_elems, cudaMemcpyHostToDevice); - cudaMemcpy(op.deviceOperator.imag, op.imag, mem_elems, cudaMemcpyHostToDevice); -} - -__global__ void agnostic_initDiagonalOpFromPauliHamilKernel( - DiagonalOp op, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms -) { - // each thread processes one diagonal element - long long int elemInd = blockIdx.x*blockDim.x + threadIdx.x; - if (elemInd >= op.numElemsPerChunk) - return; - - qreal elem = 0; - - // elem is (+-) every coefficient, with sign determined by parity - for (int t=0; t>>( - op, d_pauliCodes, d_termCoeffs, hamil.numSumTerms); - - // copy populated operator into to RAM - cudaDeviceSynchronize(); - size_t mem_elems = op.numElemsPerChunk * sizeof *op.real; - cudaMemcpy(op.real, op.deviceOperator.real, mem_elems, cudaMemcpyDeviceToHost); - cudaMemcpy(op.imag, op.deviceOperator.imag, mem_elems, cudaMemcpyDeviceToHost); - - cudaFree(d_pauliCodes); - cudaFree(d_termCoeffs); -} - -int GPUExists(void){ - int deviceCount, device; - int gpuDeviceCount = 0; - struct cudaDeviceProp properties; - cudaError_t cudaResultCode = cudaGetDeviceCount(&deviceCount); - if (cudaResultCode != cudaSuccess) deviceCount = 0; - /* machines with no GPUs can still report one emulation device */ - for (device = 0; device < deviceCount; ++device) { - cudaGetDeviceProperties(&properties, device); - if (properties.major != 9999) { /* 9999 means emulation only */ - ++gpuDeviceCount; - } - } - if (gpuDeviceCount) return 1; - else return 0; -} - -QuESTEnv createQuESTEnv(void) { - - validateGPUExists(GPUExists(), __func__); - - QuESTEnv env; - env.rank=0; - env.numRanks=1; - - env.seeds = NULL; - env.numSeeds = 0; - seedQuESTDefault(&env); - - return env; -} - -void syncQuESTEnv(QuESTEnv env){ - cudaDeviceSynchronize(); -} - -int syncQuESTSuccess(int successCode){ - return successCode; -} - -void destroyQuESTEnv(QuESTEnv env){ - free(env.seeds); -} - -void reportQuESTEnv(QuESTEnv env){ - printf("EXECUTION ENVIRONMENT:\n"); - printf("Running locally on one node with GPU\n"); - printf("Number of ranks is %d\n", env.numRanks); -# ifdef _OPENMP - printf("OpenMP enabled\n"); - printf("Number of threads available is %d\n", omp_get_max_threads()); -# else - printf("OpenMP disabled\n"); -# endif -} - -void getEnvironmentString(QuESTEnv env, char str[200]){ - - // OpenMP can be hybridised with GPU in future, so this check is safe and worthwhile - int ompStatus=0; - int numThreads=1; -# ifdef _OPENMP - ompStatus=1; - numThreads=omp_get_max_threads(); -# endif - - // there is no reporting of CUDA cores/threads/blocks currently (since non-trivial) - sprintf(str, "CUDA=1 OpenMP=%d MPI=0 threads=%d ranks=1", ompStatus, numThreads); -} - void copyStateToGPU(Qureg qureg) { if (DEBUG) printf("Copying data to GPU\n"); @@ -3566,25 +3393,6 @@ Complex densmatr_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op) { return expecVal; } -void agnostic_setDiagonalOpElems(DiagonalOp op, long long int startInd, qreal* real, qreal* imag, long long int numElems) { - - // update both RAM and VRAM, for consistency - memcpy(&op.real[startInd], real, numElems * sizeof(qreal)); - memcpy(&op.imag[startInd], imag, numElems * sizeof(qreal)); - - cudaDeviceSynchronize(); - cudaMemcpy( - op.deviceOperator.real + startInd, - real, - numElems * sizeof(*(op.deviceOperator.real)), - cudaMemcpyHostToDevice); - cudaMemcpy( - op.deviceOperator.imag + startInd, - imag, - numElems * sizeof(*(op.deviceOperator.imag)), - cudaMemcpyHostToDevice); -} - __global__ void statevec_applyPhaseFuncOverridesKernel( Qureg qureg, int* qubits, int numQubits, enum bitEncoding encoding, qreal* coeffs, qreal* exponents, int numTerms, @@ -4113,22 +3921,6 @@ void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) { cudaFree(d_termCoeffs); } -void seedQuEST(QuESTEnv *env, unsigned long int *seedArray, int numSeeds) { - - // free existing seed array, if exists - if (env->seeds != NULL) - free(env->seeds); - - // record keys in permanent heap - env->seeds = (unsigned long int*) malloc(numSeeds * sizeof *(env->seeds)); - for (int i=0; iseeds)[i] = seedArray[i]; - env->numSeeds = numSeeds; - - // pass keys to Mersenne Twister seeder - init_by_array(seedArray, numSeeds); -} - #ifdef __cplusplus diff --git a/QuEST/src/GPU/QuEST_gpu_common.cu b/QuEST/src/GPU/QuEST_gpu_common.cu new file mode 100644 index 000000000..ac46ba1f8 --- /dev/null +++ b/QuEST/src/GPU/QuEST_gpu_common.cu @@ -0,0 +1,261 @@ +// Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details + +/** @file + * GPU routines which are agnostic to the cuQuantum backend or the custom kernels + * + * @author Tyson Jones + * @author Ania Brown (defined some original functions moved here) + */ + + +# include "QuEST.h" +# include "QuEST_precision.h" +# include "QuEST_validation.h" +# include "mt19937ar.h" + +# include +# include +# include + +#ifdef USE_HIP +// Translate CUDA calls into HIP calls +#include "cuda_to_hip.h" +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + + + + +int GPUExists(void){ + int deviceCount, device; + int gpuDeviceCount = 0; + struct cudaDeviceProp properties; + cudaError_t cudaResultCode = cudaGetDeviceCount(&deviceCount); + if (cudaResultCode != cudaSuccess) deviceCount = 0; + /* machines with no GPUs can still report one emulation device */ + for (device = 0; device < deviceCount; ++device) { + cudaGetDeviceProperties(&properties, device); + if (properties.major != 9999) { /* 9999 means emulation only */ + ++gpuDeviceCount; + } + } + if (gpuDeviceCount) return 1; + else return 0; +} + +QuESTEnv createQuESTEnv(void) { + + validateGPUExists(GPUExists(), __func__); + + QuESTEnv env; + env.rank=0; + env.numRanks=1; + + env.seeds = NULL; + env.numSeeds = 0; + seedQuESTDefault(&env); + + return env; +} + +void syncQuESTEnv(QuESTEnv env){ + cudaDeviceSynchronize(); +} + +int syncQuESTSuccess(int successCode){ + return successCode; +} + +void destroyQuESTEnv(QuESTEnv env){ + free(env.seeds); +} + +void reportQuESTEnv(QuESTEnv env){ + printf("EXECUTION ENVIRONMENT:\n"); + printf("Running locally on one node with GPU "); +# ifdef USE_CUQUANTUM + printf("via cuQuantum\n"); +# else + printf("via custom kernels\n"); +# endif + printf("Number of ranks is %d\n", env.numRanks); +# ifdef _OPENMP + printf("OpenMP enabled\n"); + printf("Number of threads available is %d\n", omp_get_max_threads()); +# else + printf("OpenMP disabled\n"); +# endif +} + +void getEnvironmentString(QuESTEnv env, char str[200]){ + + // OpenMP can be hybridised with GPU in future, so this check is safe and worthwhile + int ompStatus=0; + int numThreads=1; +# ifdef _OPENMP + ompStatus=1; + numThreads=omp_get_max_threads(); +# endif + + // ascertain whether we're using cuQuantum or bespoke kernels + int cuQuantumStatus=0; +# ifdef USE_CUQUANTUM + cuQuantumStatus=1; +# endif + + // there is no reporting of CUDA cores/threads/blocks currently (since non-trivial) + sprintf(str, "CUDA=1 cuQuantum=%d OpenMP=%d MPI=0 threads=%d ranks=1", cuQuantumStatus, ompStatus, numThreads); +} + +void seedQuEST(QuESTEnv *env, unsigned long int *seedArray, int numSeeds) { + + // free existing seed array, if exists + if (env->seeds != NULL) + free(env->seeds); + + // record keys in permanent heap + env->seeds = (unsigned long int*) malloc(numSeeds * sizeof *(env->seeds)); + for (int i=0; iseeds)[i] = seedArray[i]; + env->numSeeds = numSeeds; + + // pass keys to Mersenne Twister seeder + init_by_array(seedArray, numSeeds); +} + + + + + + + +DiagonalOp agnostic_createDiagonalOp(int numQubits, QuESTEnv env) { + + DiagonalOp op; + op.numQubits = numQubits; + op.numElemsPerChunk = (1LL << numQubits) / env.numRanks; + op.chunkId = env.rank; + op.numChunks = env.numRanks; + + // allocate CPU memory (initialised to zero) + op.real = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal)); + op.imag = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal)); + // @TODO no handling of rank>1 allocation (no distributed GPU) + + // check cpu memory allocation was successful + validateDiagonalOpAllocation(&op, env, __func__); + + // allocate GPU memory + size_t arrSize = op.numElemsPerChunk * sizeof(qreal); + cudaMalloc(&(op.deviceOperator.real), arrSize); + cudaMalloc(&(op.deviceOperator.imag), arrSize); + + // check gpu memory allocation was successful + validateDiagonalOpGPUAllocation(&op, env, __func__); + + // initialise GPU memory to zero + cudaMemset(op.deviceOperator.real, 0, arrSize); + cudaMemset(op.deviceOperator.imag, 0, arrSize); + + return op; +} + +void agnostic_destroyDiagonalOp(DiagonalOp op) { + free(op.real); + free(op.imag); + cudaFree(op.deviceOperator.real); + cudaFree(op.deviceOperator.imag); +} + +void agnostic_syncDiagonalOp(DiagonalOp op) { + cudaDeviceSynchronize(); + size_t mem_elems = op.numElemsPerChunk * sizeof *op.real; + cudaMemcpy(op.deviceOperator.real, op.real, mem_elems, cudaMemcpyHostToDevice); + cudaMemcpy(op.deviceOperator.imag, op.imag, mem_elems, cudaMemcpyHostToDevice); +} + +__global__ void agnostic_initDiagonalOpFromPauliHamilKernel( + DiagonalOp op, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms +) { + // each thread processes one diagonal element + long long int elemInd = blockIdx.x*blockDim.x + threadIdx.x; + if (elemInd >= op.numElemsPerChunk) + return; + + qreal elem = 0; + + // elem is (+-) every coefficient, with sign determined by parity + for (int t=0; t> q) & 1) + isOddNumOnes = !isOddNumOnes; + + // avoid warp divergence + int sign = 1 - 2*isOddNumOnes; // (-1 if isOddNumOnes, else +1) + elem += termCoeffs[t] * sign; + } + + op.deviceOperator.real[elemInd] = elem; + op.deviceOperator.imag[elemInd] = 0; +} + +void agnostic_initDiagonalOpFromPauliHamil(DiagonalOp op, PauliHamil hamil) { + + // copy args intop GPU memory + enum pauliOpType* d_pauliCodes; + size_t mem_pauliCodes = hamil.numSumTerms * op.numQubits * sizeof *d_pauliCodes; + cudaMalloc(&d_pauliCodes, mem_pauliCodes); + cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice); + + qreal* d_termCoeffs; + size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs; + cudaMalloc(&d_termCoeffs, mem_termCoeffs); + cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice); + + int numThreadsPerBlock = 128; + int numBlocks = ceil(op.numElemsPerChunk / (qreal) numThreadsPerBlock); + agnostic_initDiagonalOpFromPauliHamilKernel<<>>( + op, d_pauliCodes, d_termCoeffs, hamil.numSumTerms); + + // copy populated operator into to RAM + cudaDeviceSynchronize(); + size_t mem_elems = op.numElemsPerChunk * sizeof *op.real; + cudaMemcpy(op.real, op.deviceOperator.real, mem_elems, cudaMemcpyDeviceToHost); + cudaMemcpy(op.imag, op.deviceOperator.imag, mem_elems, cudaMemcpyDeviceToHost); + + cudaFree(d_pauliCodes); + cudaFree(d_termCoeffs); +} + +void agnostic_setDiagonalOpElems(DiagonalOp op, long long int startInd, qreal* real, qreal* imag, long long int numElems) { + + // update both RAM and VRAM, for consistency + memcpy(&op.real[startInd], real, numElems * sizeof(qreal)); + memcpy(&op.imag[startInd], imag, numElems * sizeof(qreal)); + + cudaDeviceSynchronize(); + cudaMemcpy( + op.deviceOperator.real + startInd, + real, + numElems * sizeof(*(op.deviceOperator.real)), + cudaMemcpyHostToDevice); + cudaMemcpy( + op.deviceOperator.imag + startInd, + imag, + numElems * sizeof(*(op.deviceOperator.imag)), + cudaMemcpyHostToDevice); +} + + + +#ifdef __cplusplus +} +#endif From 40f936fd8db243312c4d557f3dbadfbe82ef6684 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sat, 19 Aug 2023 13:46:11 +0100 Subject: [PATCH 02/15] proposing cuQuantum memory design --- QuEST/CMakeLists.txt | 4 + QuEST/include/QuEST.h | 20 ++ QuEST/include/QuEST_precision.h | 18 +- QuEST/src/GPU/QuEST_cuQuantum.cu | 294 ++++++++++++++++++++++-------- QuEST/src/GPU/QuEST_gpu_common.cu | 8 + 5 files changed, 259 insertions(+), 85 deletions(-) diff --git a/QuEST/CMakeLists.txt b/QuEST/CMakeLists.txt index b03789ac7..e26aeb190 100644 --- a/QuEST/CMakeLists.txt +++ b/QuEST/CMakeLists.txt @@ -420,6 +420,10 @@ if (USE_HIP) target_link_libraries(QuEST PUBLIC ${HIP_PATH}/lib/libamdhip64.so ) elseif (USE_CUQUANTUM) find_library(CUQUANTUM_LIBRARIES custatevec) + if (NOT CUQUANTUM_LIBRARIES) + message(FATAL_ERROR "cuQuantum library (specifically custatevec) not found") + endif () + target_link_libraries(QuEST ${CUDA_LIBRARIES} ${CUQUANTUM_LIBRARIES}) target_include_directories(QuEST PUBLIC "/usr/local/cuda/include") else() diff --git a/QuEST/include/QuEST.h b/QuEST/include/QuEST.h index dd7709780..1adb78dd3 100644 --- a/QuEST/include/QuEST.h +++ b/QuEST/include/QuEST.h @@ -34,6 +34,17 @@ # include "QuEST_precision.h" + + +// ensure custatevecHandle_t is defined, even if no GPU +# ifdef USE_CUQUANTUM + # include +# else + # define custatevecHandle_t void* +# endif + + + // prevent C++ name mangling #ifdef __cplusplus extern "C" { @@ -368,6 +379,11 @@ typedef struct Qureg //! Storage for reduction of probabilities on GPU qreal *firstLevelReduction, *secondLevelReduction; + //! Storage for wavefunction amplitues and config (copy of QuESTEnv's handle) in the cuQuantum version + cuAmp* cuStateVec; + cuAmp* deviceCuStateVec; + custatevecHandle_t cuQuantumHandle; + //! Storage for generated QASM output QASMLogger* qasmLog; @@ -386,6 +402,10 @@ typedef struct QuESTEnv int numRanks; unsigned long int* seeds; int numSeeds; + + // handle to cuQuantum (specifically cuStateVec) used only cuQuantum deployment mode (otherwise is void*) + custatevecHandle_t cuQuantumHandle; + } QuESTEnv; diff --git a/QuEST/include/QuEST_precision.h b/QuEST/include/QuEST_precision.h index 48c9a1ccf..b5cc76dc0 100644 --- a/QuEST/include/QuEST_precision.h +++ b/QuEST/include/QuEST_precision.h @@ -10,15 +10,22 @@ * @author Tyson Jones (doc) */ -# include - # ifndef QUEST_PRECISION_H # define QUEST_PRECISION_H +# include + +// define CUDA complex types as void if not using cuQuantum +# ifdef USE_CUQUANTUM + # include +# else + # define cuFloatComplex void + # define cuDoubleComplex void +# endif // set default double precision if not set during compilation # ifndef QuEST_PREC -# define QuEST_PREC 2 + # define QuEST_PREC 2 # endif @@ -28,6 +35,7 @@ # if QuEST_PREC==1 # define qreal float // \cond HIDDEN_SYMBOLS + # define cuAmp cuFloatComplex # define MPI_QuEST_REAL MPI_FLOAT # define MPI_MAX_AMPS_IN_MSG (1LL<<29) // must be 2^int # define REAL_STRING_FORMAT "%.8f" @@ -41,7 +49,8 @@ */ # elif QuEST_PREC==2 # define qreal double - // \cond HIDDEN_SYMBOLS + // \cond HIDDEN_SYMBOLS + # define cuAmp cuDoubleComplex # define MPI_QuEST_REAL MPI_DOUBLE # define MPI_MAX_AMPS_IN_MSG (1LL<<28) // must be 2^int # define REAL_STRING_FORMAT "%.14f" @@ -57,6 +66,7 @@ # elif QuEST_PREC==4 # define qreal long double // \cond HIDDEN_SYMBOLS + # define cuAmp void // invalid # define MPI_QuEST_REAL MPI_LONG_DOUBLE # define MPI_MAX_AMPS_IN_MSG (1LL<<27) // must be 2^int # define REAL_STRING_FORMAT "%.17Lf" diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index 45cab781f..4852cea61 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -1,80 +1,177 @@ // Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details /** @file - * An implementation of QuEST's backend (../QuEST_internal.h) using NVIDIA's cuQuantum library + * An implementation of QuEST's backend (../QuEST_internal.h) using NVIDIA's cuQuantum library. + * This makes no use of the ComplexArray qureg.deviceStateVec, used by the bespoke GPU kernels, + * which is not malloc'd in this deployment. Instead, this cuQuantum backend mallocs and uses + * two dedicated arrays of 'cuAmp' complex primitives; qureg.cuStateVec (CPU memory) and + * qureg.deviceCuStateVec (GPU memory) * * @author Tyson Jones */ # include "QuEST.h" # include "QuEST_precision.h" +# include "QuEST_validation.h" # include +// precision-overloaded macros for creating a cuAmp +# if QuEST_PREC==1 + # define toCuAmp(re, im) make_cuFloatComplex(re, im) + # define cuAmpReal(amp) cuCrealf(amp) + # define cuAmpImag(amp) cuCimagf(amp) +# elif QuEST_PREC==2 + # define toCuAmp(re, im) make_cuDoubleComplex(re, im) + # define cuAmpReal(amp) cuCreal(amp) + # define cuAmpImag(amp) cuCimag(amp) +# elif QuEST_PREC==4 + # define toCuAmp(re, im) -1 // invalid precision config + # define cuAmpReal(amp) -1 + # define cuAmpImag(amp) -1 +#endif + + + #ifdef __cplusplus extern "C" { #endif -void statevec_setAmps(Qureg qureg, long long int startInd, qreal* reals, qreal* imags, long long int numAmps) -{ -} +/* + * QUREG CREATION AND AMP SET/GET + */ +void statevec_createQureg(Qureg *qureg, int numQubits, QuESTEnv env) +{ + // set standard fields + long long int numAmps = 1LL << numQubits; + qureg->numQubitsInStateVec = numQubits; + qureg->numAmpsPerChunk = numAmps; + qureg->numAmpsTotal = numAmps; + qureg->chunkId = 0; + qureg->numChunks = 1; + qureg->isDensityMatrix = 0; -void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) -{ -} + // copy env's cuQuantum handle to qureg + qureg->cuQuantumHandle = env.cuQuantumHandle; -void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg) -{ + // allocate user-facing CPU memory + qureg->stateVec.real = (qreal*) malloc(numAmps * sizeof(qureg->stateVec.real)); + qureg->stateVec.imag = (qreal*) malloc(numAmps * sizeof(qureg->stateVec.imag)); + validateQuregAllocation(qureg, env, __func__); + + // allocate cuQuantum GPU memory (unvalidated) + cudaMalloc( &(qureg->deviceCuStateVec), numAmps * sizeof(*(qureg->deviceCuStateVec)) ); + + // allocate private cuQuantum CPU memory (for exchanging with GPU memory) + qureg->cuStateVec = (cuAmp*) malloc(numAmps * sizeof(*(qureg->cuStateVec))); } -void densmatr_initPlusState(Qureg qureg) +void statevec_destroyQureg(Qureg qureg, QuESTEnv env) { + // free user-facing CPU memory + free(qureg.stateVec.real); + free(qureg.stateVec.imag); + + // free private cuQuantum CPU memory + free(qureg.cuStateVec); + + // free cuQuantum GPU memory + cudaFree(qureg.deviceCuStateVec); } -void densmatr_initClassicalState(Qureg qureg, long long int stateInd) +void statevec_setAmps(Qureg qureg, long long int startInd, qreal* reals, qreal* imags, long long int numAmps) { + // slowly manually overwrite subset of private cuQuantum CPU memory + for (long long int i=0; i Date: Sat, 19 Aug 2023 15:12:15 +0100 Subject: [PATCH 03/15] setup automatic workspace memory Note that this forces our cuQuantum backend to require its users to have a stream-ordered memory pool compatible GPU (which seems fair enough) --- QuEST/src/GPU/QuEST_cuQuantum.cu | 77 +++++++++++++++++++++++++++++++ QuEST/src/GPU/QuEST_gpu.cu | 21 ++++++++- QuEST/src/GPU/QuEST_gpu_common.cu | 31 +------------ QuEST/src/GPU/QuEST_gpu_common.h | 19 ++++++++ QuEST/src/QuEST_validation.c | 16 +++++-- QuEST/src/QuEST_validation.h | 4 +- 6 files changed, 132 insertions(+), 36 deletions(-) create mode 100644 QuEST/src/GPU/QuEST_gpu_common.h diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index 4852cea61..c72dcdfd0 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -11,6 +11,7 @@ */ # include "QuEST.h" +# include "QuEST_gpu_common.h" # include "QuEST_precision.h" # include "QuEST_validation.h" # include @@ -40,6 +41,82 @@ extern "C" { +/* + * ENVIRONMENT MANAGEMENT + */ + +int GPUSupportsMemPools() { + + // consult only the first device (garuanteed already to exist) + int device = 0; + + int supports; + cudaDeviceGetAttribute(&supports, cudaDevAttrMemoryPoolsSupported, device); + return supports; +} + +int memPoolAlloc(void* ctx, void** ptr, size_t size, cudaStream_t stream) { + cudaMemPool_t pool = * reinterpret_cast(ctx); + return cudaMallocFromPoolAsync(ptr, size, pool, stream); +} +int memPoolFree(void* ctx, void* ptr, size_t size, cudaStream_t stream) { + return cudaFreeAsync(ptr, stream); +} + +void setupAutoWorkspaces(custatevecHandle_t cuQuantumHandle) { + + // get the current (device's default) stream-ordered memory pool (assuming single GPU) + int device = 0; + cudaMemPool_t memPool; + cudaDeviceGetMemPool(&memPool, device); + + // get its current memory threshold, above which memory gets freed at every stream synch + size_t currMaxMem; + cudaMemPoolGetAttribute(memPool, cudaMemPoolAttrReleaseThreshold, &currMaxMem); + + // if it's smaller than 1 MiB = 16 qubits, extend it + size_t desiredMaxMem = 16*(1LL<<16); + if (currMaxMem < desiredMaxMem) + cudaMemPoolSetAttribute(memPool, cudaMemPoolAttrReleaseThreshold, &desiredMaxMem); + + // create a mem handler around the mem pool + custatevecDeviceMemHandler_t memHandler; + memHandler.ctx = reinterpret_cast(&memPool); + memHandler.device_alloc = memPoolAlloc; + memHandler.device_free = memPoolFree; + + // set cuQuantum to use this handler and pool, to automate workspace memory management + custatevecSetDeviceMemHandler(cuQuantumHandle, &memHandler); +} + +QuESTEnv createQuESTEnv(void) { + validateGPUExists(GPUExists(), __func__); + validateGPUIsCuQuantumCompatible(GPUSupportsMemPools(),__func__); + + QuESTEnv env; + env.rank=0; + env.numRanks=1; + + env.seeds = NULL; + env.numSeeds = 0; + seedQuESTDefault(&env); + + // prepare cuQuantum + custatevecCreate(&env.cuQuantumHandle); + setupAutoWorkspaces(env.cuQuantumHandle); + + return env; +} + +void destroyQuESTEnv(QuESTEnv env){ + free(env.seeds); + + // finalise cuQuantum + custatevecDestroy(env.cuQuantumHandle); +} + + + /* * QUREG CREATION AND AMP SET/GET */ diff --git a/QuEST/src/GPU/QuEST_gpu.cu b/QuEST/src/GPU/QuEST_gpu.cu index 0b924454a..b253336f4 100755 --- a/QuEST/src/GPU/QuEST_gpu.cu +++ b/QuEST/src/GPU/QuEST_gpu.cu @@ -8,6 +8,7 @@ */ # include "QuEST.h" +# include "QuEST_gpu_common.h" # include "QuEST_precision.h" # include "QuEST_validation.h" # include "QuEST_internal.h" // for getQubitBitmask @@ -155,6 +156,25 @@ extern "C" { #endif + +QuESTEnv createQuESTEnv(void) { + validateGPUExists(GPUExists(), __func__); + + QuESTEnv env; + env.rank=0; + env.numRanks=1; + + env.seeds = NULL; + env.numSeeds = 0; + seedQuESTDefault(&env); + + return env; +} + +void destroyQuESTEnv(QuESTEnv env){ + free(env.seeds); +} + void statevec_setAmps(Qureg qureg, long long int startInd, qreal* reals, qreal* imags, long long int numAmps) { cudaDeviceSynchronize(); @@ -170,7 +190,6 @@ void statevec_setAmps(Qureg qureg, long long int startInd, qreal* reals, qreal* cudaMemcpyHostToDevice); } - /** works for both statevectors and density matrices */ void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) { diff --git a/QuEST/src/GPU/QuEST_gpu_common.cu b/QuEST/src/GPU/QuEST_gpu_common.cu index 945619b40..f07b105aa 100644 --- a/QuEST/src/GPU/QuEST_gpu_common.cu +++ b/QuEST/src/GPU/QuEST_gpu_common.cu @@ -9,6 +9,7 @@ # include "QuEST.h" +# include "QuEST_gpu_common.h" # include "QuEST_precision.h" # include "QuEST_validation.h" # include "mt19937ar.h" @@ -29,8 +30,7 @@ extern "C" { - -int GPUExists(void){ +int GPUExists(void) { int deviceCount, device; int gpuDeviceCount = 0; struct cudaDeviceProp properties; @@ -47,25 +47,6 @@ int GPUExists(void){ else return 0; } -QuESTEnv createQuESTEnv(void) { - - validateGPUExists(GPUExists(), __func__); - - QuESTEnv env; - env.rank=0; - env.numRanks=1; - - env.seeds = NULL; - env.numSeeds = 0; - seedQuESTDefault(&env); - -#ifdef USE_CUQUANTUM - custatevecCreate(&env.cuQuantumHandle); -#endif - - return env; -} - void syncQuESTEnv(QuESTEnv env){ cudaDeviceSynchronize(); } @@ -74,14 +55,6 @@ int syncQuESTSuccess(int successCode){ return successCode; } -void destroyQuESTEnv(QuESTEnv env){ - free(env.seeds); - -#ifdef USE_CUQUANTUM - custatevecDestroy(env.cuQuantumHandle); -#endif -} - void reportQuESTEnv(QuESTEnv env){ printf("EXECUTION ENVIRONMENT:\n"); printf("Running locally on one node with GPU "); diff --git a/QuEST/src/GPU/QuEST_gpu_common.h b/QuEST/src/GPU/QuEST_gpu_common.h new file mode 100644 index 000000000..d2b58df93 --- /dev/null +++ b/QuEST/src/GPU/QuEST_gpu_common.h @@ -0,0 +1,19 @@ + +# ifndef QUEST_GPU_COMMON_H +# define QUEST_GPU_COMMON_H + +# ifdef __cplusplus +extern "C" { +# endif + + + +int GPUExists(void); + + + +#ifdef __cplusplus +} +#endif + +# endif // QUEST_GPU_COMMON_H \ No newline at end of file diff --git a/QuEST/src/QuEST_validation.c b/QuEST/src/QuEST_validation.c index 166b8ba6b..a8bdcd703 100644 --- a/QuEST/src/QuEST_validation.c +++ b/QuEST/src/QuEST_validation.c @@ -120,6 +120,7 @@ typedef enum { E_DIAGONAL_OP_NOT_ALLOCATED, E_DIAGONAL_OP_NOT_ALLOCATED_ON_GPU, E_NO_GPU, + E_GPU_DOES_NOT_SUPPORT_MEM_POOLS, E_QASM_BUFFER_OVERFLOW } ErrorCode; @@ -213,6 +214,7 @@ static const char* errorMessages[] = { [E_DIAGONAL_OP_NOT_ALLOCATED] = "Could not allocate memory for DiagonalOp. Possibly insufficient memory.", [E_DIAGONAL_OP_NOT_ALLOCATED_ON_GPU] = "Could not allocate memory for DiagonalOp on GPU. Possibly insufficient memory.", [E_NO_GPU] = "Trying to run GPU code with no GPU available.", + [E_GPU_DOES_NOT_SUPPORT_MEM_POOLS] = "The GPU does not support stream-ordered memory pools, required by the cuQuantum backend.", [E_QASM_BUFFER_OVERFLOW] = "QASM line buffer filled." }; @@ -1109,14 +1111,20 @@ void validateDiagonalOpGPUAllocation(DiagonalOp* op, QuESTEnv env, const char* c QuESTAssert(allocationSuccessful, E_DIAGONAL_OP_NOT_ALLOCATED_ON_GPU, caller); } -// This is really just a dummy shim, because the scope of GPUExists() -// is limited to the QuEST_gpu.cu file. +void raiseQASMBufferOverflow(const char* caller) { + invalidQuESTInputError(errorMessages[E_QASM_BUFFER_OVERFLOW], caller); +} + + +/* The below functions are dummy shims, whereby the GPU has already determined the outcome of + * validation, though sends the boolean results here for error handling. + */ void validateGPUExists(int GPUPresent, const char* caller) { QuESTAssert(GPUPresent, E_NO_GPU, caller); } -void raiseQASMBufferOverflow(const char* caller) { - invalidQuESTInputError(errorMessages[E_QASM_BUFFER_OVERFLOW], caller); +void validateGPUIsCuQuantumCompatible(int supportsMemPools, const char* caller) { + QuESTAssert(supportsMemPools, E_GPU_DOES_NOT_SUPPORT_MEM_POOLS, caller); } diff --git a/QuEST/src/QuEST_validation.h b/QuEST/src/QuEST_validation.h index eb26a48eb..6df09f93b 100644 --- a/QuEST/src/QuEST_validation.h +++ b/QuEST/src/QuEST_validation.h @@ -174,10 +174,10 @@ void validateDiagonalOpAllocation(DiagonalOp* op, QuESTEnv env, const char* call void validateDiagonalOpGPUAllocation(DiagonalOp* op, QuESTEnv env, const char* caller); -// This is really just a dummy shim, because the scope of GPUExists() -// is limited to the QuEST_gpu.cu file. void validateGPUExists(int GPUPresent, const char* caller); +void validateGPUIsCuQuantumCompatible(int supportsMemPools, const char* caller); + void raiseQASMBufferOverflow(const char* caller); # ifdef __cplusplus From ec08912fa5b7cfbfb3cef1cc3d834c597e6b5ae5 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sat, 19 Aug 2023 20:45:51 +0100 Subject: [PATCH 04/15] added type adapters for converting between QuEST's interface and backend types (like Complex, ComplexMatrixN, and bitmasks) and cuQuantum's --- QuEST/src/GPU/QuEST_cuQuantum.cu | 68 +++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 5 deletions(-) diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index c72dcdfd0..0d7543412 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -15,24 +15,82 @@ # include "QuEST_precision.h" # include "QuEST_validation.h" # include +# include -// precision-overloaded macros for creating a cuAmp +/* + * TYPES AND ADAPTERS + */ + +// precision-agnostic conversions between cuAmp, qreal and Complex # if QuEST_PREC==1 - # define toCuAmp(re, im) make_cuFloatComplex(re, im) + # define TO_CU_AMP(re, im) make_cuFloatComplex(re, im) # define cuAmpReal(amp) cuCrealf(amp) # define cuAmpImag(amp) cuCimagf(amp) + # define cuAmpConj(amp) cuConjf(amp) + # define CU_AMP_IN_STATE_PREC CUDA_C_F32 + # define CU_AMP_IN_MATRIX_PREC CUDA_C_64F # elif QuEST_PREC==2 - # define toCuAmp(re, im) make_cuDoubleComplex(re, im) + # define TO_CU_AMP(re, im) make_cuDoubleComplex(re, im) # define cuAmpReal(amp) cuCreal(amp) # define cuAmpImag(amp) cuCimag(amp) + # define cuAmpConj(amp) cuConj(amp) + # define CU_AMP_IN_STATE_PREC CUDA_C_64F + # define CU_AMP_IN_MATRIX_PREC CUDA_C_64F # elif QuEST_PREC==4 - # define toCuAmp(re, im) -1 // invalid precision config + # define TO_CU_AMP(re, im) -1 // invalid precision config # define cuAmpReal(amp) -1 # define cuAmpImag(amp) -1 + # define CU_AMP_IN_STATE_PREC void // invalid + # define CU_AMP_IN_MATRIX_PREC void // invalid #endif +// convenient operator overloads for cuAmp, for doing complex artihmetic +cuAmp operator - (const cuAmp& a) { + return TO_CU_AMP(-cuAmpReal(a), -cuAmpImag(a)); +} + +// convert user-facing Complex to cuQuantum-facing cuAmp +cuAmp toCuAmp(Complex c) { + return TO_CU_AMP(c.real, c.imag); +} + +// concise alias for row-wise flattened complex matrix +typedef std::vector cuMatr; + +// flatten ComplexMatrixN mIn to a cuMatr mOut +#define GET_cuMatr_FROM_ComplexMatrix( mOut, mIn, nQubits ) \ + long long int dim = (1LL << nQubits); \ + cuMatr mOut(dim*dim); \ + long long int i=0; \ + for (long long int r=0; r<(dim); r++) \ + for (long long int c=0; c<(dim); c++) \ + mOut[i++] = TO_CU_AMP(mIn.real[r][c], mIn.imag[r][c]); + +// convert user-facing ComplexMatrixN to cuQuantum-facing cuMatr +cuMatr toCuMatr(ComplexMatrix2 mIn) { + GET_cuMatr_FROM_ComplexMatrix(mOut, mIn, 1); + return mOut; +} +cuMatr toCuMatr(ComplexMatrix4 mIn) { + GET_cuMatr_FROM_ComplexMatrix(mOut, mIn, 2); + return mOut; +} +cuMatr toCuMatr(ComplexMatrixN mIn) { + GET_cuMatr_FROM_ComplexMatrix(mOut, mIn, mIn.numQubits); + return mOut; +} + +// convert QuEST backend masks back into user-input qubit lists (needed by cuQuantum) +std::vector getIndsFromMask(long long int mask, int numBits) { + std::vector inds; + for (int i=0; i Date: Sat, 19 Aug 2023 20:53:15 +0100 Subject: [PATCH 05/15] added wrapped operators Added all operators (like unitaries, sub-diagonal gates) which can be directly mapped to a cuQuantum calls. The cuQuantum calls are: - custatevecApplyMatrix - custatevecApplyPauliRotation - custatevecSwapIndexBits - custatevecApplyGeneralizedPermutationMatrix It appears that the remainder of QuEST's operators (decoherence channels, full-state diagonals, and phase functions) will need bespoke kernels --- QuEST/src/GPU/QuEST_cuQuantum.cu | 240 ++++++++++++++++++++++++++++++- 1 file changed, 233 insertions(+), 7 deletions(-) diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index 0d7543412..a4ac289c4 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -99,6 +99,34 @@ extern "C" { +/* + * CUQUANTUM WRAPPERS (to reduce boilerplate) + */ + +void custatevec_applyMatrix(Qureg qureg, std::vector ctrls, std::vector targs, cuMatr matr) { + + // do not adjoint matrix + int adj = 0; + + // condition all ctrls on =1 state + int* ctrlBits = nullptr; + + // use automatic workspace management + void* work = nullptr; + size_t workSize = 0; + + custatevecApplyMatrix( + qureg.cuQuantumHandle, + qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + matr.data(), CU_AMP_IN_MATRIX_PREC, CUSTATEVEC_MATRIX_LAYOUT_ROW, adj, + targs.data(), targs.size(), + ctrls.data(), ctrlBits, ctrls.size(), + CUSTATEVEC_COMPUTE_DEFAULT, + work, workSize); +} + + + /* * ENVIRONMENT MANAGEMENT */ @@ -369,108 +397,306 @@ int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision) void statevec_compactUnitary(Qureg qureg, int targetQubit, Complex alpha, Complex beta) { + cuAmp a = toCuAmp(alpha); + cuAmp b = toCuAmp(beta); + cuMatr matrix{ + a, -cuAmpConj(b), + b, cuAmpConj(a) + }; + custatevec_applyMatrix(qureg, {}, {targetQubit}, matrix); } void statevec_controlledCompactUnitary(Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta) { + cuAmp a = toCuAmp(alpha); + cuAmp b = toCuAmp(beta); + cuMatr matrix{ + a, -cuAmpConj(b), + b, cuAmpConj(a) + }; + custatevec_applyMatrix(qureg, {controlQubit}, {targetQubit}, matrix); } void statevec_unitary(Qureg qureg, int targetQubit, ComplexMatrix2 u) { + custatevec_applyMatrix(qureg, {}, {targetQubit}, toCuMatr(u)); } void statevec_multiControlledMultiQubitUnitary(Qureg qureg, long long int ctrlMask, int* targs, int numTargs, ComplexMatrixN u) { + std::vector c = getIndsFromMask(ctrlMask,qureg.numQubitsInStateVec); + std::vector t(targs,targs+numTargs); + custatevec_applyMatrix(qureg, c, t, toCuMatr(u)); } void statevec_multiControlledTwoQubitUnitary(Qureg qureg, long long int ctrlMask, int q1, int q2, ComplexMatrix4 u) { + std::vector c = getIndsFromMask(ctrlMask,qureg.numQubitsInStateVec); + custatevec_applyMatrix(qureg, c, {q1,q2}, toCuMatr(u)); } void statevec_controlledUnitary(Qureg qureg, int controlQubit, int targetQubit, ComplexMatrix2 u) { + custatevec_applyMatrix(qureg, {controlQubit}, {targetQubit}, toCuMatr(u)); } -void statevec_multiControlledUnitary( - Qureg qureg, - long long int ctrlQubitsMask, long long int ctrlFlipMask, - int targetQubit, ComplexMatrix2 u) +void statevec_multiControlledUnitary(Qureg qureg, long long int ctrlQubitsMask, long long int ctrlFlipMask, int targetQubit, ComplexMatrix2 u) { + int targs[] = {targetQubit}; + std::vector ctrlInds = getIndsFromMask(ctrlQubitsMask,qureg.numQubitsInStateVec); + std::vector ctrlVals(ctrlInds.size()); + for (size_t i=0; i targs{controlQubits[0]}; + std::vector ctrls(controlQubits + 1, controlQubits + numControlQubits); + custatevec_applyMatrix(qureg, ctrls, targs, matrix); } void statevec_multiRotateZ(Qureg qureg, long long int mask, qreal angle) { + qreal theta = - angle/2.; + std::vector targs = getIndsFromMask(mask, qureg.numQubitsInStateVec); + std::vector paulis(targs.size(), CUSTATEVEC_PAULI_Z); + + custatevecApplyPauliRotation( + qureg.cuQuantumHandle, qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + theta, paulis.data(), targs.data(), targs.size(), + nullptr, nullptr, 0); } void statevec_multiControlledMultiRotateZ(Qureg qureg, long long int ctrlMask, long long int targMask, qreal angle) { + qreal theta = - angle/2.; + std::vector ctrls = getIndsFromMask(ctrlMask, qureg.numQubitsInStateVec); + std::vector targs = getIndsFromMask(targMask, qureg.numQubitsInStateVec); + std::vector paulis(targs.size(), CUSTATEVEC_PAULI_Z); + + custatevecApplyPauliRotation( + qureg.cuQuantumHandle, qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + theta, paulis.data(), targs.data(), targs.size(), + ctrls.data(), nullptr, ctrls.size()); } void statevec_controlledPhaseFlip(Qureg qureg, int idQubit1, int idQubit2) { + // this diagonal operator, otherwise embarrasingly parallel with unit stride, + // is here treated as a generic one-target unitary, wastefully inducing non-unit + // stride and unnecessary memory reads, and potentially unnecessary communication + // in multi-GPU mode. + + cuAmp a0 = TO_CU_AMP(0, 0); + cuAmp a1 = TO_CU_AMP(1, 0); + cuMatr matrix{ + a1, a0, + a0, -a1 + }; + custatevec_applyMatrix(qureg, {idQubit1}, {idQubit2}, matrix); } void statevec_multiControlledPhaseFlip(Qureg qureg, int *controlQubits, int numControlQubits) { + // this diagonal operator, otherwise embarrasingly parallel with unit stride, + // is here treated as a generic one-target unitary, wastefully inducing non-unit + // stride and unnecessary memory reads, and potentially unnecessary communication + // in multi-GPU mode. + + cuAmp a0 = TO_CU_AMP(0, 0); + cuAmp a1 = TO_CU_AMP(1, 0); + cuMatr matrix{ + a1, a0, + a0, -a1 + }; + std::vector targs{controlQubits[0]}; + std::vector ctrls(controlQubits + 1, controlQubits + numControlQubits); + custatevec_applyMatrix(qureg, ctrls, targs, matrix); } void statevec_swapQubitAmps(Qureg qureg, int qb1, int qb2) { + int2 targPairs[] = {{qb1, qb2}}; + int numPairs = 1; + + custatevecSwapIndexBits( + qureg.cuQuantumHandle, qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + targPairs, numPairs, + nullptr, nullptr, 0); } void statevec_hadamard(Qureg qureg, int targetQubit) { + cuAmp a = TO_CU_AMP(1/sqrt(2.), 0); + cuMatr matrix{ + a, a, + a, -a + }; + custatevec_applyMatrix(qureg, {}, {targetQubit}, matrix); } void statevec_controlledNot(Qureg qureg, int controlQubit, int targetQubit) { + cuAmp a0 = TO_CU_AMP(0, 0); + cuAmp a1 = TO_CU_AMP(1, 0); + cuMatr matrix{ + a0, a1, + a1, a0 + }; + custatevec_applyMatrix(qureg, {controlQubit}, {targetQubit}, matrix); } void statevec_multiControlledMultiQubitNot(Qureg qureg, int ctrlMask, int targMask) { + // this operator can be effected in one-shot using a custom kernel, but we here + // isntead resort to slowly (by at most a factor #targs) effect it as a sequence + // of one-target multi-ctrl NOT gates. + + cuAmp a0 = TO_CU_AMP(0, 0); + cuAmp a1 = TO_CU_AMP(1, 0); + cuMatr matrix{ + a0, a1, + a1, a0 + }; + std::vector ctrls = getIndsFromMask(ctrlMask, qureg.numQubitsInStateVec); + std::vector targs = getIndsFromMask(targMask, qureg.numQubitsInStateVec); + for (int targ : targs) + custatevec_applyMatrix(qureg, ctrls, {targ}, matrix); } -void statevec_applyDiagonalOp(Qureg qureg, DiagonalOp op) +void statevec_applySubDiagonalOp(Qureg qureg, int* targets, SubDiagonalOp op, int conj) { + // sneakily leverage the CPU cuQuantum memory in order to convert op + // (as separate arrays op.real and op.imag) into cuAmp* + cuAmp* diagonals = qureg.cuStateVec; + for (long long int i=0; i Date: Sun, 20 Aug 2023 21:46:03 +0100 Subject: [PATCH 06/15] added Thrust --- QuEST/CMakeLists.txt | 7 ++++++- QuEST/src/GPU/QuEST_cuQuantum.cu | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/QuEST/CMakeLists.txt b/QuEST/CMakeLists.txt index e26aeb190..0ed65f974 100644 --- a/QuEST/CMakeLists.txt +++ b/QuEST/CMakeLists.txt @@ -286,7 +286,12 @@ endif() # ----- C++ COMPILER FLAGS -------------------------------------------------- # set C++ flags that are common between compilers and build types -set (CMAKE_CXX_STANDARD 98) +if (USE_CUQUANTUM) + set(CMAKE_CXX_STANDARD 14) + set(CMAKE_CXX_STANDARD_REQUIRED ON) +else () + set (CMAKE_CXX_STANDARD 98) +endif () # Use -O2 for all but debug mode by default if (NOT("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")) diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index a4ac289c4..3a2508d1e 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -14,8 +14,11 @@ # include "QuEST_gpu_common.h" # include "QuEST_precision.h" # include "QuEST_validation.h" -# include + +# include # include +# include +# include From 564851e0022b1318a3ada08b33b8750675143361 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Thu, 24 Aug 2023 22:35:04 +0100 Subject: [PATCH 07/15] added state initialisers although we are missing imports to avoid git conflict: # include # include # include --- QuEST/src/GPU/QuEST_cuQuantum.cu | 142 +++++++++++++++++++++------ QuEST/src/GPU/QuEST_gpu.cu | 106 -------------------- QuEST/src/GPU/QuEST_gpu_common.cu | 156 +++++++++++++++++++++++++++++- QuEST/src/GPU/QuEST_gpu_common.h | 8 ++ 4 files changed, 271 insertions(+), 141 deletions(-) diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index 3a2508d1e..174146d0e 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -5,7 +5,8 @@ * This makes no use of the ComplexArray qureg.deviceStateVec, used by the bespoke GPU kernels, * which is not malloc'd in this deployment. Instead, this cuQuantum backend mallocs and uses * two dedicated arrays of 'cuAmp' complex primitives; qureg.cuStateVec (CPU memory) and - * qureg.deviceCuStateVec (GPU memory) + * qureg.deviceCuStateVec (GPU memory). Note that some API functions are implemented directly + * in QuEST_gpu_common.cu, in a way agnostic to cuQuantum vs bespoke kernels. * * @author Tyson Jones */ @@ -49,10 +50,29 @@ # define CU_AMP_IN_MATRIX_PREC void // invalid #endif -// convenient operator overloads for cuAmp, for doing complex artihmetic -cuAmp operator - (const cuAmp& a) { +// convenient operator overloads for cuAmp, for doing complex artihmetic. +// some of these are defined to be used by Thrust's backend, because we +// avoided Thrust's complex (see QuEST_precision.h for explanation). +// notice we are manually performing the arithmetic using qreals and +// re-packing the result into TO_CU_AMP, rather than using cuComplex.h's +// functions like cuCadd(). This is because such functions are precision +// specific (grr) and do exactly the same thing themselves! +__host__ __device__ inline cuAmp operator - (const cuAmp& a) { return TO_CU_AMP(-cuAmpReal(a), -cuAmpImag(a)); } +__host__ __device__ inline cuAmp operator * (const cuAmp& a, const std::size_t n) { + return TO_CU_AMP(n*cuAmpReal(a), n*cuAmpImag(a)); +} +__host__ __device__ inline cuAmp operator + (const cuAmp& a, const cuAmp& b) { + return TO_CU_AMP(cuAmpReal(a) + cuAmpReal(b), cuAmpImag(a) + cuAmpImag(b)); +} +__host__ __device__ inline cuAmp operator * (const cuAmp& a, const cuAmp& b) { + qreal aRe = cuAmpReal(a); + qreal aIm = cuAmpImag(a); + qreal bRe = cuAmpReal(b); + qreal bIm = cuAmpImag(b); + return TO_CU_AMP(aRe*bRe - aIm*bIm, aRe*bIm + aIm*bRe); +} // convert user-facing Complex to cuQuantum-facing cuAmp cuAmp toCuAmp(Complex c) { @@ -96,12 +116,6 @@ std::vector getIndsFromMask(long long int mask, int numBits) { -#ifdef __cplusplus -extern "C" { -#endif - - - /* * CUQUANTUM WRAPPERS (to reduce boilerplate) */ @@ -130,6 +144,46 @@ void custatevec_applyMatrix(Qureg qureg, std::vector ctrls, std::vector __host__ __device__ void operator()(Tuple t) { + thrust::get<2>(t) = facOut*thrust::get<2>(t) + fac1*thrust::get<0>(t) + fac2*thrust::get<1>(t); + } +}; + +thrust::device_ptr getStartPtr(Qureg qureg) { + + return thrust::device_pointer_cast(qureg.deviceCuStateVec); +} + +thrust::device_ptr getEndPtr(Qureg qureg) { + + return getStartPtr(qureg) + qureg.numAmpsTotal; +} + + + +/* + * Start C-linkage, so that below functions may only use C signatures (not C++) + */ +#ifdef __cplusplus +extern "C" { +#endif + + + /* * ENVIRONMENT MANAGEMENT */ @@ -328,53 +382,84 @@ qreal statevec_getImagAmp(Qureg qureg, long long int index) * STATE INITIALISATION */ -void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg) -{ -} - void densmatr_initPlusState(Qureg qureg) { + qreal val = 1./(1LL << qureg.numQubitsRepresented); + thrust::fill(getStartPtr(qureg), getEndPtr(qureg), TO_CU_AMP(val, 0)); } -void densmatr_initClassicalState(Qureg qureg, long long int stateInd) +void statevec_initZeroState(Qureg qureg) { + custatevecInitializeStateVector( + qureg.cuQuantumHandle, + qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, + qureg.numQubitsInStateVec, + CUSTATEVEC_STATE_VECTOR_TYPE_ZERO); } void statevec_initBlankState(Qureg qureg) { -} + // init to |0> = {1, 0, 0, ...} + statevec_initZeroState(qureg); -void statevec_initZeroState(Qureg qureg) -{ + // overwrite amps[0] to 0 + qreal re[] = {0}; + qreal im[] = {0}; + statevec_setAmps(qureg, 0, re, im, 1); } void statevec_initPlusState(Qureg qureg) { + custatevecInitializeStateVector( + qureg.cuQuantumHandle, + qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, + qureg.numQubitsInStateVec, + CUSTATEVEC_STATE_VECTOR_TYPE_UNIFORM); } void statevec_initClassicalState(Qureg qureg, long long int stateInd) { -} + // init to |null> = {0, 0, 0, ...} + statevec_initBlankState(qureg); -void statevec_initDebugState(Qureg qureg) -{ + // overwrite amps[stateInd] to 1 + qreal re[] = {1}; + qreal im[] = {0}; + statevec_setAmps(qureg, stateInd, re, im, 1); } -void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome) +void densmatr_initClassicalState(Qureg qureg, long long int stateInd) { -} + // init to |null> = {0, 0, 0, ...} + statevec_initBlankState(qureg); -int statevec_initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env) -{ - return -1; + // index of the desired state in the flat density matrix + long long int densityDim = 1LL << qureg.numQubitsRepresented; + long long int densityInd = (densityDim + 1)*stateInd; + + // overwrite amps[densityInd] to 1 + qreal re[] = {1}; + qreal im[] = {0}; + statevec_setAmps(qureg, densityInd, re, im, 1); } -void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) +void statevec_initDebugState(Qureg qureg) { + // |n> -> (.2n + (.2n+.1)i) |n> + cuAmp init = TO_CU_AMP(0, .1); + cuAmp step = TO_CU_AMP(.2, .2); + thrust::sequence(getStartPtr(qureg), getEndPtr(qureg), init, step); } void statevec_setWeightedQureg(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out) { + // out -> facOut + fac1 qureg1 + fac2 qureg2 + thrust::for_each( + thrust::make_zip_iterator(thrust::make_tuple(getStartPtr(qureg1), getStartPtr(qureg2), getStartPtr(out))), + thrust::make_zip_iterator(thrust::make_tuple(getEndPtr(qureg1), getEndPtr(qureg2), getEndPtr(out))), + functor_setWeightedQureg(toCuAmp(fac1), toCuAmp(fac2), toCuAmp(facOut))); } @@ -387,11 +472,6 @@ void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank) { } -int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision) -{ - return -1; -} - /* diff --git a/QuEST/src/GPU/QuEST_gpu.cu b/QuEST/src/GPU/QuEST_gpu.cu index b253336f4..a44219aee 100755 --- a/QuEST/src/GPU/QuEST_gpu.cu +++ b/QuEST/src/GPU/QuEST_gpu.cu @@ -207,36 +207,6 @@ void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) { cudaMemcpyDeviceToDevice); } -__global__ void densmatr_initPureStateKernel( - long long int numPureAmps, - qreal *targetVecReal, qreal *targetVecImag, - qreal *copyVecReal, qreal *copyVecImag) -{ - // this is a particular index of the pure copyQureg - long long int index = blockIdx.x*blockDim.x + threadIdx.x; - if (index>=numPureAmps) return; - - qreal realRow = copyVecReal[index]; - qreal imagRow = copyVecImag[index]; - for (long long int col=0; col < numPureAmps; col++) { - qreal realCol = copyVecReal[col]; - qreal imagCol = - copyVecImag[col]; // minus for conjugation - targetVecReal[col*numPureAmps + index] = realRow*realCol - imagRow*imagCol; - targetVecImag[col*numPureAmps + index] = realRow*imagCol + imagRow*realCol; - } -} - -void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg) -{ - int threadsPerCUDABlock, CUDABlocks; - threadsPerCUDABlock = 128; - CUDABlocks = ceil((qreal)(copyQureg.numAmpsPerChunk)/threadsPerCUDABlock); - densmatr_initPureStateKernel<<>>( - copyQureg.numAmpsPerChunk, - targetQureg.deviceStateVec.real, targetQureg.deviceStateVec.imag, - copyQureg.deviceStateVec.real, copyQureg.deviceStateVec.imag); -} - __global__ void densmatr_initPlusStateKernel(long long int stateVecSize, qreal probFactor, qreal *stateVecReal, qreal *stateVecImag){ long long int index; @@ -3793,7 +3763,6 @@ __global__ void statevec_applyParamNamedPhaseFuncOverridesKernel( } } - // negate phase to conjugate operator if (conj) phase *= -1; @@ -3865,81 +3834,6 @@ void statevec_applyParamNamedPhaseFuncOverrides( cudaFree(d_params); } -__global__ void densmatr_setQuregToPauliHamilKernel( - Qureg qureg, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms -) { - long long int n = blockIdx.x*blockDim.x + threadIdx.x; - if (n>=qureg.numAmpsPerChunk) return; - - // flattened {I,X,Y,Z} matrix elements, where [k] = [p][i][j] - const int pauliRealElems[] = { 1,0, 0,1, 0,1, 1,0, 0,0, 0,0, 1,0, 0,-1 }; - const int pauliImagElems[] = { 0,0, 0,0, 0,0, 0,0, 0,-1,1,0, 0,0, 0,0 }; - - // |n> = |c>|r> - const int numQubits = qureg.numQubitsRepresented; - const long long int r = n & ((1LL << numQubits) - 1); - const long long int c = n >> numQubits; - - // new amplitude of |n> - qreal elemRe = 0; - qreal elemIm = 0; - - for (long long int t=0; t> q) & 1; - int j = (c >> q) & 1; - int p = (int) pauliCodes[pInd++]; - int k = (p<<2) + (i<<1) + j; - int pauliRe = pauliRealElems[k]; - int pauliIm = pauliImagElems[k]; - - // kron *= pauli - int tmp = (pauliRe*kronRe) - (pauliIm*kronIm); - kronIm = (pauliRe*kronIm) + (pauliIm*kronRe); - kronRe = tmp; - } - - // elem = sum_t coeffs[t] pauliKronecker[r][c] - elemRe += termCoeffs[t] * kronRe; - elemIm += termCoeffs[t] * kronIm; - } - - // overwrite the density matrix entry - qureg.deviceStateVec.real[n] = elemRe; - qureg.deviceStateVec.imag[n] = elemIm; -} - -void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) { - - // copy hamil into GPU memory - enum pauliOpType* d_pauliCodes; - size_t mem_pauliCodes = hamil.numSumTerms * hamil.numQubits * sizeof *d_pauliCodes; - cudaMalloc(&d_pauliCodes, mem_pauliCodes); - cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice); - - qreal* d_termCoeffs; - size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs; - cudaMalloc(&d_termCoeffs, mem_termCoeffs); - cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice); - - int numThreadsPerBlock = 128; - int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) numThreadsPerBlock); - densmatr_setQuregToPauliHamilKernel<<>>( - qureg, d_pauliCodes, d_termCoeffs, hamil.numSumTerms); - - // free tmp GPU memory - cudaFree(d_pauliCodes); - cudaFree(d_termCoeffs); -} - #ifdef __cplusplus diff --git a/QuEST/src/GPU/QuEST_gpu_common.cu b/QuEST/src/GPU/QuEST_gpu_common.cu index f07b105aa..0426a1c41 100644 --- a/QuEST/src/GPU/QuEST_gpu_common.cu +++ b/QuEST/src/GPU/QuEST_gpu_common.cu @@ -30,6 +30,49 @@ extern "C" { +/* + * GPU CONFIG + */ + +const int NUM_THREADS_PER_BLOCK = 128; + + + +/* + * BACKEND AGNOSTIC COMPLEX ARITHMETIC + */ + +#define PROD_REAL(aRe, aIm, bRe, bIm) \ + ((aRe)*(bRe) - (aIm)*(bIm)) +#define PROD_IMAG(aRe, aIm, bRe, bIm) \ + ((aRe)*(bIm) + (aIm)*(bRe)) + +#ifdef USE_CUQUANTUM + #define GET_AMP(aRe, aIm, qureg, i) \ + aRe = qureg.deviceCuStateVec[i].x; \ + aIm = qureg.deviceCuStateVec[i].y; +#else + #define GET_AMP(aRe, aIm, qureg, i) \ + aRe = qureg.deviceStateVec.real[i]; \ + aIm = qureg.deviceStateVec.imag[i]; +#endif + +#ifdef USE_CUQUANTUM + #define SET_AMP(qureg, i, aRe, aIm) \ + qureg.deviceCuStateVec[i].x = aRe; \ + qureg.deviceCuStateVec[i].y = aIm; +#else + #define SET_AMP(qureg, i, aRe, aIm) \ + qureg.deviceStateVec.real[i] = aRe; \ + qureg.deviceStateVec.imag[i] = aIm; +#endif + + + +/* + * ENVIRONMENT MANAGEMENT + */ + int GPUExists(void) { int deviceCount, device; int gpuDeviceCount = 0; @@ -110,9 +153,116 @@ void seedQuEST(QuESTEnv *env, unsigned long int *seedArray, int numSeeds) { +/* + * STATE INITIALISATION + */ + +__global__ void densmatr_initPureStateKernel(Qureg dens, Qureg pure) +{ + long long int pureInd = blockIdx.x*blockDim.x + threadIdx.x; + if (pureInd >= pure.numAmpsPerChunk) return; + + qreal aRe, aIm; + GET_AMP( aRe, aIm, pure, pureInd ); + + for (long long int col=0; col>>(dens, pure); +} + +__global__ void densmatr_setQuregToPauliHamilKernel( + Qureg qureg, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms +) { + long long int n = blockIdx.x*blockDim.x + threadIdx.x; + if (n>=qureg.numAmpsPerChunk) return; + + // flattened {I,X,Y,Z} matrix elements, where [k] = [p][i][j] + const int pauliRealElems[] = { 1,0, 0,1, 0,1, 1,0, 0,0, 0,0, 1,0, 0,-1 }; + const int pauliImagElems[] = { 0,0, 0,0, 0,0, 0,0, 0,-1,1,0, 0,0, 0,0 }; + + // |n> = |c>|r> + const int numQubits = qureg.numQubitsRepresented; + const long long int r = n & ((1LL << numQubits) - 1); + const long long int c = n >> numQubits; + + // new amplitude of |n> + qreal elemRe = 0; + qreal elemIm = 0; + + for (long long int t=0; t> q) & 1; + int j = (c >> q) & 1; + int p = (int) pauliCodes[pInd++]; + int k = (p<<2) + (i<<1) + j; + int pauliRe = pauliRealElems[k]; + int pauliIm = pauliImagElems[k]; + + // kron *= pauli + int tmp = (pauliRe*kronRe) - (pauliIm*kronIm); + kronIm = (pauliRe*kronIm) + (pauliIm*kronRe); + kronRe = tmp; + } + + // elem = sum_t coeffs[t] pauliKronecker[r][c] + elemRe += termCoeffs[t] * kronRe; + elemIm += termCoeffs[t] * kronIm; + } + + // overwrite the density matrix entry + SET_AMP( qureg, n, elemRe, elemIm ); +} + +void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) { + + // copy hamil into GPU memory + enum pauliOpType* d_pauliCodes; + size_t mem_pauliCodes = hamil.numSumTerms * hamil.numQubits * sizeof *d_pauliCodes; + cudaMalloc(&d_pauliCodes, mem_pauliCodes); + cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice); + + qreal* d_termCoeffs; + size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs; + cudaMalloc(&d_termCoeffs, mem_termCoeffs); + cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice); + + int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) NUM_THREADS_PER_BLOCK); + densmatr_setQuregToPauliHamilKernel<<>>( + qureg, d_pauliCodes, d_termCoeffs, hamil.numSumTerms); + + // free tmp GPU memory + cudaFree(d_pauliCodes); + cudaFree(d_termCoeffs); +} +/* + * MANAGING NON-QUREG TYPES + */ DiagonalOp agnostic_createDiagonalOp(int numQubits, QuESTEnv env) { @@ -125,7 +275,6 @@ DiagonalOp agnostic_createDiagonalOp(int numQubits, QuESTEnv env) { // allocate CPU memory (initialised to zero) op.real = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal)); op.imag = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal)); - // @TODO no handling of rank>1 allocation (no distributed GPU) // check cpu memory allocation was successful validateDiagonalOpAllocation(&op, env, __func__); @@ -201,9 +350,8 @@ void agnostic_initDiagonalOpFromPauliHamil(DiagonalOp op, PauliHamil hamil) { cudaMalloc(&d_termCoeffs, mem_termCoeffs); cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice); - int numThreadsPerBlock = 128; - int numBlocks = ceil(op.numElemsPerChunk / (qreal) numThreadsPerBlock); - agnostic_initDiagonalOpFromPauliHamilKernel<<>>( + int numBlocks = ceil(op.numElemsPerChunk / (qreal) NUM_THREADS_PER_BLOCK); + agnostic_initDiagonalOpFromPauliHamilKernel<<>>( op, d_pauliCodes, d_termCoeffs, hamil.numSumTerms); // copy populated operator into to RAM diff --git a/QuEST/src/GPU/QuEST_gpu_common.h b/QuEST/src/GPU/QuEST_gpu_common.h index d2b58df93..cf6998cb4 100644 --- a/QuEST/src/GPU/QuEST_gpu_common.h +++ b/QuEST/src/GPU/QuEST_gpu_common.h @@ -1,3 +1,11 @@ +// Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details + +/** @file + * GPU backend routines which are invoked by both the cuQuantum and bespoke-kernel backends. + * This excludes backend-agnostic API implementations which do not need a header declaration. + * + * @author Tyson Jones + */ # ifndef QUEST_GPU_COMMON_H # define QUEST_GPU_COMMON_H From 67153b99fd34244643d28cd764b336ec58e8d5d0 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sun, 20 Aug 2023 20:04:59 +0100 Subject: [PATCH 08/15] added wrapped decoherence Added all decoherence channels which can be directly mapped (without unacceptable performance damage) to a cuQuantum call. The cuQuantum calls are: - custatevecApplyMatrix - custatevecApplyGeneralizedPermutationMatrix and are called with matrices (some, diagonal) describing the channel superoperators. The remaining decoherence channels require linearly combining device vectors (may use Thrust), bespoke GPU kernels, or a clever decomposition of the channel (e.g. 2 qubit depolarising) into a sequence of cuStateVec calls --- QuEST/src/GPU/QuEST_cuQuantum.cu | 97 +++++++++++++++++++++++++++----- 1 file changed, 82 insertions(+), 15 deletions(-) diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index 174146d0e..dff30594f 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -126,7 +126,7 @@ void custatevec_applyMatrix(Qureg qureg, std::vector ctrls, std::vector ctrls, std::vector targs, cuAmp* elems) { + + // apply no permutation matrix + custatevecIndex_t *perm = nullptr; + + // do not adjoint elems + int adj = 0; + + // no control qubits + int* ctrls = nullptr; + int* ctrlVals = nullptr; + int nCtrls = 0; + + // use automatic workspace management + void* work = nullptr; + size_t workSize = 0; + + custatevecApplyGeneralizedPermutationMatrix( + qureg.cuQuantumHandle, qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + perm, elems, CU_AMP_IN_MATRIX_PREC, adj, + targs.data(), targs.size(), + ctrls, ctrlVals, nCtrls, + work, workSize); +} + /* @@ -759,20 +785,16 @@ void statevec_multiControlledMultiQubitNot(Qureg qureg, int ctrlMask, int targMa custatevec_applyMatrix(qureg, ctrls, {targ}, matrix); } -void statevec_applySubDiagonalOp(Qureg qureg, int* targets, SubDiagonalOp op, int conj) +void statevec_applySubDiagonalOp(Qureg qureg, int* targs, SubDiagonalOp op, int conj) { // sneakily leverage the CPU cuQuantum memory in order to convert op // (as separate arrays op.real and op.imag) into cuAmp* - cuAmp* diagonals = qureg.cuStateVec; + cuAmp* elems = qureg.cuStateVec; for (long long int i=0; i t(targs, targs + op.numQubits); + custatevec_applyDiagonal(qureg, t, elems); } void statevec_applyDiagonalOp(Qureg qureg, DiagonalOp op) @@ -819,22 +841,67 @@ void densmatr_mixDensityMatrix(Qureg combineQureg, qreal otherProb, Qureg otherQ void densmatr_mixDephasing(Qureg qureg, int targetQubit, qreal dephase) { + cuAmp a = TO_CU_AMP(1, 0); + cuAmp b = TO_CU_AMP(1-dephase, 0); // dephase = 2*prob + cuAmp elems[] = {a, b, b, a}; + + std::vector targs{targetQubit, targetQubit + qureg.numQubitsRepresented}; + custatevec_applyDiagonal(qureg, targs, elems); } -void densmatr_mixTwoQubitDephasing(Qureg qureg, int qubit1, int qubit2, qreal dephase) +void densmatr_mixTwoQubitDephasing(Qureg qureg, int qb1, int qb2, qreal dephase) { + // this function effects the two-qubit dephasing on a density matrix, via the + // four-qubit diagonal superoperator on a Choi vector. The 16 elements of the + // diagonal have only two unique entries; 1 and 1-2*dephase/3. It's conceivable + // that a bespoke kernel could be faster, but likely by little. + + cuAmp a = TO_CU_AMP(1, 0); + cuAmp b = TO_CU_AMP(1 - dephase, 0); // dephase = 4*prob/3 + cuAmp elems[] = {a, b, b, b, b, a, b, b, b, b, a, b, b, b, b, a}; + + int shift = qureg.numQubitsRepresented; + std::vector targs{qb1, qb2, qb1 + shift, qb2 + shift}; + custatevec_applyDiagonal(qureg, targs, elems); } -void densmatr_mixDepolarising(Qureg qureg, int targetQubit, qreal depolLevel) +void densmatr_mixDepolarising(Qureg qureg, int targetQubit, qreal depol) { + // this function effects depolarising as a dense two-qubit superoperator + // on a Choi vector, where only 6 of the 16 elements are non-zero. This is + // potentially wasteful, and a bespoke kernel could be faster, leveraging + // QuEST's existing GPU code (or the optimised algorithm in the "distributed" + // manuscript). + + // depol = (4*prob)/3.0 + cuAmp a = TO_CU_AMP(depol/2., 0); // 2*prob/3 + cuAmp b = TO_CU_AMP(1 - depol/2., 0); // 1-2*prob/3 + cuAmp c = TO_CU_AMP(1 - depol, 0); // 1-4*prob/3 + cuAmp z = TO_CU_AMP(0, 0); // 0 + + cuMatr matr{ + b, z, z, a, + z, c, z, z, + z, z, c, z, + a, z, z, b + }; + std::vector targs{ targetQubit, targetQubit + qureg.numQubitsRepresented }; + custatevec_applyMatrix(qureg, {}, targs, matr); } -void densmatr_mixDamping(Qureg qureg, int targetQubit, qreal damping) +void densmatr_mixTwoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal depolLevel) { } -void densmatr_mixTwoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal depolLevel) +void densmatr_mixDamping(Qureg qureg, int qb, qreal prob) { + cuAmp a = TO_CU_AMP(1, 0); + cuAmp c = TO_CU_AMP(1 - prob, 0); + cuAmp b = TO_CU_AMP(sqrt(1 - prob), 0); + cuAmp elems[] = {a, b, b, c}; + + std::vector targs{qb, qb + qureg.numQubitsRepresented}; + custatevec_applyDiagonal(qureg, targs, elems); } From d6164812b61b43fb8ac033a3f55d971ab6697cf0 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Thu, 24 Aug 2023 23:54:12 +0100 Subject: [PATCH 09/15] added bespoke decoherence --- QuEST/src/GPU/QuEST_cuQuantum.cu | 12 +++-- QuEST/src/GPU/QuEST_gpu.cu | 70 +---------------------------- QuEST/src/GPU/QuEST_gpu_common.cu | 75 ++++++++++++++++++++++++++++++- 3 files changed, 83 insertions(+), 74 deletions(-) diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index dff30594f..5aa586ad9 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -20,6 +20,9 @@ # include # include # include +# include +# include +# include @@ -837,6 +840,11 @@ void statevec_applyParamNamedPhaseFuncOverrides( void densmatr_mixDensityMatrix(Qureg combineQureg, qreal otherProb, Qureg otherQureg) { + Complex facOut = {.real=(1-otherProb), .imag=0}; + Complex facOther = {.real=otherProb, .imag=0}; + Complex zero = {.real=0, .imag=0}; + + statevec_setWeightedQureg(facOther, otherQureg, zero, otherQureg, facOut, combineQureg); } void densmatr_mixDephasing(Qureg qureg, int targetQubit, qreal dephase) @@ -889,10 +897,6 @@ void densmatr_mixDepolarising(Qureg qureg, int targetQubit, qreal depol) custatevec_applyMatrix(qureg, {}, targs, matr); } -void densmatr_mixTwoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal depolLevel) -{ -} - void densmatr_mixDamping(Qureg qureg, int qb, qreal prob) { cuAmp a = TO_CU_AMP(1, 0); diff --git a/QuEST/src/GPU/QuEST_gpu.cu b/QuEST/src/GPU/QuEST_gpu.cu index a44219aee..25f34ecc5 100755 --- a/QuEST/src/GPU/QuEST_gpu.cu +++ b/QuEST/src/GPU/QuEST_gpu.cu @@ -11,7 +11,7 @@ # include "QuEST_gpu_common.h" # include "QuEST_precision.h" # include "QuEST_validation.h" -# include "QuEST_internal.h" // for getQubitBitmask +# include "QuEST_internal.h" # include # include @@ -2896,74 +2896,6 @@ void densmatr_mixDamping(Qureg qureg, int targetQubit, qreal damping) { part1, part2, part3, bothBits); } -/** Called once for every 16 amplitudes */ -__global__ void densmatr_mixTwoQubitDepolarisingKernel( - qreal depolLevel, qreal* vecReal, qreal *vecImag, long long int numAmpsToVisit, - long long int part1, long long int part2, long long int part3, - long long int part4, long long int part5, - long long int rowCol1, long long int rowCol2) -{ - long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x; - if (scanInd >= numAmpsToVisit) return; - - // index of |..0..0..><..0..0| - long long int ind00 = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4); - long long int ind01 = ind00 + rowCol1; - long long int ind10 = ind00 + rowCol2; - long long int ind11 = ind00 + rowCol1 + rowCol2; - - qreal realAvDepol = depolLevel * 0.25 * ( - vecReal[ind00] + vecReal[ind01] + vecReal[ind10] + vecReal[ind11]); - qreal imagAvDepol = depolLevel * 0.25 * ( - vecImag[ind00] + vecImag[ind01] + vecImag[ind10] + vecImag[ind11]); - - qreal retain = 1 - depolLevel; - vecReal[ind00] *= retain; vecImag[ind00] *= retain; - vecReal[ind01] *= retain; vecImag[ind01] *= retain; - vecReal[ind10] *= retain; vecImag[ind10] *= retain; - vecReal[ind11] *= retain; vecImag[ind11] *= retain; - - vecReal[ind00] += realAvDepol; vecImag[ind00] += imagAvDepol; - vecReal[ind01] += realAvDepol; vecImag[ind01] += imagAvDepol; - vecReal[ind10] += realAvDepol; vecImag[ind10] += imagAvDepol; - vecReal[ind11] += realAvDepol; vecImag[ind11] += imagAvDepol; -} - -void densmatr_mixTwoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal depolLevel) { - - if (depolLevel == 0) - return; - - // assumes qubit2 > qubit1 - - densmatr_mixTwoQubitDephasing(qureg, qubit1, qubit2, depolLevel); - - int rowQubit1 = qubit1 + qureg.numQubitsRepresented; - int rowQubit2 = qubit2 + qureg.numQubitsRepresented; - - long long int colBit1 = 1LL << qubit1; - long long int rowBit1 = 1LL << rowQubit1; - long long int colBit2 = 1LL << qubit2; - long long int rowBit2 = 1LL << rowQubit2; - - long long int rowCol1 = colBit1 | rowBit1; - long long int rowCol2 = colBit2 | rowBit2; - - long long int numAmpsToVisit = qureg.numAmpsPerChunk/16; - long long int part1 = colBit1 - 1; - long long int part2 = (colBit2 >> 1) - colBit1; - long long int part3 = (rowBit1 >> 2) - (colBit2 >> 1); - long long int part4 = (rowBit2 >> 3) - (rowBit1 >> 2); - long long int part5 = numAmpsToVisit - (rowBit2 >> 3); - - int threadsPerCUDABlock, CUDABlocks; - threadsPerCUDABlock = 128; - CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock); - densmatr_mixTwoQubitDepolarisingKernel<<>>( - depolLevel, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit, - part1, part2, part3, part4, part5, rowCol1, rowCol2); -} - __global__ void statevec_setWeightedQuregKernel(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out) { long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x; diff --git a/QuEST/src/GPU/QuEST_gpu_common.cu b/QuEST/src/GPU/QuEST_gpu_common.cu index 0426a1c41..34caa30b0 100644 --- a/QuEST/src/GPU/QuEST_gpu_common.cu +++ b/QuEST/src/GPU/QuEST_gpu_common.cu @@ -9,9 +9,9 @@ # include "QuEST.h" -# include "QuEST_gpu_common.h" # include "QuEST_precision.h" # include "QuEST_validation.h" +# include "QuEST_internal.h" # include "mt19937ar.h" # include @@ -260,6 +260,79 @@ void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) { +/* + * DECOHERENCE + */ + +__global__ void densmatr_mixTwoQubitDepolarisingKernel( + Qureg qureg, qreal depolLevel, long long int part1, long long int part2, + long long int part3, long long int part4, long long int part5, + long long int rowCol1, long long int rowCol2) +{ + long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x; + if (scanInd >= qureg.numAmpsPerChunk) return; + + long long int ind00 = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4); + long long int ind01 = ind00 + rowCol1; + long long int ind10 = ind00 + rowCol2; + long long int ind11 = ind00 + rowCol1 + rowCol2; + + qreal re00, re01, re10, re11; + qreal im00, im01, im10, im11; + GET_AMP( re00, im00, qureg, ind00 ); + GET_AMP( re01, im01, qureg, ind01 ); + GET_AMP( re10, im10, qureg, ind10 ); + GET_AMP( re11, im11, qureg, ind11 ); + + qreal realAvDepol = depolLevel * 0.25 * (re00 + re01 + re10 + re11); + qreal imagAvDepol = depolLevel * 0.25 * (im00 + im01 + im10 + im11); + qreal retain = 1 - depolLevel; + + SET_AMP( qureg, ind00, retain*re00 + realAvDepol, retain*im00 + imagAvDepol ); + SET_AMP( qureg, ind01, retain*re01 + realAvDepol, retain*im01 + imagAvDepol ); + SET_AMP( qureg, ind10, retain*re10 + realAvDepol, retain*im10 + imagAvDepol ); + SET_AMP( qureg, ind11, retain*re11 + realAvDepol, retain*im11 + imagAvDepol ); +} + +void densmatr_mixTwoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal depolLevel) { + + if (depolLevel == 0) + return; + + densmatr_mixTwoQubitDephasing(qureg, qubit1, qubit2, depolLevel); + + // this code is painfully and unnecessarily verbose; it computes index offsets + // using bitwise logic, only to (within the kernel) effect those offsets with + // integer arithmetic. Instead, one might as well obtain the ultimate indices + // directly using bitwise logic within the kernel, passing no offsets at all. + // We defer this to when the insertBits (and other bit twiddles) have been moved + // into QuEST_gpu_common.cu. + + int rowQubit1 = qubit1 + qureg.numQubitsRepresented; + int rowQubit2 = qubit2 + qureg.numQubitsRepresented; + + long long int colBit1 = 1LL << qubit1; + long long int rowBit1 = 1LL << rowQubit1; + long long int colBit2 = 1LL << qubit2; + long long int rowBit2 = 1LL << rowQubit2; + + long long int rowCol1 = colBit1 | rowBit1; + long long int rowCol2 = colBit2 | rowBit2; + + long long int numAmpsToVisit = qureg.numAmpsPerChunk/16; + long long int part1 = colBit1 - 1; + long long int part2 = (colBit2 >> 1) - colBit1; + long long int part3 = (rowBit1 >> 2) - (colBit2 >> 1); + long long int part4 = (rowBit2 >> 3) - (rowBit1 >> 2); + long long int part5 = numAmpsToVisit - (rowBit2 >> 3); + + int CUDABlocks = ceil(numAmpsToVisit / (qreal) NUM_THREADS_PER_BLOCK); + densmatr_mixTwoQubitDepolarisingKernel<<>>( + qureg, depolLevel, part1, part2, part3, part4, part5, rowCol1, rowCol2); +} + + + /* * MANAGING NON-QUREG TYPES */ From b04797235bef099cd777f6f9a7a5138c7adcc56b Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Fri, 25 Aug 2023 00:10:33 +0100 Subject: [PATCH 10/15] optimised diagonal operators Changed several operators represented by diagonal matrices but previously effected as one-qubit general matrices, to instead be effected as diagonals (duh) --- QuEST/src/GPU/QuEST_cuQuantum.cu | 97 ++++++++++---------------------- 1 file changed, 29 insertions(+), 68 deletions(-) diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index 5aa586ad9..e25ea82ef 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -145,7 +145,7 @@ void custatevec_applyMatrix(Qureg qureg, std::vector ctrls, std::vector targs, cuAmp* elems) { +void custatevec_applyDiagonal(Qureg qureg, std::vector ctrls, std::vector targs, cuAmp* elems) { // apply no permutation matrix custatevecIndex_t *perm = nullptr; @@ -153,10 +153,8 @@ void custatevec_applyDiagonal(Qureg qureg, std::vector targs, cuAmp* elems) // do not adjoint elems int adj = 0; - // no control qubits - int* ctrls = nullptr; + // condition all ctrls on =1 state int* ctrlVals = nullptr; - int nCtrls = 0; // use automatic workspace management void* work = nullptr; @@ -167,7 +165,7 @@ void custatevec_applyDiagonal(Qureg qureg, std::vector targs, cuAmp* elems) CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, perm, elems, CU_AMP_IN_MATRIX_PREC, adj, targs.data(), targs.size(), - ctrls, ctrlVals, nCtrls, + ctrls.data(), ctrlVals, ctrls.size(), work, workSize); } @@ -625,55 +623,32 @@ void statevec_controlledPauliYConj(Qureg qureg, int controlQubit, int targetQubi void statevec_phaseShiftByTerm(Qureg qureg, int targetQubit, Complex term) { - // this diagonal operator, otherwise embarrasingly parallel with unit stride, - // is here treated as a generic one-target unitary, wastefully inducing non-unit - // stride and unnecessary memory reads, and potentially unnecessary communication - // in multi-GPU mode. - - cuAmp a0 = TO_CU_AMP(0, 0); cuAmp a1 = TO_CU_AMP(1, 0); - cuAmp aE = toCuAmp(term); - cuMatr matrix{ - a1, a0, - a0, aE - }; - custatevec_applyMatrix(qureg, {}, {targetQubit}, matrix); + cuAmp a2 = toCuAmp(term); + cuAmp elems[] = {a1, a2}; + + custatevec_applyDiagonal(qureg, {}, {targetQubit}, elems); } void statevec_controlledPhaseShift(Qureg qureg, int idQubit1, int idQubit2, qreal angle) { - // this diagonal operator, otherwise embarrasingly parallel with unit stride, - // is here treated as a generic one-target unitary, wastefully inducing non-unit - // stride and unnecessary memory reads, and potentially unnecessary communication - // in multi-GPU mode. - - cuAmp a0 = TO_CU_AMP(0, 0); cuAmp a1 = TO_CU_AMP(1, 0); - cuAmp aE = TO_CU_AMP(cos(angle), sin(angle)); - cuMatr matrix{ - a1, a0, - a0, aE - }; - custatevec_applyMatrix(qureg, {idQubit1}, {idQubit2}, matrix); + cuAmp a2 = TO_CU_AMP(cos(angle), sin(angle)); + cuAmp elems[] = {a1, a2}; + + custatevec_applyDiagonal(qureg, {idQubit1}, {idQubit2}, elems); } void statevec_multiControlledPhaseShift(Qureg qureg, int *controlQubits, int numControlQubits, qreal angle) { - // this diagonal operator, otherwise embarrasingly parallel with unit stride, - // is here treated as a generic one-target unitary, wastefully inducing non-unit - // stride and unnecessary memory reads, and potentially unnecessary communication - // in multi-GPU mode. - - cuAmp a0 = TO_CU_AMP(0, 0); cuAmp a1 = TO_CU_AMP(1, 0); - cuAmp aE = TO_CU_AMP(cos(angle), sin(angle)); - cuMatr matrix{ - a1, a0, - a0, aE - }; + cuAmp a2 = TO_CU_AMP(cos(angle), sin(angle)); + cuAmp elems[] = {a1, a2}; + std::vector targs{controlQubits[0]}; std::vector ctrls(controlQubits + 1, controlQubits + numControlQubits); - custatevec_applyMatrix(qureg, ctrls, targs, matrix); + custatevec_applyDiagonal(qureg, ctrls, targs, elems); + } void statevec_multiRotateZ(Qureg qureg, long long int mask, qreal angle) @@ -705,36 +680,22 @@ void statevec_multiControlledMultiRotateZ(Qureg qureg, long long int ctrlMask, l void statevec_controlledPhaseFlip(Qureg qureg, int idQubit1, int idQubit2) { - // this diagonal operator, otherwise embarrasingly parallel with unit stride, - // is here treated as a generic one-target unitary, wastefully inducing non-unit - // stride and unnecessary memory reads, and potentially unnecessary communication - // in multi-GPU mode. + cuAmp a1 = TO_CU_AMP( 1, 0); + cuAmp a2 = TO_CU_AMP(-1, 0); + cuAmp elems[] = {a1, a2}; - cuAmp a0 = TO_CU_AMP(0, 0); - cuAmp a1 = TO_CU_AMP(1, 0); - cuMatr matrix{ - a1, a0, - a0, -a1 - }; - custatevec_applyMatrix(qureg, {idQubit1}, {idQubit2}, matrix); + custatevec_applyDiagonal(qureg, {idQubit1}, {idQubit2}, elems); } void statevec_multiControlledPhaseFlip(Qureg qureg, int *controlQubits, int numControlQubits) { - // this diagonal operator, otherwise embarrasingly parallel with unit stride, - // is here treated as a generic one-target unitary, wastefully inducing non-unit - // stride and unnecessary memory reads, and potentially unnecessary communication - // in multi-GPU mode. + cuAmp a1 = TO_CU_AMP( 1, 0); + cuAmp a2 = TO_CU_AMP(-1, 0); + cuAmp elems[] = {a1, a2}; - cuAmp a0 = TO_CU_AMP(0, 0); - cuAmp a1 = TO_CU_AMP(1, 0); - cuMatr matrix{ - a1, a0, - a0, -a1 - }; std::vector targs{controlQubits[0]}; std::vector ctrls(controlQubits + 1, controlQubits + numControlQubits); - custatevec_applyMatrix(qureg, ctrls, targs, matrix); + custatevec_applyDiagonal(qureg, ctrls, targs, elems); } void statevec_swapQubitAmps(Qureg qureg, int qb1, int qb2) @@ -773,7 +734,7 @@ void statevec_controlledNot(Qureg qureg, int controlQubit, int targetQubit) void statevec_multiControlledMultiQubitNot(Qureg qureg, int ctrlMask, int targMask) { // this operator can be effected in one-shot using a custom kernel, but we here - // isntead resort to slowly (by at most a factor #targs) effect it as a sequence + // instead resort to slowly (by at most a factor #targs) effect it as a sequence // of one-target multi-ctrl NOT gates. cuAmp a0 = TO_CU_AMP(0, 0); @@ -797,7 +758,7 @@ void statevec_applySubDiagonalOp(Qureg qureg, int* targs, SubDiagonalOp op, int elems[i] = TO_CU_AMP(op.real[i], op.imag[i]); std::vector t(targs, targs + op.numQubits); - custatevec_applyDiagonal(qureg, t, elems); + custatevec_applyDiagonal(qureg, {}, t, elems); } void statevec_applyDiagonalOp(Qureg qureg, DiagonalOp op) @@ -854,7 +815,7 @@ void densmatr_mixDephasing(Qureg qureg, int targetQubit, qreal dephase) cuAmp elems[] = {a, b, b, a}; std::vector targs{targetQubit, targetQubit + qureg.numQubitsRepresented}; - custatevec_applyDiagonal(qureg, targs, elems); + custatevec_applyDiagonal(qureg, {}, targs, elems); } void densmatr_mixTwoQubitDephasing(Qureg qureg, int qb1, int qb2, qreal dephase) @@ -870,7 +831,7 @@ void densmatr_mixTwoQubitDephasing(Qureg qureg, int qb1, int qb2, qreal dephase) int shift = qureg.numQubitsRepresented; std::vector targs{qb1, qb2, qb1 + shift, qb2 + shift}; - custatevec_applyDiagonal(qureg, targs, elems); + custatevec_applyDiagonal(qureg, {}, targs, elems); } void densmatr_mixDepolarising(Qureg qureg, int targetQubit, qreal depol) @@ -905,7 +866,7 @@ void densmatr_mixDamping(Qureg qureg, int qb, qreal prob) cuAmp elems[] = {a, b, b, c}; std::vector targs{qb, qb + qureg.numQubitsRepresented}; - custatevec_applyDiagonal(qureg, targs, elems); + custatevec_applyDiagonal(qureg, {}, targs, elems); } From 14b0a59fdb144f978f9e624e24d6cee5915faedf Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Tue, 19 Sep 2023 16:11:30 +0900 Subject: [PATCH 11/15] added calculations --- QuEST/include/QuEST.h | 4 + QuEST/src/GPU/QuEST_cuQuantum.cu | 365 ++++++++++++++++++++++++++++-- QuEST/src/GPU/QuEST_gpu.cu | 125 ---------- QuEST/src/GPU/QuEST_gpu_common.cu | 134 +++++++++++ 4 files changed, 480 insertions(+), 148 deletions(-) diff --git a/QuEST/include/QuEST.h b/QuEST/include/QuEST.h index 1adb78dd3..26be2b94a 100644 --- a/QuEST/include/QuEST.h +++ b/QuEST/include/QuEST.h @@ -4256,6 +4256,10 @@ qreal calcPurity(Qureg qureg); * linear algebra calculation. * * The number of qubits represented in \p qureg and \p pureState must match. + * + * > In the GPU-accelerated cuQuantum backend, this function further assumes that + * > the density matrix \p qureg is correctly normalised, and otherwise returns the + * > fidelity of the conjugate-transpose of \p qureg. * * @see * - calcHilbertSchmidtDistance() diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index e25ea82ef..0557b5b16 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -22,7 +22,10 @@ # include # include # include +# include +# include # include +# include @@ -53,6 +56,8 @@ # define CU_AMP_IN_MATRIX_PREC void // invalid #endif + + // convenient operator overloads for cuAmp, for doing complex artihmetic. // some of these are defined to be used by Thrust's backend, because we // avoided Thrust's complex (see QuEST_precision.h for explanation). @@ -175,6 +180,77 @@ void custatevec_applyDiagonal(Qureg qureg, std::vector ctrls, std::vector +{ + __host__ __device__ cuAmp operator()(cuAmp braAmp, cuAmp ketAmp) { + return ketAmp * cuAmpConj(braAmp); + } +}; + +struct functor_absOfDif : public thrust::binary_function +{ + __host__ __device__ cuAmp operator()(cuAmp a, cuAmp b) { + qreal difRe = cuAmpReal(a) - cuAmpReal(b); + qreal difIm = cuAmpImag(a) - cuAmpImag(b); + qreal difAbs = difRe*difRe + difIm*difIm; + return TO_CU_AMP(difAbs, 0); + } +}; + +struct functor_absSquared : public thrust::unary_function +{ + __host__ __device__ qreal operator()(cuAmp amp) { + qreal re = cuAmpReal(amp); + qreal im = cuAmpImag(amp); + qreal absSq = re*re + im*im; + return absSq; + } +}; + +struct functor_prodOfAbsSquared : public thrust::binary_function +{ + __host__ __device__ cuAmp operator()(cuAmp probAmp, cuAmp rawAmp) { + qreal re = cuAmpReal(probAmp); + qreal im = cuAmpImag(probAmp); + qreal absSq = re*re + im*im; + cuAmp prod = rawAmp * TO_CU_AMP(absSq, 0); + return prod; + } +}; + +struct functor_scaleInd : public thrust::unary_function +{ + const long long int factor; + functor_scaleInd(long long int _factor) : factor(_factor) {} + + __host__ __device__ long long int operator()(long long int ind) { + return factor * ind; + } +}; + +struct functor_mapIndToAlternatingBlockScaledInd : public thrust::unary_function +{ + long long int blockSize; + long long int factor; + int useOffsetBlocks; + + functor_mapIndToAlternatingBlockScaledInd(long long int _blockSize, long long int _factor, int _useOffsetBlocks) : + blockSize(_blockSize), factor(_factor), useOffsetBlocks(_useOffsetBlocks) {} + + __host__ __device__ long long int operator()(long long int rawInd) { + + long long int blockNum = rawInd / blockSize; + long long int subBlockInd = rawInd % blockSize; + + long long int blockStartInd = (blockNum * 2 * blockSize) + (useOffsetBlocks * blockSize); + long long int blockifiedInd = blockStartInd + subBlockInd; + + long long int finalInd = factor * blockifiedInd; + + return finalInd; + } +}; + struct functor_setWeightedQureg { // functor requires 3 complex scalars @@ -190,6 +266,80 @@ struct functor_setWeightedQureg } }; +struct functor_matrixColumnDotVector : public thrust::unary_function +{ + cuAmp* matrix; // flattened column-wise + cuAmp* vector; + long long int dim; + functor_matrixColumnDotVector(cuAmp* _matrix, cuAmp* _vector, long long int _dim) : + matrix(_matrix), vector(_vector), dim(_dim) {} + + __host__ __device__ cuAmp operator()(long long int columnInd) { + + // safe to iterate serially, since #columnInd is exponentially growing + cuAmp sum = TO_CU_AMP(0, 0); + for (long long int rowInd=0; rowInd +{ + DiagonalOp op; + functor_getDiagonalOpAmp(DiagonalOp _op) : op(_op) {} + + __host__ __device__ cuAmp operator()(long long int columnInd) { + + return TO_CU_AMP( + op.deviceOperator.real[columnInd], + op.deviceOperator.imag[columnInd]); + } +}; + +struct functor_multDiagOntoDensMatr +{ + cuAmp* matrix; + DiagonalOp op; + functor_multDiagOntoDensMatr(cuAmp* _matrix, DiagonalOp _op) : matrix(_matrix), op(_op) {} + + __host__ __device__ void operator()(long long int matrixInd) { + + long long int opDim = 1LL << op.numQubits; + long long int opInd = matrixInd % opDim; + cuAmp opAmp = TO_CU_AMP( + op.deviceOperator.real[opInd], + op.deviceOperator.imag[opInd]); + + matrix[matrixInd] = opAmp * matrix[matrixInd]; + } +}; + +struct functor_collapseDensMatrToOutcome +{ + cuAmp* matrix; + int numQubits; + int outcome; + qreal outcomeProb; + int target; + functor_collapseDensMatrToOutcome(cuAmp* _matrix, int _numQubits, int _outcome, qreal _outcomeProb, int _target) : + matrix(_matrix), numQubits(_numQubits), outcome(_outcome), outcomeProb(_outcomeProb), target(_target) {} + + __host__ __device__ void operator()(long long int ind) { + + // obtain bits |...b2...><...b1...| + int b1 = (ind >> target) & 1; + int b2 = (ind >> target >> numQubits) & 1; + + int f1 = !(b1 ^ b2); // true if b1 == b2 + int f2 = !(b1 ^ outcome); // true if b1 == outcome + qreal fac = f1 * f2 / outcomeProb; // 0 or 1/prob + + matrix[ind] = TO_CU_AMP(fac, 0) * matrix[ind]; + } +}; + thrust::device_ptr getStartPtr(Qureg qureg) { return thrust::device_pointer_cast(qureg.deviceCuStateVec); @@ -200,6 +350,65 @@ thrust::device_ptr getEndPtr(Qureg qureg) { return getStartPtr(qureg) + qureg.numAmpsTotal; } +auto iter_strided(Qureg qureg, long long int stride, long long int strideIndInit) { + + // iterates qureg[i * stride] over i, from i = strideIndInit, until exceeding qureg + auto rawIndIter = thrust::make_counting_iterator(strideIndInit); + auto stridedIndIter = thrust::make_transform_iterator(rawIndIter, functor_scaleInd(stride)); + auto stridedAmpIter = thrust::make_permutation_iterator(getStartPtr(qureg), stridedIndIter); + return stridedAmpIter; +} + +auto getStartStridedAmpIter(Qureg qureg, long long int stride) { + + return iter_strided(qureg, stride, 0); +} + +auto getEndStridedAmpIter(Qureg qureg, long long int stride) { + + long long int numIters = ceil(qureg.numAmpsTotal / (qreal) stride); + return iter_strided(qureg, stride, numIters); +} + +auto iter_blockStrided(Qureg qureg, long long int blockSize, int useOffsetBlocks, long long int stride, long long int rawIndInit) { + + // iterates qureg[(i mapped to alternating blocks) * stride] over i, from i = rawIndInit, until exceeding qureg + auto functor = functor_mapIndToAlternatingBlockScaledInd(blockSize, stride, useOffsetBlocks); + auto rawIndIter = thrust::make_counting_iterator(rawIndInit); + auto stridedBlockIndIter = thrust::make_transform_iterator(rawIndIter, functor); + auto stridedBlockAmpIter = thrust::make_permutation_iterator(getStartPtr(qureg), stridedBlockIndIter); + return stridedBlockAmpIter; +} + +auto getStartBlockStridedAmpIter(Qureg qureg, long long int blockSize, int useOffsetBlocks, long long int stride) { + + return iter_blockStrided(qureg, blockSize, useOffsetBlocks, stride, 0); +} + +auto getEndBlockStridedAmpIter(Qureg qureg, long long int blockSize, int useOffsetBlocks, long long int stride) { + + long long int numAltBlockAmps = qureg.numAmpsTotal / 2; + long long int numIters = ceil(numAltBlockAmps / (qreal) stride); + return iter_blockStrided(qureg, blockSize, useOffsetBlocks, stride, numIters); +} + +auto iter_diagonalOp(DiagonalOp op, long long int initInd) { + + auto rawIndIter = thrust::make_counting_iterator(initInd); + auto diagAmpIter = thrust::make_transform_iterator(rawIndIter, functor_getDiagonalOpAmp(op)); + return diagAmpIter; +} + +auto getStartDiagonalOpAmpIter(DiagonalOp op) { + + return iter_diagonalOp(op, 0); +} + +auto getEndDiagonalOpAmpIter(DiagonalOp op) { + + return iter_diagonalOp(op, op.numElemsPerChunk); +} + /* @@ -763,10 +972,18 @@ void statevec_applySubDiagonalOp(Qureg qureg, int* targs, SubDiagonalOp op, int void statevec_applyDiagonalOp(Qureg qureg, DiagonalOp op) { + thrust::transform( + getStartDiagonalOpAmpIter(op), getEndDiagonalOpAmpIter(op), + getStartPtr(qureg), getStartPtr(qureg), // both are begin iters + thrust::multiplies()); } void densmatr_applyDiagonalOp(Qureg qureg, DiagonalOp op) { + auto startIndIter = thrust::make_counting_iterator(0); + auto endIndIter = startIndIter + qureg.numAmpsTotal; + auto functor = functor_multDiagOntoDensMatr(qureg.deviceCuStateVec, op); + thrust::for_each(startIndIter, endIndIter, functor); } void statevec_applyPhaseFuncOverrides( @@ -860,13 +1077,26 @@ void densmatr_mixDepolarising(Qureg qureg, int targetQubit, qreal depol) void densmatr_mixDamping(Qureg qureg, int qb, qreal prob) { - cuAmp a = TO_CU_AMP(1, 0); - cuAmp c = TO_CU_AMP(1 - prob, 0); - cuAmp b = TO_CU_AMP(sqrt(1 - prob), 0); - cuAmp elems[] = {a, b, b, c}; + // this function effects damping as a dense two-qubit superoperator + // on a Choi vector, where only 5 of the 16 elements are non-zero. This is + // potentially wasteful, and a bespoke kernel could be faster, leveraging + // QuEST's existing GPU code (or the optimised algorithm in the "distributed" + // manuscript). + + cuAmp w = TO_CU_AMP(1, 0); + cuAmp z = TO_CU_AMP(0, 0); + cuAmp p = TO_CU_AMP(prob, 0); + cuAmp a = TO_CU_AMP(sqrt(1 - prob), 0); + cuAmp b = TO_CU_AMP(1-prob, 0); + cuMatr matr{ + w, z, z, p, + z, a, z, z, + z, z, a, z, + z, z, z, b + }; std::vector targs{qb, qb + qureg.numQubitsRepresented}; - custatevec_applyDiagonal(qureg, {}, targs, elems); + custatevec_applyMatrix(qureg, {}, targs, matr); } @@ -877,65 +1107,141 @@ void densmatr_mixDamping(Qureg qureg, int qb, qreal prob) qreal densmatr_calcTotalProb(Qureg qureg) { - return -1; + long long int diagStride = 1 + (1LL << qureg.numQubitsRepresented); + + cuAmp sum = thrust::reduce( + getStartStridedAmpIter(qureg, diagStride), + getEndStridedAmpIter(qureg, diagStride), + TO_CU_AMP(0, 0)); + + return cuAmpReal(sum); } qreal statevec_calcTotalProb(Qureg qureg) { - return -1; + qreal abs2sum0; + qreal abs2sum1; + int basisBits[] = {0}; + int numBasisBits = 1; + + custatevecAbs2SumOnZBasis( + qureg.cuQuantumHandle, qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + &abs2sum0, &abs2sum1, basisBits, numBasisBits); + + return abs2sum0 + abs2sum1; } qreal statevec_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome) { - return -1; + // we only ever compute prob(outcome=0) as per the QuEST API's limitation + qreal prob0; + qreal* prob1 = nullptr; + + int basisBits[] = {measureQubit}; + int numBasisBits = 1; + + custatevecAbs2SumOnZBasis( + qureg.cuQuantumHandle, qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + &prob0, prob1, basisBits, numBasisBits); + + return (outcome == 0)? prob0 : (1-prob0); } qreal densmatr_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome) { - return -1; -} + long long int blockSize = (1LL << measureQubit); + long long int diagStride = (1LL << qureg.numQubitsRepresented) + 1; -void statevec_calcProbOfAllOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits) -{ -} + // we could set this to outcome to sum the outcome=1 amps directly, but the + // QuEST API specifies that the outcome=0 amps are always summed instead. + int useOffsetBlocks = 0; -void densmatr_calcProbOfAllOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits) -{ + cuAmp sum = thrust::reduce( + getStartBlockStridedAmpIter(qureg, blockSize, useOffsetBlocks, diagStride), + getEndBlockStridedAmpIter(qureg, blockSize, useOffsetBlocks, diagStride), + TO_CU_AMP(0, 0)); + + qreal prob0 = cuAmpReal(sum); + return (outcome == 0)? prob0 : (1-prob0); } qreal densmatr_calcInnerProduct(Qureg a, Qureg b) { - return -1; + cuAmp prod = thrust::inner_product( + getStartPtr(a), getEndPtr(a), getStartPtr(b), + TO_CU_AMP(0,0), thrust::plus(), functor_prodOfConj()); + + return cuAmpReal(prod); } Complex statevec_calcInnerProduct(Qureg bra, Qureg ket) { - return (Complex) {.real=-1, .imag=-1}; + cuAmp prod = thrust::inner_product( + getStartPtr(bra), getEndPtr(bra), getStartPtr(ket), + TO_CU_AMP(0,0), thrust::plus(), functor_prodOfConj()); + + return (Complex) {.real=cuAmpReal(prod), .imag=cuAmpImag(prod)}; } qreal densmatr_calcFidelity(Qureg qureg, Qureg pureState) { - return -1; + // create iterator f(i) = sum_j qureg_ij pureState_j + auto functor = functor_matrixColumnDotVector( + qureg.deviceCuStateVec, pureState.deviceCuStateVec, // both init iters + pureState.numAmpsTotal); + auto rawIndIter = thrust::make_counting_iterator(0); + auto prodAmpIter = thrust::make_transform_iterator(rawIndIter, functor); + + // compute sum_i conj(pureState_i) * f(i) + cuAmp prod = thrust::inner_product( + getStartPtr(pureState), getEndPtr(pureState), prodAmpIter, + TO_CU_AMP(0,0), thrust::plus(), functor_prodOfConj()); + + qreal prodRe = cuAmpReal(prod); + return prodRe; } qreal densmatr_calcHilbertSchmidtDistance(Qureg a, Qureg b) { - return -1; + cuAmp trace = thrust::inner_product( + getStartPtr(a), getEndPtr(a), getStartPtr(b), + TO_CU_AMP(0,0), thrust::plus(), functor_absOfDif()); + + qreal dist = sqrt(cuAmpReal(trace)); + return dist; } qreal densmatr_calcPurity(Qureg qureg) { - return -1; + qreal sumOfAbsSquared = thrust::transform_reduce( + getStartPtr(qureg), getEndPtr(qureg), functor_absSquared(), + 0., thrust::plus()); + + return sumOfAbsSquared; } Complex statevec_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op) { - return (Complex) {.real=-1, .imag=-1}; + cuAmp prod = thrust::inner_product( + getStartPtr(qureg), getEndPtr(qureg), getStartDiagonalOpAmpIter(op), + TO_CU_AMP(0,0), thrust::plus(), functor_prodOfAbsSquared()); + + return (Complex) {.real=cuAmpReal(prod), .imag=cuAmpImag(prod)}; } Complex densmatr_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op) { - return (Complex) {.real=-1, .imag=-1}; + long long int diagStride = 1 + (1LL << qureg.numQubitsRepresented); + + cuAmp prod = thrust::inner_product( + getStartStridedAmpIter(qureg, diagStride), + getEndStridedAmpIter(qureg, diagStride), + getStartDiagonalOpAmpIter(op), + TO_CU_AMP(0,0), thrust::plus(), thrust::multiplies()); + + return (Complex) {.real=cuAmpReal(prod), .imag=cuAmpImag(prod)}; } @@ -945,11 +1251,24 @@ Complex densmatr_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op) */ void statevec_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb) -{ +{ + int basisBits[] = {measureQubit}; + + custatevecCollapseOnZBasis( + qureg.cuQuantumHandle, qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, + outcome, basisBits, 1, outcomeProb); } void densmatr_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb) { + auto functor = functor_collapseDensMatrToOutcome( + qureg.deviceCuStateVec, qureg.numQubitsRepresented, + outcome, outcomeProb, measureQubit); + + auto startIndIter = thrust::make_counting_iterator(0); + auto endIndIter = startIndIter + qureg.numAmpsTotal; + thrust::for_each(startIndIter, endIndIter, functor); } diff --git a/QuEST/src/GPU/QuEST_gpu.cu b/QuEST/src/GPU/QuEST_gpu.cu index 25f34ecc5..0171d157a 100755 --- a/QuEST/src/GPU/QuEST_gpu.cu +++ b/QuEST/src/GPU/QuEST_gpu.cu @@ -1987,131 +1987,6 @@ qreal densmatr_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome) return outcomeProb; } -// atomicAdd on floats/doubles isn't available on <6 CC devices, so we add it ourselves -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 -#else -static __inline__ __device__ double atomicAdd(double* address, double val) -{ - unsigned long long int* address_as_ull = (unsigned long long int*) address; - unsigned long long int old = *address_as_ull, assumed; - - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + __longlong_as_double(assumed))); - - // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) - } while (assumed != old); - - return __longlong_as_double(old); -} -#endif - -__global__ void statevec_calcProbOfAllOutcomesKernel( - qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits -) { - // each thread handles one amplitude (all amplitudes are involved) - long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x; - if (ampInd >= qureg.numAmpsTotal) return; - - qreal prob = ( - qureg.deviceStateVec.real[ampInd]*qureg.deviceStateVec.real[ampInd] + - qureg.deviceStateVec.imag[ampInd]*qureg.deviceStateVec.imag[ampInd]); - - // each amplitude contributes to one outcome - long long int outcomeInd = 0; - for (int q=0; q>>( - d_outcomeProbs, qureg, d_qubits, numQubits); - - // copy outcomeProbs from GPU memory - cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost); - - // free GPU memory - cudaFree(d_qubits); - cudaFree(d_outcomeProbs); -} - -__global__ void densmatr_calcProbOfAllOutcomesKernel( - qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits -) { - // each thread handles one diagonal amplitude - long long int diagInd = blockIdx.x*blockDim.x + threadIdx.x; - long long int numDiags = (1LL << qureg.numQubitsRepresented); - if (diagInd >= numDiags) return; - - long long int flatInd = (1 + numDiags)*diagInd; - qreal prob = qureg.deviceStateVec.real[flatInd]; // im[flatInd] assumed ~ 0 - - // each diagonal amplitude contributes to one outcome - long long int outcomeInd = 0; - for (int q=0; q>>( - d_outcomeProbs, qureg, d_qubits, numQubits); - - // copy outcomeProbs from GPU memory - cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost); - - // free GPU memory - cudaFree(d_qubits); - cudaFree(d_outcomeProbs); -} - /** computes Tr(conjTrans(a) b) = sum of (a_ij^* b_ij), which is a real number */ __global__ void densmatr_calcInnerProductKernel( Qureg a, Qureg b, long long int numTermsToSum, qreal* reducedArray diff --git a/QuEST/src/GPU/QuEST_gpu_common.cu b/QuEST/src/GPU/QuEST_gpu_common.cu index 34caa30b0..c9bfea65f 100644 --- a/QuEST/src/GPU/QuEST_gpu_common.cu +++ b/QuEST/src/GPU/QuEST_gpu_common.cu @@ -260,6 +260,140 @@ void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) { +/* + * CALCULATIONS + */ + +// atomicAdd on floats/doubles isn't available on <6 CC devices, so we add it ourselves +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 +#else +static __inline__ __device__ double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*) address; + unsigned long long int old = *address_as_ull, assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __longlong_as_double(old); +} +#endif + +__global__ void statevec_calcProbOfAllOutcomesKernel( + qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits +) { + // each thread handles one amplitude (all amplitudes are involved) + long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x; + if (ampInd >= qureg.numAmpsTotal) return; + + qreal ampRe, ampIm; + GET_AMP( ampRe, ampIm, qureg, ampInd ); + qreal prob = ampRe*ampRe + ampIm*ampIm; + + // each amplitude contributes to one outcome + long long int outcomeInd = 0; + for (int q=0; q> qubits[q]) & 1) << q; + + // each thread atomically writes directly to the global output. + // this beat block-heirarchal atomic reductions in both global and shared memory! + atomicAdd(&outcomeProbs[outcomeInd], prob); +} + +void statevec_calcProbOfAllOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits) { + + // copy qubits to GPU memory + int* d_qubits; + size_t mem_qubits = numQubits * sizeof *d_qubits; + cudaMalloc(&d_qubits, mem_qubits); + cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice); + + // create one thread for every amplitude + int numThreadsPerBlock = 128; + int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) numThreadsPerBlock); + + // create global GPU array for outcomeProbs + qreal* d_outcomeProbs; + long long int numOutcomes = (1LL << numQubits); + size_t mem_outcomeProbs = numOutcomes * sizeof *d_outcomeProbs; + cudaMalloc(&d_outcomeProbs, mem_outcomeProbs); + cudaMemset(d_outcomeProbs, 0, mem_outcomeProbs); + + // populate per-block subarrays + statevec_calcProbOfAllOutcomesKernel<<>>( + d_outcomeProbs, qureg, d_qubits, numQubits); + + // copy outcomeProbs from GPU memory + cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost); + + // free GPU memory + cudaFree(d_qubits); + cudaFree(d_outcomeProbs); +} + +__global__ void densmatr_calcProbOfAllOutcomesKernel( + qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits +) { + // each thread handles one diagonal amplitude + long long int diagInd = blockIdx.x*blockDim.x + threadIdx.x; + long long int numDiags = (1LL << qureg.numQubitsRepresented); + if (diagInd >= numDiags) return; + + long long int flatInd = (1 + numDiags)*diagInd; + + qreal ampRe, ampIm; + GET_AMP( ampRe, ampIm, qureg, flatInd ); + qreal prob = ampRe + 0*ampIm; // ampIm assumed ~ 0, silence unused warning + + // each diagonal amplitude contributes to one outcome + long long int outcomeInd = 0; + for (int q=0; q> qubits[q]) & 1) << q; + + // each thread atomically writes directly to the global output. + // this beat block-heirarchal atomic reductions in both global and shared memory! + atomicAdd(&outcomeProbs[outcomeInd], prob); +} + +void densmatr_calcProbOfAllOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits) { + + // copy qubits to GPU memory + int* d_qubits; + size_t mem_qubits = numQubits * sizeof *d_qubits; + cudaMalloc(&d_qubits, mem_qubits); + cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice); + + // create global array, with per-block subarrays + int numThreadsPerBlock = 128; + int numDiags = (1LL << qureg.numQubitsRepresented); + int numBlocks = ceil(numDiags / (qreal) numThreadsPerBlock); + + // create global GPU array for outcomeProbs + qreal* d_outcomeProbs; + long long int numOutcomes = (1LL << numQubits); + size_t mem_outcomeProbs = numOutcomes * sizeof *d_outcomeProbs; + cudaMalloc(&d_outcomeProbs, mem_outcomeProbs); + cudaMemset(d_outcomeProbs, 0, mem_outcomeProbs); + + // populate per-block subarrays + densmatr_calcProbOfAllOutcomesKernel<<>>( + d_outcomeProbs, qureg, d_qubits, numQubits); + + // copy outcomeProbs from GPU memory + cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost); + + // free GPU memory + cudaFree(d_qubits); + cudaFree(d_outcomeProbs); +} + + + /* * DECOHERENCE */ From 7e2362a5964a2b3c554e00ca95305779be822a79 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Wed, 20 Sep 2023 13:48:22 +0900 Subject: [PATCH 12/15] added phase functions --- QuEST/src/GPU/QuEST_cuQuantum.cu | 25 -- QuEST/src/GPU/QuEST_gpu.cu | 452 ------------------------- QuEST/src/GPU/QuEST_gpu_common.cu | 543 +++++++++++++++++++++++++++++- 3 files changed, 537 insertions(+), 483 deletions(-) diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index 0557b5b16..7422345ca 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -857,7 +857,6 @@ void statevec_multiControlledPhaseShift(Qureg qureg, int *controlQubits, int num std::vector targs{controlQubits[0]}; std::vector ctrls(controlQubits + 1, controlQubits + numControlQubits); custatevec_applyDiagonal(qureg, ctrls, targs, elems); - } void statevec_multiRotateZ(Qureg qureg, long long int mask, qreal angle) @@ -986,30 +985,6 @@ void densmatr_applyDiagonalOp(Qureg qureg, DiagonalOp op) thrust::for_each(startIndIter, endIndIter, functor); } -void statevec_applyPhaseFuncOverrides( - Qureg qureg, int* qubits, int numQubits, enum bitEncoding encoding, - qreal* coeffs, qreal* exponents, int numTerms, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - int conj) -{ -} - -void statevec_applyMultiVarPhaseFuncOverrides( - Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, - qreal* coeffs, qreal* exponents, int* numTermsPerReg, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - int conj) -{ -} - -void statevec_applyParamNamedPhaseFuncOverrides( - Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, - enum phaseFunc phaseFuncName, qreal* params, int numParams, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - int conj) -{ -} - /* diff --git a/QuEST/src/GPU/QuEST_gpu.cu b/QuEST/src/GPU/QuEST_gpu.cu index 0171d157a..87990a312 100755 --- a/QuEST/src/GPU/QuEST_gpu.cu +++ b/QuEST/src/GPU/QuEST_gpu.cu @@ -3189,458 +3189,6 @@ Complex densmatr_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op) { return expecVal; } -__global__ void statevec_applyPhaseFuncOverridesKernel( - Qureg qureg, int* qubits, int numQubits, enum bitEncoding encoding, - qreal* coeffs, qreal* exponents, int numTerms, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - int conj -) { - long long int index = blockIdx.x*blockDim.x + threadIdx.x; - if (index>=qureg.numAmpsPerChunk) return; - - // determine global amplitude index (non-distributed, so it's just local index) - long long int globalAmpInd = index; - - // determine phase index of {qubits} - long long int phaseInd = 0LL; - if (encoding == UNSIGNED) { - for (int q=0; q>>( - qureg, d_qubits, numQubits, encoding, - d_coeffs, d_exponents, numTerms, - d_overrideInds, d_overridePhases, numOverrides, - conj); - - // cleanup device memory - cudaFree(d_qubits); - cudaFree(d_coeffs); - cudaFree(d_exponents); - cudaFree(d_overrideInds); - cudaFree(d_overridePhases); -} - -__global__ void statevec_applyMultiVarPhaseFuncOverridesKernel( - Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, - qreal* coeffs, qreal* exponents, int* numTermsPerReg, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - long long int *phaseInds, - int conj -) { - long long int index = blockIdx.x*blockDim.x + threadIdx.x; - if (index>=qureg.numAmpsPerChunk) return; - - // determine global amplitude index (non-distributed, so it's just local index) - long long int globalAmpInd = index; - - /* - * each thread needs to write to a local: - * long long int phaseInds[numRegs]; - * but instead has access to shared array phaseInds, with below stride and offset - */ - size_t stride = gridDim.x*blockDim.x; - size_t offset = blockIdx.x*blockDim.x + threadIdx.x; - - // determine phase indices - int flatInd = 0; - if (encoding == UNSIGNED) { - for (int r=0; r>>( - qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding, - d_coeffs, d_exponents, d_numTermsPerReg, - d_overrideInds, d_overridePhases, numOverrides, - d_phaseInds, - conj); - - // free device memory - cudaFree(d_qubits); - cudaFree(d_coeffs); - cudaFree(d_exponents); - cudaFree(d_numQubitsPerReg); - cudaFree(d_numTermsPerReg); - cudaFree(d_overrideInds); - cudaFree(d_overridePhases); - cudaFree(d_phaseInds); -} - -__global__ void statevec_applyParamNamedPhaseFuncOverridesKernel( - Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, - enum phaseFunc phaseFuncName, qreal* params, int numParams, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - long long int* phaseInds, - int conj -) { - long long int index = blockIdx.x*blockDim.x + threadIdx.x; - if (index>=qureg.numAmpsPerChunk) return; - - // determine global amplitude index (non-distributed, so it's just local index) - long long int globalAmpInd = index; - - /* - * each thread needs to write to a local: - * long long int phaseInds[numRegs]; - * but instead has access to shared array phaseInds, with below stride and offset - */ - size_t stride = gridDim.x*blockDim.x; - size_t offset = blockIdx.x*blockDim.x + threadIdx.x; - - // determine phase indices - if (encoding == UNSIGNED) { - int flatInd = 0; - for (int r=0; r 0) cudaMalloc(&d_params, mem_params); - - // copy function args into GPU memory - cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice); - cudaMemcpy(d_numQubitsPerReg, numQubitsPerReg, mem_numQubitsPerReg, cudaMemcpyHostToDevice); - cudaMemcpy(d_overrideInds, overrideInds, mem_overrideInds, cudaMemcpyHostToDevice); - cudaMemcpy(d_overridePhases, overridePhases, mem_overridePhases, cudaMemcpyHostToDevice); - if (numParams > 0) - cudaMemcpy(d_params, params, mem_params, cudaMemcpyHostToDevice); - - int threadsPerCUDABlock = 128; - int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock); - - // allocate thread-local working space {phaseInds} - long long int *d_phaseInds; - size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks; - cudaMalloc(&d_phaseInds, numRegs*gridSize * sizeof *d_phaseInds); - - // call kernel - statevec_applyParamNamedPhaseFuncOverridesKernel<<>>( - qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding, - phaseFuncName, d_params, numParams, - d_overrideInds, d_overridePhases, numOverrides, - d_phaseInds, - conj); - - // free device memory - cudaFree(d_qubits); - cudaFree(d_numQubitsPerReg); - cudaFree(d_overrideInds); - cudaFree(d_overridePhases); - cudaFree(d_phaseInds); - if (numParams > 0) - cudaFree(d_params); -} - #ifdef __cplusplus diff --git a/QuEST/src/GPU/QuEST_gpu_common.cu b/QuEST/src/GPU/QuEST_gpu_common.cu index c9bfea65f..0e984d800 100644 --- a/QuEST/src/GPU/QuEST_gpu_common.cu +++ b/QuEST/src/GPU/QuEST_gpu_common.cu @@ -36,6 +36,10 @@ extern "C" { const int NUM_THREADS_PER_BLOCK = 128; +__forceinline__ __device__ long long int getThreadInd() { + return blockIdx.x*blockDim.x + threadIdx.x; +} + /* @@ -159,7 +163,7 @@ void seedQuEST(QuESTEnv *env, unsigned long int *seedArray, int numSeeds) { __global__ void densmatr_initPureStateKernel(Qureg dens, Qureg pure) { - long long int pureInd = blockIdx.x*blockDim.x + threadIdx.x; + long long int pureInd = getThreadInd(); if (pureInd >= pure.numAmpsPerChunk) return; qreal aRe, aIm; @@ -188,7 +192,7 @@ void densmatr_initPureState(Qureg dens, Qureg pure) __global__ void densmatr_setQuregToPauliHamilKernel( Qureg qureg, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms ) { - long long int n = blockIdx.x*blockDim.x + threadIdx.x; + long long int n = getThreadInd(); if (n>=qureg.numAmpsPerChunk) return; // flattened {I,X,Y,Z} matrix elements, where [k] = [p][i][j] @@ -288,7 +292,7 @@ __global__ void statevec_calcProbOfAllOutcomesKernel( qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits ) { // each thread handles one amplitude (all amplitudes are involved) - long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x; + long long int ampInd = getThreadInd(); if (ampInd >= qureg.numAmpsTotal) return; qreal ampRe, ampIm; @@ -340,7 +344,7 @@ __global__ void densmatr_calcProbOfAllOutcomesKernel( qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits ) { // each thread handles one diagonal amplitude - long long int diagInd = blockIdx.x*blockDim.x + threadIdx.x; + long long int diagInd = getThreadInd(); long long int numDiags = (1LL << qureg.numQubitsRepresented); if (diagInd >= numDiags) return; @@ -403,7 +407,7 @@ __global__ void densmatr_mixTwoQubitDepolarisingKernel( long long int part3, long long int part4, long long int part5, long long int rowCol1, long long int rowCol2) { - long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x; + long long int scanInd = getThreadInd(); if (scanInd >= qureg.numAmpsPerChunk) return; long long int ind00 = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4); @@ -519,7 +523,7 @@ __global__ void agnostic_initDiagonalOpFromPauliHamilKernel( DiagonalOp op, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms ) { // each thread processes one diagonal element - long long int elemInd = blockIdx.x*blockDim.x + threadIdx.x; + long long int elemInd = getThreadInd(); if (elemInd >= op.numElemsPerChunk) return; @@ -592,6 +596,533 @@ void agnostic_setDiagonalOpElems(DiagonalOp op, long long int startInd, qreal* r +/* + * PHASE FUNCTIONS + */ + +__forceinline__ __device__ int getBit(long long int num, int ind) { + return (num >> ind) & 1; +} + +__forceinline__ __device__ void applyPhaseToAmp(Qureg qureg, long long int index, qreal phase, int conj) +{ + // negate phase to conjugate operator + phase *= (1 - 2*conj); + qreal c = cos(phase); + qreal s = sin(phase); + + // modify amp to amp * exp(i phase) + qreal re, im; + GET_AMP( re, im, qureg, index ); + SET_AMP( qureg, index, re*c - im*s, re*s + im*c ); +} + +__forceinline__ __device__ void getMultiRegPhaseIndOffsets(size_t* stride, size_t* offset) +{ + // determine the phaseInds tuple relevant for this thread + *stride = gridDim.x*blockDim.x; + *offset = blockIdx.x*blockDim.x + threadIdx.x; +} + +__forceinline__ __device__ long long int getPhaseInd(long long int globalAmpInd, int* qubits, int numQubits, enum bitEncoding encoding) +{ + long long int phaseInd = 0LL; + + if (encoding == UNSIGNED) { + for (int q=0; q=qureg.numAmpsPerChunk) return; + + // determine global amplitude index (non-distributed, so it's just local index) + long long int globalAmpInd = index; + + // determine phaseInd from the qubit encoding + long long int phaseInd = getPhaseInd(globalAmpInd, qubits, numQubits, encoding); + + // determine if this phase index has an overriden value (i < numOverrides) + int overInd = getIndOfPhaseOverride(phaseInd, overrideInds, numOverrides); + + // determine phase from {coeffs}, {exponents} (unless overriden) + qreal phase = 0; + if (overInd < numOverrides) + phase = overridePhases[overInd]; + else + for (int t=0; t>>( + qureg, d_qubits, numQubits, encoding, + d_coeffs, d_exponents, numTerms, + d_overrideInds, d_overridePhases, numOverrides, + conj); + + // cleanup device memory + cudaFree(d_qubits); + cudaFree(d_coeffs); + cudaFree(d_exponents); + cudaFree(d_overrideInds); + cudaFree(d_overridePhases); +} + +__global__ void statevec_applyMultiVarPhaseFuncOverridesKernel( + Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, + qreal* coeffs, qreal* exponents, int* numTermsPerReg, + long long int* overrideInds, qreal* overridePhases, int numOverrides, + long long int *phaseInds, + int conj +) { + long long int index = getThreadInd(); + if (index>=qureg.numAmpsPerChunk) return; + + // determine global amplitude index (non-distributed, so it's just local index) + long long int globalAmpInd = index; + + // determine phase indices (each thread has phaseInds[numRegs] sub-array) + setMultiRegPhaseInds(phaseInds, globalAmpInd, qubits, numQubitsPerReg, numRegs, encoding); + + // determine if this phase index has an overriden value + long long int overInd = getIndOfMultiRegPhaseOverride(phaseInds, numRegs, overrideInds, numOverrides); + + // either use the overriden phase... + qreal phase = 0; + if (overInd < numOverrides) + phase = overridePhases[overInd]; + + else { + // else determine the phaseInds tuple relevant for this thread + size_t stride, offset; + getMultiRegPhaseIndOffsets(&stride, &offset); + + // and compute the phase from coeffs and exponents + long long int flatInd = 0; + for (int r=0; r>>( + qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding, + d_coeffs, d_exponents, d_numTermsPerReg, + d_overrideInds, d_overridePhases, numOverrides, + d_phaseInds, + conj); + + // free device memory + cudaFree(d_qubits); + cudaFree(d_coeffs); + cudaFree(d_exponents); + cudaFree(d_numQubitsPerReg); + cudaFree(d_numTermsPerReg); + cudaFree(d_overrideInds); + cudaFree(d_overridePhases); + cudaFree(d_phaseInds); +} + +__global__ void statevec_applyParamNamedPhaseFuncOverridesKernel( + Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, + enum phaseFunc phaseFuncName, qreal* params, int numParams, + long long int* overrideInds, qreal* overridePhases, int numOverrides, + long long int* phaseInds, + int conj +) { + long long int index = getThreadInd(); + if (index>=qureg.numAmpsPerChunk) return; + + // determine global amplitude index (non-distributed, so it's just local index) + long long int globalAmpInd = index; + + // determine phase indices (each thread has phaseInds[numRegs] sub-array) + setMultiRegPhaseInds(phaseInds, globalAmpInd, qubits, numQubitsPerReg, numRegs, encoding); + + // determine if this phase index has an overriden value + long long int overInd = getIndOfMultiRegPhaseOverride(phaseInds, numRegs, overrideInds, numOverrides); + + // determine the phase, or the overriden one + qreal phase = 0; + if (overInd < numOverrides) + phase = overridePhases[overInd]; + else + phase = getPhaseFromParamNamedFunc(phaseInds, numRegs, phaseFuncName, params, numParams); + + // modify amp to amp * exp(i phase) + applyPhaseToAmp(qureg, index, phase, conj); +} + +void statevec_applyParamNamedPhaseFuncOverrides( + Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, + enum phaseFunc phaseFuncName, qreal* params, int numParams, + long long int* overrideInds, qreal* overridePhases, int numOverrides, + int conj +) { + // determine size of arrays, for cloning into GPU memory + size_t mem_numQubitsPerReg = numRegs * sizeof *numQubitsPerReg; + size_t mem_overridePhases = numOverrides * sizeof *overridePhases; + size_t mem_overrideInds = numOverrides * numRegs * sizeof *overrideInds; + size_t mem_params = numParams * sizeof *params; + size_t mem_qubits = 0; + for (int r=0; r 0) cudaMalloc(&d_params, mem_params); + + // copy function args into GPU memory + cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice); + cudaMemcpy(d_numQubitsPerReg, numQubitsPerReg, mem_numQubitsPerReg, cudaMemcpyHostToDevice); + cudaMemcpy(d_overrideInds, overrideInds, mem_overrideInds, cudaMemcpyHostToDevice); + cudaMemcpy(d_overridePhases, overridePhases, mem_overridePhases, cudaMemcpyHostToDevice); + if (numParams > 0) + cudaMemcpy(d_params, params, mem_params, cudaMemcpyHostToDevice); + + int threadsPerCUDABlock = 128; + int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock); + + // allocate thread-local working space {phaseInds} + long long int *d_phaseInds; + size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks; + cudaMalloc(&d_phaseInds, numRegs*gridSize * sizeof *d_phaseInds); + + // call kernel + statevec_applyParamNamedPhaseFuncOverridesKernel<<>>( + qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding, + phaseFuncName, d_params, numParams, + d_overrideInds, d_overridePhases, numOverrides, + d_phaseInds, + conj); + + // free device memory + cudaFree(d_qubits); + cudaFree(d_numQubitsPerReg); + cudaFree(d_overrideInds); + cudaFree(d_overridePhases); + cudaFree(d_phaseInds); + if (numParams > 0) + cudaFree(d_params); +} + + + #ifdef __cplusplus } #endif From f7c6ee6d8b07bc1fb9b6a95172e424761dc001fb Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Wed, 20 Sep 2023 16:57:57 +0900 Subject: [PATCH 13/15] fixed automatic workspace memory --- QuEST/include/QuEST.h | 18 ++-- QuEST/include/QuEST_precision.h | 7 +- QuEST/src/GPU/QuEST_cuQuantum.cu | 165 +++++++++++++++++++------------ 3 files changed, 119 insertions(+), 71 deletions(-) diff --git a/QuEST/include/QuEST.h b/QuEST/include/QuEST.h index 26be2b94a..ba89ad38d 100644 --- a/QuEST/include/QuEST.h +++ b/QuEST/include/QuEST.h @@ -38,9 +38,15 @@ // ensure custatevecHandle_t is defined, even if no GPU # ifdef USE_CUQUANTUM - # include +# include +typedef struct CuQuantumConfig { + cudaMemPool_t cuMemPool; + cudaStream_t cuStream; + custatevecHandle_t cuQuantumHandle; + custatevecDeviceMemHandler_t cuMemHandler; +} CuQuantumConfig; # else - # define custatevecHandle_t void* +# define CuQuantumConfig void* # endif @@ -379,10 +385,10 @@ typedef struct Qureg //! Storage for reduction of probabilities on GPU qreal *firstLevelReduction, *secondLevelReduction; - //! Storage for wavefunction amplitues and config (copy of QuESTEnv's handle) in the cuQuantum version + //! Storage for wavefunction amplitues and config (copy of QuESTEnv's handle) in cuQuantum deployment cuAmp* cuStateVec; cuAmp* deviceCuStateVec; - custatevecHandle_t cuQuantumHandle; + CuQuantumConfig* cuConfig; //! Storage for generated QASM output QASMLogger* qasmLog; @@ -403,8 +409,8 @@ typedef struct QuESTEnv unsigned long int* seeds; int numSeeds; - // handle to cuQuantum (specifically cuStateVec) used only cuQuantum deployment mode (otherwise is void*) - custatevecHandle_t cuQuantumHandle; + // a copy of the QuESTEnv's config, used only in cuQuantum deployment + CuQuantumConfig* cuConfig; } QuESTEnv; diff --git a/QuEST/include/QuEST_precision.h b/QuEST/include/QuEST_precision.h index b5cc76dc0..244460b78 100644 --- a/QuEST/include/QuEST_precision.h +++ b/QuEST/include/QuEST_precision.h @@ -15,7 +15,11 @@ # include -// define CUDA complex types as void if not using cuQuantum + +// define CUDA complex types as void if not using cuQuantum. +// note we used cuComplex.h for complex numbers, in lieu of +// Thrust's complex, so that the QuEST.h header can +// always be compiled with C99, rather than C++14. # ifdef USE_CUQUANTUM # include # else @@ -23,6 +27,7 @@ # define cuDoubleComplex void # endif + // set default double precision if not set during compilation # ifndef QuEST_PREC # define QuEST_PREC 2 diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index 7422345ca..4f81f4659 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -56,8 +56,6 @@ # define CU_AMP_IN_MATRIX_PREC void // invalid #endif - - // convenient operator overloads for cuAmp, for doing complex artihmetic. // some of these are defined to be used by Thrust's backend, because we // avoided Thrust's complex (see QuEST_precision.h for explanation). @@ -124,6 +122,89 @@ std::vector getIndsFromMask(long long int mask, int numBits) { +/* + * CUQUANTUM MEMORY MANAGEMENT + */ + +int GPUSupportsMemPools() { + + // consult only the first device (garuanteed already to exist) + int deviceId; + cudaGetDevice(&deviceId); + int supports; + cudaDeviceGetAttribute(&supports, cudaDevAttrMemoryPoolsSupported, deviceId); + return supports; +} + +int memPoolAlloc(void* ctx, void** ptr, size_t size, cudaStream_t stream) { + cudaMemPool_t& pool = *static_cast(ctx); + return cudaMallocFromPoolAsync(ptr, size, pool, stream); +} +int memPoolFree(void* ctx, void* ptr, size_t size, cudaStream_t stream) { + return cudaFreeAsync(ptr, stream); +} + +CuQuantumConfig* createCuConfig() { + + // create cuQuantumConfig in heap memory + CuQuantumConfig* config = (CuQuantumConfig*) malloc(sizeof(CuQuantumConfig)); + + // bind existing memory pool (does not need later manual freeing) + int deviceId; + cudaGetDevice(&deviceId); + cudaDeviceGetMemPool(&(config->cuMemPool), deviceId); + + // create new custatevecHandle_t + custatevecCreate(&(config->cuQuantumHandle)); + + // create new cudaStream_t + cudaStreamCreate(&(config->cuStream)); + + // custatevecDeviceMemHandler_t needs no explicit creation + + // return config's heap pointer + return config; +} + +void initCuConfig(CuQuantumConfig* config) { + + // get existing memPool threshold, above which memory gets freed at every stream synch + size_t currMaxMem; + cudaMemPoolGetAttribute(config->cuMemPool, cudaMemPoolAttrReleaseThreshold, &currMaxMem); + + // if memPool threshold smaller than 1 MiB = 16 qubits, extend it + size_t desiredMaxMem = 16*(1<<15); + if (currMaxMem < desiredMaxMem) + cudaMemPoolSetAttribute(config->cuMemPool, cudaMemPoolAttrReleaseThreshold, &desiredMaxMem); + + // bind mempool to deviceMemHandler + config->cuMemHandler.ctx = &(config->cuMemPool); + config->cuMemHandler.device_alloc = memPoolAlloc; + config->cuMemHandler.device_free = memPoolFree; + strcpy(config->cuMemHandler.name, "mempool"); + + // bind deviceMemHandler to cuQuantum + custatevecSetDeviceMemHandler(config->cuQuantumHandle, &(config->cuMemHandler)); + + // bind stream to cuQuantum + custatevecSetStream(config->cuQuantumHandle, config->cuStream); +} + +void destroyCuConfig(CuQuantumConfig* config) { + + // free config's heap attributes + cudaStreamDestroy(config->cuStream); + custatevecDestroy(config->cuQuantumHandle); + + // don't need to free cuMemPool; it already existed + // don't need to free cuMemHandler; it's a struct included in config's heap memory + + // free config's heap memory + free(config); +} + + + /* * CUQUANTUM WRAPPERS (to reduce boilerplate) */ @@ -141,7 +222,7 @@ void custatevec_applyMatrix(Qureg qureg, std::vector ctrls, std::vectorcuQuantumHandle, qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, matr.data(), CU_AMP_IN_MATRIX_PREC, CUSTATEVEC_MATRIX_LAYOUT_ROW, adj, targs.data(), targs.size(), @@ -166,7 +247,7 @@ void custatevec_applyDiagonal(Qureg qureg, std::vector ctrls, std::vectorcuQuantumHandle, qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, perm, elems, CU_AMP_IN_MATRIX_PREC, adj, targs.data(), targs.size(), @@ -424,50 +505,6 @@ extern "C" { * ENVIRONMENT MANAGEMENT */ -int GPUSupportsMemPools() { - - // consult only the first device (garuanteed already to exist) - int device = 0; - - int supports; - cudaDeviceGetAttribute(&supports, cudaDevAttrMemoryPoolsSupported, device); - return supports; -} - -int memPoolAlloc(void* ctx, void** ptr, size_t size, cudaStream_t stream) { - cudaMemPool_t pool = * reinterpret_cast(ctx); - return cudaMallocFromPoolAsync(ptr, size, pool, stream); -} -int memPoolFree(void* ctx, void* ptr, size_t size, cudaStream_t stream) { - return cudaFreeAsync(ptr, stream); -} - -void setupAutoWorkspaces(custatevecHandle_t cuQuantumHandle) { - - // get the current (device's default) stream-ordered memory pool (assuming single GPU) - int device = 0; - cudaMemPool_t memPool; - cudaDeviceGetMemPool(&memPool, device); - - // get its current memory threshold, above which memory gets freed at every stream synch - size_t currMaxMem; - cudaMemPoolGetAttribute(memPool, cudaMemPoolAttrReleaseThreshold, &currMaxMem); - - // if it's smaller than 1 MiB = 16 qubits, extend it - size_t desiredMaxMem = 16*(1LL<<16); - if (currMaxMem < desiredMaxMem) - cudaMemPoolSetAttribute(memPool, cudaMemPoolAttrReleaseThreshold, &desiredMaxMem); - - // create a mem handler around the mem pool - custatevecDeviceMemHandler_t memHandler; - memHandler.ctx = reinterpret_cast(&memPool); - memHandler.device_alloc = memPoolAlloc; - memHandler.device_free = memPoolFree; - - // set cuQuantum to use this handler and pool, to automate workspace memory management - custatevecSetDeviceMemHandler(cuQuantumHandle, &memHandler); -} - QuESTEnv createQuESTEnv(void) { validateGPUExists(GPUExists(), __func__); validateGPUIsCuQuantumCompatible(GPUSupportsMemPools(),__func__); @@ -480,10 +517,10 @@ QuESTEnv createQuESTEnv(void) { env.numSeeds = 0; seedQuESTDefault(&env); - // prepare cuQuantum - custatevecCreate(&env.cuQuantumHandle); - setupAutoWorkspaces(env.cuQuantumHandle); - + // prepare cuQuantum with automatic workspaces + env.cuConfig = createCuConfig(); + initCuConfig(env.cuConfig); + return env; } @@ -491,7 +528,7 @@ void destroyQuESTEnv(QuESTEnv env){ free(env.seeds); // finalise cuQuantum - custatevecDestroy(env.cuQuantumHandle); + destroyCuConfig(env.cuConfig); } @@ -511,8 +548,8 @@ void statevec_createQureg(Qureg *qureg, int numQubits, QuESTEnv env) qureg->numChunks = 1; qureg->isDensityMatrix = 0; - // copy env's cuQuantum handle to qureg - qureg->cuQuantumHandle = env.cuQuantumHandle; + // copy env's cuQuantum config handle + qureg->cuConfig = env.cuConfig; // allocate user-facing CPU memory qureg->stateVec.real = (qreal*) malloc(numAmps * sizeof(qureg->stateVec.real)); @@ -627,7 +664,7 @@ void densmatr_initPlusState(Qureg qureg) void statevec_initZeroState(Qureg qureg) { custatevecInitializeStateVector( - qureg.cuQuantumHandle, + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, @@ -648,7 +685,7 @@ void statevec_initBlankState(Qureg qureg) void statevec_initPlusState(Qureg qureg) { custatevecInitializeStateVector( - qureg.cuQuantumHandle, + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, @@ -768,7 +805,7 @@ void statevec_multiControlledUnitary(Qureg qureg, long long int ctrlQubitsMask, ctrlVals[i] = !(ctrlFlipMask & (1LL<cuQuantumHandle, qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, toCuMatr(u).data(), CU_AMP_IN_MATRIX_PREC, CUSTATEVEC_MATRIX_LAYOUT_ROW, 0, targs, 1, ctrlInds.data(), ctrlVals.data(), ctrlInds.size(), @@ -866,7 +903,7 @@ void statevec_multiRotateZ(Qureg qureg, long long int mask, qreal angle) std::vector paulis(targs.size(), CUSTATEVEC_PAULI_Z); custatevecApplyPauliRotation( - qureg.cuQuantumHandle, qureg.deviceCuStateVec, + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, theta, paulis.data(), targs.data(), targs.size(), nullptr, nullptr, 0); @@ -880,7 +917,7 @@ void statevec_multiControlledMultiRotateZ(Qureg qureg, long long int ctrlMask, l std::vector paulis(targs.size(), CUSTATEVEC_PAULI_Z); custatevecApplyPauliRotation( - qureg.cuQuantumHandle, qureg.deviceCuStateVec, + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, theta, paulis.data(), targs.data(), targs.size(), ctrls.data(), nullptr, ctrls.size()); @@ -912,7 +949,7 @@ void statevec_swapQubitAmps(Qureg qureg, int qb1, int qb2) int numPairs = 1; custatevecSwapIndexBits( - qureg.cuQuantumHandle, qureg.deviceCuStateVec, + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, targPairs, numPairs, nullptr, nullptr, 0); @@ -1100,7 +1137,7 @@ qreal statevec_calcTotalProb(Qureg qureg) int numBasisBits = 1; custatevecAbs2SumOnZBasis( - qureg.cuQuantumHandle, qureg.deviceCuStateVec, + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, &abs2sum0, &abs2sum1, basisBits, numBasisBits); @@ -1117,7 +1154,7 @@ qreal statevec_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome) int numBasisBits = 1; custatevecAbs2SumOnZBasis( - qureg.cuQuantumHandle, qureg.deviceCuStateVec, + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, &prob0, prob1, basisBits, numBasisBits); @@ -1230,7 +1267,7 @@ void statevec_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outc int basisBits[] = {measureQubit}; custatevecCollapseOnZBasis( - qureg.cuQuantumHandle, qureg.deviceCuStateVec, + qureg.cuConfig->cuQuantumHandle, qureg.deviceCuStateVec, CU_AMP_IN_STATE_PREC, qureg.numQubitsInStateVec, outcome, basisBits, 1, outcomeProb); } From c8d9d7702df5da59e369ea2ab2fa54b25c7eb6b4 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Wed, 20 Sep 2023 10:02:02 +0100 Subject: [PATCH 14/15] moved reportStateToScreen to common --- QuEST/src/GPU/QuEST_cuQuantum.cu | 10 ---------- QuEST/src/GPU/QuEST_gpu.cu | 29 ----------------------------- QuEST/src/GPU/QuEST_gpu_common.cu | 30 ++++++++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 39 deletions(-) diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index 4f81f4659..f26c563f5 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -737,16 +737,6 @@ void statevec_setWeightedQureg(Complex fac1, Qureg qureg1, Complex fac2, Qureg q -/* - * DEBUG - */ - -void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank) -{ -} - - - /* * OPERATORS */ diff --git a/QuEST/src/GPU/QuEST_gpu.cu b/QuEST/src/GPU/QuEST_gpu.cu index 87990a312..e2aaaefda 100755 --- a/QuEST/src/GPU/QuEST_gpu.cu +++ b/QuEST/src/GPU/QuEST_gpu.cu @@ -358,35 +358,6 @@ void statevec_copySubstateFromGPU(Qureg qureg, long long int startInd, long long if (DEBUG) printf("Finished copying data from GPU\n"); } -/** Print the current state vector of probability amplitudes for a set of qubits to standard out. - For debugging purposes. Each rank should print output serially. Only print output for systems <= 5 qubits - */ -void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank){ - long long int index; - int rank; - copyStateFromGPU(qureg); - if (qureg.numQubitsInStateVec<=5){ - for (rank=0; rank 5) { + printf("State reporting disabled for >5 qubits"); + return; + } + + copyStateFromGPU(qureg); + + // distributed GPU not yet supported + if (reportRank != 0) + return; + + printf("Reporting state from rank %d [\n", qureg.chunkId); + printf("real, imag\n"); + + for(long long int i=0; i Date: Wed, 20 Sep 2023 18:46:59 +0900 Subject: [PATCH 15/15] added cuQuantum to doc --- README.md | 14 +++++++++----- examples/README.md | 17 ++++++++++++----- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 9b8da9068..28d7272ac 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,10 @@ [![MIT license](https://img.shields.io/badge/license-MIT-lightgrey.svg)](LICENCE.txt) -The **Quantum Exact Simulation Toolkit** is a high performance simulator of quantum circuits, state-vectors and density matrices. QuEST uses **multithreading**, **GPU acceleration** and **distribution** to run lightning first on laptops, desktops and networked supercomputers. QuEST *just works*; it is stand-alone, requires no installation, and trivial to compile and getting running. +The **Quantum Exact Simulation Toolkit** is a high performance simulator of quantum circuits, state-vectors and density matrices. QuEST uses **multithreading**, **GPU acceleration** and **distribution** to run lightning first on laptops, desktops and networked supercomputers. +QuEST *just works*; it is stand-alone, requires no installation, and is trivial to compile and run. +QuEST hybridises [OpenMP](https://www.openmp.org/) and [MPI](https://www.mpi-forum.org/) with _huge_ compiler support to run on all sorts of multicore, multi-CPU and distributed hardware, uses [HIP](https://docs.amd.com/bundle/HIP-Programming-Guide-v5.3) to run on AMD GPUs, integrates [cuQuantum](https://developer.nvidia.com/cuquantum-sdk) and [Thrust](https://developer.nvidia.com/thrust) for cutting-edge performance on modern NVIDIA GPUs, and has a custom kernel backend to run on older CUDA-compatible GPUs. +And it hides these deployment modes behind a single, seamless interface. [![Languages](https://img.shields.io/badge/C-99-ff69b4.svg)](http://www.open-std.org/jtc1/sc22/wg14/www/standards.html#9899) [![Languages](https://img.shields.io/badge/C++-11-ff69b4.svg)](https://isocpp.org/wiki/faq/cpp11) @@ -36,16 +39,17 @@ The **Quantum Exact Simulation Toolkit** is a high performance simulator of quan ![OS](https://img.shields.io/badge/os-Linux-9cbd3c.svg) ![OS](https://img.shields.io/badge/os-Windows-9cbd3c.svg) [![Platforms](https://img.shields.io/badge/multithreaded-OpenMP-6699ff.svg)](https://www.openmp.org/) +[![Platforms](https://img.shields.io/badge/distributed-MPI-6699ff.svg)](https://www.mpi-forum.org/) [![Platforms](https://img.shields.io/badge/GPU-CUDA-6699ff.svg)](https://developer.nvidia.com/cuda-zone) [![Platforms](https://img.shields.io/badge/GPU-AMD-6699ff.svg)](https://docs.amd.com/bundle/HIP-Programming-Guide-v5.3) -[![Platforms](https://img.shields.io/badge/distributed-MPI-6699ff.svg)](https://www.mpi-forum.org/) +[![Platforms](https://img.shields.io/badge/GPU-cuQuantum-6699ff.svg)](https://developer.nvidia.com/cuquantum-sdk) QuEST is developed by the [QTechTheory](http://qtechtheory.org/) group at the University of Oxford, and [these authors](https://github.com/QuEST-Kit/QuEST/blob/master/AUTHORS.txt). To learn more: - see the [tutorial](https://github.com/QuEST-Kit/QuEST/blob/master/examples/README.md) - view the [documentation](https://quest-kit.github.io/QuEST/modules.html) -- visit our [website](https://quest.qtechtheory.org/) +- visit the [website](https://quest.qtechtheory.org/) - see some [examples](https://github.com/QuEST-Kit/QuEST/blob/master/examples/) -- read our [whitepaper](https://www.nature.com/articles/s41598-019-47174-9), which featured in Scientific Report's [Top 100 in Physics](https://www.nature.com/collections/ecehgdfcba/) :trophy: +- read the [whitepaper](https://www.nature.com/articles/s41598-019-47174-9), which featured in Scientific Report's [Top 100 in Physics](https://www.nature.com/collections/ecehgdfcba/) :trophy: [![DOI](https://img.shields.io/badge/DOI-10.1038%2Fs41598--019--47174--9-yellow.svg)](https://doi.org/10.1038/s41598-019-47174-9) [![Email](https://img.shields.io/badge/email-quest@materials.ox.ac.uk-red.svg)](mailto:quest@materials.ox.ac.uk) @@ -131,7 +135,7 @@ QuEST supports: - [testing utilities](https://quest-kit.github.io/QuEST/group__testutilities.html) -> **For developers**: To regenerate the doc after making changes to the code, run `doxygen doxyconfig/config` in the root directory. This will generate documentation in `Doxygen_doc/html`, the contents of which should be copied into [`docs/`](/docs/) +> **For developers**: QuEST's doc is automatically regenerated when the `master` branch is updated via [Github Actions](https://github.com/features/actions). To locally regenerate the doc, run `doxygen doxyconfig/config` in the root directory, which generates html documentation in `Doxygen_doc/html`. --------------------------------- diff --git a/examples/README.md b/examples/README.md index 019353e17..1252666dd 100644 --- a/examples/README.md +++ b/examples/README.md @@ -7,7 +7,6 @@ Tutorial - [Running](#running) - [Testing](#testing) -> Before getting started, it is best to [test](#testing) QuEST on your hardware. # Coding @@ -174,6 +173,8 @@ QuEST uses the [Mersenne Twister](http://www.math.sci.hiroshima-u.ac.jp/~m-mat/M # Compiling +See [this page](https://quest.qtechtheory.org/download/) to obtain the necessary compilers. + QuEST uses [CMake](https://cmake.org/) (version `3.7` or higher) as its build system. Configure the build by supplying the below `-D[VAR=VALUE]` options after the `cmake ..` command. You can alternatively compile via [GNU Make](https://www.gnu.org/software/make/) directly with the provided [makefile](makefile). > **Windows** users should install [CMake](https://cmake.org/download/) and [Build Tools](https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2019), and run the below commands in the *Developer Command Prompt for VS* @@ -242,9 +243,9 @@ If your project contains multiple source files, separate them with semi-colons. - To compile for GPU, use ```console - -DGPUACCELERATED=1 -DGPU_COMPUTE_CAPABILITY=[CC] .. + -DGPUACCELERATED=1 -DGPU_COMPUTE_CAPABILITY=[CC] ``` - where `[CC]` is the compute cabability of your GPU, written without a decimal point. This can can be looked up at the [NVIDIA website](https://developer.nvidia.com/cuda-gpus). + where `[CC]` is the compute cabability of your GPU, written without a decimal point. This can can be looked up at the [NVIDIA website](https://developer.nvidia.com/cuda-gpus), and to check you have selected the right one, you should run the [unit tests](#testing). > Note that CUDA is not compatible with all compilers. To force `cmake` to use a > compatible compiler, override `CMAKE_C_COMPILER` and `CMAKE_CXX_COMPILER`. > For example, to compile for the [Quadro P6000](https://www.pny.com/nvidia-quadro-p6000) @@ -254,11 +255,16 @@ If your project contains multiple source files, separate them with semi-colons. > -DCMAKE_C_COMPILER=gcc-6 -DCMAKE_CXX_COMPILER=g++-6 > ``` + QuEST can also leverage NVIDIA's [cuQuantum](https://developer.nvidia.com/cuquantum-sdk) and [Thrust](https://developer.nvidia.com/thrust) libraries for optimised GPU simulation on modern GPUs. You must first install cuQuantum (which includes sub-library `cuStateVec` used by QuEST) [here](https://developer.nvidia.com/cuQuantum-downloads). When compiling QuEST, in addition to the above compiler options, simply specify + ```console + -DUSE_CUQUANTUM=1 + ``` + QuEST can also run on AMD GPUs using HIP. For the HIP documentation see [HIP programming guide](https://docs.amd.com/bundle/HIP-Programming-Guide-v5.3/page/Introduction_to_HIP_Programming_Guide.html). To compile for AMD GPUs, use ```console - -DGPUACCELERATED=1 -DUSE_HIP=1 -DGPU_ARCH=[ARCH] .. + -DGPUACCELERATED=1 -DUSE_HIP=1 -DGPU_ARCH=[ARCH] ``` - where `[ARCH]` is the architecture of your GPU, for example `gfx90a`. A table for AMD GPU architectures can be looked up [here](https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table). + where `[ARCH]` is the architecture of your GPU, for example `gfx90a`. A table for AMD GPU architectures can be looked up [here](https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table). To check you have used the correct `GPU_ARCH`, you should run the [unit tests](#testing). - You can additionally customise the floating point precision used by QuEST's `qreal` type, via ```console @@ -310,6 +316,7 @@ Once compiled as above, the compiled executable can be locally run from within t export OMP_NUM_THREADS=16 mpirun -np 8 ./myExecutable ``` + In some circumstances, like when large-memory multi-core nodes have multiple CPU sockets, it is worthwhile to deploy _multiple_ MPI processes to each node. - In GPU mode, the executable is launched directly via ```console