diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index 3a2508d1..174146d0 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -5,7 +5,8 @@ * This makes no use of the ComplexArray qureg.deviceStateVec, used by the bespoke GPU kernels, * which is not malloc'd in this deployment. Instead, this cuQuantum backend mallocs and uses * two dedicated arrays of 'cuAmp' complex primitives; qureg.cuStateVec (CPU memory) and - * qureg.deviceCuStateVec (GPU memory) + * qureg.deviceCuStateVec (GPU memory). Note that some API functions are implemented directly + * in QuEST_gpu_common.cu, in a way agnostic to cuQuantum vs bespoke kernels. * * @author Tyson Jones */ @@ -49,10 +50,29 @@ # define CU_AMP_IN_MATRIX_PREC void // invalid #endif -// convenient operator overloads for cuAmp, for doing complex artihmetic -cuAmp operator - (const cuAmp& a) { +// convenient operator overloads for cuAmp, for doing complex artihmetic. +// some of these are defined to be used by Thrust's backend, because we +// avoided Thrust's complex (see QuEST_precision.h for explanation). +// notice we are manually performing the arithmetic using qreals and +// re-packing the result into TO_CU_AMP, rather than using cuComplex.h's +// functions like cuCadd(). This is because such functions are precision +// specific (grr) and do exactly the same thing themselves! +__host__ __device__ inline cuAmp operator - (const cuAmp& a) { return TO_CU_AMP(-cuAmpReal(a), -cuAmpImag(a)); } +__host__ __device__ inline cuAmp operator * (const cuAmp& a, const std::size_t n) { + return TO_CU_AMP(n*cuAmpReal(a), n*cuAmpImag(a)); +} +__host__ __device__ inline cuAmp operator + (const cuAmp& a, const cuAmp& b) { + return TO_CU_AMP(cuAmpReal(a) + cuAmpReal(b), cuAmpImag(a) + cuAmpImag(b)); +} +__host__ __device__ inline cuAmp operator * (const cuAmp& a, const cuAmp& b) { + qreal aRe = cuAmpReal(a); + qreal aIm = cuAmpImag(a); + qreal bRe = cuAmpReal(b); + qreal bIm = cuAmpImag(b); + return TO_CU_AMP(aRe*bRe - aIm*bIm, aRe*bIm + aIm*bRe); +} // convert user-facing Complex to cuQuantum-facing cuAmp cuAmp toCuAmp(Complex c) { @@ -96,12 +116,6 @@ std::vector getIndsFromMask(long long int mask, int numBits) { -#ifdef __cplusplus -extern "C" { -#endif - - - /* * CUQUANTUM WRAPPERS (to reduce boilerplate) */ @@ -130,6 +144,46 @@ void custatevec_applyMatrix(Qureg qureg, std::vector ctrls, std::vector __host__ __device__ void operator()(Tuple t) { + thrust::get<2>(t) = facOut*thrust::get<2>(t) + fac1*thrust::get<0>(t) + fac2*thrust::get<1>(t); + } +}; + +thrust::device_ptr getStartPtr(Qureg qureg) { + + return thrust::device_pointer_cast(qureg.deviceCuStateVec); +} + +thrust::device_ptr getEndPtr(Qureg qureg) { + + return getStartPtr(qureg) + qureg.numAmpsTotal; +} + + + +/* + * Start C-linkage, so that below functions may only use C signatures (not C++) + */ +#ifdef __cplusplus +extern "C" { +#endif + + + /* * ENVIRONMENT MANAGEMENT */ @@ -328,53 +382,84 @@ qreal statevec_getImagAmp(Qureg qureg, long long int index) * STATE INITIALISATION */ -void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg) -{ -} - void densmatr_initPlusState(Qureg qureg) { + qreal val = 1./(1LL << qureg.numQubitsRepresented); + thrust::fill(getStartPtr(qureg), getEndPtr(qureg), TO_CU_AMP(val, 0)); } -void densmatr_initClassicalState(Qureg qureg, long long int stateInd) +void statevec_initZeroState(Qureg qureg) { + custatevecInitializeStateVector( + qureg.cuQuantumHandle, + qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, + qureg.numQubitsInStateVec, + CUSTATEVEC_STATE_VECTOR_TYPE_ZERO); } void statevec_initBlankState(Qureg qureg) { -} + // init to |0> = {1, 0, 0, ...} + statevec_initZeroState(qureg); -void statevec_initZeroState(Qureg qureg) -{ + // overwrite amps[0] to 0 + qreal re[] = {0}; + qreal im[] = {0}; + statevec_setAmps(qureg, 0, re, im, 1); } void statevec_initPlusState(Qureg qureg) { + custatevecInitializeStateVector( + qureg.cuQuantumHandle, + qureg.deviceCuStateVec, + CU_AMP_IN_STATE_PREC, + qureg.numQubitsInStateVec, + CUSTATEVEC_STATE_VECTOR_TYPE_UNIFORM); } void statevec_initClassicalState(Qureg qureg, long long int stateInd) { -} + // init to |null> = {0, 0, 0, ...} + statevec_initBlankState(qureg); -void statevec_initDebugState(Qureg qureg) -{ + // overwrite amps[stateInd] to 1 + qreal re[] = {1}; + qreal im[] = {0}; + statevec_setAmps(qureg, stateInd, re, im, 1); } -void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome) +void densmatr_initClassicalState(Qureg qureg, long long int stateInd) { -} + // init to |null> = {0, 0, 0, ...} + statevec_initBlankState(qureg); -int statevec_initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env) -{ - return -1; + // index of the desired state in the flat density matrix + long long int densityDim = 1LL << qureg.numQubitsRepresented; + long long int densityInd = (densityDim + 1)*stateInd; + + // overwrite amps[densityInd] to 1 + qreal re[] = {1}; + qreal im[] = {0}; + statevec_setAmps(qureg, densityInd, re, im, 1); } -void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) +void statevec_initDebugState(Qureg qureg) { + // |n> -> (.2n + (.2n+.1)i) |n> + cuAmp init = TO_CU_AMP(0, .1); + cuAmp step = TO_CU_AMP(.2, .2); + thrust::sequence(getStartPtr(qureg), getEndPtr(qureg), init, step); } void statevec_setWeightedQureg(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out) { + // out -> facOut + fac1 qureg1 + fac2 qureg2 + thrust::for_each( + thrust::make_zip_iterator(thrust::make_tuple(getStartPtr(qureg1), getStartPtr(qureg2), getStartPtr(out))), + thrust::make_zip_iterator(thrust::make_tuple(getEndPtr(qureg1), getEndPtr(qureg2), getEndPtr(out))), + functor_setWeightedQureg(toCuAmp(fac1), toCuAmp(fac2), toCuAmp(facOut))); } @@ -387,11 +472,6 @@ void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank) { } -int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision) -{ - return -1; -} - /* diff --git a/QuEST/src/GPU/QuEST_gpu.cu b/QuEST/src/GPU/QuEST_gpu.cu index b253336f..a44219ae 100755 --- a/QuEST/src/GPU/QuEST_gpu.cu +++ b/QuEST/src/GPU/QuEST_gpu.cu @@ -207,36 +207,6 @@ void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) { cudaMemcpyDeviceToDevice); } -__global__ void densmatr_initPureStateKernel( - long long int numPureAmps, - qreal *targetVecReal, qreal *targetVecImag, - qreal *copyVecReal, qreal *copyVecImag) -{ - // this is a particular index of the pure copyQureg - long long int index = blockIdx.x*blockDim.x + threadIdx.x; - if (index>=numPureAmps) return; - - qreal realRow = copyVecReal[index]; - qreal imagRow = copyVecImag[index]; - for (long long int col=0; col < numPureAmps; col++) { - qreal realCol = copyVecReal[col]; - qreal imagCol = - copyVecImag[col]; // minus for conjugation - targetVecReal[col*numPureAmps + index] = realRow*realCol - imagRow*imagCol; - targetVecImag[col*numPureAmps + index] = realRow*imagCol + imagRow*realCol; - } -} - -void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg) -{ - int threadsPerCUDABlock, CUDABlocks; - threadsPerCUDABlock = 128; - CUDABlocks = ceil((qreal)(copyQureg.numAmpsPerChunk)/threadsPerCUDABlock); - densmatr_initPureStateKernel<<>>( - copyQureg.numAmpsPerChunk, - targetQureg.deviceStateVec.real, targetQureg.deviceStateVec.imag, - copyQureg.deviceStateVec.real, copyQureg.deviceStateVec.imag); -} - __global__ void densmatr_initPlusStateKernel(long long int stateVecSize, qreal probFactor, qreal *stateVecReal, qreal *stateVecImag){ long long int index; @@ -3793,7 +3763,6 @@ __global__ void statevec_applyParamNamedPhaseFuncOverridesKernel( } } - // negate phase to conjugate operator if (conj) phase *= -1; @@ -3865,81 +3834,6 @@ void statevec_applyParamNamedPhaseFuncOverrides( cudaFree(d_params); } -__global__ void densmatr_setQuregToPauliHamilKernel( - Qureg qureg, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms -) { - long long int n = blockIdx.x*blockDim.x + threadIdx.x; - if (n>=qureg.numAmpsPerChunk) return; - - // flattened {I,X,Y,Z} matrix elements, where [k] = [p][i][j] - const int pauliRealElems[] = { 1,0, 0,1, 0,1, 1,0, 0,0, 0,0, 1,0, 0,-1 }; - const int pauliImagElems[] = { 0,0, 0,0, 0,0, 0,0, 0,-1,1,0, 0,0, 0,0 }; - - // |n> = |c>|r> - const int numQubits = qureg.numQubitsRepresented; - const long long int r = n & ((1LL << numQubits) - 1); - const long long int c = n >> numQubits; - - // new amplitude of |n> - qreal elemRe = 0; - qreal elemIm = 0; - - for (long long int t=0; t> q) & 1; - int j = (c >> q) & 1; - int p = (int) pauliCodes[pInd++]; - int k = (p<<2) + (i<<1) + j; - int pauliRe = pauliRealElems[k]; - int pauliIm = pauliImagElems[k]; - - // kron *= pauli - int tmp = (pauliRe*kronRe) - (pauliIm*kronIm); - kronIm = (pauliRe*kronIm) + (pauliIm*kronRe); - kronRe = tmp; - } - - // elem = sum_t coeffs[t] pauliKronecker[r][c] - elemRe += termCoeffs[t] * kronRe; - elemIm += termCoeffs[t] * kronIm; - } - - // overwrite the density matrix entry - qureg.deviceStateVec.real[n] = elemRe; - qureg.deviceStateVec.imag[n] = elemIm; -} - -void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) { - - // copy hamil into GPU memory - enum pauliOpType* d_pauliCodes; - size_t mem_pauliCodes = hamil.numSumTerms * hamil.numQubits * sizeof *d_pauliCodes; - cudaMalloc(&d_pauliCodes, mem_pauliCodes); - cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice); - - qreal* d_termCoeffs; - size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs; - cudaMalloc(&d_termCoeffs, mem_termCoeffs); - cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice); - - int numThreadsPerBlock = 128; - int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) numThreadsPerBlock); - densmatr_setQuregToPauliHamilKernel<<>>( - qureg, d_pauliCodes, d_termCoeffs, hamil.numSumTerms); - - // free tmp GPU memory - cudaFree(d_pauliCodes); - cudaFree(d_termCoeffs); -} - #ifdef __cplusplus diff --git a/QuEST/src/GPU/QuEST_gpu_common.cu b/QuEST/src/GPU/QuEST_gpu_common.cu index f07b105a..0426a1c4 100644 --- a/QuEST/src/GPU/QuEST_gpu_common.cu +++ b/QuEST/src/GPU/QuEST_gpu_common.cu @@ -30,6 +30,49 @@ extern "C" { +/* + * GPU CONFIG + */ + +const int NUM_THREADS_PER_BLOCK = 128; + + + +/* + * BACKEND AGNOSTIC COMPLEX ARITHMETIC + */ + +#define PROD_REAL(aRe, aIm, bRe, bIm) \ + ((aRe)*(bRe) - (aIm)*(bIm)) +#define PROD_IMAG(aRe, aIm, bRe, bIm) \ + ((aRe)*(bIm) + (aIm)*(bRe)) + +#ifdef USE_CUQUANTUM + #define GET_AMP(aRe, aIm, qureg, i) \ + aRe = qureg.deviceCuStateVec[i].x; \ + aIm = qureg.deviceCuStateVec[i].y; +#else + #define GET_AMP(aRe, aIm, qureg, i) \ + aRe = qureg.deviceStateVec.real[i]; \ + aIm = qureg.deviceStateVec.imag[i]; +#endif + +#ifdef USE_CUQUANTUM + #define SET_AMP(qureg, i, aRe, aIm) \ + qureg.deviceCuStateVec[i].x = aRe; \ + qureg.deviceCuStateVec[i].y = aIm; +#else + #define SET_AMP(qureg, i, aRe, aIm) \ + qureg.deviceStateVec.real[i] = aRe; \ + qureg.deviceStateVec.imag[i] = aIm; +#endif + + + +/* + * ENVIRONMENT MANAGEMENT + */ + int GPUExists(void) { int deviceCount, device; int gpuDeviceCount = 0; @@ -110,9 +153,116 @@ void seedQuEST(QuESTEnv *env, unsigned long int *seedArray, int numSeeds) { +/* + * STATE INITIALISATION + */ + +__global__ void densmatr_initPureStateKernel(Qureg dens, Qureg pure) +{ + long long int pureInd = blockIdx.x*blockDim.x + threadIdx.x; + if (pureInd >= pure.numAmpsPerChunk) return; + + qreal aRe, aIm; + GET_AMP( aRe, aIm, pure, pureInd ); + + for (long long int col=0; col>>(dens, pure); +} + +__global__ void densmatr_setQuregToPauliHamilKernel( + Qureg qureg, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms +) { + long long int n = blockIdx.x*blockDim.x + threadIdx.x; + if (n>=qureg.numAmpsPerChunk) return; + + // flattened {I,X,Y,Z} matrix elements, where [k] = [p][i][j] + const int pauliRealElems[] = { 1,0, 0,1, 0,1, 1,0, 0,0, 0,0, 1,0, 0,-1 }; + const int pauliImagElems[] = { 0,0, 0,0, 0,0, 0,0, 0,-1,1,0, 0,0, 0,0 }; + + // |n> = |c>|r> + const int numQubits = qureg.numQubitsRepresented; + const long long int r = n & ((1LL << numQubits) - 1); + const long long int c = n >> numQubits; + + // new amplitude of |n> + qreal elemRe = 0; + qreal elemIm = 0; + + for (long long int t=0; t> q) & 1; + int j = (c >> q) & 1; + int p = (int) pauliCodes[pInd++]; + int k = (p<<2) + (i<<1) + j; + int pauliRe = pauliRealElems[k]; + int pauliIm = pauliImagElems[k]; + + // kron *= pauli + int tmp = (pauliRe*kronRe) - (pauliIm*kronIm); + kronIm = (pauliRe*kronIm) + (pauliIm*kronRe); + kronRe = tmp; + } + + // elem = sum_t coeffs[t] pauliKronecker[r][c] + elemRe += termCoeffs[t] * kronRe; + elemIm += termCoeffs[t] * kronIm; + } + + // overwrite the density matrix entry + SET_AMP( qureg, n, elemRe, elemIm ); +} + +void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) { + + // copy hamil into GPU memory + enum pauliOpType* d_pauliCodes; + size_t mem_pauliCodes = hamil.numSumTerms * hamil.numQubits * sizeof *d_pauliCodes; + cudaMalloc(&d_pauliCodes, mem_pauliCodes); + cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice); + + qreal* d_termCoeffs; + size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs; + cudaMalloc(&d_termCoeffs, mem_termCoeffs); + cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice); + + int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) NUM_THREADS_PER_BLOCK); + densmatr_setQuregToPauliHamilKernel<<>>( + qureg, d_pauliCodes, d_termCoeffs, hamil.numSumTerms); + + // free tmp GPU memory + cudaFree(d_pauliCodes); + cudaFree(d_termCoeffs); +} +/* + * MANAGING NON-QUREG TYPES + */ DiagonalOp agnostic_createDiagonalOp(int numQubits, QuESTEnv env) { @@ -125,7 +275,6 @@ DiagonalOp agnostic_createDiagonalOp(int numQubits, QuESTEnv env) { // allocate CPU memory (initialised to zero) op.real = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal)); op.imag = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal)); - // @TODO no handling of rank>1 allocation (no distributed GPU) // check cpu memory allocation was successful validateDiagonalOpAllocation(&op, env, __func__); @@ -201,9 +350,8 @@ void agnostic_initDiagonalOpFromPauliHamil(DiagonalOp op, PauliHamil hamil) { cudaMalloc(&d_termCoeffs, mem_termCoeffs); cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice); - int numThreadsPerBlock = 128; - int numBlocks = ceil(op.numElemsPerChunk / (qreal) numThreadsPerBlock); - agnostic_initDiagonalOpFromPauliHamilKernel<<>>( + int numBlocks = ceil(op.numElemsPerChunk / (qreal) NUM_THREADS_PER_BLOCK); + agnostic_initDiagonalOpFromPauliHamilKernel<<>>( op, d_pauliCodes, d_termCoeffs, hamil.numSumTerms); // copy populated operator into to RAM diff --git a/QuEST/src/GPU/QuEST_gpu_common.h b/QuEST/src/GPU/QuEST_gpu_common.h index d2b58df9..cf6998cb 100644 --- a/QuEST/src/GPU/QuEST_gpu_common.h +++ b/QuEST/src/GPU/QuEST_gpu_common.h @@ -1,3 +1,11 @@ +// Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details + +/** @file + * GPU backend routines which are invoked by both the cuQuantum and bespoke-kernel backends. + * This excludes backend-agnostic API implementations which do not need a header declaration. + * + * @author Tyson Jones + */ # ifndef QUEST_GPU_COMMON_H # define QUEST_GPU_COMMON_H