From 95ce6a59db896e1f8aaf72369b2ed43513268d2e Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Wed, 20 Sep 2023 05:45:30 +0100 Subject: [PATCH] added phase functions --- QuEST/src/GPU/QuEST_cuQuantum.cu | 25 -- QuEST/src/GPU/QuEST_gpu.cu | 452 ------------------------- QuEST/src/GPU/QuEST_gpu_common.cu | 543 +++++++++++++++++++++++++++++- 3 files changed, 537 insertions(+), 483 deletions(-) diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu index 0557b5b1..7422345c 100644 --- a/QuEST/src/GPU/QuEST_cuQuantum.cu +++ b/QuEST/src/GPU/QuEST_cuQuantum.cu @@ -857,7 +857,6 @@ void statevec_multiControlledPhaseShift(Qureg qureg, int *controlQubits, int num std::vector targs{controlQubits[0]}; std::vector ctrls(controlQubits + 1, controlQubits + numControlQubits); custatevec_applyDiagonal(qureg, ctrls, targs, elems); - } void statevec_multiRotateZ(Qureg qureg, long long int mask, qreal angle) @@ -986,30 +985,6 @@ void densmatr_applyDiagonalOp(Qureg qureg, DiagonalOp op) thrust::for_each(startIndIter, endIndIter, functor); } -void statevec_applyPhaseFuncOverrides( - Qureg qureg, int* qubits, int numQubits, enum bitEncoding encoding, - qreal* coeffs, qreal* exponents, int numTerms, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - int conj) -{ -} - -void statevec_applyMultiVarPhaseFuncOverrides( - Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, - qreal* coeffs, qreal* exponents, int* numTermsPerReg, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - int conj) -{ -} - -void statevec_applyParamNamedPhaseFuncOverrides( - Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, - enum phaseFunc phaseFuncName, qreal* params, int numParams, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - int conj) -{ -} - /* diff --git a/QuEST/src/GPU/QuEST_gpu.cu b/QuEST/src/GPU/QuEST_gpu.cu index 0171d157..87990a31 100755 --- a/QuEST/src/GPU/QuEST_gpu.cu +++ b/QuEST/src/GPU/QuEST_gpu.cu @@ -3189,458 +3189,6 @@ Complex densmatr_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op) { return expecVal; } -__global__ void statevec_applyPhaseFuncOverridesKernel( - Qureg qureg, int* qubits, int numQubits, enum bitEncoding encoding, - qreal* coeffs, qreal* exponents, int numTerms, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - int conj -) { - long long int index = blockIdx.x*blockDim.x + threadIdx.x; - if (index>=qureg.numAmpsPerChunk) return; - - // determine global amplitude index (non-distributed, so it's just local index) - long long int globalAmpInd = index; - - // determine phase index of {qubits} - long long int phaseInd = 0LL; - if (encoding == UNSIGNED) { - for (int q=0; q>>( - qureg, d_qubits, numQubits, encoding, - d_coeffs, d_exponents, numTerms, - d_overrideInds, d_overridePhases, numOverrides, - conj); - - // cleanup device memory - cudaFree(d_qubits); - cudaFree(d_coeffs); - cudaFree(d_exponents); - cudaFree(d_overrideInds); - cudaFree(d_overridePhases); -} - -__global__ void statevec_applyMultiVarPhaseFuncOverridesKernel( - Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, - qreal* coeffs, qreal* exponents, int* numTermsPerReg, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - long long int *phaseInds, - int conj -) { - long long int index = blockIdx.x*blockDim.x + threadIdx.x; - if (index>=qureg.numAmpsPerChunk) return; - - // determine global amplitude index (non-distributed, so it's just local index) - long long int globalAmpInd = index; - - /* - * each thread needs to write to a local: - * long long int phaseInds[numRegs]; - * but instead has access to shared array phaseInds, with below stride and offset - */ - size_t stride = gridDim.x*blockDim.x; - size_t offset = blockIdx.x*blockDim.x + threadIdx.x; - - // determine phase indices - int flatInd = 0; - if (encoding == UNSIGNED) { - for (int r=0; r>>( - qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding, - d_coeffs, d_exponents, d_numTermsPerReg, - d_overrideInds, d_overridePhases, numOverrides, - d_phaseInds, - conj); - - // free device memory - cudaFree(d_qubits); - cudaFree(d_coeffs); - cudaFree(d_exponents); - cudaFree(d_numQubitsPerReg); - cudaFree(d_numTermsPerReg); - cudaFree(d_overrideInds); - cudaFree(d_overridePhases); - cudaFree(d_phaseInds); -} - -__global__ void statevec_applyParamNamedPhaseFuncOverridesKernel( - Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, - enum phaseFunc phaseFuncName, qreal* params, int numParams, - long long int* overrideInds, qreal* overridePhases, int numOverrides, - long long int* phaseInds, - int conj -) { - long long int index = blockIdx.x*blockDim.x + threadIdx.x; - if (index>=qureg.numAmpsPerChunk) return; - - // determine global amplitude index (non-distributed, so it's just local index) - long long int globalAmpInd = index; - - /* - * each thread needs to write to a local: - * long long int phaseInds[numRegs]; - * but instead has access to shared array phaseInds, with below stride and offset - */ - size_t stride = gridDim.x*blockDim.x; - size_t offset = blockIdx.x*blockDim.x + threadIdx.x; - - // determine phase indices - if (encoding == UNSIGNED) { - int flatInd = 0; - for (int r=0; r 0) cudaMalloc(&d_params, mem_params); - - // copy function args into GPU memory - cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice); - cudaMemcpy(d_numQubitsPerReg, numQubitsPerReg, mem_numQubitsPerReg, cudaMemcpyHostToDevice); - cudaMemcpy(d_overrideInds, overrideInds, mem_overrideInds, cudaMemcpyHostToDevice); - cudaMemcpy(d_overridePhases, overridePhases, mem_overridePhases, cudaMemcpyHostToDevice); - if (numParams > 0) - cudaMemcpy(d_params, params, mem_params, cudaMemcpyHostToDevice); - - int threadsPerCUDABlock = 128; - int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock); - - // allocate thread-local working space {phaseInds} - long long int *d_phaseInds; - size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks; - cudaMalloc(&d_phaseInds, numRegs*gridSize * sizeof *d_phaseInds); - - // call kernel - statevec_applyParamNamedPhaseFuncOverridesKernel<<>>( - qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding, - phaseFuncName, d_params, numParams, - d_overrideInds, d_overridePhases, numOverrides, - d_phaseInds, - conj); - - // free device memory - cudaFree(d_qubits); - cudaFree(d_numQubitsPerReg); - cudaFree(d_overrideInds); - cudaFree(d_overridePhases); - cudaFree(d_phaseInds); - if (numParams > 0) - cudaFree(d_params); -} - #ifdef __cplusplus diff --git a/QuEST/src/GPU/QuEST_gpu_common.cu b/QuEST/src/GPU/QuEST_gpu_common.cu index c9bfea65..0e984d80 100644 --- a/QuEST/src/GPU/QuEST_gpu_common.cu +++ b/QuEST/src/GPU/QuEST_gpu_common.cu @@ -36,6 +36,10 @@ extern "C" { const int NUM_THREADS_PER_BLOCK = 128; +__forceinline__ __device__ long long int getThreadInd() { + return blockIdx.x*blockDim.x + threadIdx.x; +} + /* @@ -159,7 +163,7 @@ void seedQuEST(QuESTEnv *env, unsigned long int *seedArray, int numSeeds) { __global__ void densmatr_initPureStateKernel(Qureg dens, Qureg pure) { - long long int pureInd = blockIdx.x*blockDim.x + threadIdx.x; + long long int pureInd = getThreadInd(); if (pureInd >= pure.numAmpsPerChunk) return; qreal aRe, aIm; @@ -188,7 +192,7 @@ void densmatr_initPureState(Qureg dens, Qureg pure) __global__ void densmatr_setQuregToPauliHamilKernel( Qureg qureg, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms ) { - long long int n = blockIdx.x*blockDim.x + threadIdx.x; + long long int n = getThreadInd(); if (n>=qureg.numAmpsPerChunk) return; // flattened {I,X,Y,Z} matrix elements, where [k] = [p][i][j] @@ -288,7 +292,7 @@ __global__ void statevec_calcProbOfAllOutcomesKernel( qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits ) { // each thread handles one amplitude (all amplitudes are involved) - long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x; + long long int ampInd = getThreadInd(); if (ampInd >= qureg.numAmpsTotal) return; qreal ampRe, ampIm; @@ -340,7 +344,7 @@ __global__ void densmatr_calcProbOfAllOutcomesKernel( qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits ) { // each thread handles one diagonal amplitude - long long int diagInd = blockIdx.x*blockDim.x + threadIdx.x; + long long int diagInd = getThreadInd(); long long int numDiags = (1LL << qureg.numQubitsRepresented); if (diagInd >= numDiags) return; @@ -403,7 +407,7 @@ __global__ void densmatr_mixTwoQubitDepolarisingKernel( long long int part3, long long int part4, long long int part5, long long int rowCol1, long long int rowCol2) { - long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x; + long long int scanInd = getThreadInd(); if (scanInd >= qureg.numAmpsPerChunk) return; long long int ind00 = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4); @@ -519,7 +523,7 @@ __global__ void agnostic_initDiagonalOpFromPauliHamilKernel( DiagonalOp op, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms ) { // each thread processes one diagonal element - long long int elemInd = blockIdx.x*blockDim.x + threadIdx.x; + long long int elemInd = getThreadInd(); if (elemInd >= op.numElemsPerChunk) return; @@ -592,6 +596,533 @@ void agnostic_setDiagonalOpElems(DiagonalOp op, long long int startInd, qreal* r +/* + * PHASE FUNCTIONS + */ + +__forceinline__ __device__ int getBit(long long int num, int ind) { + return (num >> ind) & 1; +} + +__forceinline__ __device__ void applyPhaseToAmp(Qureg qureg, long long int index, qreal phase, int conj) +{ + // negate phase to conjugate operator + phase *= (1 - 2*conj); + qreal c = cos(phase); + qreal s = sin(phase); + + // modify amp to amp * exp(i phase) + qreal re, im; + GET_AMP( re, im, qureg, index ); + SET_AMP( qureg, index, re*c - im*s, re*s + im*c ); +} + +__forceinline__ __device__ void getMultiRegPhaseIndOffsets(size_t* stride, size_t* offset) +{ + // determine the phaseInds tuple relevant for this thread + *stride = gridDim.x*blockDim.x; + *offset = blockIdx.x*blockDim.x + threadIdx.x; +} + +__forceinline__ __device__ long long int getPhaseInd(long long int globalAmpInd, int* qubits, int numQubits, enum bitEncoding encoding) +{ + long long int phaseInd = 0LL; + + if (encoding == UNSIGNED) { + for (int q=0; q=qureg.numAmpsPerChunk) return; + + // determine global amplitude index (non-distributed, so it's just local index) + long long int globalAmpInd = index; + + // determine phaseInd from the qubit encoding + long long int phaseInd = getPhaseInd(globalAmpInd, qubits, numQubits, encoding); + + // determine if this phase index has an overriden value (i < numOverrides) + int overInd = getIndOfPhaseOverride(phaseInd, overrideInds, numOverrides); + + // determine phase from {coeffs}, {exponents} (unless overriden) + qreal phase = 0; + if (overInd < numOverrides) + phase = overridePhases[overInd]; + else + for (int t=0; t>>( + qureg, d_qubits, numQubits, encoding, + d_coeffs, d_exponents, numTerms, + d_overrideInds, d_overridePhases, numOverrides, + conj); + + // cleanup device memory + cudaFree(d_qubits); + cudaFree(d_coeffs); + cudaFree(d_exponents); + cudaFree(d_overrideInds); + cudaFree(d_overridePhases); +} + +__global__ void statevec_applyMultiVarPhaseFuncOverridesKernel( + Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, + qreal* coeffs, qreal* exponents, int* numTermsPerReg, + long long int* overrideInds, qreal* overridePhases, int numOverrides, + long long int *phaseInds, + int conj +) { + long long int index = getThreadInd(); + if (index>=qureg.numAmpsPerChunk) return; + + // determine global amplitude index (non-distributed, so it's just local index) + long long int globalAmpInd = index; + + // determine phase indices (each thread has phaseInds[numRegs] sub-array) + setMultiRegPhaseInds(phaseInds, globalAmpInd, qubits, numQubitsPerReg, numRegs, encoding); + + // determine if this phase index has an overriden value + long long int overInd = getIndOfMultiRegPhaseOverride(phaseInds, numRegs, overrideInds, numOverrides); + + // either use the overriden phase... + qreal phase = 0; + if (overInd < numOverrides) + phase = overridePhases[overInd]; + + else { + // else determine the phaseInds tuple relevant for this thread + size_t stride, offset; + getMultiRegPhaseIndOffsets(&stride, &offset); + + // and compute the phase from coeffs and exponents + long long int flatInd = 0; + for (int r=0; r>>( + qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding, + d_coeffs, d_exponents, d_numTermsPerReg, + d_overrideInds, d_overridePhases, numOverrides, + d_phaseInds, + conj); + + // free device memory + cudaFree(d_qubits); + cudaFree(d_coeffs); + cudaFree(d_exponents); + cudaFree(d_numQubitsPerReg); + cudaFree(d_numTermsPerReg); + cudaFree(d_overrideInds); + cudaFree(d_overridePhases); + cudaFree(d_phaseInds); +} + +__global__ void statevec_applyParamNamedPhaseFuncOverridesKernel( + Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, + enum phaseFunc phaseFuncName, qreal* params, int numParams, + long long int* overrideInds, qreal* overridePhases, int numOverrides, + long long int* phaseInds, + int conj +) { + long long int index = getThreadInd(); + if (index>=qureg.numAmpsPerChunk) return; + + // determine global amplitude index (non-distributed, so it's just local index) + long long int globalAmpInd = index; + + // determine phase indices (each thread has phaseInds[numRegs] sub-array) + setMultiRegPhaseInds(phaseInds, globalAmpInd, qubits, numQubitsPerReg, numRegs, encoding); + + // determine if this phase index has an overriden value + long long int overInd = getIndOfMultiRegPhaseOverride(phaseInds, numRegs, overrideInds, numOverrides); + + // determine the phase, or the overriden one + qreal phase = 0; + if (overInd < numOverrides) + phase = overridePhases[overInd]; + else + phase = getPhaseFromParamNamedFunc(phaseInds, numRegs, phaseFuncName, params, numParams); + + // modify amp to amp * exp(i phase) + applyPhaseToAmp(qureg, index, phase, conj); +} + +void statevec_applyParamNamedPhaseFuncOverrides( + Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding, + enum phaseFunc phaseFuncName, qreal* params, int numParams, + long long int* overrideInds, qreal* overridePhases, int numOverrides, + int conj +) { + // determine size of arrays, for cloning into GPU memory + size_t mem_numQubitsPerReg = numRegs * sizeof *numQubitsPerReg; + size_t mem_overridePhases = numOverrides * sizeof *overridePhases; + size_t mem_overrideInds = numOverrides * numRegs * sizeof *overrideInds; + size_t mem_params = numParams * sizeof *params; + size_t mem_qubits = 0; + for (int r=0; r 0) cudaMalloc(&d_params, mem_params); + + // copy function args into GPU memory + cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice); + cudaMemcpy(d_numQubitsPerReg, numQubitsPerReg, mem_numQubitsPerReg, cudaMemcpyHostToDevice); + cudaMemcpy(d_overrideInds, overrideInds, mem_overrideInds, cudaMemcpyHostToDevice); + cudaMemcpy(d_overridePhases, overridePhases, mem_overridePhases, cudaMemcpyHostToDevice); + if (numParams > 0) + cudaMemcpy(d_params, params, mem_params, cudaMemcpyHostToDevice); + + int threadsPerCUDABlock = 128; + int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock); + + // allocate thread-local working space {phaseInds} + long long int *d_phaseInds; + size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks; + cudaMalloc(&d_phaseInds, numRegs*gridSize * sizeof *d_phaseInds); + + // call kernel + statevec_applyParamNamedPhaseFuncOverridesKernel<<>>( + qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding, + phaseFuncName, d_params, numParams, + d_overrideInds, d_overridePhases, numOverrides, + d_phaseInds, + conj); + + // free device memory + cudaFree(d_qubits); + cudaFree(d_numQubitsPerReg); + cudaFree(d_overrideInds); + cudaFree(d_overridePhases); + cudaFree(d_phaseInds); + if (numParams > 0) + cudaFree(d_params); +} + + + #ifdef __cplusplus } #endif