Skip to content

Commit

Permalink
added state initialisers
Browse files Browse the repository at this point in the history
although we are missing imports to avoid git conflict:

# include <thrust/sequence.h>
# include <thrust/iterator/zip_iterator.h>
# include <thrust/for_each.h>
  • Loading branch information
TysonRayJones authored Aug 24, 2023
1 parent 7b7c197 commit 564851e
Show file tree
Hide file tree
Showing 4 changed files with 271 additions and 141 deletions.
142 changes: 111 additions & 31 deletions QuEST/src/GPU/QuEST_cuQuantum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
* This makes no use of the ComplexArray qureg.deviceStateVec, used by the bespoke GPU kernels,
* which is not malloc'd in this deployment. Instead, this cuQuantum backend mallocs and uses
* two dedicated arrays of 'cuAmp' complex primitives; qureg.cuStateVec (CPU memory) and
* qureg.deviceCuStateVec (GPU memory)
* qureg.deviceCuStateVec (GPU memory). Note that some API functions are implemented directly
* in QuEST_gpu_common.cu, in a way agnostic to cuQuantum vs bespoke kernels.
*
* @author Tyson Jones
*/
Expand Down Expand Up @@ -49,10 +50,29 @@
# define CU_AMP_IN_MATRIX_PREC void // invalid
#endif

// convenient operator overloads for cuAmp, for doing complex artihmetic
cuAmp operator - (const cuAmp& a) {
// convenient operator overloads for cuAmp, for doing complex artihmetic.
// some of these are defined to be used by Thrust's backend, because we
// avoided Thrust's complex<qreal> (see QuEST_precision.h for explanation).
// notice we are manually performing the arithmetic using qreals and
// re-packing the result into TO_CU_AMP, rather than using cuComplex.h's
// functions like cuCadd(). This is because such functions are precision
// specific (grr) and do exactly the same thing themselves!
__host__ __device__ inline cuAmp operator - (const cuAmp& a) {
return TO_CU_AMP(-cuAmpReal(a), -cuAmpImag(a));
}
__host__ __device__ inline cuAmp operator * (const cuAmp& a, const std::size_t n) {
return TO_CU_AMP(n*cuAmpReal(a), n*cuAmpImag(a));
}
__host__ __device__ inline cuAmp operator + (const cuAmp& a, const cuAmp& b) {
return TO_CU_AMP(cuAmpReal(a) + cuAmpReal(b), cuAmpImag(a) + cuAmpImag(b));
}
__host__ __device__ inline cuAmp operator * (const cuAmp& a, const cuAmp& b) {
qreal aRe = cuAmpReal(a);
qreal aIm = cuAmpImag(a);
qreal bRe = cuAmpReal(b);
qreal bIm = cuAmpImag(b);
return TO_CU_AMP(aRe*bRe - aIm*bIm, aRe*bIm + aIm*bRe);
}

// convert user-facing Complex to cuQuantum-facing cuAmp
cuAmp toCuAmp(Complex c) {
Expand Down Expand Up @@ -96,12 +116,6 @@ std::vector<int> getIndsFromMask(long long int mask, int numBits) {



#ifdef __cplusplus
extern "C" {
#endif



/*
* CUQUANTUM WRAPPERS (to reduce boilerplate)
*/
Expand Down Expand Up @@ -130,6 +144,46 @@ void custatevec_applyMatrix(Qureg qureg, std::vector<int> ctrls, std::vector<int



/*
* THRUST WRAPPERS AND FUNCTORS (to reduce boilerplate, and for custom modification of statevectors)
*/

struct functor_setWeightedQureg
{
// functor requires 3 complex scalars
const cuAmp fac1;
const cuAmp fac2;
const cuAmp facOut;
functor_setWeightedQureg(cuAmp _fac1, cuAmp _fac2, cuAmp _facOut) :
fac1(_fac1), fac2(_fac2), facOut(_facOut) {}

// and modifies out to (facOut out + fac1 qureg1 + fac2 qureg2)
template <typename Tuple> __host__ __device__ void operator()(Tuple t) {
thrust::get<2>(t) = facOut*thrust::get<2>(t) + fac1*thrust::get<0>(t) + fac2*thrust::get<1>(t);
}
};

thrust::device_ptr<cuAmp> getStartPtr(Qureg qureg) {

return thrust::device_pointer_cast(qureg.deviceCuStateVec);
}

thrust::device_ptr<cuAmp> getEndPtr(Qureg qureg) {

return getStartPtr(qureg) + qureg.numAmpsTotal;
}



/*
* Start C-linkage, so that below functions may only use C signatures (not C++)
*/
#ifdef __cplusplus
extern "C" {
#endif



/*
* ENVIRONMENT MANAGEMENT
*/
Expand Down Expand Up @@ -328,53 +382,84 @@ qreal statevec_getImagAmp(Qureg qureg, long long int index)
* STATE INITIALISATION
*/

void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg)
{
}

void densmatr_initPlusState(Qureg qureg)
{
qreal val = 1./(1LL << qureg.numQubitsRepresented);
thrust::fill(getStartPtr(qureg), getEndPtr(qureg), TO_CU_AMP(val, 0));
}

void densmatr_initClassicalState(Qureg qureg, long long int stateInd)
void statevec_initZeroState(Qureg qureg)
{
custatevecInitializeStateVector(
qureg.cuQuantumHandle,
qureg.deviceCuStateVec,
CU_AMP_IN_STATE_PREC,
qureg.numQubitsInStateVec,
CUSTATEVEC_STATE_VECTOR_TYPE_ZERO);
}

void statevec_initBlankState(Qureg qureg)
{
}
// init to |0> = {1, 0, 0, ...}
statevec_initZeroState(qureg);

void statevec_initZeroState(Qureg qureg)
{
// overwrite amps[0] to 0
qreal re[] = {0};
qreal im[] = {0};
statevec_setAmps(qureg, 0, re, im, 1);
}

void statevec_initPlusState(Qureg qureg)
{
custatevecInitializeStateVector(
qureg.cuQuantumHandle,
qureg.deviceCuStateVec,
CU_AMP_IN_STATE_PREC,
qureg.numQubitsInStateVec,
CUSTATEVEC_STATE_VECTOR_TYPE_UNIFORM);
}

void statevec_initClassicalState(Qureg qureg, long long int stateInd)
{
}
// init to |null> = {0, 0, 0, ...}
statevec_initBlankState(qureg);

void statevec_initDebugState(Qureg qureg)
{
// overwrite amps[stateInd] to 1
qreal re[] = {1};
qreal im[] = {0};
statevec_setAmps(qureg, stateInd, re, im, 1);
}

void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome)
void densmatr_initClassicalState(Qureg qureg, long long int stateInd)
{
}
// init to |null> = {0, 0, 0, ...}
statevec_initBlankState(qureg);

int statevec_initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env)
{
return -1;
// index of the desired state in the flat density matrix
long long int densityDim = 1LL << qureg.numQubitsRepresented;
long long int densityInd = (densityDim + 1)*stateInd;

// overwrite amps[densityInd] to 1
qreal re[] = {1};
qreal im[] = {0};
statevec_setAmps(qureg, densityInd, re, im, 1);
}

void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil)
void statevec_initDebugState(Qureg qureg)
{
// |n> -> (.2n + (.2n+.1)i) |n>
cuAmp init = TO_CU_AMP(0, .1);
cuAmp step = TO_CU_AMP(.2, .2);
thrust::sequence(getStartPtr(qureg), getEndPtr(qureg), init, step);
}

void statevec_setWeightedQureg(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out)
{
// out -> facOut + fac1 qureg1 + fac2 qureg2
thrust::for_each(
thrust::make_zip_iterator(thrust::make_tuple(getStartPtr(qureg1), getStartPtr(qureg2), getStartPtr(out))),
thrust::make_zip_iterator(thrust::make_tuple(getEndPtr(qureg1), getEndPtr(qureg2), getEndPtr(out))),
functor_setWeightedQureg(toCuAmp(fac1), toCuAmp(fac2), toCuAmp(facOut)));
}


Expand All @@ -387,11 +472,6 @@ void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank)
{
}

int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision)
{
return -1;
}



/*
Expand Down
106 changes: 0 additions & 106 deletions QuEST/src/GPU/QuEST_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -207,36 +207,6 @@ void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) {
cudaMemcpyDeviceToDevice);
}

__global__ void densmatr_initPureStateKernel(
long long int numPureAmps,
qreal *targetVecReal, qreal *targetVecImag,
qreal *copyVecReal, qreal *copyVecImag)
{
// this is a particular index of the pure copyQureg
long long int index = blockIdx.x*blockDim.x + threadIdx.x;
if (index>=numPureAmps) return;

qreal realRow = copyVecReal[index];
qreal imagRow = copyVecImag[index];
for (long long int col=0; col < numPureAmps; col++) {
qreal realCol = copyVecReal[col];
qreal imagCol = - copyVecImag[col]; // minus for conjugation
targetVecReal[col*numPureAmps + index] = realRow*realCol - imagRow*imagCol;
targetVecImag[col*numPureAmps + index] = realRow*imagCol + imagRow*realCol;
}
}

void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg)
{
int threadsPerCUDABlock, CUDABlocks;
threadsPerCUDABlock = 128;
CUDABlocks = ceil((qreal)(copyQureg.numAmpsPerChunk)/threadsPerCUDABlock);
densmatr_initPureStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
copyQureg.numAmpsPerChunk,
targetQureg.deviceStateVec.real, targetQureg.deviceStateVec.imag,
copyQureg.deviceStateVec.real, copyQureg.deviceStateVec.imag);
}

__global__ void densmatr_initPlusStateKernel(long long int stateVecSize, qreal probFactor, qreal *stateVecReal, qreal *stateVecImag){
long long int index;

Expand Down Expand Up @@ -3793,7 +3763,6 @@ __global__ void statevec_applyParamNamedPhaseFuncOverridesKernel(
}
}


// negate phase to conjugate operator
if (conj)
phase *= -1;
Expand Down Expand Up @@ -3865,81 +3834,6 @@ void statevec_applyParamNamedPhaseFuncOverrides(
cudaFree(d_params);
}

__global__ void densmatr_setQuregToPauliHamilKernel(
Qureg qureg, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms
) {
long long int n = blockIdx.x*blockDim.x + threadIdx.x;
if (n>=qureg.numAmpsPerChunk) return;

// flattened {I,X,Y,Z} matrix elements, where [k] = [p][i][j]
const int pauliRealElems[] = { 1,0, 0,1, 0,1, 1,0, 0,0, 0,0, 1,0, 0,-1 };
const int pauliImagElems[] = { 0,0, 0,0, 0,0, 0,0, 0,-1,1,0, 0,0, 0,0 };

// |n> = |c>|r>
const int numQubits = qureg.numQubitsRepresented;
const long long int r = n & ((1LL << numQubits) - 1);
const long long int c = n >> numQubits;

// new amplitude of |n>
qreal elemRe = 0;
qreal elemIm = 0;

for (long long int t=0; t<numSumTerms; t++) {

// pauliKronecker[r][c] = prod_q Pauli[q][q-th bit of r and c]
int kronRe = 1;
int kronIm = 0;
long long int pInd = t * numQubits;

for (int q=0; q<numQubits; q++) {

// get element of Pauli matrix
int i = (r >> q) & 1;
int j = (c >> q) & 1;
int p = (int) pauliCodes[pInd++];
int k = (p<<2) + (i<<1) + j;
int pauliRe = pauliRealElems[k];
int pauliIm = pauliImagElems[k];

// kron *= pauli
int tmp = (pauliRe*kronRe) - (pauliIm*kronIm);
kronIm = (pauliRe*kronIm) + (pauliIm*kronRe);
kronRe = tmp;
}

// elem = sum_t coeffs[t] pauliKronecker[r][c]
elemRe += termCoeffs[t] * kronRe;
elemIm += termCoeffs[t] * kronIm;
}

// overwrite the density matrix entry
qureg.deviceStateVec.real[n] = elemRe;
qureg.deviceStateVec.imag[n] = elemIm;
}

void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) {

// copy hamil into GPU memory
enum pauliOpType* d_pauliCodes;
size_t mem_pauliCodes = hamil.numSumTerms * hamil.numQubits * sizeof *d_pauliCodes;
cudaMalloc(&d_pauliCodes, mem_pauliCodes);
cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice);

qreal* d_termCoeffs;
size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs;
cudaMalloc(&d_termCoeffs, mem_termCoeffs);
cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice);

int numThreadsPerBlock = 128;
int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) numThreadsPerBlock);
densmatr_setQuregToPauliHamilKernel<<<numBlocks, numThreadsPerBlock>>>(
qureg, d_pauliCodes, d_termCoeffs, hamil.numSumTerms);

// free tmp GPU memory
cudaFree(d_pauliCodes);
cudaFree(d_termCoeffs);
}



#ifdef __cplusplus
Expand Down
Loading

0 comments on commit 564851e

Please sign in to comment.