diff --git a/QuEST/src/GPU/QuEST_cuQuantum.cu b/QuEST/src/GPU/QuEST_cuQuantum.cu
index 3a2508d1..174146d0 100644
--- a/QuEST/src/GPU/QuEST_cuQuantum.cu
+++ b/QuEST/src/GPU/QuEST_cuQuantum.cu
@@ -5,7 +5,8 @@
  * This makes no use of the ComplexArray qureg.deviceStateVec, used by the bespoke GPU kernels,
  * which is not malloc'd in this deployment. Instead, this cuQuantum backend mallocs and uses
  * two dedicated arrays of 'cuAmp' complex primitives; qureg.cuStateVec (CPU memory) and
- * qureg.deviceCuStateVec (GPU memory)
+ * qureg.deviceCuStateVec (GPU memory). Note that some API functions are implemented directly
+ * in QuEST_gpu_common.cu, in a way agnostic to cuQuantum vs bespoke kernels.
  *
  * @author Tyson Jones
  */
@@ -49,10 +50,29 @@
     # define CU_AMP_IN_MATRIX_PREC void // invalid
 #endif
 
-// convenient operator overloads for cuAmp, for doing complex artihmetic
-cuAmp operator - (const cuAmp& a) {
+// convenient operator overloads for cuAmp, for doing complex artihmetic.
+// some of these are defined to be used by Thrust's backend, because we
+// avoided Thrust's complex<qreal> (see QuEST_precision.h for explanation).
+// notice we are manually performing the arithmetic using qreals and 
+// re-packing the result into TO_CU_AMP, rather than using cuComplex.h's 
+// functions like cuCadd(). This is because such functions are precision
+// specific (grr) and do exactly the same thing themselves!
+__host__ __device__ inline cuAmp operator - (const cuAmp& a) {
     return TO_CU_AMP(-cuAmpReal(a), -cuAmpImag(a));
 }
+__host__ __device__ inline cuAmp operator * (const cuAmp& a, const std::size_t n) {
+    return TO_CU_AMP(n*cuAmpReal(a), n*cuAmpImag(a));
+}
+__host__ __device__ inline cuAmp operator + (const cuAmp& a, const cuAmp& b) {
+    return TO_CU_AMP(cuAmpReal(a) + cuAmpReal(b), cuAmpImag(a) + cuAmpImag(b));
+}
+__host__ __device__ inline cuAmp operator * (const cuAmp& a, const cuAmp& b) {
+    qreal aRe = cuAmpReal(a);
+    qreal aIm = cuAmpImag(a);
+    qreal bRe = cuAmpReal(b);
+    qreal bIm = cuAmpImag(b);
+    return TO_CU_AMP(aRe*bRe - aIm*bIm, aRe*bIm + aIm*bRe);
+}
 
 // convert user-facing Complex to cuQuantum-facing cuAmp
 cuAmp toCuAmp(Complex c) {
@@ -96,12 +116,6 @@ std::vector<int> getIndsFromMask(long long int mask, int numBits) {
 
 
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-
 /*
  * CUQUANTUM WRAPPERS (to reduce boilerplate)
  */
@@ -130,6 +144,46 @@ void custatevec_applyMatrix(Qureg qureg, std::vector<int> ctrls, std::vector<int
 
 
 
+/*
+ * THRUST WRAPPERS AND FUNCTORS (to reduce boilerplate, and for custom modification of statevectors)
+ */
+
+struct functor_setWeightedQureg
+{
+    // functor requires 3 complex scalars
+    const cuAmp fac1;
+    const cuAmp fac2;
+    const cuAmp facOut;
+    functor_setWeightedQureg(cuAmp _fac1, cuAmp _fac2, cuAmp _facOut) : 
+        fac1(_fac1), fac2(_fac2), facOut(_facOut) {}
+
+    // and modifies out to (facOut out + fac1 qureg1 + fac2 qureg2)
+    template <typename Tuple> __host__ __device__ void operator()(Tuple t) {
+        thrust::get<2>(t) = facOut*thrust::get<2>(t) + fac1*thrust::get<0>(t) + fac2*thrust::get<1>(t);
+    }
+};
+
+thrust::device_ptr<cuAmp> getStartPtr(Qureg qureg) {
+
+    return thrust::device_pointer_cast(qureg.deviceCuStateVec);
+}
+
+thrust::device_ptr<cuAmp> getEndPtr(Qureg qureg) {
+
+    return getStartPtr(qureg) + qureg.numAmpsTotal;
+}
+
+
+
+/*
+ * Start C-linkage, so that below functions may only use C signatures (not C++)
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
 /* 
  * ENVIRONMENT MANAGEMENT
  */
@@ -328,53 +382,84 @@ qreal statevec_getImagAmp(Qureg qureg, long long int index)
  * STATE INITIALISATION
  */
 
-void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg)
-{
-}
-
 void densmatr_initPlusState(Qureg qureg)
 {
+    qreal val = 1./(1LL << qureg.numQubitsRepresented);
+    thrust::fill(getStartPtr(qureg), getEndPtr(qureg), TO_CU_AMP(val, 0));
 }
 
-void densmatr_initClassicalState(Qureg qureg, long long int stateInd)
+void statevec_initZeroState(Qureg qureg)
 {
+    custatevecInitializeStateVector(
+        qureg.cuQuantumHandle, 
+        qureg.deviceCuStateVec, 
+        CU_AMP_IN_STATE_PREC, 
+        qureg.numQubitsInStateVec, 
+        CUSTATEVEC_STATE_VECTOR_TYPE_ZERO);
 }
 
 void statevec_initBlankState(Qureg qureg)
 {
-}
+    // init to |0> = {1, 0, 0, ...}
+    statevec_initZeroState(qureg);
 
-void statevec_initZeroState(Qureg qureg)
-{
+    // overwrite amps[0] to 0
+    qreal re[] = {0};
+    qreal im[] = {0};
+    statevec_setAmps(qureg, 0, re, im, 1);
 }
 
 void statevec_initPlusState(Qureg qureg)
 {
+    custatevecInitializeStateVector(
+        qureg.cuQuantumHandle, 
+        qureg.deviceCuStateVec, 
+        CU_AMP_IN_STATE_PREC, 
+        qureg.numQubitsInStateVec, 
+        CUSTATEVEC_STATE_VECTOR_TYPE_UNIFORM);
 }
 
 void statevec_initClassicalState(Qureg qureg, long long int stateInd)
 {
-}
+    // init to |null> = {0, 0, 0, ...}
+    statevec_initBlankState(qureg);
 
-void statevec_initDebugState(Qureg qureg)
-{
+    // overwrite amps[stateInd] to 1
+    qreal re[] = {1};
+    qreal im[] = {0};
+    statevec_setAmps(qureg, stateInd, re, im, 1);
 }
 
-void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome)
+void densmatr_initClassicalState(Qureg qureg, long long int stateInd)
 {
-}
+    // init to |null> = {0, 0, 0, ...}
+    statevec_initBlankState(qureg);
 
-int statevec_initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env)
-{
-    return -1;
+    // index of the desired state in the flat density matrix
+    long long int densityDim = 1LL << qureg.numQubitsRepresented;
+    long long int densityInd = (densityDim + 1)*stateInd;
+
+    // overwrite amps[densityInd] to 1
+    qreal re[] = {1};
+    qreal im[] = {0};
+    statevec_setAmps(qureg, densityInd, re, im, 1);
 }
 
-void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil)
+void statevec_initDebugState(Qureg qureg)
 {
+    // |n> -> (.2n + (.2n+.1)i) |n>
+    cuAmp init = TO_CU_AMP(0,  .1);
+    cuAmp step = TO_CU_AMP(.2, .2);
+    thrust::sequence(getStartPtr(qureg), getEndPtr(qureg), init, step);
 }
 
 void statevec_setWeightedQureg(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out)
 {
+    // out ->  facOut + fac1 qureg1 + fac2 qureg2
+    thrust::for_each(
+        thrust::make_zip_iterator(thrust::make_tuple(getStartPtr(qureg1), getStartPtr(qureg2), getStartPtr(out))),
+        thrust::make_zip_iterator(thrust::make_tuple(getEndPtr(qureg1),   getEndPtr(qureg2),   getEndPtr(out))),
+        functor_setWeightedQureg(toCuAmp(fac1), toCuAmp(fac2), toCuAmp(facOut)));
 }
 
 
@@ -387,11 +472,6 @@ void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank)
 {
 }
 
-int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision)
-{
-    return -1;
-}
-
 
 
 /*
diff --git a/QuEST/src/GPU/QuEST_gpu.cu b/QuEST/src/GPU/QuEST_gpu.cu
index b253336f..a44219ae 100755
--- a/QuEST/src/GPU/QuEST_gpu.cu
+++ b/QuEST/src/GPU/QuEST_gpu.cu
@@ -207,36 +207,6 @@ void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) {
         cudaMemcpyDeviceToDevice);
 }
 
-__global__ void densmatr_initPureStateKernel(
-    long long int numPureAmps,
-    qreal *targetVecReal, qreal *targetVecImag, 
-    qreal *copyVecReal, qreal *copyVecImag) 
-{
-    // this is a particular index of the pure copyQureg
-    long long int index = blockIdx.x*blockDim.x + threadIdx.x;
-    if (index>=numPureAmps) return;
-    
-    qreal realRow = copyVecReal[index];
-    qreal imagRow = copyVecImag[index];
-    for (long long int col=0; col < numPureAmps; col++) {
-        qreal realCol =   copyVecReal[col];
-        qreal imagCol = - copyVecImag[col]; // minus for conjugation
-        targetVecReal[col*numPureAmps + index] = realRow*realCol - imagRow*imagCol;
-        targetVecImag[col*numPureAmps + index] = realRow*imagCol + imagRow*realCol;
-    }
-}
-
-void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg)
-{
-    int threadsPerCUDABlock, CUDABlocks;
-    threadsPerCUDABlock = 128;
-    CUDABlocks = ceil((qreal)(copyQureg.numAmpsPerChunk)/threadsPerCUDABlock);
-    densmatr_initPureStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
-        copyQureg.numAmpsPerChunk,
-        targetQureg.deviceStateVec.real, targetQureg.deviceStateVec.imag,
-        copyQureg.deviceStateVec.real,   copyQureg.deviceStateVec.imag);
-}
-
 __global__ void densmatr_initPlusStateKernel(long long int stateVecSize, qreal probFactor, qreal *stateVecReal, qreal *stateVecImag){
     long long int index;
 
@@ -3793,7 +3763,6 @@ __global__ void statevec_applyParamNamedPhaseFuncOverridesKernel(
         }
     }
     
-    
     // negate phase to conjugate operator 
     if (conj)
         phase *= -1;
@@ -3865,81 +3834,6 @@ void statevec_applyParamNamedPhaseFuncOverrides(
         cudaFree(d_params);
 }
 
-__global__ void densmatr_setQuregToPauliHamilKernel(
-    Qureg qureg, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms
-) {
-    long long int n = blockIdx.x*blockDim.x + threadIdx.x;
-    if (n>=qureg.numAmpsPerChunk) return;
-    
-    // flattened {I,X,Y,Z} matrix elements, where [k] = [p][i][j]
-    const int pauliRealElems[] = {   1,0, 0,1,   0,1, 1,0,   0,0, 0,0,   1,0, 0,-1  };
-    const int pauliImagElems[] = {   0,0, 0,0,   0,0, 0,0,   0,-1,1,0,   0,0, 0,0   };
-
-    // |n> = |c>|r>
-    const int numQubits = qureg.numQubitsRepresented;
-    const long long int r = n & ((1LL << numQubits) - 1);
-    const long long int c = n >> numQubits;
-
-    // new amplitude of |n>
-    qreal elemRe = 0;
-    qreal elemIm = 0;
-    
-    for (long long int t=0; t<numSumTerms; t++) {
-        
-        // pauliKronecker[r][c] = prod_q Pauli[q][q-th bit of r and c]
-        int kronRe = 1;
-        int kronIm = 0;
-        long long int pInd = t * numQubits;
-        
-        for (int q=0; q<numQubits; q++) {
-            
-            // get element of Pauli matrix
-            int i = (r >> q) & 1;
-            int j = (c >> q) & 1;
-            int p = (int) pauliCodes[pInd++];
-            int k = (p<<2) + (i<<1) + j;
-            int pauliRe = pauliRealElems[k]; 
-            int pauliIm = pauliImagElems[k];
-            
-            // kron *= pauli
-            int tmp = (pauliRe*kronRe) - (pauliIm*kronIm);
-            kronIm = (pauliRe*kronIm) + (pauliIm*kronRe);
-            kronRe = tmp;
-        }
-        
-        // elem = sum_t coeffs[t] pauliKronecker[r][c]
-        elemRe += termCoeffs[t] * kronRe;
-        elemIm += termCoeffs[t] * kronIm;
-    }
-    
-    // overwrite the density matrix entry
-    qureg.deviceStateVec.real[n] = elemRe;
-    qureg.deviceStateVec.imag[n] = elemIm;
-}
-
-void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) {
-    
-    // copy hamil into GPU memory
-    enum pauliOpType* d_pauliCodes;
-    size_t mem_pauliCodes = hamil.numSumTerms * hamil.numQubits * sizeof *d_pauliCodes;
-    cudaMalloc(&d_pauliCodes, mem_pauliCodes);
-    cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice);
-    
-    qreal* d_termCoeffs;
-    size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs;
-    cudaMalloc(&d_termCoeffs, mem_termCoeffs);
-    cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice);
-    
-    int numThreadsPerBlock = 128;
-    int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) numThreadsPerBlock);
-    densmatr_setQuregToPauliHamilKernel<<<numBlocks, numThreadsPerBlock>>>(
-        qureg, d_pauliCodes, d_termCoeffs, hamil.numSumTerms);
-
-    // free tmp GPU memory
-    cudaFree(d_pauliCodes);
-    cudaFree(d_termCoeffs);
-}
-
 
 
 #ifdef __cplusplus
diff --git a/QuEST/src/GPU/QuEST_gpu_common.cu b/QuEST/src/GPU/QuEST_gpu_common.cu
index f07b105a..0426a1c4 100644
--- a/QuEST/src/GPU/QuEST_gpu_common.cu
+++ b/QuEST/src/GPU/QuEST_gpu_common.cu
@@ -30,6 +30,49 @@ extern "C" {
 
 
 
+/*
+ * GPU CONFIG
+ */
+
+const int NUM_THREADS_PER_BLOCK = 128;
+
+
+
+/*
+ * BACKEND AGNOSTIC COMPLEX ARITHMETIC
+ */
+
+#define PROD_REAL(aRe, aIm, bRe, bIm) \
+    ((aRe)*(bRe) - (aIm)*(bIm))
+#define PROD_IMAG(aRe, aIm, bRe, bIm) \
+    ((aRe)*(bIm) + (aIm)*(bRe))
+
+#ifdef USE_CUQUANTUM
+    #define GET_AMP(aRe, aIm, qureg, i) \
+        aRe = qureg.deviceCuStateVec[i].x; \
+        aIm = qureg.deviceCuStateVec[i].y;
+#else
+    #define GET_AMP(aRe, aIm, qureg, i) \
+        aRe = qureg.deviceStateVec.real[i]; \
+        aIm = qureg.deviceStateVec.imag[i];
+#endif
+
+#ifdef USE_CUQUANTUM
+    #define SET_AMP(qureg, i, aRe, aIm) \
+        qureg.deviceCuStateVec[i].x = aRe; \
+        qureg.deviceCuStateVec[i].y = aIm;
+#else
+    #define SET_AMP(qureg, i, aRe, aIm) \
+        qureg.deviceStateVec.real[i] = aRe; \
+        qureg.deviceStateVec.imag[i] = aIm;
+#endif
+
+
+
+/*
+ * ENVIRONMENT MANAGEMENT
+ */
+
 int GPUExists(void) {
     int deviceCount, device;
     int gpuDeviceCount = 0;
@@ -110,9 +153,116 @@ void seedQuEST(QuESTEnv *env, unsigned long int *seedArray, int numSeeds) {
 
 
 
+/*
+ * STATE INITIALISATION
+ */
+
+__global__ void densmatr_initPureStateKernel(Qureg dens, Qureg pure)
+{
+    long long int pureInd = blockIdx.x*blockDim.x + threadIdx.x;
+    if (pureInd >= pure.numAmpsPerChunk) return;
+
+    qreal aRe, aIm;
+    GET_AMP( aRe, aIm, pure, pureInd );
+
+    for (long long int col=0; col<pure.numAmpsPerChunk; col++) {
+        long long int densInd = col*pure.numAmpsPerChunk + pureInd;
+
+        qreal bRe, bIm;
+        GET_AMP( bRe, bIm, pure, col );
+        bIm *= -1; // conjugated
+        
+        qreal cRe = PROD_REAL( aRe, aIm, bRe, bIm );
+        qreal cIm = PROD_IMAG( aRe, aIm, bRe, bIm );
+
+        SET_AMP( dens, densInd, cRe, cIm );
+    }
+}
+
+void densmatr_initPureState(Qureg dens, Qureg pure)
+{
+    int CUDABlocks = ceil(pure.numAmpsPerChunk/ (qreal) NUM_THREADS_PER_BLOCK);
+    densmatr_initPureStateKernel<<<CUDABlocks, NUM_THREADS_PER_BLOCK>>>(dens, pure);
+}
+
+__global__ void densmatr_setQuregToPauliHamilKernel(
+    Qureg qureg, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms
+) {
+    long long int n = blockIdx.x*blockDim.x + threadIdx.x;
+    if (n>=qureg.numAmpsPerChunk) return;
+    
+    // flattened {I,X,Y,Z} matrix elements, where [k] = [p][i][j]
+    const int pauliRealElems[] = {   1,0, 0,1,   0,1, 1,0,   0,0, 0,0,   1,0, 0,-1  };
+    const int pauliImagElems[] = {   0,0, 0,0,   0,0, 0,0,   0,-1,1,0,   0,0, 0,0   };
+
+    // |n> = |c>|r>
+    const int numQubits = qureg.numQubitsRepresented;
+    const long long int r = n & ((1LL << numQubits) - 1);
+    const long long int c = n >> numQubits;
+
+    // new amplitude of |n>
+    qreal elemRe = 0;
+    qreal elemIm = 0;
+    
+    for (long long int t=0; t<numSumTerms; t++) {
+        
+        // pauliKronecker[r][c] = prod_q Pauli[q][q-th bit of r and c]
+        int kronRe = 1;
+        int kronIm = 0;
+        long long int pInd = t * numQubits;
+        
+        for (int q=0; q<numQubits; q++) {
+            
+            // get element of Pauli matrix
+            int i = (r >> q) & 1;
+            int j = (c >> q) & 1;
+            int p = (int) pauliCodes[pInd++];
+            int k = (p<<2) + (i<<1) + j;
+            int pauliRe = pauliRealElems[k]; 
+            int pauliIm = pauliImagElems[k];
+            
+            // kron *= pauli
+            int tmp = (pauliRe*kronRe) - (pauliIm*kronIm);
+            kronIm = (pauliRe*kronIm) + (pauliIm*kronRe);
+            kronRe = tmp;
+        }
+        
+        // elem = sum_t coeffs[t] pauliKronecker[r][c]
+        elemRe += termCoeffs[t] * kronRe;
+        elemIm += termCoeffs[t] * kronIm;
+    }
+    
+    // overwrite the density matrix entry
+    SET_AMP( qureg, n, elemRe, elemIm );
+}
+
+void densmatr_setQuregToPauliHamil(Qureg qureg, PauliHamil hamil) {
+    
+    // copy hamil into GPU memory
+    enum pauliOpType* d_pauliCodes;
+    size_t mem_pauliCodes = hamil.numSumTerms * hamil.numQubits * sizeof *d_pauliCodes;
+    cudaMalloc(&d_pauliCodes, mem_pauliCodes);
+    cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice);
+    
+    qreal* d_termCoeffs;
+    size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs;
+    cudaMalloc(&d_termCoeffs, mem_termCoeffs);
+    cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice);
+    
+    int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) NUM_THREADS_PER_BLOCK);
+    densmatr_setQuregToPauliHamilKernel<<<numBlocks, NUM_THREADS_PER_BLOCK>>>(
+        qureg, d_pauliCodes, d_termCoeffs, hamil.numSumTerms);
+
+    // free tmp GPU memory
+    cudaFree(d_pauliCodes);
+    cudaFree(d_termCoeffs);
+}
 
 
 
+/*
+ * MANAGING NON-QUREG TYPES 
+ */
 
 DiagonalOp agnostic_createDiagonalOp(int numQubits, QuESTEnv env) {
 
@@ -125,7 +275,6 @@ DiagonalOp agnostic_createDiagonalOp(int numQubits, QuESTEnv env) {
     // allocate CPU memory (initialised to zero)
     op.real = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal));
     op.imag = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal));
-    // @TODO no handling of rank>1 allocation (no distributed GPU)
 
     // check cpu memory allocation was successful
     validateDiagonalOpAllocation(&op, env, __func__);
@@ -201,9 +350,8 @@ void agnostic_initDiagonalOpFromPauliHamil(DiagonalOp op, PauliHamil hamil) {
     cudaMalloc(&d_termCoeffs, mem_termCoeffs);
     cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice);
     
-    int numThreadsPerBlock = 128;
-    int numBlocks = ceil(op.numElemsPerChunk / (qreal) numThreadsPerBlock);
-    agnostic_initDiagonalOpFromPauliHamilKernel<<<numBlocks, numThreadsPerBlock>>>(
+    int numBlocks = ceil(op.numElemsPerChunk / (qreal) NUM_THREADS_PER_BLOCK);
+    agnostic_initDiagonalOpFromPauliHamilKernel<<<numBlocks, NUM_THREADS_PER_BLOCK>>>(
         op, d_pauliCodes, d_termCoeffs, hamil.numSumTerms);
     
     // copy populated operator into to RAM
diff --git a/QuEST/src/GPU/QuEST_gpu_common.h b/QuEST/src/GPU/QuEST_gpu_common.h
index d2b58df9..cf6998cb 100644
--- a/QuEST/src/GPU/QuEST_gpu_common.h
+++ b/QuEST/src/GPU/QuEST_gpu_common.h
@@ -1,3 +1,11 @@
+// Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details
+
+/** @file
+ * GPU backend routines which are invoked by both the cuQuantum and bespoke-kernel backends.
+ * This excludes backend-agnostic API implementations which do not need a header declaration.
+ *
+ * @author Tyson Jones
+ */
 
 # ifndef QUEST_GPU_COMMON_H
 # define QUEST_GPU_COMMON_H