Skip to content

Commit

Permalink
Add templates for double and float types
Browse files Browse the repository at this point in the history
  • Loading branch information
ellaellela committed Aug 13, 2019
1 parent 79c2afa commit 5c95eaa
Show file tree
Hide file tree
Showing 41 changed files with 2,410 additions and 724 deletions.
98 changes: 77 additions & 21 deletions cuSten/src/kernels/2d_x_np_fun_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,18 @@
// Function pointer definition
// ---------------------------------------------------------------------

/*! typedef double (*devArg1X)(double*, double*, int);
/*! typedef elemType (*devArg1X)(elemType*, elemType*, int);
\brief The function pointer containing the user defined function to be applied <br>
Input 1: The pointer to input data to the function <br>
Input 2: The pointer to the coefficients provided by the user <br>
Input 3: The current index position (centre of the stencil to be applied)
*/

typedef double (*devArg1X)(double*, double*, int);
template <typename elemType>
struct templateFunc
{
typedef elemType (*devArg1X)(elemType*, elemType*, int);
};

// ---------------------------------------------------------------------
// Kernel Definition
Expand All @@ -65,12 +69,13 @@ typedef double (*devArg1X)(double*, double*, int);
\param nx Total number of points in the x direction
*/

template <typename elemType>
__global__ void kernel2DXnpFun
(
double* dataOutput,
double* dataInput,
double* coe,
double* func,
elemType* dataOutput,
elemType* dataInput,
elemType* coe,
elemType* func,
const int numStenLeft,
const int numStenRight,
const int numCoe,
Expand All @@ -83,8 +88,8 @@ __global__ void kernel2DXnpFun
// Allocate the shared memory
extern __shared__ int memory[];

double* arrayLocal = (double*)&memory;
double* coeLocal = (double*)&arrayLocal[nxLocal * nyLocal];
elemType* arrayLocal = (elemType*)&memory;
elemType* coeLocal = (elemType*)&arrayLocal[nxLocal * nyLocal];

// Move the weigths into shared memory
#pragma unroll
Expand All @@ -94,7 +99,7 @@ __global__ void kernel2DXnpFun
}

// True matrix index
int globalIdx = blockDim.x * blockIdx.x + threadIdx.x;
int globalIdx = blockDim.x * blockIdx.x + threadIdx.x;
int globalIdy = blockDim.y * blockIdx.y + threadIdx.y;

// Local matrix index
Expand Down Expand Up @@ -125,7 +130,7 @@ __global__ void kernel2DXnpFun

stenSet = localIdy * nxLocal + localIdx;

dataOutput[globalIdy * nx + globalIdx] = ((devArg1X)func)(arrayLocal, coeLocal, stenSet);
dataOutput[globalIdy * nx + globalIdx] = ((typename templateFunc<elemType>::devArg1X)func)(arrayLocal, coeLocal, stenSet);
}

// Set all left boundary blocks
Expand All @@ -145,7 +150,7 @@ __global__ void kernel2DXnpFun
{
stenSet = localIdy * nxLocal + threadIdx.x;

dataOutput[globalIdy * nx + globalIdx] = ((devArg1X)func)(arrayLocal, coeLocal, stenSet);
dataOutput[globalIdy * nx + globalIdx] = ((typename templateFunc<elemType>::devArg1X)func)(arrayLocal, coeLocal, stenSet);
}
}

Expand All @@ -166,7 +171,7 @@ __global__ void kernel2DXnpFun
{
stenSet = localIdy * nxLocal + localIdx;

dataOutput[globalIdy * nx + globalIdx] = ((devArg1X)func)(arrayLocal, coeLocal, stenSet);
dataOutput[globalIdy * nx + globalIdx] = ((typename templateFunc<elemType>::devArg1X)func)(arrayLocal, coeLocal, stenSet);
}
}
}
Expand All @@ -181,9 +186,10 @@ __global__ void kernel2DXnpFun
\param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU
*/

template <typename elemType>
void cuStenCompute2DXnpFun
(
cuSten_t* pt_cuSten,
cuSten_t<elemType>* pt_cuSten,
bool offload
)
{
Expand All @@ -199,14 +205,14 @@ void cuStenCompute2DXnpFun
dim3 gridDim(pt_cuSten->xGrid, pt_cuSten->yGrid);

// Load the weights
cudaMemPrefetchAsync(pt_cuSten->coe, pt_cuSten->numCoe * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]);
cudaMemPrefetchAsync(pt_cuSten->coe, pt_cuSten->numCoe * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]);

// Preload the first block
cudaStreamSynchronize(pt_cuSten->streams[1]);

// Prefetch the tile data
cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]);
cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]);
cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]);
cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]);

// Record the event
cudaEventRecord(pt_cuSten->events[0], pt_cuSten->streams[1]);
Expand Down Expand Up @@ -246,8 +252,8 @@ void cuStenCompute2DXnpFun
// Offload should the user want to
if (offload == 1)
{
cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]);
cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]);
cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]);
cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]);
}

// Load the next tile
Expand All @@ -257,8 +263,8 @@ void cuStenCompute2DXnpFun
cudaStreamSynchronize(pt_cuSten->streams[1]);

// Prefetch the necessary tiles
cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]);
cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]);
cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]);
cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]);

// Record the event
cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]);
Expand All @@ -277,6 +283,56 @@ void cuStenCompute2DXnpFun
}
}

// ---------------------------------------------------------------------
// Explicit instantiation
// ---------------------------------------------------------------------

template
__global__ void kernel2DXnpFun<double>
(
double*,
double*,
double*,
double*,
const int,
const int,
const int,
const int,
const int,
const int,
const int
);

template
void cuStenCompute2DXnpFun<double>
(
cuSten_t<double>*,
bool
);

template
__global__ void kernel2DXnpFun<float>
(
float*,
float*,
float*,
float*,
const int,
const int,
const int,
const int,
const int,
const int,
const int
);

template
void cuStenCompute2DXnpFun<float>
(
cuSten_t<float>*,
bool
);

// ---------------------------------------------------------------------
// End of file
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
83 changes: 66 additions & 17 deletions cuSten/src/kernels/2d_x_np_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,12 @@
\param nx Total number of points in the x direction
*/

template <typename elemType>
__global__ void kernel2DXnp
(
double* dataOutput,
double* dataInput,
const double* weights,
elemType* dataOutput,
elemType* dataInput,
const elemType* weights,
const int numSten,
const int numStenLeft,
const int numStenRight,
Expand All @@ -69,8 +70,8 @@ __global__ void kernel2DXnp
// Allocate the shared memory
extern __shared__ int memory[];

double* arrayLocal = (double*)&memory;
double* weigthsLocal = (double*)&arrayLocal[nxLocal * nyLocal];
elemType* arrayLocal = (elemType*)&memory;
elemType* weigthsLocal = (elemType*)&arrayLocal[nxLocal * nyLocal];

// Move the weigths into shared memory
#pragma unroll
Expand All @@ -80,15 +81,15 @@ __global__ void kernel2DXnp
}

// True matrix index
int globalIdx = blockDim.x * blockIdx.x + threadIdx.x;
int globalIdx = blockDim.x * blockIdx.x + threadIdx.x;
int globalIdy = blockDim.y * blockIdx.y + threadIdx.y;

// Local matrix index
int localIdx = threadIdx.x + numStenLeft;
int localIdy = threadIdx.y;

// Local sum variable
double sum = 0.0;
elemType sum = 0.0;

// Set index for summing stencil
int stenSet;
Expand Down Expand Up @@ -187,10 +188,10 @@ __global__ void kernel2DXnp
\param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU
*/

template <typename elemType>
void cuStenCompute2DXnp
(
cuSten_t* pt_cuSten,

cuSten_t<elemType>* pt_cuSten,
bool offload
)
{
Expand All @@ -210,14 +211,14 @@ void cuStenCompute2DXnp
int local_ny = pt_cuSten->BLOCK_Y;

// Load the weights
cudaMemPrefetchAsync(pt_cuSten->weights, pt_cuSten->numSten * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]);
cudaMemPrefetchAsync(pt_cuSten->weights, pt_cuSten->numSten * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]);

// Preload the first block
cudaStreamSynchronize(pt_cuSten->streams[1]);

// Prefetch the tile data
cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]);
cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]);
cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]);
cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]);

// Record the event
cudaEventRecord(pt_cuSten->events[0], pt_cuSten->streams[1]);
Expand All @@ -238,8 +239,8 @@ void cuStenCompute2DXnp
// Offload should the user want to
if (offload == 1)
{
cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]);
cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]);
cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]);
cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]);
}

// Load the next tile
Expand All @@ -249,8 +250,8 @@ void cuStenCompute2DXnp
cudaStreamSynchronize(pt_cuSten->streams[1]);

// Prefetch the necessary tiles
cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]);
cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]);
cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]);
cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]);

// Record the event
cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]);
Expand All @@ -269,6 +270,54 @@ void cuStenCompute2DXnp
}
}

// ---------------------------------------------------------------------
// Explicit instantiation
// ---------------------------------------------------------------------

template
__global__ void kernel2DXnp<double>
(
double*,
double*,
const double*,
const int,
const int,
const int,
const int,
const int,
const int,
const int
);

template
void cuStenCompute2DXnp<double>
(
cuSten_t<double>*,
bool
);

template
__global__ void kernel2DXnp<float>
(
float*,
float*,
const float*,
const int,
const int,
const int,
const int,
const int,
const int,
const int
);

template
void cuStenCompute2DXnp<float>
(
cuSten_t<float>*,
bool
);

// ---------------------------------------------------------------------
// End of file
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
Loading

0 comments on commit 5c95eaa

Please sign in to comment.