From 5c95eaa36ccd0863a464d58cb82ca305be10e5d2 Mon Sep 17 00:00:00 2001 From: Marin M Date: Wed, 8 May 2019 18:23:38 +0200 Subject: [PATCH] Add templates for double and float types --- cuSten/src/kernels/2d_x_np_fun_kernel.cu | 98 +++++-- cuSten/src/kernels/2d_x_np_kernel.cu | 83 ++++-- cuSten/src/kernels/2d_x_p_fun_kernel.cu | 95 +++++-- cuSten/src/kernels/2d_x_p_kernel.cu | 81 ++++-- cuSten/src/kernels/2d_xyADVWENO_p_kernel.cu | 215 ++++++++++---- cuSten/src/kernels/2d_xy_np_fun_kernel.cu | 145 +++++++--- cuSten/src/kernels/2d_xy_np_kernel.cu | 115 ++++++-- cuSten/src/kernels/2d_xy_p_fun_kernel.cu | 125 +++++++-- cuSten/src/kernels/2d_xy_p_kernel.cu | 111 ++++++-- cuSten/src/kernels/2d_y_np_fun_kernel.cu | 129 ++++++--- cuSten/src/kernels/2d_y_np_kernel.cu | 105 +++++-- cuSten/src/kernels/2d_y_p_fun_kernel.cu | 121 ++++++-- cuSten/src/kernels/2d_y_p_kernel.cu | 101 +++++-- cuSten/src/kernels/stencil_kernels.h | 52 ++-- cuSten/src/struct/cuSten_struct_functions.h | 263 ++++++++++-------- cuSten/src/struct/cuSten_struct_type.h | 28 +- .../struct/custenCreateDestroy2DXYADVWENOp.cu | 102 +++++-- .../src/struct/custenCreateDestroy2DXYnp.cu | 95 ++++++- .../struct/custenCreateDestroy2DXYnpFun.cu | 99 ++++++- cuSten/src/struct/custenCreateDestroy2DXYp.cu | 95 ++++++- .../src/struct/custenCreateDestroy2DXYpFun.cu | 99 ++++++- cuSten/src/struct/custenCreateDestroy2DXnp.cu | 86 +++++- .../src/struct/custenCreateDestroy2DXnpFun.cu | 91 +++++- cuSten/src/struct/custenCreateDestroy2DXp.cu | 85 +++++- .../src/struct/custenCreateDestroy2DXpFun.cu | 101 +++++-- cuSten/src/struct/custenCreateDestroy2DYnp.cu | 89 +++++- .../src/struct/custenCreateDestroy2DYnpFun.cu | 93 ++++++- cuSten/src/struct/custenCreateDestroy2DYp.cu | 89 +++++- .../src/struct/custenCreateDestroy2DYpFun.cu | 95 ++++++- examples/src/2d_x_np.cu | 4 +- examples/src/2d_x_np_fun.cu | 4 +- examples/src/2d_x_p.cu | 4 +- examples/src/2d_xyWENOADV_p.cu | 4 +- examples/src/2d_xy_np.cu | 4 +- examples/src/2d_xy_np_fun.cu | 4 +- examples/src/2d_xy_p.cu | 4 +- examples/src/2d_xy_p_fun.cu | 4 +- examples/src/2d_y_np.cu | 4 +- examples/src/2d_y_np_fun.cu | 4 +- examples/src/2d_y_p.cu | 4 +- examples/src/2d_y_p_fun.cu | 4 +- 41 files changed, 2410 insertions(+), 724 deletions(-) diff --git a/cuSten/src/kernels/2d_x_np_fun_kernel.cu b/cuSten/src/kernels/2d_x_np_fun_kernel.cu index 26939af..9439e61 100644 --- a/cuSten/src/kernels/2d_x_np_fun_kernel.cu +++ b/cuSten/src/kernels/2d_x_np_fun_kernel.cu @@ -37,14 +37,18 @@ // Function pointer definition // --------------------------------------------------------------------- -/*! typedef double (*devArg1X)(double*, double*, int); +/*! typedef elemType (*devArg1X)(elemType*, elemType*, int); \brief The function pointer containing the user defined function to be applied
Input 1: The pointer to input data to the function
Input 2: The pointer to the coefficients provided by the user
Input 3: The current index position (centre of the stencil to be applied) */ -typedef double (*devArg1X)(double*, double*, int); +template +struct templateFunc +{ + typedef elemType (*devArg1X)(elemType*, elemType*, int); +}; // --------------------------------------------------------------------- // Kernel Definition @@ -65,12 +69,13 @@ typedef double (*devArg1X)(double*, double*, int); \param nx Total number of points in the x direction */ +template __global__ void kernel2DXnpFun ( - double* dataOutput, - double* dataInput, - double* coe, - double* func, + elemType* dataOutput, + elemType* dataInput, + elemType* coe, + elemType* func, const int numStenLeft, const int numStenRight, const int numCoe, @@ -83,8 +88,8 @@ __global__ void kernel2DXnpFun // Allocate the shared memory extern __shared__ int memory[]; - double* arrayLocal = (double*)&memory; - double* coeLocal = (double*)&arrayLocal[nxLocal * nyLocal]; + elemType* arrayLocal = (elemType*)&memory; + elemType* coeLocal = (elemType*)&arrayLocal[nxLocal * nyLocal]; // Move the weigths into shared memory #pragma unroll @@ -94,7 +99,7 @@ __global__ void kernel2DXnpFun } // True matrix index - int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; + int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; int globalIdy = blockDim.y * blockIdx.y + threadIdx.y; // Local matrix index @@ -125,7 +130,7 @@ __global__ void kernel2DXnpFun stenSet = localIdy * nxLocal + localIdx; - dataOutput[globalIdy * nx + globalIdx] = ((devArg1X)func)(arrayLocal, coeLocal, stenSet); + dataOutput[globalIdy * nx + globalIdx] = ((typename templateFunc::devArg1X)func)(arrayLocal, coeLocal, stenSet); } // Set all left boundary blocks @@ -145,7 +150,7 @@ __global__ void kernel2DXnpFun { stenSet = localIdy * nxLocal + threadIdx.x; - dataOutput[globalIdy * nx + globalIdx] = ((devArg1X)func)(arrayLocal, coeLocal, stenSet); + dataOutput[globalIdy * nx + globalIdx] = ((typename templateFunc::devArg1X)func)(arrayLocal, coeLocal, stenSet); } } @@ -166,7 +171,7 @@ __global__ void kernel2DXnpFun { stenSet = localIdy * nxLocal + localIdx; - dataOutput[globalIdy * nx + globalIdx] = ((devArg1X)func)(arrayLocal, coeLocal, stenSet); + dataOutput[globalIdy * nx + globalIdx] = ((typename templateFunc::devArg1X)func)(arrayLocal, coeLocal, stenSet); } } } @@ -181,9 +186,10 @@ __global__ void kernel2DXnpFun \param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU */ +template void cuStenCompute2DXnpFun ( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, bool offload ) { @@ -199,14 +205,14 @@ void cuStenCompute2DXnpFun dim3 gridDim(pt_cuSten->xGrid, pt_cuSten->yGrid); // Load the weights - cudaMemPrefetchAsync(pt_cuSten->coe, pt_cuSten->numCoe * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->coe, pt_cuSten->numCoe * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Preload the first block cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tile data - cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[0], pt_cuSten->streams[1]); @@ -246,8 +252,8 @@ void cuStenCompute2DXnpFun // Offload should the user want to if (offload == 1) { - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); } // Load the next tile @@ -257,8 +263,8 @@ void cuStenCompute2DXnpFun cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the necessary tiles - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]); @@ -277,6 +283,56 @@ void cuStenCompute2DXnpFun } } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +__global__ void kernel2DXnpFun +( + double*, + double*, + double*, + double*, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXnpFun +( + cuSten_t*, + bool +); + +template +__global__ void kernel2DXnpFun +( + float*, + float*, + float*, + float*, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXnpFun +( + cuSten_t*, + bool +); + // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/kernels/2d_x_np_kernel.cu b/cuSten/src/kernels/2d_x_np_kernel.cu index e94fdf3..f42750f 100644 --- a/cuSten/src/kernels/2d_x_np_kernel.cu +++ b/cuSten/src/kernels/2d_x_np_kernel.cu @@ -52,11 +52,12 @@ \param nx Total number of points in the x direction */ +template __global__ void kernel2DXnp ( - double* dataOutput, - double* dataInput, - const double* weights, + elemType* dataOutput, + elemType* dataInput, + const elemType* weights, const int numSten, const int numStenLeft, const int numStenRight, @@ -69,8 +70,8 @@ __global__ void kernel2DXnp // Allocate the shared memory extern __shared__ int memory[]; - double* arrayLocal = (double*)&memory; - double* weigthsLocal = (double*)&arrayLocal[nxLocal * nyLocal]; + elemType* arrayLocal = (elemType*)&memory; + elemType* weigthsLocal = (elemType*)&arrayLocal[nxLocal * nyLocal]; // Move the weigths into shared memory #pragma unroll @@ -80,7 +81,7 @@ __global__ void kernel2DXnp } // True matrix index - int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; + int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; int globalIdy = blockDim.y * blockIdx.y + threadIdx.y; // Local matrix index @@ -88,7 +89,7 @@ __global__ void kernel2DXnp int localIdy = threadIdx.y; // Local sum variable - double sum = 0.0; + elemType sum = 0.0; // Set index for summing stencil int stenSet; @@ -187,10 +188,10 @@ __global__ void kernel2DXnp \param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU */ +template void cuStenCompute2DXnp ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ) { @@ -210,14 +211,14 @@ void cuStenCompute2DXnp int local_ny = pt_cuSten->BLOCK_Y; // Load the weights - cudaMemPrefetchAsync(pt_cuSten->weights, pt_cuSten->numSten * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->weights, pt_cuSten->numSten * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Preload the first block cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tile data - cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[0], pt_cuSten->streams[1]); @@ -238,8 +239,8 @@ void cuStenCompute2DXnp // Offload should the user want to if (offload == 1) { - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); } // Load the next tile @@ -249,8 +250,8 @@ void cuStenCompute2DXnp cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the necessary tiles - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]); @@ -269,6 +270,54 @@ void cuStenCompute2DXnp } } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +__global__ void kernel2DXnp +( + double*, + double*, + const double*, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXnp +( + cuSten_t*, + bool +); + +template +__global__ void kernel2DXnp +( + float*, + float*, + const float*, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXnp +( + cuSten_t*, + bool +); + // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/kernels/2d_x_p_fun_kernel.cu b/cuSten/src/kernels/2d_x_p_fun_kernel.cu index da3e4d3..16c84ed 100644 --- a/cuSten/src/kernels/2d_x_p_fun_kernel.cu +++ b/cuSten/src/kernels/2d_x_p_fun_kernel.cu @@ -37,14 +37,18 @@ // Function pointer definition // --------------------------------------------------------------------- -/*! typedef double (*devArg1X)(double*, double*, int); +/*! typedef elemType (*devArg1X)(elemType*, elemType*, int); \brief The function pointer containing the user defined function to be applied
Input 1: The pointer to input data to the function
Input 2: The pointer to the coefficients provided by the user
Input 3: The current index position (centre of the stencil to be applied) */ -typedef double (*devArg1X)(double*, double*, int); +template +struct templateFunc +{ + typedef elemType (*devArg1X)(elemType*, elemType*, int); +}; // --------------------------------------------------------------------- // Kernel Definition @@ -65,12 +69,13 @@ typedef double (*devArg1X)(double*, double*, int); \param nx Total number of points in the x direction */ +template __global__ void kernel2DXpFun ( - double* dataOutput, - double* dataInput, - const double* coe, - const double* func, + elemType* dataOutput, + elemType* dataInput, + const elemType* coe, + const elemType* func, const int numStenLeft, const int numStenRight, const int numCoe, @@ -83,8 +88,8 @@ __global__ void kernel2DXpFun // Allocate the shared memory extern __shared__ int memory[]; - double* arrayLocal = (double*)&memory; - double* coeLocal = (double*)&arrayLocal[nxLocal * nyLocal]; + elemType* arrayLocal = (elemType*)&memory; + elemType* coeLocal = (elemType*)&arrayLocal[nxLocal * nyLocal]; // Move the weigths into shared memory #pragma unroll @@ -94,7 +99,7 @@ __global__ void kernel2DXpFun } // True matrix index - int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; + int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; int globalIdy = blockDim.y * blockIdx.y + threadIdx.y; // Local matrix index @@ -156,7 +161,7 @@ __global__ void kernel2DXpFun stenSet = localIdy * nxLocal + localIdx; - dataOutput[globalIdy * nx + globalIdx] = ((devArg1X)func)(arrayLocal, coeLocal, stenSet); + dataOutput[globalIdy * nx + globalIdx] = ((typename templateFunc::devArg1X)func)(arrayLocal, coeLocal, stenSet); } // --------------------------------------------------------------------- @@ -169,10 +174,10 @@ __global__ void kernel2DXpFun \param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU */ -void cuSenCompute2DXpFun +template +void cuStenCompute2DXpFun ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ) { @@ -191,8 +196,8 @@ void cuSenCompute2DXpFun cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tile data - cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[0], pt_cuSten->streams[1]); @@ -227,8 +232,8 @@ void cuSenCompute2DXpFun // Offload should the user want to if (offload == 1) { - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); } // Load the next tile @@ -238,8 +243,8 @@ void cuSenCompute2DXpFun cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the necessary tiles - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]); @@ -258,6 +263,56 @@ void cuSenCompute2DXpFun } } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +__global__ void kernel2DXpFun +( + double*, + double*, + const double*, + const double*, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXpFun +( + cuSten_t*, + bool +); + +template +__global__ void kernel2DXpFun +( + float*, + float*, + const float*, + const float*, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXpFun +( + cuSten_t*, + bool +); + // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/kernels/2d_x_p_kernel.cu b/cuSten/src/kernels/2d_x_p_kernel.cu index d8fd340..68ac762 100644 --- a/cuSten/src/kernels/2d_x_p_kernel.cu +++ b/cuSten/src/kernels/2d_x_p_kernel.cu @@ -52,11 +52,12 @@ \param nx Total number of points in the x direction */ +template __global__ void kernel2DXp ( - double* dataOutput, - double* dataInput, - const double* weights, + elemType* dataOutput, + elemType* dataInput, + const elemType* weights, const int numSten, const int numStenLeft, const int numStenRight, @@ -72,8 +73,8 @@ __global__ void kernel2DXp extern __shared__ int memory[]; - double* arrayLocal = (double*)&memory; - double* weigthsLocal = (double*)&arrayLocal[nxLocal * nyLocal]; + elemType* arrayLocal = (elemType*)&memory; + elemType* weigthsLocal = (elemType*)&arrayLocal[nxLocal * nyLocal]; // Move the weigths into shared memory #pragma unroll @@ -87,7 +88,7 @@ __global__ void kernel2DXp // ----------------------------- // True matrix index - int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; + int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; int globalIdy = blockDim.y * blockIdx.y + threadIdx.y; // Local matrix index @@ -95,7 +96,7 @@ __global__ void kernel2DXp int localIdy = threadIdx.y; // Local sum variable - double sum = 0.0; + elemType sum = 0.0; // Set index for summing stencil int stenSet; @@ -190,10 +191,10 @@ __global__ void kernel2DXp \param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU */ +template void cuStenCompute2DXp ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ) { @@ -219,8 +220,8 @@ void cuStenCompute2DXp cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tile data - cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[0], pt_cuSten->streams[1]); @@ -241,8 +242,8 @@ void cuStenCompute2DXp // Offload should the user want to if (offload == 1) { - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); } // Load the next tile @@ -252,8 +253,8 @@ void cuStenCompute2DXp cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the necessary tiles - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]); @@ -272,6 +273,54 @@ void cuStenCompute2DXp } } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +__global__ void kernel2DXp +( + double*, + double*, + const double*, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXp +( + cuSten_t*, + bool +); + +template +__global__ void kernel2DXp +( + float*, + float*, + const float*, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXp +( + cuSten_t*, + bool +); + // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/kernels/2d_xyADVWENO_p_kernel.cu b/cuSten/src/kernels/2d_xyADVWENO_p_kernel.cu index 78b61b1..eafc4a6 100644 --- a/cuSten/src/kernels/2d_xyADVWENO_p_kernel.cu +++ b/cuSten/src/kernels/2d_xyADVWENO_p_kernel.cu @@ -48,21 +48,23 @@ */ // Notation from Level Set Methods - Fedkiw -__device__ double wenoSten + +template +__device__ elemType wenoSten ( - double v1, - double v2, - double v3, - double v4, - double v5 + elemType v1, + elemType v2, + elemType v3, + elemType v4, + elemType v5 ) { - double epsilon = 1e-06; - double phi1, phi2, phi3; - double s1, s2, s3; - double alpha1, alpha2, alpha3; - double denom; - double w1, w2, w3; + elemType epsilon = 1e-06; + elemType phi1, phi2, phi3; + elemType s1, s2, s3; + elemType alpha1, alpha2, alpha3; + elemType denom; + elemType w1, w2, w3; phi1 = (1.0 / 3.0) * v1 - (7.0 / 6.0) * v2 + (11.0 / 6.0) * v3; phi2 = - (1.0 / 6.0) * v2 + (5.0 / 6.0) * v3 + (1.0 / 3.0) * v4; @@ -114,20 +116,21 @@ __device__ double wenoSten \param nyTile Number of y direction points on tile */ +template __global__ void kernel2DXYWENOADVp ( - double* dataOutput, + elemType* dataOutput, - double* dataInput, + elemType* dataInput, - double* uVel, - double* vVel, + elemType* uVel, + elemType* vVel, - double* boundaryTop, - double* boundaryBottom, + elemType* boundaryTop, + elemType* boundaryBottom, - const double coeDx, - const double coeDy, + const elemType coeDx, + const elemType coeDy, const int numSten, @@ -155,14 +158,14 @@ __global__ void kernel2DXYWENOADVp extern __shared__ int memory[]; - double* arrayLocal = (double*)&memory; + elemType* arrayLocal = (elemType*)&memory; // ----------------------------- // Set the indexing // ----------------------------- // True matrix index - int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; + int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; int globalIdy = blockDim.y * blockIdx.y + threadIdx.y; // Local matrix index @@ -275,10 +278,10 @@ __global__ void kernel2DXYWENOADVp // ----------------------------- // Inputs - double v1, v2, v3, v4, v5; + elemType v1, v2, v3, v4, v5; // Output - double Fx, Fy; + elemType Fx, Fy; // X direction fluxes __syncthreads(); @@ -406,10 +409,10 @@ __global__ void kernel2DXYWENOADVp \param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU */ +template void cuStenCompute2DXYWENOADVp ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ) { @@ -428,14 +431,14 @@ void cuStenCompute2DXYWENOADVp cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tile data - cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->uVel[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->vVel[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->uVel[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->vVel[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the boundary data - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[0], pt_cuSten->streams[1]); @@ -496,14 +499,14 @@ void cuStenCompute2DXYWENOADVp cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the necessary tiles - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->uVel[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->vVel[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->uVel[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->vVel[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the next boundaries - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]); @@ -512,12 +515,12 @@ void cuStenCompute2DXYWENOADVp // Offload the previous set of tiles, this is to ensure we don't get page faults if (offload == 1 && tile > 0) { - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); - cudaMemPrefetchAsync(pt_cuSten->uVel[tile - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); - cudaMemPrefetchAsync(pt_cuSten->vVel[tile - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile - 1], pt_cuSten->numBoundaryTop * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile - 1], pt_cuSten->numBoundaryBottom * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); + cudaMemPrefetchAsync(pt_cuSten->uVel[tile - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); + cudaMemPrefetchAsync(pt_cuSten->vVel[tile - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile - 1], pt_cuSten->numBoundaryTop * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile - 1], pt_cuSten->numBoundaryBottom * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); } // Permute streams @@ -535,15 +538,125 @@ void cuStenCompute2DXYWENOADVp // Offload the final set if (offload == 1) { - cudaMemPrefetchAsync(pt_cuSten->dataOutput[pt_cuSten->numTiles - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[pt_cuSten->numTiles - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); - cudaMemPrefetchAsync(pt_cuSten->uVel[pt_cuSten->numTiles - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); - cudaMemPrefetchAsync(pt_cuSten->vVel[pt_cuSten->numTiles - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[pt_cuSten->numTiles - 1], pt_cuSten->numBoundaryTop * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[pt_cuSten->numTiles - 1], pt_cuSten->numBoundaryBottom * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[pt_cuSten->numTiles - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[pt_cuSten->numTiles - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); + cudaMemPrefetchAsync(pt_cuSten->uVel[pt_cuSten->numTiles - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); + cudaMemPrefetchAsync(pt_cuSten->vVel[pt_cuSten->numTiles - 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[pt_cuSten->numTiles - 1], pt_cuSten->numBoundaryTop * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[pt_cuSten->numTiles - 1], pt_cuSten->numBoundaryBottom * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[pt_cuSten->numStreams - 1]); } } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +__device__ double wenoSten +( + double, + double, + double, + double, + double +); + +template +__global__ void kernel2DXYWENOADVp +( + double*, + + double*, + + double*, + double*, + + double*, + double*, + + const double, + const double, + + const int, + + const int, + const int, + const int, + + const int, + const int, + const int, + + const int, + const int, + + const int, + const int, + + const int, + const int +); + +template +void cuStenCompute2DXYWENOADVp +( + cuSten_t*, + bool +); + +template +__device__ float wenoSten +( + float, + float, + float, + float, + float +); + +template +__global__ void kernel2DXYWENOADVp +( + float*, + + float*, + + float*, + float*, + + float*, + float*, + + const float, + const float, + + const int, + + const int, + const int, + const int, + + const int, + const int, + const int, + + const int, + const int, + + const int, + const int, + + const int, + const int +); + +template +void cuStenCompute2DXYWENOADVp +( + cuSten_t*, + bool +); + // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/kernels/2d_xy_np_fun_kernel.cu b/cuSten/src/kernels/2d_xy_np_fun_kernel.cu index 16e6440..0922d31 100644 --- a/cuSten/src/kernels/2d_xy_np_fun_kernel.cu +++ b/cuSten/src/kernels/2d_xy_np_fun_kernel.cu @@ -37,7 +37,7 @@ // Function pointer definition // --------------------------------------------------------------------- -/*! typedef double (*devArg1XY)(double*, double*, int); +/*! typedef elemType (*devArg1XY)(elemType*, elemType*, int); \brief The function pointer containing the user defined function to be applied
Input 1: The pointer to input data to the function
Input 2: The pointer to the coefficients provided by the user
@@ -47,7 +47,11 @@ Input 6: Size of stencil in y direction */ -typedef double (*devArg1XY)(double*, double*, int, int, int, int); +template +struct templateFunc +{ + typedef elemType (*devArg1XY)(elemType*, elemType*, int, int, int, int); +}; // --------------------------------------------------------------------- // Kernel Definition @@ -78,14 +82,15 @@ typedef double (*devArg1XY)(double*, double*, int, int, int, int); \param tileBottom Check if the current tile is at the bottom of the domain */ +template __global__ void kernel2DXYnpFun ( - double* dataOutput, - double* dataInput, - double* boundaryTop, - double* boundaryBottom, - const double* coe, - double* func, + elemType* dataOutput, + elemType* dataInput, + elemType* boundaryTop, + elemType* boundaryBottom, + const elemType* coe, + elemType* func, const int numSten, const int numStenHoriz, const int numStenLeft, @@ -109,8 +114,8 @@ __global__ void kernel2DXYnpFun extern __shared__ int memory[]; - double* arrayLocal = (double*)&memory; - double* coeLocal = (double*)&arrayLocal[nxLocal * nyLocal]; + elemType* arrayLocal = (elemType*)&memory; + elemType* coeLocal = (elemType*)&arrayLocal[nxLocal * nyLocal]; // Move the weigths into shared memory #pragma unroll @@ -124,7 +129,7 @@ __global__ void kernel2DXYnpFun // ----------------------------- // True matrix index - int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; + int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; int globalIdy = blockDim.y * blockIdx.y + threadIdx.y; // Local matrix index @@ -132,7 +137,7 @@ __global__ void kernel2DXYnpFun int localIdy = threadIdx.y + numStenTop; // Local sum variable - double sum = 0.0; + elemType sum = 0.0; // Set index for summing stencil int stenSet; @@ -198,7 +203,7 @@ __global__ void kernel2DXYnpFun stenSet = threadIdx.y * nxLocal + threadIdx.x; - sum = ((devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); + sum = ((typename templateFunc::devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); // Ensure the compute is complete __syncthreads(); @@ -280,7 +285,7 @@ __global__ void kernel2DXYnpFun stenSet = threadIdx.y * nxLocal + threadIdx.x; - sum = ((devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); + sum = ((typename templateFunc::devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); // Ensure the compute is complete __syncthreads(); @@ -361,7 +366,7 @@ __global__ void kernel2DXYnpFun stenSet = threadIdx.y * nxLocal + threadIdx.x; - sum = ((devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); + sum = ((typename templateFunc::devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); // Ensure the compute is complete __syncthreads(); @@ -441,7 +446,7 @@ __global__ void kernel2DXYnpFun stenSet = threadIdx.y * nxLocal + threadIdx.x; - sum = ((devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); + sum = ((typename templateFunc::devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); // Ensure the compute is complete __syncthreads(); @@ -539,7 +544,7 @@ __global__ void kernel2DXYnpFun stenSet = threadIdx.y * nxLocal + threadIdx.x; - sum = ((devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); + sum = ((typename templateFunc::devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); // Ensure the compute is complete __syncthreads(); @@ -634,7 +639,7 @@ __global__ void kernel2DXYnpFun stenSet = threadIdx.y * nxLocal + threadIdx.x; - sum = ((devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); + sum = ((typename templateFunc::devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); // Ensure the compute is complete __syncthreads(); @@ -708,7 +713,7 @@ __global__ void kernel2DXYnpFun stenSet = threadIdx.y * nxLocal + threadIdx.x; - sum = ((devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); + sum = ((typename templateFunc::devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); // Ensure the compute is complete __syncthreads(); @@ -775,7 +780,7 @@ __global__ void kernel2DXYnpFun stenSet = threadIdx.y * nxLocal + threadIdx.x; - sum = ((devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); + sum = ((typename templateFunc::devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); // Ensure the compute is complete __syncthreads(); @@ -860,7 +865,7 @@ __global__ void kernel2DXYnpFun stenSet = threadIdx.y * nxLocal + threadIdx.x; - sum = ((devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); + sum = ((typename templateFunc::devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); // Ensure the compute is complete __syncthreads(); @@ -883,10 +888,10 @@ __global__ void kernel2DXYnpFun \param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU */ +template void cuStenCompute2DXYnpFun ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ) { @@ -902,18 +907,18 @@ void cuStenCompute2DXYnpFun dim3 gridDim(pt_cuSten->xGrid, pt_cuSten->yGrid); // Load the coefficients - cudaMemPrefetchAsync(pt_cuSten->coe, pt_cuSten->numSten * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->coe, pt_cuSten->numSten * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Ensure the current stream is free cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tile data - cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the boundary data - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[0], pt_cuSten->streams[1]); @@ -998,8 +1003,8 @@ void cuStenCompute2DXYnpFun // Offload should the user want to if (offload == 1) { - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); } // Load the next set of data @@ -1009,12 +1014,12 @@ void cuStenCompute2DXYnpFun cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tiles - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the next boundaries - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]); @@ -1033,6 +1038,76 @@ void cuStenCompute2DXYnpFun } } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +__global__ void kernel2DXYnpFun +( + double*, + double*, + double*, + double*, + const double*, + double*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXYnpFun +( + cuSten_t*, + bool +); + +template +__global__ void kernel2DXYnpFun +( + float*, + float*, + float*, + float*, + const float*, + float*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXYnpFun +( + cuSten_t*, + bool +); + // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/kernels/2d_xy_np_kernel.cu b/cuSten/src/kernels/2d_xy_np_kernel.cu index b0455a8..41ed5cf 100644 --- a/cuSten/src/kernels/2d_xy_np_kernel.cu +++ b/cuSten/src/kernels/2d_xy_np_kernel.cu @@ -61,13 +61,14 @@ \param tileBottom Check if the current tile is at the bottom of the domain */ +template __global__ void kernel2DXYnp ( - double* dataOutput, - double* dataInput, - double* boundaryTop, - double* boundaryBottom, - const double* weights, + elemType* dataOutput, + elemType* dataInput, + elemType* boundaryTop, + elemType* boundaryBottom, + const elemType* weights, const int numSten, const int numStenHoriz, const int numStenLeft, @@ -91,8 +92,8 @@ __global__ void kernel2DXYnp extern __shared__ int memory[]; - double* arrayLocal = (double*)&memory; - double* weigthsLocal = (double*)&arrayLocal[nxLocal * nyLocal]; + elemType* arrayLocal = (elemType*)&memory; + elemType* weigthsLocal = (elemType*)&arrayLocal[nxLocal * nyLocal]; // Move the weigths into shared memory #pragma unroll @@ -106,7 +107,7 @@ __global__ void kernel2DXYnp // ----------------------------- // True matrix index - int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; + int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; int globalIdy = blockDim.y * blockIdx.y + threadIdx.y; // Local matrix index @@ -114,7 +115,7 @@ __global__ void kernel2DXYnp int localIdy = threadIdx.y + numStenTop; // Local sum variable - double sum = 0.0; + elemType sum = 0.0; // Set index for summing stencil int stenSet; @@ -969,10 +970,10 @@ __global__ void kernel2DXYnp \param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU */ +template void cuStenCompute2DXYnp ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ) { @@ -988,18 +989,18 @@ void cuStenCompute2DXYnp dim3 gridDim(pt_cuSten->xGrid, pt_cuSten->yGrid); // Load the weights - cudaMemPrefetchAsync(pt_cuSten->weights, pt_cuSten->numSten * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->weights, pt_cuSten->numSten * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Ensure the current stream is free cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tile data - cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the boundary data - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[0], pt_cuSten->streams[1]); @@ -1082,8 +1083,8 @@ void cuStenCompute2DXYnp // Offload should the user want to if (offload == 1) { - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); } // Load the next set of data @@ -1093,12 +1094,12 @@ void cuStenCompute2DXYnp cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tiles - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the next boundaries - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]); @@ -1117,6 +1118,74 @@ void cuStenCompute2DXYnp } } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +__global__ void kernel2DXYnp +( + double*, + double*, + double*, + double*, + const double*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXYnp +( + cuSten_t*, + bool +); + +template +__global__ void kernel2DXYnp +( + float*, + float*, + float*, + float*, + const float*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXYnp +( + cuSten_t*, + bool +); + // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/kernels/2d_xy_p_fun_kernel.cu b/cuSten/src/kernels/2d_xy_p_fun_kernel.cu index 541d0df..85eb8a4 100644 --- a/cuSten/src/kernels/2d_xy_p_fun_kernel.cu +++ b/cuSten/src/kernels/2d_xy_p_fun_kernel.cu @@ -38,7 +38,7 @@ // Function pointer definition // --------------------------------------------------------------------- -/*! typedef double (*devArg1XY)(double*, double*, int, int, int, int); +/*! typedef elemType (*devArg1XY)(elemType*, elemType*, int, int, int, int); \brief The function pointer containing the user defined function to be applied
Input 1: The pointer to input data to the function
Input 2: The pointer to the coefficients provided by the user
@@ -48,7 +48,11 @@ Input 6: Size of stencil in y direction */ -typedef double (*devArg1XY)(double*, double*, int, int, int, int); +template +struct templateFunc +{ + typedef elemType (*devArg1XY)(elemType*, elemType*, int, int, int, int); +}; // --------------------------------------------------------------------- // Kernel Definition @@ -79,14 +83,15 @@ typedef double (*devArg1XY)(double*, double*, int, int, int, int); \param tileBottom Check if the current tile is at the bottom of the domain */ +template __global__ void kernel2DXYpFun ( - double* dataOutput, - double* dataInput, - double* boundaryTop, - double* boundaryBottom, - const double* coe, - double* func, + elemType* dataOutput, + elemType* dataInput, + elemType* boundaryTop, + elemType* boundaryBottom, + const elemType* coe, + elemType* func, const int numSten, const int numStenHoriz, const int numStenLeft, @@ -108,8 +113,8 @@ __global__ void kernel2DXYpFun extern __shared__ int memory[]; - double* arrayLocal = (double*)&memory; - double* coeLocal = (double*)&arrayLocal[nxLocal * nyLocal]; + elemType* arrayLocal = (elemType*)&memory; + elemType* coeLocal = (elemType*)&arrayLocal[nxLocal * nyLocal]; // Move the weigths into shared memory #pragma unroll @@ -123,7 +128,7 @@ __global__ void kernel2DXYpFun // ----------------------------- // True matrix index - int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; + int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; int globalIdy = blockDim.y * blockIdx.y + threadIdx.y; // Local matrix index @@ -131,7 +136,7 @@ __global__ void kernel2DXYpFun int localIdy = threadIdx.y + numStenTop; // Local sum variable - double sum = 0.0; + elemType sum = 0.0; // Set index for summing stencil int stenSet; @@ -523,7 +528,7 @@ __global__ void kernel2DXYpFun __syncthreads(); - sum = ((devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); + sum = ((typename templateFunc::devArg1XY)func)(arrayLocal, coeLocal, stenSet, nxLocal, numStenHoriz, numStenVert); __syncthreads(); @@ -546,10 +551,10 @@ __global__ void kernel2DXYpFun \param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU */ +template void cuStenCompute2DXYpFun ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ) { @@ -565,18 +570,18 @@ void cuStenCompute2DXYpFun dim3 gridDim(pt_cuSten->xGrid, pt_cuSten->yGrid); // Load the weights - cudaMemPrefetchAsync(pt_cuSten->coe, pt_cuSten->numSten * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->coe, pt_cuSten->numSten * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Ensure the current stream is free cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tile data - cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the boundary data - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); sprintf(msgStringBuffer, "Can't prefectch %d", pt_cuSten->deviceNum); checkError(msgStringBuffer); @@ -633,8 +638,8 @@ void cuStenCompute2DXYpFun // Offload should the user want to if (offload == 1) { - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); } // Load the next set of data @@ -644,12 +649,12 @@ void cuStenCompute2DXYpFun cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tiles - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the next boundaries - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]); @@ -668,6 +673,72 @@ void cuStenCompute2DXYpFun } } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +__global__ void kernel2DXYpFun +( + double*, + double*, + double*, + double*, + const double*, + double*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXYpFun +( + cuSten_t*, + bool +); + +template +__global__ void kernel2DXYpFun +( + float*, + float*, + float*, + float*, + const float*, + float*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXYpFun +( + cuSten_t*, + bool +); + // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/kernels/2d_xy_p_kernel.cu b/cuSten/src/kernels/2d_xy_p_kernel.cu index 7c9c171..da6b2e1 100644 --- a/cuSten/src/kernels/2d_xy_p_kernel.cu +++ b/cuSten/src/kernels/2d_xy_p_kernel.cu @@ -60,13 +60,14 @@ \param nyTile Number of y direction points on tile */ +template __global__ void kernel2DXYp ( - double* dataOutput, - double* dataInput, - double* boundaryTop, - double* boundaryBottom, - const double* weights, + elemType* dataOutput, + elemType* dataInput, + elemType* boundaryTop, + elemType* boundaryBottom, + const elemType* weights, const int numSten, const int numStenHoriz, const int numStenLeft, @@ -88,8 +89,8 @@ __global__ void kernel2DXYp extern __shared__ int memory[]; - double* arrayLocal = (double*)&memory; - double* weigthsLocal = (double*)&arrayLocal[nxLocal * nyLocal]; + elemType* arrayLocal = (elemType*)&memory; + elemType* weigthsLocal = (elemType*)&arrayLocal[nxLocal * nyLocal]; // Move the weigths into shared memory #pragma unroll @@ -103,7 +104,7 @@ __global__ void kernel2DXYp // ----------------------------- // True matrix index - int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; + int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; int globalIdy = blockDim.y * blockIdx.y + threadIdx.y; // Local matrix index @@ -111,7 +112,7 @@ __global__ void kernel2DXYp int localIdy = threadIdx.y + numStenTop; // Local sum variable - double sum = 0.0; + elemType sum = 0.0; // Set index for summing stencil int stenSet; @@ -539,10 +540,10 @@ __global__ void kernel2DXYp \param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU */ +template void cuStenCompute2DXYp ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ) { @@ -558,18 +559,18 @@ void cuStenCompute2DXYp dim3 gridDim(pt_cuSten->xGrid, pt_cuSten->yGrid); // Load the weights - cudaMemPrefetchAsync(pt_cuSten->weights, pt_cuSten->numSten * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->weights, pt_cuSten->numSten * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Ensure the current stream is free cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tile data - cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the boundary data - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[0], pt_cuSten->streams[1]); @@ -621,8 +622,8 @@ void cuStenCompute2DXYp // Offload should the user want to if (offload == 1) { - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); } // Load the next set of data @@ -632,12 +633,12 @@ void cuStenCompute2DXYp cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tiles - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the next boundaries - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]); @@ -656,6 +657,70 @@ void cuStenCompute2DXYp } } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +__global__ void kernel2DXYp +( + double*, + double*, + double*, + double*, + const double*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXYp +( + cuSten_t*, + bool +); + +template +__global__ void kernel2DXYp +( + float*, + float*, + float*, + float*, + const float*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DXYp +( + cuSten_t*, + bool +); + // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/kernels/2d_y_np_fun_kernel.cu b/cuSten/src/kernels/2d_y_np_fun_kernel.cu index c57ef75..9239e37 100644 --- a/cuSten/src/kernels/2d_y_np_fun_kernel.cu +++ b/cuSten/src/kernels/2d_y_np_fun_kernel.cu @@ -37,7 +37,7 @@ // Function pointer definition // --------------------------------------------------------------------- -/*! typedef double (*devArg1Y)(double*, double*, int, int); +/*! typedef elemType (*devArg1Y)(elemType*, elemType*, int, int); \brief The function pointer containing the user defined function to be applied
Input 1: The pointer to input data to the function
Input 2: The pointer to the coefficients provided by the user
@@ -45,7 +45,11 @@ Input 4: Value to be used to jump between rows. (j + 1, j - 1 etc.) */ -typedef double (*devArg1Y)(double*, double*, int, int); +template +struct templateFunc +{ + typedef elemType (*devArg1Y)(elemType*, elemType*, int, int); +}; // --------------------------------------------------------------------- // Kernel Definition @@ -71,14 +75,15 @@ typedef double (*devArg1Y)(double*, double*, int, int); \param tileBottom Check if the current tile is at the bottom of the domain */ -static __global__ void kernel2DYnpFun +template +__global__ void kernel2DYnpFun ( - double* dataOutput, - double* dataInput, - double* boundaryTop, - double* boundaryBottom, - const double* coe, - const double* func, + elemType* dataOutput, + elemType* dataInput, + elemType* boundaryTop, + elemType* boundaryBottom, + const elemType* coe, + const elemType* func, const int numSten, const int numStenTop, const int numStenBottom, @@ -94,8 +99,8 @@ static __global__ void kernel2DYnpFun // Allocate the shared memory extern __shared__ int memory[]; - double* arrayLocal = (double*)&memory; - double* coeLocal = (double*)&arrayLocal[nxLocal * nyLocal]; + elemType* arrayLocal = (elemType*)&memory; + elemType* coeLocal = (elemType*)&arrayLocal[nxLocal * nyLocal]; // Move the weigths into shared memory #pragma unroll @@ -105,7 +110,7 @@ static __global__ void kernel2DYnpFun } // True matrix index - int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; + int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; int globalIdy = blockDim.y * blockIdx.y + threadIdx.y; // Local matrix index @@ -113,7 +118,7 @@ static __global__ void kernel2DYnpFun int localIdy = threadIdx.y + numStenTop; // Local sum variable - double sum = 0.0; + elemType sum = 0.0; // Set index for summing stencil int stenSet; @@ -137,7 +142,7 @@ static __global__ void kernel2DYnpFun stenSet = localIdy * nxLocal + localIdx; - sum = ((devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); + sum = ((typename templateFunc::devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); __syncthreads(); @@ -164,7 +169,7 @@ static __global__ void kernel2DYnpFun stenSet = localIdy * nxLocal + localIdx; - sum = ((devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); + sum = ((typename templateFunc::devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); __syncthreads(); @@ -183,7 +188,7 @@ static __global__ void kernel2DYnpFun stenSet = localIdy * nxLocal + localIdx; - sum = ((devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); + sum = ((typename templateFunc::devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); __syncthreads(); @@ -216,7 +221,7 @@ static __global__ void kernel2DYnpFun stenSet = localIdy * nxLocal + localIdx; - sum = ((devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); + sum = ((typename templateFunc::devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); __syncthreads(); @@ -235,7 +240,7 @@ static __global__ void kernel2DYnpFun stenSet = localIdy * nxLocal + localIdx; - sum = ((devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); + sum = ((typename templateFunc::devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); __syncthreads(); @@ -257,10 +262,10 @@ static __global__ void kernel2DYnpFun \param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU */ +template void cuStenCompute2DYnpFun ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ) { @@ -276,18 +281,18 @@ void cuStenCompute2DYnpFun dim3 gridDim(pt_cuSten->xGrid, pt_cuSten->yGrid); // Load the weights - cudaMemPrefetchAsync(pt_cuSten->coe, pt_cuSten->numSten * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->coe, pt_cuSten->numSten * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Ensure the current stream is free cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tile data - cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the boundary data - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[0], pt_cuSten->streams[1]); @@ -357,8 +362,8 @@ void cuStenCompute2DYnpFun // Offload should the user want to if (offload == 1) { - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); } // Load the next set of data @@ -368,12 +373,12 @@ void cuStenCompute2DYnpFun cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tiles - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the next boundaries - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record teh event cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]); @@ -392,6 +397,66 @@ void cuStenCompute2DYnpFun } } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +__global__ void kernel2DYnpFun +( + double*, + double*, + double*, + double*, + const double*, + const double*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DYnpFun +( + cuSten_t*, + bool +); + +template +__global__ void kernel2DYnpFun +( + float*, + float*, + float*, + float*, + const float*, + const float*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DYnpFun +( + cuSten_t*, + bool +); + // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/kernels/2d_y_np_kernel.cu b/cuSten/src/kernels/2d_y_np_kernel.cu index 832ca27..f3a1692 100644 --- a/cuSten/src/kernels/2d_y_np_kernel.cu +++ b/cuSten/src/kernels/2d_y_np_kernel.cu @@ -56,13 +56,14 @@ \param tileBottom Check if the current tile is at the bottom of the domain */ +template __global__ void kernel2DYnp ( - double* dataNew, - double* dataOld, - double* boundaryTop, - double* boundaryBottom, - const double* weights, + elemType* dataNew, + elemType* dataOld, + elemType* boundaryTop, + elemType* boundaryBottom, + const elemType* weights, const int numSten, const int numStenTop, const int numStenBottom, @@ -78,8 +79,8 @@ __global__ void kernel2DYnp // Allocate the shared memory extern __shared__ int memory[]; - double* arrayLocal = (double*)&memory; - double* weigthsLocal = (double*)&arrayLocal[nxLocal * nyLocal]; + elemType* arrayLocal = (elemType*)&memory; + elemType* weigthsLocal = (elemType*)&arrayLocal[nxLocal * nyLocal]; // Move the weigths into shared memory #pragma unroll @@ -89,7 +90,7 @@ __global__ void kernel2DYnp } // True matrix index - int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; + int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; int globalIdy = blockDim.y * blockIdx.y + threadIdx.y; // Local matrix index @@ -97,7 +98,7 @@ __global__ void kernel2DYnp int localIdy = threadIdx.y + numStenTop; // Local sum variable - double sum = 0.0; + elemType sum = 0.0; // Set index for summing stencil int stenSet; @@ -261,10 +262,10 @@ __global__ void kernel2DYnp \param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU */ +template void cuStenCompute2DYnp ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ) { @@ -280,18 +281,18 @@ void cuStenCompute2DYnp dim3 gridDim(pt_cuSten->xGrid, pt_cuSten->yGrid); // Load the weights - cudaMemPrefetchAsync(pt_cuSten->weights, pt_cuSten->numSten * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->weights, pt_cuSten->numSten * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Ensure the current stream is free cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tile data - cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the boundary data - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[0], pt_cuSten->streams[1]); @@ -343,8 +344,8 @@ void cuStenCompute2DYnp // Offload should the user want to if (offload == 1) { - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); } // Load the next set of data @@ -354,12 +355,12 @@ void cuStenCompute2DYnp cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tiles - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the next boundaries - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]); @@ -378,6 +379,64 @@ void cuStenCompute2DYnp } } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +__global__ void kernel2DYnp +( + double*, + double*, + double*, + double*, + const double*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DYnp +( + cuSten_t*, + bool +); + +template +__global__ void kernel2DYnp +( + float*, + float*, + float*, + float*, + const float*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DYnp +( + cuSten_t*, + bool +); + // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/kernels/2d_y_p_fun_kernel.cu b/cuSten/src/kernels/2d_y_p_fun_kernel.cu index fd85946..520a88e 100644 --- a/cuSten/src/kernels/2d_y_p_fun_kernel.cu +++ b/cuSten/src/kernels/2d_y_p_fun_kernel.cu @@ -38,7 +38,7 @@ // Function pointer definition // --------------------------------------------------------------------- -/*! typedef double (*devArg1Y)(double*, double*, int, int); +/*! typedef elemType (*devArg1Y)(elemType*, elemType*, int, int); \brief The function pointer containing the user defined function to be applied
Input 1: The pointer to input data to the function
Input 2: The pointer to the coefficients provided by the user
@@ -46,7 +46,11 @@ Input 4: Value to be used to jump between rows. (j + 1, j - 1 etc.) */ -typedef double (*devArg1Y)(double*, double*, int, int); +template +struct templateFunc +{ + typedef elemType (*devArg1Y)(elemType*, elemType*, int, int); +}; // --------------------------------------------------------------------- // Kernel Definition @@ -70,15 +74,16 @@ typedef double (*devArg1Y)(double*, double*, int, int); \param nyTile Number of y direction points on tile */ +template __global__ void kernel2DYpFun ( - double* dataNew, - double* dataOld, - double* boundaryTop, - double* boundaryBottom, - double* func, - const double* coe, + elemType* dataNew, + elemType* dataOld, + elemType* boundaryTop, + elemType* boundaryBottom, + elemType* func, + const elemType* coe, const int numCoe, const int numStenTop, const int numStenBottom, @@ -92,8 +97,8 @@ __global__ void kernel2DYpFun // Allocate the shared memory extern __shared__ int memory[]; - double* arrayLocal = (double*)&memory; - double* coeLocal = (double*)&arrayLocal[nxLocal * nyLocal]; + elemType* arrayLocal = (elemType*)&memory; + elemType* coeLocal = (elemType*)&arrayLocal[nxLocal * nyLocal]; // Move the weigths into shared memory #pragma unroll @@ -103,7 +108,7 @@ __global__ void kernel2DYpFun } // True matrix index - int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; + int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; int globalIdy = blockDim.y * blockIdx.y + threadIdx.y; // Local matrix index @@ -111,7 +116,7 @@ __global__ void kernel2DYpFun int localIdy = threadIdx.y + numStenTop; // Local sum variable - double sum = 0.0; + elemType sum = 0.0; // Set index for summing stencil int stenSet; @@ -135,7 +140,7 @@ __global__ void kernel2DYpFun stenSet = localIdy * nxLocal + localIdx; - sum = ((devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); + sum = ((typename templateFunc::devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); __syncthreads(); @@ -161,7 +166,7 @@ __global__ void kernel2DYpFun stenSet = localIdy * nxLocal + localIdx; - sum = ((devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); + sum = ((typename templateFunc::devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); __syncthreads(); @@ -187,7 +192,7 @@ __global__ void kernel2DYpFun stenSet = localIdy * nxLocal + localIdx; - sum = ((devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); + sum = ((typename templateFunc::devArg1Y)func)(arrayLocal, coeLocal, stenSet, nxLocal); __syncthreads(); @@ -205,10 +210,10 @@ __global__ void kernel2DYpFun \param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU */ +template void cuStenCompute2DYpFun ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ) { @@ -224,18 +229,18 @@ void cuStenCompute2DYpFun dim3 gridDim(pt_cuSten->xGrid, pt_cuSten->yGrid); // Load the weights - cudaMemPrefetchAsync(pt_cuSten->coe, pt_cuSten->numCoe * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->coe, pt_cuSten->numCoe * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Ensure the current stream is free cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tile data - cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the boundary data - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[0], pt_cuSten->streams[1]); @@ -259,8 +264,8 @@ void cuStenCompute2DYpFun // Offload should the user want to if (offload == 1) { - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); } // Load the next set of data @@ -270,12 +275,12 @@ void cuStenCompute2DYpFun cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tiles - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the next boundaries - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]); @@ -294,6 +299,64 @@ void cuStenCompute2DYpFun } } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +__global__ void kernel2DYpFun +( + + double*, + double*, + double*, + double*, + double*, + const double*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DYpFun +( + cuSten_t*, + bool +); + +template +__global__ void kernel2DYpFun +( + + float*, + float*, + float*, + float*, + float*, + const float*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DYpFun +( + cuSten_t*, + bool +); + // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/kernels/2d_y_p_kernel.cu b/cuSten/src/kernels/2d_y_p_kernel.cu index 5c50bba..f17d808 100644 --- a/cuSten/src/kernels/2d_y_p_kernel.cu +++ b/cuSten/src/kernels/2d_y_p_kernel.cu @@ -56,13 +56,14 @@ \param tileBottom Check if the current tile is at the bottom of the domain */ +template __global__ void kernel2DYp ( - double* dataOutput, - double* dataInput, - double* boundaryTop, - double* boundaryBottom, - const double* weights, + elemType* dataOutput, + elemType* dataInput, + elemType* boundaryTop, + elemType* boundaryBottom, + const elemType* weights, const int numSten, const int numStenTop, const int numStenBottom, @@ -76,8 +77,8 @@ __global__ void kernel2DYp // Allocate the shared memory extern __shared__ int memory[]; - double* arrayLocal = (double*)&memory; - double* weigthsLocal = (double*)&arrayLocal[nxLocal * nyLocal]; + elemType* arrayLocal = (elemType*)&memory; + elemType* weigthsLocal = (elemType*)&arrayLocal[nxLocal * nyLocal]; // Move the weigths into shared memory #pragma unroll @@ -87,7 +88,7 @@ __global__ void kernel2DYp } // True matrix index - int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; + int globalIdx = blockDim.x * blockIdx.x + threadIdx.x; int globalIdy = blockDim.y * blockIdx.y + threadIdx.y; // Local matrix index @@ -95,7 +96,7 @@ __global__ void kernel2DYp int localIdy = threadIdx.y + numStenTop; // Local sum variable - double sum = 0.0; + elemType sum = 0.0; // Set index for summing stencil int stenSet; @@ -202,10 +203,10 @@ __global__ void kernel2DYp \param offload Set to HOST to move data back to CPU or DEVICE to keep on the GPU */ +template void cuStenCompute2DYp ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ) { @@ -221,18 +222,18 @@ void cuStenCompute2DYp dim3 gridDim(pt_cuSten->xGrid, pt_cuSten->yGrid); // Load the weights - cudaMemPrefetchAsync(pt_cuSten->weights, pt_cuSten->numSten * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->weights, pt_cuSten->numSten * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Ensure the current stream is free cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tile data - cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[0], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the boundary data - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[0], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[0], pt_cuSten->streams[1]); @@ -255,8 +256,8 @@ void cuStenCompute2DYp // Offload should the user want to if (offload == 1) { - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), cudaCpuDeviceId, pt_cuSten->streams[0]); } // Load the next set of data @@ -266,12 +267,12 @@ void cuStenCompute2DYp cudaStreamSynchronize(pt_cuSten->streams[1]); // Prefetch the tiles - cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataOutput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->dataInput[tile + 1], pt_cuSten->nx * pt_cuSten->nyTile * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Prefetch the next boundaries - cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); - cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(double), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryTop[tile + 1], pt_cuSten->numBoundaryTop * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); + cudaMemPrefetchAsync(pt_cuSten->boundaryBottom[tile + 1], pt_cuSten->numBoundaryBottom * sizeof(elemType), pt_cuSten->deviceNum, pt_cuSten->streams[1]); // Record the event cudaEventRecord(pt_cuSten->events[1], pt_cuSten->streams[1]); @@ -290,6 +291,60 @@ void cuStenCompute2DYp } } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +__global__ void kernel2DYp +( + double*, + double*, + double*, + double*, + const double*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DYp +( + cuSten_t*, + bool +); + +template +__global__ void kernel2DYp +( + float*, + float*, + float*, + float*, + const float*, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int +); + +template +void cuStenCompute2DYp +( + cuSten_t*, + bool +); + // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/kernels/stencil_kernels.h b/cuSten/src/kernels/stencil_kernels.h index d9b5d00..014f77e 100644 --- a/cuSten/src/kernels/stencil_kernels.h +++ b/cuSten/src/kernels/stencil_kernels.h @@ -49,10 +49,10 @@ // 2D x direction non periodic // ---------------------------------------- +template void cuStenCompute2DXnp ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ); @@ -60,10 +60,10 @@ void cuStenCompute2DXnp // 2D x direction periodic // ---------------------------------------- +template void cuStenCompute2DXp ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ); @@ -71,10 +71,10 @@ void cuStenCompute2DXp // 2D x direction non periodic - user function // ---------------------------------------- +template void cuStenCompute2DXnpFun ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ); @@ -82,10 +82,10 @@ void cuStenCompute2DXnpFun // 2D x direction periodic - user function // ---------------------------------------- +template void cuStenCompute2DXpFun ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ); @@ -93,10 +93,10 @@ void cuStenCompute2DXpFun // 2D y direction periodic // ---------------------------------------- +template void cuStenCompute2DYp ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ); @@ -104,10 +104,10 @@ void cuStenCompute2DYp // 2D y direction periodic - user function // ---------------------------------------- +template void cuStenCompute2DYpFun ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ); @@ -115,10 +115,10 @@ void cuStenCompute2DYpFun // 2D y direction non periodic // ---------------------------------------- +template void cuStenCompute2DYnp ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ); @@ -126,10 +126,10 @@ void cuStenCompute2DYnp // 2D y direction non periodic - user function // ---------------------------------------- +template void cuStenCompute2DYnpFun ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ); @@ -137,10 +137,10 @@ void cuStenCompute2DYnpFun // 2D xy direction periodic // ---------------------------------------- +template void cuStenCompute2DXYp ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ); @@ -148,10 +148,10 @@ void cuStenCompute2DXYp // 2D xy direction periodic - user function // ---------------------------------------- +template void cuStenCompute2DXYpFun ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ); @@ -159,10 +159,10 @@ void cuStenCompute2DXYpFun // 2D xy direction non periodic // ---------------------------------------- +template void cuStenCompute2DXYnp ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ); @@ -170,10 +170,10 @@ void cuStenCompute2DXYnp // 2D xy direction non periodic - user function // ---------------------------------------- +template void cuStenCompute2DXYnpFun ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ); @@ -181,10 +181,10 @@ void cuStenCompute2DXYnpFun // 2D xy WENO periodic // ---------------------------------------- +template void cuStenCompute2DXYWENOADVp ( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, bool offload ); diff --git a/cuSten/src/struct/cuSten_struct_functions.h b/cuSten/src/struct/cuSten_struct_functions.h index 51b0fdf..31787f9 100644 --- a/cuSten/src/struct/cuSten_struct_functions.h +++ b/cuSten/src/struct/cuSten_struct_functions.h @@ -64,17 +64,18 @@ \param numStenRight Number of points on the right side of the stencil */ +template void cuStenCreate2DXnp( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataOutput, - double* dataInput, - double* weights, + elemType* dataOutput, + elemType* dataInput, + elemType* weights, int numSten, int numStenLeft, int numStenRight @@ -86,10 +87,10 @@ void cuStenCreate2DXnp( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXnp( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ); /*! \fun void cuStenDestroy2DXnp @@ -97,8 +98,9 @@ void cuStenSwap2DXnp( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXnp( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ); // ---------------------------------------- @@ -122,22 +124,23 @@ void cuStenDestroy2DXnp( \param Pointer to user function */ +template void cuStenCreate2DXnpFun( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dateOutput, - double* dateInput, - double* coe, + elemType* dateOutput, + elemType* dateInput, + elemType* coe, int numSten, int numStenLeft, int numStenRight, int numCoe, - double* func + elemType* func ); /*! \fun void cuStenSwap2DXnpFun @@ -146,10 +149,10 @@ void cuStenCreate2DXnpFun( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXnpFun( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ); /*! \fun void cuStenDestroy2DXnpFun @@ -157,8 +160,9 @@ void cuStenSwap2DXnpFun( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXnpFun( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ); // ---------------------------------------- @@ -181,17 +185,18 @@ void cuStenDestroy2DXnpFun( \param numStenRight Number of points on the right side of the stencil */ +template void cuStenCreate2DXp( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataNew, - double* dataOld, - double* weights, + elemType* dataNew, + elemType* dataOld, + elemType* weights, int numSten, int numStenLeft, int numStenRight @@ -203,10 +208,10 @@ void cuStenCreate2DXp( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXp( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ); /*! \fun void cuStenDestroy2DXnp @@ -214,8 +219,9 @@ void cuStenSwap2DXp( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXp( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ); // ---------------------------------------- @@ -239,30 +245,23 @@ void cuStenDestroy2DXp( \param Pointer to user function */ +template void cuStenCreate2DXpFun( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, int deviceNum, - int numTiles, - int nx, int ny, - int BLOCK_X, int BLOCK_Y, - - double* dataNew, - double* dataOld, - double* coe, - + elemType* dataNew, + elemType* dataOld, + elemType* coe, int numSten, int numStenLeft, int numStenRight, - int numCoe, - - double* func + elemType* func ); /*! \fun void cuStenSwap2DXpFun @@ -271,10 +270,10 @@ void cuStenCreate2DXpFun( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXpFun( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ); /*! \fun void cuStenDestroy2DXpFun @@ -282,8 +281,9 @@ void cuStenSwap2DXpFun( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXpFun( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ); // ---------------------------------------- @@ -306,21 +306,21 @@ void cuStenDestroy2DXpFun( \param dataInput Pointer to data input to the function */ -void cuStenCreate2DXYWENOADVp -( - cuSten_t* pt_cuSten, +template +void cuStenCreate2DXYWENOADVp( + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double dx, - double dy, - double* u, - double* v, - double* dataOutput, - double* dataInput + elemType dx, + elemType dy, + elemType* u, + elemType* v, + elemType* dataOutput, + elemType* dataInput ); /*! \fun void cuStenSwap2DXYWENOADVp @@ -329,10 +329,10 @@ void cuStenCreate2DXYWENOADVp \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXYWENOADVp( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ); /*! \fun void cuStenDestroy2DXYWENOADVp @@ -340,9 +340,10 @@ void cuStenSwap2DXYWENOADVp( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXYWENOADVp ( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ); // ---------------------------------------- @@ -368,17 +369,18 @@ void cuStenDestroy2DXYWENOADVp \param numStenBottom Number of points on the bottom of the stencil */ +template void cuStenCreate2DXYnp( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataOutput, - double* dataInput, - double* weights, + elemType* dataOutput, + elemType* dataInput, + elemType* weights, int numStenHoriz, int numStenLeft, int numStenRight, @@ -393,10 +395,10 @@ void cuStenCreate2DXYnp( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXYnp( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ); /*! \fun void cuStenDestroy2DXYnp @@ -404,8 +406,9 @@ void cuStenSwap2DXYnp( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXYnp( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ); // ---------------------------------------- @@ -432,24 +435,25 @@ void cuStenDestroy2DXYnp( \param Pointer to user function */ +template void cuStenCreate2DXYnpFun( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataNew, - double* dataOld, - double* coe, + elemType* dataNew, + elemType* dataOld, + elemType* coe, int numStenHoriz, int numStenLeft, int numStenRight, int numStenVert, int numStenTop, int numStenBottom, - double* func + elemType* func ); /*! \fun void cuStenSwap2DXYnpFun @@ -458,10 +462,10 @@ void cuStenCreate2DXYnpFun( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXYnpFun( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ); /*! \fun void cuStenDestroy2DXYnpFun @@ -469,8 +473,9 @@ void cuStenSwap2DXYnpFun( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXYnpFun( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ); @@ -497,17 +502,18 @@ void cuStenDestroy2DXYnpFun( \param numStenBottom Number of points on the bottom of the stencil */ +template void cuStenCreate2DXYp( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataNew, - double* dataOld, - double* weights, + elemType* dataNew, + elemType* dataOld, + elemType* weights, int numStenHoriz, int numStenLeft, int numStenRight, @@ -522,10 +528,10 @@ void cuStenCreate2DXYp( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXYp( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ); /*! \fun void cuStenDestroy2DXYp @@ -533,8 +539,9 @@ void cuStenSwap2DXYp( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXYp( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ); // ---------------------------------------- @@ -561,24 +568,25 @@ void cuStenDestroy2DXYp( \param Pointer to user function */ +template void cuStenCreate2DXYpFun( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataOutput, - double* dataInput, - double* coe, + elemType* dataOutput, + elemType* dataInput, + elemType* coe, int numStenHoriz, int numStenLeft, int numStenRight, int numStenVert, int numStenTop, int numStenBottom, - double* func + elemType* func ); /*! \fun void cuStenSwap2DYpFun @@ -587,10 +595,10 @@ void cuStenCreate2DXYpFun( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXYpFun( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ); /*! \fun void cuStenDestroy2DXYpFun @@ -598,8 +606,9 @@ void cuStenSwap2DXYpFun( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXYpFun( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ); @@ -623,17 +632,18 @@ void cuStenDestroy2DXYpFun( \param numStenBottom Number of points on the bottom of the stencil */ +template void cuStenCreate2DYnp( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataOutput, - double* dataInput, - double* weights, + elemType* dataOutput, + elemType* dataInput, + elemType* weights, int numSten, int numStenTop, int numStenBottom @@ -645,10 +655,10 @@ void cuStenCreate2DYnp( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DYnp( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ); /*! \fun void cuStenDestroy2DYnp @@ -656,8 +666,9 @@ void cuStenSwap2DYnp( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DYnp( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ); // ---------------------------------------- @@ -681,21 +692,22 @@ void cuStenDestroy2DYnp( \param Pointer to user function */ +template void cuStenCreate2DYnpFun( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataOutput, - double* dataInput, - double* coe, + elemType* dataOutput, + elemType* dataInput, + elemType* coe, int numSten, int numStenTop, int numStenBottom, - double* func + elemType* func ); /*! \fun void cuStenSwap2DYnpFun @@ -704,10 +716,10 @@ void cuStenCreate2DYnpFun( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DYnpFun( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ); /*! \fun void cuStenDestroy2DYnpFun @@ -715,8 +727,9 @@ void cuStenSwap2DYnpFun( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DYnpFun( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ); // ---------------------------------------- @@ -739,17 +752,18 @@ void cuStenDestroy2DYnpFun( \param numStenBottom Number of points on the bottom of the stencil */ +template void cuStenCreate2DYp( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataOutput, - double* dataInput, - double* weights, + elemType* dataOutput, + elemType* dataInput, + elemType* weights, int numSten, int numStenTop, int numStenBottom @@ -761,10 +775,10 @@ void cuStenCreate2DYp( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DYp( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ); /*! \fun void cuStenDestroy2DYp @@ -772,8 +786,9 @@ void cuStenSwap2DYp( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DYp( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ); // ---------------------------------------- @@ -798,22 +813,23 @@ void cuStenDestroy2DYp( \param Pointer to user function */ +template void cuStenCreate2DYpFun( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataOutput, - double* dataInput, - double* coe, + elemType* dataOutput, + elemType* dataInput, + elemType* coe, int numSten, int numStenTop, int numStenBottom, int numCoe, - double* func + elemType* func ); /*! \fun void cuStenSwap2DYpFun @@ -822,10 +838,10 @@ void cuStenCreate2DYpFun( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DYpFun( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ); /*! \fun void cuStenDestroy2DYnpFun @@ -833,8 +849,9 @@ void cuStenSwap2DYpFun( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DYpFun( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ); // --------------------------------------------------------------------- diff --git a/cuSten/src/struct/cuSten_struct_type.h b/cuSten/src/struct/cuSten_struct_type.h index a3294b6..a1ecde3 100644 --- a/cuSten/src/struct/cuSten_struct_type.h +++ b/cuSten/src/struct/cuSten_struct_type.h @@ -80,8 +80,8 @@ * @var devFunc Pointer to user defined function pointer */ - -typedef struct +template +struct cuSten_t { int deviceNum; int numStreams; @@ -101,24 +101,24 @@ typedef struct int xGrid; int yGrid; int mem_shared; - double** dataInput; - double** dataOutput; - double** uVel; - double** vVel; - double* weights; - double* coe; - double coeDx; - double coeDy; + elemType** dataInput; + elemType** dataOutput; + elemType** uVel; + elemType** vVel; + elemType* weights; + elemType* coe; + elemType coeDx; + elemType coeDy; int numCoe; int nxLocal; int nyLocal; - double** boundaryTop; - double** boundaryBottom; + elemType** boundaryTop; + elemType** boundaryBottom; int numBoundaryTop; int numBoundaryBottom; cudaStream_t* streams; cudaEvent_t* events; - double* devFunc; -} cuSten_t; + elemType* devFunc; +}; #endif diff --git a/cuSten/src/struct/custenCreateDestroy2DXYADVWENOp.cu b/cuSten/src/struct/custenCreateDestroy2DXYADVWENOp.cu index 9fe2f19..0add1bd 100644 --- a/cuSten/src/struct/custenCreateDestroy2DXYADVWENOp.cu +++ b/cuSten/src/struct/custenCreateDestroy2DXYADVWENOp.cu @@ -54,21 +54,21 @@ \param dataInput Pointer to data input to the function */ -void cuStenCreate2DXYWENOADVp -( - cuSten_t* pt_cuSten, +template +void cuStenCreate2DXYWENOADVp( + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double dx, - double dy, - double* u, - double* v, - double* dataOutput, - double* dataInput + elemType dx, + elemType dy, + elemType* u, + elemType* v, + elemType* dataOutput, + elemType* dataInput ) { // Buffer used for error checking @@ -153,7 +153,7 @@ void cuStenCreate2DXYWENOADVp pt_cuSten->nyLocal = pt_cuSten->BLOCK_Y + pt_cuSten->numStenTop + pt_cuSten->numStenBottom; // Set the amount of shared memory required - pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(double) + pt_cuSten->numSten * sizeof(double); + pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(elemType) + pt_cuSten->numSten * sizeof(elemType); // Find number of points per tile pt_cuSten->nyTile = pt_cuSten->ny / pt_cuSten->numTiles; @@ -163,16 +163,16 @@ void cuStenCreate2DXYWENOADVp pt_cuSten->yGrid = (pt_cuSten->nyTile % pt_cuSten->BLOCK_Y == 0) ? (pt_cuSten->nyTile / pt_cuSten->BLOCK_Y) : (pt_cuSten->nyTile / pt_cuSten->BLOCK_Y + 1); // Allocate the pointers for each input tile - pt_cuSten->dataInput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataInput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate the pointers for each output tile - pt_cuSten->dataOutput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataOutput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate the pointers for each input x velocity tile - pt_cuSten->uVel = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->uVel = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate the pointers for each input v velocity tile - pt_cuSten->vVel = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->vVel = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // // Tile offset index int offset = pt_cuSten->nx * pt_cuSten->nyTile; @@ -199,10 +199,10 @@ void cuStenCreate2DXYWENOADVp // 3 or greater // Allocate top boundary memory - pt_cuSten->boundaryTop = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryTop = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate bottom boundary memory - pt_cuSten->boundaryBottom = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryBottom = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); switch(pt_cuSten->numTiles) { @@ -263,10 +263,10 @@ void cuStenCreate2DXYWENOADVp \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXYWENOADVp( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ) { for (int tile = 0; tile < pt_cuSten->numTiles; tile++) @@ -322,9 +322,10 @@ void cuStenSwap2DXYWENOADVp( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXYWENOADVp ( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ) { // Buffer used for error checking @@ -371,7 +372,66 @@ void cuStenDestroy2DXYWENOADVp free(pt_cuSten->boundaryBottom); } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +void cuStenCreate2DXYWENOADVp( + cuSten_t*, + int, + int, + int, + int, + int, + int, + double, + double, + double*, + double*, + double*, + double* +); + +template +void cuStenSwap2DXYWENOADVp( + cuSten_t*, + double* +); + +template +void cuStenDestroy2DXYWENOADVp( + cuSten_t* +); + +template +void cuStenCreate2DXYWENOADVp( + cuSten_t*, + int, + int, + int, + int, + int, + int, + float, + float, + float*, + float*, + float*, + float* +); + +template +void cuStenSwap2DXYWENOADVp( + cuSten_t*, + float* +); + +template +void cuStenDestroy2DXYWENOADVp( + cuSten_t* +); // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/struct/custenCreateDestroy2DXYnp.cu b/cuSten/src/struct/custenCreateDestroy2DXYnp.cu index e1c7718..1e5e9c5 100644 --- a/cuSten/src/struct/custenCreateDestroy2DXYnp.cu +++ b/cuSten/src/struct/custenCreateDestroy2DXYnp.cu @@ -57,17 +57,18 @@ \param numStenBottom Number of points on the bottom of the stencil */ +template void cuStenCreate2DXYnp( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataOutput, - double* dataInput, - double* weights, + elemType* dataOutput, + elemType* dataInput, + elemType* weights, int numStenHoriz, int numStenLeft, int numStenRight, @@ -149,7 +150,7 @@ void cuStenCreate2DXYnp( pt_cuSten->nyLocal = pt_cuSten->BLOCK_Y + pt_cuSten->numStenTop + pt_cuSten->numStenBottom; // Set the amount of shared memory required - pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(double) + pt_cuSten->numSten * sizeof(double); + pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(elemType) + pt_cuSten->numSten * sizeof(elemType); // Find number of points per tile pt_cuSten->nyTile = pt_cuSten->ny / pt_cuSten->numTiles; @@ -162,10 +163,10 @@ void cuStenCreate2DXYnp( pt_cuSten->weights = weights; // Allocate the pointers for each input tile - pt_cuSten->dataInput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataInput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate the pointers for each output tile - pt_cuSten->dataOutput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataOutput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // // Tile offset index int offset = pt_cuSten->nx * pt_cuSten->nyTile; @@ -186,10 +187,10 @@ void cuStenCreate2DXYnp( // 3 or greater // Allocate top boundary memory - pt_cuSten->boundaryTop = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryTop = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate bottom boundary memory - pt_cuSten->boundaryBottom = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryBottom = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); switch(pt_cuSten->numTiles) { @@ -250,10 +251,10 @@ void cuStenCreate2DXYnp( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXYnp( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ) { for (int tile = 0; tile < pt_cuSten->numTiles; tile++) @@ -309,8 +310,9 @@ void cuStenSwap2DXYnp( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXYnp( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ) { // Buffer used for error checking @@ -357,7 +359,72 @@ void cuStenDestroy2DXYnp( free(pt_cuSten->boundaryBottom); } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +void cuStenCreate2DXYnp( + cuSten_t*, + int, + int, + int, + int, + int, + int, + double*, + double*, + double*, + int, + int, + int, + int, + int, + int +); + +template +void cuStenSwap2DXYnp( + cuSten_t*, + double* +); + +template +void cuStenDestroy2DXYnp( + cuSten_t* +); + +template +void cuStenCreate2DXYnp( + cuSten_t*, + int, + int, + int, + int, + int, + int, + float*, + float*, + float*, + int, + int, + int, + int, + int, + int +); + +template +void cuStenSwap2DXYnp( + cuSten_t*, + float* +); + +template +void cuStenDestroy2DXYnp( + cuSten_t* +); // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/struct/custenCreateDestroy2DXYnpFun.cu b/cuSten/src/struct/custenCreateDestroy2DXYnpFun.cu index 4a8d0c4..bee1cee 100644 --- a/cuSten/src/struct/custenCreateDestroy2DXYnpFun.cu +++ b/cuSten/src/struct/custenCreateDestroy2DXYnpFun.cu @@ -58,24 +58,25 @@ \param Pointer to user function */ +template void cuStenCreate2DXYnpFun( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataNew, - double* dataOld, - double* coe, + elemType* dataNew, + elemType* dataOld, + elemType* coe, int numStenHoriz, int numStenLeft, int numStenRight, int numStenVert, int numStenTop, int numStenBottom, - double* func + elemType* func ) { // Buffer used for error checking @@ -151,7 +152,7 @@ void cuStenCreate2DXYnpFun( pt_cuSten->nyLocal = pt_cuSten->BLOCK_Y + pt_cuSten->numStenTop + pt_cuSten->numStenBottom; // Set the amount of shared memory required - pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(double) + pt_cuSten->numSten * sizeof(double); + pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(elemType) + pt_cuSten->numSten * sizeof(elemType); // Find number of points per tile pt_cuSten->nyTile = pt_cuSten->ny / pt_cuSten->numTiles; @@ -164,10 +165,10 @@ void cuStenCreate2DXYnpFun( pt_cuSten->coe = coe; // Allocate the pointers for each input tile - pt_cuSten->dataInput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataInput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate the pointers for each output tile - pt_cuSten->dataOutput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataOutput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // // Tile offset index int offset = pt_cuSten->nx * pt_cuSten->nyTile; @@ -188,10 +189,10 @@ void cuStenCreate2DXYnpFun( // 3 or greater // Allocate top boundary memory - pt_cuSten->boundaryTop = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryTop = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate bottom boundary memory - pt_cuSten->boundaryBottom = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryBottom = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); switch(pt_cuSten->numTiles) { @@ -255,10 +256,10 @@ void cuStenCreate2DXYnpFun( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXYnpFun( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ) { for (int tile = 0; tile < pt_cuSten->numTiles; tile++) @@ -314,8 +315,9 @@ void cuStenSwap2DXYnpFun( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXYnpFun( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ) { // Buffer used for error checking @@ -362,7 +364,74 @@ void cuStenDestroy2DXYnpFun( free(pt_cuSten->boundaryBottom); } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +void cuStenCreate2DXYnpFun( + cuSten_t*, + int, + int, + int, + int, + int, + int, + double*, + double*, + double*, + int, + int, + int, + int, + int, + int, + double* +); + +template +void cuStenSwap2DXYnpFun( + cuSten_t*, + double* +); + +template +void cuStenDestroy2DXYnpFun( + cuSten_t* +); + +template +void cuStenCreate2DXYnpFun( + cuSten_t*, + int, + int, + int, + int, + int, + int, + float*, + float*, + float*, + int, + int, + int, + int, + int, + int, + float* +); + +template +void cuStenSwap2DXYnpFun( + cuSten_t*, + float* +); + +template +void cuStenDestroy2DXYnpFun( + cuSten_t* +); // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/struct/custenCreateDestroy2DXYp.cu b/cuSten/src/struct/custenCreateDestroy2DXYp.cu index da138c1..5e66d20 100644 --- a/cuSten/src/struct/custenCreateDestroy2DXYp.cu +++ b/cuSten/src/struct/custenCreateDestroy2DXYp.cu @@ -57,17 +57,18 @@ \param numStenBottom Number of points on the bottom of the stencil */ +template void cuStenCreate2DXYp( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataNew, - double* dataOld, - double* weights, + elemType* dataNew, + elemType* dataOld, + elemType* weights, int numStenHoriz, int numStenLeft, int numStenRight, @@ -149,7 +150,7 @@ void cuStenCreate2DXYp( pt_cuSten->nyLocal = pt_cuSten->BLOCK_Y + pt_cuSten->numStenTop + pt_cuSten->numStenBottom; // Set the amount of shared memory required - pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(double) + pt_cuSten->numSten * sizeof(double); + pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(elemType) + pt_cuSten->numSten * sizeof(elemType); // Find number of points per tile pt_cuSten->nyTile = pt_cuSten->ny / pt_cuSten->numTiles; @@ -162,10 +163,10 @@ void cuStenCreate2DXYp( pt_cuSten->weights = weights; // Allocate the pointers for each input tile - pt_cuSten->dataInput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataInput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate the pointers for each output tile - pt_cuSten->dataOutput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataOutput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // // Tile offset index int offset = pt_cuSten->nx * pt_cuSten->nyTile; @@ -186,10 +187,10 @@ void cuStenCreate2DXYp( // 3 or greater // Allocate top boundary memory - pt_cuSten->boundaryTop = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryTop = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate bottom boundary memory - pt_cuSten->boundaryBottom = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryBottom = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); switch(pt_cuSten->numTiles) { @@ -250,10 +251,10 @@ void cuStenCreate2DXYp( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXYp( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ) { for (int tile = 0; tile < pt_cuSten->numTiles; tile++) @@ -309,8 +310,9 @@ void cuStenSwap2DXYp( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXYp( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ) { // Buffer used for error checking @@ -357,7 +359,72 @@ void cuStenDestroy2DXYp( free(pt_cuSten->boundaryBottom); } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +void cuStenCreate2DXYp( + cuSten_t*, + int, + int, + int, + int, + int, + int, + double*, + double*, + double*, + int, + int, + int, + int, + int, + int +); + +template +void cuStenSwap2DXYp( + cuSten_t*, + double* +); + +template +void cuStenDestroy2DXYp( + cuSten_t* +); + +template +void cuStenCreate2DXYp( + cuSten_t*, + int, + int, + int, + int, + int, + int, + float*, + float*, + float*, + int, + int, + int, + int, + int, + int +); + +template +void cuStenSwap2DXYp( + cuSten_t*, + float* +); + +template +void cuStenDestroy2DXYp( + cuSten_t* +); // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/struct/custenCreateDestroy2DXYpFun.cu b/cuSten/src/struct/custenCreateDestroy2DXYpFun.cu index 3290d56..8d6aa8a 100644 --- a/cuSten/src/struct/custenCreateDestroy2DXYpFun.cu +++ b/cuSten/src/struct/custenCreateDestroy2DXYpFun.cu @@ -59,24 +59,25 @@ \param Pointer to user function */ +template void cuStenCreate2DXYpFun( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataOutput, - double* dataInput, - double* coe, + elemType* dataOutput, + elemType* dataInput, + elemType* coe, int numStenHoriz, int numStenLeft, int numStenRight, int numStenVert, int numStenTop, int numStenBottom, - double* func + elemType* func ) { // Buffer used for error checking @@ -152,7 +153,7 @@ void cuStenCreate2DXYpFun( pt_cuSten->nyLocal = pt_cuSten->BLOCK_Y + pt_cuSten->numStenTop + pt_cuSten->numStenBottom; // Set the amount of shared memory required - pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(double) + pt_cuSten->numSten * sizeof(double); + pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(elemType) + pt_cuSten->numSten * sizeof(elemType); // Find number of points per tile pt_cuSten->nyTile = pt_cuSten->ny / pt_cuSten->numTiles; @@ -165,10 +166,10 @@ void cuStenCreate2DXYpFun( pt_cuSten->coe = coe; // Allocate the pointers for each input tile - pt_cuSten->dataInput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataInput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate the pointers for each output tile - pt_cuSten->dataOutput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataOutput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // // Tile offset index int offset = pt_cuSten->nx * pt_cuSten->nyTile; @@ -189,10 +190,10 @@ void cuStenCreate2DXYpFun( // 3 or greater // Allocate top boundary memory - pt_cuSten->boundaryTop = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryTop = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate bottom boundary memory - pt_cuSten->boundaryBottom = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryBottom = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); switch(pt_cuSten->numTiles) { @@ -256,10 +257,10 @@ void cuStenCreate2DXYpFun( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXYpFun( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ) { for (int tile = 0; tile < pt_cuSten->numTiles; tile++) @@ -315,8 +316,9 @@ void cuStenSwap2DXYpFun( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXYpFun( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ) { // Buffer used for error checking @@ -363,7 +365,74 @@ void cuStenDestroy2DXYpFun( free(pt_cuSten->boundaryBottom); } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +void cuStenCreate2DXYpFun( + cuSten_t*, + int, + int, + int, + int, + int, + int, + double*, + double*, + double*, + int, + int, + int, + int, + int, + int, + double* +); + +template +void cuStenSwap2DXYpFun( + cuSten_t*, + double* +); + +template +void cuStenDestroy2DXYpFun( + cuSten_t* +); + +template +void cuStenCreate2DXYpFun( + cuSten_t*, + int, + int, + int, + int, + int, + int, + float*, + float*, + float*, + int, + int, + int, + int, + int, + int, + float* +); + +template +void cuStenSwap2DXYpFun( + cuSten_t*, + float* +); + +template +void cuStenDestroy2DXYpFun( + cuSten_t* +); // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/struct/custenCreateDestroy2DXnp.cu b/cuSten/src/struct/custenCreateDestroy2DXnp.cu index ba4c82f..983f290 100644 --- a/cuSten/src/struct/custenCreateDestroy2DXnp.cu +++ b/cuSten/src/struct/custenCreateDestroy2DXnp.cu @@ -54,17 +54,18 @@ \param numStenRight Number of points on the right side of the stencil */ +template void cuStenCreate2DXnp( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataOutput, - double* dataInput, - double* weights, + elemType* dataOutput, + elemType* dataInput, + elemType* weights, int numSten, int numStenLeft, int numStenRight @@ -131,7 +132,7 @@ void cuStenCreate2DXnp( pt_cuSten->numStenRight = numStenRight; // Set the amount of shared memory required - pt_cuSten->mem_shared = (pt_cuSten->BLOCK_Y * pt_cuSten->BLOCK_X + pt_cuSten->BLOCK_Y * (pt_cuSten->numStenLeft + pt_cuSten->numStenRight)) * sizeof(double) + pt_cuSten->numSten * sizeof(double); + pt_cuSten->mem_shared = (pt_cuSten->BLOCK_Y * pt_cuSten->BLOCK_X + pt_cuSten->BLOCK_Y * (pt_cuSten->numStenLeft + pt_cuSten->numStenRight)) * sizeof(elemType) + pt_cuSten->numSten * sizeof(elemType); // Find number of points per tile pt_cuSten->nyTile = pt_cuSten->ny / pt_cuSten->numTiles; @@ -144,10 +145,10 @@ void cuStenCreate2DXnp( pt_cuSten->weights = weights; // Allocate the pointers for each input tile - pt_cuSten->dataInput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataInput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(double)); // Allocate the pointers for each output tile - pt_cuSten->dataOutput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataOutput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(double)); // // Tile offset index int offset = pt_cuSten->nx * pt_cuSten->nyTile; @@ -173,10 +174,10 @@ void cuStenCreate2DXnp( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXnp( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ) { for (int tile = 0; tile < pt_cuSten->numTiles; tile++) @@ -195,8 +196,9 @@ void cuStenSwap2DXnp( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXnp( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ) { // Buffer used for error checking @@ -237,7 +239,67 @@ void cuStenDestroy2DXnp( free(pt_cuSten->dataOutput); } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +void cuStenCreate2DXnp( + cuSten_t*, + int, + int, + int, + int, + int, + int, + double*, + double*, + double*, + int, + int, + int +); + +template +void cuStenSwap2DXnp( + cuSten_t*, + double* +); + +template +void cuStenDestroy2DXnp( + cuSten_t* +); + +template +void cuStenCreate2DXnp( + cuSten_t*, + int, + int, + int, + int, + int, + int, + float*, + float*, + float*, + int, + int, + int +); + +template +void cuStenSwap2DXnp( + cuSten_t*, + float* +); + +template +void cuStenDestroy2DXnp( + cuSten_t* +); + // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/struct/custenCreateDestroy2DXnpFun.cu b/cuSten/src/struct/custenCreateDestroy2DXnpFun.cu index b5f073e..d6b9d13 100644 --- a/cuSten/src/struct/custenCreateDestroy2DXnpFun.cu +++ b/cuSten/src/struct/custenCreateDestroy2DXnpFun.cu @@ -55,22 +55,23 @@ \param Pointer to user function */ +template void cuStenCreate2DXnpFun( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dateOutput, - double* dateInput, - double* coe, + elemType* dateOutput, + elemType* dateInput, + elemType* coe, int numSten, int numStenLeft, int numStenRight, int numCoe, - double* func + elemType* func ) { // Buffer used for error checking @@ -144,7 +145,7 @@ void cuStenCreate2DXnpFun( pt_cuSten->nyLocal = pt_cuSten->BLOCK_Y; // Set the amount of shared memory required - pt_cuSten->mem_shared = pt_cuSten->nxLocal * pt_cuSten->nyLocal * sizeof(double) + numCoe * sizeof(double); + pt_cuSten->mem_shared = pt_cuSten->nxLocal * pt_cuSten->nyLocal * sizeof(elemType) + numCoe * sizeof(elemType); // Find number of points per tile pt_cuSten->nx = pt_cuSten->nx; @@ -157,10 +158,10 @@ void cuStenCreate2DXnpFun( pt_cuSten->yGrid = (pt_cuSten->nyTile % pt_cuSten->BLOCK_Y == 0) ? (pt_cuSten->nyTile / pt_cuSten->BLOCK_Y) : (pt_cuSten->nyTile / pt_cuSten->BLOCK_Y + 1); // Allocate the pointers for each input tile - pt_cuSten->dataInput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataInput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate the pointers for each output tile - pt_cuSten->dataOutput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataOutput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Tile offset index int offset = pt_cuSten->nx * pt_cuSten->nyTile; @@ -190,10 +191,10 @@ void cuStenCreate2DXnpFun( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXnpFun( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ) { for (int tile = 0; tile < pt_cuSten->numTiles; tile++) @@ -212,8 +213,9 @@ void cuStenSwap2DXnpFun( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXnpFun( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ) { // Buffer used for error checking @@ -253,7 +255,70 @@ void cuStenDestroy2DXnpFun( free(pt_cuSten->dataOutput); } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +void cuStenCreate2DXnpFun( + cuSten_t*, + int, + int, + int, + int, + int, + int, + double*, + double*, + double*, + int, + int, + int, + int, + double* +); + +template +void cuStenSwap2DXnpFun( + cuSten_t*, + double* dataInput +); + +template +void cuStenDestroy2DXnpFun( + cuSten_t* +); + +template +void cuStenCreate2DXnpFun( + cuSten_t*, + int, + int, + int, + int, + int, + int, + float*, + float*, + float*, + int, + int, + int, + int, + float* +); + +template +void cuStenSwap2DXnpFun( + cuSten_t*, + float* dataInput +); + +template +void cuStenDestroy2DXnpFun( + cuSten_t* +); // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/struct/custenCreateDestroy2DXp.cu b/cuSten/src/struct/custenCreateDestroy2DXp.cu index 8aee53e..6d33865 100644 --- a/cuSten/src/struct/custenCreateDestroy2DXp.cu +++ b/cuSten/src/struct/custenCreateDestroy2DXp.cu @@ -54,17 +54,18 @@ \param numStenRight Number of points on the right side of the stencil */ +template void cuStenCreate2DXp( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataNew, - double* dataOld, - double* weights, + elemType* dataNew, + elemType* dataOld, + elemType* weights, int numSten, int numStenLeft, int numStenRight @@ -131,7 +132,7 @@ void cuStenCreate2DXp( pt_cuSten->numStenRight = numStenRight; // Set the amount of shared memory required - pt_cuSten->mem_shared = (pt_cuSten->BLOCK_Y * pt_cuSten->BLOCK_X + pt_cuSten->BLOCK_Y * (pt_cuSten->numStenLeft + pt_cuSten->numStenRight)) * sizeof(double) + pt_cuSten->numSten * sizeof(double); + pt_cuSten->mem_shared = (pt_cuSten->BLOCK_Y * pt_cuSten->BLOCK_X + pt_cuSten->BLOCK_Y * (pt_cuSten->numStenLeft + pt_cuSten->numStenRight)) * sizeof(elemType) + pt_cuSten->numSten * sizeof(elemType); // Find number of points per tile pt_cuSten->nx = pt_cuSten->nx; @@ -147,10 +148,10 @@ void cuStenCreate2DXp( pt_cuSten->weights = weights; // Allocate the pointers for each input tile - pt_cuSten->dataInput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataInput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate the pointers for each output tile - pt_cuSten->dataOutput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataOutput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // // Tile offset index int offset = pt_cuSten->nx * pt_cuSten->nyTile; @@ -176,10 +177,10 @@ void cuStenCreate2DXp( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXp( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ) { for (int tile = 0; tile < pt_cuSten->numTiles; tile++) @@ -198,8 +199,9 @@ void cuStenSwap2DXp( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXp( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ) { // Buffer used for error checking @@ -240,7 +242,66 @@ void cuStenDestroy2DXp( free(pt_cuSten->dataOutput); } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +void cuStenCreate2DXp( + cuSten_t*, + int, + int, + int, + int, + int, + int, + double*, + double*, + double*, + int, + int, + int +); + +template +void cuStenSwap2DXp( + cuSten_t*, + double* +); + +template +void cuStenDestroy2DXp( + cuSten_t* +); + +template +void cuStenCreate2DXp( + cuSten_t*, + int, + int, + int, + int, + int, + int, + float*, + float*, + float*, + int, + int, + int +); + +template +void cuStenSwap2DXp( + cuSten_t*, + float* +); + +template +void cuStenDestroy2DXp( + cuSten_t* +); // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/struct/custenCreateDestroy2DXpFun.cu b/cuSten/src/struct/custenCreateDestroy2DXpFun.cu index 2e343fb..334338d 100644 --- a/cuSten/src/struct/custenCreateDestroy2DXpFun.cu +++ b/cuSten/src/struct/custenCreateDestroy2DXpFun.cu @@ -57,30 +57,23 @@ \param Pointer to user function */ +template void cuStenCreate2DXpFun( - cuSten_t* pt_cuSten, - + cuSten_t* pt_cuSten, int deviceNum, - int numTiles, - int nx, int ny, - int BLOCK_X, int BLOCK_Y, - - double* dataNew, - double* dataOld, - double* coe, - + elemType* dataNew, + elemType* dataOld, + elemType* coe, int numSten, int numStenLeft, int numStenRight, - int numCoe, - - double* func + elemType* func ) { // Buffer used for error checking @@ -154,7 +147,7 @@ void cuStenCreate2DXpFun( pt_cuSten->nyLocal = pt_cuSten->BLOCK_Y + pt_cuSten->numStenTop + pt_cuSten->numStenBottom; // Set the amount of shared memory required - pt_cuSten->mem_shared = pt_cuSten->nxLocal * pt_cuSten->nyLocal * sizeof(double) + numCoe * sizeof(double); + pt_cuSten->mem_shared = pt_cuSten->nxLocal * pt_cuSten->nyLocal * sizeof(elemType) + numCoe * sizeof(elemType); // Find number of points per tile pt_cuSten->nyTile = pt_cuSten->ny / pt_cuSten->numTiles; @@ -164,10 +157,10 @@ void cuStenCreate2DXpFun( pt_cuSten->yGrid = (pt_cuSten->nyTile % pt_cuSten->BLOCK_Y == 0) ? (pt_cuSten->nyTile / pt_cuSten->BLOCK_Y) : (pt_cuSten->nyTile / pt_cuSten->BLOCK_Y + 1); // Allocate the pointers for each input tile - pt_cuSten->dataInput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataInput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate the pointers for each output tile - pt_cuSten->dataOutput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataOutput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // // Tile offset index int offset = pt_cuSten->nx * pt_cuSten->nyTile; @@ -182,8 +175,6 @@ void cuStenCreate2DXpFun( pt_cuSten->dataOutput[tile] = &dataNew[tile * offset]; } - - pt_cuSten->devFunc = func; } @@ -198,10 +189,10 @@ void cuStenCreate2DXpFun( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DXpFun( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ) { for (int tile = 0; tile < pt_cuSten->numTiles; tile++) @@ -220,8 +211,9 @@ void cuStenSwap2DXpFun( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DXpFun( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ) { // Buffer used for error checking @@ -261,7 +253,70 @@ void cuStenDestroy2DXpFun( free(pt_cuSten->dataOutput); } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +void cuStenCreate2DXpFun( + cuSten_t*, + int, + int, + int, + int, + int, + int, + double*, + double*, + double*, + int, + int, + int, + int, + double* +); + +template +void cuStenSwap2DXpFun( + cuSten_t*, + double* +); + +template +void cuStenDestroy2DXpFun( + cuSten_t* +); + +template +void cuStenCreate2DXpFun( + cuSten_t*, + int, + int, + int, + int, + int, + int, + float*, + float*, + float*, + int, + int, + int, + int, + float* +); + +template +void cuStenSwap2DXpFun( + cuSten_t*, + float* +); + +template +void cuStenDestroy2DXpFun( + cuSten_t* +); // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/struct/custenCreateDestroy2DYnp.cu b/cuSten/src/struct/custenCreateDestroy2DYnp.cu index 20b866f..ce18b7f 100644 --- a/cuSten/src/struct/custenCreateDestroy2DYnp.cu +++ b/cuSten/src/struct/custenCreateDestroy2DYnp.cu @@ -54,17 +54,18 @@ \param numStenBottom Number of points on the bottom of the stencil */ +template void cuStenCreate2DYnp( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataOutput, - double* dataInput, - double* weights, + elemType* dataOutput, + elemType* dataInput, + elemType* weights, int numSten, int numStenTop, int numStenBottom @@ -137,7 +138,7 @@ void cuStenCreate2DYnp( pt_cuSten->nyLocal = pt_cuSten->BLOCK_Y + pt_cuSten->numStenTop + pt_cuSten->numStenBottom; // Set the amount of shared memory required - pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(double) + pt_cuSten->numSten * sizeof(double); + pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(elemType) + pt_cuSten->numSten * sizeof(elemType); // Find number of points per tile pt_cuSten->nyTile = pt_cuSten->ny / pt_cuSten->numTiles; @@ -150,10 +151,10 @@ void cuStenCreate2DYnp( pt_cuSten->weights = weights; // Allocate the pointers for each input tile - pt_cuSten->dataInput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataInput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate the pointers for each output tile - pt_cuSten->dataOutput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataOutput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // // Tile offset index int offset = pt_cuSten->nx * pt_cuSten->nyTile; @@ -174,10 +175,10 @@ void cuStenCreate2DYnp( // 3 or greater // Allocate top boundary memory - pt_cuSten->boundaryTop = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryTop = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate bottom boundary memory - pt_cuSten->boundaryBottom = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryBottom = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); switch(pt_cuSten->numTiles) { @@ -233,10 +234,10 @@ void cuStenCreate2DYnp( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DYnp( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ) { for (int tile = 0; tile < pt_cuSten->numTiles; tile++) @@ -292,8 +293,9 @@ void cuStenSwap2DYnp( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DYnp( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ) { // Buffer used for error checking @@ -340,7 +342,66 @@ void cuStenDestroy2DYnp( free(pt_cuSten->boundaryBottom); } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +void cuStenCreate2DYnp( + cuSten_t*, + int, + int, + int, + int, + int, + int, + double*, + double*, + double*, + int, + int, + int +); + +template +void cuStenSwap2DYnp( + cuSten_t*, + double* +); + +template +void cuStenDestroy2DYnp( + cuSten_t* +); + +template +void cuStenCreate2DYnp( + cuSten_t*, + int, + int, + int, + int, + int, + int, + float*, + float*, + float*, + int, + int, + int +); + +template +void cuStenSwap2DYnp( + cuSten_t*, + float* +); + +template +void cuStenDestroy2DYnp( + cuSten_t* +); // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/struct/custenCreateDestroy2DYnpFun.cu b/cuSten/src/struct/custenCreateDestroy2DYnpFun.cu index e685d9b..93205f2 100644 --- a/cuSten/src/struct/custenCreateDestroy2DYnpFun.cu +++ b/cuSten/src/struct/custenCreateDestroy2DYnpFun.cu @@ -55,21 +55,22 @@ \param Pointer to user function */ +template void cuStenCreate2DYnpFun( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataOutput, - double* dataInput, - double* coe, + elemType* dataOutput, + elemType* dataInput, + elemType* coe, int numSten, int numStenTop, int numStenBottom, - double* func + elemType* func ) { // Buffer used for error checking @@ -139,7 +140,7 @@ void cuStenCreate2DYnpFun( pt_cuSten->nyLocal = pt_cuSten->BLOCK_Y + pt_cuSten->numStenTop + pt_cuSten->numStenBottom; // Set the amount of shared memory required - pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(double) + pt_cuSten->numSten * sizeof(double); + pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(elemType) + pt_cuSten->numSten * sizeof(elemType); // Find number of points per tile pt_cuSten->nyTile = pt_cuSten->ny / pt_cuSten->numTiles; @@ -152,10 +153,10 @@ void cuStenCreate2DYnpFun( pt_cuSten->coe = coe; // Allocate the pointers for each input tile - pt_cuSten->dataInput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataInput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate the pointers for each output tile - pt_cuSten->dataOutput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataOutput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // // Tile offset index int offset = pt_cuSten->nx * pt_cuSten->nyTile; @@ -176,10 +177,10 @@ void cuStenCreate2DYnpFun( // 3 or greater // Allocate top boundary memory - pt_cuSten->boundaryTop = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryTop = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate bottom boundary memory - pt_cuSten->boundaryBottom = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryBottom = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); switch(pt_cuSten->numTiles) { @@ -238,10 +239,10 @@ void cuStenCreate2DYnpFun( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DYnpFun( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ) { for (int tile = 0; tile < pt_cuSten->numTiles; tile++) @@ -298,8 +299,9 @@ void cuStenSwap2DYnpFun( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DYnpFun( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ) { // Buffer used for error checking @@ -346,7 +348,68 @@ void cuStenDestroy2DYnpFun( free(pt_cuSten->boundaryBottom); } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +void cuStenCreate2DYnpFun( + cuSten_t*, + int, + int, + int, + int, + int, + int, + double*, + double*, + double*, + int, + int, + int, + double* +); + +template +void cuStenSwap2DYnpFun( + cuSten_t*, + double* +); + +template +void cuStenDestroy2DYnpFun( + cuSten_t* +); + +template +void cuStenCreate2DYnpFun( + cuSten_t*, + int, + int, + int, + int, + int, + int, + float*, + float*, + float*, + int, + int, + int, + float* +); + +template +void cuStenSwap2DYnpFun( + cuSten_t*, + float* +); + +template +void cuStenDestroy2DYnpFun( + cuSten_t* +); // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/struct/custenCreateDestroy2DYp.cu b/cuSten/src/struct/custenCreateDestroy2DYp.cu index 544acc4..7978413 100644 --- a/cuSten/src/struct/custenCreateDestroy2DYp.cu +++ b/cuSten/src/struct/custenCreateDestroy2DYp.cu @@ -54,17 +54,18 @@ \param numStenBottom Number of points on the bottom of the stencil */ +template void cuStenCreate2DYp( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataOutput, - double* dataInput, - double* weights, + elemType* dataOutput, + elemType* dataInput, + elemType* weights, int numSten, int numStenTop, int numStenBottom @@ -137,7 +138,7 @@ void cuStenCreate2DYp( pt_cuSten->nyLocal = pt_cuSten->BLOCK_Y + pt_cuSten->numStenTop + pt_cuSten->numStenBottom; // Set the amount of shared memory required - pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(double) + pt_cuSten->numSten * sizeof(double); + pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(elemType) + pt_cuSten->numSten * sizeof(elemType); // Find number of points per tile pt_cuSten->nyTile = pt_cuSten->ny / pt_cuSten->numTiles; @@ -150,10 +151,10 @@ void cuStenCreate2DYp( pt_cuSten->weights = weights; // Allocate the pointers for each input tile - pt_cuSten->dataInput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataInput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate the pointers for each output tile - pt_cuSten->dataOutput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataOutput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // // Tile offset index int offset = pt_cuSten->nx * pt_cuSten->nyTile; @@ -174,10 +175,10 @@ void cuStenCreate2DYp( // 3 or greater // Allocate top boundary memory - pt_cuSten->boundaryTop = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryTop = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate bottom boundary memory - pt_cuSten->boundaryBottom = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryBottom = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); switch(pt_cuSten->numTiles) { @@ -232,10 +233,10 @@ void cuStenCreate2DYp( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DYp( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ) { for (int tile = 0; tile < pt_cuSten->numTiles; tile++) @@ -291,8 +292,9 @@ void cuStenSwap2DYp( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DYp( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ) { // Buffer used for error checking @@ -339,7 +341,66 @@ void cuStenDestroy2DYp( free(pt_cuSten->boundaryBottom); } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +void cuStenCreate2DYp( + cuSten_t*, + int, + int, + int, + int, + int, + int, + double*, + double*, + double*, + int, + int, + int +); + +template +void cuStenSwap2DYp( + cuSten_t*, + double* +); + +template +void cuStenDestroy2DYp( + cuSten_t* +); + +template +void cuStenCreate2DYp( + cuSten_t*, + int, + int, + int, + int, + int, + int, + float*, + float*, + float*, + int, + int, + int +); + +template +void cuStenSwap2DYp( + cuSten_t*, + float* +); + +template +void cuStenDestroy2DYp( + cuSten_t* +); // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/cuSten/src/struct/custenCreateDestroy2DYpFun.cu b/cuSten/src/struct/custenCreateDestroy2DYpFun.cu index 3d94bca..5ee4ada 100644 --- a/cuSten/src/struct/custenCreateDestroy2DYpFun.cu +++ b/cuSten/src/struct/custenCreateDestroy2DYpFun.cu @@ -56,22 +56,23 @@ \param Pointer to user function */ +template void cuStenCreate2DYpFun( - cuSten_t* pt_cuSten, + cuSten_t* pt_cuSten, int deviceNum, int numTiles, int nx, int ny, int BLOCK_X, int BLOCK_Y, - double* dataOutput, - double* dataInput, - double* coe, + elemType* dataOutput, + elemType* dataInput, + elemType* coe, int numSten, int numStenTop, int numStenBottom, int numCoe, - double* func + elemType* func ) { // Buffer used for error checking @@ -144,7 +145,7 @@ void cuStenCreate2DYpFun( pt_cuSten->numCoe = numCoe; // Set the amount of shared memory required - pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(double) + pt_cuSten->numCoe * sizeof(double); + pt_cuSten->mem_shared = (pt_cuSten->nxLocal * pt_cuSten->nyLocal) * sizeof(elemType) + pt_cuSten->numCoe * sizeof(elemType); // Find number of points per tile pt_cuSten->nyTile = pt_cuSten->ny / pt_cuSten->numTiles; @@ -157,10 +158,10 @@ void cuStenCreate2DYpFun( pt_cuSten->coe = coe; // Allocate the pointers for each input tile - pt_cuSten->dataInput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataInput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate the pointers for each output tile - pt_cuSten->dataOutput = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->dataOutput = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // // Tile offset index int offset = pt_cuSten->nx * pt_cuSten->nyTile; @@ -181,10 +182,10 @@ void cuStenCreate2DYpFun( // 3 or greater // Allocate top boundary memory - pt_cuSten->boundaryTop = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryTop = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); // Allocate bottom boundary memory - pt_cuSten->boundaryBottom = (double**)malloc(pt_cuSten->numTiles * sizeof(double)); + pt_cuSten->boundaryBottom = (elemType**)malloc(pt_cuSten->numTiles * sizeof(elemType)); switch(pt_cuSten->numTiles) { @@ -242,10 +243,10 @@ void cuStenCreate2DYpFun( \param dataInput Pointer to data input to the on the next compute */ +template void cuStenSwap2DYpFun( - cuSten_t* pt_cuSten, - - double* dataInput + cuSten_t* pt_cuSten, + elemType* dataInput ) { for (int tile = 0; tile < pt_cuSten->numTiles; tile++) @@ -301,8 +302,9 @@ void cuStenSwap2DYpFun( \param pt_cuSten Pointer to cuSten type provided by user */ +template void cuStenDestroy2DYpFun( - cuSten_t* pt_cuSten + cuSten_t* pt_cuSten ) { // Buffer used for error checking @@ -349,7 +351,70 @@ void cuStenDestroy2DYpFun( free(pt_cuSten->boundaryBottom); } +// --------------------------------------------------------------------- +// Explicit instantiation +// --------------------------------------------------------------------- + +template +void cuStenCreate2DYpFun( + cuSten_t*, + int, + int, + int, + int, + int, + int, + double*, + double*, + double*, + int, + int, + int, + int, + double* +); + +template +void cuStenSwap2DYpFun( + cuSten_t*, + double* +); + +template +void cuStenDestroy2DYpFun( + cuSten_t* +); + +template +void cuStenCreate2DYpFun( + cuSten_t*, + int, + int, + int, + int, + int, + int, + float*, + float*, + float*, + int, + int, + int, + int, + float* +); + +template +void cuStenSwap2DYpFun( + cuSten_t*, + float* +); + +template +void cuStenDestroy2DYpFun( + cuSten_t* +); // --------------------------------------------------------------------- // End of file -// --------------------------------------------------------------------- \ No newline at end of file +// --------------------------------------------------------------------- diff --git a/examples/src/2d_x_np.cu b/examples/src/2d_x_np.cu index 7b38a3a..0ddb4d1 100644 --- a/examples/src/2d_x_np.cu +++ b/examples/src/2d_x_np.cu @@ -118,7 +118,7 @@ int main() // ----------------------------- // Set up the compute device structs - cuSten_t xDirCompute; + cuSten_t xDirCompute; // Initialise the instance of the stencil cuStenCreate2DXnp(&xDirCompute, deviceNum, numTiles, nx, ny, BLOCK_X, BLOCK_Y, dataNew, dataOld, weights, numSten, numStenLeft, numStenRight); @@ -159,4 +159,4 @@ int main() // Return 0 when the program completes return 0; -} \ No newline at end of file +} diff --git a/examples/src/2d_x_np_fun.cu b/examples/src/2d_x_np_fun.cu index fde7c68..b797df1 100644 --- a/examples/src/2d_x_np_fun.cu +++ b/examples/src/2d_x_np_fun.cu @@ -125,7 +125,7 @@ int main() // ----------------------------- // Set up the compute device structs - cuSten_t xDirCompute; + cuSten_t xDirCompute; // Copy the function pointer to the device double* func; @@ -170,4 +170,4 @@ int main() // Return 0 when the program completes return 0; -} \ No newline at end of file +} diff --git a/examples/src/2d_x_p.cu b/examples/src/2d_x_p.cu index 3bcf692..7adbc43 100644 --- a/examples/src/2d_x_p.cu +++ b/examples/src/2d_x_p.cu @@ -118,7 +118,7 @@ int main() // ----------------------------- // Set up the compute device structs - cuSten_t xDirCompute; + cuSten_t xDirCompute; // Initialise the instance of the stencil cuStenCreate2DXp(&xDirCompute, deviceNum, numTiles, nx, ny, BLOCK_X, BLOCK_Y, dataNew, dataOld, weights, numSten, numStenLeft, numStenRight); @@ -159,4 +159,4 @@ int main() // Return 0 when the program completes return 0; -} \ No newline at end of file +} diff --git a/examples/src/2d_xyWENOADV_p.cu b/examples/src/2d_xyWENOADV_p.cu index b4bf83b..f68e067 100644 --- a/examples/src/2d_xyWENOADV_p.cu +++ b/examples/src/2d_xyWENOADV_p.cu @@ -110,7 +110,7 @@ int main() // ----------------------------- // Set up the compute device structs - cuSten_t xyWENOCompute; + cuSten_t xyWENOCompute; // Initialise the instance of the stencil cuStenCreate2DXYWENOADVp( @@ -166,4 +166,4 @@ int main() // Return 0 when the program completes return 0; -} \ No newline at end of file +} diff --git a/examples/src/2d_xy_np.cu b/examples/src/2d_xy_np.cu index 5c06d88..8e36692 100644 --- a/examples/src/2d_xy_np.cu +++ b/examples/src/2d_xy_np.cu @@ -124,7 +124,7 @@ int main() // ----------------------------- // Set up the compute device structs - cuSten_t xyDirCompute; + cuSten_t xyDirCompute; // Initialise the instance of the stencil cuStenCreate2DXYnp( @@ -194,4 +194,4 @@ int main() // Return 0 when the program completes return 0; -} \ No newline at end of file +} diff --git a/examples/src/2d_xy_np_fun.cu b/examples/src/2d_xy_np_fun.cu index 95d030f..a3e34b8 100644 --- a/examples/src/2d_xy_np_fun.cu +++ b/examples/src/2d_xy_np_fun.cu @@ -172,7 +172,7 @@ int main() // ----------------------------- // Set up the compute device structs - cuSten_t xyDirCompute; + cuSten_t xyDirCompute; // Copy the function to device memory double* func; @@ -249,4 +249,4 @@ int main() // Return 0 when the program completes return 0; -} \ No newline at end of file +} diff --git a/examples/src/2d_xy_p.cu b/examples/src/2d_xy_p.cu index c59038e..7b094a0 100644 --- a/examples/src/2d_xy_p.cu +++ b/examples/src/2d_xy_p.cu @@ -124,7 +124,7 @@ int main() // ----------------------------- // Set up the compute device structs - cuSten_t xyDirCompute; + cuSten_t xyDirCompute; // Initialise the instance of the stencil cuStenCreate2DXYp( @@ -181,4 +181,4 @@ int main() // Return 0 when the program completes return 0; -} \ No newline at end of file +} diff --git a/examples/src/2d_xy_p_fun.cu b/examples/src/2d_xy_p_fun.cu index acb8f92..b2cef39 100644 --- a/examples/src/2d_xy_p_fun.cu +++ b/examples/src/2d_xy_p_fun.cu @@ -171,7 +171,7 @@ int main() // ----------------------------- // Set up the compute device structs - cuSten_t xyDirCompute; + cuSten_t xyDirCompute; // Copy the function to device memory double* func; @@ -248,4 +248,4 @@ int main() // Return 0 when the program completes return 0; -} \ No newline at end of file +} diff --git a/examples/src/2d_y_np.cu b/examples/src/2d_y_np.cu index 7899f68..a03bd89 100644 --- a/examples/src/2d_y_np.cu +++ b/examples/src/2d_y_np.cu @@ -116,7 +116,7 @@ int main() // ----------------------------- // Set up the compute device structs - cuSten_t yDirCompute; + cuSten_t yDirCompute; // Initialise the instance of the stencil cuStenCreate2DYnp(&yDirCompute, deviceNum, numTiles, nx, ny, BLOCK_X, BLOCK_Y, dataNew, dataOld, weights, numSten, numStenTop, numStenBottom); @@ -157,4 +157,4 @@ int main() // Return 0 when the program completes return 0; -} \ No newline at end of file +} diff --git a/examples/src/2d_y_np_fun.cu b/examples/src/2d_y_np_fun.cu index 9e38c43..7f3391c 100644 --- a/examples/src/2d_y_np_fun.cu +++ b/examples/src/2d_y_np_fun.cu @@ -145,7 +145,7 @@ int main() // ----------------------------- // Set up the compute device structs - cuSten_t yDirCompute; + cuSten_t yDirCompute; double* func; cudaMemcpyFromSymbol(&func, devFunc, sizeof(devArg1Y)); @@ -189,4 +189,4 @@ int main() // Return 0 when the program completes return 0; -} \ No newline at end of file +} diff --git a/examples/src/2d_y_p.cu b/examples/src/2d_y_p.cu index 3757e8d..3415f86 100644 --- a/examples/src/2d_y_p.cu +++ b/examples/src/2d_y_p.cu @@ -112,7 +112,7 @@ int main() // ----------------------------- // Set up the compute device structs - cuSten_t yDirCompute; + cuSten_t yDirCompute; // Initialise the instance of the stencil cuStenCreate2DYp(&yDirCompute, deviceNum, numTiles, nx, ny, BLOCK_X, BLOCK_Y, dataNew, dataOld, weights, numSten, numStenTop, numStenBottom); @@ -153,4 +153,4 @@ int main() // Return 0 when the program completes return 0; -} \ No newline at end of file +} diff --git a/examples/src/2d_y_p_fun.cu b/examples/src/2d_y_p_fun.cu index c14f108..47fa7d8 100644 --- a/examples/src/2d_y_p_fun.cu +++ b/examples/src/2d_y_p_fun.cu @@ -141,7 +141,7 @@ int main() // ----------------------------- // Set up the compute device structs - cuSten_t yDirCompute; + cuSten_t yDirCompute; double* func; cudaMemcpyFromSymbol(&func, devFunc, sizeof(devArg1Y)); @@ -185,4 +185,4 @@ int main() // Return 0 when the program completes return 0; -} \ No newline at end of file +}