-
Notifications
You must be signed in to change notification settings - Fork 9
/
gpu_library.cu
83 lines (70 loc) · 2.11 KB
/
gpu_library.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#include <sstream>
#include <iostream>
#include <cuda_runtime.h>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <cuda_runtime.h>
template <typename T>
__global__ void kernel
(T *vec, T scalar, int num_elements)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_elements) {
vec[idx] = vec[idx] * scalar;
}
}
template <typename T>
void run_kernel
(T *vec, T scalar, int num_elements)
{
dim3 dimBlock(256, 1, 1);
dim3 dimGrid(ceil((T)num_elements / dimBlock.x));
kernel<T><<<dimGrid, dimBlock>>>
(vec, scalar, num_elements);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
std::stringstream strstr;
strstr << "run_kernel launch failed" << std::endl;
strstr << "dimBlock: " << dimBlock.x << ", " << dimBlock.y << std::endl;
strstr << "dimGrid: " << dimGrid.x << ", " << dimGrid.y << std::endl;
strstr << cudaGetErrorString(error);
throw strstr.str();
}
}
template <typename T>
void map_array(pybind11::array_t<T> vec, T scalar)
{
pybind11::buffer_info ha = vec.request();
if (ha.ndim != 1) {
std::stringstream strstr;
strstr << "ha.ndim != 1" << std::endl;
strstr << "ha.ndim: " << ha.ndim << std::endl;
throw std::runtime_error(strstr.str());
}
int size = ha.shape[0];
int size_bytes = size*sizeof(T);
T *gpu_ptr;
cudaError_t error = cudaMalloc(&gpu_ptr, size_bytes);
if (error != cudaSuccess) {
throw std::runtime_error(cudaGetErrorString(error));
}
T* ptr = reinterpret_cast<T*>(ha.ptr);
error = cudaMemcpy(gpu_ptr, ptr, size_bytes, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
throw std::runtime_error(cudaGetErrorString(error));
}
run_kernel<T>(gpu_ptr, scalar, size);
error = cudaMemcpy(ptr, gpu_ptr, size_bytes, cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
throw std::runtime_error(cudaGetErrorString(error));
}
error = cudaFree(gpu_ptr);
if (error != cudaSuccess) {
throw std::runtime_error(cudaGetErrorString(error));
}
}
PYBIND11_MODULE(gpu_library, m)
{
m.def("multiply_with_scalar", map_array<double>);
}