-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathconvolution.cu
172 lines (143 loc) · 7.65 KB
/
convolution.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#include <cudnn.h>
#include <iostream>
#include "src/helper.h"
int main()
{
cudnnHandle_t cudnn;
cudnnTensorDescriptor_t input_desc;
cudnnTensorDescriptor_t output_desc;
cudnnFilterDescriptor_t filter_desc;
cudnnConvolutionDescriptor_t conv_desc;
cudnnTensorDescriptor_t bias_desc;
cudnnConvolutionFwdAlgo_t falgo;
cudnnConvolutionBwdFilterAlgo_t b_falgo;
cudnnConvolutionBwdDataAlgo_t b_dalgo;
float *d_input = nullptr;
float *d_output = nullptr;
float *d_filter = nullptr;
float *d_bias = nullptr;
int input_n = 64;
int input_c = 1;
int input_h = 28;
int input_w = 28;
// output size
int output_n = input_n;
int output_c = 20;
int output_h = 1;
int output_w = 1;
// kernel size
int filter_h = 5;
int filter_w = 5;
// alpha, beta
float one = 1.f;
float zero = 0.f;
std::cout << "[" << __LINE__ << "]" << std::endl;
cudnnCreate(&cudnn);
std::cout << "[" << __LINE__ << "]" << std::endl;
/* Create Resources */
cudnnCreateTensorDescriptor(&input_desc);
cudnnCreateTensorDescriptor(&output_desc);
cudnnCreateFilterDescriptor(&filter_desc);
cudnnCreateConvolutionDescriptor(&conv_desc);
cudnnCreateTensorDescriptor(&bias_desc);
std::cout << "[" << __LINE__ << "]" << std::endl;
// Initilziae resources
cudnnSetTensor4dDescriptor(input_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, input_n, input_c, input_h, input_w);
cudnnSetFilter4dDescriptor(filter_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, output_c, input_c, filter_h, filter_w);
cudnnSetConvolution2dDescriptor(conv_desc,
0, 0,
1, 1,
1, 1,
CUDNN_CROSS_CORRELATION,
CUDNN_DATA_FLOAT);
cudnnGetConvolution2dForwardOutputDim(conv_desc, input_desc, filter_desc, &output_n, &output_c, &output_h, &output_w);
cudnnSetTensor4dDescriptor(output_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, output_n, output_c, output_h, output_w);
cudnnSetTensor4dDescriptor(bias_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, output_c, 1, 1);
int weight_size = output_c * input_c * filter_h * filter_w;
int bias_size = output_c;
std::cout << "input size: " << input_n << " " << input_c << " " << input_h << " " << input_w << std::endl;
std::cout << "output size: " << output_n << " " << output_c << " " << output_h << " " << output_w << std::endl;
std::cout << "[" << __LINE__ << "]" << std::endl;
// convolution
size_t workspace_size = 0;
size_t temp_size = 0;
float *d_workspace = nullptr;
#if CUDNN_MAJOR == 8
// convolution (fwd)
int algo_max_count;
cudnnConvolutionFwdAlgoPerf_t fwd_algo_perf_results[CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn, &algo_max_count);
std::cout << ": Available Algorithm Count [FWD]: " << algo_max_count << std::endl;
cudnnGetConvolutionForwardAlgorithm_v7(cudnn,
input_desc, filter_desc, conv_desc, output_desc,
algo_max_count, 0, fwd_algo_perf_results);
falgo = fwd_algo_perf_results[0].algo;
#else
cudnnGetConvolutionForwardAlgorithm(cudnn, input_desc, filter_desc, conv_desc, output_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &falgo);
#endif
cudnnGetConvolutionForwardWorkspaceSize(cudnn, input_desc, filter_desc, conv_desc, output_desc, falgo, &temp_size);
workspace_size = max(workspace_size, temp_size);
#if CUDNN_MAJOR == 8
// convolution (bwd - filter)
cudnnConvolutionBwdFilterAlgoPerf_t bwd_filter_algo_perf_results[CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT];
cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnn, &algo_max_count);
std::cout << ": Available Algorithm Count [BWD-filter]: " << algo_max_count << std::endl;
cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnn,
input_desc, output_desc, conv_desc, filter_desc,
algo_max_count, 0, bwd_filter_algo_perf_results);
b_falgo = bwd_filter_algo_perf_results[0].algo;
#else
// convolution (bwd - filter)
cudnnGetConvolutionBackwardFilterAlgorithm(cudnn, input_desc, output_desc, conv_desc, filter_desc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &b_falgo);
#endif
cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn, input_desc, output_desc, conv_desc, filter_desc, b_falgo, &temp_size);
workspace_size = max(workspace_size, temp_size);
#if CUDNN_MAJOR == 8
// convolution (bwd - data)
cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_perf_results[CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT];
cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnn, &algo_max_count);
std::cout << ": Available Algorithm Count [BWD-data]: " << algo_max_count << std::endl;
cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnn,
filter_desc, output_desc, conv_desc, input_desc,
algo_max_count, 0, bwd_data_algo_perf_results);
b_dalgo = bwd_data_algo_perf_results[0].algo;
#else
// convolution (bwd - data)
cudnnGetConvolutionBackwardDataAlgorithm(cudnn, filter_desc, output_desc, conv_desc, input_desc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &b_dalgo);
#endif
cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn, filter_desc, output_desc, conv_desc, input_desc, b_dalgo, &temp_size);
workspace_size = max(workspace_size, temp_size);
std::cout << "workspace size: " << workspace_size << std::endl;
std::cout << "[" << __LINE__ << "]" << std::endl;
// allocate memory space
cudaMalloc((void**)&d_input, sizeof(float) * input_n * input_c * input_h * input_w);
cudaMalloc((void**)&d_filter, sizeof(float) * weight_size);
cudaMalloc((void**)&d_output, sizeof(float) * output_n * output_c * output_h * output_w);
cudaMalloc((void**)&d_workspace, sizeof(float) * workspace_size);
cudaMalloc((void**)&d_bias, sizeof(float) * bias_size);
std::cout << "[" << __LINE__ << "]" << std::endl;
// Forward
checkCudnnErrors(cudnnConvolutionForward(cudnn, &one, input_desc, d_input, filter_desc, d_filter, conv_desc, falgo, d_workspace, workspace_size, &zero, output_desc, d_output));
checkCudnnErrors(cudnnAddTensor(cudnn, &one, bias_desc, d_bias, &one, output_desc, d_output));
checkCudaErrors(cudaGetLastError());
std::cout << "[" << __LINE__ << "]" << std::endl;
// backward
checkCudnnErrors(cudnnConvolutionBackwardBias(cudnn, &one, output_desc, d_output, &zero, bias_desc, d_bias));
checkCudnnErrors(cudnnConvolutionBackwardFilter(cudnn, &one, input_desc, d_input, output_desc, d_output, conv_desc, b_falgo, d_workspace, workspace_size, &zero, filter_desc, d_filter));
checkCudnnErrors(cudnnConvolutionBackwardData(cudnn, &one, filter_desc, d_filter, output_desc, d_output, conv_desc, b_dalgo, d_workspace, workspace_size, &zero, input_desc, d_input));
checkCudaErrors(cudaGetLastError());
std::cout << "[" << __LINE__ << "]" << std::endl;
cudnnDestroyTensorDescriptor(input_desc);
cudnnDestroyTensorDescriptor(output_desc);
cudnnDestroyFilterDescriptor(filter_desc);
cudnnDestroyConvolutionDescriptor(conv_desc);
cudnnDestroyTensorDescriptor(bias_desc);
std::cout << "[" << __LINE__ << "]" << std::endl;
cudaFree(d_input);
cudaFree(d_filter);
cudaFree(d_output);
cudaFree(d_workspace);
cudaFree(d_bias);
cudnnDestroy(cudnn);
std::cout << "[" << __LINE__ << "]" << std::endl;
}