forked from hannes-brt/cudnn-python-wrappers
-
Notifications
You must be signed in to change notification settings - Fork 2
/
example.py
executable file
·109 lines (86 loc) · 3.3 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
import libcudnn, ctypes
import numpy as np
# Create a cuDNN context
cudnn_context = libcudnn.cudnnCreate()
# Set some options and tensor dimensions
tensor_format = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW']
data_type = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT']
convolution_mode = libcudnn.cudnnConvolutionMode['CUDNN_CROSS_CORRELATION']
convolution_fwd_pref = libcudnn.cudnnConvolutionFwdPreference['CUDNN_CONVOLUTION_FWD_PREFER_FASTEST']
start, end = (drv.Event(), drv.Event())
def start_bench():
start.record()
def end_bench(op):
end.record()
end.synchronize()
msecs = end.time_since(start)
print("%7.3f msecs" % (msecs))
n_input = 64
filters_in = 128
filters_out = 128
height_in = 112
width_in = 112
height_filter = 7
width_filter = 7
pad_h = 3
pad_w = 3
vertical_stride = 1
horizontal_stride = 1
upscalex = 1
upscaley = 1
alpha = 1.0
beta = 1.0
# Input tensor
X = gpuarray.to_gpu(np.random.rand(n_input, filters_in, height_in, width_in)
.astype(np.float32))
# Filter tensor
filters = gpuarray.to_gpu(np.random.rand(filters_out,
filters_in, height_filter, width_filter).astype(np.float32))
# Descriptor for input
X_desc = libcudnn.cudnnCreateTensorDescriptor()
libcudnn.cudnnSetTensor4dDescriptor(X_desc, tensor_format, data_type,
n_input, filters_in, height_in, width_in)
# Filter descriptor
filters_desc = libcudnn.cudnnCreateFilterDescriptor()
libcudnn.cudnnSetFilter4dDescriptor(filters_desc, data_type, filters_out,
filters_in, height_filter, width_filter)
# Convolution descriptor
conv_desc = libcudnn.cudnnCreateConvolutionDescriptor()
libcudnn.cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w,
vertical_stride, horizontal_stride, upscalex, upscaley,
convolution_mode)
# Get output dimensions (first two values are n_input and filters_out)
_, _, height_output, width_output = libcudnn.cudnnGetConvolution2dForwardOutputDim(
conv_desc, X_desc, filters_desc)
# Output tensor
Y = gpuarray.empty((n_input, filters_out, height_output, width_output), np.float32)
Y_desc = libcudnn.cudnnCreateTensorDescriptor()
libcudnn.cudnnSetTensor4dDescriptor(Y_desc, tensor_format, data_type, n_input,
filters_out, height_output, width_output)
# Get pointers to GPU memory
X_data = ctypes.c_void_p(int(X.gpudata))
filters_data = ctypes.c_void_p(int(filters.gpudata))
Y_data = ctypes.c_void_p(int(Y.gpudata))
# Perform convolution
algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(cudnn_context, X_desc,
filters_desc, conv_desc, Y_desc, convolution_fwd_pref, 0)
print("Cudnn algorithm = %d" % algo.value)
ws_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(cudnn_context, X_desc, filters_desc, conv_desc, Y_desc, algo)
ws_ptr = drv.mem_alloc(ws_size.value) if ws_size.value > 0 else 0
ws_data = ctypes.c_void_p(int(ws_ptr))
start_bench()
libcudnn.cudnnConvolutionForward(cudnn_context, alpha, X_desc, X_data,
filters_desc, filters_data, conv_desc, algo, ws_data, ws_size.value, beta,
Y_desc, Y_data)
end_bench("fprop")
ws_ptr = None
# Clean up
libcudnn.cudnnDestroyTensorDescriptor(X_desc)
libcudnn.cudnnDestroyTensorDescriptor(Y_desc)
libcudnn.cudnnDestroyFilterDescriptor(filters_desc)
libcudnn.cudnnDestroyConvolutionDescriptor(conv_desc)
libcudnn.cudnnDestroy(cudnn_context)