0
# Custom Kernels and Performance
1
2
Tools for writing custom CUDA kernels and optimizing GPU performance. CuPy provides high-level interfaces for creating custom GPU operations when built-in functions are insufficient.
3
4
## Capabilities
5
6
### Element-wise Kernels
7
8
Create custom element-wise operations that apply to each array element.
9
10
```python { .api }
11
class ElementwiseKernel:
12
"""
13
Custom element-wise CUDA kernel.
14
15
Parameters:
16
- in_params: str, input parameter specification
17
- out_params: str, output parameter specification
18
- operation: str, CUDA C++ code for operation
19
- name: str, kernel name
20
- preamble: str, code inserted before kernel
21
- loop_prep: str, code before element loop
22
- after_loop: str, code after element loop
23
"""
24
def __init__(self, in_params, out_params, operation, name='kernel', preamble='', loop_prep='', after_loop=''): ...
25
26
def __call__(self, *args, **kwargs):
27
"""
28
Execute kernel on input arrays.
29
30
Parameters:
31
- args: input and output arrays matching parameter specification
32
- size: int, number of elements to process
33
- stream: cupy.cuda.Stream, CUDA stream
34
35
Returns:
36
cupy.ndarray: Output array(s)
37
"""
38
39
class ReductionKernel:
40
"""
41
Custom reduction CUDA kernel.
42
43
Parameters:
44
- in_params: str, input parameter specification
45
- out_params: str, output parameter specification
46
- map_expr: str, expression to map input to intermediate values
47
- reduce_expr: str, expression to reduce intermediate values
48
- post_map_expr: str, expression to post-process results
49
- identity: str, identity value for reduction
50
- name: str, kernel name
51
"""
52
def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr, identity, name='kernel'): ...
53
54
def __call__(self, *args, **kwargs):
55
"""Execute reduction kernel."""
56
57
class RawKernel:
58
"""
59
Raw CUDA kernel from source code.
60
61
Parameters:
62
- code: str, complete CUDA C++ kernel source
63
- name: str, kernel function name
64
- options: tuple, compiler options
65
- backend: str, compilation backend
66
"""
67
def __init__(self, code, name, options=(), backend='nvrtc'): ...
68
69
def __call__(self, grid, block, args, *, shared_mem=0, stream=None):
70
"""
71
Launch raw CUDA kernel.
72
73
Parameters:
74
- grid: tuple, grid dimensions
75
- block: tuple, block dimensions
76
- args: tuple, kernel arguments
77
- shared_mem: int, shared memory size
78
- stream: cupy.cuda.Stream, CUDA stream
79
"""
80
81
class RawModule:
82
"""
83
Raw CUDA module containing multiple kernels.
84
85
Parameters:
86
- code: str, complete CUDA module source
87
- options: tuple, compiler options
88
- backend: str, compilation backend
89
"""
90
def __init__(self, code, options=(), backend='nvrtc'): ...
91
92
def get_function(self, name):
93
"""
94
Get kernel function by name.
95
96
Parameters:
97
- name: str, function name
98
99
Returns:
100
RawKernel: Kernel function
101
"""
102
```
103
104
### Performance Utilities
105
106
Tools for optimizing GPU performance.
107
108
```python { .api }
109
def fuse(*args, **kwargs):
110
"""
111
Decorator for kernel fusion optimization.
112
113
Parameters:
114
- args: positional arguments for fusion
115
- kwargs: keyword arguments for fusion
116
117
Returns:
118
function: Fused function decorator
119
"""
120
121
def clear_memo():
122
"""Clear memoization cache."""
123
124
def memoize(for_each_device=False):
125
"""
126
Memoization decorator for caching function results.
127
128
Parameters:
129
- for_each_device: bool, separate cache per device
130
131
Returns:
132
function: Memoization decorator
133
"""
134
```
135
136
## Usage Examples
137
138
### Element-wise Kernel
139
140
```python
141
import cupy as cp
142
143
# Custom element-wise operation
144
multiply_add = cp.ElementwiseKernel(
145
'T x, T y, T z', # Input parameters
146
'T w', # Output parameters
147
'w = x * y + z', # Operation
148
'multiply_add' # Kernel name
149
)
150
151
# Use the kernel
152
a = cp.random.random((1000, 1000))
153
b = cp.random.random((1000, 1000))
154
c = cp.random.random((1000, 1000))
155
result = multiply_add(a, b, c)
156
```
157
158
### Reduction Kernel
159
160
```python
161
# Custom reduction operation (sum of squares)
162
sum_of_squares = cp.ReductionKernel(
163
'T x', # Input parameter
164
'T y', # Output parameter
165
'x * x', # Map expression
166
'a + b', # Reduce expression
167
'y = a', # Post-map expression
168
'0', # Identity value
169
'sum_of_squares' # Kernel name
170
)
171
172
# Use the reduction kernel
173
data = cp.random.random((10000,))
174
result = sum_of_squares(data, axis=None)
175
```
176
177
### Raw CUDA Kernel
178
179
```python
180
# Raw CUDA kernel for advanced operations
181
cuda_code = '''
182
extern "C" __global__
183
void vector_add(const float* a, const float* b, float* c, int n) {
184
int idx = blockIdx.x * blockDim.x + threadIdx.x;
185
if (idx < n) {
186
c[idx] = a[idx] + b[idx];
187
}
188
}
189
'''
190
191
kernel = cp.RawKernel(cuda_code, 'vector_add')
192
193
# Use raw kernel
194
n = 1000000
195
a_gpu = cp.random.random((n,), dtype=cp.float32)
196
b_gpu = cp.random.random((n,), dtype=cp.float32)
197
c_gpu = cp.zeros((n,), dtype=cp.float32)
198
199
threads_per_block = 256
200
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
201
202
kernel((blocks_per_grid,), (threads_per_block,),
203
(a_gpu, b_gpu, c_gpu, n))
204
```