0
# Custom Kernels
1
2
Create custom GPU kernels for specialized operations not covered by standard array functions. Supports element-wise kernels, reduction kernels, and raw CUDA kernels with just-in-time compilation.
3
4
## Capabilities
5
6
### Element-wise Kernels
7
8
Create custom element-wise operations that apply a function to each element.
9
10
```python { .api }
11
class ElementwiseKernel:
12
"""
13
Create custom element-wise kernel.
14
15
Parameters:
16
- in_params: str, input parameter specification
17
- out_params: str, output parameter specification
18
- operation: str, C++ code for element operation
19
- name: str, kernel name
20
- reduce_dims: bool, whether to reduce dimensions
21
- options: tuple, compiler options
22
- preamble: str, code inserted before kernel
23
- loop_prep: str, code inserted before loop
24
- after_loop: str, code inserted after loop
25
"""
26
def __init__(self, in_params, out_params, operation, name='kernel',
27
reduce_dims=True, options=(), preamble='', loop_prep='', after_loop=''): ...
28
29
def __call__(self, *args, **kwargs):
30
"""
31
Execute kernel with given arguments.
32
33
Parameters:
34
- args: input arrays matching in_params
35
- size: int, output size override
36
- stream: Stream, execution stream
37
38
Returns:
39
cupy.ndarray or tuple: Output array(s) matching out_params
40
"""
41
```
42
43
### Reduction Kernels
44
45
Create kernels that reduce arrays along specified axes.
46
47
```python { .api }
48
class ReductionKernel:
49
"""
50
Create custom reduction kernel.
51
52
Parameters:
53
- in_params: str, input parameter specification
54
- out_params: str, output parameter specification
55
- map_expr: str, expression to map input to intermediate values
56
- reduce_expr: str, expression to reduce intermediate values
57
- post_map_expr: str, expression for post-processing
58
- identity: str, identity value for reduction
59
- name: str, kernel name
60
- reduce_type: str, intermediate data type
61
- reduce_dims: bool, whether to reduce dimensions
62
- options: tuple, compiler options
63
- preamble: str, code inserted before kernel
64
- loop_prep: str, code inserted before loop
65
- after_loop: str, code inserted after loop
66
"""
67
def __init__(self, in_params, out_params, map_expr, reduce_expr,
68
post_map_expr='', identity=None, name='kernel', reduce_type=None,
69
reduce_dims=True, options=(), preamble='', loop_prep='', after_loop=''): ...
70
71
def __call__(self, *args, **kwargs):
72
"""
73
Execute reduction kernel.
74
75
Parameters:
76
- args: input arrays
77
- axis: int or tuple, reduction axes
78
- keepdims: bool, keep reduced dimensions
79
- stream: Stream, execution stream
80
81
Returns:
82
cupy.ndarray: Reduced result
83
"""
84
```
85
86
### Raw Kernels
87
88
Create raw CUDA kernels with full control over GPU execution.
89
90
```python { .api }
91
class RawKernel:
92
"""
93
Create raw CUDA kernel from source code.
94
95
Parameters:
96
- code: str, CUDA C++ source code
97
- name: str, kernel function name
98
- options: tuple, compiler options
99
- backend: str, compiler backend ('nvcc' or 'nvrtc')
100
- translate_cucomplex: bool, translate cuComplex types
101
"""
102
def __init__(self, code, name, options=(), backend='nvcc', translate_cucomplex=True): ...
103
104
def __call__(self, grid, block, args, **kwargs):
105
"""
106
Launch raw kernel.
107
108
Parameters:
109
- grid: tuple, grid dimensions (blocks)
110
- block: tuple, block dimensions (threads)
111
- args: tuple, kernel arguments
112
- stream: Stream, execution stream
113
- shared_mem: int, shared memory size in bytes
114
"""
115
116
class RawModule:
117
"""
118
Create CUDA module from source code.
119
120
Parameters:
121
- code: str, CUDA C++ source code
122
- path: str, path to source file
123
- options: tuple, compiler options
124
- backend: str, compiler backend
125
- translate_cucomplex: bool, translate cuComplex types
126
"""
127
def __init__(self, code=None, path=None, options=(), backend='nvcc', translate_cucomplex=True): ...
128
129
def get_function(self, name):
130
"""Get kernel function by name."""
131
```
132
133
### Kernel Fusion
134
135
Optimize performance by fusing multiple operations into single kernels.
136
137
```python { .api }
138
def fuse(*args, **kwargs):
139
"""
140
Kernel fusion decorator for optimizing multiple operations.
141
142
Usage:
143
@cupy.fuse()
144
def fused_operation(x, y):
145
return cupy.sin(x) + cupy.cos(y)
146
147
Parameters:
148
- kernel_name: str, name for fused kernel
149
150
Returns:
151
function: Fused kernel function
152
"""
153
```
154
155
## Usage Examples
156
157
### Element-wise Kernel
158
159
```python
160
import cupy as cp
161
162
# Create custom element-wise operation
163
add_kernel = cp.ElementwiseKernel(
164
'float32 x, float32 y', # Input parameters
165
'float32 z', # Output parameters
166
'z = x + y * 2', # Operation
167
'custom_add' # Kernel name
168
)
169
170
# Use the kernel
171
a = cp.random.rand(1000, 1000).astype(cp.float32)
172
b = cp.random.rand(1000, 1000).astype(cp.float32)
173
result = add_kernel(a, b)
174
175
# More complex element-wise kernel
176
complex_kernel = cp.ElementwiseKernel(
177
'float32 x, float32 y',
178
'float32 z',
179
'''
180
float temp = sin(x) * cos(y);
181
z = temp * temp + sqrt(x * y);
182
''',
183
'complex_math'
184
)
185
186
result2 = complex_kernel(a, b)
187
```
188
189
### Reduction Kernel
190
191
```python
192
import cupy as cp
193
194
# Create custom reduction (sum of squares)
195
sum_of_squares = cp.ReductionKernel(
196
'float32 x', # Input
197
'float32 out', # Output
198
'x * x', # Map: square each element
199
'a + b', # Reduce: sum the squares
200
'0', # Identity: 0 for addition
201
'sum_of_squares' # Name
202
)
203
204
# Use reduction kernel
205
data = cp.random.rand(1000000).astype(cp.float32)
206
result = sum_of_squares(data)
207
208
# Multi-dimensional reduction
209
norm_kernel = cp.ReductionKernel(
210
'float32 x, float32 y',
211
'float32 out',
212
'x * x + y * y', # Map: squared magnitude
213
'a + b', # Reduce: sum
214
'0', # Identity
215
'vector_norm_squared'
216
)
217
218
x = cp.random.rand(1000).astype(cp.float32)
219
y = cp.random.rand(1000).astype(cp.float32)
220
norm_squared = norm_kernel(x, y)
221
```
222
223
### Raw CUDA Kernel
224
225
```python
226
import cupy as cp
227
228
# Raw CUDA kernel source
229
cuda_source = '''
230
extern "C" __global__
231
void matrix_add(float* a, float* b, float* c, int n) {
232
int idx = blockIdx.x * blockDim.x + threadIdx.x;
233
if (idx < n) {
234
c[idx] = a[idx] + b[idx];
235
}
236
}
237
'''
238
239
# Create raw kernel
240
matrix_add_kernel = cp.RawKernel(cuda_source, 'matrix_add')
241
242
# Prepare data
243
n = 1000000
244
a = cp.random.rand(n).astype(cp.float32)
245
b = cp.random.rand(n).astype(cp.float32)
246
c = cp.zeros(n, dtype=cp.float32)
247
248
# Launch kernel
249
threads_per_block = 256
250
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
251
252
matrix_add_kernel(
253
(blocks_per_grid,), # Grid size
254
(threads_per_block,), # Block size
255
(a, b, c, n) # Arguments
256
)
257
```
258
259
### Advanced Raw Kernel with Shared Memory
260
261
```python
262
import cupy as cp
263
264
# Advanced CUDA kernel with shared memory
265
advanced_source = '''
266
extern "C" __global__
267
void block_reduce_sum(float* input, float* output, int n) {
268
extern __shared__ float sdata[];
269
270
int tid = threadIdx.x;
271
int idx = blockIdx.x * blockDim.x + threadIdx.x;
272
273
// Load data into shared memory
274
sdata[tid] = (idx < n) ? input[idx] : 0.0f;
275
__syncthreads();
276
277
// Parallel reduction in shared memory
278
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
279
if (tid < s) {
280
sdata[tid] += sdata[tid + s];
281
}
282
__syncthreads();
283
}
284
285
// Write result for this block
286
if (tid == 0) {
287
output[blockIdx.x] = sdata[0];
288
}
289
}
290
'''
291
292
# Create and use advanced kernel
293
reduce_kernel = cp.RawKernel(advanced_source, 'block_reduce_sum')
294
295
# Setup
296
n = 1024 * 1024
297
data = cp.random.rand(n).astype(cp.float32)
298
299
threads_per_block = 256
300
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
301
output = cp.zeros(blocks_per_grid, dtype=cp.float32)
302
303
# Launch with shared memory
304
shared_mem_size = threads_per_block * 4 # 4 bytes per float
305
reduce_kernel(
306
(blocks_per_grid,),
307
(threads_per_block,),
308
(data, output, n),
309
shared_mem=shared_mem_size
310
)
311
312
# Sum the partial results
313
total_sum = cp.sum(output)
314
```
315
316
### Kernel Fusion
317
318
```python
319
import cupy as cp
320
321
# Define fused operation
322
@cp.fuse()
323
def fused_math(x, y, z):
324
"""Fuse multiple operations into single kernel."""
325
temp1 = cp.sin(x) + cp.cos(y)
326
temp2 = cp.exp(z) * temp1
327
return cp.sqrt(temp2 + 1.0)
328
329
# Use fused kernel
330
x = cp.random.rand(1000, 1000)
331
y = cp.random.rand(1000, 1000)
332
z = cp.random.rand(1000, 1000)
333
334
# This executes as single fused kernel
335
result = fused_math(x, y, z)
336
337
# Compare with unfused version (multiple kernel launches)
338
def unfused_math(x, y, z):
339
temp1 = cp.sin(x) + cp.cos(y)
340
temp2 = cp.exp(z) * temp1
341
return cp.sqrt(temp2 + 1.0)
342
343
# Fused version is typically faster due to reduced memory traffic
344
```
345
346
## Performance Tips
347
348
### Kernel Optimization
349
350
```python
351
import cupy as cp
352
353
# Use appropriate data types
354
float32_kernel = cp.ElementwiseKernel(
355
'float32 x', # Use float32 for better performance on most GPUs
356
'float32 y',
357
'y = sin(x) * cos(x)',
358
'trig_kernel'
359
)
360
361
# Minimize memory transfers
362
def efficient_processing(data):
363
"""Keep data on GPU throughout processing."""
364
# Bad: multiple CPU-GPU transfers
365
# cpu_data = cp.asnumpy(data)
366
# processed = process_on_cpu(cpu_data)
367
# gpu_result = cp.array(processed)
368
369
# Good: keep on GPU
370
gpu_result = custom_gpu_kernel(data)
371
return gpu_result
372
373
# Use shared memory for data reuse
374
shared_mem_kernel = cp.RawKernel('''
375
extern "C" __global__ void optimized_kernel(float* data, int n) {
376
__shared__ float cache[256]; // Shared memory
377
int tid = threadIdx.x;
378
int idx = blockIdx.x * blockDim.x + tid;
379
380
// Cooperative loading into shared memory
381
if (idx < n) cache[tid] = data[idx];
382
__syncthreads();
383
384
// Process using shared memory
385
if (idx < n) {
386
data[idx] = cache[tid] * 2.0f; // Example operation
387
}
388
}
389
''', 'optimized_kernel')
390
```