0
# Custom Kernels
1
2
Framework for creating custom GPU kernels in CuPy, enabling high-performance operations tailored to specific needs. Provides ElementwiseKernel for element-wise operations, ReductionKernel for reductions, and RawKernel for arbitrary CUDA code.
3
4
## Capabilities
5
6
### ElementwiseKernel
7
8
```python { .api }
9
class ElementwiseKernel:
10
"""
11
User-defined elementwise kernel.
12
13
Parameters:
14
- in_params: string of input parameters
15
- out_params: string of output parameters
16
- operation: string of element-wise operation
17
- name: kernel name
18
- reduce_dims: whether to reduce dimensions
19
- options: compiler options
20
- preamble: code to insert at beginning
21
- loop_prep: code to insert before loop
22
- after_loop: code to insert after loop
23
"""
24
def __init__(self, in_params, out_params, operation, name='kernel',
25
reduce_dims=True, options=(), preamble='', loop_prep='',
26
after_loop='', **kwargs): ...
27
28
def __call__(self, *args, **kwargs): ...
29
```
30
31
### ReductionKernel
32
33
```python { .api }
34
class ReductionKernel:
35
"""
36
User-defined reduction kernel.
37
38
Parameters:
39
- in_params: string of input parameters
40
- out_params: string of output parameters
41
- map_expr: mapping expression
42
- reduce_expr: reduction expression
43
- post_map_expr: post-mapping expression
44
- identity: identity value for reduction
45
- name: kernel name
46
- reduce_type: type for reduction
47
- reduce_dims: whether to reduce dimensions
48
- preamble: code to insert at beginning
49
- options: compiler options
50
"""
51
def __init__(self, in_params, out_params, map_expr, reduce_expr,
52
post_map_expr='', identity=None, name='kernel',
53
reduce_type=None, reduce_dims=True, preamble='',
54
options=(), **kwargs): ...
55
56
def __call__(self, *args, **kwargs): ...
57
```
58
59
### RawKernel
60
61
```python { .api }
62
class RawKernel:
63
"""
64
User-defined raw CUDA kernel.
65
66
Parameters:
67
- code: CUDA kernel code
68
- name: kernel function name
69
- options: compiler options
70
- backend: compiler backend ('nvcc' or 'nvrtc')
71
- translate_cucomplex: translate cuComplex types
72
- enable_cooperative_groups: enable cooperative groups
73
- jitify: use Jitify for compilation
74
"""
75
def __init__(self, code, name, options=(), backend='nvcc',
76
translate_cucomplex=True, enable_cooperative_groups=False,
77
jitify=False, **kwargs): ...
78
79
def __call__(self, grid, block, args, **kwargs): ...
80
```
81
82
### RawModule
83
84
```python { .api }
85
class RawModule:
86
"""
87
User-defined raw CUDA module.
88
89
Parameters:
90
- code: CUDA module code
91
- options: compiler options
92
- backend: compiler backend
93
- translate_cucomplex: translate cuComplex types
94
- enable_cooperative_groups: enable cooperative groups
95
- jitify: use Jitify for compilation
96
"""
97
def __init__(self, code, options=(), backend='nvcc',
98
translate_cucomplex=True, enable_cooperative_groups=False,
99
jitify=False, **kwargs): ...
100
101
def get_function(self, name): ...
102
```
103
104
### Kernel Creation Functions
105
106
```python { .api }
107
def create_ufunc(name, ops, routine=None, preamble='', doc=None,
108
default_casting=None, loop_prep='', out_ops=None):
109
"""
110
Create universal function from operations.
111
112
Parameters:
113
- name: function name
114
- ops: operation specifications
115
- routine: routine function
116
- preamble: preamble code
117
- doc: documentation string
118
- default_casting: default casting rule
119
- loop_prep: loop preparation code
120
- out_ops: output operations
121
122
Returns:
123
ufunc: created universal function
124
"""
125
126
def create_reduction_func(name, ops, routine=None, identity=None,
127
preamble='', **kwargs):
128
"""
129
Create reduction function.
130
131
Parameters:
132
- name: function name
133
- ops: operation specifications
134
- routine: routine function
135
- identity: identity value
136
- preamble: preamble code
137
138
Returns:
139
function: created reduction function
140
"""
141
```
142
143
### Utilities
144
145
```python { .api }
146
def clear_memo():
147
"""Clear memoization cache."""
148
149
def memoize(for_each_device=False):
150
"""
151
Memoization decorator.
152
153
Parameters:
154
- for_each_device: memoize for each device separately
155
156
Returns:
157
decorator: memoization decorator
158
"""
159
```
160
161
## Usage Examples
162
163
### ElementwiseKernel Examples
164
165
```python
166
import cupy as cp
167
168
# Simple element-wise operation
169
add_kernel = cp.ElementwiseKernel(
170
'float32 x, float32 y', # input parameters
171
'float32 z', # output parameters
172
'z = x + y', # operation
173
'add_kernel' # kernel name
174
)
175
176
# Use the kernel
177
a = cp.random.random(1000).astype(cp.float32)
178
b = cp.random.random(1000).astype(cp.float32)
179
c = cp.empty(1000, dtype=cp.float32)
180
181
add_kernel(a, b, c)
182
183
# Verify result
184
expected = a + b
185
error = cp.linalg.norm(c - expected)
186
print(f"Add kernel error: {error}")
187
```
188
189
### Complex ElementwiseKernel
190
191
```python
192
import cupy as cp
193
194
# More complex element-wise operation with conditionals
195
clamp_kernel = cp.ElementwiseKernel(
196
'float32 x, float32 min_val, float32 max_val',
197
'float32 y',
198
'''
199
if (x < min_val) {
200
y = min_val;
201
} else if (x > max_val) {
202
y = max_val;
203
} else {
204
y = x;
205
}
206
''',
207
'clamp_kernel'
208
)
209
210
# Test the kernel
211
data = cp.random.uniform(-2, 2, 1000).astype(cp.float32)
212
result = cp.empty_like(data)
213
214
clamp_kernel(data, -1.0, 1.0, result)
215
216
# Verify clamping
217
print(f"Min value: {cp.min(result)}") # Should be >= -1.0
218
print(f"Max value: {cp.max(result)}") # Should be <= 1.0
219
```
220
221
### ElementwiseKernel with Preamble
222
223
```python
224
import cupy as cp
225
226
# Kernel with helper functions in preamble
227
sigmoid_kernel = cp.ElementwiseKernel(
228
'float32 x',
229
'float32 y',
230
'y = sigmoid(x)',
231
'sigmoid_kernel',
232
preamble='''
233
__device__ float sigmoid(float x) {
234
return 1.0f / (1.0f + expf(-x));
235
}
236
'''
237
)
238
239
# Use the kernel
240
x = cp.linspace(-5, 5, 1000).astype(cp.float32)
241
y = cp.empty_like(x)
242
243
sigmoid_kernel(x, y)
244
245
# Compare with CuPy implementation
246
expected = 1 / (1 + cp.exp(-x))
247
error = cp.linalg.norm(y - expected)
248
print(f"Sigmoid kernel error: {error}")
249
```
250
251
### ReductionKernel Examples
252
253
```python
254
import cupy as cp
255
256
# Custom sum reduction
257
sum_kernel = cp.ReductionKernel(
258
'float32 x', # input
259
'float32 y', # output
260
'x', # map expression (identity)
261
'a + b', # reduce expression
262
'y = a', # post-map expression
263
'0', # identity value
264
'sum_kernel' # kernel name
265
)
266
267
# Test the kernel
268
data = cp.random.random(10000).astype(cp.float32)
269
result = sum_kernel(data)
270
271
# Compare with built-in sum
272
expected = cp.sum(data)
273
error = abs(result - expected)
274
print(f"Sum kernel error: {error}")
275
```
276
277
### Complex ReductionKernel
278
279
```python
280
import cupy as cp
281
282
# Root mean square reduction
283
rms_kernel = cp.ReductionKernel(
284
'float32 x',
285
'float32 y',
286
'x * x', # map: square each element
287
'a + b', # reduce: sum squares
288
'y = sqrt(a / _in_ind.size())', # post: sqrt of mean
289
'0', # identity
290
'rms_kernel'
291
)
292
293
# Test the kernel
294
data = cp.random.random(1000).astype(cp.float32)
295
rms_result = rms_kernel(data)
296
297
# Compare with manual calculation
298
manual_rms = cp.sqrt(cp.mean(data * data))
299
error = abs(rms_result - manual_rms)
300
print(f"RMS kernel error: {error}")
301
```
302
303
### RawKernel Examples
304
305
```python
306
import cupy as cp
307
308
# Matrix multiplication kernel
309
matmul_code = '''
310
extern "C" __global__
311
void matmul(float* A, float* B, float* C, int M, int N, int K) {
312
int row = blockIdx.y * blockDim.y + threadIdx.y;
313
int col = blockIdx.x * blockDim.x + threadIdx.x;
314
315
if (row < M && col < N) {
316
float sum = 0.0f;
317
for (int k = 0; k < K; k++) {
318
sum += A[row * K + k] * B[k * N + col];
319
}
320
C[row * N + col] = sum;
321
}
322
}
323
'''
324
325
matmul_kernel = cp.RawKernel(matmul_code, 'matmul')
326
327
# Test the kernel
328
M, N, K = 512, 512, 512
329
A = cp.random.random((M, K)).astype(cp.float32)
330
B = cp.random.random((K, N)).astype(cp.float32)
331
C = cp.zeros((M, N), dtype=cp.float32)
332
333
# Launch kernel
334
block = (16, 16)
335
grid = ((N + block[0] - 1) // block[0], (M + block[1] - 1) // block[1])
336
337
matmul_kernel(grid, block, (A, B, C, M, N, K))
338
339
# Compare with CuPy matmul
340
expected = cp.dot(A, B)
341
error = cp.linalg.norm(C - expected) / cp.linalg.norm(expected)
342
print(f"Matmul kernel relative error: {error}")
343
```
344
345
### RawKernel with Shared Memory
346
347
```python
348
import cupy as cp
349
350
# Convolution kernel with shared memory
351
conv_code = '''
352
extern "C" __global__
353
void conv2d(float* input, float* kernel, float* output,
354
int height, int width, int kernel_size) {
355
356
extern __shared__ float shared_mem[];
357
358
int tx = threadIdx.x;
359
int ty = threadIdx.y;
360
int bx = blockIdx.x;
361
int by = blockIdx.y;
362
int bdx = blockDim.x;
363
int bdy = blockDim.y;
364
365
int row = by * bdy + ty;
366
int col = bx * bdx + tx;
367
368
int shared_width = bdx + kernel_size - 1;
369
int shared_height = bdy + kernel_size - 1;
370
371
// Load data into shared memory (simplified)
372
if (row < height && col < width) {
373
shared_mem[ty * shared_width + tx] = input[row * width + col];
374
}
375
376
__syncthreads();
377
378
// Perform convolution
379
if (row < height && col < width) {
380
float sum = 0.0f;
381
int half_kernel = kernel_size / 2;
382
383
for (int ky = 0; ky < kernel_size; ky++) {
384
for (int kx = 0; kx < kernel_size; kx++) {
385
int y_idx = ty + ky;
386
int x_idx = tx + kx;
387
388
if (y_idx < shared_height && x_idx < shared_width) {
389
sum += shared_mem[y_idx * shared_width + x_idx] *
390
kernel[ky * kernel_size + kx];
391
}
392
}
393
}
394
395
output[row * width + col] = sum;
396
}
397
}
398
'''
399
400
conv_kernel = cp.RawKernel(conv_code, 'conv2d')
401
402
# Test the kernel (simplified example)
403
height, width = 256, 256
404
kernel_size = 3
405
406
input_data = cp.random.random((height, width)).astype(cp.float32)
407
kernel_data = cp.random.random((kernel_size, kernel_size)).astype(cp.float32)
408
output_data = cp.zeros((height, width), dtype=cp.float32)
409
410
# Calculate shared memory size
411
block = (16, 16)
412
shared_size = (block[0] + kernel_size - 1) * (block[1] + kernel_size - 1) * 4 # 4 bytes per float
413
414
grid = ((width + block[0] - 1) // block[0], (height + block[1] - 1) // block[1])
415
416
conv_kernel(grid, block, (input_data, kernel_data, output_data, height, width, kernel_size),
417
shared_mem=shared_size)
418
```
419
420
### RawModule for Multiple Kernels
421
422
```python
423
import cupy as cp
424
425
# Module with multiple related kernels
426
vector_ops_code = '''
427
extern "C" __global__
428
void vector_add(float* a, float* b, float* c, int n) {
429
int idx = blockDim.x * blockIdx.x + threadIdx.x;
430
if (idx < n) {
431
c[idx] = a[idx] + b[idx];
432
}
433
}
434
435
extern "C" __global__
436
void vector_mult(float* a, float* b, float* c, int n) {
437
int idx = blockDim.x * blockIdx.x + threadIdx.x;
438
if (idx < n) {
439
c[idx] = a[idx] * b[idx];
440
}
441
}
442
443
extern "C" __global__
444
void vector_norm(float* a, float* result, int n) {
445
extern __shared__ float sdata[];
446
447
int tid = threadIdx.x;
448
int idx = blockDim.x * blockIdx.x + threadIdx.x;
449
450
// Load and square
451
sdata[tid] = (idx < n) ? a[idx] * a[idx] : 0.0f;
452
__syncthreads();
453
454
// Reduction in shared memory
455
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
456
if (tid < s) {
457
sdata[tid] += sdata[tid + s];
458
}
459
__syncthreads();
460
}
461
462
// Write result
463
if (tid == 0) {
464
atomicAdd(result, sdata[0]);
465
}
466
}
467
'''
468
469
vector_module = cp.RawModule(code=vector_ops_code)
470
471
# Get individual kernels
472
add_kernel = vector_module.get_function('vector_add')
473
mult_kernel = vector_module.get_function('vector_mult')
474
norm_kernel = vector_module.get_function('vector_norm')
475
476
# Test the kernels
477
n = 1000000
478
a = cp.random.random(n).astype(cp.float32)
479
b = cp.random.random(n).astype(cp.float32)
480
c = cp.zeros(n, dtype=cp.float32)
481
482
block_size = 256
483
grid_size = (n + block_size - 1) // block_size
484
485
# Vector addition
486
add_kernel((grid_size,), (block_size,), (a, b, c, n))
487
add_expected = a + b
488
add_error = cp.linalg.norm(c - add_expected)
489
print(f"Add error: {add_error}")
490
491
# Vector multiplication
492
mult_kernel((grid_size,), (block_size,), (a, b, c, n))
493
mult_expected = a * b
494
mult_error = cp.linalg.norm(c - mult_expected)
495
print(f"Mult error: {mult_error}")
496
497
# Vector norm
498
norm_result = cp.zeros(1, dtype=cp.float32)
499
shared_mem_size = block_size * 4 # 4 bytes per float
500
501
norm_kernel((grid_size,), (block_size,), (a, norm_result, n),
502
shared_mem=shared_mem_size)
503
504
norm_expected = cp.linalg.norm(a)**2
505
norm_error = abs(float(norm_result) - float(norm_expected))
506
print(f"Norm error: {norm_error}")
507
```
508
509
### Performance Optimization
510
511
```python
512
import cupy as cp
513
import time
514
515
# Compare ElementwiseKernel vs built-in operations
516
n = 10000000
517
518
# Built-in operation timing
519
a = cp.random.random(n).astype(cp.float32)
520
b = cp.random.random(n).astype(cp.float32)
521
522
start = time.time()
523
for _ in range(100):
524
c_builtin = a + b
525
cp.cuda.Device().synchronize()
526
builtin_time = time.time() - start
527
528
# Custom kernel timing
529
add_kernel = cp.ElementwiseKernel(
530
'float32 x, float32 y',
531
'float32 z',
532
'z = x + y',
533
'custom_add'
534
)
535
536
c_custom = cp.empty_like(a)
537
start = time.time()
538
for _ in range(100):
539
add_kernel(a, b, c_custom)
540
cp.cuda.Device().synchronize()
541
custom_time = time.time() - start
542
543
print(f"Built-in time: {builtin_time:.4f}s")
544
print(f"Custom kernel time: {custom_time:.4f}s")
545
print(f"Speedup: {builtin_time/custom_time:.2f}x")
546
547
# Verify correctness
548
error = cp.linalg.norm(c_builtin - c_custom)
549
print(f"Accuracy error: {error}")
550
```