0
# Custom Kernels
1
2
Advanced kernel creation mechanisms for implementing custom GPU operations using CUDA C/C++ code or element-wise operations. Enables high-performance custom computations that go beyond built-in CuPy functions.
3
4
## Capabilities
5
6
### ElementwiseKernel
7
8
Create custom element-wise operations that apply functions to array elements in parallel.
9
10
```python { .api }
11
class ElementwiseKernel:
12
"""
13
Custom element-wise kernel for parallel array operations.
14
15
Parameters:
16
- in_params: str, input parameter specification
17
- out_params: str, output parameter specification
18
- operation: str, CUDA C code for element-wise operation
19
- name: str, kernel name
20
- options: tuple, NVCC compiler options
21
- preamble: str, code to prepend to kernel
22
- loop_prep: str, code before main loop
23
- after_loop: str, code after main loop
24
"""
25
def __init__(self, in_params, out_params, operation, name='kernel',
26
options=(), preamble='', loop_prep='', after_loop=''): ...
27
28
def __call__(self, *args, **kwargs):
29
"""
30
Execute kernel on input arrays.
31
32
Parameters:
33
- *args: input arrays matching in_params specification
34
- size: int, number of elements to process
35
- stream: cupy.cuda.Stream, CUDA stream for execution
36
37
Returns:
38
cupy.ndarray or tuple: Output arrays
39
"""
40
```
41
42
### RawKernel
43
44
Create kernels from raw CUDA C/C++ source code for maximum flexibility and performance.
45
46
```python { .api }
47
class RawKernel:
48
"""
49
Raw CUDA kernel from C/C++ source code.
50
51
Parameters:
52
- code: str, CUDA C/C++ source code
53
- name: str, kernel function name in source code
54
- options: tuple, NVCC compiler options
55
- backend: str, compilation backend ('nvcc' or 'nvrtc')
56
- translate_cucomplex: bool, translate cuComplex types
57
"""
58
def __init__(self, code, name, options=(), backend='nvcc', translate_cucomplex=True): ...
59
60
def __call__(self, grid, block, args, **kwargs):
61
"""
62
Launch kernel with specified grid and block dimensions.
63
64
Parameters:
65
- grid: tuple, grid dimensions (blocks)
66
- block: tuple, block dimensions (threads per block)
67
- args: tuple, kernel arguments
68
- shared_mem: int, shared memory size in bytes
69
- stream: cupy.cuda.Stream, CUDA stream for execution
70
"""
71
```
72
73
### RawModule
74
75
Load and manage CUDA modules containing multiple kernels and device functions.
76
77
```python { .api }
78
class RawModule:
79
"""
80
CUDA module containing multiple functions.
81
82
Parameters:
83
- code: str, CUDA C/C++ source code
84
- options: tuple, NVCC compiler options
85
- backend: str, compilation backend
86
- translate_cucomplex: bool, translate cuComplex types
87
"""
88
def __init__(self, code, options=(), backend='nvcc', translate_cucomplex=True): ...
89
90
def get_function(self, name):
91
"""
92
Get kernel function by name.
93
94
Parameters:
95
- name: str, function name
96
97
Returns:
98
RawKernel: Kernel function object
99
"""
100
```
101
102
### ReductionKernel
103
104
Create custom reduction operations that aggregate array elements using associative operations.
105
106
```python { .api }
107
class ReductionKernel:
108
"""
109
Custom reduction kernel for parallel aggregation operations.
110
111
Parameters:
112
- in_params: str, input parameter specification
113
- out_params: str, output parameter specification
114
- map_expr: str, mapping expression applied to each element
115
- reduce_expr: str, reduction expression for combining values
116
- post_map_expr: str, expression applied after mapping
117
- identity: str, identity value for reduction
118
- name: str, kernel name
119
- reduce_type: str, intermediate reduction data type
120
- options: tuple, NVCC compiler options
121
- preamble: str, code to prepend to kernel
122
"""
123
def __init__(self, in_params, out_params, map_expr, reduce_expr,
124
post_map_expr='', identity='', name='kernel',
125
reduce_type=None, options=(), preamble=''): ...
126
127
def __call__(self, *args, **kwargs):
128
"""
129
Execute reduction kernel.
130
131
Parameters:
132
- *args: input arrays matching in_params specification
133
- axis: int/tuple, axis along which to reduce
134
- keepdims: bool, keep reduced dimensions
135
- stream: cupy.cuda.Stream, CUDA stream for execution
136
137
Returns:
138
cupy.ndarray: Reduced result
139
"""
140
```
141
142
### Fusion
143
144
Fuse multiple operations into single kernels for improved performance.
145
146
```python { .api }
147
def fuse(*args, **kwargs):
148
"""
149
Decorator for fusing multiple CuPy operations.
150
151
Parameters:
152
- kernel_name: str, name for fused kernel
153
154
Returns:
155
function: Fused function that executes as single kernel
156
"""
157
158
@fuse()
159
def fused_function(x, y):
160
"""Example fused function combining multiple operations."""
161
return cp.sqrt(x**2 + y**2) * cp.sin(x + y)
162
```
163
164
## Usage Examples
165
166
### ElementwiseKernel Examples
167
168
```python
169
import cupy as cp
170
171
# Simple element-wise operation
172
add_kernel = cp.ElementwiseKernel(
173
'float32 x, float32 y', # Input parameters
174
'float32 z', # Output parameters
175
'z = x + y', # Operation
176
'add_kernel' # Kernel name
177
)
178
179
# Use the kernel
180
a = cp.random.random((1000, 1000), dtype=cp.float32)
181
b = cp.random.random((1000, 1000), dtype=cp.float32)
182
result = add_kernel(a, b)
183
184
# More complex element-wise operation
185
complex_kernel = cp.ElementwiseKernel(
186
'float32 x, float32 y, float32 alpha',
187
'float32 z',
188
'''
189
float32 temp = x * x + y * y;
190
z = alpha * sqrt(temp) + sin(x + y);
191
''',
192
'complex_kernel'
193
)
194
195
result = complex_kernel(a, b, 2.5)
196
```
197
198
### RawKernel Examples
199
200
```python
201
# Matrix addition kernel
202
matrix_add_code = '''
203
extern "C" __global__
204
void matrix_add(float* a, float* b, float* c, int n) {
205
int idx = blockIdx.x * blockDim.x + threadIdx.x;
206
if (idx < n) {
207
c[idx] = a[idx] + b[idx];
208
}
209
}
210
'''
211
212
matrix_add_kernel = cp.RawKernel(matrix_add_code, 'matrix_add')
213
214
# Launch kernel
215
n = 1000000
216
a_gpu = cp.random.random(n, dtype=cp.float32)
217
b_gpu = cp.random.random(n, dtype=cp.float32)
218
c_gpu = cp.zeros(n, dtype=cp.float32)
219
220
# Calculate grid and block dimensions
221
block_size = 256
222
grid_size = (n + block_size - 1) // block_size
223
224
matrix_add_kernel((grid_size,), (block_size,), (a_gpu, b_gpu, c_gpu, n))
225
226
# More advanced kernel with shared memory
227
shared_memory_code = '''
228
extern "C" __global__
229
void reduce_sum(float* input, float* output, int n) {
230
extern __shared__ float shared_data[];
231
232
int tid = threadIdx.x;
233
int idx = blockIdx.x * blockDim.x + threadIdx.x;
234
235
// Load data into shared memory
236
shared_data[tid] = (idx < n) ? input[idx] : 0.0f;
237
__syncthreads();
238
239
// Perform reduction in shared memory
240
for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
241
if (tid < stride) {
242
shared_data[tid] += shared_data[tid + stride];
243
}
244
__syncthreads();
245
}
246
247
// Write result
248
if (tid == 0) {
249
output[blockIdx.x] = shared_data[0];
250
}
251
}
252
'''
253
254
reduce_kernel = cp.RawKernel(shared_memory_code, 'reduce_sum')
255
256
# Use kernel with shared memory
257
input_data = cp.random.random(1000000, dtype=cp.float32)
258
output_size = (len(input_data) + block_size - 1) // block_size
259
output_data = cp.zeros(output_size, dtype=cp.float32)
260
261
shared_mem_size = block_size * 4 # 4 bytes per float
262
reduce_kernel((output_size,), (block_size,), (input_data, output_data, len(input_data)),
263
shared_mem=shared_mem_size)
264
```
265
266
### ReductionKernel Examples
267
268
```python
269
# Custom sum reduction
270
sum_kernel = cp.ReductionKernel(
271
'T x', # Input parameter
272
'T y', # Output parameter
273
'x', # Map expression (identity)
274
'a + b', # Reduction expression
275
'0', # Identity value
276
'sum_kernel' # Kernel name
277
)
278
279
data = cp.random.random((1000, 1000))
280
total = sum_kernel(data)
281
row_sums = sum_kernel(data, axis=1)
282
283
# Custom standard deviation reduction
284
std_kernel = cp.ReductionKernel(
285
'T x, T mean', # Input parameters
286
'T y', # Output parameter
287
'(x - mean) * (x - mean)', # Map expression (squared differences)
288
'a + b', # Reduction expression (sum)
289
'0', # Identity value
290
'std_kernel' # Kernel name
291
)
292
293
# Calculate standard deviation
294
data = cp.random.normal(0, 1, (1000, 1000))
295
mean_val = cp.mean(data, axis=1, keepdims=True)
296
variance = std_kernel(data, mean_val, axis=1) / (data.shape[1] - 1)
297
std_dev = cp.sqrt(variance)
298
```
299
300
### RawModule Examples
301
302
```python
303
# Module with multiple functions
304
module_code = '''
305
extern "C" {
306
307
__device__ float square(float x) {
308
return x * x;
309
}
310
311
__global__ void vector_norm(float* input, float* output, int n) {
312
int idx = blockIdx.x * blockDim.x + threadIdx.x;
313
if (idx < n) {
314
output[idx] = sqrt(square(input[idx]));
315
}
316
}
317
318
__global__ void vector_scale(float* input, float* output, float scale, int n) {
319
int idx = blockIdx.x * blockDim.x + threadIdx.x;
320
if (idx < n) {
321
output[idx] = input[idx] * scale;
322
}
323
}
324
325
}
326
'''
327
328
module = cp.RawModule(code=module_code)
329
norm_kernel = module.get_function('vector_norm')
330
scale_kernel = module.get_function('vector_scale')
331
332
# Use kernels from module
333
input_vec = cp.random.random(100000, dtype=cp.float32)
334
output_vec = cp.zeros_like(input_vec)
335
336
# Calculate norms
337
grid_size = (len(input_vec) + 255) // 256
338
norm_kernel((grid_size,), (256,), (input_vec, output_vec, len(input_vec)))
339
340
# Scale vector
341
scaled_vec = cp.zeros_like(input_vec)
342
scale_kernel((grid_size,), (256,), (input_vec, scaled_vec, 2.5, len(input_vec)))
343
```
344
345
### Function Fusion Examples
346
347
```python
348
# Fuse multiple operations for better performance
349
@cp.fuse(kernel_name='fused_operations')
350
def complex_computation(x, y, z):
351
"""Fused function combining multiple mathematical operations."""
352
temp1 = cp.sin(x) * cp.cos(y)
353
temp2 = cp.exp(-z**2)
354
return temp1 * temp2 + cp.sqrt(x**2 + y**2)
355
356
# Use fused function
357
x = cp.random.random((1000, 1000))
358
y = cp.random.random((1000, 1000))
359
z = cp.random.random((1000, 1000))
360
361
result = complex_computation(x, y, z) # Executes as single fused kernel
362
363
# Compare with unfused version
364
def unfused_computation(x, y, z):
365
"""Same computation without fusion."""
366
temp1 = cp.sin(x) * cp.cos(y)
367
temp2 = cp.exp(-z**2)
368
return temp1 * temp2 + cp.sqrt(x**2 + y**2)
369
370
# Fused version is typically faster due to reduced memory traffic
371
```
372
373
### Performance Optimization
374
375
```python
376
# Kernel with optimized memory access patterns
377
optimized_code = '''
378
extern "C" __global__
379
void optimized_transpose(float* input, float* output, int rows, int cols) {
380
__shared__ float tile[32][32+1]; // +1 to avoid bank conflicts
381
382
int x = blockIdx.x * blockDim.x + threadIdx.x;
383
int y = blockIdx.y * blockDim.y + threadIdx.y;
384
385
// Coalesced read from global memory
386
if (x < cols && y < rows) {
387
tile[threadIdx.y][threadIdx.x] = input[y * cols + x];
388
}
389
390
__syncthreads();
391
392
// Compute transposed coordinates
393
x = blockIdx.y * blockDim.y + threadIdx.x;
394
y = blockIdx.x * blockDim.x + threadIdx.y;
395
396
// Coalesced write to global memory
397
if (x < rows && y < cols) {
398
output[y * rows + x] = tile[threadIdx.x][threadIdx.y];
399
}
400
}
401
'''
402
403
transpose_kernel = cp.RawKernel(optimized_code, 'optimized_transpose')
404
405
# Use optimized kernel
406
matrix = cp.random.random((4096, 4096), dtype=cp.float32)
407
transposed = cp.zeros((4096, 4096), dtype=cp.float32)
408
409
block_dim = (32, 32)
410
grid_dim = ((matrix.shape[1] + 31) // 32, (matrix.shape[0] + 31) // 32)
411
412
transpose_kernel(grid_dim, block_dim, (matrix, transposed,
413
matrix.shape[0], matrix.shape[1]))
414
```
415
416
## Best Practices
417
418
### Kernel Development Guidelines
419
420
```python
421
# 1. Use appropriate data types
422
kernel_float32 = cp.ElementwiseKernel(
423
'float32 x, float32 y', # Specify exact precision needed
424
'float32 z',
425
'z = x + y',
426
'add_f32'
427
)
428
429
# 2. Optimize memory access patterns
430
# Good: Coalesced access
431
coalesced_kernel = cp.RawKernel('''
432
extern "C" __global__ void coalesced_access(float* data, int n) {
433
int idx = blockIdx.x * blockDim.x + threadIdx.x;
434
if (idx < n) {
435
data[idx] = data[idx] * 2.0f; // Sequential access
436
}
437
}
438
''', 'coalesced_access')
439
440
# 3. Use shared memory for data reuse
441
shared_mem_kernel = cp.RawKernel('''
442
extern "C" __global__ void use_shared_memory(float* input, float* output, int n) {
443
extern __shared__ float shared[];
444
int tid = threadIdx.x;
445
int idx = blockIdx.x * blockDim.x + tid;
446
447
// Load to shared memory
448
shared[tid] = (idx < n) ? input[idx] : 0.0f;
449
__syncthreads();
450
451
// Process using shared memory
452
if (idx < n) {
453
output[idx] = shared[tid] * 2.0f;
454
}
455
}
456
''', 'use_shared_memory')
457
458
# 4. Handle boundary conditions properly
459
boundary_safe_kernel = cp.ElementwiseKernel(
460
'raw T input, int32 size',
461
'T output',
462
'''
463
int idx = i; // Current thread index
464
if (idx < size) {
465
output = input[idx] * 2;
466
}
467
''',
468
'boundary_safe'
469
)
470
```