Tessl Tile for pypi/cupy-cuda12x@12.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

array-operations.md cuda-interface.md custom-kernels.md fft-operations.md index.md linear-algebra.md math-functions.md random-numbers.md statistics-sorting.md

custom-kernels.mddocs/

0
# Custom Kernels
1

2
Framework for creating custom GPU kernels in CuPy, enabling high-performance operations tailored to specific needs. Provides ElementwiseKernel for element-wise operations, ReductionKernel for reductions, and RawKernel for arbitrary CUDA code.
3

4
## Capabilities
5

6
### ElementwiseKernel
7

8
```python { .api }
9
class ElementwiseKernel:
10
    """
11
    User-defined elementwise kernel.
12

13
    Parameters:
14
    - in_params: string of input parameters
15
    - out_params: string of output parameters  
16
    - operation: string of element-wise operation
17
    - name: kernel name
18
    - reduce_dims: whether to reduce dimensions
19
    - options: compiler options
20
    - preamble: code to insert at beginning
21
    - loop_prep: code to insert before loop
22
    - after_loop: code to insert after loop
23
    """
24
    def __init__(self, in_params, out_params, operation, name='kernel', 
25
                 reduce_dims=True, options=(), preamble='', loop_prep='',
26
                 after_loop='', **kwargs): ...
27
    
28
    def __call__(self, *args, **kwargs): ...
29
```
30

31
### ReductionKernel
32

33
```python { .api }
34
class ReductionKernel:
35
    """
36
    User-defined reduction kernel.
37

38
    Parameters:
39
    - in_params: string of input parameters
40
    - out_params: string of output parameters
41
    - map_expr: mapping expression
42
    - reduce_expr: reduction expression
43
    - post_map_expr: post-mapping expression
44
    - identity: identity value for reduction
45
    - name: kernel name
46
    - reduce_type: type for reduction
47
    - reduce_dims: whether to reduce dimensions
48
    - preamble: code to insert at beginning
49
    - options: compiler options
50
    """
51
    def __init__(self, in_params, out_params, map_expr, reduce_expr,
52
                 post_map_expr='', identity=None, name='kernel',
53
                 reduce_type=None, reduce_dims=True, preamble='',
54
                 options=(), **kwargs): ...
55
    
56
    def __call__(self, *args, **kwargs): ...
57
```
58

59
### RawKernel
60

61
```python { .api }
62
class RawKernel:
63
    """
64
    User-defined raw CUDA kernel.
65

66
    Parameters:
67
    - code: CUDA kernel code
68
    - name: kernel function name
69
    - options: compiler options
70
    - backend: compiler backend ('nvcc' or 'nvrtc')
71
    - translate_cucomplex: translate cuComplex types
72
    - enable_cooperative_groups: enable cooperative groups
73
    - jitify: use Jitify for compilation
74
    """
75
    def __init__(self, code, name, options=(), backend='nvcc',
76
                 translate_cucomplex=True, enable_cooperative_groups=False,
77
                 jitify=False, **kwargs): ...
78
    
79
    def __call__(self, grid, block, args, **kwargs): ...
80
```
81

82
### RawModule
83

84
```python { .api }
85
class RawModule:
86
    """
87
    User-defined raw CUDA module.
88

89
    Parameters:
90
    - code: CUDA module code
91
    - options: compiler options  
92
    - backend: compiler backend
93
    - translate_cucomplex: translate cuComplex types
94
    - enable_cooperative_groups: enable cooperative groups
95
    - jitify: use Jitify for compilation
96
    """
97
    def __init__(self, code, options=(), backend='nvcc',
98
                 translate_cucomplex=True, enable_cooperative_groups=False,
99
                 jitify=False, **kwargs): ...
100
    
101
    def get_function(self, name): ...
102
```
103

104
### Kernel Creation Functions
105

106
```python { .api }
107
def create_ufunc(name, ops, routine=None, preamble='', doc=None,
108
                default_casting=None, loop_prep='', out_ops=None):
109
    """
110
    Create universal function from operations.
111

112
    Parameters:
113
    - name: function name
114
    - ops: operation specifications
115
    - routine: routine function
116
    - preamble: preamble code
117
    - doc: documentation string
118
    - default_casting: default casting rule
119
    - loop_prep: loop preparation code
120
    - out_ops: output operations
121

122
    Returns:
123
    ufunc: created universal function
124
    """
125

126
def create_reduction_func(name, ops, routine=None, identity=None,
127
                         preamble='', **kwargs):
128
    """
129
    Create reduction function.
130

131
    Parameters:
132
    - name: function name
133
    - ops: operation specifications
134
    - routine: routine function  
135
    - identity: identity value
136
    - preamble: preamble code
137

138
    Returns:
139
    function: created reduction function
140
    """
141
```
142

143
### Utilities
144

145
```python { .api }
146
def clear_memo():
147
    """Clear memoization cache."""
148

149
def memoize(for_each_device=False):
150
    """
151
    Memoization decorator.
152

153
    Parameters:
154
    - for_each_device: memoize for each device separately
155

156
    Returns:
157
    decorator: memoization decorator
158
    """
159
```
160

161
## Usage Examples
162

163
### ElementwiseKernel Examples
164

165
```python
166
import cupy as cp
167

168
# Simple element-wise operation
169
add_kernel = cp.ElementwiseKernel(
170
    'float32 x, float32 y',  # input parameters
171
    'float32 z',             # output parameters
172
    'z = x + y',             # operation
173
    'add_kernel'             # kernel name
174
)
175

176
# Use the kernel
177
a = cp.random.random(1000).astype(cp.float32)
178
b = cp.random.random(1000).astype(cp.float32)
179
c = cp.empty(1000, dtype=cp.float32)
180

181
add_kernel(a, b, c)
182

183
# Verify result
184
expected = a + b
185
error = cp.linalg.norm(c - expected)
186
print(f"Add kernel error: {error}")
187
```
188

189
### Complex ElementwiseKernel
190

191
```python
192
import cupy as cp
193

194
# More complex element-wise operation with conditionals
195
clamp_kernel = cp.ElementwiseKernel(
196
    'float32 x, float32 min_val, float32 max_val',
197
    'float32 y',
198
    '''
199
    if (x < min_val) {
200
        y = min_val;
201
    } else if (x > max_val) {
202
        y = max_val;
203
    } else {
204
        y = x;
205
    }
206
    ''',
207
    'clamp_kernel'
208
)
209

210
# Test the kernel
211
data = cp.random.uniform(-2, 2, 1000).astype(cp.float32)
212
result = cp.empty_like(data)
213

214
clamp_kernel(data, -1.0, 1.0, result)
215

216
# Verify clamping
217
print(f"Min value: {cp.min(result)}")  # Should be >= -1.0
218
print(f"Max value: {cp.max(result)}")  # Should be <= 1.0
219
```
220

221
### ElementwiseKernel with Preamble
222

223
```python
224
import cupy as cp
225

226
# Kernel with helper functions in preamble
227
sigmoid_kernel = cp.ElementwiseKernel(
228
    'float32 x',
229
    'float32 y',
230
    'y = sigmoid(x)',
231
    'sigmoid_kernel',
232
    preamble='''
233
    __device__ float sigmoid(float x) {
234
        return 1.0f / (1.0f + expf(-x));
235
    }
236
    '''
237
)
238

239
# Use the kernel
240
x = cp.linspace(-5, 5, 1000).astype(cp.float32)
241
y = cp.empty_like(x)
242

243
sigmoid_kernel(x, y)
244

245
# Compare with CuPy implementation
246
expected = 1 / (1 + cp.exp(-x))
247
error = cp.linalg.norm(y - expected)
248
print(f"Sigmoid kernel error: {error}")
249
```
250

251
### ReductionKernel Examples
252

253
```python
254
import cupy as cp
255

256
# Custom sum reduction
257
sum_kernel = cp.ReductionKernel(
258
    'float32 x',        # input
259
    'float32 y',        # output
260
    'x',                # map expression (identity)
261
    'a + b',            # reduce expression
262
    'y = a',            # post-map expression
263
    '0',                # identity value
264
    'sum_kernel'        # kernel name
265
)
266

267
# Test the kernel
268
data = cp.random.random(10000).astype(cp.float32)
269
result = sum_kernel(data)
270

271
# Compare with built-in sum
272
expected = cp.sum(data)
273
error = abs(result - expected)
274
print(f"Sum kernel error: {error}")
275
```
276

277
### Complex ReductionKernel
278

279
```python
280
import cupy as cp
281

282
# Root mean square reduction
283
rms_kernel = cp.ReductionKernel(
284
    'float32 x',
285
    'float32 y',
286
    'x * x',            # map: square each element
287
    'a + b',            # reduce: sum squares  
288
    'y = sqrt(a / _in_ind.size())',  # post: sqrt of mean
289
    '0',                # identity
290
    'rms_kernel'
291
)
292

293
# Test the kernel
294
data = cp.random.random(1000).astype(cp.float32)
295
rms_result = rms_kernel(data)
296

297
# Compare with manual calculation
298
manual_rms = cp.sqrt(cp.mean(data * data))
299
error = abs(rms_result - manual_rms)
300
print(f"RMS kernel error: {error}")
301
```
302

303
### RawKernel Examples
304

305
```python
306
import cupy as cp
307

308
# Matrix multiplication kernel
309
matmul_code = '''
310
extern "C" __global__
311
void matmul(float* A, float* B, float* C, int M, int N, int K) {
312
    int row = blockIdx.y * blockDim.y + threadIdx.y;
313
    int col = blockIdx.x * blockDim.x + threadIdx.x;
314
    
315
    if (row < M && col < N) {
316
        float sum = 0.0f;
317
        for (int k = 0; k < K; k++) {
318
            sum += A[row * K + k] * B[k * N + col];
319
        }
320
        C[row * N + col] = sum;
321
    }
322
}
323
'''
324

325
matmul_kernel = cp.RawKernel(matmul_code, 'matmul')
326

327
# Test the kernel
328
M, N, K = 512, 512, 512
329
A = cp.random.random((M, K)).astype(cp.float32)
330
B = cp.random.random((K, N)).astype(cp.float32)
331
C = cp.zeros((M, N), dtype=cp.float32)
332

333
# Launch kernel
334
block = (16, 16)
335
grid = ((N + block[0] - 1) // block[0], (M + block[1] - 1) // block[1])
336

337
matmul_kernel(grid, block, (A, B, C, M, N, K))
338

339
# Compare with CuPy matmul
340
expected = cp.dot(A, B)
341
error = cp.linalg.norm(C - expected) / cp.linalg.norm(expected)
342
print(f"Matmul kernel relative error: {error}")
343
```
344

345
### RawKernel with Shared Memory
346

347
```python
348
import cupy as cp
349

350
# Convolution kernel with shared memory
351
conv_code = '''
352
extern "C" __global__
353
void conv2d(float* input, float* kernel, float* output, 
354
           int height, int width, int kernel_size) {
355
    
356
    extern __shared__ float shared_mem[];
357
    
358
    int tx = threadIdx.x;
359
    int ty = threadIdx.y;
360
    int bx = blockIdx.x;
361
    int by = blockIdx.y;
362
    int bdx = blockDim.x;
363
    int bdy = blockDim.y;
364
    
365
    int row = by * bdy + ty;
366
    int col = bx * bdx + tx;
367
    
368
    int shared_width = bdx + kernel_size - 1;
369
    int shared_height = bdy + kernel_size - 1;
370
    
371
    // Load data into shared memory (simplified)
372
    if (row < height && col < width) {
373
        shared_mem[ty * shared_width + tx] = input[row * width + col];
374
    }
375
    
376
    __syncthreads();
377
    
378
    // Perform convolution
379
    if (row < height && col < width) {
380
        float sum = 0.0f;
381
        int half_kernel = kernel_size / 2;
382
        
383
        for (int ky = 0; ky < kernel_size; ky++) {
384
            for (int kx = 0; kx < kernel_size; kx++) {
385
                int y_idx = ty + ky;
386
                int x_idx = tx + kx;
387
                
388
                if (y_idx < shared_height && x_idx < shared_width) {
389
                    sum += shared_mem[y_idx * shared_width + x_idx] * 
390
                           kernel[ky * kernel_size + kx];
391
                }
392
            }
393
        }
394
        
395
        output[row * width + col] = sum;
396
    }
397
}
398
'''
399

400
conv_kernel = cp.RawKernel(conv_code, 'conv2d')
401

402
# Test the kernel (simplified example)
403
height, width = 256, 256
404
kernel_size = 3
405

406
input_data = cp.random.random((height, width)).astype(cp.float32)
407
kernel_data = cp.random.random((kernel_size, kernel_size)).astype(cp.float32)
408
output_data = cp.zeros((height, width), dtype=cp.float32)
409

410
# Calculate shared memory size
411
block = (16, 16)
412
shared_size = (block[0] + kernel_size - 1) * (block[1] + kernel_size - 1) * 4  # 4 bytes per float
413

414
grid = ((width + block[0] - 1) // block[0], (height + block[1] - 1) // block[1])
415

416
conv_kernel(grid, block, (input_data, kernel_data, output_data, height, width, kernel_size),
417
           shared_mem=shared_size)
418
```
419

420
### RawModule for Multiple Kernels
421

422
```python
423
import cupy as cp
424

425
# Module with multiple related kernels
426
vector_ops_code = '''
427
extern "C" __global__
428
void vector_add(float* a, float* b, float* c, int n) {
429
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
430
    if (idx < n) {
431
        c[idx] = a[idx] + b[idx];
432
    }
433
}
434

435
extern "C" __global__
436
void vector_mult(float* a, float* b, float* c, int n) {
437
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
438
    if (idx < n) {
439
        c[idx] = a[idx] * b[idx];
440
    }
441
}
442

443
extern "C" __global__
444
void vector_norm(float* a, float* result, int n) {
445
    extern __shared__ float sdata[];
446
    
447
    int tid = threadIdx.x;
448
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
449
    
450
    // Load and square
451
    sdata[tid] = (idx < n) ? a[idx] * a[idx] : 0.0f;
452
    __syncthreads();
453
    
454
    // Reduction in shared memory
455
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
456
        if (tid < s) {
457
            sdata[tid] += sdata[tid + s];
458
        }
459
        __syncthreads();
460
    }
461
    
462
    // Write result
463
    if (tid == 0) {
464
        atomicAdd(result, sdata[0]);
465
    }
466
}
467
'''
468

469
vector_module = cp.RawModule(code=vector_ops_code)
470

471
# Get individual kernels
472
add_kernel = vector_module.get_function('vector_add')
473
mult_kernel = vector_module.get_function('vector_mult')
474
norm_kernel = vector_module.get_function('vector_norm')
475

476
# Test the kernels
477
n = 1000000
478
a = cp.random.random(n).astype(cp.float32)
479
b = cp.random.random(n).astype(cp.float32)
480
c = cp.zeros(n, dtype=cp.float32)
481

482
block_size = 256
483
grid_size = (n + block_size - 1) // block_size
484

485
# Vector addition
486
add_kernel((grid_size,), (block_size,), (a, b, c, n))
487
add_expected = a + b
488
add_error = cp.linalg.norm(c - add_expected)
489
print(f"Add error: {add_error}")
490

491
# Vector multiplication  
492
mult_kernel((grid_size,), (block_size,), (a, b, c, n))
493
mult_expected = a * b
494
mult_error = cp.linalg.norm(c - mult_expected)
495
print(f"Mult error: {mult_error}")
496

497
# Vector norm
498
norm_result = cp.zeros(1, dtype=cp.float32)
499
shared_mem_size = block_size * 4  # 4 bytes per float
500

501
norm_kernel((grid_size,), (block_size,), (a, norm_result, n), 
502
           shared_mem=shared_mem_size)
503

504
norm_expected = cp.linalg.norm(a)**2
505
norm_error = abs(float(norm_result) - float(norm_expected))
506
print(f"Norm error: {norm_error}")
507
```
508

509
### Performance Optimization
510

511
```python
512
import cupy as cp
513
import time
514

515
# Compare ElementwiseKernel vs built-in operations
516
n = 10000000
517

518
# Built-in operation timing
519
a = cp.random.random(n).astype(cp.float32)
520
b = cp.random.random(n).astype(cp.float32)
521

522
start = time.time()
523
for _ in range(100):
524
    c_builtin = a + b
525
cp.cuda.Device().synchronize()
526
builtin_time = time.time() - start
527

528
# Custom kernel timing
529
add_kernel = cp.ElementwiseKernel(
530
    'float32 x, float32 y',
531
    'float32 z', 
532
    'z = x + y',
533
    'custom_add'
534
)
535

536
c_custom = cp.empty_like(a)
537
start = time.time()
538
for _ in range(100):
539
    add_kernel(a, b, c_custom)
540
cp.cuda.Device().synchronize()
541
custom_time = time.time() - start
542

543
print(f"Built-in time: {builtin_time:.4f}s")
544
print(f"Custom kernel time: {custom_time:.4f}s")
545
print(f"Speedup: {builtin_time/custom_time:.2f}x")
546

547
# Verify correctness
548
error = cp.linalg.norm(c_builtin - c_custom)
549
print(f"Accuracy error: {error}")
550
```

Version

Tile

Files

custom-kernels.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

custom-kernels.mddocs/