Tessl Tile for pypi/cupy@13.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

array-creation.md cuda-management.md fft.md index.md kernels.md linear-algebra.md math-functions.md random.md scipy-extensions.md sparse.md statistics.md

kernels.mddocs/

0
# Custom Kernels and Performance
1

2
Tools for writing custom CUDA kernels and optimizing GPU performance. CuPy provides high-level interfaces for creating custom GPU operations when built-in functions are insufficient.
3

4
## Capabilities
5

6
### Element-wise Kernels
7

8
Create custom element-wise operations that apply to each array element.
9

10
```python { .api }
11
class ElementwiseKernel:
12
    """
13
    Custom element-wise CUDA kernel.
14
    
15
    Parameters:
16
    - in_params: str, input parameter specification
17
    - out_params: str, output parameter specification  
18
    - operation: str, CUDA C++ code for operation
19
    - name: str, kernel name
20
    - preamble: str, code inserted before kernel
21
    - loop_prep: str, code before element loop
22
    - after_loop: str, code after element loop
23
    """
24
    def __init__(self, in_params, out_params, operation, name='kernel', preamble='', loop_prep='', after_loop=''): ...
25
    
26
    def __call__(self, *args, **kwargs):
27
        """
28
        Execute kernel on input arrays.
29
        
30
        Parameters:
31
        - args: input and output arrays matching parameter specification
32
        - size: int, number of elements to process
33
        - stream: cupy.cuda.Stream, CUDA stream
34
        
35
        Returns:
36
        cupy.ndarray: Output array(s)
37
        """
38

39
class ReductionKernel:
40
    """
41
    Custom reduction CUDA kernel.
42
    
43
    Parameters:
44
    - in_params: str, input parameter specification
45
    - out_params: str, output parameter specification
46
    - map_expr: str, expression to map input to intermediate values
47
    - reduce_expr: str, expression to reduce intermediate values
48
    - post_map_expr: str, expression to post-process results
49
    - identity: str, identity value for reduction
50
    - name: str, kernel name
51
    """
52
    def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr, identity, name='kernel'): ...
53
    
54
    def __call__(self, *args, **kwargs):
55
        """Execute reduction kernel."""
56

57
class RawKernel:
58
    """
59
    Raw CUDA kernel from source code.
60
    
61
    Parameters:
62
    - code: str, complete CUDA C++ kernel source
63
    - name: str, kernel function name
64
    - options: tuple, compiler options
65
    - backend: str, compilation backend
66
    """
67
    def __init__(self, code, name, options=(), backend='nvrtc'): ...
68
    
69
    def __call__(self, grid, block, args, *, shared_mem=0, stream=None):
70
        """
71
        Launch raw CUDA kernel.
72
        
73
        Parameters:
74
        - grid: tuple, grid dimensions
75
        - block: tuple, block dimensions
76
        - args: tuple, kernel arguments
77
        - shared_mem: int, shared memory size
78
        - stream: cupy.cuda.Stream, CUDA stream
79
        """
80

81
class RawModule:
82
    """
83
    Raw CUDA module containing multiple kernels.
84
    
85
    Parameters:
86
    - code: str, complete CUDA module source
87
    - options: tuple, compiler options  
88
    - backend: str, compilation backend
89
    """
90
    def __init__(self, code, options=(), backend='nvrtc'): ...
91
    
92
    def get_function(self, name):
93
        """
94
        Get kernel function by name.
95
        
96
        Parameters:
97
        - name: str, function name
98
        
99
        Returns:
100
        RawKernel: Kernel function
101
        """
102
```
103

104
### Performance Utilities
105

106
Tools for optimizing GPU performance.
107

108
```python { .api }
109
def fuse(*args, **kwargs):
110
    """
111
    Decorator for kernel fusion optimization.
112
    
113
    Parameters:
114
    - args: positional arguments for fusion
115
    - kwargs: keyword arguments for fusion
116
    
117
    Returns:
118
    function: Fused function decorator
119
    """
120

121
def clear_memo():
122
    """Clear memoization cache."""
123

124
def memoize(for_each_device=False):
125
    """
126
    Memoization decorator for caching function results.
127
    
128
    Parameters:
129
    - for_each_device: bool, separate cache per device
130
    
131
    Returns:
132
    function: Memoization decorator
133
    """
134
```
135

136
## Usage Examples
137

138
### Element-wise Kernel
139

140
```python
141
import cupy as cp
142

143
# Custom element-wise operation
144
multiply_add = cp.ElementwiseKernel(
145
    'T x, T y, T z',          # Input parameters
146
    'T w',                    # Output parameters  
147
    'w = x * y + z',          # Operation
148
    'multiply_add'            # Kernel name
149
)
150

151
# Use the kernel
152
a = cp.random.random((1000, 1000))
153
b = cp.random.random((1000, 1000))
154
c = cp.random.random((1000, 1000))
155
result = multiply_add(a, b, c)
156
```
157

158
### Reduction Kernel
159

160
```python
161
# Custom reduction operation (sum of squares)
162
sum_of_squares = cp.ReductionKernel(
163
    'T x',                    # Input parameter
164
    'T y',                    # Output parameter
165
    'x * x',                  # Map expression
166
    'a + b',                  # Reduce expression
167
    'y = a',                  # Post-map expression
168
    '0',                      # Identity value
169
    'sum_of_squares'          # Kernel name
170
)
171

172
# Use the reduction kernel
173
data = cp.random.random((10000,))
174
result = sum_of_squares(data, axis=None)
175
```
176

177
### Raw CUDA Kernel
178

179
```python
180
# Raw CUDA kernel for advanced operations
181
cuda_code = '''
182
extern "C" __global__
183
void vector_add(const float* a, const float* b, float* c, int n) {
184
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
185
    if (idx < n) {
186
        c[idx] = a[idx] + b[idx];
187
    }
188
}
189
'''
190

191
kernel = cp.RawKernel(cuda_code, 'vector_add')
192

193
# Use raw kernel
194
n = 1000000
195
a_gpu = cp.random.random((n,), dtype=cp.float32)
196
b_gpu = cp.random.random((n,), dtype=cp.float32)
197
c_gpu = cp.zeros((n,), dtype=cp.float32)
198

199
threads_per_block = 256
200
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
201

202
kernel((blocks_per_grid,), (threads_per_block,), 
203
       (a_gpu, b_gpu, c_gpu, n))
204
```

Version

Tile

Files

kernels.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

kernels.mddocs/