Tessl Tile for pypi/pycuda@2025.1.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

algorithm-kernels.md driver-api.md gpu-arrays.md index.md kernel-compilation.md math-functions.md opengl-integration.md random-numbers.md

kernel-compilation.mddocs/

0
# Kernel Compilation
1

2
Dynamic CUDA kernel compilation with source code generation, caching, and module management for both inline and file-based CUDA source code. PyCUDA enables runtime compilation of CUDA C/C++ code directly from Python.
3

4
## Capabilities
5

6
### Source Module Compilation
7

8
Compile CUDA source code into executable modules with automatic error handling and caching.
9

10
```python { .api }
11
class SourceModule:
12
    def __init__(self, source: str, nvcc: str = "nvcc", options: list = None, 
13
                 keep: bool = False, no_extern_c: bool = False, 
14
                 arch: str = None, code: str = None, cache_dir: str = None,
15
                 include_dirs: list = None):
16
        """
17
        Compile CUDA source code into module.
18
        
19
        Parameters:
20
        - source: str, CUDA C/C++ source code
21
        - nvcc: str, path to nvcc compiler
22
        - options: list, additional nvcc options
23
        - keep: bool, keep intermediate files
24
        - no_extern_c: bool, disable extern "C" wrapper
25
        - arch: str, target architecture (e.g., "sm_50")
26
        - code: str, target code generation (e.g., "compute_50")
27
        - cache_dir: str, directory for caching compiled modules
28
        - include_dirs: list, additional include directories
29
        """
30
    
31
    def get_function(self, name: str) -> Function:
32
        """
33
        Get kernel function from module.
34
        
35
        Parameters:
36
        - name: str, function name in CUDA source
37
        
38
        Returns:
39
        Function: callable kernel function
40
        """
41
    
42
    def get_global(self, name: str) -> tuple[DeviceAllocation, int]:
43
        """
44
        Get global variable from module.
45
        
46
        Parameters:
47
        - name: str, variable name in CUDA source
48
        
49
        Returns:
50
        tuple: (device_pointer, size_in_bytes)
51
        """
52
    
53
    def get_texref(self, name: str) -> TextureReference:
54
        """
55
        Get texture reference from module.
56
        
57
        Parameters:
58
        - name: str, texture reference name
59
        
60
        Returns:
61
        TextureReference: texture reference object
62
        """
63
```
64

65
### Dynamic Module Generation
66

67
Generate CUDA modules programmatically with dynamic source generation.
68

69
```python { .api }
70
class DynamicModule:
71
    def __init__(self, template: str = None):
72
        """
73
        Create dynamic module with optional template.
74
        
75
        Parameters:
76
        - template: str, template source code (optional)
77
        """
78
    
79
    def add_to_preamble(self, pa: str) -> None:
80
        """
81
        Add code to module preamble.
82
        
83
        Parameters:
84
        - pa: str, code to add to preamble
85
        """
86
    
87
    def add_function(self, func: DynamicFunction) -> None:
88
        """
89
        Add function to module.
90
        
91
        Parameters:
92
        - func: DynamicFunction, function to add
93
        """
94
    
95
    def compile(self, nvcc: str = "nvcc", options: list = None, 
96
                keep: bool = False, no_extern_c: bool = False) -> CudaModule:
97
        """
98
        Compile dynamic module.
99
        
100
        Parameters:
101
        - nvcc: str, path to nvcc compiler
102
        - options: list, additional nvcc options
103
        - keep: bool, keep intermediate files
104
        - no_extern_c: bool, disable extern "C" wrapper
105
        
106
        Returns:
107
        CudaModule: compiled module
108
        """
109

110
class DynamicSourceModule(DynamicModule):
111
    def __init__(self, template: str = None, nvcc: str = "nvcc", 
112
                 options: list = None, keep: bool = False, 
113
                 no_extern_c: bool = False, arch: str = None, 
114
                 code: str = None, cache_dir: str = None):
115
        """
116
        Dynamic module that compiles automatically.
117
        
118
        Parameters:
119
        - template: str, template source code (optional)
120
        - nvcc: str, path to nvcc compiler
121
        - options: list, additional nvcc options
122
        - keep: bool, keep intermediate files
123
        - no_extern_c: bool, disable extern "C" wrapper
124
        - arch: str, target architecture
125
        - code: str, target code generation
126
        - cache_dir: str, caching directory
127
        """
128
```
129

130
### Compilation Functions
131

132
Low-level compilation functions for advanced use cases.
133

134
```python { .api }
135
def compile(source: str, nvcc: str = "nvcc", options: list = None, 
136
            keep: bool = False, no_extern_c: bool = False,
137
            arch: str = None, code: str = None, cache_dir: str = None,
138
            include_dirs: list = None, target: str = "cubin") -> bytes:
139
    """
140
    Compile CUDA source to binary.
141
    
142
    Parameters:
143
    - source: str, CUDA source code
144
    - nvcc: str, path to nvcc compiler
145
    - options: list, compiler options
146
    - keep: bool, keep intermediate files
147
    - no_extern_c: bool, disable extern "C" wrapper
148
    - arch: str, target architecture
149
    - code: str, target code generation
150
    - cache_dir: str, cache directory
151
    - include_dirs: list, include directories
152
    - target: str, compilation target ("cubin", "ptx", "fatbin")
153
    
154
    Returns:
155
    bytes: compiled binary
156
    """
157

158
def compile_plain(source: str, options: list = None, keep: bool = False,
159
                  nvcc: str = "nvcc", cache_dir: str = None,
160
                  target: str = "cubin") -> bytes:
161
    """
162
    Simple compilation without extern "C" wrapper.
163
    
164
    Parameters:
165
    - source: str, CUDA source code
166
    - options: list, compiler options
167
    - keep: bool, keep intermediate files
168
    - nvcc: str, path to nvcc compiler
169
    - cache_dir: str, cache directory
170
    - target: str, compilation target
171
    
172
    Returns:
173
    bytes: compiled binary
174
    """
175

176
def preprocess_source(source: str, options: list = None, nvcc: str = "nvcc") -> str:
177
    """
178
    Preprocess CUDA source code.
179
    
180
    Parameters:
181
    - source: str, CUDA source code
182
    - options: list, preprocessor options
183
    - nvcc: str, path to nvcc compiler
184
    
185
    Returns:
186
    str: preprocessed source code
187
    """
188

189
def get_nvcc_version(nvcc: str = "nvcc") -> tuple[int, int]:
190
    """
191
    Get NVCC compiler version.
192
    
193
    Parameters:
194
    - nvcc: str, path to nvcc compiler
195
    
196
    Returns:
197
    tuple: (major, minor) version numbers
198
    """
199
```
200

201
### Kernel Function Interface
202

203
Execute compiled kernel functions with various launch configurations.
204

205
```python { .api }
206
class Function:
207
    def __call__(self, *args, **kwargs) -> None:
208
        """
209
        Launch kernel function.
210
        
211
        Parameters:
212
        - args: kernel arguments (must match function signature)
213
        - block: tuple, block dimensions (x, y, z)
214
        - grid: tuple, grid dimensions (x, y, z) 
215
        - stream: Stream, CUDA stream (optional)
216
        - shared: int, shared memory bytes (optional)
217
        - texrefs: list, texture references (optional)
218
        """
219
    
220
    def prepare(self, arg_types: list, block: tuple = None) -> PreparedFunction:
221
        """
222
        Prepare function for faster repeated launches.
223
        
224
        Parameters:
225
        - arg_types: list, argument type strings (e.g., ["P", "i", "f"])
226
        - block: tuple, default block dimensions (optional)
227
        
228
        Returns:
229
        PreparedFunction: prepared function for fast launches
230
        """
231
    
232
    @property  
233
    def max_threads_per_block(self) -> int:
234
        """Maximum threads per block for this function."""
235
    
236
    @property
237
    def shared_size_bytes(self) -> int:
238
        """Shared memory size in bytes."""
239
    
240
    @property
241
    def const_size_bytes(self) -> int:
242
        """Constant memory size in bytes."""
243
    
244
    @property
245
    def local_size_bytes(self) -> int:
246
        """Local memory size in bytes."""
247
    
248
    @property
249
    def num_regs(self) -> int:
250
        """Number of registers used per thread."""
251

252
class PreparedFunction:
253
    def __call__(self, *args, **kwargs) -> None:
254
        """Launch prepared function."""
255
    
256
    def prepared_call(self, grid: tuple, *args) -> None:
257
        """
258
        Launch with grid dimensions.
259
        
260
        Parameters:
261
        - grid: tuple, grid dimensions (x, y, z)
262
        - args: kernel arguments
263
        """
264
    
265
    def prepared_async_call(self, grid: tuple, stream: Stream, *args) -> None:
266
        """
267
        Launch asynchronously in stream.
268
        
269
        Parameters:
270
        - grid: tuple, grid dimensions (x, y, z)
271
        - stream: Stream, CUDA stream
272
        - args: kernel arguments
273
        """
274
    
275
    def prepared_timed_call(self, grid: tuple, *args) -> float:
276
        """
277
        Launch and return execution time.
278
        
279
        Parameters:
280
        - grid: tuple, grid dimensions (x, y, z)
281
        - args: kernel arguments
282
        
283
        Returns:
284
        float: execution time in seconds
285
        """
286
```
287

288
### Texture Memory
289

290
Manage CUDA texture memory for optimized data access patterns.
291

292
```python { .api }
293
class TextureReference:
294
    def set_array(self, ary: Array) -> None:
295
        """
296
        Bind texture to CUDA array.
297
        
298
        Parameters:
299
        - ary: Array, CUDA array to bind
300
        """
301
    
302
    def set_address(self, devptr: DeviceAllocation, size: int) -> int:
303
        """
304
        Bind texture to linear memory.
305
        
306
        Parameters:
307
        - devptr: DeviceAllocation, device memory pointer
308
        - size: int, memory size in bytes
309
        
310
        Returns:
311
        int: texture offset in bytes
312
        """
313
    
314
    def set_format(self, fmt: int, num_components: int) -> None:
315
        """
316
        Set texture format.
317
        
318
        Parameters:
319
        - fmt: int, element format
320
        - num_components: int, number of components per element
321
        """
322
    
323
    def set_address_mode(self, dim: int, mode: int) -> None:
324
        """
325
        Set addressing mode for dimension.
326
        
327
        Parameters:
328
        - dim: int, dimension (0, 1, or 2)
329
        - mode: int, addressing mode
330
        """
331
    
332
    def set_filter_mode(self, mode: int) -> None:
333
        """
334
        Set filtering mode.
335
        
336
        Parameters:
337
        - mode: int, filter mode (point or linear)
338
        """
339
    
340
    def set_flags(self, flags: int) -> None:
341
        """
342
        Set texture flags.
343
        
344
        Parameters:
345
        - flags: int, texture flags
346
        """
347

348
def make_multichannel_2d_array(matrix: np.ndarray, order: str = "C") -> Array:
349
    """
350
    Create 2D CUDA array from matrix.
351
    
352
    Parameters:
353
    - matrix: numpy.ndarray, input matrix
354
    - order: str, memory order ("C" or "F")
355
    
356
    Returns:
357
    Array: CUDA array for texture binding
358
    """
359

360
class Array:
361
    def __init__(self, format: ArrayFormat, w: int, h: int = 0, d: int = 0):
362
        """
363
        Create CUDA array.
364
        
365
        Parameters:
366
        - format: ArrayFormat, array format
367
        - w: int, width
368
        - h: int, height (for 2D/3D arrays)
369
        - d: int, depth (for 3D arrays)
370
        """
371
    
372
    def free(self) -> None:
373
        """Free CUDA array memory."""
374
```
375

376
## Usage Examples
377

378
### Basic Kernel Compilation
379

380
```python
381
# Simple vector addition kernel
382
kernel_source = """
383
__global__ void vector_add(float *a, float *b, float *c, int n) {
384
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
385
    if (idx < n) {
386
        c[idx] = a[idx] + b[idx];
387
    }
388
}
389
"""
390

391
# Compile module
392
mod = SourceModule(kernel_source)
393
vector_add = mod.get_function("vector_add")
394

395
# Launch kernel
396
vector_add(gpu_a, gpu_b, gpu_c, np.int32(n),
397
           block=(256, 1, 1), grid=((n + 255) // 256, 1))
398
```
399

400
### Prepared Function Example
401

402
```python
403
# Prepare function for repeated launches
404
prepared_add = vector_add.prepare(["P", "P", "P", "i"])
405

406
# Fast repeated launches
407
for i in range(100):
408
    prepared_add.prepared_call((grid_size, 1), gpu_a, gpu_b, gpu_c, np.int32(n))
409
```
410

411
### Template-based Dynamic Compilation
412

413
```python
414
template = """
415
#define BLOCK_SIZE ${block_size}
416

417
__global__ void process_data(float *data, int n) {
418
    __shared__ float cache[BLOCK_SIZE];
419
    
420
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
421
    if (idx < n) {
422
        cache[threadIdx.x] = data[idx];
423
        __syncthreads();
424
        
425
        // Process data...
426
        data[idx] = cache[threadIdx.x] * 2.0f;
427
    }
428
}
429
"""
430

431
# Create module with template substitution
432
from string import Template
433
source = Template(template).substitute(block_size=256)
434
mod = SourceModule(source)
435
```

Version

Tile

Files

kernel-compilation.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

kernel-compilation.mddocs/