0
# Kernel Compilation
1
2
Dynamic CUDA kernel compilation with source code generation, caching, and module management for both inline and file-based CUDA source code. PyCUDA enables runtime compilation of CUDA C/C++ code directly from Python.
3
4
## Capabilities
5
6
### Source Module Compilation
7
8
Compile CUDA source code into executable modules with automatic error handling and caching.
9
10
```python { .api }
11
class SourceModule:
12
def __init__(self, source: str, nvcc: str = "nvcc", options: list = None,
13
keep: bool = False, no_extern_c: bool = False,
14
arch: str = None, code: str = None, cache_dir: str = None,
15
include_dirs: list = None):
16
"""
17
Compile CUDA source code into module.
18
19
Parameters:
20
- source: str, CUDA C/C++ source code
21
- nvcc: str, path to nvcc compiler
22
- options: list, additional nvcc options
23
- keep: bool, keep intermediate files
24
- no_extern_c: bool, disable extern "C" wrapper
25
- arch: str, target architecture (e.g., "sm_50")
26
- code: str, target code generation (e.g., "compute_50")
27
- cache_dir: str, directory for caching compiled modules
28
- include_dirs: list, additional include directories
29
"""
30
31
def get_function(self, name: str) -> Function:
32
"""
33
Get kernel function from module.
34
35
Parameters:
36
- name: str, function name in CUDA source
37
38
Returns:
39
Function: callable kernel function
40
"""
41
42
def get_global(self, name: str) -> tuple[DeviceAllocation, int]:
43
"""
44
Get global variable from module.
45
46
Parameters:
47
- name: str, variable name in CUDA source
48
49
Returns:
50
tuple: (device_pointer, size_in_bytes)
51
"""
52
53
def get_texref(self, name: str) -> TextureReference:
54
"""
55
Get texture reference from module.
56
57
Parameters:
58
- name: str, texture reference name
59
60
Returns:
61
TextureReference: texture reference object
62
"""
63
```
64
65
### Dynamic Module Generation
66
67
Generate CUDA modules programmatically with dynamic source generation.
68
69
```python { .api }
70
class DynamicModule:
71
def __init__(self, template: str = None):
72
"""
73
Create dynamic module with optional template.
74
75
Parameters:
76
- template: str, template source code (optional)
77
"""
78
79
def add_to_preamble(self, pa: str) -> None:
80
"""
81
Add code to module preamble.
82
83
Parameters:
84
- pa: str, code to add to preamble
85
"""
86
87
def add_function(self, func: DynamicFunction) -> None:
88
"""
89
Add function to module.
90
91
Parameters:
92
- func: DynamicFunction, function to add
93
"""
94
95
def compile(self, nvcc: str = "nvcc", options: list = None,
96
keep: bool = False, no_extern_c: bool = False) -> CudaModule:
97
"""
98
Compile dynamic module.
99
100
Parameters:
101
- nvcc: str, path to nvcc compiler
102
- options: list, additional nvcc options
103
- keep: bool, keep intermediate files
104
- no_extern_c: bool, disable extern "C" wrapper
105
106
Returns:
107
CudaModule: compiled module
108
"""
109
110
class DynamicSourceModule(DynamicModule):
111
def __init__(self, template: str = None, nvcc: str = "nvcc",
112
options: list = None, keep: bool = False,
113
no_extern_c: bool = False, arch: str = None,
114
code: str = None, cache_dir: str = None):
115
"""
116
Dynamic module that compiles automatically.
117
118
Parameters:
119
- template: str, template source code (optional)
120
- nvcc: str, path to nvcc compiler
121
- options: list, additional nvcc options
122
- keep: bool, keep intermediate files
123
- no_extern_c: bool, disable extern "C" wrapper
124
- arch: str, target architecture
125
- code: str, target code generation
126
- cache_dir: str, caching directory
127
"""
128
```
129
130
### Compilation Functions
131
132
Low-level compilation functions for advanced use cases.
133
134
```python { .api }
135
def compile(source: str, nvcc: str = "nvcc", options: list = None,
136
keep: bool = False, no_extern_c: bool = False,
137
arch: str = None, code: str = None, cache_dir: str = None,
138
include_dirs: list = None, target: str = "cubin") -> bytes:
139
"""
140
Compile CUDA source to binary.
141
142
Parameters:
143
- source: str, CUDA source code
144
- nvcc: str, path to nvcc compiler
145
- options: list, compiler options
146
- keep: bool, keep intermediate files
147
- no_extern_c: bool, disable extern "C" wrapper
148
- arch: str, target architecture
149
- code: str, target code generation
150
- cache_dir: str, cache directory
151
- include_dirs: list, include directories
152
- target: str, compilation target ("cubin", "ptx", "fatbin")
153
154
Returns:
155
bytes: compiled binary
156
"""
157
158
def compile_plain(source: str, options: list = None, keep: bool = False,
159
nvcc: str = "nvcc", cache_dir: str = None,
160
target: str = "cubin") -> bytes:
161
"""
162
Simple compilation without extern "C" wrapper.
163
164
Parameters:
165
- source: str, CUDA source code
166
- options: list, compiler options
167
- keep: bool, keep intermediate files
168
- nvcc: str, path to nvcc compiler
169
- cache_dir: str, cache directory
170
- target: str, compilation target
171
172
Returns:
173
bytes: compiled binary
174
"""
175
176
def preprocess_source(source: str, options: list = None, nvcc: str = "nvcc") -> str:
177
"""
178
Preprocess CUDA source code.
179
180
Parameters:
181
- source: str, CUDA source code
182
- options: list, preprocessor options
183
- nvcc: str, path to nvcc compiler
184
185
Returns:
186
str: preprocessed source code
187
"""
188
189
def get_nvcc_version(nvcc: str = "nvcc") -> tuple[int, int]:
190
"""
191
Get NVCC compiler version.
192
193
Parameters:
194
- nvcc: str, path to nvcc compiler
195
196
Returns:
197
tuple: (major, minor) version numbers
198
"""
199
```
200
201
### Kernel Function Interface
202
203
Execute compiled kernel functions with various launch configurations.
204
205
```python { .api }
206
class Function:
207
def __call__(self, *args, **kwargs) -> None:
208
"""
209
Launch kernel function.
210
211
Parameters:
212
- args: kernel arguments (must match function signature)
213
- block: tuple, block dimensions (x, y, z)
214
- grid: tuple, grid dimensions (x, y, z)
215
- stream: Stream, CUDA stream (optional)
216
- shared: int, shared memory bytes (optional)
217
- texrefs: list, texture references (optional)
218
"""
219
220
def prepare(self, arg_types: list, block: tuple = None) -> PreparedFunction:
221
"""
222
Prepare function for faster repeated launches.
223
224
Parameters:
225
- arg_types: list, argument type strings (e.g., ["P", "i", "f"])
226
- block: tuple, default block dimensions (optional)
227
228
Returns:
229
PreparedFunction: prepared function for fast launches
230
"""
231
232
@property
233
def max_threads_per_block(self) -> int:
234
"""Maximum threads per block for this function."""
235
236
@property
237
def shared_size_bytes(self) -> int:
238
"""Shared memory size in bytes."""
239
240
@property
241
def const_size_bytes(self) -> int:
242
"""Constant memory size in bytes."""
243
244
@property
245
def local_size_bytes(self) -> int:
246
"""Local memory size in bytes."""
247
248
@property
249
def num_regs(self) -> int:
250
"""Number of registers used per thread."""
251
252
class PreparedFunction:
253
def __call__(self, *args, **kwargs) -> None:
254
"""Launch prepared function."""
255
256
def prepared_call(self, grid: tuple, *args) -> None:
257
"""
258
Launch with grid dimensions.
259
260
Parameters:
261
- grid: tuple, grid dimensions (x, y, z)
262
- args: kernel arguments
263
"""
264
265
def prepared_async_call(self, grid: tuple, stream: Stream, *args) -> None:
266
"""
267
Launch asynchronously in stream.
268
269
Parameters:
270
- grid: tuple, grid dimensions (x, y, z)
271
- stream: Stream, CUDA stream
272
- args: kernel arguments
273
"""
274
275
def prepared_timed_call(self, grid: tuple, *args) -> float:
276
"""
277
Launch and return execution time.
278
279
Parameters:
280
- grid: tuple, grid dimensions (x, y, z)
281
- args: kernel arguments
282
283
Returns:
284
float: execution time in seconds
285
"""
286
```
287
288
### Texture Memory
289
290
Manage CUDA texture memory for optimized data access patterns.
291
292
```python { .api }
293
class TextureReference:
294
def set_array(self, ary: Array) -> None:
295
"""
296
Bind texture to CUDA array.
297
298
Parameters:
299
- ary: Array, CUDA array to bind
300
"""
301
302
def set_address(self, devptr: DeviceAllocation, size: int) -> int:
303
"""
304
Bind texture to linear memory.
305
306
Parameters:
307
- devptr: DeviceAllocation, device memory pointer
308
- size: int, memory size in bytes
309
310
Returns:
311
int: texture offset in bytes
312
"""
313
314
def set_format(self, fmt: int, num_components: int) -> None:
315
"""
316
Set texture format.
317
318
Parameters:
319
- fmt: int, element format
320
- num_components: int, number of components per element
321
"""
322
323
def set_address_mode(self, dim: int, mode: int) -> None:
324
"""
325
Set addressing mode for dimension.
326
327
Parameters:
328
- dim: int, dimension (0, 1, or 2)
329
- mode: int, addressing mode
330
"""
331
332
def set_filter_mode(self, mode: int) -> None:
333
"""
334
Set filtering mode.
335
336
Parameters:
337
- mode: int, filter mode (point or linear)
338
"""
339
340
def set_flags(self, flags: int) -> None:
341
"""
342
Set texture flags.
343
344
Parameters:
345
- flags: int, texture flags
346
"""
347
348
def make_multichannel_2d_array(matrix: np.ndarray, order: str = "C") -> Array:
349
"""
350
Create 2D CUDA array from matrix.
351
352
Parameters:
353
- matrix: numpy.ndarray, input matrix
354
- order: str, memory order ("C" or "F")
355
356
Returns:
357
Array: CUDA array for texture binding
358
"""
359
360
class Array:
361
def __init__(self, format: ArrayFormat, w: int, h: int = 0, d: int = 0):
362
"""
363
Create CUDA array.
364
365
Parameters:
366
- format: ArrayFormat, array format
367
- w: int, width
368
- h: int, height (for 2D/3D arrays)
369
- d: int, depth (for 3D arrays)
370
"""
371
372
def free(self) -> None:
373
"""Free CUDA array memory."""
374
```
375
376
## Usage Examples
377
378
### Basic Kernel Compilation
379
380
```python
381
# Simple vector addition kernel
382
kernel_source = """
383
__global__ void vector_add(float *a, float *b, float *c, int n) {
384
int idx = blockIdx.x * blockDim.x + threadIdx.x;
385
if (idx < n) {
386
c[idx] = a[idx] + b[idx];
387
}
388
}
389
"""
390
391
# Compile module
392
mod = SourceModule(kernel_source)
393
vector_add = mod.get_function("vector_add")
394
395
# Launch kernel
396
vector_add(gpu_a, gpu_b, gpu_c, np.int32(n),
397
block=(256, 1, 1), grid=((n + 255) // 256, 1))
398
```
399
400
### Prepared Function Example
401
402
```python
403
# Prepare function for repeated launches
404
prepared_add = vector_add.prepare(["P", "P", "P", "i"])
405
406
# Fast repeated launches
407
for i in range(100):
408
prepared_add.prepared_call((grid_size, 1), gpu_a, gpu_b, gpu_c, np.int32(n))
409
```
410
411
### Template-based Dynamic Compilation
412
413
```python
414
template = """
415
#define BLOCK_SIZE ${block_size}
416
417
__global__ void process_data(float *data, int n) {
418
__shared__ float cache[BLOCK_SIZE];
419
420
int idx = blockIdx.x * blockDim.x + threadIdx.x;
421
if (idx < n) {
422
cache[threadIdx.x] = data[idx];
423
__syncthreads();
424
425
// Process data...
426
data[idx] = cache[threadIdx.x] * 2.0f;
427
}
428
}
429
"""
430
431
# Create module with template substitution
432
from string import Template
433
source = Template(template).substitute(block_size=256)
434
mod = SourceModule(source)
435
```