or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-operations.mdcuda-interface.mdfft-operations.mdindex.mdinput-output.mdlinear-algebra.mdmath-operations.mdrandom-generation.mdscipy-extensions.md

cuda-interface.mddocs/

0

# CUDA Interface

1

2

Direct interface to CUDA runtime, memory management, stream processing, and custom kernel development for advanced GPU programming. Provides low-level access to CUDA features for performance optimization and custom computations.

3

4

## Capabilities

5

6

### Device Management

7

8

Functions and classes for managing CUDA devices and contexts.

9

10

```python { .api }

11

class cuda.Device:

12

"""

13

CUDA device context manager.

14

15

Parameters:

16

- device: int, device ID

17

"""

18

def __init__(self, device=None): ...

19

def __enter__(self): ...

20

def __exit__(self, *args): ...

21

@property

22

def id(self): ...

23

def synchronize(self): ...

24

25

def cuda.get_device_id():

26

"""

27

Get current device ID.

28

29

Returns:

30

int, current CUDA device ID

31

"""

32

33

def cuda.is_available():

34

"""

35

Check if CUDA is available.

36

37

Returns:

38

bool, True if CUDA is available

39

"""

40

```

41

42

### Memory Management

43

44

Comprehensive GPU memory allocation and management with memory pools.

45

46

```python { .api }

47

class cuda.MemoryPointer:

48

"""

49

Pointer to GPU memory.

50

51

Parameters:

52

- mem: Memory object

53

- offset: int, byte offset from base

54

"""

55

def __init__(self, mem, offset): ...

56

@property

57

def device(self): ...

58

@property

59

def ptr(self): ...

60

def copy_from_device(self, src, size): ...

61

def copy_from_host(self, src, size): ...

62

def copy_to_host(self, dst, size): ...

63

def memset(self, value, size): ...

64

65

class cuda.Memory:

66

"""

67

GPU memory allocation.

68

69

Parameters:

70

- size: int, size in bytes

71

"""

72

def __init__(self, size): ...

73

@property

74

def ptr(self): ...

75

@property

76

def size(self): ...

77

@property

78

def device(self): ...

79

80

class cuda.MemoryPool:

81

"""

82

GPU memory pool for efficient allocation.

83

84

Parameters:

85

- allocator: function, memory allocator function

86

"""

87

def __init__(self, allocator=None): ...

88

def malloc(self, size): ...

89

def free(self, ptr, size): ...

90

def free_all_blocks(self): ...

91

def free_all_free(self): ...

92

def n_free_blocks(self): ...

93

def used_bytes(self): ...

94

def total_bytes(self): ...

95

def set_limit(self, size=None, fraction=None): ...

96

def get_limit(self): ...

97

98

def cuda.alloc(size):

99

"""

100

Allocate GPU memory.

101

102

Parameters:

103

- size: int, size in bytes

104

105

Returns:

106

MemoryPointer, pointer to allocated memory

107

"""

108

109

def cuda.set_allocator(allocator=None):

110

"""

111

Set memory allocator.

112

113

Parameters:

114

- allocator: function, allocator function or None for default

115

"""

116

117

def cuda.get_allocator():

118

"""

119

Get current memory allocator.

120

121

Returns:

122

function, current allocator function

123

"""

124

125

class cuda.ManagedMemory:

126

"""

127

Unified memory allocation accessible from CPU and GPU.

128

129

Parameters:

130

- size: int, size in bytes

131

"""

132

def __init__(self, size): ...

133

134

def cuda.malloc_managed(size):

135

"""

136

Allocate unified/managed memory.

137

138

Parameters:

139

- size: int, size in bytes

140

141

Returns:

142

MemoryPointer, pointer to managed memory

143

"""

144

```

145

146

### Pinned Memory

147

148

CPU memory pinning for faster host-device transfers.

149

150

```python { .api }

151

class cuda.PinnedMemory:

152

"""

153

Pinned (page-locked) host memory for fast transfers.

154

155

Parameters:

156

- size: int, size in bytes

157

"""

158

def __init__(self, size): ...

159

@property

160

def ptr(self): ...

161

@property

162

def size(self): ...

163

164

class cuda.PinnedMemoryPointer:

165

"""

166

Pointer to pinned host memory.

167

168

Parameters:

169

- mem: PinnedMemory object

170

- offset: int, byte offset

171

"""

172

def __init__(self, mem, offset): ...

173

174

class cuda.PinnedMemoryPool:

175

"""

176

Memory pool for pinned host memory.

177

178

Parameters:

179

- allocator: function, allocator function

180

"""

181

def __init__(self, allocator=None): ...

182

def malloc(self, size): ...

183

def free_all_blocks(self): ...

184

def n_free_blocks(self): ...

185

def used_bytes(self): ...

186

def total_bytes(self): ...

187

188

def cuda.alloc_pinned_memory(size):

189

"""

190

Allocate pinned host memory.

191

192

Parameters:

193

- size: int, size in bytes

194

195

Returns:

196

PinnedMemoryPointer, pointer to pinned memory

197

"""

198

199

def cuda.set_pinned_memory_allocator(allocator=None):

200

"""

201

Set pinned memory allocator.

202

203

Parameters:

204

- allocator: function, allocator function or None

205

"""

206

```

207

208

### Streams and Events

209

210

Asynchronous execution control with CUDA streams and events.

211

212

```python { .api }

213

class cuda.Stream:

214

"""

215

CUDA stream for asynchronous operations.

216

217

Parameters:

218

- null: bool, create null stream

219

- non_blocking: bool, create non-blocking stream

220

- priority: int, stream priority

221

"""

222

def __init__(self, null=False, non_blocking=False, priority=None): ...

223

def __enter__(self): ...

224

def __exit__(self, *args): ...

225

@property

226

def ptr(self): ...

227

def synchronize(self): ...

228

def add_callback(self, callback, arg): ...

229

def record(self, event=None): ...

230

def wait_event(self, event): ...

231

232

class cuda.Event:

233

"""

234

CUDA event for synchronization and timing.

235

236

Parameters:

237

- block: bool, blocking event

238

- disable_timing: bool, disable timing capability

239

- interprocess: bool, enable interprocess sharing

240

"""

241

def __init__(self, block=False, disable_timing=False, interprocess=False): ...

242

@property

243

def ptr(self): ...

244

def record(self, stream=None): ...

245

def synchronize(self): ...

246

def query(self): ...

247

def elapsed_time(self, end_event): ...

248

249

def cuda.get_current_stream():

250

"""

251

Get current CUDA stream.

252

253

Returns:

254

Stream, current stream object

255

"""

256

257

def cuda.get_elapsed_time(start_event, end_event):

258

"""

259

Get elapsed time between events.

260

261

Parameters:

262

- start_event: Event, start event

263

- end_event: Event, end event

264

265

Returns:

266

float, elapsed time in milliseconds

267

"""

268

269

class cuda.ExternalStream:

270

"""

271

Wrap external CUDA stream pointer.

272

273

Parameters:

274

- ptr: int, CUDA stream pointer

275

"""

276

def __init__(self, ptr): ...

277

```

278

279

### Custom Kernels

280

281

Support for user-defined CUDA kernels and GPU code compilation.

282

283

```python { .api }

284

class ElementwiseKernel:

285

"""

286

User-defined elementwise CUDA kernel.

287

288

Parameters:

289

- in_params: str, input parameter specification

290

- out_params: str, output parameter specification

291

- operation: str, CUDA C++ code for element operation

292

- name: str, kernel name

293

- reduce_dims: bool, reduce dimensions

294

- return_tuple: bool, return tuple of outputs

295

- no_return: bool, no return value

296

- preamble: str, code before kernel

297

- loop_prep: str, code before loop

298

- after_loop: str, code after loop

299

- options: tuple, compiler options

300

"""

301

def __init__(self, in_params, out_params, operation, name='kernel', **kwargs): ...

302

def __call__(self, *args, **kwargs): ...

303

304

class ReductionKernel:

305

"""

306

User-defined reduction CUDA kernel.

307

308

Parameters:

309

- in_params: str, input parameter specification

310

- out_params: str, output parameter specification

311

- map_expr: str, mapping expression

312

- reduce_expr: str, reduction expression

313

- post_map_expr: str, post-mapping expression

314

- identity: str, identity value

315

- name: str, kernel name

316

- reduce_type: str, reduction data type

317

- reduce_dims: bool, reduce dimensions

318

- preamble: str, code before kernel

319

- options: tuple, compiler options

320

"""

321

def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr, identity, name='kernel', **kwargs): ...

322

def __call__(self, *args, **kwargs): ...

323

324

class RawKernel:

325

"""

326

Raw CUDA kernel from source code.

327

328

Parameters:

329

- code: str, CUDA C++ source code

330

- name: str, kernel function name

331

- options: tuple, compiler options

332

- backend: str, backend ('nvcc' or 'nvrtc')

333

- translate_cucomplex: bool, translate cuComplex types

334

"""

335

def __init__(self, code, name, options=(), backend='auto', translate_cucomplex=True): ...

336

def __call__(self, grid, block, args, **kwargs): ...

337

338

class RawModule:

339

"""

340

CUDA module containing multiple kernels.

341

342

Parameters:

343

- code: str, CUDA C++ source code

344

- options: tuple, compiler options

345

- backend: str, backend ('nvcc' or 'nvrtc')

346

- name_expressions: tuple, kernel name expressions

347

- log_stream: stream, compilation log output

348

- translate_cucomplex: bool, translate cuComplex types

349

"""

350

def __init__(self, code, options=(), backend='auto', **kwargs): ...

351

def get_function(self, name): ...

352

353

def compile_with_cache(source, options=(), arch=None, cache_dir=None, **kwargs):

354

"""

355

Compile CUDA source with caching.

356

357

Parameters:

358

- source: str, CUDA source code

359

- options: tuple, compiler options

360

- arch: str, target architecture

361

- cache_dir: str, cache directory

362

363

Returns:

364

RawModule, compiled module

365

"""

366

```

367

368

### Context Managers

369

370

Utility context managers for resource management.

371

372

```python { .api }

373

def cuda.using_allocator(allocator=None):

374

"""

375

Context manager for temporary allocator change.

376

377

Parameters:

378

- allocator: function, allocator to use temporarily

379

380

Returns:

381

context manager

382

"""

383

384

def cuda.profile():

385

"""

386

Context manager for CUDA profiling (deprecated).

387

388

Returns:

389

context manager

390

"""

391

```

392

393

## Usage Examples

394

395

### Basic Device and Memory Management

396

397

```python

398

import cupy as cp

399

400

# Check CUDA availability

401

if cp.cuda.is_available():

402

print(f"CUDA devices: {cp.cuda.runtime.getDeviceCount()}")

403

print(f"Current device: {cp.cuda.get_device_id()}")

404

405

# Use specific device

406

with cp.cuda.Device(0):

407

# All operations on device 0

408

a = cp.array([1, 2, 3, 4, 5])

409

410

# Memory pool management

411

mempool = cp.get_default_memory_pool()

412

print(f"Used: {mempool.used_bytes()} bytes")

413

print(f"Total: {mempool.total_bytes()} bytes")

414

415

# Free unused memory

416

mempool.free_all_free()

417

418

# Set memory limit (50% of GPU memory)

419

mempool.set_limit(fraction=0.5)

420

```

421

422

### Stream-based Asynchronous Processing

423

424

```python

425

import cupy as cp

426

427

# Create streams

428

stream1 = cp.cuda.Stream()

429

stream2 = cp.cuda.Stream()

430

431

# Create events for synchronization

432

event1 = cp.cuda.Event()

433

event2 = cp.cuda.Event()

434

435

# Asynchronous operations

436

with stream1:

437

a = cp.random.random((1000, 1000))

438

cp.cuda.get_current_stream().record(event1)

439

440

with stream2:

441

# Wait for stream1 to complete

442

cp.cuda.get_current_stream().wait_event(event1)

443

b = cp.random.random((1000, 1000))

444

445

# Synchronize streams

446

stream1.synchronize()

447

stream2.synchronize()

448

449

# Measure execution time

450

start = cp.cuda.Event()

451

end = cp.cuda.Event()

452

453

start.record()

454

result = cp.dot(a, b.T)

455

end.record()

456

end.synchronize()

457

458

elapsed_time = start.elapsed_time(end)

459

print(f"Execution time: {elapsed_time} ms")

460

```

461

462

### Custom Kernel Development

463

464

```python

465

import cupy as cp

466

467

# ElementwiseKernel for simple operations

468

add_kernel = cp.ElementwiseKernel(

469

'float32 x, float32 y',

470

'float32 z',

471

'z = x + y * 2',

472

'add_scaled'

473

)

474

475

a = cp.random.random(1000000, dtype=cp.float32)

476

b = cp.random.random(1000000, dtype=cp.float32)

477

result = add_kernel(a, b)

478

479

# ReductionKernel for reductions

480

sum_kernel = cp.ReductionKernel(

481

'float32 x',

482

'float32 sum',

483

'x',

484

'sum += a',

485

'sum',

486

'0',

487

'custom_sum'

488

)

489

490

total = sum_kernel(a)

491

492

# RawKernel for complex operations

493

raw_kernel_code = r'''

494

extern "C" __global__

495

void matrix_multiply(float* a, float* b, float* c, int n) {

496

int i = blockIdx.x * blockDim.x + threadIdx.x;

497

int j = blockIdx.y * blockDim.y + threadIdx.y;

498

499

if (i < n && j < n) {

500

float sum = 0.0f;

501

for (int k = 0; k < n; k++) {

502

sum += a[i * n + k] * b[k * n + j];

503

}

504

c[i * n + j] = sum;

505

}

506

}

507

'''

508

509

kernel = cp.RawKernel(raw_kernel_code, 'matrix_multiply')

510

511

# Execute raw kernel

512

n = 512

513

a = cp.random.random((n, n), dtype=cp.float32)

514

b = cp.random.random((n, n), dtype=cp.float32)

515

c = cp.zeros((n, n), dtype=cp.float32)

516

517

block_size = (16, 16)

518

grid_size = ((n + block_size[0] - 1) // block_size[0],

519

(n + block_size[1] - 1) // block_size[1])

520

521

kernel(grid_size, block_size, (a, b, c, n))

522

```

523

524

### Memory Transfer Optimization

525

526

```python

527

import cupy as cp

528

import numpy as np

529

530

# Use pinned memory for faster transfers

531

pinned_mempool = cp.get_default_pinned_memory_pool()

532

533

# Create large CPU array

534

cpu_array = np.random.random((10000, 10000)).astype(np.float32)

535

536

# Transfer with pinned memory

537

with cp.cuda.Stream() as stream:

538

# Asynchronous transfer using pinned memory

539

gpu_array = cp.asarray(cpu_array, stream=stream)

540

541

# Process on GPU

542

result = cp.fft.fft2(gpu_array)

543

544

# Asynchronous transfer back to CPU

545

cpu_result = cp.asnumpy(result, stream=stream)

546

547

# Explicit pinned memory usage

548

pinned_array = cp.cuda.alloc_pinned_memory(cpu_array.nbytes)

549

# Copy CPU array to pinned memory, then to GPU

550

```