or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-operations.mdcuda-interface.mdcustom-kernels.mdfft-operations.mdindex.mdlinear-algebra.mdmath-functions.mdrandom-numbers.mdstatistics-sorting.md

cuda-interface.mddocs/

0

# CUDA Interface

1

2

Direct access to CUDA functionality for fine-grained GPU control, memory management, device handling, and performance optimization. Provides low-level CUDA operations while maintaining Python integration.

3

4

## Capabilities

5

6

### Device Management

7

8

```python { .api }

9

def is_available():

10

"""

11

Check if CUDA is available.

12

13

Returns:

14

bool: True if CUDA devices are available

15

"""

16

17

def get_device_id():

18

"""Get current device ID."""

19

20

class Device:

21

"""

22

CUDA device context manager.

23

24

Parameters:

25

- device: device ID or None for current device

26

"""

27

def __init__(self, device=None): ...

28

def __enter__(self): ...

29

def __exit__(self, *args): ...

30

31

def get_cublas_handle():

32

"""Get cuBLAS handle for current device."""

33

```

34

35

### Memory Management

36

37

```python { .api }

38

def alloc(size):

39

"""

40

Allocate GPU memory.

41

42

Parameters:

43

- size: size in bytes

44

45

Returns:

46

MemoryPointer: pointer to allocated memory

47

"""

48

49

class Memory:

50

"""GPU memory object."""

51

def __init__(self): ...

52

@property

53

def ptr(self): ...

54

@property

55

def size(self): ...

56

57

class MemoryPointer:

58

"""Pointer to GPU memory."""

59

def __init__(self): ...

60

@property

61

def device(self): ...

62

63

class MemoryPool:

64

"""

65

Memory pool for GPU memory allocation.

66

67

Parameters:

68

- allocator: memory allocator function

69

"""

70

def __init__(self, allocator=None): ...

71

def malloc(self, size): ...

72

def free(self, ptr, size): ...

73

def free_all_blocks(self): ...

74

def free_all_free(self): ...

75

def n_free_blocks(self): ...

76

def used_bytes(self): ...

77

def free_bytes(self): ...

78

def total_bytes(self): ...

79

80

class MemoryAsync:

81

"""Asynchronous memory allocation."""

82

83

class MemoryAsyncPool:

84

"""Asynchronous memory pool."""

85

def __init__(self): ...

86

87

class ManagedMemory:

88

"""CUDA managed memory allocation."""

89

90

class UnownedMemory:

91

"""Reference to unowned memory."""

92

93

class BaseMemory:

94

"""Base class for memory objects."""

95

96

def malloc_managed(size, device=None):

97

"""Allocate managed memory."""

98

99

def malloc_async(size, stream=None):

100

"""Allocate memory asynchronously."""

101

102

def set_allocator(allocator):

103

"""Set default memory allocator."""

104

105

def get_allocator():

106

"""Get current memory allocator."""

107

108

class PythonFunctionAllocator:

109

"""Python function-based allocator."""

110

def __init__(self, func, arg): ...

111

112

class CFunctionAllocator:

113

"""C function-based allocator."""

114

def __init__(self, func_ptr, arg_ptr): ...

115

```

116

117

### Pinned Memory

118

119

```python { .api }

120

def alloc_pinned_memory(size):

121

"""

122

Allocate pinned (page-locked) memory.

123

124

Parameters:

125

- size: size in bytes

126

127

Returns:

128

PinnedMemoryPointer: pointer to pinned memory

129

"""

130

131

class PinnedMemory:

132

"""Pinned memory object."""

133

134

class PinnedMemoryPointer:

135

"""Pointer to pinned memory."""

136

137

class PinnedMemoryPool:

138

"""

139

Memory pool for pinned memory.

140

141

Parameters:

142

- allocator: memory allocator function

143

"""

144

def __init__(self, allocator=None): ...

145

def malloc(self, size): ...

146

def free(self, ptr, size): ...

147

148

def set_pinned_memory_allocator(allocator):

149

"""Set pinned memory allocator."""

150

```

151

152

### Streams and Events

153

154

```python { .api }

155

class Stream:

156

"""

157

CUDA stream for asynchronous operations.

158

159

Parameters:

160

- null: whether to use null stream

161

- non_blocking: whether stream is non-blocking

162

- ptds: per-thread default stream

163

"""

164

def __init__(self, null=False, non_blocking=False, ptds=False): ...

165

def synchronize(self): ...

166

def add_callback(self, callback, arg): ...

167

def record(self, event=None): ...

168

def wait_event(self, event): ...

169

@property

170

def ptr(self): ...

171

172

class ExternalStream:

173

"""

174

External CUDA stream wrapper.

175

176

Parameters:

177

- ptr: stream pointer

178

"""

179

def __init__(self, ptr): ...

180

181

class Event:

182

"""

183

CUDA event for timing and synchronization.

184

185

Parameters:

186

- blocking: whether event blocks

187

- disable_timing: disable timing capability

188

- interprocess: enable interprocess sharing

189

"""

190

def __init__(self, blocking=False, disable_timing=False, interprocess=False): ...

191

def record(self, stream=None): ...

192

def synchronize(self): ...

193

def query(self): ...

194

def elapsed_time(self, end_event): ...

195

196

def get_current_stream():

197

"""Get current CUDA stream."""

198

199

def get_elapsed_time(start_event, end_event):

200

"""Get elapsed time between events."""

201

```

202

203

### Kernel Compilation and Execution

204

205

```python { .api }

206

class Function:

207

"""CUDA function object."""

208

def __init__(self): ...

209

def __call__(self, grid, block, args, **kwargs): ...

210

211

class Module:

212

"""CUDA module object."""

213

def __init__(self): ...

214

def get_function(self, name): ...

215

216

def compile_with_cache(source, options=(), arch=None, cache_dir=None,

217

prepend_cupy_headers=True, backend='nvcc',

218

translate_cucomplex=True, enable_cooperative_groups=False,

219

name_expressions=None, log_stream=None,

220

cache_in_memory=False, jitify=False):

221

"""

222

Compile CUDA source code with caching.

223

224

Parameters:

225

- source: CUDA source code

226

- options: compiler options

227

- arch: target architecture

228

- cache_dir: cache directory

229

- prepend_cupy_headers: whether to prepend CuPy headers

230

- backend: compiler backend

231

- translate_cucomplex: translate cuComplex types

232

- enable_cooperative_groups: enable cooperative groups

233

- name_expressions: name expressions for kernel parameters

234

- log_stream: log stream for compilation messages

235

- cache_in_memory: cache compiled modules in memory

236

- jitify: use Jitify for compilation

237

238

Returns:

239

Module: compiled CUDA module

240

"""

241

```

242

243

### Context Management

244

245

```python { .api }

246

def using_allocator(allocator=None):

247

"""

248

Context manager for using specific allocator.

249

250

Parameters:

251

- allocator: memory allocator function

252

253

Returns:

254

context manager

255

"""

256

```

257

258

### Memory Hooks

259

260

```python { .api }

261

class MemoryHook:

262

"""Base class for memory allocation hooks."""

263

def alloc_preprocess(self, **kwargs): ...

264

def alloc_postprocess(self, mem_ptr): ...

265

def free_preprocess(self, mem_ptr): ...

266

def free_postprocess(self, mem_ptr): ...

267

```

268

269

### Library Interfaces

270

271

```python { .api }

272

# Sub-modules providing CUDA library access

273

import cupy.cuda.driver # CUDA Driver API

274

import cupy.cuda.runtime # CUDA Runtime API

275

import cupy.cuda.cublas # cuBLAS library

276

import cupy.cuda.curand # cuRAND library

277

import cupy.cuda.cusolver # cuSOLVER library

278

import cupy.cuda.cusparse # cuSPARSE library

279

import cupy.cuda.nvrtc # NVRTC library

280

import cupy.cuda.profiler # CUDA Profiler

281

import cupy.cuda.nvtx # NVIDIA Tools Extension (optional)

282

import cupy.cuda.thrust # Thrust library (optional)

283

import cupy.cuda.cub # CUB library

284

import cupy.cuda.jitify # Jitify library (optional)

285

```

286

287

### Environment Information

288

289

```python { .api }

290

def get_cuda_path():

291

"""Get CUDA installation path."""

292

293

def get_nvcc_path():

294

"""Get NVCC compiler path."""

295

296

def get_rocm_path():

297

"""Get ROCm installation path."""

298

299

def get_hipcc_path():

300

"""Get HIPCC compiler path."""

301

```

302

303

## Usage Examples

304

305

### Device Management

306

307

```python

308

import cupy as cp

309

310

# Check CUDA availability

311

if cp.cuda.is_available():

312

print("CUDA is available")

313

device_count = cp.cuda.runtime.getDeviceCount()

314

print(f"Number of devices: {device_count}")

315

else:

316

print("CUDA is not available")

317

318

# Get current device

319

current_device = cp.cuda.get_device_id()

320

print(f"Current device: {current_device}")

321

322

# Use specific device

323

with cp.cuda.Device(1): # Use device 1

324

data = cp.random.random((1000, 1000))

325

result = cp.sum(data)

326

print(f"Computed on device: {cp.cuda.get_device_id()}")

327

```

328

329

### Memory Management

330

331

```python

332

import cupy as cp

333

334

# Get default memory pool

335

mempool = cp.get_default_memory_pool()

336

337

# Check memory usage

338

print(f"Used bytes: {mempool.used_bytes()}")

339

print(f"Total bytes: {mempool.total_bytes()}")

340

341

# Allocate raw memory

342

raw_memory = cp.cuda.alloc(1024 * 1024) # 1MB

343

print(f"Allocated memory at: {raw_memory.ptr}")

344

345

# Use custom allocator

346

def custom_allocator(size):

347

print(f"Allocating {size} bytes")

348

return cp.cuda.memory.malloc(size)

349

350

with cp.cuda.using_allocator(custom_allocator):

351

array = cp.zeros(1000) # Uses custom allocator

352

353

# Clean up memory

354

mempool.free_all_free()

355

```

356

357

### Pinned Memory

358

359

```python

360

import cupy as cp

361

import numpy as np

362

363

# Allocate pinned memory for faster transfers

364

pinned_mem = cp.cuda.alloc_pinned_memory(1000 * 8) # 1000 float64s

365

366

# Use pinned memory with NumPy array

367

pinned_array = np.frombuffer(pinned_mem, dtype=np.float64)

368

pinned_array[:] = np.random.random(1000)

369

370

# Transfer to GPU (faster with pinned memory)

371

gpu_array = cp.asarray(pinned_array)

372

373

# Pinned memory pool

374

pinned_pool = cp.get_default_pinned_memory_pool()

375

print(f"Pinned memory used: {pinned_pool.n_free_blocks()}")

376

```

377

378

### Streams and Asynchronous Operations

379

380

```python

381

import cupy as cp

382

383

# Create CUDA streams

384

stream1 = cp.cuda.Stream()

385

stream2 = cp.cuda.Stream()

386

387

# Create events for timing

388

start_event = cp.cuda.Event()

389

end_event = cp.cuda.Event()

390

391

# Asynchronous operations

392

with stream1:

393

start_event.record()

394

395

# Compute on stream1

396

data1 = cp.random.random((5000, 5000))

397

result1 = cp.linalg.svd(data1)

398

399

end_event.record()

400

401

with stream2:

402

# Compute on stream2 simultaneously

403

data2 = cp.random.random((3000, 3000))

404

result2 = cp.fft.fft2(data2)

405

406

# Wait for completion and get timing

407

stream1.synchronize()

408

stream2.synchronize()

409

410

elapsed_time = cp.cuda.get_elapsed_time(start_event, end_event)

411

print(f"Stream1 computation took: {elapsed_time} ms")

412

```

413

414

### Custom CUDA Kernels

415

416

```python

417

import cupy as cp

418

419

# Simple CUDA kernel source

420

kernel_source = '''

421

extern "C" __global__

422

void vector_add(float* a, float* b, float* c, int n) {

423

int idx = blockDim.x * blockIdx.x + threadIdx.x;

424

if (idx < n) {

425

c[idx] = a[idx] + b[idx];

426

}

427

}

428

'''

429

430

# Compile kernel

431

module = cp.cuda.compile_with_cache(kernel_source)

432

kernel = module.get_function('vector_add')

433

434

# Prepare data

435

n = 1000000

436

a = cp.random.random(n, dtype=cp.float32)

437

b = cp.random.random(n, dtype=cp.float32)

438

c = cp.zeros(n, dtype=cp.float32)

439

440

# Launch kernel

441

block_size = 256

442

grid_size = (n + block_size - 1) // block_size

443

444

kernel((grid_size,), (block_size,), (a, b, c, n))

445

446

# Verify result

447

expected = a + b

448

error = cp.linalg.norm(c - expected)

449

print(f"Kernel result error: {error}")

450

```

451

452

### Memory Hooks for Profiling

453

454

```python

455

import cupy as cp

456

457

class ProfilingHook(cp.cuda.MemoryHook):

458

def __init__(self):

459

self.alloc_count = 0

460

self.free_count = 0

461

self.total_allocated = 0

462

463

def alloc_preprocess(self, **kwargs):

464

size = kwargs.get('size', 0)

465

self.alloc_count += 1

466

self.total_allocated += size

467

print(f"Allocating {size} bytes (total: {self.total_allocated})")

468

469

def free_preprocess(self, mem_ptr):

470

self.free_count += 1

471

print(f"Freeing memory (free count: {self.free_count})")

472

473

# Install hook

474

hook = ProfilingHook()

475

cp.cuda.memory_hook.set_memory_hook(hook)

476

477

# Operations will now be logged

478

data = cp.random.random((1000, 1000))

479

result = cp.sum(data)

480

del data, result # Trigger memory free

481

482

print(f"Allocations: {hook.alloc_count}, Frees: {hook.free_count}")

483

```

484

485

### Multi-GPU Operations

486

487

```python

488

import cupy as cp

489

490

# Check available devices

491

device_count = cp.cuda.runtime.getDeviceCount()

492

print(f"Available devices: {device_count}")

493

494

if device_count > 1:

495

# Split computation across multiple GPUs

496

data = cp.random.random((10000, 10000))

497

498

# Split data

499

mid = data.shape[0] // 2

500

501

# Process first half on device 0

502

with cp.cuda.Device(0):

503

data1 = data[:mid].copy()

504

result1 = cp.linalg.svd(data1, compute_uv=False)

505

506

# Process second half on device 1

507

with cp.cuda.Device(1):

508

data2 = data[mid:].copy()

509

result2 = cp.linalg.svd(data2, compute_uv=False)

510

511

# Combine results (move to device 0)

512

with cp.cuda.Device(0):

513

combined_result = cp.concatenate([result1, result2])

514

```

515

516

### Performance Profiling

517

518

```python

519

import cupy as cp

520

import time

521

522

# Deprecated profile context manager (use cupyx.profiler instead)

523

# with cp.cuda.profile():

524

# # Operations to profile

525

# pass

526

527

# Manual timing with events

528

def time_operation(func, *args, **kwargs):

529

start = cp.cuda.Event()

530

end = cp.cuda.Event()

531

532

start.record()

533

result = func(*args, **kwargs)

534

end.record()

535

536

end.synchronize()

537

elapsed = cp.cuda.get_elapsed_time(start, end)

538

return result, elapsed

539

540

# Time different operations

541

data = cp.random.random((5000, 5000))

542

543

svd_result, svd_time = time_operation(cp.linalg.svd, data, compute_uv=False)

544

fft_result, fft_time = time_operation(cp.fft.fft2, data)

545

546

print(f"SVD time: {svd_time:.2f} ms")

547

print(f"FFT time: {fft_time:.2f} ms")

548

```