or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-operations.mdcuda-integration.mdfft.mdindex.mdindexing-selection.mdinput-output.mdjit-kernels.mdlinear-algebra.mdlogic-operations.mdmathematical-functions.mdrandom-generation.mdscipy-extensions.mdstatistics.mdtesting.md

cuda-integration.mddocs/

0

# CUDA Integration

1

2

Direct access to CUDA features including device management, memory allocation, streams, events, and custom kernel compilation for advanced GPU programming. CuPy provides comprehensive low-level CUDA functionality for performance optimization and custom GPU programming.

3

4

## Capabilities

5

6

### Device Management

7

8

Control and query GPU devices and their properties.

9

10

```python { .api }

11

class Device:

12

"""CUDA device management.

13

14

Provides context management and device switching capabilities.

15

"""

16

def __init__(self, device=None):

17

"""Initialize device context.

18

19

Args:

20

device: Device ID or None for current device

21

"""

22

23

def __enter__(self):

24

"""Enter device context."""

25

26

def __exit__(self, *args):

27

"""Exit device context."""

28

29

def use(self):

30

"""Make this device current."""

31

32

def get_device_count():

33

"""Get number of CUDA devices."""

34

35

def get_device_id():

36

"""Get current device ID."""

37

38

class DeviceMemInfo:

39

"""Device memory information."""

40

def __init__(self):

41

pass

42

43

def total(self):

44

"""Total device memory."""

45

46

def free(self):

47

"""Free device memory."""

48

49

def used(self):

50

"""Used device memory."""

51

```

52

53

### Memory Management

54

55

Control GPU memory allocation and deallocation.

56

57

```python { .api }

58

class MemoryPool:

59

"""GPU memory pool for efficient allocation.

60

61

Manages GPU memory allocation and reuse to minimize allocation overhead.

62

"""

63

def __init__(self, allocator=None):

64

"""Initialize memory pool.

65

66

Args:

67

allocator: Custom allocator function

68

"""

69

70

def malloc(self, size):

71

"""Allocate memory from pool."""

72

73

def free(self, ptr, size):

74

"""Return memory to pool."""

75

76

def free_all_blocks(self):

77

"""Free all cached memory blocks."""

78

79

def n_free_blocks(self):

80

"""Number of free blocks in pool."""

81

82

def used_bytes(self):

83

"""Total bytes currently allocated."""

84

85

def total_bytes(self):

86

"""Total bytes managed by pool."""

87

88

class PinnedMemoryPool:

89

"""Pinned (page-locked) CPU memory pool for faster CPU-GPU transfers."""

90

def __init__(self, allocator=None):

91

pass

92

93

def get_default_memory_pool():

94

"""Get default GPU memory pool."""

95

96

def get_default_pinned_memory_pool():

97

"""Get default pinned memory pool."""

98

99

def set_allocator(allocator=None):

100

"""Set memory allocator."""

101

102

class MemoryPointer:

103

"""Pointer to device memory."""

104

def __init__(self, mem, offset):

105

pass

106

107

def __int__(self):

108

"""Get memory address as integer."""

109

110

def copy_from_device(self, src, size):

111

"""Copy from device memory."""

112

113

def copy_from_host(self, src, size):

114

"""Copy from host memory."""

115

116

def copy_to_host(self, dst, size):

117

"""Copy to host memory."""

118

119

def alloc(size):

120

"""Allocate device memory."""

121

122

def malloc_managed(size):

123

"""Allocate managed (unified) memory."""

124

```

125

126

### Stream Management

127

128

Control CUDA streams for asynchronous operations.

129

130

```python { .api }

131

class Stream:

132

"""CUDA stream for asynchronous operations.

133

134

Enables overlapping of computation and memory transfers.

135

"""

136

def __init__(self, null=False, non_blocking=False, ptds=False):

137

"""Initialize CUDA stream.

138

139

Args:

140

null: Use null stream

141

non_blocking: Non-blocking stream

142

ptds: Per-thread default stream

143

"""

144

145

def __enter__(self):

146

"""Enter stream context."""

147

148

def __exit__(self, *args):

149

"""Exit stream context."""

150

151

def synchronize(self):

152

"""Synchronize stream."""

153

154

def query(self):

155

"""Query stream completion status."""

156

157

def wait_event(self, event):

158

"""Make stream wait for event."""

159

160

def record(self, event):

161

"""Record event in stream."""

162

163

def get_current_stream():

164

"""Get current CUDA stream."""

165

166

class ExternalStream:

167

"""Wrap external CUDA stream."""

168

def __init__(self, ptr):

169

pass

170

```

171

172

### Event Management

173

174

CUDA events for synchronization and timing.

175

176

```python { .api }

177

class Event:

178

"""CUDA event for synchronization and timing.

179

180

Provides fine-grained synchronization between operations.

181

"""

182

def __init__(self, block=True, disable_timing=False, interprocess=False):

183

"""Initialize CUDA event.

184

185

Args:

186

block: Blocking event

187

disable_timing: Disable timing capability

188

interprocess: Enable interprocess sharing

189

"""

190

191

def record(self, stream=None):

192

"""Record event in stream."""

193

194

def synchronize(self):

195

"""Synchronize on event."""

196

197

def query(self):

198

"""Query event completion."""

199

200

def elapsed_time(self, end_event):

201

"""Get elapsed time to another event."""

202

203

def synchronize():

204

"""Synchronize all device operations."""

205

```

206

207

### Custom Kernel Compilation

208

209

Compile and execute custom CUDA kernels.

210

211

```python { .api }

212

def compile_with_cache(source, name, options=(), arch=None, cachdir=None,

213

prepend_cupy_headers=True, backend='nvcc',

214

translate_cucomplex=True, enable_cooperative_groups=False,

215

name_expressions=None, log_stream=None,

216

cache_in_memory=False, jitify=False):

217

"""Compile CUDA source code with caching.

218

219

Args:

220

source: CUDA C/C++ source code

221

name: Kernel function name

222

options: Compiler options

223

arch: Target architecture

224

cachdir: Cache directory

225

prepend_cupy_headers: Include CuPy headers

226

backend: Compiler backend ('nvcc', 'nvrtc')

227

translate_cucomplex: Translate complex types

228

enable_cooperative_groups: Enable cooperative groups

229

name_expressions: Template name expressions

230

log_stream: Compilation log stream

231

cache_in_memory: Cache in memory

232

jitify: Use Jitify for compilation

233

234

Returns:

235

cupy.cuda.Function: Compiled kernel function

236

"""

237

238

class Function:

239

"""Compiled CUDA kernel function."""

240

def __init__(self, module, name):

241

pass

242

243

def __call__(self, grid, block, args, **kwargs):

244

"""Launch kernel.

245

246

Args:

247

grid: Grid dimensions

248

block: Block dimensions

249

args: Kernel arguments

250

**kwargs: Additional launch parameters

251

"""

252

253

class Module:

254

"""CUDA module containing compiled code."""

255

def __init__(self, cubin):

256

pass

257

258

def get_function(self, name):

259

"""Get function from module."""

260

261

def get_compute_capability(device=None):

262

"""Get compute capability of device."""

263

```

264

265

### Runtime API Access

266

267

Direct access to CUDA Runtime API functions.

268

269

```python { .api }

270

class Runtime:

271

"""CUDA Runtime API wrapper."""

272

273

@staticmethod

274

def deviceGetAttribute(attr, device):

275

"""Get device attribute."""

276

277

@staticmethod

278

def deviceGetProperties(device):

279

"""Get device properties."""

280

281

@staticmethod

282

def memGetInfo():

283

"""Get memory information."""

284

285

@staticmethod

286

def deviceSynchronize():

287

"""Synchronize device."""

288

289

@staticmethod

290

def getLastError():

291

"""Get last CUDA error."""

292

293

@staticmethod

294

def peekAtLastError():

295

"""Peek at last CUDA error."""

296

297

def runtime_version():

298

"""Get CUDA runtime version."""

299

300

def driver_version():

301

"""Get CUDA driver version."""

302

```

303

304

### Profiler Integration

305

306

CUDA profiler control and markers.

307

308

```python { .api }

309

class ProfilerRange:

310

"""CUDA profiler range marker."""

311

def __init__(self, message, color_id=None):

312

pass

313

314

def __enter__(self):

315

pass

316

317

def __exit__(self, *args):

318

pass

319

320

def nvtx_mark(message, color=None):

321

"""Add NVTX marker."""

322

323

def nvtx_range_push(message, color=None):

324

"""Push NVTX range."""

325

326

def nvtx_range_pop():

327

"""Pop NVTX range."""

328

329

def profiler_start():

330

"""Start CUDA profiler."""

331

332

def profiler_stop():

333

"""Stop CUDA profiler."""

334

```

335

336

## Usage Examples

337

338

### Device Management

339

340

```python

341

import cupy as cp

342

343

# Query device information

344

device_count = cp.cuda.get_device_count()

345

current_device = cp.cuda.get_device_id()

346

347

print(f"Available devices: {device_count}")

348

print(f"Current device: {current_device}")

349

350

# Switch devices

351

if device_count > 1:

352

with cp.cuda.Device(1):

353

# Operations on device 1

354

x = cp.array([1, 2, 3])

355

print(f"Array on device: {x.device}")

356

357

# Query memory information

358

mem_info = cp.cuda.MemoryInfo()

359

print(f"Total GPU memory: {mem_info.total / 1024**3:.2f} GB")

360

print(f"Free GPU memory: {mem_info.free / 1024**3:.2f} GB")

361

```

362

363

### Memory Pool Management

364

365

```python

366

# Get default memory pool

367

pool = cp.get_default_memory_pool()

368

369

# Monitor memory usage

370

print(f"Used bytes: {pool.used_bytes()}")

371

print(f"Total bytes: {pool.total_bytes()}")

372

373

# Allocate large array

374

large_array = cp.zeros((1000, 1000, 1000), dtype=cp.float32)

375

376

print(f"After allocation - Used: {pool.used_bytes() / 1024**3:.2f} GB")

377

378

# Free memory

379

del large_array

380

pool.free_all_blocks() # Free cached blocks

381

382

print(f"After cleanup - Used: {pool.used_bytes() / 1024**3:.2f} GB")

383

```

384

385

### Stream-based Asynchronous Operations

386

387

```python

388

# Create streams for asynchronous operations

389

stream1 = cp.cuda.Stream()

390

stream2 = cp.cuda.Stream()

391

392

# Create arrays

393

a = cp.random.random((1000, 1000))

394

b = cp.random.random((1000, 1000))

395

c = cp.zeros((1000, 1000))

396

d = cp.zeros((1000, 1000))

397

398

# Launch operations on different streams

399

with stream1:

400

c = cp.dot(a, b) # Matrix multiplication on stream1

401

402

with stream2:

403

d = a + b # Addition on stream2

404

405

# Synchronize streams

406

stream1.synchronize()

407

stream2.synchronize()

408

409

# Or synchronize all operations

410

cp.cuda.synchronize()

411

```

412

413

### Event-based Synchronization

414

415

```python

416

# Create events for timing and synchronization

417

start_event = cp.cuda.Event()

418

end_event = cp.cuda.Event()

419

420

# Record start time

421

start_event.record()

422

423

# Perform operations

424

result = cp.dot(cp.random.random((2000, 2000)),

425

cp.random.random((2000, 2000)))

426

427

# Record end time

428

end_event.record()

429

end_event.synchronize()

430

431

# Get elapsed time

432

elapsed_time = start_event.elapsed_time(end_event)

433

print(f"Operation took {elapsed_time:.2f} ms")

434

```

435

436

### Custom CUDA Kernels

437

438

```python

439

# Define custom CUDA kernel

440

kernel_code = r'''

441

extern "C" __global__

442

void add_kernel(const float* x, const float* y, float* z, int n) {

443

int tid = blockDim.x * blockIdx.x + threadIdx.x;

444

if (tid < n) {

445

z[tid] = x[tid] + y[tid];

446

}

447

}

448

'''

449

450

# Compile kernel

451

add_kernel = cp.cuda.compile_with_cache(kernel_code, 'add_kernel')

452

453

# Prepare data

454

n = 1000000

455

x = cp.random.random(n, dtype=cp.float32)

456

y = cp.random.random(n, dtype=cp.float32)

457

z = cp.zeros(n, dtype=cp.float32)

458

459

# Launch kernel

460

threads_per_block = 256

461

blocks_per_grid = (n + threads_per_block - 1) // threads_per_block

462

463

add_kernel((blocks_per_grid,), (threads_per_block,), (x, y, z, n))

464

465

# Verify result

466

expected = x + y

467

assert cp.allclose(z, expected)

468

```

469

470

### Raw Memory Operations

471

472

```python

473

# Allocate raw device memory

474

size = 1024 * 1024 * 4 # 4MB

475

raw_ptr = cp.cuda.alloc(size)

476

477

# Create array from raw pointer

478

arr = cp.ndarray((1024, 1024), dtype=cp.float32,

479

memptr=cp.cuda.MemoryPointer(raw_ptr, 0))

480

481

# Use the array

482

arr.fill(42.0)

483

print(f"Mean value: {arr.mean()}")

484

485

# Memory will be freed when raw_ptr goes out of scope

486

```

487

488

### Unified Memory

489

490

```python

491

# Allocate managed (unified) memory

492

size = 1000 * 1000 * 4 # Size in bytes

493

managed_ptr = cp.cuda.malloc_managed(size)

494

495

# Create array using managed memory

496

managed_arr = cp.ndarray((1000, 1000), dtype=cp.float32,

497

memptr=cp.cuda.MemoryPointer(managed_ptr, 0))

498

499

# Array is accessible from both CPU and GPU

500

managed_arr.fill(3.14)

501

502

# Synchronize before CPU access

503

cp.cuda.synchronize()

504

505

# Can be accessed from NumPy as well (with care)

506

print(f"Shape: {managed_arr.shape}, Mean: {managed_arr.mean()}")

507

```

508

509

### Performance Profiling

510

511

```python

512

# Use profiler ranges for performance analysis

513

with cp.cuda.ProfilerRange("Matrix Multiplication", color_id=1):

514

large_a = cp.random.random((5000, 5000))

515

large_b = cp.random.random((5000, 5000))

516

result = cp.dot(large_a, large_b)

517

518

# Add individual markers

519

cp.cuda.nvtx_mark("Starting FFT computation")

520

signal = cp.random.random(1024*1024)

521

fft_result = cp.fft.fft(signal)

522

cp.cuda.nvtx_mark("FFT computation complete")

523

```