or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-operations.mdcuda-integration.mdfft-operations.mdindex.mdio-operations.mdlinear-algebra.mdmathematical-functions.mdpolynomial-functions.mdrandom-generation.mdscipy-compatibility.md

cuda-integration.mddocs/

0

# CUDA Integration

1

2

Direct CUDA functionality providing low-level GPU programming capabilities, memory management, device control, custom kernel integration, and asynchronous execution. This module bridges the gap between high-level array operations and CUDA's powerful parallel computing features.

3

4

## Capabilities

5

6

### Device Management

7

8

Control and query CUDA devices for multi-GPU systems and device selection.

9

10

```python { .api }

11

class Device:

12

"""

13

CUDA device representation and context management.

14

"""

15

def __init__(self, device=None):

16

"""

17

Initialize device object.

18

19

Parameters:

20

- device: int or Device, device ID or device object

21

"""

22

23

def __enter__(self):

24

"""Context manager entry."""

25

26

def __exit__(self, *args):

27

"""Context manager exit."""

28

29

def use(self):

30

"""

31

Use this device in with statement.

32

33

Returns:

34

- context manager for device usage

35

"""

36

37

@property

38

def id(self):

39

"""Device ID."""

40

41

def synchronize(self):

42

"""Synchronize device."""

43

44

def get_device_id():

45

"""

46

Get current device ID.

47

48

Returns:

49

- int: Current device ID

50

"""

51

52

def is_available():

53

"""

54

Check if CUDA is available.

55

56

Returns:

57

- bool: True if CUDA is available

58

"""

59

60

def get_cublas_handle():

61

"""

62

Get cuBLAS handle for current device.

63

64

Returns:

65

- cuBLAS handle object

66

"""

67

```

68

69

### Memory Management

70

71

GPU memory allocation, pools, and efficient memory reuse strategies.

72

73

```python { .api }

74

def alloc(size):

75

"""

76

Allocate GPU memory.

77

78

Parameters:

79

- size: int, number of bytes to allocate

80

81

Returns:

82

- MemoryPointer: Pointer to allocated memory

83

"""

84

85

def malloc_managed(size):

86

"""

87

Allocate managed (unified) memory.

88

89

Parameters:

90

- size: int, number of bytes to allocate

91

92

Returns:

93

- ManagedMemory: Managed memory object

94

"""

95

96

def malloc_async(size, stream=None):

97

"""

98

Allocate memory asynchronously.

99

100

Parameters:

101

- size: int, number of bytes to allocate

102

- stream: Stream, optional, CUDA stream for allocation

103

104

Returns:

105

- MemoryAsync: Asynchronously allocated memory

106

"""

107

108

class MemoryPointer:

109

"""

110

Pointer to device memory with automatic deallocation.

111

"""

112

def __init__(self, mem, owner):

113

"""

114

Initialize memory pointer.

115

116

Parameters:

117

- mem: raw memory pointer

118

- owner: memory owner object

119

"""

120

121

@property

122

def ptr(self):

123

"""Raw memory pointer address."""

124

125

@property

126

def size(self):

127

"""Memory size in bytes."""

128

129

def copy_from_device(self, src, size):

130

"""Copy from device memory."""

131

132

def copy_from_host(self, src, size):

133

"""Copy from host memory."""

134

135

def copy_to_host(self, dst, size):

136

"""Copy to host memory."""

137

138

def memset(self, value, size):

139

"""Set memory values."""

140

141

class MemoryPool:

142

"""

143

Memory pool for efficient GPU memory management.

144

"""

145

def __init__(self, allocator=None):

146

"""

147

Initialize memory pool.

148

149

Parameters:

150

- allocator: callable, custom allocator function

151

"""

152

153

def malloc(self, size):

154

"""

155

Allocate memory from pool.

156

157

Parameters:

158

- size: int, number of bytes

159

160

Returns:

161

- MemoryPointer: Allocated memory

162

"""

163

164

def free_all_blocks(self):

165

"""Free all allocated blocks."""

166

167

def free_all_free(self):

168

"""Free all free blocks."""

169

170

def used_bytes(self):

171

"""

172

Get used memory bytes.

173

174

Returns:

175

- int: Used memory in bytes

176

"""

177

178

def total_bytes(self):

179

"""

180

Get total allocated bytes.

181

182

Returns:

183

- int: Total memory in bytes

184

"""

185

186

def set_allocator(allocator):

187

"""

188

Set global memory allocator.

189

190

Parameters:

191

- allocator: callable or None, allocator function

192

"""

193

194

def get_allocator():

195

"""

196

Get current memory allocator.

197

198

Returns:

199

- callable: Current allocator function

200

"""

201

```

202

203

### Pinned Memory

204

205

Page-locked host memory for efficient CPU-GPU transfers.

206

207

```python { .api }

208

def alloc_pinned_memory(size):

209

"""

210

Allocate pinned host memory.

211

212

Parameters:

213

- size: int, number of bytes to allocate

214

215

Returns:

216

- PinnedMemoryPointer: Pointer to pinned memory

217

"""

218

219

class PinnedMemoryPointer:

220

"""

221

Pointer to pinned host memory.

222

"""

223

def __init__(self, mem, size):

224

"""

225

Initialize pinned memory pointer.

226

227

Parameters:

228

- mem: raw memory pointer

229

- size: int, memory size in bytes

230

"""

231

232

@property

233

def ptr(self):

234

"""Raw memory pointer."""

235

236

@property

237

def size(self):

238

"""Memory size in bytes."""

239

240

class PinnedMemoryPool:

241

"""

242

Memory pool for pinned host memory.

243

"""

244

def malloc(self, size):

245

"""

246

Allocate pinned memory from pool.

247

248

Parameters:

249

- size: int, number of bytes

250

251

Returns:

252

- PinnedMemoryPointer: Allocated pinned memory

253

"""

254

255

def free_all_blocks(self):

256

"""Free all allocated blocks."""

257

258

def used_bytes(self):

259

"""

260

Used memory in bytes.

261

262

Returns:

263

- int: Used memory

264

"""

265

266

def total_bytes(self):

267

"""

268

Total allocated memory in bytes.

269

270

Returns:

271

- int: Total memory

272

"""

273

274

def set_pinned_memory_allocator(allocator):

275

"""

276

Set pinned memory allocator.

277

278

Parameters:

279

- allocator: callable or None, allocator function

280

"""

281

```

282

283

### Streams and Events

284

285

Asynchronous execution control and synchronization primitives.

286

287

```python { .api }

288

class Stream:

289

"""

290

CUDA stream for asynchronous operations.

291

"""

292

def __init__(self, null=False, non_blocking=False, ptds=False):

293

"""

294

Initialize CUDA stream.

295

296

Parameters:

297

- null: bool, use null stream

298

- non_blocking: bool, create non-blocking stream

299

- ptds: bool, per-thread default stream

300

"""

301

302

def __enter__(self):

303

"""Context manager entry."""

304

305

def __exit__(self, *args):

306

"""Context manager exit."""

307

308

def use(self):

309

"""

310

Use stream in context manager.

311

312

Returns:

313

- context manager for stream usage

314

"""

315

316

def synchronize(self):

317

"""Wait for stream operations to complete."""

318

319

def add_callback(self, callback, arg=None):

320

"""

321

Add callback to stream.

322

323

Parameters:

324

- callback: callable, callback function

325

- arg: object, optional argument to callback

326

"""

327

328

@property

329

def ptr(self):

330

"""Raw CUDA stream pointer."""

331

332

class ExternalStream:

333

"""

334

Wrapper for externally created CUDA stream.

335

"""

336

def __init__(self, ptr):

337

"""

338

Initialize external stream.

339

340

Parameters:

341

- ptr: int, raw CUDA stream pointer

342

"""

343

344

def get_current_stream():

345

"""

346

Get current CUDA stream.

347

348

Returns:

349

- Stream: Current stream object

350

"""

351

352

class Event:

353

"""

354

CUDA event for synchronization and timing.

355

"""

356

def __init__(self, block=True, disable_timing=False, interprocess=False):

357

"""

358

Initialize CUDA event.

359

360

Parameters:

361

- block: bool, blocking event

362

- disable_timing: bool, disable timing measurement

363

- interprocess: bool, enable interprocess sharing

364

"""

365

366

def record(self, stream=None):

367

"""

368

Record event in stream.

369

370

Parameters:

371

- stream: Stream, optional, stream to record in

372

"""

373

374

def synchronize(self):

375

"""Wait for event completion."""

376

377

def elapsed_time(self, end_event):

378

"""

379

Compute elapsed time to another event.

380

381

Parameters:

382

- end_event: Event, end event

383

384

Returns:

385

- float: Elapsed time in milliseconds

386

"""

387

388

@property

389

def ptr(self):

390

"""Raw CUDA event pointer."""

391

392

def get_elapsed_time(start_event, end_event):

393

"""

394

Get elapsed time between events.

395

396

Parameters:

397

- start_event: Event, start event

398

- end_event: Event, end event

399

400

Returns:

401

- float: Elapsed time in milliseconds

402

"""

403

```

404

405

### CUDA Graphs

406

407

Capture and replay sequences of operations for performance optimization.

408

409

```python { .api }

410

class Graph:

411

"""

412

CUDA graph for capturing and replaying operation sequences.

413

"""

414

def __init__(self):

415

"""Initialize empty CUDA graph."""

416

417

def capture_begin(self, stream=None, mode='global'):

418

"""

419

Begin graph capture.

420

421

Parameters:

422

- stream: Stream, stream to capture

423

- mode: str, capture mode ('global', 'thread_local', 'relaxed')

424

"""

425

426

def capture_end(self, stream=None):

427

"""

428

End graph capture.

429

430

Parameters:

431

- stream: Stream, stream being captured

432

"""

433

434

def launch(self, stream=None):

435

"""

436

Launch captured graph.

437

438

Parameters:

439

- stream: Stream, stream to launch in

440

"""

441

```

442

443

### Custom Kernels

444

445

Integration of user-defined CUDA kernels for specialized computations.

446

447

```python { .api }

448

class ElementwiseKernel:

449

"""

450

User-defined elementwise CUDA kernel.

451

"""

452

def __init__(self, in_params, out_params, operation, name='kernel', reduce_dims=True, options=(), loop_prep='', after_loop='', preamble='', **kwargs):

453

"""

454

Initialize elementwise kernel.

455

456

Parameters:

457

- in_params: str, input parameter declarations

458

- out_params: str, output parameter declarations

459

- operation: str, kernel operation code

460

- name: str, kernel name

461

- reduce_dims: bool, reduce dimensions automatically

462

- options: tuple, compiler options

463

- loop_prep: str, code before main loop

464

- after_loop: str, code after main loop

465

- preamble: str, code before kernel function

466

"""

467

468

def __call__(self, *args, **kwargs):

469

"""

470

Execute kernel with given arguments.

471

472

Parameters:

473

- *args: input and output arrays

474

- size: int, optional, number of elements to process

475

- stream: Stream, optional, execution stream

476

477

Returns:

478

- output arrays or None

479

"""

480

481

class ReductionKernel:

482

"""

483

User-defined reduction CUDA kernel.

484

"""

485

def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr='', identity=None, name='reduce_kernel', reduce_type=None, reduce_dims=True, options=(), preamble='', **kwargs):

486

"""

487

Initialize reduction kernel.

488

489

Parameters:

490

- in_params: str, input parameter declarations

491

- out_params: str, output parameter declarations

492

- map_expr: str, mapping expression

493

- reduce_expr: str, reduction expression

494

- post_map_expr: str, post-mapping expression

495

- identity: str, identity value for reduction

496

- name: str, kernel name

497

- reduce_type: type, reduction data type

498

- reduce_dims: bool, reduce dimensions

499

- options: tuple, compiler options

500

- preamble: str, preamble code

501

"""

502

503

def __call__(self, *args, **kwargs):

504

"""Execute reduction kernel."""

505

506

class RawKernel:

507

"""

508

User-defined raw CUDA kernel from source code.

509

"""

510

def __init__(self, code, name, options=(), backend='auto', translate_cucomplex=True, **kwargs):

511

"""

512

Initialize raw kernel from CUDA source.

513

514

Parameters:

515

- code: str, CUDA kernel source code

516

- name: str, kernel function name

517

- options: tuple, compilation options

518

- backend: str, compilation backend

519

- translate_cucomplex: bool, translate complex types

520

"""

521

522

def __call__(self, grid, block, *args, **kwargs):

523

"""

524

Launch kernel with specified grid and block dimensions.

525

526

Parameters:

527

- grid: tuple, grid dimensions

528

- block: tuple, block dimensions

529

- *args: kernel arguments

530

- shared_mem: int, shared memory size

531

- stream: Stream, execution stream

532

"""

533

534

class RawModule:

535

"""

536

CUDA module containing multiple kernels and functions.

537

"""

538

def __init__(self, code, options=(), backend='auto', translate_cucomplex=True, **kwargs):

539

"""

540

Initialize module from CUDA source code.

541

542

Parameters:

543

- code: str, CUDA module source code

544

- options: tuple, compilation options

545

- backend: str, compilation backend

546

- translate_cucomplex: bool, translate complex types

547

"""

548

549

def get_function(self, name):

550

"""

551

Get function from module.

552

553

Parameters:

554

- name: str, function name

555

556

Returns:

557

- Function: CUDA function object

558

"""

559

560

class Function:

561

"""

562

CUDA function from compiled module.

563

"""

564

def __call__(self, grid, block, *args, **kwargs):

565

"""

566

Launch function.

567

568

Parameters:

569

- grid: tuple, grid dimensions

570

- block: tuple, block dimensions

571

- *args: function arguments

572

- shared_mem: int, shared memory size

573

- stream: Stream, execution stream

574

"""

575

576

@property

577

def max_threads_per_block(self):

578

"""Maximum threads per block for this function."""

579

580

@property

581

def num_regs(self):

582

"""Number of registers used by function."""

583

```

584

585

### Compilation

586

587

Dynamic CUDA code compilation and caching.

588

589

```python { .api }

590

def compile_with_cache(source, options=(), arch=None, cache_dir=None, prepend_cupy_headers=True, backend='auto', translate_cucomplex=True, **kwargs):

591

"""

592

Compile CUDA source code with caching.

593

594

Parameters:

595

- source: str, CUDA source code

596

- options: tuple, compilation options

597

- arch: str, target architecture

598

- cache_dir: str, cache directory path

599

- prepend_cupy_headers: bool, add CuPy headers

600

- backend: str, compilation backend

601

- translate_cucomplex: bool, translate complex types

602

603

Returns:

604

- bytes: Compiled module binary

605

"""

606

```

607

608

### Context Managers

609

610

Convenient context managers for resource management.

611

612

```python { .api }

613

def using_allocator(allocator=None):

614

"""

615

Context manager for temporary allocator change.

616

617

Parameters:

618

- allocator: callable or None, temporary allocator

619

620

Returns:

621

- context manager

622

"""

623

624

def profile():

625

"""

626

Context manager for CUDA profiling (deprecated).

627

628

Returns:

629

- context manager

630

"""

631

```

632

633

### Environment and Paths

634

635

System configuration and tool detection.

636

637

```python { .api }

638

def get_cuda_path():

639

"""

640

Get CUDA installation path.

641

642

Returns:

643

- str: Path to CUDA installation

644

"""

645

646

def get_nvcc_path():

647

"""

648

Get nvcc compiler path.

649

650

Returns:

651

- str: Path to nvcc compiler

652

"""

653

654

def get_rocm_path():

655

"""

656

Get ROCm installation path.

657

658

Returns:

659

- str: Path to ROCm installation

660

"""

661

662

def get_hipcc_path():

663

"""

664

Get hipcc compiler path.

665

666

Returns:

667

- str: Path to hipcc compiler

668

"""

669

```

670

671

## Backend APIs

672

673

Direct access to CUDA runtime and driver APIs.

674

675

```python { .api }

676

# CUDA Runtime API

677

from cupy_backends.cuda.api import runtime

678

679

# CUDA Driver API

680

from cupy_backends.cuda.api import driver

681

682

# cuBLAS library

683

from cupy_backends.cuda.libs import cublas

684

685

# cuRAND library

686

from cupy_backends.cuda.libs import curand

687

688

# cuSOLVER library

689

from cupy_backends.cuda.libs import cusolver

690

691

# cuSPARSE library

692

from cupy_backends.cuda.libs import cusparse

693

694

# NVRTC (Runtime Compilation)

695

from cupy_backends.cuda.libs import nvrtc

696

697

# CUDA Profiler

698

from cupy_backends.cuda.libs import profiler

699

```

700

701

## Usage Examples

702

703

### Device Management and Memory

704

705

```python

706

import cupy as cp

707

708

# Check CUDA availability

709

if cp.cuda.is_available():

710

print(f"CUDA devices available: {cp.cuda.runtime.getDeviceCount()}")

711

712

# Use specific device

713

with cp.cuda.Device(0):

714

# All operations use device 0

715

arr = cp.array([1, 2, 3, 4, 5])

716

result = cp.sum(arr)

717

718

# Memory pool management

719

mempool = cp.get_default_memory_pool()

720

print(f"Used memory: {mempool.used_bytes()} bytes")

721

print(f"Total memory: {mempool.total_bytes()} bytes")

722

723

# Free unused memory

724

mempool.free_all_free()

725

```

726

727

### Asynchronous Operations with Streams

728

729

```python

730

import cupy as cp

731

732

# Create streams for async operations

733

stream1 = cp.cuda.Stream()

734

stream2 = cp.cuda.Stream()

735

736

# Async operations on different streams

737

with stream1:

738

a1 = cp.random.random((1000, 1000))

739

result1 = cp.matmul(a1, a1.T)

740

741

with stream2:

742

a2 = cp.random.random((1000, 1000))

743

result2 = cp.matmul(a2, a2.T)

744

745

# Synchronize streams

746

stream1.synchronize()

747

stream2.synchronize()

748

749

# Event-based synchronization

750

start_event = cp.cuda.Event()

751

end_event = cp.cuda.Event()

752

753

start_event.record()

754

# ... GPU operations ...

755

end_event.record()

756

757

# Measure elapsed time

758

end_event.synchronize()

759

elapsed_time = cp.cuda.get_elapsed_time(start_event, end_event)

760

print(f"Elapsed time: {elapsed_time} ms")

761

```

762

763

### Custom CUDA Kernels

764

765

```python

766

import cupy as cp

767

768

# Elementwise kernel example

769

add_kernel = cp.ElementwiseKernel(

770

'float32 x, float32 y', # input parameters

771

'float32 z', # output parameters

772

'z = x + y * 2', # operation

773

'add_kernel' # kernel name

774

)

775

776

# Use the kernel

777

a = cp.array([1, 2, 3, 4], dtype=cp.float32)

778

b = cp.array([5, 6, 7, 8], dtype=cp.float32)

779

c = cp.empty_like(a)

780

781

add_kernel(a, b, c)

782

print("Custom kernel result:", c)

783

784

# Raw CUDA kernel

785

raw_kernel_code = '''

786

extern "C" __global__ void vector_add(float* a, float* b, float* c, int n) {

787

int idx = blockIdx.x * blockDim.x + threadIdx.x;

788

if (idx < n) {

789

c[idx] = a[idx] + b[idx];

790

}

791

}

792

'''

793

794

raw_kernel = cp.RawKernel(raw_kernel_code, 'vector_add')

795

796

# Launch raw kernel

797

n = 1000

798

a_gpu = cp.random.random(n, dtype=cp.float32)

799

b_gpu = cp.random.random(n, dtype=cp.float32)

800

c_gpu = cp.empty(n, dtype=cp.float32)

801

802

threads_per_block = 256

803

blocks_per_grid = (n + threads_per_block - 1) // threads_per_block

804

805

raw_kernel((blocks_per_grid,), (threads_per_block,),

806

(a_gpu, b_gpu, c_gpu, n))

807

```

808

809

### Memory Transfer Optimization

810

811

```python

812

import cupy as cp

813

import numpy as np

814

815

# Pinned memory for faster transfers

816

size = 10000000

817

pinned_mem = cp.cuda.alloc_pinned_memory(size * 4) # 4 bytes per float32

818

819

# Create numpy array using pinned memory

820

pinned_array = np.frombuffer(pinned_mem, dtype=np.float32).reshape(-1)

821

pinned_array[:] = np.random.random(size)

822

823

# Fast transfer from pinned memory to GPU

824

gpu_array = cp.asarray(pinned_array)

825

826

# Async transfer with streams

827

stream = cp.cuda.Stream()

828

with stream:

829

gpu_result = cp.sum(gpu_array)

830

831

# Transfer result back asynchronously

832

result_pinned = cp.cuda.pinned_memory.alloc_pinned_memory(4)

833

gpu_result.get(out=np.frombuffer(result_pinned, dtype=np.float32))

834

```

835

836

### CUDA Graphs for Performance

837

838

```python

839

import cupy as cp

840

841

# Capture operations in a graph

842

graph = cp.cuda.Graph()

843

stream = cp.cuda.Stream()

844

845

# Begin graph capture

846

graph.capture_begin(stream)

847

848

with stream:

849

# Operations to capture

850

a = cp.random.random((1000, 1000))

851

b = cp.random.random((1000, 1000))

852

c = cp.matmul(a, b)

853

result = cp.sum(c)

854

855

# End capture

856

graph.capture_end(stream)

857

858

# Launch graph multiple times (very efficient)

859

for _ in range(100):

860

graph.launch(stream)

861

862

stream.synchronize()

863

```