or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-creation.mdarray-manipulation.mdbinary-operations.mdcuda.mdfft.mdindex.mdindexing-searching.mdlinalg.mdlogic-functions.mdmath-functions.mdmemory-performance.mdrandom.mdsorting-counting.mdstatistics.md

cuda.mddocs/

0

# CUDA Programming Interface

1

2

Direct access to CUDA features including custom kernels, memory management, streams, and device control. This interface enables low-level GPU programming within Python, providing full control over GPU resources and custom kernel execution.

3

4

## Capabilities

5

6

### Custom Kernel Creation

7

8

Create and execute custom CUDA kernels for specialized GPU computations.

9

10

```python { .api }

11

class RawKernel:

12

"""

13

Raw CUDA kernel wrapper for executing custom CUDA C/C++ code.

14

15

Enables direct execution of CUDA kernels written in C/C++ from Python,

16

providing maximum flexibility for GPU programming.

17

"""

18

19

def __init__(self, code, name, options=(), backend='nvcc', translate_cucomplex=True):

20

"""

21

Initialize raw CUDA kernel.

22

23

Parameters:

24

- code: str, CUDA C/C++ source code

25

- name: str, kernel function name

26

- options: tuple, compiler options

27

- backend: str, compilation backend ('nvcc' or 'nvrtc')

28

- translate_cucomplex: bool, translate complex types

29

"""

30

31

def __call__(self, grid, block, args, **kwargs):

32

"""

33

Execute kernel with specified grid and block dimensions.

34

35

Parameters:

36

- grid: tuple, grid dimensions (blocks)

37

- block: tuple, block dimensions (threads per block)

38

- args: tuple, kernel arguments

39

- shared_mem: int, shared memory size, optional

40

- stream: cupy.cuda.Stream, CUDA stream, optional

41

42

Returns:

43

None

44

"""

45

46

class ElementwiseKernel:

47

"""

48

Element-wise operation kernel for array computations.

49

50

Simplifies creation of kernels that operate on array elements

51

independently, automatically handling array indexing and broadcasting.

52

"""

53

54

def __init__(self, in_params, out_params, operation, name='kernel', **kwargs):

55

"""

56

Initialize element-wise kernel.

57

58

Parameters:

59

- in_params: str, input parameter declarations

60

- out_params: str, output parameter declarations

61

- operation: str, CUDA C operation code

62

- name: str, kernel name

63

- options: tuple, compiler options, optional

64

- reduce_dims: bool, reduce dimensions, optional

65

"""

66

67

def __call__(self, *args, **kwargs):

68

"""

69

Execute element-wise kernel on input arrays.

70

71

Parameters:

72

- args: arrays, input and output arrays

73

- size: int, array size override, optional

74

- stream: cupy.cuda.Stream, CUDA stream, optional

75

76

Returns:

77

cupy.ndarray: output array result

78

"""

79

80

class ReductionKernel:

81

"""

82

Reduction operation kernel for aggregating array values.

83

84

Efficiently performs reduction operations (sum, max, min, etc.)

85

across array dimensions with optimized GPU memory access patterns.

86

"""

87

88

def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr='', **kwargs):

89

"""

90

Initialize reduction kernel.

91

92

Parameters:

93

- in_params: str, input parameter declarations

94

- out_params: str, output parameter declarations

95

- map_expr: str, mapping expression for each element

96

- reduce_expr: str, reduction operation expression

97

- post_map_expr: str, post-processing expression, optional

98

- identity: str, identity value for reduction, optional

99

- options: tuple, compiler options, optional

100

"""

101

102

def __call__(self, *args, **kwargs):

103

"""

104

Execute reduction kernel on input arrays.

105

106

Parameters:

107

- args: arrays, input and output arrays

108

- axis: int or tuple, reduction axes, optional

109

- keepdims: bool, keep dimensions, optional

110

- stream: cupy.cuda.Stream, CUDA stream, optional

111

112

Returns:

113

cupy.ndarray: reduced result array

114

"""

115

```

116

117

### Memory Management

118

119

Direct GPU memory allocation, deallocation, and transfer operations.

120

121

```python { .api }

122

class MemoryPointer:

123

"""

124

Pointer to GPU memory location.

125

126

Low-level interface to GPU memory providing direct access

127

to memory addresses and sizes for advanced memory management.

128

"""

129

130

ptr: int # Memory address

131

size: int # Memory size in bytes

132

device: Device # Associated device

133

134

def get_default_memory_pool():

135

"""

136

Get default GPU memory pool.

137

138

Returns:

139

cupy.cuda.MemoryPool: Default memory pool for GPU allocations

140

"""

141

142

def get_default_pinned_memory_pool():

143

"""

144

Get default pinned memory pool.

145

146

Returns:

147

cupy.cuda.PinnedMemoryPool: Default memory pool for pinned host memory

148

"""

149

150

class MemoryPool:

151

"""

152

GPU memory pool for efficient memory allocation.

153

154

Manages GPU memory allocation and deallocation to reduce

155

overhead from frequent malloc/free operations.

156

"""

157

158

def malloc(self, size):

159

"""

160

Allocate GPU memory.

161

162

Parameters:

163

- size: int, memory size in bytes

164

165

Returns:

166

MemoryPointer: pointer to allocated memory

167

"""

168

169

def free(self, ptr, size):

170

"""

171

Free GPU memory.

172

173

Parameters:

174

- ptr: int, memory address

175

- size: int, memory size in bytes

176

"""

177

178

def used_bytes(self):

179

"""

180

Get used memory in bytes.

181

182

Returns:

183

int: used memory size

184

"""

185

186

def total_bytes(self):

187

"""

188

Get total allocated memory in bytes.

189

190

Returns:

191

int: total allocated memory size

192

"""

193

194

class PinnedMemoryPool:

195

"""

196

Pinned host memory pool for fast CPU-GPU transfers.

197

198

Manages pinned (page-locked) host memory that can be

199

transferred to/from GPU more efficiently than pageable memory.

200

"""

201

202

def malloc(self, size):

203

"""

204

Allocate pinned host memory.

205

206

Parameters:

207

- size: int, memory size in bytes

208

209

Returns:

210

PinnedMemoryPointer: pointer to allocated pinned memory

211

"""

212

```

213

214

### Device Management

215

216

Control GPU devices and their properties.

217

218

```python { .api }

219

class Device:

220

"""

221

CUDA device representation and control.

222

223

Provides interface to query device properties and control

224

the active GPU device for computations.

225

"""

226

227

def __init__(self, device_id=None):

228

"""

229

Initialize device object.

230

231

Parameters:

232

- device_id: int, device ID, optional (uses current device)

233

"""

234

235

id: int # Device ID

236

237

def use(self):

238

"""

239

Set this device as current device.

240

"""

241

242

def synchronize(self):

243

"""

244

Synchronize device execution.

245

"""

246

247

@property

248

def compute_capability(self):

249

"""

250

Get device compute capability.

251

252

Returns:

253

tuple: (major, minor) compute capability version

254

"""

255

256

def get_device_count():

257

"""

258

Get number of available GPU devices.

259

260

Returns:

261

int: number of GPU devices

262

"""

263

264

def get_device_id():

265

"""

266

Get current device ID.

267

268

Returns:

269

int: current device ID

270

"""

271

```

272

273

### Stream Management

274

275

CUDA streams for asynchronous operations and overlapping computation with data transfer.

276

277

```python { .api }

278

class Stream:

279

"""

280

CUDA stream for asynchronous operations.

281

282

Enables asynchronous kernel execution and memory transfers,

283

allowing overlapping of computation and data movement.

284

"""

285

286

def __init__(self, non_blocking=False):

287

"""

288

Initialize CUDA stream.

289

290

Parameters:

291

- non_blocking: bool, create non-blocking stream

292

"""

293

294

def synchronize(self):

295

"""

296

Synchronize stream execution.

297

298

Blocks until all operations in the stream complete.

299

"""

300

301

def record(self, event=None):

302

"""

303

Record event in stream.

304

305

Parameters:

306

- event: cupy.cuda.Event, event to record, optional

307

308

Returns:

309

cupy.cuda.Event: recorded event

310

"""

311

312

def wait_event(self, event):

313

"""

314

Wait for event in another stream.

315

316

Parameters:

317

- event: cupy.cuda.Event, event to wait for

318

"""

319

320

class Event:

321

"""

322

CUDA event for synchronization between streams.

323

324

Provides synchronization points that can be recorded

325

in one stream and waited for in another.

326

"""

327

328

def __init__(self, blocking=False, timing=False, interprocess=False):

329

"""

330

Initialize CUDA event.

331

332

Parameters:

333

- blocking: bool, create blocking event

334

- timing: bool, enable timing capability

335

- interprocess: bool, enable interprocess capability

336

"""

337

338

def record(self, stream=None):

339

"""

340

Record event in stream.

341

342

Parameters:

343

- stream: cupy.cuda.Stream, stream to record in, optional

344

"""

345

346

def synchronize(self):

347

"""

348

Synchronize on event.

349

350

Blocks until event is recorded.

351

"""

352

353

def elapsed_time(self, end_event):

354

"""

355

Get elapsed time between events.

356

357

Parameters:

358

- end_event: cupy.cuda.Event, end event

359

360

Returns:

361

float: elapsed time in milliseconds

362

"""

363

```

364

365

### Runtime API Access

366

367

Direct access to CUDA Runtime API functions.

368

369

```python { .api }

370

def is_available():

371

"""

372

Check if CUDA is available.

373

374

Returns:

375

bool: True if CUDA is available

376

"""

377

378

def get_cuda_path():

379

"""

380

Get CUDA installation path.

381

382

Returns:

383

str: CUDA installation directory path

384

"""

385

386

def get_nvcc_path():

387

"""

388

Get NVCC compiler path.

389

390

Returns:

391

str: NVCC compiler executable path

392

"""

393

```

394

395

## Usage Examples

396

397

### Custom Kernel Development

398

399

```python

400

import cupy as cp

401

402

# Define custom CUDA kernel

403

elementwise_kernel = cp.ElementwiseKernel(

404

'float32 x, float32 y', # Input parameters

405

'float32 z', # Output parameters

406

'z = x * x + y * y', # Operation

407

'squared_sum' # Kernel name

408

)

409

410

# Create input arrays

411

a = cp.random.random(1000000).astype(cp.float32)

412

b = cp.random.random(1000000).astype(cp.float32)

413

414

# Execute custom kernel

415

result = elementwise_kernel(a, b)

416

417

# Equivalent NumPy-style operation for comparison

418

result_numpy_style = a * a + b * b

419

print(cp.allclose(result, result_numpy_style))

420

```

421

422

### Advanced Raw Kernel

423

424

```python

425

# Raw CUDA kernel with custom C++ code

426

raw_kernel_code = '''

427

extern "C" __global__

428

void matrix_multiply(const float* A, const float* B, float* C,

429

int M, int N, int K) {

430

int row = blockIdx.y * blockDim.y + threadIdx.y;

431

int col = blockIdx.x * blockDim.x + threadIdx.x;

432

433

if (row < M && col < N) {

434

float sum = 0.0f;

435

for (int k = 0; k < K; k++) {

436

sum += A[row * K + k] * B[k * N + col];

437

}

438

C[row * N + col] = sum;

439

}

440

}

441

'''

442

443

# Compile kernel

444

raw_kernel = cp.RawKernel(raw_kernel_code, 'matrix_multiply')

445

446

# Prepare matrices

447

M, N, K = 512, 512, 512

448

A = cp.random.random((M, K), dtype=cp.float32)

449

B = cp.random.random((K, N), dtype=cp.float32)

450

C = cp.zeros((M, N), dtype=cp.float32)

451

452

# Configure kernel execution

453

block_size = (16, 16)

454

grid_size = ((N + block_size[0] - 1) // block_size[0],

455

(M + block_size[1] - 1) // block_size[1])

456

457

# Execute kernel

458

raw_kernel(grid_size, block_size, (A, B, C, M, N, K))

459

460

# Verify result

461

C_reference = cp.dot(A, B)

462

print(f"Max error: {cp.max(cp.abs(C - C_reference))}")

463

```

464

465

### Memory Management

466

467

```python

468

# Advanced memory management

469

mempool = cp.get_default_memory_pool()

470

pinned_mempool = cp.get_default_pinned_memory_pool()

471

472

print(f"GPU memory used: {mempool.used_bytes()} bytes")

473

print(f"GPU memory total: {mempool.total_bytes()} bytes")

474

475

# Create large arrays to observe memory usage

476

large_arrays = []

477

for i in range(10):

478

arr = cp.random.random((1000, 1000))

479

large_arrays.append(arr)

480

print(f"After array {i}: {mempool.used_bytes()} bytes used")

481

482

# Free memory by deleting references

483

del large_arrays

484

print(f"After deletion: {mempool.used_bytes()} bytes used")

485

486

# Force garbage collection and memory cleanup

487

import gc

488

gc.collect()

489

mempool.free_all_blocks()

490

print(f"After cleanup: {mempool.used_bytes()} bytes used")

491

```

492

493

### Asynchronous Operations with Streams

494

495

```python

496

# Create multiple streams for overlapping operations

497

stream1 = cp.cuda.Stream()

498

stream2 = cp.cuda.Stream()

499

500

# Prepare data

501

n = 10000000

502

a = cp.random.random(n).astype(cp.float32)

503

b = cp.random.random(n).astype(cp.float32)

504

c = cp.zeros(n, dtype=cp.float32)

505

d = cp.zeros(n, dtype=cp.float32)

506

507

# Asynchronous operations in different streams

508

with stream1:

509

# Operation 1 in stream1

510

result1 = cp.add(a, b)

511

512

with stream2:

513

# Operation 2 in stream2 (can run concurrently)

514

result2 = cp.multiply(a, b)

515

516

# Synchronize streams

517

stream1.synchronize()

518

stream2.synchronize()

519

520

# Event-based synchronization

521

event = cp.cuda.Event()

522

523

with stream1:

524

cp.add(a, b, out=c)

525

event.record() # Record completion of operation

526

527

with stream2:

528

stream2.wait_event(event) # Wait for stream1 operation

529

cp.multiply(c, 2.0, out=d) # Use result from stream1

530

531

stream2.synchronize()

532

```

533

534

### Device Management

535

536

```python

537

# Query available devices

538

device_count = cp.cuda.runtime.get_device_count()

539

print(f"Available GPU devices: {device_count}")

540

541

# Get current device info

542

current_device = cp.cuda.Device()

543

print(f"Current device ID: {current_device.id}")

544

print(f"Compute capability: {current_device.compute_capability}")

545

546

# Multi-GPU operations (if multiple GPUs available)

547

if device_count > 1:

548

# Use first GPU

549

with cp.cuda.Device(0):

550

array_gpu0 = cp.random.random((1000, 1000))

551

result_gpu0 = cp.sum(array_gpu0)

552

553

# Use second GPU

554

with cp.cuda.Device(1):

555

array_gpu1 = cp.random.random((1000, 1000))

556

result_gpu1 = cp.sum(array_gpu1)

557

558

print(f"Result from GPU 0: {result_gpu0}")

559

print(f"Result from GPU 1: {result_gpu1}")

560

```