or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-operations.mdcuda-interface.mdcustom-kernels.mdindex.mdlinear-algebra.mdmathematical-functions.mdrandom-generation.mdscipy-extensions.mdstatistics.md

cuda-interface.mddocs/

0

# CUDA Interface

1

2

Low-level CUDA functionality providing direct access to GPU device management, memory allocation, stream control, and integration with CUDA libraries. Enables fine-grained control over GPU resources and execution.

3

4

## Capabilities

5

6

### Device Management

7

8

Control and query GPU devices for multi-GPU computing.

9

10

```python { .api }

11

class Device:

12

"""

13

CUDA device context manager.

14

15

Parameters:

16

- device: int or None, device ID to use

17

"""

18

def __init__(self, device=None): ...

19

def __enter__(self): ...

20

def __exit__(self, *args): ...

21

22

@property

23

def id(self) -> int:

24

"""Device ID."""

25

26

def synchronize(self):

27

"""Synchronize the device."""

28

29

def use(self):

30

"""Make this device current."""

31

32

def get_device_id() -> int:

33

"""Get current device ID."""

34

35

def set_device_id(device_id: int):

36

"""Set current device ID."""

37

38

def get_device_count() -> int:

39

"""Get number of available CUDA devices."""

40

41

def is_available() -> bool:

42

"""Check if CUDA is available."""

43

44

def get_compute_capability(device=None) -> tuple:

45

"""Get compute capability of device."""

46

47

def get_device_properties(device=None) -> dict:

48

"""Get properties of CUDA device."""

49

```

50

51

### Memory Management

52

53

Advanced GPU memory allocation and management with memory pools.

54

55

```python { .api }

56

class MemoryPool:

57

"""

58

GPU memory pool for efficient allocation.

59

"""

60

def __init__(self): ...

61

62

def malloc(self, size: int):

63

"""

64

Allocate GPU memory.

65

66

Parameters:

67

- size: int, number of bytes to allocate

68

69

Returns:

70

MemoryPointer: Pointer to allocated memory

71

"""

72

73

def free_all_blocks(self):

74

"""Free all memory blocks in pool."""

75

76

def free_all_free_blocks(self):

77

"""Free all unused memory blocks."""

78

79

def get_limit(self) -> int:

80

"""Get memory pool size limit."""

81

82

def set_limit(self, size: int):

83

"""Set memory pool size limit."""

84

85

@property

86

def used_bytes(self) -> int:

87

"""Number of bytes currently in use."""

88

89

@property

90

def total_bytes(self) -> int:

91

"""Total number of bytes allocated."""

92

93

class PinnedMemoryPool:

94

"""

95

Pinned memory pool for CPU memory.

96

"""

97

def __init__(self): ...

98

def malloc(self, size: int): ...

99

def free_all_blocks(self): ...

100

101

class MemoryPointer:

102

"""

103

Pointer to GPU memory.

104

"""

105

def __init__(self, mem, offset): ...

106

107

@property

108

def device(self) -> Device: ...

109

110

@property

111

def ptr(self) -> int:

112

"""Raw pointer value."""

113

114

def copy_from_device(self, src, size): ...

115

def copy_from_host(self, src, size): ...

116

def copy_to_host(self, dst, size): ...

117

118

def get_allocator():

119

"""Get current memory allocator function."""

120

121

def set_allocator(allocator=None):

122

"""Set memory allocator function."""

123

124

def get_pinned_memory_allocator():

125

"""Get current pinned memory allocator."""

126

127

def set_pinned_memory_allocator(allocator=None):

128

"""Set pinned memory allocator function."""

129

130

def malloc(size: int) -> MemoryPointer:

131

"""Allocate GPU memory."""

132

133

def free(ptr: MemoryPointer):

134

"""Free GPU memory."""

135

136

def malloc_managed(size: int) -> MemoryPointer:

137

"""Allocate unified memory."""

138

139

def mem_info() -> tuple:

140

"""Get memory information (free, total)."""

141

```

142

143

### Stream Management

144

145

CUDA streams for asynchronous execution and memory transfers.

146

147

```python { .api }

148

class Stream:

149

"""

150

CUDA stream for asynchronous execution.

151

152

Parameters:

153

- null: bool, create null stream

154

- non_blocking: bool, create non-blocking stream

155

- ptds: bool, per-thread default stream

156

"""

157

def __init__(self, null=False, non_blocking=False, ptds=False): ...

158

159

def __enter__(self): ...

160

def __exit__(self, *args): ...

161

162

def synchronize(self):

163

"""Synchronize stream execution."""

164

165

def add_callback(self, callback, arg=None):

166

"""Add callback to stream."""

167

168

def record(self, event=None):

169

"""Record event in stream."""

170

171

def wait_event(self, event):

172

"""Make stream wait for event."""

173

174

@property

175

def ptr(self) -> int:

176

"""Raw stream pointer."""

177

178

def get_current_stream() -> Stream:

179

"""Get current CUDA stream."""

180

181

def get_default_stream() -> Stream:

182

"""Get default CUDA stream."""

183

```

184

185

### Event Management

186

187

CUDA events for synchronization and timing.

188

189

```python { .api }

190

class Event:

191

"""

192

CUDA event for synchronization.

193

194

Parameters:

195

- blocking: bool, create blocking event

196

- disable_timing: bool, disable timing capability

197

- interprocess: bool, enable interprocess sharing

198

"""

199

def __init__(self, blocking=False, disable_timing=False, interprocess=False): ...

200

201

def record(self, stream=None):

202

"""Record event in stream."""

203

204

def synchronize(self):

205

"""Synchronize on event."""

206

207

def elapsed_time(self, end_event) -> float:

208

"""Compute elapsed time to another event."""

209

210

@property

211

def ptr(self) -> int:

212

"""Raw event pointer."""

213

214

def synchronize():

215

"""Synchronize all CUDA operations."""

216

```

217

218

### CUDA Library Interfaces

219

220

Access to major CUDA libraries for specialized computations.

221

222

```python { .api }

223

# cuBLAS - Basic Linear Algebra Subprograms

224

class cublas:

225

"""cuBLAS library interface."""

226

227

@staticmethod

228

def getVersion() -> int: ...

229

230

@staticmethod

231

def create() -> int: ...

232

233

@staticmethod

234

def destroy(handle: int): ...

235

236

# cuSOLVER - Dense and Sparse Linear Algebra

237

class cusolver:

238

"""cuSOLVER library interface."""

239

240

@staticmethod

241

def getVersion() -> tuple: ...

242

243

# cuSPARSE - Sparse Matrix Operations

244

class cusparse:

245

"""cuSPARSE library interface."""

246

247

@staticmethod

248

def getVersion() -> int: ...

249

250

# cuRAND - Random Number Generation

251

class curand:

252

"""cuRAND library interface."""

253

254

@staticmethod

255

def getVersion() -> int: ...

256

257

# cuFFT - Fast Fourier Transform

258

class cufft:

259

"""cuFFT library interface."""

260

261

@staticmethod

262

def getVersion() -> int: ...

263

264

# NCCL - Collective Communications

265

class nccl:

266

"""NCCL library interface."""

267

268

@staticmethod

269

def get_version() -> int: ...

270

```

271

272

### Runtime Information

273

274

Query CUDA runtime and driver information.

275

276

```python { .api }

277

def get_cuda_path() -> str:

278

"""Get CUDA installation path."""

279

280

def get_nvcc_path() -> str:

281

"""Get nvcc compiler path."""

282

283

def runtime_version() -> int:

284

"""Get CUDA runtime version."""

285

286

def driver_version() -> int:

287

"""Get CUDA driver version."""

288

289

def get_local_mem_info() -> dict:

290

"""Get local memory information."""

291

292

def get_memory_info() -> tuple:

293

"""Get device memory information."""

294

```

295

296

## Usage Examples

297

298

### Device Management

299

300

```python

301

import cupy as cp

302

303

# Check CUDA availability

304

if cp.cuda.is_available():

305

print(f"CUDA devices available: {cp.cuda.get_device_count()}")

306

307

# Use specific device

308

with cp.cuda.Device(0):

309

# Operations run on device 0

310

data = cp.zeros((1000, 1000))

311

result = cp.sum(data)

312

313

# Switch devices

314

cp.cuda.set_device_id(1)

315

data_dev1 = cp.ones((500, 500))

316

```

317

318

### Memory Management

319

320

```python

321

# Use custom memory pool

322

memory_pool = cp.get_default_memory_pool()

323

pinned_memory_pool = cp.get_default_pinned_memory_pool()

324

325

# Monitor memory usage

326

print(f"Used: {memory_pool.used_bytes()} bytes")

327

print(f"Total: {memory_pool.total_bytes()} bytes")

328

329

# Set memory limit

330

memory_pool.set_limit(size=2**30) # 1GB limit

331

332

# Free unused memory

333

memory_pool.free_all_free_blocks()

334

335

# Direct memory allocation

336

ptr = cp.cuda.malloc(1024) # Allocate 1KB

337

cp.cuda.free(ptr) # Free memory

338

```

339

340

### Asynchronous Operations with Streams

341

342

```python

343

# Create streams for concurrent execution

344

stream1 = cp.cuda.Stream()

345

stream2 = cp.cuda.Stream()

346

347

# Asynchronous operations

348

with stream1:

349

data1 = cp.random.random((1000, 1000))

350

result1 = cp.dot(data1, data1.T)

351

352

with stream2:

353

data2 = cp.random.random((1000, 1000))

354

result2 = cp.linalg.svd(data2)

355

356

# Synchronize streams

357

stream1.synchronize()

358

stream2.synchronize()

359

360

# Event-based synchronization

361

event = cp.cuda.Event()

362

with stream1:

363

event.record()

364

365

with stream2:

366

stream2.wait_event(event) # Wait for stream1

367

```

368

369

### Performance Timing

370

371

```python

372

# Time operations using events

373

start_event = cp.cuda.Event()

374

end_event = cp.cuda.Event()

375

376

start_event.record()

377

378

# GPU operations

379

data = cp.random.random((5000, 5000))

380

result = cp.linalg.inv(data)

381

382

end_event.record()

383

end_event.synchronize()

384

385

elapsed_time = cp.cuda.get_elapsed_time(start_event, end_event)

386

print(f"Operation took {elapsed_time:.2f} ms")

387

```

388

389

### Memory Transfer Control

390

391

```python

392

# Pinned memory for faster transfers

393

pinned_array = cp.cuda.PinnedMemoryPool().malloc(1024)

394

395

# Asynchronous memory transfers

396

cpu_data = np.random.random((1000, 1000))

397

gpu_data = cp.asarray(cpu_data) # CPU to GPU

398

399

# Transfer back to CPU asynchronously

400

stream = cp.cuda.Stream()

401

cpu_result = cp.asnumpy(gpu_data, stream=stream)

402

stream.synchronize()

403

```

404

405

### Multi-GPU Computing

406

407

```python

408

# Distribute computation across multiple GPUs

409

n_devices = cp.cuda.get_device_count()

410

411

if n_devices > 1:

412

# Split work across devices

413

data_size = 10000

414

chunk_size = data_size // n_devices

415

416

results = []

417

streams = []

418

419

for device_id in range(n_devices):

420

with cp.cuda.Device(device_id):

421

stream = cp.cuda.Stream()

422

streams.append(stream)

423

424

with stream:

425

start = device_id * chunk_size

426

end = start + chunk_size

427

chunk = cp.arange(start, end)

428

result = cp.sum(chunk ** 2)

429

results.append(result)

430

431

# Synchronize all devices

432

for stream in streams:

433

stream.synchronize()

434

435

# Combine results

436

total_result = sum(cp.asnumpy(r) for r in results)

437

```