or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-operations.mdcuda-integration.mdcupy-extensions.mdcustom-kernels.mdfft-operations.mdindex.mdlinear-algebra.mdmath-functions.mdrandom-generation.mdstatistical-functions.md

cupy-extensions.mddocs/

0

# CuPy Extensions

1

2

Additional functionality through CuPy-X including SciPy compatibility, JIT compilation, specialized operations, and advanced GPU programming features. These extensions provide enhanced capabilities beyond core CuPy functionality.

3

4

## Capabilities

5

6

### Core Extensions

7

8

CuPy-specific mathematical functions and runtime utilities.

9

10

```python { .api }

11

def rsqrt(x, out=None):

12

"""Reciprocal square root function: 1 / sqrt(x).

13

14

Optimized GPU implementation providing better performance

15

than computing reciprocal and square root separately.

16

17

Parameters:

18

- x: array-like, input array

19

- out: ndarray, output array

20

21

Returns:

22

cupy.ndarray: reciprocal square root of input

23

"""

24

25

def get_runtime_info(full=False):

26

"""Get CuPy runtime information.

27

28

Parameters:

29

- full: bool, whether to include detailed information

30

31

Returns:

32

dict: runtime information including CUDA version, device details

33

"""

34

```

35

36

### Scatter Operations

37

38

Efficient scatter operations for advanced indexing and updates.

39

40

```python { .api }

41

def scatter_add(a, indices, b, axis=None):

42

"""Add values to array at specified indices.

43

44

Performs atomic addition operations, making it safe for

45

concurrent updates to the same memory locations.

46

47

Parameters:

48

- a: ndarray, destination array to be updated

49

- indices: array-like, indices where values should be added

50

- b: array-like, values to add

51

- axis: int, axis along which to scatter

52

53

Returns:

54

None: modifies array a in-place

55

"""

56

57

def scatter_max(a, indices, b, axis=None):

58

"""Take element-wise maximum between array values and scattered values.

59

60

Parameters:

61

- a: ndarray, destination array to be updated

62

- indices: array-like, indices where comparison should occur

63

- b: array-like, values to compare

64

- axis: int, axis along which to scatter

65

66

Returns:

67

None: modifies array a in-place

68

"""

69

70

def scatter_min(a, indices, b, axis=None):

71

"""Take element-wise minimum between array values and scattered values.

72

73

Parameters:

74

- a: ndarray, destination array to be updated

75

- indices: array-like, indices where comparison should occur

76

- b: array-like, values to compare

77

- axis: int, axis along which to scatter

78

79

Returns:

80

None: modifies array a in-place

81

"""

82

```

83

84

### Pinned Memory Arrays

85

86

Arrays in host memory that are accessible to GPU for efficient transfers.

87

88

```python { .api }

89

def empty_pinned(shape, dtype=float64, order='C'):

90

"""Return new pinned array with uninitialized entries.

91

92

Pinned memory enables faster transfers between CPU and GPU

93

and allows for asynchronous memory operations.

94

95

Parameters:

96

- shape: int or sequence of ints, shape of array

97

- dtype: data-type, type of array elements

98

- order: memory layout order ('C' or 'F')

99

100

Returns:

101

numpy.ndarray: array in pinned host memory

102

"""

103

104

def zeros_pinned(shape, dtype=float64, order='C'):

105

"""Return new pinned array of zeros.

106

107

Parameters:

108

- shape: int or sequence of ints, shape of array

109

- dtype: data-type, type of array elements

110

- order: memory layout order

111

112

Returns:

113

numpy.ndarray: zeros array in pinned host memory

114

"""

115

116

def empty_like_pinned(a, dtype=None, order='K', subok=True, shape=None):

117

"""Return new pinned array with same shape and type as given array."""

118

119

def zeros_like_pinned(a, dtype=None, order='K', subok=True, shape=None):

120

"""Return pinned array of zeros with same shape and type as given array."""

121

```

122

123

### Error State Management

124

125

Control floating-point error handling behavior.

126

127

```python { .api }

128

def errstate(**kwargs):

129

"""Context manager for floating-point error handling.

130

131

Temporarily override how floating-point errors are handled,

132

including division by zero, overflow, underflow, and invalid operations.

133

134

Parameters:

135

- all: str, set behavior for all error types

136

- divide: str, behavior for division by zero

137

- over: str, behavior for overflow

138

- under: str, behavior for underflow

139

- invalid: str, behavior for invalid operations

140

141

Error behaviors: 'ignore', 'warn', 'raise', 'call', 'print', 'log'

142

143

Returns:

144

context manager

145

"""

146

147

def geterr():

148

"""Get current floating-point error handling settings.

149

150

Returns:

151

dict: current error handling settings for each error type

152

"""

153

154

def seterr(all=None, divide=None, over=None, under=None, invalid=None):

155

"""Set floating-point error handling behavior.

156

157

Parameters:

158

- all: str, set behavior for all error types

159

- divide: str, behavior for division by zero

160

- over: str, behavior for overflow

161

- under: str, behavior for underflow

162

- invalid: str, behavior for invalid operations

163

164

Returns:

165

dict: previous error handling settings

166

"""

167

```

168

169

### Synchronization Control

170

171

Advanced control over GPU synchronization behavior.

172

173

```python { .api }

174

def allow_synchronize(allow=True):

175

"""Context manager to control synchronization behavior.

176

177

Allows fine-grained control over when GPU operations

178

synchronize with CPU, useful for performance optimization.

179

180

Parameters:

181

- allow: bool, whether to allow synchronization

182

183

Returns:

184

context manager

185

"""

186

187

class DeviceSynchronized:

188

"""Device synchronization manager for performance control.

189

190

Provides advanced synchronization control for multi-GPU

191

applications and performance-critical sections.

192

"""

193

194

def __enter__(self):

195

"""Enter synchronized section."""

196

197

def __exit__(self, *args):

198

"""Exit synchronized section."""

199

```

200

201

### JIT Compilation

202

203

Just-in-time compilation capabilities for dynamic kernel generation.

204

205

```python { .api }

206

def rawkernel(mode='python', device=False):

207

"""Decorator for creating raw kernels from Python functions.

208

209

Enables writing CUDA kernels using Python syntax with JIT

210

compilation to optimized GPU code.

211

212

Parameters:

213

- mode: str, compilation mode ('python', 'cuda')

214

- device: bool, whether function runs on device

215

216

Returns:

217

callable: decorated kernel function

218

"""

219

220

# CUDA Built-in Types and Functions

221

threadIdx = None # Thread index within block (x, y, z components)

222

blockDim = None # Block dimensions (x, y, z components)

223

blockIdx = None # Block index within grid (x, y, z components)

224

gridDim = None # Grid dimensions (x, y, z components)

225

226

def syncthreads():

227

"""Synchronize all threads within a CUDA block.

228

229

Ensures all threads in the current block reach this point

230

before any thread continues execution.

231

"""

232

233

def shared_memory(dtype, shape):

234

"""Allocate shared memory array within CUDA block.

235

236

Parameters:

237

- dtype: data type of shared memory elements

238

- shape: int or tuple, shape of shared memory array

239

240

Returns:

241

shared memory array accessible to all threads in block

242

"""

243

244

def atomic_add(array, index, value):

245

"""Perform atomic addition operation.

246

247

Safely adds value to array[index] in a thread-safe manner,

248

preventing race conditions in parallel execution.

249

250

Parameters:

251

- array: target array

252

- index: index to update

253

- value: value to add

254

255

Returns:

256

previous value at array[index]

257

"""

258

```

259

260

### SciPy Compatibility

261

262

Comprehensive SciPy-compatible functionality with GPU acceleration.

263

264

```python { .api }

265

def get_array_module(*args):

266

"""Get appropriate array module (NumPy or CuPy) for arguments.

267

268

Automatically selects the correct module based on input types,

269

enabling generic code that works with both CPU and GPU arrays.

270

271

Parameters:

272

- args: array arguments to analyze

273

274

Returns:

275

module: cupy or numpy module appropriate for inputs

276

"""

277

```

278

279

### Sparse Matrix Support

280

281

GPU-accelerated sparse matrix operations and formats.

282

283

```python { .api }

284

class spmatrix:

285

"""Base class for sparse matrices.

286

287

Provides common interface for all sparse matrix formats

288

with GPU-accelerated operations and memory efficiency.

289

"""

290

291

class csr_matrix:

292

"""Compressed Sparse Row matrix format.

293

294

Efficient sparse matrix format for arithmetic operations

295

and fast row slicing, optimized for GPU computation.

296

"""

297

298

def __init__(self, arg1, shape=None, dtype=None, copy=False):

299

"""Initialize CSR sparse matrix.

300

301

Parameters:

302

- arg1: array-like, sparse matrix data

303

- shape: tuple, matrix dimensions

304

- dtype: data type

305

- copy: bool, whether to copy data

306

"""

307

308

class csc_matrix:

309

"""Compressed Sparse Column matrix format.

310

311

Efficient for column slicing and matrix-vector products

312

where the matrix is accessed column-wise.

313

"""

314

315

class coo_matrix:

316

"""COOrdinate sparse matrix format.

317

318

Efficient for sparse matrix construction and format conversion,

319

stores (row, col, data) triplets.

320

"""

321

322

class dia_matrix:

323

"""DIAgonal sparse matrix format.

324

325

Efficient storage for matrices with few diagonals,

326

optimized for diagonal matrix operations.

327

"""

328

329

def issparse(x):

330

"""Check if input is a sparse matrix.

331

332

Parameters:

333

- x: input object to check

334

335

Returns:

336

bool: True if x is a sparse matrix

337

"""

338

339

def eye(m, n=None, k=0, dtype=float, format=None):

340

"""Create sparse identity matrix.

341

342

Parameters:

343

- m: int, number of rows

344

- n: int, number of columns (defaults to m)

345

- k: int, diagonal offset

346

- dtype: data type

347

- format: str, sparse matrix format

348

349

Returns:

350

sparse matrix: identity matrix in specified format

351

"""

352

353

def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):

354

"""Construct sparse matrix from diagonals.

355

356

Parameters:

357

- diagonals: array-like, diagonal values

358

- offsets: int or array-like, diagonal offsets

359

- shape: tuple, matrix shape

360

- format: str, output format

361

- dtype: data type

362

363

Returns:

364

sparse matrix: matrix with specified diagonals

365

"""

366

```

367

368

### Advanced Extensions

369

370

Specialized functionality for scientific computing and machine learning.

371

372

```python { .api }

373

# Linear Algebra Extensions (cupyx.scipy.linalg)

374

class LinalgExtensions:

375

"""Extended linear algebra operations beyond NumPy/SciPy standard."""

376

377

# Signal Processing Extensions (cupyx.scipy.signal)

378

class SignalExtensions:

379

"""Signal processing functions with GPU acceleration."""

380

381

# Image Processing Extensions (cupyx.scipy.ndimage)

382

class NDImageExtensions:

383

"""N-dimensional image processing operations."""

384

385

# Special Functions Extensions (cupyx.scipy.special)

386

class SpecialExtensions:

387

"""Mathematical special functions with GPU support."""

388

389

# Statistics Extensions (cupyx.scipy.stats)

390

class StatsExtensions:

391

"""Statistical distributions and tests with GPU acceleration."""

392

```

393

394

## Usage Examples

395

396

### Scatter Operations

397

398

```python

399

import cupy as cp

400

import cupyx

401

402

# Create destination array

403

result = cp.zeros(10, dtype=cp.float32)

404

indices = cp.array([1, 3, 1, 7, 3])

405

values = cp.array([1.0, 2.0, 3.0, 4.0, 5.0])

406

407

# Scatter add - accumulates values at repeated indices

408

cupyx.scatter_add(result, indices, values)

409

print(f"After scatter_add: {result}") # [0, 4, 0, 7, 0, 0, 0, 4, 0, 0]

410

411

# Scatter max - keeps maximum value at each index

412

result.fill(0)

413

cupyx.scatter_max(result, indices, values)

414

print(f"After scatter_max: {result}") # [0, 3, 0, 5, 0, 0, 0, 4, 0, 0]

415

416

# Use in neural network-like operations

417

def sparse_embedding_lookup(embeddings, indices):

418

"""Efficient embedding lookup with scatter operations."""

419

output = cp.zeros((len(indices), embeddings.shape[1]), dtype=embeddings.dtype)

420

for i, idx in enumerate(indices):

421

cupyx.scatter_add(output[i:i+1], 0, embeddings[idx:idx+1], axis=0)

422

return output

423

424

# Example usage

425

embedding_dim = 128

426

vocab_size = 10000

427

embeddings = cp.random.normal(0, 1, (vocab_size, embedding_dim))

428

word_indices = cp.array([5, 42, 1337, 9999])

429

430

embedded = sparse_embedding_lookup(embeddings, word_indices)

431

print(f"Embedded shape: {embedded.shape}")

432

```

433

434

### JIT Compilation

435

436

```python

437

import cupy as cp

438

import cupyx.jit as jit

439

440

@jit.rawkernel()

441

def matrix_multiply_jit(A, B, C, M, N, K):

442

"""JIT-compiled matrix multiplication kernel."""

443

i = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x

444

j = jit.blockIdx.y * jit.blockDim.y + jit.threadIdx.y

445

446

if i < M and j < N:

447

temp = 0.0

448

for k in range(K):

449

temp += A[i, k] * B[k, j]

450

C[i, j] = temp

451

452

# Example usage

453

M, N, K = 512, 512, 256

454

A = cp.random.random((M, K), dtype=cp.float32)

455

B = cp.random.random((K, N), dtype=cp.float32)

456

C = cp.zeros((M, N), dtype=cp.float32)

457

458

# Configure launch parameters

459

threads_per_block = (16, 16)

460

blocks_per_grid = ((M + 15) // 16, (N + 15) // 16)

461

462

# Launch JIT kernel

463

matrix_multiply_jit[blocks_per_grid, threads_per_block](A, B, C, M, N, K)

464

465

# Verify result

466

expected = cp.dot(A, B)

467

print(f"JIT result matches cp.dot: {cp.allclose(C, expected)}")

468

469

# Advanced JIT example with shared memory

470

@jit.rawkernel()

471

def reduction_with_shared_memory(input_array, output_array, n):

472

"""Parallel reduction using shared memory."""

473

# Allocate shared memory

474

shared_data = jit.shared_memory(cp.float32, 256)

475

476

tid = jit.threadIdx.x

477

bid = jit.blockIdx.x

478

i = bid * jit.blockDim.x + tid

479

480

# Load data into shared memory

481

if i < n:

482

shared_data[tid] = input_array[i]

483

else:

484

shared_data[tid] = 0.0

485

486

jit.syncthreads()

487

488

# Perform reduction in shared memory

489

stride = jit.blockDim.x // 2

490

while stride > 0:

491

if tid < stride:

492

shared_data[tid] += shared_data[tid + stride]

493

jit.syncthreads()

494

stride //= 2

495

496

# Write result for this block

497

if tid == 0:

498

jit.atomic_add(output_array, 0, shared_data[0])

499

500

# Test reduction kernel

501

data = cp.random.random(10000, dtype=cp.float32)

502

result = cp.zeros(1, dtype=cp.float32)

503

504

block_size = 256

505

grid_size = (len(data) + block_size - 1) // block_size

506

507

reduction_with_shared_memory[grid_size, block_size](data, result, len(data))

508

print(f"Reduction result: {result[0]}")

509

print(f"Expected (cp.sum): {cp.sum(data)}")

510

```

511

512

### Sparse Matrix Operations

513

514

```python

515

import cupy as cp

516

import cupyx.scipy.sparse as sp

517

518

# Create sparse matrices

519

# COO format - good for construction

520

row = cp.array([0, 1, 2, 0, 1, 2])

521

col = cp.array([0, 1, 2, 1, 2, 0])

522

data = cp.array([1, 2, 3, 4, 5, 6])

523

524

coo_matrix = sp.coo_matrix((data, (row, col)), shape=(3, 3))

525

print(f"COO matrix:\n{coo_matrix.toarray()}")

526

527

# Convert to CSR for efficient arithmetic

528

csr_matrix = coo_matrix.tocsr()

529

print(f"CSR format - data: {csr_matrix.data}")

530

print(f"CSR format - indices: {csr_matrix.indices}")

531

print(f"CSR format - indptr: {csr_matrix.indptr}")

532

533

# Sparse matrix operations

534

vector = cp.array([1, 2, 3])

535

result = csr_matrix.dot(vector) # Matrix-vector multiplication

536

print(f"Matrix-vector product: {result}")

537

538

# Create large sparse matrix efficiently

539

n = 10000

540

# Tridiagonal matrix

541

main_diag = cp.ones(n) * 2

542

off_diag = cp.ones(n-1) * -1

543

544

tridiag = sp.diags([off_diag, main_diag, off_diag],

545

offsets=[-1, 0, 1],

546

shape=(n, n),

547

format='csr')

548

549

print(f"Tridiagonal matrix shape: {tridiag.shape}")

550

print(f"Non-zero elements: {tridiag.nnz}")

551

552

# Solve linear system (requires SciPy compatibility)

553

b = cp.ones(n)

554

# x = sp.linalg.spsolve(tridiag, b) # Would solve Ax = b

555

```

556

557

### Pinned Memory for Optimal Transfers

558

559

```python

560

import cupy as cp

561

import cupyx

562

import time

563

import numpy as np

564

565

# Compare regular vs pinned memory transfer performance

566

size = 10000000 # 10M elements

567

568

# Regular CPU array

569

regular_cpu = np.random.random(size).astype(np.float32)

570

571

# Pinned CPU array

572

pinned_cpu = cupyx.zeros_pinned(size, dtype=cp.float32)

573

pinned_cpu[:] = np.random.random(size).astype(np.float32)

574

575

# Benchmark transfers

576

def benchmark_transfer(cpu_array, n_iterations=100):

577

"""Benchmark CPU-GPU transfer performance."""

578

times = []

579

580

for _ in range(n_iterations):

581

start = time.time()

582

gpu_array = cp.asarray(cpu_array)

583

cp.cuda.Stream.null.synchronize()

584

times.append(time.time() - start)

585

586

return np.mean(times), np.std(times)

587

588

# Regular memory transfer

589

regular_mean, regular_std = benchmark_transfer(regular_cpu)

590

591

# Pinned memory transfer

592

pinned_mean, pinned_std = benchmark_transfer(pinned_cpu)

593

594

print(f"Regular memory: {regular_mean:.4f} ± {regular_std:.4f} seconds")

595

print(f"Pinned memory: {pinned_mean:.4f} ± {pinned_std:.4f} seconds")

596

print(f"Speedup: {regular_mean / pinned_mean:.2f}x")

597

598

# Asynchronous transfer example

599

async_stream = cp.cuda.Stream()

600

601

with async_stream:

602

# Start asynchronous transfer

603

gpu_async = cp.asarray(pinned_cpu)

604

605

# Do other work while transfer happens

606

other_work = cp.random.random((1000, 1000))

607

result = cp.sum(other_work)

608

609

# Synchronize to ensure transfer is complete

610

async_stream.synchronize()

611

612

print(f"Asynchronous transfer completed, result: {result}")

613

```

614

615

### Error State Management

616

617

```python

618

import cupy as cp

619

import cupyx

620

621

# Example with potential floating point errors

622

a = cp.array([1.0, 0.0, -1.0])

623

b = cp.array([0.0, 0.0, 0.0])

624

625

# Default behavior (may warn or ignore)

626

result1 = a / b

627

print(f"Default behavior: {result1}") # [inf, nan, -inf]

628

629

# Temporarily change error handling

630

with cupyx.errstate(divide='raise', invalid='raise'):

631

try:

632

result2 = a / b

633

except FloatingPointError as e:

634

print(f"Caught error: {e}")

635

636

# Check current error state

637

current_state = cupyx.geterr()

638

print(f"Current error handling: {current_state}")

639

640

# Set specific error handling

641

old_state = cupyx.seterr(divide='warn', over='ignore', invalid='print')

642

print(f"Previous state: {old_state}")

643

644

# Example with overflow

645

large_numbers = cp.array([1e308, 1e308])

646

overflow_result = large_numbers * 10 # Will overflow to inf

647

648

# Restore previous state

649

cupyx.seterr(**old_state)

650

651

# Advanced error handling for numerical algorithms

652

def safe_divide(a, b, default=0.0):

653

"""Safe division with custom error handling."""

654

with cupyx.errstate(divide='ignore', invalid='ignore'):

655

result = a / b

656

# Replace inf and nan with default value

657

result = cp.where(cp.isfinite(result), result, default)

658

return result

659

660

# Test safe division

661

numerator = cp.array([1.0, 2.0, 3.0])

662

denominator = cp.array([2.0, 0.0, 4.0])

663

safe_result = safe_divide(numerator, denominator, default=0.0)

664

print(f"Safe division result: {safe_result}") # [0.5, 0.0, 0.75]

665

```

666

667

### High-Performance Computing Patterns

668

669

```python

670

import cupy as cp

671

import cupyx

672

import time

673

674

def gpu_accelerated_monte_carlo(n_samples=10000000):

675

"""Monte Carlo π estimation with CuPy extensions."""

676

677

# Use rsqrt for better performance

678

def estimate_pi_optimized(samples):

679

x = cp.random.uniform(-1, 1, samples)

680

y = cp.random.uniform(-1, 1, samples)

681

682

# Use optimized rsqrt instead of sqrt

683

distances_squared = x*x + y*y

684

# inside = distances_squared <= 1 is more efficient than sqrt comparison

685

inside_circle = distances_squared <= 1.0

686

687

return 4.0 * cp.mean(inside_circle)

688

689

# Benchmark with synchronization control

690

with cupyx.allow_synchronize(False):

691

start_time = time.time()

692

pi_estimate = estimate_pi_optimized(n_samples)

693

# Explicit synchronization when needed

694

cp.cuda.Stream.null.synchronize()

695

end_time = time.time()

696

697

return float(pi_estimate), end_time - start_time

698

699

# Run Monte Carlo estimation

700

pi_est, computation_time = gpu_accelerated_monte_carlo(50000000)

701

error = abs(pi_est - cp.pi)

702

703

print(f"π estimate: {pi_est:.6f}")

704

print(f"Error: {error:.6f}")

705

print(f"Computation time: {computation_time:.4f} seconds")

706

print(f"Rate: {50000000 / computation_time / 1e6:.2f} M samples/second")

707

708

# Memory-efficient streaming computation

709

def streaming_mean_computation(total_size, chunk_size=1000000):

710

"""Compute mean of very large dataset using streaming."""

711

running_sum = 0.0

712

running_count = 0

713

714

# Use pinned memory for efficient transfers

715

chunk_pinned = cupyx.empty_pinned(chunk_size, dtype=cp.float32)

716

717

for start_idx in range(0, total_size, chunk_size):

718

end_idx = min(start_idx + chunk_size, total_size)

719

current_chunk_size = end_idx - start_idx

720

721

# Generate data in pinned memory

722

chunk_pinned[:current_chunk_size] = np.random.normal(

723

5.0, 2.0, current_chunk_size

724

).astype(np.float32)

725

726

# Transfer to GPU and process

727

gpu_chunk = cp.asarray(chunk_pinned[:current_chunk_size])

728

chunk_sum = cp.sum(gpu_chunk)

729

730

running_sum += float(chunk_sum)

731

running_count += current_chunk_size

732

733

if start_idx % (chunk_size * 10) == 0:

734

current_mean = running_sum / running_count

735

print(f"Processed {running_count:,} samples, current mean: {current_mean:.4f}")

736

737

return running_sum / running_count

738

739

# Process 100M samples in chunks

740

final_mean = streaming_mean_computation(100000000, chunk_size=2000000)

741

print(f"Final streaming mean: {final_mean:.6f}")

742

print(f"Expected mean: 5.0")

743

```