0
# CuPy Extensions
1
2
Additional functionality through CuPy-X including SciPy compatibility, JIT compilation, specialized operations, and advanced GPU programming features. These extensions provide enhanced capabilities beyond core CuPy functionality.
3
4
## Capabilities
5
6
### Core Extensions
7
8
CuPy-specific mathematical functions and runtime utilities.
9
10
```python { .api }
11
def rsqrt(x, out=None):
12
"""Reciprocal square root function: 1 / sqrt(x).
13
14
Optimized GPU implementation providing better performance
15
than computing reciprocal and square root separately.
16
17
Parameters:
18
- x: array-like, input array
19
- out: ndarray, output array
20
21
Returns:
22
cupy.ndarray: reciprocal square root of input
23
"""
24
25
def get_runtime_info(full=False):
26
"""Get CuPy runtime information.
27
28
Parameters:
29
- full: bool, whether to include detailed information
30
31
Returns:
32
dict: runtime information including CUDA version, device details
33
"""
34
```
35
36
### Scatter Operations
37
38
Efficient scatter operations for advanced indexing and updates.
39
40
```python { .api }
41
def scatter_add(a, indices, b, axis=None):
42
"""Add values to array at specified indices.
43
44
Performs atomic addition operations, making it safe for
45
concurrent updates to the same memory locations.
46
47
Parameters:
48
- a: ndarray, destination array to be updated
49
- indices: array-like, indices where values should be added
50
- b: array-like, values to add
51
- axis: int, axis along which to scatter
52
53
Returns:
54
None: modifies array a in-place
55
"""
56
57
def scatter_max(a, indices, b, axis=None):
58
"""Take element-wise maximum between array values and scattered values.
59
60
Parameters:
61
- a: ndarray, destination array to be updated
62
- indices: array-like, indices where comparison should occur
63
- b: array-like, values to compare
64
- axis: int, axis along which to scatter
65
66
Returns:
67
None: modifies array a in-place
68
"""
69
70
def scatter_min(a, indices, b, axis=None):
71
"""Take element-wise minimum between array values and scattered values.
72
73
Parameters:
74
- a: ndarray, destination array to be updated
75
- indices: array-like, indices where comparison should occur
76
- b: array-like, values to compare
77
- axis: int, axis along which to scatter
78
79
Returns:
80
None: modifies array a in-place
81
"""
82
```
83
84
### Pinned Memory Arrays
85
86
Arrays in host memory that are accessible to GPU for efficient transfers.
87
88
```python { .api }
89
def empty_pinned(shape, dtype=float64, order='C'):
90
"""Return new pinned array with uninitialized entries.
91
92
Pinned memory enables faster transfers between CPU and GPU
93
and allows for asynchronous memory operations.
94
95
Parameters:
96
- shape: int or sequence of ints, shape of array
97
- dtype: data-type, type of array elements
98
- order: memory layout order ('C' or 'F')
99
100
Returns:
101
numpy.ndarray: array in pinned host memory
102
"""
103
104
def zeros_pinned(shape, dtype=float64, order='C'):
105
"""Return new pinned array of zeros.
106
107
Parameters:
108
- shape: int or sequence of ints, shape of array
109
- dtype: data-type, type of array elements
110
- order: memory layout order
111
112
Returns:
113
numpy.ndarray: zeros array in pinned host memory
114
"""
115
116
def empty_like_pinned(a, dtype=None, order='K', subok=True, shape=None):
117
"""Return new pinned array with same shape and type as given array."""
118
119
def zeros_like_pinned(a, dtype=None, order='K', subok=True, shape=None):
120
"""Return pinned array of zeros with same shape and type as given array."""
121
```
122
123
### Error State Management
124
125
Control floating-point error handling behavior.
126
127
```python { .api }
128
def errstate(**kwargs):
129
"""Context manager for floating-point error handling.
130
131
Temporarily override how floating-point errors are handled,
132
including division by zero, overflow, underflow, and invalid operations.
133
134
Parameters:
135
- all: str, set behavior for all error types
136
- divide: str, behavior for division by zero
137
- over: str, behavior for overflow
138
- under: str, behavior for underflow
139
- invalid: str, behavior for invalid operations
140
141
Error behaviors: 'ignore', 'warn', 'raise', 'call', 'print', 'log'
142
143
Returns:
144
context manager
145
"""
146
147
def geterr():
148
"""Get current floating-point error handling settings.
149
150
Returns:
151
dict: current error handling settings for each error type
152
"""
153
154
def seterr(all=None, divide=None, over=None, under=None, invalid=None):
155
"""Set floating-point error handling behavior.
156
157
Parameters:
158
- all: str, set behavior for all error types
159
- divide: str, behavior for division by zero
160
- over: str, behavior for overflow
161
- under: str, behavior for underflow
162
- invalid: str, behavior for invalid operations
163
164
Returns:
165
dict: previous error handling settings
166
"""
167
```
168
169
### Synchronization Control
170
171
Advanced control over GPU synchronization behavior.
172
173
```python { .api }
174
def allow_synchronize(allow=True):
175
"""Context manager to control synchronization behavior.
176
177
Allows fine-grained control over when GPU operations
178
synchronize with CPU, useful for performance optimization.
179
180
Parameters:
181
- allow: bool, whether to allow synchronization
182
183
Returns:
184
context manager
185
"""
186
187
class DeviceSynchronized:
188
"""Device synchronization manager for performance control.
189
190
Provides advanced synchronization control for multi-GPU
191
applications and performance-critical sections.
192
"""
193
194
def __enter__(self):
195
"""Enter synchronized section."""
196
197
def __exit__(self, *args):
198
"""Exit synchronized section."""
199
```
200
201
### JIT Compilation
202
203
Just-in-time compilation capabilities for dynamic kernel generation.
204
205
```python { .api }
206
def rawkernel(mode='python', device=False):
207
"""Decorator for creating raw kernels from Python functions.
208
209
Enables writing CUDA kernels using Python syntax with JIT
210
compilation to optimized GPU code.
211
212
Parameters:
213
- mode: str, compilation mode ('python', 'cuda')
214
- device: bool, whether function runs on device
215
216
Returns:
217
callable: decorated kernel function
218
"""
219
220
# CUDA Built-in Types and Functions
221
threadIdx = None # Thread index within block (x, y, z components)
222
blockDim = None # Block dimensions (x, y, z components)
223
blockIdx = None # Block index within grid (x, y, z components)
224
gridDim = None # Grid dimensions (x, y, z components)
225
226
def syncthreads():
227
"""Synchronize all threads within a CUDA block.
228
229
Ensures all threads in the current block reach this point
230
before any thread continues execution.
231
"""
232
233
def shared_memory(dtype, shape):
234
"""Allocate shared memory array within CUDA block.
235
236
Parameters:
237
- dtype: data type of shared memory elements
238
- shape: int or tuple, shape of shared memory array
239
240
Returns:
241
shared memory array accessible to all threads in block
242
"""
243
244
def atomic_add(array, index, value):
245
"""Perform atomic addition operation.
246
247
Safely adds value to array[index] in a thread-safe manner,
248
preventing race conditions in parallel execution.
249
250
Parameters:
251
- array: target array
252
- index: index to update
253
- value: value to add
254
255
Returns:
256
previous value at array[index]
257
"""
258
```
259
260
### SciPy Compatibility
261
262
Comprehensive SciPy-compatible functionality with GPU acceleration.
263
264
```python { .api }
265
def get_array_module(*args):
266
"""Get appropriate array module (NumPy or CuPy) for arguments.
267
268
Automatically selects the correct module based on input types,
269
enabling generic code that works with both CPU and GPU arrays.
270
271
Parameters:
272
- args: array arguments to analyze
273
274
Returns:
275
module: cupy or numpy module appropriate for inputs
276
"""
277
```
278
279
### Sparse Matrix Support
280
281
GPU-accelerated sparse matrix operations and formats.
282
283
```python { .api }
284
class spmatrix:
285
"""Base class for sparse matrices.
286
287
Provides common interface for all sparse matrix formats
288
with GPU-accelerated operations and memory efficiency.
289
"""
290
291
class csr_matrix:
292
"""Compressed Sparse Row matrix format.
293
294
Efficient sparse matrix format for arithmetic operations
295
and fast row slicing, optimized for GPU computation.
296
"""
297
298
def __init__(self, arg1, shape=None, dtype=None, copy=False):
299
"""Initialize CSR sparse matrix.
300
301
Parameters:
302
- arg1: array-like, sparse matrix data
303
- shape: tuple, matrix dimensions
304
- dtype: data type
305
- copy: bool, whether to copy data
306
"""
307
308
class csc_matrix:
309
"""Compressed Sparse Column matrix format.
310
311
Efficient for column slicing and matrix-vector products
312
where the matrix is accessed column-wise.
313
"""
314
315
class coo_matrix:
316
"""COOrdinate sparse matrix format.
317
318
Efficient for sparse matrix construction and format conversion,
319
stores (row, col, data) triplets.
320
"""
321
322
class dia_matrix:
323
"""DIAgonal sparse matrix format.
324
325
Efficient storage for matrices with few diagonals,
326
optimized for diagonal matrix operations.
327
"""
328
329
def issparse(x):
330
"""Check if input is a sparse matrix.
331
332
Parameters:
333
- x: input object to check
334
335
Returns:
336
bool: True if x is a sparse matrix
337
"""
338
339
def eye(m, n=None, k=0, dtype=float, format=None):
340
"""Create sparse identity matrix.
341
342
Parameters:
343
- m: int, number of rows
344
- n: int, number of columns (defaults to m)
345
- k: int, diagonal offset
346
- dtype: data type
347
- format: str, sparse matrix format
348
349
Returns:
350
sparse matrix: identity matrix in specified format
351
"""
352
353
def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):
354
"""Construct sparse matrix from diagonals.
355
356
Parameters:
357
- diagonals: array-like, diagonal values
358
- offsets: int or array-like, diagonal offsets
359
- shape: tuple, matrix shape
360
- format: str, output format
361
- dtype: data type
362
363
Returns:
364
sparse matrix: matrix with specified diagonals
365
"""
366
```
367
368
### Advanced Extensions
369
370
Specialized functionality for scientific computing and machine learning.
371
372
```python { .api }
373
# Linear Algebra Extensions (cupyx.scipy.linalg)
374
class LinalgExtensions:
375
"""Extended linear algebra operations beyond NumPy/SciPy standard."""
376
377
# Signal Processing Extensions (cupyx.scipy.signal)
378
class SignalExtensions:
379
"""Signal processing functions with GPU acceleration."""
380
381
# Image Processing Extensions (cupyx.scipy.ndimage)
382
class NDImageExtensions:
383
"""N-dimensional image processing operations."""
384
385
# Special Functions Extensions (cupyx.scipy.special)
386
class SpecialExtensions:
387
"""Mathematical special functions with GPU support."""
388
389
# Statistics Extensions (cupyx.scipy.stats)
390
class StatsExtensions:
391
"""Statistical distributions and tests with GPU acceleration."""
392
```
393
394
## Usage Examples
395
396
### Scatter Operations
397
398
```python
399
import cupy as cp
400
import cupyx
401
402
# Create destination array
403
result = cp.zeros(10, dtype=cp.float32)
404
indices = cp.array([1, 3, 1, 7, 3])
405
values = cp.array([1.0, 2.0, 3.0, 4.0, 5.0])
406
407
# Scatter add - accumulates values at repeated indices
408
cupyx.scatter_add(result, indices, values)
409
print(f"After scatter_add: {result}") # [0, 4, 0, 7, 0, 0, 0, 4, 0, 0]
410
411
# Scatter max - keeps maximum value at each index
412
result.fill(0)
413
cupyx.scatter_max(result, indices, values)
414
print(f"After scatter_max: {result}") # [0, 3, 0, 5, 0, 0, 0, 4, 0, 0]
415
416
# Use in neural network-like operations
417
def sparse_embedding_lookup(embeddings, indices):
418
"""Efficient embedding lookup with scatter operations."""
419
output = cp.zeros((len(indices), embeddings.shape[1]), dtype=embeddings.dtype)
420
for i, idx in enumerate(indices):
421
cupyx.scatter_add(output[i:i+1], 0, embeddings[idx:idx+1], axis=0)
422
return output
423
424
# Example usage
425
embedding_dim = 128
426
vocab_size = 10000
427
embeddings = cp.random.normal(0, 1, (vocab_size, embedding_dim))
428
word_indices = cp.array([5, 42, 1337, 9999])
429
430
embedded = sparse_embedding_lookup(embeddings, word_indices)
431
print(f"Embedded shape: {embedded.shape}")
432
```
433
434
### JIT Compilation
435
436
```python
437
import cupy as cp
438
import cupyx.jit as jit
439
440
@jit.rawkernel()
441
def matrix_multiply_jit(A, B, C, M, N, K):
442
"""JIT-compiled matrix multiplication kernel."""
443
i = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
444
j = jit.blockIdx.y * jit.blockDim.y + jit.threadIdx.y
445
446
if i < M and j < N:
447
temp = 0.0
448
for k in range(K):
449
temp += A[i, k] * B[k, j]
450
C[i, j] = temp
451
452
# Example usage
453
M, N, K = 512, 512, 256
454
A = cp.random.random((M, K), dtype=cp.float32)
455
B = cp.random.random((K, N), dtype=cp.float32)
456
C = cp.zeros((M, N), dtype=cp.float32)
457
458
# Configure launch parameters
459
threads_per_block = (16, 16)
460
blocks_per_grid = ((M + 15) // 16, (N + 15) // 16)
461
462
# Launch JIT kernel
463
matrix_multiply_jit[blocks_per_grid, threads_per_block](A, B, C, M, N, K)
464
465
# Verify result
466
expected = cp.dot(A, B)
467
print(f"JIT result matches cp.dot: {cp.allclose(C, expected)}")
468
469
# Advanced JIT example with shared memory
470
@jit.rawkernel()
471
def reduction_with_shared_memory(input_array, output_array, n):
472
"""Parallel reduction using shared memory."""
473
# Allocate shared memory
474
shared_data = jit.shared_memory(cp.float32, 256)
475
476
tid = jit.threadIdx.x
477
bid = jit.blockIdx.x
478
i = bid * jit.blockDim.x + tid
479
480
# Load data into shared memory
481
if i < n:
482
shared_data[tid] = input_array[i]
483
else:
484
shared_data[tid] = 0.0
485
486
jit.syncthreads()
487
488
# Perform reduction in shared memory
489
stride = jit.blockDim.x // 2
490
while stride > 0:
491
if tid < stride:
492
shared_data[tid] += shared_data[tid + stride]
493
jit.syncthreads()
494
stride //= 2
495
496
# Write result for this block
497
if tid == 0:
498
jit.atomic_add(output_array, 0, shared_data[0])
499
500
# Test reduction kernel
501
data = cp.random.random(10000, dtype=cp.float32)
502
result = cp.zeros(1, dtype=cp.float32)
503
504
block_size = 256
505
grid_size = (len(data) + block_size - 1) // block_size
506
507
reduction_with_shared_memory[grid_size, block_size](data, result, len(data))
508
print(f"Reduction result: {result[0]}")
509
print(f"Expected (cp.sum): {cp.sum(data)}")
510
```
511
512
### Sparse Matrix Operations
513
514
```python
515
import cupy as cp
516
import cupyx.scipy.sparse as sp
517
518
# Create sparse matrices
519
# COO format - good for construction
520
row = cp.array([0, 1, 2, 0, 1, 2])
521
col = cp.array([0, 1, 2, 1, 2, 0])
522
data = cp.array([1, 2, 3, 4, 5, 6])
523
524
coo_matrix = sp.coo_matrix((data, (row, col)), shape=(3, 3))
525
print(f"COO matrix:\n{coo_matrix.toarray()}")
526
527
# Convert to CSR for efficient arithmetic
528
csr_matrix = coo_matrix.tocsr()
529
print(f"CSR format - data: {csr_matrix.data}")
530
print(f"CSR format - indices: {csr_matrix.indices}")
531
print(f"CSR format - indptr: {csr_matrix.indptr}")
532
533
# Sparse matrix operations
534
vector = cp.array([1, 2, 3])
535
result = csr_matrix.dot(vector) # Matrix-vector multiplication
536
print(f"Matrix-vector product: {result}")
537
538
# Create large sparse matrix efficiently
539
n = 10000
540
# Tridiagonal matrix
541
main_diag = cp.ones(n) * 2
542
off_diag = cp.ones(n-1) * -1
543
544
tridiag = sp.diags([off_diag, main_diag, off_diag],
545
offsets=[-1, 0, 1],
546
shape=(n, n),
547
format='csr')
548
549
print(f"Tridiagonal matrix shape: {tridiag.shape}")
550
print(f"Non-zero elements: {tridiag.nnz}")
551
552
# Solve linear system (requires SciPy compatibility)
553
b = cp.ones(n)
554
# x = sp.linalg.spsolve(tridiag, b) # Would solve Ax = b
555
```
556
557
### Pinned Memory for Optimal Transfers
558
559
```python
560
import cupy as cp
561
import cupyx
562
import time
563
import numpy as np
564
565
# Compare regular vs pinned memory transfer performance
566
size = 10000000 # 10M elements
567
568
# Regular CPU array
569
regular_cpu = np.random.random(size).astype(np.float32)
570
571
# Pinned CPU array
572
pinned_cpu = cupyx.zeros_pinned(size, dtype=cp.float32)
573
pinned_cpu[:] = np.random.random(size).astype(np.float32)
574
575
# Benchmark transfers
576
def benchmark_transfer(cpu_array, n_iterations=100):
577
"""Benchmark CPU-GPU transfer performance."""
578
times = []
579
580
for _ in range(n_iterations):
581
start = time.time()
582
gpu_array = cp.asarray(cpu_array)
583
cp.cuda.Stream.null.synchronize()
584
times.append(time.time() - start)
585
586
return np.mean(times), np.std(times)
587
588
# Regular memory transfer
589
regular_mean, regular_std = benchmark_transfer(regular_cpu)
590
591
# Pinned memory transfer
592
pinned_mean, pinned_std = benchmark_transfer(pinned_cpu)
593
594
print(f"Regular memory: {regular_mean:.4f} ± {regular_std:.4f} seconds")
595
print(f"Pinned memory: {pinned_mean:.4f} ± {pinned_std:.4f} seconds")
596
print(f"Speedup: {regular_mean / pinned_mean:.2f}x")
597
598
# Asynchronous transfer example
599
async_stream = cp.cuda.Stream()
600
601
with async_stream:
602
# Start asynchronous transfer
603
gpu_async = cp.asarray(pinned_cpu)
604
605
# Do other work while transfer happens
606
other_work = cp.random.random((1000, 1000))
607
result = cp.sum(other_work)
608
609
# Synchronize to ensure transfer is complete
610
async_stream.synchronize()
611
612
print(f"Asynchronous transfer completed, result: {result}")
613
```
614
615
### Error State Management
616
617
```python
618
import cupy as cp
619
import cupyx
620
621
# Example with potential floating point errors
622
a = cp.array([1.0, 0.0, -1.0])
623
b = cp.array([0.0, 0.0, 0.0])
624
625
# Default behavior (may warn or ignore)
626
result1 = a / b
627
print(f"Default behavior: {result1}") # [inf, nan, -inf]
628
629
# Temporarily change error handling
630
with cupyx.errstate(divide='raise', invalid='raise'):
631
try:
632
result2 = a / b
633
except FloatingPointError as e:
634
print(f"Caught error: {e}")
635
636
# Check current error state
637
current_state = cupyx.geterr()
638
print(f"Current error handling: {current_state}")
639
640
# Set specific error handling
641
old_state = cupyx.seterr(divide='warn', over='ignore', invalid='print')
642
print(f"Previous state: {old_state}")
643
644
# Example with overflow
645
large_numbers = cp.array([1e308, 1e308])
646
overflow_result = large_numbers * 10 # Will overflow to inf
647
648
# Restore previous state
649
cupyx.seterr(**old_state)
650
651
# Advanced error handling for numerical algorithms
652
def safe_divide(a, b, default=0.0):
653
"""Safe division with custom error handling."""
654
with cupyx.errstate(divide='ignore', invalid='ignore'):
655
result = a / b
656
# Replace inf and nan with default value
657
result = cp.where(cp.isfinite(result), result, default)
658
return result
659
660
# Test safe division
661
numerator = cp.array([1.0, 2.0, 3.0])
662
denominator = cp.array([2.0, 0.0, 4.0])
663
safe_result = safe_divide(numerator, denominator, default=0.0)
664
print(f"Safe division result: {safe_result}") # [0.5, 0.0, 0.75]
665
```
666
667
### High-Performance Computing Patterns
668
669
```python
670
import cupy as cp
671
import cupyx
672
import time
673
674
def gpu_accelerated_monte_carlo(n_samples=10000000):
675
"""Monte Carlo π estimation with CuPy extensions."""
676
677
# Use rsqrt for better performance
678
def estimate_pi_optimized(samples):
679
x = cp.random.uniform(-1, 1, samples)
680
y = cp.random.uniform(-1, 1, samples)
681
682
# Use optimized rsqrt instead of sqrt
683
distances_squared = x*x + y*y
684
# inside = distances_squared <= 1 is more efficient than sqrt comparison
685
inside_circle = distances_squared <= 1.0
686
687
return 4.0 * cp.mean(inside_circle)
688
689
# Benchmark with synchronization control
690
with cupyx.allow_synchronize(False):
691
start_time = time.time()
692
pi_estimate = estimate_pi_optimized(n_samples)
693
# Explicit synchronization when needed
694
cp.cuda.Stream.null.synchronize()
695
end_time = time.time()
696
697
return float(pi_estimate), end_time - start_time
698
699
# Run Monte Carlo estimation
700
pi_est, computation_time = gpu_accelerated_monte_carlo(50000000)
701
error = abs(pi_est - cp.pi)
702
703
print(f"π estimate: {pi_est:.6f}")
704
print(f"Error: {error:.6f}")
705
print(f"Computation time: {computation_time:.4f} seconds")
706
print(f"Rate: {50000000 / computation_time / 1e6:.2f} M samples/second")
707
708
# Memory-efficient streaming computation
709
def streaming_mean_computation(total_size, chunk_size=1000000):
710
"""Compute mean of very large dataset using streaming."""
711
running_sum = 0.0
712
running_count = 0
713
714
# Use pinned memory for efficient transfers
715
chunk_pinned = cupyx.empty_pinned(chunk_size, dtype=cp.float32)
716
717
for start_idx in range(0, total_size, chunk_size):
718
end_idx = min(start_idx + chunk_size, total_size)
719
current_chunk_size = end_idx - start_idx
720
721
# Generate data in pinned memory
722
chunk_pinned[:current_chunk_size] = np.random.normal(
723
5.0, 2.0, current_chunk_size
724
).astype(np.float32)
725
726
# Transfer to GPU and process
727
gpu_chunk = cp.asarray(chunk_pinned[:current_chunk_size])
728
chunk_sum = cp.sum(gpu_chunk)
729
730
running_sum += float(chunk_sum)
731
running_count += current_chunk_size
732
733
if start_idx % (chunk_size * 10) == 0:
734
current_mean = running_sum / running_count
735
print(f"Processed {running_count:,} samples, current mean: {current_mean:.4f}")
736
737
return running_sum / running_count
738
739
# Process 100M samples in chunks
740
final_mean = streaming_mean_computation(100000000, chunk_size=2000000)
741
print(f"Final streaming mean: {final_mean:.6f}")
742
print(f"Expected mean: 5.0")
743
```