0
# Extended Functionality (cupyx)
1
2
Extended functionality beyond NumPy compatibility including SciPy-compatible functions, JIT compilation, optimization utilities, and specialized GPU algorithms. Provides advanced features for high-performance computing.
3
4
## Capabilities
5
6
### Enhanced Array Operations
7
8
Specialized operations not available in standard NumPy.
9
10
```python { .api }
11
def scatter_add(a, indices, updates, axis=None):
12
"""
13
Add updates to array at specified indices.
14
15
Parameters:
16
- a: cupy.ndarray, target array to update
17
- indices: cupy.ndarray, indices where updates are applied
18
- updates: cupy.ndarray, values to add
19
- axis: int or None, axis along which to scatter
20
21
Returns:
22
cupy.ndarray: Array with scattered additions
23
"""
24
25
def scatter_max(a, indices, updates, axis=None):
26
"""Apply element-wise maximum at scattered indices."""
27
28
def scatter_min(a, indices, updates, axis=None):
29
"""Apply element-wise minimum at scattered indices."""
30
31
def rsqrt(x):
32
"""
33
Reciprocal square root (1/sqrt(x)).
34
35
Parameters:
36
- x: array-like, input array with positive values
37
38
Returns:
39
cupy.ndarray: Reciprocal square root of each element
40
"""
41
```
42
43
### Error State Management
44
45
Control floating-point error handling behavior.
46
47
```python { .api }
48
def errstate(**kwargs):
49
"""
50
Context manager for floating-point error handling.
51
52
Parameters:
53
- all: str, set behavior for all error types
54
- divide: str, behavior for division by zero
55
- over: str, behavior for overflow
56
- under: str, behavior for underflow
57
- invalid: str, behavior for invalid operations
58
59
Error behaviors: 'ignore', 'warn', 'raise', 'call', 'print', 'log'
60
61
Usage:
62
with cupyx.errstate(divide='ignore'):
63
result = a / b # Division by zero won't raise error
64
"""
65
66
def geterr():
67
"""
68
Get current error handling behavior.
69
70
Returns:
71
dict: Current error handling settings
72
"""
73
74
def seterr(**kwargs):
75
"""
76
Set error handling behavior.
77
78
Returns:
79
dict: Previous error handling settings
80
"""
81
```
82
83
### Synchronization Control
84
85
Control when GPU operations synchronize with CPU.
86
87
```python { .api }
88
def allow_synchronize(allow=True):
89
"""
90
Context manager to control synchronization behavior.
91
92
Parameters:
93
- allow: bool, whether to allow synchronization
94
95
Usage:
96
with cupyx.allow_synchronize(False):
97
# Operations run asynchronously
98
result = cupy.matmul(a, b)
99
"""
100
101
class DeviceSynchronized:
102
"""Context manager for device synchronization."""
103
def __enter__(self):
104
"""Enter synchronized context."""
105
106
def __exit__(self, *args):
107
"""Exit synchronized context."""
108
```
109
110
### Pinned Memory Arrays
111
112
Create arrays in pinned host memory for faster GPU transfers.
113
114
```python { .api }
115
def empty_pinned(shape, dtype=float, order='C'):
116
"""
117
Create empty array in pinned host memory.
118
119
Parameters:
120
- shape: int or tuple, array shape
121
- dtype: data type, array data type
122
- order: {'C', 'F'}, memory layout
123
124
Returns:
125
numpy.ndarray: Pinned memory array
126
"""
127
128
def empty_like_pinned(a, dtype=None, order='K', subok=True, shape=None):
129
"""Create empty pinned array with same shape as existing array."""
130
131
def zeros_pinned(shape, dtype=float, order='C'):
132
"""Create zeros array in pinned host memory."""
133
134
def zeros_like_pinned(a, dtype=None, order='K', subok=True, shape=None):
135
"""Create zeros pinned array with same shape as existing array."""
136
```
137
138
### Generalized Universal Functions
139
140
Create custom ufuncs with advanced broadcasting and type handling.
141
142
```python { .api }
143
class GeneralizedUFunc:
144
"""
145
Create generalized universal function.
146
147
Parameters:
148
- definition: str, function signature and operation
149
- name: str, function name
150
- doc: str, documentation string
151
"""
152
def __init__(self, definition, name=None, doc=None): ...
153
154
def __call__(self, *args, **kwargs):
155
"""Execute generalized ufunc."""
156
```
157
158
### Runtime Information
159
160
Get detailed information about CuPy runtime environment.
161
162
```python { .api }
163
def get_runtime_info(full=False):
164
"""
165
Get CuPy runtime information.
166
167
Parameters:
168
- full: bool, include detailed information
169
170
Returns:
171
str: Runtime information including CUDA version, device info, memory usage
172
"""
173
```
174
175
### SciPy-Compatible Functions (cupyx.scipy)
176
177
GPU-accelerated versions of SciPy functionality.
178
179
```python { .api }
180
def get_array_module(*args):
181
"""
182
Get appropriate array module for SciPy functions.
183
184
Returns:
185
module: cupyx.scipy if CuPy arrays present, otherwise scipy
186
"""
187
```
188
189
### JIT Compilation (cupyx.jit)
190
191
Just-in-time compilation for custom GPU kernels.
192
193
```python { .api }
194
def rawkernel(func=None, *, device=False):
195
"""
196
Decorator for JIT compilation of raw CUDA kernels.
197
198
Parameters:
199
- func: function, kernel function to compile
200
- device: bool, whether this is a device function
201
202
Usage:
203
@cupyx.jit.rawkernel
204
def my_kernel(x, y, size):
205
tid = jit.threadIdx.x + jit.blockIdx.x * jit.blockDim.x
206
if tid < size:
207
y[tid] = x[tid] * 2
208
"""
209
210
# CUDA threading model access
211
threadIdx: object # Thread index within block
212
blockDim: object # Block dimensions
213
blockIdx: object # Block index within grid
214
gridDim: object # Grid dimensions
215
warpsize: int # Warp size constant
216
217
# Built-in functions for JIT kernels
218
def syncthreads():
219
"""Synchronize threads in block."""
220
221
def syncwarp(mask=0xffffffff):
222
"""Synchronize threads in warp."""
223
224
def range(start, stop=None, step=None):
225
"""Range function for JIT kernels."""
226
227
# Atomic operations
228
def atomic_add(array, index, value):
229
"""Atomic addition."""
230
231
def atomic_sub(array, index, value):
232
"""Atomic subtraction."""
233
234
def atomic_max(array, index, value):
235
"""Atomic maximum."""
236
237
def atomic_min(array, index, value):
238
"""Atomic minimum."""
239
240
def atomic_cas(array, index, compare, value):
241
"""Atomic compare-and-swap."""
242
```
243
244
### Profiling (cupyx.profiler)
245
246
Performance profiling and benchmarking tools.
247
248
```python { .api }
249
def profile():
250
"""
251
Context manager for CUDA profiling.
252
253
Usage:
254
with cupyx.profiler.profile():
255
# Code to profile
256
result = cupy.matmul(a, b)
257
"""
258
259
def benchmark(func, args=(), kwargs=None, n_warmup=1, n_repeat=1, name=None, n_sync=1):
260
"""
261
Benchmark function performance.
262
263
Parameters:
264
- func: callable, function to benchmark
265
- args: tuple, function arguments
266
- kwargs: dict, function keyword arguments
267
- n_warmup: int, number of warmup runs
268
- n_repeat: int, number of timing runs
269
- name: str, benchmark name
270
- n_sync: int, number of synchronizations per run
271
272
Returns:
273
dict: Timing statistics
274
"""
275
276
def time_range(message=None, color_id=None, *, sync=False):
277
"""
278
Context manager for timing code ranges.
279
280
Parameters:
281
- message: str, range description
282
- color_id: int, color for profiler display
283
- sync: bool, synchronize before timing
284
"""
285
```
286
287
## Usage Examples
288
289
### Enhanced Array Operations
290
291
```python
292
import cupy as cp
293
import cupyx
294
295
# Scatter operations for sparse updates
296
indices = cp.array([0, 2, 4, 6, 8])
297
updates = cp.array([10, 20, 30, 40, 50])
298
target = cp.zeros(10)
299
300
# Add updates at specified indices
301
result = cupyx.scatter_add(target, indices, updates)
302
print(result) # [10, 0, 20, 0, 30, 0, 40, 0, 50, 0]
303
304
# Reciprocal square root (common in ML)
305
x = cp.array([1.0, 4.0, 9.0, 16.0])
306
rsqrt_result = cupyx.rsqrt(x) # [1.0, 0.5, 0.333, 0.25]
307
```
308
309
### Error State Management
310
311
```python
312
import cupy as cp
313
import cupyx
314
315
# Handle division by zero gracefully
316
a = cp.array([1.0, 2.0, 3.0])
317
b = cp.array([1.0, 0.0, 3.0])
318
319
# Without error handling (would raise warning)
320
# result = a / b
321
322
# With error handling
323
with cupyx.errstate(divide='ignore', invalid='ignore'):
324
result = a / b # [1.0, inf, 1.0] - no warning
325
326
# Check current error state
327
current_settings = cupyx.geterr()
328
print(current_settings)
329
```
330
331
### Pinned Memory for Fast Transfers
332
333
```python
334
import cupy as cp
335
import cupyx
336
import numpy as np
337
338
# Create pinned memory arrays for faster CPU-GPU transfers
339
pinned_array = cupyx.zeros_pinned((1000, 1000), dtype=np.float32)
340
341
# Fill with data (on CPU)
342
pinned_array[:] = np.random.rand(1000, 1000).astype(np.float32)
343
344
# Fast transfer to GPU
345
gpu_array = cp.asarray(pinned_array)
346
347
# Process on GPU
348
result = cp.matmul(gpu_array, gpu_array.T)
349
350
# Fast transfer back to pinned memory
351
result_pinned = cupyx.zeros_like_pinned(pinned_array)
352
result_pinned[:] = cp.asnumpy(result)
353
```
354
355
### JIT Compilation
356
357
```python
358
import cupy as cp
359
import cupyx.jit as jit
360
361
# JIT-compiled custom kernel
362
@jit.rawkernel()
363
def elementwise_multiply(x, y, out, size):
364
"""Custom element-wise multiplication kernel."""
365
tid = jit.threadIdx.x + jit.blockIdx.x * jit.blockDim.x
366
if tid < size:
367
out[tid] = x[tid] * y[tid]
368
369
# Use JIT kernel
370
a = cp.random.rand(1000000)
371
b = cp.random.rand(1000000)
372
result = cp.zeros_like(a)
373
374
# Launch kernel
375
threads_per_block = 256
376
blocks_per_grid = (len(a) + threads_per_block - 1) // threads_per_block
377
378
elementwise_multiply[blocks_per_grid, threads_per_block](a, b, result, len(a))
379
380
# More advanced JIT kernel with shared memory
381
@jit.rawkernel()
382
def block_sum(data, output, n):
383
"""Sum elements within each block using shared memory."""
384
# Shared memory declaration
385
shared = jit.shared_memory.array(256, jit.float32)
386
387
tid = jit.threadIdx.x
388
bid = jit.blockIdx.x
389
idx = bid * jit.blockDim.x + tid
390
391
# Load data into shared memory
392
if idx < n:
393
shared[tid] = data[idx]
394
else:
395
shared[tid] = 0.0
396
397
jit.syncthreads()
398
399
# Parallel reduction
400
s = jit.blockDim.x // 2
401
while s > 0:
402
if tid < s:
403
shared[tid] += shared[tid + s]
404
jit.syncthreads()
405
s //= 2
406
407
# Write result
408
if tid == 0:
409
output[bid] = shared[0]
410
```
411
412
### Performance Profiling
413
414
```python
415
import cupy as cp
416
import cupyx.profiler as profiler
417
418
# Benchmark different implementations
419
def matmul_standard(a, b):
420
return cp.matmul(a, b)
421
422
def matmul_dot(a, b):
423
return cp.dot(a, b)
424
425
# Setup test data
426
a = cp.random.rand(1000, 1000)
427
b = cp.random.rand(1000, 1000)
428
429
# Benchmark both implementations
430
stats1 = profiler.benchmark(matmul_standard, (a, b), n_repeat=10)
431
stats2 = profiler.benchmark(matmul_dot, (a, b), n_repeat=10)
432
433
print(f"matmul: {stats1['mean']:.4f} ms")
434
print(f"dot: {stats2['mean']:.4f} ms")
435
436
# Profile code sections
437
with profiler.profile():
438
with profiler.time_range("Matrix multiply"):
439
result1 = cp.matmul(a, b)
440
441
with profiler.time_range("SVD decomposition"):
442
u, s, vh = cp.linalg.svd(result1)
443
444
with profiler.time_range("Eigenvalue computation"):
445
eigenvals = cp.linalg.eigvals(result1 @ result1.T)
446
```
447
448
### Runtime Information
449
450
```python
451
import cupyx
452
453
# Get basic runtime info
454
info = cupyx.get_runtime_info()
455
print(info)
456
457
# Get detailed runtime info
458
detailed_info = cupyx.get_runtime_info(full=True)
459
print(detailed_info)
460
461
# Example output includes:
462
# - CUDA version
463
# - Device properties
464
# - Memory information
465
# - Library versions
466
# - Compilation settings
467
```