Tessl Tile for pypi/cupy-cuda113@9.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

array-operations.md cuda-integration.md cupy-extensions.md custom-kernels.md fft-operations.md index.md linear-algebra.md math-functions.md random-generation.md statistical-functions.md

cupy-extensions.mddocs/

0
# CuPy Extensions
1

2
Additional functionality through CuPy-X including SciPy compatibility, JIT compilation, specialized operations, and advanced GPU programming features. These extensions provide enhanced capabilities beyond core CuPy functionality.
3

4
## Capabilities
5

6
### Core Extensions
7

8
CuPy-specific mathematical functions and runtime utilities.
9

10
```python { .api }
11
def rsqrt(x, out=None):
12
    """Reciprocal square root function: 1 / sqrt(x).
13
    
14
    Optimized GPU implementation providing better performance
15
    than computing reciprocal and square root separately.
16
    
17
    Parameters:
18
    - x: array-like, input array
19
    - out: ndarray, output array
20
    
21
    Returns:
22
    cupy.ndarray: reciprocal square root of input
23
    """
24

25
def get_runtime_info(full=False):
26
    """Get CuPy runtime information.
27
    
28
    Parameters:
29
    - full: bool, whether to include detailed information
30
    
31
    Returns:
32
    dict: runtime information including CUDA version, device details
33
    """
34
```
35

36
### Scatter Operations
37

38
Efficient scatter operations for advanced indexing and updates.
39

40
```python { .api }
41
def scatter_add(a, indices, b, axis=None):
42
    """Add values to array at specified indices.
43
    
44
    Performs atomic addition operations, making it safe for
45
    concurrent updates to the same memory locations.
46
    
47
    Parameters:
48
    - a: ndarray, destination array to be updated
49
    - indices: array-like, indices where values should be added
50
    - b: array-like, values to add
51
    - axis: int, axis along which to scatter
52
    
53
    Returns:
54
    None: modifies array a in-place
55
    """
56

57
def scatter_max(a, indices, b, axis=None):
58
    """Take element-wise maximum between array values and scattered values.
59
    
60
    Parameters:
61
    - a: ndarray, destination array to be updated
62
    - indices: array-like, indices where comparison should occur
63
    - b: array-like, values to compare
64
    - axis: int, axis along which to scatter
65
    
66
    Returns:
67
    None: modifies array a in-place
68
    """
69

70
def scatter_min(a, indices, b, axis=None):
71
    """Take element-wise minimum between array values and scattered values.
72
    
73
    Parameters:
74
    - a: ndarray, destination array to be updated
75
    - indices: array-like, indices where comparison should occur
76
    - b: array-like, values to compare
77
    - axis: int, axis along which to scatter
78
    
79
    Returns:
80
    None: modifies array a in-place
81
    """
82
```
83

84
### Pinned Memory Arrays
85

86
Arrays in host memory that are accessible to GPU for efficient transfers.
87

88
```python { .api }
89
def empty_pinned(shape, dtype=float64, order='C'):
90
    """Return new pinned array with uninitialized entries.
91
    
92
    Pinned memory enables faster transfers between CPU and GPU
93
    and allows for asynchronous memory operations.
94
    
95
    Parameters:
96
    - shape: int or sequence of ints, shape of array
97
    - dtype: data-type, type of array elements
98
    - order: memory layout order ('C' or 'F')
99
    
100
    Returns:
101
    numpy.ndarray: array in pinned host memory
102
    """
103

104
def zeros_pinned(shape, dtype=float64, order='C'):
105
    """Return new pinned array of zeros.
106
    
107
    Parameters:
108
    - shape: int or sequence of ints, shape of array
109
    - dtype: data-type, type of array elements
110
    - order: memory layout order
111
    
112
    Returns:
113
    numpy.ndarray: zeros array in pinned host memory
114
    """
115

116
def empty_like_pinned(a, dtype=None, order='K', subok=True, shape=None):
117
    """Return new pinned array with same shape and type as given array."""
118

119
def zeros_like_pinned(a, dtype=None, order='K', subok=True, shape=None):
120
    """Return pinned array of zeros with same shape and type as given array."""
121
```
122

123
### Error State Management
124

125
Control floating-point error handling behavior.
126

127
```python { .api }
128
def errstate(**kwargs):
129
    """Context manager for floating-point error handling.
130
    
131
    Temporarily override how floating-point errors are handled,
132
    including division by zero, overflow, underflow, and invalid operations.
133
    
134
    Parameters:
135
    - all: str, set behavior for all error types
136
    - divide: str, behavior for division by zero
137
    - over: str, behavior for overflow
138
    - under: str, behavior for underflow
139
    - invalid: str, behavior for invalid operations
140
    
141
    Error behaviors: 'ignore', 'warn', 'raise', 'call', 'print', 'log'
142
    
143
    Returns:
144
    context manager
145
    """
146

147
def geterr():
148
    """Get current floating-point error handling settings.
149
    
150
    Returns:
151
    dict: current error handling settings for each error type
152
    """
153

154
def seterr(all=None, divide=None, over=None, under=None, invalid=None):
155
    """Set floating-point error handling behavior.
156
    
157
    Parameters:
158
    - all: str, set behavior for all error types
159
    - divide: str, behavior for division by zero
160
    - over: str, behavior for overflow  
161
    - under: str, behavior for underflow
162
    - invalid: str, behavior for invalid operations
163
    
164
    Returns:
165
    dict: previous error handling settings
166
    """
167
```
168

169
### Synchronization Control
170

171
Advanced control over GPU synchronization behavior.
172

173
```python { .api }
174
def allow_synchronize(allow=True):
175
    """Context manager to control synchronization behavior.
176
    
177
    Allows fine-grained control over when GPU operations
178
    synchronize with CPU, useful for performance optimization.
179
    
180
    Parameters:
181
    - allow: bool, whether to allow synchronization
182
    
183
    Returns:
184
    context manager
185
    """
186

187
class DeviceSynchronized:
188
    """Device synchronization manager for performance control.
189
    
190
    Provides advanced synchronization control for multi-GPU
191
    applications and performance-critical sections.
192
    """
193
    
194
    def __enter__(self):
195
        """Enter synchronized section."""
196
        
197
    def __exit__(self, *args):
198
        """Exit synchronized section."""
199
```
200

201
### JIT Compilation
202

203
Just-in-time compilation capabilities for dynamic kernel generation.
204

205
```python { .api }
206
def rawkernel(mode='python', device=False):
207
    """Decorator for creating raw kernels from Python functions.
208
    
209
    Enables writing CUDA kernels using Python syntax with JIT
210
    compilation to optimized GPU code.
211
    
212
    Parameters:
213
    - mode: str, compilation mode ('python', 'cuda')
214
    - device: bool, whether function runs on device
215
    
216
    Returns:
217
    callable: decorated kernel function
218
    """
219

220
# CUDA Built-in Types and Functions
221
threadIdx = None  # Thread index within block (x, y, z components)
222
blockDim = None   # Block dimensions (x, y, z components)
223
blockIdx = None   # Block index within grid (x, y, z components)
224
gridDim = None    # Grid dimensions (x, y, z components)
225

226
def syncthreads():
227
    """Synchronize all threads within a CUDA block.
228
    
229
    Ensures all threads in the current block reach this point
230
    before any thread continues execution.
231
    """
232

233
def shared_memory(dtype, shape):
234
    """Allocate shared memory array within CUDA block.
235
    
236
    Parameters:
237
    - dtype: data type of shared memory elements
238
    - shape: int or tuple, shape of shared memory array
239
    
240
    Returns:
241
    shared memory array accessible to all threads in block
242
    """
243

244
def atomic_add(array, index, value):
245
    """Perform atomic addition operation.
246
    
247
    Safely adds value to array[index] in a thread-safe manner,
248
    preventing race conditions in parallel execution.
249
    
250
    Parameters:
251
    - array: target array
252
    - index: index to update
253
    - value: value to add
254
    
255
    Returns:
256
    previous value at array[index]
257
    """
258
```
259

260
### SciPy Compatibility
261

262
Comprehensive SciPy-compatible functionality with GPU acceleration.
263

264
```python { .api }
265
def get_array_module(*args):
266
    """Get appropriate array module (NumPy or CuPy) for arguments.
267
    
268
    Automatically selects the correct module based on input types,
269
    enabling generic code that works with both CPU and GPU arrays.
270
    
271
    Parameters:
272
    - args: array arguments to analyze
273
    
274
    Returns:
275
    module: cupy or numpy module appropriate for inputs
276
    """
277
```
278

279
### Sparse Matrix Support
280

281
GPU-accelerated sparse matrix operations and formats.
282

283
```python { .api }
284
class spmatrix:
285
    """Base class for sparse matrices.
286
    
287
    Provides common interface for all sparse matrix formats
288
    with GPU-accelerated operations and memory efficiency.
289
    """
290

291
class csr_matrix:
292
    """Compressed Sparse Row matrix format.
293
    
294
    Efficient sparse matrix format for arithmetic operations
295
    and fast row slicing, optimized for GPU computation.
296
    """
297
    
298
    def __init__(self, arg1, shape=None, dtype=None, copy=False):
299
        """Initialize CSR sparse matrix.
300
        
301
        Parameters:
302
        - arg1: array-like, sparse matrix data
303
        - shape: tuple, matrix dimensions
304
        - dtype: data type
305
        - copy: bool, whether to copy data
306
        """
307

308
class csc_matrix:
309
    """Compressed Sparse Column matrix format.
310
    
311
    Efficient for column slicing and matrix-vector products
312
    where the matrix is accessed column-wise.
313
    """
314

315
class coo_matrix:
316
    """COOrdinate sparse matrix format.
317
    
318
    Efficient for sparse matrix construction and format conversion,
319
    stores (row, col, data) triplets.
320
    """
321

322
class dia_matrix:
323
    """DIAgonal sparse matrix format.
324
    
325
    Efficient storage for matrices with few diagonals,
326
    optimized for diagonal matrix operations.
327
    """
328

329
def issparse(x):
330
    """Check if input is a sparse matrix.
331
    
332
    Parameters:
333
    - x: input object to check
334
    
335
    Returns:
336
    bool: True if x is a sparse matrix
337
    """
338

339
def eye(m, n=None, k=0, dtype=float, format=None):
340
    """Create sparse identity matrix.
341
    
342
    Parameters:
343
    - m: int, number of rows
344
    - n: int, number of columns (defaults to m)
345
    - k: int, diagonal offset
346
    - dtype: data type
347
    - format: str, sparse matrix format
348
    
349
    Returns:
350
    sparse matrix: identity matrix in specified format
351
    """
352

353
def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):
354
    """Construct sparse matrix from diagonals.
355
    
356
    Parameters:
357
    - diagonals: array-like, diagonal values
358
    - offsets: int or array-like, diagonal offsets
359
    - shape: tuple, matrix shape
360
    - format: str, output format
361
    - dtype: data type
362
    
363
    Returns:
364
    sparse matrix: matrix with specified diagonals
365
    """
366
```
367

368
### Advanced Extensions
369

370
Specialized functionality for scientific computing and machine learning.
371

372
```python { .api }
373
# Linear Algebra Extensions (cupyx.scipy.linalg)
374
class LinalgExtensions:
375
    """Extended linear algebra operations beyond NumPy/SciPy standard."""
376

377
# Signal Processing Extensions (cupyx.scipy.signal) 
378
class SignalExtensions:
379
    """Signal processing functions with GPU acceleration."""
380

381
# Image Processing Extensions (cupyx.scipy.ndimage)
382
class NDImageExtensions:
383
    """N-dimensional image processing operations."""
384

385
# Special Functions Extensions (cupyx.scipy.special)
386
class SpecialExtensions:
387
    """Mathematical special functions with GPU support."""
388

389
# Statistics Extensions (cupyx.scipy.stats)
390
class StatsExtensions:
391
    """Statistical distributions and tests with GPU acceleration."""
392
```
393

394
## Usage Examples
395

396
### Scatter Operations
397

398
```python
399
import cupy as cp
400
import cupyx
401

402
# Create destination array
403
result = cp.zeros(10, dtype=cp.float32)
404
indices = cp.array([1, 3, 1, 7, 3])
405
values = cp.array([1.0, 2.0, 3.0, 4.0, 5.0])
406

407
# Scatter add - accumulates values at repeated indices
408
cupyx.scatter_add(result, indices, values)
409
print(f"After scatter_add: {result}")  # [0, 4, 0, 7, 0, 0, 0, 4, 0, 0]
410

411
# Scatter max - keeps maximum value at each index
412
result.fill(0)
413
cupyx.scatter_max(result, indices, values)
414
print(f"After scatter_max: {result}")  # [0, 3, 0, 5, 0, 0, 0, 4, 0, 0]
415

416
# Use in neural network-like operations
417
def sparse_embedding_lookup(embeddings, indices):
418
    """Efficient embedding lookup with scatter operations."""
419
    output = cp.zeros((len(indices), embeddings.shape[1]), dtype=embeddings.dtype)
420
    for i, idx in enumerate(indices):
421
        cupyx.scatter_add(output[i:i+1], 0, embeddings[idx:idx+1], axis=0)
422
    return output
423

424
# Example usage
425
embedding_dim = 128
426
vocab_size = 10000
427
embeddings = cp.random.normal(0, 1, (vocab_size, embedding_dim))
428
word_indices = cp.array([5, 42, 1337, 9999])
429

430
embedded = sparse_embedding_lookup(embeddings, word_indices)
431
print(f"Embedded shape: {embedded.shape}")
432
```
433

434
### JIT Compilation
435

436
```python
437
import cupy as cp
438
import cupyx.jit as jit
439

440
@jit.rawkernel()
441
def matrix_multiply_jit(A, B, C, M, N, K):
442
    """JIT-compiled matrix multiplication kernel."""
443
    i = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
444
    j = jit.blockIdx.y * jit.blockDim.y + jit.threadIdx.y
445
    
446
    if i < M and j < N:
447
        temp = 0.0
448
        for k in range(K):
449
            temp += A[i, k] * B[k, j]
450
        C[i, j] = temp
451

452
# Example usage
453
M, N, K = 512, 512, 256
454
A = cp.random.random((M, K), dtype=cp.float32)
455
B = cp.random.random((K, N), dtype=cp.float32)
456
C = cp.zeros((M, N), dtype=cp.float32)
457

458
# Configure launch parameters
459
threads_per_block = (16, 16)
460
blocks_per_grid = ((M + 15) // 16, (N + 15) // 16)
461

462
# Launch JIT kernel
463
matrix_multiply_jit[blocks_per_grid, threads_per_block](A, B, C, M, N, K)
464

465
# Verify result
466
expected = cp.dot(A, B)
467
print(f"JIT result matches cp.dot: {cp.allclose(C, expected)}")
468

469
# Advanced JIT example with shared memory
470
@jit.rawkernel()
471
def reduction_with_shared_memory(input_array, output_array, n):
472
    """Parallel reduction using shared memory."""
473
    # Allocate shared memory
474
    shared_data = jit.shared_memory(cp.float32, 256)
475
    
476
    tid = jit.threadIdx.x
477
    bid = jit.blockIdx.x
478
    i = bid * jit.blockDim.x + tid
479
    
480
    # Load data into shared memory
481
    if i < n:
482
        shared_data[tid] = input_array[i]
483
    else:
484
        shared_data[tid] = 0.0
485
    
486
    jit.syncthreads()
487
    
488
    # Perform reduction in shared memory
489
    stride = jit.blockDim.x // 2
490
    while stride > 0:
491
        if tid < stride:
492
            shared_data[tid] += shared_data[tid + stride]
493
        jit.syncthreads()
494
        stride //= 2
495
    
496
    # Write result for this block
497
    if tid == 0:
498
        jit.atomic_add(output_array, 0, shared_data[0])
499

500
# Test reduction kernel
501
data = cp.random.random(10000, dtype=cp.float32)
502
result = cp.zeros(1, dtype=cp.float32)
503

504
block_size = 256
505
grid_size = (len(data) + block_size - 1) // block_size
506

507
reduction_with_shared_memory[grid_size, block_size](data, result, len(data))
508
print(f"Reduction result: {result[0]}")
509
print(f"Expected (cp.sum): {cp.sum(data)}")
510
```
511

512
### Sparse Matrix Operations
513

514
```python
515
import cupy as cp
516
import cupyx.scipy.sparse as sp
517

518
# Create sparse matrices
519
# COO format - good for construction
520
row = cp.array([0, 1, 2, 0, 1, 2])
521
col = cp.array([0, 1, 2, 1, 2, 0])
522
data = cp.array([1, 2, 3, 4, 5, 6])
523

524
coo_matrix = sp.coo_matrix((data, (row, col)), shape=(3, 3))
525
print(f"COO matrix:\n{coo_matrix.toarray()}")
526

527
# Convert to CSR for efficient arithmetic
528
csr_matrix = coo_matrix.tocsr()
529
print(f"CSR format - data: {csr_matrix.data}")
530
print(f"CSR format - indices: {csr_matrix.indices}")
531
print(f"CSR format - indptr: {csr_matrix.indptr}")
532

533
# Sparse matrix operations
534
vector = cp.array([1, 2, 3])
535
result = csr_matrix.dot(vector)  # Matrix-vector multiplication
536
print(f"Matrix-vector product: {result}")
537

538
# Create large sparse matrix efficiently
539
n = 10000
540
# Tridiagonal matrix
541
main_diag = cp.ones(n) * 2
542
off_diag = cp.ones(n-1) * -1
543

544
tridiag = sp.diags([off_diag, main_diag, off_diag], 
545
                   offsets=[-1, 0, 1], 
546
                   shape=(n, n), 
547
                   format='csr')
548

549
print(f"Tridiagonal matrix shape: {tridiag.shape}")
550
print(f"Non-zero elements: {tridiag.nnz}")
551

552
# Solve linear system (requires SciPy compatibility)
553
b = cp.ones(n)
554
# x = sp.linalg.spsolve(tridiag, b)  # Would solve Ax = b
555
```
556

557
### Pinned Memory for Optimal Transfers
558

559
```python
560
import cupy as cp
561
import cupyx
562
import time
563
import numpy as np
564

565
# Compare regular vs pinned memory transfer performance
566
size = 10000000  # 10M elements
567

568
# Regular CPU array
569
regular_cpu = np.random.random(size).astype(np.float32)
570

571
# Pinned CPU array
572
pinned_cpu = cupyx.zeros_pinned(size, dtype=cp.float32)
573
pinned_cpu[:] = np.random.random(size).astype(np.float32)
574

575
# Benchmark transfers
576
def benchmark_transfer(cpu_array, n_iterations=100):
577
    """Benchmark CPU-GPU transfer performance."""
578
    times = []
579
    
580
    for _ in range(n_iterations):
581
        start = time.time()
582
        gpu_array = cp.asarray(cpu_array)
583
        cp.cuda.Stream.null.synchronize()
584
        times.append(time.time() - start)
585
    
586
    return np.mean(times), np.std(times)
587

588
# Regular memory transfer
589
regular_mean, regular_std = benchmark_transfer(regular_cpu)
590

591
# Pinned memory transfer  
592
pinned_mean, pinned_std = benchmark_transfer(pinned_cpu)
593

594
print(f"Regular memory: {regular_mean:.4f} ± {regular_std:.4f} seconds")
595
print(f"Pinned memory:  {pinned_mean:.4f} ± {pinned_std:.4f} seconds")
596
print(f"Speedup: {regular_mean / pinned_mean:.2f}x")
597

598
# Asynchronous transfer example
599
async_stream = cp.cuda.Stream()
600

601
with async_stream:
602
    # Start asynchronous transfer
603
    gpu_async = cp.asarray(pinned_cpu)
604
    
605
    # Do other work while transfer happens
606
    other_work = cp.random.random((1000, 1000))
607
    result = cp.sum(other_work)
608
    
609
    # Synchronize to ensure transfer is complete
610
    async_stream.synchronize()
611

612
print(f"Asynchronous transfer completed, result: {result}")
613
```
614

615
### Error State Management
616

617
```python
618
import cupy as cp
619
import cupyx
620

621
# Example with potential floating point errors
622
a = cp.array([1.0, 0.0, -1.0])
623
b = cp.array([0.0, 0.0, 0.0])
624

625
# Default behavior (may warn or ignore)
626
result1 = a / b
627
print(f"Default behavior: {result1}")  # [inf, nan, -inf]
628

629
# Temporarily change error handling
630
with cupyx.errstate(divide='raise', invalid='raise'):
631
    try:
632
        result2 = a / b
633
    except FloatingPointError as e:
634
        print(f"Caught error: {e}")
635

636
# Check current error state
637
current_state = cupyx.geterr()
638
print(f"Current error handling: {current_state}")
639

640
# Set specific error handling
641
old_state = cupyx.seterr(divide='warn', over='ignore', invalid='print')
642
print(f"Previous state: {old_state}")
643

644
# Example with overflow
645
large_numbers = cp.array([1e308, 1e308])
646
overflow_result = large_numbers * 10  # Will overflow to inf
647

648
# Restore previous state
649
cupyx.seterr(**old_state)
650

651
# Advanced error handling for numerical algorithms
652
def safe_divide(a, b, default=0.0):
653
    """Safe division with custom error handling."""
654
    with cupyx.errstate(divide='ignore', invalid='ignore'):
655
        result = a / b
656
        # Replace inf and nan with default value
657
        result = cp.where(cp.isfinite(result), result, default)
658
    return result
659

660
# Test safe division
661
numerator = cp.array([1.0, 2.0, 3.0])
662
denominator = cp.array([2.0, 0.0, 4.0])
663
safe_result = safe_divide(numerator, denominator, default=0.0)
664
print(f"Safe division result: {safe_result}")  # [0.5, 0.0, 0.75]
665
```
666

667
### High-Performance Computing Patterns
668

669
```python
670
import cupy as cp
671
import cupyx
672
import time
673

674
def gpu_accelerated_monte_carlo(n_samples=10000000):
675
    """Monte Carlo π estimation with CuPy extensions."""
676
    
677
    # Use rsqrt for better performance
678
    def estimate_pi_optimized(samples):
679
        x = cp.random.uniform(-1, 1, samples)
680
        y = cp.random.uniform(-1, 1, samples)
681
        
682
        # Use optimized rsqrt instead of sqrt
683
        distances_squared = x*x + y*y
684
        # inside = distances_squared <= 1 is more efficient than sqrt comparison
685
        inside_circle = distances_squared <= 1.0
686
        
687
        return 4.0 * cp.mean(inside_circle)
688
    
689
    # Benchmark with synchronization control
690
    with cupyx.allow_synchronize(False):
691
        start_time = time.time()
692
        pi_estimate = estimate_pi_optimized(n_samples)
693
        # Explicit synchronization when needed
694
        cp.cuda.Stream.null.synchronize()
695
        end_time = time.time()
696
    
697
    return float(pi_estimate), end_time - start_time
698

699
# Run Monte Carlo estimation
700
pi_est, computation_time = gpu_accelerated_monte_carlo(50000000)
701
error = abs(pi_est - cp.pi)
702

703
print(f"π estimate: {pi_est:.6f}")
704
print(f"Error: {error:.6f}")
705
print(f"Computation time: {computation_time:.4f} seconds")
706
print(f"Rate: {50000000 / computation_time / 1e6:.2f} M samples/second")
707

708
# Memory-efficient streaming computation
709
def streaming_mean_computation(total_size, chunk_size=1000000):
710
    """Compute mean of very large dataset using streaming."""
711
    running_sum = 0.0
712
    running_count = 0
713
    
714
    # Use pinned memory for efficient transfers
715
    chunk_pinned = cupyx.empty_pinned(chunk_size, dtype=cp.float32)
716
    
717
    for start_idx in range(0, total_size, chunk_size):
718
        end_idx = min(start_idx + chunk_size, total_size)
719
        current_chunk_size = end_idx - start_idx
720
        
721
        # Generate data in pinned memory
722
        chunk_pinned[:current_chunk_size] = np.random.normal(
723
            5.0, 2.0, current_chunk_size
724
        ).astype(np.float32)
725
        
726
        # Transfer to GPU and process
727
        gpu_chunk = cp.asarray(chunk_pinned[:current_chunk_size])
728
        chunk_sum = cp.sum(gpu_chunk)
729
        
730
        running_sum += float(chunk_sum)
731
        running_count += current_chunk_size
732
        
733
        if start_idx % (chunk_size * 10) == 0:
734
            current_mean = running_sum / running_count
735
            print(f"Processed {running_count:,} samples, current mean: {current_mean:.4f}")
736
    
737
    return running_sum / running_count
738

739
# Process 100M samples in chunks
740
final_mean = streaming_mean_computation(100000000, chunk_size=2000000)
741
print(f"Final streaming mean: {final_mean:.6f}")
742
print(f"Expected mean: 5.0")
743
```

Version

Tile

Files

cupy-extensions.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

cupy-extensions.mddocs/