CuPy: NumPy & SciPy for GPU - CUDA 11.x optimized distribution providing GPU-accelerated computing with Python
—
CuPy provides comprehensive performance profiling and benchmarking tools through the cupyx.profiler module, enabling developers to measure execution times, analyze GPU utilization, identify performance bottlenecks, and optimize CUDA applications for maximum throughput and efficiency.
Core timing utilities for measuring execution performance of CuPy operations and custom kernels.
def benchmark(func, args=(), kwargs=None, n_warmup=1, n_repeat=3, n_run=1):
"""
Benchmark a function with comprehensive timing statistics.
Executes the function multiple times and provides detailed
timing statistics including mean, standard deviation, min/max
execution times, and GPU/CPU timing analysis.
Parameters:
func: callable - Function to benchmark
args: tuple, optional - Positional arguments for function
kwargs: dict, optional - Keyword arguments for function
n_warmup: int, optional - Number of warmup runs (default 1)
n_repeat: int, optional - Number of timing repetitions (default 3)
n_run: int, optional - Number of function calls per repetition (default 1)
Returns:
dict: Benchmark results with timing statistics
"""
def time_range():
"""
Context manager for measuring execution time ranges.
Returns a context manager that measures the time between
entry and exit, accounting for GPU synchronization.
Returns:
TimeRangeContext: Context manager for timing
"""
class TimeRangeContext:
"""
Context manager for timing code execution ranges.
Provides precise timing measurements for GPU operations
with proper synchronization handling.
"""
def __enter__(self): ...
def __exit__(self, *args): ...
@property
def elapsed_time(self):
"""Get elapsed time in seconds."""
def profile():
"""
Context manager for comprehensive profiling.
Enables detailed profiling including NVTX markers,
memory usage tracking, and kernel execution analysis.
Returns:
ProfileContext: Context manager for profiling
"""
class ProfileContext:
"""
Context manager for comprehensive performance profiling.
Collects detailed performance metrics including timing,
memory usage, kernel launches, and GPU utilization.
"""
def __enter__(self): ...
def __exit__(self, *args): ...
def print_report(self):
"""Print detailed profiling report."""
def save_report(self, filename):
"""Save profiling report to file."""Tools for analyzing GPU memory usage patterns and identifying memory bottlenecks.
def get_memory_info():
"""
Get current GPU memory usage information.
Returns:
dict: Memory usage statistics including total, used, and free memory
"""
def memory_profile():
"""
Context manager for memory usage profiling.
Tracks memory allocations and deallocations during execution
to identify memory usage patterns and potential leaks.
Returns:
MemoryProfileContext: Context manager for memory profiling
"""
class MemoryProfileContext:
"""
Context manager for tracking memory usage patterns.
Monitors GPU memory allocations, deallocations, and peak usage
during code execution.
"""
def __enter__(self): ...
def __exit__(self, *args): ...
@property
def peak_memory(self):
"""Peak memory usage during profiling."""
@property
def memory_allocations(self):
"""List of memory allocation events."""
def print_memory_report(self):
"""Print detailed memory usage report."""
def trace_memory(enabled=True):
"""
Enable or disable memory allocation tracing.
Parameters:
enabled: bool - Whether to enable memory tracing
"""
def get_memory_trace():
"""
Get memory allocation trace information.
Returns:
list: Memory allocation trace events
"""NVIDIA Tools Extension (NVTX) integration for advanced profiling with external tools.
def nvtx_push(message, color=None):
"""
Push an NVTX range marker.
Creates a named range marker for profiling tools like Nsight
to identify code sections and their performance characteristics.
Parameters:
message: str - Range description
color: int, optional - Color code for the range
"""
def nvtx_pop():
"""Pop the most recent NVTX range marker."""
def nvtx_mark(message, color=None):
"""
Create an NVTX point marker.
Parameters:
message: str - Marker description
color: int, optional - Color code for the marker
"""
def nvtx_range_push(message, color=None):
"""
Push a named NVTX range (alias for nvtx_push).
Parameters:
message: str - Range name
color: int, optional - Color code
"""
def nvtx_range_pop():
"""Pop the current NVTX range (alias for nvtx_pop)."""
class NVTXRange:
"""
Context manager for NVTX range markers.
Automatically pushes and pops NVTX range markers for
convenient profiling of code blocks.
"""
def __init__(self, message, color=None):
"""
Parameters:
message: str - Range description
color: int, optional - Color code
"""
def __enter__(self): ...
def __exit__(self, *args): ...
def nvtx(message=None, color=None):
"""
Decorator or context manager for NVTX range marking.
Can be used as a decorator for functions or as a context manager
for code blocks to automatically add NVTX markers.
Parameters:
message: str, optional - Range description
color: int, optional - Color code
"""Tools for analyzing individual kernel performance and optimization opportunities.
def kernel_profile():
"""
Context manager for kernel-specific profiling.
Tracks individual kernel launches, execution times,
and performance characteristics.
Returns:
KernelProfileContext: Context manager for kernel profiling
"""
class KernelProfileContext:
"""
Context manager for detailed kernel performance analysis.
Collects metrics for individual kernel launches including
execution time, occupancy, memory throughput, and compute utilization.
"""
def __enter__(self): ...
def __exit__(self, *args): ...
@property
def kernel_stats(self):
"""Statistics for executed kernels."""
def print_kernel_report(self):
"""Print detailed kernel analysis report."""
def get_kernel_info(kernel):
"""
Get information about a compiled kernel.
Parameters:
kernel: RawKernel or similar - Kernel object
Returns:
dict: Kernel information including occupancy and resource usage
"""
def analyze_occupancy(kernel, block_size, shared_mem=0):
"""
Analyze theoretical occupancy for a kernel configuration.
Parameters:
kernel: kernel object - Kernel to analyze
block_size: int - Block size (threads per block)
shared_mem: int, optional - Shared memory usage per block
Returns:
dict: Occupancy analysis results
"""Tools for comparing performance between different implementations and configurations.
def compare_implementations(*funcs, args=(), kwargs=None, names=None):
"""
Compare performance of multiple function implementations.
Benchmarks multiple functions with identical inputs and provides
comparative analysis of their performance characteristics.
Parameters:
*funcs: callable - Functions to compare
args: tuple, optional - Arguments for all functions
kwargs: dict, optional - Keyword arguments for all functions
names: list, optional - Names for each function
Returns:
dict: Comparative benchmark results
"""
def parameter_sweep(func, param_ranges, fixed_args=(), fixed_kwargs=None):
"""
Perform parameter sweep benchmarking.
Tests function performance across different parameter values
to identify optimal configurations.
Parameters:
func: callable - Function to benchmark
param_ranges: dict - Parameter names and value ranges
fixed_args: tuple, optional - Fixed positional arguments
fixed_kwargs: dict, optional - Fixed keyword arguments
Returns:
dict: Parameter sweep results
"""
def scaling_analysis(func, data_sizes, *args, **kwargs):
"""
Analyze performance scaling with different data sizes.
Parameters:
func: callable - Function to analyze
data_sizes: list - Different input sizes to test
*args: Additional function arguments
**kwargs: Additional function keyword arguments
Returns:
dict: Scaling analysis results
"""import cupy as cp
from cupyx import profiler
# Simple function benchmarking
def matrix_multiply(a, b):
return cp.dot(a, b)
# Create test matrices
size = 2048
a = cp.random.rand(size, size, dtype=cp.float32)
b = cp.random.rand(size, size, dtype=cp.float32)
# Benchmark the function
results = profiler.benchmark(
matrix_multiply,
args=(a, b),
n_warmup=3,
n_repeat=10,
n_run=1
)
print(f"Mean execution time: {results['mean']:.4f} seconds")
print(f"Standard deviation: {results['std']:.4f} seconds")
print(f"Min time: {results['min']:.4f} seconds")
print(f"Max time: {results['max']:.4f} seconds")
print(f"Throughput: {results['throughput']:.2f} GFLOPS")# Using time_range for custom timing
with profiler.time_range() as timer:
# Complex computation sequence
x = cp.random.rand(10000, 10000)
y = cp.fft.fft2(x)
z = cp.abs(y) ** 2
result = cp.sum(z)
# Ensure all operations complete
cp.cuda.synchronize()
print(f"Total execution time: {timer.elapsed_time:.4f} seconds")
# Multiple timing ranges
operations = {}
with profiler.time_range() as timer:
data = cp.random.rand(5000, 5000)
operations['data_generation'] = timer.elapsed_time
with profiler.time_range() as timer:
processed = cp.sin(data) * cp.cos(data)
operations['trigonometric'] = timer.elapsed_time
with profiler.time_range() as timer:
result = cp.linalg.svd(processed[:1000, :1000])
operations['svd'] = timer.elapsed_time
for op, time in operations.items():
print(f"{op}: {time:.4f} seconds")# Memory usage analysis
with profiler.memory_profile() as mem_prof:
# Allocate large arrays
arrays = []
for i in range(10):
arr = cp.random.rand(1000, 1000)
arrays.append(arr)
# Perform operations that may fragment memory
results = []
for arr in arrays:
processed = cp.fft.fft2(arr)
filtered = cp.abs(processed) > 0.5
results.append(cp.sum(filtered))
# Cleanup some arrays
del arrays[:5]
print(f"Peak memory usage: {mem_prof.peak_memory / 1024**3:.2f} GB")
mem_prof.print_memory_report()
# Memory trace analysis
profiler.trace_memory(True)
# Operations to trace
large_array = cp.zeros((10000, 10000))
temp_arrays = [cp.random.rand(1000, 1000) for _ in range(50)]
del temp_arrays # Free memory
# Get memory trace
trace = profiler.get_memory_trace()
print(f"Number of memory operations: {len(trace)}")
profiler.trace_memory(False)# Using NVTX markers for external profiling tools
@profiler.nvtx("matrix_operations", color=0xFF0000)
def complex_matrix_operations(data):
"""Function with NVTX profiling markers."""
with profiler.NVTXRange("preprocessing", color=0x00FF00):
# Data preprocessing
normalized = (data - cp.mean(data)) / cp.std(data)
scaled = normalized * 2.0
with profiler.NVTXRange("computation", color=0x0000FF):
# Main computation
result = cp.linalg.matrix_power(scaled, 3)
eigenvals = cp.linalg.eigvals(result)
with profiler.NVTXRange("postprocessing", color=0xFFFF00):
# Postprocessing
sorted_vals = cp.sort(eigenvals)
final_result = cp.real(sorted_vals)
return final_result
# Use the profiled function
test_matrix = cp.random.rand(500, 500, dtype=cp.complex64)
result = complex_matrix_operations(test_matrix)
# Manual NVTX markers
profiler.nvtx_mark("Starting algorithm", color=0xFF00FF)
profiler.nvtx_push("Algorithm Phase 1", color=0x00FFFF)
# Phase 1 operations
phase1_data = cp.random.rand(1000, 1000)
phase1_result = cp.sum(phase1_data, axis=0)
profiler.nvtx_pop()
profiler.nvtx_push("Algorithm Phase 2", color=0xFF8000)
# Phase 2 operations
phase2_result = cp.cumsum(phase1_result)
profiler.nvtx_pop()
profiler.nvtx_mark("Algorithm completed", color=0x8000FF)# Full profiling session
with profiler.profile() as prof:
# Data preparation
print("Preparing data...")
data_size = 8192
matrix_a = cp.random.rand(data_size, data_size, dtype=cp.float32)
matrix_b = cp.random.rand(data_size, data_size, dtype=cp.float32)
# Matrix multiplication
print("Performing matrix multiplication...")
result_mm = cp.dot(matrix_a, matrix_b)
# FFT operations
print("Performing FFT...")
fft_data = cp.random.rand(data_size, data_size, dtype=cp.complex64)
fft_result = cp.fft.fft2(fft_data)
# Reduction operations
print("Performing reductions...")
sum_result = cp.sum(result_mm)
mean_result = cp.mean(fft_result)
# Linear algebra
print("Performing linear algebra...")
smaller_matrix = matrix_a[:1000, :1000]
eigenvals = cp.linalg.eigvals(smaller_matrix)
# Custom kernel
print("Running custom kernel...")
@cp.ElementwiseKernel('T x, T y', 'T z', 'z = sqrt(x*x + y*y)')
def magnitude_kernel(x, y):
pass
mag_result = magnitude_kernel(matrix_a, matrix_b)
# Print comprehensive report
prof.print_report()
# Save report to file
prof.save_report("profiling_report.txt")# Analyze custom kernel performance
kernel_code = r'''
extern "C" __global__
void optimized_reduction(float* input, float* output, int n) {
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? input[i] : 0.0f;
__syncthreads();
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) output[blockIdx.x] = sdata[0];
}
'''
reduction_kernel = cp.RawKernel(kernel_code, 'optimized_reduction')
# Analyze kernel performance
with profiler.kernel_profile() as kernel_prof:
input_data = cp.random.rand(1000000, dtype=cp.float32)
block_size = 256
grid_size = (input_data.size + block_size - 1) // block_size
output = cp.zeros(grid_size, dtype=cp.float32)
# Launch kernel multiple times
for _ in range(100):
reduction_kernel(
(grid_size,),
(block_size,),
(input_data, output, input_data.size),
shared_mem=block_size * 4
)
kernel_prof.print_kernel_report()
# Occupancy analysis
occupancy_info = profiler.analyze_occupancy(
reduction_kernel,
block_size=256,
shared_mem=256 * 4
)
print("Occupancy Analysis:")
print(f"Theoretical occupancy: {occupancy_info['theoretical_occupancy']:.2%}")
print(f"Blocks per SM: {occupancy_info['blocks_per_sm']}")
print(f"Threads per SM: {occupancy_info['threads_per_sm']}")# Compare different matrix multiplication implementations
def cupy_dot(a, b):
return cp.dot(a, b)
def cupy_matmul(a, b):
return cp.matmul(a, b)
def cupy_einsum(a, b):
return cp.einsum('ij,jk->ik', a, b)
# Prepare test matrices
size = 2048
a = cp.random.rand(size, size, dtype=cp.float32)
b = cp.random.rand(size, size, dtype=cp.float32)
# Compare implementations
comparison = profiler.compare_implementations(
cupy_dot, cupy_matmul, cupy_einsum,
args=(a, b),
names=['cp.dot', 'cp.matmul', 'cp.einsum']
)
print("Performance Comparison:")
for name, stats in comparison.items():
print(f"{name:12}: {stats['mean']:.4f}s ± {stats['std']:.4f}s")
# Parameter sweep for optimal block size
def custom_kernel_test(data, block_size):
# Custom kernel with configurable block size
threads_per_block = block_size
blocks_per_grid = (data.size + threads_per_block - 1) // threads_per_block
result = cp.zeros(blocks_per_grid)
# Kernel launch would go here
return result
data = cp.random.rand(1000000)
param_ranges = {'block_size': [64, 128, 256, 512, 1024]}
sweep_results = profiler.parameter_sweep(
custom_kernel_test,
param_ranges,
fixed_args=(data,)
)
print("Parameter Sweep Results:")
for params, timing in sweep_results.items():
print(f"Block size {params['block_size']}: {timing['mean']:.4f}s")# Analyze how performance scales with data size
def scaling_test_function(data):
# Test function that should scale with data size
result = cp.fft.fft(data)
magnitude = cp.abs(result)
return cp.sum(magnitude)
# Test with different data sizes
data_sizes = [1000, 5000, 10000, 50000, 100000, 500000, 1000000]
scaling_results = profiler.scaling_analysis(
scaling_test_function,
data_sizes,
dtype=cp.complex64
)
print("Scaling Analysis:")
print("Size\t\tTime (s)\tThroughput (MB/s)")
for size, stats in scaling_results.items():
throughput = (size * 8) / (stats['mean'] * 1024**2) # Complex64 = 8 bytes
print(f"{size:8}\t{stats['mean']:.4f}\t\t{throughput:.2f}")
# Memory bandwidth test
def memory_bandwidth_test(size):
"""Test memory bandwidth with different array sizes."""
data = cp.random.rand(size, dtype=cp.float32)
return cp.sum(data)
memory_sizes = [10**i for i in range(4, 8)] # 10K to 10M elements
bandwidth_results = profiler.scaling_analysis(
memory_bandwidth_test,
memory_sizes
)
print("\nMemory Bandwidth Analysis:")
for size, stats in bandwidth_results.items():
bandwidth_gbps = (size * 4) / (stats['mean'] * 1024**3) # Float32 = 4 bytes
print(f"Size: {size:8} elements, Bandwidth: {bandwidth_gbps:.2f} GB/s")# Production profiling workflow
class ProductionProfiler:
def __init__(self, enable_profiling=True):
self.enable_profiling = enable_profiling
self.profiles = {}
def profile_section(self, name):
"""Context manager for profiling code sections."""
if not self.enable_profiling:
return profiler.time_range() # No-op profiler
return profiler.time_range()
def benchmark_operation(self, name, func, *args, **kwargs):
"""Benchmark a specific operation."""
if not self.enable_profiling:
return func(*args, **kwargs)
with profiler.time_range() as timer:
result = func(*args, **kwargs)
self.profiles[name] = timer.elapsed_time
return result
def print_summary(self):
"""Print profiling summary."""
if not self.profiles:
print("No profiling data collected")
return
print("Performance Summary:")
print("-" * 40)
total_time = sum(self.profiles.values())
for name, time in sorted(self.profiles.items(), key=lambda x: x[1], reverse=True):
percentage = (time / total_time) * 100
print(f"{name:25}: {time:.4f}s ({percentage:.1f}%)")
print("-" * 40)
print(f"Total time: {total_time:.4f}s")
# Use production profiler
profiler_instance = ProductionProfiler(enable_profiling=True)
# Profile different operations
data = profiler_instance.benchmark_operation(
"data_generation",
cp.random.rand,
5000, 5000
)
fft_result = profiler_instance.benchmark_operation(
"fft_computation",
cp.fft.fft2,
data
)
with profiler_instance.profile_section("postprocessing") as timer:
magnitude = cp.abs(fft_result)
result = cp.sum(magnitude)
profiler_instance.profiles["postprocessing"] = timer.elapsed_time
# Print comprehensive summary
profiler_instance.print_summary()Performance profiling in CuPy provides essential tools for optimizing GPU applications, identifying bottlenecks, measuring execution characteristics, and ensuring optimal utilization of GPU resources across different computational workloads and hardware configurations.
Install with Tessl CLI
npx tessl i tessl/pypi-cupy-cuda11x