CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pytest-benchmark

A pytest fixture for benchmarking code that automatically calibrates test runs for accurate performance measurements.

Pending
Overview
Eval results
Files

analysis-reporting.mddocs/

Statistical Analysis and Reporting

Overview

pytest-benchmark provides comprehensive statistical analysis of benchmark results with multiple output formats including tables, CSV exports, histograms, and cProfile integration. The statistical engine computes various measures of central tendency, variability, and outlier detection.

Statistical Measures

Stats Class

class Stats:
    """Statistical calculations for benchmark timing data."""
    
    fields = (
        'min', 'max', 'mean', 'stddev', 'rounds', 'median', 
        'iqr', 'q1', 'q3', 'iqr_outliers', 'stddev_outliers', 
        'outliers', 'ld15iqr', 'hd15iqr', 'ops', 'total'
    )
    
    def __init__(self):
        """Initialize with empty data list."""
        self.data: list[float] = []
    
    def update(self, duration: float) -> None:
        """Add a timing measurement to the dataset."""
    
    def as_dict(self) -> dict:
        """Return all statistics as a dictionary."""

Core Statistical Properties

@property
def min(self) -> float:
    """Minimum execution time in seconds."""

@property  
def max(self) -> float:
    """Maximum execution time in seconds."""

@property
def mean(self) -> float:
    """Arithmetic mean execution time in seconds."""

@property
def median(self) -> float:
    """Median execution time in seconds."""

@property
def stddev(self) -> float:
    """Standard deviation of execution times in seconds."""

@property
def rounds(self) -> int:
    """Number of timing rounds executed."""

@property
def total(self) -> float:
    """Total execution time across all rounds in seconds."""

Quartile and Outlier Analysis

@property
def q1(self) -> float:
    """First quartile (25th percentile) in seconds."""

@property
def q3(self) -> float:
    """Third quartile (75th percentile) in seconds."""

@property
def iqr(self) -> float:
    """Interquartile range (Q3 - Q1) in seconds."""

@property
def ld15iqr(self) -> float:
    """Lowest datum within 1.5 IQR of Q1 (Tukey's method)."""

@property
def hd15iqr(self) -> float:
    """Highest datum within 1.5 IQR of Q3 (Tukey's method)."""

@property
def iqr_outliers(self) -> int:
    """Count of outliers beyond 1.5 IQR from quartiles."""

@property
def stddev_outliers(self) -> int:
    """Count of outliers beyond one standard deviation from mean."""

@property
def outliers(self) -> str:
    """Formatted string describing outlier counts."""

@property
def ops(self) -> float:
    """Operations per second (1 / mean)."""

Table Display

TableResults Class

class TableResults:
    """Formats benchmark results as console tables."""
    
    def __init__(self, benchmarks: list, columns: list, sort_key: str, logger, scale_unit: callable):
        """
        Initialize table formatter.
        
        Args:
            benchmarks: List of benchmark result objects
            columns: List of column names to display
            sort_key: Column to sort results by
            logger: Logger for output
            scale_unit: Function to determine time unit scaling
        """
    
    def display(self, tr) -> None:
        """Display formatted table to terminal."""

Column Options

# Available table columns:
COLUMNS = [
    'min',          # Minimum time
    'max',          # Maximum time  
    'mean',         # Mean time
    'stddev',       # Standard deviation
    'median',       # Median time
    'iqr',          # Interquartile range
    'outliers',     # Outlier summary
    'ops',          # Operations per second
    'rounds',       # Number of rounds
    'iterations'    # Iterations per round
]

Display Examples

# Default table output
pytest --benchmark-only

# Custom columns
pytest --benchmark-columns=min,max,mean,ops,rounds

# Sort by different metric
pytest --benchmark-sort=ops

# Group results differently
pytest --benchmark-group-by=func

CSV Export

CSVResults Class

class CSVResults:
    """Export benchmark results to CSV format."""
    
    def __init__(self, benchmarks: list, filename: str, logger):
        """
        Initialize CSV exporter.
        
        Args:
            benchmarks: List of benchmark results
            filename: Output CSV filename
            logger: Logger instance
        """
    
    def save(self) -> None:
        """Save results to CSV file."""

CSV Usage

# Export to CSV
pytest --benchmark-csv=results.csv

# CSV with timestamp
pytest --benchmark-csv=benchmark_$(date +%Y%m%d_%H%M%S).csv

# Multiple exports
pytest --benchmark-csv=summary.csv --benchmark-json=detailed.json

CSV Format

# Example CSV output structure:
name,min,max,mean,stddev,rounds,median,iqr,q1,q3,iqr_outliers,stddev_outliers,ops,total
test_function[param1],0.001,0.002,0.0015,0.0003,10,0.0014,0.0004,0.0012,0.0016,0,1,666.67,0.015
test_function[param2],0.002,0.003,0.0025,0.0004,10,0.0024,0.0005,0.0021,0.0026,1,0,400.0,0.025

Histogram Generation

Histogram Module

# Histogram generation requires pygal and pygaljs
# Install with: pip install pytest-benchmark[histogram]

def generate_histogram(benchmarks: list, filename_prefix: str) -> None:
    """
    Generate SVG histograms for benchmark results.
    
    Args:
        benchmarks: List of benchmark results
        filename_prefix: Prefix for output SVG files
    """

Histogram Usage

# Generate histograms
pytest --benchmark-histogram=charts

# Custom prefix with path
pytest --benchmark-histogram=results/benchmark_charts

# Histograms with comparison
pytest --benchmark-compare=baseline --benchmark-histogram=comparison

Histogram Output

# Generated files:
charts-test_function.svg        # Individual test histogram
charts-comparison.svg           # Comparison chart (if --benchmark-compare used)

cProfile Integration

cProfile Options

# Enable cProfile with sort column
--benchmark-cprofile COLUMN

# Available sort columns:
--benchmark-cprofile ncalls         # Number of calls
--benchmark-cprofile ncalls_recursion  # Calls including recursion
--benchmark-cprofile tottime         # Total time excluding subcalls
--benchmark-cprofile tottime_per     # Total time per call
--benchmark-cprofile cumtime         # Cumulative time including subcalls  
--benchmark-cprofile cumtime_per     # Cumulative time per call
--benchmark-cprofile function_name   # Function name

cProfile Configuration

# Control profiling behavior
--benchmark-cprofile-loops LOOPS     # Iterations to profile (default: 1)
--benchmark-cprofile-top COUNT       # Top N functions to display (default: 25)
--benchmark-cprofile-dump PREFIX     # Save profile dumps to files

cProfile Usage Examples

# Basic profiling sorted by cumulative time
pytest --benchmark-cprofile=cumtime

# Detailed profiling with more functions shown
pytest --benchmark-cprofile=tottime --benchmark-cprofile-top=50

# Save profile dumps for external analysis
pytest --benchmark-cprofile=cumtime --benchmark-cprofile-dump=profiles

Profile Output

def test_profile_example(benchmark):
    def complex_function():
        # This will be profiled
        data = [x**2 for x in range(10000)]
        return sum(data)
    
    result = benchmark(complex_function)
    assert result == 333283335000

# Command: pytest --benchmark-cprofile=cumtime --benchmark-cprofile-top=10
# Output shows top functions by cumulative time

Time Unit Scaling

Automatic Unit Selection

def time_unit(seconds: float) -> tuple[str, float]:
    """
    Automatically select appropriate time unit.
    
    Args:
        seconds: Time value in seconds
        
    Returns:
        tuple: (unit_symbol, scale_factor)
        
    Examples:
        time_unit(0.000001) -> ('u', 1000000)  # microseconds
        time_unit(0.001) -> ('m', 1000)       # milliseconds  
        time_unit(1.0) -> ('', 1)             # seconds
    """

Manual Unit Selection

# Force specific time units
pytest --benchmark-time-unit=ns     # nanoseconds
pytest --benchmark-time-unit=us     # microseconds  
pytest --benchmark-time-unit=ms     # milliseconds
pytest --benchmark-time-unit=s      # seconds
pytest --benchmark-time-unit=auto   # automatic (default)

Metadata and Context

Metadata Class

class Metadata:
    """Container for benchmark metadata and statistics."""
    
    def __init__(self, fixture, iterations: int, options: dict):
        """
        Initialize benchmark metadata.
        
        Args:
            fixture: BenchmarkFixture instance
            iterations: Number of iterations per round
            options: Benchmark configuration options
        """
    
    def as_dict(self, include_data: bool = False) -> dict:
        """
        Export metadata as dictionary.
        
        Args:
            include_data: Whether to include raw timing data
            
        Returns:
            dict: Complete benchmark metadata
        """

Context Information

# Benchmark context automatically includes:
{
    "name": str,           # Test function name
    "fullname": str,       # Full pytest node ID
    "group": str,          # Benchmark group
    "params": dict,        # Test parameters
    "param": str,          # Parameter string
    "extra_info": dict,    # Additional metadata
    "stats": dict,         # Statistical measures
    "options": dict        # Benchmark options used
}

Custom Reporting

Report Generation Hooks

def pytest_benchmark_generate_json(config, benchmarks, include_data, machine_info, commit_info) -> dict:
    """
    Generate JSON report data.
    
    Args:
        config: pytest configuration
        benchmarks: List of benchmark results
        include_data: Whether to include raw timing data
        machine_info: Machine information dict
        commit_info: Git commit information dict
        
    Returns:
        dict: Complete JSON report structure
    """

Custom Display Hooks

def pytest_benchmark_group_stats(config, benchmarks, group_by):
    """Custom grouping logic for result display."""
    
def pytest_benchmark_scale_unit(config, unit, benchmarks, best, worst, sort):
    """Custom unit scaling for result display."""

Analysis Examples

Performance Trend Analysis

def test_trend_analysis(benchmark):
    """Example of capturing trend data."""
    def algorithm_v1():
        return sum(x**2 for x in range(1000))
    
    result = benchmark(algorithm_v1)
    
    # Results automatically include statistical analysis:
    # - Mean execution time with confidence intervals
    # - Outlier detection and classification  
    # - Operations per second calculation
    # - Comparison with previous runs (if --benchmark-compare used)
    
    assert result == 332833500

Statistical Validation

def test_statistical_validation(benchmark):
    """Validate statistical measures."""
    def consistent_function():
        # Function with predictable performance
        return sum(range(100))
    
    result = benchmark(consistent_function)
    
    # Access statistics after benchmarking
    stats = benchmark.stats
    assert stats.min > 0
    assert stats.max >= stats.min
    assert stats.mean >= stats.min
    assert stats.stddev >= 0
    assert stats.rounds >= 1
    
    assert result == 4950

Comparative Analysis

# Generate comparative reports
pytest --benchmark-save=implementation_a tests/
pytest --benchmark-save=implementation_b tests/
pytest-benchmark compare implementation_a implementation_b --csv=comparison.csv

Troubleshooting Statistics

Calibration Issues

def test_calibration_debugging(benchmark):
    def micro_function():
        return 42
    
    # Very fast functions may have calibration challenges
    # Use pedantic mode for precise control
    result = benchmark.pedantic(
        target=micro_function,
        rounds=1000,      # Many rounds for statistical significance  
        iterations=10000  # Many iterations per round
    )
    assert result == 42

Timer Resolution

# Debug timer precision issues
pytest --benchmark-verbose --benchmark-calibration-precision=100

Outlier Investigation

def test_outlier_analysis(benchmark):
    def variable_function():
        # Function with variable performance
        import random
        time.sleep(random.uniform(0.001, 0.002))
        return sum(range(100))
    
    result = benchmark(variable_function)
    
    # Check outlier statistics
    stats = benchmark.stats
    if stats.iqr_outliers > stats.rounds * 0.1:
        # More than 10% outliers - investigate environment
        pass
    
    assert result == 4950

Install with Tessl CLI

npx tessl i tessl/pypi-pytest-benchmark

docs

analysis-reporting.md

aspect-benchmarking.md

cli-tools.md

configuration.md

core-benchmarking.md

index.md

storage-comparison.md

tile.json