or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

analysis-reporting.mdaspect-benchmarking.mdcli-tools.mdconfiguration.mdcore-benchmarking.mdindex.mdstorage-comparison.md

analysis-reporting.mddocs/

0

# Statistical Analysis and Reporting

1

2

## Overview

3

4

pytest-benchmark provides comprehensive statistical analysis of benchmark results with multiple output formats including tables, CSV exports, histograms, and cProfile integration. The statistical engine computes various measures of central tendency, variability, and outlier detection.

5

6

## Statistical Measures

7

8

### Stats Class

9

10

```python { .api }

11

class Stats:

12

"""Statistical calculations for benchmark timing data."""

13

14

fields = (

15

'min', 'max', 'mean', 'stddev', 'rounds', 'median',

16

'iqr', 'q1', 'q3', 'iqr_outliers', 'stddev_outliers',

17

'outliers', 'ld15iqr', 'hd15iqr', 'ops', 'total'

18

)

19

20

def __init__(self):

21

"""Initialize with empty data list."""

22

self.data: list[float] = []

23

24

def update(self, duration: float) -> None:

25

"""Add a timing measurement to the dataset."""

26

27

def as_dict(self) -> dict:

28

"""Return all statistics as a dictionary."""

29

```

30

31

### Core Statistical Properties

32

33

```python { .api }

34

@property

35

def min(self) -> float:

36

"""Minimum execution time in seconds."""

37

38

@property

39

def max(self) -> float:

40

"""Maximum execution time in seconds."""

41

42

@property

43

def mean(self) -> float:

44

"""Arithmetic mean execution time in seconds."""

45

46

@property

47

def median(self) -> float:

48

"""Median execution time in seconds."""

49

50

@property

51

def stddev(self) -> float:

52

"""Standard deviation of execution times in seconds."""

53

54

@property

55

def rounds(self) -> int:

56

"""Number of timing rounds executed."""

57

58

@property

59

def total(self) -> float:

60

"""Total execution time across all rounds in seconds."""

61

```

62

63

### Quartile and Outlier Analysis

64

65

```python { .api }

66

@property

67

def q1(self) -> float:

68

"""First quartile (25th percentile) in seconds."""

69

70

@property

71

def q3(self) -> float:

72

"""Third quartile (75th percentile) in seconds."""

73

74

@property

75

def iqr(self) -> float:

76

"""Interquartile range (Q3 - Q1) in seconds."""

77

78

@property

79

def ld15iqr(self) -> float:

80

"""Lowest datum within 1.5 IQR of Q1 (Tukey's method)."""

81

82

@property

83

def hd15iqr(self) -> float:

84

"""Highest datum within 1.5 IQR of Q3 (Tukey's method)."""

85

86

@property

87

def iqr_outliers(self) -> int:

88

"""Count of outliers beyond 1.5 IQR from quartiles."""

89

90

@property

91

def stddev_outliers(self) -> int:

92

"""Count of outliers beyond one standard deviation from mean."""

93

94

@property

95

def outliers(self) -> str:

96

"""Formatted string describing outlier counts."""

97

98

@property

99

def ops(self) -> float:

100

"""Operations per second (1 / mean)."""

101

```

102

103

## Table Display

104

105

### TableResults Class

106

107

```python { .api }

108

class TableResults:

109

"""Formats benchmark results as console tables."""

110

111

def __init__(self, benchmarks: list, columns: list, sort_key: str, logger, scale_unit: callable):

112

"""

113

Initialize table formatter.

114

115

Args:

116

benchmarks: List of benchmark result objects

117

columns: List of column names to display

118

sort_key: Column to sort results by

119

logger: Logger for output

120

scale_unit: Function to determine time unit scaling

121

"""

122

123

def display(self, tr) -> None:

124

"""Display formatted table to terminal."""

125

```

126

127

### Column Options

128

129

```python { .api }

130

# Available table columns:

131

COLUMNS = [

132

'min', # Minimum time

133

'max', # Maximum time

134

'mean', # Mean time

135

'stddev', # Standard deviation

136

'median', # Median time

137

'iqr', # Interquartile range

138

'outliers', # Outlier summary

139

'ops', # Operations per second

140

'rounds', # Number of rounds

141

'iterations' # Iterations per round

142

]

143

```

144

145

### Display Examples

146

147

```bash

148

# Default table output

149

pytest --benchmark-only

150

151

# Custom columns

152

pytest --benchmark-columns=min,max,mean,ops,rounds

153

154

# Sort by different metric

155

pytest --benchmark-sort=ops

156

157

# Group results differently

158

pytest --benchmark-group-by=func

159

```

160

161

## CSV Export

162

163

### CSVResults Class

164

165

```python { .api }

166

class CSVResults:

167

"""Export benchmark results to CSV format."""

168

169

def __init__(self, benchmarks: list, filename: str, logger):

170

"""

171

Initialize CSV exporter.

172

173

Args:

174

benchmarks: List of benchmark results

175

filename: Output CSV filename

176

logger: Logger instance

177

"""

178

179

def save(self) -> None:

180

"""Save results to CSV file."""

181

```

182

183

### CSV Usage

184

185

```bash

186

# Export to CSV

187

pytest --benchmark-csv=results.csv

188

189

# CSV with timestamp

190

pytest --benchmark-csv=benchmark_$(date +%Y%m%d_%H%M%S).csv

191

192

# Multiple exports

193

pytest --benchmark-csv=summary.csv --benchmark-json=detailed.json

194

```

195

196

### CSV Format

197

198

```csv

199

# Example CSV output structure:

200

name,min,max,mean,stddev,rounds,median,iqr,q1,q3,iqr_outliers,stddev_outliers,ops,total

201

test_function[param1],0.001,0.002,0.0015,0.0003,10,0.0014,0.0004,0.0012,0.0016,0,1,666.67,0.015

202

test_function[param2],0.002,0.003,0.0025,0.0004,10,0.0024,0.0005,0.0021,0.0026,1,0,400.0,0.025

203

```

204

205

## Histogram Generation

206

207

### Histogram Module

208

209

```python { .api }

210

# Histogram generation requires pygal and pygaljs

211

# Install with: pip install pytest-benchmark[histogram]

212

213

def generate_histogram(benchmarks: list, filename_prefix: str) -> None:

214

"""

215

Generate SVG histograms for benchmark results.

216

217

Args:

218

benchmarks: List of benchmark results

219

filename_prefix: Prefix for output SVG files

220

"""

221

```

222

223

### Histogram Usage

224

225

```bash

226

# Generate histograms

227

pytest --benchmark-histogram=charts

228

229

# Custom prefix with path

230

pytest --benchmark-histogram=results/benchmark_charts

231

232

# Histograms with comparison

233

pytest --benchmark-compare=baseline --benchmark-histogram=comparison

234

```

235

236

### Histogram Output

237

238

```bash

239

# Generated files:

240

charts-test_function.svg # Individual test histogram

241

charts-comparison.svg # Comparison chart (if --benchmark-compare used)

242

```

243

244

## cProfile Integration

245

246

### cProfile Options

247

248

```bash { .api }

249

# Enable cProfile with sort column

250

--benchmark-cprofile COLUMN

251

252

# Available sort columns:

253

--benchmark-cprofile ncalls # Number of calls

254

--benchmark-cprofile ncalls_recursion # Calls including recursion

255

--benchmark-cprofile tottime # Total time excluding subcalls

256

--benchmark-cprofile tottime_per # Total time per call

257

--benchmark-cprofile cumtime # Cumulative time including subcalls

258

--benchmark-cprofile cumtime_per # Cumulative time per call

259

--benchmark-cprofile function_name # Function name

260

```

261

262

### cProfile Configuration

263

264

```bash

265

# Control profiling behavior

266

--benchmark-cprofile-loops LOOPS # Iterations to profile (default: 1)

267

--benchmark-cprofile-top COUNT # Top N functions to display (default: 25)

268

--benchmark-cprofile-dump PREFIX # Save profile dumps to files

269

```

270

271

### cProfile Usage Examples

272

273

```bash

274

# Basic profiling sorted by cumulative time

275

pytest --benchmark-cprofile=cumtime

276

277

# Detailed profiling with more functions shown

278

pytest --benchmark-cprofile=tottime --benchmark-cprofile-top=50

279

280

# Save profile dumps for external analysis

281

pytest --benchmark-cprofile=cumtime --benchmark-cprofile-dump=profiles

282

```

283

284

### Profile Output

285

286

```python

287

def test_profile_example(benchmark):

288

def complex_function():

289

# This will be profiled

290

data = [x**2 for x in range(10000)]

291

return sum(data)

292

293

result = benchmark(complex_function)

294

assert result == 333283335000

295

296

# Command: pytest --benchmark-cprofile=cumtime --benchmark-cprofile-top=10

297

# Output shows top functions by cumulative time

298

```

299

300

## Time Unit Scaling

301

302

### Automatic Unit Selection

303

304

```python { .api }

305

def time_unit(seconds: float) -> tuple[str, float]:

306

"""

307

Automatically select appropriate time unit.

308

309

Args:

310

seconds: Time value in seconds

311

312

Returns:

313

tuple: (unit_symbol, scale_factor)

314

315

Examples:

316

time_unit(0.000001) -> ('u', 1000000) # microseconds

317

time_unit(0.001) -> ('m', 1000) # milliseconds

318

time_unit(1.0) -> ('', 1) # seconds

319

"""

320

```

321

322

### Manual Unit Selection

323

324

```bash

325

# Force specific time units

326

pytest --benchmark-time-unit=ns # nanoseconds

327

pytest --benchmark-time-unit=us # microseconds

328

pytest --benchmark-time-unit=ms # milliseconds

329

pytest --benchmark-time-unit=s # seconds

330

pytest --benchmark-time-unit=auto # automatic (default)

331

```

332

333

## Metadata and Context

334

335

### Metadata Class

336

337

```python { .api }

338

class Metadata:

339

"""Container for benchmark metadata and statistics."""

340

341

def __init__(self, fixture, iterations: int, options: dict):

342

"""

343

Initialize benchmark metadata.

344

345

Args:

346

fixture: BenchmarkFixture instance

347

iterations: Number of iterations per round

348

options: Benchmark configuration options

349

"""

350

351

def as_dict(self, include_data: bool = False) -> dict:

352

"""

353

Export metadata as dictionary.

354

355

Args:

356

include_data: Whether to include raw timing data

357

358

Returns:

359

dict: Complete benchmark metadata

360

"""

361

```

362

363

### Context Information

364

365

```python { .api }

366

# Benchmark context automatically includes:

367

{

368

"name": str, # Test function name

369

"fullname": str, # Full pytest node ID

370

"group": str, # Benchmark group

371

"params": dict, # Test parameters

372

"param": str, # Parameter string

373

"extra_info": dict, # Additional metadata

374

"stats": dict, # Statistical measures

375

"options": dict # Benchmark options used

376

}

377

```

378

379

## Custom Reporting

380

381

### Report Generation Hooks

382

383

```python { .api }

384

def pytest_benchmark_generate_json(config, benchmarks, include_data, machine_info, commit_info) -> dict:

385

"""

386

Generate JSON report data.

387

388

Args:

389

config: pytest configuration

390

benchmarks: List of benchmark results

391

include_data: Whether to include raw timing data

392

machine_info: Machine information dict

393

commit_info: Git commit information dict

394

395

Returns:

396

dict: Complete JSON report structure

397

"""

398

```

399

400

### Custom Display Hooks

401

402

```python

403

def pytest_benchmark_group_stats(config, benchmarks, group_by):

404

"""Custom grouping logic for result display."""

405

406

def pytest_benchmark_scale_unit(config, unit, benchmarks, best, worst, sort):

407

"""Custom unit scaling for result display."""

408

```

409

410

## Analysis Examples

411

412

### Performance Trend Analysis

413

414

```python

415

def test_trend_analysis(benchmark):

416

"""Example of capturing trend data."""

417

def algorithm_v1():

418

return sum(x**2 for x in range(1000))

419

420

result = benchmark(algorithm_v1)

421

422

# Results automatically include statistical analysis:

423

# - Mean execution time with confidence intervals

424

# - Outlier detection and classification

425

# - Operations per second calculation

426

# - Comparison with previous runs (if --benchmark-compare used)

427

428

assert result == 332833500

429

```

430

431

### Statistical Validation

432

433

```python

434

def test_statistical_validation(benchmark):

435

"""Validate statistical measures."""

436

def consistent_function():

437

# Function with predictable performance

438

return sum(range(100))

439

440

result = benchmark(consistent_function)

441

442

# Access statistics after benchmarking

443

stats = benchmark.stats

444

assert stats.min > 0

445

assert stats.max >= stats.min

446

assert stats.mean >= stats.min

447

assert stats.stddev >= 0

448

assert stats.rounds >= 1

449

450

assert result == 4950

451

```

452

453

### Comparative Analysis

454

455

```bash

456

# Generate comparative reports

457

pytest --benchmark-save=implementation_a tests/

458

pytest --benchmark-save=implementation_b tests/

459

pytest-benchmark compare implementation_a implementation_b --csv=comparison.csv

460

```

461

462

## Troubleshooting Statistics

463

464

### Calibration Issues

465

466

```python

467

def test_calibration_debugging(benchmark):

468

def micro_function():

469

return 42

470

471

# Very fast functions may have calibration challenges

472

# Use pedantic mode for precise control

473

result = benchmark.pedantic(

474

target=micro_function,

475

rounds=1000, # Many rounds for statistical significance

476

iterations=10000 # Many iterations per round

477

)

478

assert result == 42

479

```

480

481

### Timer Resolution

482

483

```bash

484

# Debug timer precision issues

485

pytest --benchmark-verbose --benchmark-calibration-precision=100

486

```

487

488

### Outlier Investigation

489

490

```python

491

def test_outlier_analysis(benchmark):

492

def variable_function():

493

# Function with variable performance

494

import random

495

time.sleep(random.uniform(0.001, 0.002))

496

return sum(range(100))

497

498

result = benchmark(variable_function)

499

500

# Check outlier statistics

501

stats = benchmark.stats

502

if stats.iqr_outliers > stats.rounds * 0.1:

503

# More than 10% outliers - investigate environment

504

pass

505

506

assert result == 4950

507

```