or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

browser-automation.mdcli-tools.mdconfiguration.mdcore-types.mdcrawlers.mderror-handling.mdevents.mdfingerprinting.mdhttp-clients.mdindex.mdrequest-management.mdsessions.mdstatistics.mdstorage.md

statistics.mddocs/

0

# Statistics

1

2

Performance monitoring and statistics collection for tracking crawling progress and system resource usage. Statistics provide insights into crawler performance, request success rates, and resource utilization.

3

4

## Capabilities

5

6

### Statistics Collector

7

8

Main statistics collection system for monitoring crawler performance and resource usage.

9

10

```python { .api }

11

class Statistics:

12

def __init__(self): ...

13

14

def get_state(self) -> StatisticsState:

15

"""

16

Get current statistics state.

17

18

Returns:

19

StatisticsState with current metrics

20

"""

21

22

def reset(self) -> None:

23

"""Reset all statistics counters."""

24

25

def start_job(self, job_name: str) -> None:

26

"""Start tracking a named job."""

27

28

def finish_job(self, job_name: str) -> None:

29

"""Finish tracking a named job."""

30

31

def increment_requests_finished(self) -> None:

32

"""Increment successful request counter."""

33

34

def increment_requests_failed(self) -> None:

35

"""Increment failed request counter."""

36

37

def increment_requests_retries(self) -> None:

38

"""Increment retry counter."""

39

40

def set_requests_total(self, total: int) -> None:

41

"""Set total expected requests."""

42

43

def log_system_info(self, interval: timedelta = timedelta(seconds=60)) -> None:

44

"""Start logging system information at intervals."""

45

46

def calculate_statistics_percentile(

47

self,

48

values: list[float],

49

percentile: float

50

) -> float:

51

"""Calculate percentile of values."""

52

53

@property

54

def requests_finished(self) -> int:

55

"""Number of successfully finished requests."""

56

57

@property

58

def requests_failed(self) -> int:

59

"""Number of failed requests."""

60

61

@property

62

def requests_total(self) -> int | None:

63

"""Total expected requests."""

64

65

@property

66

def requests_avg_failed_per_minute(self) -> float:

67

"""Average failed requests per minute."""

68

69

@property

70

def requests_avg_finished_per_minute(self) -> float:

71

"""Average successful requests per minute."""

72

73

@property

74

def crawl_duration_millis(self) -> int:

75

"""Total crawl duration in milliseconds."""

76

```

77

78

### Statistics State

79

80

Current state snapshot containing all performance metrics and counters.

81

82

```python { .api }

83

class StatisticsState:

84

def __init__(

85

self,

86

*,

87

requests_finished: int = 0,

88

requests_failed: int = 0,

89

requests_retries: int = 0,

90

requests_total: int | None = None,

91

crawl_duration_millis: int = 0,

92

requests_avg_failed_per_minute: float = 0.0,

93

requests_avg_finished_per_minute: float = 0.0,

94

requests_total_duration_millis: int = 0,

95

requests_min_duration_millis: int = 0,

96

requests_max_duration_millis: int = 0,

97

requests_avg_duration_millis: float = 0.0,

98

stats_id: str | None = None,

99

cpu_usage_percent: float = 0.0,

100

memory_usage_bytes: int = 0,

101

memory_usage_mb: float = 0.0

102

): ...

103

104

@property

105

def requests_finished(self) -> int:

106

"""Number of successfully finished requests."""

107

108

@property

109

def requests_failed(self) -> int:

110

"""Number of failed requests."""

111

112

@property

113

def requests_retries(self) -> int:

114

"""Total number of retries across all requests."""

115

116

@property

117

def requests_total(self) -> int | None:

118

"""Total expected requests."""

119

120

@property

121

def crawl_duration_millis(self) -> int:

122

"""Total crawl duration in milliseconds."""

123

124

@property

125

def requests_avg_failed_per_minute(self) -> float:

126

"""Average failed requests per minute."""

127

128

@property

129

def requests_avg_finished_per_minute(self) -> float:

130

"""Average successful requests per minute."""

131

132

@property

133

def requests_avg_duration_millis(self) -> float:

134

"""Average request duration in milliseconds."""

135

136

@property

137

def cpu_usage_percent(self) -> float:

138

"""Current CPU usage percentage."""

139

140

@property

141

def memory_usage_mb(self) -> float:

142

"""Current memory usage in megabytes."""

143

144

def to_dict(self) -> dict[str, any]:

145

"""Convert state to dictionary for serialization."""

146

```

147

148

### Final Statistics

149

150

Comprehensive final statistics summary generated at the end of crawling operations.

151

152

```python { .api }

153

class FinalStatistics:

154

def __init__(

155

self,

156

*,

157

requests_finished: int,

158

requests_failed: int,

159

retry_histogram: list[int],

160

requests_avg_failed_per_minute: float,

161

requests_avg_finished_per_minute: float,

162

requests_total_duration_millis: int,

163

requests_min_duration_millis: int,

164

requests_max_duration_millis: int,

165

requests_avg_duration_millis: float,

166

crawl_duration_millis: int,

167

stats_id: str | None = None

168

): ...

169

170

@property

171

def requests_finished(self) -> int:

172

"""Total successfully finished requests."""

173

174

@property

175

def requests_failed(self) -> int:

176

"""Total failed requests."""

177

178

@property

179

def requests_total(self) -> int:

180

"""Total requests processed (finished + failed)."""

181

182

@property

183

def retry_histogram(self) -> list[int]:

184

"""Histogram of retry counts [0_retries, 1_retry, 2_retries, ...]."""

185

186

@property

187

def requests_avg_failed_per_minute(self) -> float:

188

"""Average failed requests per minute."""

189

190

@property

191

def requests_avg_finished_per_minute(self) -> float:

192

"""Average successful requests per minute."""

193

194

@property

195

def requests_min_duration_millis(self) -> int:

196

"""Minimum request duration in milliseconds."""

197

198

@property

199

def requests_max_duration_millis(self) -> int:

200

"""Maximum request duration in milliseconds."""

201

202

@property

203

def requests_avg_duration_millis(self) -> float:

204

"""Average request duration in milliseconds."""

205

206

@property

207

def crawl_duration_millis(self) -> int:

208

"""Total crawl duration in milliseconds."""

209

210

@property

211

def success_rate(self) -> float:

212

"""Success rate as percentage (0-100)."""

213

214

def log_summary(self) -> None:

215

"""Log formatted summary of final statistics."""

216

217

def to_dict(self) -> dict[str, any]:

218

"""Convert to dictionary for serialization."""

219

```

220

221

## Usage Examples

222

223

### Basic Statistics Monitoring

224

225

```python

226

import asyncio

227

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext

228

from crawlee.statistics import Statistics

229

230

async def main():

231

# Create crawler with custom statistics

232

stats = Statistics()

233

crawler = HttpCrawler(statistics=stats)

234

235

@crawler.router.default_handler

236

async def handler(context: HttpCrawlingContext):

237

# Process request

238

data = {

239

'url': context.request.url,

240

'status': context.response.status_code

241

}

242

await context.push_data(data)

243

244

# Run crawler

245

urls = [f'https://httpbin.org/delay/{i}' for i in range(1, 6)]

246

final_stats = await crawler.run(urls)

247

248

# Access statistics during crawling

249

current_state = stats.get_state()

250

print(f"Requests finished: {current_state.requests_finished}")

251

print(f"Requests failed: {current_state.requests_failed}")

252

print(f"Average duration: {current_state.requests_avg_duration_millis:.2f}ms")

253

254

# Final statistics

255

print(f"\nFinal Statistics:")

256

print(f"Success rate: {final_stats.success_rate:.1f}%")

257

print(f"Total duration: {final_stats.crawl_duration_millis}ms")

258

print(f"Retry histogram: {final_stats.retry_histogram}")

259

260

asyncio.run(main())

261

```

262

263

### Real-time Statistics Monitoring

264

265

```python

266

import asyncio

267

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext

268

from crawlee.statistics import Statistics

269

270

async def monitor_statistics(stats: Statistics, interval: int = 10):

271

"""Monitor statistics in real-time during crawling."""

272

while True:

273

await asyncio.sleep(interval)

274

275

state = stats.get_state()

276

277

print(f"\n--- Statistics Update ---")

278

print(f"Finished: {state.requests_finished}")

279

print(f"Failed: {state.requests_failed}")

280

print(f"Success rate: {state.requests_finished / (state.requests_finished + state.requests_failed + 0.001) * 100:.1f}%")

281

print(f"Avg requests/min: {state.requests_avg_finished_per_minute:.1f}")

282

print(f"CPU usage: {state.cpu_usage_percent:.1f}%")

283

print(f"Memory usage: {state.memory_usage_mb:.1f}MB")

284

print(f"Duration: {state.crawl_duration_millis}ms")

285

286

# Stop monitoring when crawling is complete

287

if state.requests_total and (state.requests_finished + state.requests_failed) >= state.requests_total:

288

break

289

290

async def main():

291

stats = Statistics()

292

293

# Enable system info logging

294

stats.log_system_info(interval=timedelta(seconds=5))

295

296

crawler = HttpCrawler(

297

statistics=stats,

298

max_requests_per_crawl=50

299

)

300

301

@crawler.router.default_handler

302

async def handler(context: HttpCrawlingContext):

303

# Simulate varying processing time

304

await asyncio.sleep(random.uniform(0.1, 2.0))

305

306

data = {

307

'url': context.request.url,

308

'timestamp': datetime.now().isoformat()

309

}

310

await context.push_data(data)

311

312

# Start statistics monitoring

313

monitor_task = asyncio.create_task(monitor_statistics(stats, interval=5))

314

315

# Start crawling

316

urls = ['https://httpbin.org/delay/1'] * 20

317

final_stats = await crawler.run(urls)

318

319

# Wait for monitoring to finish

320

await monitor_task

321

322

# Print final summary

323

print(f"\n=== Final Summary ===")

324

final_stats.log_summary()

325

326

import random

327

from datetime import datetime, timedelta

328

asyncio.run(main())

329

```

330

331

### Custom Statistics Collection

332

333

```python

334

import asyncio

335

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext

336

from crawlee.statistics import Statistics

337

338

class CustomStatistics(Statistics):

339

"""Extended statistics with custom metrics."""

340

341

def __init__(self):

342

super().__init__()

343

self.status_code_counts = {}

344

self.domain_counts = {}

345

self.response_sizes = []

346

347

def record_response(self, url: str, status_code: int, size: int):

348

"""Record custom response metrics."""

349

# Count status codes

350

self.status_code_counts[status_code] = self.status_code_counts.get(status_code, 0) + 1

351

352

# Count domains

353

from urllib.parse import urlparse

354

domain = urlparse(url).netloc

355

self.domain_counts[domain] = self.domain_counts.get(domain, 0) + 1

356

357

# Track response sizes

358

self.response_sizes.append(size)

359

360

def get_custom_summary(self) -> dict[str, any]:

361

"""Get summary of custom metrics."""

362

avg_size = sum(self.response_sizes) / len(self.response_sizes) if self.response_sizes else 0

363

364

return {

365

'status_codes': dict(self.status_code_counts),

366

'domains': dict(self.domain_counts),

367

'response_size_avg': avg_size,

368

'response_size_min': min(self.response_sizes) if self.response_sizes else 0,

369

'response_size_max': max(self.response_sizes) if self.response_sizes else 0

370

}

371

372

async def main():

373

custom_stats = CustomStatistics()

374

375

crawler = HttpCrawler(statistics=custom_stats)

376

377

@crawler.router.default_handler

378

async def handler(context: HttpCrawlingContext):

379

response = context.response

380

381

# Record custom metrics

382

response_size = len(response.content)

383

custom_stats.record_response(

384

url=response.url,

385

status_code=response.status_code,

386

size=response_size

387

)

388

389

data = {

390

'url': response.url,

391

'status': response.status_code,

392

'size': response_size

393

}

394

await context.push_data(data)

395

396

# Test with various URLs

397

urls = [

398

'https://httpbin.org/json',

399

'https://httpbin.org/html',

400

'https://httpbin.org/xml',

401

'https://httpbin.org/status/404',

402

'https://httpbin.org/status/500'

403

]

404

405

await crawler.run(urls)

406

407

# Get custom statistics

408

custom_summary = custom_stats.get_custom_summary()

409

410

print("Custom Statistics:")

411

print(f"Status codes: {custom_summary['status_codes']}")

412

print(f"Domains: {custom_summary['domains']}")

413

print(f"Avg response size: {custom_summary['response_size_avg']:.0f} bytes")

414

print(f"Min response size: {custom_summary['response_size_min']} bytes")

415

print(f"Max response size: {custom_summary['response_size_max']} bytes")

416

417

asyncio.run(main())

418

```

419

420

### Statistics Persistence

421

422

```python

423

import asyncio

424

import json

425

from datetime import datetime

426

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext

427

from crawlee.statistics import Statistics

428

from crawlee.storages import KeyValueStore

429

430

async def save_statistics_periodically(stats: Statistics, store: KeyValueStore, interval: int = 30):

431

"""Save statistics to storage periodically."""

432

while True:

433

await asyncio.sleep(interval)

434

435

state = stats.get_state()

436

timestamp = datetime.now().isoformat()

437

438

# Save current state

439

await store.set_value(

440

f'stats_{timestamp}',

441

state.to_dict()

442

)

443

444

print(f"Statistics saved at {timestamp}")

445

446

async def main():

447

stats = Statistics()

448

stats_store = await KeyValueStore.open('crawl-statistics')

449

450

crawler = HttpCrawler(statistics=stats, max_requests_per_crawl=30)

451

452

@crawler.router.default_handler

453

async def handler(context: HttpCrawlingContext):

454

await asyncio.sleep(random.uniform(0.5, 2.0)) # Simulate work

455

456

data = {'url': context.request.url, 'processed_at': datetime.now().isoformat()}

457

await context.push_data(data)

458

459

# Start periodic statistics saving

460

save_task = asyncio.create_task(

461

save_statistics_periodically(stats, stats_store, interval=10)

462

)

463

464

# Run crawler

465

urls = ['https://httpbin.org/delay/1'] * 25

466

final_stats = await crawler.run(urls)

467

468

# Cancel periodic saving

469

save_task.cancel()

470

471

# Save final statistics

472

await stats_store.set_value('final_stats', final_stats.to_dict())

473

474

print("Final statistics saved to storage")

475

476

# Demonstrate reading saved statistics

477

saved_final = await stats_store.get_value('final_stats')

478

print(f"Retrieved final stats: Success rate = {saved_final['success_rate']:.1f}%")

479

480

import random

481

asyncio.run(main())

482

```

483

484

### Performance Analysis

485

486

```python

487

import asyncio

488

import time

489

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext

490

from crawlee.statistics import Statistics

491

492

async def analyze_performance(stats: Statistics, final_stats):

493

"""Analyze crawler performance metrics."""

494

495

print("=== Performance Analysis ===")

496

497

# Basic metrics

498

total_requests = final_stats.requests_total

499

success_rate = final_stats.success_rate

500

avg_duration = final_stats.requests_avg_duration_millis

501

502

print(f"Total requests: {total_requests}")

503

print(f"Success rate: {success_rate:.1f}%")

504

print(f"Average request duration: {avg_duration:.2f}ms")

505

506

# Throughput analysis

507

crawl_duration_seconds = final_stats.crawl_duration_millis / 1000

508

throughput = total_requests / crawl_duration_seconds if crawl_duration_seconds > 0 else 0

509

510

print(f"Crawl duration: {crawl_duration_seconds:.1f}s")

511

print(f"Throughput: {throughput:.2f} requests/second")

512

513

# Retry analysis

514

retry_histogram = final_stats.retry_histogram

515

total_retries = sum(i * count for i, count in enumerate(retry_histogram))

516

517

print(f"Total retries: {total_retries}")

518

print(f"Retry distribution: {retry_histogram}")

519

520

if len(retry_histogram) > 1:

521

retry_rate = sum(retry_histogram[1:]) / total_requests * 100

522

print(f"Requests requiring retries: {retry_rate:.1f}%")

523

524

# Performance rating

525

if success_rate > 95 and avg_duration < 2000:

526

rating = "Excellent"

527

elif success_rate > 90 and avg_duration < 5000:

528

rating = "Good"

529

elif success_rate > 80:

530

rating = "Fair"

531

else:

532

rating = "Poor"

533

534

print(f"Performance rating: {rating}")

535

536

async def main():

537

stats = Statistics()

538

539

# Configure crawler with different settings for performance testing

540

crawler = HttpCrawler(

541

statistics=stats,

542

max_requests_per_crawl=20,

543

max_request_retries=3,

544

max_concurrent_requests=5

545

)

546

547

request_times = []

548

549

@crawler.router.default_handler

550

async def handler(context: HttpCrawlingContext):

551

start_time = time.time()

552

553

# Simulate varying response times

554

delay = random.choice([0.5, 1.0, 1.5, 2.0, 3.0])

555

await asyncio.sleep(delay)

556

557

end_time = time.time()

558

request_times.append((end_time - start_time) * 1000) # Convert to ms

559

560

# Occasionally fail to test retry logic

561

if random.random() < 0.1: # 10% failure rate

562

raise Exception("Simulated failure")

563

564

data = {

565

'url': context.request.url,

566

'duration_ms': (end_time - start_time) * 1000

567

}

568

await context.push_data(data)

569

570

# Run performance test

571

urls = ['https://httpbin.org/delay/1'] * 20

572

final_stats = await crawler.run(urls)

573

574

# Analyze performance

575

await analyze_performance(stats, final_stats)

576

577

# Additional custom analysis

578

if request_times:

579

print(f"\nCustom timing analysis:")

580

print(f"Min request time: {min(request_times):.2f}ms")

581

print(f"Max request time: {max(request_times):.2f}ms")

582

print(f"Median request time: {sorted(request_times)[len(request_times)//2]:.2f}ms")

583

584

import random

585

asyncio.run(main())

586

```