0
# Statistics
1
2
Performance monitoring and statistics collection for tracking crawling progress and system resource usage. Statistics provide insights into crawler performance, request success rates, and resource utilization.
3
4
## Capabilities
5
6
### Statistics Collector
7
8
Main statistics collection system for monitoring crawler performance and resource usage.
9
10
```python { .api }
11
class Statistics:
12
def __init__(self): ...
13
14
def get_state(self) -> StatisticsState:
15
"""
16
Get current statistics state.
17
18
Returns:
19
StatisticsState with current metrics
20
"""
21
22
def reset(self) -> None:
23
"""Reset all statistics counters."""
24
25
def start_job(self, job_name: str) -> None:
26
"""Start tracking a named job."""
27
28
def finish_job(self, job_name: str) -> None:
29
"""Finish tracking a named job."""
30
31
def increment_requests_finished(self) -> None:
32
"""Increment successful request counter."""
33
34
def increment_requests_failed(self) -> None:
35
"""Increment failed request counter."""
36
37
def increment_requests_retries(self) -> None:
38
"""Increment retry counter."""
39
40
def set_requests_total(self, total: int) -> None:
41
"""Set total expected requests."""
42
43
def log_system_info(self, interval: timedelta = timedelta(seconds=60)) -> None:
44
"""Start logging system information at intervals."""
45
46
def calculate_statistics_percentile(
47
self,
48
values: list[float],
49
percentile: float
50
) -> float:
51
"""Calculate percentile of values."""
52
53
@property
54
def requests_finished(self) -> int:
55
"""Number of successfully finished requests."""
56
57
@property
58
def requests_failed(self) -> int:
59
"""Number of failed requests."""
60
61
@property
62
def requests_total(self) -> int | None:
63
"""Total expected requests."""
64
65
@property
66
def requests_avg_failed_per_minute(self) -> float:
67
"""Average failed requests per minute."""
68
69
@property
70
def requests_avg_finished_per_minute(self) -> float:
71
"""Average successful requests per minute."""
72
73
@property
74
def crawl_duration_millis(self) -> int:
75
"""Total crawl duration in milliseconds."""
76
```
77
78
### Statistics State
79
80
Current state snapshot containing all performance metrics and counters.
81
82
```python { .api }
83
class StatisticsState:
84
def __init__(
85
self,
86
*,
87
requests_finished: int = 0,
88
requests_failed: int = 0,
89
requests_retries: int = 0,
90
requests_total: int | None = None,
91
crawl_duration_millis: int = 0,
92
requests_avg_failed_per_minute: float = 0.0,
93
requests_avg_finished_per_minute: float = 0.0,
94
requests_total_duration_millis: int = 0,
95
requests_min_duration_millis: int = 0,
96
requests_max_duration_millis: int = 0,
97
requests_avg_duration_millis: float = 0.0,
98
stats_id: str | None = None,
99
cpu_usage_percent: float = 0.0,
100
memory_usage_bytes: int = 0,
101
memory_usage_mb: float = 0.0
102
): ...
103
104
@property
105
def requests_finished(self) -> int:
106
"""Number of successfully finished requests."""
107
108
@property
109
def requests_failed(self) -> int:
110
"""Number of failed requests."""
111
112
@property
113
def requests_retries(self) -> int:
114
"""Total number of retries across all requests."""
115
116
@property
117
def requests_total(self) -> int | None:
118
"""Total expected requests."""
119
120
@property
121
def crawl_duration_millis(self) -> int:
122
"""Total crawl duration in milliseconds."""
123
124
@property
125
def requests_avg_failed_per_minute(self) -> float:
126
"""Average failed requests per minute."""
127
128
@property
129
def requests_avg_finished_per_minute(self) -> float:
130
"""Average successful requests per minute."""
131
132
@property
133
def requests_avg_duration_millis(self) -> float:
134
"""Average request duration in milliseconds."""
135
136
@property
137
def cpu_usage_percent(self) -> float:
138
"""Current CPU usage percentage."""
139
140
@property
141
def memory_usage_mb(self) -> float:
142
"""Current memory usage in megabytes."""
143
144
def to_dict(self) -> dict[str, any]:
145
"""Convert state to dictionary for serialization."""
146
```
147
148
### Final Statistics
149
150
Comprehensive final statistics summary generated at the end of crawling operations.
151
152
```python { .api }
153
class FinalStatistics:
154
def __init__(
155
self,
156
*,
157
requests_finished: int,
158
requests_failed: int,
159
retry_histogram: list[int],
160
requests_avg_failed_per_minute: float,
161
requests_avg_finished_per_minute: float,
162
requests_total_duration_millis: int,
163
requests_min_duration_millis: int,
164
requests_max_duration_millis: int,
165
requests_avg_duration_millis: float,
166
crawl_duration_millis: int,
167
stats_id: str | None = None
168
): ...
169
170
@property
171
def requests_finished(self) -> int:
172
"""Total successfully finished requests."""
173
174
@property
175
def requests_failed(self) -> int:
176
"""Total failed requests."""
177
178
@property
179
def requests_total(self) -> int:
180
"""Total requests processed (finished + failed)."""
181
182
@property
183
def retry_histogram(self) -> list[int]:
184
"""Histogram of retry counts [0_retries, 1_retry, 2_retries, ...]."""
185
186
@property
187
def requests_avg_failed_per_minute(self) -> float:
188
"""Average failed requests per minute."""
189
190
@property
191
def requests_avg_finished_per_minute(self) -> float:
192
"""Average successful requests per minute."""
193
194
@property
195
def requests_min_duration_millis(self) -> int:
196
"""Minimum request duration in milliseconds."""
197
198
@property
199
def requests_max_duration_millis(self) -> int:
200
"""Maximum request duration in milliseconds."""
201
202
@property
203
def requests_avg_duration_millis(self) -> float:
204
"""Average request duration in milliseconds."""
205
206
@property
207
def crawl_duration_millis(self) -> int:
208
"""Total crawl duration in milliseconds."""
209
210
@property
211
def success_rate(self) -> float:
212
"""Success rate as percentage (0-100)."""
213
214
def log_summary(self) -> None:
215
"""Log formatted summary of final statistics."""
216
217
def to_dict(self) -> dict[str, any]:
218
"""Convert to dictionary for serialization."""
219
```
220
221
## Usage Examples
222
223
### Basic Statistics Monitoring
224
225
```python
226
import asyncio
227
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
228
from crawlee.statistics import Statistics
229
230
async def main():
231
# Create crawler with custom statistics
232
stats = Statistics()
233
crawler = HttpCrawler(statistics=stats)
234
235
@crawler.router.default_handler
236
async def handler(context: HttpCrawlingContext):
237
# Process request
238
data = {
239
'url': context.request.url,
240
'status': context.response.status_code
241
}
242
await context.push_data(data)
243
244
# Run crawler
245
urls = [f'https://httpbin.org/delay/{i}' for i in range(1, 6)]
246
final_stats = await crawler.run(urls)
247
248
# Access statistics during crawling
249
current_state = stats.get_state()
250
print(f"Requests finished: {current_state.requests_finished}")
251
print(f"Requests failed: {current_state.requests_failed}")
252
print(f"Average duration: {current_state.requests_avg_duration_millis:.2f}ms")
253
254
# Final statistics
255
print(f"\nFinal Statistics:")
256
print(f"Success rate: {final_stats.success_rate:.1f}%")
257
print(f"Total duration: {final_stats.crawl_duration_millis}ms")
258
print(f"Retry histogram: {final_stats.retry_histogram}")
259
260
asyncio.run(main())
261
```
262
263
### Real-time Statistics Monitoring
264
265
```python
266
import asyncio
267
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
268
from crawlee.statistics import Statistics
269
270
async def monitor_statistics(stats: Statistics, interval: int = 10):
271
"""Monitor statistics in real-time during crawling."""
272
while True:
273
await asyncio.sleep(interval)
274
275
state = stats.get_state()
276
277
print(f"\n--- Statistics Update ---")
278
print(f"Finished: {state.requests_finished}")
279
print(f"Failed: {state.requests_failed}")
280
print(f"Success rate: {state.requests_finished / (state.requests_finished + state.requests_failed + 0.001) * 100:.1f}%")
281
print(f"Avg requests/min: {state.requests_avg_finished_per_minute:.1f}")
282
print(f"CPU usage: {state.cpu_usage_percent:.1f}%")
283
print(f"Memory usage: {state.memory_usage_mb:.1f}MB")
284
print(f"Duration: {state.crawl_duration_millis}ms")
285
286
# Stop monitoring when crawling is complete
287
if state.requests_total and (state.requests_finished + state.requests_failed) >= state.requests_total:
288
break
289
290
async def main():
291
stats = Statistics()
292
293
# Enable system info logging
294
stats.log_system_info(interval=timedelta(seconds=5))
295
296
crawler = HttpCrawler(
297
statistics=stats,
298
max_requests_per_crawl=50
299
)
300
301
@crawler.router.default_handler
302
async def handler(context: HttpCrawlingContext):
303
# Simulate varying processing time
304
await asyncio.sleep(random.uniform(0.1, 2.0))
305
306
data = {
307
'url': context.request.url,
308
'timestamp': datetime.now().isoformat()
309
}
310
await context.push_data(data)
311
312
# Start statistics monitoring
313
monitor_task = asyncio.create_task(monitor_statistics(stats, interval=5))
314
315
# Start crawling
316
urls = ['https://httpbin.org/delay/1'] * 20
317
final_stats = await crawler.run(urls)
318
319
# Wait for monitoring to finish
320
await monitor_task
321
322
# Print final summary
323
print(f"\n=== Final Summary ===")
324
final_stats.log_summary()
325
326
import random
327
from datetime import datetime, timedelta
328
asyncio.run(main())
329
```
330
331
### Custom Statistics Collection
332
333
```python
334
import asyncio
335
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
336
from crawlee.statistics import Statistics
337
338
class CustomStatistics(Statistics):
339
"""Extended statistics with custom metrics."""
340
341
def __init__(self):
342
super().__init__()
343
self.status_code_counts = {}
344
self.domain_counts = {}
345
self.response_sizes = []
346
347
def record_response(self, url: str, status_code: int, size: int):
348
"""Record custom response metrics."""
349
# Count status codes
350
self.status_code_counts[status_code] = self.status_code_counts.get(status_code, 0) + 1
351
352
# Count domains
353
from urllib.parse import urlparse
354
domain = urlparse(url).netloc
355
self.domain_counts[domain] = self.domain_counts.get(domain, 0) + 1
356
357
# Track response sizes
358
self.response_sizes.append(size)
359
360
def get_custom_summary(self) -> dict[str, any]:
361
"""Get summary of custom metrics."""
362
avg_size = sum(self.response_sizes) / len(self.response_sizes) if self.response_sizes else 0
363
364
return {
365
'status_codes': dict(self.status_code_counts),
366
'domains': dict(self.domain_counts),
367
'response_size_avg': avg_size,
368
'response_size_min': min(self.response_sizes) if self.response_sizes else 0,
369
'response_size_max': max(self.response_sizes) if self.response_sizes else 0
370
}
371
372
async def main():
373
custom_stats = CustomStatistics()
374
375
crawler = HttpCrawler(statistics=custom_stats)
376
377
@crawler.router.default_handler
378
async def handler(context: HttpCrawlingContext):
379
response = context.response
380
381
# Record custom metrics
382
response_size = len(response.content)
383
custom_stats.record_response(
384
url=response.url,
385
status_code=response.status_code,
386
size=response_size
387
)
388
389
data = {
390
'url': response.url,
391
'status': response.status_code,
392
'size': response_size
393
}
394
await context.push_data(data)
395
396
# Test with various URLs
397
urls = [
398
'https://httpbin.org/json',
399
'https://httpbin.org/html',
400
'https://httpbin.org/xml',
401
'https://httpbin.org/status/404',
402
'https://httpbin.org/status/500'
403
]
404
405
await crawler.run(urls)
406
407
# Get custom statistics
408
custom_summary = custom_stats.get_custom_summary()
409
410
print("Custom Statistics:")
411
print(f"Status codes: {custom_summary['status_codes']}")
412
print(f"Domains: {custom_summary['domains']}")
413
print(f"Avg response size: {custom_summary['response_size_avg']:.0f} bytes")
414
print(f"Min response size: {custom_summary['response_size_min']} bytes")
415
print(f"Max response size: {custom_summary['response_size_max']} bytes")
416
417
asyncio.run(main())
418
```
419
420
### Statistics Persistence
421
422
```python
423
import asyncio
424
import json
425
from datetime import datetime
426
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
427
from crawlee.statistics import Statistics
428
from crawlee.storages import KeyValueStore
429
430
async def save_statistics_periodically(stats: Statistics, store: KeyValueStore, interval: int = 30):
431
"""Save statistics to storage periodically."""
432
while True:
433
await asyncio.sleep(interval)
434
435
state = stats.get_state()
436
timestamp = datetime.now().isoformat()
437
438
# Save current state
439
await store.set_value(
440
f'stats_{timestamp}',
441
state.to_dict()
442
)
443
444
print(f"Statistics saved at {timestamp}")
445
446
async def main():
447
stats = Statistics()
448
stats_store = await KeyValueStore.open('crawl-statistics')
449
450
crawler = HttpCrawler(statistics=stats, max_requests_per_crawl=30)
451
452
@crawler.router.default_handler
453
async def handler(context: HttpCrawlingContext):
454
await asyncio.sleep(random.uniform(0.5, 2.0)) # Simulate work
455
456
data = {'url': context.request.url, 'processed_at': datetime.now().isoformat()}
457
await context.push_data(data)
458
459
# Start periodic statistics saving
460
save_task = asyncio.create_task(
461
save_statistics_periodically(stats, stats_store, interval=10)
462
)
463
464
# Run crawler
465
urls = ['https://httpbin.org/delay/1'] * 25
466
final_stats = await crawler.run(urls)
467
468
# Cancel periodic saving
469
save_task.cancel()
470
471
# Save final statistics
472
await stats_store.set_value('final_stats', final_stats.to_dict())
473
474
print("Final statistics saved to storage")
475
476
# Demonstrate reading saved statistics
477
saved_final = await stats_store.get_value('final_stats')
478
print(f"Retrieved final stats: Success rate = {saved_final['success_rate']:.1f}%")
479
480
import random
481
asyncio.run(main())
482
```
483
484
### Performance Analysis
485
486
```python
487
import asyncio
488
import time
489
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
490
from crawlee.statistics import Statistics
491
492
async def analyze_performance(stats: Statistics, final_stats):
493
"""Analyze crawler performance metrics."""
494
495
print("=== Performance Analysis ===")
496
497
# Basic metrics
498
total_requests = final_stats.requests_total
499
success_rate = final_stats.success_rate
500
avg_duration = final_stats.requests_avg_duration_millis
501
502
print(f"Total requests: {total_requests}")
503
print(f"Success rate: {success_rate:.1f}%")
504
print(f"Average request duration: {avg_duration:.2f}ms")
505
506
# Throughput analysis
507
crawl_duration_seconds = final_stats.crawl_duration_millis / 1000
508
throughput = total_requests / crawl_duration_seconds if crawl_duration_seconds > 0 else 0
509
510
print(f"Crawl duration: {crawl_duration_seconds:.1f}s")
511
print(f"Throughput: {throughput:.2f} requests/second")
512
513
# Retry analysis
514
retry_histogram = final_stats.retry_histogram
515
total_retries = sum(i * count for i, count in enumerate(retry_histogram))
516
517
print(f"Total retries: {total_retries}")
518
print(f"Retry distribution: {retry_histogram}")
519
520
if len(retry_histogram) > 1:
521
retry_rate = sum(retry_histogram[1:]) / total_requests * 100
522
print(f"Requests requiring retries: {retry_rate:.1f}%")
523
524
# Performance rating
525
if success_rate > 95 and avg_duration < 2000:
526
rating = "Excellent"
527
elif success_rate > 90 and avg_duration < 5000:
528
rating = "Good"
529
elif success_rate > 80:
530
rating = "Fair"
531
else:
532
rating = "Poor"
533
534
print(f"Performance rating: {rating}")
535
536
async def main():
537
stats = Statistics()
538
539
# Configure crawler with different settings for performance testing
540
crawler = HttpCrawler(
541
statistics=stats,
542
max_requests_per_crawl=20,
543
max_request_retries=3,
544
max_concurrent_requests=5
545
)
546
547
request_times = []
548
549
@crawler.router.default_handler
550
async def handler(context: HttpCrawlingContext):
551
start_time = time.time()
552
553
# Simulate varying response times
554
delay = random.choice([0.5, 1.0, 1.5, 2.0, 3.0])
555
await asyncio.sleep(delay)
556
557
end_time = time.time()
558
request_times.append((end_time - start_time) * 1000) # Convert to ms
559
560
# Occasionally fail to test retry logic
561
if random.random() < 0.1: # 10% failure rate
562
raise Exception("Simulated failure")
563
564
data = {
565
'url': context.request.url,
566
'duration_ms': (end_time - start_time) * 1000
567
}
568
await context.push_data(data)
569
570
# Run performance test
571
urls = ['https://httpbin.org/delay/1'] * 20
572
final_stats = await crawler.run(urls)
573
574
# Analyze performance
575
await analyze_performance(stats, final_stats)
576
577
# Additional custom analysis
578
if request_times:
579
print(f"\nCustom timing analysis:")
580
print(f"Min request time: {min(request_times):.2f}ms")
581
print(f"Max request time: {max(request_times):.2f}ms")
582
print(f"Median request time: {sorted(request_times)[len(request_times)//2]:.2f}ms")
583
584
import random
585
asyncio.run(main())
586
```