Tessl Tile for pypi/crawlee@0.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

browser-automation.md cli-tools.md configuration.md core-types.md crawlers.md error-handling.md events.md fingerprinting.md http-clients.md index.md request-management.md sessions.md statistics.md storage.md

statistics.mddocs/

0
# Statistics
1

2
Performance monitoring and statistics collection for tracking crawling progress and system resource usage. Statistics provide insights into crawler performance, request success rates, and resource utilization.
3

4
## Capabilities
5

6
### Statistics Collector
7

8
Main statistics collection system for monitoring crawler performance and resource usage.
9

10
```python { .api }
11
class Statistics:
12
    def __init__(self): ...
13

14
    def get_state(self) -> StatisticsState:
15
        """
16
        Get current statistics state.
17

18
        Returns:
19
            StatisticsState with current metrics
20
        """
21

22
    def reset(self) -> None:
23
        """Reset all statistics counters."""
24

25
    def start_job(self, job_name: str) -> None:
26
        """Start tracking a named job."""
27

28
    def finish_job(self, job_name: str) -> None:
29
        """Finish tracking a named job."""
30

31
    def increment_requests_finished(self) -> None:
32
        """Increment successful request counter."""
33

34
    def increment_requests_failed(self) -> None:
35
        """Increment failed request counter."""
36

37
    def increment_requests_retries(self) -> None:
38
        """Increment retry counter."""
39

40
    def set_requests_total(self, total: int) -> None:
41
        """Set total expected requests."""
42

43
    def log_system_info(self, interval: timedelta = timedelta(seconds=60)) -> None:
44
        """Start logging system information at intervals."""
45

46
    def calculate_statistics_percentile(
47
        self,
48
        values: list[float],
49
        percentile: float
50
    ) -> float:
51
        """Calculate percentile of values."""
52

53
    @property
54
    def requests_finished(self) -> int:
55
        """Number of successfully finished requests."""
56

57
    @property
58
    def requests_failed(self) -> int:
59
        """Number of failed requests."""
60

61
    @property
62
    def requests_total(self) -> int | None:
63
        """Total expected requests."""
64

65
    @property
66
    def requests_avg_failed_per_minute(self) -> float:
67
        """Average failed requests per minute."""
68

69
    @property
70
    def requests_avg_finished_per_minute(self) -> float:
71
        """Average successful requests per minute."""
72

73
    @property
74
    def crawl_duration_millis(self) -> int:
75
        """Total crawl duration in milliseconds."""
76
```
77

78
### Statistics State
79

80
Current state snapshot containing all performance metrics and counters.
81

82
```python { .api }
83
class StatisticsState:
84
    def __init__(
85
        self,
86
        *,
87
        requests_finished: int = 0,
88
        requests_failed: int = 0,
89
        requests_retries: int = 0,
90
        requests_total: int | None = None,
91
        crawl_duration_millis: int = 0,
92
        requests_avg_failed_per_minute: float = 0.0,
93
        requests_avg_finished_per_minute: float = 0.0,
94
        requests_total_duration_millis: int = 0,
95
        requests_min_duration_millis: int = 0,
96
        requests_max_duration_millis: int = 0,
97
        requests_avg_duration_millis: float = 0.0,
98
        stats_id: str | None = None,
99
        cpu_usage_percent: float = 0.0,
100
        memory_usage_bytes: int = 0,
101
        memory_usage_mb: float = 0.0
102
    ): ...
103

104
    @property
105
    def requests_finished(self) -> int:
106
        """Number of successfully finished requests."""
107

108
    @property
109
    def requests_failed(self) -> int:
110
        """Number of failed requests."""
111

112
    @property
113
    def requests_retries(self) -> int:
114
        """Total number of retries across all requests."""
115

116
    @property
117
    def requests_total(self) -> int | None:
118
        """Total expected requests."""
119

120
    @property
121
    def crawl_duration_millis(self) -> int:
122
        """Total crawl duration in milliseconds."""
123

124
    @property
125
    def requests_avg_failed_per_minute(self) -> float:
126
        """Average failed requests per minute."""
127

128
    @property
129
    def requests_avg_finished_per_minute(self) -> float:
130
        """Average successful requests per minute."""
131

132
    @property
133
    def requests_avg_duration_millis(self) -> float:
134
        """Average request duration in milliseconds."""
135

136
    @property
137
    def cpu_usage_percent(self) -> float:
138
        """Current CPU usage percentage."""
139

140
    @property
141
    def memory_usage_mb(self) -> float:
142
        """Current memory usage in megabytes."""
143

144
    def to_dict(self) -> dict[str, any]:
145
        """Convert state to dictionary for serialization."""
146
```
147

148
### Final Statistics
149

150
Comprehensive final statistics summary generated at the end of crawling operations.
151

152
```python { .api }
153
class FinalStatistics:
154
    def __init__(
155
        self,
156
        *,
157
        requests_finished: int,
158
        requests_failed: int,
159
        retry_histogram: list[int],
160
        requests_avg_failed_per_minute: float,
161
        requests_avg_finished_per_minute: float,
162
        requests_total_duration_millis: int,
163
        requests_min_duration_millis: int,
164
        requests_max_duration_millis: int,
165
        requests_avg_duration_millis: float,
166
        crawl_duration_millis: int,
167
        stats_id: str | None = None
168
    ): ...
169

170
    @property
171
    def requests_finished(self) -> int:
172
        """Total successfully finished requests."""
173

174
    @property
175
    def requests_failed(self) -> int:
176
        """Total failed requests."""
177

178
    @property
179
    def requests_total(self) -> int:
180
        """Total requests processed (finished + failed)."""
181

182
    @property
183
    def retry_histogram(self) -> list[int]:
184
        """Histogram of retry counts [0_retries, 1_retry, 2_retries, ...]."""
185

186
    @property
187
    def requests_avg_failed_per_minute(self) -> float:
188
        """Average failed requests per minute."""
189

190
    @property
191
    def requests_avg_finished_per_minute(self) -> float:
192
        """Average successful requests per minute."""
193

194
    @property
195
    def requests_min_duration_millis(self) -> int:
196
        """Minimum request duration in milliseconds."""
197

198
    @property
199
    def requests_max_duration_millis(self) -> int:
200
        """Maximum request duration in milliseconds."""
201

202
    @property
203
    def requests_avg_duration_millis(self) -> float:
204
        """Average request duration in milliseconds."""
205

206
    @property
207
    def crawl_duration_millis(self) -> int:
208
        """Total crawl duration in milliseconds."""
209

210
    @property
211
    def success_rate(self) -> float:
212
        """Success rate as percentage (0-100)."""
213

214
    def log_summary(self) -> None:
215
        """Log formatted summary of final statistics."""
216

217
    def to_dict(self) -> dict[str, any]:
218
        """Convert to dictionary for serialization."""
219
```
220

221
## Usage Examples
222

223
### Basic Statistics Monitoring
224

225
```python
226
import asyncio
227
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
228
from crawlee.statistics import Statistics
229

230
async def main():
231
    # Create crawler with custom statistics
232
    stats = Statistics()
233
    crawler = HttpCrawler(statistics=stats)
234

235
    @crawler.router.default_handler
236
    async def handler(context: HttpCrawlingContext):
237
        # Process request
238
        data = {
239
            'url': context.request.url,
240
            'status': context.response.status_code
241
        }
242
        await context.push_data(data)
243

244
    # Run crawler
245
    urls = [f'https://httpbin.org/delay/{i}' for i in range(1, 6)]
246
    final_stats = await crawler.run(urls)
247

248
    # Access statistics during crawling
249
    current_state = stats.get_state()
250
    print(f"Requests finished: {current_state.requests_finished}")
251
    print(f"Requests failed: {current_state.requests_failed}")
252
    print(f"Average duration: {current_state.requests_avg_duration_millis:.2f}ms")
253

254
    # Final statistics
255
    print(f"\nFinal Statistics:")
256
    print(f"Success rate: {final_stats.success_rate:.1f}%")
257
    print(f"Total duration: {final_stats.crawl_duration_millis}ms")
258
    print(f"Retry histogram: {final_stats.retry_histogram}")
259

260
asyncio.run(main())
261
```
262

263
### Real-time Statistics Monitoring
264

265
```python
266
import asyncio
267
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
268
from crawlee.statistics import Statistics
269

270
async def monitor_statistics(stats: Statistics, interval: int = 10):
271
    """Monitor statistics in real-time during crawling."""
272
    while True:
273
        await asyncio.sleep(interval)
274

275
        state = stats.get_state()
276

277
        print(f"\n--- Statistics Update ---")
278
        print(f"Finished: {state.requests_finished}")
279
        print(f"Failed: {state.requests_failed}")
280
        print(f"Success rate: {state.requests_finished / (state.requests_finished + state.requests_failed + 0.001) * 100:.1f}%")
281
        print(f"Avg requests/min: {state.requests_avg_finished_per_minute:.1f}")
282
        print(f"CPU usage: {state.cpu_usage_percent:.1f}%")
283
        print(f"Memory usage: {state.memory_usage_mb:.1f}MB")
284
        print(f"Duration: {state.crawl_duration_millis}ms")
285

286
        # Stop monitoring when crawling is complete
287
        if state.requests_total and (state.requests_finished + state.requests_failed) >= state.requests_total:
288
            break
289

290
async def main():
291
    stats = Statistics()
292

293
    # Enable system info logging
294
    stats.log_system_info(interval=timedelta(seconds=5))
295

296
    crawler = HttpCrawler(
297
        statistics=stats,
298
        max_requests_per_crawl=50
299
    )
300

301
    @crawler.router.default_handler
302
    async def handler(context: HttpCrawlingContext):
303
        # Simulate varying processing time
304
        await asyncio.sleep(random.uniform(0.1, 2.0))
305

306
        data = {
307
            'url': context.request.url,
308
            'timestamp': datetime.now().isoformat()
309
        }
310
        await context.push_data(data)
311

312
    # Start statistics monitoring
313
    monitor_task = asyncio.create_task(monitor_statistics(stats, interval=5))
314

315
    # Start crawling
316
    urls = ['https://httpbin.org/delay/1'] * 20
317
    final_stats = await crawler.run(urls)
318

319
    # Wait for monitoring to finish
320
    await monitor_task
321

322
    # Print final summary
323
    print(f"\n=== Final Summary ===")
324
    final_stats.log_summary()
325

326
import random
327
from datetime import datetime, timedelta
328
asyncio.run(main())
329
```
330

331
### Custom Statistics Collection
332

333
```python
334
import asyncio
335
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
336
from crawlee.statistics import Statistics
337

338
class CustomStatistics(Statistics):
339
    """Extended statistics with custom metrics."""
340

341
    def __init__(self):
342
        super().__init__()
343
        self.status_code_counts = {}
344
        self.domain_counts = {}
345
        self.response_sizes = []
346

347
    def record_response(self, url: str, status_code: int, size: int):
348
        """Record custom response metrics."""
349
        # Count status codes
350
        self.status_code_counts[status_code] = self.status_code_counts.get(status_code, 0) + 1
351

352
        # Count domains
353
        from urllib.parse import urlparse
354
        domain = urlparse(url).netloc
355
        self.domain_counts[domain] = self.domain_counts.get(domain, 0) + 1
356

357
        # Track response sizes
358
        self.response_sizes.append(size)
359

360
    def get_custom_summary(self) -> dict[str, any]:
361
        """Get summary of custom metrics."""
362
        avg_size = sum(self.response_sizes) / len(self.response_sizes) if self.response_sizes else 0
363

364
        return {
365
            'status_codes': dict(self.status_code_counts),
366
            'domains': dict(self.domain_counts),
367
            'response_size_avg': avg_size,
368
            'response_size_min': min(self.response_sizes) if self.response_sizes else 0,
369
            'response_size_max': max(self.response_sizes) if self.response_sizes else 0
370
        }
371

372
async def main():
373
    custom_stats = CustomStatistics()
374

375
    crawler = HttpCrawler(statistics=custom_stats)
376

377
    @crawler.router.default_handler
378
    async def handler(context: HttpCrawlingContext):
379
        response = context.response
380

381
        # Record custom metrics
382
        response_size = len(response.content)
383
        custom_stats.record_response(
384
            url=response.url,
385
            status_code=response.status_code,
386
            size=response_size
387
        )
388

389
        data = {
390
            'url': response.url,
391
            'status': response.status_code,
392
            'size': response_size
393
        }
394
        await context.push_data(data)
395

396
    # Test with various URLs
397
    urls = [
398
        'https://httpbin.org/json',
399
        'https://httpbin.org/html',
400
        'https://httpbin.org/xml',
401
        'https://httpbin.org/status/404',
402
        'https://httpbin.org/status/500'
403
    ]
404

405
    await crawler.run(urls)
406

407
    # Get custom statistics
408
    custom_summary = custom_stats.get_custom_summary()
409

410
    print("Custom Statistics:")
411
    print(f"Status codes: {custom_summary['status_codes']}")
412
    print(f"Domains: {custom_summary['domains']}")
413
    print(f"Avg response size: {custom_summary['response_size_avg']:.0f} bytes")
414
    print(f"Min response size: {custom_summary['response_size_min']} bytes")
415
    print(f"Max response size: {custom_summary['response_size_max']} bytes")
416

417
asyncio.run(main())
418
```
419

420
### Statistics Persistence
421

422
```python
423
import asyncio
424
import json
425
from datetime import datetime
426
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
427
from crawlee.statistics import Statistics
428
from crawlee.storages import KeyValueStore
429

430
async def save_statistics_periodically(stats: Statistics, store: KeyValueStore, interval: int = 30):
431
    """Save statistics to storage periodically."""
432
    while True:
433
        await asyncio.sleep(interval)
434

435
        state = stats.get_state()
436
        timestamp = datetime.now().isoformat()
437

438
        # Save current state
439
        await store.set_value(
440
            f'stats_{timestamp}',
441
            state.to_dict()
442
        )
443

444
        print(f"Statistics saved at {timestamp}")
445

446
async def main():
447
    stats = Statistics()
448
    stats_store = await KeyValueStore.open('crawl-statistics')
449

450
    crawler = HttpCrawler(statistics=stats, max_requests_per_crawl=30)
451

452
    @crawler.router.default_handler
453
    async def handler(context: HttpCrawlingContext):
454
        await asyncio.sleep(random.uniform(0.5, 2.0))  # Simulate work
455

456
        data = {'url': context.request.url, 'processed_at': datetime.now().isoformat()}
457
        await context.push_data(data)
458

459
    # Start periodic statistics saving
460
    save_task = asyncio.create_task(
461
        save_statistics_periodically(stats, stats_store, interval=10)
462
    )
463

464
    # Run crawler
465
    urls = ['https://httpbin.org/delay/1'] * 25
466
    final_stats = await crawler.run(urls)
467

468
    # Cancel periodic saving
469
    save_task.cancel()
470

471
    # Save final statistics
472
    await stats_store.set_value('final_stats', final_stats.to_dict())
473

474
    print("Final statistics saved to storage")
475

476
    # Demonstrate reading saved statistics
477
    saved_final = await stats_store.get_value('final_stats')
478
    print(f"Retrieved final stats: Success rate = {saved_final['success_rate']:.1f}%")
479

480
import random
481
asyncio.run(main())
482
```
483

484
### Performance Analysis
485

486
```python
487
import asyncio
488
import time
489
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
490
from crawlee.statistics import Statistics
491

492
async def analyze_performance(stats: Statistics, final_stats):
493
    """Analyze crawler performance metrics."""
494

495
    print("=== Performance Analysis ===")
496

497
    # Basic metrics
498
    total_requests = final_stats.requests_total
499
    success_rate = final_stats.success_rate
500
    avg_duration = final_stats.requests_avg_duration_millis
501

502
    print(f"Total requests: {total_requests}")
503
    print(f"Success rate: {success_rate:.1f}%")
504
    print(f"Average request duration: {avg_duration:.2f}ms")
505

506
    # Throughput analysis
507
    crawl_duration_seconds = final_stats.crawl_duration_millis / 1000
508
    throughput = total_requests / crawl_duration_seconds if crawl_duration_seconds > 0 else 0
509

510
    print(f"Crawl duration: {crawl_duration_seconds:.1f}s")
511
    print(f"Throughput: {throughput:.2f} requests/second")
512

513
    # Retry analysis
514
    retry_histogram = final_stats.retry_histogram
515
    total_retries = sum(i * count for i, count in enumerate(retry_histogram))
516

517
    print(f"Total retries: {total_retries}")
518
    print(f"Retry distribution: {retry_histogram}")
519

520
    if len(retry_histogram) > 1:
521
        retry_rate = sum(retry_histogram[1:]) / total_requests * 100
522
        print(f"Requests requiring retries: {retry_rate:.1f}%")
523

524
    # Performance rating
525
    if success_rate > 95 and avg_duration < 2000:
526
        rating = "Excellent"
527
    elif success_rate > 90 and avg_duration < 5000:
528
        rating = "Good"
529
    elif success_rate > 80:
530
        rating = "Fair"
531
    else:
532
        rating = "Poor"
533

534
    print(f"Performance rating: {rating}")
535

536
async def main():
537
    stats = Statistics()
538

539
    # Configure crawler with different settings for performance testing
540
    crawler = HttpCrawler(
541
        statistics=stats,
542
        max_requests_per_crawl=20,
543
        max_request_retries=3,
544
        max_concurrent_requests=5
545
    )
546

547
    request_times = []
548

549
    @crawler.router.default_handler
550
    async def handler(context: HttpCrawlingContext):
551
        start_time = time.time()
552

553
        # Simulate varying response times
554
        delay = random.choice([0.5, 1.0, 1.5, 2.0, 3.0])
555
        await asyncio.sleep(delay)
556

557
        end_time = time.time()
558
        request_times.append((end_time - start_time) * 1000)  # Convert to ms
559

560
        # Occasionally fail to test retry logic
561
        if random.random() < 0.1:  # 10% failure rate
562
            raise Exception("Simulated failure")
563

564
        data = {
565
            'url': context.request.url,
566
            'duration_ms': (end_time - start_time) * 1000
567
        }
568
        await context.push_data(data)
569

570
    # Run performance test
571
    urls = ['https://httpbin.org/delay/1'] * 20
572
    final_stats = await crawler.run(urls)
573

574
    # Analyze performance
575
    await analyze_performance(stats, final_stats)
576

577
    # Additional custom analysis
578
    if request_times:
579
        print(f"\nCustom timing analysis:")
580
        print(f"Min request time: {min(request_times):.2f}ms")
581
        print(f"Max request time: {max(request_times):.2f}ms")
582
        print(f"Median request time: {sorted(request_times)[len(request_times)//2]:.2f}ms")
583

584
import random
585
asyncio.run(main())
586
```

Version

Tile

Files

statistics.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

statistics.mddocs/