A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass
Performance monitoring and statistics collection for tracking crawling progress and system resource usage. Statistics provide insights into crawler performance, request success rates, and resource utilization.
Main statistics collection system for monitoring crawler performance and resource usage.
class Statistics:
def __init__(self): ...
def get_state(self) -> StatisticsState:
"""
Get current statistics state.
Returns:
StatisticsState with current metrics
"""
def reset(self) -> None:
"""Reset all statistics counters."""
def start_job(self, job_name: str) -> None:
"""Start tracking a named job."""
def finish_job(self, job_name: str) -> None:
"""Finish tracking a named job."""
def increment_requests_finished(self) -> None:
"""Increment successful request counter."""
def increment_requests_failed(self) -> None:
"""Increment failed request counter."""
def increment_requests_retries(self) -> None:
"""Increment retry counter."""
def set_requests_total(self, total: int) -> None:
"""Set total expected requests."""
def log_system_info(self, interval: timedelta = timedelta(seconds=60)) -> None:
"""Start logging system information at intervals."""
def calculate_statistics_percentile(
self,
values: list[float],
percentile: float
) -> float:
"""Calculate percentile of values."""
@property
def requests_finished(self) -> int:
"""Number of successfully finished requests."""
@property
def requests_failed(self) -> int:
"""Number of failed requests."""
@property
def requests_total(self) -> int | None:
"""Total expected requests."""
@property
def requests_avg_failed_per_minute(self) -> float:
"""Average failed requests per minute."""
@property
def requests_avg_finished_per_minute(self) -> float:
"""Average successful requests per minute."""
@property
def crawl_duration_millis(self) -> int:
"""Total crawl duration in milliseconds."""Current state snapshot containing all performance metrics and counters.
class StatisticsState:
def __init__(
self,
*,
requests_finished: int = 0,
requests_failed: int = 0,
requests_retries: int = 0,
requests_total: int | None = None,
crawl_duration_millis: int = 0,
requests_avg_failed_per_minute: float = 0.0,
requests_avg_finished_per_minute: float = 0.0,
requests_total_duration_millis: int = 0,
requests_min_duration_millis: int = 0,
requests_max_duration_millis: int = 0,
requests_avg_duration_millis: float = 0.0,
stats_id: str | None = None,
cpu_usage_percent: float = 0.0,
memory_usage_bytes: int = 0,
memory_usage_mb: float = 0.0
): ...
@property
def requests_finished(self) -> int:
"""Number of successfully finished requests."""
@property
def requests_failed(self) -> int:
"""Number of failed requests."""
@property
def requests_retries(self) -> int:
"""Total number of retries across all requests."""
@property
def requests_total(self) -> int | None:
"""Total expected requests."""
@property
def crawl_duration_millis(self) -> int:
"""Total crawl duration in milliseconds."""
@property
def requests_avg_failed_per_minute(self) -> float:
"""Average failed requests per minute."""
@property
def requests_avg_finished_per_minute(self) -> float:
"""Average successful requests per minute."""
@property
def requests_avg_duration_millis(self) -> float:
"""Average request duration in milliseconds."""
@property
def cpu_usage_percent(self) -> float:
"""Current CPU usage percentage."""
@property
def memory_usage_mb(self) -> float:
"""Current memory usage in megabytes."""
def to_dict(self) -> dict[str, any]:
"""Convert state to dictionary for serialization."""Comprehensive final statistics summary generated at the end of crawling operations.
class FinalStatistics:
def __init__(
self,
*,
requests_finished: int,
requests_failed: int,
retry_histogram: list[int],
requests_avg_failed_per_minute: float,
requests_avg_finished_per_minute: float,
requests_total_duration_millis: int,
requests_min_duration_millis: int,
requests_max_duration_millis: int,
requests_avg_duration_millis: float,
crawl_duration_millis: int,
stats_id: str | None = None
): ...
@property
def requests_finished(self) -> int:
"""Total successfully finished requests."""
@property
def requests_failed(self) -> int:
"""Total failed requests."""
@property
def requests_total(self) -> int:
"""Total requests processed (finished + failed)."""
@property
def retry_histogram(self) -> list[int]:
"""Histogram of retry counts [0_retries, 1_retry, 2_retries, ...]."""
@property
def requests_avg_failed_per_minute(self) -> float:
"""Average failed requests per minute."""
@property
def requests_avg_finished_per_minute(self) -> float:
"""Average successful requests per minute."""
@property
def requests_min_duration_millis(self) -> int:
"""Minimum request duration in milliseconds."""
@property
def requests_max_duration_millis(self) -> int:
"""Maximum request duration in milliseconds."""
@property
def requests_avg_duration_millis(self) -> float:
"""Average request duration in milliseconds."""
@property
def crawl_duration_millis(self) -> int:
"""Total crawl duration in milliseconds."""
@property
def success_rate(self) -> float:
"""Success rate as percentage (0-100)."""
def log_summary(self) -> None:
"""Log formatted summary of final statistics."""
def to_dict(self) -> dict[str, any]:
"""Convert to dictionary for serialization."""import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.statistics import Statistics
async def main():
# Create crawler with custom statistics
stats = Statistics()
crawler = HttpCrawler(statistics=stats)
@crawler.router.default_handler
async def handler(context: HttpCrawlingContext):
# Process request
data = {
'url': context.request.url,
'status': context.response.status_code
}
await context.push_data(data)
# Run crawler
urls = [f'https://httpbin.org/delay/{i}' for i in range(1, 6)]
final_stats = await crawler.run(urls)
# Access statistics during crawling
current_state = stats.get_state()
print(f"Requests finished: {current_state.requests_finished}")
print(f"Requests failed: {current_state.requests_failed}")
print(f"Average duration: {current_state.requests_avg_duration_millis:.2f}ms")
# Final statistics
print(f"\nFinal Statistics:")
print(f"Success rate: {final_stats.success_rate:.1f}%")
print(f"Total duration: {final_stats.crawl_duration_millis}ms")
print(f"Retry histogram: {final_stats.retry_histogram}")
asyncio.run(main())import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.statistics import Statistics
async def monitor_statistics(stats: Statistics, interval: int = 10):
"""Monitor statistics in real-time during crawling."""
while True:
await asyncio.sleep(interval)
state = stats.get_state()
print(f"\n--- Statistics Update ---")
print(f"Finished: {state.requests_finished}")
print(f"Failed: {state.requests_failed}")
print(f"Success rate: {state.requests_finished / (state.requests_finished + state.requests_failed + 0.001) * 100:.1f}%")
print(f"Avg requests/min: {state.requests_avg_finished_per_minute:.1f}")
print(f"CPU usage: {state.cpu_usage_percent:.1f}%")
print(f"Memory usage: {state.memory_usage_mb:.1f}MB")
print(f"Duration: {state.crawl_duration_millis}ms")
# Stop monitoring when crawling is complete
if state.requests_total and (state.requests_finished + state.requests_failed) >= state.requests_total:
break
async def main():
stats = Statistics()
# Enable system info logging
stats.log_system_info(interval=timedelta(seconds=5))
crawler = HttpCrawler(
statistics=stats,
max_requests_per_crawl=50
)
@crawler.router.default_handler
async def handler(context: HttpCrawlingContext):
# Simulate varying processing time
await asyncio.sleep(random.uniform(0.1, 2.0))
data = {
'url': context.request.url,
'timestamp': datetime.now().isoformat()
}
await context.push_data(data)
# Start statistics monitoring
monitor_task = asyncio.create_task(monitor_statistics(stats, interval=5))
# Start crawling
urls = ['https://httpbin.org/delay/1'] * 20
final_stats = await crawler.run(urls)
# Wait for monitoring to finish
await monitor_task
# Print final summary
print(f"\n=== Final Summary ===")
final_stats.log_summary()
import random
from datetime import datetime, timedelta
asyncio.run(main())import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.statistics import Statistics
class CustomStatistics(Statistics):
"""Extended statistics with custom metrics."""
def __init__(self):
super().__init__()
self.status_code_counts = {}
self.domain_counts = {}
self.response_sizes = []
def record_response(self, url: str, status_code: int, size: int):
"""Record custom response metrics."""
# Count status codes
self.status_code_counts[status_code] = self.status_code_counts.get(status_code, 0) + 1
# Count domains
from urllib.parse import urlparse
domain = urlparse(url).netloc
self.domain_counts[domain] = self.domain_counts.get(domain, 0) + 1
# Track response sizes
self.response_sizes.append(size)
def get_custom_summary(self) -> dict[str, any]:
"""Get summary of custom metrics."""
avg_size = sum(self.response_sizes) / len(self.response_sizes) if self.response_sizes else 0
return {
'status_codes': dict(self.status_code_counts),
'domains': dict(self.domain_counts),
'response_size_avg': avg_size,
'response_size_min': min(self.response_sizes) if self.response_sizes else 0,
'response_size_max': max(self.response_sizes) if self.response_sizes else 0
}
async def main():
custom_stats = CustomStatistics()
crawler = HttpCrawler(statistics=custom_stats)
@crawler.router.default_handler
async def handler(context: HttpCrawlingContext):
response = context.response
# Record custom metrics
response_size = len(response.content)
custom_stats.record_response(
url=response.url,
status_code=response.status_code,
size=response_size
)
data = {
'url': response.url,
'status': response.status_code,
'size': response_size
}
await context.push_data(data)
# Test with various URLs
urls = [
'https://httpbin.org/json',
'https://httpbin.org/html',
'https://httpbin.org/xml',
'https://httpbin.org/status/404',
'https://httpbin.org/status/500'
]
await crawler.run(urls)
# Get custom statistics
custom_summary = custom_stats.get_custom_summary()
print("Custom Statistics:")
print(f"Status codes: {custom_summary['status_codes']}")
print(f"Domains: {custom_summary['domains']}")
print(f"Avg response size: {custom_summary['response_size_avg']:.0f} bytes")
print(f"Min response size: {custom_summary['response_size_min']} bytes")
print(f"Max response size: {custom_summary['response_size_max']} bytes")
asyncio.run(main())import asyncio
import json
from datetime import datetime
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.statistics import Statistics
from crawlee.storages import KeyValueStore
async def save_statistics_periodically(stats: Statistics, store: KeyValueStore, interval: int = 30):
"""Save statistics to storage periodically."""
while True:
await asyncio.sleep(interval)
state = stats.get_state()
timestamp = datetime.now().isoformat()
# Save current state
await store.set_value(
f'stats_{timestamp}',
state.to_dict()
)
print(f"Statistics saved at {timestamp}")
async def main():
stats = Statistics()
stats_store = await KeyValueStore.open('crawl-statistics')
crawler = HttpCrawler(statistics=stats, max_requests_per_crawl=30)
@crawler.router.default_handler
async def handler(context: HttpCrawlingContext):
await asyncio.sleep(random.uniform(0.5, 2.0)) # Simulate work
data = {'url': context.request.url, 'processed_at': datetime.now().isoformat()}
await context.push_data(data)
# Start periodic statistics saving
save_task = asyncio.create_task(
save_statistics_periodically(stats, stats_store, interval=10)
)
# Run crawler
urls = ['https://httpbin.org/delay/1'] * 25
final_stats = await crawler.run(urls)
# Cancel periodic saving
save_task.cancel()
# Save final statistics
await stats_store.set_value('final_stats', final_stats.to_dict())
print("Final statistics saved to storage")
# Demonstrate reading saved statistics
saved_final = await stats_store.get_value('final_stats')
print(f"Retrieved final stats: Success rate = {saved_final['success_rate']:.1f}%")
import random
asyncio.run(main())import asyncio
import time
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.statistics import Statistics
async def analyze_performance(stats: Statistics, final_stats):
"""Analyze crawler performance metrics."""
print("=== Performance Analysis ===")
# Basic metrics
total_requests = final_stats.requests_total
success_rate = final_stats.success_rate
avg_duration = final_stats.requests_avg_duration_millis
print(f"Total requests: {total_requests}")
print(f"Success rate: {success_rate:.1f}%")
print(f"Average request duration: {avg_duration:.2f}ms")
# Throughput analysis
crawl_duration_seconds = final_stats.crawl_duration_millis / 1000
throughput = total_requests / crawl_duration_seconds if crawl_duration_seconds > 0 else 0
print(f"Crawl duration: {crawl_duration_seconds:.1f}s")
print(f"Throughput: {throughput:.2f} requests/second")
# Retry analysis
retry_histogram = final_stats.retry_histogram
total_retries = sum(i * count for i, count in enumerate(retry_histogram))
print(f"Total retries: {total_retries}")
print(f"Retry distribution: {retry_histogram}")
if len(retry_histogram) > 1:
retry_rate = sum(retry_histogram[1:]) / total_requests * 100
print(f"Requests requiring retries: {retry_rate:.1f}%")
# Performance rating
if success_rate > 95 and avg_duration < 2000:
rating = "Excellent"
elif success_rate > 90 and avg_duration < 5000:
rating = "Good"
elif success_rate > 80:
rating = "Fair"
else:
rating = "Poor"
print(f"Performance rating: {rating}")
async def main():
stats = Statistics()
# Configure crawler with different settings for performance testing
crawler = HttpCrawler(
statistics=stats,
max_requests_per_crawl=20,
max_request_retries=3,
max_concurrent_requests=5
)
request_times = []
@crawler.router.default_handler
async def handler(context: HttpCrawlingContext):
start_time = time.time()
# Simulate varying response times
delay = random.choice([0.5, 1.0, 1.5, 2.0, 3.0])
await asyncio.sleep(delay)
end_time = time.time()
request_times.append((end_time - start_time) * 1000) # Convert to ms
# Occasionally fail to test retry logic
if random.random() < 0.1: # 10% failure rate
raise Exception("Simulated failure")
data = {
'url': context.request.url,
'duration_ms': (end_time - start_time) * 1000
}
await context.push_data(data)
# Run performance test
urls = ['https://httpbin.org/delay/1'] * 20
final_stats = await crawler.run(urls)
# Analyze performance
await analyze_performance(stats, final_stats)
# Additional custom analysis
if request_times:
print(f"\nCustom timing analysis:")
print(f"Min request time: {min(request_times):.2f}ms")
print(f"Max request time: {max(request_times):.2f}ms")
print(f"Median request time: {sorted(request_times)[len(request_times)//2]:.2f}ms")
import random
asyncio.run(main())Install with Tessl CLI
npx tessl i tessl/pypi-crawlee