tessl/pypi-crawlee

A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass

Overview

Eval results

Files

Statistics

Name: tessl/pypi-crawlee
Author: tessl

Performance monitoring and statistics collection for tracking crawling progress and system resource usage. Statistics provide insights into crawler performance, request success rates, and resource utilization.

Capabilities

Statistics Collector

Main statistics collection system for monitoring crawler performance and resource usage.

class Statistics:
    def __init__(self): ...

    def get_state(self) -> StatisticsState:
        """
        Get current statistics state.

        Returns:
            StatisticsState with current metrics
        """

    def reset(self) -> None:
        """Reset all statistics counters."""

    def start_job(self, job_name: str) -> None:
        """Start tracking a named job."""

    def finish_job(self, job_name: str) -> None:
        """Finish tracking a named job."""

    def increment_requests_finished(self) -> None:
        """Increment successful request counter."""

    def increment_requests_failed(self) -> None:
        """Increment failed request counter."""

    def increment_requests_retries(self) -> None:
        """Increment retry counter."""

    def set_requests_total(self, total: int) -> None:
        """Set total expected requests."""

    def log_system_info(self, interval: timedelta = timedelta(seconds=60)) -> None:
        """Start logging system information at intervals."""

    def calculate_statistics_percentile(
        self,
        values: list[float],
        percentile: float
    ) -> float:
        """Calculate percentile of values."""

    @property
    def requests_finished(self) -> int:
        """Number of successfully finished requests."""

    @property
    def requests_failed(self) -> int:
        """Number of failed requests."""

    @property
    def requests_total(self) -> int | None:
        """Total expected requests."""

    @property
    def requests_avg_failed_per_minute(self) -> float:
        """Average failed requests per minute."""

    @property
    def requests_avg_finished_per_minute(self) -> float:
        """Average successful requests per minute."""

    @property
    def crawl_duration_millis(self) -> int:
        """Total crawl duration in milliseconds."""

Statistics State

Current state snapshot containing all performance metrics and counters.

class StatisticsState:
    def __init__(
        self,
        *,
        requests_finished: int = 0,
        requests_failed: int = 0,
        requests_retries: int = 0,
        requests_total: int | None = None,
        crawl_duration_millis: int = 0,
        requests_avg_failed_per_minute: float = 0.0,
        requests_avg_finished_per_minute: float = 0.0,
        requests_total_duration_millis: int = 0,
        requests_min_duration_millis: int = 0,
        requests_max_duration_millis: int = 0,
        requests_avg_duration_millis: float = 0.0,
        stats_id: str | None = None,
        cpu_usage_percent: float = 0.0,
        memory_usage_bytes: int = 0,
        memory_usage_mb: float = 0.0
    ): ...

    @property
    def requests_finished(self) -> int:
        """Number of successfully finished requests."""

    @property
    def requests_failed(self) -> int:
        """Number of failed requests."""

    @property
    def requests_retries(self) -> int:
        """Total number of retries across all requests."""

    @property
    def requests_total(self) -> int | None:
        """Total expected requests."""

    @property
    def crawl_duration_millis(self) -> int:
        """Total crawl duration in milliseconds."""

    @property
    def requests_avg_failed_per_minute(self) -> float:
        """Average failed requests per minute."""

    @property
    def requests_avg_finished_per_minute(self) -> float:
        """Average successful requests per minute."""

    @property
    def requests_avg_duration_millis(self) -> float:
        """Average request duration in milliseconds."""

    @property
    def cpu_usage_percent(self) -> float:
        """Current CPU usage percentage."""

    @property
    def memory_usage_mb(self) -> float:
        """Current memory usage in megabytes."""

    def to_dict(self) -> dict[str, any]:
        """Convert state to dictionary for serialization."""

Final Statistics

Comprehensive final statistics summary generated at the end of crawling operations.

class FinalStatistics:
    def __init__(
        self,
        *,
        requests_finished: int,
        requests_failed: int,
        retry_histogram: list[int],
        requests_avg_failed_per_minute: float,
        requests_avg_finished_per_minute: float,
        requests_total_duration_millis: int,
        requests_min_duration_millis: int,
        requests_max_duration_millis: int,
        requests_avg_duration_millis: float,
        crawl_duration_millis: int,
        stats_id: str | None = None
    ): ...

    @property
    def requests_finished(self) -> int:
        """Total successfully finished requests."""

    @property
    def requests_failed(self) -> int:
        """Total failed requests."""

    @property
    def requests_total(self) -> int:
        """Total requests processed (finished + failed)."""

    @property
    def retry_histogram(self) -> list[int]:
        """Histogram of retry counts [0_retries, 1_retry, 2_retries, ...]."""

    @property
    def requests_avg_failed_per_minute(self) -> float:
        """Average failed requests per minute."""

    @property
    def requests_avg_finished_per_minute(self) -> float:
        """Average successful requests per minute."""

    @property
    def requests_min_duration_millis(self) -> int:
        """Minimum request duration in milliseconds."""

    @property
    def requests_max_duration_millis(self) -> int:
        """Maximum request duration in milliseconds."""

    @property
    def requests_avg_duration_millis(self) -> float:
        """Average request duration in milliseconds."""

    @property
    def crawl_duration_millis(self) -> int:
        """Total crawl duration in milliseconds."""

    @property
    def success_rate(self) -> float:
        """Success rate as percentage (0-100)."""

    def log_summary(self) -> None:
        """Log formatted summary of final statistics."""

    def to_dict(self) -> dict[str, any]:
        """Convert to dictionary for serialization."""

Usage Examples

Basic Statistics Monitoring

import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.statistics import Statistics

async def main():
    # Create crawler with custom statistics
    stats = Statistics()
    crawler = HttpCrawler(statistics=stats)

    @crawler.router.default_handler
    async def handler(context: HttpCrawlingContext):
        # Process request
        data = {
            'url': context.request.url,
            'status': context.response.status_code
        }
        await context.push_data(data)

    # Run crawler
    urls = [f'https://httpbin.org/delay/{i}' for i in range(1, 6)]
    final_stats = await crawler.run(urls)

    # Access statistics during crawling
    current_state = stats.get_state()
    print(f"Requests finished: {current_state.requests_finished}")
    print(f"Requests failed: {current_state.requests_failed}")
    print(f"Average duration: {current_state.requests_avg_duration_millis:.2f}ms")

    # Final statistics
    print(f"\nFinal Statistics:")
    print(f"Success rate: {final_stats.success_rate:.1f}%")
    print(f"Total duration: {final_stats.crawl_duration_millis}ms")
    print(f"Retry histogram: {final_stats.retry_histogram}")

asyncio.run(main())

Real-time Statistics Monitoring

import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.statistics import Statistics

async def monitor_statistics(stats: Statistics, interval: int = 10):
    """Monitor statistics in real-time during crawling."""
    while True:
        await asyncio.sleep(interval)

        state = stats.get_state()

        print(f"\n--- Statistics Update ---")
        print(f"Finished: {state.requests_finished}")
        print(f"Failed: {state.requests_failed}")
        print(f"Success rate: {state.requests_finished / (state.requests_finished + state.requests_failed + 0.001) * 100:.1f}%")
        print(f"Avg requests/min: {state.requests_avg_finished_per_minute:.1f}")
        print(f"CPU usage: {state.cpu_usage_percent:.1f}%")
        print(f"Memory usage: {state.memory_usage_mb:.1f}MB")
        print(f"Duration: {state.crawl_duration_millis}ms")

        # Stop monitoring when crawling is complete
        if state.requests_total and (state.requests_finished + state.requests_failed) >= state.requests_total:
            break

async def main():
    stats = Statistics()

    # Enable system info logging
    stats.log_system_info(interval=timedelta(seconds=5))

    crawler = HttpCrawler(
        statistics=stats,
        max_requests_per_crawl=50
    )

    @crawler.router.default_handler
    async def handler(context: HttpCrawlingContext):
        # Simulate varying processing time
        await asyncio.sleep(random.uniform(0.1, 2.0))

        data = {
            'url': context.request.url,
            'timestamp': datetime.now().isoformat()
        }
        await context.push_data(data)

    # Start statistics monitoring
    monitor_task = asyncio.create_task(monitor_statistics(stats, interval=5))

    # Start crawling
    urls = ['https://httpbin.org/delay/1'] * 20
    final_stats = await crawler.run(urls)

    # Wait for monitoring to finish
    await monitor_task

    # Print final summary
    print(f"\n=== Final Summary ===")
    final_stats.log_summary()

import random
from datetime import datetime, timedelta
asyncio.run(main())

Custom Statistics Collection

import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.statistics import Statistics

class CustomStatistics(Statistics):
    """Extended statistics with custom metrics."""

    def __init__(self):
        super().__init__()
        self.status_code_counts = {}
        self.domain_counts = {}
        self.response_sizes = []

    def record_response(self, url: str, status_code: int, size: int):
        """Record custom response metrics."""
        # Count status codes
        self.status_code_counts[status_code] = self.status_code_counts.get(status_code, 0) + 1

        # Count domains
        from urllib.parse import urlparse
        domain = urlparse(url).netloc
        self.domain_counts[domain] = self.domain_counts.get(domain, 0) + 1

        # Track response sizes
        self.response_sizes.append(size)

    def get_custom_summary(self) -> dict[str, any]:
        """Get summary of custom metrics."""
        avg_size = sum(self.response_sizes) / len(self.response_sizes) if self.response_sizes else 0

        return {
            'status_codes': dict(self.status_code_counts),
            'domains': dict(self.domain_counts),
            'response_size_avg': avg_size,
            'response_size_min': min(self.response_sizes) if self.response_sizes else 0,
            'response_size_max': max(self.response_sizes) if self.response_sizes else 0
        }

async def main():
    custom_stats = CustomStatistics()

    crawler = HttpCrawler(statistics=custom_stats)

    @crawler.router.default_handler
    async def handler(context: HttpCrawlingContext):
        response = context.response

        # Record custom metrics
        response_size = len(response.content)
        custom_stats.record_response(
            url=response.url,
            status_code=response.status_code,
            size=response_size
        )

        data = {
            'url': response.url,
            'status': response.status_code,
            'size': response_size
        }
        await context.push_data(data)

    # Test with various URLs
    urls = [
        'https://httpbin.org/json',
        'https://httpbin.org/html',
        'https://httpbin.org/xml',
        'https://httpbin.org/status/404',
        'https://httpbin.org/status/500'
    ]

    await crawler.run(urls)

    # Get custom statistics
    custom_summary = custom_stats.get_custom_summary()

    print("Custom Statistics:")
    print(f"Status codes: {custom_summary['status_codes']}")
    print(f"Domains: {custom_summary['domains']}")
    print(f"Avg response size: {custom_summary['response_size_avg']:.0f} bytes")
    print(f"Min response size: {custom_summary['response_size_min']} bytes")
    print(f"Max response size: {custom_summary['response_size_max']} bytes")

asyncio.run(main())

Statistics Persistence

import asyncio
import json
from datetime import datetime
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.statistics import Statistics
from crawlee.storages import KeyValueStore

async def save_statistics_periodically(stats: Statistics, store: KeyValueStore, interval: int = 30):
    """Save statistics to storage periodically."""
    while True:
        await asyncio.sleep(interval)

        state = stats.get_state()
        timestamp = datetime.now().isoformat()

        # Save current state
        await store.set_value(
            f'stats_{timestamp}',
            state.to_dict()
        )

        print(f"Statistics saved at {timestamp}")

async def main():
    stats = Statistics()
    stats_store = await KeyValueStore.open('crawl-statistics')

    crawler = HttpCrawler(statistics=stats, max_requests_per_crawl=30)

    @crawler.router.default_handler
    async def handler(context: HttpCrawlingContext):
        await asyncio.sleep(random.uniform(0.5, 2.0))  # Simulate work

        data = {'url': context.request.url, 'processed_at': datetime.now().isoformat()}
        await context.push_data(data)

    # Start periodic statistics saving
    save_task = asyncio.create_task(
        save_statistics_periodically(stats, stats_store, interval=10)
    )

    # Run crawler
    urls = ['https://httpbin.org/delay/1'] * 25
    final_stats = await crawler.run(urls)

    # Cancel periodic saving
    save_task.cancel()

    # Save final statistics
    await stats_store.set_value('final_stats', final_stats.to_dict())

    print("Final statistics saved to storage")

    # Demonstrate reading saved statistics
    saved_final = await stats_store.get_value('final_stats')
    print(f"Retrieved final stats: Success rate = {saved_final['success_rate']:.1f}%")

import random
asyncio.run(main())

Performance Analysis

import asyncio
import time
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.statistics import Statistics

async def analyze_performance(stats: Statistics, final_stats):
    """Analyze crawler performance metrics."""

    print("=== Performance Analysis ===")

    # Basic metrics
    total_requests = final_stats.requests_total
    success_rate = final_stats.success_rate
    avg_duration = final_stats.requests_avg_duration_millis

    print(f"Total requests: {total_requests}")
    print(f"Success rate: {success_rate:.1f}%")
    print(f"Average request duration: {avg_duration:.2f}ms")

    # Throughput analysis
    crawl_duration_seconds = final_stats.crawl_duration_millis / 1000
    throughput = total_requests / crawl_duration_seconds if crawl_duration_seconds > 0 else 0

    print(f"Crawl duration: {crawl_duration_seconds:.1f}s")
    print(f"Throughput: {throughput:.2f} requests/second")

    # Retry analysis
    retry_histogram = final_stats.retry_histogram
    total_retries = sum(i * count for i, count in enumerate(retry_histogram))

    print(f"Total retries: {total_retries}")
    print(f"Retry distribution: {retry_histogram}")

    if len(retry_histogram) > 1:
        retry_rate = sum(retry_histogram[1:]) / total_requests * 100
        print(f"Requests requiring retries: {retry_rate:.1f}%")

    # Performance rating
    if success_rate > 95 and avg_duration < 2000:
        rating = "Excellent"
    elif success_rate > 90 and avg_duration < 5000:
        rating = "Good"
    elif success_rate > 80:
        rating = "Fair"
    else:
        rating = "Poor"

    print(f"Performance rating: {rating}")

async def main():
    stats = Statistics()

    # Configure crawler with different settings for performance testing
    crawler = HttpCrawler(
        statistics=stats,
        max_requests_per_crawl=20,
        max_request_retries=3,
        max_concurrent_requests=5
    )

    request_times = []

    @crawler.router.default_handler
    async def handler(context: HttpCrawlingContext):
        start_time = time.time()

        # Simulate varying response times
        delay = random.choice([0.5, 1.0, 1.5, 2.0, 3.0])
        await asyncio.sleep(delay)

        end_time = time.time()
        request_times.append((end_time - start_time) * 1000)  # Convert to ms

        # Occasionally fail to test retry logic
        if random.random() < 0.1:  # 10% failure rate
            raise Exception("Simulated failure")

        data = {
            'url': context.request.url,
            'duration_ms': (end_time - start_time) * 1000
        }
        await context.push_data(data)

    # Run performance test
    urls = ['https://httpbin.org/delay/1'] * 20
    final_stats = await crawler.run(urls)

    # Analyze performance
    await analyze_performance(stats, final_stats)

    # Additional custom analysis
    if request_times:
        print(f"\nCustom timing analysis:")
        print(f"Min request time: {min(request_times):.2f}ms")
        print(f"Max request time: {max(request_times):.2f}ms")
        print(f"Median request time: {sorted(request_times)[len(request_times)//2]:.2f}ms")

import random
asyncio.run(main())

Install with Tessl CLI