CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-crawlee

A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass

Overview
Eval results
Files

fingerprinting.mddocs/

Fingerprinting

Browser fingerprint generation and header randomization for enhanced stealth capabilities and bot protection bypass. Fingerprinting capabilities help make HTTP requests and browser sessions appear more human-like to avoid detection.

Capabilities

Fingerprint Generator

Base class for generating browser fingerprints with realistic device characteristics.

class FingerprintGenerator:
    async def generate_fingerprint(self, **options) -> dict[str, any]:
        """
        Generate browser fingerprint with realistic characteristics.

        Args:
            **options: Fingerprint generation options

        Returns:
            Dictionary containing fingerprint data
        """

    def get_headers(self, fingerprint: dict[str, any]) -> HttpHeaders:
        """
        Generate HTTP headers from fingerprint.

        Args:
            fingerprint: Generated fingerprint data

        Returns:
            HttpHeaders object with realistic headers
        """

Default Fingerprint Generator

Default implementation using browserforge for generating realistic browser fingerprints.

class DefaultFingerprintGenerator(FingerprintGenerator):
    def __init__(
        self,
        *,
        browser_name: str | None = None,
        browser_version: str | None = None,
        device_category: str | None = None,
        operating_system: str | None = None,
        locale: str | None = None
    ): ...

    async def generate_fingerprint(
        self,
        **options
    ) -> BrowserFingerprintData:
        """Generate realistic browser fingerprint."""

    @property
    def browser_name(self) -> str | None: ...

    @property
    def device_category(self) -> str | None: ...

Header Generator

Specialized generator for creating realistic HTTP headers with proper ordering and values.

class HeaderGenerator:
    def __init__(
        self,
        *,
        browser_name: str | None = None,
        browser_version: str | None = None,
        operating_system: str | None = None,
        device: str | None = None,
        locale: str | None = None
    ): ...

    def get_headers(
        self,
        *,
        url: str | None = None,
        method: HttpMethod = "GET",
        **options: HeaderGeneratorOptions
    ) -> HttpHeaders:
        """
        Generate realistic HTTP headers.

        Args:
            url: Target URL for headers
            method: HTTP method
            **options: Additional options

        Returns:
            HttpHeaders with realistic browser headers
        """

    def get_fingerprint_headers(
        self,
        fingerprint: dict[str, any]
    ) -> HttpHeaders:
        """Generate headers from existing fingerprint data."""

Configuration Types

Configuration classes for customizing fingerprint and header generation.

class HeaderGeneratorOptions:
    def __init__(
        self,
        *,
        accept: str | None = None,
        accept_encoding: str | None = None,
        accept_language: str | None = None,
        cache_control: str | None = None,
        referer: str | None = None,
        sec_fetch_dest: str | None = None,
        sec_fetch_mode: str | None = None,
        sec_fetch_site: str | None = None,
        sec_fetch_user: str | None = None,
        upgrade_insecure_requests: str | None = None,
        user_agent: str | None = None
    ): ...

    @property
    def accept(self) -> str | None: ...

    @property
    def user_agent(self) -> str | None: ...
class ScreenOptions:
    def __init__(
        self,
        *,
        width: int | None = None,
        height: int | None = None,
        pixel_ratio: float | None = None
    ): ...

    @property
    def width(self) -> int | None:
        """Screen width in pixels."""

    @property
    def height(self) -> int | None:
        """Screen height in pixels."""

    @property
    def pixel_ratio(self) -> float | None:
        """Device pixel ratio."""

Fingerprint Data Types

Data structures containing generated fingerprint information.

class BrowserFingerprintData:
    user_agent: str
    viewport: ViewportSize
    screen: ScreenSize
    headers: dict[str, str]
    webgl_vendor: str | None
    webgl_renderer: str | None
    languages: list[str]
    timezone: str
    platform: str
    cookie_enabled: bool
    do_not_track: bool | None
    plugins: list[PluginData]
class ViewportSize:
    width: int
    height: int
class ScreenSize:
    width: int
    height: int
    available_width: int
    available_height: int
    color_depth: int
    pixel_depth: int
class PluginData:
    name: str
    filename: str
    description: str

Usage Examples

Basic Fingerprint Generation

import asyncio
from crawlee.fingerprint_suite import DefaultFingerprintGenerator

async def main():
    # Create fingerprint generator
    generator = DefaultFingerprintGenerator(
        browser_name='chrome',
        device_category='desktop',
        operating_system='windows'
    )

    # Generate fingerprint
    fingerprint = await generator.generate_fingerprint()

    print(f"User Agent: {fingerprint.user_agent}")
    print(f"Viewport: {fingerprint.viewport.width}x{fingerprint.viewport.height}")
    print(f"Screen: {fingerprint.screen.width}x{fingerprint.screen.height}")
    print(f"Platform: {fingerprint.platform}")
    print(f"Languages: {fingerprint.languages}")
    print(f"Timezone: {fingerprint.timezone}")

    # Generate headers from fingerprint
    headers = generator.get_headers(fingerprint)
    print(f"Generated headers: {headers.to_dict()}")

asyncio.run(main())

Header Generation

from crawlee.fingerprint_suite import HeaderGenerator, HeaderGeneratorOptions

# Create header generator
generator = HeaderGenerator(
    browser_name='chrome',
    operating_system='macos',
    locale='en-US'
)

# Generate headers for specific URL
headers = generator.get_headers(
    url='https://example.com/api/data',
    method='GET',
    referer='https://example.com',
    accept='application/json'
)

print("Generated headers:")
for key, value in headers.items():
    print(f"  {key}: {value}")

# Custom header options
options = HeaderGeneratorOptions(
    accept='text/html,application/xhtml+xml',
    accept_language='en-US,en;q=0.9',
    cache_control='max-age=0',
    sec_fetch_dest='document',
    sec_fetch_mode='navigate'
)

headers = generator.get_headers(
    url='https://example.com',
    method='GET',
    **options.__dict__
)

Using with HTTP Crawler

import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.fingerprint_suite import DefaultFingerprintGenerator

async def main():
    # Create fingerprint generator
    fingerprint_generator = DefaultFingerprintGenerator(
        browser_name='chrome',
        device_category='mobile',
        operating_system='android'
    )

    crawler = HttpCrawler()

    @crawler.router.default_handler
    async def handler(context: HttpCrawlingContext):
        # Generate new fingerprint for each request
        fingerprint = await fingerprint_generator.generate_fingerprint()

        # Update request headers with fingerprint
        headers = fingerprint_generator.get_headers(fingerprint)

        # Log fingerprint info
        context.log.info(f"Using fingerprint: {fingerprint.user_agent}")
        context.log.info(f"Screen: {fingerprint.screen.width}x{fingerprint.screen.height}")

        # Process response
        data = {
            'url': context.request.url,
            'user_agent': fingerprint.user_agent,
            'screen_size': f"{fingerprint.screen.width}x{fingerprint.screen.height}",
            'status': context.response.status_code
        }

        await context.push_data(data)

    await crawler.run(['https://httpbin.org/user-agent', 'https://httpbin.org/headers'])

asyncio.run(main())

Session-Specific Fingerprints

import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.fingerprint_suite import DefaultFingerprintGenerator
from crawlee.sessions import SessionPool

async def main():
    # Create fingerprint generator
    generator = DefaultFingerprintGenerator()

    # Create session pool
    session_pool = SessionPool(max_pool_size=5)

    crawler = HttpCrawler(
        session_pool=session_pool,
        use_session_pool=True
    )

    # Store fingerprints per session
    session_fingerprints = {}

    @crawler.router.default_handler
    async def handler(context: HttpCrawlingContext):
        session_id = context.session.id

        # Generate fingerprint once per session
        if session_id not in session_fingerprints:
            fingerprint = await generator.generate_fingerprint()
            session_fingerprints[session_id] = fingerprint
            context.log.info(f"Generated new fingerprint for session {session_id}")

        fingerprint = session_fingerprints[session_id]

        # Use consistent fingerprint for this session
        headers = generator.get_headers(fingerprint)

        data = {
            'url': context.request.url,
            'session_id': session_id,
            'user_agent': fingerprint.user_agent,
            'consistent_fingerprint': True
        }

        await context.push_data(data)

    # Multiple requests will reuse fingerprints per session
    urls = ['https://httpbin.org/headers'] * 10
    await crawler.run(urls)

asyncio.run(main())

Custom Fingerprint Generator

import asyncio
import random
from crawlee.fingerprint_suite import FingerprintGenerator, HeaderGenerator

class CustomFingerprintGenerator(FingerprintGenerator):
    """Custom fingerprint generator with specific characteristics."""

    def __init__(self):
        self.header_generator = HeaderGenerator()
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        ]

    async def generate_fingerprint(self, **options) -> dict[str, any]:
        """Generate custom fingerprint with specific characteristics."""

        # Select random user agent
        user_agent = random.choice(self.user_agents)

        # Define screen resolutions
        screen_resolutions = [
            (1920, 1080),
            (1366, 768),
            (1440, 900),
            (1600, 900)
        ]

        screen_width, screen_height = random.choice(screen_resolutions)

        # Generate viewport (slightly smaller than screen)
        viewport_width = screen_width - random.randint(0, 100)
        viewport_height = screen_height - random.randint(100, 200)

        fingerprint = {
            'user_agent': user_agent,
            'viewport': {
                'width': viewport_width,
                'height': viewport_height
            },
            'screen': {
                'width': screen_width,
                'height': screen_height,
                'color_depth': 24,
                'pixel_depth': 24
            },
            'languages': ['en-US', 'en'],
            'timezone': random.choice(['America/New_York', 'Europe/London', 'America/Los_Angeles']),
            'platform': self._get_platform_from_ua(user_agent),
            'cookie_enabled': True,
            'do_not_track': random.choice([None, False])
        }

        return fingerprint

    def get_headers(self, fingerprint: dict[str, any]) -> dict[str, str]:
        """Generate headers from fingerprint."""

        return {
            'User-Agent': fingerprint['user_agent'],
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': ','.join(fingerprint['languages']) + ';q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1' if fingerprint.get('do_not_track') else '0',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Cache-Control': 'max-age=0'
        }

    def _get_platform_from_ua(self, user_agent: str) -> str:
        """Extract platform from user agent."""
        if 'Windows' in user_agent:
            return 'Win32'
        elif 'Macintosh' in user_agent:
            return 'MacIntel'
        elif 'Linux' in user_agent:
            return 'Linux x86_64'
        else:
            return 'Unknown'

async def main():
    # Use custom fingerprint generator
    generator = CustomFingerprintGenerator()

    # Generate multiple fingerprints
    for i in range(3):
        fingerprint = await generator.generate_fingerprint()
        headers = generator.get_headers(fingerprint)

        print(f"\nFingerprint {i+1}:")
        print(f"  User-Agent: {fingerprint['user_agent']}")
        print(f"  Screen: {fingerprint['screen']['width']}x{fingerprint['screen']['height']}")
        print(f"  Viewport: {fingerprint['viewport']['width']}x{fingerprint['viewport']['height']}")
        print(f"  Platform: {fingerprint['platform']}")
        print(f"  Timezone: {fingerprint['timezone']}")
        print(f"  Accept-Language: {headers.get('Accept-Language', 'N/A')}")

asyncio.run(main())

Integration with Playwright

import asyncio
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.fingerprint_suite import DefaultFingerprintGenerator

async def main():
    generator = DefaultFingerprintGenerator()

    crawler = PlaywrightCrawler()

    @crawler.router.default_handler
    async def handler(context: PlaywrightCrawlingContext):
        page = context.page

        # Generate fingerprint
        fingerprint = await generator.generate_fingerprint()

        # Apply fingerprint to browser page
        await page.set_viewport_size({
            'width': fingerprint.viewport.width,
            'height': fingerprint.viewport.height
        })

        # Override JavaScript properties to match fingerprint
        await page.add_init_script(f"""
            // Override screen properties
            Object.defineProperty(screen, 'width', {{ get: () => {fingerprint.screen.width} }});
            Object.defineProperty(screen, 'height', {{ get: () => {fingerprint.screen.height} }});
            Object.defineProperty(screen, 'availWidth', {{ get: () => {fingerprint.screen.width} }});
            Object.defineProperty(screen, 'availHeight', {{ get: () => {fingerprint.screen.height - 40} }});
            Object.defineProperty(screen, 'colorDepth', {{ get: () => {fingerprint.screen.color_depth} }});

            // Override navigator properties
            Object.defineProperty(navigator, 'languages', {{ get: () => {fingerprint.languages} }});
            Object.defineProperty(navigator, 'platform', {{ get: () => '{fingerprint.platform}' }});
            Object.defineProperty(navigator, 'cookieEnabled', {{ get: () => {str(fingerprint.cookie_enabled).lower()} }});

            // Override timezone
            Date.prototype.getTimezoneOffset = function() {{
                return {random.randint(-720, 720)};
            }};
        """)

        # Navigate with fingerprint applied
        await page.goto(context.request.url)

        # Extract data
        data = {
            'url': context.request.url,
            'title': await page.title(),
            'fingerprint_applied': True,
            'viewport': f"{fingerprint.viewport.width}x{fingerprint.viewport.height}",
            'user_agent': fingerprint.user_agent
        }

        await context.push_data(data)

    await crawler.run(['https://httpbin.org/headers'])

asyncio.run(main())

Install with Tessl CLI

npx tessl i tessl/pypi-crawlee

docs

browser-automation.md

cli-tools.md

configuration.md

core-types.md

crawlers.md

error-handling.md

events.md

fingerprinting.md

http-clients.md

index.md

request-management.md

sessions.md

statistics.md

storage.md

tile.json