A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass
Browser fingerprint generation and header randomization for enhanced stealth capabilities and bot protection bypass. Fingerprinting capabilities help make HTTP requests and browser sessions appear more human-like to avoid detection.
Base class for generating browser fingerprints with realistic device characteristics.
class FingerprintGenerator:
async def generate_fingerprint(self, **options) -> dict[str, any]:
"""
Generate browser fingerprint with realistic characteristics.
Args:
**options: Fingerprint generation options
Returns:
Dictionary containing fingerprint data
"""
def get_headers(self, fingerprint: dict[str, any]) -> HttpHeaders:
"""
Generate HTTP headers from fingerprint.
Args:
fingerprint: Generated fingerprint data
Returns:
HttpHeaders object with realistic headers
"""Default implementation using browserforge for generating realistic browser fingerprints.
class DefaultFingerprintGenerator(FingerprintGenerator):
def __init__(
self,
*,
browser_name: str | None = None,
browser_version: str | None = None,
device_category: str | None = None,
operating_system: str | None = None,
locale: str | None = None
): ...
async def generate_fingerprint(
self,
**options
) -> BrowserFingerprintData:
"""Generate realistic browser fingerprint."""
@property
def browser_name(self) -> str | None: ...
@property
def device_category(self) -> str | None: ...Specialized generator for creating realistic HTTP headers with proper ordering and values.
class HeaderGenerator:
def __init__(
self,
*,
browser_name: str | None = None,
browser_version: str | None = None,
operating_system: str | None = None,
device: str | None = None,
locale: str | None = None
): ...
def get_headers(
self,
*,
url: str | None = None,
method: HttpMethod = "GET",
**options: HeaderGeneratorOptions
) -> HttpHeaders:
"""
Generate realistic HTTP headers.
Args:
url: Target URL for headers
method: HTTP method
**options: Additional options
Returns:
HttpHeaders with realistic browser headers
"""
def get_fingerprint_headers(
self,
fingerprint: dict[str, any]
) -> HttpHeaders:
"""Generate headers from existing fingerprint data."""Configuration classes for customizing fingerprint and header generation.
class HeaderGeneratorOptions:
def __init__(
self,
*,
accept: str | None = None,
accept_encoding: str | None = None,
accept_language: str | None = None,
cache_control: str | None = None,
referer: str | None = None,
sec_fetch_dest: str | None = None,
sec_fetch_mode: str | None = None,
sec_fetch_site: str | None = None,
sec_fetch_user: str | None = None,
upgrade_insecure_requests: str | None = None,
user_agent: str | None = None
): ...
@property
def accept(self) -> str | None: ...
@property
def user_agent(self) -> str | None: ...class ScreenOptions:
def __init__(
self,
*,
width: int | None = None,
height: int | None = None,
pixel_ratio: float | None = None
): ...
@property
def width(self) -> int | None:
"""Screen width in pixels."""
@property
def height(self) -> int | None:
"""Screen height in pixels."""
@property
def pixel_ratio(self) -> float | None:
"""Device pixel ratio."""Data structures containing generated fingerprint information.
class BrowserFingerprintData:
user_agent: str
viewport: ViewportSize
screen: ScreenSize
headers: dict[str, str]
webgl_vendor: str | None
webgl_renderer: str | None
languages: list[str]
timezone: str
platform: str
cookie_enabled: bool
do_not_track: bool | None
plugins: list[PluginData]class ViewportSize:
width: int
height: intclass ScreenSize:
width: int
height: int
available_width: int
available_height: int
color_depth: int
pixel_depth: intclass PluginData:
name: str
filename: str
description: strimport asyncio
from crawlee.fingerprint_suite import DefaultFingerprintGenerator
async def main():
# Create fingerprint generator
generator = DefaultFingerprintGenerator(
browser_name='chrome',
device_category='desktop',
operating_system='windows'
)
# Generate fingerprint
fingerprint = await generator.generate_fingerprint()
print(f"User Agent: {fingerprint.user_agent}")
print(f"Viewport: {fingerprint.viewport.width}x{fingerprint.viewport.height}")
print(f"Screen: {fingerprint.screen.width}x{fingerprint.screen.height}")
print(f"Platform: {fingerprint.platform}")
print(f"Languages: {fingerprint.languages}")
print(f"Timezone: {fingerprint.timezone}")
# Generate headers from fingerprint
headers = generator.get_headers(fingerprint)
print(f"Generated headers: {headers.to_dict()}")
asyncio.run(main())from crawlee.fingerprint_suite import HeaderGenerator, HeaderGeneratorOptions
# Create header generator
generator = HeaderGenerator(
browser_name='chrome',
operating_system='macos',
locale='en-US'
)
# Generate headers for specific URL
headers = generator.get_headers(
url='https://example.com/api/data',
method='GET',
referer='https://example.com',
accept='application/json'
)
print("Generated headers:")
for key, value in headers.items():
print(f" {key}: {value}")
# Custom header options
options = HeaderGeneratorOptions(
accept='text/html,application/xhtml+xml',
accept_language='en-US,en;q=0.9',
cache_control='max-age=0',
sec_fetch_dest='document',
sec_fetch_mode='navigate'
)
headers = generator.get_headers(
url='https://example.com',
method='GET',
**options.__dict__
)import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.fingerprint_suite import DefaultFingerprintGenerator
async def main():
# Create fingerprint generator
fingerprint_generator = DefaultFingerprintGenerator(
browser_name='chrome',
device_category='mobile',
operating_system='android'
)
crawler = HttpCrawler()
@crawler.router.default_handler
async def handler(context: HttpCrawlingContext):
# Generate new fingerprint for each request
fingerprint = await fingerprint_generator.generate_fingerprint()
# Update request headers with fingerprint
headers = fingerprint_generator.get_headers(fingerprint)
# Log fingerprint info
context.log.info(f"Using fingerprint: {fingerprint.user_agent}")
context.log.info(f"Screen: {fingerprint.screen.width}x{fingerprint.screen.height}")
# Process response
data = {
'url': context.request.url,
'user_agent': fingerprint.user_agent,
'screen_size': f"{fingerprint.screen.width}x{fingerprint.screen.height}",
'status': context.response.status_code
}
await context.push_data(data)
await crawler.run(['https://httpbin.org/user-agent', 'https://httpbin.org/headers'])
asyncio.run(main())import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.fingerprint_suite import DefaultFingerprintGenerator
from crawlee.sessions import SessionPool
async def main():
# Create fingerprint generator
generator = DefaultFingerprintGenerator()
# Create session pool
session_pool = SessionPool(max_pool_size=5)
crawler = HttpCrawler(
session_pool=session_pool,
use_session_pool=True
)
# Store fingerprints per session
session_fingerprints = {}
@crawler.router.default_handler
async def handler(context: HttpCrawlingContext):
session_id = context.session.id
# Generate fingerprint once per session
if session_id not in session_fingerprints:
fingerprint = await generator.generate_fingerprint()
session_fingerprints[session_id] = fingerprint
context.log.info(f"Generated new fingerprint for session {session_id}")
fingerprint = session_fingerprints[session_id]
# Use consistent fingerprint for this session
headers = generator.get_headers(fingerprint)
data = {
'url': context.request.url,
'session_id': session_id,
'user_agent': fingerprint.user_agent,
'consistent_fingerprint': True
}
await context.push_data(data)
# Multiple requests will reuse fingerprints per session
urls = ['https://httpbin.org/headers'] * 10
await crawler.run(urls)
asyncio.run(main())import asyncio
import random
from crawlee.fingerprint_suite import FingerprintGenerator, HeaderGenerator
class CustomFingerprintGenerator(FingerprintGenerator):
"""Custom fingerprint generator with specific characteristics."""
def __init__(self):
self.header_generator = HeaderGenerator()
self.user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
]
async def generate_fingerprint(self, **options) -> dict[str, any]:
"""Generate custom fingerprint with specific characteristics."""
# Select random user agent
user_agent = random.choice(self.user_agents)
# Define screen resolutions
screen_resolutions = [
(1920, 1080),
(1366, 768),
(1440, 900),
(1600, 900)
]
screen_width, screen_height = random.choice(screen_resolutions)
# Generate viewport (slightly smaller than screen)
viewport_width = screen_width - random.randint(0, 100)
viewport_height = screen_height - random.randint(100, 200)
fingerprint = {
'user_agent': user_agent,
'viewport': {
'width': viewport_width,
'height': viewport_height
},
'screen': {
'width': screen_width,
'height': screen_height,
'color_depth': 24,
'pixel_depth': 24
},
'languages': ['en-US', 'en'],
'timezone': random.choice(['America/New_York', 'Europe/London', 'America/Los_Angeles']),
'platform': self._get_platform_from_ua(user_agent),
'cookie_enabled': True,
'do_not_track': random.choice([None, False])
}
return fingerprint
def get_headers(self, fingerprint: dict[str, any]) -> dict[str, str]:
"""Generate headers from fingerprint."""
return {
'User-Agent': fingerprint['user_agent'],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': ','.join(fingerprint['languages']) + ';q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1' if fingerprint.get('do_not_track') else '0',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0'
}
def _get_platform_from_ua(self, user_agent: str) -> str:
"""Extract platform from user agent."""
if 'Windows' in user_agent:
return 'Win32'
elif 'Macintosh' in user_agent:
return 'MacIntel'
elif 'Linux' in user_agent:
return 'Linux x86_64'
else:
return 'Unknown'
async def main():
# Use custom fingerprint generator
generator = CustomFingerprintGenerator()
# Generate multiple fingerprints
for i in range(3):
fingerprint = await generator.generate_fingerprint()
headers = generator.get_headers(fingerprint)
print(f"\nFingerprint {i+1}:")
print(f" User-Agent: {fingerprint['user_agent']}")
print(f" Screen: {fingerprint['screen']['width']}x{fingerprint['screen']['height']}")
print(f" Viewport: {fingerprint['viewport']['width']}x{fingerprint['viewport']['height']}")
print(f" Platform: {fingerprint['platform']}")
print(f" Timezone: {fingerprint['timezone']}")
print(f" Accept-Language: {headers.get('Accept-Language', 'N/A')}")
asyncio.run(main())import asyncio
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.fingerprint_suite import DefaultFingerprintGenerator
async def main():
generator = DefaultFingerprintGenerator()
crawler = PlaywrightCrawler()
@crawler.router.default_handler
async def handler(context: PlaywrightCrawlingContext):
page = context.page
# Generate fingerprint
fingerprint = await generator.generate_fingerprint()
# Apply fingerprint to browser page
await page.set_viewport_size({
'width': fingerprint.viewport.width,
'height': fingerprint.viewport.height
})
# Override JavaScript properties to match fingerprint
await page.add_init_script(f"""
// Override screen properties
Object.defineProperty(screen, 'width', {{ get: () => {fingerprint.screen.width} }});
Object.defineProperty(screen, 'height', {{ get: () => {fingerprint.screen.height} }});
Object.defineProperty(screen, 'availWidth', {{ get: () => {fingerprint.screen.width} }});
Object.defineProperty(screen, 'availHeight', {{ get: () => {fingerprint.screen.height - 40} }});
Object.defineProperty(screen, 'colorDepth', {{ get: () => {fingerprint.screen.color_depth} }});
// Override navigator properties
Object.defineProperty(navigator, 'languages', {{ get: () => {fingerprint.languages} }});
Object.defineProperty(navigator, 'platform', {{ get: () => '{fingerprint.platform}' }});
Object.defineProperty(navigator, 'cookieEnabled', {{ get: () => {str(fingerprint.cookie_enabled).lower()} }});
// Override timezone
Date.prototype.getTimezoneOffset = function() {{
return {random.randint(-720, 720)};
}};
""")
# Navigate with fingerprint applied
await page.goto(context.request.url)
# Extract data
data = {
'url': context.request.url,
'title': await page.title(),
'fingerprint_applied': True,
'viewport': f"{fingerprint.viewport.width}x{fingerprint.viewport.height}",
'user_agent': fingerprint.user_agent
}
await context.push_data(data)
await crawler.run(['https://httpbin.org/headers'])
asyncio.run(main())Install with Tessl CLI
npx tessl i tessl/pypi-crawlee