0
# Fingerprinting
1
2
Browser fingerprint generation and header randomization for enhanced stealth capabilities and bot protection bypass. Fingerprinting capabilities help make HTTP requests and browser sessions appear more human-like to avoid detection.
3
4
## Capabilities
5
6
### Fingerprint Generator
7
8
Base class for generating browser fingerprints with realistic device characteristics.
9
10
```python { .api }
11
class FingerprintGenerator:
12
async def generate_fingerprint(self, **options) -> dict[str, any]:
13
"""
14
Generate browser fingerprint with realistic characteristics.
15
16
Args:
17
**options: Fingerprint generation options
18
19
Returns:
20
Dictionary containing fingerprint data
21
"""
22
23
def get_headers(self, fingerprint: dict[str, any]) -> HttpHeaders:
24
"""
25
Generate HTTP headers from fingerprint.
26
27
Args:
28
fingerprint: Generated fingerprint data
29
30
Returns:
31
HttpHeaders object with realistic headers
32
"""
33
```
34
35
### Default Fingerprint Generator
36
37
Default implementation using browserforge for generating realistic browser fingerprints.
38
39
```python { .api }
40
class DefaultFingerprintGenerator(FingerprintGenerator):
41
def __init__(
42
self,
43
*,
44
browser_name: str | None = None,
45
browser_version: str | None = None,
46
device_category: str | None = None,
47
operating_system: str | None = None,
48
locale: str | None = None
49
): ...
50
51
async def generate_fingerprint(
52
self,
53
**options
54
) -> BrowserFingerprintData:
55
"""Generate realistic browser fingerprint."""
56
57
@property
58
def browser_name(self) -> str | None: ...
59
60
@property
61
def device_category(self) -> str | None: ...
62
```
63
64
### Header Generator
65
66
Specialized generator for creating realistic HTTP headers with proper ordering and values.
67
68
```python { .api }
69
class HeaderGenerator:
70
def __init__(
71
self,
72
*,
73
browser_name: str | None = None,
74
browser_version: str | None = None,
75
operating_system: str | None = None,
76
device: str | None = None,
77
locale: str | None = None
78
): ...
79
80
def get_headers(
81
self,
82
*,
83
url: str | None = None,
84
method: HttpMethod = "GET",
85
**options: HeaderGeneratorOptions
86
) -> HttpHeaders:
87
"""
88
Generate realistic HTTP headers.
89
90
Args:
91
url: Target URL for headers
92
method: HTTP method
93
**options: Additional options
94
95
Returns:
96
HttpHeaders with realistic browser headers
97
"""
98
99
def get_fingerprint_headers(
100
self,
101
fingerprint: dict[str, any]
102
) -> HttpHeaders:
103
"""Generate headers from existing fingerprint data."""
104
```
105
106
### Configuration Types
107
108
Configuration classes for customizing fingerprint and header generation.
109
110
```python { .api }
111
class HeaderGeneratorOptions:
112
def __init__(
113
self,
114
*,
115
accept: str | None = None,
116
accept_encoding: str | None = None,
117
accept_language: str | None = None,
118
cache_control: str | None = None,
119
referer: str | None = None,
120
sec_fetch_dest: str | None = None,
121
sec_fetch_mode: str | None = None,
122
sec_fetch_site: str | None = None,
123
sec_fetch_user: str | None = None,
124
upgrade_insecure_requests: str | None = None,
125
user_agent: str | None = None
126
): ...
127
128
@property
129
def accept(self) -> str | None: ...
130
131
@property
132
def user_agent(self) -> str | None: ...
133
```
134
135
```python { .api }
136
class ScreenOptions:
137
def __init__(
138
self,
139
*,
140
width: int | None = None,
141
height: int | None = None,
142
pixel_ratio: float | None = None
143
): ...
144
145
@property
146
def width(self) -> int | None:
147
"""Screen width in pixels."""
148
149
@property
150
def height(self) -> int | None:
151
"""Screen height in pixels."""
152
153
@property
154
def pixel_ratio(self) -> float | None:
155
"""Device pixel ratio."""
156
```
157
158
### Fingerprint Data Types
159
160
Data structures containing generated fingerprint information.
161
162
```python { .api }
163
class BrowserFingerprintData:
164
user_agent: str
165
viewport: ViewportSize
166
screen: ScreenSize
167
headers: dict[str, str]
168
webgl_vendor: str | None
169
webgl_renderer: str | None
170
languages: list[str]
171
timezone: str
172
platform: str
173
cookie_enabled: bool
174
do_not_track: bool | None
175
plugins: list[PluginData]
176
```
177
178
```python { .api }
179
class ViewportSize:
180
width: int
181
height: int
182
```
183
184
```python { .api }
185
class ScreenSize:
186
width: int
187
height: int
188
available_width: int
189
available_height: int
190
color_depth: int
191
pixel_depth: int
192
```
193
194
```python { .api }
195
class PluginData:
196
name: str
197
filename: str
198
description: str
199
```
200
201
## Usage Examples
202
203
### Basic Fingerprint Generation
204
205
```python
206
import asyncio
207
from crawlee.fingerprint_suite import DefaultFingerprintGenerator
208
209
async def main():
210
# Create fingerprint generator
211
generator = DefaultFingerprintGenerator(
212
browser_name='chrome',
213
device_category='desktop',
214
operating_system='windows'
215
)
216
217
# Generate fingerprint
218
fingerprint = await generator.generate_fingerprint()
219
220
print(f"User Agent: {fingerprint.user_agent}")
221
print(f"Viewport: {fingerprint.viewport.width}x{fingerprint.viewport.height}")
222
print(f"Screen: {fingerprint.screen.width}x{fingerprint.screen.height}")
223
print(f"Platform: {fingerprint.platform}")
224
print(f"Languages: {fingerprint.languages}")
225
print(f"Timezone: {fingerprint.timezone}")
226
227
# Generate headers from fingerprint
228
headers = generator.get_headers(fingerprint)
229
print(f"Generated headers: {headers.to_dict()}")
230
231
asyncio.run(main())
232
```
233
234
### Header Generation
235
236
```python
237
from crawlee.fingerprint_suite import HeaderGenerator, HeaderGeneratorOptions
238
239
# Create header generator
240
generator = HeaderGenerator(
241
browser_name='chrome',
242
operating_system='macos',
243
locale='en-US'
244
)
245
246
# Generate headers for specific URL
247
headers = generator.get_headers(
248
url='https://example.com/api/data',
249
method='GET',
250
referer='https://example.com',
251
accept='application/json'
252
)
253
254
print("Generated headers:")
255
for key, value in headers.items():
256
print(f" {key}: {value}")
257
258
# Custom header options
259
options = HeaderGeneratorOptions(
260
accept='text/html,application/xhtml+xml',
261
accept_language='en-US,en;q=0.9',
262
cache_control='max-age=0',
263
sec_fetch_dest='document',
264
sec_fetch_mode='navigate'
265
)
266
267
headers = generator.get_headers(
268
url='https://example.com',
269
method='GET',
270
**options.__dict__
271
)
272
```
273
274
### Using with HTTP Crawler
275
276
```python
277
import asyncio
278
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
279
from crawlee.fingerprint_suite import DefaultFingerprintGenerator
280
281
async def main():
282
# Create fingerprint generator
283
fingerprint_generator = DefaultFingerprintGenerator(
284
browser_name='chrome',
285
device_category='mobile',
286
operating_system='android'
287
)
288
289
crawler = HttpCrawler()
290
291
@crawler.router.default_handler
292
async def handler(context: HttpCrawlingContext):
293
# Generate new fingerprint for each request
294
fingerprint = await fingerprint_generator.generate_fingerprint()
295
296
# Update request headers with fingerprint
297
headers = fingerprint_generator.get_headers(fingerprint)
298
299
# Log fingerprint info
300
context.log.info(f"Using fingerprint: {fingerprint.user_agent}")
301
context.log.info(f"Screen: {fingerprint.screen.width}x{fingerprint.screen.height}")
302
303
# Process response
304
data = {
305
'url': context.request.url,
306
'user_agent': fingerprint.user_agent,
307
'screen_size': f"{fingerprint.screen.width}x{fingerprint.screen.height}",
308
'status': context.response.status_code
309
}
310
311
await context.push_data(data)
312
313
await crawler.run(['https://httpbin.org/user-agent', 'https://httpbin.org/headers'])
314
315
asyncio.run(main())
316
```
317
318
### Session-Specific Fingerprints
319
320
```python
321
import asyncio
322
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
323
from crawlee.fingerprint_suite import DefaultFingerprintGenerator
324
from crawlee.sessions import SessionPool
325
326
async def main():
327
# Create fingerprint generator
328
generator = DefaultFingerprintGenerator()
329
330
# Create session pool
331
session_pool = SessionPool(max_pool_size=5)
332
333
crawler = HttpCrawler(
334
session_pool=session_pool,
335
use_session_pool=True
336
)
337
338
# Store fingerprints per session
339
session_fingerprints = {}
340
341
@crawler.router.default_handler
342
async def handler(context: HttpCrawlingContext):
343
session_id = context.session.id
344
345
# Generate fingerprint once per session
346
if session_id not in session_fingerprints:
347
fingerprint = await generator.generate_fingerprint()
348
session_fingerprints[session_id] = fingerprint
349
context.log.info(f"Generated new fingerprint for session {session_id}")
350
351
fingerprint = session_fingerprints[session_id]
352
353
# Use consistent fingerprint for this session
354
headers = generator.get_headers(fingerprint)
355
356
data = {
357
'url': context.request.url,
358
'session_id': session_id,
359
'user_agent': fingerprint.user_agent,
360
'consistent_fingerprint': True
361
}
362
363
await context.push_data(data)
364
365
# Multiple requests will reuse fingerprints per session
366
urls = ['https://httpbin.org/headers'] * 10
367
await crawler.run(urls)
368
369
asyncio.run(main())
370
```
371
372
### Custom Fingerprint Generator
373
374
```python
375
import asyncio
376
import random
377
from crawlee.fingerprint_suite import FingerprintGenerator, HeaderGenerator
378
379
class CustomFingerprintGenerator(FingerprintGenerator):
380
"""Custom fingerprint generator with specific characteristics."""
381
382
def __init__(self):
383
self.header_generator = HeaderGenerator()
384
self.user_agents = [
385
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
386
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
387
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
388
]
389
390
async def generate_fingerprint(self, **options) -> dict[str, any]:
391
"""Generate custom fingerprint with specific characteristics."""
392
393
# Select random user agent
394
user_agent = random.choice(self.user_agents)
395
396
# Define screen resolutions
397
screen_resolutions = [
398
(1920, 1080),
399
(1366, 768),
400
(1440, 900),
401
(1600, 900)
402
]
403
404
screen_width, screen_height = random.choice(screen_resolutions)
405
406
# Generate viewport (slightly smaller than screen)
407
viewport_width = screen_width - random.randint(0, 100)
408
viewport_height = screen_height - random.randint(100, 200)
409
410
fingerprint = {
411
'user_agent': user_agent,
412
'viewport': {
413
'width': viewport_width,
414
'height': viewport_height
415
},
416
'screen': {
417
'width': screen_width,
418
'height': screen_height,
419
'color_depth': 24,
420
'pixel_depth': 24
421
},
422
'languages': ['en-US', 'en'],
423
'timezone': random.choice(['America/New_York', 'Europe/London', 'America/Los_Angeles']),
424
'platform': self._get_platform_from_ua(user_agent),
425
'cookie_enabled': True,
426
'do_not_track': random.choice([None, False])
427
}
428
429
return fingerprint
430
431
def get_headers(self, fingerprint: dict[str, any]) -> dict[str, str]:
432
"""Generate headers from fingerprint."""
433
434
return {
435
'User-Agent': fingerprint['user_agent'],
436
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
437
'Accept-Language': ','.join(fingerprint['languages']) + ';q=0.9',
438
'Accept-Encoding': 'gzip, deflate, br',
439
'DNT': '1' if fingerprint.get('do_not_track') else '0',
440
'Connection': 'keep-alive',
441
'Upgrade-Insecure-Requests': '1',
442
'Sec-Fetch-Dest': 'document',
443
'Sec-Fetch-Mode': 'navigate',
444
'Sec-Fetch-Site': 'none',
445
'Cache-Control': 'max-age=0'
446
}
447
448
def _get_platform_from_ua(self, user_agent: str) -> str:
449
"""Extract platform from user agent."""
450
if 'Windows' in user_agent:
451
return 'Win32'
452
elif 'Macintosh' in user_agent:
453
return 'MacIntel'
454
elif 'Linux' in user_agent:
455
return 'Linux x86_64'
456
else:
457
return 'Unknown'
458
459
async def main():
460
# Use custom fingerprint generator
461
generator = CustomFingerprintGenerator()
462
463
# Generate multiple fingerprints
464
for i in range(3):
465
fingerprint = await generator.generate_fingerprint()
466
headers = generator.get_headers(fingerprint)
467
468
print(f"\nFingerprint {i+1}:")
469
print(f" User-Agent: {fingerprint['user_agent']}")
470
print(f" Screen: {fingerprint['screen']['width']}x{fingerprint['screen']['height']}")
471
print(f" Viewport: {fingerprint['viewport']['width']}x{fingerprint['viewport']['height']}")
472
print(f" Platform: {fingerprint['platform']}")
473
print(f" Timezone: {fingerprint['timezone']}")
474
print(f" Accept-Language: {headers.get('Accept-Language', 'N/A')}")
475
476
asyncio.run(main())
477
```
478
479
### Integration with Playwright
480
481
```python
482
import asyncio
483
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
484
from crawlee.fingerprint_suite import DefaultFingerprintGenerator
485
486
async def main():
487
generator = DefaultFingerprintGenerator()
488
489
crawler = PlaywrightCrawler()
490
491
@crawler.router.default_handler
492
async def handler(context: PlaywrightCrawlingContext):
493
page = context.page
494
495
# Generate fingerprint
496
fingerprint = await generator.generate_fingerprint()
497
498
# Apply fingerprint to browser page
499
await page.set_viewport_size({
500
'width': fingerprint.viewport.width,
501
'height': fingerprint.viewport.height
502
})
503
504
# Override JavaScript properties to match fingerprint
505
await page.add_init_script(f"""
506
// Override screen properties
507
Object.defineProperty(screen, 'width', {{ get: () => {fingerprint.screen.width} }});
508
Object.defineProperty(screen, 'height', {{ get: () => {fingerprint.screen.height} }});
509
Object.defineProperty(screen, 'availWidth', {{ get: () => {fingerprint.screen.width} }});
510
Object.defineProperty(screen, 'availHeight', {{ get: () => {fingerprint.screen.height - 40} }});
511
Object.defineProperty(screen, 'colorDepth', {{ get: () => {fingerprint.screen.color_depth} }});
512
513
// Override navigator properties
514
Object.defineProperty(navigator, 'languages', {{ get: () => {fingerprint.languages} }});
515
Object.defineProperty(navigator, 'platform', {{ get: () => '{fingerprint.platform}' }});
516
Object.defineProperty(navigator, 'cookieEnabled', {{ get: () => {str(fingerprint.cookie_enabled).lower()} }});
517
518
// Override timezone
519
Date.prototype.getTimezoneOffset = function() {{
520
return {random.randint(-720, 720)};
521
}};
522
""")
523
524
# Navigate with fingerprint applied
525
await page.goto(context.request.url)
526
527
# Extract data
528
data = {
529
'url': context.request.url,
530
'title': await page.title(),
531
'fingerprint_applied': True,
532
'viewport': f"{fingerprint.viewport.width}x{fingerprint.viewport.height}",
533
'user_agent': fingerprint.user_agent
534
}
535
536
await context.push_data(data)
537
538
await crawler.run(['https://httpbin.org/headers'])
539
540
asyncio.run(main())
541
```