0
# Configuration
1
2
Global configuration management and request routing systems for fine-tuned control over crawling behavior. Configuration components provide centralized settings management, environment variable integration, proxy management, and request routing capabilities.
3
4
## Capabilities
5
6
### Global Configuration
7
8
Centralized configuration system with environment variable support and default value management.
9
10
```python { .api }
11
class Configuration:
12
def __init__(self, **settings): ...
13
14
def get(self, key: str, default: any = None) -> any:
15
"""Get configuration value with optional default."""
16
17
def set(self, key: str, value: any) -> None:
18
"""Set configuration value."""
19
20
def get_bool(self, key: str, default: bool = False) -> bool:
21
"""Get boolean configuration value."""
22
23
def get_int(self, key: str, default: int = 0) -> int:
24
"""Get integer configuration value."""
25
26
def get_float(self, key: str, default: float = 0.0) -> float:
27
"""Get float configuration value."""
28
29
@property
30
def storage_dir(self) -> str:
31
"""Default storage directory path."""
32
33
@property
34
def max_pool_size(self) -> int:
35
"""Default maximum pool size."""
36
37
@property
38
def request_handler_timeout(self) -> int:
39
"""Default request handler timeout in seconds."""
40
```
41
42
### Request Router
43
44
Request routing system for directing requests to appropriate handlers based on labels and patterns.
45
46
```python { .api }
47
class Router:
48
def __init__(self): ...
49
50
def default_handler(self, handler: RequestHandler) -> RequestHandler:
51
"""
52
Register default request handler.
53
54
Args:
55
handler: Handler function for requests
56
57
Returns:
58
The registered handler
59
"""
60
61
def route(
62
self,
63
label: str,
64
handler: RequestHandler,
65
*,
66
method: HttpMethod | None = None
67
) -> RequestHandler:
68
"""
69
Register handler for specific request label.
70
71
Args:
72
label: Request label to match
73
handler: Handler function
74
method: Optional HTTP method filter
75
76
Returns:
77
The registered handler
78
"""
79
80
def error_handler(self, handler: ErrorRequestHandler) -> ErrorRequestHandler:
81
"""
82
Register error handler for failed requests.
83
84
Args:
85
handler: Error handler function
86
87
Returns:
88
The registered handler
89
"""
90
91
def get_handler(self, request: Request) -> RequestHandler | None:
92
"""Get appropriate handler for request."""
93
94
def get_error_handler(self) -> ErrorRequestHandler | None:
95
"""Get registered error handler."""
96
```
97
98
### Proxy Configuration
99
100
Proxy server configuration and rotation system for enhanced anonymity and geo-targeting.
101
102
```python { .api }
103
class ProxyConfiguration:
104
def __init__(
105
self,
106
proxy_urls: list[str] | None = None,
107
*,
108
new_url_function: Callable[[], str] | None = None,
109
country_code: str | None = None,
110
session_id: str | None = None
111
): ...
112
113
async def new_proxy_info(
114
self,
115
*,
116
session_id: str | None = None,
117
request: Request | None = None
118
) -> ProxyInfo | None:
119
"""
120
Get new proxy for request.
121
122
Args:
123
session_id: Session identifier for proxy affinity
124
request: Request being processed
125
126
Returns:
127
ProxyInfo object or None if no proxy needed
128
"""
129
130
def new_url(self) -> str | None:
131
"""Generate new proxy URL using configured strategy."""
132
133
@property
134
def proxy_urls(self) -> list[str]: ...
135
136
@property
137
def country_code(self) -> str | None: ...
138
```
139
140
```python { .api }
141
class ProxyInfo:
142
def __init__(
143
self,
144
*,
145
url: str,
146
hostname: str | None = None,
147
port: int | None = None,
148
username: str | None = None,
149
password: str | None = None,
150
country_code: str | None = None,
151
session_id: str | None = None
152
): ...
153
154
@property
155
def url(self) -> str:
156
"""Full proxy URL."""
157
158
@property
159
def hostname(self) -> str | None:
160
"""Proxy hostname."""
161
162
@property
163
def port(self) -> int | None:
164
"""Proxy port number."""
165
166
@property
167
def username(self) -> str | None:
168
"""Proxy authentication username."""
169
170
@property
171
def password(self) -> str | None:
172
"""Proxy authentication password."""
173
174
@property
175
def country_code(self) -> str | None:
176
"""ISO country code for proxy location."""
177
178
@property
179
def session_id(self) -> str | None:
180
"""Session identifier for proxy affinity."""
181
```
182
183
## Handler Types
184
185
Type definitions for request handlers and error handlers used with the Router.
186
187
```python { .api }
188
RequestHandler = Callable[[BasicCrawlingContext], Awaitable[None]]
189
190
ErrorRequestHandler = Callable[
191
[BasicCrawlingContext, Exception], Awaitable[None]
192
]
193
```
194
195
## Usage Examples
196
197
### Global Configuration
198
199
```python
200
from crawlee.configuration import Configuration
201
import os
202
203
# Create configuration with defaults
204
config = Configuration(
205
storage_dir='./crawlee_storage',
206
max_concurrent_requests=10,
207
request_timeout=30
208
)
209
210
# Environment variables override defaults
211
# CRAWLEE_STORAGE_DIR, CRAWLEE_MAX_CONCURRENT_REQUESTS, etc.
212
os.environ['CRAWLEE_STORAGE_DIR'] = '/tmp/my_crawls'
213
214
# Get configuration values
215
storage_dir = config.get('storage_dir')
216
print(f"Storage directory: {storage_dir}") # /tmp/my_crawls
217
218
# Type-specific getters
219
max_requests = config.get_int('max_concurrent_requests', 5)
220
enable_logging = config.get_bool('enable_logging', True)
221
222
# Set values programmatically
223
config.set('user_agent', 'My Custom Bot 1.0')
224
```
225
226
### Request Routing
227
228
```python
229
import asyncio
230
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
231
from crawlee.router import Router
232
233
async def main():
234
crawler = BeautifulSoupCrawler()
235
236
# Use the crawler's built-in router
237
router = crawler.router
238
239
# Default handler for unlabeled requests
240
@router.default_handler
241
async def default_handler(context: BeautifulSoupCrawlingContext):
242
context.log.info(f"Processing default: {context.request.url}")
243
244
data = {
245
'url': context.request.url,
246
'title': context.soup.title.string if context.soup.title else None,
247
'type': 'default'
248
}
249
250
await context.push_data(data)
251
252
# Handler for product pages
253
@router.route('product')
254
async def product_handler(context: BeautifulSoupCrawlingContext):
255
context.log.info(f"Processing product: {context.request.url}")
256
257
# Extract product-specific data
258
name = context.soup.select_one('.product-name')
259
price = context.soup.select_one('.price')
260
261
data = {
262
'url': context.request.url,
263
'name': name.get_text().strip() if name else None,
264
'price': price.get_text().strip() if price else None,
265
'type': 'product'
266
}
267
268
await context.push_data(data)
269
270
# Enqueue related products
271
await context.enqueue_links(
272
selector='.related-product a',
273
label='product'
274
)
275
276
# Handler for category pages
277
@router.route('category')
278
async def category_handler(context: BeautifulSoupCrawlingContext):
279
context.log.info(f"Processing category: {context.request.url}")
280
281
# Extract category info
282
category_name = context.soup.select_one('h1')
283
284
data = {
285
'url': context.request.url,
286
'category': category_name.get_text().strip() if category_name else None,
287
'type': 'category'
288
}
289
290
await context.push_data(data)
291
292
# Enqueue product links with product label
293
await context.enqueue_links(
294
selector='.product-link',
295
label='product'
296
)
297
298
# Error handler for all failed requests
299
@router.error_handler
300
async def error_handler(context: BeautifulSoupCrawlingContext, error: Exception):
301
context.log.error(f"Error processing {context.request.url}: {error}")
302
303
# Log error details
304
await context.push_data({
305
'url': context.request.url,
306
'error': str(error),
307
'type': 'error'
308
})
309
310
# Start crawling with labeled requests
311
from crawlee import Request
312
313
requests = [
314
Request('https://store.example.com/', label='category'),
315
Request('https://store.example.com/products/item1', label='product'),
316
'https://store.example.com/about', # No label = default handler
317
]
318
319
await crawler.run(requests)
320
321
asyncio.run(main())
322
```
323
324
### Proxy Configuration
325
326
```python
327
import asyncio
328
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
329
from crawlee.proxy_configuration import ProxyConfiguration
330
331
async def main():
332
# Configure proxy rotation
333
proxy_config = ProxyConfiguration([
334
'http://proxy1:8080',
335
'http://user:pass@proxy2:8080',
336
'http://proxy3:8080',
337
'socks5://socks-proxy:1080'
338
])
339
340
# Create crawler with proxy configuration
341
crawler = HttpCrawler(
342
proxy_configuration=proxy_config,
343
max_requests_per_crawl=20
344
)
345
346
@crawler.router.default_handler
347
async def handler(context: HttpCrawlingContext):
348
# Each request may use different proxy
349
proxy_info = context.proxy_info
350
if proxy_info:
351
context.log.info(f"Using proxy: {proxy_info.hostname}:{proxy_info.port}")
352
if proxy_info.country_code:
353
context.log.info(f"Proxy country: {proxy_info.country_code}")
354
355
data = {
356
'url': context.request.url,
357
'status': context.response.status_code,
358
'proxy_used': proxy_info.url if proxy_info else None
359
}
360
361
await context.push_data(data)
362
363
await crawler.run(['https://httpbin.org/ip'] * 10)
364
365
asyncio.run(main())
366
```
367
368
### Custom Proxy Function
369
370
```python
371
import asyncio
372
import random
373
from crawlee.proxy_configuration import ProxyConfiguration
374
375
def generate_proxy_url() -> str:
376
"""Custom function to generate proxy URLs dynamically."""
377
proxy_providers = [
378
'proxy-pool-1.example.com:8080',
379
'proxy-pool-2.example.com:8080',
380
'proxy-pool-3.example.com:8080'
381
]
382
383
selected = random.choice(proxy_providers)
384
return f"http://user:pass@{selected}"
385
386
async def main():
387
# Use custom proxy generation function
388
proxy_config = ProxyConfiguration(
389
new_url_function=generate_proxy_url
390
)
391
392
# Test proxy generation
393
for i in range(5):
394
proxy_info = await proxy_config.new_proxy_info()
395
print(f"Generated proxy {i+1}: {proxy_info.url}")
396
397
asyncio.run(main())
398
```
399
400
### Environment-Based Configuration
401
402
```python
403
import os
404
from crawlee.configuration import Configuration
405
406
# Set environment variables
407
os.environ.update({
408
'CRAWLEE_STORAGE_DIR': './data',
409
'CRAWLEE_MAX_CONCURRENT_REQUESTS': '20',
410
'CRAWLEE_REQUEST_TIMEOUT': '60',
411
'CRAWLEE_ENABLE_PROXY': 'true',
412
'CRAWLEE_LOG_LEVEL': 'DEBUG'
413
})
414
415
# Configuration automatically reads from environment
416
config = Configuration()
417
418
print(f"Storage dir: {config.storage_dir}")
419
print(f"Max concurrent: {config.get_int('max_concurrent_requests')}")
420
print(f"Request timeout: {config.get_int('request_timeout')}")
421
print(f"Proxy enabled: {config.get_bool('enable_proxy')}")
422
print(f"Log level: {config.get('log_level')}")
423
424
# Override with custom values
425
config.set('custom_setting', 'my_value')
426
print(f"Custom setting: {config.get('custom_setting')}")
427
```