or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

browser-automation.mdcli-tools.mdconfiguration.mdcore-types.mdcrawlers.mderror-handling.mdevents.mdfingerprinting.mdhttp-clients.mdindex.mdrequest-management.mdsessions.mdstatistics.mdstorage.md

configuration.mddocs/

0

# Configuration

1

2

Global configuration management and request routing systems for fine-tuned control over crawling behavior. Configuration components provide centralized settings management, environment variable integration, proxy management, and request routing capabilities.

3

4

## Capabilities

5

6

### Global Configuration

7

8

Centralized configuration system with environment variable support and default value management.

9

10

```python { .api }

11

class Configuration:

12

def __init__(self, **settings): ...

13

14

def get(self, key: str, default: any = None) -> any:

15

"""Get configuration value with optional default."""

16

17

def set(self, key: str, value: any) -> None:

18

"""Set configuration value."""

19

20

def get_bool(self, key: str, default: bool = False) -> bool:

21

"""Get boolean configuration value."""

22

23

def get_int(self, key: str, default: int = 0) -> int:

24

"""Get integer configuration value."""

25

26

def get_float(self, key: str, default: float = 0.0) -> float:

27

"""Get float configuration value."""

28

29

@property

30

def storage_dir(self) -> str:

31

"""Default storage directory path."""

32

33

@property

34

def max_pool_size(self) -> int:

35

"""Default maximum pool size."""

36

37

@property

38

def request_handler_timeout(self) -> int:

39

"""Default request handler timeout in seconds."""

40

```

41

42

### Request Router

43

44

Request routing system for directing requests to appropriate handlers based on labels and patterns.

45

46

```python { .api }

47

class Router:

48

def __init__(self): ...

49

50

def default_handler(self, handler: RequestHandler) -> RequestHandler:

51

"""

52

Register default request handler.

53

54

Args:

55

handler: Handler function for requests

56

57

Returns:

58

The registered handler

59

"""

60

61

def route(

62

self,

63

label: str,

64

handler: RequestHandler,

65

*,

66

method: HttpMethod | None = None

67

) -> RequestHandler:

68

"""

69

Register handler for specific request label.

70

71

Args:

72

label: Request label to match

73

handler: Handler function

74

method: Optional HTTP method filter

75

76

Returns:

77

The registered handler

78

"""

79

80

def error_handler(self, handler: ErrorRequestHandler) -> ErrorRequestHandler:

81

"""

82

Register error handler for failed requests.

83

84

Args:

85

handler: Error handler function

86

87

Returns:

88

The registered handler

89

"""

90

91

def get_handler(self, request: Request) -> RequestHandler | None:

92

"""Get appropriate handler for request."""

93

94

def get_error_handler(self) -> ErrorRequestHandler | None:

95

"""Get registered error handler."""

96

```

97

98

### Proxy Configuration

99

100

Proxy server configuration and rotation system for enhanced anonymity and geo-targeting.

101

102

```python { .api }

103

class ProxyConfiguration:

104

def __init__(

105

self,

106

proxy_urls: list[str] | None = None,

107

*,

108

new_url_function: Callable[[], str] | None = None,

109

country_code: str | None = None,

110

session_id: str | None = None

111

): ...

112

113

async def new_proxy_info(

114

self,

115

*,

116

session_id: str | None = None,

117

request: Request | None = None

118

) -> ProxyInfo | None:

119

"""

120

Get new proxy for request.

121

122

Args:

123

session_id: Session identifier for proxy affinity

124

request: Request being processed

125

126

Returns:

127

ProxyInfo object or None if no proxy needed

128

"""

129

130

def new_url(self) -> str | None:

131

"""Generate new proxy URL using configured strategy."""

132

133

@property

134

def proxy_urls(self) -> list[str]: ...

135

136

@property

137

def country_code(self) -> str | None: ...

138

```

139

140

```python { .api }

141

class ProxyInfo:

142

def __init__(

143

self,

144

*,

145

url: str,

146

hostname: str | None = None,

147

port: int | None = None,

148

username: str | None = None,

149

password: str | None = None,

150

country_code: str | None = None,

151

session_id: str | None = None

152

): ...

153

154

@property

155

def url(self) -> str:

156

"""Full proxy URL."""

157

158

@property

159

def hostname(self) -> str | None:

160

"""Proxy hostname."""

161

162

@property

163

def port(self) -> int | None:

164

"""Proxy port number."""

165

166

@property

167

def username(self) -> str | None:

168

"""Proxy authentication username."""

169

170

@property

171

def password(self) -> str | None:

172

"""Proxy authentication password."""

173

174

@property

175

def country_code(self) -> str | None:

176

"""ISO country code for proxy location."""

177

178

@property

179

def session_id(self) -> str | None:

180

"""Session identifier for proxy affinity."""

181

```

182

183

## Handler Types

184

185

Type definitions for request handlers and error handlers used with the Router.

186

187

```python { .api }

188

RequestHandler = Callable[[BasicCrawlingContext], Awaitable[None]]

189

190

ErrorRequestHandler = Callable[

191

[BasicCrawlingContext, Exception], Awaitable[None]

192

]

193

```

194

195

## Usage Examples

196

197

### Global Configuration

198

199

```python

200

from crawlee.configuration import Configuration

201

import os

202

203

# Create configuration with defaults

204

config = Configuration(

205

storage_dir='./crawlee_storage',

206

max_concurrent_requests=10,

207

request_timeout=30

208

)

209

210

# Environment variables override defaults

211

# CRAWLEE_STORAGE_DIR, CRAWLEE_MAX_CONCURRENT_REQUESTS, etc.

212

os.environ['CRAWLEE_STORAGE_DIR'] = '/tmp/my_crawls'

213

214

# Get configuration values

215

storage_dir = config.get('storage_dir')

216

print(f"Storage directory: {storage_dir}") # /tmp/my_crawls

217

218

# Type-specific getters

219

max_requests = config.get_int('max_concurrent_requests', 5)

220

enable_logging = config.get_bool('enable_logging', True)

221

222

# Set values programmatically

223

config.set('user_agent', 'My Custom Bot 1.0')

224

```

225

226

### Request Routing

227

228

```python

229

import asyncio

230

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext

231

from crawlee.router import Router

232

233

async def main():

234

crawler = BeautifulSoupCrawler()

235

236

# Use the crawler's built-in router

237

router = crawler.router

238

239

# Default handler for unlabeled requests

240

@router.default_handler

241

async def default_handler(context: BeautifulSoupCrawlingContext):

242

context.log.info(f"Processing default: {context.request.url}")

243

244

data = {

245

'url': context.request.url,

246

'title': context.soup.title.string if context.soup.title else None,

247

'type': 'default'

248

}

249

250

await context.push_data(data)

251

252

# Handler for product pages

253

@router.route('product')

254

async def product_handler(context: BeautifulSoupCrawlingContext):

255

context.log.info(f"Processing product: {context.request.url}")

256

257

# Extract product-specific data

258

name = context.soup.select_one('.product-name')

259

price = context.soup.select_one('.price')

260

261

data = {

262

'url': context.request.url,

263

'name': name.get_text().strip() if name else None,

264

'price': price.get_text().strip() if price else None,

265

'type': 'product'

266

}

267

268

await context.push_data(data)

269

270

# Enqueue related products

271

await context.enqueue_links(

272

selector='.related-product a',

273

label='product'

274

)

275

276

# Handler for category pages

277

@router.route('category')

278

async def category_handler(context: BeautifulSoupCrawlingContext):

279

context.log.info(f"Processing category: {context.request.url}")

280

281

# Extract category info

282

category_name = context.soup.select_one('h1')

283

284

data = {

285

'url': context.request.url,

286

'category': category_name.get_text().strip() if category_name else None,

287

'type': 'category'

288

}

289

290

await context.push_data(data)

291

292

# Enqueue product links with product label

293

await context.enqueue_links(

294

selector='.product-link',

295

label='product'

296

)

297

298

# Error handler for all failed requests

299

@router.error_handler

300

async def error_handler(context: BeautifulSoupCrawlingContext, error: Exception):

301

context.log.error(f"Error processing {context.request.url}: {error}")

302

303

# Log error details

304

await context.push_data({

305

'url': context.request.url,

306

'error': str(error),

307

'type': 'error'

308

})

309

310

# Start crawling with labeled requests

311

from crawlee import Request

312

313

requests = [

314

Request('https://store.example.com/', label='category'),

315

Request('https://store.example.com/products/item1', label='product'),

316

'https://store.example.com/about', # No label = default handler

317

]

318

319

await crawler.run(requests)

320

321

asyncio.run(main())

322

```

323

324

### Proxy Configuration

325

326

```python

327

import asyncio

328

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext

329

from crawlee.proxy_configuration import ProxyConfiguration

330

331

async def main():

332

# Configure proxy rotation

333

proxy_config = ProxyConfiguration([

334

'http://proxy1:8080',

335

'http://user:pass@proxy2:8080',

336

'http://proxy3:8080',

337

'socks5://socks-proxy:1080'

338

])

339

340

# Create crawler with proxy configuration

341

crawler = HttpCrawler(

342

proxy_configuration=proxy_config,

343

max_requests_per_crawl=20

344

)

345

346

@crawler.router.default_handler

347

async def handler(context: HttpCrawlingContext):

348

# Each request may use different proxy

349

proxy_info = context.proxy_info

350

if proxy_info:

351

context.log.info(f"Using proxy: {proxy_info.hostname}:{proxy_info.port}")

352

if proxy_info.country_code:

353

context.log.info(f"Proxy country: {proxy_info.country_code}")

354

355

data = {

356

'url': context.request.url,

357

'status': context.response.status_code,

358

'proxy_used': proxy_info.url if proxy_info else None

359

}

360

361

await context.push_data(data)

362

363

await crawler.run(['https://httpbin.org/ip'] * 10)

364

365

asyncio.run(main())

366

```

367

368

### Custom Proxy Function

369

370

```python

371

import asyncio

372

import random

373

from crawlee.proxy_configuration import ProxyConfiguration

374

375

def generate_proxy_url() -> str:

376

"""Custom function to generate proxy URLs dynamically."""

377

proxy_providers = [

378

'proxy-pool-1.example.com:8080',

379

'proxy-pool-2.example.com:8080',

380

'proxy-pool-3.example.com:8080'

381

]

382

383

selected = random.choice(proxy_providers)

384

return f"http://user:pass@{selected}"

385

386

async def main():

387

# Use custom proxy generation function

388

proxy_config = ProxyConfiguration(

389

new_url_function=generate_proxy_url

390

)

391

392

# Test proxy generation

393

for i in range(5):

394

proxy_info = await proxy_config.new_proxy_info()

395

print(f"Generated proxy {i+1}: {proxy_info.url}")

396

397

asyncio.run(main())

398

```

399

400

### Environment-Based Configuration

401

402

```python

403

import os

404

from crawlee.configuration import Configuration

405

406

# Set environment variables

407

os.environ.update({

408

'CRAWLEE_STORAGE_DIR': './data',

409

'CRAWLEE_MAX_CONCURRENT_REQUESTS': '20',

410

'CRAWLEE_REQUEST_TIMEOUT': '60',

411

'CRAWLEE_ENABLE_PROXY': 'true',

412

'CRAWLEE_LOG_LEVEL': 'DEBUG'

413

})

414

415

# Configuration automatically reads from environment

416

config = Configuration()

417

418

print(f"Storage dir: {config.storage_dir}")

419

print(f"Max concurrent: {config.get_int('max_concurrent_requests')}")

420

print(f"Request timeout: {config.get_int('request_timeout')}")

421

print(f"Proxy enabled: {config.get_bool('enable_proxy')}")

422

print(f"Log level: {config.get('log_level')}")

423

424

# Override with custom values

425

config.set('custom_setting', 'my_value')

426

print(f"Custom setting: {config.get('custom_setting')}")

427

```