0
# Configuration and Proxies
1
2
Configuration and proxy management provide global settings control and distributed crawling capabilities with proxy rotation, authentication, and fault tolerance.
3
4
## Capabilities
5
6
### Configuration
7
8
Global configuration management for Crawlee settings and behavior control.
9
10
```typescript { .api }
11
/**
12
* Global configuration management for Crawlee
13
*/
14
class Configuration {
15
/** Get the global configuration instance */
16
static getGlobalConfig(): Configuration;
17
18
/** Get a configuration value */
19
get<T = any>(key: string): T;
20
21
/** Set a configuration value */
22
set(key: string, value: any): void;
23
24
/** Reset configuration to defaults */
25
reset(): void;
26
27
/** Get all configuration values */
28
getAll(): Dictionary<any>;
29
30
/** Initialize configuration from environment */
31
buildConfigFromEnv(): void;
32
33
/** Storage client configuration */
34
getStorageClient(): StorageClient;
35
36
/** Get event manager instance */
37
getEventManager(): EventManager;
38
39
/** Memory monitoring configuration */
40
getMemoryInfo(): MemoryInfo;
41
42
/** System status monitoring */
43
getSystemInfo(): SystemInfo;
44
}
45
```
46
47
### Configuration Keys
48
49
Common configuration options available through the Configuration class.
50
51
```typescript { .api }
52
interface ConfigurationKeys {
53
/** Default dataset ID */
54
defaultDatasetId: string;
55
56
/** Default key-value store ID */
57
defaultKeyValueStoreId: string;
58
59
/** Default request queue ID */
60
defaultRequestQueueId: string;
61
62
/** Local storage directory */
63
localDataDirectory: string;
64
65
/** Whether to purge local data on startup */
66
purgeOnStart: boolean;
67
68
/** Maximum memory usage in MB */
69
memoryMbytes: number;
70
71
/** Available memory ratio threshold */
72
availableMemoryRatio: number;
73
74
/** Maximum old space size for Node.js */
75
maxOldSpaceSize: number;
76
77
/** Log level */
78
logLevel: 'DEBUG' | 'INFO' | 'WARNING' | 'ERROR' | 'OFF';
79
80
/** Whether to use headless browser mode */
81
headless: boolean;
82
83
/** Chrome executable path */
84
chromeExecutablePath?: string;
85
86
/** Default browser viewport */
87
defaultBrowserViewport: { width: number; height: number };
88
89
/** System monitoring interval */
90
systemInfoIntervalMillis: number;
91
92
/** Input charset */
93
inputCharset: string;
94
95
/** Default user agent */
96
defaultUserAgent: string;
97
98
/** HTTP timeout */
99
defaultRequestTimeoutSecs: number;
100
101
/** Maximum HTTP redirects */
102
maxRequestRedirects: number;
103
104
/** Whether to persist storage state */
105
persistStorage: boolean;
106
}
107
```
108
109
**Usage Examples:**
110
111
```typescript
112
import { Configuration, CheerioCrawler } from "crawlee";
113
114
// Get global configuration
115
const config = Configuration.getGlobalConfig();
116
117
// Configure storage settings
118
config.set('defaultDatasetId', 'my-crawl-results');
119
config.set('localDataDirectory', './crawlee_storage');
120
config.set('purgeOnStart', false);
121
122
// Configure memory limits
123
config.set('memoryMbytes', 4096);
124
config.set('availableMemoryRatio', 0.1);
125
126
// Configure browser settings
127
config.set('headless', true);
128
config.set('defaultBrowserViewport', { width: 1920, height: 1080 });
129
130
// Configure logging
131
config.set('logLevel', 'INFO');
132
133
// Configure HTTP settings
134
config.set('defaultRequestTimeoutSecs', 30);
135
config.set('maxRequestRedirects', 10);
136
137
// Use configuration in crawlers
138
const crawler = new CheerioCrawler({
139
requestHandler: async ({ request }) => {
140
const memoryLimit = config.get('memoryMbytes');
141
const currentMemory = config.getMemoryInfo();
142
143
if (currentMemory.usedBytes > memoryLimit * 1024 * 1024 * 0.9) {
144
console.warn('Approaching memory limit');
145
}
146
147
// Process request...
148
},
149
});
150
151
// Environment-based configuration
152
config.buildConfigFromEnv();
153
154
// Check configuration values
155
console.log('Local data directory:', config.get('localDataDirectory'));
156
console.log('Log level:', config.get('logLevel'));
157
console.log('All config:', config.getAll());
158
```
159
160
### ProxyConfiguration
161
162
Proxy configuration management with support for multiple proxy sources and rotation.
163
164
```typescript { .api }
165
/**
166
* Proxy configuration and management with rotation support
167
*/
168
class ProxyConfiguration {
169
constructor(options?: ProxyConfigurationOptions);
170
171
/** Initialize proxy configuration */
172
initialize(): Promise<void>;
173
174
/** Get a new proxy URL */
175
newUrl(sessionId?: number | string): Promise<string | undefined>;
176
177
/** Get new proxy information */
178
newProxyInfo(sessionId?: number | string): Promise<ProxyInfo | undefined>;
179
180
/** Get proxy statistics */
181
getProxyStats(): ProxyStats;
182
183
/** Mark a proxy as bad */
184
markProxyBad(proxyInfo: ProxyInfo, errorMessage?: string): void;
185
186
/** Reset proxy statistics */
187
resetProxyStats(): void;
188
}
189
```
190
191
### ProxyConfigurationOptions
192
193
Configuration options for proxy management.
194
195
```typescript { .api }
196
interface ProxyConfigurationOptions {
197
/** Array of proxy URLs */
198
proxyUrls?: string[];
199
200
/** Function that returns proxy URLs */
201
newUrlFunction?: (sessionId?: number | string) => Promise<string | undefined>;
202
203
/** Apify Proxy groups to use */
204
groups?: string[];
205
206
/** Apify Proxy country code */
207
countryCode?: string;
208
209
/** Custom password for Apify Proxy */
210
password?: string;
211
212
/** Session persistence time in seconds */
213
sessionStickinessTimeSecs?: number;
214
215
/** Apify Proxy options */
216
apifyProxyOptions?: ApifyProxyOptions;
217
218
/** Whether to rotate proxies */
219
rotateProxies?: boolean;
220
221
/** Proxy rotation strategy */
222
rotationStrategy?: ProxyRotationStrategy;
223
}
224
225
interface ApifyProxyOptions {
226
/** Apify proxy groups */
227
groups?: string[];
228
229
/** Country code for geo-targeting */
230
countryCode?: string;
231
232
/** Custom session ID format */
233
sessionIdFunction?: (request: Request) => string;
234
235
/** Whether to use Apify Proxy */
236
useApifyProxy?: boolean;
237
238
/** Apify Proxy password */
239
password?: string;
240
}
241
242
enum ProxyRotationStrategy {
243
ROUND_ROBIN = 'ROUND_ROBIN',
244
RANDOM = 'RANDOM',
245
SESSION_STICKY = 'SESSION_STICKY',
246
}
247
```
248
249
**Usage Examples:**
250
251
```typescript
252
import { ProxyConfiguration, PuppeteerCrawler } from "crawlee";
253
254
// Basic proxy configuration with static URLs
255
const proxyConfiguration = new ProxyConfiguration({
256
proxyUrls: [
257
'http://user:pass@proxy1.example.com:8000',
258
'http://user:pass@proxy2.example.com:8000',
259
'http://user:pass@proxy3.example.com:8000',
260
],
261
rotationStrategy: ProxyRotationStrategy.ROUND_ROBIN,
262
});
263
264
// Initialize before use
265
await proxyConfiguration.initialize();
266
267
// Use with crawler
268
const crawler = new PuppeteerCrawler({
269
proxyConfiguration,
270
requestHandler: async ({ page, request, proxyInfo }) => {
271
console.log(`Using proxy: ${proxyInfo?.url}`);
272
273
try {
274
await page.goto(request.url);
275
// Process page...
276
} catch (error) {
277
if (error.message.includes('proxy')) {
278
// Mark proxy as bad
279
proxyConfiguration.markProxyBad(proxyInfo, error.message);
280
}
281
throw error;
282
}
283
},
284
});
285
286
// Custom proxy function
287
const dynamicProxyConfig = new ProxyConfiguration({
288
newUrlFunction: async (sessionId) => {
289
// Fetch proxy from external service
290
const response = await fetch('https://proxy-service.com/get-proxy');
291
const proxy = await response.json();
292
return `http://${proxy.username}:${proxy.password}@${proxy.host}:${proxy.port}`;
293
},
294
sessionStickinessTimeSecs: 300, // 5 minutes
295
});
296
297
// Apify Proxy configuration
298
const apifyProxyConfig = new ProxyConfiguration({
299
groups: ['RESIDENTIAL', 'DATACENTER'],
300
countryCode: 'US',
301
sessionStickinessTimeSecs: 600,
302
apifyProxyOptions: {
303
password: process.env.APIFY_PROXY_PASSWORD,
304
sessionIdFunction: (request) => `session_${request.userData.category}`,
305
},
306
});
307
308
// Monitor proxy performance
309
setInterval(() => {
310
const stats = proxyConfiguration.getProxyStats();
311
console.log(`Proxy stats: ${stats.successCount}/${stats.totalCount} successful`);
312
console.log(`Bad proxies: ${stats.badProxyCount}`);
313
}, 30000);
314
```
315
316
### ProxyInfo
317
318
Information about a specific proxy instance.
319
320
```typescript { .api }
321
interface ProxyInfo {
322
/** Full proxy URL */
323
url: string;
324
325
/** Proxy hostname */
326
hostname: string;
327
328
/** Proxy port number */
329
port: number;
330
331
/** Proxy protocol (http, https, socks5) */
332
protocol: string;
333
334
/** Authentication credentials */
335
auth?: {
336
username: string;
337
password: string;
338
};
339
340
/** Session ID for this proxy */
341
sessionId?: string | number;
342
343
/** Additional proxy metadata */
344
metadata?: Dictionary<any>;
345
346
/** When this proxy was created */
347
createdAt?: Date;
348
349
/** Proxy geographic location */
350
country?: string;
351
352
/** Proxy provider information */
353
provider?: string;
354
}
355
356
interface ProxyStats {
357
/** Total proxy requests made */
358
totalCount: number;
359
360
/** Successful proxy requests */
361
successCount: number;
362
363
/** Failed proxy requests */
364
errorCount: number;
365
366
/** Number of bad proxies marked */
367
badProxyCount: number;
368
369
/** Success rate ratio (0-1) */
370
successRate: number;
371
372
/** Average response time */
373
averageResponseTime: number;
374
375
/** Stats by proxy URL */
376
proxyStats: Dictionary<{
377
requests: number;
378
successes: number;
379
errors: number;
380
lastUsed: Date;
381
averageResponseTime: number;
382
}>;
383
}
384
```
385
386
### Event Management
387
388
Event system for monitoring and reacting to crawler and configuration events.
389
390
```typescript { .api }
391
/**
392
* Event manager for handling system and crawler events
393
*/
394
class EventManager {
395
/** Register an event listener */
396
on(eventName: string, listener: EventListener): void;
397
398
/** Register a one-time event listener */
399
once(eventName: string, listener: EventListener): void;
400
401
/** Remove an event listener */
402
off(eventName: string, listener: EventListener): void;
403
404
/** Emit an event */
405
emit(eventName: string, ...args: any[]): void;
406
407
/** List all registered events */
408
listenerCount(eventName: string): number;
409
410
/** Remove all listeners for an event */
411
removeAllListeners(eventName?: string): void;
412
}
413
414
type EventListener = (...args: any[]) => void | Promise<void>;
415
```
416
417
**Usage Examples:**
418
419
```typescript
420
import { Configuration, CheerioCrawler } from "crawlee";
421
422
const config = Configuration.getGlobalConfig();
423
const eventManager = config.getEventManager();
424
425
// Listen for system events
426
eventManager.on('memoryWarning', (memoryInfo) => {
427
console.warn('Memory usage high:', memoryInfo.ratio);
428
// Implement memory pressure handling
429
});
430
431
eventManager.on('proxyError', (proxyInfo, error) => {
432
console.error(`Proxy ${proxyInfo.url} failed:`, error.message);
433
// Log proxy failures for analysis
434
});
435
436
eventManager.on('sessionRetired', (session) => {
437
console.log(`Session ${session.id} was retired`);
438
// Track session lifecycle
439
});
440
441
// Emit custom events
442
const crawler = new CheerioCrawler({
443
requestHandler: async ({ request, response }) => {
444
if (response.statusCode === 429) {
445
eventManager.emit('rateLimitHit', {
446
url: request.url,
447
retryAfter: response.headers['retry-after'],
448
});
449
}
450
451
// Process request...
452
},
453
});
454
455
// React to custom events
456
eventManager.on('rateLimitHit', async ({ url, retryAfter }) => {
457
console.log(`Rate limit hit on ${url}, backing off for ${retryAfter}s`);
458
// Implement backoff strategy
459
await sleep(parseInt(retryAfter) * 1000);
460
});
461
```
462
463
### Memory and System Monitoring
464
465
Built-in monitoring for system resources and crawler performance.
466
467
```typescript { .api }
468
interface MemoryInfo {
469
/** Total system memory in bytes */
470
totalBytes: number;
471
472
/** Free memory in bytes */
473
freeBytes: number;
474
475
/** Used memory in bytes */
476
usedBytes: number;
477
478
/** Available memory in bytes */
479
availableBytes: number;
480
481
/** Memory usage ratio (0-1) */
482
ratio: number;
483
484
/** Node.js heap information */
485
heapUsed: number;
486
heapTotal: number;
487
heapLimit: number;
488
489
/** External memory usage */
490
external: number;
491
492
/** Memory usage by category */
493
breakdown: {
494
rss: number;
495
heapUsed: number;
496
heapTotal: number;
497
external: number;
498
};
499
}
500
501
interface SystemInfo {
502
/** CPU usage information */
503
cpu: {
504
usage: number;
505
loadAverage: number[];
506
cores: number;
507
};
508
509
/** Memory information */
510
memory: MemoryInfo;
511
512
/** Operating system information */
513
os: {
514
platform: string;
515
arch: string;
516
release: string;
517
uptime: number;
518
};
519
520
/** Node.js process information */
521
process: {
522
pid: number;
523
uptime: number;
524
memoryUsage: NodeJS.MemoryUsage;
525
cpuUsage: NodeJS.CpuUsage;
526
};
527
528
/** Timestamp of measurement */
529
timestamp: Date;
530
}
531
```
532
533
**Usage Examples:**
534
535
```typescript
536
import { Configuration, CheerioCrawler } from "crawlee";
537
538
const config = Configuration.getGlobalConfig();
539
540
// Monitor system resources
541
setInterval(() => {
542
const memInfo = config.getMemoryInfo();
543
const sysInfo = config.getSystemInfo();
544
545
console.log(`Memory usage: ${(memInfo.ratio * 100).toFixed(1)}%`);
546
console.log(`CPU usage: ${(sysInfo.cpu.usage * 100).toFixed(1)}%`);
547
console.log(`Heap used: ${(memInfo.heapUsed / 1024 / 1024).toFixed(0)}MB`);
548
549
// Trigger cleanup if memory usage is high
550
if (memInfo.ratio > 0.9) {
551
console.warn('High memory usage, triggering garbage collection');
552
if (global.gc) {
553
global.gc();
554
}
555
}
556
}, 10000);
557
558
// Use system monitoring in crawler
559
const crawler = new CheerioCrawler({
560
requestHandler: async ({ request }) => {
561
const memInfo = config.getMemoryInfo();
562
563
// Adapt behavior based on memory usage
564
if (memInfo.ratio > 0.8) {
565
console.log('High memory usage, reducing processing');
566
// Skip heavy processing or reduce data collection
567
return;
568
}
569
570
// Normal processing...
571
},
572
573
// Configure based on system capabilities
574
maxConcurrency: (() => {
575
const sysInfo = config.getSystemInfo();
576
const cores = sysInfo.cpu.cores;
577
return Math.max(1, cores - 1); // Leave one core for system
578
})(),
579
});
580
581
// Set memory thresholds based on available memory
582
const totalMemoryGB = config.getMemoryInfo().totalBytes / (1024 ** 3);
583
config.set('memoryMbytes', Math.floor(totalMemoryGB * 0.8 * 1024)); // Use 80% of available memory
584
```
585
586
### Storage Client Integration
587
588
Configuration integration with storage clients for advanced storage operations.
589
590
```typescript { .api }
591
interface StorageClient {
592
/** Dataset client for advanced dataset operations */
593
datasets(): DatasetClient;
594
595
/** Key-value store client */
596
keyValueStores(): KeyValueStoreClient;
597
598
/** Request queue client */
599
requestQueues(): RequestQueueClient;
600
601
/** Update client configuration */
602
setOptions(options: StorageClientOptions): void;
603
604
/** Get current configuration */
605
getOptions(): StorageClientOptions;
606
}
607
608
interface StorageClientOptions {
609
/** Storage API base URL */
610
baseUrl?: string;
611
612
/** Authentication token */
613
token?: string;
614
615
/** Request timeout in seconds */
616
timeoutSecs?: number;
617
618
/** Maximum retry attempts */
619
maxRetries?: number;
620
621
/** Local storage directory */
622
localDataDirectory?: string;
623
624
/** Whether to use cloud storage */
625
cloudStorage?: boolean;
626
}
627
```
628
629
**Usage Examples:**
630
631
```typescript
632
import { Configuration } from "crawlee";
633
634
const config = Configuration.getGlobalConfig();
635
const storageClient = config.getStorageClient();
636
637
// Configure storage client
638
storageClient.setOptions({
639
baseUrl: 'https://api.apify.com/v2',
640
token: process.env.APIFY_TOKEN,
641
timeoutSecs: 30,
642
maxRetries: 3,
643
cloudStorage: true,
644
});
645
646
// Use advanced dataset operations
647
const datasetClient = storageClient.datasets();
648
649
// Custom dataset operations
650
await datasetClient.pushData('my-dataset', [
651
{ url: 'example.com', title: 'Example' }
652
]);
653
654
const datasetInfo = await datasetClient.getDataset('my-dataset');
655
console.log(`Dataset has ${datasetInfo.itemCount} items`);
656
657
// Export data with advanced options
658
await datasetClient.exportDataset('my-dataset', {
659
format: 'csv',
660
fields: ['url', 'title'],
661
clean: true,
662
});
663
```
664
665
## Types
666
667
```typescript { .api }
668
interface Dictionary<T = any> {
669
[key: string]: T;
670
}
671
672
interface EventEmitter {
673
on(event: string, listener: Function): this;
674
once(event: string, listener: Function): this;
675
emit(event: string, ...args: any[]): boolean;
676
off(event: string, listener: Function): this;
677
removeAllListeners(event?: string): this;
678
}
679
680
interface Request<UserData = Dictionary> {
681
url: string;
682
loadedUrl?: string;
683
uniqueKey: string;
684
userData?: UserData;
685
label?: string;
686
method?: string;
687
headers?: Dictionary<string>;
688
payload?: string;
689
}
690
691
interface CrawlerOptions {
692
proxyConfiguration?: ProxyConfiguration;
693
sessionPoolOptions?: SessionPoolOptions;
694
maxConcurrency?: number;
695
maxRequestRetries?: number;
696
requestTimeoutSecs?: number;
697
}
698
699
interface NodeJSMemoryUsage {
700
rss: number;
701
heapTotal: number;
702
heapUsed: number;
703
external: number;
704
arrayBuffers: number;
705
}
706
707
interface NodeJSCpuUsage {
708
user: number;
709
system: number;
710
}
711
712
type LogLevel = 'DEBUG' | 'INFO' | 'WARNING' | 'ERROR' | 'OFF';
713
714
interface BrowserViewport {
715
width: number;
716
height: number;
717
deviceScaleFactor?: number;
718
isMobile?: boolean;
719
hasTouch?: boolean;
720
isLandscape?: boolean;
721
}
722
```