or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

browser-crawling.mdconfiguration-proxies.mdcore-crawling.mdhttp-crawling.mdindex.mdsession-management.mdstorage.mdutilities.md

configuration-proxies.mddocs/

0

# Configuration and Proxies

1

2

Configuration and proxy management provide global settings control and distributed crawling capabilities with proxy rotation, authentication, and fault tolerance.

3

4

## Capabilities

5

6

### Configuration

7

8

Global configuration management for Crawlee settings and behavior control.

9

10

```typescript { .api }

11

/**

12

* Global configuration management for Crawlee

13

*/

14

class Configuration {

15

/** Get the global configuration instance */

16

static getGlobalConfig(): Configuration;

17

18

/** Get a configuration value */

19

get<T = any>(key: string): T;

20

21

/** Set a configuration value */

22

set(key: string, value: any): void;

23

24

/** Reset configuration to defaults */

25

reset(): void;

26

27

/** Get all configuration values */

28

getAll(): Dictionary<any>;

29

30

/** Initialize configuration from environment */

31

buildConfigFromEnv(): void;

32

33

/** Storage client configuration */

34

getStorageClient(): StorageClient;

35

36

/** Get event manager instance */

37

getEventManager(): EventManager;

38

39

/** Memory monitoring configuration */

40

getMemoryInfo(): MemoryInfo;

41

42

/** System status monitoring */

43

getSystemInfo(): SystemInfo;

44

}

45

```

46

47

### Configuration Keys

48

49

Common configuration options available through the Configuration class.

50

51

```typescript { .api }

52

interface ConfigurationKeys {

53

/** Default dataset ID */

54

defaultDatasetId: string;

55

56

/** Default key-value store ID */

57

defaultKeyValueStoreId: string;

58

59

/** Default request queue ID */

60

defaultRequestQueueId: string;

61

62

/** Local storage directory */

63

localDataDirectory: string;

64

65

/** Whether to purge local data on startup */

66

purgeOnStart: boolean;

67

68

/** Maximum memory usage in MB */

69

memoryMbytes: number;

70

71

/** Available memory ratio threshold */

72

availableMemoryRatio: number;

73

74

/** Maximum old space size for Node.js */

75

maxOldSpaceSize: number;

76

77

/** Log level */

78

logLevel: 'DEBUG' | 'INFO' | 'WARNING' | 'ERROR' | 'OFF';

79

80

/** Whether to use headless browser mode */

81

headless: boolean;

82

83

/** Chrome executable path */

84

chromeExecutablePath?: string;

85

86

/** Default browser viewport */

87

defaultBrowserViewport: { width: number; height: number };

88

89

/** System monitoring interval */

90

systemInfoIntervalMillis: number;

91

92

/** Input charset */

93

inputCharset: string;

94

95

/** Default user agent */

96

defaultUserAgent: string;

97

98

/** HTTP timeout */

99

defaultRequestTimeoutSecs: number;

100

101

/** Maximum HTTP redirects */

102

maxRequestRedirects: number;

103

104

/** Whether to persist storage state */

105

persistStorage: boolean;

106

}

107

```

108

109

**Usage Examples:**

110

111

```typescript

112

import { Configuration, CheerioCrawler } from "crawlee";

113

114

// Get global configuration

115

const config = Configuration.getGlobalConfig();

116

117

// Configure storage settings

118

config.set('defaultDatasetId', 'my-crawl-results');

119

config.set('localDataDirectory', './crawlee_storage');

120

config.set('purgeOnStart', false);

121

122

// Configure memory limits

123

config.set('memoryMbytes', 4096);

124

config.set('availableMemoryRatio', 0.1);

125

126

// Configure browser settings

127

config.set('headless', true);

128

config.set('defaultBrowserViewport', { width: 1920, height: 1080 });

129

130

// Configure logging

131

config.set('logLevel', 'INFO');

132

133

// Configure HTTP settings

134

config.set('defaultRequestTimeoutSecs', 30);

135

config.set('maxRequestRedirects', 10);

136

137

// Use configuration in crawlers

138

const crawler = new CheerioCrawler({

139

requestHandler: async ({ request }) => {

140

const memoryLimit = config.get('memoryMbytes');

141

const currentMemory = config.getMemoryInfo();

142

143

if (currentMemory.usedBytes > memoryLimit * 1024 * 1024 * 0.9) {

144

console.warn('Approaching memory limit');

145

}

146

147

// Process request...

148

},

149

});

150

151

// Environment-based configuration

152

config.buildConfigFromEnv();

153

154

// Check configuration values

155

console.log('Local data directory:', config.get('localDataDirectory'));

156

console.log('Log level:', config.get('logLevel'));

157

console.log('All config:', config.getAll());

158

```

159

160

### ProxyConfiguration

161

162

Proxy configuration management with support for multiple proxy sources and rotation.

163

164

```typescript { .api }

165

/**

166

* Proxy configuration and management with rotation support

167

*/

168

class ProxyConfiguration {

169

constructor(options?: ProxyConfigurationOptions);

170

171

/** Initialize proxy configuration */

172

initialize(): Promise<void>;

173

174

/** Get a new proxy URL */

175

newUrl(sessionId?: number | string): Promise<string | undefined>;

176

177

/** Get new proxy information */

178

newProxyInfo(sessionId?: number | string): Promise<ProxyInfo | undefined>;

179

180

/** Get proxy statistics */

181

getProxyStats(): ProxyStats;

182

183

/** Mark a proxy as bad */

184

markProxyBad(proxyInfo: ProxyInfo, errorMessage?: string): void;

185

186

/** Reset proxy statistics */

187

resetProxyStats(): void;

188

}

189

```

190

191

### ProxyConfigurationOptions

192

193

Configuration options for proxy management.

194

195

```typescript { .api }

196

interface ProxyConfigurationOptions {

197

/** Array of proxy URLs */

198

proxyUrls?: string[];

199

200

/** Function that returns proxy URLs */

201

newUrlFunction?: (sessionId?: number | string) => Promise<string | undefined>;

202

203

/** Apify Proxy groups to use */

204

groups?: string[];

205

206

/** Apify Proxy country code */

207

countryCode?: string;

208

209

/** Custom password for Apify Proxy */

210

password?: string;

211

212

/** Session persistence time in seconds */

213

sessionStickinessTimeSecs?: number;

214

215

/** Apify Proxy options */

216

apifyProxyOptions?: ApifyProxyOptions;

217

218

/** Whether to rotate proxies */

219

rotateProxies?: boolean;

220

221

/** Proxy rotation strategy */

222

rotationStrategy?: ProxyRotationStrategy;

223

}

224

225

interface ApifyProxyOptions {

226

/** Apify proxy groups */

227

groups?: string[];

228

229

/** Country code for geo-targeting */

230

countryCode?: string;

231

232

/** Custom session ID format */

233

sessionIdFunction?: (request: Request) => string;

234

235

/** Whether to use Apify Proxy */

236

useApifyProxy?: boolean;

237

238

/** Apify Proxy password */

239

password?: string;

240

}

241

242

enum ProxyRotationStrategy {

243

ROUND_ROBIN = 'ROUND_ROBIN',

244

RANDOM = 'RANDOM',

245

SESSION_STICKY = 'SESSION_STICKY',

246

}

247

```

248

249

**Usage Examples:**

250

251

```typescript

252

import { ProxyConfiguration, PuppeteerCrawler } from "crawlee";

253

254

// Basic proxy configuration with static URLs

255

const proxyConfiguration = new ProxyConfiguration({

256

proxyUrls: [

257

'http://user:pass@proxy1.example.com:8000',

258

'http://user:pass@proxy2.example.com:8000',

259

'http://user:pass@proxy3.example.com:8000',

260

],

261

rotationStrategy: ProxyRotationStrategy.ROUND_ROBIN,

262

});

263

264

// Initialize before use

265

await proxyConfiguration.initialize();

266

267

// Use with crawler

268

const crawler = new PuppeteerCrawler({

269

proxyConfiguration,

270

requestHandler: async ({ page, request, proxyInfo }) => {

271

console.log(`Using proxy: ${proxyInfo?.url}`);

272

273

try {

274

await page.goto(request.url);

275

// Process page...

276

} catch (error) {

277

if (error.message.includes('proxy')) {

278

// Mark proxy as bad

279

proxyConfiguration.markProxyBad(proxyInfo, error.message);

280

}

281

throw error;

282

}

283

},

284

});

285

286

// Custom proxy function

287

const dynamicProxyConfig = new ProxyConfiguration({

288

newUrlFunction: async (sessionId) => {

289

// Fetch proxy from external service

290

const response = await fetch('https://proxy-service.com/get-proxy');

291

const proxy = await response.json();

292

return `http://${proxy.username}:${proxy.password}@${proxy.host}:${proxy.port}`;

293

},

294

sessionStickinessTimeSecs: 300, // 5 minutes

295

});

296

297

// Apify Proxy configuration

298

const apifyProxyConfig = new ProxyConfiguration({

299

groups: ['RESIDENTIAL', 'DATACENTER'],

300

countryCode: 'US',

301

sessionStickinessTimeSecs: 600,

302

apifyProxyOptions: {

303

password: process.env.APIFY_PROXY_PASSWORD,

304

sessionIdFunction: (request) => `session_${request.userData.category}`,

305

},

306

});

307

308

// Monitor proxy performance

309

setInterval(() => {

310

const stats = proxyConfiguration.getProxyStats();

311

console.log(`Proxy stats: ${stats.successCount}/${stats.totalCount} successful`);

312

console.log(`Bad proxies: ${stats.badProxyCount}`);

313

}, 30000);

314

```

315

316

### ProxyInfo

317

318

Information about a specific proxy instance.

319

320

```typescript { .api }

321

interface ProxyInfo {

322

/** Full proxy URL */

323

url: string;

324

325

/** Proxy hostname */

326

hostname: string;

327

328

/** Proxy port number */

329

port: number;

330

331

/** Proxy protocol (http, https, socks5) */

332

protocol: string;

333

334

/** Authentication credentials */

335

auth?: {

336

username: string;

337

password: string;

338

};

339

340

/** Session ID for this proxy */

341

sessionId?: string | number;

342

343

/** Additional proxy metadata */

344

metadata?: Dictionary<any>;

345

346

/** When this proxy was created */

347

createdAt?: Date;

348

349

/** Proxy geographic location */

350

country?: string;

351

352

/** Proxy provider information */

353

provider?: string;

354

}

355

356

interface ProxyStats {

357

/** Total proxy requests made */

358

totalCount: number;

359

360

/** Successful proxy requests */

361

successCount: number;

362

363

/** Failed proxy requests */

364

errorCount: number;

365

366

/** Number of bad proxies marked */

367

badProxyCount: number;

368

369

/** Success rate ratio (0-1) */

370

successRate: number;

371

372

/** Average response time */

373

averageResponseTime: number;

374

375

/** Stats by proxy URL */

376

proxyStats: Dictionary<{

377

requests: number;

378

successes: number;

379

errors: number;

380

lastUsed: Date;

381

averageResponseTime: number;

382

}>;

383

}

384

```

385

386

### Event Management

387

388

Event system for monitoring and reacting to crawler and configuration events.

389

390

```typescript { .api }

391

/**

392

* Event manager for handling system and crawler events

393

*/

394

class EventManager {

395

/** Register an event listener */

396

on(eventName: string, listener: EventListener): void;

397

398

/** Register a one-time event listener */

399

once(eventName: string, listener: EventListener): void;

400

401

/** Remove an event listener */

402

off(eventName: string, listener: EventListener): void;

403

404

/** Emit an event */

405

emit(eventName: string, ...args: any[]): void;

406

407

/** List all registered events */

408

listenerCount(eventName: string): number;

409

410

/** Remove all listeners for an event */

411

removeAllListeners(eventName?: string): void;

412

}

413

414

type EventListener = (...args: any[]) => void | Promise<void>;

415

```

416

417

**Usage Examples:**

418

419

```typescript

420

import { Configuration, CheerioCrawler } from "crawlee";

421

422

const config = Configuration.getGlobalConfig();

423

const eventManager = config.getEventManager();

424

425

// Listen for system events

426

eventManager.on('memoryWarning', (memoryInfo) => {

427

console.warn('Memory usage high:', memoryInfo.ratio);

428

// Implement memory pressure handling

429

});

430

431

eventManager.on('proxyError', (proxyInfo, error) => {

432

console.error(`Proxy ${proxyInfo.url} failed:`, error.message);

433

// Log proxy failures for analysis

434

});

435

436

eventManager.on('sessionRetired', (session) => {

437

console.log(`Session ${session.id} was retired`);

438

// Track session lifecycle

439

});

440

441

// Emit custom events

442

const crawler = new CheerioCrawler({

443

requestHandler: async ({ request, response }) => {

444

if (response.statusCode === 429) {

445

eventManager.emit('rateLimitHit', {

446

url: request.url,

447

retryAfter: response.headers['retry-after'],

448

});

449

}

450

451

// Process request...

452

},

453

});

454

455

// React to custom events

456

eventManager.on('rateLimitHit', async ({ url, retryAfter }) => {

457

console.log(`Rate limit hit on ${url}, backing off for ${retryAfter}s`);

458

// Implement backoff strategy

459

await sleep(parseInt(retryAfter) * 1000);

460

});

461

```

462

463

### Memory and System Monitoring

464

465

Built-in monitoring for system resources and crawler performance.

466

467

```typescript { .api }

468

interface MemoryInfo {

469

/** Total system memory in bytes */

470

totalBytes: number;

471

472

/** Free memory in bytes */

473

freeBytes: number;

474

475

/** Used memory in bytes */

476

usedBytes: number;

477

478

/** Available memory in bytes */

479

availableBytes: number;

480

481

/** Memory usage ratio (0-1) */

482

ratio: number;

483

484

/** Node.js heap information */

485

heapUsed: number;

486

heapTotal: number;

487

heapLimit: number;

488

489

/** External memory usage */

490

external: number;

491

492

/** Memory usage by category */

493

breakdown: {

494

rss: number;

495

heapUsed: number;

496

heapTotal: number;

497

external: number;

498

};

499

}

500

501

interface SystemInfo {

502

/** CPU usage information */

503

cpu: {

504

usage: number;

505

loadAverage: number[];

506

cores: number;

507

};

508

509

/** Memory information */

510

memory: MemoryInfo;

511

512

/** Operating system information */

513

os: {

514

platform: string;

515

arch: string;

516

release: string;

517

uptime: number;

518

};

519

520

/** Node.js process information */

521

process: {

522

pid: number;

523

uptime: number;

524

memoryUsage: NodeJS.MemoryUsage;

525

cpuUsage: NodeJS.CpuUsage;

526

};

527

528

/** Timestamp of measurement */

529

timestamp: Date;

530

}

531

```

532

533

**Usage Examples:**

534

535

```typescript

536

import { Configuration, CheerioCrawler } from "crawlee";

537

538

const config = Configuration.getGlobalConfig();

539

540

// Monitor system resources

541

setInterval(() => {

542

const memInfo = config.getMemoryInfo();

543

const sysInfo = config.getSystemInfo();

544

545

console.log(`Memory usage: ${(memInfo.ratio * 100).toFixed(1)}%`);

546

console.log(`CPU usage: ${(sysInfo.cpu.usage * 100).toFixed(1)}%`);

547

console.log(`Heap used: ${(memInfo.heapUsed / 1024 / 1024).toFixed(0)}MB`);

548

549

// Trigger cleanup if memory usage is high

550

if (memInfo.ratio > 0.9) {

551

console.warn('High memory usage, triggering garbage collection');

552

if (global.gc) {

553

global.gc();

554

}

555

}

556

}, 10000);

557

558

// Use system monitoring in crawler

559

const crawler = new CheerioCrawler({

560

requestHandler: async ({ request }) => {

561

const memInfo = config.getMemoryInfo();

562

563

// Adapt behavior based on memory usage

564

if (memInfo.ratio > 0.8) {

565

console.log('High memory usage, reducing processing');

566

// Skip heavy processing or reduce data collection

567

return;

568

}

569

570

// Normal processing...

571

},

572

573

// Configure based on system capabilities

574

maxConcurrency: (() => {

575

const sysInfo = config.getSystemInfo();

576

const cores = sysInfo.cpu.cores;

577

return Math.max(1, cores - 1); // Leave one core for system

578

})(),

579

});

580

581

// Set memory thresholds based on available memory

582

const totalMemoryGB = config.getMemoryInfo().totalBytes / (1024 ** 3);

583

config.set('memoryMbytes', Math.floor(totalMemoryGB * 0.8 * 1024)); // Use 80% of available memory

584

```

585

586

### Storage Client Integration

587

588

Configuration integration with storage clients for advanced storage operations.

589

590

```typescript { .api }

591

interface StorageClient {

592

/** Dataset client for advanced dataset operations */

593

datasets(): DatasetClient;

594

595

/** Key-value store client */

596

keyValueStores(): KeyValueStoreClient;

597

598

/** Request queue client */

599

requestQueues(): RequestQueueClient;

600

601

/** Update client configuration */

602

setOptions(options: StorageClientOptions): void;

603

604

/** Get current configuration */

605

getOptions(): StorageClientOptions;

606

}

607

608

interface StorageClientOptions {

609

/** Storage API base URL */

610

baseUrl?: string;

611

612

/** Authentication token */

613

token?: string;

614

615

/** Request timeout in seconds */

616

timeoutSecs?: number;

617

618

/** Maximum retry attempts */

619

maxRetries?: number;

620

621

/** Local storage directory */

622

localDataDirectory?: string;

623

624

/** Whether to use cloud storage */

625

cloudStorage?: boolean;

626

}

627

```

628

629

**Usage Examples:**

630

631

```typescript

632

import { Configuration } from "crawlee";

633

634

const config = Configuration.getGlobalConfig();

635

const storageClient = config.getStorageClient();

636

637

// Configure storage client

638

storageClient.setOptions({

639

baseUrl: 'https://api.apify.com/v2',

640

token: process.env.APIFY_TOKEN,

641

timeoutSecs: 30,

642

maxRetries: 3,

643

cloudStorage: true,

644

});

645

646

// Use advanced dataset operations

647

const datasetClient = storageClient.datasets();

648

649

// Custom dataset operations

650

await datasetClient.pushData('my-dataset', [

651

{ url: 'example.com', title: 'Example' }

652

]);

653

654

const datasetInfo = await datasetClient.getDataset('my-dataset');

655

console.log(`Dataset has ${datasetInfo.itemCount} items`);

656

657

// Export data with advanced options

658

await datasetClient.exportDataset('my-dataset', {

659

format: 'csv',

660

fields: ['url', 'title'],

661

clean: true,

662

});

663

```

664

665

## Types

666

667

```typescript { .api }

668

interface Dictionary<T = any> {

669

[key: string]: T;

670

}

671

672

interface EventEmitter {

673

on(event: string, listener: Function): this;

674

once(event: string, listener: Function): this;

675

emit(event: string, ...args: any[]): boolean;

676

off(event: string, listener: Function): this;

677

removeAllListeners(event?: string): this;

678

}

679

680

interface Request<UserData = Dictionary> {

681

url: string;

682

loadedUrl?: string;

683

uniqueKey: string;

684

userData?: UserData;

685

label?: string;

686

method?: string;

687

headers?: Dictionary<string>;

688

payload?: string;

689

}

690

691

interface CrawlerOptions {

692

proxyConfiguration?: ProxyConfiguration;

693

sessionPoolOptions?: SessionPoolOptions;

694

maxConcurrency?: number;

695

maxRequestRetries?: number;

696

requestTimeoutSecs?: number;

697

}

698

699

interface NodeJSMemoryUsage {

700

rss: number;

701

heapTotal: number;

702

heapUsed: number;

703

external: number;

704

arrayBuffers: number;

705

}

706

707

interface NodeJSCpuUsage {

708

user: number;

709

system: number;

710

}

711

712

type LogLevel = 'DEBUG' | 'INFO' | 'WARNING' | 'ERROR' | 'OFF';

713

714

interface BrowserViewport {

715

width: number;

716

height: number;

717

deviceScaleFactor?: number;

718

isMobile?: boolean;

719

hasTouch?: boolean;

720

isLandscape?: boolean;

721

}

722

```