or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

browser-crawling.mdconfiguration-proxies.mdcore-crawling.mdhttp-crawling.mdindex.mdsession-management.mdstorage.mdutilities.md

core-crawling.mddocs/

0

# Core Crawling

1

2

Core crawling functionality provides the foundation classes and utilities that all other crawlers build upon. This includes the base `BasicCrawler` class, autoscaling capabilities, error handling, and request routing.

3

4

## Capabilities

5

6

### BasicCrawler

7

8

The foundation crawler class that all other crawlers extend. Provides core functionality for request management, autoscaling, session handling, and error recovery.

9

10

```typescript { .api }

11

/**

12

* Base crawler class that provides the foundation for all crawler implementations

13

* @template Context - The context type passed to request handlers

14

*/

15

class BasicCrawler<Context = BasicCrawlingContext> {

16

constructor(options: BasicCrawlerOptions<Context>);

17

18

/** Run the crawler until all requests are processed */

19

run(): Promise<FinalStatistics>;

20

21

/** Add requests to the crawler queue */

22

addRequests(requests: (string | RequestOptions)[]): Promise<void>;

23

24

/** Export data from the default dataset */

25

exportData<T>(options?: DatasetExportOptions): Promise<T[]>;

26

27

/** Get data from the default dataset */

28

getData<T>(options?: DatasetDataOptions): Promise<T[]>;

29

30

/** Set a value in the default key-value store */

31

setValue(key: string, value: any, options?: RecordOptions): Promise<void>;

32

33

/** Get a value from the default key-value store */

34

getValue<T>(key: string): Promise<T | null>;

35

36

/** Register event handlers */

37

use(handler: CrawlerAddons<Context>): void;

38

39

/** Get crawler statistics */

40

readonly stats: Statistics;

41

42

/** Get final statistics after crawler finishes */

43

readonly finalStatistics?: FinalStatistics;

44

}

45

```

46

47

**Usage Examples:**

48

49

```typescript

50

import { BasicCrawler } from "crawlee";

51

52

const crawler = new BasicCrawler({

53

requestHandler: async ({ request, log }) => {

54

log.info(`Processing ${request.url}`);

55

56

// Custom processing logic here

57

const response = await fetch(request.url);

58

const data = await response.text();

59

60

// Save processed data

61

await crawler.setValue(`page-${request.uniqueKey}`, data);

62

},

63

maxConcurrency: 10,

64

maxRequestRetries: 3,

65

});

66

67

// Add requests and run

68

await crawler.addRequests(['https://example.com']);

69

const stats = await crawler.run();

70

71

console.log(`Processed ${stats.requestsFinished} requests`);

72

```

73

74

### BasicCrawlerOptions

75

76

Configuration options for the BasicCrawler.

77

78

```typescript { .api }

79

interface BasicCrawlerOptions<Context = BasicCrawlingContext> {

80

/** List of requests to process */

81

requestList?: RequestList;

82

83

/** Queue of requests to process */

84

requestQueue?: RequestQueue;

85

86

/** Function to handle each request */

87

requestHandler: (context: Context) => Promise<void>;

88

89

/** Handler for failed requests that won't be retried */

90

failedRequestHandler?: (context: Context, error: Error) => Promise<void>;

91

92

/** Maximum number of retries per request */

93

maxRequestRetries?: number;

94

95

/** Maximum number of requests to process */

96

maxRequestsPerCrawl?: number;

97

98

/** Maximum number of concurrent requests */

99

maxConcurrency?: number;

100

101

/** Minimum number of concurrent requests */

102

minConcurrency?: number;

103

104

/** Options for the autoscaled pool */

105

autoscaledPoolOptions?: AutoscaledPoolOptions;

106

107

/** Options for the session pool */

108

sessionPoolOptions?: SessionPoolOptions;

109

110

/** Whether to use session pool for requests */

111

useSessionPool?: boolean;

112

113

/** Whether to persist cookies per session */

114

persistCookiesPerSession?: boolean;

115

116

/** Configuration for proxy usage */

117

proxyConfiguration?: ProxyConfiguration;

118

119

/** Whether to keep URL fragments when processing */

120

keepAlive?: boolean;

121

122

/** Custom statistics instance */

123

statistics?: Statistics;

124

125

/** Custom logger instance */

126

log?: Log;

127

128

/** Options for handling reclaimed requests */

129

reclaimRequestHandler?: (context: Context) => Promise<void>;

130

}

131

```

132

133

### BasicCrawlingContext

134

135

The context object passed to request handlers, containing request information and helper methods.

136

137

```typescript { .api }

138

interface BasicCrawlingContext<UserData = Dictionary> {

139

/** The current request being processed */

140

request: Request<UserData>;

141

142

/** Current session if session pool is used */

143

session?: Session;

144

145

/** Proxy information for the request */

146

proxyInfo?: ProxyInfo;

147

148

/** HTTP response object (when applicable) */

149

response?: IncomingMessage;

150

151

/** Reference to the crawler instance */

152

crawler: BasicCrawler;

153

154

/** Logger instance scoped to this request */

155

log: Log;

156

157

/** Send HTTP request with current session and proxy */

158

sendRequest<T>(overrideOptions?: Partial<OptionsInit>): Promise<T>;

159

160

/** Extract and enqueue links from current page */

161

enqueueLinks(options?: EnqueueLinksOptions): Promise<BatchAddRequestsResult>;

162

163

/** Push data to the default dataset */

164

pushData(data: Dictionary | Dictionary[]): Promise<void>;

165

166

/** Store value in the default key-value store */

167

setValue(key: string, value: any, options?: RecordOptions): Promise<void>;

168

169

/** Get value from the default key-value store */

170

getValue<T>(key: string): Promise<T | null>;

171

}

172

```

173

174

### AutoscaledPool

175

176

Manages automatic scaling of concurrent tasks based on system resources and performance.

177

178

```typescript { .api }

179

/**

180

* Manages parallel asynchronous tasks with automatic scaling based on system resources

181

*/

182

class AutoscaledPool {

183

constructor(options: AutoscaledPoolOptions);

184

185

/** Start running the pool */

186

run(): Promise<void>;

187

188

/** Abort all running and pending tasks */

189

abort(): Promise<void>;

190

191

/** Pause the pool (finish running tasks but don't start new ones) */

192

pause(): Promise<void>;

193

194

/** Resume a paused pool */

195

resume(): Promise<void>;

196

197

/** Notify the pool about task completion or failure */

198

notify(): void;

199

200

/** Current concurrency level */

201

readonly currentConcurrency: number;

202

203

/** Desired concurrency level calculated by autoscaling */

204

readonly desiredConcurrency: number;

205

206

/** Minimum concurrency level */

207

readonly minConcurrency: number;

208

209

/** Maximum concurrency level */

210

readonly maxConcurrency: number;

211

}

212

```

213

214

### AutoscaledPoolOptions

215

216

Configuration options for the AutoscaledPool.

217

218

```typescript { .api }

219

interface AutoscaledPoolOptions {

220

/** Function that runs a single task */

221

runTaskFunction: () => Promise<void>;

222

223

/** Function that checks if more tasks are available */

224

isTaskReadyFunction?: () => Promise<boolean>;

225

226

/** Function that checks if all tasks are finished */

227

isFinishedFunction?: () => Promise<boolean>;

228

229

/** Minimum number of concurrent tasks */

230

minConcurrency?: number;

231

232

/** Maximum number of concurrent tasks */

233

maxConcurrency?: number;

234

235

/** Initial number of concurrent tasks */

236

desiredConcurrency?: number;

237

238

/** How often to scale up/down (in milliseconds) */

239

scaleUpStepRatio?: number;

240

241

/** How often to scale down (in milliseconds) */

242

scaleDownStepRatio?: number;

243

244

/** How long to maintain high concurrency after scaling up */

245

maintainConcurrencyTimeoutSecs?: number;

246

247

/** How long to wait before scaling down */

248

tasksReadyTimeoutSecs?: number;

249

250

/** CPU usage threshold for scaling decisions */

251

targetCpuRatio?: number;

252

253

/** Whether to log autoscaling decisions */

254

loggingIntervalSecs?: number;

255

256

/** Custom logger instance */

257

log?: Log;

258

}

259

```

260

261

### Error Classes

262

263

Specialized error types for controlling crawler behavior and retry logic.

264

265

```typescript { .api }

266

/**

267

* Thrown when a request should not be retried

268

* Use this for permanent failures like 404 errors or validation failures

269

*/

270

class NonRetryableError extends Error {

271

constructor(message?: string);

272

}

273

274

/**

275

* Thrown when a critical error occurs that should stop the entire crawler

276

* Use this for fatal errors like invalid configuration or system failures

277

*/

278

class CriticalError extends Error {

279

constructor(message?: string);

280

}

281

282

/**

283

* Thrown when a request should be retried immediately

284

* Use this to force retries even if max retries would normally be exceeded

285

*/

286

class RetryRequestError extends Error {

287

constructor(message?: string);

288

}

289

290

/**

291

* Thrown when a session becomes invalid and should be rotated

292

* Use this when encountering IP blocks or session-related failures

293

*/

294

class SessionError extends Error {

295

constructor(message?: string);

296

}

297

298

/**

299

* Thrown when a request URL is blocked and should be skipped

300

* Internal use by the router system

301

*/

302

class MissingRouteError extends Error {

303

constructor(message?: string);

304

}

305

```

306

307

**Usage Examples:**

308

309

```typescript

310

import { BasicCrawler, NonRetryableError, SessionError } from "crawlee";

311

312

const crawler = new BasicCrawler({

313

requestHandler: async ({ request, response, session }) => {

314

if (response?.statusCode === 404) {

315

// Don't retry 404s

316

throw new NonRetryableError(`Page not found: ${request.url}`);

317

}

318

319

if (response?.statusCode === 403) {

320

// Rotate session on access denied

321

throw new SessionError(`Access denied, rotating session: ${request.url}`);

322

}

323

324

// Process the request normally

325

// ...

326

},

327

});

328

```

329

330

### Router

331

332

URL pattern-based request routing system for handling different types of pages.

333

334

```typescript { .api }

335

/**

336

* Routes requests to different handlers based on URL patterns and labels

337

* @template Context - The crawler context type

338

* @template UserData - The request user data type

339

*/

340

class Router<Context = BasicCrawlingContext, UserData = Dictionary> {

341

constructor();

342

343

/** Add a handler for requests matching the pattern */

344

addHandler<Data extends UserData = UserData>(

345

pattern: string | RegExp | RouteHandler<Context, Data>,

346

handler?: RouteHandler<Context, Data>

347

): void;

348

349

/** Add a default handler for requests that don't match any pattern */

350

addDefaultHandler<Data extends UserData = UserData>(

351

handler: RouteHandler<Context, Data>

352

): void;

353

354

/** Find and return the handler for a given request */

355

findMatchingHandler(request: Request): RouteHandler<Context, UserData> | null;

356

357

/** Create a request handler function to use with crawlers */

358

createRequestHandler(): (context: Context) => Promise<void>;

359

}

360

```

361

362

**Usage Examples:**

363

364

```typescript

365

import { CheerioCrawler, Router } from "crawlee";

366

367

const router = new Router<CheerioCrawlingContext>();

368

369

// Handle product pages

370

router.addHandler('PRODUCT', async ({ $, request, enqueueLinks }) => {

371

const title = $('h1').text();

372

const price = $('.price').text();

373

374

await Dataset.pushData({ title, price, url: request.url });

375

});

376

377

// Handle category pages

378

router.addHandler(/\/category\/.*/, async ({ $, enqueueLinks }) => {

379

await enqueueLinks({

380

selector: '.product-link',

381

label: 'PRODUCT',

382

});

383

});

384

385

// Handle any unmatched pages

386

router.addDefaultHandler(async ({ request, log }) => {

387

log.warn(`No handler for ${request.url}`);

388

});

389

390

const crawler = new CheerioCrawler({

391

requestHandler: router.createRequestHandler(),

392

});

393

```

394

395

### System Status and Monitoring

396

397

Classes for monitoring system resources and crawler performance.

398

399

```typescript { .api }

400

/**

401

* Provides current and historical system status information

402

*/

403

class SystemStatus {

404

constructor(options?: SystemStatusOptions);

405

406

/** Get current system status snapshot */

407

getCurrentStatus(): SystemInfo;

408

409

/** Get historical status data */

410

getHistoricalStatus(): SystemInfo[];

411

412

/** Start monitoring system status */

413

startCapturing(intervalMillis?: number): void;

414

415

/** Stop monitoring system status */

416

stopCapturing(): void;

417

}

418

419

/**

420

* Takes snapshots of system resources for autoscaling decisions

421

*/

422

class Snapshotter {

423

constructor(options?: SnapshotterOptions);

424

425

/** Take a snapshot of current system resources */

426

start(): void;

427

428

/** Stop taking snapshots */

429

stop(): void;

430

431

/** Get CPU usage ratio (0-1) */

432

getCpuRatio(): number;

433

434

/** Get memory usage ratio (0-1) */

435

getMemoryRatio(): number;

436

437

/** Check if system is overloaded */

438

isOverloaded(): boolean;

439

}

440

441

interface SystemInfo {

442

/** CPU usage as a ratio (0-1) */

443

cpuUsage: number;

444

445

/** Memory usage in bytes */

446

memoryUsage: number;

447

448

/** Available memory in bytes */

449

memoryAvailable: number;

450

451

/** Timestamp of the measurement */

452

createdAt: Date;

453

454

/** Whether the system is considered overloaded */

455

isOverloaded: boolean;

456

}

457

```

458

459

### Event Handlers

460

461

Event handling system for crawler lifecycle management.

462

463

```typescript { .api }

464

interface CrawlerAddons<Context> {

465

/** Called when the crawler starts */

466

crawlerStarting?: (crawler: BasicCrawler) => Promise<void>;

467

468

/** Called when the crawler finishes */

469

crawlerFinishing?: (crawler: BasicCrawler) => Promise<void>;

470

471

/** Called when a request starts processing */

472

requestStarting?: (context: Context) => Promise<void>;

473

474

/** Called when a request finishes successfully */

475

requestFinished?: (context: Context) => Promise<void>;

476

477

/** Called when a request fails */

478

requestFailed?: (context: Context, error: Error) => Promise<void>;

479

480

/** Called when a session is rotated */

481

sessionRotating?: (context: Context) => Promise<void>;

482

}

483

484

/**

485

* Handler function type for routing

486

*/

487

type RouteHandler<Context, UserData = Dictionary> = (

488

context: Context,

489

request: Request<UserData>

490

) => Promise<void>;

491

```

492

493

**Usage Examples:**

494

495

```typescript

496

import { BasicCrawler } from "crawlee";

497

498

const crawler = new BasicCrawler({

499

requestHandler: async ({ request }) => {

500

// Main processing logic

501

},

502

});

503

504

// Register event handlers

505

crawler.use({

506

crawlerStarting: async (crawler) => {

507

console.log('Crawler is starting...');

508

},

509

510

crawlerFinishing: async (crawler) => {

511

console.log('Crawler finished!');

512

console.log(`Final stats:`, crawler.finalStatistics);

513

},

514

515

requestFailed: async (context, error) => {

516

console.error(`Request failed: ${context.request.url}`, error);

517

},

518

});

519

```

520

521

## Types

522

523

```typescript { .api }

524

interface Statistics {

525

/** Number of requests that finished successfully */

526

requestsFinished: number;

527

528

/** Number of requests that failed permanently */

529

requestsFailed: number;

530

531

/** Total number of request retries */

532

requestsRetries: number;

533

534

/** Average requests per minute (finished) */

535

requestsFinishedPerMinute: number;

536

537

/** Average failed requests per minute */

538

requestsFailedPerMinute: number;

539

540

/** Minimum request duration in milliseconds */

541

requestMinDurationMillis: number;

542

543

/** Maximum request duration in milliseconds */

544

requestMaxDurationMillis: number;

545

546

/** Total duration of all requests in milliseconds */

547

requestTotalDurationMillis: number;

548

549

/** When the crawler started */

550

crawlerStartedAt: Date;

551

552

/** When the crawler finished */

553

crawlerFinishedAt?: Date;

554

555

/** Unique identifier for these statistics */

556

statsId: string;

557

}

558

559

type FinalStatistics = Statistics & {

560

crawlerFinishedAt: Date;

561

}

562

563

interface RouteDefinition {

564

pattern: string | RegExp;

565

method: 'GET' | 'POST' | 'PUT' | 'DELETE' | '*';

566

handler: RouteHandler<any, any>;

567

}

568

569

interface SystemStatusOptions {

570

/** How often to capture system status (in milliseconds) */

571

intervalMillis?: number;

572

573

/** Maximum number of historical entries to keep */

574

maxEntries?: number;

575

}

576

577

interface SnapshotterOptions {

578

/** How often to take snapshots (in milliseconds) */

579

intervalSecs?: number;

580

581

/** Number of snapshots to average for CPU/memory calculations */

582

windowSize?: number;

583

}

584

```