or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

browser-crawling.mdconfiguration-proxies.mdcore-crawling.mdhttp-crawling.mdindex.mdsession-management.mdstorage.mdutilities.md

http-crawling.mddocs/

0

# HTTP Crawling

1

2

HTTP crawling provides server-side HTML parsing and scraping capabilities without the overhead of full browser automation. These crawlers use various DOM parsing libraries to extract data efficiently from web pages.

3

4

## Capabilities

5

6

### HttpCrawler

7

8

Base HTTP crawler that extends BasicCrawler with HTTP-specific functionality for making requests and handling responses.

9

10

```typescript { .api }

11

/**

12

* HTTP crawler for server-side request processing without browser automation

13

*/

14

class HttpCrawler extends BasicCrawler<HttpCrawlingContext> {

15

constructor(options: HttpCrawlerOptions);

16

}

17

```

18

19

### HttpCrawlerOptions

20

21

Configuration options for the HttpCrawler.

22

23

```typescript { .api }

24

interface HttpCrawlerOptions extends BasicCrawlerOptions<HttpCrawlingContext> {

25

/** HTTP client options for making requests */

26

requestHandlerOptions?: Partial<OptionsInit>;

27

28

/** Additional HTTP headers to send with requests */

29

additionalHttpErrorStatusCodes?: number[];

30

31

/** Whether to ignore HTTP error status codes */

32

ignoreHttpErrorStatusCodes?: boolean;

33

34

/** Pre-navigation hooks to modify requests before sending */

35

preNavigationHooks?: Array<(crawlingContext: HttpCrawlingContext, requestAsBrowserOptions: OptionsInit) => Promise<void>>;

36

37

/** Post-navigation hooks to process responses after receiving */

38

postNavigationHooks?: Array<(crawlingContext: HttpCrawlingContext, response: Response) => Promise<void>>;

39

40

/** HTTP client configuration */

41

httpClient?: BaseHttpClient;

42

43

/** Whether to persist cookies between requests */

44

persistCookiesPerSession?: boolean;

45

46

/** Custom User-Agent string */

47

userAgent?: string;

48

49

/** Custom request transformation function */

50

requestTransform?: (options: OptionsInit) => Promise<OptionsInit>;

51

52

/** Custom response transformation function */

53

responseTransform?: (response: Response) => Promise<Response>;

54

}

55

```

56

57

### HttpCrawlingContext

58

59

The context object passed to HTTP crawler request handlers.

60

61

```typescript { .api }

62

interface HttpCrawlingContext<UserData = Dictionary> extends BasicCrawlingContext<UserData> {

63

/** The HTTP response object */

64

response: Response;

65

66

/** Response body as text */

67

body: string;

68

69

/** Response headers */

70

headers: Dictionary<string>;

71

72

/** Content type of the response */

73

contentType: string;

74

75

/** Send HTTP request with custom options */

76

sendRequest<T = any>(overrideOptions?: Partial<OptionsInit>): Promise<T>;

77

}

78

```

79

80

**Usage Examples:**

81

82

```typescript

83

import { HttpCrawler } from "crawlee";

84

85

const crawler = new HttpCrawler({

86

requestHandler: async ({ request, response, body }) => {

87

console.log(`Status: ${response.statusCode} for ${request.url}`);

88

console.log(`Body length: ${body.length}`);

89

90

// Parse HTML manually or use simple text processing

91

const titleMatch = body.match(/<title>(.*?)<\/title>/i);

92

const title = titleMatch ? titleMatch[1] : 'No title';

93

94

await crawler.pushData({

95

url: request.url,

96

title,

97

statusCode: response.statusCode,

98

});

99

},

100

additionalHttpErrorStatusCodes: [429], // Treat 429 as error

101

userAgent: 'MyCustomCrawler/1.0',

102

});

103

```

104

105

### CheerioCrawler

106

107

Server-side HTML parsing crawler using the Cheerio library for jQuery-like DOM manipulation.

108

109

```typescript { .api }

110

/**

111

* Cheerio-based crawler for server-side HTML parsing with jQuery-like syntax

112

*/

113

class CheerioCrawler extends HttpCrawler {

114

constructor(options: CheerioCrawlerOptions);

115

}

116

```

117

118

### CheerioCrawlerOptions

119

120

Configuration options for the CheerioCrawler.

121

122

```typescript { .api }

123

interface CheerioCrawlerOptions extends HttpCrawlerOptions {

124

/** Handler function that receives Cheerio context */

125

requestHandler: (context: CheerioCrawlingContext) => Promise<void>;

126

127

/** Cheerio parsing options */

128

cheerioParseOptions?: CheerioParseOptions;

129

130

/** Whether to inject Cheerio into global scope */

131

forceResponseEncoding?: string;

132

133

/** Custom Cheerio root selector */

134

parserOptions?: {

135

xmlMode?: boolean;

136

decodeEntities?: boolean;

137

lowerCaseAttributeNames?: boolean;

138

};

139

}

140

```

141

142

### CheerioCrawlingContext

143

144

The context object passed to Cheerio crawler request handlers.

145

146

```typescript { .api }

147

interface CheerioCrawlingContext<UserData = Dictionary> extends HttpCrawlingContext<UserData> {

148

/** Cheerio root object for DOM manipulation */

149

$: CheerioRoot;

150

151

/** Get text content from the current page */

152

body: string;

153

154

/** Parse additional HTML with Cheerio */

155

parseWithCheerio(html: string): CheerioRoot;

156

157

/** Enqueue links found on the page */

158

enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;

159

}

160

```

161

162

**Usage Examples:**

163

164

```typescript

165

import { CheerioCrawler, Dataset } from "crawlee";

166

167

const crawler = new CheerioCrawler({

168

requestHandler: async ({ $, request, enqueueLinks, pushData }) => {

169

// Extract data using jQuery-like syntax

170

const title = $('title').text();

171

const description = $('meta[name="description"]').attr('content');

172

173

// Extract all product information

174

const products = [];

175

$('.product').each((index, element) => {

176

const product = $(element);

177

products.push({

178

name: product.find('.product-name').text().trim(),

179

price: product.find('.price').text().trim(),

180

image: product.find('img').attr('src'),

181

});

182

});

183

184

// Save extracted data

185

await pushData({

186

url: request.loadedUrl,

187

title,

188

description,

189

products,

190

extractedAt: new Date(),

191

});

192

193

// Find and enqueue pagination links

194

await enqueueLinks({

195

selector: 'a.page-link',

196

label: 'LIST',

197

});

198

199

// Find and enqueue product detail links

200

await enqueueLinks({

201

selector: '.product a',

202

label: 'DETAIL',

203

});

204

},

205

206

// Handle product detail pages

207

router.addHandler('DETAIL', async ({ $, request, pushData }) => {

208

const productDetails = {

209

url: request.loadedUrl,

210

name: $('.product-title').text(),

211

fullDescription: $('.description').text(),

212

specifications: {},

213

reviews: [],

214

};

215

216

// Extract specifications

217

$('.spec-row').each((_, element) => {

218

const key = $(element).find('.spec-name').text().trim();

219

const value = $(element).find('.spec-value').text().trim();

220

productDetails.specifications[key] = value;

221

});

222

223

// Extract reviews

224

$('.review').each((_, element) => {

225

productDetails.reviews.push({

226

rating: $(element).find('.rating').attr('data-rating'),

227

text: $(element).find('.review-text').text().trim(),

228

author: $(element).find('.reviewer-name').text().trim(),

229

});

230

});

231

232

await pushData(productDetails);

233

}),

234

235

maxConcurrency: 5,

236

maxRequestRetries: 3,

237

});

238

```

239

240

### JSDOMCrawler

241

242

Server-side DOM manipulation crawler using JSDOM for full DOM API support.

243

244

```typescript { .api }

245

/**

246

* JSDOM-based crawler for server-side DOM manipulation with full DOM API

247

*/

248

class JSDOMCrawler extends HttpCrawler {

249

constructor(options: JSDOMCrawlerOptions);

250

}

251

```

252

253

### JSDOMCrawlerOptions

254

255

Configuration options for the JSDOMCrawler.

256

257

```typescript { .api }

258

interface JSDOMCrawlerOptions extends HttpCrawlerOptions {

259

/** Handler function that receives JSDOM context */

260

requestHandler: (context: JSDOMCrawlingContext) => Promise<void>;

261

262

/** JSDOM constructor options */

263

jsdomOptions?: ConstructorOptions;

264

265

/** Whether to run scripts in JSDOM */

266

runScripts?: 'dangerously' | 'outside-only';

267

268

/** Custom resource loader for JSDOM */

269

resourceLoader?: ResourceLoader;

270

271

/** Virtual console options */

272

virtualConsole?: VirtualConsole;

273

}

274

```

275

276

### JSDOMCrawlingContext

277

278

The context object passed to JSDOM crawler request handlers.

279

280

```typescript { .api }

281

interface JSDOMCrawlingContext<UserData = Dictionary> extends HttpCrawlingContext<UserData> {

282

/** The JSDOM window object */

283

window: DOMWindow;

284

285

/** The document object */

286

document: Document;

287

288

/** Shortcut to document.querySelector */

289

$(selector: string): Element | null;

290

291

/** Shortcut to document.querySelectorAll */

292

$$(selector: string): NodeListOf<Element>;

293

294

/** Enqueue links found on the page */

295

enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;

296

}

297

```

298

299

**Usage Examples:**

300

301

```typescript

302

import { JSDOMCrawler } from "crawlee";

303

304

const crawler = new JSDOMCrawler({

305

requestHandler: async ({ window, document, $, $$, request, pushData, enqueueLinks }) => {

306

// Use full DOM API

307

const title = document.title;

308

const metaTags = document.getElementsByTagName('meta');

309

310

// Use convenience selectors

311

const mainContent = $('.main-content');

312

const allLinks = $$('a[href]');

313

314

// Execute JavaScript-like operations

315

const productList = Array.from($$('.product')).map(element => ({

316

name: element.querySelector('.name')?.textContent?.trim(),

317

price: element.querySelector('.price')?.textContent?.trim(),

318

inStock: element.classList.contains('in-stock'),

319

}));

320

321

// Access computed styles if needed

322

const computedStyle = window.getComputedStyle(mainContent);

323

324

await pushData({

325

url: request.loadedUrl,

326

title,

327

productCount: productList.length,

328

products: productList,

329

hasMainContent: !!mainContent,

330

});

331

332

// Enqueue links

333

await enqueueLinks({

334

selector: 'a[href*="/category/"]',

335

label: 'CATEGORY',

336

});

337

},

338

339

jsdomOptions: {

340

runScripts: 'dangerously', // Enable JavaScript execution

341

resources: 'usable', // Load external resources

342

},

343

});

344

```

345

346

### LinkedOMCrawler

347

348

Fast server-side DOM manipulation crawler using LinkedOM for performance-optimized parsing.

349

350

```typescript { .api }

351

/**

352

* LinkedOM-based crawler for fast server-side DOM manipulation

353

*/

354

class LinkedOMCrawler extends HttpCrawler {

355

constructor(options: LinkedOMCrawlerOptions);

356

}

357

```

358

359

### LinkedOMCrawlerOptions

360

361

Configuration options for the LinkedOMCrawler.

362

363

```typescript { .api }

364

interface LinkedOMCrawlerOptions extends HttpCrawlerOptions {

365

/** Handler function that receives LinkedOM context */

366

requestHandler: (context: LinkedOMCrawlingContext) => Promise<void>;

367

368

/** LinkedOM parsing options */

369

linkedomOptions?: {

370

/** Include comment nodes in parsing */

371

includeComments?: boolean;

372

/** Include text nodes in parsing */

373

includeTextNodes?: boolean;

374

};

375

}

376

```

377

378

### LinkedOMCrawlingContext

379

380

The context object passed to LinkedOM crawler request handlers.

381

382

```typescript { .api }

383

interface LinkedOMCrawlingContext<UserData = Dictionary> extends HttpCrawlingContext<UserData> {

384

/** The LinkedOM window object */

385

window: Window;

386

387

/** The document object */

388

document: Document;

389

390

/** Shortcut to document.querySelector */

391

$(selector: string): Element | null;

392

393

/** Shortcut to document.querySelectorAll */

394

$$(selector: string): NodeListOf<Element>;

395

396

/** Enqueue links found on the page */

397

enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;

398

}

399

```

400

401

**Usage Examples:**

402

403

```typescript

404

import { LinkedOMCrawler } from "crawlee";

405

406

const crawler = new LinkedOMCrawler({

407

requestHandler: async ({ window, document, $, $$, request, pushData }) => {

408

// LinkedOM provides fast DOM manipulation

409

const title = document.title;

410

const description = $('meta[name="description"]')?.getAttribute('content');

411

412

// Fast element selection and text extraction

413

const headlines = Array.from($$('h1, h2, h3')).map(el => ({

414

tag: el.tagName.toLowerCase(),

415

text: el.textContent?.trim(),

416

level: parseInt(el.tagName.slice(1)),

417

}));

418

419

// Fast table parsing

420

const tableData = [];

421

$$('table tr').forEach(row => {

422

const cells = Array.from(row.querySelectorAll('td, th')).map(cell =>

423

cell.textContent?.trim()

424

);

425

if (cells.length > 0) {

426

tableData.push(cells);

427

}

428

});

429

430

await pushData({

431

url: request.loadedUrl,

432

title,

433

description,

434

headlines,

435

tableData,

436

});

437

},

438

439

maxConcurrency: 20, // LinkedOM is fast, can handle higher concurrency

440

});

441

```

442

443

### File Download Crawler

444

445

Specialized crawler for efficient file downloading using HTTP streams.

446

447

```typescript { .api }

448

/**

449

* Specialized crawler for downloading files efficiently

450

*/

451

class FileDownload extends HttpCrawler<FileDownloadCrawlingContext> {

452

constructor(options: FileDownloadOptions);

453

}

454

455

/**

456

* Create a router for file download handling

457

*/

458

function createFileRouter<Context extends FileDownloadCrawlingContext>(): Router<Context>;

459

460

/**

461

* Transform stream that monitors download speed and aborts if too slow

462

*/

463

function MinimumSpeedStream(options: MinimumSpeedStreamOptions): Transform;

464

465

/**

466

* Transform stream that logs download progress

467

*/

468

function ByteCounterStream(options: ByteCounterStreamOptions): Transform;

469

470

interface FileDownloadOptions<UserData = any, JSONData = any> {

471

/** Request handler for processing downloaded files */

472

requestHandler?: FileDownloadRequestHandler<UserData, JSONData>;

473

474

/** Stream handler for processing download streams */

475

streamHandler?: StreamHandler;

476

477

/** All standard HttpCrawlerOptions are supported */

478

requestList?: RequestList;

479

requestQueue?: RequestQueue;

480

maxRequestRetries?: number;

481

maxRequestsPerCrawl?: number;

482

maxConcurrency?: number;

483

navigationTimeoutSecs?: number;

484

}

485

486

interface FileDownloadCrawlingContext<UserData = any, JSONData = any>

487

extends HttpCrawlingContext<UserData, JSONData> {

488

/** The download stream (when using streamHandler) */

489

stream?: Request;

490

}

491

492

interface MinimumSpeedStreamOptions {

493

/** Minimum speed in KB/s */

494

minSpeedKbps: number;

495

496

/** Time window for speed calculation in ms (default: 10000) */

497

historyLengthMs?: number;

498

499

/** How often to check speed in ms (default: 5000) */

500

checkProgressInterval?: number;

501

}

502

503

interface ByteCounterStreamOptions {

504

/** Function to call with bytes transferred */

505

logTransferredBytes: (bytes: number) => void;

506

507

/** How often to log progress in ms (default: 5000) */

508

loggingInterval?: number;

509

}

510

```

511

512

**Usage Examples:**

513

514

```typescript

515

import { FileDownload, createFileRouter, writeFileSync } from "crawlee";

516

517

// Basic file download with requestHandler

518

const fileDownloader = new FileDownload({

519

requestHandler: async ({ body, request, pushData }) => {

520

// Save file to disk

521

const fileName = request.url.replace(/[^a-z0-9\.]/gi, '_');

522

writeFileSync(`./downloads/${fileName}`, body);

523

524

await pushData({

525

url: request.url,

526

fileName,

527

size: body.length,

528

downloadedAt: new Date(),

529

});

530

},

531

});

532

533

// Run with list of file URLs

534

await fileDownloader.run([

535

'http://www.example.com/document.pdf',

536

'http://www.example.com/image.jpg',

537

'http://www.example.com/video.mp4',

538

]);

539

540

// Advanced streaming with progress monitoring

541

const streamDownloader = new FileDownload({

542

streamHandler: async ({ stream, request, log }) => {

543

const filePath = `./downloads/${path.basename(request.url)}`;

544

const fileStream = createWriteStream(filePath);

545

546

// Add progress monitoring

547

const progressStream = ByteCounterStream({

548

logTransferredBytes: (bytes) => {

549

log.info(`Downloaded ${(bytes / 1024 / 1024).toFixed(2)} MB`);

550

},

551

loggingInterval: 2000,

552

});

553

554

// Add speed monitoring

555

const speedStream = MinimumSpeedStream({

556

minSpeedKbps: 100, // Minimum 100 KB/s

557

historyLengthMs: 10000,

558

checkProgressInterval: 3000,

559

});

560

561

// Pipe stream through monitors to file

562

stream

563

.pipe(progressStream)

564

.pipe(speedStream)

565

.pipe(fileStream);

566

567

// Wait for completion

568

await finished(fileStream);

569

log.info(`File saved: ${filePath}`);

570

},

571

});

572

573

// Using router for different file types

574

const router = createFileRouter();

575

576

router.addHandler('PDF', async ({ body, request, pushData }) => {

577

// Handle PDF files

578

const fileName = `pdf_${Date.now()}.pdf`;

579

writeFileSync(`./pdfs/${fileName}`, body);

580

await pushData({ type: 'pdf', fileName, url: request.url });

581

});

582

583

router.addHandler('IMAGE', async ({ body, request, pushData }) => {

584

// Handle image files

585

const fileName = `img_${Date.now()}.jpg`;

586

writeFileSync(`./images/${fileName}`, body);

587

await pushData({ type: 'image', fileName, url: request.url });

588

});

589

590

router.addDefaultHandler(async ({ body, request, pushData }) => {

591

// Handle other file types

592

const fileName = `file_${Date.now()}`;

593

writeFileSync(`./files/${fileName}`, body);

594

await pushData({ type: 'other', fileName, url: request.url });

595

});

596

597

const routerDownloader = new FileDownload({

598

requestHandler: router,

599

});

600

601

// Add requests with labels for routing

602

await routerDownloader.addRequests([

603

{ url: 'http://example.com/doc.pdf', label: 'PDF' },

604

{ url: 'http://example.com/photo.jpg', label: 'IMAGE' },

605

{ url: 'http://example.com/data.csv', label: 'OTHER' },

606

]);

607

```

608

609

## Types

610

611

```typescript { .api }

612

interface Response {

613

/** HTTP status code */

614

statusCode: number;

615

616

/** HTTP status message */

617

statusMessage: string;

618

619

/** Response headers */

620

headers: Dictionary<string | string[]>;

621

622

/** Response body as string */

623

body: string;

624

625

/** Response body as buffer */

626

rawBody: Buffer;

627

628

/** Whether the request was redirected */

629

isRedirect: boolean;

630

631

/** Final URL after redirects */

632

url: string;

633

634

/** Request timing information */

635

timings: {

636

start: number;

637

socket: number;

638

lookup: number;

639

connect: number;

640

secureConnect: number;

641

upload: number;

642

response: number;

643

end: number;

644

};

645

}

646

647

interface OptionsInit {

648

/** HTTP method */

649

method?: HttpMethod;

650

651

/** Request headers */

652

headers?: Dictionary<string>;

653

654

/** Request body */

655

body?: string | Buffer;

656

657

/** Request timeout in milliseconds */

658

timeout?: number;

659

660

/** Whether to follow redirects */

661

followRedirect?: boolean;

662

663

/** Maximum number of redirects to follow */

664

maxRedirects?: number;

665

666

/** Proxy URL */

667

proxy?: string;

668

669

/** User agent string */

670

userAgent?: string;

671

672

/** Whether to validate SSL certificates */

673

rejectUnauthorized?: boolean;

674

}

675

676

interface CheerioParseOptions {

677

/** Whether to parse as XML */

678

xmlMode?: boolean;

679

680

/** Whether to decode HTML entities */

681

decodeEntities?: boolean;

682

683

/** Whether to lowercase attribute names */

684

lowerCaseAttributeNames?: boolean;

685

686

/** Whether to recognize CDATA sections */

687

recognizeCDATA?: boolean;

688

689

/** Whether to recognize self-closing tags */

690

recognizeSelfClosing?: boolean;

691

}

692

693

interface CrawlerEnqueueLinksOptions {

694

/** CSS selector for finding links */

695

selector?: string;

696

697

/** Base URL for resolving relative links */

698

baseUrl?: string;

699

700

/** URLs to exclude from enqueueing */

701

exclude?: (string | RegExp)[];

702

703

/** Glob patterns for URLs to include */

704

globs?: string[];

705

706

/** Pseudo-URLs for matching links */

707

pseudoUrls?: string[];

708

709

/** Label to assign to enqueued requests */

710

label?: string;

711

712

/** Additional data to attach to requests */

713

userData?: Dictionary;

714

715

/** Whether to transform relative URLs to absolute */

716

transformRequestFunction?: (request: RequestOptions) => RequestOptions;

717

718

/** Request queue to add requests to */

719

requestQueue?: RequestQueue;

720

721

/** Maximum number of links to enqueue */

722

limit?: number;

723

}

724

725

type HttpMethod = 'GET' | 'POST' | 'PUT' | 'DELETE' | 'HEAD' | 'OPTIONS' | 'PATCH';

726

type CheerioRoot = ReturnType<typeof cheerio.load>;

727

```