0
# HTTP Crawling
1
2
HTTP crawling provides server-side HTML parsing and scraping capabilities without the overhead of full browser automation. These crawlers use various DOM parsing libraries to extract data efficiently from web pages.
3
4
## Capabilities
5
6
### HttpCrawler
7
8
Base HTTP crawler that extends BasicCrawler with HTTP-specific functionality for making requests and handling responses.
9
10
```typescript { .api }
11
/**
12
* HTTP crawler for server-side request processing without browser automation
13
*/
14
class HttpCrawler extends BasicCrawler<HttpCrawlingContext> {
15
constructor(options: HttpCrawlerOptions);
16
}
17
```
18
19
### HttpCrawlerOptions
20
21
Configuration options for the HttpCrawler.
22
23
```typescript { .api }
24
interface HttpCrawlerOptions extends BasicCrawlerOptions<HttpCrawlingContext> {
25
/** HTTP client options for making requests */
26
requestHandlerOptions?: Partial<OptionsInit>;
27
28
/** Additional HTTP headers to send with requests */
29
additionalHttpErrorStatusCodes?: number[];
30
31
/** Whether to ignore HTTP error status codes */
32
ignoreHttpErrorStatusCodes?: boolean;
33
34
/** Pre-navigation hooks to modify requests before sending */
35
preNavigationHooks?: Array<(crawlingContext: HttpCrawlingContext, requestAsBrowserOptions: OptionsInit) => Promise<void>>;
36
37
/** Post-navigation hooks to process responses after receiving */
38
postNavigationHooks?: Array<(crawlingContext: HttpCrawlingContext, response: Response) => Promise<void>>;
39
40
/** HTTP client configuration */
41
httpClient?: BaseHttpClient;
42
43
/** Whether to persist cookies between requests */
44
persistCookiesPerSession?: boolean;
45
46
/** Custom User-Agent string */
47
userAgent?: string;
48
49
/** Custom request transformation function */
50
requestTransform?: (options: OptionsInit) => Promise<OptionsInit>;
51
52
/** Custom response transformation function */
53
responseTransform?: (response: Response) => Promise<Response>;
54
}
55
```
56
57
### HttpCrawlingContext
58
59
The context object passed to HTTP crawler request handlers.
60
61
```typescript { .api }
62
interface HttpCrawlingContext<UserData = Dictionary> extends BasicCrawlingContext<UserData> {
63
/** The HTTP response object */
64
response: Response;
65
66
/** Response body as text */
67
body: string;
68
69
/** Response headers */
70
headers: Dictionary<string>;
71
72
/** Content type of the response */
73
contentType: string;
74
75
/** Send HTTP request with custom options */
76
sendRequest<T = any>(overrideOptions?: Partial<OptionsInit>): Promise<T>;
77
}
78
```
79
80
**Usage Examples:**
81
82
```typescript
83
import { HttpCrawler } from "crawlee";
84
85
const crawler = new HttpCrawler({
86
requestHandler: async ({ request, response, body }) => {
87
console.log(`Status: ${response.statusCode} for ${request.url}`);
88
console.log(`Body length: ${body.length}`);
89
90
// Parse HTML manually or use simple text processing
91
const titleMatch = body.match(/<title>(.*?)<\/title>/i);
92
const title = titleMatch ? titleMatch[1] : 'No title';
93
94
await crawler.pushData({
95
url: request.url,
96
title,
97
statusCode: response.statusCode,
98
});
99
},
100
additionalHttpErrorStatusCodes: [429], // Treat 429 as error
101
userAgent: 'MyCustomCrawler/1.0',
102
});
103
```
104
105
### CheerioCrawler
106
107
Server-side HTML parsing crawler using the Cheerio library for jQuery-like DOM manipulation.
108
109
```typescript { .api }
110
/**
111
* Cheerio-based crawler for server-side HTML parsing with jQuery-like syntax
112
*/
113
class CheerioCrawler extends HttpCrawler {
114
constructor(options: CheerioCrawlerOptions);
115
}
116
```
117
118
### CheerioCrawlerOptions
119
120
Configuration options for the CheerioCrawler.
121
122
```typescript { .api }
123
interface CheerioCrawlerOptions extends HttpCrawlerOptions {
124
/** Handler function that receives Cheerio context */
125
requestHandler: (context: CheerioCrawlingContext) => Promise<void>;
126
127
/** Cheerio parsing options */
128
cheerioParseOptions?: CheerioParseOptions;
129
130
/** Whether to inject Cheerio into global scope */
131
forceResponseEncoding?: string;
132
133
/** Custom Cheerio root selector */
134
parserOptions?: {
135
xmlMode?: boolean;
136
decodeEntities?: boolean;
137
lowerCaseAttributeNames?: boolean;
138
};
139
}
140
```
141
142
### CheerioCrawlingContext
143
144
The context object passed to Cheerio crawler request handlers.
145
146
```typescript { .api }
147
interface CheerioCrawlingContext<UserData = Dictionary> extends HttpCrawlingContext<UserData> {
148
/** Cheerio root object for DOM manipulation */
149
$: CheerioRoot;
150
151
/** Get text content from the current page */
152
body: string;
153
154
/** Parse additional HTML with Cheerio */
155
parseWithCheerio(html: string): CheerioRoot;
156
157
/** Enqueue links found on the page */
158
enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;
159
}
160
```
161
162
**Usage Examples:**
163
164
```typescript
165
import { CheerioCrawler, Dataset } from "crawlee";
166
167
const crawler = new CheerioCrawler({
168
requestHandler: async ({ $, request, enqueueLinks, pushData }) => {
169
// Extract data using jQuery-like syntax
170
const title = $('title').text();
171
const description = $('meta[name="description"]').attr('content');
172
173
// Extract all product information
174
const products = [];
175
$('.product').each((index, element) => {
176
const product = $(element);
177
products.push({
178
name: product.find('.product-name').text().trim(),
179
price: product.find('.price').text().trim(),
180
image: product.find('img').attr('src'),
181
});
182
});
183
184
// Save extracted data
185
await pushData({
186
url: request.loadedUrl,
187
title,
188
description,
189
products,
190
extractedAt: new Date(),
191
});
192
193
// Find and enqueue pagination links
194
await enqueueLinks({
195
selector: 'a.page-link',
196
label: 'LIST',
197
});
198
199
// Find and enqueue product detail links
200
await enqueueLinks({
201
selector: '.product a',
202
label: 'DETAIL',
203
});
204
},
205
206
// Handle product detail pages
207
router.addHandler('DETAIL', async ({ $, request, pushData }) => {
208
const productDetails = {
209
url: request.loadedUrl,
210
name: $('.product-title').text(),
211
fullDescription: $('.description').text(),
212
specifications: {},
213
reviews: [],
214
};
215
216
// Extract specifications
217
$('.spec-row').each((_, element) => {
218
const key = $(element).find('.spec-name').text().trim();
219
const value = $(element).find('.spec-value').text().trim();
220
productDetails.specifications[key] = value;
221
});
222
223
// Extract reviews
224
$('.review').each((_, element) => {
225
productDetails.reviews.push({
226
rating: $(element).find('.rating').attr('data-rating'),
227
text: $(element).find('.review-text').text().trim(),
228
author: $(element).find('.reviewer-name').text().trim(),
229
});
230
});
231
232
await pushData(productDetails);
233
}),
234
235
maxConcurrency: 5,
236
maxRequestRetries: 3,
237
});
238
```
239
240
### JSDOMCrawler
241
242
Server-side DOM manipulation crawler using JSDOM for full DOM API support.
243
244
```typescript { .api }
245
/**
246
* JSDOM-based crawler for server-side DOM manipulation with full DOM API
247
*/
248
class JSDOMCrawler extends HttpCrawler {
249
constructor(options: JSDOMCrawlerOptions);
250
}
251
```
252
253
### JSDOMCrawlerOptions
254
255
Configuration options for the JSDOMCrawler.
256
257
```typescript { .api }
258
interface JSDOMCrawlerOptions extends HttpCrawlerOptions {
259
/** Handler function that receives JSDOM context */
260
requestHandler: (context: JSDOMCrawlingContext) => Promise<void>;
261
262
/** JSDOM constructor options */
263
jsdomOptions?: ConstructorOptions;
264
265
/** Whether to run scripts in JSDOM */
266
runScripts?: 'dangerously' | 'outside-only';
267
268
/** Custom resource loader for JSDOM */
269
resourceLoader?: ResourceLoader;
270
271
/** Virtual console options */
272
virtualConsole?: VirtualConsole;
273
}
274
```
275
276
### JSDOMCrawlingContext
277
278
The context object passed to JSDOM crawler request handlers.
279
280
```typescript { .api }
281
interface JSDOMCrawlingContext<UserData = Dictionary> extends HttpCrawlingContext<UserData> {
282
/** The JSDOM window object */
283
window: DOMWindow;
284
285
/** The document object */
286
document: Document;
287
288
/** Shortcut to document.querySelector */
289
$(selector: string): Element | null;
290
291
/** Shortcut to document.querySelectorAll */
292
$$(selector: string): NodeListOf<Element>;
293
294
/** Enqueue links found on the page */
295
enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;
296
}
297
```
298
299
**Usage Examples:**
300
301
```typescript
302
import { JSDOMCrawler } from "crawlee";
303
304
const crawler = new JSDOMCrawler({
305
requestHandler: async ({ window, document, $, $$, request, pushData, enqueueLinks }) => {
306
// Use full DOM API
307
const title = document.title;
308
const metaTags = document.getElementsByTagName('meta');
309
310
// Use convenience selectors
311
const mainContent = $('.main-content');
312
const allLinks = $$('a[href]');
313
314
// Execute JavaScript-like operations
315
const productList = Array.from($$('.product')).map(element => ({
316
name: element.querySelector('.name')?.textContent?.trim(),
317
price: element.querySelector('.price')?.textContent?.trim(),
318
inStock: element.classList.contains('in-stock'),
319
}));
320
321
// Access computed styles if needed
322
const computedStyle = window.getComputedStyle(mainContent);
323
324
await pushData({
325
url: request.loadedUrl,
326
title,
327
productCount: productList.length,
328
products: productList,
329
hasMainContent: !!mainContent,
330
});
331
332
// Enqueue links
333
await enqueueLinks({
334
selector: 'a[href*="/category/"]',
335
label: 'CATEGORY',
336
});
337
},
338
339
jsdomOptions: {
340
runScripts: 'dangerously', // Enable JavaScript execution
341
resources: 'usable', // Load external resources
342
},
343
});
344
```
345
346
### LinkedOMCrawler
347
348
Fast server-side DOM manipulation crawler using LinkedOM for performance-optimized parsing.
349
350
```typescript { .api }
351
/**
352
* LinkedOM-based crawler for fast server-side DOM manipulation
353
*/
354
class LinkedOMCrawler extends HttpCrawler {
355
constructor(options: LinkedOMCrawlerOptions);
356
}
357
```
358
359
### LinkedOMCrawlerOptions
360
361
Configuration options for the LinkedOMCrawler.
362
363
```typescript { .api }
364
interface LinkedOMCrawlerOptions extends HttpCrawlerOptions {
365
/** Handler function that receives LinkedOM context */
366
requestHandler: (context: LinkedOMCrawlingContext) => Promise<void>;
367
368
/** LinkedOM parsing options */
369
linkedomOptions?: {
370
/** Include comment nodes in parsing */
371
includeComments?: boolean;
372
/** Include text nodes in parsing */
373
includeTextNodes?: boolean;
374
};
375
}
376
```
377
378
### LinkedOMCrawlingContext
379
380
The context object passed to LinkedOM crawler request handlers.
381
382
```typescript { .api }
383
interface LinkedOMCrawlingContext<UserData = Dictionary> extends HttpCrawlingContext<UserData> {
384
/** The LinkedOM window object */
385
window: Window;
386
387
/** The document object */
388
document: Document;
389
390
/** Shortcut to document.querySelector */
391
$(selector: string): Element | null;
392
393
/** Shortcut to document.querySelectorAll */
394
$$(selector: string): NodeListOf<Element>;
395
396
/** Enqueue links found on the page */
397
enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;
398
}
399
```
400
401
**Usage Examples:**
402
403
```typescript
404
import { LinkedOMCrawler } from "crawlee";
405
406
const crawler = new LinkedOMCrawler({
407
requestHandler: async ({ window, document, $, $$, request, pushData }) => {
408
// LinkedOM provides fast DOM manipulation
409
const title = document.title;
410
const description = $('meta[name="description"]')?.getAttribute('content');
411
412
// Fast element selection and text extraction
413
const headlines = Array.from($$('h1, h2, h3')).map(el => ({
414
tag: el.tagName.toLowerCase(),
415
text: el.textContent?.trim(),
416
level: parseInt(el.tagName.slice(1)),
417
}));
418
419
// Fast table parsing
420
const tableData = [];
421
$$('table tr').forEach(row => {
422
const cells = Array.from(row.querySelectorAll('td, th')).map(cell =>
423
cell.textContent?.trim()
424
);
425
if (cells.length > 0) {
426
tableData.push(cells);
427
}
428
});
429
430
await pushData({
431
url: request.loadedUrl,
432
title,
433
description,
434
headlines,
435
tableData,
436
});
437
},
438
439
maxConcurrency: 20, // LinkedOM is fast, can handle higher concurrency
440
});
441
```
442
443
### File Download Crawler
444
445
Specialized crawler for efficient file downloading using HTTP streams.
446
447
```typescript { .api }
448
/**
449
* Specialized crawler for downloading files efficiently
450
*/
451
class FileDownload extends HttpCrawler<FileDownloadCrawlingContext> {
452
constructor(options: FileDownloadOptions);
453
}
454
455
/**
456
* Create a router for file download handling
457
*/
458
function createFileRouter<Context extends FileDownloadCrawlingContext>(): Router<Context>;
459
460
/**
461
* Transform stream that monitors download speed and aborts if too slow
462
*/
463
function MinimumSpeedStream(options: MinimumSpeedStreamOptions): Transform;
464
465
/**
466
* Transform stream that logs download progress
467
*/
468
function ByteCounterStream(options: ByteCounterStreamOptions): Transform;
469
470
interface FileDownloadOptions<UserData = any, JSONData = any> {
471
/** Request handler for processing downloaded files */
472
requestHandler?: FileDownloadRequestHandler<UserData, JSONData>;
473
474
/** Stream handler for processing download streams */
475
streamHandler?: StreamHandler;
476
477
/** All standard HttpCrawlerOptions are supported */
478
requestList?: RequestList;
479
requestQueue?: RequestQueue;
480
maxRequestRetries?: number;
481
maxRequestsPerCrawl?: number;
482
maxConcurrency?: number;
483
navigationTimeoutSecs?: number;
484
}
485
486
interface FileDownloadCrawlingContext<UserData = any, JSONData = any>
487
extends HttpCrawlingContext<UserData, JSONData> {
488
/** The download stream (when using streamHandler) */
489
stream?: Request;
490
}
491
492
interface MinimumSpeedStreamOptions {
493
/** Minimum speed in KB/s */
494
minSpeedKbps: number;
495
496
/** Time window for speed calculation in ms (default: 10000) */
497
historyLengthMs?: number;
498
499
/** How often to check speed in ms (default: 5000) */
500
checkProgressInterval?: number;
501
}
502
503
interface ByteCounterStreamOptions {
504
/** Function to call with bytes transferred */
505
logTransferredBytes: (bytes: number) => void;
506
507
/** How often to log progress in ms (default: 5000) */
508
loggingInterval?: number;
509
}
510
```
511
512
**Usage Examples:**
513
514
```typescript
515
import { FileDownload, createFileRouter, writeFileSync } from "crawlee";
516
517
// Basic file download with requestHandler
518
const fileDownloader = new FileDownload({
519
requestHandler: async ({ body, request, pushData }) => {
520
// Save file to disk
521
const fileName = request.url.replace(/[^a-z0-9\.]/gi, '_');
522
writeFileSync(`./downloads/${fileName}`, body);
523
524
await pushData({
525
url: request.url,
526
fileName,
527
size: body.length,
528
downloadedAt: new Date(),
529
});
530
},
531
});
532
533
// Run with list of file URLs
534
await fileDownloader.run([
535
'http://www.example.com/document.pdf',
536
'http://www.example.com/image.jpg',
537
'http://www.example.com/video.mp4',
538
]);
539
540
// Advanced streaming with progress monitoring
541
const streamDownloader = new FileDownload({
542
streamHandler: async ({ stream, request, log }) => {
543
const filePath = `./downloads/${path.basename(request.url)}`;
544
const fileStream = createWriteStream(filePath);
545
546
// Add progress monitoring
547
const progressStream = ByteCounterStream({
548
logTransferredBytes: (bytes) => {
549
log.info(`Downloaded ${(bytes / 1024 / 1024).toFixed(2)} MB`);
550
},
551
loggingInterval: 2000,
552
});
553
554
// Add speed monitoring
555
const speedStream = MinimumSpeedStream({
556
minSpeedKbps: 100, // Minimum 100 KB/s
557
historyLengthMs: 10000,
558
checkProgressInterval: 3000,
559
});
560
561
// Pipe stream through monitors to file
562
stream
563
.pipe(progressStream)
564
.pipe(speedStream)
565
.pipe(fileStream);
566
567
// Wait for completion
568
await finished(fileStream);
569
log.info(`File saved: ${filePath}`);
570
},
571
});
572
573
// Using router for different file types
574
const router = createFileRouter();
575
576
router.addHandler('PDF', async ({ body, request, pushData }) => {
577
// Handle PDF files
578
const fileName = `pdf_${Date.now()}.pdf`;
579
writeFileSync(`./pdfs/${fileName}`, body);
580
await pushData({ type: 'pdf', fileName, url: request.url });
581
});
582
583
router.addHandler('IMAGE', async ({ body, request, pushData }) => {
584
// Handle image files
585
const fileName = `img_${Date.now()}.jpg`;
586
writeFileSync(`./images/${fileName}`, body);
587
await pushData({ type: 'image', fileName, url: request.url });
588
});
589
590
router.addDefaultHandler(async ({ body, request, pushData }) => {
591
// Handle other file types
592
const fileName = `file_${Date.now()}`;
593
writeFileSync(`./files/${fileName}`, body);
594
await pushData({ type: 'other', fileName, url: request.url });
595
});
596
597
const routerDownloader = new FileDownload({
598
requestHandler: router,
599
});
600
601
// Add requests with labels for routing
602
await routerDownloader.addRequests([
603
{ url: 'http://example.com/doc.pdf', label: 'PDF' },
604
{ url: 'http://example.com/photo.jpg', label: 'IMAGE' },
605
{ url: 'http://example.com/data.csv', label: 'OTHER' },
606
]);
607
```
608
609
## Types
610
611
```typescript { .api }
612
interface Response {
613
/** HTTP status code */
614
statusCode: number;
615
616
/** HTTP status message */
617
statusMessage: string;
618
619
/** Response headers */
620
headers: Dictionary<string | string[]>;
621
622
/** Response body as string */
623
body: string;
624
625
/** Response body as buffer */
626
rawBody: Buffer;
627
628
/** Whether the request was redirected */
629
isRedirect: boolean;
630
631
/** Final URL after redirects */
632
url: string;
633
634
/** Request timing information */
635
timings: {
636
start: number;
637
socket: number;
638
lookup: number;
639
connect: number;
640
secureConnect: number;
641
upload: number;
642
response: number;
643
end: number;
644
};
645
}
646
647
interface OptionsInit {
648
/** HTTP method */
649
method?: HttpMethod;
650
651
/** Request headers */
652
headers?: Dictionary<string>;
653
654
/** Request body */
655
body?: string | Buffer;
656
657
/** Request timeout in milliseconds */
658
timeout?: number;
659
660
/** Whether to follow redirects */
661
followRedirect?: boolean;
662
663
/** Maximum number of redirects to follow */
664
maxRedirects?: number;
665
666
/** Proxy URL */
667
proxy?: string;
668
669
/** User agent string */
670
userAgent?: string;
671
672
/** Whether to validate SSL certificates */
673
rejectUnauthorized?: boolean;
674
}
675
676
interface CheerioParseOptions {
677
/** Whether to parse as XML */
678
xmlMode?: boolean;
679
680
/** Whether to decode HTML entities */
681
decodeEntities?: boolean;
682
683
/** Whether to lowercase attribute names */
684
lowerCaseAttributeNames?: boolean;
685
686
/** Whether to recognize CDATA sections */
687
recognizeCDATA?: boolean;
688
689
/** Whether to recognize self-closing tags */
690
recognizeSelfClosing?: boolean;
691
}
692
693
interface CrawlerEnqueueLinksOptions {
694
/** CSS selector for finding links */
695
selector?: string;
696
697
/** Base URL for resolving relative links */
698
baseUrl?: string;
699
700
/** URLs to exclude from enqueueing */
701
exclude?: (string | RegExp)[];
702
703
/** Glob patterns for URLs to include */
704
globs?: string[];
705
706
/** Pseudo-URLs for matching links */
707
pseudoUrls?: string[];
708
709
/** Label to assign to enqueued requests */
710
label?: string;
711
712
/** Additional data to attach to requests */
713
userData?: Dictionary;
714
715
/** Whether to transform relative URLs to absolute */
716
transformRequestFunction?: (request: RequestOptions) => RequestOptions;
717
718
/** Request queue to add requests to */
719
requestQueue?: RequestQueue;
720
721
/** Maximum number of links to enqueue */
722
limit?: number;
723
}
724
725
type HttpMethod = 'GET' | 'POST' | 'PUT' | 'DELETE' | 'HEAD' | 'OPTIONS' | 'PATCH';
726
type CheerioRoot = ReturnType<typeof cheerio.load>;
727
```