0
# Browser Crawling
1
2
Browser crawling provides full browser automation capabilities for handling JavaScript-heavy websites, dynamic content, and complex user interactions. This includes Puppeteer and Playwright integration with efficient browser pool management.
3
4
## Capabilities
5
6
### BrowserCrawler
7
8
Base browser crawler class that extends BasicCrawler with browser automation capabilities.
9
10
```typescript { .api }
11
/**
12
* Base browser crawler for browser automation with Puppeteer or Playwright
13
*/
14
class BrowserCrawler extends BasicCrawler<BrowserCrawlingContext> {
15
constructor(options: BrowserCrawlerOptions);
16
}
17
```
18
19
### BrowserCrawlerOptions
20
21
Configuration options for the BrowserCrawler.
22
23
```typescript { .api }
24
interface BrowserCrawlerOptions extends BasicCrawlerOptions<BrowserCrawlingContext> {
25
/** Browser launcher options */
26
launchContext?: LaunchContext;
27
28
/** Browser pool configuration */
29
browserPoolOptions?: BrowserPoolOptions;
30
31
/** Whether to block certain resource types for faster loading */
32
blockRequests?: boolean;
33
34
/** List of resource types to block */
35
blockedUrlPatterns?: string[];
36
37
/** Pre-navigation hooks to run before page navigation */
38
preNavigationHooks?: Array<(crawlingContext: BrowserCrawlingContext, gotoOptions: DirectNavigationOptions) => Promise<void>>;
39
40
/** Post-navigation hooks to run after page navigation */
41
postNavigationHooks?: Array<(crawlingContext: BrowserCrawlingContext) => Promise<void>>;
42
43
/** Custom page function to run on each page */
44
pageFunction?: (context: BrowserCrawlingContext) => Promise<void>;
45
46
/** Navigation timeout in milliseconds */
47
navigationTimeoutSecs?: number;
48
49
/** Whether to keep browser context alive between requests */
50
keepAlive?: boolean;
51
52
/** Request interception handler */
53
requestHandler?: (context: BrowserCrawlingContext) => Promise<void>;
54
}
55
```
56
57
### BrowserCrawlingContext
58
59
The context object passed to browser crawler request handlers.
60
61
```typescript { .api }
62
interface BrowserCrawlingContext<UserData = Dictionary> extends BasicCrawlingContext<UserData> {
63
/** The browser page object */
64
page: Page;
65
66
/** Browser context */
67
browserContext: BrowserContext;
68
69
/** The response object from navigation */
70
response?: Response;
71
72
/** Enqueue links found on the page */
73
enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;
74
75
/** Take a screenshot of the page */
76
saveSnapshot(options?: SaveSnapshotOptions): Promise<void>;
77
78
/** Scroll page to load infinite content */
79
infiniteScroll(options?: InfiniteScrollOptions): Promise<void>;
80
81
/** Wait for a selector to appear */
82
waitForSelector(selector: string, options?: WaitForSelectorOptions): Promise<ElementHandle | null>;
83
84
/** Click elements matching selector */
85
clickElements(selector: string, options?: ClickElementsOptions): Promise<void>;
86
}
87
```
88
89
### PuppeteerCrawler
90
91
Browser crawler using Puppeteer for Chrome/Chromium automation.
92
93
```typescript { .api }
94
/**
95
* Puppeteer-based browser crawler for Chrome/Chromium automation
96
*/
97
class PuppeteerCrawler extends BrowserCrawler {
98
constructor(options: PuppeteerCrawlerOptions);
99
}
100
```
101
102
### PuppeteerCrawlerOptions
103
104
Configuration options specific to PuppeteerCrawler.
105
106
```typescript { .api }
107
interface PuppeteerCrawlerOptions extends BrowserCrawlerOptions {
108
/** Handler function that receives Puppeteer context */
109
requestHandler: (context: PuppeteerCrawlingContext) => Promise<void>;
110
111
/** Puppeteer launch options */
112
launchContext?: PuppeteerLaunchContext;
113
114
/** Whether to use Puppeteer request interception */
115
useRequestInterception?: boolean;
116
117
/** Request interception patterns */
118
interceptRequestHandler?: InterceptHandler;
119
120
/** Whether to block requests for faster crawling */
121
blockRequests?: boolean;
122
123
/** Custom viewport settings */
124
viewport?: Viewport;
125
126
/** Whether to use Chrome headless mode */
127
headless?: boolean | 'new';
128
129
/** Additional Chrome launch arguments */
130
args?: string[];
131
}
132
```
133
134
### PuppeteerCrawlingContext
135
136
The context object passed to Puppeteer crawler request handlers.
137
138
```typescript { .api }
139
interface PuppeteerCrawlingContext<UserData = Dictionary> extends BrowserCrawlingContext<UserData> {
140
/** The Puppeteer page object */
141
page: PuppeteerPage;
142
143
/** Browser context */
144
browserContext: PuppeteerBrowserContext;
145
146
/** The Puppeteer response object */
147
response?: PuppeteerResponse;
148
149
/** Enqueue links by clicking elements */
150
enqueueLinksByClickingElements(options: EnqueueLinksByClickingElementsOptions): Promise<BatchAddRequestsResult>;
151
152
/** Compile and evaluate script on page */
153
compileScript(pageFunction: string | Function, options?: CompileScriptOptions): Promise<any>;
154
}
155
```
156
157
**Usage Examples:**
158
159
```typescript
160
import { PuppeteerCrawler, Dataset } from "crawlee";
161
162
const crawler = new PuppeteerCrawler({
163
launchContext: {
164
launchOptions: {
165
headless: true,
166
args: ['--no-sandbox', '--disable-setuid-sandbox'],
167
},
168
},
169
170
requestHandler: async ({ page, request, enqueueLinks, infiniteScroll, saveSnapshot }) => {
171
// Wait for dynamic content to load
172
await page.waitForSelector('.product-list', { timeout: 10000 });
173
174
// Handle infinite scrolling
175
await infiniteScroll({
176
maxScrollHeight: 5000,
177
scrollDownAndUp: true,
178
});
179
180
// Extract data using browser APIs
181
const products = await page.evaluate(() => {
182
return Array.from(document.querySelectorAll('.product')).map(product => ({
183
name: product.querySelector('.name')?.textContent?.trim(),
184
price: product.querySelector('.price')?.textContent?.trim(),
185
image: product.querySelector('img')?.src,
186
rating: product.querySelector('.rating')?.getAttribute('data-rating'),
187
}));
188
});
189
190
// Take screenshot for debugging
191
await saveSnapshot({
192
key: `screenshot-${request.uniqueKey}`,
193
saveHtml: true,
194
});
195
196
await Dataset.pushData({
197
url: request.loadedUrl,
198
products,
199
extractedAt: new Date(),
200
});
201
202
// Find and click "Load More" buttons
203
await page.click('.load-more-btn').catch(() => {
204
// Ignore if button doesn't exist
205
});
206
207
// Enqueue pagination links
208
await enqueueLinks({
209
selector: 'a[href*="page="]',
210
label: 'LIST',
211
});
212
},
213
214
// Enable request blocking for faster crawling
215
blockRequests: true,
216
blockedUrlPatterns: [
217
'**/*.css',
218
'**/*.jpg',
219
'**/*.jpeg',
220
'**/*.png',
221
'**/*.svg',
222
'**/*.gif',
223
'**/*.woff',
224
'**/*.pdf',
225
'**/*.zip',
226
],
227
228
maxConcurrency: 3, // Lower concurrency for browser crawling
229
navigationTimeoutSecs: 30,
230
});
231
```
232
233
### PlaywrightCrawler
234
235
Browser crawler using Playwright for multi-browser automation.
236
237
```typescript { .api }
238
/**
239
* Playwright-based browser crawler supporting Chrome, Firefox, and Safari
240
*/
241
class PlaywrightCrawler extends BrowserCrawler {
242
constructor(options: PlaywrightCrawlerOptions);
243
}
244
```
245
246
### PlaywrightCrawlerOptions
247
248
Configuration options specific to PlaywrightCrawler.
249
250
```typescript { .api }
251
interface PlaywrightCrawlerOptions extends BrowserCrawlerOptions {
252
/** Handler function that receives Playwright context */
253
requestHandler: (context: PlaywrightCrawlingContext) => Promise<void>;
254
255
/** Playwright launch context */
256
launchContext?: PlaywrightLaunchContext;
257
258
/** Browser type to use (chromium, firefox, webkit) */
259
browserName?: 'chromium' | 'firefox' | 'webkit';
260
261
/** Whether to use browser context fingerprinting */
262
useFingerprints?: boolean;
263
264
/** Additional browser launch options */
265
launchOptions?: LaunchOptions;
266
267
/** Experiment with different rendering strategies */
268
experimentalContainers?: boolean;
269
}
270
```
271
272
### PlaywrightCrawlingContext
273
274
The context object passed to Playwright crawler request handlers.
275
276
```typescript { .api }
277
interface PlaywrightCrawlingContext<UserData = Dictionary> extends BrowserCrawlingContext<UserData> {
278
/** The Playwright page object */
279
page: PlaywrightPage;
280
281
/** Browser context */
282
browserContext: PlaywrightBrowserContext;
283
284
/** The Playwright response object */
285
response?: PlaywrightResponse;
286
287
/** Wait for network to be idle */
288
waitForNetworkIdle(options?: WaitForNetworkIdleOptions): Promise<void>;
289
290
/** Handle dialogs (alerts, confirms, prompts) */
291
handleDialog(handler: (dialog: Dialog) => Promise<void>): void;
292
}
293
```
294
295
**Usage Examples:**
296
297
```typescript
298
import { PlaywrightCrawler } from "crawlee";
299
300
const crawler = new PlaywrightCrawler({
301
launchContext: {
302
launcher: 'chromium', // or 'firefox', 'webkit'
303
launchOptions: {
304
headless: true,
305
viewport: { width: 1920, height: 1080 },
306
},
307
},
308
309
requestHandler: async ({ page, request, enqueueLinks, waitForNetworkIdle }) => {
310
// Handle JavaScript-heavy pages
311
await waitForNetworkIdle({ timeout: 30000 });
312
313
// Interact with dynamic forms
314
await page.fill('input[name="search"]', 'example query');
315
await page.click('button[type="submit"]');
316
await page.waitForSelector('.results', { timeout: 10000 });
317
318
// Extract data after JavaScript execution
319
const results = await page.locator('.result-item').evaluateAll(items => {
320
return items.map(item => ({
321
title: item.querySelector('.title')?.textContent?.trim(),
322
description: item.querySelector('.description')?.textContent?.trim(),
323
link: item.querySelector('a')?.href,
324
}));
325
});
326
327
await page.screenshot({
328
path: `screenshots/${request.uniqueKey}.png`,
329
fullPage: true,
330
});
331
332
await Dataset.pushData({
333
url: request.loadedUrl,
334
results,
335
totalCount: results.length,
336
});
337
338
// Handle pagination with JavaScript
339
const hasNextPage = await page.locator('.next-page:not(.disabled)').count() > 0;
340
if (hasNextPage) {
341
await page.click('.next-page');
342
await enqueueLinks({
343
selector: '.next-page',
344
label: 'LIST',
345
});
346
}
347
},
348
349
browserName: 'chromium',
350
maxConcurrency: 2,
351
});
352
```
353
354
### AdaptivePlaywrightCrawler
355
356
Intelligent crawler that automatically switches between HTTP and browser rendering based on page requirements.
357
358
```typescript { .api }
359
/**
360
* Adaptive crawler that switches between HTTP and browser rendering automatically
361
*/
362
class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
363
constructor(options: AdaptivePlaywrightCrawlerOptions);
364
}
365
```
366
367
### AdaptivePlaywrightCrawlerOptions
368
369
Configuration options for the AdaptivePlaywrightCrawler.
370
371
```typescript { .api }
372
interface AdaptivePlaywrightCrawlerOptions extends PlaywrightCrawlerOptions {
373
/** Strategy for determining rendering type */
374
renderingTypeDecisionMaker?: RenderingTypePredictor;
375
376
/** HTTP crawler options for static pages */
377
httpCrawlerOptions?: HttpCrawlerOptions;
378
379
/** Threshold for switching to browser rendering */
380
browserRenderingThreshold?: number;
381
382
/** Whether to cache rendering decisions */
383
cacheDecisions?: boolean;
384
}
385
```
386
387
### RenderingTypePredictor
388
389
Service that predicts the optimal rendering strategy for websites.
390
391
```typescript { .api }
392
/**
393
* Predicts whether a website requires browser rendering or can use HTTP
394
*/
395
class RenderingTypePredictor {
396
constructor();
397
398
/** Predict rendering type for a URL */
399
predictRenderingType(url: string): Promise<RenderingType>;
400
401
/** Store rendering decision for future use */
402
storeResult(url: string, renderingType: RenderingType): void;
403
404
/** Get cached decision if available */
405
getCachedResult(url: string): RenderingType | null;
406
}
407
408
type RenderingType = 'http' | 'browser' | 'hybrid';
409
```
410
411
### BrowserPool
412
413
Manages browser instances efficiently for optimal resource usage.
414
415
```typescript { .api }
416
/**
417
* Pool for managing browser instances with automatic lifecycle management
418
*/
419
class BrowserPool {
420
constructor(options?: BrowserPoolOptions);
421
422
/** Get a browser page from the pool */
423
newPage(options?: NewPageOptions): Promise<{ page: Page; browser: Browser }>;
424
425
/** Return a page to the pool */
426
retire(page: Page): Promise<void>;
427
428
/** Destroy all browsers in the pool */
429
destroy(): Promise<void>;
430
431
/** Get current pool statistics */
432
getStatistics(): BrowserPoolStatistics;
433
}
434
```
435
436
### BrowserPoolOptions
437
438
Configuration options for BrowserPool.
439
440
```typescript { .api }
441
interface BrowserPoolOptions {
442
/** Maximum number of browser instances */
443
maxOpenPagesPerBrowser?: number;
444
445
/** Browser plugins to use */
446
browserPlugins?: BrowserPlugin[];
447
448
/** Browser fingerprinting options */
449
fingerprintOptions?: FingerprintGeneratorOptions;
450
451
/** Whether to use fingerprints */
452
useFingerprints?: boolean;
453
454
/** Browser launch context */
455
launchContext?: LaunchContext;
456
457
/** How often to check for retired browsers */
458
retireBrowserAfterPageCount?: number;
459
460
/** Maximum browser idle time before retirement */
461
maxOpenPagesPerBrowser?: number;
462
}
463
464
interface BrowserPoolStatistics {
465
/** Number of active browsers */
466
activeBrowsers: number;
467
468
/** Number of active pages */
469
activePages: number;
470
471
/** Number of retired browsers */
472
retiredBrowsers: number;
473
474
/** Total pages created */
475
totalPagesCreated: number;
476
}
477
```
478
479
### Browser Launchers
480
481
Specialized launchers for different browser automation libraries.
482
483
```typescript { .api }
484
/**
485
* Puppeteer browser launcher
486
*/
487
class PuppeteerLauncher {
488
constructor(options?: PuppeteerLauncherOptions);
489
490
/** Launch a Puppeteer browser */
491
launch(options?: LaunchOptions): Promise<Browser>;
492
}
493
494
/**
495
* Playwright browser launcher
496
*/
497
class PlaywrightLauncher {
498
constructor(options?: PlaywrightLauncherOptions);
499
500
/** Launch a Playwright browser */
501
launch(options?: LaunchOptions): Promise<Browser>;
502
}
503
```
504
505
### Utility Functions
506
507
Browser automation helper functions.
508
509
```typescript { .api }
510
const puppeteerUtils: {
511
/** Block requests matching patterns */
512
blockRequests(page: PuppeteerPage, options?: BlockRequestsOptions): Promise<void>;
513
514
/** Cache responses for faster loading */
515
cacheResponses(page: PuppeteerPage, cache: Map<string, any>): Promise<void>;
516
517
/** Compile and inject JavaScript into page */
518
compileScript(scriptString: string, context?: any): CompiledScriptFunction;
519
520
/** Navigate with retries and error handling */
521
gotoExtended(page: PuppeteerPage, request: Request, options?: DirectNavigationOptions): Promise<Response | null>;
522
523
/** Infinite scroll implementation */
524
infiniteScroll(page: PuppeteerPage, options?: InfiniteScrollOptions): Promise<void>;
525
526
/** Save page snapshot (HTML + screenshot) */
527
saveSnapshot(page: PuppeteerPage, options?: SaveSnapshotOptions): Promise<void>;
528
529
/** Enqueue links by clicking elements */
530
enqueueLinksByClickingElements(options: EnqueueLinksByClickingElementsOptions): Promise<BatchAddRequestsResult>;
531
};
532
533
const playwrightUtils: {
534
/** Block requests matching patterns */
535
blockRequests(page: PlaywrightPage, options?: BlockRequestsOptions): Promise<void>;
536
537
/** Navigate with retries and error handling */
538
gotoExtended(page: PlaywrightPage, request: Request, options?: DirectNavigationOptions): Promise<Response | null>;
539
540
/** Infinite scroll implementation */
541
infiniteScroll(page: PlaywrightPage, options?: InfiniteScrollOptions): Promise<void>;
542
543
/** Save page snapshot (HTML + screenshot) */
544
saveSnapshot(page: PlaywrightPage, options?: SaveSnapshotOptions): Promise<void>;
545
546
/** Wait for network to be idle */
547
waitForNetworkIdle(page: PlaywrightPage, options?: WaitForNetworkIdleOptions): Promise<void>;
548
};
549
```
550
551
**Usage Examples:**
552
553
```typescript
554
import { PuppeteerCrawler, puppeteerUtils } from "crawlee";
555
556
const crawler = new PuppeteerCrawler({
557
preNavigationHooks: [
558
async ({ page }, gotoOptions) => {
559
// Block unnecessary resources
560
await puppeteerUtils.blockRequests(page, {
561
urlPatterns: ['.css', '.jpg', '.png'],
562
});
563
564
// Set custom headers
565
await page.setExtraHTTPHeaders({
566
'Accept-Language': 'en-US,en;q=0.9',
567
});
568
},
569
],
570
571
postNavigationHooks: [
572
async ({ page }) => {
573
// Wait for dynamic content
574
await page.waitForSelector('.dynamic-content', { timeout: 5000 });
575
576
// Inject custom scripts
577
await page.addScriptTag({
578
content: 'window.customFlag = true;',
579
});
580
},
581
],
582
583
requestHandler: async ({ page, request, infiniteScroll, saveSnapshot }) => {
584
// Use utility functions
585
await infiniteScroll({
586
maxScrollHeight: 10000,
587
waitForSecs: 2,
588
});
589
590
// Take snapshot for debugging
591
await saveSnapshot({
592
key: `snapshot-${Date.now()}`,
593
saveHtml: true,
594
saveScreenshot: true,
595
});
596
597
// Extract data...
598
},
599
});
600
```
601
602
## Types
603
604
```typescript { .api }
605
interface LaunchContext {
606
/** Browser launcher instance */
607
launcher?: any;
608
609
/** Browser launch options */
610
launchOptions?: LaunchOptions;
611
612
/** Browser type identifier */
613
browserName?: BrowserName;
614
615
/** Whether to use stealth mode */
616
useIncognito?: boolean;
617
618
/** Proxy configuration */
619
proxyUrl?: string;
620
621
/** User data directory for persistent sessions */
622
userDataDir?: string;
623
}
624
625
interface DirectNavigationOptions {
626
/** Navigation timeout */
627
timeout?: number;
628
629
/** Wait until condition */
630
waitUntil?: 'load' | 'domcontentloaded' | 'networkidle0' | 'networkidle2';
631
632
/** Referer header */
633
referer?: string;
634
}
635
636
interface InfiniteScrollOptions {
637
/** Maximum height to scroll */
638
maxScrollHeight?: number;
639
640
/** Time to wait between scrolls */
641
waitForSecs?: number;
642
643
/** Scroll down and back up */
644
scrollDownAndUp?: boolean;
645
646
/** Custom scroll function */
647
scrollFunction?: string;
648
649
/** Stop scrolling condition */
650
stopScrollCallback?: () => boolean;
651
}
652
653
interface SaveSnapshotOptions {
654
/** Key to save under */
655
key: string;
656
657
/** Save HTML content */
658
saveHtml?: boolean;
659
660
/** Save screenshot */
661
saveScreenshot?: boolean;
662
663
/** Screenshot options */
664
screenshotOptions?: {
665
fullPage?: boolean;
666
quality?: number;
667
type?: 'png' | 'jpeg';
668
};
669
670
/** Key-value store to save to */
671
keyValueStore?: KeyValueStore;
672
}
673
674
interface BlockRequestsOptions {
675
/** URL patterns to block */
676
urlPatterns?: string[];
677
678
/** Extra URL patterns to block */
679
extraUrlPatterns?: string[];
680
681
/** Whether to block CSS */
682
blockCssRequests?: boolean;
683
684
/** Whether to block fonts */
685
blockFontRequests?: boolean;
686
687
/** Whether to block images */
688
blockImageRequests?: boolean;
689
690
/** Custom request handler */
691
requestHandler?: (request: any) => boolean;
692
}
693
694
interface ClickElementsOptions {
695
/** Maximum number of elements to click */
696
limit?: number;
697
698
/** Delay between clicks */
699
delay?: number;
700
701
/** Whether to wait for navigation after clicking */
702
waitForNavigation?: boolean;
703
704
/** Timeout for clicking each element */
705
timeout?: number;
706
}
707
708
interface EnqueueLinksByClickingElementsOptions extends CrawlerEnqueueLinksOptions {
709
/** Elements to click for finding links */
710
selector: string;
711
712
/** Wait for selector after clicking */
713
waitForSelector?: string;
714
715
/** Maximum number of clicks */
716
clickLimit?: number;
717
}
718
719
interface WaitForNetworkIdleOptions {
720
/** Timeout for network idle */
721
timeout?: number;
722
723
/** Time to wait with no network requests */
724
idleTime?: number;
725
}
726
727
interface CompileScriptOptions {
728
/** Context variables to inject */
729
context?: any;
730
731
/** Whether to return a promise */
732
async?: boolean;
733
}
734
735
type CompiledScriptFunction = (...args: any[]) => Promise<any>;
736
737
enum BrowserName {
738
CHROMIUM = 'chromium',
739
CHROME = 'chrome',
740
FIREFOX = 'firefox',
741
WEBKIT = 'webkit',
742
SAFARI = 'webkit',
743
}
744
745
interface Viewport {
746
/** Width in pixels */
747
width: number;
748
749
/** Height in pixels */
750
height: number;
751
752
/** Device scale factor */
753
deviceScaleFactor?: number;
754
755
/** Whether it's a mobile device */
756
isMobile?: boolean;
757
758
/** Whether it has touch support */
759
hasTouch?: boolean;
760
761
/** Whether it's in landscape mode */
762
isLandscape?: boolean;
763
}
764
765
interface FingerprintGeneratorOptions {
766
/** Browser fingerprints to generate */
767
browsers?: BrowserName[];
768
769
/** Operating systems to simulate */
770
operatingSystems?: OperatingSystemsName[];
771
772
/** Device categories to simulate */
773
devices?: DeviceCategory[];
774
775
/** Locale settings */
776
locales?: string[];
777
}
778
779
enum DeviceCategory {
780
DESKTOP = 'desktop',
781
MOBILE = 'mobile',
782
}
783
784
enum OperatingSystemsName {
785
WINDOWS = 'windows',
786
MACOS = 'macos',
787
LINUX = 'linux',
788
ANDROID = 'android',
789
IOS = 'ios',
790
}
791
```