The scalable web crawling and scraping library for JavaScript/Node.js that enables development of data extraction and web automation jobs with headless Chrome and Puppeteer.
Browser crawling provides full browser automation capabilities for handling JavaScript-heavy websites, dynamic content, and complex user interactions. This includes Puppeteer and Playwright integration with efficient browser pool management.
Base browser crawler class that extends BasicCrawler with browser automation capabilities.
/**
* Base browser crawler for browser automation with Puppeteer or Playwright
*/
class BrowserCrawler extends BasicCrawler<BrowserCrawlingContext> {
constructor(options: BrowserCrawlerOptions);
}Configuration options for the BrowserCrawler.
interface BrowserCrawlerOptions extends BasicCrawlerOptions<BrowserCrawlingContext> {
/** Browser launcher options */
launchContext?: LaunchContext;
/** Browser pool configuration */
browserPoolOptions?: BrowserPoolOptions;
/** Whether to block certain resource types for faster loading */
blockRequests?: boolean;
/** List of resource types to block */
blockedUrlPatterns?: string[];
/** Pre-navigation hooks to run before page navigation */
preNavigationHooks?: Array<(crawlingContext: BrowserCrawlingContext, gotoOptions: DirectNavigationOptions) => Promise<void>>;
/** Post-navigation hooks to run after page navigation */
postNavigationHooks?: Array<(crawlingContext: BrowserCrawlingContext) => Promise<void>>;
/** Custom page function to run on each page */
pageFunction?: (context: BrowserCrawlingContext) => Promise<void>;
/** Navigation timeout in milliseconds */
navigationTimeoutSecs?: number;
/** Whether to keep browser context alive between requests */
keepAlive?: boolean;
/** Request interception handler */
requestHandler?: (context: BrowserCrawlingContext) => Promise<void>;
}The context object passed to browser crawler request handlers.
interface BrowserCrawlingContext<UserData = Dictionary> extends BasicCrawlingContext<UserData> {
/** The browser page object */
page: Page;
/** Browser context */
browserContext: BrowserContext;
/** The response object from navigation */
response?: Response;
/** Enqueue links found on the page */
enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;
/** Take a screenshot of the page */
saveSnapshot(options?: SaveSnapshotOptions): Promise<void>;
/** Scroll page to load infinite content */
infiniteScroll(options?: InfiniteScrollOptions): Promise<void>;
/** Wait for a selector to appear */
waitForSelector(selector: string, options?: WaitForSelectorOptions): Promise<ElementHandle | null>;
/** Click elements matching selector */
clickElements(selector: string, options?: ClickElementsOptions): Promise<void>;
}Browser crawler using Puppeteer for Chrome/Chromium automation.
/**
* Puppeteer-based browser crawler for Chrome/Chromium automation
*/
class PuppeteerCrawler extends BrowserCrawler {
constructor(options: PuppeteerCrawlerOptions);
}Configuration options specific to PuppeteerCrawler.
interface PuppeteerCrawlerOptions extends BrowserCrawlerOptions {
/** Handler function that receives Puppeteer context */
requestHandler: (context: PuppeteerCrawlingContext) => Promise<void>;
/** Puppeteer launch options */
launchContext?: PuppeteerLaunchContext;
/** Whether to use Puppeteer request interception */
useRequestInterception?: boolean;
/** Request interception patterns */
interceptRequestHandler?: InterceptHandler;
/** Whether to block requests for faster crawling */
blockRequests?: boolean;
/** Custom viewport settings */
viewport?: Viewport;
/** Whether to use Chrome headless mode */
headless?: boolean | 'new';
/** Additional Chrome launch arguments */
args?: string[];
}The context object passed to Puppeteer crawler request handlers.
interface PuppeteerCrawlingContext<UserData = Dictionary> extends BrowserCrawlingContext<UserData> {
/** The Puppeteer page object */
page: PuppeteerPage;
/** Browser context */
browserContext: PuppeteerBrowserContext;
/** The Puppeteer response object */
response?: PuppeteerResponse;
/** Enqueue links by clicking elements */
enqueueLinksByClickingElements(options: EnqueueLinksByClickingElementsOptions): Promise<BatchAddRequestsResult>;
/** Compile and evaluate script on page */
compileScript(pageFunction: string | Function, options?: CompileScriptOptions): Promise<any>;
}Usage Examples:
import { PuppeteerCrawler, Dataset } from "crawlee";
const crawler = new PuppeteerCrawler({
launchContext: {
launchOptions: {
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
},
},
requestHandler: async ({ page, request, enqueueLinks, infiniteScroll, saveSnapshot }) => {
// Wait for dynamic content to load
await page.waitForSelector('.product-list', { timeout: 10000 });
// Handle infinite scrolling
await infiniteScroll({
maxScrollHeight: 5000,
scrollDownAndUp: true,
});
// Extract data using browser APIs
const products = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.product')).map(product => ({
name: product.querySelector('.name')?.textContent?.trim(),
price: product.querySelector('.price')?.textContent?.trim(),
image: product.querySelector('img')?.src,
rating: product.querySelector('.rating')?.getAttribute('data-rating'),
}));
});
// Take screenshot for debugging
await saveSnapshot({
key: `screenshot-${request.uniqueKey}`,
saveHtml: true,
});
await Dataset.pushData({
url: request.loadedUrl,
products,
extractedAt: new Date(),
});
// Find and click "Load More" buttons
await page.click('.load-more-btn').catch(() => {
// Ignore if button doesn't exist
});
// Enqueue pagination links
await enqueueLinks({
selector: 'a[href*="page="]',
label: 'LIST',
});
},
// Enable request blocking for faster crawling
blockRequests: true,
blockedUrlPatterns: [
'**/*.css',
'**/*.jpg',
'**/*.jpeg',
'**/*.png',
'**/*.svg',
'**/*.gif',
'**/*.woff',
'**/*.pdf',
'**/*.zip',
],
maxConcurrency: 3, // Lower concurrency for browser crawling
navigationTimeoutSecs: 30,
});Browser crawler using Playwright for multi-browser automation.
/**
* Playwright-based browser crawler supporting Chrome, Firefox, and Safari
*/
class PlaywrightCrawler extends BrowserCrawler {
constructor(options: PlaywrightCrawlerOptions);
}Configuration options specific to PlaywrightCrawler.
interface PlaywrightCrawlerOptions extends BrowserCrawlerOptions {
/** Handler function that receives Playwright context */
requestHandler: (context: PlaywrightCrawlingContext) => Promise<void>;
/** Playwright launch context */
launchContext?: PlaywrightLaunchContext;
/** Browser type to use (chromium, firefox, webkit) */
browserName?: 'chromium' | 'firefox' | 'webkit';
/** Whether to use browser context fingerprinting */
useFingerprints?: boolean;
/** Additional browser launch options */
launchOptions?: LaunchOptions;
/** Experiment with different rendering strategies */
experimentalContainers?: boolean;
}The context object passed to Playwright crawler request handlers.
interface PlaywrightCrawlingContext<UserData = Dictionary> extends BrowserCrawlingContext<UserData> {
/** The Playwright page object */
page: PlaywrightPage;
/** Browser context */
browserContext: PlaywrightBrowserContext;
/** The Playwright response object */
response?: PlaywrightResponse;
/** Wait for network to be idle */
waitForNetworkIdle(options?: WaitForNetworkIdleOptions): Promise<void>;
/** Handle dialogs (alerts, confirms, prompts) */
handleDialog(handler: (dialog: Dialog) => Promise<void>): void;
}Usage Examples:
import { PlaywrightCrawler } from "crawlee";
const crawler = new PlaywrightCrawler({
launchContext: {
launcher: 'chromium', // or 'firefox', 'webkit'
launchOptions: {
headless: true,
viewport: { width: 1920, height: 1080 },
},
},
requestHandler: async ({ page, request, enqueueLinks, waitForNetworkIdle }) => {
// Handle JavaScript-heavy pages
await waitForNetworkIdle({ timeout: 30000 });
// Interact with dynamic forms
await page.fill('input[name="search"]', 'example query');
await page.click('button[type="submit"]');
await page.waitForSelector('.results', { timeout: 10000 });
// Extract data after JavaScript execution
const results = await page.locator('.result-item').evaluateAll(items => {
return items.map(item => ({
title: item.querySelector('.title')?.textContent?.trim(),
description: item.querySelector('.description')?.textContent?.trim(),
link: item.querySelector('a')?.href,
}));
});
await page.screenshot({
path: `screenshots/${request.uniqueKey}.png`,
fullPage: true,
});
await Dataset.pushData({
url: request.loadedUrl,
results,
totalCount: results.length,
});
// Handle pagination with JavaScript
const hasNextPage = await page.locator('.next-page:not(.disabled)').count() > 0;
if (hasNextPage) {
await page.click('.next-page');
await enqueueLinks({
selector: '.next-page',
label: 'LIST',
});
}
},
browserName: 'chromium',
maxConcurrency: 2,
});Intelligent crawler that automatically switches between HTTP and browser rendering based on page requirements.
/**
* Adaptive crawler that switches between HTTP and browser rendering automatically
*/
class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
constructor(options: AdaptivePlaywrightCrawlerOptions);
}Configuration options for the AdaptivePlaywrightCrawler.
interface AdaptivePlaywrightCrawlerOptions extends PlaywrightCrawlerOptions {
/** Strategy for determining rendering type */
renderingTypeDecisionMaker?: RenderingTypePredictor;
/** HTTP crawler options for static pages */
httpCrawlerOptions?: HttpCrawlerOptions;
/** Threshold for switching to browser rendering */
browserRenderingThreshold?: number;
/** Whether to cache rendering decisions */
cacheDecisions?: boolean;
}Service that predicts the optimal rendering strategy for websites.
/**
* Predicts whether a website requires browser rendering or can use HTTP
*/
class RenderingTypePredictor {
constructor();
/** Predict rendering type for a URL */
predictRenderingType(url: string): Promise<RenderingType>;
/** Store rendering decision for future use */
storeResult(url: string, renderingType: RenderingType): void;
/** Get cached decision if available */
getCachedResult(url: string): RenderingType | null;
}
type RenderingType = 'http' | 'browser' | 'hybrid';Manages browser instances efficiently for optimal resource usage.
/**
* Pool for managing browser instances with automatic lifecycle management
*/
class BrowserPool {
constructor(options?: BrowserPoolOptions);
/** Get a browser page from the pool */
newPage(options?: NewPageOptions): Promise<{ page: Page; browser: Browser }>;
/** Return a page to the pool */
retire(page: Page): Promise<void>;
/** Destroy all browsers in the pool */
destroy(): Promise<void>;
/** Get current pool statistics */
getStatistics(): BrowserPoolStatistics;
}Configuration options for BrowserPool.
interface BrowserPoolOptions {
/** Maximum number of browser instances */
maxOpenPagesPerBrowser?: number;
/** Browser plugins to use */
browserPlugins?: BrowserPlugin[];
/** Browser fingerprinting options */
fingerprintOptions?: FingerprintGeneratorOptions;
/** Whether to use fingerprints */
useFingerprints?: boolean;
/** Browser launch context */
launchContext?: LaunchContext;
/** How often to check for retired browsers */
retireBrowserAfterPageCount?: number;
/** Maximum browser idle time before retirement */
maxOpenPagesPerBrowser?: number;
}
interface BrowserPoolStatistics {
/** Number of active browsers */
activeBrowsers: number;
/** Number of active pages */
activePages: number;
/** Number of retired browsers */
retiredBrowsers: number;
/** Total pages created */
totalPagesCreated: number;
}Specialized launchers for different browser automation libraries.
/**
* Puppeteer browser launcher
*/
class PuppeteerLauncher {
constructor(options?: PuppeteerLauncherOptions);
/** Launch a Puppeteer browser */
launch(options?: LaunchOptions): Promise<Browser>;
}
/**
* Playwright browser launcher
*/
class PlaywrightLauncher {
constructor(options?: PlaywrightLauncherOptions);
/** Launch a Playwright browser */
launch(options?: LaunchOptions): Promise<Browser>;
}Browser automation helper functions.
const puppeteerUtils: {
/** Block requests matching patterns */
blockRequests(page: PuppeteerPage, options?: BlockRequestsOptions): Promise<void>;
/** Cache responses for faster loading */
cacheResponses(page: PuppeteerPage, cache: Map<string, any>): Promise<void>;
/** Compile and inject JavaScript into page */
compileScript(scriptString: string, context?: any): CompiledScriptFunction;
/** Navigate with retries and error handling */
gotoExtended(page: PuppeteerPage, request: Request, options?: DirectNavigationOptions): Promise<Response | null>;
/** Infinite scroll implementation */
infiniteScroll(page: PuppeteerPage, options?: InfiniteScrollOptions): Promise<void>;
/** Save page snapshot (HTML + screenshot) */
saveSnapshot(page: PuppeteerPage, options?: SaveSnapshotOptions): Promise<void>;
/** Enqueue links by clicking elements */
enqueueLinksByClickingElements(options: EnqueueLinksByClickingElementsOptions): Promise<BatchAddRequestsResult>;
};
const playwrightUtils: {
/** Block requests matching patterns */
blockRequests(page: PlaywrightPage, options?: BlockRequestsOptions): Promise<void>;
/** Navigate with retries and error handling */
gotoExtended(page: PlaywrightPage, request: Request, options?: DirectNavigationOptions): Promise<Response | null>;
/** Infinite scroll implementation */
infiniteScroll(page: PlaywrightPage, options?: InfiniteScrollOptions): Promise<void>;
/** Save page snapshot (HTML + screenshot) */
saveSnapshot(page: PlaywrightPage, options?: SaveSnapshotOptions): Promise<void>;
/** Wait for network to be idle */
waitForNetworkIdle(page: PlaywrightPage, options?: WaitForNetworkIdleOptions): Promise<void>;
};Usage Examples:
import { PuppeteerCrawler, puppeteerUtils } from "crawlee";
const crawler = new PuppeteerCrawler({
preNavigationHooks: [
async ({ page }, gotoOptions) => {
// Block unnecessary resources
await puppeteerUtils.blockRequests(page, {
urlPatterns: ['.css', '.jpg', '.png'],
});
// Set custom headers
await page.setExtraHTTPHeaders({
'Accept-Language': 'en-US,en;q=0.9',
});
},
],
postNavigationHooks: [
async ({ page }) => {
// Wait for dynamic content
await page.waitForSelector('.dynamic-content', { timeout: 5000 });
// Inject custom scripts
await page.addScriptTag({
content: 'window.customFlag = true;',
});
},
],
requestHandler: async ({ page, request, infiniteScroll, saveSnapshot }) => {
// Use utility functions
await infiniteScroll({
maxScrollHeight: 10000,
waitForSecs: 2,
});
// Take snapshot for debugging
await saveSnapshot({
key: `snapshot-${Date.now()}`,
saveHtml: true,
saveScreenshot: true,
});
// Extract data...
},
});interface LaunchContext {
/** Browser launcher instance */
launcher?: any;
/** Browser launch options */
launchOptions?: LaunchOptions;
/** Browser type identifier */
browserName?: BrowserName;
/** Whether to use stealth mode */
useIncognito?: boolean;
/** Proxy configuration */
proxyUrl?: string;
/** User data directory for persistent sessions */
userDataDir?: string;
}
interface DirectNavigationOptions {
/** Navigation timeout */
timeout?: number;
/** Wait until condition */
waitUntil?: 'load' | 'domcontentloaded' | 'networkidle0' | 'networkidle2';
/** Referer header */
referer?: string;
}
interface InfiniteScrollOptions {
/** Maximum height to scroll */
maxScrollHeight?: number;
/** Time to wait between scrolls */
waitForSecs?: number;
/** Scroll down and back up */
scrollDownAndUp?: boolean;
/** Custom scroll function */
scrollFunction?: string;
/** Stop scrolling condition */
stopScrollCallback?: () => boolean;
}
interface SaveSnapshotOptions {
/** Key to save under */
key: string;
/** Save HTML content */
saveHtml?: boolean;
/** Save screenshot */
saveScreenshot?: boolean;
/** Screenshot options */
screenshotOptions?: {
fullPage?: boolean;
quality?: number;
type?: 'png' | 'jpeg';
};
/** Key-value store to save to */
keyValueStore?: KeyValueStore;
}
interface BlockRequestsOptions {
/** URL patterns to block */
urlPatterns?: string[];
/** Extra URL patterns to block */
extraUrlPatterns?: string[];
/** Whether to block CSS */
blockCssRequests?: boolean;
/** Whether to block fonts */
blockFontRequests?: boolean;
/** Whether to block images */
blockImageRequests?: boolean;
/** Custom request handler */
requestHandler?: (request: any) => boolean;
}
interface ClickElementsOptions {
/** Maximum number of elements to click */
limit?: number;
/** Delay between clicks */
delay?: number;
/** Whether to wait for navigation after clicking */
waitForNavigation?: boolean;
/** Timeout for clicking each element */
timeout?: number;
}
interface EnqueueLinksByClickingElementsOptions extends CrawlerEnqueueLinksOptions {
/** Elements to click for finding links */
selector: string;
/** Wait for selector after clicking */
waitForSelector?: string;
/** Maximum number of clicks */
clickLimit?: number;
}
interface WaitForNetworkIdleOptions {
/** Timeout for network idle */
timeout?: number;
/** Time to wait with no network requests */
idleTime?: number;
}
interface CompileScriptOptions {
/** Context variables to inject */
context?: any;
/** Whether to return a promise */
async?: boolean;
}
type CompiledScriptFunction = (...args: any[]) => Promise<any>;
enum BrowserName {
CHROMIUM = 'chromium',
CHROME = 'chrome',
FIREFOX = 'firefox',
WEBKIT = 'webkit',
SAFARI = 'webkit',
}
interface Viewport {
/** Width in pixels */
width: number;
/** Height in pixels */
height: number;
/** Device scale factor */
deviceScaleFactor?: number;
/** Whether it's a mobile device */
isMobile?: boolean;
/** Whether it has touch support */
hasTouch?: boolean;
/** Whether it's in landscape mode */
isLandscape?: boolean;
}
interface FingerprintGeneratorOptions {
/** Browser fingerprints to generate */
browsers?: BrowserName[];
/** Operating systems to simulate */
operatingSystems?: OperatingSystemsName[];
/** Device categories to simulate */
devices?: DeviceCategory[];
/** Locale settings */
locales?: string[];
}
enum DeviceCategory {
DESKTOP = 'desktop',
MOBILE = 'mobile',
}
enum OperatingSystemsName {
WINDOWS = 'windows',
MACOS = 'macos',
LINUX = 'linux',
ANDROID = 'android',
IOS = 'ios',
}Install with Tessl CLI
npx tessl i tessl/npm-crawlee