The scalable web crawling and scraping library for JavaScript/Node.js that enables development of data extraction and web automation jobs with headless Chrome and Puppeteer.
npx @tessl/cli install tessl/npm-crawlee@3.15.0Crawlee is a comprehensive web crawling and scraping library for Node.js that enables development of robust data extraction and web automation jobs. It provides a unified interface for various crawling strategies, from simple HTTP requests to full browser automation with headless Chrome, Puppeteer, and Playwright.
npm install crawleeimport {
// Core crawlers
BasicCrawler,
HttpCrawler,
CheerioCrawler,
JSDOMCrawler,
LinkedOMCrawler,
PuppeteerCrawler,
PlaywrightCrawler,
FileDownload,
// Storage
Dataset,
KeyValueStore,
RequestQueue,
RequestList,
RecoverableState,
// Session management
SessionPool,
Session,
// Configuration and proxies
Configuration,
ProxyConfiguration,
// Error handling
NonRetryableError,
CriticalError,
MissingRouteError,
RetryRequestError,
SessionError,
BrowserLaunchError,
// State management
useState,
purgeDefaultStorages,
// Utilities
utils,
enqueueLinks,
sleep
} from "crawlee";For CommonJS:
const {
// Core crawlers
BasicCrawler,
HttpCrawler,
CheerioCrawler,
JSDOMCrawler,
LinkedOMCrawler,
PuppeteerCrawler,
PlaywrightCrawler,
FileDownload,
// Storage
Dataset,
KeyValueStore,
RequestQueue,
RequestList,
RecoverableState,
// Session management
SessionPool,
Session,
// Configuration and proxies
Configuration,
ProxyConfiguration,
// Error handling
NonRetryableError,
CriticalError,
MissingRouteError,
RetryRequestError,
SessionError,
BrowserLaunchError,
// State management
useState,
purgeDefaultStorages,
// Utilities
utils,
enqueueLinks,
sleep
} = require("crawlee");import { CheerioCrawler, Dataset } from "crawlee";
const crawler = new CheerioCrawler({
requestHandler: async ({ $, request, enqueueLinks }) => {
// Extract data from the page
const title = $('title').text();
const price = $('.price').text();
// Save data to dataset
await Dataset.pushData({
url: request.loadedUrl,
title,
price,
});
// Find and enqueue new links
await enqueueLinks({
selector: 'a[href^="/products/"]',
label: 'PRODUCT',
});
},
});
// Add initial URLs
await crawler.addRequests(['https://example.com/products']);
// Run the crawler
await crawler.run();Crawlee is built around several key architectural components:
BasicCrawler → HttpCrawler → CheerioCrawler/JSDOMCrawler/LinkedOMCrawler, and BasicCrawler → BrowserCrawler → PuppeteerCrawler/PlaywrightCrawler)Foundation classes for building custom crawlers with autoscaling, request management, and error handling.
class BasicCrawler<Context = BasicCrawlingContext> {
constructor(options: BasicCrawlerOptions<Context>);
run(): Promise<FinalStatistics>;
addRequests(requests: (string | RequestOptions)[]): Promise<void>;
}
class AutoscaledPool {
constructor(options: AutoscaledPoolOptions);
run(): Promise<void>;
abort(): Promise<void>;
pause(): Promise<void>;
resume(): Promise<void>;
}Server-side HTML parsing crawlers for efficient data extraction without browser overhead.
class HttpCrawler extends BasicCrawler<HttpCrawlingContext> {
constructor(options: HttpCrawlerOptions);
}
class CheerioCrawler extends HttpCrawler {
constructor(options: CheerioCrawlerOptions);
}
class JSDOMCrawler extends HttpCrawler {
constructor(options: JSDOMCrawlerOptions);
}Full browser automation with Puppeteer and Playwright for JavaScript-heavy websites.
class BrowserCrawler extends BasicCrawler<BrowserCrawlingContext> {
constructor(options: BrowserCrawlerOptions);
}
class PuppeteerCrawler extends BrowserCrawler {
constructor(options: PuppeteerCrawlerOptions);
}
class PlaywrightCrawler extends BrowserCrawler {
constructor(options: PlaywrightCrawlerOptions);
}Persistent storage solutions for structured data, key-value pairs, and request management.
class Dataset {
static open(idOrName?: string): Promise<Dataset>;
pushData(data: Dictionary | Dictionary[]): Promise<void>;
getData(options?: DatasetDataOptions): Promise<DatasetData>;
}
class KeyValueStore {
static open(idOrName?: string): Promise<KeyValueStore>;
setValue(key: string, value: any, options?: RecordOptions): Promise<void>;
getValue<T>(key: string): Promise<T | null>;
}
class RequestQueue {
static open(idOrName?: string): Promise<RequestQueue>;
addRequest(request: RequestOptions | string): Promise<QueueOperationInfo>;
fetchNextRequest(): Promise<Request | null>;
}Helper functions for URL extraction, social media parsing, and system detection.
const utils: {
sleep(millis?: number): Promise<void>;
enqueueLinks(options: EnqueueLinksOptions): Promise<BatchAddRequestsResult>;
social: {
parseHandlesFromHtml(html: string): SocialHandles;
emailsFromText(text: string): string[];
phonesFromText(text: string): string[];
};
downloadListOfUrls(options: DownloadListOfUrlsOptions): Promise<string[]>;
parseOpenGraph(html: string): Dictionary;
isDocker(): boolean;
isLambda(): boolean;
};Session rotation and proxy management for handling anti-bot measures.
class SessionPool {
constructor(options?: SessionPoolOptions);
getSession(request?: Request): Promise<Session>;
markSessionBad(session: Session): Promise<void>;
}
class Session {
constructor(options: SessionOptions);
getCookieString(url: string): string;
setPuppeteerCookies(page: Page, domain?: string): Promise<void>;
}Global configuration management and proxy handling for distributed crawling.
class Configuration {
static getGlobalConfig(): Configuration;
get(key: string): any;
set(key: string, value: any): void;
}
class ProxyConfiguration {
constructor(options?: ProxyConfigurationOptions);
newUrl(sessionId?: number | string): Promise<string | undefined>;
newProxyInfo(sessionId?: number | string): Promise<ProxyInfo | undefined>;
}Comprehensive error handling system with specialized error types for different failure scenarios.
/**
* Base error for requests that should not be retried
*/
class NonRetryableError extends Error {
constructor(message?: string);
}
/**
* Critical error that extends NonRetryableError
*/
class CriticalError extends NonRetryableError {
constructor(message?: string);
}
/**
* Error indicating a missing route handler
*/
class MissingRouteError extends CriticalError {
constructor(message?: string);
}
/**
* Error requesting that a request should be retried
*/
class RetryRequestError extends Error {
constructor(message?: string, options?: { retryAfter?: number });
}
/**
* Session-related error extending RetryRequestError
*/
class SessionError extends RetryRequestError {
constructor(session: Session, message?: string, options?: { retryAfter?: number });
}
/**
* Browser launch error for browser pool issues
*/
class BrowserLaunchError extends CriticalError {
constructor(message?: string);
}
/**
* Cookie parsing error for session management
*/
class CookieParseError extends Error {
constructor(message?: string);
}interface BasicCrawlerOptions<Context> {
requestList?: RequestList;
requestQueue?: RequestQueue;
requestHandler: (context: Context) => Promise<void>;
maxRequestRetries?: number;
maxRequestsPerCrawl?: number;
maxConcurrency?: number;
autoscaledPoolOptions?: AutoscaledPoolOptions;
sessionPoolOptions?: SessionPoolOptions;
useSessionPool?: boolean;
persistCookiesPerSession?: boolean;
}
interface BasicCrawlingContext<UserData = Dictionary> {
request: Request<UserData>;
session?: Session;
proxyInfo?: ProxyInfo;
response?: IncomingMessage;
crawler: BasicCrawler;
log: Log;
sendRequest<T>(overrideOptions?: Partial<OptionsInit>): Promise<T>;
enqueueLinks(options?: EnqueueLinksOptions): Promise<BatchAddRequestsResult>;
pushData(data: Dictionary | Dictionary[]): Promise<void>;
setValue(key: string, value: any, options?: RecordOptions): Promise<void>;
getValue<T>(key: string): Promise<T | null>;
}
interface Request<UserData = Dictionary> {
url: string;
loadedUrl?: string;
uniqueKey: string;
method?: HttpMethod;
payload?: string;
noRetry?: boolean;
retryCount?: number;
errorMessages?: string[];
headers?: Dictionary;
userData?: UserData;
handledAt?: Date;
label?: string;
keepUrlFragment?: boolean;
}
interface ProxyInfo {
url: string;
hostname: string;
port: number;
auth?: {
username: string;
password: string;
};
protocol: string;
sessionId?: string | number;
}
interface FinalStatistics {
requestsFinished: number;
requestsFailed: number;
requestsRetries: number;
requestsFailedPerMinute: number;
requestsFinishedPerMinute: number;
requestMinDurationMillis: number;
requestMaxDurationMillis: number;
requestTotalDurationMillis: number;
crawlerStartedAt: Date;
crawlerFinishedAt: Date;
statsId: string;
}