The scalable web crawling and scraping library for JavaScript/Node.js that enables development of data extraction and web automation jobs with headless Chrome and Puppeteer.
Crawlee provides comprehensive storage solutions for persisting scraped data, managing request queues, and handling key-value storage. The storage system supports both cloud-based and local storage backends.
Datasets store structured data in JSON format, ideal for storing scraped results and enabling easy export to various formats.
/**
* Dataset for storing structured data (JSON objects)
*/
class Dataset {
/** Open an existing dataset or create a new one */
static open(idOrName?: string): Promise<Dataset>;
/** Get the default dataset instance */
static getDefaultDataset(): Promise<Dataset>;
/** Push data to the dataset */
pushData(data: Dictionary | Dictionary[]): Promise<void>;
/** Get data from the dataset */
getData(options?: DatasetDataOptions): Promise<DatasetData>;
/** Get dataset information */
getInfo(): Promise<DatasetInfo>;
/** Export dataset to various formats */
exportTo(options: DatasetExportOptions): Promise<void>;
/** Delete the dataset */
drop(): Promise<void>;
/** Convert dataset to a stream */
stream(options?: DatasetStreamOptions): NodeJS.ReadableStream;
/** The dataset ID */
readonly id: string;
/** The dataset name */
readonly name?: string;
}Options for retrieving data from datasets.
interface DatasetDataOptions {
/** Number of items to retrieve */
limit?: number;
/** Number of items to skip */
offset?: number;
/** Whether to return data in clean JSON format */
clean?: boolean;
/** Fields to include in results */
fields?: string[];
/** Whether to return data in descending order */
desc?: boolean;
/** JSON streaming options */
streaming?: boolean;
}Options for exporting dataset data to different formats.
interface DatasetExportOptions {
/** Format to export to */
format: 'json' | 'csv' | 'xlsx' | 'xml' | 'rss';
/** Fields to include in export */
fields?: string[];
/** Whether to exclude empty fields */
omitEmptyFields?: boolean;
/** Whether to exclude null values */
omitNullValues?: boolean;
/** Key to store the exported file under */
key: string;
/** Key-value store to save to */
keyValueStore?: KeyValueStore;
/** Maximum number of items to export */
limit?: number;
/** Number of items to skip */
offset?: number;
}Usage Examples:
import { Dataset, CheerioCrawler } from "crawlee";
const crawler = new CheerioCrawler({
requestHandler: async ({ $, request, pushData }) => {
// Extract product data
const products = [];
$('.product').each((_, element) => {
products.push({
name: $(element).find('.name').text().trim(),
price: $(element).find('.price').text().trim(),
url: new URL($(element).find('a').attr('href'), request.loadedUrl).href,
extractedAt: new Date().toISOString(),
});
});
// Save to default dataset
await pushData(products);
},
});
await crawler.run();
// Work with the dataset after crawling
const dataset = await Dataset.getDefaultDataset();
// Get all data
const { items } = await dataset.getData();
console.log(`Extracted ${items.length} products`);
// Export to CSV
await dataset.exportTo({
format: 'csv',
key: 'products.csv',
fields: ['name', 'price', 'url'],
});
// Get specific data with filtering
const recentItems = await dataset.getData({
limit: 100,
clean: true,
desc: true,
});
// Stream large datasets
const stream = dataset.stream({
limit: 10000,
});
stream.on('data', (item) => {
console.log('Processing item:', item.name);
});Key-value stores handle unstructured data, files, and configuration storage with support for various data formats.
/**
* Key-value store for storing unstructured data and files
*/
class KeyValueStore {
/** Open an existing store or create a new one */
static open(idOrName?: string): Promise<KeyValueStore>;
/** Get the default key-value store */
static getDefaultKeyValueStore(): Promise<KeyValueStore>;
/** Set a value for a key */
setValue(key: string, value: any, options?: RecordOptions): Promise<void>;
/** Get a value by key */
getValue<T = any>(key: string): Promise<T | null>;
/** Get a public URL for a stored file */
getPublicUrl(key: string): string;
/** Delete a record */
delete(key: string): Promise<void>;
/** Get store information */
getInfo(): Promise<KeyValueStoreInfo>;
/** List all keys in the store */
listKeys(options?: ListKeysOptions): Promise<KeyValueStoreKeys>;
/** Delete the entire store */
drop(): Promise<void>;
/** The store ID */
readonly id: string;
/** The store name */
readonly name?: string;
}Options for storing records in key-value stores.
interface RecordOptions {
/** Content type of the stored data */
contentType?: string;
/** Whether to gzip the content */
gzip?: boolean;
/** Custom metadata */
metadata?: Dictionary;
}Usage Examples:
import { KeyValueStore, CheerioCrawler } from "crawlee";
const crawler = new CheerioCrawler({
requestHandler: async ({ request, $, setValue }) => {
// Save HTML content
await setValue(`html-${request.uniqueKey}`, $.html());
// Save screenshot (if using browser crawler)
await setValue(`screenshot-${request.uniqueKey}`, screenshotBuffer, {
contentType: 'image/png',
});
// Save structured data
await setValue(`metadata-${request.uniqueKey}`, {
title: $('title').text(),
description: $('meta[name="description"]').attr('content'),
extractedAt: new Date(),
});
// Save with custom options
await setValue('large-file', largeJsonData, {
gzip: true,
contentType: 'application/json',
});
},
});
// Work with key-value store after crawling
const store = await KeyValueStore.getDefaultKeyValueStore();
// Retrieve stored data
const htmlContent = await store.getValue('html-page-1');
const metadata = await store.getValue('metadata-page-1');
// List all keys
const { keys } = await store.listKeys({ prefix: 'screenshot-' });
console.log(`Found ${keys.length} screenshots`);
// Get public URLs for files
const screenshotUrl = store.getPublicUrl('screenshot-page-1');
console.log(`Screenshot available at: ${screenshotUrl}`);
// Clean up old data
for (const key of keys) {
if (key.includes('temp-')) {
await store.delete(key);
}
}Request queues manage crawling requests in FIFO order with support for priorities, deduplication, and persistence.
/**
* Request queue for managing crawling requests in FIFO order
*/
class RequestQueue {
/** Open an existing queue or create a new one */
static open(idOrName?: string): Promise<RequestQueue>;
/** Get the default request queue */
static getDefaultRequestQueue(): Promise<RequestQueue>;
/** Add a single request to the queue */
addRequest(request: RequestOptions | string, options?: RequestQueueOptions): Promise<QueueOperationInfo>;
/** Add multiple requests to the queue */
addRequests(requests: (RequestOptions | string)[], options?: RequestQueueOptions): Promise<BatchAddRequestsResult>;
/** Get the next request from the queue */
fetchNextRequest(): Promise<Request | null>;
/** Mark a request as handled */
markRequestHandled(request: Request): Promise<QueueOperationInfo>;
/** Return a request to the queue for retry */
reclaimRequest(request: Request, options?: ReclaimRequestOptions): Promise<QueueOperationInfo>;
/** Check if the queue is empty */
isEmpty(): Promise<boolean>;
/** Check if the queue is finished (no pending requests) */
isFinished(): Promise<boolean>;
/** Get queue information and statistics */
getInfo(): Promise<RequestQueueInfo>;
/** Delete the queue */
drop(): Promise<void>;
/** The queue ID */
readonly id: string;
/** The queue name */
readonly name?: string;
}Options for creating requests to add to queues.
interface RequestOptions<UserData = Dictionary> {
/** Request URL */
url: string;
/** Unique key for deduplication */
uniqueKey?: string;
/** HTTP method */
method?: HttpMethod;
/** Request payload for POST/PUT requests */
payload?: string;
/** HTTP headers */
headers?: Dictionary;
/** Custom user data */
userData?: UserData;
/** Request label for routing */
label?: string;
/** Whether to skip this request if it fails */
noRetry?: boolean;
/** Priority (higher numbers = higher priority) */
priority?: number;
/** Whether to keep URL fragments */
keepUrlFragment?: boolean;
}Usage Examples:
import { RequestQueue, BasicCrawler } from "crawlee";
// Initialize queue before starting crawler
const requestQueue = await RequestQueue.open('my-crawl-queue');
// Add initial requests
await requestQueue.addRequests([
'https://example.com/page1',
'https://example.com/page2',
{
url: 'https://example.com/api/data',
method: 'POST',
payload: JSON.stringify({ query: 'products' }),
headers: { 'Content-Type': 'application/json' },
label: 'API',
userData: { type: 'api-call' },
},
]);
const crawler = new BasicCrawler({
requestQueue,
requestHandler: async ({ request, enqueueLinks }) => {
if (request.label === 'API') {
// Handle API requests differently
console.log(`Processing API request: ${request.url}`);
return;
}
// Add more requests dynamically
await enqueueLinks({
selector: 'a[href]',
transformRequestFunction: (req) => ({
...req,
priority: req.url.includes('/important/') ? 10 : 1,
}),
});
},
});
// Monitor queue status
const info = await requestQueue.getInfo();
console.log(`Queue has ${info.totalRequestCount} total requests`);
console.log(`${info.handledRequestCount} handled, ${info.pendingRequestCount} pending`);
await crawler.run();Request lists provide finite, static collections of requests for bounded crawling scenarios.
/**
* Static list of requests for finite crawling scenarios
*/
class RequestList {
/** Create a new request list from URLs or request objects */
static open(sources: (string | RequestOptions)[], options?: RequestListOptions): Promise<RequestList>;
/** Get the next request from the list */
fetchNextRequest(): Promise<Request | null>;
/** Mark a request as handled */
markRequestHandled(request: Request): Promise<void>;
/** Return a request to the list for retry */
reclaimRequest(request: Request): Promise<void>;
/** Check if all requests have been processed */
isFinished(): Promise<boolean>;
/** Check if the list is empty */
isEmpty(): Promise<boolean>;
/** Get the total number of requests */
length(): number;
/** Get the number of handled requests */
handledCount(): number;
/** Get list information and statistics */
getState(): RequestListState;
}Options for creating request lists.
interface RequestListOptions {
/** Whether to keep duplicates */
keepDuplicateUrls?: boolean;
/** Whether to check duplicates by URL only */
checksumOptions?: {
forceUrlEncoding?: boolean;
includeFragment?: boolean;
};
/** Custom unique key function */
uniqueKey?: (requestOptions: RequestOptions) => string;
/** Persist state to key-value store */
persistStateKey?: string;
/** Key-value store for persistence */
persistStateKeyValueStore?: KeyValueStore;
}Usage Examples:
import { RequestList, CheerioCrawler } from "crawlee";
// Create request list from mixed sources
const requestList = await RequestList.open([
'https://example.com/category/electronics',
'https://example.com/category/books',
{
url: 'https://example.com/category/clothing',
userData: { category: 'fashion' },
label: 'CATEGORY',
},
{
url: 'https://example.com/special-page',
priority: 10,
label: 'PRIORITY',
},
]);
const crawler = new CheerioCrawler({
requestList,
requestHandler: async ({ request, $ }) => {
console.log(`Processing ${request.label || 'page'}: ${request.url}`);
if (request.userData?.category) {
console.log(`Category: ${request.userData.category}`);
}
// Extract data specific to the page type
const data = {
url: request.loadedUrl,
title: $('title').text(),
timestamp: new Date(),
};
await Dataset.pushData(data);
},
// Don't add new requests - just process the static list
maxConcurrency: 5,
});
// Monitor progress
console.log(`Total requests: ${requestList.length()}`);
await crawler.run();
console.log(`Processed ${requestList.handledCount()} requests`);
console.log(`Finished: ${await requestList.isFinished()}`);Central management for all storage instances and their lifecycle.
/**
* Manages storage instances and their lifecycle
*/
class StorageManager {
constructor(options?: StorageManagerOptions);
/** Get or create a dataset */
dataset(idOrName?: string): Promise<Dataset>;
/** Get or create a key-value store */
keyValueStore(idOrName?: string): Promise<KeyValueStore>;
/** Get or create a request queue */
requestQueue(idOrName?: string): Promise<RequestQueue>;
/** Close all storage instances */
closeAll(): Promise<void>;
/** List all storage instances by type */
listDatasets(): Promise<DatasetCollectionInfo>;
listKeyValueStores(): Promise<KeyValueStoreCollectionInfo>;
listRequestQueues(): Promise<RequestQueueCollectionInfo>;
}
interface StorageManagerOptions {
/** Storage client to use */
storageClient?: StorageClient;
/** Local data directory */
localDataDirectory?: string;
/** Whether to purge storage on startup */
purgeOnStart?: boolean;
}Low-level storage client for advanced storage operations.
/**
* Low-level storage client for advanced operations
*/
interface StorageClient {
/** Dataset operations */
datasets(): DatasetClient;
/** Key-value store operations */
keyValueStores(): KeyValueStoreClient;
/** Request queue operations */
requestQueues(): RequestQueueClient;
/** Set storage options */
setOptions(options: StorageClientOptions): void;
}
interface StorageClientOptions {
/** Base URL for storage API */
baseUrl?: string;
/** Authentication token */
token?: string;
/** Default timeout for requests */
timeoutSecs?: number;
/** Maximum retries for failed requests */
maxRetries?: number;
}Advanced state management capabilities for persistent and recoverable crawler state.
/**
* Class for managing persistent recoverable state
*/
class RecoverableState<TStateModel = Record<string, unknown>> {
constructor(options: RecoverableStateOptions<TStateModel>);
/** Initialize and load persisted state */
initialize(): Promise<TStateModel>;
/** Clean up resources and persist state */
teardown(): Promise<void>;
/** Get current state value */
get currentValue(): TStateModel;
/** Reset state to default values */
reset(): Promise<void>;
/** Manually persist current state */
persistState(eventData?: { isMigrating: boolean }): Promise<void>;
}
/**
* Simple state management with automatic persistence
*/
function useState<State extends Dictionary = Dictionary>(
name?: string,
defaultValue?: State,
options?: UseStateOptions
): Promise<AutoSavedValue<State>>;
/**
* Purge default storage directories
*/
function purgeDefaultStorages(options?: PurgeDefaultStorageOptions): Promise<void>;
function purgeDefaultStorages(config?: Configuration, client?: StorageClient): Promise<void>;Usage Examples:
import { RecoverableState, useState, Configuration } from "crawlee";
// Advanced recoverable state for complex crawler state
interface CrawlerState {
processedUrls: Set<string>;
categoryProgress: Record<string, number>;
lastCheckpoint: Date;
errorCounts: Record<string, number>;
}
const state = new RecoverableState<CrawlerState>({
defaultState: {
processedUrls: new Set<string>(),
categoryProgress: {},
lastCheckpoint: new Date(),
errorCounts: {},
},
persistStateKey: 'CRAWLER_STATE',
persistenceEnabled: true,
persistStateKvsName: 'crawler-checkpoints',
// Custom serialization for Set objects
serialize: (state) => JSON.stringify({
...state,
processedUrls: Array.from(state.processedUrls),
}),
deserialize: (json) => {
const parsed = JSON.parse(json);
return {
...parsed,
processedUrls: new Set(parsed.processedUrls),
lastCheckpoint: new Date(parsed.lastCheckpoint),
};
},
});
// Initialize state at crawler start
await state.initialize();
const crawler = new CheerioCrawler({
requestHandler: async ({ request, $, enqueueLinks }) => {
const currentState = state.currentValue;
// Skip if already processed
if (currentState.processedUrls.has(request.url)) {
return;
}
// Process page and update state
const category = extractCategory(request.url);
currentState.processedUrls.add(request.url);
currentState.categoryProgress[category] = (currentState.categoryProgress[category] || 0) + 1;
currentState.lastCheckpoint = new Date();
// Find and enqueue new links
await enqueueLinks({ selector: 'a[href]' });
console.log(`Processed ${currentState.processedUrls.size} URLs so far`);
},
failedRequestHandler: async ({ request }) => {
const currentState = state.currentValue;
const domain = new URL(request.url).hostname;
currentState.errorCounts[domain] = (currentState.errorCounts[domain] || 0) + 1;
},
});
// Clean up state on finish
crawler.teardown = async () => {
await state.teardown();
};
// Simple state management with useState
const simpleState = await useState('SIMPLE_CRAWLER_STATE', {
totalProcessed: 0,
startTime: new Date(),
categories: {} as Record<string, number>,
});
const simpleCrawler = new CheerioCrawler({
requestHandler: async ({ request }) => {
// useState automatically persists changes
simpleState.totalProcessed++;
const category = extractCategory(request.url);
simpleState.categories[category] = (simpleState.categories[category] || 0) + 1;
console.log(`Total processed: ${simpleState.totalProcessed}`);
},
});
// Clean up storage before starting (optional)
await purgeDefaultStorages({
onlyPurgeOnce: true,
});Additional utilities for storage management and cleanup.
/**
* Get request ID from unique key for local storage
*/
function getRequestId(uniqueKey: string): string;Usage Examples:
import { getRequestId, RequestQueue } from "crawlee";
// Create consistent request IDs for caching
const url = "https://example.com/page";
const uniqueKey = `${url}-${Date.now()}`;
const requestId = getRequestId(uniqueKey);
console.log(`Request ID: ${requestId}`); // e.g., "a1b2c3d4e5f6g7h"
// Use in custom request queue implementations
class CustomRequestQueue extends RequestQueue {
private cache = new Map<string, any>();
async addRequest(request: RequestOptions) {
const id = getRequestId(request.uniqueKey || request.url);
// Check cache before adding
if (this.cache.has(id)) {
console.log('Request already cached');
return;
}
const result = await super.addRequest(request);
this.cache.set(id, result);
return result;
}
}interface DatasetInfo {
/** Dataset ID */
id: string;
/** Dataset name */
name?: string;
/** Creation time */
createdAt: Date;
/** Last modification time */
modifiedAt: Date;
/** Number of items in dataset */
itemCount: number;
/** Total size in bytes */
cleanItemCount: number;
}
interface DatasetData<T = Dictionary> {
/** Array of data items */
items: T[];
/** Total number of items available */
total: number;
/** Number of items in this response */
count: number;
/** Starting offset of items */
offset: number;
/** Maximum items per response */
limit: number;
}
interface KeyValueStoreInfo {
/** Store ID */
id: string;
/** Store name */
name?: string;
/** Creation time */
createdAt: Date;
/** Last modification time */
modifiedAt: Date;
}
interface KeyValueStoreKeys {
/** Array of key information */
keys: Array<{
key: string;
size: number;
contentType?: string;
modifiedAt: Date;
}>;
/** Total number of keys */
total: number;
/** Number of keys in this response */
count: number;
/** Starting offset */
offset: number;
/** Maximum keys per response */
limit: number;
/** Whether there are more keys */
isTruncated: boolean;
}
interface RequestQueueInfo {
/** Queue ID */
id: string;
/** Queue name */
name?: string;
/** Creation time */
createdAt: Date;
/** Last modification time */
modifiedAt: Date;
/** Total number of requests ever added */
totalRequestCount: number;
/** Number of handled requests */
handledRequestCount: number;
/** Number of pending requests */
pendingRequestCount: number;
}
interface QueueOperationInfo {
/** Request ID */
requestId: string;
/** Whether this was a new request */
wasAlreadyHandled: boolean;
/** Whether this was already present */
wasAlreadyPresent: boolean;
/** Unique key of the request */
uniqueKey: string;
}
interface BatchAddRequestsResult {
/** Requests that were added */
addedRequests: QueueOperationInfo[];
/** Requests that were already present */
existingRequests: QueueOperationInfo[];
/** Number of processed requests */
processedRequests: number;
/** Number of unprocessed requests */
unprocessedRequests: number;
}
interface RequestListState {
/** Total number of requests */
total: number;
/** Number of finished requests */
finished: number;
/** Number of pending requests */
pending: number;
/** Number of reclaimed requests */
reclaimed: number;
/** List of finished request IDs */
finishedRequestIds: string[];
/** List of reclaimed request IDs */
reclaimedRequestIds: string[];
}
interface ReclaimRequestOptions {
/** Whether to put the request in front of the queue */
forefront?: boolean;
}
interface ListKeysOptions {
/** Maximum number of keys to return */
limit?: number;
/** Prefix to filter keys */
prefix?: string;
/** Starting position for pagination */
offset?: number;
}
interface DatasetStreamOptions {
/** Number of items to stream */
limit?: number;
/** Starting offset */
offset?: number;
/** Whether to return clean JSON */
clean?: boolean;
}
type HttpMethod = 'GET' | 'POST' | 'PUT' | 'DELETE' | 'HEAD' | 'OPTIONS' | 'PATCH';
interface RecoverableStateOptions<TStateModel = Record<string, unknown>> {
/** Default state used if no persisted state is found */
defaultState: TStateModel;
/** The key under which the state is stored */
persistStateKey: string;
/** Flag to enable or disable state persistence */
persistenceEnabled?: boolean;
/** KeyValueStore name for persistence */
persistStateKvsName?: string;
/** KeyValueStore ID for persistence */
persistStateKvsId?: string;
/** Logger instance */
logger?: Log;
/** Configuration instance */
config?: Configuration;
/** Custom serialization function */
serialize?: (state: TStateModel) => string;
/** Custom deserialization function */
deserialize?: (serializedState: string) => TStateModel;
}
interface UseStateOptions {
/** Configuration instance */
config?: Configuration;
/** KeyValueStore name for state storage */
keyValueStoreName?: string | null;
}
interface PurgeDefaultStorageOptions {
/** If true, purge only once per execution */
onlyPurgeOnce?: boolean;
/** Configuration instance */
config?: Configuration;
/** Storage client instance */
client?: StorageClient;
}
interface AutoSavedValue<T> extends T {
/** Manually save the current state */
save(): Promise<void>;
/** Reset to initial value */
reset(): Promise<void>;
}Install with Tessl CLI
npx tessl i tessl/npm-crawlee