tessl/npm-crawlee

The scalable web crawling and scraping library for JavaScript/Node.js that enables development of data extraction and web automation jobs with headless Chrome and Puppeteer.

Overview

Eval results

Files

Storage

Name: tessl/npm-crawlee
Author: tessl

Crawlee provides comprehensive storage solutions for persisting scraped data, managing request queues, and handling key-value storage. The storage system supports both cloud-based and local storage backends.

Capabilities

Dataset

Datasets store structured data in JSON format, ideal for storing scraped results and enabling easy export to various formats.

/**
 * Dataset for storing structured data (JSON objects)
 */
class Dataset {
  /** Open an existing dataset or create a new one */
  static open(idOrName?: string): Promise<Dataset>;

  /** Get the default dataset instance */
  static getDefaultDataset(): Promise<Dataset>;

  /** Push data to the dataset */
  pushData(data: Dictionary | Dictionary[]): Promise<void>;

  /** Get data from the dataset */
  getData(options?: DatasetDataOptions): Promise<DatasetData>;

  /** Get dataset information */
  getInfo(): Promise<DatasetInfo>;

  /** Export dataset to various formats */
  exportTo(options: DatasetExportOptions): Promise<void>;

  /** Delete the dataset */
  drop(): Promise<void>;

  /** Convert dataset to a stream */
  stream(options?: DatasetStreamOptions): NodeJS.ReadableStream;

  /** The dataset ID */
  readonly id: string;

  /** The dataset name */
  readonly name?: string;
}

DatasetDataOptions

Options for retrieving data from datasets.

interface DatasetDataOptions {
  /** Number of items to retrieve */
  limit?: number;

  /** Number of items to skip */
  offset?: number;

  /** Whether to return data in clean JSON format */
  clean?: boolean;

  /** Fields to include in results */
  fields?: string[];

  /** Whether to return data in descending order */
  desc?: boolean;

  /** JSON streaming options */
  streaming?: boolean;
}

DatasetExportOptions

Options for exporting dataset data to different formats.

interface DatasetExportOptions {
  /** Format to export to */
  format: 'json' | 'csv' | 'xlsx' | 'xml' | 'rss';

  /** Fields to include in export */
  fields?: string[];

  /** Whether to exclude empty fields */
  omitEmptyFields?: boolean;

  /** Whether to exclude null values */
  omitNullValues?: boolean;

  /** Key to store the exported file under */
  key: string;

  /** Key-value store to save to */
  keyValueStore?: KeyValueStore;

  /** Maximum number of items to export */
  limit?: number;

  /** Number of items to skip */
  offset?: number;
}

Usage Examples:

import { Dataset, CheerioCrawler } from "crawlee";

const crawler = new CheerioCrawler({
  requestHandler: async ({ $, request, pushData }) => {
    // Extract product data
    const products = [];
    $('.product').each((_, element) => {
      products.push({
        name: $(element).find('.name').text().trim(),
        price: $(element).find('.price').text().trim(),
        url: new URL($(element).find('a').attr('href'), request.loadedUrl).href,
        extractedAt: new Date().toISOString(),
      });
    });

    // Save to default dataset
    await pushData(products);
  },
});

await crawler.run();

// Work with the dataset after crawling
const dataset = await Dataset.getDefaultDataset();

// Get all data
const { items } = await dataset.getData();
console.log(`Extracted ${items.length} products`);

// Export to CSV
await dataset.exportTo({
  format: 'csv',
  key: 'products.csv',
  fields: ['name', 'price', 'url'],
});

// Get specific data with filtering
const recentItems = await dataset.getData({
  limit: 100,
  clean: true,
  desc: true,
});

// Stream large datasets
const stream = dataset.stream({
  limit: 10000,
});

stream.on('data', (item) => {
  console.log('Processing item:', item.name);
});

KeyValueStore

Key-value stores handle unstructured data, files, and configuration storage with support for various data formats.

/**
 * Key-value store for storing unstructured data and files
 */
class KeyValueStore {
  /** Open an existing store or create a new one */
  static open(idOrName?: string): Promise<KeyValueStore>;

  /** Get the default key-value store */
  static getDefaultKeyValueStore(): Promise<KeyValueStore>;

  /** Set a value for a key */
  setValue(key: string, value: any, options?: RecordOptions): Promise<void>;

  /** Get a value by key */
  getValue<T = any>(key: string): Promise<T | null>;

  /** Get a public URL for a stored file */
  getPublicUrl(key: string): string;

  /** Delete a record */
  delete(key: string): Promise<void>;

  /** Get store information */
  getInfo(): Promise<KeyValueStoreInfo>;

  /** List all keys in the store */
  listKeys(options?: ListKeysOptions): Promise<KeyValueStoreKeys>;

  /** Delete the entire store */
  drop(): Promise<void>;

  /** The store ID */
  readonly id: string;

  /** The store name */
  readonly name?: string;
}

RecordOptions

Options for storing records in key-value stores.

interface RecordOptions {
  /** Content type of the stored data */
  contentType?: string;

  /** Whether to gzip the content */
  gzip?: boolean;

  /** Custom metadata */
  metadata?: Dictionary;
}

Usage Examples:

import { KeyValueStore, CheerioCrawler } from "crawlee";

const crawler = new CheerioCrawler({
  requestHandler: async ({ request, $, setValue }) => {
    // Save HTML content
    await setValue(`html-${request.uniqueKey}`, $.html());

    // Save screenshot (if using browser crawler)
    await setValue(`screenshot-${request.uniqueKey}`, screenshotBuffer, {
      contentType: 'image/png',
    });

    // Save structured data
    await setValue(`metadata-${request.uniqueKey}`, {
      title: $('title').text(),
      description: $('meta[name="description"]').attr('content'),
      extractedAt: new Date(),
    });

    // Save with custom options
    await setValue('large-file', largeJsonData, {
      gzip: true,
      contentType: 'application/json',
    });
  },
});

// Work with key-value store after crawling
const store = await KeyValueStore.getDefaultKeyValueStore();

// Retrieve stored data
const htmlContent = await store.getValue('html-page-1');
const metadata = await store.getValue('metadata-page-1');

// List all keys
const { keys } = await store.listKeys({ prefix: 'screenshot-' });
console.log(`Found ${keys.length} screenshots`);

// Get public URLs for files
const screenshotUrl = store.getPublicUrl('screenshot-page-1');
console.log(`Screenshot available at: ${screenshotUrl}`);

// Clean up old data
for (const key of keys) {
  if (key.includes('temp-')) {
    await store.delete(key);
  }
}

RequestQueue

Request queues manage crawling requests in FIFO order with support for priorities, deduplication, and persistence.

/**
 * Request queue for managing crawling requests in FIFO order
 */
class RequestQueue {
  /** Open an existing queue or create a new one */
  static open(idOrName?: string): Promise<RequestQueue>;

  /** Get the default request queue */
  static getDefaultRequestQueue(): Promise<RequestQueue>;

  /** Add a single request to the queue */
  addRequest(request: RequestOptions | string, options?: RequestQueueOptions): Promise<QueueOperationInfo>;

  /** Add multiple requests to the queue */
  addRequests(requests: (RequestOptions | string)[], options?: RequestQueueOptions): Promise<BatchAddRequestsResult>;

  /** Get the next request from the queue */
  fetchNextRequest(): Promise<Request | null>;

  /** Mark a request as handled */
  markRequestHandled(request: Request): Promise<QueueOperationInfo>;

  /** Return a request to the queue for retry */
  reclaimRequest(request: Request, options?: ReclaimRequestOptions): Promise<QueueOperationInfo>;

  /** Check if the queue is empty */
  isEmpty(): Promise<boolean>;

  /** Check if the queue is finished (no pending requests) */
  isFinished(): Promise<boolean>;

  /** Get queue information and statistics */
  getInfo(): Promise<RequestQueueInfo>;

  /** Delete the queue */
  drop(): Promise<void>;

  /** The queue ID */
  readonly id: string;

  /** The queue name */
  readonly name?: string;
}

RequestOptions

Options for creating requests to add to queues.

interface RequestOptions<UserData = Dictionary> {
  /** Request URL */
  url: string;

  /** Unique key for deduplication */
  uniqueKey?: string;

  /** HTTP method */
  method?: HttpMethod;

  /** Request payload for POST/PUT requests */
  payload?: string;

  /** HTTP headers */
  headers?: Dictionary;

  /** Custom user data */
  userData?: UserData;

  /** Request label for routing */
  label?: string;

  /** Whether to skip this request if it fails */
  noRetry?: boolean;

  /** Priority (higher numbers = higher priority) */
  priority?: number;

  /** Whether to keep URL fragments */
  keepUrlFragment?: boolean;
}

Usage Examples:

import { RequestQueue, BasicCrawler } from "crawlee";

// Initialize queue before starting crawler
const requestQueue = await RequestQueue.open('my-crawl-queue');

// Add initial requests
await requestQueue.addRequests([
  'https://example.com/page1',
  'https://example.com/page2',
  {
    url: 'https://example.com/api/data',
    method: 'POST',
    payload: JSON.stringify({ query: 'products' }),
    headers: { 'Content-Type': 'application/json' },
    label: 'API',
    userData: { type: 'api-call' },
  },
]);

const crawler = new BasicCrawler({
  requestQueue,
  requestHandler: async ({ request, enqueueLinks }) => {
    if (request.label === 'API') {
      // Handle API requests differently
      console.log(`Processing API request: ${request.url}`);
      return;
    }

    // Add more requests dynamically
    await enqueueLinks({
      selector: 'a[href]',
      transformRequestFunction: (req) => ({
        ...req,
        priority: req.url.includes('/important/') ? 10 : 1,
      }),
    });
  },
});

// Monitor queue status
const info = await requestQueue.getInfo();
console.log(`Queue has ${info.totalRequestCount} total requests`);
console.log(`${info.handledRequestCount} handled, ${info.pendingRequestCount} pending`);

await crawler.run();

RequestList

Request lists provide finite, static collections of requests for bounded crawling scenarios.

/**
 * Static list of requests for finite crawling scenarios
 */
class RequestList {
  /** Create a new request list from URLs or request objects */
  static open(sources: (string | RequestOptions)[], options?: RequestListOptions): Promise<RequestList>;

  /** Get the next request from the list */
  fetchNextRequest(): Promise<Request | null>;

  /** Mark a request as handled */
  markRequestHandled(request: Request): Promise<void>;

  /** Return a request to the list for retry */
  reclaimRequest(request: Request): Promise<void>;

  /** Check if all requests have been processed */
  isFinished(): Promise<boolean>;

  /** Check if the list is empty */
  isEmpty(): Promise<boolean>;

  /** Get the total number of requests */
  length(): number;

  /** Get the number of handled requests */
  handledCount(): number;

  /** Get list information and statistics */
  getState(): RequestListState;
}

RequestListOptions

Options for creating request lists.

interface RequestListOptions {
  /** Whether to keep duplicates */
  keepDuplicateUrls?: boolean;

  /** Whether to check duplicates by URL only */
  checksumOptions?: {
    forceUrlEncoding?: boolean;
    includeFragment?: boolean;
  };

  /** Custom unique key function */
  uniqueKey?: (requestOptions: RequestOptions) => string;

  /** Persist state to key-value store */
  persistStateKey?: string;

  /** Key-value store for persistence */
  persistStateKeyValueStore?: KeyValueStore;
}

Usage Examples:

import { RequestList, CheerioCrawler } from "crawlee";

// Create request list from mixed sources
const requestList = await RequestList.open([
  'https://example.com/category/electronics',
  'https://example.com/category/books',
  {
    url: 'https://example.com/category/clothing',
    userData: { category: 'fashion' },
    label: 'CATEGORY',
  },
  {
    url: 'https://example.com/special-page',
    priority: 10,
    label: 'PRIORITY',
  },
]);

const crawler = new CheerioCrawler({
  requestList,
  requestHandler: async ({ request, $ }) => {
    console.log(`Processing ${request.label || 'page'}: ${request.url}`);

    if (request.userData?.category) {
      console.log(`Category: ${request.userData.category}`);
    }

    // Extract data specific to the page type
    const data = {
      url: request.loadedUrl,
      title: $('title').text(),
      timestamp: new Date(),
    };

    await Dataset.pushData(data);
  },

  // Don't add new requests - just process the static list
  maxConcurrency: 5,
});

// Monitor progress
console.log(`Total requests: ${requestList.length()}`);

await crawler.run();

console.log(`Processed ${requestList.handledCount()} requests`);
console.log(`Finished: ${await requestList.isFinished()}`);

Storage Management

Central management for all storage instances and their lifecycle.

/**
 * Manages storage instances and their lifecycle
 */
class StorageManager {
  constructor(options?: StorageManagerOptions);

  /** Get or create a dataset */
  dataset(idOrName?: string): Promise<Dataset>;

  /** Get or create a key-value store */
  keyValueStore(idOrName?: string): Promise<KeyValueStore>;

  /** Get or create a request queue */
  requestQueue(idOrName?: string): Promise<RequestQueue>;

  /** Close all storage instances */
  closeAll(): Promise<void>;

  /** List all storage instances by type */
  listDatasets(): Promise<DatasetCollectionInfo>;
  listKeyValueStores(): Promise<KeyValueStoreCollectionInfo>;
  listRequestQueues(): Promise<RequestQueueCollectionInfo>;
}

interface StorageManagerOptions {
  /** Storage client to use */
  storageClient?: StorageClient;

  /** Local data directory */
  localDataDirectory?: string;

  /** Whether to purge storage on startup */
  purgeOnStart?: boolean;
}

Storage Client Configuration

Low-level storage client for advanced storage operations.

/**
 * Low-level storage client for advanced operations
 */
interface StorageClient {
  /** Dataset operations */
  datasets(): DatasetClient;

  /** Key-value store operations */
  keyValueStores(): KeyValueStoreClient;

  /** Request queue operations */
  requestQueues(): RequestQueueClient;

  /** Set storage options */
  setOptions(options: StorageClientOptions): void;
}

interface StorageClientOptions {
  /** Base URL for storage API */
  baseUrl?: string;

  /** Authentication token */
  token?: string;

  /** Default timeout for requests */
  timeoutSecs?: number;

  /** Maximum retries for failed requests */
  maxRetries?: number;
}

State Management

Advanced state management capabilities for persistent and recoverable crawler state.

/**
 * Class for managing persistent recoverable state
 */
class RecoverableState<TStateModel = Record<string, unknown>> {
  constructor(options: RecoverableStateOptions<TStateModel>);

  /** Initialize and load persisted state */
  initialize(): Promise<TStateModel>;

  /** Clean up resources and persist state */
  teardown(): Promise<void>;

  /** Get current state value */
  get currentValue(): TStateModel;

  /** Reset state to default values */
  reset(): Promise<void>;

  /** Manually persist current state */
  persistState(eventData?: { isMigrating: boolean }): Promise<void>;
}

/**
 * Simple state management with automatic persistence
 */
function useState<State extends Dictionary = Dictionary>(
  name?: string,
  defaultValue?: State,
  options?: UseStateOptions
): Promise<AutoSavedValue<State>>;

/**
 * Purge default storage directories
 */
function purgeDefaultStorages(options?: PurgeDefaultStorageOptions): Promise<void>;
function purgeDefaultStorages(config?: Configuration, client?: StorageClient): Promise<void>;

Usage Examples:

import { RecoverableState, useState, Configuration } from "crawlee";

// Advanced recoverable state for complex crawler state
interface CrawlerState {
  processedUrls: Set<string>;
  categoryProgress: Record<string, number>;
  lastCheckpoint: Date;
  errorCounts: Record<string, number>;
}

const state = new RecoverableState<CrawlerState>({
  defaultState: {
    processedUrls: new Set<string>(),
    categoryProgress: {},
    lastCheckpoint: new Date(),
    errorCounts: {},
  },
  persistStateKey: 'CRAWLER_STATE',
  persistenceEnabled: true,
  persistStateKvsName: 'crawler-checkpoints',
  // Custom serialization for Set objects
  serialize: (state) => JSON.stringify({
    ...state,
    processedUrls: Array.from(state.processedUrls),
  }),
  deserialize: (json) => {
    const parsed = JSON.parse(json);
    return {
      ...parsed,
      processedUrls: new Set(parsed.processedUrls),
      lastCheckpoint: new Date(parsed.lastCheckpoint),
    };
  },
});

// Initialize state at crawler start
await state.initialize();

const crawler = new CheerioCrawler({
  requestHandler: async ({ request, $, enqueueLinks }) => {
    const currentState = state.currentValue;

    // Skip if already processed
    if (currentState.processedUrls.has(request.url)) {
      return;
    }

    // Process page and update state
    const category = extractCategory(request.url);
    currentState.processedUrls.add(request.url);
    currentState.categoryProgress[category] = (currentState.categoryProgress[category] || 0) + 1;
    currentState.lastCheckpoint = new Date();

    // Find and enqueue new links
    await enqueueLinks({ selector: 'a[href]' });

    console.log(`Processed ${currentState.processedUrls.size} URLs so far`);
  },

  failedRequestHandler: async ({ request }) => {
    const currentState = state.currentValue;
    const domain = new URL(request.url).hostname;
    currentState.errorCounts[domain] = (currentState.errorCounts[domain] || 0) + 1;
  },
});

// Clean up state on finish
crawler.teardown = async () => {
  await state.teardown();
};

// Simple state management with useState
const simpleState = await useState('SIMPLE_CRAWLER_STATE', {
  totalProcessed: 0,
  startTime: new Date(),
  categories: {} as Record<string, number>,
});

const simpleCrawler = new CheerioCrawler({
  requestHandler: async ({ request }) => {
    // useState automatically persists changes
    simpleState.totalProcessed++;

    const category = extractCategory(request.url);
    simpleState.categories[category] = (simpleState.categories[category] || 0) + 1;

    console.log(`Total processed: ${simpleState.totalProcessed}`);
  },
});

// Clean up storage before starting (optional)
await purgeDefaultStorages({
  onlyPurgeOnce: true,
});

Storage Utilities

Additional utilities for storage management and cleanup.

/**
 * Get request ID from unique key for local storage
 */
function getRequestId(uniqueKey: string): string;

Usage Examples:

import { getRequestId, RequestQueue } from "crawlee";

// Create consistent request IDs for caching
const url = "https://example.com/page";
const uniqueKey = `${url}-${Date.now()}`;
const requestId = getRequestId(uniqueKey);

console.log(`Request ID: ${requestId}`); // e.g., "a1b2c3d4e5f6g7h"

// Use in custom request queue implementations
class CustomRequestQueue extends RequestQueue {
  private cache = new Map<string, any>();

  async addRequest(request: RequestOptions) {
    const id = getRequestId(request.uniqueKey || request.url);

    // Check cache before adding
    if (this.cache.has(id)) {
      console.log('Request already cached');
      return;
    }

    const result = await super.addRequest(request);
    this.cache.set(id, result);

    return result;
  }
}

Types

interface DatasetInfo {
  /** Dataset ID */
  id: string;

  /** Dataset name */
  name?: string;

  /** Creation time */
  createdAt: Date;

  /** Last modification time */
  modifiedAt: Date;

  /** Number of items in dataset */
  itemCount: number;

  /** Total size in bytes */
  cleanItemCount: number;
}

interface DatasetData<T = Dictionary> {
  /** Array of data items */
  items: T[];

  /** Total number of items available */
  total: number;

  /** Number of items in this response */
  count: number;

  /** Starting offset of items */
  offset: number;

  /** Maximum items per response */
  limit: number;
}

interface KeyValueStoreInfo {
  /** Store ID */
  id: string;

  /** Store name */
  name?: string;

  /** Creation time */
  createdAt: Date;

  /** Last modification time */
  modifiedAt: Date;
}

interface KeyValueStoreKeys {
  /** Array of key information */
  keys: Array<{
    key: string;
    size: number;
    contentType?: string;
    modifiedAt: Date;
  }>;

  /** Total number of keys */
  total: number;

  /** Number of keys in this response */
  count: number;

  /** Starting offset */
  offset: number;

  /** Maximum keys per response */
  limit: number;

  /** Whether there are more keys */
  isTruncated: boolean;
}

interface RequestQueueInfo {
  /** Queue ID */
  id: string;

  /** Queue name */
  name?: string;

  /** Creation time */
  createdAt: Date;

  /** Last modification time */
  modifiedAt: Date;

  /** Total number of requests ever added */
  totalRequestCount: number;

  /** Number of handled requests */
  handledRequestCount: number;

  /** Number of pending requests */
  pendingRequestCount: number;
}

interface QueueOperationInfo {
  /** Request ID */
  requestId: string;

  /** Whether this was a new request */
  wasAlreadyHandled: boolean;

  /** Whether this was already present */
  wasAlreadyPresent: boolean;

  /** Unique key of the request */
  uniqueKey: string;
}

interface BatchAddRequestsResult {
  /** Requests that were added */
  addedRequests: QueueOperationInfo[];

  /** Requests that were already present */
  existingRequests: QueueOperationInfo[];

  /** Number of processed requests */
  processedRequests: number;

  /** Number of unprocessed requests */
  unprocessedRequests: number;
}

interface RequestListState {
  /** Total number of requests */
  total: number;

  /** Number of finished requests */
  finished: number;

  /** Number of pending requests */
  pending: number;

  /** Number of reclaimed requests */
  reclaimed: number;

  /** List of finished request IDs */
  finishedRequestIds: string[];

  /** List of reclaimed request IDs */
  reclaimedRequestIds: string[];
}

interface ReclaimRequestOptions {
  /** Whether to put the request in front of the queue */
  forefront?: boolean;
}

interface ListKeysOptions {
  /** Maximum number of keys to return */
  limit?: number;

  /** Prefix to filter keys */
  prefix?: string;

  /** Starting position for pagination */
  offset?: number;
}

interface DatasetStreamOptions {
  /** Number of items to stream */
  limit?: number;

  /** Starting offset */
  offset?: number;

  /** Whether to return clean JSON */
  clean?: boolean;
}

type HttpMethod = 'GET' | 'POST' | 'PUT' | 'DELETE' | 'HEAD' | 'OPTIONS' | 'PATCH';

interface RecoverableStateOptions<TStateModel = Record<string, unknown>> {
  /** Default state used if no persisted state is found */
  defaultState: TStateModel;

  /** The key under which the state is stored */
  persistStateKey: string;

  /** Flag to enable or disable state persistence */
  persistenceEnabled?: boolean;

  /** KeyValueStore name for persistence */
  persistStateKvsName?: string;

  /** KeyValueStore ID for persistence */
  persistStateKvsId?: string;

  /** Logger instance */
  logger?: Log;

  /** Configuration instance */
  config?: Configuration;

  /** Custom serialization function */
  serialize?: (state: TStateModel) => string;

  /** Custom deserialization function */
  deserialize?: (serializedState: string) => TStateModel;
}

interface UseStateOptions {
  /** Configuration instance */
  config?: Configuration;

  /** KeyValueStore name for state storage */
  keyValueStoreName?: string | null;
}

interface PurgeDefaultStorageOptions {
  /** If true, purge only once per execution */
  onlyPurgeOnce?: boolean;

  /** Configuration instance */
  config?: Configuration;

  /** Storage client instance */
  client?: StorageClient;
}

interface AutoSavedValue<T> extends T {
  /** Manually save the current state */
  save(): Promise<void>;

  /** Reset to initial value */
  reset(): Promise<void>;
}

Install with Tessl CLI