CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/npm-crawlee

The scalable web crawling and scraping library for JavaScript/Node.js that enables development of data extraction and web automation jobs with headless Chrome and Puppeteer.

Overview
Eval results
Files

session-management.mddocs/

Session Management

Session management provides capabilities for handling cookies, user agents, and proxy rotation to avoid blocking and rate limiting during large-scale crawling operations.

Capabilities

Session

Individual session containing cookies, proxy information, and state for a single logical browsing session.

/**
 * Represents a single session with cookies, proxy configuration, and state
 */
class Session {
  constructor(options: SessionOptions);

  /** Unique session ID */
  readonly id: string;

  /** Current cookie jar for this session */
  readonly cookieJar: CookieJar;

  /** User agent string for this session */
  readonly userData: Dictionary;

  /** Whether this session is blocked/retired */
  readonly isBlocked: boolean;

  /** Number of errors encountered by this session */
  readonly errorScore: number;

  /** When this session was created */
  readonly createdAt: Date;

  /** When this session expires */
  readonly expiresAt?: Date;

  /** Get cookie string for a URL */
  getCookieString(url: string): string;

  /** Set cookies from response headers */
  setCookiesFromResponse(response: Response): void;

  /** Set cookies for Puppeteer page */
  setPuppeteerCookies(page: PuppeteerPage, domain?: string): Promise<void>;

  /** Set cookies for Playwright page */
  setPlaywrightCookies(page: PlaywrightPage, domain?: string): Promise<void>;

  /** Mark this session as blocked */
  markBad(errorMessage?: string): void;

  /** Retire this session (soft block) */
  retire(): void;

  /** Get session state for persistence */
  getState(): SessionState;

  /** Check if session is usable */
  isUsable(): boolean;
}

SessionOptions

Configuration options for creating sessions.

interface SessionOptions {
  /** Unique session ID */
  id?: string;

  /** Session pool that owns this session */
  sessionPool?: SessionPool;

  /** User agent string */
  userAgent?: string;

  /** Custom user data */
  userData?: Dictionary;

  /** Proxy information for this session */
  proxyInfo?: ProxyInfo;

  /** Cookie jar instance */
  cookieJar?: CookieJar;

  /** Maximum age in seconds */
  maxAgeSecs?: number;

  /** Maximum number of errors before blocking */
  maxErrorScore?: number;

  /** Custom error score weightings */
  errorScoreDecrement?: number;
}

SessionState

Serializable state of a session for persistence.

interface SessionState {
  /** Session ID */
  id: string;

  /** Cookies as key-value pairs */
  cookies: Cookie[];

  /** User agent string */
  userAgent: string;

  /** Custom user data */
  userData: Dictionary;

  /** Current error score */
  errorScore: number;

  /** Whether session is blocked */
  isBlocked: boolean;

  /** Creation timestamp */
  createdAt: string;

  /** Expiration timestamp */
  expiresAt?: string;

  /** Proxy URL if used */
  proxyUrl?: string;
}

Usage Examples:

import { Session, CheerioCrawler } from "crawlee";

// Create a session manually
const session = new Session({
  userAgent: 'Mozilla/5.0 (compatible; CustomBot/1.0)',
  userData: { loginStatus: 'guest' },
  maxAgeSecs: 3600, // 1 hour
});

// Use session in crawler
const crawler = new CheerioCrawler({
  useSessionPool: true,
  requestHandler: async ({ session, request, response }) => {
    console.log(`Using session ${session.id} for ${request.url}`);

    // Handle login detection
    if (response.url.includes('/login')) {
      session.userData.loginRequired = true;
      session.markBad('Login required');
      return;
    }

    // Save successful interaction
    if (response.statusCode === 200) {
      session.userData.lastSuccessful = new Date();
    }

    // Process response...
  },
});

// Work with session state
const sessionState = session.getState();
console.log('Session cookies:', sessionState.cookies.length);
console.log('Session score:', sessionState.errorScore);

// Check session health
if (!session.isUsable()) {
  console.log('Session is no longer usable');
}

SessionPool

Pool for managing multiple sessions with automatic rotation and lifecycle management.

/**
 * Pool for managing sessions with automatic rotation and error handling
 */
class SessionPool {
  constructor(options?: SessionPoolOptions);

  /** Get a session for a request */
  getSession(request?: Request): Promise<Session>;

  /** Get session by ID */
  getSessionById(sessionId: string): Session | undefined;

  /** Mark a session as having errors */
  markSessionBad(session: Session): Promise<void>;

  /** Retire a session (remove from active use) */
  retire(session: Session): Promise<void>;

  /** Retire all sessions (clear the pool) */
  retireAllSessions(): Promise<void>;

  /** Manually add a session to the pool */
  addSession(session: Session): void;

  /** Get pool statistics */
  getState(): SessionPoolState;

  /** Persist session pool state */
  persistState(): Promise<void>;

  /** Tear down the session pool */
  teardown(): Promise<void>;

  /** Total number of sessions in pool */
  readonly sessionsCount: number;

  /** Number of usable sessions */
  readonly usableSessionsCount: number;

  /** Number of retired sessions */
  readonly retiredSessionsCount: number;
}

SessionPoolOptions

Configuration options for SessionPool.

interface SessionPoolOptions {
  /** Maximum number of sessions in the pool */
  maxPoolSize?: number;

  /** How often to create new sessions */
  sessionOptions?: SessionOptions;

  /** Persist sessions to key-value store */
  persistStateKeyValueStoreId?: string;

  /** Key for persisting session pool state */
  persistStateKey?: string;

  /** Whether to create sessions on demand */
  createSessionFunction?: (sessionPool: SessionPool, options?: SessionOptions) => Session;

  /** Whether to validate sessions before use */
  validateSessionFunction?: (session: Session) => Promise<boolean>;

  /** Custom user agent generation */
  userAgentPoolOptions?: UserAgentPoolOptions;

  /** Proxy configuration for sessions */
  proxyConfiguration?: ProxyConfiguration;

  /** Session retirement rules */
  sessionRetirementRules?: SessionRetirementRules;
}

SessionPoolState

State information about the session pool.

interface SessionPoolState {
  /** Total sessions in pool */
  totalSessions: number;

  /** Usable sessions count */
  usableSessions: number;

  /** Retired sessions count */
  retiredSessions: number;

  /** Blocked sessions count */
  blockedSessions: number;

  /** Sessions by error score */
  sessionsByErrorScore: Dictionary<number>;

  /** Average session age */
  averageSessionAge: number;

  /** Pool health ratio (0-1) */
  poolHealth: number;
}

Usage Examples:

import { SessionPool, PuppeteerCrawler } from "crawlee";

// Create session pool with configuration
const sessionPool = new SessionPool({
  maxPoolSize: 100,
  sessionOptions: {
    maxAgeSecs: 1800, // 30 minutes
    maxErrorScore: 3,
  },
  persistStateKey: 'my-crawler-sessions',
  userAgentPoolOptions: {
    userAgentStrings: [
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
      'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
    ],
  },
});

const crawler = new PuppeteerCrawler({
  sessionPool,
  sessionPoolOptions: {
    maxPoolSize: 50,
  },

  requestHandler: async ({ page, request, session }) => {
    console.log(`Using session ${session.id}`);

    // Handle different response scenarios
    try {
      await page.goto(request.url);

      // Check for blocking indicators
      const isBlocked = await page.$('.captcha, .blocked-message');
      if (isBlocked) {
        session.markBad('Blocked by anti-bot measures');
        return;
      }

      // Check for rate limiting
      const isRateLimited = await page.$('.rate-limit');
      if (isRateLimited) {
        session.userData.rateLimited = true;
        // Don't mark as bad, just note it
      }

      // Extract data...
      const title = await page.title();
      await Dataset.pushData({ url: request.url, title });

    } catch (error) {
      // Handle session-related errors
      if (error.message.includes('timeout')) {
        session.userData.timeouts = (session.userData.timeouts || 0) + 1;
        if (session.userData.timeouts > 3) {
          session.markBad('Too many timeouts');
        }
      }
      throw error;
    }
  },

  // Custom failed request handler for session management
  failedRequestHandler: async ({ request, session, error }) => {
    console.log(`Request failed for session ${session.id}: ${error.message}`);

    // Mark session bad for certain error types
    if (error.message.includes('403') || error.message.includes('blocked')) {
      await sessionPool.markSessionBad(session);
    }
  },
});

// Monitor session pool
setInterval(async () => {
  const state = sessionPool.getState();
  console.log(`Session pool: ${state.usableSessions}/${state.totalSessions} usable`);
  console.log(`Pool health: ${(state.poolHealth * 100).toFixed(1)}%`);

  // Retire old sessions if pool health is low
  if (state.poolHealth < 0.3) {
    console.log('Pool health low, retiring all sessions');
    await sessionPool.retireAllSessions();
  }
}, 30000);

await crawler.run();

// Clean up
await sessionPool.teardown();

Cookie Management

Working with cookies across different session types.

interface Cookie {
  /** Cookie name */
  name: string;

  /** Cookie value */
  value: string;

  /** Domain for the cookie */
  domain?: string;

  /** Path for the cookie */
  path?: string;

  /** Expiration date */
  expires?: Date;

  /** Max age in seconds */
  maxAge?: number;

  /** Whether cookie is secure */
  secure?: boolean;

  /** Whether cookie is HTTP only */
  httpOnly?: boolean;

  /** SameSite policy */
  sameSite?: 'Strict' | 'Lax' | 'None';
}

interface CookieJar {
  /** Get all cookies for a domain */
  getCookies(url: string): Cookie[];

  /** Set a cookie */
  setCookie(cookie: Cookie | string, url: string): void;

  /** Get cookies as header string */
  getCookieString(url: string): string;

  /** Remove cookies */
  removeCookie(name: string, domain?: string): boolean;

  /** Remove all cookies */
  removeAllCookies(): void;
}

Usage Examples:

import { Session } from "crawlee";

const session = new Session({
  userAgent: 'CustomBot/1.0',
});

// Working with cookies manually
session.cookieJar.setCookie({
  name: 'session_id',
  value: 'abc123',
  domain: '.example.com',
  path: '/',
  secure: true,
  httpOnly: true,
}, 'https://example.com');

// Get cookies for a specific URL
const cookies = session.cookieJar.getCookies('https://api.example.com');
console.log('Cookies for API:', cookies);

// Use with different browser types
const crawler = new PuppeteerCrawler({
  useSessionPool: true,
  preNavigationHooks: [
    async ({ session, page }) => {
      // Set cookies before navigation
      await session.setPuppeteerCookies(page, '.example.com');
    },
  ],

  requestHandler: async ({ session, page, response }) => {
    // Save cookies after navigation
    const newCookies = await page.cookies();
    newCookies.forEach(cookie => {
      session.cookieJar.setCookie(cookie, response.url);
    });
  },
});

User Agent Management

Managing user agents for sessions to appear more human-like.

interface UserAgentPoolOptions {
  /** List of user agent strings to choose from */
  userAgentStrings?: string[];

  /** Whether to rotate user agents */
  rotateUserAgents?: boolean;

  /** User agent categories to use */
  categories?: UserAgentCategory[];

  /** Operating systems to simulate */
  operatingSystems?: string[];

  /** Browser types to simulate */
  browsers?: string[];
}

enum UserAgentCategory {
  DESKTOP = 'desktop',
  MOBILE = 'mobile',
  TABLET = 'tablet',
}

Usage Examples:

import { SessionPool } from "crawlee";

const sessionPool = new SessionPool({
  userAgentPoolOptions: {
    userAgentStrings: [
      // Chrome on Windows
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
      // Safari on macOS
      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
      // Firefox on Linux
      'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0',
    ],
    rotateUserAgents: true,
    categories: [UserAgentCategory.DESKTOP],
  },
});

// Custom user agent selection
const customSessionPool = new SessionPool({
  createSessionFunction: (pool, options) => {
    const userAgents = [
      'Bot/1.0 (compatible; DataExtractor)',
      'Crawler/2.0 (+http://example.com/bot)',
    ];

    return new Session({
      ...options,
      userAgent: userAgents[Math.floor(Math.random() * userAgents.length)],
      userData: {
        browserType: options?.userAgent?.includes('Chrome') ? 'chrome' : 'firefox',
      },
    });
  },
});

Session Retirement Rules

Advanced configuration for when to retire sessions.

interface SessionRetirementRules {
  /** Maximum age before retirement */
  maxSessionAgeMinutes?: number;

  /** Maximum error score before retirement */
  maxErrorScore?: number;

  /** Retire on specific HTTP status codes */
  retireOnStatusCodes?: number[];

  /** Retire on specific error patterns */
  retireOnErrorPatterns?: RegExp[];

  /** Custom retirement function */
  shouldRetireSession?: (session: Session, context?: any) => boolean;

  /** How often to check for retirement */
  retirementCheckIntervalSecs?: number;
}

Usage Examples:

import { SessionPool, Session } from "crawlee";

const sessionPool = new SessionPool({
  sessionRetirementRules: {
    maxSessionAgeMinutes: 30,
    maxErrorScore: 5,
    retireOnStatusCodes: [403, 429, 503],
    retireOnErrorPatterns: [/blocked/i, /captcha/i, /rate.?limit/i],

    shouldRetireSession: (session, context) => {
      // Custom retirement logic
      const timeouts = session.userData.timeouts || 0;
      const redirects = session.userData.redirects || 0;

      // Retire if too many timeouts or suspicious redirects
      return timeouts > 3 || redirects > 10;
    },

    retirementCheckIntervalSecs: 300, // Check every 5 minutes
  },
});

// Monitor and react to session retirement
const crawler = new CheerioCrawler({
  sessionPool,

  requestHandler: async ({ session, response }) => {
    // Track session metrics
    if (response.statusCode >= 300 && response.statusCode < 400) {
      session.userData.redirects = (session.userData.redirects || 0) + 1;
    }

    // Process request...
  },

  failedRequestHandler: async ({ session, error }) => {
    // Custom error handling that affects retirement
    if (error.code === 'ETIMEDOUT') {
      session.userData.timeouts = (session.userData.timeouts || 0) + 1;
    }

    console.log(`Session ${session.id} error count: ${session.errorScore}`);
  },
});

Types

interface ProxyInfo {
  /** Proxy URL */
  url: string;

  /** Proxy hostname */
  hostname: string;

  /** Proxy port */
  port: number;

  /** Proxy protocol */
  protocol: string;

  /** Authentication credentials */
  auth?: {
    username: string;
    password: string;
  };

  /** Session ID associated with this proxy */
  sessionId?: string | number;

  /** Password for the proxy */
  password?: string;

  /** Username for the proxy */
  username?: string;
}

interface Response {
  /** HTTP status code */
  statusCode: number;

  /** Response URL (after redirects) */
  url: string;

  /** Response headers */
  headers: Dictionary<string | string[]>;

  /** Response body */
  body?: string;

  /** Raw response body */
  rawBody?: Buffer;
}

interface Dictionary<T = any> {
  [key: string]: T;
}

interface Request<UserData = Dictionary> {
  /** Request URL */
  url: string;

  /** Loaded URL (after redirects) */
  loadedUrl?: string;

  /** Unique identifier for deduplication */
  uniqueKey: string;

  /** HTTP method */
  method?: 'GET' | 'POST' | 'PUT' | 'DELETE' | 'HEAD' | 'OPTIONS' | 'PATCH';

  /** Request payload */
  payload?: string;

  /** Custom user data */
  userData?: UserData;

  /** Request label for routing */
  label?: string;

  /** Whether to retry this request on failure */
  noRetry?: boolean;

  /** Number of retry attempts */
  retryCount?: number;

  /** HTTP headers */
  headers?: Dictionary<string>;

  /** When this request was handled */
  handledAt?: Date;
}

Install with Tessl CLI

npx tessl i tessl/npm-crawlee

docs

browser-crawling.md

configuration-proxies.md

core-crawling.md

http-crawling.md

index.md

session-management.md

storage.md

utilities.md

tile.json