The scalable web crawling and scraping library for JavaScript/Node.js that enables development of data extraction and web automation jobs with headless Chrome and Puppeteer.
Session management provides capabilities for handling cookies, user agents, and proxy rotation to avoid blocking and rate limiting during large-scale crawling operations.
Individual session containing cookies, proxy information, and state for a single logical browsing session.
/**
* Represents a single session with cookies, proxy configuration, and state
*/
class Session {
constructor(options: SessionOptions);
/** Unique session ID */
readonly id: string;
/** Current cookie jar for this session */
readonly cookieJar: CookieJar;
/** User agent string for this session */
readonly userData: Dictionary;
/** Whether this session is blocked/retired */
readonly isBlocked: boolean;
/** Number of errors encountered by this session */
readonly errorScore: number;
/** When this session was created */
readonly createdAt: Date;
/** When this session expires */
readonly expiresAt?: Date;
/** Get cookie string for a URL */
getCookieString(url: string): string;
/** Set cookies from response headers */
setCookiesFromResponse(response: Response): void;
/** Set cookies for Puppeteer page */
setPuppeteerCookies(page: PuppeteerPage, domain?: string): Promise<void>;
/** Set cookies for Playwright page */
setPlaywrightCookies(page: PlaywrightPage, domain?: string): Promise<void>;
/** Mark this session as blocked */
markBad(errorMessage?: string): void;
/** Retire this session (soft block) */
retire(): void;
/** Get session state for persistence */
getState(): SessionState;
/** Check if session is usable */
isUsable(): boolean;
}Configuration options for creating sessions.
interface SessionOptions {
/** Unique session ID */
id?: string;
/** Session pool that owns this session */
sessionPool?: SessionPool;
/** User agent string */
userAgent?: string;
/** Custom user data */
userData?: Dictionary;
/** Proxy information for this session */
proxyInfo?: ProxyInfo;
/** Cookie jar instance */
cookieJar?: CookieJar;
/** Maximum age in seconds */
maxAgeSecs?: number;
/** Maximum number of errors before blocking */
maxErrorScore?: number;
/** Custom error score weightings */
errorScoreDecrement?: number;
}Serializable state of a session for persistence.
interface SessionState {
/** Session ID */
id: string;
/** Cookies as key-value pairs */
cookies: Cookie[];
/** User agent string */
userAgent: string;
/** Custom user data */
userData: Dictionary;
/** Current error score */
errorScore: number;
/** Whether session is blocked */
isBlocked: boolean;
/** Creation timestamp */
createdAt: string;
/** Expiration timestamp */
expiresAt?: string;
/** Proxy URL if used */
proxyUrl?: string;
}Usage Examples:
import { Session, CheerioCrawler } from "crawlee";
// Create a session manually
const session = new Session({
userAgent: 'Mozilla/5.0 (compatible; CustomBot/1.0)',
userData: { loginStatus: 'guest' },
maxAgeSecs: 3600, // 1 hour
});
// Use session in crawler
const crawler = new CheerioCrawler({
useSessionPool: true,
requestHandler: async ({ session, request, response }) => {
console.log(`Using session ${session.id} for ${request.url}`);
// Handle login detection
if (response.url.includes('/login')) {
session.userData.loginRequired = true;
session.markBad('Login required');
return;
}
// Save successful interaction
if (response.statusCode === 200) {
session.userData.lastSuccessful = new Date();
}
// Process response...
},
});
// Work with session state
const sessionState = session.getState();
console.log('Session cookies:', sessionState.cookies.length);
console.log('Session score:', sessionState.errorScore);
// Check session health
if (!session.isUsable()) {
console.log('Session is no longer usable');
}Pool for managing multiple sessions with automatic rotation and lifecycle management.
/**
* Pool for managing sessions with automatic rotation and error handling
*/
class SessionPool {
constructor(options?: SessionPoolOptions);
/** Get a session for a request */
getSession(request?: Request): Promise<Session>;
/** Get session by ID */
getSessionById(sessionId: string): Session | undefined;
/** Mark a session as having errors */
markSessionBad(session: Session): Promise<void>;
/** Retire a session (remove from active use) */
retire(session: Session): Promise<void>;
/** Retire all sessions (clear the pool) */
retireAllSessions(): Promise<void>;
/** Manually add a session to the pool */
addSession(session: Session): void;
/** Get pool statistics */
getState(): SessionPoolState;
/** Persist session pool state */
persistState(): Promise<void>;
/** Tear down the session pool */
teardown(): Promise<void>;
/** Total number of sessions in pool */
readonly sessionsCount: number;
/** Number of usable sessions */
readonly usableSessionsCount: number;
/** Number of retired sessions */
readonly retiredSessionsCount: number;
}Configuration options for SessionPool.
interface SessionPoolOptions {
/** Maximum number of sessions in the pool */
maxPoolSize?: number;
/** How often to create new sessions */
sessionOptions?: SessionOptions;
/** Persist sessions to key-value store */
persistStateKeyValueStoreId?: string;
/** Key for persisting session pool state */
persistStateKey?: string;
/** Whether to create sessions on demand */
createSessionFunction?: (sessionPool: SessionPool, options?: SessionOptions) => Session;
/** Whether to validate sessions before use */
validateSessionFunction?: (session: Session) => Promise<boolean>;
/** Custom user agent generation */
userAgentPoolOptions?: UserAgentPoolOptions;
/** Proxy configuration for sessions */
proxyConfiguration?: ProxyConfiguration;
/** Session retirement rules */
sessionRetirementRules?: SessionRetirementRules;
}State information about the session pool.
interface SessionPoolState {
/** Total sessions in pool */
totalSessions: number;
/** Usable sessions count */
usableSessions: number;
/** Retired sessions count */
retiredSessions: number;
/** Blocked sessions count */
blockedSessions: number;
/** Sessions by error score */
sessionsByErrorScore: Dictionary<number>;
/** Average session age */
averageSessionAge: number;
/** Pool health ratio (0-1) */
poolHealth: number;
}Usage Examples:
import { SessionPool, PuppeteerCrawler } from "crawlee";
// Create session pool with configuration
const sessionPool = new SessionPool({
maxPoolSize: 100,
sessionOptions: {
maxAgeSecs: 1800, // 30 minutes
maxErrorScore: 3,
},
persistStateKey: 'my-crawler-sessions',
userAgentPoolOptions: {
userAgentStrings: [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
],
},
});
const crawler = new PuppeteerCrawler({
sessionPool,
sessionPoolOptions: {
maxPoolSize: 50,
},
requestHandler: async ({ page, request, session }) => {
console.log(`Using session ${session.id}`);
// Handle different response scenarios
try {
await page.goto(request.url);
// Check for blocking indicators
const isBlocked = await page.$('.captcha, .blocked-message');
if (isBlocked) {
session.markBad('Blocked by anti-bot measures');
return;
}
// Check for rate limiting
const isRateLimited = await page.$('.rate-limit');
if (isRateLimited) {
session.userData.rateLimited = true;
// Don't mark as bad, just note it
}
// Extract data...
const title = await page.title();
await Dataset.pushData({ url: request.url, title });
} catch (error) {
// Handle session-related errors
if (error.message.includes('timeout')) {
session.userData.timeouts = (session.userData.timeouts || 0) + 1;
if (session.userData.timeouts > 3) {
session.markBad('Too many timeouts');
}
}
throw error;
}
},
// Custom failed request handler for session management
failedRequestHandler: async ({ request, session, error }) => {
console.log(`Request failed for session ${session.id}: ${error.message}`);
// Mark session bad for certain error types
if (error.message.includes('403') || error.message.includes('blocked')) {
await sessionPool.markSessionBad(session);
}
},
});
// Monitor session pool
setInterval(async () => {
const state = sessionPool.getState();
console.log(`Session pool: ${state.usableSessions}/${state.totalSessions} usable`);
console.log(`Pool health: ${(state.poolHealth * 100).toFixed(1)}%`);
// Retire old sessions if pool health is low
if (state.poolHealth < 0.3) {
console.log('Pool health low, retiring all sessions');
await sessionPool.retireAllSessions();
}
}, 30000);
await crawler.run();
// Clean up
await sessionPool.teardown();Working with cookies across different session types.
interface Cookie {
/** Cookie name */
name: string;
/** Cookie value */
value: string;
/** Domain for the cookie */
domain?: string;
/** Path for the cookie */
path?: string;
/** Expiration date */
expires?: Date;
/** Max age in seconds */
maxAge?: number;
/** Whether cookie is secure */
secure?: boolean;
/** Whether cookie is HTTP only */
httpOnly?: boolean;
/** SameSite policy */
sameSite?: 'Strict' | 'Lax' | 'None';
}
interface CookieJar {
/** Get all cookies for a domain */
getCookies(url: string): Cookie[];
/** Set a cookie */
setCookie(cookie: Cookie | string, url: string): void;
/** Get cookies as header string */
getCookieString(url: string): string;
/** Remove cookies */
removeCookie(name: string, domain?: string): boolean;
/** Remove all cookies */
removeAllCookies(): void;
}Usage Examples:
import { Session } from "crawlee";
const session = new Session({
userAgent: 'CustomBot/1.0',
});
// Working with cookies manually
session.cookieJar.setCookie({
name: 'session_id',
value: 'abc123',
domain: '.example.com',
path: '/',
secure: true,
httpOnly: true,
}, 'https://example.com');
// Get cookies for a specific URL
const cookies = session.cookieJar.getCookies('https://api.example.com');
console.log('Cookies for API:', cookies);
// Use with different browser types
const crawler = new PuppeteerCrawler({
useSessionPool: true,
preNavigationHooks: [
async ({ session, page }) => {
// Set cookies before navigation
await session.setPuppeteerCookies(page, '.example.com');
},
],
requestHandler: async ({ session, page, response }) => {
// Save cookies after navigation
const newCookies = await page.cookies();
newCookies.forEach(cookie => {
session.cookieJar.setCookie(cookie, response.url);
});
},
});Managing user agents for sessions to appear more human-like.
interface UserAgentPoolOptions {
/** List of user agent strings to choose from */
userAgentStrings?: string[];
/** Whether to rotate user agents */
rotateUserAgents?: boolean;
/** User agent categories to use */
categories?: UserAgentCategory[];
/** Operating systems to simulate */
operatingSystems?: string[];
/** Browser types to simulate */
browsers?: string[];
}
enum UserAgentCategory {
DESKTOP = 'desktop',
MOBILE = 'mobile',
TABLET = 'tablet',
}Usage Examples:
import { SessionPool } from "crawlee";
const sessionPool = new SessionPool({
userAgentPoolOptions: {
userAgentStrings: [
// Chrome on Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
// Safari on macOS
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
// Firefox on Linux
'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0',
],
rotateUserAgents: true,
categories: [UserAgentCategory.DESKTOP],
},
});
// Custom user agent selection
const customSessionPool = new SessionPool({
createSessionFunction: (pool, options) => {
const userAgents = [
'Bot/1.0 (compatible; DataExtractor)',
'Crawler/2.0 (+http://example.com/bot)',
];
return new Session({
...options,
userAgent: userAgents[Math.floor(Math.random() * userAgents.length)],
userData: {
browserType: options?.userAgent?.includes('Chrome') ? 'chrome' : 'firefox',
},
});
},
});Advanced configuration for when to retire sessions.
interface SessionRetirementRules {
/** Maximum age before retirement */
maxSessionAgeMinutes?: number;
/** Maximum error score before retirement */
maxErrorScore?: number;
/** Retire on specific HTTP status codes */
retireOnStatusCodes?: number[];
/** Retire on specific error patterns */
retireOnErrorPatterns?: RegExp[];
/** Custom retirement function */
shouldRetireSession?: (session: Session, context?: any) => boolean;
/** How often to check for retirement */
retirementCheckIntervalSecs?: number;
}Usage Examples:
import { SessionPool, Session } from "crawlee";
const sessionPool = new SessionPool({
sessionRetirementRules: {
maxSessionAgeMinutes: 30,
maxErrorScore: 5,
retireOnStatusCodes: [403, 429, 503],
retireOnErrorPatterns: [/blocked/i, /captcha/i, /rate.?limit/i],
shouldRetireSession: (session, context) => {
// Custom retirement logic
const timeouts = session.userData.timeouts || 0;
const redirects = session.userData.redirects || 0;
// Retire if too many timeouts or suspicious redirects
return timeouts > 3 || redirects > 10;
},
retirementCheckIntervalSecs: 300, // Check every 5 minutes
},
});
// Monitor and react to session retirement
const crawler = new CheerioCrawler({
sessionPool,
requestHandler: async ({ session, response }) => {
// Track session metrics
if (response.statusCode >= 300 && response.statusCode < 400) {
session.userData.redirects = (session.userData.redirects || 0) + 1;
}
// Process request...
},
failedRequestHandler: async ({ session, error }) => {
// Custom error handling that affects retirement
if (error.code === 'ETIMEDOUT') {
session.userData.timeouts = (session.userData.timeouts || 0) + 1;
}
console.log(`Session ${session.id} error count: ${session.errorScore}`);
},
});interface ProxyInfo {
/** Proxy URL */
url: string;
/** Proxy hostname */
hostname: string;
/** Proxy port */
port: number;
/** Proxy protocol */
protocol: string;
/** Authentication credentials */
auth?: {
username: string;
password: string;
};
/** Session ID associated with this proxy */
sessionId?: string | number;
/** Password for the proxy */
password?: string;
/** Username for the proxy */
username?: string;
}
interface Response {
/** HTTP status code */
statusCode: number;
/** Response URL (after redirects) */
url: string;
/** Response headers */
headers: Dictionary<string | string[]>;
/** Response body */
body?: string;
/** Raw response body */
rawBody?: Buffer;
}
interface Dictionary<T = any> {
[key: string]: T;
}
interface Request<UserData = Dictionary> {
/** Request URL */
url: string;
/** Loaded URL (after redirects) */
loadedUrl?: string;
/** Unique identifier for deduplication */
uniqueKey: string;
/** HTTP method */
method?: 'GET' | 'POST' | 'PUT' | 'DELETE' | 'HEAD' | 'OPTIONS' | 'PATCH';
/** Request payload */
payload?: string;
/** Custom user data */
userData?: UserData;
/** Request label for routing */
label?: string;
/** Whether to retry this request on failure */
noRetry?: boolean;
/** Number of retry attempts */
retryCount?: number;
/** HTTP headers */
headers?: Dictionary<string>;
/** When this request was handled */
handledAt?: Date;
}Install with Tessl CLI
npx tessl i tessl/npm-crawlee