0
# Session Management
1
2
Session management provides capabilities for handling cookies, user agents, and proxy rotation to avoid blocking and rate limiting during large-scale crawling operations.
3
4
## Capabilities
5
6
### Session
7
8
Individual session containing cookies, proxy information, and state for a single logical browsing session.
9
10
```typescript { .api }
11
/**
12
* Represents a single session with cookies, proxy configuration, and state
13
*/
14
class Session {
15
constructor(options: SessionOptions);
16
17
/** Unique session ID */
18
readonly id: string;
19
20
/** Current cookie jar for this session */
21
readonly cookieJar: CookieJar;
22
23
/** User agent string for this session */
24
readonly userData: Dictionary;
25
26
/** Whether this session is blocked/retired */
27
readonly isBlocked: boolean;
28
29
/** Number of errors encountered by this session */
30
readonly errorScore: number;
31
32
/** When this session was created */
33
readonly createdAt: Date;
34
35
/** When this session expires */
36
readonly expiresAt?: Date;
37
38
/** Get cookie string for a URL */
39
getCookieString(url: string): string;
40
41
/** Set cookies from response headers */
42
setCookiesFromResponse(response: Response): void;
43
44
/** Set cookies for Puppeteer page */
45
setPuppeteerCookies(page: PuppeteerPage, domain?: string): Promise<void>;
46
47
/** Set cookies for Playwright page */
48
setPlaywrightCookies(page: PlaywrightPage, domain?: string): Promise<void>;
49
50
/** Mark this session as blocked */
51
markBad(errorMessage?: string): void;
52
53
/** Retire this session (soft block) */
54
retire(): void;
55
56
/** Get session state for persistence */
57
getState(): SessionState;
58
59
/** Check if session is usable */
60
isUsable(): boolean;
61
}
62
```
63
64
### SessionOptions
65
66
Configuration options for creating sessions.
67
68
```typescript { .api }
69
interface SessionOptions {
70
/** Unique session ID */
71
id?: string;
72
73
/** Session pool that owns this session */
74
sessionPool?: SessionPool;
75
76
/** User agent string */
77
userAgent?: string;
78
79
/** Custom user data */
80
userData?: Dictionary;
81
82
/** Proxy information for this session */
83
proxyInfo?: ProxyInfo;
84
85
/** Cookie jar instance */
86
cookieJar?: CookieJar;
87
88
/** Maximum age in seconds */
89
maxAgeSecs?: number;
90
91
/** Maximum number of errors before blocking */
92
maxErrorScore?: number;
93
94
/** Custom error score weightings */
95
errorScoreDecrement?: number;
96
}
97
```
98
99
### SessionState
100
101
Serializable state of a session for persistence.
102
103
```typescript { .api }
104
interface SessionState {
105
/** Session ID */
106
id: string;
107
108
/** Cookies as key-value pairs */
109
cookies: Cookie[];
110
111
/** User agent string */
112
userAgent: string;
113
114
/** Custom user data */
115
userData: Dictionary;
116
117
/** Current error score */
118
errorScore: number;
119
120
/** Whether session is blocked */
121
isBlocked: boolean;
122
123
/** Creation timestamp */
124
createdAt: string;
125
126
/** Expiration timestamp */
127
expiresAt?: string;
128
129
/** Proxy URL if used */
130
proxyUrl?: string;
131
}
132
```
133
134
**Usage Examples:**
135
136
```typescript
137
import { Session, CheerioCrawler } from "crawlee";
138
139
// Create a session manually
140
const session = new Session({
141
userAgent: 'Mozilla/5.0 (compatible; CustomBot/1.0)',
142
userData: { loginStatus: 'guest' },
143
maxAgeSecs: 3600, // 1 hour
144
});
145
146
// Use session in crawler
147
const crawler = new CheerioCrawler({
148
useSessionPool: true,
149
requestHandler: async ({ session, request, response }) => {
150
console.log(`Using session ${session.id} for ${request.url}`);
151
152
// Handle login detection
153
if (response.url.includes('/login')) {
154
session.userData.loginRequired = true;
155
session.markBad('Login required');
156
return;
157
}
158
159
// Save successful interaction
160
if (response.statusCode === 200) {
161
session.userData.lastSuccessful = new Date();
162
}
163
164
// Process response...
165
},
166
});
167
168
// Work with session state
169
const sessionState = session.getState();
170
console.log('Session cookies:', sessionState.cookies.length);
171
console.log('Session score:', sessionState.errorScore);
172
173
// Check session health
174
if (!session.isUsable()) {
175
console.log('Session is no longer usable');
176
}
177
```
178
179
### SessionPool
180
181
Pool for managing multiple sessions with automatic rotation and lifecycle management.
182
183
```typescript { .api }
184
/**
185
* Pool for managing sessions with automatic rotation and error handling
186
*/
187
class SessionPool {
188
constructor(options?: SessionPoolOptions);
189
190
/** Get a session for a request */
191
getSession(request?: Request): Promise<Session>;
192
193
/** Get session by ID */
194
getSessionById(sessionId: string): Session | undefined;
195
196
/** Mark a session as having errors */
197
markSessionBad(session: Session): Promise<void>;
198
199
/** Retire a session (remove from active use) */
200
retire(session: Session): Promise<void>;
201
202
/** Retire all sessions (clear the pool) */
203
retireAllSessions(): Promise<void>;
204
205
/** Manually add a session to the pool */
206
addSession(session: Session): void;
207
208
/** Get pool statistics */
209
getState(): SessionPoolState;
210
211
/** Persist session pool state */
212
persistState(): Promise<void>;
213
214
/** Tear down the session pool */
215
teardown(): Promise<void>;
216
217
/** Total number of sessions in pool */
218
readonly sessionsCount: number;
219
220
/** Number of usable sessions */
221
readonly usableSessionsCount: number;
222
223
/** Number of retired sessions */
224
readonly retiredSessionsCount: number;
225
}
226
```
227
228
### SessionPoolOptions
229
230
Configuration options for SessionPool.
231
232
```typescript { .api }
233
interface SessionPoolOptions {
234
/** Maximum number of sessions in the pool */
235
maxPoolSize?: number;
236
237
/** How often to create new sessions */
238
sessionOptions?: SessionOptions;
239
240
/** Persist sessions to key-value store */
241
persistStateKeyValueStoreId?: string;
242
243
/** Key for persisting session pool state */
244
persistStateKey?: string;
245
246
/** Whether to create sessions on demand */
247
createSessionFunction?: (sessionPool: SessionPool, options?: SessionOptions) => Session;
248
249
/** Whether to validate sessions before use */
250
validateSessionFunction?: (session: Session) => Promise<boolean>;
251
252
/** Custom user agent generation */
253
userAgentPoolOptions?: UserAgentPoolOptions;
254
255
/** Proxy configuration for sessions */
256
proxyConfiguration?: ProxyConfiguration;
257
258
/** Session retirement rules */
259
sessionRetirementRules?: SessionRetirementRules;
260
}
261
```
262
263
### SessionPoolState
264
265
State information about the session pool.
266
267
```typescript { .api }
268
interface SessionPoolState {
269
/** Total sessions in pool */
270
totalSessions: number;
271
272
/** Usable sessions count */
273
usableSessions: number;
274
275
/** Retired sessions count */
276
retiredSessions: number;
277
278
/** Blocked sessions count */
279
blockedSessions: number;
280
281
/** Sessions by error score */
282
sessionsByErrorScore: Dictionary<number>;
283
284
/** Average session age */
285
averageSessionAge: number;
286
287
/** Pool health ratio (0-1) */
288
poolHealth: number;
289
}
290
```
291
292
**Usage Examples:**
293
294
```typescript
295
import { SessionPool, PuppeteerCrawler } from "crawlee";
296
297
// Create session pool with configuration
298
const sessionPool = new SessionPool({
299
maxPoolSize: 100,
300
sessionOptions: {
301
maxAgeSecs: 1800, // 30 minutes
302
maxErrorScore: 3,
303
},
304
persistStateKey: 'my-crawler-sessions',
305
userAgentPoolOptions: {
306
userAgentStrings: [
307
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
308
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
309
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
310
],
311
},
312
});
313
314
const crawler = new PuppeteerCrawler({
315
sessionPool,
316
sessionPoolOptions: {
317
maxPoolSize: 50,
318
},
319
320
requestHandler: async ({ page, request, session }) => {
321
console.log(`Using session ${session.id}`);
322
323
// Handle different response scenarios
324
try {
325
await page.goto(request.url);
326
327
// Check for blocking indicators
328
const isBlocked = await page.$('.captcha, .blocked-message');
329
if (isBlocked) {
330
session.markBad('Blocked by anti-bot measures');
331
return;
332
}
333
334
// Check for rate limiting
335
const isRateLimited = await page.$('.rate-limit');
336
if (isRateLimited) {
337
session.userData.rateLimited = true;
338
// Don't mark as bad, just note it
339
}
340
341
// Extract data...
342
const title = await page.title();
343
await Dataset.pushData({ url: request.url, title });
344
345
} catch (error) {
346
// Handle session-related errors
347
if (error.message.includes('timeout')) {
348
session.userData.timeouts = (session.userData.timeouts || 0) + 1;
349
if (session.userData.timeouts > 3) {
350
session.markBad('Too many timeouts');
351
}
352
}
353
throw error;
354
}
355
},
356
357
// Custom failed request handler for session management
358
failedRequestHandler: async ({ request, session, error }) => {
359
console.log(`Request failed for session ${session.id}: ${error.message}`);
360
361
// Mark session bad for certain error types
362
if (error.message.includes('403') || error.message.includes('blocked')) {
363
await sessionPool.markSessionBad(session);
364
}
365
},
366
});
367
368
// Monitor session pool
369
setInterval(async () => {
370
const state = sessionPool.getState();
371
console.log(`Session pool: ${state.usableSessions}/${state.totalSessions} usable`);
372
console.log(`Pool health: ${(state.poolHealth * 100).toFixed(1)}%`);
373
374
// Retire old sessions if pool health is low
375
if (state.poolHealth < 0.3) {
376
console.log('Pool health low, retiring all sessions');
377
await sessionPool.retireAllSessions();
378
}
379
}, 30000);
380
381
await crawler.run();
382
383
// Clean up
384
await sessionPool.teardown();
385
```
386
387
### Cookie Management
388
389
Working with cookies across different session types.
390
391
```typescript { .api }
392
interface Cookie {
393
/** Cookie name */
394
name: string;
395
396
/** Cookie value */
397
value: string;
398
399
/** Domain for the cookie */
400
domain?: string;
401
402
/** Path for the cookie */
403
path?: string;
404
405
/** Expiration date */
406
expires?: Date;
407
408
/** Max age in seconds */
409
maxAge?: number;
410
411
/** Whether cookie is secure */
412
secure?: boolean;
413
414
/** Whether cookie is HTTP only */
415
httpOnly?: boolean;
416
417
/** SameSite policy */
418
sameSite?: 'Strict' | 'Lax' | 'None';
419
}
420
421
interface CookieJar {
422
/** Get all cookies for a domain */
423
getCookies(url: string): Cookie[];
424
425
/** Set a cookie */
426
setCookie(cookie: Cookie | string, url: string): void;
427
428
/** Get cookies as header string */
429
getCookieString(url: string): string;
430
431
/** Remove cookies */
432
removeCookie(name: string, domain?: string): boolean;
433
434
/** Remove all cookies */
435
removeAllCookies(): void;
436
}
437
```
438
439
**Usage Examples:**
440
441
```typescript
442
import { Session } from "crawlee";
443
444
const session = new Session({
445
userAgent: 'CustomBot/1.0',
446
});
447
448
// Working with cookies manually
449
session.cookieJar.setCookie({
450
name: 'session_id',
451
value: 'abc123',
452
domain: '.example.com',
453
path: '/',
454
secure: true,
455
httpOnly: true,
456
}, 'https://example.com');
457
458
// Get cookies for a specific URL
459
const cookies = session.cookieJar.getCookies('https://api.example.com');
460
console.log('Cookies for API:', cookies);
461
462
// Use with different browser types
463
const crawler = new PuppeteerCrawler({
464
useSessionPool: true,
465
preNavigationHooks: [
466
async ({ session, page }) => {
467
// Set cookies before navigation
468
await session.setPuppeteerCookies(page, '.example.com');
469
},
470
],
471
472
requestHandler: async ({ session, page, response }) => {
473
// Save cookies after navigation
474
const newCookies = await page.cookies();
475
newCookies.forEach(cookie => {
476
session.cookieJar.setCookie(cookie, response.url);
477
});
478
},
479
});
480
```
481
482
### User Agent Management
483
484
Managing user agents for sessions to appear more human-like.
485
486
```typescript { .api }
487
interface UserAgentPoolOptions {
488
/** List of user agent strings to choose from */
489
userAgentStrings?: string[];
490
491
/** Whether to rotate user agents */
492
rotateUserAgents?: boolean;
493
494
/** User agent categories to use */
495
categories?: UserAgentCategory[];
496
497
/** Operating systems to simulate */
498
operatingSystems?: string[];
499
500
/** Browser types to simulate */
501
browsers?: string[];
502
}
503
504
enum UserAgentCategory {
505
DESKTOP = 'desktop',
506
MOBILE = 'mobile',
507
TABLET = 'tablet',
508
}
509
```
510
511
**Usage Examples:**
512
513
```typescript
514
import { SessionPool } from "crawlee";
515
516
const sessionPool = new SessionPool({
517
userAgentPoolOptions: {
518
userAgentStrings: [
519
// Chrome on Windows
520
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
521
// Safari on macOS
522
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
523
// Firefox on Linux
524
'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0',
525
],
526
rotateUserAgents: true,
527
categories: [UserAgentCategory.DESKTOP],
528
},
529
});
530
531
// Custom user agent selection
532
const customSessionPool = new SessionPool({
533
createSessionFunction: (pool, options) => {
534
const userAgents = [
535
'Bot/1.0 (compatible; DataExtractor)',
536
'Crawler/2.0 (+http://example.com/bot)',
537
];
538
539
return new Session({
540
...options,
541
userAgent: userAgents[Math.floor(Math.random() * userAgents.length)],
542
userData: {
543
browserType: options?.userAgent?.includes('Chrome') ? 'chrome' : 'firefox',
544
},
545
});
546
},
547
});
548
```
549
550
### Session Retirement Rules
551
552
Advanced configuration for when to retire sessions.
553
554
```typescript { .api }
555
interface SessionRetirementRules {
556
/** Maximum age before retirement */
557
maxSessionAgeMinutes?: number;
558
559
/** Maximum error score before retirement */
560
maxErrorScore?: number;
561
562
/** Retire on specific HTTP status codes */
563
retireOnStatusCodes?: number[];
564
565
/** Retire on specific error patterns */
566
retireOnErrorPatterns?: RegExp[];
567
568
/** Custom retirement function */
569
shouldRetireSession?: (session: Session, context?: any) => boolean;
570
571
/** How often to check for retirement */
572
retirementCheckIntervalSecs?: number;
573
}
574
```
575
576
**Usage Examples:**
577
578
```typescript
579
import { SessionPool, Session } from "crawlee";
580
581
const sessionPool = new SessionPool({
582
sessionRetirementRules: {
583
maxSessionAgeMinutes: 30,
584
maxErrorScore: 5,
585
retireOnStatusCodes: [403, 429, 503],
586
retireOnErrorPatterns: [/blocked/i, /captcha/i, /rate.?limit/i],
587
588
shouldRetireSession: (session, context) => {
589
// Custom retirement logic
590
const timeouts = session.userData.timeouts || 0;
591
const redirects = session.userData.redirects || 0;
592
593
// Retire if too many timeouts or suspicious redirects
594
return timeouts > 3 || redirects > 10;
595
},
596
597
retirementCheckIntervalSecs: 300, // Check every 5 minutes
598
},
599
});
600
601
// Monitor and react to session retirement
602
const crawler = new CheerioCrawler({
603
sessionPool,
604
605
requestHandler: async ({ session, response }) => {
606
// Track session metrics
607
if (response.statusCode >= 300 && response.statusCode < 400) {
608
session.userData.redirects = (session.userData.redirects || 0) + 1;
609
}
610
611
// Process request...
612
},
613
614
failedRequestHandler: async ({ session, error }) => {
615
// Custom error handling that affects retirement
616
if (error.code === 'ETIMEDOUT') {
617
session.userData.timeouts = (session.userData.timeouts || 0) + 1;
618
}
619
620
console.log(`Session ${session.id} error count: ${session.errorScore}`);
621
},
622
});
623
```
624
625
## Types
626
627
```typescript { .api }
628
interface ProxyInfo {
629
/** Proxy URL */
630
url: string;
631
632
/** Proxy hostname */
633
hostname: string;
634
635
/** Proxy port */
636
port: number;
637
638
/** Proxy protocol */
639
protocol: string;
640
641
/** Authentication credentials */
642
auth?: {
643
username: string;
644
password: string;
645
};
646
647
/** Session ID associated with this proxy */
648
sessionId?: string | number;
649
650
/** Password for the proxy */
651
password?: string;
652
653
/** Username for the proxy */
654
username?: string;
655
}
656
657
interface Response {
658
/** HTTP status code */
659
statusCode: number;
660
661
/** Response URL (after redirects) */
662
url: string;
663
664
/** Response headers */
665
headers: Dictionary<string | string[]>;
666
667
/** Response body */
668
body?: string;
669
670
/** Raw response body */
671
rawBody?: Buffer;
672
}
673
674
interface Dictionary<T = any> {
675
[key: string]: T;
676
}
677
678
interface Request<UserData = Dictionary> {
679
/** Request URL */
680
url: string;
681
682
/** Loaded URL (after redirects) */
683
loadedUrl?: string;
684
685
/** Unique identifier for deduplication */
686
uniqueKey: string;
687
688
/** HTTP method */
689
method?: 'GET' | 'POST' | 'PUT' | 'DELETE' | 'HEAD' | 'OPTIONS' | 'PATCH';
690
691
/** Request payload */
692
payload?: string;
693
694
/** Custom user data */
695
userData?: UserData;
696
697
/** Request label for routing */
698
label?: string;
699
700
/** Whether to retry this request on failure */
701
noRetry?: boolean;
702
703
/** Number of retry attempts */
704
retryCount?: number;
705
706
/** HTTP headers */
707
headers?: Dictionary<string>;
708
709
/** When this request was handled */
710
handledAt?: Date;
711
}
712
```