or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

browser-crawling.mdconfiguration-proxies.mdcore-crawling.mdhttp-crawling.mdindex.mdsession-management.mdstorage.mdutilities.md

session-management.mddocs/

0

# Session Management

1

2

Session management provides capabilities for handling cookies, user agents, and proxy rotation to avoid blocking and rate limiting during large-scale crawling operations.

3

4

## Capabilities

5

6

### Session

7

8

Individual session containing cookies, proxy information, and state for a single logical browsing session.

9

10

```typescript { .api }

11

/**

12

* Represents a single session with cookies, proxy configuration, and state

13

*/

14

class Session {

15

constructor(options: SessionOptions);

16

17

/** Unique session ID */

18

readonly id: string;

19

20

/** Current cookie jar for this session */

21

readonly cookieJar: CookieJar;

22

23

/** User agent string for this session */

24

readonly userData: Dictionary;

25

26

/** Whether this session is blocked/retired */

27

readonly isBlocked: boolean;

28

29

/** Number of errors encountered by this session */

30

readonly errorScore: number;

31

32

/** When this session was created */

33

readonly createdAt: Date;

34

35

/** When this session expires */

36

readonly expiresAt?: Date;

37

38

/** Get cookie string for a URL */

39

getCookieString(url: string): string;

40

41

/** Set cookies from response headers */

42

setCookiesFromResponse(response: Response): void;

43

44

/** Set cookies for Puppeteer page */

45

setPuppeteerCookies(page: PuppeteerPage, domain?: string): Promise<void>;

46

47

/** Set cookies for Playwright page */

48

setPlaywrightCookies(page: PlaywrightPage, domain?: string): Promise<void>;

49

50

/** Mark this session as blocked */

51

markBad(errorMessage?: string): void;

52

53

/** Retire this session (soft block) */

54

retire(): void;

55

56

/** Get session state for persistence */

57

getState(): SessionState;

58

59

/** Check if session is usable */

60

isUsable(): boolean;

61

}

62

```

63

64

### SessionOptions

65

66

Configuration options for creating sessions.

67

68

```typescript { .api }

69

interface SessionOptions {

70

/** Unique session ID */

71

id?: string;

72

73

/** Session pool that owns this session */

74

sessionPool?: SessionPool;

75

76

/** User agent string */

77

userAgent?: string;

78

79

/** Custom user data */

80

userData?: Dictionary;

81

82

/** Proxy information for this session */

83

proxyInfo?: ProxyInfo;

84

85

/** Cookie jar instance */

86

cookieJar?: CookieJar;

87

88

/** Maximum age in seconds */

89

maxAgeSecs?: number;

90

91

/** Maximum number of errors before blocking */

92

maxErrorScore?: number;

93

94

/** Custom error score weightings */

95

errorScoreDecrement?: number;

96

}

97

```

98

99

### SessionState

100

101

Serializable state of a session for persistence.

102

103

```typescript { .api }

104

interface SessionState {

105

/** Session ID */

106

id: string;

107

108

/** Cookies as key-value pairs */

109

cookies: Cookie[];

110

111

/** User agent string */

112

userAgent: string;

113

114

/** Custom user data */

115

userData: Dictionary;

116

117

/** Current error score */

118

errorScore: number;

119

120

/** Whether session is blocked */

121

isBlocked: boolean;

122

123

/** Creation timestamp */

124

createdAt: string;

125

126

/** Expiration timestamp */

127

expiresAt?: string;

128

129

/** Proxy URL if used */

130

proxyUrl?: string;

131

}

132

```

133

134

**Usage Examples:**

135

136

```typescript

137

import { Session, CheerioCrawler } from "crawlee";

138

139

// Create a session manually

140

const session = new Session({

141

userAgent: 'Mozilla/5.0 (compatible; CustomBot/1.0)',

142

userData: { loginStatus: 'guest' },

143

maxAgeSecs: 3600, // 1 hour

144

});

145

146

// Use session in crawler

147

const crawler = new CheerioCrawler({

148

useSessionPool: true,

149

requestHandler: async ({ session, request, response }) => {

150

console.log(`Using session ${session.id} for ${request.url}`);

151

152

// Handle login detection

153

if (response.url.includes('/login')) {

154

session.userData.loginRequired = true;

155

session.markBad('Login required');

156

return;

157

}

158

159

// Save successful interaction

160

if (response.statusCode === 200) {

161

session.userData.lastSuccessful = new Date();

162

}

163

164

// Process response...

165

},

166

});

167

168

// Work with session state

169

const sessionState = session.getState();

170

console.log('Session cookies:', sessionState.cookies.length);

171

console.log('Session score:', sessionState.errorScore);

172

173

// Check session health

174

if (!session.isUsable()) {

175

console.log('Session is no longer usable');

176

}

177

```

178

179

### SessionPool

180

181

Pool for managing multiple sessions with automatic rotation and lifecycle management.

182

183

```typescript { .api }

184

/**

185

* Pool for managing sessions with automatic rotation and error handling

186

*/

187

class SessionPool {

188

constructor(options?: SessionPoolOptions);

189

190

/** Get a session for a request */

191

getSession(request?: Request): Promise<Session>;

192

193

/** Get session by ID */

194

getSessionById(sessionId: string): Session | undefined;

195

196

/** Mark a session as having errors */

197

markSessionBad(session: Session): Promise<void>;

198

199

/** Retire a session (remove from active use) */

200

retire(session: Session): Promise<void>;

201

202

/** Retire all sessions (clear the pool) */

203

retireAllSessions(): Promise<void>;

204

205

/** Manually add a session to the pool */

206

addSession(session: Session): void;

207

208

/** Get pool statistics */

209

getState(): SessionPoolState;

210

211

/** Persist session pool state */

212

persistState(): Promise<void>;

213

214

/** Tear down the session pool */

215

teardown(): Promise<void>;

216

217

/** Total number of sessions in pool */

218

readonly sessionsCount: number;

219

220

/** Number of usable sessions */

221

readonly usableSessionsCount: number;

222

223

/** Number of retired sessions */

224

readonly retiredSessionsCount: number;

225

}

226

```

227

228

### SessionPoolOptions

229

230

Configuration options for SessionPool.

231

232

```typescript { .api }

233

interface SessionPoolOptions {

234

/** Maximum number of sessions in the pool */

235

maxPoolSize?: number;

236

237

/** How often to create new sessions */

238

sessionOptions?: SessionOptions;

239

240

/** Persist sessions to key-value store */

241

persistStateKeyValueStoreId?: string;

242

243

/** Key for persisting session pool state */

244

persistStateKey?: string;

245

246

/** Whether to create sessions on demand */

247

createSessionFunction?: (sessionPool: SessionPool, options?: SessionOptions) => Session;

248

249

/** Whether to validate sessions before use */

250

validateSessionFunction?: (session: Session) => Promise<boolean>;

251

252

/** Custom user agent generation */

253

userAgentPoolOptions?: UserAgentPoolOptions;

254

255

/** Proxy configuration for sessions */

256

proxyConfiguration?: ProxyConfiguration;

257

258

/** Session retirement rules */

259

sessionRetirementRules?: SessionRetirementRules;

260

}

261

```

262

263

### SessionPoolState

264

265

State information about the session pool.

266

267

```typescript { .api }

268

interface SessionPoolState {

269

/** Total sessions in pool */

270

totalSessions: number;

271

272

/** Usable sessions count */

273

usableSessions: number;

274

275

/** Retired sessions count */

276

retiredSessions: number;

277

278

/** Blocked sessions count */

279

blockedSessions: number;

280

281

/** Sessions by error score */

282

sessionsByErrorScore: Dictionary<number>;

283

284

/** Average session age */

285

averageSessionAge: number;

286

287

/** Pool health ratio (0-1) */

288

poolHealth: number;

289

}

290

```

291

292

**Usage Examples:**

293

294

```typescript

295

import { SessionPool, PuppeteerCrawler } from "crawlee";

296

297

// Create session pool with configuration

298

const sessionPool = new SessionPool({

299

maxPoolSize: 100,

300

sessionOptions: {

301

maxAgeSecs: 1800, // 30 minutes

302

maxErrorScore: 3,

303

},

304

persistStateKey: 'my-crawler-sessions',

305

userAgentPoolOptions: {

306

userAgentStrings: [

307

'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',

308

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',

309

'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',

310

],

311

},

312

});

313

314

const crawler = new PuppeteerCrawler({

315

sessionPool,

316

sessionPoolOptions: {

317

maxPoolSize: 50,

318

},

319

320

requestHandler: async ({ page, request, session }) => {

321

console.log(`Using session ${session.id}`);

322

323

// Handle different response scenarios

324

try {

325

await page.goto(request.url);

326

327

// Check for blocking indicators

328

const isBlocked = await page.$('.captcha, .blocked-message');

329

if (isBlocked) {

330

session.markBad('Blocked by anti-bot measures');

331

return;

332

}

333

334

// Check for rate limiting

335

const isRateLimited = await page.$('.rate-limit');

336

if (isRateLimited) {

337

session.userData.rateLimited = true;

338

// Don't mark as bad, just note it

339

}

340

341

// Extract data...

342

const title = await page.title();

343

await Dataset.pushData({ url: request.url, title });

344

345

} catch (error) {

346

// Handle session-related errors

347

if (error.message.includes('timeout')) {

348

session.userData.timeouts = (session.userData.timeouts || 0) + 1;

349

if (session.userData.timeouts > 3) {

350

session.markBad('Too many timeouts');

351

}

352

}

353

throw error;

354

}

355

},

356

357

// Custom failed request handler for session management

358

failedRequestHandler: async ({ request, session, error }) => {

359

console.log(`Request failed for session ${session.id}: ${error.message}`);

360

361

// Mark session bad for certain error types

362

if (error.message.includes('403') || error.message.includes('blocked')) {

363

await sessionPool.markSessionBad(session);

364

}

365

},

366

});

367

368

// Monitor session pool

369

setInterval(async () => {

370

const state = sessionPool.getState();

371

console.log(`Session pool: ${state.usableSessions}/${state.totalSessions} usable`);

372

console.log(`Pool health: ${(state.poolHealth * 100).toFixed(1)}%`);

373

374

// Retire old sessions if pool health is low

375

if (state.poolHealth < 0.3) {

376

console.log('Pool health low, retiring all sessions');

377

await sessionPool.retireAllSessions();

378

}

379

}, 30000);

380

381

await crawler.run();

382

383

// Clean up

384

await sessionPool.teardown();

385

```

386

387

### Cookie Management

388

389

Working with cookies across different session types.

390

391

```typescript { .api }

392

interface Cookie {

393

/** Cookie name */

394

name: string;

395

396

/** Cookie value */

397

value: string;

398

399

/** Domain for the cookie */

400

domain?: string;

401

402

/** Path for the cookie */

403

path?: string;

404

405

/** Expiration date */

406

expires?: Date;

407

408

/** Max age in seconds */

409

maxAge?: number;

410

411

/** Whether cookie is secure */

412

secure?: boolean;

413

414

/** Whether cookie is HTTP only */

415

httpOnly?: boolean;

416

417

/** SameSite policy */

418

sameSite?: 'Strict' | 'Lax' | 'None';

419

}

420

421

interface CookieJar {

422

/** Get all cookies for a domain */

423

getCookies(url: string): Cookie[];

424

425

/** Set a cookie */

426

setCookie(cookie: Cookie | string, url: string): void;

427

428

/** Get cookies as header string */

429

getCookieString(url: string): string;

430

431

/** Remove cookies */

432

removeCookie(name: string, domain?: string): boolean;

433

434

/** Remove all cookies */

435

removeAllCookies(): void;

436

}

437

```

438

439

**Usage Examples:**

440

441

```typescript

442

import { Session } from "crawlee";

443

444

const session = new Session({

445

userAgent: 'CustomBot/1.0',

446

});

447

448

// Working with cookies manually

449

session.cookieJar.setCookie({

450

name: 'session_id',

451

value: 'abc123',

452

domain: '.example.com',

453

path: '/',

454

secure: true,

455

httpOnly: true,

456

}, 'https://example.com');

457

458

// Get cookies for a specific URL

459

const cookies = session.cookieJar.getCookies('https://api.example.com');

460

console.log('Cookies for API:', cookies);

461

462

// Use with different browser types

463

const crawler = new PuppeteerCrawler({

464

useSessionPool: true,

465

preNavigationHooks: [

466

async ({ session, page }) => {

467

// Set cookies before navigation

468

await session.setPuppeteerCookies(page, '.example.com');

469

},

470

],

471

472

requestHandler: async ({ session, page, response }) => {

473

// Save cookies after navigation

474

const newCookies = await page.cookies();

475

newCookies.forEach(cookie => {

476

session.cookieJar.setCookie(cookie, response.url);

477

});

478

},

479

});

480

```

481

482

### User Agent Management

483

484

Managing user agents for sessions to appear more human-like.

485

486

```typescript { .api }

487

interface UserAgentPoolOptions {

488

/** List of user agent strings to choose from */

489

userAgentStrings?: string[];

490

491

/** Whether to rotate user agents */

492

rotateUserAgents?: boolean;

493

494

/** User agent categories to use */

495

categories?: UserAgentCategory[];

496

497

/** Operating systems to simulate */

498

operatingSystems?: string[];

499

500

/** Browser types to simulate */

501

browsers?: string[];

502

}

503

504

enum UserAgentCategory {

505

DESKTOP = 'desktop',

506

MOBILE = 'mobile',

507

TABLET = 'tablet',

508

}

509

```

510

511

**Usage Examples:**

512

513

```typescript

514

import { SessionPool } from "crawlee";

515

516

const sessionPool = new SessionPool({

517

userAgentPoolOptions: {

518

userAgentStrings: [

519

// Chrome on Windows

520

'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',

521

// Safari on macOS

522

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',

523

// Firefox on Linux

524

'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0',

525

],

526

rotateUserAgents: true,

527

categories: [UserAgentCategory.DESKTOP],

528

},

529

});

530

531

// Custom user agent selection

532

const customSessionPool = new SessionPool({

533

createSessionFunction: (pool, options) => {

534

const userAgents = [

535

'Bot/1.0 (compatible; DataExtractor)',

536

'Crawler/2.0 (+http://example.com/bot)',

537

];

538

539

return new Session({

540

...options,

541

userAgent: userAgents[Math.floor(Math.random() * userAgents.length)],

542

userData: {

543

browserType: options?.userAgent?.includes('Chrome') ? 'chrome' : 'firefox',

544

},

545

});

546

},

547

});

548

```

549

550

### Session Retirement Rules

551

552

Advanced configuration for when to retire sessions.

553

554

```typescript { .api }

555

interface SessionRetirementRules {

556

/** Maximum age before retirement */

557

maxSessionAgeMinutes?: number;

558

559

/** Maximum error score before retirement */

560

maxErrorScore?: number;

561

562

/** Retire on specific HTTP status codes */

563

retireOnStatusCodes?: number[];

564

565

/** Retire on specific error patterns */

566

retireOnErrorPatterns?: RegExp[];

567

568

/** Custom retirement function */

569

shouldRetireSession?: (session: Session, context?: any) => boolean;

570

571

/** How often to check for retirement */

572

retirementCheckIntervalSecs?: number;

573

}

574

```

575

576

**Usage Examples:**

577

578

```typescript

579

import { SessionPool, Session } from "crawlee";

580

581

const sessionPool = new SessionPool({

582

sessionRetirementRules: {

583

maxSessionAgeMinutes: 30,

584

maxErrorScore: 5,

585

retireOnStatusCodes: [403, 429, 503],

586

retireOnErrorPatterns: [/blocked/i, /captcha/i, /rate.?limit/i],

587

588

shouldRetireSession: (session, context) => {

589

// Custom retirement logic

590

const timeouts = session.userData.timeouts || 0;

591

const redirects = session.userData.redirects || 0;

592

593

// Retire if too many timeouts or suspicious redirects

594

return timeouts > 3 || redirects > 10;

595

},

596

597

retirementCheckIntervalSecs: 300, // Check every 5 minutes

598

},

599

});

600

601

// Monitor and react to session retirement

602

const crawler = new CheerioCrawler({

603

sessionPool,

604

605

requestHandler: async ({ session, response }) => {

606

// Track session metrics

607

if (response.statusCode >= 300 && response.statusCode < 400) {

608

session.userData.redirects = (session.userData.redirects || 0) + 1;

609

}

610

611

// Process request...

612

},

613

614

failedRequestHandler: async ({ session, error }) => {

615

// Custom error handling that affects retirement

616

if (error.code === 'ETIMEDOUT') {

617

session.userData.timeouts = (session.userData.timeouts || 0) + 1;

618

}

619

620

console.log(`Session ${session.id} error count: ${session.errorScore}`);

621

},

622

});

623

```

624

625

## Types

626

627

```typescript { .api }

628

interface ProxyInfo {

629

/** Proxy URL */

630

url: string;

631

632

/** Proxy hostname */

633

hostname: string;

634

635

/** Proxy port */

636

port: number;

637

638

/** Proxy protocol */

639

protocol: string;

640

641

/** Authentication credentials */

642

auth?: {

643

username: string;

644

password: string;

645

};

646

647

/** Session ID associated with this proxy */

648

sessionId?: string | number;

649

650

/** Password for the proxy */

651

password?: string;

652

653

/** Username for the proxy */

654

username?: string;

655

}

656

657

interface Response {

658

/** HTTP status code */

659

statusCode: number;

660

661

/** Response URL (after redirects) */

662

url: string;

663

664

/** Response headers */

665

headers: Dictionary<string | string[]>;

666

667

/** Response body */

668

body?: string;

669

670

/** Raw response body */

671

rawBody?: Buffer;

672

}

673

674

interface Dictionary<T = any> {

675

[key: string]: T;

676

}

677

678

interface Request<UserData = Dictionary> {

679

/** Request URL */

680

url: string;

681

682

/** Loaded URL (after redirects) */

683

loadedUrl?: string;

684

685

/** Unique identifier for deduplication */

686

uniqueKey: string;

687

688

/** HTTP method */

689

method?: 'GET' | 'POST' | 'PUT' | 'DELETE' | 'HEAD' | 'OPTIONS' | 'PATCH';

690

691

/** Request payload */

692

payload?: string;

693

694

/** Custom user data */

695

userData?: UserData;

696

697

/** Request label for routing */

698

label?: string;

699

700

/** Whether to retry this request on failure */

701

noRetry?: boolean;

702

703

/** Number of retry attempts */

704

retryCount?: number;

705

706

/** HTTP headers */

707

headers?: Dictionary<string>;

708

709

/** When this request was handled */

710

handledAt?: Date;

711

}

712

```