0
# Sessions
1
2
Session and cookie management with rotation capabilities for maintaining state across requests and avoiding detection. Sessions provide persistent state management, cookie handling, and user agent rotation for more human-like crawling behavior.
3
4
## Capabilities
5
6
### Session
7
8
Individual session object managing cookies, user agent, and request state for a single logical browsing session.
9
10
```python { .api }
11
class Session:
12
def __init__(
13
self,
14
session_pool: SessionPool,
15
*,
16
id: str | None = None,
17
max_age: timedelta = timedelta(hours=1),
18
max_usage_count: int = 50,
19
max_error_score: float = 3.0
20
): ...
21
22
@property
23
def id(self) -> str:
24
"""Unique session identifier."""
25
26
@property
27
def cookies(self) -> SessionCookies:
28
"""Cookie jar for this session."""
29
30
@property
31
def user_agent(self) -> str:
32
"""User agent string for this session."""
33
34
@property
35
def usage_count(self) -> int:
36
"""Number of requests made with this session."""
37
38
@property
39
def error_score(self) -> float:
40
"""Accumulated error score (higher = more problematic)."""
41
42
@property
43
def is_blocked(self) -> bool:
44
"""True if session appears to be blocked."""
45
46
@property
47
def is_expired(self) -> bool:
48
"""True if session has exceeded age or usage limits."""
49
50
def mark_blocked(self) -> None:
51
"""Mark session as blocked/detected."""
52
53
def retire(self) -> None:
54
"""Remove session from pool and mark as retired."""
55
56
def get_state(self) -> dict[str, any]:
57
"""Get session state for persistence."""
58
59
def set_state(self, state: dict[str, any]) -> None:
60
"""Restore session state from persistence."""
61
```
62
63
### Session Pool
64
65
Pool managing multiple sessions with automatic rotation, creation, and cleanup of sessions to maintain anonymity.
66
67
```python { .api }
68
class SessionPool:
69
def __init__(
70
self,
71
*,
72
max_pool_size: int = 1000,
73
create_session_function: Callable[[], Session] | None = None,
74
persist_state_key: str | None = None,
75
persist_state_key_value_store_id: str | None = None
76
): ...
77
78
async def get_session(self, session_id: str | None = None) -> Session:
79
"""
80
Get session from pool, creating new one if needed.
81
82
Args:
83
session_id: Specific session ID to retrieve
84
85
Returns:
86
Session object
87
"""
88
89
async def retire_session(self, session: Session) -> None:
90
"""Remove session from pool."""
91
92
def get_session_count(self) -> int:
93
"""Get number of sessions in pool."""
94
95
def get_state(self) -> dict[str, any]:
96
"""Get pool state for persistence."""
97
98
async def persist_state(self) -> None:
99
"""Save pool state to storage."""
100
101
async def initialize(self) -> None:
102
"""Initialize pool and restore state if configured."""
103
104
async def teardown(self) -> None:
105
"""Clean up pool resources."""
106
107
@property
108
def max_pool_size(self) -> int: ...
109
```
110
111
### Session Cookies
112
113
Cookie management within sessions supporting standard HTTP cookie operations with domain and path handling.
114
115
```python { .api }
116
class SessionCookies:
117
def __init__(self): ...
118
119
def add_cookie(
120
self,
121
cookie: CookieParam,
122
*,
123
url: str | None = None
124
) -> None:
125
"""
126
Add cookie to session.
127
128
Args:
129
cookie: Cookie data
130
url: URL context for cookie domain/path
131
"""
132
133
def get_cookie(
134
self,
135
name: str,
136
domain: str | None = None,
137
path: str | None = None
138
) -> Cookie | None:
139
"""
140
Get cookie by name and optional domain/path.
141
142
Args:
143
name: Cookie name
144
domain: Cookie domain
145
path: Cookie path
146
147
Returns:
148
Cookie object or None if not found
149
"""
150
151
def delete_cookie(
152
self,
153
name: str,
154
domain: str | None = None,
155
path: str | None = None
156
) -> None:
157
"""Delete cookie by name."""
158
159
def clear(self) -> None:
160
"""Remove all cookies."""
161
162
def get_cookies_for_url(self, url: str) -> list[Cookie]:
163
"""Get all cookies applicable to given URL."""
164
165
def to_dict(self) -> dict[str, any]:
166
"""Serialize cookies to dictionary."""
167
168
def from_dict(self, data: dict[str, any]) -> None:
169
"""Restore cookies from dictionary."""
170
171
def __len__(self) -> int: ...
172
173
def __iter__(self) -> Iterator[Cookie]: ...
174
```
175
176
### Cookie Types
177
178
Type definitions for cookie parameters and cookie objects.
179
180
```python { .api }
181
CookieParam = Union[
182
dict[str, str | int | float | bool | None],
183
Cookie
184
]
185
```
186
187
```python { .api }
188
class Cookie:
189
def __init__(
190
self,
191
name: str,
192
value: str,
193
*,
194
domain: str | None = None,
195
path: str = "/",
196
expires: datetime | None = None,
197
max_age: int | None = None,
198
secure: bool = False,
199
http_only: bool = False,
200
same_site: Literal["Strict", "Lax", "None"] | None = None
201
): ...
202
203
@property
204
def name(self) -> str: ...
205
206
@property
207
def value(self) -> str: ...
208
209
@property
210
def domain(self) -> str | None: ...
211
212
@property
213
def path(self) -> str: ...
214
215
@property
216
def expires(self) -> datetime | None: ...
217
218
@property
219
def secure(self) -> bool: ...
220
221
@property
222
def http_only(self) -> bool: ...
223
224
def is_expired(self) -> bool:
225
"""Check if cookie has expired."""
226
227
def matches_url(self, url: str) -> bool:
228
"""Check if cookie should be sent with given URL."""
229
```
230
231
## Usage Examples
232
233
### Basic Session Usage
234
235
```python
236
import asyncio
237
from crawlee.sessions import SessionPool, Session
238
239
async def main():
240
# Create session pool
241
pool = SessionPool(max_pool_size=100)
242
await pool.initialize()
243
244
# Get session from pool
245
session = await pool.get_session()
246
247
print(f"Session ID: {session.id}")
248
print(f"User Agent: {session.user_agent}")
249
print(f"Usage count: {session.usage_count}")
250
251
# Add cookies to session
252
session.cookies.add_cookie({
253
'name': 'sessionid',
254
'value': 'abc123',
255
'domain': 'example.com'
256
})
257
258
# Use session multiple times
259
print(f"Cookies for example.com: {len(session.cookies.get_cookies_for_url('https://example.com'))}")
260
261
# Mark session as blocked if detected
262
if should_retire_session():
263
session.mark_blocked()
264
await pool.retire_session(session)
265
266
await pool.teardown()
267
268
def should_retire_session() -> bool:
269
# Your logic to detect if session is blocked
270
return False
271
272
asyncio.run(main())
273
```
274
275
### Session with HTTP Crawler
276
277
```python
278
import asyncio
279
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
280
from crawlee.sessions import SessionPool
281
282
async def main():
283
# Configure session pool
284
session_pool = SessionPool(
285
max_pool_size=10,
286
persist_state_key='my-crawler-sessions'
287
)
288
289
# Create crawler with session pool
290
crawler = HttpCrawler(
291
session_pool=session_pool,
292
use_session_pool=True
293
)
294
295
@crawler.router.default_handler
296
async def handler(context: HttpCrawlingContext):
297
session = context.session
298
299
context.log.info(f"Using session: {session.id}")
300
context.log.info(f"Session usage: {session.usage_count}")
301
302
# Add authentication cookie if needed
303
if not session.cookies.get_cookie('auth_token'):
304
session.cookies.add_cookie({
305
'name': 'auth_token',
306
'value': 'your_auth_token_here',
307
'domain': 'example.com'
308
})
309
310
# Extract data
311
data = {
312
'url': context.request.url,
313
'session_id': session.id,
314
'status': context.response.status_code
315
}
316
317
await context.push_data(data)
318
319
# Mark session as blocked if we get blocked
320
if context.response.status_code == 403:
321
context.log.warning(f"Session {session.id} may be blocked")
322
session.mark_blocked()
323
324
await crawler.run(['https://example.com/page1', 'https://example.com/page2'])
325
326
asyncio.run(main())
327
```
328
329
### Custom Session Creation
330
331
```python
332
import asyncio
333
from crawlee.sessions import SessionPool, Session
334
335
def create_custom_session() -> Session:
336
"""Custom session factory with specific configuration."""
337
session = Session(
338
session_pool=None, # Will be set by pool
339
max_age=timedelta(minutes=30),
340
max_usage_count=25,
341
max_error_score=2.0
342
)
343
344
# Add custom cookies or configuration
345
session.cookies.add_cookie({
346
'name': 'preferences',
347
'value': 'theme=dark;lang=en',
348
'domain': '.example.com'
349
})
350
351
return session
352
353
async def main():
354
pool = SessionPool(
355
max_pool_size=50,
356
create_session_function=create_custom_session
357
)
358
359
await pool.initialize()
360
361
# Get custom-configured session
362
session = await pool.get_session()
363
364
# Verify custom cookie was added
365
prefs_cookie = session.cookies.get_cookie('preferences', domain='.example.com')
366
print(f"Custom cookie: {prefs_cookie.value if prefs_cookie else 'Not found'}")
367
368
await pool.teardown()
369
370
asyncio.run(main())
371
```
372
373
### Session State Persistence
374
375
```python
376
import asyncio
377
from crawlee.sessions import SessionPool
378
379
async def main():
380
# Create pool with state persistence
381
pool = SessionPool(
382
max_pool_size=100,
383
persist_state_key='crawler-sessions',
384
persist_state_key_value_store_id='session-store'
385
)
386
387
# Initialize will restore previous session state
388
await pool.initialize()
389
390
# Use sessions for crawling...
391
session1 = await pool.get_session()
392
session2 = await pool.get_session()
393
394
print(f"Pool has {pool.get_session_count()} sessions")
395
396
# Manually persist state
397
await pool.persist_state()
398
399
# Teardown will also persist state
400
await pool.teardown()
401
402
print("Session state saved for next run")
403
404
asyncio.run(main())
405
```
406
407
### Cookie Management
408
409
```python
410
import asyncio
411
from crawlee.sessions import SessionPool
412
from datetime import datetime, timedelta
413
414
async def main():
415
pool = SessionPool()
416
await pool.initialize()
417
418
session = await pool.get_session()
419
420
# Add various types of cookies
421
session.cookies.add_cookie({
422
'name': 'session_id',
423
'value': 'abc123',
424
'domain': 'example.com',
425
'path': '/',
426
'expires': datetime.now() + timedelta(hours=1),
427
'secure': True,
428
'http_only': True
429
})
430
431
session.cookies.add_cookie({
432
'name': 'preferences',
433
'value': 'theme=dark',
434
'domain': '.example.com',
435
'path': '/settings'
436
})
437
438
# Get cookies for specific URL
439
url = 'https://example.com/settings/profile'
440
cookies = session.cookies.get_cookies_for_url(url)
441
442
print(f"Cookies for {url}:")
443
for cookie in cookies:
444
print(f" {cookie.name}={cookie.value}")
445
446
# Remove specific cookie
447
session.cookies.delete_cookie('preferences', domain='.example.com')
448
449
# Check remaining cookies
450
print(f"Remaining cookies: {len(session.cookies)}")
451
452
await pool.teardown()
453
454
asyncio.run(main())
455
```