A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass
Session and cookie management with rotation capabilities for maintaining state across requests and avoiding detection. Sessions provide persistent state management, cookie handling, and user agent rotation for more human-like crawling behavior.
Individual session object managing cookies, user agent, and request state for a single logical browsing session.
class Session:
def __init__(
self,
session_pool: SessionPool,
*,
id: str | None = None,
max_age: timedelta = timedelta(hours=1),
max_usage_count: int = 50,
max_error_score: float = 3.0
): ...
@property
def id(self) -> str:
"""Unique session identifier."""
@property
def cookies(self) -> SessionCookies:
"""Cookie jar for this session."""
@property
def user_agent(self) -> str:
"""User agent string for this session."""
@property
def usage_count(self) -> int:
"""Number of requests made with this session."""
@property
def error_score(self) -> float:
"""Accumulated error score (higher = more problematic)."""
@property
def is_blocked(self) -> bool:
"""True if session appears to be blocked."""
@property
def is_expired(self) -> bool:
"""True if session has exceeded age or usage limits."""
def mark_blocked(self) -> None:
"""Mark session as blocked/detected."""
def retire(self) -> None:
"""Remove session from pool and mark as retired."""
def get_state(self) -> dict[str, any]:
"""Get session state for persistence."""
def set_state(self, state: dict[str, any]) -> None:
"""Restore session state from persistence."""Pool managing multiple sessions with automatic rotation, creation, and cleanup of sessions to maintain anonymity.
class SessionPool:
def __init__(
self,
*,
max_pool_size: int = 1000,
create_session_function: Callable[[], Session] | None = None,
persist_state_key: str | None = None,
persist_state_key_value_store_id: str | None = None
): ...
async def get_session(self, session_id: str | None = None) -> Session:
"""
Get session from pool, creating new one if needed.
Args:
session_id: Specific session ID to retrieve
Returns:
Session object
"""
async def retire_session(self, session: Session) -> None:
"""Remove session from pool."""
def get_session_count(self) -> int:
"""Get number of sessions in pool."""
def get_state(self) -> dict[str, any]:
"""Get pool state for persistence."""
async def persist_state(self) -> None:
"""Save pool state to storage."""
async def initialize(self) -> None:
"""Initialize pool and restore state if configured."""
async def teardown(self) -> None:
"""Clean up pool resources."""
@property
def max_pool_size(self) -> int: ...Cookie management within sessions supporting standard HTTP cookie operations with domain and path handling.
class SessionCookies:
def __init__(self): ...
def add_cookie(
self,
cookie: CookieParam,
*,
url: str | None = None
) -> None:
"""
Add cookie to session.
Args:
cookie: Cookie data
url: URL context for cookie domain/path
"""
def get_cookie(
self,
name: str,
domain: str | None = None,
path: str | None = None
) -> Cookie | None:
"""
Get cookie by name and optional domain/path.
Args:
name: Cookie name
domain: Cookie domain
path: Cookie path
Returns:
Cookie object or None if not found
"""
def delete_cookie(
self,
name: str,
domain: str | None = None,
path: str | None = None
) -> None:
"""Delete cookie by name."""
def clear(self) -> None:
"""Remove all cookies."""
def get_cookies_for_url(self, url: str) -> list[Cookie]:
"""Get all cookies applicable to given URL."""
def to_dict(self) -> dict[str, any]:
"""Serialize cookies to dictionary."""
def from_dict(self, data: dict[str, any]) -> None:
"""Restore cookies from dictionary."""
def __len__(self) -> int: ...
def __iter__(self) -> Iterator[Cookie]: ...Type definitions for cookie parameters and cookie objects.
CookieParam = Union[
dict[str, str | int | float | bool | None],
Cookie
]class Cookie:
def __init__(
self,
name: str,
value: str,
*,
domain: str | None = None,
path: str = "/",
expires: datetime | None = None,
max_age: int | None = None,
secure: bool = False,
http_only: bool = False,
same_site: Literal["Strict", "Lax", "None"] | None = None
): ...
@property
def name(self) -> str: ...
@property
def value(self) -> str: ...
@property
def domain(self) -> str | None: ...
@property
def path(self) -> str: ...
@property
def expires(self) -> datetime | None: ...
@property
def secure(self) -> bool: ...
@property
def http_only(self) -> bool: ...
def is_expired(self) -> bool:
"""Check if cookie has expired."""
def matches_url(self, url: str) -> bool:
"""Check if cookie should be sent with given URL."""import asyncio
from crawlee.sessions import SessionPool, Session
async def main():
# Create session pool
pool = SessionPool(max_pool_size=100)
await pool.initialize()
# Get session from pool
session = await pool.get_session()
print(f"Session ID: {session.id}")
print(f"User Agent: {session.user_agent}")
print(f"Usage count: {session.usage_count}")
# Add cookies to session
session.cookies.add_cookie({
'name': 'sessionid',
'value': 'abc123',
'domain': 'example.com'
})
# Use session multiple times
print(f"Cookies for example.com: {len(session.cookies.get_cookies_for_url('https://example.com'))}")
# Mark session as blocked if detected
if should_retire_session():
session.mark_blocked()
await pool.retire_session(session)
await pool.teardown()
def should_retire_session() -> bool:
# Your logic to detect if session is blocked
return False
asyncio.run(main())import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.sessions import SessionPool
async def main():
# Configure session pool
session_pool = SessionPool(
max_pool_size=10,
persist_state_key='my-crawler-sessions'
)
# Create crawler with session pool
crawler = HttpCrawler(
session_pool=session_pool,
use_session_pool=True
)
@crawler.router.default_handler
async def handler(context: HttpCrawlingContext):
session = context.session
context.log.info(f"Using session: {session.id}")
context.log.info(f"Session usage: {session.usage_count}")
# Add authentication cookie if needed
if not session.cookies.get_cookie('auth_token'):
session.cookies.add_cookie({
'name': 'auth_token',
'value': 'your_auth_token_here',
'domain': 'example.com'
})
# Extract data
data = {
'url': context.request.url,
'session_id': session.id,
'status': context.response.status_code
}
await context.push_data(data)
# Mark session as blocked if we get blocked
if context.response.status_code == 403:
context.log.warning(f"Session {session.id} may be blocked")
session.mark_blocked()
await crawler.run(['https://example.com/page1', 'https://example.com/page2'])
asyncio.run(main())import asyncio
from crawlee.sessions import SessionPool, Session
def create_custom_session() -> Session:
"""Custom session factory with specific configuration."""
session = Session(
session_pool=None, # Will be set by pool
max_age=timedelta(minutes=30),
max_usage_count=25,
max_error_score=2.0
)
# Add custom cookies or configuration
session.cookies.add_cookie({
'name': 'preferences',
'value': 'theme=dark;lang=en',
'domain': '.example.com'
})
return session
async def main():
pool = SessionPool(
max_pool_size=50,
create_session_function=create_custom_session
)
await pool.initialize()
# Get custom-configured session
session = await pool.get_session()
# Verify custom cookie was added
prefs_cookie = session.cookies.get_cookie('preferences', domain='.example.com')
print(f"Custom cookie: {prefs_cookie.value if prefs_cookie else 'Not found'}")
await pool.teardown()
asyncio.run(main())import asyncio
from crawlee.sessions import SessionPool
async def main():
# Create pool with state persistence
pool = SessionPool(
max_pool_size=100,
persist_state_key='crawler-sessions',
persist_state_key_value_store_id='session-store'
)
# Initialize will restore previous session state
await pool.initialize()
# Use sessions for crawling...
session1 = await pool.get_session()
session2 = await pool.get_session()
print(f"Pool has {pool.get_session_count()} sessions")
# Manually persist state
await pool.persist_state()
# Teardown will also persist state
await pool.teardown()
print("Session state saved for next run")
asyncio.run(main())import asyncio
from crawlee.sessions import SessionPool
from datetime import datetime, timedelta
async def main():
pool = SessionPool()
await pool.initialize()
session = await pool.get_session()
# Add various types of cookies
session.cookies.add_cookie({
'name': 'session_id',
'value': 'abc123',
'domain': 'example.com',
'path': '/',
'expires': datetime.now() + timedelta(hours=1),
'secure': True,
'http_only': True
})
session.cookies.add_cookie({
'name': 'preferences',
'value': 'theme=dark',
'domain': '.example.com',
'path': '/settings'
})
# Get cookies for specific URL
url = 'https://example.com/settings/profile'
cookies = session.cookies.get_cookies_for_url(url)
print(f"Cookies for {url}:")
for cookie in cookies:
print(f" {cookie.name}={cookie.value}")
# Remove specific cookie
session.cookies.delete_cookie('preferences', domain='.example.com')
# Check remaining cookies
print(f"Remaining cookies: {len(session.cookies)}")
await pool.teardown()
asyncio.run(main())Install with Tessl CLI
npx tessl i tessl/pypi-crawlee