tessl/pypi-crawlee

A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass

Overview

Eval results

Files

Sessions

Name: tessl/pypi-crawlee
Author: tessl

Session and cookie management with rotation capabilities for maintaining state across requests and avoiding detection. Sessions provide persistent state management, cookie handling, and user agent rotation for more human-like crawling behavior.

Capabilities

Session

Individual session object managing cookies, user agent, and request state for a single logical browsing session.

class Session:
    def __init__(
        self,
        session_pool: SessionPool,
        *,
        id: str | None = None,
        max_age: timedelta = timedelta(hours=1),
        max_usage_count: int = 50,
        max_error_score: float = 3.0
    ): ...

    @property
    def id(self) -> str:
        """Unique session identifier."""

    @property
    def cookies(self) -> SessionCookies:
        """Cookie jar for this session."""

    @property
    def user_agent(self) -> str:
        """User agent string for this session."""

    @property
    def usage_count(self) -> int:
        """Number of requests made with this session."""

    @property
    def error_score(self) -> float:
        """Accumulated error score (higher = more problematic)."""

    @property
    def is_blocked(self) -> bool:
        """True if session appears to be blocked."""

    @property
    def is_expired(self) -> bool:
        """True if session has exceeded age or usage limits."""

    def mark_blocked(self) -> None:
        """Mark session as blocked/detected."""

    def retire(self) -> None:
        """Remove session from pool and mark as retired."""

    def get_state(self) -> dict[str, any]:
        """Get session state for persistence."""

    def set_state(self, state: dict[str, any]) -> None:
        """Restore session state from persistence."""

Session Pool

Pool managing multiple sessions with automatic rotation, creation, and cleanup of sessions to maintain anonymity.

class SessionPool:
    def __init__(
        self,
        *,
        max_pool_size: int = 1000,
        create_session_function: Callable[[], Session] | None = None,
        persist_state_key: str | None = None,
        persist_state_key_value_store_id: str | None = None
    ): ...

    async def get_session(self, session_id: str | None = None) -> Session:
        """
        Get session from pool, creating new one if needed.

        Args:
            session_id: Specific session ID to retrieve

        Returns:
            Session object
        """

    async def retire_session(self, session: Session) -> None:
        """Remove session from pool."""

    def get_session_count(self) -> int:
        """Get number of sessions in pool."""

    def get_state(self) -> dict[str, any]:
        """Get pool state for persistence."""

    async def persist_state(self) -> None:
        """Save pool state to storage."""

    async def initialize(self) -> None:
        """Initialize pool and restore state if configured."""

    async def teardown(self) -> None:
        """Clean up pool resources."""

    @property
    def max_pool_size(self) -> int: ...

Session Cookies

Cookie management within sessions supporting standard HTTP cookie operations with domain and path handling.

class SessionCookies:
    def __init__(self): ...

    def add_cookie(
        self,
        cookie: CookieParam,
        *,
        url: str | None = None
    ) -> None:
        """
        Add cookie to session.

        Args:
            cookie: Cookie data
            url: URL context for cookie domain/path
        """

    def get_cookie(
        self,
        name: str,
        domain: str | None = None,
        path: str | None = None
    ) -> Cookie | None:
        """
        Get cookie by name and optional domain/path.

        Args:
            name: Cookie name
            domain: Cookie domain
            path: Cookie path

        Returns:
            Cookie object or None if not found
        """

    def delete_cookie(
        self,
        name: str,
        domain: str | None = None,
        path: str | None = None
    ) -> None:
        """Delete cookie by name."""

    def clear(self) -> None:
        """Remove all cookies."""

    def get_cookies_for_url(self, url: str) -> list[Cookie]:
        """Get all cookies applicable to given URL."""

    def to_dict(self) -> dict[str, any]:
        """Serialize cookies to dictionary."""

    def from_dict(self, data: dict[str, any]) -> None:
        """Restore cookies from dictionary."""

    def __len__(self) -> int: ...

    def __iter__(self) -> Iterator[Cookie]: ...

Cookie Types

Type definitions for cookie parameters and cookie objects.

CookieParam = Union[
    dict[str, str | int | float | bool | None],
    Cookie
]

class Cookie:
    def __init__(
        self,
        name: str,
        value: str,
        *,
        domain: str | None = None,
        path: str = "/",
        expires: datetime | None = None,
        max_age: int | None = None,
        secure: bool = False,
        http_only: bool = False,
        same_site: Literal["Strict", "Lax", "None"] | None = None
    ): ...

    @property
    def name(self) -> str: ...

    @property
    def value(self) -> str: ...

    @property
    def domain(self) -> str | None: ...

    @property
    def path(self) -> str: ...

    @property
    def expires(self) -> datetime | None: ...

    @property
    def secure(self) -> bool: ...

    @property
    def http_only(self) -> bool: ...

    def is_expired(self) -> bool:
        """Check if cookie has expired."""

    def matches_url(self, url: str) -> bool:
        """Check if cookie should be sent with given URL."""

Usage Examples

Basic Session Usage

import asyncio
from crawlee.sessions import SessionPool, Session

async def main():
    # Create session pool
    pool = SessionPool(max_pool_size=100)
    await pool.initialize()

    # Get session from pool
    session = await pool.get_session()

    print(f"Session ID: {session.id}")
    print(f"User Agent: {session.user_agent}")
    print(f"Usage count: {session.usage_count}")

    # Add cookies to session
    session.cookies.add_cookie({
        'name': 'sessionid',
        'value': 'abc123',
        'domain': 'example.com'
    })

    # Use session multiple times
    print(f"Cookies for example.com: {len(session.cookies.get_cookies_for_url('https://example.com'))}")

    # Mark session as blocked if detected
    if should_retire_session():
        session.mark_blocked()
        await pool.retire_session(session)

    await pool.teardown()

def should_retire_session() -> bool:
    # Your logic to detect if session is blocked
    return False

asyncio.run(main())

Session with HTTP Crawler

import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.sessions import SessionPool

async def main():
    # Configure session pool
    session_pool = SessionPool(
        max_pool_size=10,
        persist_state_key='my-crawler-sessions'
    )

    # Create crawler with session pool
    crawler = HttpCrawler(
        session_pool=session_pool,
        use_session_pool=True
    )

    @crawler.router.default_handler
    async def handler(context: HttpCrawlingContext):
        session = context.session

        context.log.info(f"Using session: {session.id}")
        context.log.info(f"Session usage: {session.usage_count}")

        # Add authentication cookie if needed
        if not session.cookies.get_cookie('auth_token'):
            session.cookies.add_cookie({
                'name': 'auth_token',
                'value': 'your_auth_token_here',
                'domain': 'example.com'
            })

        # Extract data
        data = {
            'url': context.request.url,
            'session_id': session.id,
            'status': context.response.status_code
        }

        await context.push_data(data)

        # Mark session as blocked if we get blocked
        if context.response.status_code == 403:
            context.log.warning(f"Session {session.id} may be blocked")
            session.mark_blocked()

    await crawler.run(['https://example.com/page1', 'https://example.com/page2'])

asyncio.run(main())

Custom Session Creation

import asyncio
from crawlee.sessions import SessionPool, Session

def create_custom_session() -> Session:
    """Custom session factory with specific configuration."""
    session = Session(
        session_pool=None,  # Will be set by pool
        max_age=timedelta(minutes=30),
        max_usage_count=25,
        max_error_score=2.0
    )

    # Add custom cookies or configuration
    session.cookies.add_cookie({
        'name': 'preferences',
        'value': 'theme=dark;lang=en',
        'domain': '.example.com'
    })

    return session

async def main():
    pool = SessionPool(
        max_pool_size=50,
        create_session_function=create_custom_session
    )

    await pool.initialize()

    # Get custom-configured session
    session = await pool.get_session()

    # Verify custom cookie was added
    prefs_cookie = session.cookies.get_cookie('preferences', domain='.example.com')
    print(f"Custom cookie: {prefs_cookie.value if prefs_cookie else 'Not found'}")

    await pool.teardown()

asyncio.run(main())

Session State Persistence

import asyncio
from crawlee.sessions import SessionPool

async def main():
    # Create pool with state persistence
    pool = SessionPool(
        max_pool_size=100,
        persist_state_key='crawler-sessions',
        persist_state_key_value_store_id='session-store'
    )

    # Initialize will restore previous session state
    await pool.initialize()

    # Use sessions for crawling...
    session1 = await pool.get_session()
    session2 = await pool.get_session()

    print(f"Pool has {pool.get_session_count()} sessions")

    # Manually persist state
    await pool.persist_state()

    # Teardown will also persist state
    await pool.teardown()

    print("Session state saved for next run")

asyncio.run(main())

Cookie Management

import asyncio
from crawlee.sessions import SessionPool
from datetime import datetime, timedelta

async def main():
    pool = SessionPool()
    await pool.initialize()

    session = await pool.get_session()

    # Add various types of cookies
    session.cookies.add_cookie({
        'name': 'session_id',
        'value': 'abc123',
        'domain': 'example.com',
        'path': '/',
        'expires': datetime.now() + timedelta(hours=1),
        'secure': True,
        'http_only': True
    })

    session.cookies.add_cookie({
        'name': 'preferences',
        'value': 'theme=dark',
        'domain': '.example.com',
        'path': '/settings'
    })

    # Get cookies for specific URL
    url = 'https://example.com/settings/profile'
    cookies = session.cookies.get_cookies_for_url(url)

    print(f"Cookies for {url}:")
    for cookie in cookies:
        print(f"  {cookie.name}={cookie.value}")

    # Remove specific cookie
    session.cookies.delete_cookie('preferences', domain='.example.com')

    # Check remaining cookies
    print(f"Remaining cookies: {len(session.cookies)}")

    await pool.teardown()

asyncio.run(main())

Install with Tessl CLI