tessl/pypi-internetarchive

A Python interface to archive.org for programmatic access to the Internet Archive's digital library

—

Pending

Overview

Eval results

Files

Session Management

Name: tessl/pypi-internetarchive
Author: tessl

Session management in the Internet Archive Python library provides persistent configuration, authentication, and HTTP adapter customization for efficient operations across multiple API calls.

Capabilities

Creating Sessions

Create new ArchiveSession objects with custom configuration, authentication, and HTTP settings.

def get_session(config=None, config_file=None, debug=False, http_adapter_kwargs=None):
    """
    Return a new ArchiveSession object for persistent configuration across tasks.
    
    Args:
        config (dict, optional): Configuration dictionary with keys:
            - 's3': dict with 'access' and 'secret' keys for IA-S3 authentication
            - 'general': dict with 'secure', 'host' for connection settings
            - 'cookies': dict with Archive.org cookies for authentication
        config_file (str, optional): Path to configuration file
        debug (bool): Enable debug logging for all session operations
        http_adapter_kwargs (dict, optional): Keyword arguments for HTTPAdapter:
            - 'max_retries': int or urllib3.Retry object
            - 'pool_connections': int, number of connection pools to cache
            - 'pool_maxsize': int, maximum connections in pool
            - 'socket_options': list of socket options
            
    Returns:
        ArchiveSession: Session object for API interactions
    """

class ArchiveSession:
    """
    Main session class inheriting from requests.Session with Archive.org-specific functionality.
    """
    
    def __init__(self, config=None, config_file="", debug=False, http_adapter_kwargs=None):
        """
        Initialize ArchiveSession with configuration and HTTP settings.
        
        Args:
            config (dict, optional): Configuration dictionary
            config_file (str): Path to configuration file
            debug (bool): Enable debug logging
            http_adapter_kwargs (dict, optional): HTTP adapter arguments
        """

Session Properties

Access session configuration, authentication details, and connection settings.

class ArchiveSession:
    @property
    def config(self):
        """dict: Complete configuration dictionary."""
        
    @property  
    def secure(self):
        """bool: Whether to use HTTPS (default: True)."""
        
    @property
    def host(self):
        """str: Archive.org host (default: 'archive.org')."""
        
    @property
    def user_email(self):
        """str: Email of logged-in user (if authenticated)."""
        
    @property
    def access_key(self):
        """str: IA-S3 access key (if configured)."""
        
    @property
    def secret_key(self):
        """str: IA-S3 secret key (if configured)."""
        
    @property
    def headers(self):
        """dict: Default HTTP headers for requests."""
        
    @property
    def protocol(self):
        """str: HTTP protocol ('https' or 'http')."""

Item and Metadata Operations

Retrieve items and metadata through the session object.

class ArchiveSession:
    def get_item(self, identifier, item_metadata=None, request_kwargs=None):
        """
        Get an Item or Collection object.
        
        Args:
            identifier (str): Archive.org item identifier
            item_metadata (dict, optional): Pre-fetched item metadata
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            Item or Collection: Item object (Collection if item is a collection)
        """
        
    def get_metadata(self, identifier, request_kwargs=None):
        """
        Get item metadata from Archive.org API.
        
        Args:
            identifier (str): Archive.org item identifier
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            dict: Item metadata dictionary
        """

Search Operations

Perform searches through the session with advanced options.

class ArchiveSession:
    def search_items(self, query, fields=None, sorts=None, params=None, full_text_search=False, dsl_fts=False, request_kwargs=None, max_retries=None):
        """
        Search for items with advanced filtering and configuration.
        
        Args:
            query (str): Search query using Archive.org syntax
            fields (list, optional): Metadata fields to return
            sorts (list, optional): Sort criteria (e.g., ['downloads desc'])
            params (dict, optional): Additional URL parameters
            full_text_search (bool): Enable full-text search across item content
            dsl_fts (bool): Enable DSL-based full-text search
            request_kwargs (dict, optional): Additional request arguments
            max_retries (int, optional): Maximum retry attempts
            
        Returns:
            Search: Search object for iterating over results
        """

Task Management

Submit and manage Archive.org catalog tasks through the session.

class ArchiveSession:
    def submit_task(self, identifier, cmd, comment="", priority=0, data=None, headers=None, reduced_priority=False, request_kwargs=None):
        """
        Submit a task to Archive.org catalog system.
        
        Args:
            identifier (str): Item identifier for the task
            cmd (str): Task command (e.g., 'derive.php', 'fixer.php')
            comment (str): Task comment
            priority (int): Task priority (-5 to 10, higher is more priority) 
            data (dict, optional): Additional task data
            headers (dict, optional): Additional HTTP headers
            reduced_priority (bool): Use reduced priority queue
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            Response: HTTP response from task submission
        """
        
    def get_tasks(self, identifier="", params=None, request_kwargs=None):
        """
        Get tasks from Archive.org catalog.
        
        Args:
            identifier (str, optional): Filter by item identifier
            params (dict, optional): Additional query parameters:
                - 'catalog': bool, include queued/running tasks
                - 'history': bool, include completed tasks
                - 'summary': bool, return task count summary
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            set: Set of CatalogTask objects
        """
        
    def get_my_catalog(self, params=None, request_kwargs=None):
        """
        Get current user's queued and running tasks.
        
        Args:
            params (dict, optional): Additional query parameters
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            set: Set of CatalogTask objects for current user
        """
        
    def get_task_log(self, task_id, request_kwargs=None):
        """
        Get log output for a specific task.
        
        Args:
            task_id (int): Task ID
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            str: Task log content
        """
        
    def iter_history(self, identifier=None, params=None, request_kwargs=None):
        """
        Iterate over completed tasks.
        
        Args:
            identifier (str, optional): Filter by item identifier
            params (dict, optional): Additional query parameters
            request_kwargs (dict, optional): Additional request arguments
            
        Yields:
            CatalogTask: Completed task objects
        """
        
    def iter_catalog(self, identifier=None, params=None, request_kwargs=None):
        """
        Iterate over queued and running tasks.
        
        Args:
            identifier (str, optional): Filter by item identifier
            params (dict, optional): Additional query parameters
            request_kwargs (dict, optional): Additional request arguments
            
        Yields:
            CatalogTask: Queued/running task objects
        """
        
    def get_tasks_summary(self, identifier="", params=None, request_kwargs=None):
        """
        Get task count summary by status.
        
        Args:
            identifier (str, optional): Filter by item identifier
            params (dict, optional): Additional query parameters
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            dict: Task counts by status (queued, running, finished, etc.)
        """

User Operations

Get information about the authenticated user.

class ArchiveSession:
    def whoami(self):
        """
        Get the email address of the logged-in user.
        
        Returns:
            str: User email address, or empty string if not authenticated
        """

HTTP Configuration

Configure HTTP adapters and logging for the session.

class ArchiveSession:
    def mount_http_adapter(self, protocol=None, max_retries=None, status_forcelist=None, host=None):
        """
        Mount HTTP adapter with custom retry and error handling.
        
        Args:
            protocol (str, optional): Protocol to mount for ('http', 'https')
            max_retries (int or Retry, optional): Retry configuration
            status_forcelist (list, optional): HTTP status codes to retry
            host (str, optional): Specific host to mount adapter for
        """
        
    def set_file_logger(self, log_level, path, logger_name="internetarchive"):
        """
        Configure file logging for the session.
        
        Args:
            log_level (int or str): Logging level (DEBUG, INFO, WARNING, ERROR)
            path (str): Path to log file
            logger_name (str): Logger name (default: 'internetarchive')
        """

Usage Examples

Basic Session Creation

import internetarchive

# Create session with default configuration
session = internetarchive.get_session()

# Create session with custom configuration
config = {
    's3': {
        'access': 'your-access-key',
        'secret': 'your-secret-key'
    },
    'general': {
        'secure': True,
        'host': 'archive.org'
    }
}
session = internetarchive.get_session(config=config)

Session with HTTP Configuration

from urllib3.util.retry import Retry

# Configure HTTP adapter with custom retry logic
http_adapter_kwargs = {
    'max_retries': Retry(
        total=5,
        backoff_factor=1,
        status_forcelist=[500, 502, 503, 504]
    ),
    'pool_connections': 10,
    'pool_maxsize': 20
}

session = internetarchive.get_session(
    debug=True,
    http_adapter_kwargs=http_adapter_kwargs
)

Using Session for Multiple Operations

import internetarchive

# Create session once
session = internetarchive.get_session()

# Use session for multiple operations
item = session.get_item('example-item')
search = session.search_items('collection:opensource')
tasks = session.get_tasks('example-item')

# Check authentication status
if session.user_email:
    print(f"Authenticated as: {session.user_email}")
else:
    print("Not authenticated")

Install with Tessl CLI