A Python interface to archive.org for programmatic access to the Internet Archive's digital library
—
Session management in the Internet Archive Python library provides persistent configuration, authentication, and HTTP adapter customization for efficient operations across multiple API calls.
Create new ArchiveSession objects with custom configuration, authentication, and HTTP settings.
def get_session(config=None, config_file=None, debug=False, http_adapter_kwargs=None):
"""
Return a new ArchiveSession object for persistent configuration across tasks.
Args:
config (dict, optional): Configuration dictionary with keys:
- 's3': dict with 'access' and 'secret' keys for IA-S3 authentication
- 'general': dict with 'secure', 'host' for connection settings
- 'cookies': dict with Archive.org cookies for authentication
config_file (str, optional): Path to configuration file
debug (bool): Enable debug logging for all session operations
http_adapter_kwargs (dict, optional): Keyword arguments for HTTPAdapter:
- 'max_retries': int or urllib3.Retry object
- 'pool_connections': int, number of connection pools to cache
- 'pool_maxsize': int, maximum connections in pool
- 'socket_options': list of socket options
Returns:
ArchiveSession: Session object for API interactions
"""
class ArchiveSession:
"""
Main session class inheriting from requests.Session with Archive.org-specific functionality.
"""
def __init__(self, config=None, config_file="", debug=False, http_adapter_kwargs=None):
"""
Initialize ArchiveSession with configuration and HTTP settings.
Args:
config (dict, optional): Configuration dictionary
config_file (str): Path to configuration file
debug (bool): Enable debug logging
http_adapter_kwargs (dict, optional): HTTP adapter arguments
"""Access session configuration, authentication details, and connection settings.
class ArchiveSession:
@property
def config(self):
"""dict: Complete configuration dictionary."""
@property
def secure(self):
"""bool: Whether to use HTTPS (default: True)."""
@property
def host(self):
"""str: Archive.org host (default: 'archive.org')."""
@property
def user_email(self):
"""str: Email of logged-in user (if authenticated)."""
@property
def access_key(self):
"""str: IA-S3 access key (if configured)."""
@property
def secret_key(self):
"""str: IA-S3 secret key (if configured)."""
@property
def headers(self):
"""dict: Default HTTP headers for requests."""
@property
def protocol(self):
"""str: HTTP protocol ('https' or 'http')."""Retrieve items and metadata through the session object.
class ArchiveSession:
def get_item(self, identifier, item_metadata=None, request_kwargs=None):
"""
Get an Item or Collection object.
Args:
identifier (str): Archive.org item identifier
item_metadata (dict, optional): Pre-fetched item metadata
request_kwargs (dict, optional): Additional request arguments
Returns:
Item or Collection: Item object (Collection if item is a collection)
"""
def get_metadata(self, identifier, request_kwargs=None):
"""
Get item metadata from Archive.org API.
Args:
identifier (str): Archive.org item identifier
request_kwargs (dict, optional): Additional request arguments
Returns:
dict: Item metadata dictionary
"""Perform searches through the session with advanced options.
class ArchiveSession:
def search_items(self, query, fields=None, sorts=None, params=None, full_text_search=False, dsl_fts=False, request_kwargs=None, max_retries=None):
"""
Search for items with advanced filtering and configuration.
Args:
query (str): Search query using Archive.org syntax
fields (list, optional): Metadata fields to return
sorts (list, optional): Sort criteria (e.g., ['downloads desc'])
params (dict, optional): Additional URL parameters
full_text_search (bool): Enable full-text search across item content
dsl_fts (bool): Enable DSL-based full-text search
request_kwargs (dict, optional): Additional request arguments
max_retries (int, optional): Maximum retry attempts
Returns:
Search: Search object for iterating over results
"""Submit and manage Archive.org catalog tasks through the session.
class ArchiveSession:
def submit_task(self, identifier, cmd, comment="", priority=0, data=None, headers=None, reduced_priority=False, request_kwargs=None):
"""
Submit a task to Archive.org catalog system.
Args:
identifier (str): Item identifier for the task
cmd (str): Task command (e.g., 'derive.php', 'fixer.php')
comment (str): Task comment
priority (int): Task priority (-5 to 10, higher is more priority)
data (dict, optional): Additional task data
headers (dict, optional): Additional HTTP headers
reduced_priority (bool): Use reduced priority queue
request_kwargs (dict, optional): Additional request arguments
Returns:
Response: HTTP response from task submission
"""
def get_tasks(self, identifier="", params=None, request_kwargs=None):
"""
Get tasks from Archive.org catalog.
Args:
identifier (str, optional): Filter by item identifier
params (dict, optional): Additional query parameters:
- 'catalog': bool, include queued/running tasks
- 'history': bool, include completed tasks
- 'summary': bool, return task count summary
request_kwargs (dict, optional): Additional request arguments
Returns:
set: Set of CatalogTask objects
"""
def get_my_catalog(self, params=None, request_kwargs=None):
"""
Get current user's queued and running tasks.
Args:
params (dict, optional): Additional query parameters
request_kwargs (dict, optional): Additional request arguments
Returns:
set: Set of CatalogTask objects for current user
"""
def get_task_log(self, task_id, request_kwargs=None):
"""
Get log output for a specific task.
Args:
task_id (int): Task ID
request_kwargs (dict, optional): Additional request arguments
Returns:
str: Task log content
"""
def iter_history(self, identifier=None, params=None, request_kwargs=None):
"""
Iterate over completed tasks.
Args:
identifier (str, optional): Filter by item identifier
params (dict, optional): Additional query parameters
request_kwargs (dict, optional): Additional request arguments
Yields:
CatalogTask: Completed task objects
"""
def iter_catalog(self, identifier=None, params=None, request_kwargs=None):
"""
Iterate over queued and running tasks.
Args:
identifier (str, optional): Filter by item identifier
params (dict, optional): Additional query parameters
request_kwargs (dict, optional): Additional request arguments
Yields:
CatalogTask: Queued/running task objects
"""
def get_tasks_summary(self, identifier="", params=None, request_kwargs=None):
"""
Get task count summary by status.
Args:
identifier (str, optional): Filter by item identifier
params (dict, optional): Additional query parameters
request_kwargs (dict, optional): Additional request arguments
Returns:
dict: Task counts by status (queued, running, finished, etc.)
"""Get information about the authenticated user.
class ArchiveSession:
def whoami(self):
"""
Get the email address of the logged-in user.
Returns:
str: User email address, or empty string if not authenticated
"""Configure HTTP adapters and logging for the session.
class ArchiveSession:
def mount_http_adapter(self, protocol=None, max_retries=None, status_forcelist=None, host=None):
"""
Mount HTTP adapter with custom retry and error handling.
Args:
protocol (str, optional): Protocol to mount for ('http', 'https')
max_retries (int or Retry, optional): Retry configuration
status_forcelist (list, optional): HTTP status codes to retry
host (str, optional): Specific host to mount adapter for
"""
def set_file_logger(self, log_level, path, logger_name="internetarchive"):
"""
Configure file logging for the session.
Args:
log_level (int or str): Logging level (DEBUG, INFO, WARNING, ERROR)
path (str): Path to log file
logger_name (str): Logger name (default: 'internetarchive')
"""import internetarchive
# Create session with default configuration
session = internetarchive.get_session()
# Create session with custom configuration
config = {
's3': {
'access': 'your-access-key',
'secret': 'your-secret-key'
},
'general': {
'secure': True,
'host': 'archive.org'
}
}
session = internetarchive.get_session(config=config)from urllib3.util.retry import Retry
# Configure HTTP adapter with custom retry logic
http_adapter_kwargs = {
'max_retries': Retry(
total=5,
backoff_factor=1,
status_forcelist=[500, 502, 503, 504]
),
'pool_connections': 10,
'pool_maxsize': 20
}
session = internetarchive.get_session(
debug=True,
http_adapter_kwargs=http_adapter_kwargs
)import internetarchive
# Create session once
session = internetarchive.get_session()
# Use session for multiple operations
item = session.get_item('example-item')
search = session.search_items('collection:opensource')
tasks = session.get_tasks('example-item')
# Check authentication status
if session.user_email:
print(f"Authenticated as: {session.user_email}")
else:
print("Not authenticated")Install with Tessl CLI
npx tessl i tessl/pypi-internetarchive