A Python interface to archive.org for programmatic access to the Internet Archive's digital library
npx @tessl/cli install tessl/pypi-internetarchive@5.5.0A comprehensive Python interface to archive.org for programmatic access to the Internet Archive's vast digital library. This library enables developers to search, download, upload, and manage items in the Internet Archive through both a Python API and command-line tools.
pip install internetarchiveimport internetarchiveCommon imports for specific functionality:
from internetarchive import get_item, search_items, get_session
from internetarchive import Item, Search, ArchiveSessionimport internetarchive
# Get an item from the Internet Archive
item = internetarchive.get_item('govlawgacode20071')
print(f"Item exists: {item.exists}")
print(f"Item title: {item.metadata.get('title')}")
# Download files from an item
item.download()
# Search for items
search = internetarchive.search_items('collection:nasa')
for result in search:
print(f"Found: {result['identifier']} - {result.get('title', 'No title')}")
# Upload files to create or update an item
internetarchive.upload('my-item-identifier',
files=['local-file.txt'],
metadata={'title': 'My Item', 'creator': 'Your Name'})The Internet Archive Python library follows a layered architecture:
ia commandThis design enables both high-level convenience functions and low-level session-based access patterns, supporting everything from simple file downloads to complex metadata operations and bulk processing workflows.
Create and manage persistent sessions with configuration, authentication, and HTTP adapter customization for efficient bulk operations.
def get_session(config=None, config_file=None, debug=False, http_adapter_kwargs=None):
"""
Return a new ArchiveSession object for persistent configuration across tasks.
Args:
config (dict, optional): Configuration dictionary
config_file (str, optional): Path to configuration file
debug (bool): Enable debug logging
http_adapter_kwargs (dict, optional): HTTP adapter keyword arguments
Returns:
ArchiveSession: Session object for API interactions
"""Access, download, upload, and manage Archive.org items with comprehensive metadata support and file filtering capabilities.
def get_item(identifier, config=None, config_file=None, archive_session=None, debug=False, http_adapter_kwargs=None, request_kwargs=None):
"""
Get an Item object by Archive.org identifier.
Args:
identifier (str): The globally unique Archive.org item identifier
config (dict, optional): Configuration dictionary
config_file (str, optional): Path to configuration file
archive_session (ArchiveSession, optional): Existing session object
debug (bool): Enable debug logging
http_adapter_kwargs (dict, optional): HTTP adapter kwargs
request_kwargs (dict, optional): Request kwargs
Returns:
Item: Item object for the specified identifier
"""
def upload(identifier, files, metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=None, verbose=False, verify=False, checksum=False, delete=False, retries=None, retries_sleep=None, debug=False, validate_identifier=False, request_kwargs=None, **get_item_kwargs):
"""
Upload files to an Archive.org item (creates item if it doesn't exist).
Args:
identifier (str): Item identifier to upload to
files (list): List of file paths or file-like objects to upload
metadata (dict, optional): Item metadata
headers (dict, optional): HTTP headers
Various authentication and upload options...
Returns:
list: List of Request/Response objects from upload operations
"""
def download(identifier, files=None, formats=None, glob_pattern=None, dry_run=False, verbose=False, ignore_existing=False, checksum=False, checksum_archive=False, destdir=None, no_directory=False, retries=None, item_index=None, ignore_errors=False, on_the_fly=False, return_responses=False, no_change_timestamp=False, timeout=None, **get_item_kwargs):
"""
Download files from an Archive.org item with extensive filtering options.
Args:
identifier (str): Item identifier to download from
files (list, optional): Specific files to download
formats (list, optional): File formats to download
glob_pattern (str, optional): Glob pattern for file selection
Various download configuration options...
Returns:
list: List of Request/Response objects from download operations
"""Search the Internet Archive with advanced query syntax, field selection, sorting, and full-text search capabilities.
def search_items(query, fields=None, sorts=None, params=None, full_text_search=False, dsl_fts=False, archive_session=None, config=None, config_file=None, http_adapter_kwargs=None, request_kwargs=None, max_retries=None):
"""
Search for items on Archive.org with advanced filtering options.
Args:
query (str): Search query string
fields (list, optional): Fields to return in results
sorts (list, optional): Sort criteria
params (dict, optional): Additional search parameters
full_text_search (bool): Enable full-text search
dsl_fts (bool): Enable DSL full-text search
Various session and request options...
Returns:
Search: Search object for iterating over results
"""Access and manage individual files within Archive.org items, including download, deletion, and metadata access.
def get_files(identifier, files=None, formats=None, glob_pattern=None, exclude_pattern=None, on_the_fly=False, **get_item_kwargs):
"""
Get File objects from an item with optional filtering.
Args:
identifier (str): Item identifier
files (list, optional): Specific files to retrieve
formats (list, optional): File formats to filter by
glob_pattern (str, optional): Glob pattern for file selection
exclude_pattern (str, optional): Glob pattern for exclusion
on_the_fly (bool): Include on-the-fly files
Returns:
list: List of File objects
"""
def delete(identifier, files=None, formats=None, glob_pattern=None, cascade_delete=False, access_key=None, secret_key=None, verbose=False, debug=False, **kwargs):
"""
Delete files from an Archive.org item.
Args:
identifier (str): Item identifier
files (list, optional): Specific files to delete
formats (list, optional): File formats to delete
glob_pattern (str, optional): Glob pattern for file selection
cascade_delete (bool): Delete derived files
Various authentication and request options...
Returns:
list: List of Request/Response objects from delete operations
"""View and modify item metadata with support for appending, targeting specific metadata sections, and batch operations.
def modify_metadata(identifier, metadata, target=None, append=False, append_list=False, priority=0, access_key=None, secret_key=None, debug=False, request_kwargs=None, **get_item_kwargs):
"""
Modify metadata of an existing Archive.org item.
Args:
identifier (str): Item identifier
metadata (dict): Metadata changes to apply
target (str, optional): Target metadata section
append (bool): Append to existing metadata
append_list (bool): Append to metadata lists
priority (int): Task priority
Various authentication and request options...
Returns:
Request or Response: Metadata modification result
"""Manage Archive.org catalog tasks including derive operations, item processing, and task monitoring.
def get_tasks(identifier="", params=None, config=None, config_file=None, archive_session=None, http_adapter_kwargs=None, request_kwargs=None):
"""
Get tasks from the Archive.org catalog system.
Args:
identifier (str, optional): Filter tasks by item identifier
params (dict, optional): Additional task query parameters
Various session and request options...
Returns:
set: Set of CatalogTask objects
"""Configure the library with Archive.org credentials and retrieve user information.
def configure(username="", password="", config_file="", host="archive.org"):
"""
Configure internetarchive with Archive.org credentials.
Args:
username (str): Archive.org username
password (str): Archive.org password
config_file (str): Path to config file
host (str): Archive.org host
Returns:
str: Path to configuration file
"""
def get_username(access_key, secret_key):
"""
Get Archive.org username from IA-S3 key pair.
Args:
access_key (str): IA-S3 access key
secret_key (str): IA-S3 secret key
Returns:
str: Archive.org username
"""
def get_user_info(access_key, secret_key):
"""
Get detailed user information from IA-S3 key pair.
Args:
access_key (str): IA-S3 access key
secret_key (str): IA-S3 secret key
Returns:
dict: User information dictionary
"""Configuration and Authentication
Administrative functions for managing Archive.org user accounts. Requires administrative privileges.
Note: The Account class is not part of the main public API but can be imported directly from internetarchive.account.
# Import required for Account class
from internetarchive.account import Account
class Account:
"""
Administrative interface for managing Archive.org user accounts.
Note: Requires administrative privileges.
"""
@classmethod
def from_account_lookup(cls, identifier_type: str, identifier: str, session=None):
"""
Factory method to get Account by identifier type and value.
Args:
identifier_type (str): Type of identifier ('email', 'screenname', 'itemname')
identifier (str): The identifier value (e.g., 'user@example.com')
session (ArchiveSession, optional): Session object to use
Returns:
Account: Account object with user information
Raises:
AccountAPIError: If account lookup fails or access denied
"""
def lock(self, comment: str):
"""Lock the account with a comment."""
def unlock(self, comment: str):
"""Unlock the account with a comment."""
def to_dict(self):
"""Convert account data to dictionary."""Comprehensive command-line tools accessible through the ia command for all major Archive.org operations.
# CLI Commands (accessed via command line):
# ia configure - Configure credentials
# ia upload - Upload files to items
# ia download - Download files from items
# ia delete - Delete files from items
# ia metadata - View/modify item metadata
# ia search - Search Archive.org
# ia list - List item files
# ia tasks - Manage catalog tasks
# ia copy - Copy files between items
# ia move - Move files between items
# ia account - Account management
# ia reviews - Manage item reviews
# ia flag - Flag items for reviewclass ArchiveSession:
"""Main session class for Internet Archive operations."""
class Item:
"""Represents an Archive.org item."""
class Collection:
"""Represents an Archive.org collection (extends Item)."""
class File:
"""Represents a file within an Archive.org item."""
class Search:
"""Represents a search query and results."""
class Catalog:
"""Interface to Archive.org catalog/tasks system."""
class CatalogTask:
"""Represents a catalog task."""
class Account:
"""Account management interface (requires admin privileges)."""
# Package metadata
__version__: str
"""Current version of the internetarchive package (5.5.0)."""
# Exceptions
class AuthenticationError(Exception):
"""Authentication failed."""
class ItemLocateError(Exception):
"""Item cannot be located (dark or non-existent)."""
class InvalidChecksumError(Exception):
"""File corrupt, checksums don't match."""
class AccountAPIError(Exception):
"""Account API-related errors."""