A Python interface to archive.org for programmatic access to the Internet Archive's digital library
—
Item operations provide comprehensive access to Archive.org items, including retrieval, download, upload, and management of items and their files.
Get Item objects to access metadata, files, and perform operations on Archive.org items.
def get_item(identifier, config=None, config_file=None, archive_session=None, debug=False, http_adapter_kwargs=None, request_kwargs=None):
"""
Get an Item object by Archive.org identifier.
Args:
identifier (str): The globally unique Archive.org item identifier
config (dict, optional): Configuration dictionary for session creation
config_file (str, optional): Path to configuration file
archive_session (ArchiveSession, optional): Existing session object to use
debug (bool): Enable debug logging
http_adapter_kwargs (dict, optional): HTTP adapter keyword arguments
request_kwargs (dict, optional): Additional request arguments
Returns:
Item: Item object for the specified identifier (or Collection if item is a collection)
Raises:
ItemLocateError: If item cannot be located or is dark
"""
class Item:
"""
Represents an Archive.org item with metadata, files, and operations.
"""
def __init__(self, archive_session, identifier, item_metadata=None):
"""
Initialize Item object.
Args:
archive_session (ArchiveSession): Session object
identifier (str): Item identifier
item_metadata (dict, optional): Pre-fetched metadata
"""Access item metadata, files, and status information.
class Item:
@property
def identifier(self):
"""str: Item identifier."""
@property
def metadata(self):
"""dict: Complete item metadata."""
@property
def files(self):
"""list: List of file metadata dictionaries."""
@property
def exists(self):
"""bool: Whether the item exists and is accessible."""
@property
def session(self):
"""ArchiveSession: Session object used by this item."""
@property
def urls(self):
"""URLs: Object providing access to various item URLs."""
@property
def collection(self):
"""list: Collections this item belongs to."""
@property
def wikilink(self):
"""str: MediaWiki-formatted link (if item has title)."""
# Archive.org specific properties
@property
def created(self):
"""int: Unix timestamp of item creation."""
@property
def d1(self):
"""str: Primary server."""
@property
def d2(self):
"""str: Secondary server."""
@property
def dir(self):
"""str: Item directory path."""
@property
def files_count(self):
"""int: Number of files in item."""
@property
def item_size(self):
"""int: Total size of all files in bytes."""
@property
def reviews(self):
"""list: Item reviews."""
@property
def server(self):
"""str: Item server."""
@property
def uniq(self):
"""int: Unique item number."""
@property
def updated(self):
"""int: Unix timestamp of last update."""
@property
def tasks(self):
"""int: Number of tasks associated with item."""
@property
def is_dark(self):
"""bool: Whether item is dark (restricted access)."""Refresh item data and check identifier availability.
class Item:
def refresh(self, item_metadata=None, **kwargs):
"""
Refresh item metadata from Archive.org.
Args:
item_metadata (dict, optional): Use specific metadata instead of fetching
**kwargs: Additional arguments passed to get_metadata
"""
def identifier_available(self):
"""
Check if the item identifier is available for use.
Returns:
bool: True if identifier is available, False if taken
"""Access individual files and collections of files within the item.
class Item:
def get_file(self, file_name):
"""
Get a File object for a specific file in the item.
Args:
file_name (str): Name of the file
Returns:
File: File object, or None if file doesn't exist
"""
def get_files(self, files=None, formats=None, glob_pattern=None, exclude_pattern=None, on_the_fly=False):
"""
Get File objects with optional filtering.
Args:
files (list, optional): Specific file names to retrieve
formats (list, optional): File formats to include (e.g., ['pdf', 'epub'])
glob_pattern (str, optional): Glob pattern for file selection
exclude_pattern (str, optional): Glob pattern for exclusion
on_the_fly (bool): Include on-the-fly derived files
Yields:
File: File objects matching the criteria
"""Upload files to items, creating new items or updating existing ones.
def upload(identifier, files, metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=None, verbose=False, verify=False, checksum=False, delete=False, retries=None, retries_sleep=None, debug=False, validate_identifier=False, request_kwargs=None, **get_item_kwargs):
"""
Upload files to an Archive.org item (creates item if it doesn't exist).
Args:
identifier (str): Item identifier to upload to
files (list): Files to upload - can be:
- File paths (str)
- File-like objects
- Tuples of (remote_name, local_path_or_file_object)
- Dictionaries with 'name' and file content
metadata (dict, optional): Item metadata to set/update
headers (dict, optional): HTTP headers for upload requests
access_key (str, optional): IA-S3 access key (overrides config)
secret_key (str, optional): IA-S3 secret key (overrides config)
queue_derive (bool, optional): Queue derive task after upload
verbose (bool): Enable verbose output
verify (bool): Verify checksums after upload
checksum (bool): Calculate and verify MD5 checksums
delete (bool): Delete local files after successful upload
retries (int, optional): Number of retry attempts
retries_sleep (int, optional): Seconds to sleep between retries
debug (bool): Enable debug logging
validate_identifier (bool): Validate identifier format
request_kwargs (dict, optional): Additional request arguments
**get_item_kwargs: Additional arguments for get_item
Returns:
list: List of Request/Response objects from upload operations
Raises:
ValueError: If identifier is invalid
AuthenticationError: If authentication fails
"""
class Item:
def upload(self, files, metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=None, verbose=False, verify=False, checksum=False, delete=False, retries=None, retries_sleep=None, debug=False, request_kwargs=None):
"""
Upload files to this item using the same parameters as the upload function.
Returns:
list: List of Request/Response objects from upload operations
"""Download files from items with extensive filtering and configuration options.
def download(identifier, files=None, formats=None, glob_pattern=None, dry_run=False, verbose=False, ignore_existing=False, checksum=False, checksum_archive=False, destdir=None, no_directory=False, retries=None, item_index=None, ignore_errors=False, on_the_fly=False, return_responses=False, no_change_timestamp=False, timeout=None, **get_item_kwargs):
"""
Download files from an Archive.org item with extensive filtering options.
Args:
identifier (str): Item identifier to download from
files (list, optional): Specific files to download
formats (list, optional): File formats to download (e.g., ['pdf', 'txt'])
glob_pattern (str, optional): Glob pattern for file selection
dry_run (bool): Show what would be downloaded without downloading
verbose (bool): Enable verbose output
ignore_existing (bool): Re-download files that already exist locally
checksum (bool): Verify file checksums after download
checksum_archive (bool): Verify checksums from archive
destdir (str, optional): Destination directory (default: current directory)
no_directory (bool): Don't create item directory, save files directly to destdir
retries (int, optional): Number of retry attempts per file
item_index (int, optional): Download only files modified after this item index
ignore_errors (bool): Continue downloading other files if some fail
on_the_fly (bool): Include on-the-fly derived files
return_responses (bool): Return response objects instead of downloading
no_change_timestamp (bool): Don't update file timestamps to match archive
timeout (int, optional): Request timeout in seconds
**get_item_kwargs: Additional arguments for get_item
Returns:
list: List of Request/Response objects from download operations
Raises:
ItemLocateError: If item cannot be located
"""
class Item:
def download(self, files=None, formats=None, glob_pattern=None, dry_run=False, verbose=False, ignore_existing=False, checksum=False, checksum_archive=False, destdir=None, no_directory=False, retries=None, item_index=None, ignore_errors=False, on_the_fly=False, return_responses=False, no_change_timestamp=False, timeout=None):
"""
Download files from this item using the same parameters as the download function.
Returns:
list: List of Request/Response objects from download operations
"""Modify item metadata with various update strategies.
class Item:
def modify_metadata(self, metadata, target=None, append=False, append_list=False, priority=0, access_key=None, secret_key=None, debug=False, request_kwargs=None):
"""
Modify metadata of this item.
Args:
metadata (dict): Metadata changes to apply
target (str, optional): Target specific metadata section
append (bool): Append values to existing metadata fields
append_list (bool): Append to metadata list fields
priority (int): Task priority for metadata update
access_key (str, optional): IA-S3 access key
secret_key (str, optional): IA-S3 secret key
debug (bool): Enable debug logging
request_kwargs (dict, optional): Additional request arguments
Returns:
Request or Response: Metadata modification result
Raises:
AuthenticationError: If authentication fails
"""Submit various Archive.org tasks for item processing.
class Item:
def derive(self, priority=0, remove_derived=None, reduced_priority=False, data=None, headers=None, request_kwargs=None):
"""
Submit derive task to generate derived files.
Args:
priority (int): Task priority (-5 to 10)
remove_derived (list, optional): Derived formats to remove
reduced_priority (bool): Use reduced priority queue
data (dict, optional): Additional task data
headers (dict, optional): Additional HTTP headers
request_kwargs (dict, optional): Additional request arguments
Returns:
Response: Task submission response
"""
def fixer(self, ops=None, priority=None, reduced_priority=False, data=None, headers=None, request_kwargs=None):
"""
Submit fixer task to fix item issues.
Args:
ops (list, optional): Fixer operations to perform
priority (int, optional): Task priority
reduced_priority (bool): Use reduced priority queue
data (dict, optional): Additional task data
headers (dict, optional): Additional HTTP headers
request_kwargs (dict, optional): Additional request arguments
Returns:
Response: Task submission response
"""
def dark(self, comment, priority=None, data=None, reduced_priority=False, request_kwargs=None):
"""
Dark the item (restrict access).
Args:
comment (str): Reason for darking the item
priority (int, optional): Task priority
data (dict, optional): Additional task data
reduced_priority (bool): Use reduced priority queue
request_kwargs (dict, optional): Additional request arguments
Returns:
Response: Task submission response
"""
def undark(self, comment, priority=None, reduced_priority=False, data=None, request_kwargs=None):
"""
Undark the item (restore access).
Args:
comment (str): Reason for undarking the item
priority (int, optional): Task priority
reduced_priority (bool): Use reduced priority queue
data (dict, optional): Additional task data
request_kwargs (dict, optional): Additional request arguments
Returns:
Response: Task submission response
"""Manage item reviews and monitor task status.
class Item:
def get_review(self):
"""
Get review information for this item.
Returns:
Response: Review data response
"""
def get_task_summary(self, params=None, request_kwargs=None):
"""
Get task count summary for this item.
Args:
params (dict, optional): Additional query parameters
request_kwargs (dict, optional): Additional request arguments
Returns:
dict: Task counts by status
"""
def no_tasks_pending(self, params=None, request_kwargs=None):
"""
Check if item has no pending tasks.
Args:
params (dict, optional): Additional query parameters
request_kwargs (dict, optional): Additional request arguments
Returns:
bool: True if no tasks are pending
"""
def get_all_item_tasks(self, params=None, request_kwargs=None):
"""
Get all tasks (completed and pending) for this item.
Args:
params (dict, optional): Additional query parameters
request_kwargs (dict, optional): Additional request arguments
Returns:
list: List of CatalogTask objects
"""
def get_history(self, params=None, request_kwargs=None):
"""
Get completed tasks for this item.
Args:
params (dict, optional): Additional query parameters
request_kwargs (dict, optional): Additional request arguments
Returns:
list: List of completed CatalogTask objects
"""
def get_catalog(self, params=None, request_kwargs=None):
"""
Get pending tasks for this item.
Args:
params (dict, optional): Additional query parameters
request_kwargs (dict, optional): Additional request arguments
Returns:
list: List of pending CatalogTask objects
"""Get various URLs associated with the item.
class URLs:
"""Object providing access to various item URLs."""
@property
def details(self):
"""str: Item details page URL."""
@property
def metadata(self):
"""str: Item metadata API URL."""
@property
def download(self):
"""str: Item download directory URL."""
@property
def history(self):
"""str: Item history page URL."""
@property
def edit(self):
"""str: Item edit page URL."""
@property
def editxml(self):
"""str: Item XML edit page URL."""
@property
def manage(self):
"""str: Item management page URL."""Add, manage, and moderate reviews for items.
def review(self, title: str, body: str, stars=None):
"""
Add a review to the item.
Args:
title (str): Review title
body (str): Review content/body
stars (int, optional): Star rating for the review (1-5)
Returns:
requests.Response: Response object from review submission
Example:
>>> item = internetarchive.get_item('my-item')
>>> item.review('Great content!', 'This item has excellent resources.', stars=5)
"""
def index_review(self, username=None, screenname=None, itemname=None):
"""
Set a review to be indexed (make it visible in search results).
Args:
username (str, optional): Username of the reviewer
screenname (str, optional): Screen name of the reviewer
itemname (str, optional): Item name of the reviewer
Returns:
requests.Response: Response object from the indexing operation
Note:
Requires appropriate privileges for review moderation.
"""
def noindex_review(self, username=None, screenname=None, itemname=None):
"""
Set a review to not be indexed (hide it from search results).
Args:
username (str, optional): Username of the reviewer
screenname (str, optional): Screen name of the reviewer
itemname (str, optional): Item name of the reviewer
Returns:
requests.Response: Response object from the operation
Note:
Requires appropriate privileges for review moderation.
"""
def delete_review(self, username=None, screenname=None, itemname=None):
"""
Delete a review from the item.
Args:
username (str, optional): Username of the reviewer
screenname (str, optional): Screen name of the reviewer
itemname (str, optional): Item name of the reviewer
Returns:
requests.Response: Response object from the deletion operation
Note:
Requires appropriate privileges for review management.
"""Add and manage administrative flags for items.
def add_flag(self, category: str, user=None):
"""
Add a flag to the item for administrative purposes.
Args:
category (str): Flag category (e.g., 'copyright', 'spam', 'inappropriate')
user (str, optional): User adding the flag (defaults to current user)
Returns:
requests.Response: Response object from the flag addition
Example:
>>> item = internetarchive.get_item('problematic-item')
>>> item.add_flag('copyright', user='moderator')
"""
def delete_flag(self, category: str, user=None):
"""
Remove a flag from the item.
Args:
category (str): Flag category to remove
user (str, optional): User removing the flag (defaults to current user)
Returns:
requests.Response: Response object from the flag removal
"""
def get_flags(self):
"""
Retrieve all flags associated with the item.
Returns:
requests.Response: Response object containing flag data
Example:
>>> item = internetarchive.get_item('my-item')
>>> flags_response = item.get_flags()
>>> flags_data = flags_response.json()
"""Upload individual files with extensive configuration options.
def upload_file(self, body, key=None, metadata=None, file_metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=False, verbose=False, verify=False, checksum=False, delete=False, retries=None, retries_sleep=None, debug=False, validate_identifier=False, request_kwargs=None):
"""
Upload a single file to the item with fine-grained control.
Args:
body (str or file-like): File path or file-like object to upload
key (str, optional): Remote filename (defaults to local filename)
metadata (dict, optional): Item metadata to set during upload
file_metadata (dict, optional): File-level metadata
headers (dict, optional): Additional HTTP headers
access_key (str, optional): IA-S3 access key
secret_key (str, optional): IA-S3 secret key
queue_derive (bool): Whether to queue derive task after upload
verbose (bool): Enable verbose output
verify (bool): Verify checksums after upload
checksum (bool): Calculate MD5 checksums
delete (bool): Delete local file after upload success
retries (int, optional): Number of retry attempts
retries_sleep (int, optional): Sleep time between retries in seconds
debug (bool): Enable debug mode
validate_identifier (bool): Validate identifier format
request_kwargs (dict, optional): Additional request arguments
Returns:
requests.Request or requests.Response: Request object (if debug=True) or Response object
Example:
>>> item = internetarchive.get_item('my-item')
>>> response = item.upload_file(
... 'document.pdf',
... key='renamed-document.pdf',
... file_metadata={'title': 'Important Document'},
... verify=True,
... checksum=True
... )
"""Manage item membership in simplelists and collections.
def remove_from_simplelist(self, parent: str, list: str):
"""
Remove the item from a simplelist collection.
Args:
parent (str): Parent collection identifier
list (str): List name to remove item from
Returns:
requests.Response: Response object from the removal operation
Example:
>>> item = internetarchive.get_item('my-item')
>>> item.remove_from_simplelist('my-collection', 'featured-items')
"""Extended functionality for collection items.
class Collection:
"""
Represents an Archive.org collection (extends Item).
"""
@property
def searches(self):
"""dict: Dictionary of Search objects for collection contents."""import internetarchive
# Get an item
item = internetarchive.get_item('govlawgacode20071')
# Check if item exists
if item.exists:
print(f"Item title: {item.metadata.get('title')}")
print(f"Item has {item.files_count} files")
print(f"Total size: {item.item_size} bytes")
# Get specific file
pdf_file = item.get_file('govlawgacode20071.pdf')
if pdf_file:
print(f"PDF file size: {pdf_file.size}")import internetarchive
# Upload files with metadata
response = internetarchive.upload(
'my-new-item',
files=['document.pdf', 'image.jpg'],
metadata={
'title': 'My Document Collection',
'creator': 'Your Name',
'description': 'A collection of important documents',
'collection': 'opensource'
}
)
print(f"Upload completed: {len(response)} files uploaded")import internetarchive
# Download only PDF files
internetarchive.download(
'example-item',
formats=['pdf'],
destdir='./downloads',
verbose=True,
checksum=True
)
# Download files matching pattern
internetarchive.download(
'example-item',
glob_pattern='*.txt',
ignore_existing=True
)Install with Tessl CLI
npx tessl i tessl/pypi-internetarchive