A Python interface to archive.org for programmatic access to the Internet Archive's digital library
—
File management operations provide access to individual files within Archive.org items, including file retrieval, download, deletion, and metadata access.
Access File objects representing individual files within Archive.org items.
def get_files(identifier, files=None, formats=None, glob_pattern=None, exclude_pattern=None, on_the_fly=False, **get_item_kwargs):
"""
Get File objects from an item with optional filtering.
Args:
identifier (str): Item identifier
files (list, optional): Specific file names to retrieve
formats (list, optional): File formats to filter by (e.g., ['pdf', 'txt', 'jpg'])
glob_pattern (str, optional): Glob pattern for file selection (e.g., '*.pdf', 'chapter*.txt')
exclude_pattern (str, optional): Glob pattern for exclusion
on_the_fly (bool): Include on-the-fly derived files
**get_item_kwargs: Additional arguments passed to get_item
Returns:
list: List of File objects matching the criteria
"""
class File:
"""
Represents a file within an Archive.org item.
"""
def __init__(self, item, name, file_metadata=None):
"""
Initialize File object.
Args:
item (Item): Parent Item object
name (str): Filename
file_metadata (dict, optional): Pre-fetched file metadata
"""Access file metadata, URLs, and status information.
class File:
@property
def item(self):
"""Item: Parent Item object."""
@property
def identifier(self):
"""str: Item identifier (same as parent item)."""
@property
def name(self):
"""str: Filename."""
@property
def url(self):
"""str: Direct download URL for the file."""
@property
def auth(self):
"""S3Auth: S3 authentication object if credentials are available."""
@property
def exists(self):
"""bool: Whether the file exists in the item."""
@property
def metadata(self):
"""dict: File metadata dictionary."""
# Standard file properties
@property
def size(self):
"""int: File size in bytes."""
@property
def format(self):
"""str: File format/type."""
@property
def md5(self):
"""str: MD5 checksum of the file."""
@property
def sha1(self):
"""str: SHA1 checksum of the file."""
@property
def mtime(self):
"""str: Last modification time."""
@property
def crc32(self):
"""str: CRC32 checksum of the file."""
@property
def source(self):
"""str: Source of the file (original or derived)."""Download individual files with various options.
class File:
def download(self, file_path=None, verbose=None, ignore_existing=None, checksum=None, checksum_archive=None, destdir=None, retries=None, ignore_errors=None, no_change_timestamp=None, timeout=None, **kwargs):
"""
Download this file.
Args:
file_path (str, optional): Local path to save file (defaults to filename)
verbose (bool, optional): Enable verbose output
ignore_existing (bool, optional): Re-download if file already exists
checksum (bool, optional): Verify checksum after download
checksum_archive (bool, optional): Use archive-provided checksums
destdir (str, optional): Destination directory
retries (int, optional): Number of retry attempts
ignore_errors (bool, optional): Continue on errors
no_change_timestamp (bool, optional): Don't update file timestamp
timeout (int, optional): Request timeout in seconds
**kwargs: Additional download options
Returns:
Request or Response: Download operation result
Raises:
InvalidChecksumError: If checksum verification fails
requests.RequestException: If download fails
"""Delete files from Archive.org items.
def delete(identifier, files=None, formats=None, glob_pattern=None, cascade_delete=False, access_key=None, secret_key=None, verbose=False, debug=False, **kwargs):
"""
Delete files from an Archive.org item.
Args:
identifier (str): Item identifier
files (list, optional): Specific files to delete
formats (list, optional): File formats to delete (e.g., ['pdf', 'jpg'])
glob_pattern (str, optional): Glob pattern for file selection
cascade_delete (bool): Delete derived files along with source files
access_key (str, optional): IA-S3 access key (overrides config)
secret_key (str, optional): IA-S3 secret key (overrides config)
verbose (bool): Enable verbose output
debug (bool): Enable debug logging
**kwargs: Additional arguments passed to get_item
Returns:
list: List of Request/Response objects from delete operations
Raises:
AuthenticationError: If authentication fails
ItemLocateError: If item cannot be located
"""
class File:
def delete(self, cascade_delete=False, access_key=None, secret_key=None, verbose=False, debug=False, request_kwargs=None):
"""
Delete this file from the Archive.org item.
Args:
cascade_delete (bool): Delete derived files along with this file
access_key (str, optional): IA-S3 access key
secret_key (str, optional): IA-S3 secret key
verbose (bool): Enable verbose output
debug (bool): Enable debug logging
request_kwargs (dict, optional): Additional request arguments
Returns:
Request or Response: Delete operation result
Raises:
AuthenticationError: If authentication fails
"""import internetarchive
# Get all files from an item
files = internetarchive.get_files('example-item')
for file in files:
print(f"File: {file.name}")
print(f"Size: {file.size} bytes")
print(f"Format: {file.format}")
print(f"MD5: {file.md5}")
print("---")import internetarchive
# Get only PDF files
pdf_files = internetarchive.get_files('example-item', formats=['pdf'])
# Get files matching pattern
text_files = internetarchive.get_files('example-item', glob_pattern='*.txt')
# Get specific files
specific_files = internetarchive.get_files(
'example-item',
files=['document.pdf', 'readme.txt']
)
# Exclude certain patterns
filtered_files = internetarchive.get_files(
'example-item',
exclude_pattern='*_thumb.jpg'
)import internetarchive
# Download specific file
item = internetarchive.get_item('example-item')
file = item.get_file('document.pdf')
if file:
# Download with verification
file.download(
file_path='./downloads/document.pdf',
checksum=True,
verbose=True
)
# Download all files of specific format
for file in item.get_files(formats=['pdf']):
file.download(destdir='./pdf_downloads')import internetarchive
# Download all images from an item
item = internetarchive.get_item('photo-collection')
image_formats = ['jpg', 'jpeg', 'png', 'gif']
for file in item.get_files(formats=image_formats):
print(f"Downloading {file.name} ({file.size} bytes)")
file.download(
destdir='./images',
ignore_existing=True,
checksum=True
)import internetarchive
# Delete specific files
internetarchive.delete(
'my-item',
files=['unwanted.pdf', 'old-version.txt'],
verbose=True
)
# Delete files by format
internetarchive.delete(
'my-item',
formats=['tmp'], # Delete all temporary files
cascade_delete=True
)
# Delete using pattern
internetarchive.delete(
'my-item',
glob_pattern='*_backup.*'
)import internetarchive
from collections import defaultdict
# Analyze file types in an item
item = internetarchive.get_item('example-item')
format_stats = defaultdict(lambda: {'count': 0, 'total_size': 0})
for file in item.get_files():
format_name = file.format or 'unknown'
format_stats[format_name]['count'] += 1
format_stats[format_name]['total_size'] += file.size or 0
print("File Format Analysis:")
for fmt, stats in sorted(format_stats.items()):
avg_size = stats['total_size'] / stats['count'] if stats['count'] > 0 else 0
print(f"{fmt}: {stats['count']} files, {stats['total_size']:,} bytes total, {avg_size:.0f} bytes average")import internetarchive
import hashlib
# Verify file integrity
item = internetarchive.get_item('example-item')
file = item.get_file('important-document.pdf')
if file and file.md5:
# Download and verify
response = file.download(file_path='temp_file.pdf', checksum=True)
# Manual checksum verification
with open('temp_file.pdf', 'rb') as f:
local_md5 = hashlib.md5(f.read()).hexdigest()
if local_md5 == file.md5:
print("File integrity verified")
else:
print("Checksum mismatch - file may be corrupted")Install with Tessl CLI
npx tessl i tessl/pypi-internetarchive