tessl/pypi-internetarchive

A Python interface to archive.org for programmatic access to the Internet Archive's digital library

—

Pending

Overview

Eval results

Files

File Management

Name: tessl/pypi-internetarchive
Author: tessl

File management operations provide access to individual files within Archive.org items, including file retrieval, download, deletion, and metadata access.

Capabilities

File Retrieval

Access File objects representing individual files within Archive.org items.

def get_files(identifier, files=None, formats=None, glob_pattern=None, exclude_pattern=None, on_the_fly=False, **get_item_kwargs):
    """
    Get File objects from an item with optional filtering.
    
    Args:
        identifier (str): Item identifier
        files (list, optional): Specific file names to retrieve
        formats (list, optional): File formats to filter by (e.g., ['pdf', 'txt', 'jpg'])
        glob_pattern (str, optional): Glob pattern for file selection (e.g., '*.pdf', 'chapter*.txt')
        exclude_pattern (str, optional): Glob pattern for exclusion
        on_the_fly (bool): Include on-the-fly derived files
        **get_item_kwargs: Additional arguments passed to get_item
        
    Returns:
        list: List of File objects matching the criteria
    """

class File:
    """
    Represents a file within an Archive.org item.
    """
    
    def __init__(self, item, name, file_metadata=None):
        """
        Initialize File object.
        
        Args:
            item (Item): Parent Item object
            name (str): Filename
            file_metadata (dict, optional): Pre-fetched file metadata
        """

File Properties

Access file metadata, URLs, and status information.

class File:
    @property
    def item(self):
        """Item: Parent Item object."""
        
    @property
    def identifier(self):
        """str: Item identifier (same as parent item)."""
        
    @property
    def name(self):
        """str: Filename."""
        
    @property
    def url(self):
        """str: Direct download URL for the file."""
        
    @property
    def auth(self):
        """S3Auth: S3 authentication object if credentials are available."""
        
    @property
    def exists(self):
        """bool: Whether the file exists in the item."""
        
    @property
    def metadata(self):
        """dict: File metadata dictionary."""
        
    # Standard file properties
    @property
    def size(self):
        """int: File size in bytes."""
        
    @property
    def format(self):
        """str: File format/type."""
        
    @property
    def md5(self):
        """str: MD5 checksum of the file."""
        
    @property
    def sha1(self):
        """str: SHA1 checksum of the file."""
        
    @property
    def mtime(self):
        """str: Last modification time."""
        
    @property
    def crc32(self):
        """str: CRC32 checksum of the file."""
        
    @property
    def source(self):
        """str: Source of the file (original or derived)."""

File Download

Download individual files with various options.

class File:
    def download(self, file_path=None, verbose=None, ignore_existing=None, checksum=None, checksum_archive=None, destdir=None, retries=None, ignore_errors=None, no_change_timestamp=None, timeout=None, **kwargs):
        """
        Download this file.
        
        Args:
            file_path (str, optional): Local path to save file (defaults to filename)
            verbose (bool, optional): Enable verbose output
            ignore_existing (bool, optional): Re-download if file already exists
            checksum (bool, optional): Verify checksum after download
            checksum_archive (bool, optional): Use archive-provided checksums
            destdir (str, optional): Destination directory
            retries (int, optional): Number of retry attempts
            ignore_errors (bool, optional): Continue on errors
            no_change_timestamp (bool, optional): Don't update file timestamp
            timeout (int, optional): Request timeout in seconds
            **kwargs: Additional download options
            
        Returns:
            Request or Response: Download operation result
            
        Raises:
            InvalidChecksumError: If checksum verification fails
            requests.RequestException: If download fails
        """

File Deletion

Delete files from Archive.org items.

def delete(identifier, files=None, formats=None, glob_pattern=None, cascade_delete=False, access_key=None, secret_key=None, verbose=False, debug=False, **kwargs):
    """
    Delete files from an Archive.org item.
    
    Args:
        identifier (str): Item identifier
        files (list, optional): Specific files to delete
        formats (list, optional): File formats to delete (e.g., ['pdf', 'jpg'])
        glob_pattern (str, optional): Glob pattern for file selection
        cascade_delete (bool): Delete derived files along with source files
        access_key (str, optional): IA-S3 access key (overrides config)
        secret_key (str, optional): IA-S3 secret key (overrides config)
        verbose (bool): Enable verbose output
        debug (bool): Enable debug logging
        **kwargs: Additional arguments passed to get_item
        
    Returns:
        list: List of Request/Response objects from delete operations
        
    Raises:
        AuthenticationError: If authentication fails
        ItemLocateError: If item cannot be located
    """

class File:
    def delete(self, cascade_delete=False, access_key=None, secret_key=None, verbose=False, debug=False, request_kwargs=None):
        """
        Delete this file from the Archive.org item.
        
        Args:
            cascade_delete (bool): Delete derived files along with this file
            access_key (str, optional): IA-S3 access key
            secret_key (str, optional): IA-S3 secret key
            verbose (bool): Enable verbose output
            debug (bool): Enable debug logging
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            Request or Response: Delete operation result
            
        Raises:
            AuthenticationError: If authentication fails
        """

Usage Examples

Basic File Access

import internetarchive

# Get all files from an item
files = internetarchive.get_files('example-item')

for file in files:
    print(f"File: {file.name}")
    print(f"Size: {file.size} bytes")
    print(f"Format: {file.format}")
    print(f"MD5: {file.md5}")
    print("---")

File Filtering

import internetarchive

# Get only PDF files
pdf_files = internetarchive.get_files('example-item', formats=['pdf'])

# Get files matching pattern
text_files = internetarchive.get_files('example-item', glob_pattern='*.txt')

# Get specific files
specific_files = internetarchive.get_files(
    'example-item', 
    files=['document.pdf', 'readme.txt']
)

# Exclude certain patterns
filtered_files = internetarchive.get_files(
    'example-item',
    exclude_pattern='*_thumb.jpg'
)

File Download Operations

import internetarchive

# Download specific file
item = internetarchive.get_item('example-item')
file = item.get_file('document.pdf')

if file:
    # Download with verification
    file.download(
        file_path='./downloads/document.pdf',
        checksum=True,
        verbose=True
    )

# Download all files of specific format
for file in item.get_files(formats=['pdf']):
    file.download(destdir='./pdf_downloads')

Bulk File Operations

import internetarchive

# Download all images from an item
item = internetarchive.get_item('photo-collection')

image_formats = ['jpg', 'jpeg', 'png', 'gif']
for file in item.get_files(formats=image_formats):
    print(f"Downloading {file.name} ({file.size} bytes)")
    file.download(
        destdir='./images',
        ignore_existing=True,
        checksum=True
    )

File Deletion

import internetarchive

# Delete specific files
internetarchive.delete(
    'my-item',
    files=['unwanted.pdf', 'old-version.txt'],
    verbose=True
)

# Delete files by format
internetarchive.delete(
    'my-item',
    formats=['tmp'],  # Delete all temporary files
    cascade_delete=True
)

# Delete using pattern
internetarchive.delete(
    'my-item',
    glob_pattern='*_backup.*'
)

File Metadata Analysis

import internetarchive
from collections import defaultdict

# Analyze file types in an item
item = internetarchive.get_item('example-item')

format_stats = defaultdict(lambda: {'count': 0, 'total_size': 0})

for file in item.get_files():
    format_name = file.format or 'unknown'
    format_stats[format_name]['count'] += 1
    format_stats[format_name]['total_size'] += file.size or 0

print("File Format Analysis:")
for fmt, stats in sorted(format_stats.items()):
    avg_size = stats['total_size'] / stats['count'] if stats['count'] > 0 else 0
    print(f"{fmt}: {stats['count']} files, {stats['total_size']:,} bytes total, {avg_size:.0f} bytes average")

Working with Checksums

import internetarchive
import hashlib

# Verify file integrity
item = internetarchive.get_item('example-item')
file = item.get_file('important-document.pdf')

if file and file.md5:
    # Download and verify
    response = file.download(file_path='temp_file.pdf', checksum=True)
    
    # Manual checksum verification
    with open('temp_file.pdf', 'rb') as f:
        local_md5 = hashlib.md5(f.read()).hexdigest()
    
    if local_md5 == file.md5:
        print("File integrity verified")
    else:
        print("Checksum mismatch - file may be corrupted")

Install with Tessl CLI