tessl/pypi-internetarchive

A Python interface to archive.org for programmatic access to the Internet Archive's digital library

—

Pending

Overview

Eval results

Files

Item Operations

Name: tessl/pypi-internetarchive
Author: tessl

Item operations provide comprehensive access to Archive.org items, including retrieval, download, upload, and management of items and their files.

Capabilities

Item Retrieval

Get Item objects to access metadata, files, and perform operations on Archive.org items.

def get_item(identifier, config=None, config_file=None, archive_session=None, debug=False, http_adapter_kwargs=None, request_kwargs=None):
    """
    Get an Item object by Archive.org identifier.
    
    Args:
        identifier (str): The globally unique Archive.org item identifier
        config (dict, optional): Configuration dictionary for session creation
        config_file (str, optional): Path to configuration file
        archive_session (ArchiveSession, optional): Existing session object to use
        debug (bool): Enable debug logging
        http_adapter_kwargs (dict, optional): HTTP adapter keyword arguments
        request_kwargs (dict, optional): Additional request arguments
        
    Returns:
        Item: Item object for the specified identifier (or Collection if item is a collection)
        
    Raises:
        ItemLocateError: If item cannot be located or is dark
    """

class Item:
    """
    Represents an Archive.org item with metadata, files, and operations.
    """
    
    def __init__(self, archive_session, identifier, item_metadata=None):
        """
        Initialize Item object.
        
        Args:
            archive_session (ArchiveSession): Session object
            identifier (str): Item identifier
            item_metadata (dict, optional): Pre-fetched metadata
        """

Item Properties

Access item metadata, files, and status information.

class Item:
    @property
    def identifier(self):
        """str: Item identifier."""
        
    @property
    def metadata(self):
        """dict: Complete item metadata."""
        
    @property
    def files(self):
        """list: List of file metadata dictionaries."""
        
    @property
    def exists(self):
        """bool: Whether the item exists and is accessible."""
        
    @property
    def session(self):
        """ArchiveSession: Session object used by this item."""
        
    @property
    def urls(self):
        """URLs: Object providing access to various item URLs."""
        
    @property
    def collection(self):
        """list: Collections this item belongs to."""
        
    @property
    def wikilink(self):
        """str: MediaWiki-formatted link (if item has title)."""
        
    # Archive.org specific properties
    @property
    def created(self):
        """int: Unix timestamp of item creation."""
        
    @property
    def d1(self):
        """str: Primary server."""
        
    @property
    def d2(self):
        """str: Secondary server."""
        
    @property
    def dir(self):
        """str: Item directory path."""
        
    @property
    def files_count(self):
        """int: Number of files in item."""
        
    @property
    def item_size(self):
        """int: Total size of all files in bytes."""
        
    @property
    def reviews(self):
        """list: Item reviews."""
        
    @property
    def server(self):
        """str: Item server."""
        
    @property
    def uniq(self):
        """int: Unique item number."""
        
    @property
    def updated(self):
        """int: Unix timestamp of last update."""
        
    @property
    def tasks(self):
        """int: Number of tasks associated with item."""
        
    @property
    def is_dark(self):
        """bool: Whether item is dark (restricted access)."""

Item Management

Refresh item data and check identifier availability.

class Item:
    def refresh(self, item_metadata=None, **kwargs):
        """
        Refresh item metadata from Archive.org.
        
        Args:
            item_metadata (dict, optional): Use specific metadata instead of fetching
            **kwargs: Additional arguments passed to get_metadata
        """
        
    def identifier_available(self):
        """
        Check if the item identifier is available for use.
        
        Returns:
            bool: True if identifier is available, False if taken
        """

File Access

Access individual files and collections of files within the item.

class Item:
    def get_file(self, file_name):
        """
        Get a File object for a specific file in the item.
        
        Args:
            file_name (str): Name of the file
            
        Returns:
            File: File object, or None if file doesn't exist
        """
        
    def get_files(self, files=None, formats=None, glob_pattern=None, exclude_pattern=None, on_the_fly=False):
        """
        Get File objects with optional filtering.
        
        Args:
            files (list, optional): Specific file names to retrieve
            formats (list, optional): File formats to include (e.g., ['pdf', 'epub'])
            glob_pattern (str, optional): Glob pattern for file selection
            exclude_pattern (str, optional): Glob pattern for exclusion
            on_the_fly (bool): Include on-the-fly derived files
            
        Yields:
            File: File objects matching the criteria
        """

Upload Operations

Upload files to items, creating new items or updating existing ones.

def upload(identifier, files, metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=None, verbose=False, verify=False, checksum=False, delete=False, retries=None, retries_sleep=None, debug=False, validate_identifier=False, request_kwargs=None, **get_item_kwargs):
    """
    Upload files to an Archive.org item (creates item if it doesn't exist).
    
    Args:
        identifier (str): Item identifier to upload to
        files (list): Files to upload - can be:
            - File paths (str)
            - File-like objects
            - Tuples of (remote_name, local_path_or_file_object)
            - Dictionaries with 'name' and file content
        metadata (dict, optional): Item metadata to set/update
        headers (dict, optional): HTTP headers for upload requests
        access_key (str, optional): IA-S3 access key (overrides config)
        secret_key (str, optional): IA-S3 secret key (overrides config)
        queue_derive (bool, optional): Queue derive task after upload
        verbose (bool): Enable verbose output
        verify (bool): Verify checksums after upload
        checksum (bool): Calculate and verify MD5 checksums
        delete (bool): Delete local files after successful upload
        retries (int, optional): Number of retry attempts
        retries_sleep (int, optional): Seconds to sleep between retries
        debug (bool): Enable debug logging
        validate_identifier (bool): Validate identifier format
        request_kwargs (dict, optional): Additional request arguments
        **get_item_kwargs: Additional arguments for get_item
        
    Returns:
        list: List of Request/Response objects from upload operations
        
    Raises:
        ValueError: If identifier is invalid
        AuthenticationError: If authentication fails
    """

class Item:
    def upload(self, files, metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=None, verbose=False, verify=False, checksum=False, delete=False, retries=None, retries_sleep=None, debug=False, request_kwargs=None):
        """
        Upload files to this item using the same parameters as the upload function.
        
        Returns:
            list: List of Request/Response objects from upload operations
        """

Download Operations

Download files from items with extensive filtering and configuration options.

def download(identifier, files=None, formats=None, glob_pattern=None, dry_run=False, verbose=False, ignore_existing=False, checksum=False, checksum_archive=False, destdir=None, no_directory=False, retries=None, item_index=None, ignore_errors=False, on_the_fly=False, return_responses=False, no_change_timestamp=False, timeout=None, **get_item_kwargs):
    """
    Download files from an Archive.org item with extensive filtering options.
    
    Args:
        identifier (str): Item identifier to download from
        files (list, optional): Specific files to download
        formats (list, optional): File formats to download (e.g., ['pdf', 'txt'])
        glob_pattern (str, optional): Glob pattern for file selection
        dry_run (bool): Show what would be downloaded without downloading
        verbose (bool): Enable verbose output
        ignore_existing (bool): Re-download files that already exist locally
        checksum (bool): Verify file checksums after download
        checksum_archive (bool): Verify checksums from archive
        destdir (str, optional): Destination directory (default: current directory)
        no_directory (bool): Don't create item directory, save files directly to destdir
        retries (int, optional): Number of retry attempts per file
        item_index (int, optional): Download only files modified after this item index
        ignore_errors (bool): Continue downloading other files if some fail
        on_the_fly (bool): Include on-the-fly derived files
        return_responses (bool): Return response objects instead of downloading
        no_change_timestamp (bool): Don't update file timestamps to match archive
        timeout (int, optional): Request timeout in seconds
        **get_item_kwargs: Additional arguments for get_item
        
    Returns:
        list: List of Request/Response objects from download operations
        
    Raises:
        ItemLocateError: If item cannot be located
    """

class Item:
    def download(self, files=None, formats=None, glob_pattern=None, dry_run=False, verbose=False, ignore_existing=False, checksum=False, checksum_archive=False, destdir=None, no_directory=False, retries=None, item_index=None, ignore_errors=False, on_the_fly=False, return_responses=False, no_change_timestamp=False, timeout=None):
        """
        Download files from this item using the same parameters as the download function.
        
        Returns:
            list: List of Request/Response objects from download operations
        """

Metadata Operations

Modify item metadata with various update strategies.

class Item:
    def modify_metadata(self, metadata, target=None, append=False, append_list=False, priority=0, access_key=None, secret_key=None, debug=False, request_kwargs=None):
        """
        Modify metadata of this item.
        
        Args:
            metadata (dict): Metadata changes to apply
            target (str, optional): Target specific metadata section
            append (bool): Append values to existing metadata fields
            append_list (bool): Append to metadata list fields
            priority (int): Task priority for metadata update
            access_key (str, optional): IA-S3 access key
            secret_key (str, optional): IA-S3 secret key
            debug (bool): Enable debug logging
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            Request or Response: Metadata modification result
            
        Raises:
            AuthenticationError: If authentication fails
        """

Task Operations

Submit various Archive.org tasks for item processing.

class Item:
    def derive(self, priority=0, remove_derived=None, reduced_priority=False, data=None, headers=None, request_kwargs=None):
        """
        Submit derive task to generate derived files.
        
        Args:
            priority (int): Task priority (-5 to 10)
            remove_derived (list, optional): Derived formats to remove
            reduced_priority (bool): Use reduced priority queue
            data (dict, optional): Additional task data
            headers (dict, optional): Additional HTTP headers
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            Response: Task submission response
        """
        
    def fixer(self, ops=None, priority=None, reduced_priority=False, data=None, headers=None, request_kwargs=None):
        """
        Submit fixer task to fix item issues.
        
        Args:
            ops (list, optional): Fixer operations to perform
            priority (int, optional): Task priority
            reduced_priority (bool): Use reduced priority queue
            data (dict, optional): Additional task data
            headers (dict, optional): Additional HTTP headers
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            Response: Task submission response
        """
        
    def dark(self, comment, priority=None, data=None, reduced_priority=False, request_kwargs=None):
        """
        Dark the item (restrict access).
        
        Args:
            comment (str): Reason for darking the item
            priority (int, optional): Task priority
            data (dict, optional): Additional task data
            reduced_priority (bool): Use reduced priority queue
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            Response: Task submission response
        """
        
    def undark(self, comment, priority=None, reduced_priority=False, data=None, request_kwargs=None):
        """
        Undark the item (restore access).
        
        Args:
            comment (str): Reason for undarking the item
            priority (int, optional): Task priority
            reduced_priority (bool): Use reduced priority queue
            data (dict, optional): Additional task data
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            Response: Task submission response
        """

Review and Task Management

Manage item reviews and monitor task status.

class Item:
    def get_review(self):
        """
        Get review information for this item.
        
        Returns:
            Response: Review data response
        """
        
    def get_task_summary(self, params=None, request_kwargs=None):
        """
        Get task count summary for this item.
        
        Args:
            params (dict, optional): Additional query parameters
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            dict: Task counts by status
        """
        
    def no_tasks_pending(self, params=None, request_kwargs=None):
        """
        Check if item has no pending tasks.
        
        Args:
            params (dict, optional): Additional query parameters
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            bool: True if no tasks are pending
        """
        
    def get_all_item_tasks(self, params=None, request_kwargs=None):
        """
        Get all tasks (completed and pending) for this item.
        
        Args:
            params (dict, optional): Additional query parameters
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            list: List of CatalogTask objects
        """
        
    def get_history(self, params=None, request_kwargs=None):
        """
        Get completed tasks for this item.
        
        Args:
            params (dict, optional): Additional query parameters
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            list: List of completed CatalogTask objects
        """
        
    def get_catalog(self, params=None, request_kwargs=None):
        """
        Get pending tasks for this item.
        
        Args:
            params (dict, optional): Additional query parameters
            request_kwargs (dict, optional): Additional request arguments
            
        Returns:
            list: List of pending CatalogTask objects
        """

URL Access

Get various URLs associated with the item.

class URLs:
    """Object providing access to various item URLs."""
    
    @property
    def details(self):
        """str: Item details page URL."""
        
    @property
    def metadata(self):
        """str: Item metadata API URL."""
        
    @property
    def download(self):
        """str: Item download directory URL."""
        
    @property
    def history(self):
        """str: Item history page URL."""
        
    @property
    def edit(self):
        """str: Item edit page URL."""
        
    @property
    def editxml(self):
        """str: Item XML edit page URL."""
        
    @property
    def manage(self):
        """str: Item management page URL."""

Review Management

Add, manage, and moderate reviews for items.

def review(self, title: str, body: str, stars=None):
    """
    Add a review to the item.
    
    Args:
        title (str): Review title
        body (str): Review content/body
        stars (int, optional): Star rating for the review (1-5)
        
    Returns:
        requests.Response: Response object from review submission
        
    Example:
        >>> item = internetarchive.get_item('my-item')
        >>> item.review('Great content!', 'This item has excellent resources.', stars=5)
    """

def index_review(self, username=None, screenname=None, itemname=None):
    """
    Set a review to be indexed (make it visible in search results).
    
    Args:
        username (str, optional): Username of the reviewer
        screenname (str, optional): Screen name of the reviewer  
        itemname (str, optional): Item name of the reviewer
        
    Returns:
        requests.Response: Response object from the indexing operation
        
    Note:
        Requires appropriate privileges for review moderation.
    """

def noindex_review(self, username=None, screenname=None, itemname=None):
    """
    Set a review to not be indexed (hide it from search results).
    
    Args:
        username (str, optional): Username of the reviewer
        screenname (str, optional): Screen name of the reviewer
        itemname (str, optional): Item name of the reviewer
        
    Returns:
        requests.Response: Response object from the operation
        
    Note:
        Requires appropriate privileges for review moderation.
    """

def delete_review(self, username=None, screenname=None, itemname=None):
    """
    Delete a review from the item.
    
    Args:
        username (str, optional): Username of the reviewer
        screenname (str, optional): Screen name of the reviewer
        itemname (str, optional): Item name of the reviewer
        
    Returns:
        requests.Response: Response object from the deletion operation
        
    Note:
        Requires appropriate privileges for review management.
    """

Flag Management

Add and manage administrative flags for items.

def add_flag(self, category: str, user=None):
    """
    Add a flag to the item for administrative purposes.
    
    Args:
        category (str): Flag category (e.g., 'copyright', 'spam', 'inappropriate')
        user (str, optional): User adding the flag (defaults to current user)
        
    Returns:
        requests.Response: Response object from the flag addition
        
    Example:
        >>> item = internetarchive.get_item('problematic-item')
        >>> item.add_flag('copyright', user='moderator')
    """

def delete_flag(self, category: str, user=None):
    """
    Remove a flag from the item.
    
    Args:
        category (str): Flag category to remove
        user (str, optional): User removing the flag (defaults to current user)
        
    Returns:
        requests.Response: Response object from the flag removal
    """

def get_flags(self):
    """
    Retrieve all flags associated with the item.
    
    Returns:
        requests.Response: Response object containing flag data
        
    Example:
        >>> item = internetarchive.get_item('my-item')
        >>> flags_response = item.get_flags()
        >>> flags_data = flags_response.json()
    """

Single File Upload

Upload individual files with extensive configuration options.

def upload_file(self, body, key=None, metadata=None, file_metadata=None, headers=None, access_key=None, secret_key=None, queue_derive=False, verbose=False, verify=False, checksum=False, delete=False, retries=None, retries_sleep=None, debug=False, validate_identifier=False, request_kwargs=None):
    """
    Upload a single file to the item with fine-grained control.
    
    Args:
        body (str or file-like): File path or file-like object to upload
        key (str, optional): Remote filename (defaults to local filename)
        metadata (dict, optional): Item metadata to set during upload
        file_metadata (dict, optional): File-level metadata
        headers (dict, optional): Additional HTTP headers
        access_key (str, optional): IA-S3 access key
        secret_key (str, optional): IA-S3 secret key
        queue_derive (bool): Whether to queue derive task after upload
        verbose (bool): Enable verbose output
        verify (bool): Verify checksums after upload
        checksum (bool): Calculate MD5 checksums
        delete (bool): Delete local file after upload success
        retries (int, optional): Number of retry attempts
        retries_sleep (int, optional): Sleep time between retries in seconds
        debug (bool): Enable debug mode
        validate_identifier (bool): Validate identifier format
        request_kwargs (dict, optional): Additional request arguments
        
    Returns:
        requests.Request or requests.Response: Request object (if debug=True) or Response object
        
    Example:
        >>> item = internetarchive.get_item('my-item')
        >>> response = item.upload_file(
        ...     'document.pdf', 
        ...     key='renamed-document.pdf',
        ...     file_metadata={'title': 'Important Document'},
        ...     verify=True,
        ...     checksum=True
        ... )
    """

Collection Management

Manage item membership in simplelists and collections.

def remove_from_simplelist(self, parent: str, list: str):
    """
    Remove the item from a simplelist collection.
    
    Args:
        parent (str): Parent collection identifier
        list (str): List name to remove item from
        
    Returns:
        requests.Response: Response object from the removal operation
        
    Example:
        >>> item = internetarchive.get_item('my-item')
        >>> item.remove_from_simplelist('my-collection', 'featured-items')
    """

Collection Operations

Extended functionality for collection items.

class Collection:
    """
    Represents an Archive.org collection (extends Item).
    """
    
    @property
    def searches(self):
        """dict: Dictionary of Search objects for collection contents."""

Usage Examples

Basic Item Operations

import internetarchive

# Get an item
item = internetarchive.get_item('govlawgacode20071')

# Check if item exists
if item.exists:
    print(f"Item title: {item.metadata.get('title')}")
    print(f"Item has {item.files_count} files")
    print(f"Total size: {item.item_size} bytes")

# Get specific file
pdf_file = item.get_file('govlawgacode20071.pdf')
if pdf_file:
    print(f"PDF file size: {pdf_file.size}")

Upload Example

import internetarchive

# Upload files with metadata
response = internetarchive.upload(
    'my-new-item',
    files=['document.pdf', 'image.jpg'],
    metadata={
        'title': 'My Document Collection',
        'creator': 'Your Name',
        'description': 'A collection of important documents',
        'collection': 'opensource'
    }
)

print(f"Upload completed: {len(response)} files uploaded")

Download with Filtering

import internetarchive

# Download only PDF files
internetarchive.download(
    'example-item',
    formats=['pdf'],
    destdir='./downloads',
    verbose=True,
    checksum=True
)

# Download files matching pattern
internetarchive.download(
    'example-item', 
    glob_pattern='*.txt',
    ignore_existing=True
)

Install with Tessl CLI