tessl/pypi-internetarchive

A Python interface to archive.org for programmatic access to the Internet Archive's digital library

—

Pending

Overview

Eval results

Files

Metadata Operations

Name: tessl/pypi-internetarchive
Author: tessl

Metadata operations enable viewing and modifying Archive.org item metadata with support for various update strategies including appending, targeting specific sections, and batch operations.

Capabilities

Metadata Modification

Modify item metadata with flexible update strategies and priority control.

def modify_metadata(identifier, metadata, target=None, append=False, append_list=False, priority=0, access_key=None, secret_key=None, debug=False, request_kwargs=None, **get_item_kwargs):
    """
    Modify metadata of an existing Archive.org item.
    
    Args:
        identifier (str): Item identifier
        metadata (dict): Metadata changes to apply with keys:
            - Standard fields: 'title', 'creator', 'description', 'date', 'subject'
            - Collection: 'collection' (str or list of collection identifiers)
            - Custom fields: Any valid metadata field name
        target (str, optional): Target specific metadata section:
            - 'metadata' (default): Modify main metadata
            - 'collection': Modify collection membership only
            - 'files/<filename>': Modify specific file metadata
        append (bool): Append values to existing metadata fields instead of replacing
        append_list (bool): Append to metadata list fields (like 'subject')
        priority (int): Task priority for metadata update (-5 to 10, higher = more priority)
        access_key (str, optional): IA-S3 access key (overrides config)
        secret_key (str, optional): IA-S3 secret key (overrides config)
        debug (bool): Enable debug logging
        request_kwargs (dict, optional): Additional request arguments
        **get_item_kwargs: Additional arguments passed to get_item
        
    Returns:
        Request or Response: Metadata modification result
        
    Raises:
        AuthenticationError: If authentication fails
        ItemLocateError: If item cannot be located
        ValueError: If metadata format is invalid
    """

Item Metadata Access

Access and refresh item metadata through Item objects.

class Item:
    def modify_metadata(self, metadata, target=None, append=False, append_list=False, priority=0, access_key=None, secret_key=None, debug=False, request_kwargs=None):
        """
        Modify metadata of this item using the same parameters as the module function.
        
        Returns:
            Request or Response: Metadata modification result
        """
        
    def refresh(self, item_metadata=None, **kwargs):
        """
        Refresh item metadata from Archive.org.
        
        Args:
            item_metadata (dict, optional): Use specific metadata instead of fetching
            **kwargs: Additional arguments passed to get_metadata
            
        Note:
            Updates the item's metadata property with fresh data from Archive.org
        """
        
    @property
    def metadata(self):
        """
        dict: Complete item metadata dictionary containing:
            - 'identifier': Item identifier
            - 'title': Item title
            - 'creator': Creator/author information
            - 'description': Item description
            - 'date': Creation/publication date
            - 'subject': Subject tags/keywords
            - 'collection': Collections this item belongs to
            - 'mediatype': Media type (texts, movies, audio, etc.)
            - 'uploader': User who uploaded the item
            - 'addeddate': When item was added to Archive.org
            - 'publicdate': When item became publicly available
            - And many other Archive.org specific fields
        """

Session Metadata Operations

Retrieve metadata through ArchiveSession objects.

class ArchiveSession:
    def get_metadata(self, identifier, request_kwargs=None):
        """
        Get item metadata from Archive.org API.
        
        Args:
            identifier (str): Archive.org item identifier
            request_kwargs (dict, optional): Additional request arguments:
                - 'timeout': Request timeout in seconds
                - 'headers': Additional HTTP headers
                
        Returns:
            dict: Item metadata dictionary from API
            
        Raises:
            ItemLocateError: If item cannot be located
            requests.RequestException: If API request fails
        """

Metadata Field Reference

Standard Metadata Fields

Common metadata fields supported by Archive.org:

# Core descriptive fields
metadata_fields = {
    'title': str,           # Item title
    'creator': str,         # Creator/author name  
    'description': str,     # Item description
    'date': str,           # Creation/publication date (YYYY-MM-DD)
    'subject': list,       # Subject tags/keywords
    'collection': list,    # Collection identifiers
    'mediatype': str,      # Media type (texts, movies, audio, etc.)
    'language': str,       # Language code (eng, fra, etc.)
    'publisher': str,      # Publisher name
    'contributor': str,    # Contributors
    'coverage': str,       # Geographic/temporal coverage
    'rights': str,         # Rights/license information
    'source': str,         # Source information
    'relation': str,       # Related items
    'format': str,         # Physical format
    'type': str,          # Resource type
}

# Archive.org specific fields
archive_fields = {
    'identifier': str,     # Unique item identifier (read-only)
    'uploader': str,      # Username of uploader (read-only)
    'addeddate': str,     # Date added to archive (read-only)
    'publicdate': str,    # Date made public (read-only)
    'updatedate': str,    # Last update date (read-only)
    'scanner': str,       # Scanning equipment used
    'sponsor': str,       # Digitization sponsor
    'contributor': str,   # Additional contributors
    'call_number': str,   # Library call number
    'isbn': str,         # ISBN for books
    'oclc': str,         # OCLC number
    'lccn': str,         # Library of Congress Control Number
}

Usage Examples

Basic Metadata Modification

import internetarchive

# Update basic metadata
internetarchive.modify_metadata(
    'my-item-identifier',
    metadata={
        'title': 'Updated Title',
        'creator': 'New Author Name',
        'description': 'Updated description of the item',
        'subject': ['keyword1', 'keyword2', 'new-topic']
    }
)

Append to Existing Metadata

import internetarchive

# Append to existing subjects without replacing
internetarchive.modify_metadata(
    'my-item-identifier',
    metadata={
        'subject': ['additional-keyword', 'another-topic']
    },
    append_list=True
)

# Append to description
internetarchive.modify_metadata(
    'my-item-identifier', 
    metadata={
        'description': '\\n\\nAdditional information appended to existing description.'
    },
    append=True
)

Collection Management

import internetarchive

# Add item to collections
internetarchive.modify_metadata(
    'my-item-identifier',
    metadata={
        'collection': ['opensource', 'community']
    }
)

# Add to existing collections (append)
internetarchive.modify_metadata(
    'my-item-identifier',
    metadata={
        'collection': ['new-collection']
    },
    append_list=True
)

File-Specific Metadata

import internetarchive

# Modify metadata for a specific file
internetarchive.modify_metadata(
    'my-item-identifier',
    metadata={
        'title': 'Chapter 1: Introduction',
        'creator': 'Specific Author'
    },
    target='files/chapter1.pdf'
)

Priority and Authentication

import internetarchive

# High-priority metadata update with specific credentials
internetarchive.modify_metadata(
    'important-item',
    metadata={
        'title': 'Critical Update',
        'description': 'Updated with high priority'
    },
    priority=5,  # Higher priority
    access_key='your-access-key',
    secret_key='your-secret-key'
)

Working with Item Objects

import internetarchive

# Get item and modify metadata
item = internetarchive.get_item('my-item-identifier')

# Check current metadata
print(f\"Current title: {item.metadata.get('title')}\")\nprint(f\"Current creator: {item.metadata.get('creator')}\")\n\n# Update metadata\nitem.modify_metadata({\n    'title': 'New Title',\n    'description': 'Updated description'\n})\n\n# Refresh to get updated metadata\nitem.refresh()\nprint(f\"Updated title: {item.metadata.get('title')}\")\n```\n\n### Batch Metadata Operations\n\n```python\nimport internetarchive\n\n# Update multiple items with similar metadata\nitems_to_update = ['item1', 'item2', 'item3']\ncommon_metadata = {\n    'creator': 'Updated Author',\n    'subject': ['batch-update', 'corrected-metadata']\n}\n\nfor identifier in items_to_update:\n    try:\n        internetarchive.modify_metadata(\n            identifier,\n            metadata=common_metadata,\n            priority=1\n        )\n        print(f\"Updated {identifier}\")\n    except Exception as e:\n        print(f\"Failed to update {identifier}: {e}\")\n```\n\n### Metadata Validation and Cleanup\n\n```python\nimport internetarchive\n\n# Get item metadata for analysis\nitem = internetarchive.get_item('example-item')\nmetadata = item.metadata\n\n# Clean up and standardize metadata\ncleanup_metadata = {}\n\n# Standardize date format\nif 'date' in metadata:\n    date_str = metadata['date']\n    # Convert various date formats to YYYY-MM-DD\n    if len(date_str) == 4:  # Year only\n        cleanup_metadata['date'] = f\"{date_str}-01-01\"\n\n# Ensure subjects are properly formatted\nif 'subject' in metadata:\n    subjects = metadata['subject']\n    if isinstance(subjects, str):\n        # Convert single string to list\n        cleanup_metadata['subject'] = [subjects]\n    else:\n        # Clean up subject list\n        cleanup_metadata['subject'] = [s.strip().lower() for s in subjects if s.strip()]\n\n# Apply cleanup if needed\nif cleanup_metadata:\n    item.modify_metadata(cleanup_metadata)\n    print(f\"Applied metadata cleanup: {cleanup_metadata}\")\n```\n\n### Metadata Field Analysis\n\n```python\nimport internetarchive\nfrom collections import Counter\n\n# Analyze metadata across multiple items\nsearch = internetarchive.search_items(\n    'collection:opensource',\n    fields=['identifier', 'title', 'creator', 'subject']\n)\n\n# Collect metadata statistics\nall_subjects = []\ncreator_count = Counter()\n\nfor result in search:\n    # Count creators\n    if 'creator' in result:\n        creator_count[result['creator']] += 1\n    \n    # Collect all subjects\n    if 'subject' in result:\n        subjects = result['subject']\n        if isinstance(subjects, list):\n            all_subjects.extend(subjects)\n        else:\n            all_subjects.append(subjects)\n\n# Analysis results\nprint(\"Top 10 creators:\")\nfor creator, count in creator_count.most_common(10):\n    print(f\"  {creator}: {count} items\")\n\nprint(\"\\nTop 10 subjects:\")\nsubject_count = Counter(all_subjects)\nfor subject, count in subject_count.most_common(10):\n    print(f\"  {subject}: {count} items\")\n```"}]

Install with Tessl CLI