A Python interface to archive.org for programmatic access to the Internet Archive's digital library
—
Metadata operations enable viewing and modifying Archive.org item metadata with support for various update strategies including appending, targeting specific sections, and batch operations.
Modify item metadata with flexible update strategies and priority control.
def modify_metadata(identifier, metadata, target=None, append=False, append_list=False, priority=0, access_key=None, secret_key=None, debug=False, request_kwargs=None, **get_item_kwargs):
"""
Modify metadata of an existing Archive.org item.
Args:
identifier (str): Item identifier
metadata (dict): Metadata changes to apply with keys:
- Standard fields: 'title', 'creator', 'description', 'date', 'subject'
- Collection: 'collection' (str or list of collection identifiers)
- Custom fields: Any valid metadata field name
target (str, optional): Target specific metadata section:
- 'metadata' (default): Modify main metadata
- 'collection': Modify collection membership only
- 'files/<filename>': Modify specific file metadata
append (bool): Append values to existing metadata fields instead of replacing
append_list (bool): Append to metadata list fields (like 'subject')
priority (int): Task priority for metadata update (-5 to 10, higher = more priority)
access_key (str, optional): IA-S3 access key (overrides config)
secret_key (str, optional): IA-S3 secret key (overrides config)
debug (bool): Enable debug logging
request_kwargs (dict, optional): Additional request arguments
**get_item_kwargs: Additional arguments passed to get_item
Returns:
Request or Response: Metadata modification result
Raises:
AuthenticationError: If authentication fails
ItemLocateError: If item cannot be located
ValueError: If metadata format is invalid
"""Access and refresh item metadata through Item objects.
class Item:
def modify_metadata(self, metadata, target=None, append=False, append_list=False, priority=0, access_key=None, secret_key=None, debug=False, request_kwargs=None):
"""
Modify metadata of this item using the same parameters as the module function.
Returns:
Request or Response: Metadata modification result
"""
def refresh(self, item_metadata=None, **kwargs):
"""
Refresh item metadata from Archive.org.
Args:
item_metadata (dict, optional): Use specific metadata instead of fetching
**kwargs: Additional arguments passed to get_metadata
Note:
Updates the item's metadata property with fresh data from Archive.org
"""
@property
def metadata(self):
"""
dict: Complete item metadata dictionary containing:
- 'identifier': Item identifier
- 'title': Item title
- 'creator': Creator/author information
- 'description': Item description
- 'date': Creation/publication date
- 'subject': Subject tags/keywords
- 'collection': Collections this item belongs to
- 'mediatype': Media type (texts, movies, audio, etc.)
- 'uploader': User who uploaded the item
- 'addeddate': When item was added to Archive.org
- 'publicdate': When item became publicly available
- And many other Archive.org specific fields
"""Retrieve metadata through ArchiveSession objects.
class ArchiveSession:
def get_metadata(self, identifier, request_kwargs=None):
"""
Get item metadata from Archive.org API.
Args:
identifier (str): Archive.org item identifier
request_kwargs (dict, optional): Additional request arguments:
- 'timeout': Request timeout in seconds
- 'headers': Additional HTTP headers
Returns:
dict: Item metadata dictionary from API
Raises:
ItemLocateError: If item cannot be located
requests.RequestException: If API request fails
"""Common metadata fields supported by Archive.org:
# Core descriptive fields
metadata_fields = {
'title': str, # Item title
'creator': str, # Creator/author name
'description': str, # Item description
'date': str, # Creation/publication date (YYYY-MM-DD)
'subject': list, # Subject tags/keywords
'collection': list, # Collection identifiers
'mediatype': str, # Media type (texts, movies, audio, etc.)
'language': str, # Language code (eng, fra, etc.)
'publisher': str, # Publisher name
'contributor': str, # Contributors
'coverage': str, # Geographic/temporal coverage
'rights': str, # Rights/license information
'source': str, # Source information
'relation': str, # Related items
'format': str, # Physical format
'type': str, # Resource type
}
# Archive.org specific fields
archive_fields = {
'identifier': str, # Unique item identifier (read-only)
'uploader': str, # Username of uploader (read-only)
'addeddate': str, # Date added to archive (read-only)
'publicdate': str, # Date made public (read-only)
'updatedate': str, # Last update date (read-only)
'scanner': str, # Scanning equipment used
'sponsor': str, # Digitization sponsor
'contributor': str, # Additional contributors
'call_number': str, # Library call number
'isbn': str, # ISBN for books
'oclc': str, # OCLC number
'lccn': str, # Library of Congress Control Number
}import internetarchive
# Update basic metadata
internetarchive.modify_metadata(
'my-item-identifier',
metadata={
'title': 'Updated Title',
'creator': 'New Author Name',
'description': 'Updated description of the item',
'subject': ['keyword1', 'keyword2', 'new-topic']
}
)import internetarchive
# Append to existing subjects without replacing
internetarchive.modify_metadata(
'my-item-identifier',
metadata={
'subject': ['additional-keyword', 'another-topic']
},
append_list=True
)
# Append to description
internetarchive.modify_metadata(
'my-item-identifier',
metadata={
'description': '\\n\\nAdditional information appended to existing description.'
},
append=True
)import internetarchive
# Add item to collections
internetarchive.modify_metadata(
'my-item-identifier',
metadata={
'collection': ['opensource', 'community']
}
)
# Add to existing collections (append)
internetarchive.modify_metadata(
'my-item-identifier',
metadata={
'collection': ['new-collection']
},
append_list=True
)import internetarchive
# Modify metadata for a specific file
internetarchive.modify_metadata(
'my-item-identifier',
metadata={
'title': 'Chapter 1: Introduction',
'creator': 'Specific Author'
},
target='files/chapter1.pdf'
)import internetarchive
# High-priority metadata update with specific credentials
internetarchive.modify_metadata(
'important-item',
metadata={
'title': 'Critical Update',
'description': 'Updated with high priority'
},
priority=5, # Higher priority
access_key='your-access-key',
secret_key='your-secret-key'
)import internetarchive
# Get item and modify metadata
item = internetarchive.get_item('my-item-identifier')
# Check current metadata
print(f\"Current title: {item.metadata.get('title')}\")\nprint(f\"Current creator: {item.metadata.get('creator')}\")\n\n# Update metadata\nitem.modify_metadata({\n 'title': 'New Title',\n 'description': 'Updated description'\n})\n\n# Refresh to get updated metadata\nitem.refresh()\nprint(f\"Updated title: {item.metadata.get('title')}\")\n```\n\n### Batch Metadata Operations\n\n```python\nimport internetarchive\n\n# Update multiple items with similar metadata\nitems_to_update = ['item1', 'item2', 'item3']\ncommon_metadata = {\n 'creator': 'Updated Author',\n 'subject': ['batch-update', 'corrected-metadata']\n}\n\nfor identifier in items_to_update:\n try:\n internetarchive.modify_metadata(\n identifier,\n metadata=common_metadata,\n priority=1\n )\n print(f\"Updated {identifier}\")\n except Exception as e:\n print(f\"Failed to update {identifier}: {e}\")\n```\n\n### Metadata Validation and Cleanup\n\n```python\nimport internetarchive\n\n# Get item metadata for analysis\nitem = internetarchive.get_item('example-item')\nmetadata = item.metadata\n\n# Clean up and standardize metadata\ncleanup_metadata = {}\n\n# Standardize date format\nif 'date' in metadata:\n date_str = metadata['date']\n # Convert various date formats to YYYY-MM-DD\n if len(date_str) == 4: # Year only\n cleanup_metadata['date'] = f\"{date_str}-01-01\"\n\n# Ensure subjects are properly formatted\nif 'subject' in metadata:\n subjects = metadata['subject']\n if isinstance(subjects, str):\n # Convert single string to list\n cleanup_metadata['subject'] = [subjects]\n else:\n # Clean up subject list\n cleanup_metadata['subject'] = [s.strip().lower() for s in subjects if s.strip()]\n\n# Apply cleanup if needed\nif cleanup_metadata:\n item.modify_metadata(cleanup_metadata)\n print(f\"Applied metadata cleanup: {cleanup_metadata}\")\n```\n\n### Metadata Field Analysis\n\n```python\nimport internetarchive\nfrom collections import Counter\n\n# Analyze metadata across multiple items\nsearch = internetarchive.search_items(\n 'collection:opensource',\n fields=['identifier', 'title', 'creator', 'subject']\n)\n\n# Collect metadata statistics\nall_subjects = []\ncreator_count = Counter()\n\nfor result in search:\n # Count creators\n if 'creator' in result:\n creator_count[result['creator']] += 1\n \n # Collect all subjects\n if 'subject' in result:\n subjects = result['subject']\n if isinstance(subjects, list):\n all_subjects.extend(subjects)\n else:\n all_subjects.append(subjects)\n\n# Analysis results\nprint(\"Top 10 creators:\")\nfor creator, count in creator_count.most_common(10):\n print(f\" {creator}: {count} items\")\n\nprint(\"\\nTop 10 subjects:\")\nsubject_count = Counter(all_subjects)\nfor subject, count in subject_count.most_common(10):\n print(f\" {subject}: {count} items\")\n```"}]Install with Tessl CLI
npx tessl i tessl/pypi-internetarchive