A Python interface to archive.org for programmatic access to the Internet Archive's digital library
—
Search operations provide powerful querying capabilities for discovering items in the Internet Archive using various search criteria, field selection, sorting, and full-text search options.
Search for items using Archive.org's search syntax with results returned as an iterable Search object.
def search_items(query, fields=None, sorts=None, params=None, full_text_search=False, dsl_fts=False, archive_session=None, config=None, config_file=None, http_adapter_kwargs=None, request_kwargs=None, max_retries=None):
"""
Search for items on Archive.org with advanced filtering options.
Args:
query (str): Search query using Archive.org syntax:
- Basic: 'collection:nasa'
- Field search: 'creator:"Neil Armstrong"'
- Boolean: 'collection:nasa AND mediatype:movies'
- Date ranges: 'date:[1969-01-01 TO 1969-12-31]'
- Wildcards: 'title:apollo*'
fields (list, optional): Metadata fields to return in results:
- Common: ['identifier', 'title', 'creator', 'date', 'description']
- All available fields returned if None
sorts (list, optional): Sort criteria:
- ['downloads desc'] - Most downloaded first
- ['date desc'] - Newest first
- ['titleSorter asc'] - Alphabetical by title
- ['reviewdate desc', 'identifier asc'] - Multiple sorts
params (dict, optional): Additional URL parameters:
- 'rows': int, results per page (default: 25, max: 10000)
- 'page': int, page number (1-based)
- 'cursor': str, cursor for pagination
- 'save': bool, save search for future use
full_text_search (bool): Enable full-text search across item content
dsl_fts (bool): Enable DSL-based full-text search for advanced queries
archive_session (ArchiveSession, optional): Existing session to use
config (dict, optional): Configuration for new session
config_file (str, optional): Config file for new session
http_adapter_kwargs (dict, optional): HTTP adapter arguments
request_kwargs (dict, optional): Additional request arguments
max_retries (int, optional): Maximum retry attempts for failed requests
Returns:
Search: Search object for iterating over results
Raises:
ValueError: If query is invalid
requests.RequestException: If search request fails
"""
class Search:
"""
Represents a search query and provides access to results.
"""
def __init__(self, archive_session, query, fields=None, sorts=None, params=None, full_text_search=None, dsl_fts=None, request_kwargs=None, max_retries=None):
"""
Initialize Search object.
Args:
archive_session (ArchiveSession): Session object
query (str): Search query string
fields (list, optional): Fields to return
sorts (list, optional): Sort criteria
params (dict, optional): URL parameters
full_text_search (bool, optional): Enable full-text search
dsl_fts (bool, optional): Enable DSL full-text search
request_kwargs (dict, optional): Request arguments
max_retries (int, optional): Maximum retries
"""Access search configuration and result information.
class Search:
@property
def session(self):
"""ArchiveSession: Session object used for this search."""
@property
def query(self):
"""str: Search query string."""
@property
def fields(self):
"""list: Metadata fields being returned."""
@property
def sorts(self):
"""list: Sort criteria applied to results."""
@property
def params(self):
"""dict: URL parameters for the search."""
@property
def fts(self):
"""bool: Whether full-text search is enabled."""
@property
def dsl_fts(self):
"""bool: Whether DSL full-text search is enabled."""
@property
def num_found(self):
"""int: Total number of results found (not just returned)."""Iterate over search results in different formats.
class Search:
def __iter__(self):
"""
Iterate over search results as dictionaries.
Yields:
dict: Result dictionaries with requested fields
"""
def iter_as_results(self):
"""
Explicitly iterate over search results as dictionaries.
Yields:
dict: Result dictionaries with metadata fields
"""
def iter_as_items(self):
"""
Iterate over search results as Item objects.
Yields:
Item: Item objects for each search result
Note:
Creates Item objects which may trigger additional API calls
for metadata. Use iter_as_results() for better performance
when you only need the search result fields.
"""import internetarchive
# Search by collection
search = internetarchive.search_items('collection:nasa')
# Search by media type
search = internetarchive.search_items('mediatype:movies')
# Search by creator
search = internetarchive.search_items('creator:"Internet Archive"')
# Search by title with wildcards
search = internetarchive.search_items('title:apollo*')import internetarchive
# Boolean queries
search = internetarchive.search_items(
'collection:nasa AND mediatype:movies AND date:[1969-01-01 TO 1969-12-31]'
)
# Multiple collections
search = internetarchive.search_items('collection:(nasa OR loc)')
# Exclude results
search = internetarchive.search_items('collection:nasa NOT mediatype:data')
# Full-text search
search = internetarchive.search_items(
'moon landing',
full_text_search=True
)import internetarchive
# Select specific fields
search = internetarchive.search_items(
'collection:nasa',
fields=['identifier', 'title', 'creator', 'date', 'downloads']
)
# Sort by popularity
search = internetarchive.search_items(
'collection:movies',
sorts=['downloads desc', 'reviewdate desc']
)
# Sort alphabetically
search = internetarchive.search_items(
'collection:books',
sorts=['titleSorter asc']
)import internetarchive
# Large result sets
search = internetarchive.search_items(
'collection:opensource',
params={'rows': 1000} # Get up to 1000 results per page
)
# Specific page
search = internetarchive.search_items(
'collection:nasa',
params={'page': 5, 'rows': 50}
)
# Using cursor for efficient pagination
search = internetarchive.search_items(
'collection:books',
params={'cursor': 'next_cursor_value'}
)import internetarchive
# Search for NASA collection items
search = internetarchive.search_items('collection:nasa')
print(f"Found {search.num_found} total results")
# Iterate over first page of results
for result in search:
print(f"ID: {result['identifier']}")
if 'title' in result:
print(f"Title: {result['title']}")
print(f"Downloads: {result.get('downloads', 'N/A')}")
print("---")import internetarchive
# Search and get Item objects
search = internetarchive.search_items(
'collection:nasa AND mediatype:movies',
fields=['identifier', 'title', 'creator']
)
# Convert results to Item objects for full functionality
for item in search.iter_as_items():
print(f"Processing item: {item.identifier}")
# Access full metadata (triggers API call)
print(f"Full title: {item.metadata.get('title')}")
print(f"File count: {item.files_count}")
# Download first PDF file if available
for file in item.get_files(formats=['pdf']):
file.download()
breakimport internetarchive
# Create session for multiple searches
session = internetarchive.get_session()
# Search with session for better performance
search1 = session.search_items(
'collection:movies AND year:2020',
fields=['identifier', 'title', 'year'],
sorts=['downloads desc']
)
search2 = session.search_items(
'creator:"Internet Archive" AND mediatype:texts',
fields=['identifier', 'title', 'creator', 'date']
)
# Process multiple searches
for search in [search1, search2]:
print(f"Query: {search.query}")
print(f"Results: {search.num_found}")
# Get top 10 results
count = 0
for result in search:
print(f" {result['identifier']}: {result.get('title', 'No title')}")
count += 1
if count >= 10:
break
print()import internetarchive
# Search within document content
search = internetarchive.search_items(
'artificial intelligence machine learning',
full_text_search=True,
fields=['identifier', 'title', 'description']
)
print(f"Full-text search found {search.num_found} documents")
for result in search:
print(f"Document: {result['identifier']}")
print(f"Title: {result.get('title', 'No title')}")
if 'description' in result:
print(f"Description: {result['description'][:200]}...")
print("---")import internetarchive
# Search specific collections with targeted fields
collections_queries = {
'software': {
'query': 'collection:softwarelibrary',
'fields': ['identifier', 'title', 'creator', 'emulator']
},
'books': {
'query': 'collection:books AND language:eng',
'fields': ['identifier', 'title', 'creator', 'publisher', 'date']
},
'audio': {
'query': 'collection:etree AND year:2023',
'fields': ['identifier', 'title', 'creator', 'date', 'venue']
}
}
for collection_name, config in collections_queries.items():
search = internetarchive.search_items(
config['query'],
fields=config['fields'],
sorts=['downloads desc']
)
print(f"{collection_name.upper()} Collection ({search.num_found} items):")
count = 0
for result in search:
print(f" {result['identifier']}: {result.get('title', 'No title')}")
count += 1
if count >= 5: # Show top 5
break
print()Install with Tessl CLI
npx tessl i tessl/pypi-internetarchive