Streaming WARC (and ARC) IO library for reading and writing web archive files
Core functionality for reading and iterating through WARC and ARC files with automatic format detection, decompression, and record parsing. The library provides streaming access to archive records without loading entire files into memory.
The main interface for reading WARC and ARC files, providing iterator access to records with automatic format detection and decompression support.
class ArchiveIterator:
def __init__(self, fileobj, no_record_parse=False, verify_http=False,
arc2warc=False, ensure_http_headers=False,
block_size=16384, check_digests=False):
"""
Iterator over records in WARC and ARC files.
Args:
fileobj: File-like object to read from
no_record_parse (bool): Skip record parsing if True
verify_http (bool): Verify HTTP headers if True
arc2warc (bool): Convert ARC records to WARC format if True
ensure_http_headers (bool): Ensure HTTP headers are present if True
block_size (int): Buffer size for reading (default 16384)
check_digests (bool): Verify record digests if True
"""
def __iter__(self):
"""Return iterator."""
def __next__(self):
"""Get next record."""
def close(self):
"""Close iterator and cleanup resources."""
def get_record_offset(self):
"""
Get current record file offset.
Returns:
int: Byte offset of current record in file
"""
def get_record_length(self):
"""
Get current record length.
Returns:
int: Length of current record in bytes
"""
def read_to_end(self, record=None):
"""
Read remainder of the stream.
Args:
record: Optional record to read (uses current record if None)
"""Specialized iterators for specific archive formats when you know the format in advance.
class WARCIterator(ArchiveIterator):
def __init__(self, *args, **kwargs):
"""ArchiveIterator specialized for WARC format."""
class ARCIterator(ArchiveIterator):
def __init__(self, *args, **kwargs):
"""ArchiveIterator specialized for ARC format."""Utility class for handling non-seekable streams that need position tracking.
class UnseekableYetTellable:
def __init__(self, fh):
"""
Wrapper for streams that can't seek but need tell() functionality.
Args:
fh: File handle to wrap
"""
def tell(self):
"""
Return current offset.
Returns:
int: Current position in stream
"""
def read(self, size=-1):
"""
Read data and track offset.
Args:
size (int): Number of bytes to read (-1 for all)
Returns:
bytes: Data read from stream
"""from warcio import ArchiveIterator
# Read from a WARC file
with open('example.warc.gz', 'rb') as stream:
for record in ArchiveIterator(stream):
print(f"Record Type: {record.rec_type}")
print(f"URI: {record.rec_headers.get_header('WARC-Target-URI')}")
if record.http_headers:
print(f"HTTP Status: {record.http_headers.get_statuscode()}")
print(f"Content-Type: {record.http_headers.get_header('Content-Type')}")
# Read record content
content = record.content_stream().read()
print(f"Content Length: {len(content)}")
print("---")from warcio import ArchiveIterator
# Read with digest verification and HTTP header checking
with open('example.warc.gz', 'rb') as stream:
iterator = ArchiveIterator(
stream,
verify_http=True, # Verify HTTP headers
check_digests=True, # Verify record digests
ensure_http_headers=True # Ensure HTTP headers are present
)
for record in iterator:
# Get record position information
offset = iterator.get_record_offset()
length = iterator.get_record_length()
print(f"Record at offset {offset}, length {length}")
# Process record
if record.rec_type == 'response':
print(f"Response from: {record.rec_headers.get_header('WARC-Target-URI')}")
iterator.close()from warcio import ArchiveIterator
# Read ARC file and convert records to WARC format
with open('example.arc.gz', 'rb') as stream:
for record in ArchiveIterator(stream, arc2warc=True):
# Record is now in WARC format even if source was ARC
print(f"WARC Record Type: {record.rec_type}")
print(f"WARC-Date: {record.rec_headers.get_header('WARC-Date')}")from warcio.archiveiterator import WARCIterator, ARCIterator
# Use format-specific iterator when format is known
with open('example.warc.gz', 'rb') as stream:
for record in WARCIterator(stream):
print(f"WARC Record: {record.rec_type}")
# For ARC files
with open('example.arc.gz', 'rb') as stream:
for record in ARCIterator(stream):
print(f"ARC Record: {record.rec_type}")from warcio import ArchiveIterator
from warcio.archiveiterator import UnseekableYetTellable
import requests
# Read from HTTP stream
response = requests.get('https://example.com/archive.warc.gz', stream=True)
wrapped_stream = UnseekableYetTellable(response.raw)
for record in ArchiveIterator(wrapped_stream):
print(f"Position: {wrapped_stream.tell()}")
print(f"Record: {record.rec_type}")Install with Tessl CLI
npx tessl i tessl/pypi-warcio