Streaming WARC (and ARC) IO library for reading and writing web archive files
npx @tessl/cli install tessl/pypi-warcio@1.7.0A comprehensive Python library for reading and writing WARC (Web ARChive) and ARC (ARChive) files. warcio provides streaming I/O capabilities with automatic format detection, compression handling, and HTTP traffic capture functionality, serving as the foundation for web archiving and digital preservation workflows.
pip install warciofrom warcio import StatusAndHeaders, ArchiveIterator, WARCWriterIndividual components:
from warcio.statusandheaders import StatusAndHeaders, StatusAndHeadersParser
from warcio.archiveiterator import ArchiveIterator, WARCIterator, ARCIterator
from warcio.warcwriter import WARCWriter, BufferWARCWriter
from warcio.recordbuilder import RecordBuilder
from warcio.capture_http import capture_http
from warcio.utils import Digester, BUFF_SIZE
from warcio.exceptions import ArchiveLoadFailed
from warcio.indexer import Indexer
from warcio.checker import Checker
from warcio.extractor import Extractor
from warcio.recompressor import Recompressorfrom warcio import ArchiveIterator, WARCWriter, StatusAndHeaders
from warcio.recordbuilder import RecordBuilder
from warcio.capture_http import capture_http
import requests
import io
# Reading WARC files
with open('example.warc.gz', 'rb') as stream:
for record in ArchiveIterator(stream):
if record.rec_type == 'response':
print(f"URL: {record.rec_headers.get_header('WARC-Target-URI')}")
print(f"Status: {record.http_headers.get_statuscode()}")
print(f"Content-Type: {record.http_headers.get_header('Content-Type')}")
# Access decompressed content
content = record.content_stream().read()
# Writing WARC files manually
output_buffer = io.BytesIO()
writer = WARCWriter(output_buffer)
builder = RecordBuilder()
# Create a response record
record = builder.create_warc_record(
uri='http://example.com',
record_type='response',
payload=io.BytesIO(b'Hello, World!'),
http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')])
)
writer.write_record(record)
# HTTP capture (common usage)
with capture_http('example.warc.gz') as writer:
requests.get('https://example.com/') # Automatically captured to WARCwarcio follows a layered architecture designed for streaming processing:
This design enables efficient processing of large archive files without loading entire contents into memory, supporting both WARC 1.0/1.1 and legacy ARC formats.
Core functionality for reading and iterating through WARC and ARC files with automatic format detection, decompression, and record parsing.
class ArchiveIterator:
def __init__(self, fileobj, no_record_parse=False, verify_http=False,
arc2warc=False, ensure_http_headers=False,
block_size=16384, check_digests=False): ...
def __iter__(self): ...
def __next__(self): ...
def close(self): ...
def get_record_offset(self): ...
def get_record_length(self): ...
class WARCIterator(ArchiveIterator):
def __init__(self, *args, **kwargs): ...
class ARCIterator(ArchiveIterator):
def __init__(self, *args, **kwargs): ...Functionality for creating and writing WARC files, including record building, header management, and compression.
class WARCWriter:
def __init__(self, filebuf, gzip=True, warc_version=None, header_filter=None): ...
def write_record(self, record, params=None): ...
def write_request_response_pair(self, req, resp, params=None): ...
class BufferWARCWriter(WARCWriter):
def __init__(self, gzip=True, warc_version=None, header_filter=None): ...
def get_contents(self): ...
def get_stream(self): ...
class RecordBuilder:
def __init__(self, warc_version=None, header_filter=None): ...
def create_warc_record(self, uri, record_type, payload=None, length=None,
warc_content_type='', warc_headers_dict=None,
warc_headers=None, http_headers=None): ...
def create_revisit_record(self, uri, digest, refers_to_uri, refers_to_date,
http_headers=None, warc_headers_dict=None): ...
def create_warcinfo_record(self, filename, info): ...Comprehensive HTTP header parsing, manipulation, and formatting with support for status lines and case-insensitive access.
class StatusAndHeaders:
def __init__(self, statusline, headers, protocol='', total_len=0,
is_http_request=False): ...
def get_header(self, name, default_value=None): ...
def add_header(self, name, value): ...
def replace_header(self, name, value): ...
def remove_header(self, name): ...
def get_statuscode(self): ...
class StatusAndHeadersParser:
def __init__(self, statuslist, verify=True): ...
def parse(self, stream, full_statusline=None): ...Live HTTP traffic recording capabilities that capture requests and responses directly to WARC format.
def capture_http(warc_writer=None, filter_func=None, append=True,
record_ip=True, **kwargs): ...Advanced stream processing with compression, digest verification, and buffered reading capabilities.
class BufferedReader:
def __init__(self, stream, block_size=16384, decomp_type=None,
starting_data=None, read_all_members=False): ...
def read(self, length=None): ...
def readline(self, length=None): ...
class LimitReader:
def __init__(self, stream, limit): ...
def read(self, length=None): ...
def readline(self, length=None): ...
class DigestVerifyingReader:
def __init__(self, stream, limit, digest_checker, record_type=None,
payload_digest=None, block_digest=None, segment_number=None): ...Comprehensive time handling for web archive timestamps with support for multiple date formats and timezone handling.
def iso_date_to_datetime(string, tz_aware=False): ...
def http_date_to_datetime(string, tz_aware=False): ...
def datetime_to_http_date(the_datetime): ...
def datetime_to_iso_date(the_datetime, use_micros=False): ...
def timestamp_now(): ...
def timestamp_to_datetime(string, tz_aware=False): ...Built-in command line utilities for indexing, checking, extracting, and recompressing WARC/ARC files.
class Indexer:
def __init__(self, fields, inputs, output, verify_http=False): ...
def process_all(self): ...
class Checker:
def __init__(self, cmd): ...
def process_all(self): ...
class Extractor:
def __init__(self, filename, offset): ...
def extract(self, payload_only, headers_only): ...
class Recompressor:
def __init__(self, filename, output, verbose=False): ...
def recompress(self): ...class ArcWarcRecord:
"""Represents a parsed WARC/ARC record."""
def __init__(self, format, rec_type, rec_headers, raw_stream,
http_headers=None, content_type=None, length=None,
payload_length=-1, digest_checker=None): ...
def content_stream(self): ...
class Digester:
"""Hash digest calculator."""
def __init__(self, type_='sha1'): ...
def update(self, buff): ...
def __str__(self): ...
class DigestChecker:
"""Digest validation checker."""
def __init__(self, kind=None): ...
@property
def passed(self): ...
@property
def problems(self): ...
# Exception Classes
class ArchiveLoadFailed(Exception):
"""Exception for archive loading failures."""
def __init__(self, reason): ...
class ChunkedDataException(Exception):
"""Exception for chunked data parsing errors."""
def __init__(self, msg, data=b''): ...
class StatusAndHeadersParserException(Exception):
"""Exception for status/headers parsing errors."""
def __init__(self, msg, statusline): ...
# Constants
BUFF_SIZE = 16384