Streaming WARC (and ARC) IO library for reading and writing web archive files
Comprehensive functionality for creating and writing WARC files, including record building, header management, compression, and digest calculation. The library provides both streaming writers and in-memory buffer writers for different use cases.
Main class for writing WARC records to files or streams with optional compression.
class WARCWriter:
def __init__(self, filebuf, gzip=True, warc_version=None, header_filter=None):
"""
WARC writer for creating WARC files.
Args:
filebuf: File-like object to write to
gzip (bool): Enable gzip compression (default True)
warc_version (str): WARC version to use (default None for latest)
header_filter (callable): Optional function to filter headers
"""
def write_record(self, record, params=None):
"""
Write a WARC record to the output stream.
Args:
record: ArcWarcRecord to write
params: Optional parameters for writing
"""
def write_request_response_pair(self, req, resp, params=None):
"""
Write a request/response pair with proper linking.
Args:
req: Request record
resp: Response record
params: Optional parameters for writing
"""WARC writer that writes to an in-memory buffer for testing or temporary storage.
class BufferWARCWriter(WARCWriter):
def __init__(self, gzip=True, warc_version=None, header_filter=None):
"""
WARC writer that writes to in-memory buffer.
Args:
gzip (bool): Enable gzip compression (default True)
warc_version (str): WARC version to use
header_filter (callable): Optional function to filter headers
"""
def get_contents(self):
"""
Get buffer contents as bytes.
Returns:
bytes: Complete WARC file contents
"""
def get_stream(self):
"""
Get buffer as stream positioned at beginning.
Returns:
io.BytesIO: Stream containing WARC data
"""Factory class for creating various types of WARC records with proper headers and metadata.
class RecordBuilder:
def __init__(self, warc_version=None, header_filter=None):
"""
Builder for creating WARC records.
Args:
warc_version (str): WARC version to use (default None)
header_filter (callable): Optional function to filter headers
"""
def create_warc_record(self, uri, record_type, payload=None, length=None,
warc_content_type='', warc_headers_dict=None,
warc_headers=None, http_headers=None):
"""
Create a general WARC record.
Args:
uri (str): Target URI for the record
record_type (str): WARC record type ('response', 'request', etc.)
payload: Record payload as file-like object or bytes
length (int): Content length (calculated if None)
warc_content_type (str): WARC content type (default '')
warc_headers_dict (dict): Additional WARC headers as dict
warc_headers: Additional WARC headers as StatusAndHeaders
http_headers: HTTP headers as StatusAndHeaders object
Returns:
ArcWarcRecord: Created WARC record
"""
def create_revisit_record(self, uri, digest, refers_to_uri, refers_to_date,
http_headers=None, warc_headers_dict=None):
"""
Create a revisit record that references an earlier record.
Args:
uri (str): Target URI
digest (str): Digest of referenced record
refers_to_uri (str): URI of referenced record
refers_to_date (str): Date of referenced record
http_headers: HTTP headers as StatusAndHeaders object
warc_headers_dict (dict): Additional WARC headers
Returns:
ArcWarcRecord: Created revisit record
"""
def create_warcinfo_record(self, filename, info):
"""
Create a warcinfo record with file metadata.
Args:
filename (str): Name of the WARC file
info (dict or str): Metadata information
Returns:
ArcWarcRecord: Created warcinfo record
"""
def curr_warc_date(self):
"""
Get current date in WARC format.
Returns:
str: Current timestamp in WARC date format
"""
def ensure_digest(self, record, block=True, payload=True):
"""
Ensure record has proper digests calculated.
Args:
record: Record to add digests to
block (bool): Calculate block digest if True
payload (bool): Calculate payload digest if True
"""
# RecordBuilder Constants
REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/identical-payload-digest'
REVISIT_PROFILE_1_1 = 'http://netpreserve.org/warc/1.1/revisit/identical-payload-digest'
WARC_1_0 = 'WARC/1.0'
WARC_1_1 = 'WARC/1.1'
WARC_VERSION = WARC_1_0
NO_PAYLOAD_DIGEST_TYPES = ('warcinfo', 'revisit')Base classes and utilities for WARC writing with compression support.
class BaseWARCWriter:
def __init__(self, gzip=True, warc_version=None, header_filter=None):
"""
Base class for WARC writers.
Args:
gzip (bool): Enable gzip compression
warc_version (str): WARC version
header_filter (callable): Header filter function
"""
def write_request_response_pair(self, req, resp, params=None):
"""Write request/response pair with proper linking."""
def write_record(self, record, params=None):
"""Write single record (abstract method)."""
class GzippingWrapper:
def __init__(self, out):
"""
Wrapper that gzip-compresses data on write.
Args:
out: Output stream to write compressed data to
"""
def write(self, buff):
"""
Write and compress data.
Args:
buff (bytes): Data to compress and write
"""
def flush(self):
"""Flush compressed data to output stream."""from warcio import WARCWriter
from warcio.recordbuilder import RecordBuilder
from warcio.statusandheaders import StatusAndHeaders
import io
# Create a WARC file
output_buffer = io.BytesIO()
writer = WARCWriter(output_buffer)
builder = RecordBuilder()
# Create warcinfo record
warcinfo_record = builder.create_warcinfo_record(
filename='example.warc',
info={'software': 'warcio', 'format': 'WARC File Format 1.1'}
)
writer.write_record(warcinfo_record)
# Create response record
http_headers = StatusAndHeaders('200 OK', [
('Content-Type', 'text/html'),
('Content-Length', '13')
])
response_record = builder.create_warc_record(
uri='http://example.com',
record_type='response',
payload=io.BytesIO(b'Hello, World!'),
http_headers=http_headers
)
writer.write_record(response_record)
# Get the WARC data
warc_data = output_buffer.getvalue()
print(f"Created WARC file of {len(warc_data)} bytes")from warcio import WARCWriter
from warcio.recordbuilder import RecordBuilder
from warcio.statusandheaders import StatusAndHeaders
import io
output_buffer = io.BytesIO()
writer = WARCWriter(output_buffer)
builder = RecordBuilder()
# Create request record
request_headers = StatusAndHeaders('GET / HTTP/1.1', [
('Host', 'example.com'),
('User-Agent', 'warcio-client/1.0')
], is_http_request=True)
request_record = builder.create_warc_record(
uri='http://example.com/',
record_type='request',
http_headers=request_headers
)
# Create response record
response_headers = StatusAndHeaders('200 OK', [
('Content-Type', 'text/html'),
('Content-Length', '13')
])
response_record = builder.create_warc_record(
uri='http://example.com/',
record_type='response',
payload=io.BytesIO(b'Hello, World!'),
http_headers=response_headers
)
# Write as linked pair
writer.write_request_response_pair(request_record, response_record)from warcio.warcwriter import BufferWARCWriter
from warcio.recordbuilder import RecordBuilder
from warcio.statusandheaders import StatusAndHeaders
import io
# Use buffer writer for in-memory operations
writer = BufferWARCWriter()
builder = RecordBuilder()
# Create and write record
record = builder.create_warc_record(
uri='http://example.com',
record_type='response',
payload=io.BytesIO(b'Hello, World!'),
http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')])
)
writer.write_record(record)
# Get contents as bytes
warc_bytes = writer.get_contents()
# Or get as stream for further processing
warc_stream = writer.get_stream()from warcio.recordbuilder import RecordBuilder
from warcio.statusandheaders import StatusAndHeaders
builder = RecordBuilder()
# Create original response record
original_record = builder.create_warc_record(
uri='http://example.com',
record_type='response',
payload=io.BytesIO(b'Original content'),
http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')])
)
# Get the payload digest from the original record
original_digest = original_record.rec_headers.get_header('WARC-Payload-Digest')
original_date = original_record.rec_headers.get_header('WARC-Date')
# Create revisit record referencing the original
revisit_record = builder.create_revisit_record(
uri='http://example.com',
digest=original_digest,
refers_to_uri='http://example.com',
refers_to_date=original_date,
http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')])
)from warcio.recordbuilder import RecordBuilder
from warcio.statusandheaders import StatusAndHeaders
import io
builder = RecordBuilder()
# Create record with custom WARC headers
custom_warc_headers = {
'WARC-IP-Address': '192.168.1.1',
'WARC-Block-Digest': 'sha1:AAAAAAAAAAAAAAAAAAAAAAAAAAA=',
'Custom-Header': 'custom-value'
}
record = builder.create_warc_record(
uri='http://example.com',
record_type='response',
payload=io.BytesIO(b'Hello, World!'),
http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')]),
warc_headers_dict=custom_warc_headers
)
# Ensure digests are calculated
builder.ensure_digest(record, block=True, payload=True)from warcio import WARCWriter
import io
# Create uncompressed WARC file
output_buffer = io.BytesIO()
writer = WARCWriter(output_buffer, gzip=False) # Disable compression
# Write records normally
# ... record creation and writing code ...Install with Tessl CLI
npx tessl i tessl/pypi-warcio