CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-warcio

Streaming WARC (and ARC) IO library for reading and writing web archive files

Overview
Eval results
Files

warc-writing.mddocs/

WARC Writing

Comprehensive functionality for creating and writing WARC files, including record building, header management, compression, and digest calculation. The library provides both streaming writers and in-memory buffer writers for different use cases.

Capabilities

WARC Writer

Main class for writing WARC records to files or streams with optional compression.

class WARCWriter:
    def __init__(self, filebuf, gzip=True, warc_version=None, header_filter=None):
        """
        WARC writer for creating WARC files.
        
        Args:
            filebuf: File-like object to write to
            gzip (bool): Enable gzip compression (default True)
            warc_version (str): WARC version to use (default None for latest)
            header_filter (callable): Optional function to filter headers
        """
    
    def write_record(self, record, params=None):
        """
        Write a WARC record to the output stream.
        
        Args:
            record: ArcWarcRecord to write
            params: Optional parameters for writing
        """
    
    def write_request_response_pair(self, req, resp, params=None):
        """
        Write a request/response pair with proper linking.
        
        Args:
            req: Request record
            resp: Response record  
            params: Optional parameters for writing
        """

Buffer WARC Writer

WARC writer that writes to an in-memory buffer for testing or temporary storage.

class BufferWARCWriter(WARCWriter):
    def __init__(self, gzip=True, warc_version=None, header_filter=None):
        """
        WARC writer that writes to in-memory buffer.
        
        Args:
            gzip (bool): Enable gzip compression (default True)
            warc_version (str): WARC version to use
            header_filter (callable): Optional function to filter headers
        """
    
    def get_contents(self):
        """
        Get buffer contents as bytes.
        
        Returns:
            bytes: Complete WARC file contents
        """
    
    def get_stream(self):
        """
        Get buffer as stream positioned at beginning.
        
        Returns:
            io.BytesIO: Stream containing WARC data
        """

Record Builder

Factory class for creating various types of WARC records with proper headers and metadata.

class RecordBuilder:
    def __init__(self, warc_version=None, header_filter=None):
        """
        Builder for creating WARC records.
        
        Args:
            warc_version (str): WARC version to use (default None)
            header_filter (callable): Optional function to filter headers
        """
    
    def create_warc_record(self, uri, record_type, payload=None, length=None,
                          warc_content_type='', warc_headers_dict=None, 
                          warc_headers=None, http_headers=None):
        """
        Create a general WARC record.
        
        Args:
            uri (str): Target URI for the record
            record_type (str): WARC record type ('response', 'request', etc.)
            payload: Record payload as file-like object or bytes
            length (int): Content length (calculated if None)
            warc_content_type (str): WARC content type (default '')
            warc_headers_dict (dict): Additional WARC headers as dict
            warc_headers: Additional WARC headers as StatusAndHeaders
            http_headers: HTTP headers as StatusAndHeaders object
            
        Returns:
            ArcWarcRecord: Created WARC record
        """
    
    def create_revisit_record(self, uri, digest, refers_to_uri, refers_to_date,
                             http_headers=None, warc_headers_dict=None):
        """
        Create a revisit record that references an earlier record.
        
        Args:
            uri (str): Target URI
            digest (str): Digest of referenced record
            refers_to_uri (str): URI of referenced record
            refers_to_date (str): Date of referenced record
            http_headers: HTTP headers as StatusAndHeaders object
            warc_headers_dict (dict): Additional WARC headers
            
        Returns:
            ArcWarcRecord: Created revisit record
        """
    
    def create_warcinfo_record(self, filename, info):
        """
        Create a warcinfo record with file metadata.
        
        Args:
            filename (str): Name of the WARC file
            info (dict or str): Metadata information
            
        Returns:
            ArcWarcRecord: Created warcinfo record
        """
    
    def curr_warc_date(self):
        """
        Get current date in WARC format.
        
        Returns:
            str: Current timestamp in WARC date format
        """
    
    def ensure_digest(self, record, block=True, payload=True):
        """
        Ensure record has proper digests calculated.
        
        Args:
            record: Record to add digests to
            block (bool): Calculate block digest if True
            payload (bool): Calculate payload digest if True
        """
    
    # RecordBuilder Constants
    REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/identical-payload-digest'
    REVISIT_PROFILE_1_1 = 'http://netpreserve.org/warc/1.1/revisit/identical-payload-digest'
    WARC_1_0 = 'WARC/1.0'
    WARC_1_1 = 'WARC/1.1'
    WARC_VERSION = WARC_1_0
    NO_PAYLOAD_DIGEST_TYPES = ('warcinfo', 'revisit')

Base Writer and Compression

Base classes and utilities for WARC writing with compression support.

class BaseWARCWriter:
    def __init__(self, gzip=True, warc_version=None, header_filter=None):
        """
        Base class for WARC writers.
        
        Args:
            gzip (bool): Enable gzip compression
            warc_version (str): WARC version
            header_filter (callable): Header filter function
        """
    
    def write_request_response_pair(self, req, resp, params=None):
        """Write request/response pair with proper linking."""
    
    def write_record(self, record, params=None):
        """Write single record (abstract method)."""

class GzippingWrapper:
    def __init__(self, out):
        """
        Wrapper that gzip-compresses data on write.
        
        Args:
            out: Output stream to write compressed data to
        """
    
    def write(self, buff):
        """
        Write and compress data.
        
        Args:
            buff (bytes): Data to compress and write
        """
    
    def flush(self):
        """Flush compressed data to output stream."""

Usage Examples

Basic WARC File Creation

from warcio import WARCWriter
from warcio.recordbuilder import RecordBuilder
from warcio.statusandheaders import StatusAndHeaders
import io

# Create a WARC file
output_buffer = io.BytesIO()
writer = WARCWriter(output_buffer)
builder = RecordBuilder()

# Create warcinfo record
warcinfo_record = builder.create_warcinfo_record(
    filename='example.warc',
    info={'software': 'warcio', 'format': 'WARC File Format 1.1'}
)
writer.write_record(warcinfo_record)

# Create response record
http_headers = StatusAndHeaders('200 OK', [
    ('Content-Type', 'text/html'),
    ('Content-Length', '13')
])

response_record = builder.create_warc_record(
    uri='http://example.com',
    record_type='response',
    payload=io.BytesIO(b'Hello, World!'),
    http_headers=http_headers
)
writer.write_record(response_record)

# Get the WARC data
warc_data = output_buffer.getvalue()
print(f"Created WARC file of {len(warc_data)} bytes")

Request/Response Pair Creation

from warcio import WARCWriter
from warcio.recordbuilder import RecordBuilder
from warcio.statusandheaders import StatusAndHeaders
import io

output_buffer = io.BytesIO()
writer = WARCWriter(output_buffer)
builder = RecordBuilder()

# Create request record
request_headers = StatusAndHeaders('GET / HTTP/1.1', [
    ('Host', 'example.com'),
    ('User-Agent', 'warcio-client/1.0')
], is_http_request=True)

request_record = builder.create_warc_record(
    uri='http://example.com/',
    record_type='request',
    http_headers=request_headers
)

# Create response record
response_headers = StatusAndHeaders('200 OK', [
    ('Content-Type', 'text/html'),
    ('Content-Length', '13')
])

response_record = builder.create_warc_record(
    uri='http://example.com/',
    record_type='response',
    payload=io.BytesIO(b'Hello, World!'),
    http_headers=response_headers
)

# Write as linked pair
writer.write_request_response_pair(request_record, response_record)

Buffer Writer Usage

from warcio.warcwriter import BufferWARCWriter
from warcio.recordbuilder import RecordBuilder
from warcio.statusandheaders import StatusAndHeaders
import io

# Use buffer writer for in-memory operations
writer = BufferWARCWriter()
builder = RecordBuilder()

# Create and write record
record = builder.create_warc_record(
    uri='http://example.com',
    record_type='response',
    payload=io.BytesIO(b'Hello, World!'),
    http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')])
)
writer.write_record(record)

# Get contents as bytes
warc_bytes = writer.get_contents()

# Or get as stream for further processing
warc_stream = writer.get_stream()

Revisit Record Creation

from warcio.recordbuilder import RecordBuilder
from warcio.statusandheaders import StatusAndHeaders

builder = RecordBuilder()

# Create original response record
original_record = builder.create_warc_record(
    uri='http://example.com',
    record_type='response',
    payload=io.BytesIO(b'Original content'),
    http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')])
)

# Get the payload digest from the original record
original_digest = original_record.rec_headers.get_header('WARC-Payload-Digest')
original_date = original_record.rec_headers.get_header('WARC-Date')

# Create revisit record referencing the original
revisit_record = builder.create_revisit_record(
    uri='http://example.com',
    digest=original_digest,
    refers_to_uri='http://example.com',
    refers_to_date=original_date,
    http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')])
)

Custom WARC Headers

from warcio.recordbuilder import RecordBuilder
from warcio.statusandheaders import StatusAndHeaders
import io

builder = RecordBuilder()

# Create record with custom WARC headers
custom_warc_headers = {
    'WARC-IP-Address': '192.168.1.1',
    'WARC-Block-Digest': 'sha1:AAAAAAAAAAAAAAAAAAAAAAAAAAA=',
    'Custom-Header': 'custom-value'
}

record = builder.create_warc_record(
    uri='http://example.com',
    record_type='response',
    payload=io.BytesIO(b'Hello, World!'),
    http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')]),
    warc_headers_dict=custom_warc_headers
)

# Ensure digests are calculated
builder.ensure_digest(record, block=True, payload=True)

Uncompressed WARC Files

from warcio import WARCWriter
import io

# Create uncompressed WARC file
output_buffer = io.BytesIO()
writer = WARCWriter(output_buffer, gzip=False)  # Disable compression

# Write records normally
# ... record creation and writing code ...

Install with Tessl CLI

npx tessl i tessl/pypi-warcio

docs

archive-reading.md

cli-tools.md

http-capture.md

http-headers.md

index.md

stream-processing.md

time-utilities.md

warc-writing.md

tile.json