tessl/pypi-warcio

Streaming WARC (and ARC) IO library for reading and writing web archive files

Overview

Eval results

Files

HTTP Capture

Name: tessl/pypi-warcio
Author: tessl

Live HTTP traffic recording capabilities that capture requests and responses directly to WARC format by monkey-patching Python's http.client library. This enables transparent recording of HTTP traffic from existing applications and libraries.

Capabilities

HTTP Traffic Capture

Context manager that enables live HTTP traffic recording to WARC files with support for filtering and customization.

def capture_http(warc_writer=None, filter_func=None, append=True, 
                record_ip=True, **kwargs):
    """
    Context manager for capturing HTTP traffic to WARC format.
    
    Args:
        warc_writer: WARCWriter instance to write records to (creates BufferWARCWriter if None)
        filter_func (callable): Optional function to filter which requests/responses to record
        append (bool): Whether to append to existing WARC writer (default True)
        record_ip (bool): Whether to record IP addresses in WARC headers (default True)
        **kwargs: Additional arguments passed to WARCWriter if created
        
    Returns:
        Context manager that yields the warc_writer instance
        
    Example:
        with capture_http() as writer:
            # HTTP requests made here will be recorded
            response = requests.get('http://example.com')
    """

Advanced HTTP Capture Classes

Internal classes used by capture_http for advanced customization and direct access to recording functionality.

class RequestRecorder:
    def __init__(self, writer, filter_func=None, record_ip=True):
        """
        Records HTTP requests and responses to WARC writer.
        
        Args:
            writer: WARCWriter instance to write records to
            filter_func (callable): Optional filter function for requests/responses
            record_ip (bool): Whether to record IP addresses
        """
    
    def start_tunnel(self):
        """Start HTTP tunnel recording (for CONNECT method)."""
    
    def start(self):
        """Start recording session."""
    
    def set_remote_ip(self, remote_ip):
        """
        Set remote IP address for current connection.
        
        Args:
            remote_ip (str): IP address to record
        """
    
    def write_request(self, buff):
        """
        Write request data to buffer.
        
        Args:
            buff (bytes): Request data to write
        """
    
    def write_response(self, buff):
        """
        Write response data to buffer.
        
        Args:
            buff (bytes): Response data to write
        """
    
    def done(self):
        """Complete recording and write WARC records."""

class RecordingHTTPConnection:
    def __init__(self, *args, **kwargs):
        """HTTP connection that records traffic to WARC."""
    
    def send(self, data):
        """
        Send data while recording.
        
        Args:
            data (bytes): Data to send
        """
    
    def putrequest(self, *args, **kwargs):
        """Send HTTP request while recording."""

class RecordingHTTPResponse:
    def __init__(self, recorder, *args, **kwargs):
        """
        HTTP response that records data as it's read.
        
        Args:
            recorder: RequestRecorder instance
        """

class RecordingStream:
    def __init__(self, fp, recorder):
        """
        Stream wrapper that records data as it's read.
        
        Args:
            fp: File-like object to wrap
            recorder: RequestRecorder instance
        """
    
    def read(self, amt=None):
        """
        Read and record data.
        
        Args:
            amt (int): Amount to read
            
        Returns:
            bytes: Data read from stream
        """
    
    def readinto(self, buff):
        """
        Read into buffer and record data.
        
        Args:
            buff: Buffer to read into
            
        Returns:
            int: Number of bytes read
        """
    
    def readline(self, maxlen=-1):
        """
        Read line and record data.
        
        Args:
            maxlen (int): Maximum line length
            
        Returns:
            bytes: Line data
        """

Usage Examples

Basic HTTP Capture

from warcio.capture_http import capture_http
import requests

# Capture HTTP traffic to a buffer
with capture_http() as writer:
    # Make HTTP requests - they will be automatically recorded
    response1 = requests.get('http://example.com')
    response2 = requests.post('http://httpbin.org/post', data={'key': 'value'})
    
    print(f"Response 1 status: {response1.status_code}")
    print(f"Response 2 status: {response2.status_code}")

# Get captured WARC data
warc_data = writer.get_contents()
print(f"Captured {len(warc_data)} bytes of WARC data")

# Save to file
with open('captured.warc.gz', 'wb') as f:
    f.write(warc_data)

Capture to File

from warcio.capture_http import capture_http
from warcio import WARCWriter
import requests

# Capture directly to file
with open('live_capture.warc.gz', 'wb') as output_file:
    writer = WARCWriter(output_file)
    
    with capture_http(warc_writer=writer) as writer:
        # Make requests that will be written directly to file
        requests.get('http://example.com')
        requests.get('http://httpbin.org/get')
        
print("HTTP traffic saved to live_capture.warc.gz")

Filtered Capture

from warcio.capture_http import capture_http
import requests

def should_record(request_data):
    """
    Filter function to control which requests/responses are recorded.
    
    Args:
        request_data: Dictionary containing request information
        
    Returns:
        bool: True to record, False to skip
    """
    url = request_data.get('url', '')
    
    # Only record requests to specific domains
    if 'example.com' in url or 'httpbin.org' in url:
        return True
    
    # Skip requests to certain paths
    if '/favicon.ico' in url:
        return False
        
    return True

# Capture with filtering
with capture_http(filter_func=should_record) as writer:
    requests.get('http://example.com')  # Will be recorded
    requests.get('http://example.com/favicon.ico')  # Will be skipped
    requests.get('http://other-domain.com')  # Will be skipped
    requests.get('http://httpbin.org/get')  # Will be recorded

warc_data = writer.get_contents()
print(f"Filtered capture: {len(warc_data)} bytes")

Capture with IP Recording

from warcio.capture_http import capture_http
import requests

# Capture with IP address recording enabled (default)
with capture_http(record_ip=True) as writer:
    requests.get('http://example.com')

# Check the WARC data for IP addresses
warc_data = writer.get_contents()

# You can then read the WARC data to see IP addresses in headers
from warcio import ArchiveIterator
import io

for record in ArchiveIterator(io.BytesIO(warc_data)):
    ip_address = record.rec_headers.get_header('WARC-IP-Address')
    if ip_address:
        print(f"Recorded IP: {ip_address}")

Multiple Session Capture

from warcio.capture_http import capture_http
import requests

# Create a session for persistent connections
session = requests.Session()
session.headers.update({'User-Agent': 'WARC-Capture-Bot/1.0'})

with capture_http() as writer:
    # Use session for multiple requests
    response1 = session.get('http://httpbin.org/cookies/set/session/abc123')
    response2 = session.get('http://httpbin.org/cookies')  # Will include session cookie
    
    # Regular requests also captured
    response3 = requests.get('http://example.com')

print("Session-based requests captured")

Working with urllib

from warcio.capture_http import capture_http
import urllib.request
import urllib.parse

# Capture urllib requests
with capture_http() as writer:
    # urllib.request automatically uses http.client under the hood
    with urllib.request.urlopen('http://example.com') as response:
        content = response.read()
        print(f"Read {len(content)} bytes")
    
    # POST request with urllib
    data = urllib.parse.urlencode({'key': 'value'}).encode('utf-8')
    req = urllib.request.Request('http://httpbin.org/post', data=data)
    with urllib.request.urlopen(req) as response:
        result = response.read()

warc_data = writer.get_contents()
print(f"urllib requests captured: {len(warc_data)} bytes")

Advanced Filtering with Request Details

from warcio.capture_http import capture_http
import requests

def advanced_filter(request_data):
    """
    Advanced filter with access to more request details.
    
    Available keys in request_data:
    - 'url': Request URL
    - 'method': HTTP method (GET, POST, etc.)
    - 'headers': Request headers dict
    """
    method = request_data.get('method', 'GET')
    url = request_data.get('url', '')
    headers = request_data.get('headers', {})
    
    # Only record GET requests
    if method != 'GET':
        return False
    
    # Skip requests with certain user agents
    user_agent = headers.get('User-Agent', '')
    if 'bot' in user_agent.lower():
        return False
    
    # Only record specific domains
    allowed_domains = ['example.com', 'httpbin.org']
    if not any(domain in url for domain in allowed_domains):
        return False
    
    return True

with capture_http(filter_func=advanced_filter) as writer:
    # This will be recorded (GET to allowed domain)
    requests.get('http://example.com')
    
    # This will be skipped (POST request)
    requests.post('http://example.com', data={'test': 'data'})
    
    # This will be skipped (bot user agent)
    requests.get('http://example.com', headers={'User-Agent': 'TestBot/1.0'})

Combining with Existing WARC Files

from warcio.capture_http import capture_http
from warcio import WARCWriter
import requests

# Open existing WARC file for appending
with open('existing.warc.gz', 'ab') as output_file:  # Note: 'ab' for append
    writer = WARCWriter(output_file)
    
    # Capture new requests and append to existing file
    with capture_http(warc_writer=writer, append=True) as writer:
        requests.get('http://example.com/new-page')
        
print("New requests appended to existing WARC file")

Error Handling in Capture

from warcio.capture_http import capture_http
import requests

with capture_http() as writer:
    try:
        # Even failed requests are captured
        response = requests.get('http://nonexistent-domain.invalid', timeout=5)
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        # The failed request attempt is still recorded in the WARC
    
    # Successful requests continue to be captured
    response = requests.get('http://example.com')

# Both successful and failed request attempts are in the WARC
warc_data = writer.get_contents()
print(f"Captured data including failed requests: {len(warc_data)} bytes")

Install with Tessl CLI