Streaming WARC (and ARC) IO library for reading and writing web archive files
Live HTTP traffic recording capabilities that capture requests and responses directly to WARC format by monkey-patching Python's http.client library. This enables transparent recording of HTTP traffic from existing applications and libraries.
Context manager that enables live HTTP traffic recording to WARC files with support for filtering and customization.
def capture_http(warc_writer=None, filter_func=None, append=True,
record_ip=True, **kwargs):
"""
Context manager for capturing HTTP traffic to WARC format.
Args:
warc_writer: WARCWriter instance to write records to (creates BufferWARCWriter if None)
filter_func (callable): Optional function to filter which requests/responses to record
append (bool): Whether to append to existing WARC writer (default True)
record_ip (bool): Whether to record IP addresses in WARC headers (default True)
**kwargs: Additional arguments passed to WARCWriter if created
Returns:
Context manager that yields the warc_writer instance
Example:
with capture_http() as writer:
# HTTP requests made here will be recorded
response = requests.get('http://example.com')
"""Internal classes used by capture_http for advanced customization and direct access to recording functionality.
class RequestRecorder:
def __init__(self, writer, filter_func=None, record_ip=True):
"""
Records HTTP requests and responses to WARC writer.
Args:
writer: WARCWriter instance to write records to
filter_func (callable): Optional filter function for requests/responses
record_ip (bool): Whether to record IP addresses
"""
def start_tunnel(self):
"""Start HTTP tunnel recording (for CONNECT method)."""
def start(self):
"""Start recording session."""
def set_remote_ip(self, remote_ip):
"""
Set remote IP address for current connection.
Args:
remote_ip (str): IP address to record
"""
def write_request(self, buff):
"""
Write request data to buffer.
Args:
buff (bytes): Request data to write
"""
def write_response(self, buff):
"""
Write response data to buffer.
Args:
buff (bytes): Response data to write
"""
def done(self):
"""Complete recording and write WARC records."""
class RecordingHTTPConnection:
def __init__(self, *args, **kwargs):
"""HTTP connection that records traffic to WARC."""
def send(self, data):
"""
Send data while recording.
Args:
data (bytes): Data to send
"""
def putrequest(self, *args, **kwargs):
"""Send HTTP request while recording."""
class RecordingHTTPResponse:
def __init__(self, recorder, *args, **kwargs):
"""
HTTP response that records data as it's read.
Args:
recorder: RequestRecorder instance
"""
class RecordingStream:
def __init__(self, fp, recorder):
"""
Stream wrapper that records data as it's read.
Args:
fp: File-like object to wrap
recorder: RequestRecorder instance
"""
def read(self, amt=None):
"""
Read and record data.
Args:
amt (int): Amount to read
Returns:
bytes: Data read from stream
"""
def readinto(self, buff):
"""
Read into buffer and record data.
Args:
buff: Buffer to read into
Returns:
int: Number of bytes read
"""
def readline(self, maxlen=-1):
"""
Read line and record data.
Args:
maxlen (int): Maximum line length
Returns:
bytes: Line data
"""from warcio.capture_http import capture_http
import requests
# Capture HTTP traffic to a buffer
with capture_http() as writer:
# Make HTTP requests - they will be automatically recorded
response1 = requests.get('http://example.com')
response2 = requests.post('http://httpbin.org/post', data={'key': 'value'})
print(f"Response 1 status: {response1.status_code}")
print(f"Response 2 status: {response2.status_code}")
# Get captured WARC data
warc_data = writer.get_contents()
print(f"Captured {len(warc_data)} bytes of WARC data")
# Save to file
with open('captured.warc.gz', 'wb') as f:
f.write(warc_data)from warcio.capture_http import capture_http
from warcio import WARCWriter
import requests
# Capture directly to file
with open('live_capture.warc.gz', 'wb') as output_file:
writer = WARCWriter(output_file)
with capture_http(warc_writer=writer) as writer:
# Make requests that will be written directly to file
requests.get('http://example.com')
requests.get('http://httpbin.org/get')
print("HTTP traffic saved to live_capture.warc.gz")from warcio.capture_http import capture_http
import requests
def should_record(request_data):
"""
Filter function to control which requests/responses are recorded.
Args:
request_data: Dictionary containing request information
Returns:
bool: True to record, False to skip
"""
url = request_data.get('url', '')
# Only record requests to specific domains
if 'example.com' in url or 'httpbin.org' in url:
return True
# Skip requests to certain paths
if '/favicon.ico' in url:
return False
return True
# Capture with filtering
with capture_http(filter_func=should_record) as writer:
requests.get('http://example.com') # Will be recorded
requests.get('http://example.com/favicon.ico') # Will be skipped
requests.get('http://other-domain.com') # Will be skipped
requests.get('http://httpbin.org/get') # Will be recorded
warc_data = writer.get_contents()
print(f"Filtered capture: {len(warc_data)} bytes")from warcio.capture_http import capture_http
import requests
# Capture with IP address recording enabled (default)
with capture_http(record_ip=True) as writer:
requests.get('http://example.com')
# Check the WARC data for IP addresses
warc_data = writer.get_contents()
# You can then read the WARC data to see IP addresses in headers
from warcio import ArchiveIterator
import io
for record in ArchiveIterator(io.BytesIO(warc_data)):
ip_address = record.rec_headers.get_header('WARC-IP-Address')
if ip_address:
print(f"Recorded IP: {ip_address}")from warcio.capture_http import capture_http
import requests
# Create a session for persistent connections
session = requests.Session()
session.headers.update({'User-Agent': 'WARC-Capture-Bot/1.0'})
with capture_http() as writer:
# Use session for multiple requests
response1 = session.get('http://httpbin.org/cookies/set/session/abc123')
response2 = session.get('http://httpbin.org/cookies') # Will include session cookie
# Regular requests also captured
response3 = requests.get('http://example.com')
print("Session-based requests captured")from warcio.capture_http import capture_http
import urllib.request
import urllib.parse
# Capture urllib requests
with capture_http() as writer:
# urllib.request automatically uses http.client under the hood
with urllib.request.urlopen('http://example.com') as response:
content = response.read()
print(f"Read {len(content)} bytes")
# POST request with urllib
data = urllib.parse.urlencode({'key': 'value'}).encode('utf-8')
req = urllib.request.Request('http://httpbin.org/post', data=data)
with urllib.request.urlopen(req) as response:
result = response.read()
warc_data = writer.get_contents()
print(f"urllib requests captured: {len(warc_data)} bytes")from warcio.capture_http import capture_http
import requests
def advanced_filter(request_data):
"""
Advanced filter with access to more request details.
Available keys in request_data:
- 'url': Request URL
- 'method': HTTP method (GET, POST, etc.)
- 'headers': Request headers dict
"""
method = request_data.get('method', 'GET')
url = request_data.get('url', '')
headers = request_data.get('headers', {})
# Only record GET requests
if method != 'GET':
return False
# Skip requests with certain user agents
user_agent = headers.get('User-Agent', '')
if 'bot' in user_agent.lower():
return False
# Only record specific domains
allowed_domains = ['example.com', 'httpbin.org']
if not any(domain in url for domain in allowed_domains):
return False
return True
with capture_http(filter_func=advanced_filter) as writer:
# This will be recorded (GET to allowed domain)
requests.get('http://example.com')
# This will be skipped (POST request)
requests.post('http://example.com', data={'test': 'data'})
# This will be skipped (bot user agent)
requests.get('http://example.com', headers={'User-Agent': 'TestBot/1.0'})from warcio.capture_http import capture_http
from warcio import WARCWriter
import requests
# Open existing WARC file for appending
with open('existing.warc.gz', 'ab') as output_file: # Note: 'ab' for append
writer = WARCWriter(output_file)
# Capture new requests and append to existing file
with capture_http(warc_writer=writer, append=True) as writer:
requests.get('http://example.com/new-page')
print("New requests appended to existing WARC file")from warcio.capture_http import capture_http
import requests
with capture_http() as writer:
try:
# Even failed requests are captured
response = requests.get('http://nonexistent-domain.invalid', timeout=5)
except requests.exceptions.RequestException as e:
print(f"Request failed: {e}")
# The failed request attempt is still recorded in the WARC
# Successful requests continue to be captured
response = requests.get('http://example.com')
# Both successful and failed request attempts are in the WARC
warc_data = writer.get_contents()
print(f"Captured data including failed requests: {len(warc_data)} bytes")Install with Tessl CLI
npx tessl i tessl/pypi-warcio