Tessl Tile for pypi/warcio@1.7.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

archive-reading.md cli-tools.md http-capture.md http-headers.md index.md stream-processing.md time-utilities.md warc-writing.md

http-capture.mddocs/

0
# HTTP Capture
1

2
Live HTTP traffic recording capabilities that capture requests and responses directly to WARC format by monkey-patching Python's http.client library. This enables transparent recording of HTTP traffic from existing applications and libraries.
3

4
## Capabilities
5

6
### HTTP Traffic Capture
7

8
Context manager that enables live HTTP traffic recording to WARC files with support for filtering and customization.
9

10
```python { .api }
11
def capture_http(warc_writer=None, filter_func=None, append=True, 
12
                record_ip=True, **kwargs):
13
    """
14
    Context manager for capturing HTTP traffic to WARC format.
15
    
16
    Args:
17
        warc_writer: WARCWriter instance to write records to (creates BufferWARCWriter if None)
18
        filter_func (callable): Optional function to filter which requests/responses to record
19
        append (bool): Whether to append to existing WARC writer (default True)
20
        record_ip (bool): Whether to record IP addresses in WARC headers (default True)
21
        **kwargs: Additional arguments passed to WARCWriter if created
22
        
23
    Returns:
24
        Context manager that yields the warc_writer instance
25
        
26
    Example:
27
        with capture_http() as writer:
28
            # HTTP requests made here will be recorded
29
            response = requests.get('http://example.com')
30
    """
31
```
32

33
### Advanced HTTP Capture Classes
34

35
Internal classes used by capture_http for advanced customization and direct access to recording functionality.
36

37
```python { .api }
38
class RequestRecorder:
39
    def __init__(self, writer, filter_func=None, record_ip=True):
40
        """
41
        Records HTTP requests and responses to WARC writer.
42
        
43
        Args:
44
            writer: WARCWriter instance to write records to
45
            filter_func (callable): Optional filter function for requests/responses
46
            record_ip (bool): Whether to record IP addresses
47
        """
48
    
49
    def start_tunnel(self):
50
        """Start HTTP tunnel recording (for CONNECT method)."""
51
    
52
    def start(self):
53
        """Start recording session."""
54
    
55
    def set_remote_ip(self, remote_ip):
56
        """
57
        Set remote IP address for current connection.
58
        
59
        Args:
60
            remote_ip (str): IP address to record
61
        """
62
    
63
    def write_request(self, buff):
64
        """
65
        Write request data to buffer.
66
        
67
        Args:
68
            buff (bytes): Request data to write
69
        """
70
    
71
    def write_response(self, buff):
72
        """
73
        Write response data to buffer.
74
        
75
        Args:
76
            buff (bytes): Response data to write
77
        """
78
    
79
    def done(self):
80
        """Complete recording and write WARC records."""
81

82
class RecordingHTTPConnection:
83
    def __init__(self, *args, **kwargs):
84
        """HTTP connection that records traffic to WARC."""
85
    
86
    def send(self, data):
87
        """
88
        Send data while recording.
89
        
90
        Args:
91
            data (bytes): Data to send
92
        """
93
    
94
    def putrequest(self, *args, **kwargs):
95
        """Send HTTP request while recording."""
96

97
class RecordingHTTPResponse:
98
    def __init__(self, recorder, *args, **kwargs):
99
        """
100
        HTTP response that records data as it's read.
101
        
102
        Args:
103
            recorder: RequestRecorder instance
104
        """
105

106
class RecordingStream:
107
    def __init__(self, fp, recorder):
108
        """
109
        Stream wrapper that records data as it's read.
110
        
111
        Args:
112
            fp: File-like object to wrap
113
            recorder: RequestRecorder instance
114
        """
115
    
116
    def read(self, amt=None):
117
        """
118
        Read and record data.
119
        
120
        Args:
121
            amt (int): Amount to read
122
            
123
        Returns:
124
            bytes: Data read from stream
125
        """
126
    
127
    def readinto(self, buff):
128
        """
129
        Read into buffer and record data.
130
        
131
        Args:
132
            buff: Buffer to read into
133
            
134
        Returns:
135
            int: Number of bytes read
136
        """
137
    
138
    def readline(self, maxlen=-1):
139
        """
140
        Read line and record data.
141
        
142
        Args:
143
            maxlen (int): Maximum line length
144
            
145
        Returns:
146
            bytes: Line data
147
        """
148
```
149

150
## Usage Examples
151

152
### Basic HTTP Capture
153

154
```python
155
from warcio.capture_http import capture_http
156
import requests
157

158
# Capture HTTP traffic to a buffer
159
with capture_http() as writer:
160
    # Make HTTP requests - they will be automatically recorded
161
    response1 = requests.get('http://example.com')
162
    response2 = requests.post('http://httpbin.org/post', data={'key': 'value'})
163
    
164
    print(f"Response 1 status: {response1.status_code}")
165
    print(f"Response 2 status: {response2.status_code}")
166

167
# Get captured WARC data
168
warc_data = writer.get_contents()
169
print(f"Captured {len(warc_data)} bytes of WARC data")
170

171
# Save to file
172
with open('captured.warc.gz', 'wb') as f:
173
    f.write(warc_data)
174
```
175

176
### Capture to File
177

178
```python
179
from warcio.capture_http import capture_http
180
from warcio import WARCWriter
181
import requests
182

183
# Capture directly to file
184
with open('live_capture.warc.gz', 'wb') as output_file:
185
    writer = WARCWriter(output_file)
186
    
187
    with capture_http(warc_writer=writer) as writer:
188
        # Make requests that will be written directly to file
189
        requests.get('http://example.com')
190
        requests.get('http://httpbin.org/get')
191
        
192
print("HTTP traffic saved to live_capture.warc.gz")
193
```
194

195
### Filtered Capture
196

197
```python
198
from warcio.capture_http import capture_http
199
import requests
200

201
def should_record(request_data):
202
    """
203
    Filter function to control which requests/responses are recorded.
204
    
205
    Args:
206
        request_data: Dictionary containing request information
207
        
208
    Returns:
209
        bool: True to record, False to skip
210
    """
211
    url = request_data.get('url', '')
212
    
213
    # Only record requests to specific domains
214
    if 'example.com' in url or 'httpbin.org' in url:
215
        return True
216
    
217
    # Skip requests to certain paths
218
    if '/favicon.ico' in url:
219
        return False
220
        
221
    return True
222

223
# Capture with filtering
224
with capture_http(filter_func=should_record) as writer:
225
    requests.get('http://example.com')  # Will be recorded
226
    requests.get('http://example.com/favicon.ico')  # Will be skipped
227
    requests.get('http://other-domain.com')  # Will be skipped
228
    requests.get('http://httpbin.org/get')  # Will be recorded
229

230
warc_data = writer.get_contents()
231
print(f"Filtered capture: {len(warc_data)} bytes")
232
```
233

234
### Capture with IP Recording
235

236
```python
237
from warcio.capture_http import capture_http
238
import requests
239

240
# Capture with IP address recording enabled (default)
241
with capture_http(record_ip=True) as writer:
242
    requests.get('http://example.com')
243

244
# Check the WARC data for IP addresses
245
warc_data = writer.get_contents()
246

247
# You can then read the WARC data to see IP addresses in headers
248
from warcio import ArchiveIterator
249
import io
250

251
for record in ArchiveIterator(io.BytesIO(warc_data)):
252
    ip_address = record.rec_headers.get_header('WARC-IP-Address')
253
    if ip_address:
254
        print(f"Recorded IP: {ip_address}")
255
```
256

257
### Multiple Session Capture
258

259
```python
260
from warcio.capture_http import capture_http
261
import requests
262

263
# Create a session for persistent connections
264
session = requests.Session()
265
session.headers.update({'User-Agent': 'WARC-Capture-Bot/1.0'})
266

267
with capture_http() as writer:
268
    # Use session for multiple requests
269
    response1 = session.get('http://httpbin.org/cookies/set/session/abc123')
270
    response2 = session.get('http://httpbin.org/cookies')  # Will include session cookie
271
    
272
    # Regular requests also captured
273
    response3 = requests.get('http://example.com')
274

275
print("Session-based requests captured")
276
```
277

278
### Working with urllib
279

280
```python
281
from warcio.capture_http import capture_http
282
import urllib.request
283
import urllib.parse
284

285
# Capture urllib requests
286
with capture_http() as writer:
287
    # urllib.request automatically uses http.client under the hood
288
    with urllib.request.urlopen('http://example.com') as response:
289
        content = response.read()
290
        print(f"Read {len(content)} bytes")
291
    
292
    # POST request with urllib
293
    data = urllib.parse.urlencode({'key': 'value'}).encode('utf-8')
294
    req = urllib.request.Request('http://httpbin.org/post', data=data)
295
    with urllib.request.urlopen(req) as response:
296
        result = response.read()
297

298
warc_data = writer.get_contents()
299
print(f"urllib requests captured: {len(warc_data)} bytes")
300
```
301

302
### Advanced Filtering with Request Details
303

304
```python
305
from warcio.capture_http import capture_http
306
import requests
307

308
def advanced_filter(request_data):
309
    """
310
    Advanced filter with access to more request details.
311
    
312
    Available keys in request_data:
313
    - 'url': Request URL
314
    - 'method': HTTP method (GET, POST, etc.)
315
    - 'headers': Request headers dict
316
    """
317
    method = request_data.get('method', 'GET')
318
    url = request_data.get('url', '')
319
    headers = request_data.get('headers', {})
320
    
321
    # Only record GET requests
322
    if method != 'GET':
323
        return False
324
    
325
    # Skip requests with certain user agents
326
    user_agent = headers.get('User-Agent', '')
327
    if 'bot' in user_agent.lower():
328
        return False
329
    
330
    # Only record specific domains
331
    allowed_domains = ['example.com', 'httpbin.org']
332
    if not any(domain in url for domain in allowed_domains):
333
        return False
334
    
335
    return True
336

337
with capture_http(filter_func=advanced_filter) as writer:
338
    # This will be recorded (GET to allowed domain)
339
    requests.get('http://example.com')
340
    
341
    # This will be skipped (POST request)
342
    requests.post('http://example.com', data={'test': 'data'})
343
    
344
    # This will be skipped (bot user agent)
345
    requests.get('http://example.com', headers={'User-Agent': 'TestBot/1.0'})
346
```
347

348
### Combining with Existing WARC Files
349

350
```python
351
from warcio.capture_http import capture_http
352
from warcio import WARCWriter
353
import requests
354

355
# Open existing WARC file for appending
356
with open('existing.warc.gz', 'ab') as output_file:  # Note: 'ab' for append
357
    writer = WARCWriter(output_file)
358
    
359
    # Capture new requests and append to existing file
360
    with capture_http(warc_writer=writer, append=True) as writer:
361
        requests.get('http://example.com/new-page')
362
        
363
print("New requests appended to existing WARC file")
364
```
365

366
### Error Handling in Capture
367

368
```python
369
from warcio.capture_http import capture_http
370
import requests
371

372
with capture_http() as writer:
373
    try:
374
        # Even failed requests are captured
375
        response = requests.get('http://nonexistent-domain.invalid', timeout=5)
376
    except requests.exceptions.RequestException as e:
377
        print(f"Request failed: {e}")
378
        # The failed request attempt is still recorded in the WARC
379
    
380
    # Successful requests continue to be captured
381
    response = requests.get('http://example.com')
382

383
# Both successful and failed request attempts are in the WARC
384
warc_data = writer.get_contents()
385
print(f"Captured data including failed requests: {len(warc_data)} bytes")
386
```

Version

Tile

Files

http-capture.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

http-capture.mddocs/