0
# HTTP Capture
1
2
Live HTTP traffic recording capabilities that capture requests and responses directly to WARC format by monkey-patching Python's http.client library. This enables transparent recording of HTTP traffic from existing applications and libraries.
3
4
## Capabilities
5
6
### HTTP Traffic Capture
7
8
Context manager that enables live HTTP traffic recording to WARC files with support for filtering and customization.
9
10
```python { .api }
11
def capture_http(warc_writer=None, filter_func=None, append=True,
12
record_ip=True, **kwargs):
13
"""
14
Context manager for capturing HTTP traffic to WARC format.
15
16
Args:
17
warc_writer: WARCWriter instance to write records to (creates BufferWARCWriter if None)
18
filter_func (callable): Optional function to filter which requests/responses to record
19
append (bool): Whether to append to existing WARC writer (default True)
20
record_ip (bool): Whether to record IP addresses in WARC headers (default True)
21
**kwargs: Additional arguments passed to WARCWriter if created
22
23
Returns:
24
Context manager that yields the warc_writer instance
25
26
Example:
27
with capture_http() as writer:
28
# HTTP requests made here will be recorded
29
response = requests.get('http://example.com')
30
"""
31
```
32
33
### Advanced HTTP Capture Classes
34
35
Internal classes used by capture_http for advanced customization and direct access to recording functionality.
36
37
```python { .api }
38
class RequestRecorder:
39
def __init__(self, writer, filter_func=None, record_ip=True):
40
"""
41
Records HTTP requests and responses to WARC writer.
42
43
Args:
44
writer: WARCWriter instance to write records to
45
filter_func (callable): Optional filter function for requests/responses
46
record_ip (bool): Whether to record IP addresses
47
"""
48
49
def start_tunnel(self):
50
"""Start HTTP tunnel recording (for CONNECT method)."""
51
52
def start(self):
53
"""Start recording session."""
54
55
def set_remote_ip(self, remote_ip):
56
"""
57
Set remote IP address for current connection.
58
59
Args:
60
remote_ip (str): IP address to record
61
"""
62
63
def write_request(self, buff):
64
"""
65
Write request data to buffer.
66
67
Args:
68
buff (bytes): Request data to write
69
"""
70
71
def write_response(self, buff):
72
"""
73
Write response data to buffer.
74
75
Args:
76
buff (bytes): Response data to write
77
"""
78
79
def done(self):
80
"""Complete recording and write WARC records."""
81
82
class RecordingHTTPConnection:
83
def __init__(self, *args, **kwargs):
84
"""HTTP connection that records traffic to WARC."""
85
86
def send(self, data):
87
"""
88
Send data while recording.
89
90
Args:
91
data (bytes): Data to send
92
"""
93
94
def putrequest(self, *args, **kwargs):
95
"""Send HTTP request while recording."""
96
97
class RecordingHTTPResponse:
98
def __init__(self, recorder, *args, **kwargs):
99
"""
100
HTTP response that records data as it's read.
101
102
Args:
103
recorder: RequestRecorder instance
104
"""
105
106
class RecordingStream:
107
def __init__(self, fp, recorder):
108
"""
109
Stream wrapper that records data as it's read.
110
111
Args:
112
fp: File-like object to wrap
113
recorder: RequestRecorder instance
114
"""
115
116
def read(self, amt=None):
117
"""
118
Read and record data.
119
120
Args:
121
amt (int): Amount to read
122
123
Returns:
124
bytes: Data read from stream
125
"""
126
127
def readinto(self, buff):
128
"""
129
Read into buffer and record data.
130
131
Args:
132
buff: Buffer to read into
133
134
Returns:
135
int: Number of bytes read
136
"""
137
138
def readline(self, maxlen=-1):
139
"""
140
Read line and record data.
141
142
Args:
143
maxlen (int): Maximum line length
144
145
Returns:
146
bytes: Line data
147
"""
148
```
149
150
## Usage Examples
151
152
### Basic HTTP Capture
153
154
```python
155
from warcio.capture_http import capture_http
156
import requests
157
158
# Capture HTTP traffic to a buffer
159
with capture_http() as writer:
160
# Make HTTP requests - they will be automatically recorded
161
response1 = requests.get('http://example.com')
162
response2 = requests.post('http://httpbin.org/post', data={'key': 'value'})
163
164
print(f"Response 1 status: {response1.status_code}")
165
print(f"Response 2 status: {response2.status_code}")
166
167
# Get captured WARC data
168
warc_data = writer.get_contents()
169
print(f"Captured {len(warc_data)} bytes of WARC data")
170
171
# Save to file
172
with open('captured.warc.gz', 'wb') as f:
173
f.write(warc_data)
174
```
175
176
### Capture to File
177
178
```python
179
from warcio.capture_http import capture_http
180
from warcio import WARCWriter
181
import requests
182
183
# Capture directly to file
184
with open('live_capture.warc.gz', 'wb') as output_file:
185
writer = WARCWriter(output_file)
186
187
with capture_http(warc_writer=writer) as writer:
188
# Make requests that will be written directly to file
189
requests.get('http://example.com')
190
requests.get('http://httpbin.org/get')
191
192
print("HTTP traffic saved to live_capture.warc.gz")
193
```
194
195
### Filtered Capture
196
197
```python
198
from warcio.capture_http import capture_http
199
import requests
200
201
def should_record(request_data):
202
"""
203
Filter function to control which requests/responses are recorded.
204
205
Args:
206
request_data: Dictionary containing request information
207
208
Returns:
209
bool: True to record, False to skip
210
"""
211
url = request_data.get('url', '')
212
213
# Only record requests to specific domains
214
if 'example.com' in url or 'httpbin.org' in url:
215
return True
216
217
# Skip requests to certain paths
218
if '/favicon.ico' in url:
219
return False
220
221
return True
222
223
# Capture with filtering
224
with capture_http(filter_func=should_record) as writer:
225
requests.get('http://example.com') # Will be recorded
226
requests.get('http://example.com/favicon.ico') # Will be skipped
227
requests.get('http://other-domain.com') # Will be skipped
228
requests.get('http://httpbin.org/get') # Will be recorded
229
230
warc_data = writer.get_contents()
231
print(f"Filtered capture: {len(warc_data)} bytes")
232
```
233
234
### Capture with IP Recording
235
236
```python
237
from warcio.capture_http import capture_http
238
import requests
239
240
# Capture with IP address recording enabled (default)
241
with capture_http(record_ip=True) as writer:
242
requests.get('http://example.com')
243
244
# Check the WARC data for IP addresses
245
warc_data = writer.get_contents()
246
247
# You can then read the WARC data to see IP addresses in headers
248
from warcio import ArchiveIterator
249
import io
250
251
for record in ArchiveIterator(io.BytesIO(warc_data)):
252
ip_address = record.rec_headers.get_header('WARC-IP-Address')
253
if ip_address:
254
print(f"Recorded IP: {ip_address}")
255
```
256
257
### Multiple Session Capture
258
259
```python
260
from warcio.capture_http import capture_http
261
import requests
262
263
# Create a session for persistent connections
264
session = requests.Session()
265
session.headers.update({'User-Agent': 'WARC-Capture-Bot/1.0'})
266
267
with capture_http() as writer:
268
# Use session for multiple requests
269
response1 = session.get('http://httpbin.org/cookies/set/session/abc123')
270
response2 = session.get('http://httpbin.org/cookies') # Will include session cookie
271
272
# Regular requests also captured
273
response3 = requests.get('http://example.com')
274
275
print("Session-based requests captured")
276
```
277
278
### Working with urllib
279
280
```python
281
from warcio.capture_http import capture_http
282
import urllib.request
283
import urllib.parse
284
285
# Capture urllib requests
286
with capture_http() as writer:
287
# urllib.request automatically uses http.client under the hood
288
with urllib.request.urlopen('http://example.com') as response:
289
content = response.read()
290
print(f"Read {len(content)} bytes")
291
292
# POST request with urllib
293
data = urllib.parse.urlencode({'key': 'value'}).encode('utf-8')
294
req = urllib.request.Request('http://httpbin.org/post', data=data)
295
with urllib.request.urlopen(req) as response:
296
result = response.read()
297
298
warc_data = writer.get_contents()
299
print(f"urllib requests captured: {len(warc_data)} bytes")
300
```
301
302
### Advanced Filtering with Request Details
303
304
```python
305
from warcio.capture_http import capture_http
306
import requests
307
308
def advanced_filter(request_data):
309
"""
310
Advanced filter with access to more request details.
311
312
Available keys in request_data:
313
- 'url': Request URL
314
- 'method': HTTP method (GET, POST, etc.)
315
- 'headers': Request headers dict
316
"""
317
method = request_data.get('method', 'GET')
318
url = request_data.get('url', '')
319
headers = request_data.get('headers', {})
320
321
# Only record GET requests
322
if method != 'GET':
323
return False
324
325
# Skip requests with certain user agents
326
user_agent = headers.get('User-Agent', '')
327
if 'bot' in user_agent.lower():
328
return False
329
330
# Only record specific domains
331
allowed_domains = ['example.com', 'httpbin.org']
332
if not any(domain in url for domain in allowed_domains):
333
return False
334
335
return True
336
337
with capture_http(filter_func=advanced_filter) as writer:
338
# This will be recorded (GET to allowed domain)
339
requests.get('http://example.com')
340
341
# This will be skipped (POST request)
342
requests.post('http://example.com', data={'test': 'data'})
343
344
# This will be skipped (bot user agent)
345
requests.get('http://example.com', headers={'User-Agent': 'TestBot/1.0'})
346
```
347
348
### Combining with Existing WARC Files
349
350
```python
351
from warcio.capture_http import capture_http
352
from warcio import WARCWriter
353
import requests
354
355
# Open existing WARC file for appending
356
with open('existing.warc.gz', 'ab') as output_file: # Note: 'ab' for append
357
writer = WARCWriter(output_file)
358
359
# Capture new requests and append to existing file
360
with capture_http(warc_writer=writer, append=True) as writer:
361
requests.get('http://example.com/new-page')
362
363
print("New requests appended to existing WARC file")
364
```
365
366
### Error Handling in Capture
367
368
```python
369
from warcio.capture_http import capture_http
370
import requests
371
372
with capture_http() as writer:
373
try:
374
# Even failed requests are captured
375
response = requests.get('http://nonexistent-domain.invalid', timeout=5)
376
except requests.exceptions.RequestException as e:
377
print(f"Request failed: {e}")
378
# The failed request attempt is still recorded in the WARC
379
380
# Successful requests continue to be captured
381
response = requests.get('http://example.com')
382
383
# Both successful and failed request attempts are in the WARC
384
warc_data = writer.get_contents()
385
print(f"Captured data including failed requests: {len(warc_data)} bytes")
386
```