or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

archive-reading.mdcli-tools.mdhttp-capture.mdhttp-headers.mdindex.mdstream-processing.mdtime-utilities.mdwarc-writing.md

http-capture.mddocs/

0

# HTTP Capture

1

2

Live HTTP traffic recording capabilities that capture requests and responses directly to WARC format by monkey-patching Python's http.client library. This enables transparent recording of HTTP traffic from existing applications and libraries.

3

4

## Capabilities

5

6

### HTTP Traffic Capture

7

8

Context manager that enables live HTTP traffic recording to WARC files with support for filtering and customization.

9

10

```python { .api }

11

def capture_http(warc_writer=None, filter_func=None, append=True,

12

record_ip=True, **kwargs):

13

"""

14

Context manager for capturing HTTP traffic to WARC format.

15

16

Args:

17

warc_writer: WARCWriter instance to write records to (creates BufferWARCWriter if None)

18

filter_func (callable): Optional function to filter which requests/responses to record

19

append (bool): Whether to append to existing WARC writer (default True)

20

record_ip (bool): Whether to record IP addresses in WARC headers (default True)

21

**kwargs: Additional arguments passed to WARCWriter if created

22

23

Returns:

24

Context manager that yields the warc_writer instance

25

26

Example:

27

with capture_http() as writer:

28

# HTTP requests made here will be recorded

29

response = requests.get('http://example.com')

30

"""

31

```

32

33

### Advanced HTTP Capture Classes

34

35

Internal classes used by capture_http for advanced customization and direct access to recording functionality.

36

37

```python { .api }

38

class RequestRecorder:

39

def __init__(self, writer, filter_func=None, record_ip=True):

40

"""

41

Records HTTP requests and responses to WARC writer.

42

43

Args:

44

writer: WARCWriter instance to write records to

45

filter_func (callable): Optional filter function for requests/responses

46

record_ip (bool): Whether to record IP addresses

47

"""

48

49

def start_tunnel(self):

50

"""Start HTTP tunnel recording (for CONNECT method)."""

51

52

def start(self):

53

"""Start recording session."""

54

55

def set_remote_ip(self, remote_ip):

56

"""

57

Set remote IP address for current connection.

58

59

Args:

60

remote_ip (str): IP address to record

61

"""

62

63

def write_request(self, buff):

64

"""

65

Write request data to buffer.

66

67

Args:

68

buff (bytes): Request data to write

69

"""

70

71

def write_response(self, buff):

72

"""

73

Write response data to buffer.

74

75

Args:

76

buff (bytes): Response data to write

77

"""

78

79

def done(self):

80

"""Complete recording and write WARC records."""

81

82

class RecordingHTTPConnection:

83

def __init__(self, *args, **kwargs):

84

"""HTTP connection that records traffic to WARC."""

85

86

def send(self, data):

87

"""

88

Send data while recording.

89

90

Args:

91

data (bytes): Data to send

92

"""

93

94

def putrequest(self, *args, **kwargs):

95

"""Send HTTP request while recording."""

96

97

class RecordingHTTPResponse:

98

def __init__(self, recorder, *args, **kwargs):

99

"""

100

HTTP response that records data as it's read.

101

102

Args:

103

recorder: RequestRecorder instance

104

"""

105

106

class RecordingStream:

107

def __init__(self, fp, recorder):

108

"""

109

Stream wrapper that records data as it's read.

110

111

Args:

112

fp: File-like object to wrap

113

recorder: RequestRecorder instance

114

"""

115

116

def read(self, amt=None):

117

"""

118

Read and record data.

119

120

Args:

121

amt (int): Amount to read

122

123

Returns:

124

bytes: Data read from stream

125

"""

126

127

def readinto(self, buff):

128

"""

129

Read into buffer and record data.

130

131

Args:

132

buff: Buffer to read into

133

134

Returns:

135

int: Number of bytes read

136

"""

137

138

def readline(self, maxlen=-1):

139

"""

140

Read line and record data.

141

142

Args:

143

maxlen (int): Maximum line length

144

145

Returns:

146

bytes: Line data

147

"""

148

```

149

150

## Usage Examples

151

152

### Basic HTTP Capture

153

154

```python

155

from warcio.capture_http import capture_http

156

import requests

157

158

# Capture HTTP traffic to a buffer

159

with capture_http() as writer:

160

# Make HTTP requests - they will be automatically recorded

161

response1 = requests.get('http://example.com')

162

response2 = requests.post('http://httpbin.org/post', data={'key': 'value'})

163

164

print(f"Response 1 status: {response1.status_code}")

165

print(f"Response 2 status: {response2.status_code}")

166

167

# Get captured WARC data

168

warc_data = writer.get_contents()

169

print(f"Captured {len(warc_data)} bytes of WARC data")

170

171

# Save to file

172

with open('captured.warc.gz', 'wb') as f:

173

f.write(warc_data)

174

```

175

176

### Capture to File

177

178

```python

179

from warcio.capture_http import capture_http

180

from warcio import WARCWriter

181

import requests

182

183

# Capture directly to file

184

with open('live_capture.warc.gz', 'wb') as output_file:

185

writer = WARCWriter(output_file)

186

187

with capture_http(warc_writer=writer) as writer:

188

# Make requests that will be written directly to file

189

requests.get('http://example.com')

190

requests.get('http://httpbin.org/get')

191

192

print("HTTP traffic saved to live_capture.warc.gz")

193

```

194

195

### Filtered Capture

196

197

```python

198

from warcio.capture_http import capture_http

199

import requests

200

201

def should_record(request_data):

202

"""

203

Filter function to control which requests/responses are recorded.

204

205

Args:

206

request_data: Dictionary containing request information

207

208

Returns:

209

bool: True to record, False to skip

210

"""

211

url = request_data.get('url', '')

212

213

# Only record requests to specific domains

214

if 'example.com' in url or 'httpbin.org' in url:

215

return True

216

217

# Skip requests to certain paths

218

if '/favicon.ico' in url:

219

return False

220

221

return True

222

223

# Capture with filtering

224

with capture_http(filter_func=should_record) as writer:

225

requests.get('http://example.com') # Will be recorded

226

requests.get('http://example.com/favicon.ico') # Will be skipped

227

requests.get('http://other-domain.com') # Will be skipped

228

requests.get('http://httpbin.org/get') # Will be recorded

229

230

warc_data = writer.get_contents()

231

print(f"Filtered capture: {len(warc_data)} bytes")

232

```

233

234

### Capture with IP Recording

235

236

```python

237

from warcio.capture_http import capture_http

238

import requests

239

240

# Capture with IP address recording enabled (default)

241

with capture_http(record_ip=True) as writer:

242

requests.get('http://example.com')

243

244

# Check the WARC data for IP addresses

245

warc_data = writer.get_contents()

246

247

# You can then read the WARC data to see IP addresses in headers

248

from warcio import ArchiveIterator

249

import io

250

251

for record in ArchiveIterator(io.BytesIO(warc_data)):

252

ip_address = record.rec_headers.get_header('WARC-IP-Address')

253

if ip_address:

254

print(f"Recorded IP: {ip_address}")

255

```

256

257

### Multiple Session Capture

258

259

```python

260

from warcio.capture_http import capture_http

261

import requests

262

263

# Create a session for persistent connections

264

session = requests.Session()

265

session.headers.update({'User-Agent': 'WARC-Capture-Bot/1.0'})

266

267

with capture_http() as writer:

268

# Use session for multiple requests

269

response1 = session.get('http://httpbin.org/cookies/set/session/abc123')

270

response2 = session.get('http://httpbin.org/cookies') # Will include session cookie

271

272

# Regular requests also captured

273

response3 = requests.get('http://example.com')

274

275

print("Session-based requests captured")

276

```

277

278

### Working with urllib

279

280

```python

281

from warcio.capture_http import capture_http

282

import urllib.request

283

import urllib.parse

284

285

# Capture urllib requests

286

with capture_http() as writer:

287

# urllib.request automatically uses http.client under the hood

288

with urllib.request.urlopen('http://example.com') as response:

289

content = response.read()

290

print(f"Read {len(content)} bytes")

291

292

# POST request with urllib

293

data = urllib.parse.urlencode({'key': 'value'}).encode('utf-8')

294

req = urllib.request.Request('http://httpbin.org/post', data=data)

295

with urllib.request.urlopen(req) as response:

296

result = response.read()

297

298

warc_data = writer.get_contents()

299

print(f"urllib requests captured: {len(warc_data)} bytes")

300

```

301

302

### Advanced Filtering with Request Details

303

304

```python

305

from warcio.capture_http import capture_http

306

import requests

307

308

def advanced_filter(request_data):

309

"""

310

Advanced filter with access to more request details.

311

312

Available keys in request_data:

313

- 'url': Request URL

314

- 'method': HTTP method (GET, POST, etc.)

315

- 'headers': Request headers dict

316

"""

317

method = request_data.get('method', 'GET')

318

url = request_data.get('url', '')

319

headers = request_data.get('headers', {})

320

321

# Only record GET requests

322

if method != 'GET':

323

return False

324

325

# Skip requests with certain user agents

326

user_agent = headers.get('User-Agent', '')

327

if 'bot' in user_agent.lower():

328

return False

329

330

# Only record specific domains

331

allowed_domains = ['example.com', 'httpbin.org']

332

if not any(domain in url for domain in allowed_domains):

333

return False

334

335

return True

336

337

with capture_http(filter_func=advanced_filter) as writer:

338

# This will be recorded (GET to allowed domain)

339

requests.get('http://example.com')

340

341

# This will be skipped (POST request)

342

requests.post('http://example.com', data={'test': 'data'})

343

344

# This will be skipped (bot user agent)

345

requests.get('http://example.com', headers={'User-Agent': 'TestBot/1.0'})

346

```

347

348

### Combining with Existing WARC Files

349

350

```python

351

from warcio.capture_http import capture_http

352

from warcio import WARCWriter

353

import requests

354

355

# Open existing WARC file for appending

356

with open('existing.warc.gz', 'ab') as output_file: # Note: 'ab' for append

357

writer = WARCWriter(output_file)

358

359

# Capture new requests and append to existing file

360

with capture_http(warc_writer=writer, append=True) as writer:

361

requests.get('http://example.com/new-page')

362

363

print("New requests appended to existing WARC file")

364

```

365

366

### Error Handling in Capture

367

368

```python

369

from warcio.capture_http import capture_http

370

import requests

371

372

with capture_http() as writer:

373

try:

374

# Even failed requests are captured

375

response = requests.get('http://nonexistent-domain.invalid', timeout=5)

376

except requests.exceptions.RequestException as e:

377

print(f"Request failed: {e}")

378

# The failed request attempt is still recorded in the WARC

379

380

# Successful requests continue to be captured

381

response = requests.get('http://example.com')

382

383

# Both successful and failed request attempts are in the WARC

384

warc_data = writer.get_contents()

385

print(f"Captured data including failed requests: {len(warc_data)} bytes")

386

```