0
# HTTP Headers
1
2
Comprehensive HTTP header parsing, manipulation, and formatting with support for status lines, case-insensitive access, and proper encoding handling. The library provides both representation and parsing capabilities for HTTP-style headers used in WARC records.
3
4
## Capabilities
5
6
### Status and Headers Representation
7
8
Main class for representing parsed HTTP status lines and headers with dictionary-like access and manipulation methods.
9
10
```python { .api }
11
class StatusAndHeaders:
12
def __init__(self, statusline, headers, protocol='', total_len=0,
13
is_http_request=False):
14
"""
15
Representation of parsed HTTP-style status line and headers.
16
17
Args:
18
statusline (str): HTTP status line (e.g., '200 OK')
19
headers (list): List of (name, value) tuples for headers
20
protocol (str): Protocol string (e.g., 'HTTP/1.1')
21
total_len (int): Total length of original headers
22
is_http_request (bool): True if this is a request (splits verb from statusline)
23
"""
24
25
def get_header(self, name, default_value=None):
26
"""
27
Get header value by name (case-insensitive).
28
29
Args:
30
name (str): Header name to search for
31
default_value: Value to return if header not found
32
33
Returns:
34
str or default_value: Header value if found, default_value otherwise
35
"""
36
37
def add_header(self, name, value):
38
"""
39
Add a new header.
40
41
Args:
42
name (str): Header name
43
value (str): Header value
44
"""
45
46
def replace_header(self, name, value):
47
"""
48
Replace header with new value or add if not present.
49
50
Args:
51
name (str): Header name
52
value (str): New header value
53
54
Returns:
55
str or None: Previous header value if replaced, None if added
56
"""
57
58
def remove_header(self, name):
59
"""
60
Remove header by name (case-insensitive).
61
62
Args:
63
name (str): Header name to remove
64
65
Returns:
66
bool: True if header was removed, False if not found
67
"""
68
69
def get_statuscode(self):
70
"""
71
Extract status code from status line.
72
73
Returns:
74
str: Status code portion of status line
75
"""
76
77
def validate_statusline(self, valid_statusline):
78
"""
79
Validate status line and replace if invalid.
80
81
Args:
82
valid_statusline (str): Replacement status line if current is invalid
83
84
Returns:
85
bool: True if original was valid, False if replaced
86
"""
87
88
def add_range(self, start, part_len, total_len):
89
"""
90
Add HTTP range headers for partial content responses.
91
92
Args:
93
start (int): Start byte position
94
part_len (int): Length of partial content
95
total_len (int): Total content length
96
97
Returns:
98
StatusAndHeaders: Self for method chaining
99
"""
100
101
def compute_headers_buffer(self, header_filter=None):
102
"""
103
Pre-compute headers buffer for efficient serialization.
104
105
Args:
106
header_filter (callable): Optional function to filter headers
107
"""
108
109
def to_str(self, filter_func=None):
110
"""
111
Convert to string representation.
112
113
Args:
114
filter_func (callable): Optional function to filter headers
115
116
Returns:
117
str: String representation of status and headers
118
"""
119
120
def to_bytes(self, filter_func=None, encoding='utf-8'):
121
"""
122
Convert to bytes representation.
123
124
Args:
125
filter_func (callable): Optional function to filter headers
126
encoding (str): Text encoding to use
127
128
Returns:
129
bytes: Byte representation of status and headers
130
"""
131
132
def to_ascii_bytes(self, filter_func=None):
133
"""
134
Convert to ASCII bytes with percent-encoding for non-ASCII characters.
135
136
Args:
137
filter_func (callable): Optional function to filter headers
138
139
Returns:
140
bytes: ASCII-safe byte representation
141
"""
142
143
def percent_encode_non_ascii_headers(self, encoding='UTF-8'):
144
"""
145
Percent-encode non-ASCII header values per RFC specifications.
146
147
Args:
148
encoding (str): Encoding to use for percent-encoding
149
"""
150
151
# Dictionary-like interface
152
def __getitem__(self, key):
153
"""Get header value by name (same as get_header)."""
154
155
def __setitem__(self, key, value):
156
"""Set header value by name (same as replace_header)."""
157
158
def __delitem__(self, key):
159
"""Delete header by name (same as remove_header)."""
160
161
def __contains__(self, key):
162
"""Check if header exists (case-insensitive)."""
163
```
164
165
### Headers Parser
166
167
Parser for reading HTTP-style status and headers from streams with support for continuation lines and encoding detection.
168
169
```python { .api }
170
class StatusAndHeadersParser:
171
def __init__(self, statuslist, verify=True):
172
"""
173
Parser for HTTP-style status and headers.
174
175
Args:
176
statuslist (list): List of valid status line prefixes
177
verify (bool): Whether to verify status line format
178
"""
179
180
def parse(self, stream, full_statusline=None):
181
"""
182
Parse status line and headers from stream.
183
184
Args:
185
stream: Stream supporting readline() method
186
full_statusline (str): Pre-read status line (optional)
187
188
Returns:
189
StatusAndHeaders: Parsed status and headers object
190
191
Raises:
192
StatusAndHeadersParserException: If parsing fails
193
EOFError: If stream is at end
194
"""
195
196
@staticmethod
197
def split_prefix(key, prefixs):
198
"""
199
Split key string by first matching prefix.
200
201
Args:
202
key (str): String to split
203
prefixs (list): List of prefixes to match against
204
205
Returns:
206
tuple: (matched_prefix, remainder) or None if no match
207
"""
208
209
@staticmethod
210
def make_warc_id(id_=None):
211
"""
212
Generate a WARC record ID.
213
214
Args:
215
id_: Optional UUID to use (generates new one if None)
216
217
Returns:
218
str: WARC record ID in URN format
219
"""
220
221
@staticmethod
222
def decode_header(line):
223
"""
224
Decode header line with proper encoding detection.
225
226
Args:
227
line (bytes or str): Header line to decode
228
229
Returns:
230
str: Decoded header line
231
"""
232
```
233
234
### Parser Exception
235
236
Exception class for header parsing errors with access to problematic status line.
237
238
```python { .api }
239
class StatusAndHeadersParserException(Exception):
240
def __init__(self, msg, statusline):
241
"""
242
Exception for status and headers parsing errors.
243
244
Args:
245
msg (str): Error message
246
statusline (str): Problematic status line
247
"""
248
```
249
250
## Usage Examples
251
252
### Basic Header Manipulation
253
254
```python
255
from warcio.statusandheaders import StatusAndHeaders
256
257
# Create status and headers object
258
headers_list = [
259
('Content-Type', 'text/html'),
260
('Content-Length', '1234'),
261
('Server', 'Apache/2.4.41')
262
]
263
264
status_headers = StatusAndHeaders('200 OK', headers_list)
265
266
# Access headers (case-insensitive)
267
content_type = status_headers.get_header('content-type')
268
print(f"Content-Type: {content_type}") # text/html
269
270
# Dictionary-like access
271
content_length = status_headers['Content-Length']
272
print(f"Content-Length: {content_length}") # 1234
273
274
# Check if header exists
275
if 'server' in status_headers:
276
print(f"Server: {status_headers['server']}")
277
278
# Get status code
279
code = status_headers.get_statuscode()
280
print(f"Status Code: {code}") # 200
281
```
282
283
### Header Modification
284
285
```python
286
from warcio.statusandheaders import StatusAndHeaders
287
288
status_headers = StatusAndHeaders('200 OK', [
289
('Content-Type', 'text/html'),
290
('Content-Length', '1234')
291
])
292
293
# Add new header
294
status_headers.add_header('Cache-Control', 'no-cache')
295
296
# Replace existing header
297
old_length = status_headers.replace_header('Content-Length', '5678')
298
print(f"Previous length: {old_length}") # 1234
299
300
# Remove header
301
removed = status_headers.remove_header('Cache-Control')
302
print(f"Header removed: {removed}") # True
303
304
# Dictionary-style modification
305
status_headers['X-Custom-Header'] = 'custom-value'
306
del status_headers['Content-Type']
307
```
308
309
### Request Headers
310
311
```python
312
from warcio.statusandheaders import StatusAndHeaders
313
314
# Create request headers (note is_http_request=True)
315
request_headers = StatusAndHeaders(
316
'GET /path HTTP/1.1',
317
[
318
('Host', 'example.com'),
319
('User-Agent', 'Mozilla/5.0'),
320
('Accept', 'text/html,application/xhtml+xml')
321
],
322
is_http_request=True
323
)
324
325
# The protocol is extracted from the status line
326
print(f"Method and path: {request_headers.statusline}") # /path
327
print(f"Protocol: {request_headers.protocol}") # GET
328
```
329
330
### Range Headers for Partial Content
331
332
```python
333
from warcio.statusandheaders import StatusAndHeaders
334
335
# Create initial response headers
336
status_headers = StatusAndHeaders('200 OK', [
337
('Content-Type', 'application/octet-stream'),
338
('Content-Length', '10000')
339
])
340
341
# Convert to partial content response
342
status_headers.add_range(start=1000, part_len=2000, total_len=10000)
343
344
print(f"Status: {status_headers.statusline}") # 206 Partial Content
345
print(f"Content-Range: {status_headers.get_header('Content-Range')}")
346
# bytes 1000-2999/10000
347
print(f"Content-Length: {status_headers.get_header('Content-Length')}") # 2000
348
```
349
350
### Headers Parsing
351
352
```python
353
from warcio.statusandheaders import StatusAndHeadersParser
354
import io
355
356
# Create parser for HTTP responses
357
parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
358
359
# Parse headers from stream
360
header_data = b"""HTTP/1.1 200 OK\r
361
Content-Type: text/html\r
362
Content-Length: 1234\r
363
Server: Apache/2.4.41\r
364
\r
365
"""
366
367
stream = io.BytesIO(header_data)
368
status_headers = parser.parse(stream)
369
370
print(f"Status: {status_headers.statusline}") # 200 OK
371
print(f"Protocol: {status_headers.protocol}") # HTTP/1.1
372
print(f"Content-Type: {status_headers.get_header('Content-Type')}") # text/html
373
```
374
375
### Encoding Handling
376
377
```python
378
from warcio.statusandheaders import StatusAndHeaders
379
380
# Headers with non-ASCII content
381
headers_with_unicode = StatusAndHeaders('200 OK', [
382
('Content-Type', 'text/html; charset=utf-8'),
383
('Content-Disposition', 'attachment; filename="tëst.txt"'),
384
('X-Custom', 'Héllo Wörld')
385
])
386
387
# Convert to ASCII bytes (automatically percent-encodes non-ASCII)
388
ascii_bytes = headers_with_unicode.to_ascii_bytes()
389
print("ASCII-safe representation created")
390
391
# Manual percent-encoding of non-ASCII headers
392
headers_with_unicode.percent_encode_non_ascii_headers()
393
print("Non-ASCII headers percent-encoded")
394
```
395
396
### Custom Header Filtering
397
398
```python
399
from warcio.statusandheaders import StatusAndHeaders
400
401
status_headers = StatusAndHeaders('200 OK', [
402
('Content-Type', 'text/html'),
403
('Content-Length', '1234'),
404
('Server', 'Apache/2.4.41'),
405
('X-Debug', 'sensitive-info')
406
])
407
408
# Define filter function to remove debug headers
409
def filter_debug_headers(header_tuple):
410
name, value = header_tuple
411
if name.lower().startswith('x-debug'):
412
return None # Remove this header
413
return header_tuple # Keep this header
414
415
# Convert to string with filtering
416
filtered_headers = status_headers.to_str(filter_func=filter_debug_headers)
417
print("Headers with debug info filtered out")
418
419
# Pre-compute filtered buffer for efficient serialization
420
status_headers.compute_headers_buffer(header_filter=filter_debug_headers)
421
```
422
423
### Status Line Validation
424
425
```python
426
from warcio.statusandheaders import StatusAndHeaders
427
428
# Create headers with potentially invalid status line
429
status_headers = StatusAndHeaders('Invalid Status', [
430
('Content-Type', 'text/html')
431
])
432
433
# Validate and fix if necessary
434
is_valid = status_headers.validate_statusline('200 OK')
435
if not is_valid:
436
print("Status line was invalid and has been replaced")
437
print(f"New status: {status_headers.statusline}") # 200 OK
438
```