or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

archive-reading.mdcli-tools.mdhttp-capture.mdhttp-headers.mdindex.mdstream-processing.mdtime-utilities.mdwarc-writing.md

http-headers.mddocs/

0

# HTTP Headers

1

2

Comprehensive HTTP header parsing, manipulation, and formatting with support for status lines, case-insensitive access, and proper encoding handling. The library provides both representation and parsing capabilities for HTTP-style headers used in WARC records.

3

4

## Capabilities

5

6

### Status and Headers Representation

7

8

Main class for representing parsed HTTP status lines and headers with dictionary-like access and manipulation methods.

9

10

```python { .api }

11

class StatusAndHeaders:

12

def __init__(self, statusline, headers, protocol='', total_len=0,

13

is_http_request=False):

14

"""

15

Representation of parsed HTTP-style status line and headers.

16

17

Args:

18

statusline (str): HTTP status line (e.g., '200 OK')

19

headers (list): List of (name, value) tuples for headers

20

protocol (str): Protocol string (e.g., 'HTTP/1.1')

21

total_len (int): Total length of original headers

22

is_http_request (bool): True if this is a request (splits verb from statusline)

23

"""

24

25

def get_header(self, name, default_value=None):

26

"""

27

Get header value by name (case-insensitive).

28

29

Args:

30

name (str): Header name to search for

31

default_value: Value to return if header not found

32

33

Returns:

34

str or default_value: Header value if found, default_value otherwise

35

"""

36

37

def add_header(self, name, value):

38

"""

39

Add a new header.

40

41

Args:

42

name (str): Header name

43

value (str): Header value

44

"""

45

46

def replace_header(self, name, value):

47

"""

48

Replace header with new value or add if not present.

49

50

Args:

51

name (str): Header name

52

value (str): New header value

53

54

Returns:

55

str or None: Previous header value if replaced, None if added

56

"""

57

58

def remove_header(self, name):

59

"""

60

Remove header by name (case-insensitive).

61

62

Args:

63

name (str): Header name to remove

64

65

Returns:

66

bool: True if header was removed, False if not found

67

"""

68

69

def get_statuscode(self):

70

"""

71

Extract status code from status line.

72

73

Returns:

74

str: Status code portion of status line

75

"""

76

77

def validate_statusline(self, valid_statusline):

78

"""

79

Validate status line and replace if invalid.

80

81

Args:

82

valid_statusline (str): Replacement status line if current is invalid

83

84

Returns:

85

bool: True if original was valid, False if replaced

86

"""

87

88

def add_range(self, start, part_len, total_len):

89

"""

90

Add HTTP range headers for partial content responses.

91

92

Args:

93

start (int): Start byte position

94

part_len (int): Length of partial content

95

total_len (int): Total content length

96

97

Returns:

98

StatusAndHeaders: Self for method chaining

99

"""

100

101

def compute_headers_buffer(self, header_filter=None):

102

"""

103

Pre-compute headers buffer for efficient serialization.

104

105

Args:

106

header_filter (callable): Optional function to filter headers

107

"""

108

109

def to_str(self, filter_func=None):

110

"""

111

Convert to string representation.

112

113

Args:

114

filter_func (callable): Optional function to filter headers

115

116

Returns:

117

str: String representation of status and headers

118

"""

119

120

def to_bytes(self, filter_func=None, encoding='utf-8'):

121

"""

122

Convert to bytes representation.

123

124

Args:

125

filter_func (callable): Optional function to filter headers

126

encoding (str): Text encoding to use

127

128

Returns:

129

bytes: Byte representation of status and headers

130

"""

131

132

def to_ascii_bytes(self, filter_func=None):

133

"""

134

Convert to ASCII bytes with percent-encoding for non-ASCII characters.

135

136

Args:

137

filter_func (callable): Optional function to filter headers

138

139

Returns:

140

bytes: ASCII-safe byte representation

141

"""

142

143

def percent_encode_non_ascii_headers(self, encoding='UTF-8'):

144

"""

145

Percent-encode non-ASCII header values per RFC specifications.

146

147

Args:

148

encoding (str): Encoding to use for percent-encoding

149

"""

150

151

# Dictionary-like interface

152

def __getitem__(self, key):

153

"""Get header value by name (same as get_header)."""

154

155

def __setitem__(self, key, value):

156

"""Set header value by name (same as replace_header)."""

157

158

def __delitem__(self, key):

159

"""Delete header by name (same as remove_header)."""

160

161

def __contains__(self, key):

162

"""Check if header exists (case-insensitive)."""

163

```

164

165

### Headers Parser

166

167

Parser for reading HTTP-style status and headers from streams with support for continuation lines and encoding detection.

168

169

```python { .api }

170

class StatusAndHeadersParser:

171

def __init__(self, statuslist, verify=True):

172

"""

173

Parser for HTTP-style status and headers.

174

175

Args:

176

statuslist (list): List of valid status line prefixes

177

verify (bool): Whether to verify status line format

178

"""

179

180

def parse(self, stream, full_statusline=None):

181

"""

182

Parse status line and headers from stream.

183

184

Args:

185

stream: Stream supporting readline() method

186

full_statusline (str): Pre-read status line (optional)

187

188

Returns:

189

StatusAndHeaders: Parsed status and headers object

190

191

Raises:

192

StatusAndHeadersParserException: If parsing fails

193

EOFError: If stream is at end

194

"""

195

196

@staticmethod

197

def split_prefix(key, prefixs):

198

"""

199

Split key string by first matching prefix.

200

201

Args:

202

key (str): String to split

203

prefixs (list): List of prefixes to match against

204

205

Returns:

206

tuple: (matched_prefix, remainder) or None if no match

207

"""

208

209

@staticmethod

210

def make_warc_id(id_=None):

211

"""

212

Generate a WARC record ID.

213

214

Args:

215

id_: Optional UUID to use (generates new one if None)

216

217

Returns:

218

str: WARC record ID in URN format

219

"""

220

221

@staticmethod

222

def decode_header(line):

223

"""

224

Decode header line with proper encoding detection.

225

226

Args:

227

line (bytes or str): Header line to decode

228

229

Returns:

230

str: Decoded header line

231

"""

232

```

233

234

### Parser Exception

235

236

Exception class for header parsing errors with access to problematic status line.

237

238

```python { .api }

239

class StatusAndHeadersParserException(Exception):

240

def __init__(self, msg, statusline):

241

"""

242

Exception for status and headers parsing errors.

243

244

Args:

245

msg (str): Error message

246

statusline (str): Problematic status line

247

"""

248

```

249

250

## Usage Examples

251

252

### Basic Header Manipulation

253

254

```python

255

from warcio.statusandheaders import StatusAndHeaders

256

257

# Create status and headers object

258

headers_list = [

259

('Content-Type', 'text/html'),

260

('Content-Length', '1234'),

261

('Server', 'Apache/2.4.41')

262

]

263

264

status_headers = StatusAndHeaders('200 OK', headers_list)

265

266

# Access headers (case-insensitive)

267

content_type = status_headers.get_header('content-type')

268

print(f"Content-Type: {content_type}") # text/html

269

270

# Dictionary-like access

271

content_length = status_headers['Content-Length']

272

print(f"Content-Length: {content_length}") # 1234

273

274

# Check if header exists

275

if 'server' in status_headers:

276

print(f"Server: {status_headers['server']}")

277

278

# Get status code

279

code = status_headers.get_statuscode()

280

print(f"Status Code: {code}") # 200

281

```

282

283

### Header Modification

284

285

```python

286

from warcio.statusandheaders import StatusAndHeaders

287

288

status_headers = StatusAndHeaders('200 OK', [

289

('Content-Type', 'text/html'),

290

('Content-Length', '1234')

291

])

292

293

# Add new header

294

status_headers.add_header('Cache-Control', 'no-cache')

295

296

# Replace existing header

297

old_length = status_headers.replace_header('Content-Length', '5678')

298

print(f"Previous length: {old_length}") # 1234

299

300

# Remove header

301

removed = status_headers.remove_header('Cache-Control')

302

print(f"Header removed: {removed}") # True

303

304

# Dictionary-style modification

305

status_headers['X-Custom-Header'] = 'custom-value'

306

del status_headers['Content-Type']

307

```

308

309

### Request Headers

310

311

```python

312

from warcio.statusandheaders import StatusAndHeaders

313

314

# Create request headers (note is_http_request=True)

315

request_headers = StatusAndHeaders(

316

'GET /path HTTP/1.1',

317

[

318

('Host', 'example.com'),

319

('User-Agent', 'Mozilla/5.0'),

320

('Accept', 'text/html,application/xhtml+xml')

321

],

322

is_http_request=True

323

)

324

325

# The protocol is extracted from the status line

326

print(f"Method and path: {request_headers.statusline}") # /path

327

print(f"Protocol: {request_headers.protocol}") # GET

328

```

329

330

### Range Headers for Partial Content

331

332

```python

333

from warcio.statusandheaders import StatusAndHeaders

334

335

# Create initial response headers

336

status_headers = StatusAndHeaders('200 OK', [

337

('Content-Type', 'application/octet-stream'),

338

('Content-Length', '10000')

339

])

340

341

# Convert to partial content response

342

status_headers.add_range(start=1000, part_len=2000, total_len=10000)

343

344

print(f"Status: {status_headers.statusline}") # 206 Partial Content

345

print(f"Content-Range: {status_headers.get_header('Content-Range')}")

346

# bytes 1000-2999/10000

347

print(f"Content-Length: {status_headers.get_header('Content-Length')}") # 2000

348

```

349

350

### Headers Parsing

351

352

```python

353

from warcio.statusandheaders import StatusAndHeadersParser

354

import io

355

356

# Create parser for HTTP responses

357

parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])

358

359

# Parse headers from stream

360

header_data = b"""HTTP/1.1 200 OK\r

361

Content-Type: text/html\r

362

Content-Length: 1234\r

363

Server: Apache/2.4.41\r

364

\r

365

"""

366

367

stream = io.BytesIO(header_data)

368

status_headers = parser.parse(stream)

369

370

print(f"Status: {status_headers.statusline}") # 200 OK

371

print(f"Protocol: {status_headers.protocol}") # HTTP/1.1

372

print(f"Content-Type: {status_headers.get_header('Content-Type')}") # text/html

373

```

374

375

### Encoding Handling

376

377

```python

378

from warcio.statusandheaders import StatusAndHeaders

379

380

# Headers with non-ASCII content

381

headers_with_unicode = StatusAndHeaders('200 OK', [

382

('Content-Type', 'text/html; charset=utf-8'),

383

('Content-Disposition', 'attachment; filename="tëst.txt"'),

384

('X-Custom', 'Héllo Wörld')

385

])

386

387

# Convert to ASCII bytes (automatically percent-encodes non-ASCII)

388

ascii_bytes = headers_with_unicode.to_ascii_bytes()

389

print("ASCII-safe representation created")

390

391

# Manual percent-encoding of non-ASCII headers

392

headers_with_unicode.percent_encode_non_ascii_headers()

393

print("Non-ASCII headers percent-encoded")

394

```

395

396

### Custom Header Filtering

397

398

```python

399

from warcio.statusandheaders import StatusAndHeaders

400

401

status_headers = StatusAndHeaders('200 OK', [

402

('Content-Type', 'text/html'),

403

('Content-Length', '1234'),

404

('Server', 'Apache/2.4.41'),

405

('X-Debug', 'sensitive-info')

406

])

407

408

# Define filter function to remove debug headers

409

def filter_debug_headers(header_tuple):

410

name, value = header_tuple

411

if name.lower().startswith('x-debug'):

412

return None # Remove this header

413

return header_tuple # Keep this header

414

415

# Convert to string with filtering

416

filtered_headers = status_headers.to_str(filter_func=filter_debug_headers)

417

print("Headers with debug info filtered out")

418

419

# Pre-compute filtered buffer for efficient serialization

420

status_headers.compute_headers_buffer(header_filter=filter_debug_headers)

421

```

422

423

### Status Line Validation

424

425

```python

426

from warcio.statusandheaders import StatusAndHeaders

427

428

# Create headers with potentially invalid status line

429

status_headers = StatusAndHeaders('Invalid Status', [

430

('Content-Type', 'text/html')

431

])

432

433

# Validate and fix if necessary

434

is_valid = status_headers.validate_statusline('200 OK')

435

if not is_valid:

436

print("Status line was invalid and has been replaced")

437

print(f"New status: {status_headers.statusline}") # 200 OK

438

```