or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

archive-reading.mdcli-tools.mdhttp-capture.mdhttp-headers.mdindex.mdstream-processing.mdtime-utilities.mdwarc-writing.md

warc-writing.mddocs/

0

# WARC Writing

1

2

Comprehensive functionality for creating and writing WARC files, including record building, header management, compression, and digest calculation. The library provides both streaming writers and in-memory buffer writers for different use cases.

3

4

## Capabilities

5

6

### WARC Writer

7

8

Main class for writing WARC records to files or streams with optional compression.

9

10

```python { .api }

11

class WARCWriter:

12

def __init__(self, filebuf, gzip=True, warc_version=None, header_filter=None):

13

"""

14

WARC writer for creating WARC files.

15

16

Args:

17

filebuf: File-like object to write to

18

gzip (bool): Enable gzip compression (default True)

19

warc_version (str): WARC version to use (default None for latest)

20

header_filter (callable): Optional function to filter headers

21

"""

22

23

def write_record(self, record, params=None):

24

"""

25

Write a WARC record to the output stream.

26

27

Args:

28

record: ArcWarcRecord to write

29

params: Optional parameters for writing

30

"""

31

32

def write_request_response_pair(self, req, resp, params=None):

33

"""

34

Write a request/response pair with proper linking.

35

36

Args:

37

req: Request record

38

resp: Response record

39

params: Optional parameters for writing

40

"""

41

```

42

43

### Buffer WARC Writer

44

45

WARC writer that writes to an in-memory buffer for testing or temporary storage.

46

47

```python { .api }

48

class BufferWARCWriter(WARCWriter):

49

def __init__(self, gzip=True, warc_version=None, header_filter=None):

50

"""

51

WARC writer that writes to in-memory buffer.

52

53

Args:

54

gzip (bool): Enable gzip compression (default True)

55

warc_version (str): WARC version to use

56

header_filter (callable): Optional function to filter headers

57

"""

58

59

def get_contents(self):

60

"""

61

Get buffer contents as bytes.

62

63

Returns:

64

bytes: Complete WARC file contents

65

"""

66

67

def get_stream(self):

68

"""

69

Get buffer as stream positioned at beginning.

70

71

Returns:

72

io.BytesIO: Stream containing WARC data

73

"""

74

```

75

76

### Record Builder

77

78

Factory class for creating various types of WARC records with proper headers and metadata.

79

80

```python { .api }

81

class RecordBuilder:

82

def __init__(self, warc_version=None, header_filter=None):

83

"""

84

Builder for creating WARC records.

85

86

Args:

87

warc_version (str): WARC version to use (default None)

88

header_filter (callable): Optional function to filter headers

89

"""

90

91

def create_warc_record(self, uri, record_type, payload=None, length=None,

92

warc_content_type='', warc_headers_dict=None,

93

warc_headers=None, http_headers=None):

94

"""

95

Create a general WARC record.

96

97

Args:

98

uri (str): Target URI for the record

99

record_type (str): WARC record type ('response', 'request', etc.)

100

payload: Record payload as file-like object or bytes

101

length (int): Content length (calculated if None)

102

warc_content_type (str): WARC content type (default '')

103

warc_headers_dict (dict): Additional WARC headers as dict

104

warc_headers: Additional WARC headers as StatusAndHeaders

105

http_headers: HTTP headers as StatusAndHeaders object

106

107

Returns:

108

ArcWarcRecord: Created WARC record

109

"""

110

111

def create_revisit_record(self, uri, digest, refers_to_uri, refers_to_date,

112

http_headers=None, warc_headers_dict=None):

113

"""

114

Create a revisit record that references an earlier record.

115

116

Args:

117

uri (str): Target URI

118

digest (str): Digest of referenced record

119

refers_to_uri (str): URI of referenced record

120

refers_to_date (str): Date of referenced record

121

http_headers: HTTP headers as StatusAndHeaders object

122

warc_headers_dict (dict): Additional WARC headers

123

124

Returns:

125

ArcWarcRecord: Created revisit record

126

"""

127

128

def create_warcinfo_record(self, filename, info):

129

"""

130

Create a warcinfo record with file metadata.

131

132

Args:

133

filename (str): Name of the WARC file

134

info (dict or str): Metadata information

135

136

Returns:

137

ArcWarcRecord: Created warcinfo record

138

"""

139

140

def curr_warc_date(self):

141

"""

142

Get current date in WARC format.

143

144

Returns:

145

str: Current timestamp in WARC date format

146

"""

147

148

def ensure_digest(self, record, block=True, payload=True):

149

"""

150

Ensure record has proper digests calculated.

151

152

Args:

153

record: Record to add digests to

154

block (bool): Calculate block digest if True

155

payload (bool): Calculate payload digest if True

156

"""

157

158

# RecordBuilder Constants

159

REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/identical-payload-digest'

160

REVISIT_PROFILE_1_1 = 'http://netpreserve.org/warc/1.1/revisit/identical-payload-digest'

161

WARC_1_0 = 'WARC/1.0'

162

WARC_1_1 = 'WARC/1.1'

163

WARC_VERSION = WARC_1_0

164

NO_PAYLOAD_DIGEST_TYPES = ('warcinfo', 'revisit')

165

```

166

167

### Base Writer and Compression

168

169

Base classes and utilities for WARC writing with compression support.

170

171

```python { .api }

172

class BaseWARCWriter:

173

def __init__(self, gzip=True, warc_version=None, header_filter=None):

174

"""

175

Base class for WARC writers.

176

177

Args:

178

gzip (bool): Enable gzip compression

179

warc_version (str): WARC version

180

header_filter (callable): Header filter function

181

"""

182

183

def write_request_response_pair(self, req, resp, params=None):

184

"""Write request/response pair with proper linking."""

185

186

def write_record(self, record, params=None):

187

"""Write single record (abstract method)."""

188

189

class GzippingWrapper:

190

def __init__(self, out):

191

"""

192

Wrapper that gzip-compresses data on write.

193

194

Args:

195

out: Output stream to write compressed data to

196

"""

197

198

def write(self, buff):

199

"""

200

Write and compress data.

201

202

Args:

203

buff (bytes): Data to compress and write

204

"""

205

206

def flush(self):

207

"""Flush compressed data to output stream."""

208

```

209

210

## Usage Examples

211

212

### Basic WARC File Creation

213

214

```python

215

from warcio import WARCWriter

216

from warcio.recordbuilder import RecordBuilder

217

from warcio.statusandheaders import StatusAndHeaders

218

import io

219

220

# Create a WARC file

221

output_buffer = io.BytesIO()

222

writer = WARCWriter(output_buffer)

223

builder = RecordBuilder()

224

225

# Create warcinfo record

226

warcinfo_record = builder.create_warcinfo_record(

227

filename='example.warc',

228

info={'software': 'warcio', 'format': 'WARC File Format 1.1'}

229

)

230

writer.write_record(warcinfo_record)

231

232

# Create response record

233

http_headers = StatusAndHeaders('200 OK', [

234

('Content-Type', 'text/html'),

235

('Content-Length', '13')

236

])

237

238

response_record = builder.create_warc_record(

239

uri='http://example.com',

240

record_type='response',

241

payload=io.BytesIO(b'Hello, World!'),

242

http_headers=http_headers

243

)

244

writer.write_record(response_record)

245

246

# Get the WARC data

247

warc_data = output_buffer.getvalue()

248

print(f"Created WARC file of {len(warc_data)} bytes")

249

```

250

251

### Request/Response Pair Creation

252

253

```python

254

from warcio import WARCWriter

255

from warcio.recordbuilder import RecordBuilder

256

from warcio.statusandheaders import StatusAndHeaders

257

import io

258

259

output_buffer = io.BytesIO()

260

writer = WARCWriter(output_buffer)

261

builder = RecordBuilder()

262

263

# Create request record

264

request_headers = StatusAndHeaders('GET / HTTP/1.1', [

265

('Host', 'example.com'),

266

('User-Agent', 'warcio-client/1.0')

267

], is_http_request=True)

268

269

request_record = builder.create_warc_record(

270

uri='http://example.com/',

271

record_type='request',

272

http_headers=request_headers

273

)

274

275

# Create response record

276

response_headers = StatusAndHeaders('200 OK', [

277

('Content-Type', 'text/html'),

278

('Content-Length', '13')

279

])

280

281

response_record = builder.create_warc_record(

282

uri='http://example.com/',

283

record_type='response',

284

payload=io.BytesIO(b'Hello, World!'),

285

http_headers=response_headers

286

)

287

288

# Write as linked pair

289

writer.write_request_response_pair(request_record, response_record)

290

```

291

292

### Buffer Writer Usage

293

294

```python

295

from warcio.warcwriter import BufferWARCWriter

296

from warcio.recordbuilder import RecordBuilder

297

from warcio.statusandheaders import StatusAndHeaders

298

import io

299

300

# Use buffer writer for in-memory operations

301

writer = BufferWARCWriter()

302

builder = RecordBuilder()

303

304

# Create and write record

305

record = builder.create_warc_record(

306

uri='http://example.com',

307

record_type='response',

308

payload=io.BytesIO(b'Hello, World!'),

309

http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')])

310

)

311

writer.write_record(record)

312

313

# Get contents as bytes

314

warc_bytes = writer.get_contents()

315

316

# Or get as stream for further processing

317

warc_stream = writer.get_stream()

318

```

319

320

### Revisit Record Creation

321

322

```python

323

from warcio.recordbuilder import RecordBuilder

324

from warcio.statusandheaders import StatusAndHeaders

325

326

builder = RecordBuilder()

327

328

# Create original response record

329

original_record = builder.create_warc_record(

330

uri='http://example.com',

331

record_type='response',

332

payload=io.BytesIO(b'Original content'),

333

http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')])

334

)

335

336

# Get the payload digest from the original record

337

original_digest = original_record.rec_headers.get_header('WARC-Payload-Digest')

338

original_date = original_record.rec_headers.get_header('WARC-Date')

339

340

# Create revisit record referencing the original

341

revisit_record = builder.create_revisit_record(

342

uri='http://example.com',

343

digest=original_digest,

344

refers_to_uri='http://example.com',

345

refers_to_date=original_date,

346

http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')])

347

)

348

```

349

350

### Custom WARC Headers

351

352

```python

353

from warcio.recordbuilder import RecordBuilder

354

from warcio.statusandheaders import StatusAndHeaders

355

import io

356

357

builder = RecordBuilder()

358

359

# Create record with custom WARC headers

360

custom_warc_headers = {

361

'WARC-IP-Address': '192.168.1.1',

362

'WARC-Block-Digest': 'sha1:AAAAAAAAAAAAAAAAAAAAAAAAAAA=',

363

'Custom-Header': 'custom-value'

364

}

365

366

record = builder.create_warc_record(

367

uri='http://example.com',

368

record_type='response',

369

payload=io.BytesIO(b'Hello, World!'),

370

http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')]),

371

warc_headers_dict=custom_warc_headers

372

)

373

374

# Ensure digests are calculated

375

builder.ensure_digest(record, block=True, payload=True)

376

```

377

378

### Uncompressed WARC Files

379

380

```python

381

from warcio import WARCWriter

382

import io

383

384

# Create uncompressed WARC file

385

output_buffer = io.BytesIO()

386

writer = WARCWriter(output_buffer, gzip=False) # Disable compression

387

388

# Write records normally

389

# ... record creation and writing code ...

390

```