0
# WARC Writing
1
2
Comprehensive functionality for creating and writing WARC files, including record building, header management, compression, and digest calculation. The library provides both streaming writers and in-memory buffer writers for different use cases.
3
4
## Capabilities
5
6
### WARC Writer
7
8
Main class for writing WARC records to files or streams with optional compression.
9
10
```python { .api }
11
class WARCWriter:
12
def __init__(self, filebuf, gzip=True, warc_version=None, header_filter=None):
13
"""
14
WARC writer for creating WARC files.
15
16
Args:
17
filebuf: File-like object to write to
18
gzip (bool): Enable gzip compression (default True)
19
warc_version (str): WARC version to use (default None for latest)
20
header_filter (callable): Optional function to filter headers
21
"""
22
23
def write_record(self, record, params=None):
24
"""
25
Write a WARC record to the output stream.
26
27
Args:
28
record: ArcWarcRecord to write
29
params: Optional parameters for writing
30
"""
31
32
def write_request_response_pair(self, req, resp, params=None):
33
"""
34
Write a request/response pair with proper linking.
35
36
Args:
37
req: Request record
38
resp: Response record
39
params: Optional parameters for writing
40
"""
41
```
42
43
### Buffer WARC Writer
44
45
WARC writer that writes to an in-memory buffer for testing or temporary storage.
46
47
```python { .api }
48
class BufferWARCWriter(WARCWriter):
49
def __init__(self, gzip=True, warc_version=None, header_filter=None):
50
"""
51
WARC writer that writes to in-memory buffer.
52
53
Args:
54
gzip (bool): Enable gzip compression (default True)
55
warc_version (str): WARC version to use
56
header_filter (callable): Optional function to filter headers
57
"""
58
59
def get_contents(self):
60
"""
61
Get buffer contents as bytes.
62
63
Returns:
64
bytes: Complete WARC file contents
65
"""
66
67
def get_stream(self):
68
"""
69
Get buffer as stream positioned at beginning.
70
71
Returns:
72
io.BytesIO: Stream containing WARC data
73
"""
74
```
75
76
### Record Builder
77
78
Factory class for creating various types of WARC records with proper headers and metadata.
79
80
```python { .api }
81
class RecordBuilder:
82
def __init__(self, warc_version=None, header_filter=None):
83
"""
84
Builder for creating WARC records.
85
86
Args:
87
warc_version (str): WARC version to use (default None)
88
header_filter (callable): Optional function to filter headers
89
"""
90
91
def create_warc_record(self, uri, record_type, payload=None, length=None,
92
warc_content_type='', warc_headers_dict=None,
93
warc_headers=None, http_headers=None):
94
"""
95
Create a general WARC record.
96
97
Args:
98
uri (str): Target URI for the record
99
record_type (str): WARC record type ('response', 'request', etc.)
100
payload: Record payload as file-like object or bytes
101
length (int): Content length (calculated if None)
102
warc_content_type (str): WARC content type (default '')
103
warc_headers_dict (dict): Additional WARC headers as dict
104
warc_headers: Additional WARC headers as StatusAndHeaders
105
http_headers: HTTP headers as StatusAndHeaders object
106
107
Returns:
108
ArcWarcRecord: Created WARC record
109
"""
110
111
def create_revisit_record(self, uri, digest, refers_to_uri, refers_to_date,
112
http_headers=None, warc_headers_dict=None):
113
"""
114
Create a revisit record that references an earlier record.
115
116
Args:
117
uri (str): Target URI
118
digest (str): Digest of referenced record
119
refers_to_uri (str): URI of referenced record
120
refers_to_date (str): Date of referenced record
121
http_headers: HTTP headers as StatusAndHeaders object
122
warc_headers_dict (dict): Additional WARC headers
123
124
Returns:
125
ArcWarcRecord: Created revisit record
126
"""
127
128
def create_warcinfo_record(self, filename, info):
129
"""
130
Create a warcinfo record with file metadata.
131
132
Args:
133
filename (str): Name of the WARC file
134
info (dict or str): Metadata information
135
136
Returns:
137
ArcWarcRecord: Created warcinfo record
138
"""
139
140
def curr_warc_date(self):
141
"""
142
Get current date in WARC format.
143
144
Returns:
145
str: Current timestamp in WARC date format
146
"""
147
148
def ensure_digest(self, record, block=True, payload=True):
149
"""
150
Ensure record has proper digests calculated.
151
152
Args:
153
record: Record to add digests to
154
block (bool): Calculate block digest if True
155
payload (bool): Calculate payload digest if True
156
"""
157
158
# RecordBuilder Constants
159
REVISIT_PROFILE = 'http://netpreserve.org/warc/1.0/revisit/identical-payload-digest'
160
REVISIT_PROFILE_1_1 = 'http://netpreserve.org/warc/1.1/revisit/identical-payload-digest'
161
WARC_1_0 = 'WARC/1.0'
162
WARC_1_1 = 'WARC/1.1'
163
WARC_VERSION = WARC_1_0
164
NO_PAYLOAD_DIGEST_TYPES = ('warcinfo', 'revisit')
165
```
166
167
### Base Writer and Compression
168
169
Base classes and utilities for WARC writing with compression support.
170
171
```python { .api }
172
class BaseWARCWriter:
173
def __init__(self, gzip=True, warc_version=None, header_filter=None):
174
"""
175
Base class for WARC writers.
176
177
Args:
178
gzip (bool): Enable gzip compression
179
warc_version (str): WARC version
180
header_filter (callable): Header filter function
181
"""
182
183
def write_request_response_pair(self, req, resp, params=None):
184
"""Write request/response pair with proper linking."""
185
186
def write_record(self, record, params=None):
187
"""Write single record (abstract method)."""
188
189
class GzippingWrapper:
190
def __init__(self, out):
191
"""
192
Wrapper that gzip-compresses data on write.
193
194
Args:
195
out: Output stream to write compressed data to
196
"""
197
198
def write(self, buff):
199
"""
200
Write and compress data.
201
202
Args:
203
buff (bytes): Data to compress and write
204
"""
205
206
def flush(self):
207
"""Flush compressed data to output stream."""
208
```
209
210
## Usage Examples
211
212
### Basic WARC File Creation
213
214
```python
215
from warcio import WARCWriter
216
from warcio.recordbuilder import RecordBuilder
217
from warcio.statusandheaders import StatusAndHeaders
218
import io
219
220
# Create a WARC file
221
output_buffer = io.BytesIO()
222
writer = WARCWriter(output_buffer)
223
builder = RecordBuilder()
224
225
# Create warcinfo record
226
warcinfo_record = builder.create_warcinfo_record(
227
filename='example.warc',
228
info={'software': 'warcio', 'format': 'WARC File Format 1.1'}
229
)
230
writer.write_record(warcinfo_record)
231
232
# Create response record
233
http_headers = StatusAndHeaders('200 OK', [
234
('Content-Type', 'text/html'),
235
('Content-Length', '13')
236
])
237
238
response_record = builder.create_warc_record(
239
uri='http://example.com',
240
record_type='response',
241
payload=io.BytesIO(b'Hello, World!'),
242
http_headers=http_headers
243
)
244
writer.write_record(response_record)
245
246
# Get the WARC data
247
warc_data = output_buffer.getvalue()
248
print(f"Created WARC file of {len(warc_data)} bytes")
249
```
250
251
### Request/Response Pair Creation
252
253
```python
254
from warcio import WARCWriter
255
from warcio.recordbuilder import RecordBuilder
256
from warcio.statusandheaders import StatusAndHeaders
257
import io
258
259
output_buffer = io.BytesIO()
260
writer = WARCWriter(output_buffer)
261
builder = RecordBuilder()
262
263
# Create request record
264
request_headers = StatusAndHeaders('GET / HTTP/1.1', [
265
('Host', 'example.com'),
266
('User-Agent', 'warcio-client/1.0')
267
], is_http_request=True)
268
269
request_record = builder.create_warc_record(
270
uri='http://example.com/',
271
record_type='request',
272
http_headers=request_headers
273
)
274
275
# Create response record
276
response_headers = StatusAndHeaders('200 OK', [
277
('Content-Type', 'text/html'),
278
('Content-Length', '13')
279
])
280
281
response_record = builder.create_warc_record(
282
uri='http://example.com/',
283
record_type='response',
284
payload=io.BytesIO(b'Hello, World!'),
285
http_headers=response_headers
286
)
287
288
# Write as linked pair
289
writer.write_request_response_pair(request_record, response_record)
290
```
291
292
### Buffer Writer Usage
293
294
```python
295
from warcio.warcwriter import BufferWARCWriter
296
from warcio.recordbuilder import RecordBuilder
297
from warcio.statusandheaders import StatusAndHeaders
298
import io
299
300
# Use buffer writer for in-memory operations
301
writer = BufferWARCWriter()
302
builder = RecordBuilder()
303
304
# Create and write record
305
record = builder.create_warc_record(
306
uri='http://example.com',
307
record_type='response',
308
payload=io.BytesIO(b'Hello, World!'),
309
http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')])
310
)
311
writer.write_record(record)
312
313
# Get contents as bytes
314
warc_bytes = writer.get_contents()
315
316
# Or get as stream for further processing
317
warc_stream = writer.get_stream()
318
```
319
320
### Revisit Record Creation
321
322
```python
323
from warcio.recordbuilder import RecordBuilder
324
from warcio.statusandheaders import StatusAndHeaders
325
326
builder = RecordBuilder()
327
328
# Create original response record
329
original_record = builder.create_warc_record(
330
uri='http://example.com',
331
record_type='response',
332
payload=io.BytesIO(b'Original content'),
333
http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')])
334
)
335
336
# Get the payload digest from the original record
337
original_digest = original_record.rec_headers.get_header('WARC-Payload-Digest')
338
original_date = original_record.rec_headers.get_header('WARC-Date')
339
340
# Create revisit record referencing the original
341
revisit_record = builder.create_revisit_record(
342
uri='http://example.com',
343
digest=original_digest,
344
refers_to_uri='http://example.com',
345
refers_to_date=original_date,
346
http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')])
347
)
348
```
349
350
### Custom WARC Headers
351
352
```python
353
from warcio.recordbuilder import RecordBuilder
354
from warcio.statusandheaders import StatusAndHeaders
355
import io
356
357
builder = RecordBuilder()
358
359
# Create record with custom WARC headers
360
custom_warc_headers = {
361
'WARC-IP-Address': '192.168.1.1',
362
'WARC-Block-Digest': 'sha1:AAAAAAAAAAAAAAAAAAAAAAAAAAA=',
363
'Custom-Header': 'custom-value'
364
}
365
366
record = builder.create_warc_record(
367
uri='http://example.com',
368
record_type='response',
369
payload=io.BytesIO(b'Hello, World!'),
370
http_headers=StatusAndHeaders('200 OK', [('Content-Type', 'text/plain')]),
371
warc_headers_dict=custom_warc_headers
372
)
373
374
# Ensure digests are calculated
375
builder.ensure_digest(record, block=True, payload=True)
376
```
377
378
### Uncompressed WARC Files
379
380
```python
381
from warcio import WARCWriter
382
import io
383
384
# Create uncompressed WARC file
385
output_buffer = io.BytesIO()
386
writer = WARCWriter(output_buffer, gzip=False) # Disable compression
387
388
# Write records normally
389
# ... record creation and writing code ...
390
```