Tessl Tile for pypi/warcio@1.7.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

archive-reading.md cli-tools.md http-capture.md http-headers.md index.md stream-processing.md time-utilities.md warc-writing.md

archive-reading.mddocs/

0
# Archive Reading
1

2
Core functionality for reading and iterating through WARC and ARC files with automatic format detection, decompression, and record parsing. The library provides streaming access to archive records without loading entire files into memory.
3

4
## Capabilities
5

6
### Archive Iterator
7

8
The main interface for reading WARC and ARC files, providing iterator access to records with automatic format detection and decompression support.
9

10
```python { .api }
11
class ArchiveIterator:
12
    def __init__(self, fileobj, no_record_parse=False, verify_http=False, 
13
                 arc2warc=False, ensure_http_headers=False, 
14
                 block_size=16384, check_digests=False):
15
        """
16
        Iterator over records in WARC and ARC files.
17
        
18
        Args:
19
            fileobj: File-like object to read from
20
            no_record_parse (bool): Skip record parsing if True
21
            verify_http (bool): Verify HTTP headers if True
22
            arc2warc (bool): Convert ARC records to WARC format if True
23
            ensure_http_headers (bool): Ensure HTTP headers are present if True
24
            block_size (int): Buffer size for reading (default 16384)
25
            check_digests (bool): Verify record digests if True
26
        """
27
    
28
    def __iter__(self):
29
        """Return iterator."""
30
    
31
    def __next__(self):
32
        """Get next record."""
33
    
34
    def close(self):
35
        """Close iterator and cleanup resources."""
36
    
37
    def get_record_offset(self):
38
        """
39
        Get current record file offset.
40
        
41
        Returns:
42
            int: Byte offset of current record in file
43
        """
44
    
45
    def get_record_length(self):
46
        """
47
        Get current record length.
48
        
49
        Returns:
50
            int: Length of current record in bytes
51
        """
52
    
53
    def read_to_end(self, record=None):
54
        """
55
        Read remainder of the stream.
56
        
57
        Args:
58
            record: Optional record to read (uses current record if None)
59
        """
60
```
61

62
### Format-Specific Iterators
63

64
Specialized iterators for specific archive formats when you know the format in advance.
65

66
```python { .api }
67
class WARCIterator(ArchiveIterator):
68
    def __init__(self, *args, **kwargs):
69
        """ArchiveIterator specialized for WARC format."""
70

71
class ARCIterator(ArchiveIterator):
72
    def __init__(self, *args, **kwargs):
73
        """ArchiveIterator specialized for ARC format."""
74
```
75

76
### Stream Wrapper
77

78
Utility class for handling non-seekable streams that need position tracking.
79

80
```python { .api }
81
class UnseekableYetTellable:
82
    def __init__(self, fh):
83
        """
84
        Wrapper for streams that can't seek but need tell() functionality.
85
        
86
        Args:
87
            fh: File handle to wrap
88
        """
89
    
90
    def tell(self):
91
        """
92
        Return current offset.
93
        
94
        Returns:
95
            int: Current position in stream
96
        """
97
    
98
    def read(self, size=-1):
99
        """
100
        Read data and track offset.
101
        
102
        Args:
103
            size (int): Number of bytes to read (-1 for all)
104
            
105
        Returns:
106
            bytes: Data read from stream
107
        """
108
```
109

110
## Usage Examples
111

112
### Basic Archive Reading
113

114
```python
115
from warcio import ArchiveIterator
116

117
# Read from a WARC file
118
with open('example.warc.gz', 'rb') as stream:
119
    for record in ArchiveIterator(stream):
120
        print(f"Record Type: {record.rec_type}")
121
        print(f"URI: {record.rec_headers.get_header('WARC-Target-URI')}")
122
        
123
        if record.http_headers:
124
            print(f"HTTP Status: {record.http_headers.get_statuscode()}")
125
            print(f"Content-Type: {record.http_headers.get_header('Content-Type')}")
126
        
127
        # Read record content
128
        content = record.content_stream().read()
129
        print(f"Content Length: {len(content)}")
130
        print("---")
131
```
132

133
### Reading with Options
134

135
```python
136
from warcio import ArchiveIterator
137

138
# Read with digest verification and HTTP header checking
139
with open('example.warc.gz', 'rb') as stream:
140
    iterator = ArchiveIterator(
141
        stream,
142
        verify_http=True,        # Verify HTTP headers
143
        check_digests=True,      # Verify record digests
144
        ensure_http_headers=True # Ensure HTTP headers are present
145
    )
146
    
147
    for record in iterator:
148
        # Get record position information
149
        offset = iterator.get_record_offset()
150
        length = iterator.get_record_length()
151
        print(f"Record at offset {offset}, length {length}")
152
        
153
        # Process record
154
        if record.rec_type == 'response':
155
            print(f"Response from: {record.rec_headers.get_header('WARC-Target-URI')}")
156
    
157
    iterator.close()
158
```
159

160
### Converting ARC to WARC
161

162
```python
163
from warcio import ArchiveIterator
164

165
# Read ARC file and convert records to WARC format
166
with open('example.arc.gz', 'rb') as stream:
167
    for record in ArchiveIterator(stream, arc2warc=True):
168
        # Record is now in WARC format even if source was ARC
169
        print(f"WARC Record Type: {record.rec_type}")
170
        print(f"WARC-Date: {record.rec_headers.get_header('WARC-Date')}")
171
```
172

173
### Format-Specific Reading
174

175
```python
176
from warcio.archiveiterator import WARCIterator, ARCIterator
177

178
# Use format-specific iterator when format is known
179
with open('example.warc.gz', 'rb') as stream:
180
    for record in WARCIterator(stream):
181
        print(f"WARC Record: {record.rec_type}")
182

183
# For ARC files
184
with open('example.arc.gz', 'rb') as stream:
185
    for record in ARCIterator(stream):
186
        print(f"ARC Record: {record.rec_type}")
187
```
188

189
### Handling Non-Seekable Streams
190

191
```python
192
from warcio import ArchiveIterator
193
from warcio.archiveiterator import UnseekableYetTellable
194
import requests
195

196
# Read from HTTP stream
197
response = requests.get('https://example.com/archive.warc.gz', stream=True)
198
wrapped_stream = UnseekableYetTellable(response.raw)
199

200
for record in ArchiveIterator(wrapped_stream):
201
    print(f"Position: {wrapped_stream.tell()}")
202
    print(f"Record: {record.rec_type}")
203
```

Version

Tile

Files

archive-reading.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

archive-reading.mddocs/