or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

archive-reading.mdcli-tools.mdhttp-capture.mdhttp-headers.mdindex.mdstream-processing.mdtime-utilities.mdwarc-writing.md

archive-reading.mddocs/

0

# Archive Reading

1

2

Core functionality for reading and iterating through WARC and ARC files with automatic format detection, decompression, and record parsing. The library provides streaming access to archive records without loading entire files into memory.

3

4

## Capabilities

5

6

### Archive Iterator

7

8

The main interface for reading WARC and ARC files, providing iterator access to records with automatic format detection and decompression support.

9

10

```python { .api }

11

class ArchiveIterator:

12

def __init__(self, fileobj, no_record_parse=False, verify_http=False,

13

arc2warc=False, ensure_http_headers=False,

14

block_size=16384, check_digests=False):

15

"""

16

Iterator over records in WARC and ARC files.

17

18

Args:

19

fileobj: File-like object to read from

20

no_record_parse (bool): Skip record parsing if True

21

verify_http (bool): Verify HTTP headers if True

22

arc2warc (bool): Convert ARC records to WARC format if True

23

ensure_http_headers (bool): Ensure HTTP headers are present if True

24

block_size (int): Buffer size for reading (default 16384)

25

check_digests (bool): Verify record digests if True

26

"""

27

28

def __iter__(self):

29

"""Return iterator."""

30

31

def __next__(self):

32

"""Get next record."""

33

34

def close(self):

35

"""Close iterator and cleanup resources."""

36

37

def get_record_offset(self):

38

"""

39

Get current record file offset.

40

41

Returns:

42

int: Byte offset of current record in file

43

"""

44

45

def get_record_length(self):

46

"""

47

Get current record length.

48

49

Returns:

50

int: Length of current record in bytes

51

"""

52

53

def read_to_end(self, record=None):

54

"""

55

Read remainder of the stream.

56

57

Args:

58

record: Optional record to read (uses current record if None)

59

"""

60

```

61

62

### Format-Specific Iterators

63

64

Specialized iterators for specific archive formats when you know the format in advance.

65

66

```python { .api }

67

class WARCIterator(ArchiveIterator):

68

def __init__(self, *args, **kwargs):

69

"""ArchiveIterator specialized for WARC format."""

70

71

class ARCIterator(ArchiveIterator):

72

def __init__(self, *args, **kwargs):

73

"""ArchiveIterator specialized for ARC format."""

74

```

75

76

### Stream Wrapper

77

78

Utility class for handling non-seekable streams that need position tracking.

79

80

```python { .api }

81

class UnseekableYetTellable:

82

def __init__(self, fh):

83

"""

84

Wrapper for streams that can't seek but need tell() functionality.

85

86

Args:

87

fh: File handle to wrap

88

"""

89

90

def tell(self):

91

"""

92

Return current offset.

93

94

Returns:

95

int: Current position in stream

96

"""

97

98

def read(self, size=-1):

99

"""

100

Read data and track offset.

101

102

Args:

103

size (int): Number of bytes to read (-1 for all)

104

105

Returns:

106

bytes: Data read from stream

107

"""

108

```

109

110

## Usage Examples

111

112

### Basic Archive Reading

113

114

```python

115

from warcio import ArchiveIterator

116

117

# Read from a WARC file

118

with open('example.warc.gz', 'rb') as stream:

119

for record in ArchiveIterator(stream):

120

print(f"Record Type: {record.rec_type}")

121

print(f"URI: {record.rec_headers.get_header('WARC-Target-URI')}")

122

123

if record.http_headers:

124

print(f"HTTP Status: {record.http_headers.get_statuscode()}")

125

print(f"Content-Type: {record.http_headers.get_header('Content-Type')}")

126

127

# Read record content

128

content = record.content_stream().read()

129

print(f"Content Length: {len(content)}")

130

print("---")

131

```

132

133

### Reading with Options

134

135

```python

136

from warcio import ArchiveIterator

137

138

# Read with digest verification and HTTP header checking

139

with open('example.warc.gz', 'rb') as stream:

140

iterator = ArchiveIterator(

141

stream,

142

verify_http=True, # Verify HTTP headers

143

check_digests=True, # Verify record digests

144

ensure_http_headers=True # Ensure HTTP headers are present

145

)

146

147

for record in iterator:

148

# Get record position information

149

offset = iterator.get_record_offset()

150

length = iterator.get_record_length()

151

print(f"Record at offset {offset}, length {length}")

152

153

# Process record

154

if record.rec_type == 'response':

155

print(f"Response from: {record.rec_headers.get_header('WARC-Target-URI')}")

156

157

iterator.close()

158

```

159

160

### Converting ARC to WARC

161

162

```python

163

from warcio import ArchiveIterator

164

165

# Read ARC file and convert records to WARC format

166

with open('example.arc.gz', 'rb') as stream:

167

for record in ArchiveIterator(stream, arc2warc=True):

168

# Record is now in WARC format even if source was ARC

169

print(f"WARC Record Type: {record.rec_type}")

170

print(f"WARC-Date: {record.rec_headers.get_header('WARC-Date')}")

171

```

172

173

### Format-Specific Reading

174

175

```python

176

from warcio.archiveiterator import WARCIterator, ARCIterator

177

178

# Use format-specific iterator when format is known

179

with open('example.warc.gz', 'rb') as stream:

180

for record in WARCIterator(stream):

181

print(f"WARC Record: {record.rec_type}")

182

183

# For ARC files

184

with open('example.arc.gz', 'rb') as stream:

185

for record in ARCIterator(stream):

186

print(f"ARC Record: {record.rec_type}")

187

```

188

189

### Handling Non-Seekable Streams

190

191

```python

192

from warcio import ArchiveIterator

193

from warcio.archiveiterator import UnseekableYetTellable

194

import requests

195

196

# Read from HTTP stream

197

response = requests.get('https://example.com/archive.warc.gz', stream=True)

198

wrapped_stream = UnseekableYetTellable(response.raw)

199

200

for record in ArchiveIterator(wrapped_stream):

201

print(f"Position: {wrapped_stream.tell()}")

202

print(f"Record: {record.rec_type}")

203

```