0
# File Management
1
2
File management operations provide access to individual files within Archive.org items, including file retrieval, download, deletion, and metadata access.
3
4
## Capabilities
5
6
### File Retrieval
7
8
Access File objects representing individual files within Archive.org items.
9
10
```python { .api }
11
def get_files(identifier, files=None, formats=None, glob_pattern=None, exclude_pattern=None, on_the_fly=False, **get_item_kwargs):
12
"""
13
Get File objects from an item with optional filtering.
14
15
Args:
16
identifier (str): Item identifier
17
files (list, optional): Specific file names to retrieve
18
formats (list, optional): File formats to filter by (e.g., ['pdf', 'txt', 'jpg'])
19
glob_pattern (str, optional): Glob pattern for file selection (e.g., '*.pdf', 'chapter*.txt')
20
exclude_pattern (str, optional): Glob pattern for exclusion
21
on_the_fly (bool): Include on-the-fly derived files
22
**get_item_kwargs: Additional arguments passed to get_item
23
24
Returns:
25
list: List of File objects matching the criteria
26
"""
27
28
class File:
29
"""
30
Represents a file within an Archive.org item.
31
"""
32
33
def __init__(self, item, name, file_metadata=None):
34
"""
35
Initialize File object.
36
37
Args:
38
item (Item): Parent Item object
39
name (str): Filename
40
file_metadata (dict, optional): Pre-fetched file metadata
41
"""
42
```
43
44
### File Properties
45
46
Access file metadata, URLs, and status information.
47
48
```python { .api }
49
class File:
50
@property
51
def item(self):
52
"""Item: Parent Item object."""
53
54
@property
55
def identifier(self):
56
"""str: Item identifier (same as parent item)."""
57
58
@property
59
def name(self):
60
"""str: Filename."""
61
62
@property
63
def url(self):
64
"""str: Direct download URL for the file."""
65
66
@property
67
def auth(self):
68
"""S3Auth: S3 authentication object if credentials are available."""
69
70
@property
71
def exists(self):
72
"""bool: Whether the file exists in the item."""
73
74
@property
75
def metadata(self):
76
"""dict: File metadata dictionary."""
77
78
# Standard file properties
79
@property
80
def size(self):
81
"""int: File size in bytes."""
82
83
@property
84
def format(self):
85
"""str: File format/type."""
86
87
@property
88
def md5(self):
89
"""str: MD5 checksum of the file."""
90
91
@property
92
def sha1(self):
93
"""str: SHA1 checksum of the file."""
94
95
@property
96
def mtime(self):
97
"""str: Last modification time."""
98
99
@property
100
def crc32(self):
101
"""str: CRC32 checksum of the file."""
102
103
@property
104
def source(self):
105
"""str: Source of the file (original or derived)."""
106
```
107
108
### File Download
109
110
Download individual files with various options.
111
112
```python { .api }
113
class File:
114
def download(self, file_path=None, verbose=None, ignore_existing=None, checksum=None, checksum_archive=None, destdir=None, retries=None, ignore_errors=None, no_change_timestamp=None, timeout=None, **kwargs):
115
"""
116
Download this file.
117
118
Args:
119
file_path (str, optional): Local path to save file (defaults to filename)
120
verbose (bool, optional): Enable verbose output
121
ignore_existing (bool, optional): Re-download if file already exists
122
checksum (bool, optional): Verify checksum after download
123
checksum_archive (bool, optional): Use archive-provided checksums
124
destdir (str, optional): Destination directory
125
retries (int, optional): Number of retry attempts
126
ignore_errors (bool, optional): Continue on errors
127
no_change_timestamp (bool, optional): Don't update file timestamp
128
timeout (int, optional): Request timeout in seconds
129
**kwargs: Additional download options
130
131
Returns:
132
Request or Response: Download operation result
133
134
Raises:
135
InvalidChecksumError: If checksum verification fails
136
requests.RequestException: If download fails
137
"""
138
```
139
140
### File Deletion
141
142
Delete files from Archive.org items.
143
144
```python { .api }
145
def delete(identifier, files=None, formats=None, glob_pattern=None, cascade_delete=False, access_key=None, secret_key=None, verbose=False, debug=False, **kwargs):
146
"""
147
Delete files from an Archive.org item.
148
149
Args:
150
identifier (str): Item identifier
151
files (list, optional): Specific files to delete
152
formats (list, optional): File formats to delete (e.g., ['pdf', 'jpg'])
153
glob_pattern (str, optional): Glob pattern for file selection
154
cascade_delete (bool): Delete derived files along with source files
155
access_key (str, optional): IA-S3 access key (overrides config)
156
secret_key (str, optional): IA-S3 secret key (overrides config)
157
verbose (bool): Enable verbose output
158
debug (bool): Enable debug logging
159
**kwargs: Additional arguments passed to get_item
160
161
Returns:
162
list: List of Request/Response objects from delete operations
163
164
Raises:
165
AuthenticationError: If authentication fails
166
ItemLocateError: If item cannot be located
167
"""
168
169
class File:
170
def delete(self, cascade_delete=False, access_key=None, secret_key=None, verbose=False, debug=False, request_kwargs=None):
171
"""
172
Delete this file from the Archive.org item.
173
174
Args:
175
cascade_delete (bool): Delete derived files along with this file
176
access_key (str, optional): IA-S3 access key
177
secret_key (str, optional): IA-S3 secret key
178
verbose (bool): Enable verbose output
179
debug (bool): Enable debug logging
180
request_kwargs (dict, optional): Additional request arguments
181
182
Returns:
183
Request or Response: Delete operation result
184
185
Raises:
186
AuthenticationError: If authentication fails
187
"""
188
```
189
190
## Usage Examples
191
192
### Basic File Access
193
194
```python
195
import internetarchive
196
197
# Get all files from an item
198
files = internetarchive.get_files('example-item')
199
200
for file in files:
201
print(f"File: {file.name}")
202
print(f"Size: {file.size} bytes")
203
print(f"Format: {file.format}")
204
print(f"MD5: {file.md5}")
205
print("---")
206
```
207
208
### File Filtering
209
210
```python
211
import internetarchive
212
213
# Get only PDF files
214
pdf_files = internetarchive.get_files('example-item', formats=['pdf'])
215
216
# Get files matching pattern
217
text_files = internetarchive.get_files('example-item', glob_pattern='*.txt')
218
219
# Get specific files
220
specific_files = internetarchive.get_files(
221
'example-item',
222
files=['document.pdf', 'readme.txt']
223
)
224
225
# Exclude certain patterns
226
filtered_files = internetarchive.get_files(
227
'example-item',
228
exclude_pattern='*_thumb.jpg'
229
)
230
```
231
232
### File Download Operations
233
234
```python
235
import internetarchive
236
237
# Download specific file
238
item = internetarchive.get_item('example-item')
239
file = item.get_file('document.pdf')
240
241
if file:
242
# Download with verification
243
file.download(
244
file_path='./downloads/document.pdf',
245
checksum=True,
246
verbose=True
247
)
248
249
# Download all files of specific format
250
for file in item.get_files(formats=['pdf']):
251
file.download(destdir='./pdf_downloads')
252
```
253
254
### Bulk File Operations
255
256
```python
257
import internetarchive
258
259
# Download all images from an item
260
item = internetarchive.get_item('photo-collection')
261
262
image_formats = ['jpg', 'jpeg', 'png', 'gif']
263
for file in item.get_files(formats=image_formats):
264
print(f"Downloading {file.name} ({file.size} bytes)")
265
file.download(
266
destdir='./images',
267
ignore_existing=True,
268
checksum=True
269
)
270
```
271
272
### File Deletion
273
274
```python
275
import internetarchive
276
277
# Delete specific files
278
internetarchive.delete(
279
'my-item',
280
files=['unwanted.pdf', 'old-version.txt'],
281
verbose=True
282
)
283
284
# Delete files by format
285
internetarchive.delete(
286
'my-item',
287
formats=['tmp'], # Delete all temporary files
288
cascade_delete=True
289
)
290
291
# Delete using pattern
292
internetarchive.delete(
293
'my-item',
294
glob_pattern='*_backup.*'
295
)
296
```
297
298
### File Metadata Analysis
299
300
```python
301
import internetarchive
302
from collections import defaultdict
303
304
# Analyze file types in an item
305
item = internetarchive.get_item('example-item')
306
307
format_stats = defaultdict(lambda: {'count': 0, 'total_size': 0})
308
309
for file in item.get_files():
310
format_name = file.format or 'unknown'
311
format_stats[format_name]['count'] += 1
312
format_stats[format_name]['total_size'] += file.size or 0
313
314
print("File Format Analysis:")
315
for fmt, stats in sorted(format_stats.items()):
316
avg_size = stats['total_size'] / stats['count'] if stats['count'] > 0 else 0
317
print(f"{fmt}: {stats['count']} files, {stats['total_size']:,} bytes total, {avg_size:.0f} bytes average")
318
```
319
320
### Working with Checksums
321
322
```python
323
import internetarchive
324
import hashlib
325
326
# Verify file integrity
327
item = internetarchive.get_item('example-item')
328
file = item.get_file('important-document.pdf')
329
330
if file and file.md5:
331
# Download and verify
332
response = file.download(file_path='temp_file.pdf', checksum=True)
333
334
# Manual checksum verification
335
with open('temp_file.pdf', 'rb') as f:
336
local_md5 = hashlib.md5(f.read()).hexdigest()
337
338
if local_md5 == file.md5:
339
print("File integrity verified")
340
else:
341
print("Checksum mismatch - file may be corrupted")
342
```