Tessl Tile for pypi/internetarchive@5.5.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

account-management.md cli-interface.md configuration-auth.md file-management.md index.md item-operations.md metadata-operations.md search-operations.md session-management.md task-management.md

file-management.mddocs/

0
# File Management
1

2
File management operations provide access to individual files within Archive.org items, including file retrieval, download, deletion, and metadata access.
3

4
## Capabilities
5

6
### File Retrieval
7

8
Access File objects representing individual files within Archive.org items.
9

10
```python { .api }
11
def get_files(identifier, files=None, formats=None, glob_pattern=None, exclude_pattern=None, on_the_fly=False, **get_item_kwargs):
12
    """
13
    Get File objects from an item with optional filtering.
14
    
15
    Args:
16
        identifier (str): Item identifier
17
        files (list, optional): Specific file names to retrieve
18
        formats (list, optional): File formats to filter by (e.g., ['pdf', 'txt', 'jpg'])
19
        glob_pattern (str, optional): Glob pattern for file selection (e.g., '*.pdf', 'chapter*.txt')
20
        exclude_pattern (str, optional): Glob pattern for exclusion
21
        on_the_fly (bool): Include on-the-fly derived files
22
        **get_item_kwargs: Additional arguments passed to get_item
23
        
24
    Returns:
25
        list: List of File objects matching the criteria
26
    """
27

28
class File:
29
    """
30
    Represents a file within an Archive.org item.
31
    """
32
    
33
    def __init__(self, item, name, file_metadata=None):
34
        """
35
        Initialize File object.
36
        
37
        Args:
38
            item (Item): Parent Item object
39
            name (str): Filename
40
            file_metadata (dict, optional): Pre-fetched file metadata
41
        """
42
```
43

44
### File Properties
45

46
Access file metadata, URLs, and status information.
47

48
```python { .api }
49
class File:
50
    @property
51
    def item(self):
52
        """Item: Parent Item object."""
53
        
54
    @property
55
    def identifier(self):
56
        """str: Item identifier (same as parent item)."""
57
        
58
    @property
59
    def name(self):
60
        """str: Filename."""
61
        
62
    @property
63
    def url(self):
64
        """str: Direct download URL for the file."""
65
        
66
    @property
67
    def auth(self):
68
        """S3Auth: S3 authentication object if credentials are available."""
69
        
70
    @property
71
    def exists(self):
72
        """bool: Whether the file exists in the item."""
73
        
74
    @property
75
    def metadata(self):
76
        """dict: File metadata dictionary."""
77
        
78
    # Standard file properties
79
    @property
80
    def size(self):
81
        """int: File size in bytes."""
82
        
83
    @property
84
    def format(self):
85
        """str: File format/type."""
86
        
87
    @property
88
    def md5(self):
89
        """str: MD5 checksum of the file."""
90
        
91
    @property
92
    def sha1(self):
93
        """str: SHA1 checksum of the file."""
94
        
95
    @property
96
    def mtime(self):
97
        """str: Last modification time."""
98
        
99
    @property
100
    def crc32(self):
101
        """str: CRC32 checksum of the file."""
102
        
103
    @property
104
    def source(self):
105
        """str: Source of the file (original or derived)."""
106
```
107

108
### File Download
109

110
Download individual files with various options.
111

112
```python { .api }
113
class File:
114
    def download(self, file_path=None, verbose=None, ignore_existing=None, checksum=None, checksum_archive=None, destdir=None, retries=None, ignore_errors=None, no_change_timestamp=None, timeout=None, **kwargs):
115
        """
116
        Download this file.
117
        
118
        Args:
119
            file_path (str, optional): Local path to save file (defaults to filename)
120
            verbose (bool, optional): Enable verbose output
121
            ignore_existing (bool, optional): Re-download if file already exists
122
            checksum (bool, optional): Verify checksum after download
123
            checksum_archive (bool, optional): Use archive-provided checksums
124
            destdir (str, optional): Destination directory
125
            retries (int, optional): Number of retry attempts
126
            ignore_errors (bool, optional): Continue on errors
127
            no_change_timestamp (bool, optional): Don't update file timestamp
128
            timeout (int, optional): Request timeout in seconds
129
            **kwargs: Additional download options
130
            
131
        Returns:
132
            Request or Response: Download operation result
133
            
134
        Raises:
135
            InvalidChecksumError: If checksum verification fails
136
            requests.RequestException: If download fails
137
        """
138
```
139

140
### File Deletion
141

142
Delete files from Archive.org items.
143

144
```python { .api }
145
def delete(identifier, files=None, formats=None, glob_pattern=None, cascade_delete=False, access_key=None, secret_key=None, verbose=False, debug=False, **kwargs):
146
    """
147
    Delete files from an Archive.org item.
148
    
149
    Args:
150
        identifier (str): Item identifier
151
        files (list, optional): Specific files to delete
152
        formats (list, optional): File formats to delete (e.g., ['pdf', 'jpg'])
153
        glob_pattern (str, optional): Glob pattern for file selection
154
        cascade_delete (bool): Delete derived files along with source files
155
        access_key (str, optional): IA-S3 access key (overrides config)
156
        secret_key (str, optional): IA-S3 secret key (overrides config)
157
        verbose (bool): Enable verbose output
158
        debug (bool): Enable debug logging
159
        **kwargs: Additional arguments passed to get_item
160
        
161
    Returns:
162
        list: List of Request/Response objects from delete operations
163
        
164
    Raises:
165
        AuthenticationError: If authentication fails
166
        ItemLocateError: If item cannot be located
167
    """
168

169
class File:
170
    def delete(self, cascade_delete=False, access_key=None, secret_key=None, verbose=False, debug=False, request_kwargs=None):
171
        """
172
        Delete this file from the Archive.org item.
173
        
174
        Args:
175
            cascade_delete (bool): Delete derived files along with this file
176
            access_key (str, optional): IA-S3 access key
177
            secret_key (str, optional): IA-S3 secret key
178
            verbose (bool): Enable verbose output
179
            debug (bool): Enable debug logging
180
            request_kwargs (dict, optional): Additional request arguments
181
            
182
        Returns:
183
            Request or Response: Delete operation result
184
            
185
        Raises:
186
            AuthenticationError: If authentication fails
187
        """
188
```
189

190
## Usage Examples
191

192
### Basic File Access
193

194
```python
195
import internetarchive
196

197
# Get all files from an item
198
files = internetarchive.get_files('example-item')
199

200
for file in files:
201
    print(f"File: {file.name}")
202
    print(f"Size: {file.size} bytes")
203
    print(f"Format: {file.format}")
204
    print(f"MD5: {file.md5}")
205
    print("---")
206
```
207

208
### File Filtering
209

210
```python
211
import internetarchive
212

213
# Get only PDF files
214
pdf_files = internetarchive.get_files('example-item', formats=['pdf'])
215

216
# Get files matching pattern
217
text_files = internetarchive.get_files('example-item', glob_pattern='*.txt')
218

219
# Get specific files
220
specific_files = internetarchive.get_files(
221
    'example-item', 
222
    files=['document.pdf', 'readme.txt']
223
)
224

225
# Exclude certain patterns
226
filtered_files = internetarchive.get_files(
227
    'example-item',
228
    exclude_pattern='*_thumb.jpg'
229
)
230
```
231

232
### File Download Operations
233

234
```python
235
import internetarchive
236

237
# Download specific file
238
item = internetarchive.get_item('example-item')
239
file = item.get_file('document.pdf')
240

241
if file:
242
    # Download with verification
243
    file.download(
244
        file_path='./downloads/document.pdf',
245
        checksum=True,
246
        verbose=True
247
    )
248

249
# Download all files of specific format
250
for file in item.get_files(formats=['pdf']):
251
    file.download(destdir='./pdf_downloads')
252
```
253

254
### Bulk File Operations
255

256
```python
257
import internetarchive
258

259
# Download all images from an item
260
item = internetarchive.get_item('photo-collection')
261

262
image_formats = ['jpg', 'jpeg', 'png', 'gif']
263
for file in item.get_files(formats=image_formats):
264
    print(f"Downloading {file.name} ({file.size} bytes)")
265
    file.download(
266
        destdir='./images',
267
        ignore_existing=True,
268
        checksum=True
269
    )
270
```
271

272
### File Deletion
273

274
```python
275
import internetarchive
276

277
# Delete specific files
278
internetarchive.delete(
279
    'my-item',
280
    files=['unwanted.pdf', 'old-version.txt'],
281
    verbose=True
282
)
283

284
# Delete files by format
285
internetarchive.delete(
286
    'my-item',
287
    formats=['tmp'],  # Delete all temporary files
288
    cascade_delete=True
289
)
290

291
# Delete using pattern
292
internetarchive.delete(
293
    'my-item',
294
    glob_pattern='*_backup.*'
295
)
296
```
297

298
### File Metadata Analysis
299

300
```python
301
import internetarchive
302
from collections import defaultdict
303

304
# Analyze file types in an item
305
item = internetarchive.get_item('example-item')
306

307
format_stats = defaultdict(lambda: {'count': 0, 'total_size': 0})
308

309
for file in item.get_files():
310
    format_name = file.format or 'unknown'
311
    format_stats[format_name]['count'] += 1
312
    format_stats[format_name]['total_size'] += file.size or 0
313

314
print("File Format Analysis:")
315
for fmt, stats in sorted(format_stats.items()):
316
    avg_size = stats['total_size'] / stats['count'] if stats['count'] > 0 else 0
317
    print(f"{fmt}: {stats['count']} files, {stats['total_size']:,} bytes total, {avg_size:.0f} bytes average")
318
```
319

320
### Working with Checksums
321

322
```python
323
import internetarchive
324
import hashlib
325

326
# Verify file integrity
327
item = internetarchive.get_item('example-item')
328
file = item.get_file('important-document.pdf')
329

330
if file and file.md5:
331
    # Download and verify
332
    response = file.download(file_path='temp_file.pdf', checksum=True)
333
    
334
    # Manual checksum verification
335
    with open('temp_file.pdf', 'rb') as f:
336
        local_md5 = hashlib.md5(f.read()).hexdigest()
337
    
338
    if local_md5 == file.md5:
339
        print("File integrity verified")
340
    else:
341
        print("Checksum mismatch - file may be corrupted")
342
```

Version

Tile

Files

file-management.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

file-management.mddocs/