or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

account-management.mdcli-interface.mdconfiguration-auth.mdfile-management.mdindex.mditem-operations.mdmetadata-operations.mdsearch-operations.mdsession-management.mdtask-management.md

file-management.mddocs/

0

# File Management

1

2

File management operations provide access to individual files within Archive.org items, including file retrieval, download, deletion, and metadata access.

3

4

## Capabilities

5

6

### File Retrieval

7

8

Access File objects representing individual files within Archive.org items.

9

10

```python { .api }

11

def get_files(identifier, files=None, formats=None, glob_pattern=None, exclude_pattern=None, on_the_fly=False, **get_item_kwargs):

12

"""

13

Get File objects from an item with optional filtering.

14

15

Args:

16

identifier (str): Item identifier

17

files (list, optional): Specific file names to retrieve

18

formats (list, optional): File formats to filter by (e.g., ['pdf', 'txt', 'jpg'])

19

glob_pattern (str, optional): Glob pattern for file selection (e.g., '*.pdf', 'chapter*.txt')

20

exclude_pattern (str, optional): Glob pattern for exclusion

21

on_the_fly (bool): Include on-the-fly derived files

22

**get_item_kwargs: Additional arguments passed to get_item

23

24

Returns:

25

list: List of File objects matching the criteria

26

"""

27

28

class File:

29

"""

30

Represents a file within an Archive.org item.

31

"""

32

33

def __init__(self, item, name, file_metadata=None):

34

"""

35

Initialize File object.

36

37

Args:

38

item (Item): Parent Item object

39

name (str): Filename

40

file_metadata (dict, optional): Pre-fetched file metadata

41

"""

42

```

43

44

### File Properties

45

46

Access file metadata, URLs, and status information.

47

48

```python { .api }

49

class File:

50

@property

51

def item(self):

52

"""Item: Parent Item object."""

53

54

@property

55

def identifier(self):

56

"""str: Item identifier (same as parent item)."""

57

58

@property

59

def name(self):

60

"""str: Filename."""

61

62

@property

63

def url(self):

64

"""str: Direct download URL for the file."""

65

66

@property

67

def auth(self):

68

"""S3Auth: S3 authentication object if credentials are available."""

69

70

@property

71

def exists(self):

72

"""bool: Whether the file exists in the item."""

73

74

@property

75

def metadata(self):

76

"""dict: File metadata dictionary."""

77

78

# Standard file properties

79

@property

80

def size(self):

81

"""int: File size in bytes."""

82

83

@property

84

def format(self):

85

"""str: File format/type."""

86

87

@property

88

def md5(self):

89

"""str: MD5 checksum of the file."""

90

91

@property

92

def sha1(self):

93

"""str: SHA1 checksum of the file."""

94

95

@property

96

def mtime(self):

97

"""str: Last modification time."""

98

99

@property

100

def crc32(self):

101

"""str: CRC32 checksum of the file."""

102

103

@property

104

def source(self):

105

"""str: Source of the file (original or derived)."""

106

```

107

108

### File Download

109

110

Download individual files with various options.

111

112

```python { .api }

113

class File:

114

def download(self, file_path=None, verbose=None, ignore_existing=None, checksum=None, checksum_archive=None, destdir=None, retries=None, ignore_errors=None, no_change_timestamp=None, timeout=None, **kwargs):

115

"""

116

Download this file.

117

118

Args:

119

file_path (str, optional): Local path to save file (defaults to filename)

120

verbose (bool, optional): Enable verbose output

121

ignore_existing (bool, optional): Re-download if file already exists

122

checksum (bool, optional): Verify checksum after download

123

checksum_archive (bool, optional): Use archive-provided checksums

124

destdir (str, optional): Destination directory

125

retries (int, optional): Number of retry attempts

126

ignore_errors (bool, optional): Continue on errors

127

no_change_timestamp (bool, optional): Don't update file timestamp

128

timeout (int, optional): Request timeout in seconds

129

**kwargs: Additional download options

130

131

Returns:

132

Request or Response: Download operation result

133

134

Raises:

135

InvalidChecksumError: If checksum verification fails

136

requests.RequestException: If download fails

137

"""

138

```

139

140

### File Deletion

141

142

Delete files from Archive.org items.

143

144

```python { .api }

145

def delete(identifier, files=None, formats=None, glob_pattern=None, cascade_delete=False, access_key=None, secret_key=None, verbose=False, debug=False, **kwargs):

146

"""

147

Delete files from an Archive.org item.

148

149

Args:

150

identifier (str): Item identifier

151

files (list, optional): Specific files to delete

152

formats (list, optional): File formats to delete (e.g., ['pdf', 'jpg'])

153

glob_pattern (str, optional): Glob pattern for file selection

154

cascade_delete (bool): Delete derived files along with source files

155

access_key (str, optional): IA-S3 access key (overrides config)

156

secret_key (str, optional): IA-S3 secret key (overrides config)

157

verbose (bool): Enable verbose output

158

debug (bool): Enable debug logging

159

**kwargs: Additional arguments passed to get_item

160

161

Returns:

162

list: List of Request/Response objects from delete operations

163

164

Raises:

165

AuthenticationError: If authentication fails

166

ItemLocateError: If item cannot be located

167

"""

168

169

class File:

170

def delete(self, cascade_delete=False, access_key=None, secret_key=None, verbose=False, debug=False, request_kwargs=None):

171

"""

172

Delete this file from the Archive.org item.

173

174

Args:

175

cascade_delete (bool): Delete derived files along with this file

176

access_key (str, optional): IA-S3 access key

177

secret_key (str, optional): IA-S3 secret key

178

verbose (bool): Enable verbose output

179

debug (bool): Enable debug logging

180

request_kwargs (dict, optional): Additional request arguments

181

182

Returns:

183

Request or Response: Delete operation result

184

185

Raises:

186

AuthenticationError: If authentication fails

187

"""

188

```

189

190

## Usage Examples

191

192

### Basic File Access

193

194

```python

195

import internetarchive

196

197

# Get all files from an item

198

files = internetarchive.get_files('example-item')

199

200

for file in files:

201

print(f"File: {file.name}")

202

print(f"Size: {file.size} bytes")

203

print(f"Format: {file.format}")

204

print(f"MD5: {file.md5}")

205

print("---")

206

```

207

208

### File Filtering

209

210

```python

211

import internetarchive

212

213

# Get only PDF files

214

pdf_files = internetarchive.get_files('example-item', formats=['pdf'])

215

216

# Get files matching pattern

217

text_files = internetarchive.get_files('example-item', glob_pattern='*.txt')

218

219

# Get specific files

220

specific_files = internetarchive.get_files(

221

'example-item',

222

files=['document.pdf', 'readme.txt']

223

)

224

225

# Exclude certain patterns

226

filtered_files = internetarchive.get_files(

227

'example-item',

228

exclude_pattern='*_thumb.jpg'

229

)

230

```

231

232

### File Download Operations

233

234

```python

235

import internetarchive

236

237

# Download specific file

238

item = internetarchive.get_item('example-item')

239

file = item.get_file('document.pdf')

240

241

if file:

242

# Download with verification

243

file.download(

244

file_path='./downloads/document.pdf',

245

checksum=True,

246

verbose=True

247

)

248

249

# Download all files of specific format

250

for file in item.get_files(formats=['pdf']):

251

file.download(destdir='./pdf_downloads')

252

```

253

254

### Bulk File Operations

255

256

```python

257

import internetarchive

258

259

# Download all images from an item

260

item = internetarchive.get_item('photo-collection')

261

262

image_formats = ['jpg', 'jpeg', 'png', 'gif']

263

for file in item.get_files(formats=image_formats):

264

print(f"Downloading {file.name} ({file.size} bytes)")

265

file.download(

266

destdir='./images',

267

ignore_existing=True,

268

checksum=True

269

)

270

```

271

272

### File Deletion

273

274

```python

275

import internetarchive

276

277

# Delete specific files

278

internetarchive.delete(

279

'my-item',

280

files=['unwanted.pdf', 'old-version.txt'],

281

verbose=True

282

)

283

284

# Delete files by format

285

internetarchive.delete(

286

'my-item',

287

formats=['tmp'], # Delete all temporary files

288

cascade_delete=True

289

)

290

291

# Delete using pattern

292

internetarchive.delete(

293

'my-item',

294

glob_pattern='*_backup.*'

295

)

296

```

297

298

### File Metadata Analysis

299

300

```python

301

import internetarchive

302

from collections import defaultdict

303

304

# Analyze file types in an item

305

item = internetarchive.get_item('example-item')

306

307

format_stats = defaultdict(lambda: {'count': 0, 'total_size': 0})

308

309

for file in item.get_files():

310

format_name = file.format or 'unknown'

311

format_stats[format_name]['count'] += 1

312

format_stats[format_name]['total_size'] += file.size or 0

313

314

print("File Format Analysis:")

315

for fmt, stats in sorted(format_stats.items()):

316

avg_size = stats['total_size'] / stats['count'] if stats['count'] > 0 else 0

317

print(f"{fmt}: {stats['count']} files, {stats['total_size']:,} bytes total, {avg_size:.0f} bytes average")

318

```

319

320

### Working with Checksums

321

322

```python

323

import internetarchive

324

import hashlib

325

326

# Verify file integrity

327

item = internetarchive.get_item('example-item')

328

file = item.get_file('important-document.pdf')

329

330

if file and file.md5:

331

# Download and verify

332

response = file.download(file_path='temp_file.pdf', checksum=True)

333

334

# Manual checksum verification

335

with open('temp_file.pdf', 'rb') as f:

336

local_md5 = hashlib.md5(f.read()).hexdigest()

337

338

if local_md5 == file.md5:

339

print("File integrity verified")

340

else:

341

print("Checksum mismatch - file may be corrupted")

342

```