or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

archive-utilities.mdcaching-integrity.mdfile-downloads.mdfolder-operations.mdindex.md

archive-utilities.mddocs/

0

# Archive Utilities

1

2

Extract compressed archives with support for multiple formats including ZIP, TAR, and compressed TAR variants.

3

4

## Capabilities

5

6

### Archive Extraction Function

7

8

Extracts various archive formats to specified directories with automatic format detection.

9

10

```python { .api }

11

def extractall(path, to=None) -> List[str]:

12

"""

13

Extract archive file with automatic format detection.

14

15

Parameters:

16

- path (str): Path to archive file to be extracted.

17

- to (str, optional): Directory to extract files to.

18

If None, extracts to parent directory of archive file.

19

20

Returns:

21

List[str]: List of extracted file paths.

22

23

Raises:

24

ValueError: When archive format is not supported or file doesn't exist.

25

"""

26

```

27

28

## Usage Examples

29

30

### Basic Archive Extraction

31

32

```python

33

import gdown

34

35

# Extract to same directory as archive

36

archive_path = "./data.zip"

37

extracted_files = gdown.extractall(archive_path)

38

39

print(f"Extracted {len(extracted_files)} files:")

40

for file_path in extracted_files:

41

print(f" {file_path}")

42

```

43

44

### Extract to Specific Directory

45

46

```python

47

# Extract to specific target directory

48

archive_path = "./dataset.tar.gz"

49

target_dir = "./extracted_data/"

50

51

extracted_files = gdown.extractall(archive_path, to=target_dir)

52

print(f"Extracted to {target_dir}: {len(extracted_files)} files")

53

```

54

55

### Combined Download and Extract

56

57

```python

58

# Download archive and extract in one workflow

59

import gdown

60

61

# Download compressed dataset

62

url = "https://drive.google.com/uc?id=ARCHIVE_FILE_ID"

63

archive_path = gdown.download(url, "dataset.zip")

64

65

# Extract the downloaded archive

66

extracted_files = gdown.extractall(archive_path, to="./dataset/")

67

68

print(f"Downloaded and extracted {len(extracted_files)} files")

69

```

70

71

### Post-processing Integration

72

73

```python

74

# Use with cached_download for automated workflows

75

def download_and_extract_dataset(url, expected_hash):

76

"""Download, verify, and extract dataset archive."""

77

78

# Download with integrity verification

79

archive_path = gdown.cached_download(

80

url,

81

hash=expected_hash,

82

path="./cache/dataset.tar.gz"

83

)

84

85

# Extract archive

86

extracted_files = gdown.extractall(archive_path, to="./data/")

87

88

# Process extracted files

89

data_files = [f for f in extracted_files if f.endswith('.csv')]

90

print(f"Found {len(data_files)} data files")

91

92

return extracted_files

93

94

# Usage

95

files = download_and_extract_dataset(

96

"https://drive.google.com/uc?id=FILE_ID",

97

"sha256:expected_hash_value"

98

)

99

```

100

101

### Integration with Post-processing Callback

102

103

```python

104

# Automatic extraction using cached_download postprocess

105

def auto_extract(filepath):

106

"""Automatically extract archive after download."""

107

print(f"Auto-extracting {filepath}")

108

return gdown.extractall(filepath, to="./extracted/")

109

110

# Download and auto-extract

111

gdown.cached_download(

112

url="https://example.com/data.tar.gz",

113

hash="sha256:abc123...",

114

postprocess=auto_extract

115

)

116

```

117

118

## Supported Archive Formats

119

120

### ZIP Archives

121

- **Extension**: `.zip`

122

- **Description**: Standard ZIP compression format

123

- **Usage**: Most common for Windows and cross-platform archives

124

125

```python

126

# ZIP file extraction

127

extracted = gdown.extractall("data.zip", to="./zip_contents/")

128

```

129

130

### TAR Archives

131

- **Extension**: `.tar`

132

- **Description**: Uncompressed TAR (tape archive) format

133

- **Usage**: Common on Unix/Linux systems for packaging files

134

135

```python

136

# TAR file extraction

137

extracted = gdown.extractall("archive.tar", to="./tar_contents/")

138

```

139

140

### Compressed TAR Archives

141

142

#### GZIP Compressed TAR

143

- **Extensions**: `.tar.gz`, `.tgz`

144

- **Description**: TAR archive compressed with GZIP

145

- **Usage**: Very common for source code and Linux packages

146

147

```python

148

# GZIP compressed TAR extraction

149

extracted = gdown.extractall("package.tar.gz", to="./source/")

150

extracted = gdown.extractall("backup.tgz", to="./backup/")

151

```

152

153

#### BZIP2 Compressed TAR

154

- **Extensions**: `.tar.bz2`, `.tbz`

155

- **Description**: TAR archive compressed with BZIP2 (better compression than GZIP)

156

- **Usage**: Higher compression ratio, slower processing

157

158

```python

159

# BZIP2 compressed TAR extraction

160

extracted = gdown.extractall("dataset.tar.bz2", to="./dataset/")

161

extracted = gdown.extractall("archive.tbz", to="./archive/")

162

```

163

164

## Directory Structure Handling

165

166

### Extraction Behavior

167

168

Archives are extracted preserving their internal directory structure:

169

170

```

171

Archive Contents:

172

data.zip

173

├── dataset/

174

│ ├── train/

175

│ │ ├── file1.txt

176

│ │ └── file2.txt

177

│ └── test/

178

│ └── file3.txt

179

└── README.md

180

181

After extraction to "./extracted/":

182

./extracted/

183

├── dataset/

184

│ ├── train/

185

│ │ ├── file1.txt

186

│ │ └── file2.txt

187

│ └── test/

188

│ └── file3.txt

189

└── README.md

190

```

191

192

### Path Resolution

193

194

```python

195

# Default: extract to archive's parent directory

196

archive_path = "/home/user/downloads/data.zip"

197

files = gdown.extractall(archive_path) # Extracts to /home/user/downloads/

198

199

# Custom: extract to specific directory

200

files = gdown.extractall(archive_path, to="/home/user/projects/data/")

201

```

202

203

## Error Handling

204

205

```python

206

import gdown

207

208

def safe_extract(archive_path, target_dir=None):

209

"""Safely extract archive with comprehensive error handling."""

210

211

try:

212

extracted_files = gdown.extractall(archive_path, to=target_dir)

213

print(f"✅ Successfully extracted {len(extracted_files)} files")

214

return extracted_files

215

216

except ValueError as e:

217

if "no appropriate extractor" in str(e):

218

print(f"❌ Unsupported archive format: {archive_path}")

219

print("Supported formats: .zip, .tar, .tar.gz, .tgz, .tar.bz2, .tbz")

220

else:

221

print(f"❌ Extraction error: {e}")

222

return None

223

224

except FileNotFoundError:

225

print(f"❌ Archive file not found: {archive_path}")

226

return None

227

228

except PermissionError:

229

print(f"❌ Permission denied accessing: {archive_path}")

230

return None

231

232

except Exception as e:

233

print(f"❌ Unexpected error during extraction: {e}")

234

return None

235

236

# Usage

237

files = safe_extract("./dataset.tar.gz", "./data/")

238

if files:

239

print("Extraction completed successfully")

240

```

241

242

## Advanced Usage Patterns

243

244

### Batch Archive Processing

245

246

```python

247

import os

248

import gdown

249

250

def process_archive_directory(archive_dir, extract_base="./extracted/"):

251

"""Process all archives in a directory."""

252

253

supported_extensions = ('.zip', '.tar', '.tar.gz', '.tgz', '.tar.bz2', '.tbz')

254

processed = 0

255

256

for filename in os.listdir(archive_dir):

257

if filename.endswith(supported_extensions):

258

archive_path = os.path.join(archive_dir, filename)

259

260

# Create extraction directory based on filename

261

extract_name = os.path.splitext(filename)[0]

262

if extract_name.endswith('.tar'): # Handle .tar.gz, .tar.bz2

263

extract_name = os.path.splitext(extract_name)[0]

264

265

extract_dir = os.path.join(extract_base, extract_name)

266

267

try:

268

files = gdown.extractall(archive_path, to=extract_dir)

269

print(f"✅ {filename}: {len(files)} files extracted")

270

processed += 1

271

except Exception as e:

272

print(f"❌ {filename}: {e}")

273

274

print(f"Processed {processed} archives")

275

276

# Usage

277

process_archive_directory("./downloads/", "./data/")

278

```

279

280

### Archive Validation

281

282

```python

283

def validate_extraction(archive_path, expected_files=None):

284

"""Validate archive extraction results."""

285

286

try:

287

extracted_files = gdown.extractall(archive_path, to="./temp_extract/")

288

289

print(f"Extraction completed: {len(extracted_files)} files")

290

291

if expected_files:

292

# Check if all expected files were extracted

293

extracted_names = [os.path.basename(f) for f in extracted_files]

294

missing = set(expected_files) - set(extracted_names)

295

296

if missing:

297

print(f"⚠️ Missing expected files: {missing}")

298

else:

299

print("✅ All expected files found")

300

301

# Show file types

302

extensions = {}

303

for file_path in extracted_files:

304

ext = os.path.splitext(file_path)[1].lower()

305

extensions[ext] = extensions.get(ext, 0) + 1

306

307

print("File types found:")

308

for ext, count in sorted(extensions.items()):

309

print(f" {ext or '(no extension)'}: {count} files")

310

311

return extracted_files

312

313

except Exception as e:

314

print(f"Extraction failed: {e}")

315

return None

316

317

# Usage

318

validate_extraction(

319

"dataset.zip",

320

expected_files=["README.txt", "data.csv", "config.json"]

321

)

322

```

323

324

### Cleanup and Management

325

326

```python

327

import shutil

328

import tempfile

329

330

def extract_temporarily(archive_path, process_func):

331

"""Extract archive to temporary directory and clean up after processing."""

332

333

with tempfile.TemporaryDirectory() as temp_dir:

334

try:

335

# Extract to temporary directory

336

extracted_files = gdown.extractall(archive_path, to=temp_dir)

337

print(f"Extracted {len(extracted_files)} files to temporary directory")

338

339

# Process files

340

result = process_func(extracted_files, temp_dir)

341

342

return result

343

344

except Exception as e:

345

print(f"Processing failed: {e}")

346

return None

347

# Temporary directory automatically cleaned up

348

349

def process_extracted_files(file_list, base_dir):

350

"""Example processing function."""

351

csv_files = [f for f in file_list if f.endswith('.csv')]

352

print(f"Found {len(csv_files)} CSV files for processing")

353

354

# Process CSV files here

355

results = []

356

for csv_file in csv_files:

357

# Process each CSV file

358

results.append(f"Processed {os.path.basename(csv_file)}")

359

360

return results

361

362

# Usage

363

results = extract_temporarily("data.tar.gz", process_extracted_files)

364

print("Processing results:", results)

365

```

366

367

## Best Practices

368

369

### Memory Efficient Processing

370

371

```python

372

def stream_process_archive(archive_path):

373

"""Process large archives without keeping all files in memory."""

374

375

# Extract files

376

extracted_files = gdown.extractall(archive_path, to="./processing/")

377

378

# Process files one at a time to manage memory

379

for file_path in extracted_files:

380

if file_path.endswith('.csv'):

381

# Process individual CSV file

382

print(f"Processing {file_path}")

383

# ... process file ...

384

385

# Optionally remove processed file to save disk space

386

# os.remove(file_path)

387

388

return len(extracted_files)

389

```

390

391

### Integration with Download Workflows

392

393

```python

394

def complete_dataset_workflow(drive_url, expected_hash):

395

"""Complete workflow: download, verify, extract, and process."""

396

397

# Step 1: Download with verification

398

archive_path = gdown.cached_download(

399

drive_url,

400

hash=expected_hash,

401

path="./cache/dataset.tar.gz"

402

)

403

404

# Step 2: Extract archive

405

extracted_files = gdown.extractall(archive_path, to="./data/")

406

407

# Step 3: Organize extracted files

408

organized = {

409

'images': [f for f in extracted_files if f.endswith(('.jpg', '.png'))],

410

'data': [f for f in extracted_files if f.endswith('.csv')],

411

'docs': [f for f in extracted_files if f.endswith(('.txt', '.md'))]

412

}

413

414

print("Dataset organized:")

415

for category, files in organized.items():

416

print(f" {category}: {len(files)} files")

417

418

return organized

419

420

# Usage

421

dataset = complete_dataset_workflow(

422

"https://drive.google.com/uc?id=DATASET_ID",

423

"sha256:expected_dataset_hash"

424

)

425

```