0
# Archive Utilities
1
2
Extract compressed archives with support for multiple formats including ZIP, TAR, and compressed TAR variants.
3
4
## Capabilities
5
6
### Archive Extraction Function
7
8
Extracts various archive formats to specified directories with automatic format detection.
9
10
```python { .api }
11
def extractall(path, to=None) -> List[str]:
12
"""
13
Extract archive file with automatic format detection.
14
15
Parameters:
16
- path (str): Path to archive file to be extracted.
17
- to (str, optional): Directory to extract files to.
18
If None, extracts to parent directory of archive file.
19
20
Returns:
21
List[str]: List of extracted file paths.
22
23
Raises:
24
ValueError: When archive format is not supported or file doesn't exist.
25
"""
26
```
27
28
## Usage Examples
29
30
### Basic Archive Extraction
31
32
```python
33
import gdown
34
35
# Extract to same directory as archive
36
archive_path = "./data.zip"
37
extracted_files = gdown.extractall(archive_path)
38
39
print(f"Extracted {len(extracted_files)} files:")
40
for file_path in extracted_files:
41
print(f" {file_path}")
42
```
43
44
### Extract to Specific Directory
45
46
```python
47
# Extract to specific target directory
48
archive_path = "./dataset.tar.gz"
49
target_dir = "./extracted_data/"
50
51
extracted_files = gdown.extractall(archive_path, to=target_dir)
52
print(f"Extracted to {target_dir}: {len(extracted_files)} files")
53
```
54
55
### Combined Download and Extract
56
57
```python
58
# Download archive and extract in one workflow
59
import gdown
60
61
# Download compressed dataset
62
url = "https://drive.google.com/uc?id=ARCHIVE_FILE_ID"
63
archive_path = gdown.download(url, "dataset.zip")
64
65
# Extract the downloaded archive
66
extracted_files = gdown.extractall(archive_path, to="./dataset/")
67
68
print(f"Downloaded and extracted {len(extracted_files)} files")
69
```
70
71
### Post-processing Integration
72
73
```python
74
# Use with cached_download for automated workflows
75
def download_and_extract_dataset(url, expected_hash):
76
"""Download, verify, and extract dataset archive."""
77
78
# Download with integrity verification
79
archive_path = gdown.cached_download(
80
url,
81
hash=expected_hash,
82
path="./cache/dataset.tar.gz"
83
)
84
85
# Extract archive
86
extracted_files = gdown.extractall(archive_path, to="./data/")
87
88
# Process extracted files
89
data_files = [f for f in extracted_files if f.endswith('.csv')]
90
print(f"Found {len(data_files)} data files")
91
92
return extracted_files
93
94
# Usage
95
files = download_and_extract_dataset(
96
"https://drive.google.com/uc?id=FILE_ID",
97
"sha256:expected_hash_value"
98
)
99
```
100
101
### Integration with Post-processing Callback
102
103
```python
104
# Automatic extraction using cached_download postprocess
105
def auto_extract(filepath):
106
"""Automatically extract archive after download."""
107
print(f"Auto-extracting {filepath}")
108
return gdown.extractall(filepath, to="./extracted/")
109
110
# Download and auto-extract
111
gdown.cached_download(
112
url="https://example.com/data.tar.gz",
113
hash="sha256:abc123...",
114
postprocess=auto_extract
115
)
116
```
117
118
## Supported Archive Formats
119
120
### ZIP Archives
121
- **Extension**: `.zip`
122
- **Description**: Standard ZIP compression format
123
- **Usage**: Most common for Windows and cross-platform archives
124
125
```python
126
# ZIP file extraction
127
extracted = gdown.extractall("data.zip", to="./zip_contents/")
128
```
129
130
### TAR Archives
131
- **Extension**: `.tar`
132
- **Description**: Uncompressed TAR (tape archive) format
133
- **Usage**: Common on Unix/Linux systems for packaging files
134
135
```python
136
# TAR file extraction
137
extracted = gdown.extractall("archive.tar", to="./tar_contents/")
138
```
139
140
### Compressed TAR Archives
141
142
#### GZIP Compressed TAR
143
- **Extensions**: `.tar.gz`, `.tgz`
144
- **Description**: TAR archive compressed with GZIP
145
- **Usage**: Very common for source code and Linux packages
146
147
```python
148
# GZIP compressed TAR extraction
149
extracted = gdown.extractall("package.tar.gz", to="./source/")
150
extracted = gdown.extractall("backup.tgz", to="./backup/")
151
```
152
153
#### BZIP2 Compressed TAR
154
- **Extensions**: `.tar.bz2`, `.tbz`
155
- **Description**: TAR archive compressed with BZIP2 (better compression than GZIP)
156
- **Usage**: Higher compression ratio, slower processing
157
158
```python
159
# BZIP2 compressed TAR extraction
160
extracted = gdown.extractall("dataset.tar.bz2", to="./dataset/")
161
extracted = gdown.extractall("archive.tbz", to="./archive/")
162
```
163
164
## Directory Structure Handling
165
166
### Extraction Behavior
167
168
Archives are extracted preserving their internal directory structure:
169
170
```
171
Archive Contents:
172
data.zip
173
├── dataset/
174
│ ├── train/
175
│ │ ├── file1.txt
176
│ │ └── file2.txt
177
│ └── test/
178
│ └── file3.txt
179
└── README.md
180
181
After extraction to "./extracted/":
182
./extracted/
183
├── dataset/
184
│ ├── train/
185
│ │ ├── file1.txt
186
│ │ └── file2.txt
187
│ └── test/
188
│ └── file3.txt
189
└── README.md
190
```
191
192
### Path Resolution
193
194
```python
195
# Default: extract to archive's parent directory
196
archive_path = "/home/user/downloads/data.zip"
197
files = gdown.extractall(archive_path) # Extracts to /home/user/downloads/
198
199
# Custom: extract to specific directory
200
files = gdown.extractall(archive_path, to="/home/user/projects/data/")
201
```
202
203
## Error Handling
204
205
```python
206
import gdown
207
208
def safe_extract(archive_path, target_dir=None):
209
"""Safely extract archive with comprehensive error handling."""
210
211
try:
212
extracted_files = gdown.extractall(archive_path, to=target_dir)
213
print(f"✅ Successfully extracted {len(extracted_files)} files")
214
return extracted_files
215
216
except ValueError as e:
217
if "no appropriate extractor" in str(e):
218
print(f"❌ Unsupported archive format: {archive_path}")
219
print("Supported formats: .zip, .tar, .tar.gz, .tgz, .tar.bz2, .tbz")
220
else:
221
print(f"❌ Extraction error: {e}")
222
return None
223
224
except FileNotFoundError:
225
print(f"❌ Archive file not found: {archive_path}")
226
return None
227
228
except PermissionError:
229
print(f"❌ Permission denied accessing: {archive_path}")
230
return None
231
232
except Exception as e:
233
print(f"❌ Unexpected error during extraction: {e}")
234
return None
235
236
# Usage
237
files = safe_extract("./dataset.tar.gz", "./data/")
238
if files:
239
print("Extraction completed successfully")
240
```
241
242
## Advanced Usage Patterns
243
244
### Batch Archive Processing
245
246
```python
247
import os
248
import gdown
249
250
def process_archive_directory(archive_dir, extract_base="./extracted/"):
251
"""Process all archives in a directory."""
252
253
supported_extensions = ('.zip', '.tar', '.tar.gz', '.tgz', '.tar.bz2', '.tbz')
254
processed = 0
255
256
for filename in os.listdir(archive_dir):
257
if filename.endswith(supported_extensions):
258
archive_path = os.path.join(archive_dir, filename)
259
260
# Create extraction directory based on filename
261
extract_name = os.path.splitext(filename)[0]
262
if extract_name.endswith('.tar'): # Handle .tar.gz, .tar.bz2
263
extract_name = os.path.splitext(extract_name)[0]
264
265
extract_dir = os.path.join(extract_base, extract_name)
266
267
try:
268
files = gdown.extractall(archive_path, to=extract_dir)
269
print(f"✅ {filename}: {len(files)} files extracted")
270
processed += 1
271
except Exception as e:
272
print(f"❌ {filename}: {e}")
273
274
print(f"Processed {processed} archives")
275
276
# Usage
277
process_archive_directory("./downloads/", "./data/")
278
```
279
280
### Archive Validation
281
282
```python
283
def validate_extraction(archive_path, expected_files=None):
284
"""Validate archive extraction results."""
285
286
try:
287
extracted_files = gdown.extractall(archive_path, to="./temp_extract/")
288
289
print(f"Extraction completed: {len(extracted_files)} files")
290
291
if expected_files:
292
# Check if all expected files were extracted
293
extracted_names = [os.path.basename(f) for f in extracted_files]
294
missing = set(expected_files) - set(extracted_names)
295
296
if missing:
297
print(f"⚠️ Missing expected files: {missing}")
298
else:
299
print("✅ All expected files found")
300
301
# Show file types
302
extensions = {}
303
for file_path in extracted_files:
304
ext = os.path.splitext(file_path)[1].lower()
305
extensions[ext] = extensions.get(ext, 0) + 1
306
307
print("File types found:")
308
for ext, count in sorted(extensions.items()):
309
print(f" {ext or '(no extension)'}: {count} files")
310
311
return extracted_files
312
313
except Exception as e:
314
print(f"Extraction failed: {e}")
315
return None
316
317
# Usage
318
validate_extraction(
319
"dataset.zip",
320
expected_files=["README.txt", "data.csv", "config.json"]
321
)
322
```
323
324
### Cleanup and Management
325
326
```python
327
import shutil
328
import tempfile
329
330
def extract_temporarily(archive_path, process_func):
331
"""Extract archive to temporary directory and clean up after processing."""
332
333
with tempfile.TemporaryDirectory() as temp_dir:
334
try:
335
# Extract to temporary directory
336
extracted_files = gdown.extractall(archive_path, to=temp_dir)
337
print(f"Extracted {len(extracted_files)} files to temporary directory")
338
339
# Process files
340
result = process_func(extracted_files, temp_dir)
341
342
return result
343
344
except Exception as e:
345
print(f"Processing failed: {e}")
346
return None
347
# Temporary directory automatically cleaned up
348
349
def process_extracted_files(file_list, base_dir):
350
"""Example processing function."""
351
csv_files = [f for f in file_list if f.endswith('.csv')]
352
print(f"Found {len(csv_files)} CSV files for processing")
353
354
# Process CSV files here
355
results = []
356
for csv_file in csv_files:
357
# Process each CSV file
358
results.append(f"Processed {os.path.basename(csv_file)}")
359
360
return results
361
362
# Usage
363
results = extract_temporarily("data.tar.gz", process_extracted_files)
364
print("Processing results:", results)
365
```
366
367
## Best Practices
368
369
### Memory Efficient Processing
370
371
```python
372
def stream_process_archive(archive_path):
373
"""Process large archives without keeping all files in memory."""
374
375
# Extract files
376
extracted_files = gdown.extractall(archive_path, to="./processing/")
377
378
# Process files one at a time to manage memory
379
for file_path in extracted_files:
380
if file_path.endswith('.csv'):
381
# Process individual CSV file
382
print(f"Processing {file_path}")
383
# ... process file ...
384
385
# Optionally remove processed file to save disk space
386
# os.remove(file_path)
387
388
return len(extracted_files)
389
```
390
391
### Integration with Download Workflows
392
393
```python
394
def complete_dataset_workflow(drive_url, expected_hash):
395
"""Complete workflow: download, verify, extract, and process."""
396
397
# Step 1: Download with verification
398
archive_path = gdown.cached_download(
399
drive_url,
400
hash=expected_hash,
401
path="./cache/dataset.tar.gz"
402
)
403
404
# Step 2: Extract archive
405
extracted_files = gdown.extractall(archive_path, to="./data/")
406
407
# Step 3: Organize extracted files
408
organized = {
409
'images': [f for f in extracted_files if f.endswith(('.jpg', '.png'))],
410
'data': [f for f in extracted_files if f.endswith('.csv')],
411
'docs': [f for f in extracted_files if f.endswith(('.txt', '.md'))]
412
}
413
414
print("Dataset organized:")
415
for category, files in organized.items():
416
print(f" {category}: {len(files)} files")
417
418
return organized
419
420
# Usage
421
dataset = complete_dataset_workflow(
422
"https://drive.google.com/uc?id=DATASET_ID",
423
"sha256:expected_dataset_hash"
424
)
425
```