0
# Archive and Compression Support
1
2
PyStow provides comprehensive support for compressed archives and files, including ZIP, TAR, GZIP, LZMA, and BZ2 formats. It can automatically extract archives, access files within archives, and handle various compression formats transparently.
3
4
## Archive Extraction
5
6
### TAR Archive Extraction
7
8
```python { .api }
9
def ensure_untar(key: str, *subkeys: str, url: str, name: str | None = None, directory: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, extract_kwargs: Mapping[str, Any] | None = None) -> Path:
10
"""Ensure a file is downloaded and untarred.
11
12
Args:
13
key: The name of the module. No funny characters. The envvar <key>_HOME where
14
key is uppercased is checked first before using the default home directory.
15
subkeys: A sequence of additional strings to join. If none are given, returns
16
the directory for this module.
17
url: The URL to download.
18
name: Overrides the name of the file at the end of the URL, if given. Also
19
useful for URLs that don't have proper filenames with extensions.
20
directory: Overrides the name of the directory into which the tar archive is
21
extracted. If none given, will use the stem of the file name that gets
22
downloaded.
23
force: Should the download be done again, even if the path already exists?
24
Defaults to false.
25
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
26
extract_kwargs: Keyword arguments to pass to tarfile.TarFile.extract_all.
27
28
Returns:
29
The path of the directory where the file that has been downloaded gets
30
extracted to
31
"""
32
```
33
34
### GZIP Decompression
35
36
```python { .api }
37
def ensure_gunzip(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, autoclean: bool = True, download_kwargs: Mapping[str, Any] | None = None) -> Path:
38
"""Ensure a file is downloaded and gunzipped.
39
40
Args:
41
key: The name of the module. No funny characters. The envvar <key>_HOME where
42
key is uppercased is checked first before using the default home directory.
43
subkeys: A sequence of additional strings to join. If none are given, returns
44
the directory for this module.
45
url: The URL to download.
46
name: Overrides the name of the file at the end of the URL, if given. Also
47
useful for URLs that don't have proper filenames with extensions.
48
force: Should the download be done again, even if the path already exists?
49
Defaults to false.
50
autoclean: Should the zipped file be deleted?
51
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
52
53
Returns:
54
The path of the directory where the file that has been downloaded gets
55
extracted to
56
"""
57
```
58
59
## Compressed Archive Access
60
61
### ZIP File Access
62
63
```python { .api }
64
@contextmanager
65
def ensure_open_zip(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: str = "r", open_kwargs: Mapping[str, Any] | None = None) -> BytesOpener:
66
"""Ensure a file is downloaded then open it with zipfile.
67
68
Args:
69
key: The name of the module. No funny characters. The envvar <key>_HOME
70
where key is uppercased is checked first before using the default home
71
directory.
72
subkeys: A sequence of additional strings to join. If none are given, returns
73
the directory for this module.
74
url: The URL to download.
75
inner_path: The relative path to the file inside the archive
76
name: Overrides the name of the file at the end of the URL, if given. Also
77
useful for URLs that don't have proper filenames with extensions.
78
force: Should the download be done again, even if the path already exists?
79
Defaults to false.
80
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
81
mode: The read mode, passed to zipfile.open
82
open_kwargs: Additional keyword arguments passed to zipfile.open
83
84
Yields:
85
An open file object
86
"""
87
```
88
89
### TAR File Access
90
91
```python { .api }
92
@contextmanager
93
def ensure_open_tarfile(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: str = "r", open_kwargs: Mapping[str, Any] | None = None) -> BytesOpener:
94
"""Ensure a tar file is downloaded and open a file inside it.
95
96
Args:
97
key: The name of the module. No funny characters. The envvar <key>_HOME
98
where key is uppercased is checked first before using the default home
99
directory.
100
subkeys: A sequence of additional strings to join. If none are given, returns
101
the directory for this module.
102
url: The URL to download.
103
inner_path: The relative path to the file inside the archive
104
name: Overrides the name of the file at the end of the URL, if given. Also
105
useful for URLs that don't have proper filenames with extensions.
106
force: Should the download be done again, even if the path already exists?
107
Defaults to false.
108
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
109
mode: The read mode, passed to tarfile.open
110
open_kwargs: Additional keyword arguments passed to tarfile.open
111
112
Yields:
113
An open file object
114
"""
115
```
116
117
## Compression Format Support
118
119
### GZIP Files
120
121
```python { .api }
122
@contextmanager
123
def ensure_open_gz(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["r", "rb", "w", "wb", "rt", "wt"] = "rb", open_kwargs: Mapping[str, Any] | None = None) -> Generator[StringIO | BytesIO, None, None]:
124
"""Ensure a gzipped file is downloaded and open a file inside it.
125
126
Args:
127
key: The name of the module. No funny characters. The envvar <key>_HOME
128
where key is uppercased is checked first before using the default home
129
directory.
130
subkeys: A sequence of additional strings to join. If none are given, returns
131
the directory for this module.
132
url: The URL to download.
133
name: Overrides the name of the file at the end of the URL, if given. Also
134
useful for URLs that don't have proper filenames with extensions.
135
force: Should the download be done again, even if the path already exists?
136
Defaults to false.
137
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
138
mode: The read mode, passed to gzip.open
139
open_kwargs: Additional keyword arguments passed to gzip.open
140
141
Yields:
142
An open file object
143
"""
144
```
145
146
### LZMA Files
147
148
```python { .api }
149
@contextmanager
150
def ensure_open_lzma(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["r", "rb", "w", "wb", "rt", "wt"] = "rt", open_kwargs: Mapping[str, Any] | None = None) -> Generator[lzma.LZMAFile | io.TextIOWrapper[lzma.LZMAFile], None, None]:
151
"""Ensure a LZMA-compressed file is downloaded and open a file inside it.
152
153
Args:
154
key: The name of the module. No funny characters. The envvar <key>_HOME
155
where key is uppercased is checked first before using the default home
156
directory.
157
subkeys: A sequence of additional strings to join. If none are given, returns
158
the directory for this module.
159
url: The URL to download.
160
name: Overrides the name of the file at the end of the URL, if given. Also
161
useful for URLs that don't have proper filenames with extensions.
162
force: Should the download be done again, even if the path already exists?
163
Defaults to false.
164
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
165
mode: The read mode, passed to lzma.open
166
open_kwargs: Additional keyword arguments passed to lzma.open
167
168
Yields:
169
An open file object
170
"""
171
```
172
173
### BZ2 Files
174
175
```python { .api }
176
@contextmanager
177
def ensure_open_bz2(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["rb"] = "rb", open_kwargs: Mapping[str, Any] | None = None) -> Generator[bz2.BZ2File, None, None]:
178
"""Ensure a BZ2-compressed file is downloaded and open a file inside it.
179
180
Args:
181
key: The name of the module. No funny characters. The envvar <key>_HOME
182
where key is uppercased is checked first before using the default home
183
directory.
184
subkeys: A sequence of additional strings to join. If none are given, returns
185
the directory for this module.
186
url: The URL to download.
187
name: Overrides the name of the file at the end of the URL, if given. Also
188
useful for URLs that don't have proper filenames with extensions.
189
force: Should the download be done again, even if the path already exists?
190
Defaults to false.
191
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
192
mode: The read mode, passed to bz2.open
193
open_kwargs: Additional keyword arguments passed to bz2.open
194
195
Yields:
196
An open file object
197
"""
198
```
199
200
## Archive Data Format Support
201
202
### CSV from Archives
203
204
```python { .api }
205
def ensure_zip_df(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
206
"""Download a zip file and open an inner file as a dataframe with pandas.
207
208
Args:
209
key: The module name
210
subkeys: A sequence of additional strings to join. If none are given, returns
211
the directory for this module.
212
url: The URL to download.
213
inner_path: The relative path to the file inside the archive
214
name: Overrides the name of the file at the end of the URL, if given. Also
215
useful for URLs that don't have proper filenames with extensions.
216
force: Should the download be done again, even if the path already exists?
217
Defaults to false.
218
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
219
read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
220
221
Returns:
222
A pandas DataFrame
223
"""
224
225
def ensure_tar_df(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
226
"""Download a tar file and open an inner file as a dataframe with pandas.
227
228
Args:
229
key: The module name
230
subkeys: A sequence of additional strings to join. If none are given, returns
231
the directory for this module.
232
url: The URL to download.
233
inner_path: The relative path to the file inside the archive
234
name: Overrides the name of the file at the end of the URL, if given. Also
235
useful for URLs that don't have proper filenames with extensions.
236
force: Should the download be done again, even if the path already exists?
237
Defaults to false.
238
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
239
read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
240
241
Returns:
242
A dataframe
243
"""
244
```
245
246
### XML from Archives
247
248
```python { .api }
249
def ensure_tar_xml(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, parse_kwargs: Mapping[str, Any] | None = None) -> lxml.etree.ElementTree:
250
"""Download a tar file and open an inner file as an XML with lxml.
251
252
Args:
253
key: The module name
254
subkeys: A sequence of additional strings to join. If none are given, returns
255
the directory for this module.
256
url: The URL to download.
257
inner_path: The relative path to the file inside the archive
258
name: Overrides the name of the file at the end of the URL, if given. Also
259
useful for URLs that don't have proper filenames with extensions.
260
force: Should the download be done again, even if the path already exists?
261
Defaults to false.
262
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
263
parse_kwargs: Keyword arguments to pass through to lxml.etree.parse.
264
265
Returns:
266
An ElementTree object
267
"""
268
```
269
270
### NumPy Arrays from Archives
271
272
```python { .api }
273
def ensure_zip_np(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, load_kwargs: Mapping[str, Any] | None = None) -> numpy.typing.ArrayLike:
274
"""Download a zip file and open an inner file as an array-like with numpy.
275
276
Args:
277
key: The module name
278
subkeys: A sequence of additional strings to join. If none are given, returns
279
the directory for this module.
280
url: The URL to download.
281
inner_path: The relative path to the file inside the archive
282
name: Overrides the name of the file at the end of the URL, if given. Also
283
useful for URLs that don't have proper filenames with extensions.
284
force: Should the download be done again, even if the path already exists?
285
Defaults to false.
286
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
287
load_kwargs: Additional keyword arguments that are passed through to
288
read_zip_np and transitively to numpy.load.
289
290
Returns:
291
An array-like object
292
"""
293
```
294
295
## Usage Examples
296
297
### TAR Archive Extraction
298
299
```python
300
import pystow
301
302
# Download and extract tar archive
303
extracted_dir = pystow.ensure_untar(
304
"myapp", "datasets",
305
url="https://example.com/dataset.tar.gz",
306
directory="dataset_v1" # Custom extraction directory name
307
)
308
309
# Access extracted files
310
data_file = extracted_dir / "data" / "train.csv"
311
```
312
313
### GZIP Decompression
314
315
```python
316
import pystow
317
318
# Download and decompress gzipped file
319
decompressed_file = pystow.ensure_gunzip(
320
"myapp", "data",
321
url="https://example.com/large_file.txt.gz",
322
autoclean=True # Remove .gz file after decompression
323
)
324
325
# Read decompressed content
326
content = decompressed_file.read_text()
327
```
328
329
### Working with ZIP Archives
330
331
```python
332
import pystow
333
334
# Access file inside ZIP archive without extraction
335
with pystow.ensure_open_zip(
336
"myapp", "archives",
337
url="https://example.com/data.zip",
338
inner_path="data/file.txt"
339
) as file:
340
content = file.read()
341
342
# Extract DataFrame from CSV inside ZIP
343
df = pystow.ensure_zip_df(
344
"myapp", "datasets",
345
url="https://example.com/dataset.zip",
346
inner_path="dataset/train.csv",
347
read_csv_kwargs={"sep": ","}
348
)
349
350
# Load NumPy array from ZIP
351
array = pystow.ensure_zip_np(
352
"myapp", "arrays",
353
url="https://example.com/arrays.zip",
354
inner_path="data.npy"
355
)
356
```
357
358
### Working with TAR Archives
359
360
```python
361
import pystow
362
363
# Access file inside TAR archive
364
with pystow.ensure_open_tarfile(
365
"myapp", "archives",
366
url="https://example.com/data.tar.gz",
367
inner_path="data/config.json"
368
) as file:
369
import json
370
config = json.load(file)
371
372
# Extract DataFrame from TAR
373
df = pystow.ensure_tar_df(
374
"myapp", "datasets",
375
url="https://example.com/dataset.tar.bz2",
376
inner_path="dataset/data.csv"
377
)
378
379
# Parse XML from TAR
380
tree = pystow.ensure_tar_xml(
381
"myapp", "documents",
382
url="https://example.com/docs.tar.gz",
383
inner_path="docs/schema.xml"
384
)
385
```
386
387
### Compressed File Formats
388
389
```python
390
import pystow
391
392
# Work with GZIP files
393
with pystow.ensure_open_gz(
394
"myapp", "logs",
395
url="https://example.com/logfile.log.gz",
396
mode="rt" # Text mode
397
) as file:
398
lines = file.readlines()
399
400
# Work with LZMA/XZ files
401
with pystow.ensure_open_lzma(
402
"myapp", "compressed",
403
url="https://example.com/data.txt.xz",
404
mode="rt"
405
) as file:
406
data = file.read()
407
408
# Work with BZ2 files
409
with pystow.ensure_open_bz2(
410
"myapp", "compressed",
411
url="https://example.com/data.bz2",
412
mode="rb"
413
) as file:
414
binary_data = file.read()
415
```
416
417
### Compressed Data Formats
418
419
```python
420
import pystow
421
422
# Load gzipped pickle
423
model = pystow.ensure_pickle_gz(
424
"myapp", "models",
425
url="https://example.com/model.pkl.gz"
426
)
427
428
# Load BZ2-compressed JSON
429
data = pystow.ensure_json_bz2(
430
"myapp", "data",
431
url="https://api.example.com/large_dataset.json.bz2"
432
)
433
434
# Save gzipped pickle
435
pystow.module("myapp").dump_pickle(
436
"cache",
437
name="processed_data.pkl",
438
obj=large_data_structure
439
)
440
# Then manually compress if needed
441
```
442
443
### Complex Archive Workflows
444
445
```python
446
import pystow
447
import pandas as pd
448
449
# Download archive, extract specific file, process data
450
def process_archive_data(archive_url, inner_file):
451
# Extract DataFrame from archive
452
df = pystow.ensure_zip_df(
453
"myapp", "raw_data",
454
url=archive_url,
455
inner_path=inner_file,
456
read_csv_kwargs={"sep": "\t"}
457
)
458
459
# Process data
460
processed_df = df.groupby("category").agg({
461
"value": "sum",
462
"count": "mean"
463
})
464
465
# Save processed data
466
pystow.dump_df(
467
"myapp", "processed",
468
name="summary.csv",
469
obj=processed_df
470
)
471
472
return processed_df
473
474
# Use the function
475
result = process_archive_data(
476
"https://example.com/dataset.zip",
477
"raw/data.tsv"
478
)
479
```