Tessl Tile for pypi/pystow@0.7.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

archives.md cloud-storage.md configuration.md data-formats.md directory-management.md file-operations.md index.md module-class.md nltk-integration.md web-scraping.md

archives.mddocs/

0
# Archive and Compression Support
1

2
PyStow provides comprehensive support for compressed archives and files, including ZIP, TAR, GZIP, LZMA, and BZ2 formats. It can automatically extract archives, access files within archives, and handle various compression formats transparently.
3

4
## Archive Extraction
5

6
### TAR Archive Extraction
7

8
```python { .api }
9
def ensure_untar(key: str, *subkeys: str, url: str, name: str | None = None, directory: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, extract_kwargs: Mapping[str, Any] | None = None) -> Path:
10
    """Ensure a file is downloaded and untarred.
11
    
12
    Args:
13
        key: The name of the module. No funny characters. The envvar <key>_HOME where
14
            key is uppercased is checked first before using the default home directory.
15
        subkeys: A sequence of additional strings to join. If none are given, returns
16
            the directory for this module.
17
        url: The URL to download.
18
        name: Overrides the name of the file at the end of the URL, if given. Also
19
            useful for URLs that don't have proper filenames with extensions.
20
        directory: Overrides the name of the directory into which the tar archive is
21
            extracted. If none given, will use the stem of the file name that gets
22
            downloaded.
23
        force: Should the download be done again, even if the path already exists?
24
            Defaults to false.
25
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
26
        extract_kwargs: Keyword arguments to pass to tarfile.TarFile.extract_all.
27
    
28
    Returns:
29
        The path of the directory where the file that has been downloaded gets
30
        extracted to
31
    """
32
```
33

34
### GZIP Decompression
35

36
```python { .api }
37
def ensure_gunzip(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, autoclean: bool = True, download_kwargs: Mapping[str, Any] | None = None) -> Path:
38
    """Ensure a file is downloaded and gunzipped.
39
    
40
    Args:
41
        key: The name of the module. No funny characters. The envvar <key>_HOME where
42
            key is uppercased is checked first before using the default home directory.
43
        subkeys: A sequence of additional strings to join. If none are given, returns
44
            the directory for this module.
45
        url: The URL to download.
46
        name: Overrides the name of the file at the end of the URL, if given. Also
47
            useful for URLs that don't have proper filenames with extensions.
48
        force: Should the download be done again, even if the path already exists?
49
            Defaults to false.
50
        autoclean: Should the zipped file be deleted?
51
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
52
    
53
    Returns:
54
        The path of the directory where the file that has been downloaded gets
55
        extracted to
56
    """
57
```
58

59
## Compressed Archive Access
60

61
### ZIP File Access
62

63
```python { .api }
64
@contextmanager
65
def ensure_open_zip(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: str = "r", open_kwargs: Mapping[str, Any] | None = None) -> BytesOpener:
66
    """Ensure a file is downloaded then open it with zipfile.
67
    
68
    Args:
69
        key: The name of the module. No funny characters. The envvar <key>_HOME
70
            where key is uppercased is checked first before using the default home
71
            directory.
72
        subkeys: A sequence of additional strings to join. If none are given, returns
73
            the directory for this module.
74
        url: The URL to download.
75
        inner_path: The relative path to the file inside the archive
76
        name: Overrides the name of the file at the end of the URL, if given. Also
77
            useful for URLs that don't have proper filenames with extensions.
78
        force: Should the download be done again, even if the path already exists?
79
            Defaults to false.
80
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
81
        mode: The read mode, passed to zipfile.open
82
        open_kwargs: Additional keyword arguments passed to zipfile.open
83
    
84
    Yields:
85
        An open file object
86
    """
87
```
88

89
### TAR File Access
90

91
```python { .api }
92
@contextmanager
93
def ensure_open_tarfile(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: str = "r", open_kwargs: Mapping[str, Any] | None = None) -> BytesOpener:
94
    """Ensure a tar file is downloaded and open a file inside it.
95
    
96
    Args:
97
        key: The name of the module. No funny characters. The envvar <key>_HOME
98
            where key is uppercased is checked first before using the default home
99
            directory.
100
        subkeys: A sequence of additional strings to join. If none are given, returns
101
            the directory for this module.
102
        url: The URL to download.
103
        inner_path: The relative path to the file inside the archive
104
        name: Overrides the name of the file at the end of the URL, if given. Also
105
            useful for URLs that don't have proper filenames with extensions.
106
        force: Should the download be done again, even if the path already exists?
107
            Defaults to false.
108
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
109
        mode: The read mode, passed to tarfile.open
110
        open_kwargs: Additional keyword arguments passed to tarfile.open
111
    
112
    Yields:
113
        An open file object
114
    """
115
```
116

117
## Compression Format Support
118

119
### GZIP Files
120

121
```python { .api }
122
@contextmanager
123
def ensure_open_gz(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["r", "rb", "w", "wb", "rt", "wt"] = "rb", open_kwargs: Mapping[str, Any] | None = None) -> Generator[StringIO | BytesIO, None, None]:
124
    """Ensure a gzipped file is downloaded and open a file inside it.
125
    
126
    Args:
127
        key: The name of the module. No funny characters. The envvar <key>_HOME
128
            where key is uppercased is checked first before using the default home
129
            directory.
130
        subkeys: A sequence of additional strings to join. If none are given, returns
131
            the directory for this module.
132
        url: The URL to download.
133
        name: Overrides the name of the file at the end of the URL, if given. Also
134
            useful for URLs that don't have proper filenames with extensions.
135
        force: Should the download be done again, even if the path already exists?
136
            Defaults to false.
137
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
138
        mode: The read mode, passed to gzip.open
139
        open_kwargs: Additional keyword arguments passed to gzip.open
140
    
141
    Yields:
142
        An open file object
143
    """
144
```
145

146
### LZMA Files
147

148
```python { .api }
149
@contextmanager
150
def ensure_open_lzma(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["r", "rb", "w", "wb", "rt", "wt"] = "rt", open_kwargs: Mapping[str, Any] | None = None) -> Generator[lzma.LZMAFile | io.TextIOWrapper[lzma.LZMAFile], None, None]:
151
    """Ensure a LZMA-compressed file is downloaded and open a file inside it.
152
    
153
    Args:
154
        key: The name of the module. No funny characters. The envvar <key>_HOME
155
            where key is uppercased is checked first before using the default home
156
            directory.
157
        subkeys: A sequence of additional strings to join. If none are given, returns
158
            the directory for this module.
159
        url: The URL to download.
160
        name: Overrides the name of the file at the end of the URL, if given. Also
161
            useful for URLs that don't have proper filenames with extensions.
162
        force: Should the download be done again, even if the path already exists?
163
            Defaults to false.
164
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
165
        mode: The read mode, passed to lzma.open
166
        open_kwargs: Additional keyword arguments passed to lzma.open
167
    
168
    Yields:
169
        An open file object
170
    """
171
```
172

173
### BZ2 Files
174

175
```python { .api }
176
@contextmanager
177
def ensure_open_bz2(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["rb"] = "rb", open_kwargs: Mapping[str, Any] | None = None) -> Generator[bz2.BZ2File, None, None]:
178
    """Ensure a BZ2-compressed file is downloaded and open a file inside it.
179
    
180
    Args:
181
        key: The name of the module. No funny characters. The envvar <key>_HOME
182
            where key is uppercased is checked first before using the default home
183
            directory.
184
        subkeys: A sequence of additional strings to join. If none are given, returns
185
            the directory for this module.
186
        url: The URL to download.
187
        name: Overrides the name of the file at the end of the URL, if given. Also
188
            useful for URLs that don't have proper filenames with extensions.
189
        force: Should the download be done again, even if the path already exists?
190
            Defaults to false.
191
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
192
        mode: The read mode, passed to bz2.open
193
        open_kwargs: Additional keyword arguments passed to bz2.open
194
    
195
    Yields:
196
        An open file object
197
    """
198
```
199

200
## Archive Data Format Support
201

202
### CSV from Archives
203

204
```python { .api }
205
def ensure_zip_df(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
206
    """Download a zip file and open an inner file as a dataframe with pandas.
207
    
208
    Args:
209
        key: The module name
210
        subkeys: A sequence of additional strings to join. If none are given, returns
211
            the directory for this module.
212
        url: The URL to download.
213
        inner_path: The relative path to the file inside the archive
214
        name: Overrides the name of the file at the end of the URL, if given. Also
215
            useful for URLs that don't have proper filenames with extensions.
216
        force: Should the download be done again, even if the path already exists?
217
            Defaults to false.
218
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
219
        read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
220
    
221
    Returns:
222
        A pandas DataFrame
223
    """
224

225
def ensure_tar_df(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
226
    """Download a tar file and open an inner file as a dataframe with pandas.
227
    
228
    Args:
229
        key: The module name
230
        subkeys: A sequence of additional strings to join. If none are given, returns
231
            the directory for this module.
232
        url: The URL to download.
233
        inner_path: The relative path to the file inside the archive
234
        name: Overrides the name of the file at the end of the URL, if given. Also
235
            useful for URLs that don't have proper filenames with extensions.
236
        force: Should the download be done again, even if the path already exists?
237
            Defaults to false.
238
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
239
        read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
240
    
241
    Returns:
242
        A dataframe
243
    """
244
```
245

246
### XML from Archives
247

248
```python { .api }
249
def ensure_tar_xml(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, parse_kwargs: Mapping[str, Any] | None = None) -> lxml.etree.ElementTree:
250
    """Download a tar file and open an inner file as an XML with lxml.
251
    
252
    Args:
253
        key: The module name
254
        subkeys: A sequence of additional strings to join. If none are given, returns
255
            the directory for this module.
256
        url: The URL to download.
257
        inner_path: The relative path to the file inside the archive
258
        name: Overrides the name of the file at the end of the URL, if given. Also
259
            useful for URLs that don't have proper filenames with extensions.
260
        force: Should the download be done again, even if the path already exists?
261
            Defaults to false.
262
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
263
        parse_kwargs: Keyword arguments to pass through to lxml.etree.parse.
264
    
265
    Returns:
266
        An ElementTree object
267
    """
268
```
269

270
### NumPy Arrays from Archives
271

272
```python { .api }
273
def ensure_zip_np(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, load_kwargs: Mapping[str, Any] | None = None) -> numpy.typing.ArrayLike:
274
    """Download a zip file and open an inner file as an array-like with numpy.
275
    
276
    Args:
277
        key: The module name
278
        subkeys: A sequence of additional strings to join. If none are given, returns
279
            the directory for this module.
280
        url: The URL to download.
281
        inner_path: The relative path to the file inside the archive
282
        name: Overrides the name of the file at the end of the URL, if given. Also
283
            useful for URLs that don't have proper filenames with extensions.
284
        force: Should the download be done again, even if the path already exists?
285
            Defaults to false.
286
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
287
        load_kwargs: Additional keyword arguments that are passed through to
288
            read_zip_np and transitively to numpy.load.
289
    
290
    Returns:
291
        An array-like object
292
    """
293
```
294

295
## Usage Examples
296

297
### TAR Archive Extraction
298

299
```python
300
import pystow
301

302
# Download and extract tar archive
303
extracted_dir = pystow.ensure_untar(
304
    "myapp", "datasets",
305
    url="https://example.com/dataset.tar.gz",
306
    directory="dataset_v1"  # Custom extraction directory name
307
)
308

309
# Access extracted files
310
data_file = extracted_dir / "data" / "train.csv"
311
```
312

313
### GZIP Decompression
314

315
```python
316
import pystow
317

318
# Download and decompress gzipped file
319
decompressed_file = pystow.ensure_gunzip(
320
    "myapp", "data",
321
    url="https://example.com/large_file.txt.gz",
322
    autoclean=True  # Remove .gz file after decompression
323
)
324

325
# Read decompressed content
326
content = decompressed_file.read_text()
327
```
328

329
### Working with ZIP Archives
330

331
```python
332
import pystow
333

334
# Access file inside ZIP archive without extraction
335
with pystow.ensure_open_zip(
336
    "myapp", "archives",
337
    url="https://example.com/data.zip",
338
    inner_path="data/file.txt"
339
) as file:
340
    content = file.read()
341

342
# Extract DataFrame from CSV inside ZIP
343
df = pystow.ensure_zip_df(
344
    "myapp", "datasets", 
345
    url="https://example.com/dataset.zip",
346
    inner_path="dataset/train.csv",
347
    read_csv_kwargs={"sep": ","}
348
)
349

350
# Load NumPy array from ZIP
351
array = pystow.ensure_zip_np(
352
    "myapp", "arrays",
353
    url="https://example.com/arrays.zip", 
354
    inner_path="data.npy"
355
)
356
```
357

358
### Working with TAR Archives
359

360
```python
361
import pystow
362

363
# Access file inside TAR archive
364
with pystow.ensure_open_tarfile(
365
    "myapp", "archives",
366
    url="https://example.com/data.tar.gz",
367
    inner_path="data/config.json"
368
) as file:
369
    import json
370
    config = json.load(file)
371

372
# Extract DataFrame from TAR
373
df = pystow.ensure_tar_df(
374
    "myapp", "datasets",
375
    url="https://example.com/dataset.tar.bz2",
376
    inner_path="dataset/data.csv"
377
)
378

379
# Parse XML from TAR
380
tree = pystow.ensure_tar_xml(
381
    "myapp", "documents",
382
    url="https://example.com/docs.tar.gz",
383
    inner_path="docs/schema.xml"
384
)
385
```
386

387
### Compressed File Formats
388

389
```python
390
import pystow
391

392
# Work with GZIP files
393
with pystow.ensure_open_gz(
394
    "myapp", "logs",
395
    url="https://example.com/logfile.log.gz",
396
    mode="rt"  # Text mode
397
) as file:
398
    lines = file.readlines()
399

400
# Work with LZMA/XZ files  
401
with pystow.ensure_open_lzma(
402
    "myapp", "compressed",
403
    url="https://example.com/data.txt.xz",
404
    mode="rt"
405
) as file:
406
    data = file.read()
407

408
# Work with BZ2 files
409
with pystow.ensure_open_bz2(
410
    "myapp", "compressed", 
411
    url="https://example.com/data.bz2",
412
    mode="rb"
413
) as file:
414
    binary_data = file.read()
415
```
416

417
### Compressed Data Formats
418

419
```python
420
import pystow
421

422
# Load gzipped pickle
423
model = pystow.ensure_pickle_gz(
424
    "myapp", "models",
425
    url="https://example.com/model.pkl.gz"
426
)
427

428
# Load BZ2-compressed JSON
429
data = pystow.ensure_json_bz2(
430
    "myapp", "data",
431
    url="https://api.example.com/large_dataset.json.bz2"
432
)
433

434
# Save gzipped pickle
435
pystow.module("myapp").dump_pickle(
436
    "cache",
437
    name="processed_data.pkl",
438
    obj=large_data_structure
439
)
440
# Then manually compress if needed
441
```
442

443
### Complex Archive Workflows
444

445
```python
446
import pystow
447
import pandas as pd
448

449
# Download archive, extract specific file, process data
450
def process_archive_data(archive_url, inner_file):
451
    # Extract DataFrame from archive
452
    df = pystow.ensure_zip_df(
453
        "myapp", "raw_data",
454
        url=archive_url,
455
        inner_path=inner_file,
456
        read_csv_kwargs={"sep": "\t"}
457
    )
458
    
459
    # Process data
460
    processed_df = df.groupby("category").agg({
461
        "value": "sum",
462
        "count": "mean"
463
    })
464
    
465
    # Save processed data
466
    pystow.dump_df(
467
        "myapp", "processed",
468
        name="summary.csv",
469
        obj=processed_df
470
    )
471
    
472
    return processed_df
473

474
# Use the function
475
result = process_archive_data(
476
    "https://example.com/dataset.zip",
477
    "raw/data.tsv"
478
)
479
```

Version

Tile

Files

archives.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

archives.mddocs/