Tessl Tile for pypi/pystow@0.7.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

archives.md cloud-storage.md configuration.md data-formats.md directory-management.md file-operations.md index.md module-class.md nltk-integration.md web-scraping.md

data-formats.mddocs/

0
# Data Format Support
1

2
PyStow provides built-in support for common data formats with automatic parsing and serialization. It integrates with popular libraries like pandas, lxml, and rdflib to handle CSV, JSON, XML, RDF, Excel, and Python objects seamlessly.
3

4
## CSV and DataFrames
5

6
### CSV Download and Parsing
7

8
```python { .api }
9
def ensure_csv(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
10
    """Download a CSV and open as a dataframe with pandas.
11
    
12
    Args:
13
        key: The module name
14
        subkeys: A sequence of additional strings to join. If none are given, returns
15
            the directory for this module.
16
        url: The URL to download.
17
        name: Overrides the name of the file at the end of the URL, if given. Also
18
            useful for URLs that don't have proper filenames with extensions.
19
        force: Should the download be done again, even if the path already exists?
20
            Defaults to false.
21
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
22
        read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
23
    
24
    Returns:
25
        A pandas DataFrame
26
    """
27
```
28

29
### Excel Support
30

31
```python { .api }
32
def ensure_excel(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_excel_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
33
    """Download an excel file and open as a dataframe with pandas.
34
    
35
    Args:
36
        key: The module name
37
        subkeys: A sequence of additional strings to join. If none are given, returns
38
            the directory for this module.
39
        url: The URL to download.
40
        name: Overrides the name of the file at the end of the URL, if given. Also
41
            useful for URLs that don't have proper filenames with extensions.
42
        force: Should the download be done again, even if the path already exists?
43
            Defaults to false.
44
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
45
        read_excel_kwargs: Keyword arguments to pass through to pandas.read_excel.
46
    
47
    Returns:
48
        A pandas DataFrame
49
    """
50
```
51

52
### DataFrame Operations
53

54
```python { .api }
55
def load_df(key: str, *subkeys: str, name: str, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
56
    """Open a pre-existing CSV as a dataframe with pandas.
57
    
58
    Args:
59
        key: The module name
60
        subkeys: A sequence of additional strings to join. If none are given, returns
61
            the directory for this module.
62
        name: Overrides the name of the file at the end of the URL, if given. Also
63
            useful for URLs that don't have proper filenames with extensions.
64
        read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
65
    
66
    Returns:
67
        A pandas DataFrame
68
    """
69

70
def dump_df(key: str, *subkeys: str, name: str, obj: pd.DataFrame, sep: str = "\t", index: bool = False, to_csv_kwargs: Mapping[str, Any] | None = None) -> None:
71
    """Dump a dataframe to a TSV file with pandas.
72
    
73
    Args:
74
        key: The module name
75
        subkeys: A sequence of additional strings to join. If none are given, returns
76
            the directory for this module.
77
        name: Overrides the name of the file at the end of the URL, if given. Also
78
            useful for URLs that don't have proper filenames with extensions.
79
        obj: The dataframe to dump
80
        sep: The separator to use, defaults to a tab
81
        index: Should the index be dumped? Defaults to false.
82
        to_csv_kwargs: Keyword arguments to pass through to pandas.DataFrame.to_csv.
83
    """
84
```
85

86
## JSON Format
87

88
### JSON Download and Parsing
89

90
```python { .api }
91
def ensure_json(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, open_kwargs: Mapping[str, Any] | None = None, json_load_kwargs: Mapping[str, Any] | None = None) -> JSON:
92
    """Download JSON and open with json.
93
    
94
    Args:
95
        key: The module name
96
        subkeys: A sequence of additional strings to join. If none are given, returns
97
            the directory for this module.
98
        url: The URL to download.
99
        name: Overrides the name of the file at the end of the URL, if given. Also
100
            useful for URLs that don't have proper filenames with extensions.
101
        force: Should the download be done again, even if the path already exists?
102
            Defaults to false.
103
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
104
        open_kwargs: Additional keyword arguments passed to open
105
        json_load_kwargs: Keyword arguments to pass through to json.load.
106
    
107
    Returns:
108
        A JSON object (list, dict, etc.)
109
    """
110
```
111

112
### Compressed JSON
113

114
```python { .api }
115
def ensure_json_bz2(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, open_kwargs: Mapping[str, Any] | None = None, json_load_kwargs: Mapping[str, Any] | None = None) -> JSON:
116
    """Download BZ2-compressed JSON and open with json.
117
    
118
    Args:
119
        key: The module name
120
        subkeys: A sequence of additional strings to join. If none are given, returns
121
            the directory for this module.
122
        url: The URL to download.
123
        name: Overrides the name of the file at the end of the URL, if given. Also
124
            useful for URLs that don't have proper filenames with extensions.
125
        force: Should the download be done again, even if the path already exists?
126
            Defaults to false.
127
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
128
        open_kwargs: Additional keyword arguments passed to bz2.open
129
        json_load_kwargs: Keyword arguments to pass through to json.load.
130
    
131
    Returns:
132
        A JSON object (list, dict, etc.)
133
    """
134
```
135

136
### JSON Operations
137

138
```python { .api }
139
def load_json(key: str, *subkeys: str, name: str, json_load_kwargs: Mapping[str, Any] | None = None) -> JSON:
140
    """Open a JSON file json.
141
    
142
    Args:
143
        key: The module name
144
        subkeys: A sequence of additional strings to join. If none are given, returns
145
            the directory for this module.
146
        name: The name of the file to open
147
        json_load_kwargs: Keyword arguments to pass through to json.load.
148
    
149
    Returns:
150
        A JSON object (list, dict, etc.)
151
    """
152

153
def dump_json(key: str, *subkeys: str, name: str, obj: JSON, open_kwargs: Mapping[str, Any] | None = None, json_dump_kwargs: Mapping[str, Any] | None = None) -> None:
154
    """Dump an object to a file with json.
155
    
156
    Args:
157
        key: The module name
158
        subkeys: A sequence of additional strings to join. If none are given, returns
159
            the directory for this module.
160
        name: The name of the file to open
161
        obj: The object to dump
162
        open_kwargs: Additional keyword arguments passed to open
163
        json_dump_kwargs: Keyword arguments to pass through to json.dump.
164
    """
165
```
166

167
## XML Format
168

169
### XML Download and Parsing
170

171
```python { .api }
172
def ensure_xml(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, parse_kwargs: Mapping[str, Any] | None = None) -> lxml.etree.ElementTree:
173
    """Download an XML file and open it with lxml.
174
    
175
    Args:
176
        key: The module name
177
        subkeys: A sequence of additional strings to join. If none are given, returns
178
            the directory for this module.
179
        url: The URL to download.
180
        name: Overrides the name of the file at the end of the URL, if given. Also
181
            useful for URLs that don't have proper filenames with extensions.
182
        force: Should the download be done again, even if the path already exists?
183
            Defaults to false.
184
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
185
        parse_kwargs: Keyword arguments to pass through to lxml.etree.parse.
186
    
187
    Returns:
188
        An ElementTree object
189
    """
190
```
191

192
### XML Operations
193

194
```python { .api }
195
def load_xml(key: str, *subkeys: str, name: str, parse_kwargs: Mapping[str, Any] | None = None) -> lxml.etree.ElementTree:
196
    """Load an XML file with lxml.
197
    
198
    Args:
199
        key: The module name
200
        subkeys: A sequence of additional strings to join. If none are given, returns
201
            the directory for this module.
202
        name: The name of the file to open
203
        parse_kwargs: Keyword arguments to pass through to lxml.etree.parse.
204
    
205
    Returns:
206
        An ElementTree object
207
    """
208

209
def dump_xml(key: str, *subkeys: str, name: str, obj: lxml.etree.ElementTree, open_kwargs: Mapping[str, Any] | None = None, write_kwargs: Mapping[str, Any] | None = None) -> None:
210
    """Dump an XML element tree to a file with lxml.
211
    
212
    Args:
213
        key: The module name
214
        subkeys: A sequence of additional strings to join. If none are given, returns
215
            the directory for this module.
216
        name: The name of the file to open
217
        obj: The object to dump
218
        open_kwargs: Additional keyword arguments passed to open
219
        write_kwargs: Keyword arguments to pass through to lxml.etree.ElementTree.write.
220
    """
221
```
222

223
## RDF Format
224

225
### RDF Download and Parsing
226

227
```python { .api }
228
def ensure_rdf(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, precache: bool = True, parse_kwargs: Mapping[str, Any] | None = None) -> rdflib.Graph:
229
    """Download a RDF file and open with rdflib.
230
    
231
    Args:
232
        key: The module name
233
        subkeys: A sequence of additional strings to join. If none are given, returns
234
            the directory for this module.
235
        url: The URL to download.
236
        name: Overrides the name of the file at the end of the URL, if given. Also
237
            useful for URLs that don't have proper filenames with extensions.
238
        force: Should the download be done again, even if the path already exists?
239
            Defaults to false.
240
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
241
        precache: Should the parsed rdflib.Graph be stored as a pickle for
242
            fast loading?
243
        parse_kwargs: Keyword arguments to pass through to pystow.utils.read_rdf
244
            and transitively to rdflib.Graph.parse.
245
    
246
    Returns:
247
        An RDF graph
248
    """
249
```
250

251
### RDF Operations
252

253
```python { .api }
254
def load_rdf(key: str, *subkeys: str, name: str | None = None, parse_kwargs: Mapping[str, Any] | None = None) -> rdflib.Graph:
255
    """Open an RDF file with rdflib.
256
    
257
    Args:
258
        key: The name of the module. No funny characters. The envvar <key>_HOME where
259
            key is uppercased is checked first before using the default home directory.
260
        subkeys: A sequence of additional strings to join. If none are given, returns
261
            the directory for this module.
262
        name: The name of the file to open
263
        parse_kwargs: Keyword arguments to pass through to pystow.utils.read_rdf
264
            and transitively to rdflib.Graph.parse.
265
    
266
    Returns:
267
        An RDF graph
268
    """
269

270
def dump_rdf(key: str, *subkeys: str, name: str, obj: rdflib.Graph, format: str = "turtle", serialize_kwargs: Mapping[str, Any] | None = None) -> None:
271
    """Dump an RDF graph to a file with rdflib.
272
    
273
    Args:
274
        key: The name of the module. No funny characters. The envvar <key>_HOME where
275
            key is uppercased is checked first before using the default home directory.
276
        subkeys: A sequence of additional strings to join. If none are given, returns
277
            the directory for this module.
278
        name: The name of the file to open
279
        obj: The object to dump
280
        format: The format to dump in
281
        serialize_kwargs: Keyword arguments to through to rdflib.Graph.serialize.
282
    """
283
```
284

285
## Pickle Format
286

287
### Pickle Operations
288

289
```python { .api }
290
def ensure_pickle(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["rb"] = "rb", open_kwargs: Mapping[str, Any] | None = None, pickle_load_kwargs: Mapping[str, Any] | None = None) -> Any:
291
    """Download a pickle file and open with pickle.
292
    
293
    Args:
294
        key: The module name
295
        subkeys: A sequence of additional strings to join. If none are given, returns
296
            the directory for this module.
297
        url: The URL to download.
298
        name: Overrides the name of the file at the end of the URL, if given. Also
299
            useful for URLs that don't have proper filenames with extensions.
300
        force: Should the download be done again, even if the path already exists?
301
            Defaults to false.
302
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
303
        mode: The read mode, passed to open
304
        open_kwargs: Additional keyword arguments passed to open
305
        pickle_load_kwargs: Keyword arguments to pass through to pickle.load.
306
    
307
    Returns:
308
        Any object
309
    """
310

311
def load_pickle(key: str, *subkeys: str, name: str, mode: Literal["rb"] = "rb", open_kwargs: Mapping[str, Any] | None = None, pickle_load_kwargs: Mapping[str, Any] | None = None) -> Any:
312
    """Open a pickle file with pickle.
313
    
314
    Args:
315
        key: The module name
316
        subkeys: A sequence of additional strings to join. If none are given, returns
317
            the directory for this module.
318
        name: The name of the file to open
319
        mode: The read mode, passed to open
320
        open_kwargs: Additional keyword arguments passed to open
321
        pickle_load_kwargs: Keyword arguments to pass through to pickle.load.
322
    
323
    Returns:
324
        Any object
325
    """
326

327
def dump_pickle(key: str, *subkeys: str, name: str, obj: Any, mode: Literal["wb"] = "wb", open_kwargs: Mapping[str, Any] | None = None, pickle_dump_kwargs: Mapping[str, Any] | None = None) -> None:
328
    """Dump an object to a file with pickle.
329
    
330
    Args:
331
        key: The module name
332
        subkeys: A sequence of additional strings to join. If none are given, returns
333
            the directory for this module.
334
        name: The name of the file to open
335
        obj: The object to dump
336
        mode: The read mode, passed to open
337
        open_kwargs: Additional keyword arguments passed to open
338
        pickle_dump_kwargs: Keyword arguments to pass through to pickle.dump.
339
    """
340
```
341

342
## Usage Examples
343

344
### CSV and DataFrames
345

346
```python
347
import pystow
348
import pandas as pd
349

350
# Download and parse CSV
351
df = pystow.ensure_csv(
352
    "myapp", "datasets",
353
    url="https://example.com/data.csv",
354
    read_csv_kwargs={"sep": ",", "header": 0}
355
)
356

357
# Load existing CSV
358
df = pystow.load_df("myapp", "processed", name="clean_data.csv")
359

360
# Save DataFrame
361
pystow.dump_df(
362
    "myapp", "outputs",
363
    name="results.tsv",
364
    obj=df,
365
    sep="\t"
366
)
367

368
# Excel files
369
excel_df = pystow.ensure_excel(
370
    "myapp", "reports",
371
    url="https://example.com/report.xlsx",
372
    read_excel_kwargs={"sheet_name": "Summary"}
373
)
374
```
375

376
### JSON Data
377

378
```python
379
import pystow
380

381
# Download and parse JSON
382
config = pystow.ensure_json(
383
    "myapp", "config",
384
    url="https://api.example.com/config.json"
385
)
386

387
# Load existing JSON
388
data = pystow.load_json("myapp", "cache", name="api_response.json")
389

390
# Save JSON data
391
pystow.dump_json(
392
    "myapp", "outputs",
393
    name="results.json",
394
    obj={"status": "complete", "count": 42},
395
    json_dump_kwargs={"indent": 2}
396
)
397

398
# Compressed JSON
399
large_data = pystow.ensure_json_bz2(
400
    "myapp", "datasets",
401
    url="https://example.com/large_dataset.json.bz2"
402
)
403
```
404

405
### XML Processing
406

407
```python
408
import pystow
409
from lxml import etree
410

411
# Download and parse XML
412
tree = pystow.ensure_xml(
413
    "myapp", "schemas",
414
    url="https://example.com/schema.xml"
415
)
416

417
# Access elements
418
root = tree.getroot()
419
elements = root.xpath("//element[@type='important']")
420

421
# Load existing XML
422
local_tree = pystow.load_xml("myapp", "data", name="document.xml")
423

424
# Save XML
425
pystow.dump_xml(
426
    "myapp", "outputs",
427
    name="modified.xml",
428
    obj=tree
429
)
430
```
431

432
### RDF Data
433

434
```python
435
import pystow
436
import rdflib
437

438
# Download and parse RDF with caching
439
graph = pystow.ensure_rdf(
440
    "myapp", "ontologies",
441
    url="https://example.com/ontology.rdf.gz",
442
    parse_kwargs={"format": "xml"},
443
    precache=True  # Cache parsed graph as pickle for speed
444
)
445

446
# Query the graph
447
results = graph.query("""
448
    SELECT ?subject ?predicate ?object
449
    WHERE { ?subject ?predicate ?object }
450
    LIMIT 10
451
""")
452

453
# Save RDF in different format
454
pystow.dump_rdf(
455
    "myapp", "outputs",
456
    name="data.ttl",
457
    obj=graph,
458
    format="turtle"
459
)
460
```
461

462
### Python Objects
463

464
```python
465
import pystow
466

467
# Download and load pickled object
468
model = pystow.ensure_pickle(
469
    "myapp", "models",
470
    url="https://example.com/trained_model.pkl"
471
)
472

473
# Save Python object
474
data_structure = {"key": "value", "list": [1, 2, 3]}
475
pystow.dump_pickle(
476
    "myapp", "cache",
477
    name="data.pkl",
478
    obj=data_structure
479
)
480

481
# Load existing pickle
482
cached_data = pystow.load_pickle("myapp", "cache", name="data.pkl")
483
```

Version

Tile

Files

data-formats.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

data-formats.mddocs/