0
# Data Format Support
1
2
PyStow provides built-in support for common data formats with automatic parsing and serialization. It integrates with popular libraries like pandas, lxml, and rdflib to handle CSV, JSON, XML, RDF, Excel, and Python objects seamlessly.
3
4
## CSV and DataFrames
5
6
### CSV Download and Parsing
7
8
```python { .api }
9
def ensure_csv(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
10
"""Download a CSV and open as a dataframe with pandas.
11
12
Args:
13
key: The module name
14
subkeys: A sequence of additional strings to join. If none are given, returns
15
the directory for this module.
16
url: The URL to download.
17
name: Overrides the name of the file at the end of the URL, if given. Also
18
useful for URLs that don't have proper filenames with extensions.
19
force: Should the download be done again, even if the path already exists?
20
Defaults to false.
21
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
22
read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
23
24
Returns:
25
A pandas DataFrame
26
"""
27
```
28
29
### Excel Support
30
31
```python { .api }
32
def ensure_excel(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_excel_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
33
"""Download an excel file and open as a dataframe with pandas.
34
35
Args:
36
key: The module name
37
subkeys: A sequence of additional strings to join. If none are given, returns
38
the directory for this module.
39
url: The URL to download.
40
name: Overrides the name of the file at the end of the URL, if given. Also
41
useful for URLs that don't have proper filenames with extensions.
42
force: Should the download be done again, even if the path already exists?
43
Defaults to false.
44
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
45
read_excel_kwargs: Keyword arguments to pass through to pandas.read_excel.
46
47
Returns:
48
A pandas DataFrame
49
"""
50
```
51
52
### DataFrame Operations
53
54
```python { .api }
55
def load_df(key: str, *subkeys: str, name: str, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:
56
"""Open a pre-existing CSV as a dataframe with pandas.
57
58
Args:
59
key: The module name
60
subkeys: A sequence of additional strings to join. If none are given, returns
61
the directory for this module.
62
name: Overrides the name of the file at the end of the URL, if given. Also
63
useful for URLs that don't have proper filenames with extensions.
64
read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.
65
66
Returns:
67
A pandas DataFrame
68
"""
69
70
def dump_df(key: str, *subkeys: str, name: str, obj: pd.DataFrame, sep: str = "\t", index: bool = False, to_csv_kwargs: Mapping[str, Any] | None = None) -> None:
71
"""Dump a dataframe to a TSV file with pandas.
72
73
Args:
74
key: The module name
75
subkeys: A sequence of additional strings to join. If none are given, returns
76
the directory for this module.
77
name: Overrides the name of the file at the end of the URL, if given. Also
78
useful for URLs that don't have proper filenames with extensions.
79
obj: The dataframe to dump
80
sep: The separator to use, defaults to a tab
81
index: Should the index be dumped? Defaults to false.
82
to_csv_kwargs: Keyword arguments to pass through to pandas.DataFrame.to_csv.
83
"""
84
```
85
86
## JSON Format
87
88
### JSON Download and Parsing
89
90
```python { .api }
91
def ensure_json(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, open_kwargs: Mapping[str, Any] | None = None, json_load_kwargs: Mapping[str, Any] | None = None) -> JSON:
92
"""Download JSON and open with json.
93
94
Args:
95
key: The module name
96
subkeys: A sequence of additional strings to join. If none are given, returns
97
the directory for this module.
98
url: The URL to download.
99
name: Overrides the name of the file at the end of the URL, if given. Also
100
useful for URLs that don't have proper filenames with extensions.
101
force: Should the download be done again, even if the path already exists?
102
Defaults to false.
103
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
104
open_kwargs: Additional keyword arguments passed to open
105
json_load_kwargs: Keyword arguments to pass through to json.load.
106
107
Returns:
108
A JSON object (list, dict, etc.)
109
"""
110
```
111
112
### Compressed JSON
113
114
```python { .api }
115
def ensure_json_bz2(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, open_kwargs: Mapping[str, Any] | None = None, json_load_kwargs: Mapping[str, Any] | None = None) -> JSON:
116
"""Download BZ2-compressed JSON and open with json.
117
118
Args:
119
key: The module name
120
subkeys: A sequence of additional strings to join. If none are given, returns
121
the directory for this module.
122
url: The URL to download.
123
name: Overrides the name of the file at the end of the URL, if given. Also
124
useful for URLs that don't have proper filenames with extensions.
125
force: Should the download be done again, even if the path already exists?
126
Defaults to false.
127
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
128
open_kwargs: Additional keyword arguments passed to bz2.open
129
json_load_kwargs: Keyword arguments to pass through to json.load.
130
131
Returns:
132
A JSON object (list, dict, etc.)
133
"""
134
```
135
136
### JSON Operations
137
138
```python { .api }
139
def load_json(key: str, *subkeys: str, name: str, json_load_kwargs: Mapping[str, Any] | None = None) -> JSON:
140
"""Open a JSON file json.
141
142
Args:
143
key: The module name
144
subkeys: A sequence of additional strings to join. If none are given, returns
145
the directory for this module.
146
name: The name of the file to open
147
json_load_kwargs: Keyword arguments to pass through to json.load.
148
149
Returns:
150
A JSON object (list, dict, etc.)
151
"""
152
153
def dump_json(key: str, *subkeys: str, name: str, obj: JSON, open_kwargs: Mapping[str, Any] | None = None, json_dump_kwargs: Mapping[str, Any] | None = None) -> None:
154
"""Dump an object to a file with json.
155
156
Args:
157
key: The module name
158
subkeys: A sequence of additional strings to join. If none are given, returns
159
the directory for this module.
160
name: The name of the file to open
161
obj: The object to dump
162
open_kwargs: Additional keyword arguments passed to open
163
json_dump_kwargs: Keyword arguments to pass through to json.dump.
164
"""
165
```
166
167
## XML Format
168
169
### XML Download and Parsing
170
171
```python { .api }
172
def ensure_xml(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, parse_kwargs: Mapping[str, Any] | None = None) -> lxml.etree.ElementTree:
173
"""Download an XML file and open it with lxml.
174
175
Args:
176
key: The module name
177
subkeys: A sequence of additional strings to join. If none are given, returns
178
the directory for this module.
179
url: The URL to download.
180
name: Overrides the name of the file at the end of the URL, if given. Also
181
useful for URLs that don't have proper filenames with extensions.
182
force: Should the download be done again, even if the path already exists?
183
Defaults to false.
184
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
185
parse_kwargs: Keyword arguments to pass through to lxml.etree.parse.
186
187
Returns:
188
An ElementTree object
189
"""
190
```
191
192
### XML Operations
193
194
```python { .api }
195
def load_xml(key: str, *subkeys: str, name: str, parse_kwargs: Mapping[str, Any] | None = None) -> lxml.etree.ElementTree:
196
"""Load an XML file with lxml.
197
198
Args:
199
key: The module name
200
subkeys: A sequence of additional strings to join. If none are given, returns
201
the directory for this module.
202
name: The name of the file to open
203
parse_kwargs: Keyword arguments to pass through to lxml.etree.parse.
204
205
Returns:
206
An ElementTree object
207
"""
208
209
def dump_xml(key: str, *subkeys: str, name: str, obj: lxml.etree.ElementTree, open_kwargs: Mapping[str, Any] | None = None, write_kwargs: Mapping[str, Any] | None = None) -> None:
210
"""Dump an XML element tree to a file with lxml.
211
212
Args:
213
key: The module name
214
subkeys: A sequence of additional strings to join. If none are given, returns
215
the directory for this module.
216
name: The name of the file to open
217
obj: The object to dump
218
open_kwargs: Additional keyword arguments passed to open
219
write_kwargs: Keyword arguments to pass through to lxml.etree.ElementTree.write.
220
"""
221
```
222
223
## RDF Format
224
225
### RDF Download and Parsing
226
227
```python { .api }
228
def ensure_rdf(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, precache: bool = True, parse_kwargs: Mapping[str, Any] | None = None) -> rdflib.Graph:
229
"""Download a RDF file and open with rdflib.
230
231
Args:
232
key: The module name
233
subkeys: A sequence of additional strings to join. If none are given, returns
234
the directory for this module.
235
url: The URL to download.
236
name: Overrides the name of the file at the end of the URL, if given. Also
237
useful for URLs that don't have proper filenames with extensions.
238
force: Should the download be done again, even if the path already exists?
239
Defaults to false.
240
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
241
precache: Should the parsed rdflib.Graph be stored as a pickle for
242
fast loading?
243
parse_kwargs: Keyword arguments to pass through to pystow.utils.read_rdf
244
and transitively to rdflib.Graph.parse.
245
246
Returns:
247
An RDF graph
248
"""
249
```
250
251
### RDF Operations
252
253
```python { .api }
254
def load_rdf(key: str, *subkeys: str, name: str | None = None, parse_kwargs: Mapping[str, Any] | None = None) -> rdflib.Graph:
255
"""Open an RDF file with rdflib.
256
257
Args:
258
key: The name of the module. No funny characters. The envvar <key>_HOME where
259
key is uppercased is checked first before using the default home directory.
260
subkeys: A sequence of additional strings to join. If none are given, returns
261
the directory for this module.
262
name: The name of the file to open
263
parse_kwargs: Keyword arguments to pass through to pystow.utils.read_rdf
264
and transitively to rdflib.Graph.parse.
265
266
Returns:
267
An RDF graph
268
"""
269
270
def dump_rdf(key: str, *subkeys: str, name: str, obj: rdflib.Graph, format: str = "turtle", serialize_kwargs: Mapping[str, Any] | None = None) -> None:
271
"""Dump an RDF graph to a file with rdflib.
272
273
Args:
274
key: The name of the module. No funny characters. The envvar <key>_HOME where
275
key is uppercased is checked first before using the default home directory.
276
subkeys: A sequence of additional strings to join. If none are given, returns
277
the directory for this module.
278
name: The name of the file to open
279
obj: The object to dump
280
format: The format to dump in
281
serialize_kwargs: Keyword arguments to through to rdflib.Graph.serialize.
282
"""
283
```
284
285
## Pickle Format
286
287
### Pickle Operations
288
289
```python { .api }
290
def ensure_pickle(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["rb"] = "rb", open_kwargs: Mapping[str, Any] | None = None, pickle_load_kwargs: Mapping[str, Any] | None = None) -> Any:
291
"""Download a pickle file and open with pickle.
292
293
Args:
294
key: The module name
295
subkeys: A sequence of additional strings to join. If none are given, returns
296
the directory for this module.
297
url: The URL to download.
298
name: Overrides the name of the file at the end of the URL, if given. Also
299
useful for URLs that don't have proper filenames with extensions.
300
force: Should the download be done again, even if the path already exists?
301
Defaults to false.
302
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
303
mode: The read mode, passed to open
304
open_kwargs: Additional keyword arguments passed to open
305
pickle_load_kwargs: Keyword arguments to pass through to pickle.load.
306
307
Returns:
308
Any object
309
"""
310
311
def load_pickle(key: str, *subkeys: str, name: str, mode: Literal["rb"] = "rb", open_kwargs: Mapping[str, Any] | None = None, pickle_load_kwargs: Mapping[str, Any] | None = None) -> Any:
312
"""Open a pickle file with pickle.
313
314
Args:
315
key: The module name
316
subkeys: A sequence of additional strings to join. If none are given, returns
317
the directory for this module.
318
name: The name of the file to open
319
mode: The read mode, passed to open
320
open_kwargs: Additional keyword arguments passed to open
321
pickle_load_kwargs: Keyword arguments to pass through to pickle.load.
322
323
Returns:
324
Any object
325
"""
326
327
def dump_pickle(key: str, *subkeys: str, name: str, obj: Any, mode: Literal["wb"] = "wb", open_kwargs: Mapping[str, Any] | None = None, pickle_dump_kwargs: Mapping[str, Any] | None = None) -> None:
328
"""Dump an object to a file with pickle.
329
330
Args:
331
key: The module name
332
subkeys: A sequence of additional strings to join. If none are given, returns
333
the directory for this module.
334
name: The name of the file to open
335
obj: The object to dump
336
mode: The read mode, passed to open
337
open_kwargs: Additional keyword arguments passed to open
338
pickle_dump_kwargs: Keyword arguments to pass through to pickle.dump.
339
"""
340
```
341
342
## Usage Examples
343
344
### CSV and DataFrames
345
346
```python
347
import pystow
348
import pandas as pd
349
350
# Download and parse CSV
351
df = pystow.ensure_csv(
352
"myapp", "datasets",
353
url="https://example.com/data.csv",
354
read_csv_kwargs={"sep": ",", "header": 0}
355
)
356
357
# Load existing CSV
358
df = pystow.load_df("myapp", "processed", name="clean_data.csv")
359
360
# Save DataFrame
361
pystow.dump_df(
362
"myapp", "outputs",
363
name="results.tsv",
364
obj=df,
365
sep="\t"
366
)
367
368
# Excel files
369
excel_df = pystow.ensure_excel(
370
"myapp", "reports",
371
url="https://example.com/report.xlsx",
372
read_excel_kwargs={"sheet_name": "Summary"}
373
)
374
```
375
376
### JSON Data
377
378
```python
379
import pystow
380
381
# Download and parse JSON
382
config = pystow.ensure_json(
383
"myapp", "config",
384
url="https://api.example.com/config.json"
385
)
386
387
# Load existing JSON
388
data = pystow.load_json("myapp", "cache", name="api_response.json")
389
390
# Save JSON data
391
pystow.dump_json(
392
"myapp", "outputs",
393
name="results.json",
394
obj={"status": "complete", "count": 42},
395
json_dump_kwargs={"indent": 2}
396
)
397
398
# Compressed JSON
399
large_data = pystow.ensure_json_bz2(
400
"myapp", "datasets",
401
url="https://example.com/large_dataset.json.bz2"
402
)
403
```
404
405
### XML Processing
406
407
```python
408
import pystow
409
from lxml import etree
410
411
# Download and parse XML
412
tree = pystow.ensure_xml(
413
"myapp", "schemas",
414
url="https://example.com/schema.xml"
415
)
416
417
# Access elements
418
root = tree.getroot()
419
elements = root.xpath("//element[@type='important']")
420
421
# Load existing XML
422
local_tree = pystow.load_xml("myapp", "data", name="document.xml")
423
424
# Save XML
425
pystow.dump_xml(
426
"myapp", "outputs",
427
name="modified.xml",
428
obj=tree
429
)
430
```
431
432
### RDF Data
433
434
```python
435
import pystow
436
import rdflib
437
438
# Download and parse RDF with caching
439
graph = pystow.ensure_rdf(
440
"myapp", "ontologies",
441
url="https://example.com/ontology.rdf.gz",
442
parse_kwargs={"format": "xml"},
443
precache=True # Cache parsed graph as pickle for speed
444
)
445
446
# Query the graph
447
results = graph.query("""
448
SELECT ?subject ?predicate ?object
449
WHERE { ?subject ?predicate ?object }
450
LIMIT 10
451
""")
452
453
# Save RDF in different format
454
pystow.dump_rdf(
455
"myapp", "outputs",
456
name="data.ttl",
457
obj=graph,
458
format="turtle"
459
)
460
```
461
462
### Python Objects
463
464
```python
465
import pystow
466
467
# Download and load pickled object
468
model = pystow.ensure_pickle(
469
"myapp", "models",
470
url="https://example.com/trained_model.pkl"
471
)
472
473
# Save Python object
474
data_structure = {"key": "value", "list": [1, 2, 3]}
475
pystow.dump_pickle(
476
"myapp", "cache",
477
name="data.pkl",
478
obj=data_structure
479
)
480
481
# Load existing pickle
482
cached_data = pystow.load_pickle("myapp", "cache", name="data.pkl")
483
```