or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

archives.mdcloud-storage.mdconfiguration.mddata-formats.mddirectory-management.mdfile-operations.mdindex.mdmodule-class.mdnltk-integration.mdweb-scraping.md

data-formats.mddocs/

0

# Data Format Support

1

2

PyStow provides built-in support for common data formats with automatic parsing and serialization. It integrates with popular libraries like pandas, lxml, and rdflib to handle CSV, JSON, XML, RDF, Excel, and Python objects seamlessly.

3

4

## CSV and DataFrames

5

6

### CSV Download and Parsing

7

8

```python { .api }

9

def ensure_csv(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:

10

"""Download a CSV and open as a dataframe with pandas.

11

12

Args:

13

key: The module name

14

subkeys: A sequence of additional strings to join. If none are given, returns

15

the directory for this module.

16

url: The URL to download.

17

name: Overrides the name of the file at the end of the URL, if given. Also

18

useful for URLs that don't have proper filenames with extensions.

19

force: Should the download be done again, even if the path already exists?

20

Defaults to false.

21

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

22

read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.

23

24

Returns:

25

A pandas DataFrame

26

"""

27

```

28

29

### Excel Support

30

31

```python { .api }

32

def ensure_excel(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_excel_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:

33

"""Download an excel file and open as a dataframe with pandas.

34

35

Args:

36

key: The module name

37

subkeys: A sequence of additional strings to join. If none are given, returns

38

the directory for this module.

39

url: The URL to download.

40

name: Overrides the name of the file at the end of the URL, if given. Also

41

useful for URLs that don't have proper filenames with extensions.

42

force: Should the download be done again, even if the path already exists?

43

Defaults to false.

44

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

45

read_excel_kwargs: Keyword arguments to pass through to pandas.read_excel.

46

47

Returns:

48

A pandas DataFrame

49

"""

50

```

51

52

### DataFrame Operations

53

54

```python { .api }

55

def load_df(key: str, *subkeys: str, name: str, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:

56

"""Open a pre-existing CSV as a dataframe with pandas.

57

58

Args:

59

key: The module name

60

subkeys: A sequence of additional strings to join. If none are given, returns

61

the directory for this module.

62

name: Overrides the name of the file at the end of the URL, if given. Also

63

useful for URLs that don't have proper filenames with extensions.

64

read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.

65

66

Returns:

67

A pandas DataFrame

68

"""

69

70

def dump_df(key: str, *subkeys: str, name: str, obj: pd.DataFrame, sep: str = "\t", index: bool = False, to_csv_kwargs: Mapping[str, Any] | None = None) -> None:

71

"""Dump a dataframe to a TSV file with pandas.

72

73

Args:

74

key: The module name

75

subkeys: A sequence of additional strings to join. If none are given, returns

76

the directory for this module.

77

name: Overrides the name of the file at the end of the URL, if given. Also

78

useful for URLs that don't have proper filenames with extensions.

79

obj: The dataframe to dump

80

sep: The separator to use, defaults to a tab

81

index: Should the index be dumped? Defaults to false.

82

to_csv_kwargs: Keyword arguments to pass through to pandas.DataFrame.to_csv.

83

"""

84

```

85

86

## JSON Format

87

88

### JSON Download and Parsing

89

90

```python { .api }

91

def ensure_json(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, open_kwargs: Mapping[str, Any] | None = None, json_load_kwargs: Mapping[str, Any] | None = None) -> JSON:

92

"""Download JSON and open with json.

93

94

Args:

95

key: The module name

96

subkeys: A sequence of additional strings to join. If none are given, returns

97

the directory for this module.

98

url: The URL to download.

99

name: Overrides the name of the file at the end of the URL, if given. Also

100

useful for URLs that don't have proper filenames with extensions.

101

force: Should the download be done again, even if the path already exists?

102

Defaults to false.

103

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

104

open_kwargs: Additional keyword arguments passed to open

105

json_load_kwargs: Keyword arguments to pass through to json.load.

106

107

Returns:

108

A JSON object (list, dict, etc.)

109

"""

110

```

111

112

### Compressed JSON

113

114

```python { .api }

115

def ensure_json_bz2(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, open_kwargs: Mapping[str, Any] | None = None, json_load_kwargs: Mapping[str, Any] | None = None) -> JSON:

116

"""Download BZ2-compressed JSON and open with json.

117

118

Args:

119

key: The module name

120

subkeys: A sequence of additional strings to join. If none are given, returns

121

the directory for this module.

122

url: The URL to download.

123

name: Overrides the name of the file at the end of the URL, if given. Also

124

useful for URLs that don't have proper filenames with extensions.

125

force: Should the download be done again, even if the path already exists?

126

Defaults to false.

127

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

128

open_kwargs: Additional keyword arguments passed to bz2.open

129

json_load_kwargs: Keyword arguments to pass through to json.load.

130

131

Returns:

132

A JSON object (list, dict, etc.)

133

"""

134

```

135

136

### JSON Operations

137

138

```python { .api }

139

def load_json(key: str, *subkeys: str, name: str, json_load_kwargs: Mapping[str, Any] | None = None) -> JSON:

140

"""Open a JSON file json.

141

142

Args:

143

key: The module name

144

subkeys: A sequence of additional strings to join. If none are given, returns

145

the directory for this module.

146

name: The name of the file to open

147

json_load_kwargs: Keyword arguments to pass through to json.load.

148

149

Returns:

150

A JSON object (list, dict, etc.)

151

"""

152

153

def dump_json(key: str, *subkeys: str, name: str, obj: JSON, open_kwargs: Mapping[str, Any] | None = None, json_dump_kwargs: Mapping[str, Any] | None = None) -> None:

154

"""Dump an object to a file with json.

155

156

Args:

157

key: The module name

158

subkeys: A sequence of additional strings to join. If none are given, returns

159

the directory for this module.

160

name: The name of the file to open

161

obj: The object to dump

162

open_kwargs: Additional keyword arguments passed to open

163

json_dump_kwargs: Keyword arguments to pass through to json.dump.

164

"""

165

```

166

167

## XML Format

168

169

### XML Download and Parsing

170

171

```python { .api }

172

def ensure_xml(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, parse_kwargs: Mapping[str, Any] | None = None) -> lxml.etree.ElementTree:

173

"""Download an XML file and open it with lxml.

174

175

Args:

176

key: The module name

177

subkeys: A sequence of additional strings to join. If none are given, returns

178

the directory for this module.

179

url: The URL to download.

180

name: Overrides the name of the file at the end of the URL, if given. Also

181

useful for URLs that don't have proper filenames with extensions.

182

force: Should the download be done again, even if the path already exists?

183

Defaults to false.

184

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

185

parse_kwargs: Keyword arguments to pass through to lxml.etree.parse.

186

187

Returns:

188

An ElementTree object

189

"""

190

```

191

192

### XML Operations

193

194

```python { .api }

195

def load_xml(key: str, *subkeys: str, name: str, parse_kwargs: Mapping[str, Any] | None = None) -> lxml.etree.ElementTree:

196

"""Load an XML file with lxml.

197

198

Args:

199

key: The module name

200

subkeys: A sequence of additional strings to join. If none are given, returns

201

the directory for this module.

202

name: The name of the file to open

203

parse_kwargs: Keyword arguments to pass through to lxml.etree.parse.

204

205

Returns:

206

An ElementTree object

207

"""

208

209

def dump_xml(key: str, *subkeys: str, name: str, obj: lxml.etree.ElementTree, open_kwargs: Mapping[str, Any] | None = None, write_kwargs: Mapping[str, Any] | None = None) -> None:

210

"""Dump an XML element tree to a file with lxml.

211

212

Args:

213

key: The module name

214

subkeys: A sequence of additional strings to join. If none are given, returns

215

the directory for this module.

216

name: The name of the file to open

217

obj: The object to dump

218

open_kwargs: Additional keyword arguments passed to open

219

write_kwargs: Keyword arguments to pass through to lxml.etree.ElementTree.write.

220

"""

221

```

222

223

## RDF Format

224

225

### RDF Download and Parsing

226

227

```python { .api }

228

def ensure_rdf(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, precache: bool = True, parse_kwargs: Mapping[str, Any] | None = None) -> rdflib.Graph:

229

"""Download a RDF file and open with rdflib.

230

231

Args:

232

key: The module name

233

subkeys: A sequence of additional strings to join. If none are given, returns

234

the directory for this module.

235

url: The URL to download.

236

name: Overrides the name of the file at the end of the URL, if given. Also

237

useful for URLs that don't have proper filenames with extensions.

238

force: Should the download be done again, even if the path already exists?

239

Defaults to false.

240

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

241

precache: Should the parsed rdflib.Graph be stored as a pickle for

242

fast loading?

243

parse_kwargs: Keyword arguments to pass through to pystow.utils.read_rdf

244

and transitively to rdflib.Graph.parse.

245

246

Returns:

247

An RDF graph

248

"""

249

```

250

251

### RDF Operations

252

253

```python { .api }

254

def load_rdf(key: str, *subkeys: str, name: str | None = None, parse_kwargs: Mapping[str, Any] | None = None) -> rdflib.Graph:

255

"""Open an RDF file with rdflib.

256

257

Args:

258

key: The name of the module. No funny characters. The envvar <key>_HOME where

259

key is uppercased is checked first before using the default home directory.

260

subkeys: A sequence of additional strings to join. If none are given, returns

261

the directory for this module.

262

name: The name of the file to open

263

parse_kwargs: Keyword arguments to pass through to pystow.utils.read_rdf

264

and transitively to rdflib.Graph.parse.

265

266

Returns:

267

An RDF graph

268

"""

269

270

def dump_rdf(key: str, *subkeys: str, name: str, obj: rdflib.Graph, format: str = "turtle", serialize_kwargs: Mapping[str, Any] | None = None) -> None:

271

"""Dump an RDF graph to a file with rdflib.

272

273

Args:

274

key: The name of the module. No funny characters. The envvar <key>_HOME where

275

key is uppercased is checked first before using the default home directory.

276

subkeys: A sequence of additional strings to join. If none are given, returns

277

the directory for this module.

278

name: The name of the file to open

279

obj: The object to dump

280

format: The format to dump in

281

serialize_kwargs: Keyword arguments to through to rdflib.Graph.serialize.

282

"""

283

```

284

285

## Pickle Format

286

287

### Pickle Operations

288

289

```python { .api }

290

def ensure_pickle(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["rb"] = "rb", open_kwargs: Mapping[str, Any] | None = None, pickle_load_kwargs: Mapping[str, Any] | None = None) -> Any:

291

"""Download a pickle file and open with pickle.

292

293

Args:

294

key: The module name

295

subkeys: A sequence of additional strings to join. If none are given, returns

296

the directory for this module.

297

url: The URL to download.

298

name: Overrides the name of the file at the end of the URL, if given. Also

299

useful for URLs that don't have proper filenames with extensions.

300

force: Should the download be done again, even if the path already exists?

301

Defaults to false.

302

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

303

mode: The read mode, passed to open

304

open_kwargs: Additional keyword arguments passed to open

305

pickle_load_kwargs: Keyword arguments to pass through to pickle.load.

306

307

Returns:

308

Any object

309

"""

310

311

def load_pickle(key: str, *subkeys: str, name: str, mode: Literal["rb"] = "rb", open_kwargs: Mapping[str, Any] | None = None, pickle_load_kwargs: Mapping[str, Any] | None = None) -> Any:

312

"""Open a pickle file with pickle.

313

314

Args:

315

key: The module name

316

subkeys: A sequence of additional strings to join. If none are given, returns

317

the directory for this module.

318

name: The name of the file to open

319

mode: The read mode, passed to open

320

open_kwargs: Additional keyword arguments passed to open

321

pickle_load_kwargs: Keyword arguments to pass through to pickle.load.

322

323

Returns:

324

Any object

325

"""

326

327

def dump_pickle(key: str, *subkeys: str, name: str, obj: Any, mode: Literal["wb"] = "wb", open_kwargs: Mapping[str, Any] | None = None, pickle_dump_kwargs: Mapping[str, Any] | None = None) -> None:

328

"""Dump an object to a file with pickle.

329

330

Args:

331

key: The module name

332

subkeys: A sequence of additional strings to join. If none are given, returns

333

the directory for this module.

334

name: The name of the file to open

335

obj: The object to dump

336

mode: The read mode, passed to open

337

open_kwargs: Additional keyword arguments passed to open

338

pickle_dump_kwargs: Keyword arguments to pass through to pickle.dump.

339

"""

340

```

341

342

## Usage Examples

343

344

### CSV and DataFrames

345

346

```python

347

import pystow

348

import pandas as pd

349

350

# Download and parse CSV

351

df = pystow.ensure_csv(

352

"myapp", "datasets",

353

url="https://example.com/data.csv",

354

read_csv_kwargs={"sep": ",", "header": 0}

355

)

356

357

# Load existing CSV

358

df = pystow.load_df("myapp", "processed", name="clean_data.csv")

359

360

# Save DataFrame

361

pystow.dump_df(

362

"myapp", "outputs",

363

name="results.tsv",

364

obj=df,

365

sep="\t"

366

)

367

368

# Excel files

369

excel_df = pystow.ensure_excel(

370

"myapp", "reports",

371

url="https://example.com/report.xlsx",

372

read_excel_kwargs={"sheet_name": "Summary"}

373

)

374

```

375

376

### JSON Data

377

378

```python

379

import pystow

380

381

# Download and parse JSON

382

config = pystow.ensure_json(

383

"myapp", "config",

384

url="https://api.example.com/config.json"

385

)

386

387

# Load existing JSON

388

data = pystow.load_json("myapp", "cache", name="api_response.json")

389

390

# Save JSON data

391

pystow.dump_json(

392

"myapp", "outputs",

393

name="results.json",

394

obj={"status": "complete", "count": 42},

395

json_dump_kwargs={"indent": 2}

396

)

397

398

# Compressed JSON

399

large_data = pystow.ensure_json_bz2(

400

"myapp", "datasets",

401

url="https://example.com/large_dataset.json.bz2"

402

)

403

```

404

405

### XML Processing

406

407

```python

408

import pystow

409

from lxml import etree

410

411

# Download and parse XML

412

tree = pystow.ensure_xml(

413

"myapp", "schemas",

414

url="https://example.com/schema.xml"

415

)

416

417

# Access elements

418

root = tree.getroot()

419

elements = root.xpath("//element[@type='important']")

420

421

# Load existing XML

422

local_tree = pystow.load_xml("myapp", "data", name="document.xml")

423

424

# Save XML

425

pystow.dump_xml(

426

"myapp", "outputs",

427

name="modified.xml",

428

obj=tree

429

)

430

```

431

432

### RDF Data

433

434

```python

435

import pystow

436

import rdflib

437

438

# Download and parse RDF with caching

439

graph = pystow.ensure_rdf(

440

"myapp", "ontologies",

441

url="https://example.com/ontology.rdf.gz",

442

parse_kwargs={"format": "xml"},

443

precache=True # Cache parsed graph as pickle for speed

444

)

445

446

# Query the graph

447

results = graph.query("""

448

SELECT ?subject ?predicate ?object

449

WHERE { ?subject ?predicate ?object }

450

LIMIT 10

451

""")

452

453

# Save RDF in different format

454

pystow.dump_rdf(

455

"myapp", "outputs",

456

name="data.ttl",

457

obj=graph,

458

format="turtle"

459

)

460

```

461

462

### Python Objects

463

464

```python

465

import pystow

466

467

# Download and load pickled object

468

model = pystow.ensure_pickle(

469

"myapp", "models",

470

url="https://example.com/trained_model.pkl"

471

)

472

473

# Save Python object

474

data_structure = {"key": "value", "list": [1, 2, 3]}

475

pystow.dump_pickle(

476

"myapp", "cache",

477

name="data.pkl",

478

obj=data_structure

479

)

480

481

# Load existing pickle

482

cached_data = pystow.load_pickle("myapp", "cache", name="data.pkl")

483

```