0
# Module Class API
1
2
The Module class provides an object-oriented interface for PyStow's directory management and file operations. It encapsulates all functionality within a specific directory context, making it ideal for organizing data within applications.
3
4
## Core Module Class
5
6
```python { .api }
7
class Module:
8
"""The class wrapping the directory lookup implementation."""
9
10
def __init__(self, base: str | Path, ensure_exists: bool = True) -> None:
11
"""Initialize the module.
12
13
Args:
14
base: The base directory for the module
15
ensure_exists: Should the base directory be created automatically?
16
Defaults to true.
17
"""
18
19
@classmethod
20
def from_key(cls, key: str, *subkeys: str, ensure_exists: bool = True) -> Module:
21
"""Get a module for the given directory or one of its subdirectories.
22
23
Args:
24
key: The name of the module. No funny characters. The envvar <key>_HOME
25
where key is uppercased is checked first before using the default home
26
directory.
27
subkeys: A sequence of additional strings to join. If none are given,
28
returns the directory for this module.
29
ensure_exists: Should all directories be created automatically? Defaults
30
to true.
31
32
Returns:
33
A module
34
"""
35
```
36
37
## Directory Management Methods
38
39
```python { .api }
40
def module(self, *subkeys: str, ensure_exists: bool = True) -> Module:
41
"""Get a module for a subdirectory of the current module.
42
43
Args:
44
subkeys: A sequence of additional strings to join. If none are given,
45
returns the directory for this module.
46
ensure_exists: Should all directories be created automatically? Defaults
47
to true.
48
49
Returns:
50
A module representing the subdirectory based on the given subkeys.
51
"""
52
53
def join(self, *subkeys: str, name: str | None = None, ensure_exists: bool = True, version: VersionHint = None) -> Path:
54
"""Get a subdirectory of the current module.
55
56
Args:
57
subkeys: A sequence of additional strings to join. If none are given,
58
returns the directory for this module.
59
ensure_exists: Should all directories be created automatically? Defaults
60
to true.
61
name: The name of the file (optional) inside the folder
62
version: The optional version, or no-argument callable that returns an
63
optional version. This is prepended before the subkeys.
64
65
Returns:
66
The path of the directory or subdirectory for the given module.
67
"""
68
69
def joinpath_sqlite(self, *subkeys: str, name: str) -> str:
70
"""Get an SQLite database connection string.
71
72
Args:
73
subkeys: A sequence of additional strings to join. If none are given,
74
returns the directory for this module.
75
name: The name of the database file.
76
77
Returns:
78
A SQLite path string.
79
"""
80
```
81
82
## File Download Methods
83
84
```python { .api }
85
def ensure(self, *subkeys: str, url: str, name: str | None = None, version: VersionHint = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Path:
86
"""Ensure a file is downloaded.
87
88
Args:
89
subkeys: A sequence of additional strings to join. If none are given,
90
returns the directory for this module.
91
url: The URL to download.
92
name: Overrides the name of the file at the end of the URL, if given.
93
Also useful for URLs that don't have proper filenames with extensions.
94
version: The optional version, or no-argument callable that returns an
95
optional version. This is prepended before the subkeys.
96
force: Should the download be done again, even if the path already
97
exists? Defaults to false.
98
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
99
100
Returns:
101
The path of the file that has been downloaded (or already exists)
102
"""
103
104
def ensure_custom(self, *subkeys: str, name: str, force: bool = False, provider: Provider, **kwargs: Any) -> Path:
105
"""Ensure a file is present, and run a custom create function otherwise.
106
107
Args:
108
subkeys: A sequence of additional strings to join. If none are given,
109
returns the directory for this module.
110
name: The file name.
111
force: Should the file be re-created, even if the path already exists?
112
provider: The file provider. Will be run with the path as the first
113
positional argument, if the file needs to be generated.
114
kwargs: Additional keyword-based parameters passed to the provider.
115
116
Returns:
117
The path of the file that has been created (or already exists)
118
119
Raises:
120
ValueError: If the provider was called but the file was not created by it.
121
"""
122
```
123
124
## Archive and Compression Methods
125
126
```python { .api }
127
def ensure_untar(self, *subkeys: str, url: str, name: str | None = None, directory: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, extract_kwargs: Mapping[str, Any] | None = None) -> Path:
128
"""Ensure a tar file is downloaded and unarchived.
129
130
Args:
131
subkeys: A sequence of additional strings to join. If none are given,
132
returns the directory for this module.
133
url: The URL to download.
134
name: Overrides the name of the file at the end of the URL, if given.
135
Also useful for URLs that don't have proper filenames with extensions.
136
directory: Overrides the name of the directory into which the tar archive
137
is extracted. If none given, will use the stem of the file name that gets
138
downloaded.
139
force: Should the download be done again, even if the path already
140
exists? Defaults to false.
141
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
142
extract_kwargs: Keyword arguments to pass to tarfile.TarFile.extract_all.
143
144
Returns:
145
The path of the directory where the file that has been downloaded gets
146
extracted to
147
"""
148
149
def ensure_gunzip(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, autoclean: bool = True, download_kwargs: Mapping[str, Any] | None = None) -> Path:
150
"""Ensure a tar.gz file is downloaded and unarchived.
151
152
Args:
153
subkeys: A sequence of additional strings to join. If none are given,
154
returns the directory for this module.
155
url: The URL to download.
156
name: Overrides the name of the file at the end of the URL, if given.
157
Also useful for URLs that don't have proper filenames with extensions.
158
force: Should the download be done again, even if the path already
159
exists? Defaults to false.
160
autoclean: Should the zipped file be deleted?
161
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
162
163
Returns:
164
The path of the directory where the file that has been downloaded gets
165
extracted to
166
"""
167
```
168
169
## File I/O Context Managers
170
171
The Module class provides all the same context manager methods as the functional API:
172
173
- `open()` - Open files with various modes
174
- `open_gz()` - Open gzipped files
175
- `ensure_open()` - Download and open files
176
- `ensure_open_zip()` - Download zip and open inner files
177
- `ensure_open_lzma()` - Download and open LZMA files
178
- `ensure_open_tarfile()` - Download tar and open inner files
179
- `ensure_open_gz()` - Download and open gzipped files
180
- `ensure_open_bz2()` - Download and open BZ2 files
181
182
## Data Format Methods
183
184
The Module class provides all data format methods:
185
186
### CSV/DataFrame Methods
187
- `ensure_csv()` - Download CSV as DataFrame
188
- `load_df()` - Load existing CSV as DataFrame
189
- `dump_df()` - Save DataFrame to file
190
- `ensure_excel()` - Download Excel as DataFrame
191
- `ensure_tar_df()` - Extract CSV from TAR archive
192
- `ensure_zip_df()` - Extract CSV from ZIP archive
193
194
### JSON Methods
195
- `ensure_json()` - Download and parse JSON
196
- `ensure_json_bz2()` - Download compressed JSON
197
- `load_json()` - Load existing JSON file
198
- `dump_json()` - Save object as JSON
199
200
### XML Methods
201
- `ensure_xml()` - Download and parse XML
202
- `ensure_tar_xml()` - Extract XML from TAR archive
203
- `load_xml()` - Load existing XML file
204
- `dump_xml()` - Save XML ElementTree
205
206
### RDF Methods
207
- `ensure_rdf()` - Download and parse RDF with caching
208
- `load_rdf()` - Load existing RDF file
209
- `dump_rdf()` - Save RDF graph
210
211
### Pickle Methods
212
- `ensure_pickle()` - Download and load pickle
213
- `ensure_pickle_gz()` - Download compressed pickle
214
- `load_pickle()` - Load existing pickle
215
- `load_pickle_gz()` - Load compressed pickle
216
- `dump_pickle()` - Save object as pickle
217
218
### NumPy Methods
219
- `ensure_zip_np()` - Load NumPy array from ZIP
220
221
## Cloud Storage Methods
222
223
```python { .api }
224
def ensure_from_s3(self, *subkeys: str, s3_bucket: str, s3_key: str | Sequence[str], name: str | None = None, client: botocore.client.BaseClient | None = None, client_kwargs: Mapping[str, Any] | None = None, download_file_kwargs: Mapping[str, Any] | None = None, force: bool = False) -> Path:
225
"""Ensure a file is downloaded from AWS S3.
226
227
Args:
228
subkeys: A sequence of additional strings to join. If none are given,
229
returns the directory for this module.
230
s3_bucket: The S3 bucket name
231
s3_key: The S3 key name
232
name: Overrides the name of the file at the end of the S3 key, if given.
233
client: A botocore client. If none given, one will be created
234
automatically
235
client_kwargs: Keyword arguments to be passed to the client on
236
instantiation.
237
download_file_kwargs: Keyword arguments to be passed to
238
boto3.s3.transfer.S3Transfer.download_file
239
force: Should the download be done again, even if the path already
240
exists? Defaults to false.
241
242
Returns:
243
The path of the file that has been downloaded (or already exists)
244
"""
245
246
def ensure_from_google(self, *subkeys: str, name: str, file_id: str, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Path:
247
"""Ensure a file is downloaded from Google Drive.
248
249
Args:
250
subkeys: A sequence of additional strings to join. If none are given,
251
returns the directory for this module.
252
name: The name of the file
253
file_id: The file identifier of the Google file. If your share link is
254
https://drive.google.com/file/d/1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z/view, then
255
your file ID is 1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z.
256
force: Should the download be done again, even if the path already
257
exists? Defaults to false.
258
download_kwargs: Keyword arguments to pass through to
259
pystow.utils.download_from_google.
260
261
Returns:
262
The path of the file that has been downloaded (or already exists)
263
"""
264
```
265
266
## Database Methods
267
268
```python { .api }
269
@contextmanager
270
def ensure_open_sqlite(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Generator[sqlite3.Connection, None, None]:
271
"""Ensure and connect to a SQLite database.
272
273
Args:
274
subkeys: A sequence of additional strings to join. If none are given,
275
returns the directory for this module.
276
url: The URL to download.
277
name: Overrides the name of the file at the end of the URL, if given.
278
Also useful for URLs that don't have proper filenames with extensions.
279
force: Should the download be done again, even if the path already
280
exists? Defaults to false.
281
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
282
283
Yields:
284
An instance of sqlite3.Connection from sqlite3.connect
285
"""
286
287
@contextmanager
288
def ensure_open_sqlite_gz(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Generator[sqlite3.Connection, None, None]:
289
"""Ensure and connect to a SQLite database that's gzipped.
290
291
Unfortunately, it's a paid feature to directly read gzipped sqlite files, so
292
this automatically gunzips it first.
293
294
Args:
295
subkeys: A sequence of additional strings to join. If none are given,
296
returns the directory for this module.
297
url: The URL to download.
298
name: Overrides the name of the file at the end of the URL, if given.
299
Also useful for URLs that don't have proper filenames with extensions.
300
force: Should the download be done again, even if the path already
301
exists? Defaults to false.
302
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
303
304
Yields:
305
An instance of sqlite3.Connection from sqlite3.connect
306
"""
307
```
308
309
## Usage Examples
310
311
### Basic Module Usage
312
313
```python
314
import pystow
315
316
# Create a module for your application
317
module = pystow.module("myapp")
318
319
# Get subdirectories
320
data_module = module.module("datasets")
321
config_module = module.module("config")
322
323
# Get file paths
324
data_file = data_module.join(name="data.csv")
325
config_file = config_module.join(name="settings.json")
326
```
327
328
### File Operations with Module
329
330
```python
331
import pystow
332
333
# Create module
334
module = pystow.module("myproject")
335
336
# Download files
337
dataset_path = module.ensure(
338
"datasets", "raw",
339
url="https://example.com/data.csv"
340
)
341
342
# Work with compressed archives
343
extracted_dir = module.ensure_untar(
344
"archives",
345
url="https://example.com/dataset.tar.gz",
346
directory="dataset_v1"
347
)
348
349
# Custom file creation
350
processed_path = module.ensure_custom(
351
"processed",
352
name="summary.txt",
353
provider=lambda path: path.write_text("Processing complete"),
354
force=False
355
)
356
```
357
358
### Data Format Operations
359
360
```python
361
import pystow
362
import pandas as pd
363
364
# Create module
365
module = pystow.module("analytics")
366
367
# Work with DataFrames
368
df = module.ensure_csv(
369
"raw_data",
370
url="https://example.com/sales.csv"
371
)
372
373
# Process and save
374
summary_df = df.groupby('region').sum()
375
module.dump_df(
376
"processed",
377
name="regional_summary.csv",
378
obj=summary_df
379
)
380
381
# Work with JSON
382
config = module.ensure_json(
383
"config",
384
url="https://api.example.com/settings.json"
385
)
386
387
# Save processed config
388
module.dump_json(
389
"processed_config",
390
name="app_config.json",
391
obj=config,
392
json_dump_kwargs={"indent": 2}
393
)
394
```
395
396
### Cloud Storage with Module
397
398
```python
399
import pystow
400
401
# Create module
402
module = pystow.module("research")
403
404
# Download from S3
405
s3_data = module.ensure_from_s3(
406
"datasets", "external",
407
s3_bucket="public-datasets",
408
s3_key="research/dataset_v2.csv"
409
)
410
411
# Download from Google Drive
412
gdrive_model = module.ensure_from_google(
413
"models", "pretrained",
414
name="bert_model.tar.gz",
415
file_id="1ExAmPlE_fIlE_iD_123456789"
416
)
417
```
418
419
### Module-Based Project Organization
420
421
```python
422
import pystow
423
import pandas as pd
424
425
class DataPipeline:
426
"""Data processing pipeline using PyStow modules"""
427
428
def __init__(self, project_name):
429
self.module = pystow.module(project_name)
430
self.raw_data = self.module.module("raw_data")
431
self.processed = self.module.module("processed")
432
self.models = self.module.module("models")
433
self.outputs = self.module.module("outputs")
434
435
def download_data(self, url, name):
436
"""Download raw data"""
437
return self.raw_data.ensure(url=url, name=name)
438
439
def process_data(self, raw_file, output_name):
440
"""Process raw data and save"""
441
df = pd.read_csv(raw_file)
442
443
# Processing logic here
444
processed_df = df.groupby('category').agg({
445
'value': 'mean',
446
'count': 'sum'
447
}).reset_index()
448
449
# Save processed data
450
self.processed.dump_df(name=output_name, obj=processed_df)
451
return self.processed.join(name=output_name)
452
453
def save_model(self, model, name):
454
"""Save trained model"""
455
self.models.dump_pickle(name=name, obj=model)
456
457
def load_model(self, name):
458
"""Load trained model"""
459
return self.models.load_pickle(name=name)
460
461
# Usage
462
pipeline = DataPipeline("my_ml_project")
463
464
# Download data
465
raw_path = pipeline.download_data(
466
url="https://example.com/training_data.csv",
467
name="training.csv"
468
)
469
470
# Process data
471
processed_path = pipeline.process_data(raw_path, "processed_training.csv")
472
473
# The module automatically organizes everything:
474
# ~/.data/my_ml_project/
475
# ├── raw_data/
476
# │ └── training.csv
477
# ├── processed/
478
# │ └── processed_training.csv
479
# ├── models/
480
# └── outputs/
481
```
482
483
### Advanced Module Patterns
484
485
```python
486
import pystow
487
from contextlib import contextmanager
488
489
class ConfigurableModule:
490
"""Module with configuration-driven behavior"""
491
492
def __init__(self, name, config_module="config"):
493
self.module = pystow.module(name)
494
self.config_module = config_module
495
496
def get_base_url(self):
497
"""Get base URL from configuration"""
498
return pystow.get_config(self.config_module, "base_url")
499
500
def get_api_key(self):
501
"""Get API key from configuration"""
502
return pystow.get_config(self.config_module, "api_key")
503
504
def download_with_auth(self, endpoint, name):
505
"""Download with authentication"""
506
base_url = self.get_base_url()
507
api_key = self.get_api_key()
508
509
return self.module.ensure(
510
url=f"{base_url}/{endpoint}",
511
name=name,
512
download_kwargs={
513
"headers": {"Authorization": f"Bearer {api_key}"}
514
}
515
)
516
517
@contextmanager
518
def temp_file(self, name):
519
"""Context manager for temporary files"""
520
temp_path = self.module.join("temp", name=name)
521
try:
522
yield temp_path
523
finally:
524
if temp_path.exists():
525
temp_path.unlink()
526
527
# Usage
528
app_module = ConfigurableModule("myapp")
529
530
# Download with authentication
531
data_path = app_module.download_with_auth("data/latest.csv", "current_data.csv")
532
533
# Use temporary file
534
with app_module.temp_file("temp_processing.csv") as temp_path:
535
# Process data using temp file
536
df = pd.read_csv(data_path)
537
df.to_csv(temp_path)
538
# temp_path is automatically cleaned up
539
```