Tessl Tile for pypi/pystow@0.7.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

archives.md cloud-storage.md configuration.md data-formats.md directory-management.md file-operations.md index.md module-class.md nltk-integration.md web-scraping.md

module-class.mddocs/

0
# Module Class API
1

2
The Module class provides an object-oriented interface for PyStow's directory management and file operations. It encapsulates all functionality within a specific directory context, making it ideal for organizing data within applications.
3

4
## Core Module Class
5

6
```python { .api }
7
class Module:
8
    """The class wrapping the directory lookup implementation."""
9
    
10
    def __init__(self, base: str | Path, ensure_exists: bool = True) -> None:
11
        """Initialize the module.
12
        
13
        Args:
14
            base: The base directory for the module
15
            ensure_exists: Should the base directory be created automatically?
16
                Defaults to true.
17
        """
18
    
19
    @classmethod
20
    def from_key(cls, key: str, *subkeys: str, ensure_exists: bool = True) -> Module:
21
        """Get a module for the given directory or one of its subdirectories.
22
        
23
        Args:
24
            key: The name of the module. No funny characters. The envvar <key>_HOME
25
                where key is uppercased is checked first before using the default home
26
                directory.
27
            subkeys: A sequence of additional strings to join. If none are given,
28
                returns the directory for this module.
29
            ensure_exists: Should all directories be created automatically? Defaults
30
                to true.
31
        
32
        Returns:
33
            A module
34
        """
35
```
36

37
## Directory Management Methods
38

39
```python { .api }
40
def module(self, *subkeys: str, ensure_exists: bool = True) -> Module:
41
    """Get a module for a subdirectory of the current module.
42
    
43
    Args:
44
        subkeys: A sequence of additional strings to join. If none are given,
45
            returns the directory for this module.
46
        ensure_exists: Should all directories be created automatically? Defaults
47
            to true.
48
    
49
    Returns:
50
        A module representing the subdirectory based on the given subkeys.
51
    """
52

53
def join(self, *subkeys: str, name: str | None = None, ensure_exists: bool = True, version: VersionHint = None) -> Path:
54
    """Get a subdirectory of the current module.
55
    
56
    Args:
57
        subkeys: A sequence of additional strings to join. If none are given,
58
            returns the directory for this module.
59
        ensure_exists: Should all directories be created automatically? Defaults
60
            to true.
61
        name: The name of the file (optional) inside the folder
62
        version: The optional version, or no-argument callable that returns an
63
            optional version. This is prepended before the subkeys.
64
    
65
    Returns:
66
        The path of the directory or subdirectory for the given module.
67
    """
68

69
def joinpath_sqlite(self, *subkeys: str, name: str) -> str:
70
    """Get an SQLite database connection string.
71
    
72
    Args:
73
        subkeys: A sequence of additional strings to join. If none are given,
74
            returns the directory for this module.
75
        name: The name of the database file.
76
    
77
    Returns:
78
        A SQLite path string.
79
    """
80
```
81

82
## File Download Methods
83

84
```python { .api }
85
def ensure(self, *subkeys: str, url: str, name: str | None = None, version: VersionHint = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Path:
86
    """Ensure a file is downloaded.
87
    
88
    Args:
89
        subkeys: A sequence of additional strings to join. If none are given,
90
            returns the directory for this module.
91
        url: The URL to download.
92
        name: Overrides the name of the file at the end of the URL, if given.
93
            Also useful for URLs that don't have proper filenames with extensions.
94
        version: The optional version, or no-argument callable that returns an
95
            optional version. This is prepended before the subkeys.
96
        force: Should the download be done again, even if the path already
97
            exists? Defaults to false.
98
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
99
    
100
    Returns:
101
        The path of the file that has been downloaded (or already exists)
102
    """
103

104
def ensure_custom(self, *subkeys: str, name: str, force: bool = False, provider: Provider, **kwargs: Any) -> Path:
105
    """Ensure a file is present, and run a custom create function otherwise.
106
    
107
    Args:
108
        subkeys: A sequence of additional strings to join. If none are given,
109
            returns the directory for this module.
110
        name: The file name.
111
        force: Should the file be re-created, even if the path already exists?
112
        provider: The file provider. Will be run with the path as the first
113
            positional argument, if the file needs to be generated.
114
        kwargs: Additional keyword-based parameters passed to the provider.
115
    
116
    Returns:
117
        The path of the file that has been created (or already exists)
118
    
119
    Raises:
120
        ValueError: If the provider was called but the file was not created by it.
121
    """
122
```
123

124
## Archive and Compression Methods
125

126
```python { .api }
127
def ensure_untar(self, *subkeys: str, url: str, name: str | None = None, directory: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, extract_kwargs: Mapping[str, Any] | None = None) -> Path:
128
    """Ensure a tar file is downloaded and unarchived.
129
    
130
    Args:
131
        subkeys: A sequence of additional strings to join. If none are given,
132
            returns the directory for this module.
133
        url: The URL to download.
134
        name: Overrides the name of the file at the end of the URL, if given.
135
            Also useful for URLs that don't have proper filenames with extensions.
136
        directory: Overrides the name of the directory into which the tar archive
137
            is extracted. If none given, will use the stem of the file name that gets
138
            downloaded.
139
        force: Should the download be done again, even if the path already
140
            exists? Defaults to false.
141
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
142
        extract_kwargs: Keyword arguments to pass to tarfile.TarFile.extract_all.
143
    
144
    Returns:
145
        The path of the directory where the file that has been downloaded gets
146
        extracted to
147
    """
148

149
def ensure_gunzip(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, autoclean: bool = True, download_kwargs: Mapping[str, Any] | None = None) -> Path:
150
    """Ensure a tar.gz file is downloaded and unarchived.
151
    
152
    Args:
153
        subkeys: A sequence of additional strings to join. If none are given,
154
            returns the directory for this module.
155
        url: The URL to download.
156
        name: Overrides the name of the file at the end of the URL, if given.
157
            Also useful for URLs that don't have proper filenames with extensions.
158
        force: Should the download be done again, even if the path already
159
            exists? Defaults to false.
160
        autoclean: Should the zipped file be deleted?
161
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
162
    
163
    Returns:
164
        The path of the directory where the file that has been downloaded gets
165
        extracted to
166
    """
167
```
168

169
## File I/O Context Managers
170

171
The Module class provides all the same context manager methods as the functional API:
172

173
- `open()` - Open files with various modes
174
- `open_gz()` - Open gzipped files
175
- `ensure_open()` - Download and open files
176
- `ensure_open_zip()` - Download zip and open inner files
177
- `ensure_open_lzma()` - Download and open LZMA files
178
- `ensure_open_tarfile()` - Download tar and open inner files
179
- `ensure_open_gz()` - Download and open gzipped files
180
- `ensure_open_bz2()` - Download and open BZ2 files
181

182
## Data Format Methods
183

184
The Module class provides all data format methods:
185

186
### CSV/DataFrame Methods
187
- `ensure_csv()` - Download CSV as DataFrame
188
- `load_df()` - Load existing CSV as DataFrame  
189
- `dump_df()` - Save DataFrame to file
190
- `ensure_excel()` - Download Excel as DataFrame
191
- `ensure_tar_df()` - Extract CSV from TAR archive
192
- `ensure_zip_df()` - Extract CSV from ZIP archive
193

194
### JSON Methods
195
- `ensure_json()` - Download and parse JSON
196
- `ensure_json_bz2()` - Download compressed JSON
197
- `load_json()` - Load existing JSON file
198
- `dump_json()` - Save object as JSON
199

200
### XML Methods
201
- `ensure_xml()` - Download and parse XML
202
- `ensure_tar_xml()` - Extract XML from TAR archive
203
- `load_xml()` - Load existing XML file
204
- `dump_xml()` - Save XML ElementTree
205

206
### RDF Methods
207
- `ensure_rdf()` - Download and parse RDF with caching
208
- `load_rdf()` - Load existing RDF file
209
- `dump_rdf()` - Save RDF graph
210

211
### Pickle Methods
212
- `ensure_pickle()` - Download and load pickle
213
- `ensure_pickle_gz()` - Download compressed pickle
214
- `load_pickle()` - Load existing pickle
215
- `load_pickle_gz()` - Load compressed pickle
216
- `dump_pickle()` - Save object as pickle
217

218
### NumPy Methods
219
- `ensure_zip_np()` - Load NumPy array from ZIP
220

221
## Cloud Storage Methods
222

223
```python { .api }
224
def ensure_from_s3(self, *subkeys: str, s3_bucket: str, s3_key: str | Sequence[str], name: str | None = None, client: botocore.client.BaseClient | None = None, client_kwargs: Mapping[str, Any] | None = None, download_file_kwargs: Mapping[str, Any] | None = None, force: bool = False) -> Path:
225
    """Ensure a file is downloaded from AWS S3.
226
    
227
    Args:
228
        subkeys: A sequence of additional strings to join. If none are given,
229
            returns the directory for this module.
230
        s3_bucket: The S3 bucket name
231
        s3_key: The S3 key name
232
        name: Overrides the name of the file at the end of the S3 key, if given.
233
        client: A botocore client. If none given, one will be created
234
            automatically
235
        client_kwargs: Keyword arguments to be passed to the client on
236
            instantiation.
237
        download_file_kwargs: Keyword arguments to be passed to
238
            boto3.s3.transfer.S3Transfer.download_file
239
        force: Should the download be done again, even if the path already
240
            exists? Defaults to false.
241
    
242
    Returns:
243
        The path of the file that has been downloaded (or already exists)
244
    """
245

246
def ensure_from_google(self, *subkeys: str, name: str, file_id: str, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Path:
247
    """Ensure a file is downloaded from Google Drive.
248
    
249
    Args:
250
        subkeys: A sequence of additional strings to join. If none are given,
251
            returns the directory for this module.
252
        name: The name of the file
253
        file_id: The file identifier of the Google file. If your share link is
254
            https://drive.google.com/file/d/1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z/view, then
255
            your file ID is 1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z.
256
        force: Should the download be done again, even if the path already
257
            exists? Defaults to false.
258
        download_kwargs: Keyword arguments to pass through to
259
            pystow.utils.download_from_google.
260
    
261
    Returns:
262
        The path of the file that has been downloaded (or already exists)
263
    """
264
```
265

266
## Database Methods
267

268
```python { .api }
269
@contextmanager
270
def ensure_open_sqlite(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Generator[sqlite3.Connection, None, None]:
271
    """Ensure and connect to a SQLite database.
272
    
273
    Args:
274
        subkeys: A sequence of additional strings to join. If none are given,
275
            returns the directory for this module.
276
        url: The URL to download.
277
        name: Overrides the name of the file at the end of the URL, if given.
278
            Also useful for URLs that don't have proper filenames with extensions.
279
        force: Should the download be done again, even if the path already
280
            exists? Defaults to false.
281
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
282
    
283
    Yields:
284
        An instance of sqlite3.Connection from sqlite3.connect
285
    """
286

287
@contextmanager
288
def ensure_open_sqlite_gz(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Generator[sqlite3.Connection, None, None]:
289
    """Ensure and connect to a SQLite database that's gzipped.
290
    
291
    Unfortunately, it's a paid feature to directly read gzipped sqlite files, so
292
    this automatically gunzips it first.
293
    
294
    Args:
295
        subkeys: A sequence of additional strings to join. If none are given,
296
            returns the directory for this module.
297
        url: The URL to download.
298
        name: Overrides the name of the file at the end of the URL, if given.
299
            Also useful for URLs that don't have proper filenames with extensions.
300
        force: Should the download be done again, even if the path already
301
            exists? Defaults to false.
302
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
303
    
304
    Yields:
305
        An instance of sqlite3.Connection from sqlite3.connect
306
    """
307
```
308

309
## Usage Examples
310

311
### Basic Module Usage
312

313
```python
314
import pystow
315

316
# Create a module for your application
317
module = pystow.module("myapp")
318

319
# Get subdirectories
320
data_module = module.module("datasets")
321
config_module = module.module("config")
322

323
# Get file paths
324
data_file = data_module.join(name="data.csv")
325
config_file = config_module.join(name="settings.json")
326
```
327

328
### File Operations with Module
329

330
```python
331
import pystow
332

333
# Create module
334
module = pystow.module("myproject")
335

336
# Download files
337
dataset_path = module.ensure(
338
    "datasets", "raw",
339
    url="https://example.com/data.csv"
340
)
341

342
# Work with compressed archives
343
extracted_dir = module.ensure_untar(
344
    "archives",
345
    url="https://example.com/dataset.tar.gz",
346
    directory="dataset_v1"
347
)
348

349
# Custom file creation
350
processed_path = module.ensure_custom(
351
    "processed",
352
    name="summary.txt",
353
    provider=lambda path: path.write_text("Processing complete"),
354
    force=False
355
)
356
```
357

358
### Data Format Operations
359

360
```python
361
import pystow
362
import pandas as pd
363

364
# Create module
365
module = pystow.module("analytics")
366

367
# Work with DataFrames
368
df = module.ensure_csv(
369
    "raw_data",
370
    url="https://example.com/sales.csv"
371
)
372

373
# Process and save
374
summary_df = df.groupby('region').sum()
375
module.dump_df(
376
    "processed",
377
    name="regional_summary.csv",
378
    obj=summary_df
379
)
380

381
# Work with JSON
382
config = module.ensure_json(
383
    "config",
384
    url="https://api.example.com/settings.json"
385
)
386

387
# Save processed config
388
module.dump_json(
389
    "processed_config",
390
    name="app_config.json",
391
    obj=config,
392
    json_dump_kwargs={"indent": 2}
393
)
394
```
395

396
### Cloud Storage with Module
397

398
```python
399
import pystow
400

401
# Create module
402
module = pystow.module("research")
403

404
# Download from S3
405
s3_data = module.ensure_from_s3(
406
    "datasets", "external",
407
    s3_bucket="public-datasets",
408
    s3_key="research/dataset_v2.csv"
409
)
410

411
# Download from Google Drive
412
gdrive_model = module.ensure_from_google(
413
    "models", "pretrained",
414
    name="bert_model.tar.gz",
415
    file_id="1ExAmPlE_fIlE_iD_123456789"
416
)
417
```
418

419
### Module-Based Project Organization
420

421
```python
422
import pystow
423
import pandas as pd
424

425
class DataPipeline:
426
    """Data processing pipeline using PyStow modules"""
427
    
428
    def __init__(self, project_name):
429
        self.module = pystow.module(project_name)
430
        self.raw_data = self.module.module("raw_data")
431
        self.processed = self.module.module("processed")
432
        self.models = self.module.module("models")
433
        self.outputs = self.module.module("outputs")
434
    
435
    def download_data(self, url, name):
436
        """Download raw data"""
437
        return self.raw_data.ensure(url=url, name=name)
438
    
439
    def process_data(self, raw_file, output_name):
440
        """Process raw data and save"""
441
        df = pd.read_csv(raw_file)
442
        
443
        # Processing logic here
444
        processed_df = df.groupby('category').agg({
445
            'value': 'mean',
446
            'count': 'sum'
447
        }).reset_index()
448
        
449
        # Save processed data
450
        self.processed.dump_df(name=output_name, obj=processed_df)
451
        return self.processed.join(name=output_name)
452
    
453
    def save_model(self, model, name):
454
        """Save trained model"""
455
        self.models.dump_pickle(name=name, obj=model)
456
    
457
    def load_model(self, name):
458
        """Load trained model"""
459
        return self.models.load_pickle(name=name)
460

461
# Usage
462
pipeline = DataPipeline("my_ml_project")
463

464
# Download data
465
raw_path = pipeline.download_data(
466
    url="https://example.com/training_data.csv",
467
    name="training.csv"
468
)
469

470
# Process data
471
processed_path = pipeline.process_data(raw_path, "processed_training.csv")
472

473
# The module automatically organizes everything:
474
# ~/.data/my_ml_project/
475
# ├── raw_data/
476
# │   └── training.csv
477
# ├── processed/
478
# │   └── processed_training.csv
479
# ├── models/
480
# └── outputs/
481
```
482

483
### Advanced Module Patterns
484

485
```python
486
import pystow
487
from contextlib import contextmanager
488

489
class ConfigurableModule:
490
    """Module with configuration-driven behavior"""
491
    
492
    def __init__(self, name, config_module="config"):
493
        self.module = pystow.module(name)
494
        self.config_module = config_module
495
    
496
    def get_base_url(self):
497
        """Get base URL from configuration"""
498
        return pystow.get_config(self.config_module, "base_url")
499
    
500
    def get_api_key(self):
501
        """Get API key from configuration"""
502
        return pystow.get_config(self.config_module, "api_key")
503
    
504
    def download_with_auth(self, endpoint, name):
505
        """Download with authentication"""
506
        base_url = self.get_base_url()
507
        api_key = self.get_api_key()
508
        
509
        return self.module.ensure(
510
            url=f"{base_url}/{endpoint}",
511
            name=name,
512
            download_kwargs={
513
                "headers": {"Authorization": f"Bearer {api_key}"}
514
            }
515
        )
516
    
517
    @contextmanager
518
    def temp_file(self, name):
519
        """Context manager for temporary files"""
520
        temp_path = self.module.join("temp", name=name)
521
        try:
522
            yield temp_path
523
        finally:
524
            if temp_path.exists():
525
                temp_path.unlink()
526

527
# Usage
528
app_module = ConfigurableModule("myapp")
529

530
# Download with authentication
531
data_path = app_module.download_with_auth("data/latest.csv", "current_data.csv")
532

533
# Use temporary file
534
with app_module.temp_file("temp_processing.csv") as temp_path:
535
    # Process data using temp file
536
    df = pd.read_csv(data_path)
537
    df.to_csv(temp_path)
538
    # temp_path is automatically cleaned up
539
```

Version

Tile

Files

module-class.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

module-class.mddocs/