or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

archives.mdcloud-storage.mdconfiguration.mddata-formats.mddirectory-management.mdfile-operations.mdindex.mdmodule-class.mdnltk-integration.mdweb-scraping.md

module-class.mddocs/

0

# Module Class API

1

2

The Module class provides an object-oriented interface for PyStow's directory management and file operations. It encapsulates all functionality within a specific directory context, making it ideal for organizing data within applications.

3

4

## Core Module Class

5

6

```python { .api }

7

class Module:

8

"""The class wrapping the directory lookup implementation."""

9

10

def __init__(self, base: str | Path, ensure_exists: bool = True) -> None:

11

"""Initialize the module.

12

13

Args:

14

base: The base directory for the module

15

ensure_exists: Should the base directory be created automatically?

16

Defaults to true.

17

"""

18

19

@classmethod

20

def from_key(cls, key: str, *subkeys: str, ensure_exists: bool = True) -> Module:

21

"""Get a module for the given directory or one of its subdirectories.

22

23

Args:

24

key: The name of the module. No funny characters. The envvar <key>_HOME

25

where key is uppercased is checked first before using the default home

26

directory.

27

subkeys: A sequence of additional strings to join. If none are given,

28

returns the directory for this module.

29

ensure_exists: Should all directories be created automatically? Defaults

30

to true.

31

32

Returns:

33

A module

34

"""

35

```

36

37

## Directory Management Methods

38

39

```python { .api }

40

def module(self, *subkeys: str, ensure_exists: bool = True) -> Module:

41

"""Get a module for a subdirectory of the current module.

42

43

Args:

44

subkeys: A sequence of additional strings to join. If none are given,

45

returns the directory for this module.

46

ensure_exists: Should all directories be created automatically? Defaults

47

to true.

48

49

Returns:

50

A module representing the subdirectory based on the given subkeys.

51

"""

52

53

def join(self, *subkeys: str, name: str | None = None, ensure_exists: bool = True, version: VersionHint = None) -> Path:

54

"""Get a subdirectory of the current module.

55

56

Args:

57

subkeys: A sequence of additional strings to join. If none are given,

58

returns the directory for this module.

59

ensure_exists: Should all directories be created automatically? Defaults

60

to true.

61

name: The name of the file (optional) inside the folder

62

version: The optional version, or no-argument callable that returns an

63

optional version. This is prepended before the subkeys.

64

65

Returns:

66

The path of the directory or subdirectory for the given module.

67

"""

68

69

def joinpath_sqlite(self, *subkeys: str, name: str) -> str:

70

"""Get an SQLite database connection string.

71

72

Args:

73

subkeys: A sequence of additional strings to join. If none are given,

74

returns the directory for this module.

75

name: The name of the database file.

76

77

Returns:

78

A SQLite path string.

79

"""

80

```

81

82

## File Download Methods

83

84

```python { .api }

85

def ensure(self, *subkeys: str, url: str, name: str | None = None, version: VersionHint = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Path:

86

"""Ensure a file is downloaded.

87

88

Args:

89

subkeys: A sequence of additional strings to join. If none are given,

90

returns the directory for this module.

91

url: The URL to download.

92

name: Overrides the name of the file at the end of the URL, if given.

93

Also useful for URLs that don't have proper filenames with extensions.

94

version: The optional version, or no-argument callable that returns an

95

optional version. This is prepended before the subkeys.

96

force: Should the download be done again, even if the path already

97

exists? Defaults to false.

98

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

99

100

Returns:

101

The path of the file that has been downloaded (or already exists)

102

"""

103

104

def ensure_custom(self, *subkeys: str, name: str, force: bool = False, provider: Provider, **kwargs: Any) -> Path:

105

"""Ensure a file is present, and run a custom create function otherwise.

106

107

Args:

108

subkeys: A sequence of additional strings to join. If none are given,

109

returns the directory for this module.

110

name: The file name.

111

force: Should the file be re-created, even if the path already exists?

112

provider: The file provider. Will be run with the path as the first

113

positional argument, if the file needs to be generated.

114

kwargs: Additional keyword-based parameters passed to the provider.

115

116

Returns:

117

The path of the file that has been created (or already exists)

118

119

Raises:

120

ValueError: If the provider was called but the file was not created by it.

121

"""

122

```

123

124

## Archive and Compression Methods

125

126

```python { .api }

127

def ensure_untar(self, *subkeys: str, url: str, name: str | None = None, directory: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, extract_kwargs: Mapping[str, Any] | None = None) -> Path:

128

"""Ensure a tar file is downloaded and unarchived.

129

130

Args:

131

subkeys: A sequence of additional strings to join. If none are given,

132

returns the directory for this module.

133

url: The URL to download.

134

name: Overrides the name of the file at the end of the URL, if given.

135

Also useful for URLs that don't have proper filenames with extensions.

136

directory: Overrides the name of the directory into which the tar archive

137

is extracted. If none given, will use the stem of the file name that gets

138

downloaded.

139

force: Should the download be done again, even if the path already

140

exists? Defaults to false.

141

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

142

extract_kwargs: Keyword arguments to pass to tarfile.TarFile.extract_all.

143

144

Returns:

145

The path of the directory where the file that has been downloaded gets

146

extracted to

147

"""

148

149

def ensure_gunzip(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, autoclean: bool = True, download_kwargs: Mapping[str, Any] | None = None) -> Path:

150

"""Ensure a tar.gz file is downloaded and unarchived.

151

152

Args:

153

subkeys: A sequence of additional strings to join. If none are given,

154

returns the directory for this module.

155

url: The URL to download.

156

name: Overrides the name of the file at the end of the URL, if given.

157

Also useful for URLs that don't have proper filenames with extensions.

158

force: Should the download be done again, even if the path already

159

exists? Defaults to false.

160

autoclean: Should the zipped file be deleted?

161

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

162

163

Returns:

164

The path of the directory where the file that has been downloaded gets

165

extracted to

166

"""

167

```

168

169

## File I/O Context Managers

170

171

The Module class provides all the same context manager methods as the functional API:

172

173

- `open()` - Open files with various modes

174

- `open_gz()` - Open gzipped files

175

- `ensure_open()` - Download and open files

176

- `ensure_open_zip()` - Download zip and open inner files

177

- `ensure_open_lzma()` - Download and open LZMA files

178

- `ensure_open_tarfile()` - Download tar and open inner files

179

- `ensure_open_gz()` - Download and open gzipped files

180

- `ensure_open_bz2()` - Download and open BZ2 files

181

182

## Data Format Methods

183

184

The Module class provides all data format methods:

185

186

### CSV/DataFrame Methods

187

- `ensure_csv()` - Download CSV as DataFrame

188

- `load_df()` - Load existing CSV as DataFrame

189

- `dump_df()` - Save DataFrame to file

190

- `ensure_excel()` - Download Excel as DataFrame

191

- `ensure_tar_df()` - Extract CSV from TAR archive

192

- `ensure_zip_df()` - Extract CSV from ZIP archive

193

194

### JSON Methods

195

- `ensure_json()` - Download and parse JSON

196

- `ensure_json_bz2()` - Download compressed JSON

197

- `load_json()` - Load existing JSON file

198

- `dump_json()` - Save object as JSON

199

200

### XML Methods

201

- `ensure_xml()` - Download and parse XML

202

- `ensure_tar_xml()` - Extract XML from TAR archive

203

- `load_xml()` - Load existing XML file

204

- `dump_xml()` - Save XML ElementTree

205

206

### RDF Methods

207

- `ensure_rdf()` - Download and parse RDF with caching

208

- `load_rdf()` - Load existing RDF file

209

- `dump_rdf()` - Save RDF graph

210

211

### Pickle Methods

212

- `ensure_pickle()` - Download and load pickle

213

- `ensure_pickle_gz()` - Download compressed pickle

214

- `load_pickle()` - Load existing pickle

215

- `load_pickle_gz()` - Load compressed pickle

216

- `dump_pickle()` - Save object as pickle

217

218

### NumPy Methods

219

- `ensure_zip_np()` - Load NumPy array from ZIP

220

221

## Cloud Storage Methods

222

223

```python { .api }

224

def ensure_from_s3(self, *subkeys: str, s3_bucket: str, s3_key: str | Sequence[str], name: str | None = None, client: botocore.client.BaseClient | None = None, client_kwargs: Mapping[str, Any] | None = None, download_file_kwargs: Mapping[str, Any] | None = None, force: bool = False) -> Path:

225

"""Ensure a file is downloaded from AWS S3.

226

227

Args:

228

subkeys: A sequence of additional strings to join. If none are given,

229

returns the directory for this module.

230

s3_bucket: The S3 bucket name

231

s3_key: The S3 key name

232

name: Overrides the name of the file at the end of the S3 key, if given.

233

client: A botocore client. If none given, one will be created

234

automatically

235

client_kwargs: Keyword arguments to be passed to the client on

236

instantiation.

237

download_file_kwargs: Keyword arguments to be passed to

238

boto3.s3.transfer.S3Transfer.download_file

239

force: Should the download be done again, even if the path already

240

exists? Defaults to false.

241

242

Returns:

243

The path of the file that has been downloaded (or already exists)

244

"""

245

246

def ensure_from_google(self, *subkeys: str, name: str, file_id: str, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Path:

247

"""Ensure a file is downloaded from Google Drive.

248

249

Args:

250

subkeys: A sequence of additional strings to join. If none are given,

251

returns the directory for this module.

252

name: The name of the file

253

file_id: The file identifier of the Google file. If your share link is

254

https://drive.google.com/file/d/1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z/view, then

255

your file ID is 1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z.

256

force: Should the download be done again, even if the path already

257

exists? Defaults to false.

258

download_kwargs: Keyword arguments to pass through to

259

pystow.utils.download_from_google.

260

261

Returns:

262

The path of the file that has been downloaded (or already exists)

263

"""

264

```

265

266

## Database Methods

267

268

```python { .api }

269

@contextmanager

270

def ensure_open_sqlite(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Generator[sqlite3.Connection, None, None]:

271

"""Ensure and connect to a SQLite database.

272

273

Args:

274

subkeys: A sequence of additional strings to join. If none are given,

275

returns the directory for this module.

276

url: The URL to download.

277

name: Overrides the name of the file at the end of the URL, if given.

278

Also useful for URLs that don't have proper filenames with extensions.

279

force: Should the download be done again, even if the path already

280

exists? Defaults to false.

281

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

282

283

Yields:

284

An instance of sqlite3.Connection from sqlite3.connect

285

"""

286

287

@contextmanager

288

def ensure_open_sqlite_gz(self, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None) -> Generator[sqlite3.Connection, None, None]:

289

"""Ensure and connect to a SQLite database that's gzipped.

290

291

Unfortunately, it's a paid feature to directly read gzipped sqlite files, so

292

this automatically gunzips it first.

293

294

Args:

295

subkeys: A sequence of additional strings to join. If none are given,

296

returns the directory for this module.

297

url: The URL to download.

298

name: Overrides the name of the file at the end of the URL, if given.

299

Also useful for URLs that don't have proper filenames with extensions.

300

force: Should the download be done again, even if the path already

301

exists? Defaults to false.

302

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

303

304

Yields:

305

An instance of sqlite3.Connection from sqlite3.connect

306

"""

307

```

308

309

## Usage Examples

310

311

### Basic Module Usage

312

313

```python

314

import pystow

315

316

# Create a module for your application

317

module = pystow.module("myapp")

318

319

# Get subdirectories

320

data_module = module.module("datasets")

321

config_module = module.module("config")

322

323

# Get file paths

324

data_file = data_module.join(name="data.csv")

325

config_file = config_module.join(name="settings.json")

326

```

327

328

### File Operations with Module

329

330

```python

331

import pystow

332

333

# Create module

334

module = pystow.module("myproject")

335

336

# Download files

337

dataset_path = module.ensure(

338

"datasets", "raw",

339

url="https://example.com/data.csv"

340

)

341

342

# Work with compressed archives

343

extracted_dir = module.ensure_untar(

344

"archives",

345

url="https://example.com/dataset.tar.gz",

346

directory="dataset_v1"

347

)

348

349

# Custom file creation

350

processed_path = module.ensure_custom(

351

"processed",

352

name="summary.txt",

353

provider=lambda path: path.write_text("Processing complete"),

354

force=False

355

)

356

```

357

358

### Data Format Operations

359

360

```python

361

import pystow

362

import pandas as pd

363

364

# Create module

365

module = pystow.module("analytics")

366

367

# Work with DataFrames

368

df = module.ensure_csv(

369

"raw_data",

370

url="https://example.com/sales.csv"

371

)

372

373

# Process and save

374

summary_df = df.groupby('region').sum()

375

module.dump_df(

376

"processed",

377

name="regional_summary.csv",

378

obj=summary_df

379

)

380

381

# Work with JSON

382

config = module.ensure_json(

383

"config",

384

url="https://api.example.com/settings.json"

385

)

386

387

# Save processed config

388

module.dump_json(

389

"processed_config",

390

name="app_config.json",

391

obj=config,

392

json_dump_kwargs={"indent": 2}

393

)

394

```

395

396

### Cloud Storage with Module

397

398

```python

399

import pystow

400

401

# Create module

402

module = pystow.module("research")

403

404

# Download from S3

405

s3_data = module.ensure_from_s3(

406

"datasets", "external",

407

s3_bucket="public-datasets",

408

s3_key="research/dataset_v2.csv"

409

)

410

411

# Download from Google Drive

412

gdrive_model = module.ensure_from_google(

413

"models", "pretrained",

414

name="bert_model.tar.gz",

415

file_id="1ExAmPlE_fIlE_iD_123456789"

416

)

417

```

418

419

### Module-Based Project Organization

420

421

```python

422

import pystow

423

import pandas as pd

424

425

class DataPipeline:

426

"""Data processing pipeline using PyStow modules"""

427

428

def __init__(self, project_name):

429

self.module = pystow.module(project_name)

430

self.raw_data = self.module.module("raw_data")

431

self.processed = self.module.module("processed")

432

self.models = self.module.module("models")

433

self.outputs = self.module.module("outputs")

434

435

def download_data(self, url, name):

436

"""Download raw data"""

437

return self.raw_data.ensure(url=url, name=name)

438

439

def process_data(self, raw_file, output_name):

440

"""Process raw data and save"""

441

df = pd.read_csv(raw_file)

442

443

# Processing logic here

444

processed_df = df.groupby('category').agg({

445

'value': 'mean',

446

'count': 'sum'

447

}).reset_index()

448

449

# Save processed data

450

self.processed.dump_df(name=output_name, obj=processed_df)

451

return self.processed.join(name=output_name)

452

453

def save_model(self, model, name):

454

"""Save trained model"""

455

self.models.dump_pickle(name=name, obj=model)

456

457

def load_model(self, name):

458

"""Load trained model"""

459

return self.models.load_pickle(name=name)

460

461

# Usage

462

pipeline = DataPipeline("my_ml_project")

463

464

# Download data

465

raw_path = pipeline.download_data(

466

url="https://example.com/training_data.csv",

467

name="training.csv"

468

)

469

470

# Process data

471

processed_path = pipeline.process_data(raw_path, "processed_training.csv")

472

473

# The module automatically organizes everything:

474

# ~/.data/my_ml_project/

475

# ├── raw_data/

476

# │ └── training.csv

477

# ├── processed/

478

# │ └── processed_training.csv

479

# ├── models/

480

# └── outputs/

481

```

482

483

### Advanced Module Patterns

484

485

```python

486

import pystow

487

from contextlib import contextmanager

488

489

class ConfigurableModule:

490

"""Module with configuration-driven behavior"""

491

492

def __init__(self, name, config_module="config"):

493

self.module = pystow.module(name)

494

self.config_module = config_module

495

496

def get_base_url(self):

497

"""Get base URL from configuration"""

498

return pystow.get_config(self.config_module, "base_url")

499

500

def get_api_key(self):

501

"""Get API key from configuration"""

502

return pystow.get_config(self.config_module, "api_key")

503

504

def download_with_auth(self, endpoint, name):

505

"""Download with authentication"""

506

base_url = self.get_base_url()

507

api_key = self.get_api_key()

508

509

return self.module.ensure(

510

url=f"{base_url}/{endpoint}",

511

name=name,

512

download_kwargs={

513

"headers": {"Authorization": f"Bearer {api_key}"}

514

}

515

)

516

517

@contextmanager

518

def temp_file(self, name):

519

"""Context manager for temporary files"""

520

temp_path = self.module.join("temp", name=name)

521

try:

522

yield temp_path

523

finally:

524

if temp_path.exists():

525

temp_path.unlink()

526

527

# Usage

528

app_module = ConfigurableModule("myapp")

529

530

# Download with authentication

531

data_path = app_module.download_with_auth("data/latest.csv", "current_data.csv")

532

533

# Use temporary file

534

with app_module.temp_file("temp_processing.csv") as temp_path:

535

# Process data using temp file

536

df = pd.read_csv(data_path)

537

df.to_csv(temp_path)

538

# temp_path is automatically cleaned up

539

```