or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

archives.mdcloud-storage.mdconfiguration.mddata-formats.mddirectory-management.mdfile-operations.mdindex.mdmodule-class.mdnltk-integration.mdweb-scraping.md

archives.mddocs/

0

# Archive and Compression Support

1

2

PyStow provides comprehensive support for compressed archives and files, including ZIP, TAR, GZIP, LZMA, and BZ2 formats. It can automatically extract archives, access files within archives, and handle various compression formats transparently.

3

4

## Archive Extraction

5

6

### TAR Archive Extraction

7

8

```python { .api }

9

def ensure_untar(key: str, *subkeys: str, url: str, name: str | None = None, directory: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, extract_kwargs: Mapping[str, Any] | None = None) -> Path:

10

"""Ensure a file is downloaded and untarred.

11

12

Args:

13

key: The name of the module. No funny characters. The envvar <key>_HOME where

14

key is uppercased is checked first before using the default home directory.

15

subkeys: A sequence of additional strings to join. If none are given, returns

16

the directory for this module.

17

url: The URL to download.

18

name: Overrides the name of the file at the end of the URL, if given. Also

19

useful for URLs that don't have proper filenames with extensions.

20

directory: Overrides the name of the directory into which the tar archive is

21

extracted. If none given, will use the stem of the file name that gets

22

downloaded.

23

force: Should the download be done again, even if the path already exists?

24

Defaults to false.

25

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

26

extract_kwargs: Keyword arguments to pass to tarfile.TarFile.extract_all.

27

28

Returns:

29

The path of the directory where the file that has been downloaded gets

30

extracted to

31

"""

32

```

33

34

### GZIP Decompression

35

36

```python { .api }

37

def ensure_gunzip(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, autoclean: bool = True, download_kwargs: Mapping[str, Any] | None = None) -> Path:

38

"""Ensure a file is downloaded and gunzipped.

39

40

Args:

41

key: The name of the module. No funny characters. The envvar <key>_HOME where

42

key is uppercased is checked first before using the default home directory.

43

subkeys: A sequence of additional strings to join. If none are given, returns

44

the directory for this module.

45

url: The URL to download.

46

name: Overrides the name of the file at the end of the URL, if given. Also

47

useful for URLs that don't have proper filenames with extensions.

48

force: Should the download be done again, even if the path already exists?

49

Defaults to false.

50

autoclean: Should the zipped file be deleted?

51

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

52

53

Returns:

54

The path of the directory where the file that has been downloaded gets

55

extracted to

56

"""

57

```

58

59

## Compressed Archive Access

60

61

### ZIP File Access

62

63

```python { .api }

64

@contextmanager

65

def ensure_open_zip(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: str = "r", open_kwargs: Mapping[str, Any] | None = None) -> BytesOpener:

66

"""Ensure a file is downloaded then open it with zipfile.

67

68

Args:

69

key: The name of the module. No funny characters. The envvar <key>_HOME

70

where key is uppercased is checked first before using the default home

71

directory.

72

subkeys: A sequence of additional strings to join. If none are given, returns

73

the directory for this module.

74

url: The URL to download.

75

inner_path: The relative path to the file inside the archive

76

name: Overrides the name of the file at the end of the URL, if given. Also

77

useful for URLs that don't have proper filenames with extensions.

78

force: Should the download be done again, even if the path already exists?

79

Defaults to false.

80

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

81

mode: The read mode, passed to zipfile.open

82

open_kwargs: Additional keyword arguments passed to zipfile.open

83

84

Yields:

85

An open file object

86

"""

87

```

88

89

### TAR File Access

90

91

```python { .api }

92

@contextmanager

93

def ensure_open_tarfile(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: str = "r", open_kwargs: Mapping[str, Any] | None = None) -> BytesOpener:

94

"""Ensure a tar file is downloaded and open a file inside it.

95

96

Args:

97

key: The name of the module. No funny characters. The envvar <key>_HOME

98

where key is uppercased is checked first before using the default home

99

directory.

100

subkeys: A sequence of additional strings to join. If none are given, returns

101

the directory for this module.

102

url: The URL to download.

103

inner_path: The relative path to the file inside the archive

104

name: Overrides the name of the file at the end of the URL, if given. Also

105

useful for URLs that don't have proper filenames with extensions.

106

force: Should the download be done again, even if the path already exists?

107

Defaults to false.

108

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

109

mode: The read mode, passed to tarfile.open

110

open_kwargs: Additional keyword arguments passed to tarfile.open

111

112

Yields:

113

An open file object

114

"""

115

```

116

117

## Compression Format Support

118

119

### GZIP Files

120

121

```python { .api }

122

@contextmanager

123

def ensure_open_gz(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["r", "rb", "w", "wb", "rt", "wt"] = "rb", open_kwargs: Mapping[str, Any] | None = None) -> Generator[StringIO | BytesIO, None, None]:

124

"""Ensure a gzipped file is downloaded and open a file inside it.

125

126

Args:

127

key: The name of the module. No funny characters. The envvar <key>_HOME

128

where key is uppercased is checked first before using the default home

129

directory.

130

subkeys: A sequence of additional strings to join. If none are given, returns

131

the directory for this module.

132

url: The URL to download.

133

name: Overrides the name of the file at the end of the URL, if given. Also

134

useful for URLs that don't have proper filenames with extensions.

135

force: Should the download be done again, even if the path already exists?

136

Defaults to false.

137

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

138

mode: The read mode, passed to gzip.open

139

open_kwargs: Additional keyword arguments passed to gzip.open

140

141

Yields:

142

An open file object

143

"""

144

```

145

146

### LZMA Files

147

148

```python { .api }

149

@contextmanager

150

def ensure_open_lzma(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["r", "rb", "w", "wb", "rt", "wt"] = "rt", open_kwargs: Mapping[str, Any] | None = None) -> Generator[lzma.LZMAFile | io.TextIOWrapper[lzma.LZMAFile], None, None]:

151

"""Ensure a LZMA-compressed file is downloaded and open a file inside it.

152

153

Args:

154

key: The name of the module. No funny characters. The envvar <key>_HOME

155

where key is uppercased is checked first before using the default home

156

directory.

157

subkeys: A sequence of additional strings to join. If none are given, returns

158

the directory for this module.

159

url: The URL to download.

160

name: Overrides the name of the file at the end of the URL, if given. Also

161

useful for URLs that don't have proper filenames with extensions.

162

force: Should the download be done again, even if the path already exists?

163

Defaults to false.

164

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

165

mode: The read mode, passed to lzma.open

166

open_kwargs: Additional keyword arguments passed to lzma.open

167

168

Yields:

169

An open file object

170

"""

171

```

172

173

### BZ2 Files

174

175

```python { .api }

176

@contextmanager

177

def ensure_open_bz2(key: str, *subkeys: str, url: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, mode: Literal["rb"] = "rb", open_kwargs: Mapping[str, Any] | None = None) -> Generator[bz2.BZ2File, None, None]:

178

"""Ensure a BZ2-compressed file is downloaded and open a file inside it.

179

180

Args:

181

key: The name of the module. No funny characters. The envvar <key>_HOME

182

where key is uppercased is checked first before using the default home

183

directory.

184

subkeys: A sequence of additional strings to join. If none are given, returns

185

the directory for this module.

186

url: The URL to download.

187

name: Overrides the name of the file at the end of the URL, if given. Also

188

useful for URLs that don't have proper filenames with extensions.

189

force: Should the download be done again, even if the path already exists?

190

Defaults to false.

191

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

192

mode: The read mode, passed to bz2.open

193

open_kwargs: Additional keyword arguments passed to bz2.open

194

195

Yields:

196

An open file object

197

"""

198

```

199

200

## Archive Data Format Support

201

202

### CSV from Archives

203

204

```python { .api }

205

def ensure_zip_df(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:

206

"""Download a zip file and open an inner file as a dataframe with pandas.

207

208

Args:

209

key: The module name

210

subkeys: A sequence of additional strings to join. If none are given, returns

211

the directory for this module.

212

url: The URL to download.

213

inner_path: The relative path to the file inside the archive

214

name: Overrides the name of the file at the end of the URL, if given. Also

215

useful for URLs that don't have proper filenames with extensions.

216

force: Should the download be done again, even if the path already exists?

217

Defaults to false.

218

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

219

read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.

220

221

Returns:

222

A pandas DataFrame

223

"""

224

225

def ensure_tar_df(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, read_csv_kwargs: Mapping[str, Any] | None = None) -> pd.DataFrame:

226

"""Download a tar file and open an inner file as a dataframe with pandas.

227

228

Args:

229

key: The module name

230

subkeys: A sequence of additional strings to join. If none are given, returns

231

the directory for this module.

232

url: The URL to download.

233

inner_path: The relative path to the file inside the archive

234

name: Overrides the name of the file at the end of the URL, if given. Also

235

useful for URLs that don't have proper filenames with extensions.

236

force: Should the download be done again, even if the path already exists?

237

Defaults to false.

238

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

239

read_csv_kwargs: Keyword arguments to pass through to pandas.read_csv.

240

241

Returns:

242

A dataframe

243

"""

244

```

245

246

### XML from Archives

247

248

```python { .api }

249

def ensure_tar_xml(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, parse_kwargs: Mapping[str, Any] | None = None) -> lxml.etree.ElementTree:

250

"""Download a tar file and open an inner file as an XML with lxml.

251

252

Args:

253

key: The module name

254

subkeys: A sequence of additional strings to join. If none are given, returns

255

the directory for this module.

256

url: The URL to download.

257

inner_path: The relative path to the file inside the archive

258

name: Overrides the name of the file at the end of the URL, if given. Also

259

useful for URLs that don't have proper filenames with extensions.

260

force: Should the download be done again, even if the path already exists?

261

Defaults to false.

262

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

263

parse_kwargs: Keyword arguments to pass through to lxml.etree.parse.

264

265

Returns:

266

An ElementTree object

267

"""

268

```

269

270

### NumPy Arrays from Archives

271

272

```python { .api }

273

def ensure_zip_np(key: str, *subkeys: str, url: str, inner_path: str, name: str | None = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, load_kwargs: Mapping[str, Any] | None = None) -> numpy.typing.ArrayLike:

274

"""Download a zip file and open an inner file as an array-like with numpy.

275

276

Args:

277

key: The module name

278

subkeys: A sequence of additional strings to join. If none are given, returns

279

the directory for this module.

280

url: The URL to download.

281

inner_path: The relative path to the file inside the archive

282

name: Overrides the name of the file at the end of the URL, if given. Also

283

useful for URLs that don't have proper filenames with extensions.

284

force: Should the download be done again, even if the path already exists?

285

Defaults to false.

286

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

287

load_kwargs: Additional keyword arguments that are passed through to

288

read_zip_np and transitively to numpy.load.

289

290

Returns:

291

An array-like object

292

"""

293

```

294

295

## Usage Examples

296

297

### TAR Archive Extraction

298

299

```python

300

import pystow

301

302

# Download and extract tar archive

303

extracted_dir = pystow.ensure_untar(

304

"myapp", "datasets",

305

url="https://example.com/dataset.tar.gz",

306

directory="dataset_v1" # Custom extraction directory name

307

)

308

309

# Access extracted files

310

data_file = extracted_dir / "data" / "train.csv"

311

```

312

313

### GZIP Decompression

314

315

```python

316

import pystow

317

318

# Download and decompress gzipped file

319

decompressed_file = pystow.ensure_gunzip(

320

"myapp", "data",

321

url="https://example.com/large_file.txt.gz",

322

autoclean=True # Remove .gz file after decompression

323

)

324

325

# Read decompressed content

326

content = decompressed_file.read_text()

327

```

328

329

### Working with ZIP Archives

330

331

```python

332

import pystow

333

334

# Access file inside ZIP archive without extraction

335

with pystow.ensure_open_zip(

336

"myapp", "archives",

337

url="https://example.com/data.zip",

338

inner_path="data/file.txt"

339

) as file:

340

content = file.read()

341

342

# Extract DataFrame from CSV inside ZIP

343

df = pystow.ensure_zip_df(

344

"myapp", "datasets",

345

url="https://example.com/dataset.zip",

346

inner_path="dataset/train.csv",

347

read_csv_kwargs={"sep": ","}

348

)

349

350

# Load NumPy array from ZIP

351

array = pystow.ensure_zip_np(

352

"myapp", "arrays",

353

url="https://example.com/arrays.zip",

354

inner_path="data.npy"

355

)

356

```

357

358

### Working with TAR Archives

359

360

```python

361

import pystow

362

363

# Access file inside TAR archive

364

with pystow.ensure_open_tarfile(

365

"myapp", "archives",

366

url="https://example.com/data.tar.gz",

367

inner_path="data/config.json"

368

) as file:

369

import json

370

config = json.load(file)

371

372

# Extract DataFrame from TAR

373

df = pystow.ensure_tar_df(

374

"myapp", "datasets",

375

url="https://example.com/dataset.tar.bz2",

376

inner_path="dataset/data.csv"

377

)

378

379

# Parse XML from TAR

380

tree = pystow.ensure_tar_xml(

381

"myapp", "documents",

382

url="https://example.com/docs.tar.gz",

383

inner_path="docs/schema.xml"

384

)

385

```

386

387

### Compressed File Formats

388

389

```python

390

import pystow

391

392

# Work with GZIP files

393

with pystow.ensure_open_gz(

394

"myapp", "logs",

395

url="https://example.com/logfile.log.gz",

396

mode="rt" # Text mode

397

) as file:

398

lines = file.readlines()

399

400

# Work with LZMA/XZ files

401

with pystow.ensure_open_lzma(

402

"myapp", "compressed",

403

url="https://example.com/data.txt.xz",

404

mode="rt"

405

) as file:

406

data = file.read()

407

408

# Work with BZ2 files

409

with pystow.ensure_open_bz2(

410

"myapp", "compressed",

411

url="https://example.com/data.bz2",

412

mode="rb"

413

) as file:

414

binary_data = file.read()

415

```

416

417

### Compressed Data Formats

418

419

```python

420

import pystow

421

422

# Load gzipped pickle

423

model = pystow.ensure_pickle_gz(

424

"myapp", "models",

425

url="https://example.com/model.pkl.gz"

426

)

427

428

# Load BZ2-compressed JSON

429

data = pystow.ensure_json_bz2(

430

"myapp", "data",

431

url="https://api.example.com/large_dataset.json.bz2"

432

)

433

434

# Save gzipped pickle

435

pystow.module("myapp").dump_pickle(

436

"cache",

437

name="processed_data.pkl",

438

obj=large_data_structure

439

)

440

# Then manually compress if needed

441

```

442

443

### Complex Archive Workflows

444

445

```python

446

import pystow

447

import pandas as pd

448

449

# Download archive, extract specific file, process data

450

def process_archive_data(archive_url, inner_file):

451

# Extract DataFrame from archive

452

df = pystow.ensure_zip_df(

453

"myapp", "raw_data",

454

url=archive_url,

455

inner_path=inner_file,

456

read_csv_kwargs={"sep": "\t"}

457

)

458

459

# Process data

460

processed_df = df.groupby("category").agg({

461

"value": "sum",

462

"count": "mean"

463

})

464

465

# Save processed data

466

pystow.dump_df(

467

"myapp", "processed",

468

name="summary.csv",

469

obj=processed_df

470

)

471

472

return processed_df

473

474

# Use the function

475

result = process_archive_data(

476

"https://example.com/dataset.zip",

477

"raw/data.tsv"

478

)

479

```