or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-data-management.mddownload-protocols.mdfile-processing.mdindex.mdutilities-helpers.md

file-processing.mddocs/

0

# File Processing

1

2

Post-download processors for automatic decompression, archive extraction, and custom file transformations. These processors execute after successful downloads to prepare files for use.

3

4

## Capabilities

5

6

### File Decompression

7

8

Automatically decompresses files compressed with gzip, bzip2, or xz/lzma algorithms.

9

10

```python { .api }

11

class Decompress:

12

"""Processor to decompress files after download."""

13

14

def __init__(self, method: str = "auto", name: str | None = None):

15

"""

16

Parameters:

17

- method: The decompression method. Can be 'auto' (default), 'gzip', 'xz', or 'bz2'. If 'auto', will determine the method from the file extension

18

- name: The name that will be used for the decompressed file. If None, will remove the compression extension from the downloaded file name

19

"""

20

21

def __call__(self, fname: str, action: str, pooch: object) -> str:

22

"""

23

Decompress the given file.

24

25

Parameters:

26

- fname: Full path to the downloaded file

27

- action: Either 'download' or 'update' depending on the action taken by the Pooch

28

- pooch: The Pooch instance that is calling this processor

29

30

Returns:

31

The full path to the decompressed file

32

"""

33

```

34

35

### ZIP Archive Extraction

36

37

Extracts ZIP archives and returns paths to all extracted files.

38

39

```python { .api }

40

class Unzip:

41

"""Processor to unzip downloaded ZIP files."""

42

43

def __init__(

44

self,

45

members: list[str] | None = None,

46

extract_dir: str | None = None,

47

password: bytes | None = None

48

):

49

"""

50

Parameters:

51

- members: List of archive members to extract. If None, will extract all members

52

- extract_dir: Directory where the members will be extracted. If None, will extract to the directory of the ZIP file

53

- password: Password to use for encrypted ZIP files

54

"""

55

56

def __call__(self, fname: str, action: str, pooch: object) -> list[str]:

57

"""

58

Extract the given ZIP file.

59

60

Parameters:

61

- fname: Full path to the downloaded ZIP file

62

- action: Either 'download' or 'update' depending on the action taken by the Pooch

63

- pooch: The Pooch instance that is calling this processor

64

65

Returns:

66

A list with the full paths to all extracted files

67

"""

68

```

69

70

### TAR Archive Extraction

71

72

Extracts TAR archives (including compressed variants like .tar.gz, .tar.bz2, .tar.xz).

73

74

```python { .api }

75

class Untar:

76

"""Processor to untar downloaded TAR files."""

77

78

def __init__(

79

self,

80

members: list[str] | None = None,

81

extract_dir: str | None = None

82

):

83

"""

84

Parameters:

85

- members: List of archive members to extract. If None, will extract all members

86

- extract_dir: Directory where the members will be extracted. If None, will extract to the directory of the TAR file

87

"""

88

89

def __call__(self, fname: str, action: str, pooch: object) -> list[str]:

90

"""

91

Extract the given TAR file.

92

93

Parameters:

94

- fname: Full path to the downloaded TAR file

95

- action: Either 'download' or 'update' depending on the action taken by the Pooch

96

- pooch: The Pooch instance that is calling this processor

97

98

Returns:

99

A list with the full paths to all extracted files

100

"""

101

```

102

103

## Usage Examples

104

105

### Automatic Decompression

106

107

```python

108

import pooch

109

110

# Download and automatically decompress a gzipped file

111

fname = pooch.retrieve(

112

"https://example.com/data.txt.gz",

113

known_hash="md5:abc123...",

114

processor=pooch.Decompress()

115

)

116

# Returns path to decompressed data.txt file

117

```

118

119

### Specific Decompression Method

120

121

```python

122

import pooch

123

124

# Explicitly specify decompression method and output name

125

fname = pooch.retrieve(

126

"https://example.com/dataset.xz",

127

known_hash="sha256:def456...",

128

processor=pooch.Decompress(method="xz", name="dataset.csv")

129

)

130

# Returns path to dataset.csv

131

```

132

133

### ZIP File Extraction

134

135

```python

136

import pooch

137

138

# Extract all files from a ZIP archive

139

files = pooch.retrieve(

140

"https://example.com/data.zip",

141

known_hash="md5:ghi789...",

142

processor=pooch.Unzip()

143

)

144

# Returns list of paths to all extracted files

145

146

# Extract specific files only

147

files = pooch.retrieve(

148

"https://example.com/data.zip",

149

known_hash="md5:ghi789...",

150

processor=pooch.Unzip(members=["data.csv", "readme.txt"])

151

)

152

# Returns list with paths to data.csv and readme.txt only

153

```

154

155

### TAR Archive Extraction

156

157

```python

158

import pooch

159

160

# Extract a tar.gz archive

161

files = pooch.retrieve(

162

"https://example.com/dataset.tar.gz",

163

known_hash="sha256:jkl012...",

164

processor=pooch.Untar()

165

)

166

# Returns list of paths to all extracted files

167

168

# Extract to specific directory

169

files = pooch.retrieve(

170

"https://example.com/dataset.tar.bz2",

171

known_hash="sha256:mno345...",

172

processor=pooch.Untar(extract_dir="./extracted_data")

173

)

174

```

175

176

### Using with Pooch Manager

177

178

```python

179

import pooch

180

181

# Create data manager with processors

182

data_manager = pooch.create(

183

path=pooch.os_cache("myproject"),

184

base_url="https://example.com/data/",

185

registry={

186

"dataset.csv.gz": "md5:abc123...",

187

"images.zip": "sha256:def456...",

188

"archive.tar.xz": "sha256:ghi789...",

189

}

190

)

191

192

# Fetch and decompress

193

csv_file = data_manager.fetch("dataset.csv.gz", processor=pooch.Decompress())

194

195

# Fetch and extract archive

196

image_files = data_manager.fetch("images.zip", processor=pooch.Unzip())

197

198

# Fetch and extract compressed tar

199

archive_files = data_manager.fetch("archive.tar.xz", processor=pooch.Untar())

200

```

201

202

### Custom Processors

203

204

```python

205

import pooch

206

import os

207

208

class CustomProcessor:

209

"""Custom processor example."""

210

211

def __init__(self, suffix="_processed"):

212

self.suffix = suffix

213

214

def __call__(self, fname, action, pooch):

215

"""Process the downloaded file."""

216

output_name = fname.replace('.txt', f'{self.suffix}.txt')

217

218

# Custom processing logic here

219

with open(fname, 'r') as infile, open(output_name, 'w') as outfile:

220

outfile.write(infile.read().upper())

221

222

return output_name

223

224

# Use custom processor

225

fname = pooch.retrieve(

226

"https://example.com/data.txt",

227

known_hash="md5:abc123...",

228

processor=CustomProcessor(suffix="_uppercase")

229

)

230

```