or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-data-management.mddownload-protocols.mdfile-processing.mdindex.mdutilities-helpers.md

utilities-helpers.mddocs/

0

# Utilities and Helpers

1

2

Helper functions for cache management, version handling, file hashing, and registry creation. These utilities support data management workflows and provide essential functionality for working with Pooch.

3

4

## Capabilities

5

6

### Cache Management

7

8

Functions for managing cache directories and storage locations across different operating systems.

9

10

```python { .api }

11

def os_cache(project: str) -> str:

12

"""

13

Get the default cache location for the operating system.

14

15

Parameters:

16

- project: The name of your project. The cache folder will be created under this name in the appropriate OS cache directory

17

18

Returns:

19

The default cache location for your OS

20

"""

21

```

22

23

### Version Handling

24

25

Utilities for handling version strings and development versions in data management workflows.

26

27

```python { .api }

28

def check_version(version: str, fallback: str = "master") -> str:

29

"""

30

Check if a version string is a development version and format accordingly.

31

32

Parameters:

33

- version: The version string to check

34

- fallback: The name to use for development versions

35

36

Returns:

37

The version string or the fallback if it's a development version

38

"""

39

```

40

41

### File Hashing

42

43

Functions for calculating and verifying file hashes to ensure data integrity.

44

45

```python { .api }

46

def file_hash(fname: str, alg: str = "sha256") -> str:

47

"""

48

Calculate the hash of a given file.

49

50

Parameters:

51

- fname: The path to the file

52

- alg: The hashing algorithm to use. Supported algorithms include 'sha256', 'sha1', 'md5', and others available in hashlib

53

54

Returns:

55

The hash of the file as a hexadecimal string

56

"""

57

```

58

59

### Registry Management

60

61

Functions for creating and managing file registries with hash information.

62

63

```python { .api }

64

def make_registry(directory: str, output: str, recursive: bool = True) -> None:

65

"""

66

Create a registry file with the hashes of all files in a directory.

67

68

Parameters:

69

- directory: The directory for which to create the registry

70

- output: The path to the output registry file

71

- recursive: If True, will include files in subdirectories

72

"""

73

```

74

75

### Logging

76

77

Access to Pooch's internal logging system for debugging and monitoring.

78

79

```python { .api }

80

def get_logger() -> logging.Logger:

81

"""

82

Get the default Pooch logger.

83

84

Returns:

85

The logger object for Pooch

86

"""

87

```

88

89

## Usage Examples

90

91

### Setting Up Cache Directory

92

93

```python

94

import pooch

95

96

# Get OS-appropriate cache directory for your project

97

cache_dir = pooch.os_cache("myproject")

98

print(f"Cache directory: {cache_dir}")

99

100

# Use in data manager creation

101

data_manager = pooch.create(

102

path=cache_dir,

103

base_url="https://example.com/data/",

104

)

105

```

106

107

### Version Management

108

109

```python

110

import pooch

111

112

# Handle version strings for development builds

113

version = "1.2.3+dev"

114

safe_version = pooch.check_version(version, fallback="main")

115

print(f"Using version: {safe_version}") # Will use "main" for dev version

116

117

# Use in URL formatting

118

base_url = "https://github.com/myproject/data/raw/{version}/"

119

formatted_url = base_url.format(version=safe_version)

120

```

121

122

### File Hash Calculation

123

124

```python

125

import pooch

126

127

# Calculate SHA256 hash (default)

128

hash_value = pooch.file_hash("data.csv")

129

print(f"SHA256: {hash_value}")

130

131

# Calculate MD5 hash

132

md5_hash = pooch.file_hash("data.csv", alg="md5")

133

print(f"MD5: {md5_hash}")

134

135

# Use in registry

136

registry = {

137

"data.csv": f"sha256:{hash_value}",

138

"readme.txt": f"md5:{md5_hash}",

139

}

140

```

141

142

### Registry Creation

143

144

```python

145

import pooch

146

147

# Create registry for all files in a directory

148

pooch.make_registry("./data", "./registry.txt", recursive=True)

149

150

# The registry.txt file will contain:

151

# data/file1.csv sha256:abc123...

152

# data/subdir/file2.txt md5:def456...

153

154

# Load registry into Pooch manager

155

data_manager = pooch.create(

156

path=pooch.os_cache("myproject"),

157

base_url="https://example.com/data/",

158

)

159

data_manager.load_registry("./registry.txt")

160

```

161

162

### Logging Configuration

163

164

```python

165

import pooch

166

import logging

167

168

# Get Pooch logger

169

logger = pooch.get_logger()

170

171

# Configure logging level

172

logger.setLevel(logging.DEBUG)

173

174

# Add custom handler

175

handler = logging.FileHandler("pooch.log")

176

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

177

handler.setFormatter(formatter)

178

logger.addHandler(handler)

179

180

# Now Pooch operations will be logged

181

fname = pooch.retrieve(

182

"https://example.com/data.csv",

183

known_hash="md5:abc123...",

184

)

185

```

186

187

### Complete Workflow Example

188

189

```python

190

import pooch

191

import os

192

193

# Set up project data management

194

project_name = "my-analysis-project"

195

cache_dir = pooch.os_cache(project_name)

196

197

# Create registry from existing data directory

198

if os.path.exists("./reference_data"):

199

pooch.make_registry("./reference_data", "./data_registry.txt")

200

201

# Set up data manager

202

data_manager = pooch.create(

203

path=cache_dir,

204

base_url="https://github.com/myuser/my-analysis-project/raw/{version}/data/",

205

version=pooch.check_version("1.0.0+dev", fallback="main"),

206

)

207

208

# Load registry

209

if os.path.exists("./data_registry.txt"):

210

data_manager.load_registry("./data_registry.txt")

211

212

# Configure logging

213

logger = pooch.get_logger()

214

logger.setLevel(logging.INFO)

215

216

# Fetch data files

217

dataset1 = data_manager.fetch("dataset1.csv")

218

dataset2 = data_manager.fetch("dataset2.zip", processor=pooch.Unzip())

219

220

print(f"Dataset 1: {dataset1}")

221

print(f"Dataset 2 files: {dataset2}")

222

```

223

224

### Hash Verification

225

226

```python

227

import pooch

228

229

# Verify file integrity

230

def verify_file_integrity(filepath, expected_hash):

231

"""Verify a file's integrity against expected hash."""

232

actual_hash = pooch.file_hash(filepath)

233

234

# Handle different hash formats

235

if ":" in expected_hash:

236

alg, expected = expected_hash.split(":", 1)

237

actual_hash = pooch.file_hash(filepath, alg=alg)

238

else:

239

expected = expected_hash

240

241

return actual_hash == expected

242

243

# Example usage

244

file_ok = verify_file_integrity("data.csv", "md5:abc123def456...")

245

if file_ok:

246

print("File integrity verified!")

247

else:

248

print("File may be corrupted!")

249

```