0
# Hashing and Import Utilities
1
2
Hash arbitrary data and files, plus dynamic module importing and path resolution utilities for data integrity and module management.
3
4
## Capabilities
5
6
### Data and File Hashing
7
8
Functions for computing secure hashes of arbitrary Python data structures and files.
9
10
```python { .api }
11
def hash_data(data, hasher=NoParam, base=NoParam, types=False, convert=False, extensions=None):
12
"""
13
Hash arbitrary Python data structures.
14
15
Args:
16
data: Any Python object (dict, list, str, etc.)
17
hasher: Hash algorithm (NoParam uses 'sha512')
18
base: Output encoding (NoParam uses 'hex')
19
types (bool): Include type information in hash
20
convert (bool): Convert data to hashable format
21
extensions: Custom extensions for handling special types
22
23
Returns:
24
str: Hash digest as string
25
26
Note:
27
Data is normalized for consistent hashing across runs.
28
Supports nested structures, numpy arrays, and custom objects.
29
NoParam defaults: hasher='sha512', base='hex'
30
"""
31
32
def hash_file(fpath, blocksize=1048576, stride=1, maxbytes=None, hasher=NoParam, base=NoParam):
33
"""
34
Hash file contents efficiently.
35
36
Args:
37
fpath (str|Path): Path to file
38
blocksize (int): Read block size in bytes (default: 1MB)
39
stride (int): Read every nth block (default: 1 = all blocks)
40
maxbytes (int): Maximum bytes to read (None = entire file)
41
hasher: Hash algorithm (NoParam uses 'sha512')
42
base: Output encoding (NoParam uses 'hex')
43
44
Returns:
45
str: File hash digest
46
47
Raises:
48
FileNotFoundError: File does not exist
49
IOError: Cannot read file
50
51
Note:
52
NoParam defaults: hasher='sha512', base='hex'
53
"""
54
```
55
56
### Module Import Utilities
57
58
Dynamic module importing and path resolution for runtime module loading.
59
60
```python { .api }
61
def import_module_from_name(name, **kwargs):
62
"""
63
Import module by name with error handling.
64
65
Args:
66
name (str): Module name (e.g., 'os.path', 'numpy')
67
**kwargs: Additional import options
68
69
Returns:
70
module: Imported module object
71
72
Raises:
73
ImportError: Module cannot be imported
74
"""
75
76
def import_module_from_path(modpath, index=-1):
77
"""
78
Import module from file path.
79
80
Args:
81
modpath (str|Path): Path to Python file
82
index (int): Module index for namespace packages
83
84
Returns:
85
module: Imported module object
86
87
Raises:
88
ImportError: Cannot import from path
89
FileNotFoundError: File does not exist
90
"""
91
```
92
93
### Module Path Utilities
94
95
Functions for converting between module names and file paths.
96
97
```python { .api }
98
def modname_to_modpath(modname, **kwargs):
99
"""
100
Convert module name to file path.
101
102
Args:
103
modname (str): Module name (e.g., 'os.path')
104
**kwargs: Additional resolution options
105
106
Returns:
107
str|None: Path to module file or None if not found
108
"""
109
110
def modpath_to_modname(fpath, **kwargs):
111
"""
112
Convert file path to module name.
113
114
Args:
115
fpath (str|Path): Path to Python file
116
**kwargs: Additional conversion options
117
118
Returns:
119
str: Module name
120
"""
121
122
def split_modpath(fpath, **kwargs):
123
"""
124
Split module path into components.
125
126
Args:
127
fpath (str|Path): Path to Python file
128
**kwargs: Additional options
129
130
Returns:
131
dict: Dictionary with path components
132
"""
133
```
134
135
## Usage Examples
136
137
### Data Hashing
138
139
```python
140
import ubelt as ub
141
142
# Hash simple data
143
data = {'name': 'Alice', 'age': 30, 'scores': [95, 87, 92]}
144
hash_value = ub.hash_data(data)
145
print(f"Data hash: {hash_value}")
146
147
# Different hash algorithms
148
sha256_hash = ub.hash_data(data, hasher='sha256')
149
md5_hash = ub.hash_data(data, hasher='md5')
150
print(f"SHA256: {sha256_hash}")
151
print(f"MD5: {md5_hash}")
152
153
# Different output encodings
154
hex_hash = ub.hash_data(data, base='hex')
155
b64_hash = ub.hash_data(data, base='base64')
156
print(f"Hex: {hex_hash}")
157
print(f"Base64: {b64_hash}")
158
159
# Truncated hashes
160
short_hash = ub.hash_data(data, hashlen=8)
161
print(f"Short hash: {short_hash}")
162
```
163
164
### Complex Data Hashing
165
166
```python
167
import ubelt as ub
168
import numpy as np
169
170
# Hash complex nested structures
171
complex_data = {
172
'metadata': {
173
'version': '1.0',
174
'created': '2023-01-01'
175
},
176
'arrays': [
177
np.array([1, 2, 3, 4]),
178
np.array([[1, 2], [3, 4]])
179
],
180
'config': {
181
'learning_rate': 0.001,
182
'batch_size': 32,
183
'layers': [128, 64, 32]
184
}
185
}
186
187
hash_value = ub.hash_data(complex_data)
188
print(f"Complex data hash: {hash_value}")
189
190
# Hashing is consistent across runs
191
hash2 = ub.hash_data(complex_data)
192
assert hash_value == hash2 # Same data produces same hash
193
194
# Order-independent hashing for dicts
195
data1 = {'a': 1, 'b': 2}
196
data2 = {'b': 2, 'a': 1}
197
hash1 = ub.hash_data(data1)
198
hash2 = ub.hash_data(data2)
199
assert hash1 == hash2 # Dict order doesn't matter
200
```
201
202
### File Hashing
203
204
```python
205
import ubelt as ub
206
207
# Hash file contents
208
file_path = 'example.txt'
209
with open(file_path, 'w') as f:
210
f.write('Hello, World!')
211
212
file_hash = ub.hash_file(file_path)
213
print(f"File hash: {file_hash}")
214
215
# Hash large files efficiently (uses chunks)
216
large_file_hash = ub.hash_file('large_file.bin', blocksize=65536)
217
218
# Verify file integrity
219
def verify_file(fpath, expected_hash):
220
actual_hash = ub.hash_file(fpath)
221
return actual_hash == expected_hash
222
223
is_valid = verify_file(file_path, file_hash)
224
print(f"File is valid: {is_valid}")
225
226
# Quick hash for caching
227
cache_key = ub.hash_file('config.json', hashlen=8)
228
print(f"Cache key: {cache_key}")
229
```
230
231
### Dynamic Module Importing
232
233
```python
234
import ubelt as ub
235
236
# Import module by name
237
os_module = ub.import_module_from_name('os')
238
print(f"OS name: {os_module.name}")
239
240
# Import submodules
241
path_module = ub.import_module_from_name('os.path')
242
print(f"Current dir: {path_module.abspath('.')}")
243
244
# Safe importing with error handling
245
try:
246
numpy = ub.import_module_from_name('numpy')
247
print("NumPy is available")
248
except ImportError:
249
print("NumPy not installed")
250
251
# Import from file path
252
script_path = 'my_script.py'
253
with open(script_path, 'w') as f:
254
f.write('''
255
def greet(name):
256
return f"Hello, {name}!"
257
258
VERSION = "1.0"
259
''')
260
261
my_module = ub.import_module_from_path(script_path)
262
print(my_module.greet("World"))
263
print(f"Version: {my_module.VERSION}")
264
```
265
266
### Module Path Resolution
267
268
```python
269
import ubelt as ub
270
271
# Convert module name to path
272
os_path = ub.modname_to_modpath('os')
273
print(f"OS module path: {os_path}")
274
275
json_path = ub.modname_to_modpath('json')
276
print(f"JSON module path: {json_path}")
277
278
# Convert path to module name
279
if json_path:
280
module_name = ub.modpath_to_modname(json_path)
281
print(f"Module name: {module_name}")
282
283
# Split module path into components
284
if json_path:
285
components = ub.split_modpath(json_path)
286
print(f"Path components: {components}")
287
288
# Find package modules
289
import sys
290
for path in sys.path:
291
if 'site-packages' in path:
292
print(f"Site packages: {path}")
293
break
294
```
295
296
### Data Integrity and Caching
297
298
```python
299
import ubelt as ub
300
import json
301
302
# Cache with data integrity
303
def cached_computation(data):
304
"""Cache expensive computation with data hash as key"""
305
data_hash = ub.hash_data(data, hashlen=16)
306
cache_file = f'cache_{data_hash}.json'
307
308
try:
309
with open(cache_file, 'r') as f:
310
cached_result = json.load(f)
311
print("Using cached result")
312
return cached_result
313
except FileNotFoundError:
314
print("Computing new result")
315
# Expensive computation
316
result = sum(x**2 for x in data.get('values', []))
317
318
# Cache the result
319
with open(cache_file, 'w') as f:
320
json.dump(result, f)
321
322
return result
323
324
# Test caching
325
data1 = {'values': [1, 2, 3, 4, 5], 'metadata': 'test'}
326
result1 = cached_computation(data1) # Computes new
327
result2 = cached_computation(data1) # Uses cache
328
329
# Different data gets different cache
330
data2 = {'values': [1, 2, 3, 4, 6], 'metadata': 'test'} # Changed last value
331
result3 = cached_computation(data2) # Computes new
332
333
print(f"Results: {result1}, {result2}, {result3}")
334
```
335
336
### File Verification and Checksums
337
338
```python
339
import ubelt as ub
340
341
# Create checksums for multiple files
342
files_to_check = ['file1.txt', 'file2.txt', 'file3.txt']
343
344
# Create test files
345
for i, fname in enumerate(files_to_check):
346
with open(fname, 'w') as f:
347
f.write(f'Content of file {i+1}')
348
349
# Generate checksums
350
checksums = {}
351
for fpath in files_to_check:
352
checksums[fpath] = ub.hash_file(fpath, hasher='sha256', hashlen=16)
353
354
print("File checksums:")
355
for fpath, checksum in checksums.items():
356
print(f"{fpath}: {checksum}")
357
358
# Verify files later
359
def verify_files(expected_checksums):
360
"""Verify files haven't changed"""
361
for fpath, expected in expected_checksums.items():
362
try:
363
actual = ub.hash_file(fpath, hasher='sha256', hashlen=16)
364
if actual == expected:
365
print(f"✓ {fpath} is valid")
366
else:
367
print(f"✗ {fpath} has changed!")
368
except FileNotFoundError:
369
print(f"✗ {fpath} is missing!")
370
371
verify_files(checksums)
372
373
# Modify a file and check again
374
with open('file2.txt', 'a') as f:
375
f.write(' - modified')
376
377
print("\nAfter modification:")
378
verify_files(checksums)
379
```