0
# Download and Caching
1
2
Download files with progress tracking, verification, and comprehensive caching systems for computations and data.
3
4
## Capabilities
5
6
### File Downloads
7
8
Download files from URLs with progress tracking, hash verification, and caching support.
9
10
```python { .api }
11
def download(url, fpath=None, hash_prefix=None, hasher='sha512', **kwargs):
12
"""
13
Download file from URL with progress and verification.
14
15
Args:
16
url (str): URL to download from
17
fpath (str|Path): Local file path (auto-generated if None)
18
hash_prefix (str): Expected hash prefix for verification
19
hasher (str): Hash algorithm ('sha512', 'sha256', 'md5')
20
verbose (int): Verbosity level
21
chunk_size (int): Download chunk size in bytes
22
timeout (float): Connection timeout
23
24
Returns:
25
str: Path to downloaded file
26
27
Raises:
28
URLError: Download failed
29
HashMismatchError: Hash verification failed
30
"""
31
32
def grabdata(url, fpath=None, dpath=None, fname=None, **kwargs):
33
"""
34
Download and cache data with automatic path handling.
35
36
Args:
37
url (str): URL to download
38
fpath (str): Explicit file path
39
dpath (str): Directory for cached file
40
fname (str): Filename for cached file
41
**kwargs: Additional download options
42
43
Returns:
44
str: Path to cached file
45
"""
46
47
class DownloadManager:
48
"""
49
Manage multiple download operations with queuing and progress tracking.
50
"""
51
def __init__(self, max_workers=4): ...
52
53
def submit(self, url, fpath=None, **kwargs): ...
54
def download_all(self): ...
55
def __enter__(self): ...
56
def __exit__(self, exc_type, exc_val, exc_tb): ...
57
```
58
59
### Computation Caching
60
61
Cache expensive computations to disk with dependency tracking and automatic invalidation.
62
63
```python { .api }
64
class Cacher:
65
"""
66
On-disk caching with dependency tracking.
67
Automatically invalidates cache when dependencies change.
68
"""
69
def __init__(self, fname, depends=None, dpath=None, appname='ubelt', **kwargs):
70
"""
71
Args:
72
fname (str): Cache filename
73
depends: Dependencies that invalidate cache when changed
74
dpath (str): Cache directory
75
appname (str): Application name for cache organization
76
**kwargs: Additional cache options
77
"""
78
79
def tryload(self):
80
"""
81
Try to load cached result.
82
83
Returns:
84
object|None: Cached result or None if cache miss/invalid
85
"""
86
87
def save(self, data):
88
"""
89
Save data to cache.
90
91
Args:
92
data: Data to cache
93
"""
94
95
def clear(self):
96
"""Clear cached data."""
97
98
def exists(self):
99
"""
100
Check if cache exists and is valid.
101
102
Returns:
103
bool: True if cache exists and dependencies unchanged
104
"""
105
106
def ensure(self, func, *args, **kwargs):
107
"""
108
Ensure cached result exists, computing if necessary.
109
110
Args:
111
func: Function to call if cache miss
112
*args: Arguments for func
113
**kwargs: Keyword arguments for func
114
115
Returns:
116
object: Cached or computed result
117
"""
118
119
class CacheStamp:
120
"""
121
Lightweight cache stamping for file-producing computations.
122
Tracks when outputs are newer than inputs.
123
"""
124
def __init__(self, fname, dpath=None, **kwargs): ...
125
126
def expired(self, *depends):
127
"""
128
Check if cache is expired relative to dependencies.
129
130
Args:
131
*depends: File paths or other dependencies
132
133
Returns:
134
bool: True if cache is expired
135
"""
136
137
def renew(self):
138
"""Update cache timestamp."""
139
140
def clear(self):
141
"""Remove cache stamp."""
142
```
143
144
## Usage Examples
145
146
### File Downloads
147
148
```python
149
import ubelt as ub
150
151
# Simple download
152
url = 'https://example.com/data.zip'
153
fpath = ub.download(url)
154
print(f"Downloaded to: {fpath}")
155
156
# Download with verification
157
url = 'https://example.com/important.tar.gz'
158
expected_hash = 'a1b2c3d4e5f6...' # First few characters of expected hash
159
fpath = ub.download(url, hash_prefix=expected_hash, hasher='sha256')
160
161
# Download to specific location
162
local_path = './downloads/myfile.zip'
163
ub.download(url, fpath=local_path, verbose=2)
164
165
# Download with caching (won't re-download if file exists)
166
cached_file = ub.grabdata(url, dpath='./cache')
167
```
168
169
### Multiple Downloads
170
171
```python
172
import ubelt as ub
173
174
# Download multiple files
175
urls = [
176
'https://example.com/file1.zip',
177
'https://example.com/file2.tar.gz',
178
'https://example.com/file3.json'
179
]
180
181
# Sequential downloads
182
files = []
183
for url in urls:
184
fpath = ub.download(url, dpath='./downloads')
185
files.append(fpath)
186
187
# Parallel downloads with DownloadManager
188
with ub.DownloadManager(max_workers=3) as dm:
189
futures = []
190
for url in urls:
191
future = dm.submit(url, dpath='./downloads')
192
futures.append(future)
193
194
# Get results
195
files = [future.result() for future in futures]
196
```
197
198
### Computation Caching
199
200
```python
201
import ubelt as ub
202
import time
203
204
def expensive_computation(n):
205
"""Simulate expensive computation"""
206
print(f"Computing for n={n}...")
207
time.sleep(2) # Simulate work
208
return n ** 2
209
210
# Basic caching
211
cache = ub.Cacher('computation_cache')
212
result = cache.tryload()
213
if result is None:
214
result = expensive_computation(100)
215
cache.save(result)
216
print(f"Result: {result}")
217
218
# Dependency-based caching
219
input_file = 'input.txt'
220
with open(input_file, 'w') as f:
221
f.write('some input data')
222
223
# Cache depends on input file
224
cache = ub.Cacher('file_processing', depends=[input_file])
225
result = cache.tryload()
226
if result is None:
227
# Process the file
228
with open(input_file, 'r') as f:
229
data = f.read()
230
result = data.upper() # Simple processing
231
cache.save(result)
232
233
# Cache will be invalidated if input.txt changes
234
235
# Using ensure for cleaner code
236
def process_data(filename):
237
with open(filename, 'r') as f:
238
return f.read().upper()
239
240
cache = ub.Cacher('processing', depends=[input_file])
241
result = cache.ensure(process_data, input_file)
242
```
243
244
### Cache Stamps for File Operations
245
246
```python
247
import ubelt as ub
248
249
# Stamp-based caching for file generation
250
input_files = ['input1.txt', 'input2.txt', 'config.json']
251
output_file = 'processed_output.json'
252
253
stamp = ub.CacheStamp('processing_stamp')
254
255
if stamp.expired(*input_files, output_file):
256
print("Processing files...")
257
# Do expensive file processing
258
processed_data = {'result': 'processed'}
259
260
# Write output
261
import json
262
with open(output_file, 'w') as f:
263
json.dump(processed_data, f)
264
265
# Update stamp
266
stamp.renew()
267
else:
268
print("Using cached output")
269
270
# Output file exists and is newer than inputs
271
```
272
273
### Advanced Caching Patterns
274
275
```python
276
import ubelt as ub
277
278
# Cache with custom dependencies
279
def get_data_hash():
280
"""Get hash of current data state"""
281
return ub.hash_data({'version': '1.2', 'config': 'prod'})
282
283
# Cache that depends on data state, not just files
284
cache = ub.Cacher('model_cache', depends=[get_data_hash()])
285
286
def train_model():
287
print("Training model...")
288
return {'accuracy': 0.95, 'model': 'trained_weights'}
289
290
model = cache.ensure(train_model)
291
292
# Organized caching with app-specific directories
293
user_cache = ub.Cacher('user_prefs', appname='myapp')
294
model_cache = ub.Cacher('models', appname='myapp', dpath='./models')
295
296
# Clear caches when needed
297
if need_fresh_data:
298
cache.clear()
299
300
# Check cache status
301
if cache.exists():
302
print("Cache is valid")
303
data = cache.tryload()
304
else:
305
print("Cache expired or missing")
306
```