0
# Caching and Performance
1
2
Caching decorators and performance optimization tools for efficient data processing and resource management. Streamlit's caching system enables applications to avoid expensive recomputations and resource loading.
3
4
## Capabilities
5
6
### Data Caching
7
8
Cache expensive data computations that can be serialized and shared across sessions.
9
10
```python { .api }
11
def cache_data(func=None, *, ttl=None, max_entries=None, show_spinner=True, persist=None, experimental_allow_widgets=False, hash_funcs=None, validate=None):
12
"""
13
Decorator to cache functions that return serializable data.
14
15
Args:
16
func (callable, optional): Function to cache (when used as decorator)
17
ttl (float, optional): Time-to-live in seconds
18
max_entries (int, optional): Maximum number of cached entries
19
show_spinner (bool): Whether to show spinner during computation
20
persist (str, optional): Persistence mode ("disk" for persistent storage)
21
experimental_allow_widgets (bool): Allow widgets in cached functions
22
hash_funcs (dict, optional): Custom hash functions for parameter types
23
validate (callable, optional): Function to validate cached values
24
25
Returns:
26
callable: Decorated function with caching capability
27
"""
28
```
29
30
Example usage:
31
```python
32
@st.cache_data
33
def load_data(file_path):
34
"""Load and process data file."""
35
df = pd.read_csv(file_path)
36
return df.groupby('category').sum()
37
38
# Cache with TTL (expires after 1 hour)
39
@st.cache_data(ttl=3600)
40
def fetch_api_data(endpoint, params):
41
"""Fetch data from API with 1-hour cache."""
42
response = requests.get(endpoint, params=params)
43
return response.json()
44
45
# Cache with persistence (survives app restarts)
46
@st.cache_data(persist="disk")
47
def expensive_computation(data, algorithm):
48
"""Expensive ML computation with disk persistence."""
49
model = train_model(data, algorithm)
50
return model.predictions
51
52
# Cache with custom validation
53
@st.cache_data(validate=lambda x: len(x) > 0)
54
def get_user_data(user_id):
55
"""Get user data with validation."""
56
return database.fetch_user(user_id)
57
58
# Cache with max entries limit
59
@st.cache_data(max_entries=100)
60
def process_query(query, filters):
61
"""Process search query with LRU eviction."""
62
return search_engine.process(query, filters)
63
```
64
65
### Resource Caching
66
67
Cache global resources like database connections, models, and objects that cannot be serialized.
68
69
```python { .api }
70
def cache_resource(func=None, *, ttl=None, max_entries=None, show_spinner=True, validate=None, hash_funcs=None):
71
"""
72
Decorator to cache functions that return non-serializable resources.
73
74
Args:
75
func (callable, optional): Function to cache (when used as decorator)
76
ttl (float, optional): Time-to-live in seconds
77
max_entries (int, optional): Maximum number of cached entries
78
show_spinner (bool): Whether to show spinner during computation
79
validate (callable, optional): Function to validate cached resources
80
hash_funcs (dict, optional): Custom hash functions for parameter types
81
82
Returns:
83
callable: Decorated function with resource caching capability
84
"""
85
```
86
87
Example usage:
88
```python
89
@st.cache_resource
90
def get_database_connection():
91
"""Create database connection (shared across sessions)."""
92
return sqlite3.connect("app.db", check_same_thread=False)
93
94
@st.cache_resource
95
def load_ml_model(model_path):
96
"""Load ML model (expensive, non-serializable)."""
97
import tensorflow as tf
98
return tf.keras.models.load_model(model_path)
99
100
# Resource with TTL (model refreshes daily)
101
@st.cache_resource(ttl=86400)
102
def get_trained_model(training_data_hash):
103
"""Load or train model with daily refresh."""
104
return train_model(training_data_hash)
105
106
# Resource with validation
107
@st.cache_resource(validate=lambda conn: conn.is_connected())
108
def get_api_client(api_key):
109
"""Get API client with connection validation."""
110
return APIClient(api_key)
111
112
# Limited resource cache
113
@st.cache_resource(max_entries=5)
114
def create_processor(config):
115
"""Create data processor (max 5 configurations cached)."""
116
return DataProcessor(config)
117
```
118
119
### Legacy Caching (Deprecated)
120
121
The original caching function, now deprecated in favor of `cache_data` and `cache_resource`.
122
123
```python { .api }
124
def cache(func=None, persist=False, allow_output_mutation=False, show_spinner=True, suppress_st_warning=False, hash_funcs=None, max_entries=None, ttl=None):
125
"""
126
Legacy caching decorator (deprecated).
127
128
Args:
129
func (callable, optional): Function to cache
130
persist (bool): Whether to persist cache to disk
131
allow_output_mutation (bool): Allow mutation of cached return values
132
show_spinner (bool): Whether to show spinner during computation
133
suppress_st_warning (bool): Suppress Streamlit warnings
134
hash_funcs (dict, optional): Custom hash functions
135
max_entries (int, optional): Maximum number of cached entries
136
ttl (float, optional): Time-to-live in seconds
137
138
Returns:
139
callable: Decorated function with caching
140
141
Note:
142
Deprecated. Use st.cache_data or st.cache_resource instead.
143
"""
144
```
145
146
### Performance Optimization Patterns
147
148
#### Data Loading Optimization
149
150
```python
151
# Cache expensive data loading
152
@st.cache_data
153
def load_large_dataset(data_source):
154
"""Load and preprocess large dataset."""
155
df = pd.read_parquet(data_source) # Fast format
156
df = df.fillna(0) # Preprocessing
157
return df
158
159
# Cache with parameters
160
@st.cache_data
161
def filter_data(df, category, date_range):
162
"""Filter dataset based on parameters."""
163
mask = (df['category'] == category) &
164
(df['date'].between(date_range[0], date_range[1]))
165
return df[mask]
166
167
# Usage with cached functions
168
data = load_large_dataset("data.parquet")
169
filtered_data = filter_data(data, selected_category, date_range)
170
```
171
172
#### Model and Resource Management
173
174
```python
175
# Cache ML models
176
@st.cache_resource
177
def load_prediction_model():
178
"""Load trained model for predictions."""
179
return joblib.load("model.pkl")
180
181
@st.cache_resource
182
def get_feature_encoder():
183
"""Load feature preprocessing pipeline."""
184
return joblib.load("encoder.pkl")
185
186
# Cache database connections
187
@st.cache_resource
188
def init_database():
189
"""Initialize database connection pool."""
190
return ConnectionPool(
191
host="localhost",
192
database="myapp",
193
max_connections=10
194
)
195
196
# Usage pattern
197
model = load_prediction_model()
198
encoder = get_feature_encoder()
199
db = init_database()
200
201
# Now use these cached resources
202
features = encoder.transform(user_input)
203
prediction = model.predict(features)
204
```
205
206
#### API and External Service Caching
207
208
```python
209
# Cache API calls with TTL
210
@st.cache_data(ttl=300) # 5 minutes
211
def fetch_stock_prices(symbols):
212
"""Fetch current stock prices (cached for 5 minutes)."""
213
api_key = st.secrets["stock_api_key"]
214
response = requests.get(f"https://api.stocks.com/prices",
215
params={"symbols": ",".join(symbols), "key": api_key})
216
return response.json()
217
218
@st.cache_data(ttl=3600) # 1 hour
219
def get_weather_data(location):
220
"""Fetch weather data (cached for 1 hour)."""
221
api_key = st.secrets["weather_api_key"]
222
response = requests.get(f"https://api.weather.com/current",
223
params={"location": location, "key": api_key})
224
return response.json()
225
226
# Usage with error handling
227
try:
228
weather = get_weather_data(user_location)
229
st.metric("Temperature", f"{weather['temp']}°F")
230
except Exception as e:
231
st.error(f"Could not fetch weather data: {e}")
232
```
233
234
#### Custom Hash Functions
235
236
```python
237
# Custom hash for complex objects
238
def hash_dataframe(df):
239
"""Custom hash function for pandas DataFrames."""
240
return hash(pd.util.hash_pandas_object(df).sum())
241
242
@st.cache_data(hash_funcs={pd.DataFrame: hash_dataframe})
243
def process_dataframe(df, operations):
244
"""Process DataFrame with custom hashing."""
245
result = df.copy()
246
for op in operations:
247
result = apply_operation(result, op)
248
return result
249
250
# Custom hash for file objects
251
def hash_file(file_obj):
252
"""Hash file based on content."""
253
if hasattr(file_obj, 'name'):
254
return hash((file_obj.name, os.path.getmtime(file_obj.name)))
255
return hash(file_obj.read())
256
257
@st.cache_data(hash_funcs={type(open(__file__)): hash_file})
258
def process_uploaded_file(file):
259
"""Process uploaded file with content-based hashing."""
260
return pd.read_csv(file)
261
```
262
263
#### Cache Management
264
265
```python
266
# Clear specific cache
267
@st.cache_data
268
def expensive_function(param):
269
return compute_result(param)
270
271
# Clear cache manually
272
if st.button("Clear Cache"):
273
expensive_function.clear()
274
st.success("Cache cleared!")
275
276
# Clear all caches
277
if st.button("Clear All Caches"):
278
st.cache_data.clear()
279
st.cache_resource.clear()
280
st.success("All caches cleared!")
281
282
# Conditional cache clearing
283
if st.checkbox("Force Refresh"):
284
expensive_function.clear()
285
result = expensive_function(user_input)
286
else:
287
result = expensive_function(user_input)
288
```
289
290
#### Performance Monitoring
291
292
```python
293
import time
294
import streamlit as st
295
296
# Monitor cache performance
297
@st.cache_data
298
def monitored_function(data):
299
start_time = time.time()
300
result = expensive_computation(data)
301
end_time = time.time()
302
303
# Log performance metrics
304
st.sidebar.metric("Computation Time", f"{end_time - start_time:.2f}s")
305
return result
306
307
# Cache hit/miss tracking
308
cache_stats = {"hits": 0, "misses": 0}
309
310
@st.cache_data
311
def tracked_function(param):
312
cache_stats["misses"] += 1
313
return compute_result(param)
314
315
# Display cache statistics
316
col1, col2 = st.sidebar.columns(2)
317
col1.metric("Cache Hits", cache_stats["hits"])
318
col2.metric("Cache Misses", cache_stats["misses"])
319
```
320
321
### Best Practices
322
323
#### When to Use Each Cache Type
324
325
**Use `@st.cache_data` for:**
326
- Data loading from files, APIs, or databases
327
- Data transformations and computations
328
- Serializable objects (DataFrames, lists, dicts, numbers, strings)
329
- Results that can be safely shared across users
330
331
**Use `@st.cache_resource` for:**
332
- Database connections and connection pools
333
- ML models and trained algorithms
334
- File handles and open resources
335
- Objects with locks or threads
336
- Non-serializable or stateful objects
337
338
#### Cache Configuration Guidelines
339
340
```python
341
# For frequently accessed, stable data
342
@st.cache_data(persist="disk")
343
def load_reference_data():
344
return pd.read_csv("reference.csv")
345
346
# For real-time data with appropriate TTL
347
@st.cache_data(ttl=60) # 1 minute
348
def get_live_metrics():
349
return fetch_current_metrics()
350
351
# For user-specific data with size limits
352
@st.cache_data(max_entries=1000)
353
def get_user_analysis(user_id, analysis_type):
354
return perform_analysis(user_id, analysis_type)
355
356
# For expensive resources with validation
357
@st.cache_resource(validate=lambda x: x.is_healthy())
358
def get_ml_service():
359
return MLService()
360
```