0
# Data and I/O Utilities
1
2
Functions for data formatting, CSV file handling, and interoperability with the Stan ecosystem. These utilities support data preparation, result persistence, and integration with other analysis tools.
3
4
## Capabilities
5
6
### JSON Data Writing
7
8
Convert Python data structures to Stan-compatible JSON format for model input.
9
10
```python { .api }
11
def write_stan_json(data, filename=None):
12
"""
13
Write data to Stan-compatible JSON format.
14
15
Parameters:
16
- data (dict): Dictionary mapping variable names to values
17
- filename (str or PathLike, optional): Output filename
18
19
Returns:
20
str: JSON string if filename not provided, otherwise filename
21
22
Raises:
23
ValueError: If data contains unsupported types
24
"""
25
```
26
27
**Usage Examples:**
28
29
```python
30
import cmdstanpy as csp
31
import numpy as np
32
33
# Prepare data for Stan model
34
data = {
35
"N": 100,
36
"K": 3,
37
"x": np.random.normal(0, 1, 100),
38
"y": np.random.normal(0, 1, 100),
39
"group": np.random.randint(1, 4, 100)
40
}
41
42
# Write to file
43
csp.write_stan_json(data, "model_data.json")
44
45
# Get JSON string
46
json_str = csp.write_stan_json(data)
47
print(json_str[:100]) # Preview first 100 characters
48
```
49
50
### CSV File Loading
51
52
Load Stan CSV output files back into fit objects for analysis and reproducibility.
53
54
```python { .api }
55
def from_csv(path=None, method=None):
56
"""
57
Instantiate CmdStan fit object from Stan CSV files.
58
59
Parameters:
60
- path (str, list, or PathLike): Path(s) to CSV files, directory, or glob pattern
61
- method (str, optional): Expected method type for validation
62
("sample", "optimize", "variational", "pathfinder", "laplace")
63
64
Returns:
65
CmdStanMCMC, CmdStanMLE, CmdStanVB, CmdStanPathfinder, CmdStanLaplace, or None
66
67
Raises:
68
ValueError: If files not found or invalid format
69
"""
70
```
71
72
**Usage Examples:**
73
74
```python
75
import cmdstanpy as csp
76
77
# Load from directory
78
fit = csp.from_csv("./mcmc_output/")
79
80
# Load specific files
81
fit = csp.from_csv([
82
"chain_1.csv",
83
"chain_2.csv",
84
"chain_3.csv",
85
"chain_4.csv"
86
])
87
88
# Load with glob pattern
89
fit = csp.from_csv("results/chain_*.csv")
90
91
# Load with method validation
92
fit = csp.from_csv("./results/", method="sample")
93
94
# Access loaded results
95
print(f"Loaded {fit.chains} chains")
96
print(fit.summary())
97
```
98
99
### System Information
100
101
Display comprehensive system and dependency information for debugging and reproducibility.
102
103
```python { .api }
104
def show_versions(output=True):
105
"""
106
Display system and dependency information for debugging.
107
108
Parameters:
109
- output (bool): Whether to print to console
110
111
Returns:
112
str: Formatted version information
113
"""
114
```
115
116
**Usage Example:**
117
118
```python
119
import cmdstanpy as csp
120
121
# Print version information
122
csp.show_versions()
123
124
# Get as string for logging
125
version_info = csp.show_versions(output=False)
126
with open("session_info.txt", "w") as f:
127
f.write(version_info)
128
```
129
130
## Data Preparation Patterns
131
132
### Complex Data Structures
133
134
```python
135
# Prepare complex nested data for Stan
136
data = {
137
# Scalars
138
"N": 100,
139
"K": 5,
140
141
# Vectors
142
"y": np.random.normal(0, 1, 100),
143
"weights": np.ones(100),
144
145
# Matrices
146
"X": np.random.normal(0, 1, (100, 5)),
147
148
# Arrays
149
"group_data": np.random.normal(0, 1, (10, 5, 3)),
150
151
# Integer arrays
152
"indices": np.arange(1, 101), # Stan uses 1-based indexing
153
154
# Boolean (converted to int)
155
"include_intercept": 1
156
}
157
158
# Validate data types
159
for key, value in data.items():
160
if isinstance(value, np.ndarray):
161
print(f"{key}: {value.dtype} shape {value.shape}")
162
else:
163
print(f"{key}: {type(value)} = {value}")
164
165
# Write to JSON
166
csp.write_stan_json(data, "complex_data.json")
167
```
168
169
### Data Validation
170
171
```python
172
def validate_stan_data(data):
173
"""Custom function to validate data for Stan compatibility."""
174
for key, value in data.items():
175
if isinstance(value, np.ndarray):
176
# Check for NaN or infinite values
177
if np.any(~np.isfinite(value)):
178
raise ValueError(f"Non-finite values in {key}")
179
180
# Ensure proper data types
181
if value.dtype == np.bool_:
182
data[key] = value.astype(int)
183
print(f"Converted {key} from bool to int")
184
185
# Check for proper indexing (1-based for Stan)
186
if "index" in key.lower() and np.any(value <= 0):
187
print(f"Warning: {key} contains non-positive indices")
188
189
return data
190
191
# Use validation before fitting
192
validated_data = validate_stan_data(data)
193
fit = model.sample(data=validated_data)
194
```
195
196
## File Management Patterns
197
198
### Organized Output Directories
199
200
```python
201
import os
202
from datetime import datetime
203
204
# Create organized directory structure
205
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
206
output_dir = f"analysis_{timestamp}"
207
os.makedirs(output_dir, exist_ok=True)
208
209
# Run analysis with organized outputs
210
fit = model.sample(
211
data=data,
212
output_dir=output_dir,
213
chains=4
214
)
215
216
# Save additional outputs
217
fit.save_csvfiles(dir=f"{output_dir}/chains")
218
csp.write_stan_json(data, f"{output_dir}/data.json")
219
220
# Save metadata
221
with open(f"{output_dir}/session_info.txt", "w") as f:
222
f.write(csp.show_versions(output=False))
223
224
print(f"Analysis saved to {output_dir}")
225
```
226
227
### Batch Processing
228
229
```python
230
import glob
231
from pathlib import Path
232
233
# Process multiple datasets
234
data_files = glob.glob("datasets/*.json")
235
results_dir = Path("batch_results")
236
results_dir.mkdir(exist_ok=True)
237
238
for data_file in data_files:
239
dataset_name = Path(data_file).stem
240
print(f"Processing {dataset_name}...")
241
242
# Load data (would need custom JSON loader for complex types)
243
with open(data_file, 'r') as f:
244
data = json.load(f)
245
246
# Run analysis
247
fit = model.sample(data=data, chains=4)
248
249
# Save results
250
output_subdir = results_dir / dataset_name
251
output_subdir.mkdir(exist_ok=True)
252
253
fit.save_csvfiles(dir=str(output_subdir))
254
255
# Save summary
256
summary = fit.summary()
257
summary.to_csv(output_subdir / "summary.csv")
258
259
print(f"Completed {dataset_name}")
260
```
261
262
### Archive and Reproducibility
263
264
```python
265
import json
266
import pickle
267
from pathlib import Path
268
269
def save_analysis_archive(fit, data, model_file, output_dir):
270
"""Save complete analysis archive for reproducibility."""
271
output_path = Path(output_dir)
272
output_path.mkdir(exist_ok=True)
273
274
# Save CSV files
275
fit.save_csvfiles(dir=str(output_path / "csvs"))
276
277
# Save data
278
csp.write_stan_json(data, output_path / "data.json")
279
280
# Copy Stan model file
281
import shutil
282
shutil.copy2(model_file, output_path / "model.stan")
283
284
# Save Python objects
285
with open(output_path / "fit.pkl", "wb") as f:
286
pickle.dump(fit, f)
287
288
# Save metadata
289
metadata = {
290
"cmdstanpy_version": csp.__version__,
291
"cmdstan_version": csp.cmdstan_version(),
292
"timestamp": datetime.now().isoformat(),
293
"chains": fit.chains,
294
"draws_per_chain": fit.num_draws_sampling
295
}
296
297
with open(output_path / "metadata.json", "w") as f:
298
json.dump(metadata, f, indent=2)
299
300
# Save system info
301
with open(output_path / "system_info.txt", "w") as f:
302
f.write(csp.show_versions(output=False))
303
304
print(f"Analysis archived to {output_path}")
305
306
# Use archive function
307
save_analysis_archive(
308
fit=mcmc_fit,
309
data=model_data,
310
model_file="my_model.stan",
311
output_dir="analysis_archive"
312
)
313
```
314
315
### Loading Archived Results
316
317
```python
318
def load_analysis_archive(archive_dir):
319
"""Load archived analysis results."""
320
archive_path = Path(archive_dir)
321
322
# Load fit object
323
with open(archive_path / "fit.pkl", "rb") as f:
324
fit = pickle.load(f)
325
326
# Load metadata
327
with open(archive_path / "metadata.json", "r") as f:
328
metadata = json.load(f)
329
330
print(f"Loaded analysis from {metadata['timestamp']}")
331
print(f"CmdStanPy version: {metadata['cmdstanpy_version']}")
332
333
return fit, metadata
334
335
# Restore archived results
336
restored_fit, meta = load_analysis_archive("analysis_archive")
337
print(restored_fit.summary())
338
```