Tessl Tile for pypi/pymc@5.25.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

data.md distributions.md gp.md index.md math.md model.md ode.md sampling.md stats.md variational.md

data.mddocs/

0
# PyMC Data Handling and Backends
1

2
PyMC provides comprehensive data handling capabilities and flexible backend systems for managing datasets, mini-batching, and storing sampling results. The data system supports dynamic updates, efficient memory usage, and seamless integration with the broader scientific Python ecosystem.
3

4
## Data Containers
5

6
### Data Function
7

8
```python { .api }
9
import pymc as pm
10

11
def Data(name, value, *, dims=None, export_index_as_coords=True, **kwargs):
12
    """
13
    Create a data container for observed or input data.
14
    
15
    Parameters:
16
    - name (str): Name of the data variable
17
    - value (array-like): Initial data values
18
    - dims (tuple, optional): Dimension names for the data
19
    - export_index_as_coords (bool): Export indices as coordinates
20
    - **kwargs: Additional PyTensor shared variable options
21
    
22
    Returns:
23
    - data_var: PyTensor shared variable containing the data
24
    """
25

26
# Basic data container
27
import numpy as np
28

29
X_train = np.random.randn(100, 5)
30
y_train = np.random.randn(100)
31

32
with pm.Model() as model:
33
    # Create data containers
34
    X_data = pm.Data('X_data', X_train)
35
    y_data = pm.Data('y_data', y_train)
36
    
37
    # Model using data containers
38
    beta = pm.Normal('beta', 0, 1, shape=X_train.shape[1])
39
    mu = pm.math.dot(X_data, beta)
40
    sigma = pm.HalfNormal('sigma', 1)
41
    
42
    # Likelihood using data container
43
    likelihood = pm.Normal('likelihood', mu=mu, sigma=sigma, observed=y_data)
44

45
# Data containers with dimensions
46
coords = {'obs': range(100), 'features': ['f1', 'f2', 'f3', 'f4', 'f5']}
47

48
with pm.Model(coords=coords) as dimensional_model:
49
    # Data with explicit dimensions
50
    X_data = pm.Data('X_data', X_train, dims=['obs', 'features'])
51
    y_data = pm.Data('y_data', y_train, dims='obs')
52
    
53
    # Model components with matching dimensions
54
    beta = pm.Normal('beta', 0, 1, dims='features')
55
    mu = pm.math.dot(X_data, beta)
56
    likelihood = pm.Normal('likelihood', mu=mu, sigma=1, observed=y_data)
57
```
58

59
### Data Updates
60

61
```python { .api }
62
def set_data(new_data, *, model=None, coords=None):
63
    """
64
    Update data values in existing data containers.
65
    
66
    Parameters:
67
    - new_data (dict): Dictionary mapping data names to new values
68
    - model (Model, optional): Target model (default: current context)
69
    - coords (dict, optional): New coordinate values
70
    """
71

72
# Update data for out-of-sample prediction
73
X_test = np.random.randn(50, 5)
74

75
# Sample with training data
76
with model:
77
    trace = pm.sample(1000)
78

79
# Update data containers for prediction
80
pm.set_data({'X_data': X_test, 'y_data': None})
81

82
# Generate predictions with new data
83
with model:
84
    posterior_predictive = pm.sample_posterior_predictive(trace, var_names=['likelihood'])
85

86
# Restore original data
87
pm.set_data({'X_data': X_train, 'y_data': y_train})
88
```
89

90
### Minibatch Data
91

92
```python { .api }
93
def Minibatch(name, value, batch_size=None, *, dims=None, random_seed=None, **kwargs):
94
    """
95
    Create a minibatch data container for large datasets.
96
    
97
    Parameters:
98
    - name (str): Name of the minibatch variable
99
    - value (array-like): Full dataset
100
    - batch_size (int, optional): Size of each minibatch
101
    - dims (tuple, optional): Dimension names
102
    - random_seed (int, optional): Seed for batch sampling
103
    - **kwargs: Additional options
104
    
105
    Returns:
106
    - minibatch_var: Minibatch data container
107
    """
108

109
# Large dataset minibatching
110
X_large = np.random.randn(10000, 20)
111
y_large = np.random.randn(10000)
112

113
with pm.Model() as minibatch_model:
114
    # Create minibatch containers
115
    X_mb = pm.Minibatch('X_mb', X_large, batch_size=128)
116
    y_mb = pm.Minibatch('y_mb', y_large, batch_size=128)
117
    
118
    # Model parameters
119
    beta = pm.Normal('beta', 0, 1, shape=20)
120
    sigma = pm.HalfNormal('sigma', 1)
121
    
122
    # Linear predictor with minibatch data
123
    mu = pm.math.dot(X_mb, beta)
124
    
125
    # Likelihood (automatically scaled for minibatching)
126
    likelihood = pm.Normal('likelihood', mu=mu, sigma=sigma, 
127
                          observed=y_mb, total_size=len(X_large))
128
    
129
    # Variational inference works well with minibatches
130
    approx = pm.fit(n=50000, method='advi')
131
    trace = approx.sample(2000)
132
```
133

134
### Data Utilities
135

136
```python { .api }
137
def get_data(dataset_name):
138
    """
139
    Load example datasets for testing and tutorials.
140
    
141
    Parameters:
142
    - dataset_name (str): Name of the dataset to load
143
    
144
    Returns:
145
    - data: Loaded dataset as appropriate data structure
146
    """
147

148
# Load example datasets
149
boston_housing = pm.get_data('boston_housing')
150
iris_data = pm.get_data('iris')
151
radon_data = pm.get_data('radon')
152

153
print("Available datasets:")
154
for name, data in [('boston', boston_housing), ('iris', iris_data)]:
155
    print(f"  {name}: {data.shape if hasattr(data, 'shape') else type(data)}")
156
```
157

158
## Trace Storage Backends
159

160
### NDArray Backend (In-Memory)
161

162
```python { .api }
163
from pymc.backends import NDArray
164

165
class NDArray:
166
    """
167
    In-memory trace storage backend.
168
    
169
    Parameters:
170
    - vars (list, optional): Variables to trace
171
    - model (Model, optional): PyMC model
172
    
173
    Attributes:
174
    - samples (dict): Dictionary of samples by variable name
175
    - sampler_stats (dict): Sampler statistics
176
    """
177
    
178
    def __init__(self, vars=None, model=None):
179
        pass
180
    
181
    def setup(self, draws, chain, sampler_vars=None):
182
        """Set up storage for sampling."""
183
        pass
184
    
185
    def record(self, point, sampler_stats=None):
186
        """Record a sample point."""
187
        pass
188
    
189
    def close(self):
190
        """Finalize the trace."""
191
        pass
192

193
# Basic in-memory sampling
194
with pm.Model() as model:
195
    alpha = pm.Normal('alpha', 0, 1)
196
    beta = pm.Normal('beta', 0, 1)
197
    
198
    # Use in-memory backend (default)
199
    trace = pm.sample(1000, trace=pm.backends.NDArray())
200

201
# Access samples directly
202
print(f"Alpha samples shape: {trace.posterior['alpha'].shape}")
203
print(f"Beta mean: {trace.posterior['beta'].mean():.3f}")
204
```
205

206
### Zarr Backend (Persistent Storage)
207

208
```python { .api }
209
from pymc.backends import ZarrTrace
210

211
class ZarrTrace:
212
    """
213
    Zarr-based persistent trace storage.
214
    
215
    Parameters:
216
    - dir_path (str): Directory path for Zarr storage
217
    - vars (list, optional): Variables to trace
218
    - model (Model, optional): PyMC model
219
    - append (bool): Append to existing trace
220
    """
221
    
222
    def __init__(self, dir_path=None, vars=None, model=None, append=False):
223
        pass
224

225
# Persistent storage with Zarr
226
import tempfile
227
import os
228

229
with tempfile.TemporaryDirectory() as tmp_dir:
230
    zarr_path = os.path.join(tmp_dir, 'trace.zarr')
231
    
232
    with pm.Model() as model:
233
        # Large model...
234
        theta = pm.Normal('theta', 0, 1, shape=100)
235
        
236
        # Store trace persistently
237
        trace = pm.sample(10000, trace=pm.backends.ZarrTrace(zarr_path))
238
    
239
    # Trace is automatically saved to disk
240
    print(f"Trace stored at: {zarr_path}")
241
    
242
    # Load trace later
243
    loaded_trace = pm.backends.ZarrTrace(zarr_path)
244
    print(f"Loaded trace shape: {loaded_trace.posterior['theta'].shape}")
245
```
246

247
### Base Trace Classes
248

249
```python { .api }
250
from pymc.backends.base import BaseTrace, MultiTrace
251

252
class BaseTrace:
253
    """
254
    Base class for all trace backends.
255
    
256
    Methods:
257
    - setup: Initialize trace storage
258
    - record: Record sample point
259
    - close: Finalize trace
260
    - __getitem__: Access samples by variable name
261
    """
262

263
class MultiTrace:
264
    """
265
    Container for multiple chain traces.
266
    
267
    Parameters:
268
    - straces (list): List of single-chain traces
269
    
270
    Methods:
271
    - get_values: Extract values for variables
272
    - get_sampler_stats: Extract sampler statistics
273
    - add_values: Add new samples
274
    """
275
    
276
    def __init__(self, straces):
277
        self.straces = straces
278
    
279
    def get_values(self, varname, burn=0, thin=1, combine=True, chains=None, squeeze=True):
280
        """
281
        Get values for a variable across chains.
282
        
283
        Parameters:
284
        - varname (str): Variable name
285
        - burn (int): Burn-in samples to exclude
286
        - thin (int): Thinning factor
287
        - combine (bool): Combine chains into single array
288
        - chains (list, optional): Specific chains to extract
289
        - squeeze (bool): Squeeze singleton dimensions
290
        
291
        Returns:
292
        - values: Variable samples
293
        """
294
        pass
295

296
# Working with MultiTrace
297
trace = pm.sample(2000, chains=4)  # Returns MultiTrace
298

299
# Extract specific variable values
300
alpha_samples = trace.get_values('alpha', burn=500, thin=2)
301
beta_samples = trace.get_values('beta', chains=[0, 1], combine=False)
302

303
# Get sampler statistics
304
diverging = trace.get_sampler_stats('diverging')
305
energy = trace.get_sampler_stats('energy')
306

307
print(f"Total divergences: {diverging.sum()}")
308
print(f"Mean energy: {energy.mean():.3f}")
309
```
310

311
## ArviZ Integration
312

313
### InferenceData Conversion
314

315
```python { .api }
316
def to_inference_data(trace=None, *, prior=None, posterior_predictive=None,
317
                     predictions=None, log_likelihood=None, coords=None,
318
                     dims=None, model=None, save_warmup=False, **kwargs):
319
    """
320
    Convert PyMC trace to ArviZ InferenceData object.
321
    
322
    Parameters:
323
    - trace: PyMC trace object
324
    - prior: Prior samples
325
    - posterior_predictive: Posterior predictive samples  
326
    - predictions: Out-of-sample predictions
327
    - log_likelihood: Log-likelihood values
328
    - coords (dict): Coordinate specifications
329
    - dims (dict): Dimension specifications
330
    - model: PyMC model
331
    - save_warmup (bool): Include warmup samples
332
    
333
    Returns:
334
    - idata: ArviZ InferenceData object
335
    """
336

337
# Convert to ArviZ InferenceData
338
with pm.Model() as model:
339
    alpha = pm.Normal('alpha', 0, 1)
340
    beta = pm.Normal('beta', 0, 1, shape=3)
341
    sigma = pm.HalfNormal('sigma', 1)
342
    
343
    mu = alpha + pm.math.dot(X_data, beta)
344
    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y_data)
345
    
346
    # Sample
347
    trace = pm.sample(1000)
348
    
349
    # Prior samples
350
    prior_samples = pm.sample_prior_predictive(500)
351
    
352
    # Posterior predictive samples
353
    post_pred = pm.sample_posterior_predictive(trace)
354

355
# Convert to comprehensive InferenceData
356
idata = pm.to_inference_data(
357
    trace,
358
    prior=prior_samples,
359
    posterior_predictive=post_pred,
360
    coords={'obs': range(len(y_data)), 'features': ['f1', 'f2', 'f3']},
361
    dims={'beta': ['features'], 'y_obs': ['obs']},
362
    model=model
363
)
364

365
print("InferenceData groups:")
366
for group_name in idata._groups:
367
    print(f"  {group_name}: {list(idata[group_name].data_vars.keys())}")
368
```
369

370
### Predictions Integration
371

372
```python { .api }
373
def predictions_to_inference_data(predictions, *, posterior_trace=None,
374
                                model=None, coords=None, dims=None, **kwargs):
375
    """
376
    Convert predictions to InferenceData format.
377
    
378
    Parameters:
379
    - predictions: Prediction samples
380
    - posterior_trace: Original posterior trace
381
    - model: PyMC model
382
    - coords (dict): Coordinate specifications
383
    - dims (dict): Dimension specifications
384
    
385
    Returns:
386
    - pred_idata: InferenceData with predictions group
387
    """
388

389
# Out-of-sample predictions with proper metadata
390
X_test = np.random.randn(25, 3)
391
pm.set_data({'X_data': X_test, 'y_data': None})
392

393
with model:
394
    # Generate predictions
395
    predictions = pm.sample_posterior_predictive(
396
        trace, 
397
        var_names=['y_obs'],
398
        predictions=True
399
    )
400

401
# Convert predictions to InferenceData
402
pred_idata = pm.predictions_to_inference_data(
403
    predictions,
404
    posterior_trace=trace,
405
    coords={'test_obs': range(25), 'features': ['f1', 'f2', 'f3']},
406
    dims={'y_obs': ['test_obs']},
407
    model=model
408
)
409
```
410

411
## Data Processing Utilities
412

413
### Dictionary-Array Mapping
414

415
```python { .api }
416
from pymc.blocking import DictToArrayBijection
417

418
class DictToArrayBijection:
419
    """
420
    Bijection between dictionary and array representations of parameters.
421
    
422
    Parameters:
423
    - ordering: Variable ordering specification
424
    - vmap: Variable mapping
425
    
426
    Methods:
427
    - map: Convert dictionary to array
428
    - rmap: Convert array to dictionary
429
    """
430
    
431
    def __init__(self, ordering, vmap=None):
432
        pass
433
    
434
    def map(self, point_dict):
435
        """Convert parameter dictionary to array."""
436
        pass
437
    
438
    def rmap(self, point_array):  
439
        """Convert parameter array to dictionary."""
440
        pass
441

442
# Create bijection for efficient array operations
443
with pm.Model() as model:
444
    alpha = pm.Normal('alpha', 0, 1)
445
    beta = pm.Normal('beta', 0, 1, shape=3)
446
    
447
    # Create bijection
448
    bijection = pm.DictToArrayBijection(model.free_RVs)
449
    
450
    # Convert point formats
451
    point_dict = {'alpha': 0.5, 'beta': np.array([1.0, -0.5, 0.2])}
452
    point_array = bijection.map(point_dict)
453
    recovered_dict = bijection.rmap(point_array)
454
    
455
    print(f"Array representation: {point_array}")
456
    print(f"Recovered dict: {recovered_dict}")
457
```
458

459
## Advanced Data Patterns
460

461
### Dynamic Data Loading
462

463
```python { .api }
464
class DataLoader:
465
    """Custom data loader for streaming/dynamic data."""
466
    
467
    def __init__(self, data_source, batch_size=128):
468
        self.data_source = data_source
469
        self.batch_size = batch_size
470
        self.current_batch = 0
471
    
472
    def get_batch(self):
473
        """Get next batch of data."""
474
        start_idx = self.current_batch * self.batch_size
475
        end_idx = start_idx + self.batch_size
476
        
477
        batch_data = self.data_source[start_idx:end_idx]
478
        self.current_batch += 1
479
        
480
        return batch_data
481
    
482
    def reset(self):
483
        """Reset batch counter."""
484
        self.current_batch = 0
485

486
# Use with PyMC for streaming inference
487
data_loader = DataLoader(large_dataset, batch_size=256)
488

489
with pm.Model() as streaming_model:
490
    # Initialize with first batch
491
    initial_batch = data_loader.get_batch()
492
    X_mb = pm.Minibatch('X_mb', initial_batch['X'], batch_size=256)
493
    y_mb = pm.Minibatch('y_mb', initial_batch['y'], batch_size=256)
494
    
495
    # Model definition...
496
    
497
    # Stream through data for variational inference
498
    for epoch in range(10):
499
        data_loader.reset()
500
        
501
        # Process all batches in epoch
502
        while True:
503
            try:
504
                batch = data_loader.get_batch()
505
                pm.set_data({'X_mb': batch['X'], 'y_mb': batch['y']})
506
                
507
                # Update variational approximation
508
                approx = pm.fit(n=1000, method='advi')
509
                
510
            except IndexError:  # End of data
511
                break
512
```
513

514
### Custom Backend Implementation
515

516
```python { .api }
517
class CustomBackend(pm.backends.base.BaseTrace):
518
    """Custom trace backend for specialized storage needs."""
519
    
520
    def __init__(self, custom_storage, vars=None, model=None):
521
        self.custom_storage = custom_storage
522
        super().__init__(vars=vars, model=model)
523
    
524
    def setup(self, draws, chain, sampler_vars=None):
525
        """Initialize custom storage."""
526
        self.chain = chain
527
        self.draws = draws
528
        self.draw_idx = 0
529
        
530
        # Initialize storage structures
531
        self.custom_storage.init_chain(chain, draws, self.varnames)
532
    
533
    def record(self, point, sampler_stats=None):
534
        """Record sample to custom storage."""
535
        self.custom_storage.store_sample(
536
            self.chain, 
537
            self.draw_idx, 
538
            point, 
539
            sampler_stats
540
        )
541
        self.draw_idx += 1
542
    
543
    def close(self):
544
        """Finalize custom storage."""
545
        self.custom_storage.finalize_chain(self.chain)
546
    
547
    def __getitem__(self, key):
548
        """Access stored samples."""
549
        return self.custom_storage.get_samples(self.chain, key)
550

551
# Use custom backend
552
# custom_storage = YourCustomStorage()  # Implement your storage system
553
# trace = pm.sample(1000, trace=CustomBackend(custom_storage))
554
```
555

556
### Memory Management
557

558
```python { .api }
559
# Memory-efficient sampling for large models
560
with pm.Model() as large_model:
561
    # Very large parameter space
562
    theta = pm.Normal('theta', 0, 1, shape=10000)
563
    
564
    # Use memory-mapped storage
565
    with tempfile.TemporaryDirectory() as tmp_dir:
566
        zarr_path = os.path.join(tmp_dir, 'large_trace.zarr')
567
        
568
        # Sample with disk storage
569
        trace = pm.sample(
570
            draws=5000,
571
            trace=pm.backends.ZarrTrace(zarr_path),
572
            compute_convergence_checks=False,  # Save memory
573
            progressbar=True
574
        )
575
        
576
        # Process trace in chunks to avoid memory overflow
577
        chunk_size = 1000
578
        for i in range(0, 5000, chunk_size):
579
            chunk = trace.posterior['theta'][..., i:i+chunk_size, :]
580
            # Process chunk...
581
            processed_chunk = chunk.mean(dim='draw')
582
            print(f"Processed chunk {i//chunk_size + 1}")
583
```
584

585
PyMC's data handling system provides flexible and efficient tools for managing datasets of any size while maintaining seamless integration with the broader PyMC ecosystem and external data sources.

Version

Tile

Files

data.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

data.mddocs/