0
# Statistics
1
2
Statistical tools specifically designed for astronomical data analysis, including sigma clipping, robust statistics, and confidence intervals.
3
4
## Capabilities
5
6
### Sigma Clipping and Outlier Rejection
7
8
Iterative sigma clipping for robust statistical analysis in the presence of outliers.
9
10
```python { .api }
11
def sigma_clip(data, sigma=3, sigma_lower=None, sigma_upper=None, maxiters=5,
12
cenfunc='median', stdfunc='std', axis=None, masked=True,
13
return_bounds=False, copy=True, invalid='mask'):
14
"""
15
Perform sigma clipping on data.
16
17
Parameters:
18
- data: input data array
19
- sigma: number of standard deviations for clipping
20
- sigma_lower: lower sigma threshold (default: sigma)
21
- sigma_upper: upper sigma threshold (default: sigma)
22
- maxiters: maximum number of clipping iterations
23
- cenfunc: function for computing center ('median', 'mean', or callable)
24
- stdfunc: function for computing spread ('std', 'mad_std', or callable)
25
- axis: axis along which to clip
26
- masked: return masked array
27
- return_bounds: also return clipping bounds
28
- copy: copy input data
29
- invalid: how to handle invalid values
30
31
Returns:
32
MaskedArray or tuple: clipped data, optionally with bounds
33
"""
34
35
class SigmaClip:
36
"""
37
Sigma clipping class for reusable clipping operations.
38
39
Parameters:
40
- sigma: sigma threshold
41
- sigma_lower: lower sigma threshold
42
- sigma_upper: upper sigma threshold
43
- maxiters: maximum iterations
44
- cenfunc: center function
45
- stdfunc: spread function
46
"""
47
def __init__(self, sigma=3, sigma_lower=None, sigma_upper=None, maxiters=5,
48
cenfunc='median', stdfunc='std'): ...
49
50
def __call__(self, data, axis=None, masked=True, return_bounds=False, copy=True):
51
"""Apply sigma clipping to data."""
52
```
53
54
### Robust Statistics
55
56
Robust statistical estimators that are resistant to outliers and non-Gaussian distributions.
57
58
```python { .api }
59
def mad_std(data, axis=None, func=None, ignore_nan=False):
60
"""
61
Calculate median absolute deviation scaled to standard deviation.
62
63
Parameters:
64
- data: input data
65
- axis: axis along which to compute
66
- func: function to apply before MAD calculation
67
- ignore_nan: ignore NaN values
68
69
Returns:
70
ndarray or scalar: MAD-based standard deviation estimate
71
"""
72
73
def biweight_location(data, c=6.0, M=None, axis=None, modify_sample_size=False):
74
"""
75
Compute biweight location (robust estimate of central tendency).
76
77
Parameters:
78
- data: input data
79
- c: tuning constant
80
- M: initial estimate of location (default: median)
81
- axis: axis along which to compute
82
- modify_sample_size: modify sample size for small samples
83
84
Returns:
85
ndarray or scalar: biweight location estimate
86
"""
87
88
def biweight_scale(data, c=9.0, M=None, axis=None, modify_sample_size=False):
89
"""
90
Compute biweight scale (robust estimate of scale).
91
92
Parameters:
93
- data: input data
94
- c: tuning constant
95
- M: location estimate (default: median)
96
- axis: axis along which to compute
97
- modify_sample_size: modify sample size for small samples
98
99
Returns:
100
ndarray or scalar: biweight scale estimate
101
"""
102
103
def median_absolute_deviation(data, axis=None, func=None, ignore_nan=False):
104
"""
105
Calculate median absolute deviation.
106
107
Parameters:
108
- data: input data
109
- axis: axis along which to compute
110
- func: function to apply before MAD calculation
111
- ignore_nan: ignore NaN values
112
113
Returns:
114
ndarray or scalar: median absolute deviation
115
"""
116
```
117
118
### Bootstrap and Jackknife Resampling
119
120
Statistical resampling methods for uncertainty estimation and hypothesis testing.
121
122
```python { .api }
123
def bootstrap(data, bootnum=100, samples=None, bootfunc=None):
124
"""
125
Perform bootstrap resampling.
126
127
Parameters:
128
- data: input data array
129
- bootnum: number of bootstrap samples
130
- samples: sample size for each bootstrap (default: len(data))
131
- bootfunc: function to apply to each bootstrap sample
132
133
Returns:
134
ndarray: bootstrap results
135
"""
136
137
def jackknife_resampling(data):
138
"""
139
Perform jackknife resampling (leave-one-out).
140
141
Parameters:
142
- data: input data array
143
144
Returns:
145
ndarray: jackknife resampled data
146
"""
147
148
def jackknife_stats(data, statistic, confidence_level=0.95):
149
"""
150
Compute jackknife statistics and confidence intervals.
151
152
Parameters:
153
- data: input data
154
- statistic: function to compute statistic
155
- confidence_level: confidence level for intervals
156
157
Returns:
158
tuple: (statistic, standard error, confidence interval)
159
"""
160
161
class JackknifeStat:
162
"""
163
Jackknife statistics calculator.
164
165
Parameters:
166
- statistic: function to compute statistic
167
- confidence_level: confidence level
168
"""
169
def __init__(self, statistic, confidence_level=0.95): ...
170
171
def __call__(self, data):
172
"""Compute jackknife statistics."""
173
```
174
175
### Circular Statistics
176
177
Statistical functions for circular data (angles, phases, directions).
178
179
```python { .api }
180
def circmean(data, high=2*np.pi, low=0, axis=None, nan_policy='propagate'):
181
"""
182
Compute circular mean of angular data.
183
184
Parameters:
185
- data: angular data
186
- high: upper bound of angular range
187
- low: lower bound of angular range
188
- axis: axis along which to compute
189
- nan_policy: how to handle NaN values
190
191
Returns:
192
ndarray or scalar: circular mean
193
"""
194
195
def circstd(data, high=2*np.pi, low=0, axis=None, nan_policy='propagate'):
196
"""
197
Compute circular standard deviation.
198
199
Parameters:
200
- data: angular data
201
- high: upper bound of angular range
202
- low: lower bound of angular range
203
- axis: axis along which to compute
204
- nan_policy: how to handle NaN values
205
206
Returns:
207
ndarray or scalar: circular standard deviation
208
"""
209
210
def circvar(data, high=2*np.pi, low=0, axis=None, nan_policy='propagate'):
211
"""
212
Compute circular variance.
213
214
Parameters:
215
- data: angular data
216
- high: upper bound of angular range
217
- low: lower bound of angular range
218
- axis: axis along which to compute
219
- nan_policy: how to handle NaN values
220
221
Returns:
222
ndarray or scalar: circular variance
223
"""
224
```
225
226
### Confidence Intervals
227
228
Statistical confidence interval calculations for various distributions commonly used in astronomy.
229
230
```python { .api }
231
def poisson_conf_interval(n, interval='root-n', confidence_level=0.6826894921370859,
232
background=0, conflevel=None):
233
"""
234
Confidence interval for Poisson-distributed data.
235
236
Parameters:
237
- n: observed counts
238
- interval: method ('root-n', 'root-n-0', 'pearson', 'sherpagehrels', 'kraft-burrows-nousek')
239
- confidence_level: confidence level
240
- background: background counts
241
- conflevel: deprecated parameter
242
243
Returns:
244
tuple: (lower_bound, upper_bound)
245
"""
246
247
def binom_conf_interval(k, n, confidence_level=0.6826894921370859, interval='wilson'):
248
"""
249
Confidence interval for binomial distribution.
250
251
Parameters:
252
- k: number of successes
253
- n: number of trials
254
- confidence_level: confidence level
255
- interval: method ('wilson', 'jeffreys', 'flat', 'wald')
256
257
Returns:
258
tuple: (lower_bound, upper_bound)
259
"""
260
261
def median_conf_interval(data, confidence_level=0.6826894921370859, axis=None):
262
"""
263
Confidence interval for median.
264
265
Parameters:
266
- data: input data
267
- confidence_level: confidence level
268
- axis: axis along which to compute
269
270
Returns:
271
tuple: (lower_bound, upper_bound)
272
"""
273
```
274
275
### Histogram and Density Estimation
276
277
Advanced histogram functions with automatic binning and density estimation.
278
279
```python { .api }
280
def histogram(a, bins='blocks', range=None, **kwargs):
281
"""
282
Enhanced histogram with automatic binning algorithms.
283
284
Parameters:
285
- a: input data
286
- bins: binning method ('blocks', 'knuth', 'scott', 'freedman', int, or array)
287
- range: range of histogram
288
- **kwargs: additional arguments
289
290
Returns:
291
tuple: (counts, bin_edges)
292
"""
293
294
def knuth_bin_width(data, return_bins=False, quiet=True):
295
"""
296
Optimal histogram bin width using Knuth's rule.
297
298
Parameters:
299
- data: input data
300
- return_bins: return bin edges instead of width
301
- quiet: suppress output
302
303
Returns:
304
float or ndarray: bin width or bin edges
305
"""
306
307
def scott_bin_width(data, return_bins=False):
308
"""
309
Optimal histogram bin width using Scott's rule.
310
311
Parameters:
312
- data: input data
313
- return_bins: return bin edges instead of width
314
315
Returns:
316
float or ndarray: bin width or bin edges
317
"""
318
319
def freedman_bin_width(data, return_bins=False):
320
"""
321
Optimal histogram bin width using Freedman-Diaconis rule.
322
323
Parameters:
324
- data: input data
325
- return_bins: return bin edges instead of width
326
327
Returns:
328
float or ndarray: bin width or bin edges
329
"""
330
331
def bayesian_blocks(t, x=None, sigma=None, fitness='events', **kwargs):
332
"""
333
Bayesian blocks algorithm for optimal binning.
334
335
Parameters:
336
- t: data values or time stamps
337
- x: data values (if t represents time)
338
- sigma: error on data values
339
- fitness: fitness function ('events', 'regular_events', 'measures')
340
341
Returns:
342
ndarray: optimal bin edges
343
"""
344
```
345
346
## Usage Examples
347
348
### Sigma Clipping for Outlier Rejection
349
350
```python
351
from astropy.stats import sigma_clip, SigmaClip
352
import numpy as np
353
354
# Generate data with outliers
355
np.random.seed(42)
356
data = np.random.normal(100, 15, 100)
357
data[::10] = 200 # Add outliers
358
359
# Basic sigma clipping
360
clipped_data = sigma_clip(data, sigma=3, maxiters=5)
361
print(f"Original data: {len(data)} points")
362
print(f"After clipping: {len(clipped_data.compressed())} points")
363
print(f"Mean before: {np.mean(data):.2f}")
364
print(f"Mean after: {np.mean(clipped_data.compressed()):.2f}")
365
366
# Reusable sigma clipper
367
clipper = SigmaClip(sigma=2.5, maxiters=10)
368
clipped_data2 = clipper(data)
369
```
370
371
### Robust Statistics
372
373
```python
374
from astropy.stats import mad_std, biweight_location, biweight_scale
375
376
# Robust statistics on data with outliers
377
robust_center = biweight_location(data)
378
robust_scale = biweight_scale(data)
379
mad_scale = mad_std(data)
380
381
print(f"Standard mean: {np.mean(data):.2f}")
382
print(f"Biweight location: {robust_center:.2f}")
383
print(f"Standard std: {np.std(data):.2f}")
384
print(f"Biweight scale: {robust_scale:.2f}")
385
print(f"MAD std: {mad_scale:.2f}")
386
```
387
388
### Bootstrap Uncertainty Estimation
389
390
```python
391
from astropy.stats import bootstrap
392
393
# Define statistic of interest
394
def ratio_stat(data):
395
return np.mean(data) / np.std(data)
396
397
# Original statistic
398
original_stat = ratio_stat(data)
399
400
# Bootstrap to estimate uncertainty
401
bootstrap_results = bootstrap(data, bootnum=1000, bootfunc=ratio_stat)
402
bootstrap_std = np.std(bootstrap_results)
403
confidence_interval = np.percentile(bootstrap_results, [16, 84])
404
405
print(f"Original statistic: {original_stat:.3f}")
406
print(f"Bootstrap uncertainty: ±{bootstrap_std:.3f}")
407
print(f"68% confidence interval: [{confidence_interval[0]:.3f}, {confidence_interval[1]:.3f}]")
408
```
409
410
### Circular Statistics
411
412
```python
413
from astropy.stats import circmean, circstd
414
import astropy.units as u
415
416
# Angular data in radians
417
angles = np.array([0.1, 0.3, 6.2, 6.1, 0.2, 0.4]) # Mix of small and ~2π angles
418
419
# Circular statistics
420
circular_mean = circmean(angles)
421
circular_std = circstd(angles)
422
423
print(f"Linear mean: {np.mean(angles):.3f} rad")
424
print(f"Circular mean: {circular_mean:.3f} rad")
425
print(f"Linear std: {np.std(angles):.3f} rad")
426
print(f"Circular std: {circular_std:.3f} rad")
427
428
# Convert to degrees
429
print(f"Circular mean: {circular_mean * 180/np.pi:.1f} degrees")
430
```
431
432
### Confidence Intervals
433
434
```python
435
from astropy.stats import poisson_conf_interval, binom_conf_interval
436
437
# Poisson confidence intervals (common in photon counting)
438
observed_counts = 25
439
lower, upper = poisson_conf_interval(observed_counts, interval='root-n')
440
print(f"Observed counts: {observed_counts}")
441
print(f"Root-n interval: [{lower:.1f}, {upper:.1f}]")
442
443
# More sophisticated Poisson interval
444
lower2, upper2 = poisson_conf_interval(observed_counts, interval='kraft-burrows-nousek')
445
print(f"Kraft-Burrows-Nousek interval: [{lower2:.1f}, {upper2:.1f}]")
446
447
# Binomial confidence intervals
448
successes = 15
449
trials = 20
450
lower_b, upper_b = binom_conf_interval(successes, trials, interval='wilson')
451
success_rate = successes / trials
452
print(f"Success rate: {success_rate:.2f}")
453
print(f"Wilson interval: [{lower_b:.3f}, {upper_b:.3f}]")
454
```
455
456
### Optimal Histogram Binning
457
458
```python
459
from astropy.stats import histogram, knuth_bin_width, bayesian_blocks
460
461
# Generate astronomical data (e.g., stellar magnitudes)
462
magnitudes = np.random.normal(18, 2, 1000)
463
464
# Compare different binning methods
465
bins_knuth = knuth_bin_width(magnitudes, return_bins=True)
466
bins_blocks = bayesian_blocks(magnitudes)
467
468
# Create histograms
469
counts_auto, bins_auto = histogram(magnitudes, bins='blocks')
470
counts_knuth = np.histogram(magnitudes, bins=bins_knuth)[0]
471
counts_blocks = np.histogram(magnitudes, bins=bins_blocks)[0]
472
473
print(f"Automatic bins: {len(bins_auto)-1}")
474
print(f"Knuth bins: {len(bins_knuth)-1}")
475
print(f"Bayesian blocks: {len(bins_blocks)-1}")
476
```
477
478
### Combining Multiple Statistical Methods
479
480
```python
481
# Comprehensive statistical analysis pipeline
482
def analyze_data(data, name="Dataset"):
483
\"\"\"Comprehensive statistical analysis.\"\"\"
484
print(f"\\n=== Analysis of {name} ===")
485
486
# Basic statistics
487
print(f"Sample size: {len(data)}")
488
print(f"Mean: {np.mean(data):.3f}")
489
print(f"Median: {np.median(data):.3f}")
490
print(f"Std dev: {np.std(data):.3f}")
491
492
# Robust statistics
493
robust_center = biweight_location(data)
494
robust_scale = biweight_scale(data)
495
print(f"Biweight location: {robust_center:.3f}")
496
print(f"Biweight scale: {robust_scale:.3f}")
497
498
# Sigma clipping
499
clipped = sigma_clip(data, sigma=3)
500
outlier_fraction = 1 - len(clipped.compressed()) / len(data)
501
print(f"Outlier fraction (3σ): {outlier_fraction:.1%}")
502
503
# Bootstrap uncertainty on mean
504
bootstrap_means = bootstrap(data, bootnum=1000, bootfunc=np.mean)
505
bootstrap_uncertainty = np.std(bootstrap_means)
506
print(f"Bootstrap uncertainty on mean: ±{bootstrap_uncertainty:.3f}")
507
508
# Apply to different datasets
509
clean_data = np.random.normal(10, 2, 100)
510
contaminated_data = np.concatenate([clean_data, [50, -20, 30]])
511
512
analyze_data(clean_data, "Clean data")
513
analyze_data(contaminated_data, "Contaminated data")
514
```