or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mdconstants.mdconvolution.mdcoordinates.mdcosmology.mdfits-io.mdindex.mdmodeling.mdnddata.mdsamp.mdstatistics.mdtables.mdtime.mdtimeseries.mduncertainty.mdunits-quantities.mdutils.mdvisualization.mdwcs.md

statistics.mddocs/

0

# Statistics

1

2

Statistical tools specifically designed for astronomical data analysis, including sigma clipping, robust statistics, and confidence intervals.

3

4

## Capabilities

5

6

### Sigma Clipping and Outlier Rejection

7

8

Iterative sigma clipping for robust statistical analysis in the presence of outliers.

9

10

```python { .api }

11

def sigma_clip(data, sigma=3, sigma_lower=None, sigma_upper=None, maxiters=5,

12

cenfunc='median', stdfunc='std', axis=None, masked=True,

13

return_bounds=False, copy=True, invalid='mask'):

14

"""

15

Perform sigma clipping on data.

16

17

Parameters:

18

- data: input data array

19

- sigma: number of standard deviations for clipping

20

- sigma_lower: lower sigma threshold (default: sigma)

21

- sigma_upper: upper sigma threshold (default: sigma)

22

- maxiters: maximum number of clipping iterations

23

- cenfunc: function for computing center ('median', 'mean', or callable)

24

- stdfunc: function for computing spread ('std', 'mad_std', or callable)

25

- axis: axis along which to clip

26

- masked: return masked array

27

- return_bounds: also return clipping bounds

28

- copy: copy input data

29

- invalid: how to handle invalid values

30

31

Returns:

32

MaskedArray or tuple: clipped data, optionally with bounds

33

"""

34

35

class SigmaClip:

36

"""

37

Sigma clipping class for reusable clipping operations.

38

39

Parameters:

40

- sigma: sigma threshold

41

- sigma_lower: lower sigma threshold

42

- sigma_upper: upper sigma threshold

43

- maxiters: maximum iterations

44

- cenfunc: center function

45

- stdfunc: spread function

46

"""

47

def __init__(self, sigma=3, sigma_lower=None, sigma_upper=None, maxiters=5,

48

cenfunc='median', stdfunc='std'): ...

49

50

def __call__(self, data, axis=None, masked=True, return_bounds=False, copy=True):

51

"""Apply sigma clipping to data."""

52

```

53

54

### Robust Statistics

55

56

Robust statistical estimators that are resistant to outliers and non-Gaussian distributions.

57

58

```python { .api }

59

def mad_std(data, axis=None, func=None, ignore_nan=False):

60

"""

61

Calculate median absolute deviation scaled to standard deviation.

62

63

Parameters:

64

- data: input data

65

- axis: axis along which to compute

66

- func: function to apply before MAD calculation

67

- ignore_nan: ignore NaN values

68

69

Returns:

70

ndarray or scalar: MAD-based standard deviation estimate

71

"""

72

73

def biweight_location(data, c=6.0, M=None, axis=None, modify_sample_size=False):

74

"""

75

Compute biweight location (robust estimate of central tendency).

76

77

Parameters:

78

- data: input data

79

- c: tuning constant

80

- M: initial estimate of location (default: median)

81

- axis: axis along which to compute

82

- modify_sample_size: modify sample size for small samples

83

84

Returns:

85

ndarray or scalar: biweight location estimate

86

"""

87

88

def biweight_scale(data, c=9.0, M=None, axis=None, modify_sample_size=False):

89

"""

90

Compute biweight scale (robust estimate of scale).

91

92

Parameters:

93

- data: input data

94

- c: tuning constant

95

- M: location estimate (default: median)

96

- axis: axis along which to compute

97

- modify_sample_size: modify sample size for small samples

98

99

Returns:

100

ndarray or scalar: biweight scale estimate

101

"""

102

103

def median_absolute_deviation(data, axis=None, func=None, ignore_nan=False):

104

"""

105

Calculate median absolute deviation.

106

107

Parameters:

108

- data: input data

109

- axis: axis along which to compute

110

- func: function to apply before MAD calculation

111

- ignore_nan: ignore NaN values

112

113

Returns:

114

ndarray or scalar: median absolute deviation

115

"""

116

```

117

118

### Bootstrap and Jackknife Resampling

119

120

Statistical resampling methods for uncertainty estimation and hypothesis testing.

121

122

```python { .api }

123

def bootstrap(data, bootnum=100, samples=None, bootfunc=None):

124

"""

125

Perform bootstrap resampling.

126

127

Parameters:

128

- data: input data array

129

- bootnum: number of bootstrap samples

130

- samples: sample size for each bootstrap (default: len(data))

131

- bootfunc: function to apply to each bootstrap sample

132

133

Returns:

134

ndarray: bootstrap results

135

"""

136

137

def jackknife_resampling(data):

138

"""

139

Perform jackknife resampling (leave-one-out).

140

141

Parameters:

142

- data: input data array

143

144

Returns:

145

ndarray: jackknife resampled data

146

"""

147

148

def jackknife_stats(data, statistic, confidence_level=0.95):

149

"""

150

Compute jackknife statistics and confidence intervals.

151

152

Parameters:

153

- data: input data

154

- statistic: function to compute statistic

155

- confidence_level: confidence level for intervals

156

157

Returns:

158

tuple: (statistic, standard error, confidence interval)

159

"""

160

161

class JackknifeStat:

162

"""

163

Jackknife statistics calculator.

164

165

Parameters:

166

- statistic: function to compute statistic

167

- confidence_level: confidence level

168

"""

169

def __init__(self, statistic, confidence_level=0.95): ...

170

171

def __call__(self, data):

172

"""Compute jackknife statistics."""

173

```

174

175

### Circular Statistics

176

177

Statistical functions for circular data (angles, phases, directions).

178

179

```python { .api }

180

def circmean(data, high=2*np.pi, low=0, axis=None, nan_policy='propagate'):

181

"""

182

Compute circular mean of angular data.

183

184

Parameters:

185

- data: angular data

186

- high: upper bound of angular range

187

- low: lower bound of angular range

188

- axis: axis along which to compute

189

- nan_policy: how to handle NaN values

190

191

Returns:

192

ndarray or scalar: circular mean

193

"""

194

195

def circstd(data, high=2*np.pi, low=0, axis=None, nan_policy='propagate'):

196

"""

197

Compute circular standard deviation.

198

199

Parameters:

200

- data: angular data

201

- high: upper bound of angular range

202

- low: lower bound of angular range

203

- axis: axis along which to compute

204

- nan_policy: how to handle NaN values

205

206

Returns:

207

ndarray or scalar: circular standard deviation

208

"""

209

210

def circvar(data, high=2*np.pi, low=0, axis=None, nan_policy='propagate'):

211

"""

212

Compute circular variance.

213

214

Parameters:

215

- data: angular data

216

- high: upper bound of angular range

217

- low: lower bound of angular range

218

- axis: axis along which to compute

219

- nan_policy: how to handle NaN values

220

221

Returns:

222

ndarray or scalar: circular variance

223

"""

224

```

225

226

### Confidence Intervals

227

228

Statistical confidence interval calculations for various distributions commonly used in astronomy.

229

230

```python { .api }

231

def poisson_conf_interval(n, interval='root-n', confidence_level=0.6826894921370859,

232

background=0, conflevel=None):

233

"""

234

Confidence interval for Poisson-distributed data.

235

236

Parameters:

237

- n: observed counts

238

- interval: method ('root-n', 'root-n-0', 'pearson', 'sherpagehrels', 'kraft-burrows-nousek')

239

- confidence_level: confidence level

240

- background: background counts

241

- conflevel: deprecated parameter

242

243

Returns:

244

tuple: (lower_bound, upper_bound)

245

"""

246

247

def binom_conf_interval(k, n, confidence_level=0.6826894921370859, interval='wilson'):

248

"""

249

Confidence interval for binomial distribution.

250

251

Parameters:

252

- k: number of successes

253

- n: number of trials

254

- confidence_level: confidence level

255

- interval: method ('wilson', 'jeffreys', 'flat', 'wald')

256

257

Returns:

258

tuple: (lower_bound, upper_bound)

259

"""

260

261

def median_conf_interval(data, confidence_level=0.6826894921370859, axis=None):

262

"""

263

Confidence interval for median.

264

265

Parameters:

266

- data: input data

267

- confidence_level: confidence level

268

- axis: axis along which to compute

269

270

Returns:

271

tuple: (lower_bound, upper_bound)

272

"""

273

```

274

275

### Histogram and Density Estimation

276

277

Advanced histogram functions with automatic binning and density estimation.

278

279

```python { .api }

280

def histogram(a, bins='blocks', range=None, **kwargs):

281

"""

282

Enhanced histogram with automatic binning algorithms.

283

284

Parameters:

285

- a: input data

286

- bins: binning method ('blocks', 'knuth', 'scott', 'freedman', int, or array)

287

- range: range of histogram

288

- **kwargs: additional arguments

289

290

Returns:

291

tuple: (counts, bin_edges)

292

"""

293

294

def knuth_bin_width(data, return_bins=False, quiet=True):

295

"""

296

Optimal histogram bin width using Knuth's rule.

297

298

Parameters:

299

- data: input data

300

- return_bins: return bin edges instead of width

301

- quiet: suppress output

302

303

Returns:

304

float or ndarray: bin width or bin edges

305

"""

306

307

def scott_bin_width(data, return_bins=False):

308

"""

309

Optimal histogram bin width using Scott's rule.

310

311

Parameters:

312

- data: input data

313

- return_bins: return bin edges instead of width

314

315

Returns:

316

float or ndarray: bin width or bin edges

317

"""

318

319

def freedman_bin_width(data, return_bins=False):

320

"""

321

Optimal histogram bin width using Freedman-Diaconis rule.

322

323

Parameters:

324

- data: input data

325

- return_bins: return bin edges instead of width

326

327

Returns:

328

float or ndarray: bin width or bin edges

329

"""

330

331

def bayesian_blocks(t, x=None, sigma=None, fitness='events', **kwargs):

332

"""

333

Bayesian blocks algorithm for optimal binning.

334

335

Parameters:

336

- t: data values or time stamps

337

- x: data values (if t represents time)

338

- sigma: error on data values

339

- fitness: fitness function ('events', 'regular_events', 'measures')

340

341

Returns:

342

ndarray: optimal bin edges

343

"""

344

```

345

346

## Usage Examples

347

348

### Sigma Clipping for Outlier Rejection

349

350

```python

351

from astropy.stats import sigma_clip, SigmaClip

352

import numpy as np

353

354

# Generate data with outliers

355

np.random.seed(42)

356

data = np.random.normal(100, 15, 100)

357

data[::10] = 200 # Add outliers

358

359

# Basic sigma clipping

360

clipped_data = sigma_clip(data, sigma=3, maxiters=5)

361

print(f"Original data: {len(data)} points")

362

print(f"After clipping: {len(clipped_data.compressed())} points")

363

print(f"Mean before: {np.mean(data):.2f}")

364

print(f"Mean after: {np.mean(clipped_data.compressed()):.2f}")

365

366

# Reusable sigma clipper

367

clipper = SigmaClip(sigma=2.5, maxiters=10)

368

clipped_data2 = clipper(data)

369

```

370

371

### Robust Statistics

372

373

```python

374

from astropy.stats import mad_std, biweight_location, biweight_scale

375

376

# Robust statistics on data with outliers

377

robust_center = biweight_location(data)

378

robust_scale = biweight_scale(data)

379

mad_scale = mad_std(data)

380

381

print(f"Standard mean: {np.mean(data):.2f}")

382

print(f"Biweight location: {robust_center:.2f}")

383

print(f"Standard std: {np.std(data):.2f}")

384

print(f"Biweight scale: {robust_scale:.2f}")

385

print(f"MAD std: {mad_scale:.2f}")

386

```

387

388

### Bootstrap Uncertainty Estimation

389

390

```python

391

from astropy.stats import bootstrap

392

393

# Define statistic of interest

394

def ratio_stat(data):

395

return np.mean(data) / np.std(data)

396

397

# Original statistic

398

original_stat = ratio_stat(data)

399

400

# Bootstrap to estimate uncertainty

401

bootstrap_results = bootstrap(data, bootnum=1000, bootfunc=ratio_stat)

402

bootstrap_std = np.std(bootstrap_results)

403

confidence_interval = np.percentile(bootstrap_results, [16, 84])

404

405

print(f"Original statistic: {original_stat:.3f}")

406

print(f"Bootstrap uncertainty: ±{bootstrap_std:.3f}")

407

print(f"68% confidence interval: [{confidence_interval[0]:.3f}, {confidence_interval[1]:.3f}]")

408

```

409

410

### Circular Statistics

411

412

```python

413

from astropy.stats import circmean, circstd

414

import astropy.units as u

415

416

# Angular data in radians

417

angles = np.array([0.1, 0.3, 6.2, 6.1, 0.2, 0.4]) # Mix of small and ~2π angles

418

419

# Circular statistics

420

circular_mean = circmean(angles)

421

circular_std = circstd(angles)

422

423

print(f"Linear mean: {np.mean(angles):.3f} rad")

424

print(f"Circular mean: {circular_mean:.3f} rad")

425

print(f"Linear std: {np.std(angles):.3f} rad")

426

print(f"Circular std: {circular_std:.3f} rad")

427

428

# Convert to degrees

429

print(f"Circular mean: {circular_mean * 180/np.pi:.1f} degrees")

430

```

431

432

### Confidence Intervals

433

434

```python

435

from astropy.stats import poisson_conf_interval, binom_conf_interval

436

437

# Poisson confidence intervals (common in photon counting)

438

observed_counts = 25

439

lower, upper = poisson_conf_interval(observed_counts, interval='root-n')

440

print(f"Observed counts: {observed_counts}")

441

print(f"Root-n interval: [{lower:.1f}, {upper:.1f}]")

442

443

# More sophisticated Poisson interval

444

lower2, upper2 = poisson_conf_interval(observed_counts, interval='kraft-burrows-nousek')

445

print(f"Kraft-Burrows-Nousek interval: [{lower2:.1f}, {upper2:.1f}]")

446

447

# Binomial confidence intervals

448

successes = 15

449

trials = 20

450

lower_b, upper_b = binom_conf_interval(successes, trials, interval='wilson')

451

success_rate = successes / trials

452

print(f"Success rate: {success_rate:.2f}")

453

print(f"Wilson interval: [{lower_b:.3f}, {upper_b:.3f}]")

454

```

455

456

### Optimal Histogram Binning

457

458

```python

459

from astropy.stats import histogram, knuth_bin_width, bayesian_blocks

460

461

# Generate astronomical data (e.g., stellar magnitudes)

462

magnitudes = np.random.normal(18, 2, 1000)

463

464

# Compare different binning methods

465

bins_knuth = knuth_bin_width(magnitudes, return_bins=True)

466

bins_blocks = bayesian_blocks(magnitudes)

467

468

# Create histograms

469

counts_auto, bins_auto = histogram(magnitudes, bins='blocks')

470

counts_knuth = np.histogram(magnitudes, bins=bins_knuth)[0]

471

counts_blocks = np.histogram(magnitudes, bins=bins_blocks)[0]

472

473

print(f"Automatic bins: {len(bins_auto)-1}")

474

print(f"Knuth bins: {len(bins_knuth)-1}")

475

print(f"Bayesian blocks: {len(bins_blocks)-1}")

476

```

477

478

### Combining Multiple Statistical Methods

479

480

```python

481

# Comprehensive statistical analysis pipeline

482

def analyze_data(data, name="Dataset"):

483

\"\"\"Comprehensive statistical analysis.\"\"\"

484

print(f"\\n=== Analysis of {name} ===")

485

486

# Basic statistics

487

print(f"Sample size: {len(data)}")

488

print(f"Mean: {np.mean(data):.3f}")

489

print(f"Median: {np.median(data):.3f}")

490

print(f"Std dev: {np.std(data):.3f}")

491

492

# Robust statistics

493

robust_center = biweight_location(data)

494

robust_scale = biweight_scale(data)

495

print(f"Biweight location: {robust_center:.3f}")

496

print(f"Biweight scale: {robust_scale:.3f}")

497

498

# Sigma clipping

499

clipped = sigma_clip(data, sigma=3)

500

outlier_fraction = 1 - len(clipped.compressed()) / len(data)

501

print(f"Outlier fraction (3σ): {outlier_fraction:.1%}")

502

503

# Bootstrap uncertainty on mean

504

bootstrap_means = bootstrap(data, bootnum=1000, bootfunc=np.mean)

505

bootstrap_uncertainty = np.std(bootstrap_means)

506

print(f"Bootstrap uncertainty on mean: ±{bootstrap_uncertainty:.3f}")

507

508

# Apply to different datasets

509

clean_data = np.random.normal(10, 2, 100)

510

contaminated_data = np.concatenate([clean_data, [50, -20, 30]])

511

512

analyze_data(clean_data, "Clean data")

513

analyze_data(contaminated_data, "Contaminated data")

514

```