Astronomy and astrophysics core library providing comprehensive tools for astronomical computations and data handling
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Statistical tools specifically designed for astronomical data analysis, including sigma clipping, robust statistics, and confidence intervals.
Iterative sigma clipping for robust statistical analysis in the presence of outliers.
def sigma_clip(data, sigma=3, sigma_lower=None, sigma_upper=None, maxiters=5,
cenfunc='median', stdfunc='std', axis=None, masked=True,
return_bounds=False, copy=True, invalid='mask'):
"""
Perform sigma clipping on data.
Parameters:
- data: input data array
- sigma: number of standard deviations for clipping
- sigma_lower: lower sigma threshold (default: sigma)
- sigma_upper: upper sigma threshold (default: sigma)
- maxiters: maximum number of clipping iterations
- cenfunc: function for computing center ('median', 'mean', or callable)
- stdfunc: function for computing spread ('std', 'mad_std', or callable)
- axis: axis along which to clip
- masked: return masked array
- return_bounds: also return clipping bounds
- copy: copy input data
- invalid: how to handle invalid values
Returns:
MaskedArray or tuple: clipped data, optionally with bounds
"""
class SigmaClip:
"""
Sigma clipping class for reusable clipping operations.
Parameters:
- sigma: sigma threshold
- sigma_lower: lower sigma threshold
- sigma_upper: upper sigma threshold
- maxiters: maximum iterations
- cenfunc: center function
- stdfunc: spread function
"""
def __init__(self, sigma=3, sigma_lower=None, sigma_upper=None, maxiters=5,
cenfunc='median', stdfunc='std'): ...
def __call__(self, data, axis=None, masked=True, return_bounds=False, copy=True):
"""Apply sigma clipping to data."""Robust statistical estimators that are resistant to outliers and non-Gaussian distributions.
def mad_std(data, axis=None, func=None, ignore_nan=False):
"""
Calculate median absolute deviation scaled to standard deviation.
Parameters:
- data: input data
- axis: axis along which to compute
- func: function to apply before MAD calculation
- ignore_nan: ignore NaN values
Returns:
ndarray or scalar: MAD-based standard deviation estimate
"""
def biweight_location(data, c=6.0, M=None, axis=None, modify_sample_size=False):
"""
Compute biweight location (robust estimate of central tendency).
Parameters:
- data: input data
- c: tuning constant
- M: initial estimate of location (default: median)
- axis: axis along which to compute
- modify_sample_size: modify sample size for small samples
Returns:
ndarray or scalar: biweight location estimate
"""
def biweight_scale(data, c=9.0, M=None, axis=None, modify_sample_size=False):
"""
Compute biweight scale (robust estimate of scale).
Parameters:
- data: input data
- c: tuning constant
- M: location estimate (default: median)
- axis: axis along which to compute
- modify_sample_size: modify sample size for small samples
Returns:
ndarray or scalar: biweight scale estimate
"""
def median_absolute_deviation(data, axis=None, func=None, ignore_nan=False):
"""
Calculate median absolute deviation.
Parameters:
- data: input data
- axis: axis along which to compute
- func: function to apply before MAD calculation
- ignore_nan: ignore NaN values
Returns:
ndarray or scalar: median absolute deviation
"""Statistical resampling methods for uncertainty estimation and hypothesis testing.
def bootstrap(data, bootnum=100, samples=None, bootfunc=None):
"""
Perform bootstrap resampling.
Parameters:
- data: input data array
- bootnum: number of bootstrap samples
- samples: sample size for each bootstrap (default: len(data))
- bootfunc: function to apply to each bootstrap sample
Returns:
ndarray: bootstrap results
"""
def jackknife_resampling(data):
"""
Perform jackknife resampling (leave-one-out).
Parameters:
- data: input data array
Returns:
ndarray: jackknife resampled data
"""
def jackknife_stats(data, statistic, confidence_level=0.95):
"""
Compute jackknife statistics and confidence intervals.
Parameters:
- data: input data
- statistic: function to compute statistic
- confidence_level: confidence level for intervals
Returns:
tuple: (statistic, standard error, confidence interval)
"""
class JackknifeStat:
"""
Jackknife statistics calculator.
Parameters:
- statistic: function to compute statistic
- confidence_level: confidence level
"""
def __init__(self, statistic, confidence_level=0.95): ...
def __call__(self, data):
"""Compute jackknife statistics."""Statistical functions for circular data (angles, phases, directions).
def circmean(data, high=2*np.pi, low=0, axis=None, nan_policy='propagate'):
"""
Compute circular mean of angular data.
Parameters:
- data: angular data
- high: upper bound of angular range
- low: lower bound of angular range
- axis: axis along which to compute
- nan_policy: how to handle NaN values
Returns:
ndarray or scalar: circular mean
"""
def circstd(data, high=2*np.pi, low=0, axis=None, nan_policy='propagate'):
"""
Compute circular standard deviation.
Parameters:
- data: angular data
- high: upper bound of angular range
- low: lower bound of angular range
- axis: axis along which to compute
- nan_policy: how to handle NaN values
Returns:
ndarray or scalar: circular standard deviation
"""
def circvar(data, high=2*np.pi, low=0, axis=None, nan_policy='propagate'):
"""
Compute circular variance.
Parameters:
- data: angular data
- high: upper bound of angular range
- low: lower bound of angular range
- axis: axis along which to compute
- nan_policy: how to handle NaN values
Returns:
ndarray or scalar: circular variance
"""Statistical confidence interval calculations for various distributions commonly used in astronomy.
def poisson_conf_interval(n, interval='root-n', confidence_level=0.6826894921370859,
background=0, conflevel=None):
"""
Confidence interval for Poisson-distributed data.
Parameters:
- n: observed counts
- interval: method ('root-n', 'root-n-0', 'pearson', 'sherpagehrels', 'kraft-burrows-nousek')
- confidence_level: confidence level
- background: background counts
- conflevel: deprecated parameter
Returns:
tuple: (lower_bound, upper_bound)
"""
def binom_conf_interval(k, n, confidence_level=0.6826894921370859, interval='wilson'):
"""
Confidence interval for binomial distribution.
Parameters:
- k: number of successes
- n: number of trials
- confidence_level: confidence level
- interval: method ('wilson', 'jeffreys', 'flat', 'wald')
Returns:
tuple: (lower_bound, upper_bound)
"""
def median_conf_interval(data, confidence_level=0.6826894921370859, axis=None):
"""
Confidence interval for median.
Parameters:
- data: input data
- confidence_level: confidence level
- axis: axis along which to compute
Returns:
tuple: (lower_bound, upper_bound)
"""Advanced histogram functions with automatic binning and density estimation.
def histogram(a, bins='blocks', range=None, **kwargs):
"""
Enhanced histogram with automatic binning algorithms.
Parameters:
- a: input data
- bins: binning method ('blocks', 'knuth', 'scott', 'freedman', int, or array)
- range: range of histogram
- **kwargs: additional arguments
Returns:
tuple: (counts, bin_edges)
"""
def knuth_bin_width(data, return_bins=False, quiet=True):
"""
Optimal histogram bin width using Knuth's rule.
Parameters:
- data: input data
- return_bins: return bin edges instead of width
- quiet: suppress output
Returns:
float or ndarray: bin width or bin edges
"""
def scott_bin_width(data, return_bins=False):
"""
Optimal histogram bin width using Scott's rule.
Parameters:
- data: input data
- return_bins: return bin edges instead of width
Returns:
float or ndarray: bin width or bin edges
"""
def freedman_bin_width(data, return_bins=False):
"""
Optimal histogram bin width using Freedman-Diaconis rule.
Parameters:
- data: input data
- return_bins: return bin edges instead of width
Returns:
float or ndarray: bin width or bin edges
"""
def bayesian_blocks(t, x=None, sigma=None, fitness='events', **kwargs):
"""
Bayesian blocks algorithm for optimal binning.
Parameters:
- t: data values or time stamps
- x: data values (if t represents time)
- sigma: error on data values
- fitness: fitness function ('events', 'regular_events', 'measures')
Returns:
ndarray: optimal bin edges
"""from astropy.stats import sigma_clip, SigmaClip
import numpy as np
# Generate data with outliers
np.random.seed(42)
data = np.random.normal(100, 15, 100)
data[::10] = 200 # Add outliers
# Basic sigma clipping
clipped_data = sigma_clip(data, sigma=3, maxiters=5)
print(f"Original data: {len(data)} points")
print(f"After clipping: {len(clipped_data.compressed())} points")
print(f"Mean before: {np.mean(data):.2f}")
print(f"Mean after: {np.mean(clipped_data.compressed()):.2f}")
# Reusable sigma clipper
clipper = SigmaClip(sigma=2.5, maxiters=10)
clipped_data2 = clipper(data)from astropy.stats import mad_std, biweight_location, biweight_scale
# Robust statistics on data with outliers
robust_center = biweight_location(data)
robust_scale = biweight_scale(data)
mad_scale = mad_std(data)
print(f"Standard mean: {np.mean(data):.2f}")
print(f"Biweight location: {robust_center:.2f}")
print(f"Standard std: {np.std(data):.2f}")
print(f"Biweight scale: {robust_scale:.2f}")
print(f"MAD std: {mad_scale:.2f}")from astropy.stats import bootstrap
# Define statistic of interest
def ratio_stat(data):
return np.mean(data) / np.std(data)
# Original statistic
original_stat = ratio_stat(data)
# Bootstrap to estimate uncertainty
bootstrap_results = bootstrap(data, bootnum=1000, bootfunc=ratio_stat)
bootstrap_std = np.std(bootstrap_results)
confidence_interval = np.percentile(bootstrap_results, [16, 84])
print(f"Original statistic: {original_stat:.3f}")
print(f"Bootstrap uncertainty: ±{bootstrap_std:.3f}")
print(f"68% confidence interval: [{confidence_interval[0]:.3f}, {confidence_interval[1]:.3f}]")from astropy.stats import circmean, circstd
import astropy.units as u
# Angular data in radians
angles = np.array([0.1, 0.3, 6.2, 6.1, 0.2, 0.4]) # Mix of small and ~2π angles
# Circular statistics
circular_mean = circmean(angles)
circular_std = circstd(angles)
print(f"Linear mean: {np.mean(angles):.3f} rad")
print(f"Circular mean: {circular_mean:.3f} rad")
print(f"Linear std: {np.std(angles):.3f} rad")
print(f"Circular std: {circular_std:.3f} rad")
# Convert to degrees
print(f"Circular mean: {circular_mean * 180/np.pi:.1f} degrees")from astropy.stats import poisson_conf_interval, binom_conf_interval
# Poisson confidence intervals (common in photon counting)
observed_counts = 25
lower, upper = poisson_conf_interval(observed_counts, interval='root-n')
print(f"Observed counts: {observed_counts}")
print(f"Root-n interval: [{lower:.1f}, {upper:.1f}]")
# More sophisticated Poisson interval
lower2, upper2 = poisson_conf_interval(observed_counts, interval='kraft-burrows-nousek')
print(f"Kraft-Burrows-Nousek interval: [{lower2:.1f}, {upper2:.1f}]")
# Binomial confidence intervals
successes = 15
trials = 20
lower_b, upper_b = binom_conf_interval(successes, trials, interval='wilson')
success_rate = successes / trials
print(f"Success rate: {success_rate:.2f}")
print(f"Wilson interval: [{lower_b:.3f}, {upper_b:.3f}]")from astropy.stats import histogram, knuth_bin_width, bayesian_blocks
# Generate astronomical data (e.g., stellar magnitudes)
magnitudes = np.random.normal(18, 2, 1000)
# Compare different binning methods
bins_knuth = knuth_bin_width(magnitudes, return_bins=True)
bins_blocks = bayesian_blocks(magnitudes)
# Create histograms
counts_auto, bins_auto = histogram(magnitudes, bins='blocks')
counts_knuth = np.histogram(magnitudes, bins=bins_knuth)[0]
counts_blocks = np.histogram(magnitudes, bins=bins_blocks)[0]
print(f"Automatic bins: {len(bins_auto)-1}")
print(f"Knuth bins: {len(bins_knuth)-1}")
print(f"Bayesian blocks: {len(bins_blocks)-1}")# Comprehensive statistical analysis pipeline
def analyze_data(data, name="Dataset"):
\"\"\"Comprehensive statistical analysis.\"\"\"
print(f"\\n=== Analysis of {name} ===")
# Basic statistics
print(f"Sample size: {len(data)}")
print(f"Mean: {np.mean(data):.3f}")
print(f"Median: {np.median(data):.3f}")
print(f"Std dev: {np.std(data):.3f}")
# Robust statistics
robust_center = biweight_location(data)
robust_scale = biweight_scale(data)
print(f"Biweight location: {robust_center:.3f}")
print(f"Biweight scale: {robust_scale:.3f}")
# Sigma clipping
clipped = sigma_clip(data, sigma=3)
outlier_fraction = 1 - len(clipped.compressed()) / len(data)
print(f"Outlier fraction (3σ): {outlier_fraction:.1%}")
# Bootstrap uncertainty on mean
bootstrap_means = bootstrap(data, bootnum=1000, bootfunc=np.mean)
bootstrap_uncertainty = np.std(bootstrap_means)
print(f"Bootstrap uncertainty on mean: ±{bootstrap_uncertainty:.3f}")
# Apply to different datasets
clean_data = np.random.normal(10, 2, 100)
contaminated_data = np.concatenate([clean_data, [50, -20, 30]])
analyze_data(clean_data, "Clean data")
analyze_data(contaminated_data, "Contaminated data")