Powerful data structures for data analysis, time series, and statistics
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Built-in statistical functions, mathematical operations, and data analysis utilities including descriptive statistics, correlation analysis, and numerical computations.
import pandas as pd
from pandas import cut, qcut, factorize, value_countsCore statistical functions available on DataFrame and Series objects.
# These are methods available on DataFrame and Series:
# Central tendency
def mean(axis=None, skipna=True, level=None, numeric_only=None):
"""Return the mean of the values over the requested axis."""
def median(axis=None, skipna=True, level=None, numeric_only=None):
"""Return the median of the values over the requested axis."""
def mode(axis=0, numeric_only=False, dropna=True):
"""Return the mode(s) of each element along the selected axis."""
# Measures of spread
def std(axis=None, skipna=True, level=None, ddof=1, numeric_only=None):
"""Return sample standard deviation over requested axis."""
def var(axis=None, skipna=True, level=None, ddof=1, numeric_only=None):
"""Return unbiased variance over requested axis."""
def sem(axis=None, skipna=True, level=None, ddof=1, numeric_only=None):
"""Return unbiased standard error of the mean over requested axis."""
def mad(axis=None, skipna=True, level=None):
"""Return the mean absolute deviation of the values over the requested axis."""
# Distribution shape
def skew(axis=None, skipna=True, level=None, numeric_only=None):
"""Return unbiased skew over requested axis."""
def kurt(axis=None, skipna=True, level=None, numeric_only=None):
"""Return unbiased kurtosis over requested axis."""
def kurtosis(axis=None, skipna=True, level=None, numeric_only=None):
"""Return unbiased kurtosis over requested axis (alias for kurt)."""
# Extremes
def min(axis=None, skipna=True, level=None, numeric_only=None):
"""Return the minimum of the values over the requested axis."""
def max(axis=None, skipna=True, level=None, numeric_only=None):
"""Return the maximum of the values over the requested axis."""
def idxmin(axis=0, skipna=True):
"""Return index of first occurrence of minimum over requested axis."""
def idxmax(axis=0, skipna=True):
"""Return index of first occurrence of maximum over requested axis."""
# Aggregation
def sum(axis=None, skipna=True, level=None, numeric_only=None, min_count=0):
"""Return the sum of the values over the requested axis."""
def prod(axis=None, skipna=True, level=None, numeric_only=None, min_count=0):
"""Return the product of the values over the requested axis."""
def product(axis=None, skipna=True, level=None, numeric_only=None, min_count=0):
"""Return the product of the values over the requested axis (alias for prod)."""
def count(axis=0, level=None, numeric_only=False):
"""Count non-NA cells for each column or row."""
def nunique(axis=0, dropna=True):
"""Count number of distinct elements in specified axis."""
# Quantiles and percentiles
def quantile(q=0.5, axis=0, numeric_only=True, interpolation='linear', method='single'):
"""Return values at the given quantile over requested axis."""
def describe(percentiles=None, include=None, exclude=None):
"""Generate descriptive statistics."""
# Cumulative operations
def cumsum(axis=None, skipna=True):
"""Return cumulative sum over a DataFrame or Series axis."""
def cumprod(axis=None, skipna=True):
"""Return cumulative product over a DataFrame or Series axis."""
def cummax(axis=None, skipna=True):
"""Return cumulative maximum over a DataFrame or Series axis."""
def cummin(axis=None, skipna=True):
"""Return cumulative minimum over a DataFrame or Series axis."""Functions to compute relationships between variables.
# These are methods available on DataFrame and Series:
def corr(method='pearson', min_periods=1, numeric_only=True):
"""
Compute pairwise correlation of columns.
Parameters:
- method: str, correlation method ('pearson', 'kendall', 'spearman')
- min_periods: int, minimum number of observations for valid result
- numeric_only: bool, include only numeric columns
Returns:
DataFrame, correlation matrix
"""
def cov(min_periods=None, ddof=1, numeric_only=True):
"""
Compute pairwise covariance of columns.
Parameters:
- min_periods: int, minimum number of observations for valid result
- ddof: int, delta degrees of freedom
- numeric_only: bool, include only numeric columns
Returns:
DataFrame, covariance matrix
"""
def corrwith(other, axis=0, drop=False, method='pearson', numeric_only=True):
"""
Compute pairwise correlation.
Parameters:
- other: DataFrame, Series, or array-like
- axis: int, axis to use (0 or 1)
- drop: bool, drop missing indices from result
- method: str, correlation method ('pearson', 'kendall', 'spearman')
- numeric_only: bool, include only numeric columns
Returns:
Series, correlations
"""Element-wise mathematical functions and operations.
# These are methods available on DataFrame and Series:
def abs():
"""Return a Series/DataFrame with absolute numeric value of each element."""
def round(decimals=0):
"""Round each value to the given number of decimals."""
def clip(lower=None, upper=None, axis=None, inplace=False):
"""Trim values at input threshold(s)."""
def rank(axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False):
"""
Compute numerical data ranks along axis.
Parameters:
- axis: int, axis to rank along
- method: str, how to rank ('average', 'min', 'max', 'first', 'dense')
- numeric_only: bool, include only numeric columns
- na_option: str, how to rank NaN values ('keep', 'top', 'bottom')
- ascending: bool, rank in ascending order
- pct: bool, return percentile rank
Returns:
same type as caller, data ranks
"""
# Exponential and logarithmic functions (available via NumPy integration)
def exp():
"""Calculate exponential of elements."""
def log():
"""Calculate natural logarithm of elements."""
def log10():
"""Calculate base-10 logarithm of elements."""
def log2():
"""Calculate base-2 logarithm of elements."""
def sqrt():
"""Calculate square root of elements."""
def pow(other):
"""Calculate exponential power of elements."""
# Trigonometric functions (available via NumPy integration)
def sin():
"""Calculate sine of elements."""
def cos():
"""Calculate cosine of elements."""
def tan():
"""Calculate tangent of elements."""
def arcsin():
"""Calculate inverse sine of elements."""
def arccos():
"""Calculate inverse cosine of elements."""
def arctan():
"""Calculate inverse tangent of elements."""Functions for comparing and ranking data.
# These are methods available on DataFrame and Series:
def eq(other, axis='columns', level=None):
"""Get equal to of dataframe and other, element-wise (binary operator ==)."""
def ne(other, axis='columns', level=None):
"""Get not equal to of dataframe and other, element-wise (binary operator !=)."""
def lt(other, axis='columns', level=None):
"""Get less than of dataframe and other, element-wise (binary operator <)."""
def le(other, axis='columns', level=None):
"""Get less than or equal to of dataframe and other, element-wise (binary operator <=)."""
def gt(other, axis='columns', level=None):
"""Get greater than of dataframe and other, element-wise (binary operator >)."""
def ge(other, axis='columns', level=None):
"""Get greater than or equal to of dataframe and other, element-wise (binary operator >=)."""
def between(left, right, inclusive='both'):
"""
Return boolean Series equivalent to left <= series <= right.
Parameters:
- left: scalar or list-like, left boundary
- right: scalar or list-like, right boundary
- inclusive: str, include boundaries ('both', 'neither', 'left', 'right')
Returns:
Series, boolean values
"""
def isin(values):
"""
Whether each element in the Series/DataFrame is contained in values.
Parameters:
- values: set or list-like, sequence of values to test
Returns:
Series/DataFrame of bools, boolean values
"""Standalone statistical functions that operate on array-like data.
def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise', ordered=True):
"""
Bin values into discrete intervals.
Parameters:
- x: array-like, input array to be binned
- bins: int, sequence of scalars, or IntervalIndex
- right: bool, whether bins include rightmost edge
- labels: array or bool, labels for returned bins
- retbins: bool, return bins
- precision: int, precision for bin labels
- include_lowest: bool, whether first interval is left-inclusive
- duplicates: str, behavior for non-unique bin edges ('raise' or 'drop')
- ordered: bool, whether returned Categorical is ordered
Returns:
Categorical, Series, or array
"""
def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
"""
Quantile-based discretization function.
Parameters:
- x: array-like, input array to be binned
- q: int or list-like of float, quantiles to compute
- labels: array or bool, labels for returned bins
- retbins: bool, return (bins, labels)
- precision: int, precision for bin labels
- duplicates: str, behavior for non-unique bin edges ('raise' or 'drop')
Returns:
Categorical, Series, or array
"""
def factorize(values, sort=False, na_sentinel=-1, use_na_sentinel=True, size_hint=None):
"""
Encode the object as an enumerated type or categorical variable.
Parameters:
- values: sequence, 1-d array-like
- sort: bool, sort uniques
- na_sentinel: int, value for missing values
- use_na_sentinel: bool, use na_sentinel for missing values
- size_hint: int, hint for hashtable size
Returns:
tuple of (codes, uniques)
"""
def unique(values):
"""
Return unique values based on a hash table.
Parameters:
- values: 1d array-like
Returns:
ndarray or ExtensionArray
"""
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True):
"""
Compute a histogram of the 1D array values.
Parameters:
- values: 1d array-like
- sort: bool, sort by values
- ascending: bool, sort in ascending order
- normalize: bool, return relative frequencies
- bins: int, group into half-open bins
- dropna: bool, exclude NaN values
Returns:
Series
"""Functions for converting data to numeric types.
def to_numeric(arg, errors='raise', downcast=None):
"""
Convert argument to a numeric type.
Parameters:
- arg: scalar, list, tuple, 1-d array, or Series
- errors: str, error handling ('raise', 'coerce', 'ignore')
- downcast: str, downcast resulting data ('integer', 'signed', 'unsigned', 'float')
Returns:
numeric, converted values
"""Statistical methods available on GroupBy objects.
# Available on DataFrameGroupBy and SeriesGroupBy objects:
class GroupBy:
"""GroupBy object with statistical methods."""
def mean(self, numeric_only=True, engine=None, engine_kwargs=None):
"""Compute mean of groups."""
def median(self, numeric_only=True):
"""Compute median of groups."""
def sum(self, numeric_only=True, min_count=0, engine=None, engine_kwargs=None):
"""Compute sum of groups."""
def min(self, numeric_only=False, min_count=-1, engine=None, engine_kwargs=None):
"""Compute min of groups."""
def max(self, numeric_only=False, min_count=-1, engine=None, engine_kwargs=None):
"""Compute max of groups."""
def std(self, ddof=1, engine=None, engine_kwargs=None, numeric_only=True):
"""Compute standard deviation of groups."""
def var(self, ddof=1, engine=None, engine_kwargs=None, numeric_only=True):
"""Compute variance of groups."""
def count(self):
"""Compute count of group."""
def size(self):
"""Compute group sizes."""
def nunique(self, dropna=True):
"""Count number of unique values in each group."""
def quantile(self, q=0.5, interpolation='linear', numeric_only=True):
"""Return values at given quantile for each group."""
def describe(self, percentiles=None, include=None, exclude=None):
"""Generate descriptive statistics for each group."""
def sem(self, ddof=1, numeric_only=True):
"""Compute standard error of the mean for each group."""
def rank(self, method='average', ascending=True, na_option='keep', pct=False, axis=0):
"""Provide the rank of values within each group."""
def cumcount(self, ascending=True):
"""Number each item in each group from 0 to the length of that group - 1."""
def cumsum(self, axis=0, **kwargs):
"""Cumulative sum for each group."""
def cumprod(self, axis=0, **kwargs):
"""Cumulative product for each group."""
def cummax(self, axis=0, numeric_only=False, **kwargs):
"""Cumulative max for each group."""
def cummin(self, axis=0, numeric_only=False, **kwargs):
"""Cumulative min for each group."""
def skew(self, axis=0, skipna=True, numeric_only=True, **kwargs):
"""Return unbiased skew within groups."""
def kurt(self, axis=0, skipna=True, numeric_only=True, **kwargs):
"""Return unbiased kurtosis within groups."""
def mad(self, **kwargs):
"""Return mean absolute deviation within groups."""
def prod(self, numeric_only=True, min_count=0):
"""Compute product of group values."""
def ohlc(self):
"""Compute open, high, low and close values of a group."""
def first(self, numeric_only=False, min_count=-1):
"""Return first value within each group."""
def last(self, numeric_only=False, min_count=-1):
"""Return last value within each group."""
def nth(self, n, dropna=None):
"""Take nth value, or subset if n is a list."""
def idxmax(self, axis=0, skipna=True):
"""Return index of maximum value within each group."""
def idxmin(self, axis=0, skipna=True):
"""Return index of minimum value within each group."""More specialized statistical operations and utilities.
# These functions work with DataFrame/Series or can be called independently:
def pct_change(periods=1, fill_method='pad', limit=None, freq=None):
"""
Percentage change between current and prior element.
Parameters:
- periods: int, periods to shift for forming percent change
- fill_method: str, how to handle NaNs before computing percent changes
- limit: int, number of consecutive NaNs to fill before stopping
- freq: DateOffset, Timedelta or str, increment to use for time rule
Returns:
Series/DataFrame, percentage changes
"""
def diff(periods=1, axis=0):
"""
First discrete difference of element.
Parameters:
- periods: int, periods to shift for calculating difference
- axis: int, axis to shift along
Returns:
Series/DataFrame, differences
"""
def shift(periods=1, freq=None, axis=0, fill_value=None):
"""
Shift index by desired number of periods.
Parameters:
- periods: int, number of periods to shift
- freq: DateOffset, Timedelta, or str, offset to use from time series API
- axis: int, axis to shift
- fill_value: object, scalar value to use for missing values
Returns:
Series/DataFrame, shifted data
"""
def expanding(min_periods=1, center=None, axis=0, method='single'):
"""
Provide expanding window calculations.
Parameters:
- min_periods: int, minimum number of observations in window
- center: bool, whether result should be centered
- axis: int, axis along which to slide window
- method: str, execution method ('single' thread or 'table')
Returns:
Expanding object
"""
def rolling(window, min_periods=None, center=False, win_type=None, on=None, axis=0, closed=None, method='single'):
"""
Provide rolling window calculations.
Parameters:
- window: int, size of moving window
- min_periods: int, minimum number of observations in window
- center: bool, whether result should be centered
- win_type: str, window type
- on: str, datetime-like column for DatetimeIndex
- axis: int, axis along which to slide window
- closed: str, make interval closed on 'right', 'left', 'both' or 'neither'
- method: str, execution method ('single' or 'table')
Returns:
Rolling object
"""
def ewm(com=None, span=None, halflife=None, alpha=None, min_periods=0, adjust=True, ignore_na=False, axis=0, times=None, method='single'):
"""
Provide exponentially weighted (EW) calculations.
Parameters:
- com: float, center of mass
- span: float, span
- halflife: float, decay in terms of half-life
- alpha: float, smoothing factor
- min_periods: int, minimum number of observations
- adjust: bool, divide by decaying adjustment factor
- ignore_na: bool, ignore missing values
- axis: int, axis along which to calculate
- times: array-like, times corresponding to observations
- method: str, execution method ('single' or 'table')
Returns:
ExponentialMovingWindow object
"""# Statistical method options
StatMethod = Literal['average', 'min', 'max', 'first', 'dense']
CorrelationMethod = Literal['pearson', 'kendall', 'spearman']
InterpolationMethod = Literal['linear', 'lower', 'higher', 'midpoint', 'nearest']
QuantileInterpolation = Literal['linear', 'lower', 'higher', 'midpoint', 'nearest']
# Ranking options
RankMethod = Literal['average', 'min', 'max', 'first', 'dense']
RankNaOption = Literal['keep', 'top', 'bottom']
# Numeric conversion options
NumericErrors = Literal['raise', 'coerce', 'ignore']
DowncastOptions = Literal['integer', 'signed', 'unsigned', 'float']
# Binning options
BinningDuplicates = Literal['raise', 'drop']
IntervalInclusive = Literal['both', 'neither', 'left', 'right']
# Window calculation options
WindowMethod = Literal['single', 'table']
WindowType = Literal[
'boxcar', 'triang', 'blackman', 'hamming', 'bartlett', 'parzen',
'bohman', 'blackmanharris', 'nuttall', 'barthann', 'kaiser',
'gaussian', 'general_gaussian', 'slepian', 'exponential'
]
# Percentile inclusion options
PercentileInclusive = Literal['both', 'neither', 'left', 'right']
# Axis specification
AxisOption = Union[int, str, None]Install with Tessl CLI
npx tessl i tessl/pypi-pandas