0
# Statistical and Mathematical Operations
1
2
Built-in statistical functions, mathematical operations, and data analysis utilities including descriptive statistics, correlation analysis, and numerical computations.
3
4
## Core Imports
5
6
```python
7
import pandas as pd
8
from pandas import cut, qcut, factorize, value_counts
9
```
10
11
## Capabilities
12
13
### Descriptive Statistics
14
15
Core statistical functions available on DataFrame and Series objects.
16
17
```python { .api }
18
# These are methods available on DataFrame and Series:
19
20
# Central tendency
21
def mean(axis=None, skipna=True, level=None, numeric_only=None):
22
"""Return the mean of the values over the requested axis."""
23
24
def median(axis=None, skipna=True, level=None, numeric_only=None):
25
"""Return the median of the values over the requested axis."""
26
27
def mode(axis=0, numeric_only=False, dropna=True):
28
"""Return the mode(s) of each element along the selected axis."""
29
30
# Measures of spread
31
def std(axis=None, skipna=True, level=None, ddof=1, numeric_only=None):
32
"""Return sample standard deviation over requested axis."""
33
34
def var(axis=None, skipna=True, level=None, ddof=1, numeric_only=None):
35
"""Return unbiased variance over requested axis."""
36
37
def sem(axis=None, skipna=True, level=None, ddof=1, numeric_only=None):
38
"""Return unbiased standard error of the mean over requested axis."""
39
40
def mad(axis=None, skipna=True, level=None):
41
"""Return the mean absolute deviation of the values over the requested axis."""
42
43
# Distribution shape
44
def skew(axis=None, skipna=True, level=None, numeric_only=None):
45
"""Return unbiased skew over requested axis."""
46
47
def kurt(axis=None, skipna=True, level=None, numeric_only=None):
48
"""Return unbiased kurtosis over requested axis."""
49
50
def kurtosis(axis=None, skipna=True, level=None, numeric_only=None):
51
"""Return unbiased kurtosis over requested axis (alias for kurt)."""
52
53
# Extremes
54
def min(axis=None, skipna=True, level=None, numeric_only=None):
55
"""Return the minimum of the values over the requested axis."""
56
57
def max(axis=None, skipna=True, level=None, numeric_only=None):
58
"""Return the maximum of the values over the requested axis."""
59
60
def idxmin(axis=0, skipna=True):
61
"""Return index of first occurrence of minimum over requested axis."""
62
63
def idxmax(axis=0, skipna=True):
64
"""Return index of first occurrence of maximum over requested axis."""
65
66
# Aggregation
67
def sum(axis=None, skipna=True, level=None, numeric_only=None, min_count=0):
68
"""Return the sum of the values over the requested axis."""
69
70
def prod(axis=None, skipna=True, level=None, numeric_only=None, min_count=0):
71
"""Return the product of the values over the requested axis."""
72
73
def product(axis=None, skipna=True, level=None, numeric_only=None, min_count=0):
74
"""Return the product of the values over the requested axis (alias for prod)."""
75
76
def count(axis=0, level=None, numeric_only=False):
77
"""Count non-NA cells for each column or row."""
78
79
def nunique(axis=0, dropna=True):
80
"""Count number of distinct elements in specified axis."""
81
82
# Quantiles and percentiles
83
def quantile(q=0.5, axis=0, numeric_only=True, interpolation='linear', method='single'):
84
"""Return values at the given quantile over requested axis."""
85
86
def describe(percentiles=None, include=None, exclude=None):
87
"""Generate descriptive statistics."""
88
89
# Cumulative operations
90
def cumsum(axis=None, skipna=True):
91
"""Return cumulative sum over a DataFrame or Series axis."""
92
93
def cumprod(axis=None, skipna=True):
94
"""Return cumulative product over a DataFrame or Series axis."""
95
96
def cummax(axis=None, skipna=True):
97
"""Return cumulative maximum over a DataFrame or Series axis."""
98
99
def cummin(axis=None, skipna=True):
100
"""Return cumulative minimum over a DataFrame or Series axis."""
101
```
102
103
### Correlation and Covariance
104
105
Functions to compute relationships between variables.
106
107
```python { .api }
108
# These are methods available on DataFrame and Series:
109
110
def corr(method='pearson', min_periods=1, numeric_only=True):
111
"""
112
Compute pairwise correlation of columns.
113
114
Parameters:
115
- method: str, correlation method ('pearson', 'kendall', 'spearman')
116
- min_periods: int, minimum number of observations for valid result
117
- numeric_only: bool, include only numeric columns
118
119
Returns:
120
DataFrame, correlation matrix
121
"""
122
123
def cov(min_periods=None, ddof=1, numeric_only=True):
124
"""
125
Compute pairwise covariance of columns.
126
127
Parameters:
128
- min_periods: int, minimum number of observations for valid result
129
- ddof: int, delta degrees of freedom
130
- numeric_only: bool, include only numeric columns
131
132
Returns:
133
DataFrame, covariance matrix
134
"""
135
136
def corrwith(other, axis=0, drop=False, method='pearson', numeric_only=True):
137
"""
138
Compute pairwise correlation.
139
140
Parameters:
141
- other: DataFrame, Series, or array-like
142
- axis: int, axis to use (0 or 1)
143
- drop: bool, drop missing indices from result
144
- method: str, correlation method ('pearson', 'kendall', 'spearman')
145
- numeric_only: bool, include only numeric columns
146
147
Returns:
148
Series, correlations
149
"""
150
```
151
152
### Mathematical Operations
153
154
Element-wise mathematical functions and operations.
155
156
```python { .api }
157
# These are methods available on DataFrame and Series:
158
159
def abs():
160
"""Return a Series/DataFrame with absolute numeric value of each element."""
161
162
def round(decimals=0):
163
"""Round each value to the given number of decimals."""
164
165
def clip(lower=None, upper=None, axis=None, inplace=False):
166
"""Trim values at input threshold(s)."""
167
168
def rank(axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False):
169
"""
170
Compute numerical data ranks along axis.
171
172
Parameters:
173
- axis: int, axis to rank along
174
- method: str, how to rank ('average', 'min', 'max', 'first', 'dense')
175
- numeric_only: bool, include only numeric columns
176
- na_option: str, how to rank NaN values ('keep', 'top', 'bottom')
177
- ascending: bool, rank in ascending order
178
- pct: bool, return percentile rank
179
180
Returns:
181
same type as caller, data ranks
182
"""
183
184
# Exponential and logarithmic functions (available via NumPy integration)
185
def exp():
186
"""Calculate exponential of elements."""
187
188
def log():
189
"""Calculate natural logarithm of elements."""
190
191
def log10():
192
"""Calculate base-10 logarithm of elements."""
193
194
def log2():
195
"""Calculate base-2 logarithm of elements."""
196
197
def sqrt():
198
"""Calculate square root of elements."""
199
200
def pow(other):
201
"""Calculate exponential power of elements."""
202
203
# Trigonometric functions (available via NumPy integration)
204
def sin():
205
"""Calculate sine of elements."""
206
207
def cos():
208
"""Calculate cosine of elements."""
209
210
def tan():
211
"""Calculate tangent of elements."""
212
213
def arcsin():
214
"""Calculate inverse sine of elements."""
215
216
def arccos():
217
"""Calculate inverse cosine of elements."""
218
219
def arctan():
220
"""Calculate inverse tangent of elements."""
221
```
222
223
### Comparison Operations
224
225
Functions for comparing and ranking data.
226
227
```python { .api }
228
# These are methods available on DataFrame and Series:
229
230
def eq(other, axis='columns', level=None):
231
"""Get equal to of dataframe and other, element-wise (binary operator ==)."""
232
233
def ne(other, axis='columns', level=None):
234
"""Get not equal to of dataframe and other, element-wise (binary operator !=)."""
235
236
def lt(other, axis='columns', level=None):
237
"""Get less than of dataframe and other, element-wise (binary operator <)."""
238
239
def le(other, axis='columns', level=None):
240
"""Get less than or equal to of dataframe and other, element-wise (binary operator <=)."""
241
242
def gt(other, axis='columns', level=None):
243
"""Get greater than of dataframe and other, element-wise (binary operator >)."""
244
245
def ge(other, axis='columns', level=None):
246
"""Get greater than or equal to of dataframe and other, element-wise (binary operator >=)."""
247
248
def between(left, right, inclusive='both'):
249
"""
250
Return boolean Series equivalent to left <= series <= right.
251
252
Parameters:
253
- left: scalar or list-like, left boundary
254
- right: scalar or list-like, right boundary
255
- inclusive: str, include boundaries ('both', 'neither', 'left', 'right')
256
257
Returns:
258
Series, boolean values
259
"""
260
261
def isin(values):
262
"""
263
Whether each element in the Series/DataFrame is contained in values.
264
265
Parameters:
266
- values: set or list-like, sequence of values to test
267
268
Returns:
269
Series/DataFrame of bools, boolean values
270
"""
271
```
272
273
### Top-Level Statistical Functions
274
275
Standalone statistical functions that operate on array-like data.
276
277
```python { .api }
278
def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise', ordered=True):
279
"""
280
Bin values into discrete intervals.
281
282
Parameters:
283
- x: array-like, input array to be binned
284
- bins: int, sequence of scalars, or IntervalIndex
285
- right: bool, whether bins include rightmost edge
286
- labels: array or bool, labels for returned bins
287
- retbins: bool, return bins
288
- precision: int, precision for bin labels
289
- include_lowest: bool, whether first interval is left-inclusive
290
- duplicates: str, behavior for non-unique bin edges ('raise' or 'drop')
291
- ordered: bool, whether returned Categorical is ordered
292
293
Returns:
294
Categorical, Series, or array
295
"""
296
297
def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
298
"""
299
Quantile-based discretization function.
300
301
Parameters:
302
- x: array-like, input array to be binned
303
- q: int or list-like of float, quantiles to compute
304
- labels: array or bool, labels for returned bins
305
- retbins: bool, return (bins, labels)
306
- precision: int, precision for bin labels
307
- duplicates: str, behavior for non-unique bin edges ('raise' or 'drop')
308
309
Returns:
310
Categorical, Series, or array
311
"""
312
313
def factorize(values, sort=False, na_sentinel=-1, use_na_sentinel=True, size_hint=None):
314
"""
315
Encode the object as an enumerated type or categorical variable.
316
317
Parameters:
318
- values: sequence, 1-d array-like
319
- sort: bool, sort uniques
320
- na_sentinel: int, value for missing values
321
- use_na_sentinel: bool, use na_sentinel for missing values
322
- size_hint: int, hint for hashtable size
323
324
Returns:
325
tuple of (codes, uniques)
326
"""
327
328
def unique(values):
329
"""
330
Return unique values based on a hash table.
331
332
Parameters:
333
- values: 1d array-like
334
335
Returns:
336
ndarray or ExtensionArray
337
"""
338
339
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True):
340
"""
341
Compute a histogram of the 1D array values.
342
343
Parameters:
344
- values: 1d array-like
345
- sort: bool, sort by values
346
- ascending: bool, sort in ascending order
347
- normalize: bool, return relative frequencies
348
- bins: int, group into half-open bins
349
- dropna: bool, exclude NaN values
350
351
Returns:
352
Series
353
"""
354
```
355
356
### Numeric Conversion
357
358
Functions for converting data to numeric types.
359
360
```python { .api }
361
def to_numeric(arg, errors='raise', downcast=None):
362
"""
363
Convert argument to a numeric type.
364
365
Parameters:
366
- arg: scalar, list, tuple, 1-d array, or Series
367
- errors: str, error handling ('raise', 'coerce', 'ignore')
368
- downcast: str, downcast resulting data ('integer', 'signed', 'unsigned', 'float')
369
370
Returns:
371
numeric, converted values
372
"""
373
```
374
375
### Groupby Statistical Operations
376
377
Statistical methods available on GroupBy objects.
378
379
```python { .api }
380
# Available on DataFrameGroupBy and SeriesGroupBy objects:
381
382
class GroupBy:
383
"""GroupBy object with statistical methods."""
384
385
def mean(self, numeric_only=True, engine=None, engine_kwargs=None):
386
"""Compute mean of groups."""
387
388
def median(self, numeric_only=True):
389
"""Compute median of groups."""
390
391
def sum(self, numeric_only=True, min_count=0, engine=None, engine_kwargs=None):
392
"""Compute sum of groups."""
393
394
def min(self, numeric_only=False, min_count=-1, engine=None, engine_kwargs=None):
395
"""Compute min of groups."""
396
397
def max(self, numeric_only=False, min_count=-1, engine=None, engine_kwargs=None):
398
"""Compute max of groups."""
399
400
def std(self, ddof=1, engine=None, engine_kwargs=None, numeric_only=True):
401
"""Compute standard deviation of groups."""
402
403
def var(self, ddof=1, engine=None, engine_kwargs=None, numeric_only=True):
404
"""Compute variance of groups."""
405
406
def count(self):
407
"""Compute count of group."""
408
409
def size(self):
410
"""Compute group sizes."""
411
412
def nunique(self, dropna=True):
413
"""Count number of unique values in each group."""
414
415
def quantile(self, q=0.5, interpolation='linear', numeric_only=True):
416
"""Return values at given quantile for each group."""
417
418
def describe(self, percentiles=None, include=None, exclude=None):
419
"""Generate descriptive statistics for each group."""
420
421
def sem(self, ddof=1, numeric_only=True):
422
"""Compute standard error of the mean for each group."""
423
424
def rank(self, method='average', ascending=True, na_option='keep', pct=False, axis=0):
425
"""Provide the rank of values within each group."""
426
427
def cumcount(self, ascending=True):
428
"""Number each item in each group from 0 to the length of that group - 1."""
429
430
def cumsum(self, axis=0, **kwargs):
431
"""Cumulative sum for each group."""
432
433
def cumprod(self, axis=0, **kwargs):
434
"""Cumulative product for each group."""
435
436
def cummax(self, axis=0, numeric_only=False, **kwargs):
437
"""Cumulative max for each group."""
438
439
def cummin(self, axis=0, numeric_only=False, **kwargs):
440
"""Cumulative min for each group."""
441
442
def skew(self, axis=0, skipna=True, numeric_only=True, **kwargs):
443
"""Return unbiased skew within groups."""
444
445
def kurt(self, axis=0, skipna=True, numeric_only=True, **kwargs):
446
"""Return unbiased kurtosis within groups."""
447
448
def mad(self, **kwargs):
449
"""Return mean absolute deviation within groups."""
450
451
def prod(self, numeric_only=True, min_count=0):
452
"""Compute product of group values."""
453
454
def ohlc(self):
455
"""Compute open, high, low and close values of a group."""
456
457
def first(self, numeric_only=False, min_count=-1):
458
"""Return first value within each group."""
459
460
def last(self, numeric_only=False, min_count=-1):
461
"""Return last value within each group."""
462
463
def nth(self, n, dropna=None):
464
"""Take nth value, or subset if n is a list."""
465
466
def idxmax(self, axis=0, skipna=True):
467
"""Return index of maximum value within each group."""
468
469
def idxmin(self, axis=0, skipna=True):
470
"""Return index of minimum value within each group."""
471
```
472
473
### Advanced Statistical Functions
474
475
More specialized statistical operations and utilities.
476
477
```python { .api }
478
# These functions work with DataFrame/Series or can be called independently:
479
480
def pct_change(periods=1, fill_method='pad', limit=None, freq=None):
481
"""
482
Percentage change between current and prior element.
483
484
Parameters:
485
- periods: int, periods to shift for forming percent change
486
- fill_method: str, how to handle NaNs before computing percent changes
487
- limit: int, number of consecutive NaNs to fill before stopping
488
- freq: DateOffset, Timedelta or str, increment to use for time rule
489
490
Returns:
491
Series/DataFrame, percentage changes
492
"""
493
494
def diff(periods=1, axis=0):
495
"""
496
First discrete difference of element.
497
498
Parameters:
499
- periods: int, periods to shift for calculating difference
500
- axis: int, axis to shift along
501
502
Returns:
503
Series/DataFrame, differences
504
"""
505
506
def shift(periods=1, freq=None, axis=0, fill_value=None):
507
"""
508
Shift index by desired number of periods.
509
510
Parameters:
511
- periods: int, number of periods to shift
512
- freq: DateOffset, Timedelta, or str, offset to use from time series API
513
- axis: int, axis to shift
514
- fill_value: object, scalar value to use for missing values
515
516
Returns:
517
Series/DataFrame, shifted data
518
"""
519
520
def expanding(min_periods=1, center=None, axis=0, method='single'):
521
"""
522
Provide expanding window calculations.
523
524
Parameters:
525
- min_periods: int, minimum number of observations in window
526
- center: bool, whether result should be centered
527
- axis: int, axis along which to slide window
528
- method: str, execution method ('single' thread or 'table')
529
530
Returns:
531
Expanding object
532
"""
533
534
def rolling(window, min_periods=None, center=False, win_type=None, on=None, axis=0, closed=None, method='single'):
535
"""
536
Provide rolling window calculations.
537
538
Parameters:
539
- window: int, size of moving window
540
- min_periods: int, minimum number of observations in window
541
- center: bool, whether result should be centered
542
- win_type: str, window type
543
- on: str, datetime-like column for DatetimeIndex
544
- axis: int, axis along which to slide window
545
- closed: str, make interval closed on 'right', 'left', 'both' or 'neither'
546
- method: str, execution method ('single' or 'table')
547
548
Returns:
549
Rolling object
550
"""
551
552
def ewm(com=None, span=None, halflife=None, alpha=None, min_periods=0, adjust=True, ignore_na=False, axis=0, times=None, method='single'):
553
"""
554
Provide exponentially weighted (EW) calculations.
555
556
Parameters:
557
- com: float, center of mass
558
- span: float, span
559
- halflife: float, decay in terms of half-life
560
- alpha: float, smoothing factor
561
- min_periods: int, minimum number of observations
562
- adjust: bool, divide by decaying adjustment factor
563
- ignore_na: bool, ignore missing values
564
- axis: int, axis along which to calculate
565
- times: array-like, times corresponding to observations
566
- method: str, execution method ('single' or 'table')
567
568
Returns:
569
ExponentialMovingWindow object
570
"""
571
```
572
573
## Types
574
575
```python { .api }
576
# Statistical method options
577
StatMethod = Literal['average', 'min', 'max', 'first', 'dense']
578
CorrelationMethod = Literal['pearson', 'kendall', 'spearman']
579
InterpolationMethod = Literal['linear', 'lower', 'higher', 'midpoint', 'nearest']
580
QuantileInterpolation = Literal['linear', 'lower', 'higher', 'midpoint', 'nearest']
581
582
# Ranking options
583
RankMethod = Literal['average', 'min', 'max', 'first', 'dense']
584
RankNaOption = Literal['keep', 'top', 'bottom']
585
586
# Numeric conversion options
587
NumericErrors = Literal['raise', 'coerce', 'ignore']
588
DowncastOptions = Literal['integer', 'signed', 'unsigned', 'float']
589
590
# Binning options
591
BinningDuplicates = Literal['raise', 'drop']
592
IntervalInclusive = Literal['both', 'neither', 'left', 'right']
593
594
# Window calculation options
595
WindowMethod = Literal['single', 'table']
596
WindowType = Literal[
597
'boxcar', 'triang', 'blackman', 'hamming', 'bartlett', 'parzen',
598
'bohman', 'blackmanharris', 'nuttall', 'barthann', 'kaiser',
599
'gaussian', 'general_gaussian', 'slepian', 'exponential'
600
]
601
602
# Percentile inclusion options
603
PercentileInclusive = Literal['both', 'neither', 'left', 'right']
604
605
# Axis specification
606
AxisOption = Union[int, str, None]
607
```