Python package for manipulating 2-dimensional tabular data structures with emphasis on speed and big data support
—
Statistical and mathematical reduction functions for data analysis and aggregation operations, supporting both full-frame and grouped aggregations.
Fundamental statistical measures for summarizing data distributions.
def sum(expr=None):
"""
Sum of values, ignoring missing values.
Parameters:
- expr: Column expression (all numeric columns if None)
Returns:
Sum of values
"""
def mean(expr):
"""
Arithmetic mean of values, ignoring missing values.
Parameters:
- expr: Column expression
Returns:
Mean value
"""
def count(expr=None):
"""
Count of non-missing values.
Parameters:
- expr: Column expression (row count if None)
Returns:
Count of non-missing values
"""
def countna(expr):
"""
Count of missing (NA) values.
Parameters:
- expr: Column expression
Returns:
Count of missing values
"""
def nunique(expr):
"""
Count of unique values, excluding missing values.
Parameters:
- expr: Column expression
Returns:
Number of unique values
"""Functions for analyzing data distributions and variability.
def min(expr):
"""
Minimum value, ignoring missing values.
Parameters:
- expr: Column expression
Returns:
Minimum value
"""
def max(expr):
"""
Maximum value, ignoring missing values.
Parameters:
- expr: Column expression
Returns:
Maximum value
"""
def median(expr):
"""
Median (50th percentile) value.
Parameters:
- expr: Column expression
Returns:
Median value
"""
def sd(expr):
"""
Standard deviation with N-1 denominator.
Parameters:
- expr: Column expression
Returns:
Standard deviation
"""
def prod(expr):
"""
Product of values, ignoring missing values.
Parameters:
- expr: Column expression
Returns:
Product of values
"""Functions that return specific positional values from data.
def first(expr):
"""
First non-missing value in order.
Parameters:
- expr: Column expression
Returns:
First non-missing value
"""
def last(expr):
"""
Last non-missing value in order.
Parameters:
- expr: Column expression
Returns:
Last non-missing value
"""Statistical measures of relationships between variables.
def corr(expr1, expr2=None):
"""
Pearson correlation coefficient between variables.
Parameters:
- expr1: First column expression
- expr2: Second column expression (correlation matrix if None)
Returns:
Correlation coefficient or matrix
"""
def cov(expr1, expr2=None):
"""
Covariance between variables.
Parameters:
- expr1: First column expression
- expr2: Second column expression (covariance matrix if None)
Returns:
Covariance value or matrix
"""Running aggregations that accumulate values over ordered data.
def cumsum(expr):
"""
Cumulative sum of values.
Parameters:
- expr: Column expression
Returns:
Running cumulative sum
"""
def cumcount(expr=None):
"""
Cumulative count of non-missing values.
Parameters:
- expr: Column expression (row numbers if None)
Returns:
Running count
"""
def cummax(expr):
"""
Cumulative maximum values.
Parameters:
- expr: Column expression
Returns:
Running maximum
"""
def cummin(expr):
"""
Cumulative minimum values.
Parameters:
- expr: Column expression
Returns:
Running minimum
"""
def cumprod(expr):
"""
Cumulative product of values.
Parameters:
- expr: Column expression
Returns:
Running product
"""def ngroup():
"""
Group numbering within by() operations.
Returns:
Sequential group numbers starting from 0
"""import datatable as dt
from datatable import f
DT = dt.Frame({
'A': [1, 2, 3, 4, 5],
'B': [1.1, 2.2, 3.3, 4.4, 5.5],
'C': ['x', 'y', 'x', 'y', 'x']
})
# Basic statistics
total = DT[:, dt.sum(f.A)] # Sum of A
average = DT[:, dt.mean(f.B)] # Mean of B
count_rows = DT[:, dt.count()] # Total row count
unique_C = DT[:, dt.nunique(f.C)] # Unique values in C
# Multiple aggregations
stats = DT[:, [
dt.sum(f.A),
dt.mean(f.A),
dt.min(f.A),
dt.max(f.A),
dt.sd(f.A)
]]DT_na = dt.Frame({
'X': [1, None, 3, None, 5],
'Y': [1.1, 2.2, None, 4.4, None]
})
# Aggregations ignore missing values
result = DT_na[:, [
dt.sum(f.X), # Sum of non-missing values
dt.count(f.X), # Count of non-missing values
dt.countna(f.X), # Count of missing values
dt.mean(f.Y) # Mean of non-missing values
]]
# Check for missing values
has_missing = DT_na[:, dt.countna(f.X)] > 0DT = dt.Frame({
'category': ['A', 'B', 'A', 'B', 'A', 'C'],
'value': [10, 15, 12, 18, 14, 20],
'score': [85, 92, 78, 95, 87, 88]
})
# Group by single column
result = DT[:, dt.sum(f.value), dt.by(f.category)]
result = DT[:, dt.mean(f.score), dt.by(f.category)]
# Multiple aggregations per group
result = DT[:, [
dt.sum(f.value),
dt.mean(f.score),
dt.count(),
dt.min(f.score),
dt.max(f.score)
], dt.by(f.category)]DT = dt.Frame({
'region': ['North', 'South', 'North', 'South', 'North', 'South'],
'category': ['A', 'A', 'B', 'B', 'A', 'B'],
'sales': [100, 150, 120, 180, 110, 170],
'units': [10, 12, 8, 15, 9, 14]
})
# Group by multiple columns
result = DT[:, [
dt.sum(f.sales),
dt.sum(f.units)
], dt.by(f.region, f.category)]
# Nested statistics
result = DT[:, [
dt.mean(f.sales),
dt.sd(f.sales),
dt.count()
], dt.by(f.region, f.category)]# Group with expressions
result = DT[:, dt.sum(f.sales), dt.by(f.sales > 140)]
# Group with computed categories
result = DT[:, dt.mean(f.sales), dt.by(
dt.ifelse(f.units > 10, "high_volume", "low_volume")
)]
# Multiple conditions
result = DT[:, dt.sum(f.sales), dt.by(
f.region,
f.sales > dt.mean(f.sales)
)]DT = dt.Frame({
'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
'sales': [100, 150, 120, 180],
'category': ['A', 'A', 'B', 'B']
})
# Running totals
result = DT[:, dt.update(
running_sum=dt.cumsum(f.sales),
running_avg=dt.cumsum(f.sales) / dt.cumcount()
), dt.by(f.category)]
# Running statistics
result = DT[:, dt.update(
running_max=dt.cummax(f.sales),
running_min=dt.cummin(f.sales)
), dt.by(f.category)]# Group identification
result = DT[:, dt.update(
group_id=dt.ngroup(),
group_size=dt.count()
), dt.by(f.category)]
# Ranking within groups
result = DT[:, dt.update(
rank_in_group=dt.cumcount() + 1,
sales_rank=dt.sort(f.sales, reverse=True)[:, dt.cumcount() + 1]
), dt.by(f.category)]# Comprehensive statistics per group
def group_summary(frame, value_col, group_col):
return frame[:, [
dt.count().alias('count'),
dt.sum(value_col).alias('sum'),
dt.mean(value_col).alias('mean'),
dt.median(value_col).alias('median'),
dt.min(value_col).alias('min'),
dt.max(value_col).alias('max'),
dt.sd(value_col).alias('std'),
dt.nunique(value_col).alias('unique_count')
], dt.by(group_col)]
summary = group_summary(DT, f.sales, f.category)DT_corr = dt.Frame({
'x': [1, 2, 3, 4, 5, 6],
'y': [2, 4, 6, 8, 10, 12],
'z': [1, 4, 9, 16, 25, 36],
'group': ['A', 'A', 'A', 'B', 'B', 'B']
})
# Overall correlations
corr_xy = DT_corr[:, dt.corr(f.x, f.y)]
cov_xy = DT_corr[:, dt.cov(f.x, f.y)]
# Correlations by group
result = DT_corr[:, [
dt.corr(f.x, f.y),
dt.corr(f.x, f.z),
dt.cov(f.x, f.y)
], dt.by(f.group)]# Conditional sums
result = DT[:, [
dt.sum(dt.ifelse(f.sales > 140, f.sales, 0)), # Sum of high sales
dt.count(dt.ifelse(f.sales > 140, 1, None)), # Count of high sales
dt.mean(dt.ifelse(f.category == 'A', f.sales, None)) # Mean for category A
], dt.by(f.region)]
# Percentage calculations
result = DT[:, [
dt.sum(f.sales),
(dt.sum(f.sales) / dt.sum(f.sales).sum()) * 100 # Percentage of total
], dt.by(f.category)]# Pre-sort for faster grouping on large datasets
sorted_DT = dt.sort(DT, f.category)
result = sorted_DT[:, dt.sum(f.value), dt.by(f.category)]
# Use appropriate data types for grouping columns
DT_optimized = DT[:, dt.update(
category=dt.as_type(f.category, dt.str32) # Smaller strings
)]
# Minimize temporary objects in complex aggregations
result = DT[:, [
dt.sum(f.sales).alias('total_sales'),
dt.count().alias('count'),
(dt.sum(f.sales) / dt.count()).alias('avg_sales')
], dt.by(f.region)]Install with Tessl CLI
npx tessl i tessl/pypi-datatable