tessl/pypi-datatable

Python package for manipulating 2-dimensional tabular data structures with emphasis on speed and big data support

—

Pending

Overview

Eval results

Files

Reductions and Aggregations

Name: tessl/pypi-datatable
Author: tessl

Statistical and mathematical reduction functions for data analysis and aggregation operations, supporting both full-frame and grouped aggregations.

Capabilities

Basic Statistical Functions

Fundamental statistical measures for summarizing data distributions.

def sum(expr=None):
    """
    Sum of values, ignoring missing values.
    
    Parameters:
    - expr: Column expression (all numeric columns if None)
    
    Returns:
    Sum of values
    """

def mean(expr):
    """
    Arithmetic mean of values, ignoring missing values.
    
    Parameters:
    - expr: Column expression
    
    Returns:
    Mean value
    """

def count(expr=None):
    """
    Count of non-missing values.
    
    Parameters:
    - expr: Column expression (row count if None)
    
    Returns:
    Count of non-missing values
    """

def countna(expr):
    """
    Count of missing (NA) values.
    
    Parameters:
    - expr: Column expression
    
    Returns:
    Count of missing values
    """

def nunique(expr):
    """
    Count of unique values, excluding missing values.
    
    Parameters:
    - expr: Column expression
    
    Returns:
    Number of unique values
    """

Distribution Statistics

Functions for analyzing data distributions and variability.

def min(expr):
    """
    Minimum value, ignoring missing values.
    
    Parameters:
    - expr: Column expression
    
    Returns:
    Minimum value
    """

def max(expr):
    """
    Maximum value, ignoring missing values.
    
    Parameters:
    - expr: Column expression
    
    Returns:
    Maximum value
    """

def median(expr):
    """
    Median (50th percentile) value.
    
    Parameters:
    - expr: Column expression
    
    Returns:
    Median value
    """

def sd(expr):
    """
    Standard deviation with N-1 denominator.
    
    Parameters:
    - expr: Column expression
    
    Returns:
    Standard deviation
    """

def prod(expr):
    """
    Product of values, ignoring missing values.
    
    Parameters:
    - expr: Column expression
    
    Returns:
    Product of values
    """

Positional Functions

Functions that return specific positional values from data.

def first(expr):
    """
    First non-missing value in order.
    
    Parameters:
    - expr: Column expression
    
    Returns:
    First non-missing value
    """

def last(expr):
    """
    Last non-missing value in order.
    
    Parameters:
    - expr: Column expression
    
    Returns:
    Last non-missing value
    """

Correlation and Covariance

Statistical measures of relationships between variables.

def corr(expr1, expr2=None):
    """
    Pearson correlation coefficient between variables.
    
    Parameters:
    - expr1: First column expression
    - expr2: Second column expression (correlation matrix if None)
    
    Returns:
    Correlation coefficient or matrix
    """

def cov(expr1, expr2=None):
    """
    Covariance between variables.
    
    Parameters:
    - expr1: First column expression  
    - expr2: Second column expression (covariance matrix if None)
    
    Returns:
    Covariance value or matrix
    """

Cumulative Functions

Running aggregations that accumulate values over ordered data.

def cumsum(expr):
    """
    Cumulative sum of values.
    
    Parameters:
    - expr: Column expression
    
    Returns:
    Running cumulative sum
    """

def cumcount(expr=None):
    """
    Cumulative count of non-missing values.
    
    Parameters:
    - expr: Column expression (row numbers if None)
    
    Returns:
    Running count
    """

def cummax(expr):
    """
    Cumulative maximum values.
    
    Parameters:
    - expr: Column expression
    
    Returns:
    Running maximum
    """

def cummin(expr):
    """
    Cumulative minimum values.
    
    Parameters:
    - expr: Column expression
    
    Returns:
    Running minimum
    """

def cumprod(expr):
    """
    Cumulative product of values.
    
    Parameters:
    - expr: Column expression
    
    Returns:
    Running product
    """

Grouping Helper

def ngroup():
    """
    Group numbering within by() operations.
    
    Returns:
    Sequential group numbers starting from 0
    """

Basic Aggregation Examples

Simple Reductions

import datatable as dt
from datatable import f

DT = dt.Frame({
    'A': [1, 2, 3, 4, 5],
    'B': [1.1, 2.2, 3.3, 4.4, 5.5],
    'C': ['x', 'y', 'x', 'y', 'x']
})

# Basic statistics
total = DT[:, dt.sum(f.A)]              # Sum of A
average = DT[:, dt.mean(f.B)]           # Mean of B
count_rows = DT[:, dt.count()]          # Total row count
unique_C = DT[:, dt.nunique(f.C)]       # Unique values in C

# Multiple aggregations
stats = DT[:, [
    dt.sum(f.A),
    dt.mean(f.A), 
    dt.min(f.A),
    dt.max(f.A),
    dt.sd(f.A)
]]

Missing Value Handling

DT_na = dt.Frame({
    'X': [1, None, 3, None, 5],
    'Y': [1.1, 2.2, None, 4.4, None]
})

# Aggregations ignore missing values
result = DT_na[:, [
    dt.sum(f.X),      # Sum of non-missing values
    dt.count(f.X),    # Count of non-missing values  
    dt.countna(f.X),  # Count of missing values
    dt.mean(f.Y)      # Mean of non-missing values
]]

# Check for missing values
has_missing = DT_na[:, dt.countna(f.X)] > 0

Grouped Aggregations

Basic Grouping

DT = dt.Frame({
    'category': ['A', 'B', 'A', 'B', 'A', 'C'],
    'value': [10, 15, 12, 18, 14, 20],
    'score': [85, 92, 78, 95, 87, 88]
})

# Group by single column
result = DT[:, dt.sum(f.value), dt.by(f.category)]
result = DT[:, dt.mean(f.score), dt.by(f.category)]

# Multiple aggregations per group
result = DT[:, [
    dt.sum(f.value),
    dt.mean(f.score),
    dt.count(),
    dt.min(f.score),
    dt.max(f.score)
], dt.by(f.category)]

Multi-level Grouping

DT = dt.Frame({
    'region': ['North', 'South', 'North', 'South', 'North', 'South'],
    'category': ['A', 'A', 'B', 'B', 'A', 'B'],
    'sales': [100, 150, 120, 180, 110, 170],
    'units': [10, 12, 8, 15, 9, 14]
})

# Group by multiple columns
result = DT[:, [
    dt.sum(f.sales),
    dt.sum(f.units)
], dt.by(f.region, f.category)]

# Nested statistics
result = DT[:, [
    dt.mean(f.sales),
    dt.sd(f.sales),
    dt.count()
], dt.by(f.region, f.category)]

Conditional Grouping

# Group with expressions
result = DT[:, dt.sum(f.sales), dt.by(f.sales > 140)]

# Group with computed categories
result = DT[:, dt.mean(f.sales), dt.by(
    dt.ifelse(f.units > 10, "high_volume", "low_volume")
)]

# Multiple conditions
result = DT[:, dt.sum(f.sales), dt.by(
    f.region,
    f.sales > dt.mean(f.sales)
)]

Advanced Aggregation Patterns

Window Functions with Cumulative Aggregations

DT = dt.Frame({
    'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
    'sales': [100, 150, 120, 180],
    'category': ['A', 'A', 'B', 'B']
})

# Running totals
result = DT[:, dt.update(
    running_sum=dt.cumsum(f.sales),
    running_avg=dt.cumsum(f.sales) / dt.cumcount()
), dt.by(f.category)]

# Running statistics
result = DT[:, dt.update(
    running_max=dt.cummax(f.sales),
    running_min=dt.cummin(f.sales)
), dt.by(f.category)]

Group Numbering and Ranking

# Group identification
result = DT[:, dt.update(
    group_id=dt.ngroup(),
    group_size=dt.count()
), dt.by(f.category)]

# Ranking within groups
result = DT[:, dt.update(
    rank_in_group=dt.cumcount() + 1,
    sales_rank=dt.sort(f.sales, reverse=True)[:, dt.cumcount() + 1]
), dt.by(f.category)]

Statistical Summaries

# Comprehensive statistics per group
def group_summary(frame, value_col, group_col):
    return frame[:, [
        dt.count().alias('count'),
        dt.sum(value_col).alias('sum'),
        dt.mean(value_col).alias('mean'),
        dt.median(value_col).alias('median'),
        dt.min(value_col).alias('min'),
        dt.max(value_col).alias('max'),
        dt.sd(value_col).alias('std'),
        dt.nunique(value_col).alias('unique_count')
    ], dt.by(group_col)]

summary = group_summary(DT, f.sales, f.category)

Correlation Analysis

DT_corr = dt.Frame({
    'x': [1, 2, 3, 4, 5, 6],
    'y': [2, 4, 6, 8, 10, 12],
    'z': [1, 4, 9, 16, 25, 36],
    'group': ['A', 'A', 'A', 'B', 'B', 'B']
})

# Overall correlations
corr_xy = DT_corr[:, dt.corr(f.x, f.y)]
cov_xy = DT_corr[:, dt.cov(f.x, f.y)]

# Correlations by group
result = DT_corr[:, [
    dt.corr(f.x, f.y),
    dt.corr(f.x, f.z),
    dt.cov(f.x, f.y)
], dt.by(f.group)]

Conditional Aggregations

# Conditional sums
result = DT[:, [
    dt.sum(dt.ifelse(f.sales > 140, f.sales, 0)),  # Sum of high sales
    dt.count(dt.ifelse(f.sales > 140, 1, None)),   # Count of high sales
    dt.mean(dt.ifelse(f.category == 'A', f.sales, None))  # Mean for category A
], dt.by(f.region)]

# Percentage calculations
result = DT[:, [
    dt.sum(f.sales),
    (dt.sum(f.sales) / dt.sum(f.sales).sum()) * 100  # Percentage of total
], dt.by(f.category)]

Performance Considerations

Efficient Grouping

# Pre-sort for faster grouping on large datasets
sorted_DT = dt.sort(DT, f.category)
result = sorted_DT[:, dt.sum(f.value), dt.by(f.category)]

# Use appropriate data types for grouping columns
DT_optimized = DT[:, dt.update(
    category=dt.as_type(f.category, dt.str32)  # Smaller strings
)]

# Minimize temporary objects in complex aggregations
result = DT[:, [
    dt.sum(f.sales).alias('total_sales'),
    dt.count().alias('count'),
    (dt.sum(f.sales) / dt.count()).alias('avg_sales')
], dt.by(f.region)]

Install with Tessl CLI