tessl/pypi-datatable

Python package for manipulating 2-dimensional tabular data structures with emphasis on speed and big data support

—

Pending

Overview

Eval results

Files

Data Manipulation

Name: tessl/pypi-datatable
Author: tessl

Comprehensive functions for combining, transforming, and reshaping data frames with high-performance operations.

Capabilities

Frame Binding Operations

Combine multiple frames by columns or rows with flexible options for handling mismatched structures.

def cbind(*frames, force=False) -> Frame:
    """
    Bind frames column-wise (horizontally).
    
    Parameters:
    - *frames: Frame objects to bind
    - force: Force binding even with mismatched row counts
    
    Returns:
    Frame with columns from all input frames
    """

def rbind(*frames, force=False, bynames=True) -> Frame:
    """
    Bind frames row-wise (vertically).
    
    Parameters:
    - *frames: Frame objects to bind
    - force: Force binding even with mismatched column types
    - bynames: Match columns by name (True) or position (False)
    
    Returns:
    Frame with rows from all input frames
    """

Sorting and Uniqueness

Sort frames and extract unique values with flexible column specifications.

def sort(frame, *cols, reverse=False, na_position='first') -> Frame:
    """
    Sort frame by specified columns.
    
    Parameters:
    - frame: Frame to sort
    - *cols: Column expressions or names to sort by
    - reverse: Sort in descending order
    - na_position: Position of NA values ('first' or 'last')
    
    Returns:
    Sorted Frame
    """

def unique(frame, *cols) -> Frame:
    """
    Return unique rows based on specified columns.
    
    Parameters:
    - frame: Frame to process
    - *cols: Columns to consider for uniqueness (all if none specified)
    
    Returns:
    Frame with unique rows
    """

Utility Functions

Helper functions for data transformation and manipulation.

def repeat(frame, n) -> Frame:
    """
    Repeat frame rows n times.
    
    Parameters:
    - frame: Frame to repeat
    - n: Number of repetitions
    
    Returns:
    Frame with repeated rows
    """

def shift(column, n=1) -> FExpr:
    """
    Shift column values by n positions.
    
    Parameters:
    - column: Column expression to shift
    - n: Number of positions to shift (positive=down, negative=up)
    
    Returns:
    Expression with shifted values
    """

def fillna(column, value) -> FExpr:
    """
    Fill missing values in column with specified value.
    
    Parameters:
    - column: Column expression with missing values
    - value: Value to use for filling NAs
    
    Returns:
    Expression with filled values
    """

def ifelse(condition, true_value, false_value) -> FExpr:
    """
    Conditional expression returning different values based on condition.
    
    Parameters:
    - condition: Boolean expression
    - true_value: Value when condition is True
    - false_value: Value when condition is False
    
    Returns:
    Expression with conditional values
    """

Type Conversion

Convert between different data types with explicit control over the conversion process.

def as_type(frame_or_column, new_type) -> Frame:
    """
    Convert frame or column to specified type.
    
    Parameters:
    - frame_or_column: Frame or column expression to convert
    - new_type: Target stype or Type object
    
    Returns:
    Frame or expression with converted types
    """

def update(**kwargs) -> UpdateExpr:
    """
    Create update specification for adding or modifying columns.
    
    Parameters:
    - **kwargs: column_name=expression pairs
    
    Returns:
    Update expression for use in Frame operations
    """

def cut(column, bins, right=True, labels=None) -> FExpr:
    """
    Bin values into discrete intervals.
    
    Parameters:
    - column: Column expression to bin  
    - bins: Number of bins or sequence of bin edges
    - right: Include right edge of intervals
    - labels: Labels for bins
    
    Returns:
    Categorical column with binned values
    """

def qcut(column, q, labels=None) -> FExpr:
    """
    Quantile-based binning of values.
    
    Parameters:
    - column: Column expression to bin
    - q: Number of quantiles or sequence of quantile boundaries
    - labels: Labels for bins
    
    Returns:
    Categorical column with quantile-based bins
    """

def split_into_nhot(frame, delimiter=",") -> Frame:
    """
    One-hot encoding for delimited string values.
    
    Parameters:
    - frame: Frame containing delimited strings
    - delimiter: Character used to separate values
    
    Returns:
    Frame with binary columns for each unique value
    """

Data Binding Examples

Column Binding

import datatable as dt

# Create sample frames
DT1 = dt.Frame({'A': [1, 2, 3], 'B': [4, 5, 6]})
DT2 = dt.Frame({'C': [7, 8, 9], 'D': [10, 11, 12]})
DT3 = dt.Frame({'E': [13, 14, 15]})

# Bind columns
result = dt.cbind(DT1, DT2, DT3)
# Result: Frame with columns A, B, C, D, E

# Force binding with mismatched row counts
DT4 = dt.Frame({'F': [16, 17]})  # Only 2 rows
result = dt.cbind(DT1, DT4, force=True)  # Shorter frame is recycled

Row Binding

# Create compatible frames
DT1 = dt.Frame({'X': [1, 2], 'Y': ['a', 'b']})
DT2 = dt.Frame({'X': [3, 4], 'Y': ['c', 'd']})
DT3 = dt.Frame({'X': [5, 6], 'Y': ['e', 'f']})

# Bind rows
result = dt.rbind(DT1, DT2, DT3)
# Result: Frame with 6 rows and columns X, Y

# Bind with different column orders
DT4 = dt.Frame({'Y': ['g', 'h'], 'X': [7, 8]})
result = dt.rbind(DT1, DT4, bynames=True)  # Matches by column names

# Force binding with type mismatches
DT5 = dt.Frame({'X': [1.1, 2.2], 'Y': ['i', 'j']})  # X is float
result = dt.rbind(DT1, DT5, force=True)  # Forces compatible types

Sorting Examples

Basic Sorting

DT = dt.Frame({
    'A': [3, 1, 4, 1, 5],
    'B': ['c', 'a', 'd', 'a', 'e'],
    'C': [3.3, 1.1, 4.4, 1.2, 5.5]
})

# Sort by single column
sorted_DT = dt.sort(DT, f.A)           # Sort by A ascending
sorted_DT = dt.sort(DT, -f.A)          # Sort by A descending
sorted_DT = dt.sort(DT, f.A, reverse=True)  # Alternative descending

# Sort by multiple columns
sorted_DT = dt.sort(DT, f.B, f.A)      # Sort by B, then A
sorted_DT = dt.sort(DT, f.B, -f.C)     # Sort by B asc, C desc

# Sort with NA handling
DT_na = dt.Frame({'X': [3, None, 1, None, 2]})
sorted_DT = dt.sort(DT_na, f.X, na_position='last')

Sorting in Frame Operations

# Sort as part of selection
result = DT[:, :, dt.sort(f.A)]
result = DT[f.A > 2, :, dt.sort(f.B)]

# Sort within groups
result = DT[:, :, dt.sort(f.C), dt.by(f.B)]

Uniqueness Examples

Basic Unique Operations

DT = dt.Frame({
    'A': [1, 2, 2, 3, 3, 3],
    'B': ['x', 'y', 'y', 'z', 'z', 'w'],
    'C': [1.1, 2.2, 2.2, 3.3, 3.4, 3.5]
})

# Unique rows (all columns)
unique_DT = dt.unique(DT)

# Unique based on specific columns
unique_DT = dt.unique(DT, f.A)         # Unique values of A
unique_DT = dt.unique(DT, f.A, f.B)    # Unique combinations of A and B

# Unique in Frame operations
result = DT[:, :, dt.unique(f.A)]

Transformation Examples

Conditional Logic

DT = dt.Frame({
    'score': [85, 92, 78, 95, 67],
    'category': ['A', 'B', 'A', 'B', 'C']
})

# Simple conditional
result = DT[:, dt.update(
    grade=dt.ifelse(f.score >= 90, "A", "B")
)]

# Nested conditionals
result = DT[:, dt.update(
    grade=dt.ifelse(f.score >= 90, "A",
          dt.ifelse(f.score >= 80, "B",
          dt.ifelse(f.score >= 70, "C", "F")))
)]

# Conditional aggregation  
result = DT[:, dt.sum(dt.ifelse(f.score >= 80, 1, 0)), dt.by(f.category)]

Missing Value Handling

DT = dt.Frame({
    'A': [1, None, 3, None, 5],
    'B': [1.1, 2.2, None, 4.4, None]
})

# Fill missing values
result = DT[:, dt.update(
    A_filled=dt.fillna(f.A, 0),
    B_filled=dt.fillna(f.B, dt.mean(f.B))
)]

# Forward fill
result = DT[:, dt.update(
    A_ffill=dt.fillna(f.A, dt.shift(f.A, 1))
)]

# Conditional filling
result = DT[:, dt.update(
    A_smart=dt.ifelse(dt.isna(f.A), dt.mean(f.A), f.A)
)]

Data Shifting

DT = dt.Frame({
    'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
    'value': [10, 15, 12, 18]
})

# Lag values (shift down)
result = DT[:, dt.update(
    prev_value=dt.shift(f.value, 1),    # Previous value
    prev2_value=dt.shift(f.value, 2)    # Value 2 periods ago
)]

# Lead values (shift up)
result = DT[:, dt.update(
    next_value=dt.shift(f.value, -1),   # Next value
    next2_value=dt.shift(f.value, -2)   # Value 2 periods ahead
)]

# Calculate differences
result = DT[:, dt.update(
    diff=f.value - dt.shift(f.value, 1),
    pct_change=((f.value - dt.shift(f.value, 1)) / dt.shift(f.value, 1)) * 100
)]

Repetition and Expansion

DT = dt.Frame({'A': [1, 2], 'B': ['x', 'y']})

# Repeat entire frame
repeated = dt.repeat(DT, 3)  # 6 rows total

# Repeat with expressions
result = DT[:, dt.repeat(f.A, 2)]  # Each value repeated twice

# Create expanding sequences
base = dt.Frame({'seq': [1]})
expanded = dt.repeat(base, 5)[:, dt.update(seq=range(1, 6))]

Type Conversion Examples

Basic Type Conversion

DT = dt.Frame({
    'A': [1, 2, 3],           # int64 by default
    'B': [1.1, 2.2, 3.3],    # float64 by default
    'C': ['1', '2', '3']      # str64
})

# Convert single column
result = DT[:, dt.update(A_float=dt.as_type(f.A, dt.float32))]

# Convert multiple columns
result = DT[:, dt.update(
    A_str=dt.as_type(f.A, dt.str32),
    C_int=dt.as_type(f.C, dt.int32)
)]

# Convert entire frame
DT_float = dt.as_type(DT, dt.float64)

Advanced Type Operations

# Conditional type conversion
result = DT[:, dt.update(
    A_converted=dt.ifelse(f.A > 2, 
                         dt.as_type(f.A, dt.float32),
                         dt.as_type(f.A, dt.int32))
)]

# Safe conversion with error handling
try:
    result = DT[:, dt.update(C_numeric=dt.as_type(f.C, dt.float64))]
except dt.exceptions.TypeError as e:
    # Handle conversion errors
    result = DT[:, dt.update(C_numeric=dt.fillna(dt.as_type(f.C, dt.float64), 0))]

Install with Tessl CLI