Python package for manipulating 2-dimensional tabular data structures with emphasis on speed and big data support
—
datatable's expression system provides a powerful and flexible way to reference columns and build complex data transformations using f and g objects.
Primary objects for referencing columns in Frame operations and building expressions.
f: object
"""
Primary column reference object for selecting and transforming columns.
Used in most datatable operations for column selection and expressions.
"""
g: object
"""
Secondary column reference object primarily used in join operations
to distinguish between columns from different frames.
"""class FExpr:
"""
Expression object representing column operations and transformations.
Created automatically when using f/g objects with operators.
"""
def __add__(self, other): ...
def __sub__(self, other): ...
def __mul__(self, other): ...
def __truediv__(self, other): ...
def __mod__(self, other): ...
def __pow__(self, other): ...
def __eq__(self, other): ...
def __ne__(self, other): ...
def __lt__(self, other): ...
def __le__(self, other): ...
def __gt__(self, other): ...
def __ge__(self, other): ...
def __and__(self, other): ...
def __or__(self, other): ...
def __invert__(self): ...def by(*args):
"""
Create grouping specification for group-by operations.
Parameters:
- *args: Column expressions or names to group by
Returns:
Grouping object for use in Frame operations
"""
def join(frame, on=None, how='inner'):
"""
Create join specification for joining frames.
Parameters:
- frame: Frame to join with
- on: Columns to join on (auto-detected if None)
- how: Join type ('inner', 'left', 'right', 'outer')
Returns:
Join object for use in Frame operations
"""def update(**kwargs):
"""
Create update specification for adding or modifying columns.
Parameters:
- **kwargs: column_name=expression pairs
Returns:
Update object for use in Frame operations
"""import datatable as dt
from datatable import f, g
DT = dt.Frame({
'A': [1, 2, 3, 4, 5],
'B': [10, 20, 30, 40, 50],
'C': ['x', 'y', 'z', 'x', 'y']
})
# Column selection
DT[:, f.A] # Select column A
DT[:, f[0]] # Select first column by index
DT[:, f['A']] # Select column by name string
DT[:, [f.A, f.B]] # Select multiple columns
DT[:, f.A:f.C] # Select column range
DT[:, f[:]] # Select all columns# Arithmetic operations
DT[:, f.A + f.B] # Add columns
DT[:, f.A * 2] # Multiply by scalar
DT[:, f.A / f.B] # Division
DT[:, f.A ** 2] # Power
DT[:, f.A % 3] # Modulo
# Create new columns with expressions
DT[:, dt.update(D=f.A + f.B, E=f.A * f.B)]# Boolean operations
DT[f.A > 2, :] # Filter rows
DT[f.A > f.B, :] # Compare columns
DT[(f.A > 2) & (f.B < 40), :] # Logical AND
DT[(f.A < 2) | (f.A > 4), :] # Logical OR
DT[~(f.A == 3), :] # Logical NOT
# Conditional assignment
DT[:, dt.update(flag=dt.ifelse(f.A > 3, "high", "low"))]# String operations using dt.str module
DT[:, dt.str.len(f.C)] # String length
DT[:, dt.str.slice(f.C, 0, 1)] # String slicing
# String comparisons
DT[f.C == 'x', :] # String equality
DT[dt.re.match(f.C, '[xy]'), :] # Regex matching# Basic aggregations
DT[:, dt.sum(f.A)] # Sum of column A
DT[:, dt.mean(f.B)] # Mean of column B
DT[:, dt.count()] # Row count
# Group-by operations
DT[:, dt.sum(f.A), dt.by(f.C)] # Sum A grouped by C
DT[:, [dt.sum(f.A), dt.mean(f.B)], dt.by(f.C)] # Multiple aggregations
# Multiple grouping columns
DT[:, dt.sum(f.A), dt.by(f.C, f.A > 2)]DT2 = dt.Frame({
'C': ['x', 'y', 'z'],
'X': [100, 200, 300]
})
# Join using g to reference right frame columns
result = DT[:, [f.A, f.B, g.X], dt.join(DT2)]
# Join with conditions
result = DT[:, :, dt.join(DT2, on=f.C==g.C)]# Time component extraction
DT_time = dt.Frame({
'date': [dt.datetime(2023, 1, 1), dt.datetime(2023, 6, 15)]
})
DT_time[:, dt.time.year(f.date)] # Extract year
DT_time[:, dt.time.month(f.date)] # Extract month
DT_time[:, dt.time.day(f.date)] # Extract day# Mathematical functions
DT[:, dt.abs(f.A - 3)] # Absolute value
DT[:, dt.exp(f.A)] # Exponential
DT[:, dt.log(f.A)] # Natural logarithm
DT[:, dt.sqrt(f.A)] # Square root
DT[:, dt.isna(f.A)] # Check for missing values# Row-wise operations across columns
DT[:, dt.rowsum(f.A, f.B)] # Row-wise sum
DT[:, dt.rowmean(f.A, f.B)] # Row-wise mean
DT[:, dt.rowmax(f.A, f.B)] # Row-wise maximum
DT[:, dt.rowcount(f.A, f.B)] # Row-wise count of non-missing# Chained operations
result = DT[f.A > dt.mean(f.A),
[f.A, f.B, dt.update(ratio=f.A/f.B)]]
# Conditional aggregations
DT[:, dt.sum(dt.ifelse(f.A > 3, f.B, 0)), dt.by(f.C)]
# Window functions with shift
DT[:, dt.update(prev_A=dt.shift(f.A, 1)), dt.by(f.C)]class Expr:
"""Base expression class for column operations"""
passInstall with Tessl CLI
npx tessl i tessl/pypi-datatable