Python package for manipulating 2-dimensional tabular data structures with emphasis on speed and big data support
—
Comprehensive functions for combining, transforming, and reshaping data frames with high-performance operations.
Combine multiple frames by columns or rows with flexible options for handling mismatched structures.
def cbind(*frames, force=False) -> Frame:
"""
Bind frames column-wise (horizontally).
Parameters:
- *frames: Frame objects to bind
- force: Force binding even with mismatched row counts
Returns:
Frame with columns from all input frames
"""
def rbind(*frames, force=False, bynames=True) -> Frame:
"""
Bind frames row-wise (vertically).
Parameters:
- *frames: Frame objects to bind
- force: Force binding even with mismatched column types
- bynames: Match columns by name (True) or position (False)
Returns:
Frame with rows from all input frames
"""Sort frames and extract unique values with flexible column specifications.
def sort(frame, *cols, reverse=False, na_position='first') -> Frame:
"""
Sort frame by specified columns.
Parameters:
- frame: Frame to sort
- *cols: Column expressions or names to sort by
- reverse: Sort in descending order
- na_position: Position of NA values ('first' or 'last')
Returns:
Sorted Frame
"""
def unique(frame, *cols) -> Frame:
"""
Return unique rows based on specified columns.
Parameters:
- frame: Frame to process
- *cols: Columns to consider for uniqueness (all if none specified)
Returns:
Frame with unique rows
"""Helper functions for data transformation and manipulation.
def repeat(frame, n) -> Frame:
"""
Repeat frame rows n times.
Parameters:
- frame: Frame to repeat
- n: Number of repetitions
Returns:
Frame with repeated rows
"""
def shift(column, n=1) -> FExpr:
"""
Shift column values by n positions.
Parameters:
- column: Column expression to shift
- n: Number of positions to shift (positive=down, negative=up)
Returns:
Expression with shifted values
"""
def fillna(column, value) -> FExpr:
"""
Fill missing values in column with specified value.
Parameters:
- column: Column expression with missing values
- value: Value to use for filling NAs
Returns:
Expression with filled values
"""
def ifelse(condition, true_value, false_value) -> FExpr:
"""
Conditional expression returning different values based on condition.
Parameters:
- condition: Boolean expression
- true_value: Value when condition is True
- false_value: Value when condition is False
Returns:
Expression with conditional values
"""Convert between different data types with explicit control over the conversion process.
def as_type(frame_or_column, new_type) -> Frame:
"""
Convert frame or column to specified type.
Parameters:
- frame_or_column: Frame or column expression to convert
- new_type: Target stype or Type object
Returns:
Frame or expression with converted types
"""
def update(**kwargs) -> UpdateExpr:
"""
Create update specification for adding or modifying columns.
Parameters:
- **kwargs: column_name=expression pairs
Returns:
Update expression for use in Frame operations
"""
def cut(column, bins, right=True, labels=None) -> FExpr:
"""
Bin values into discrete intervals.
Parameters:
- column: Column expression to bin
- bins: Number of bins or sequence of bin edges
- right: Include right edge of intervals
- labels: Labels for bins
Returns:
Categorical column with binned values
"""
def qcut(column, q, labels=None) -> FExpr:
"""
Quantile-based binning of values.
Parameters:
- column: Column expression to bin
- q: Number of quantiles or sequence of quantile boundaries
- labels: Labels for bins
Returns:
Categorical column with quantile-based bins
"""
def split_into_nhot(frame, delimiter=",") -> Frame:
"""
One-hot encoding for delimited string values.
Parameters:
- frame: Frame containing delimited strings
- delimiter: Character used to separate values
Returns:
Frame with binary columns for each unique value
"""import datatable as dt
# Create sample frames
DT1 = dt.Frame({'A': [1, 2, 3], 'B': [4, 5, 6]})
DT2 = dt.Frame({'C': [7, 8, 9], 'D': [10, 11, 12]})
DT3 = dt.Frame({'E': [13, 14, 15]})
# Bind columns
result = dt.cbind(DT1, DT2, DT3)
# Result: Frame with columns A, B, C, D, E
# Force binding with mismatched row counts
DT4 = dt.Frame({'F': [16, 17]}) # Only 2 rows
result = dt.cbind(DT1, DT4, force=True) # Shorter frame is recycled# Create compatible frames
DT1 = dt.Frame({'X': [1, 2], 'Y': ['a', 'b']})
DT2 = dt.Frame({'X': [3, 4], 'Y': ['c', 'd']})
DT3 = dt.Frame({'X': [5, 6], 'Y': ['e', 'f']})
# Bind rows
result = dt.rbind(DT1, DT2, DT3)
# Result: Frame with 6 rows and columns X, Y
# Bind with different column orders
DT4 = dt.Frame({'Y': ['g', 'h'], 'X': [7, 8]})
result = dt.rbind(DT1, DT4, bynames=True) # Matches by column names
# Force binding with type mismatches
DT5 = dt.Frame({'X': [1.1, 2.2], 'Y': ['i', 'j']}) # X is float
result = dt.rbind(DT1, DT5, force=True) # Forces compatible typesDT = dt.Frame({
'A': [3, 1, 4, 1, 5],
'B': ['c', 'a', 'd', 'a', 'e'],
'C': [3.3, 1.1, 4.4, 1.2, 5.5]
})
# Sort by single column
sorted_DT = dt.sort(DT, f.A) # Sort by A ascending
sorted_DT = dt.sort(DT, -f.A) # Sort by A descending
sorted_DT = dt.sort(DT, f.A, reverse=True) # Alternative descending
# Sort by multiple columns
sorted_DT = dt.sort(DT, f.B, f.A) # Sort by B, then A
sorted_DT = dt.sort(DT, f.B, -f.C) # Sort by B asc, C desc
# Sort with NA handling
DT_na = dt.Frame({'X': [3, None, 1, None, 2]})
sorted_DT = dt.sort(DT_na, f.X, na_position='last')# Sort as part of selection
result = DT[:, :, dt.sort(f.A)]
result = DT[f.A > 2, :, dt.sort(f.B)]
# Sort within groups
result = DT[:, :, dt.sort(f.C), dt.by(f.B)]DT = dt.Frame({
'A': [1, 2, 2, 3, 3, 3],
'B': ['x', 'y', 'y', 'z', 'z', 'w'],
'C': [1.1, 2.2, 2.2, 3.3, 3.4, 3.5]
})
# Unique rows (all columns)
unique_DT = dt.unique(DT)
# Unique based on specific columns
unique_DT = dt.unique(DT, f.A) # Unique values of A
unique_DT = dt.unique(DT, f.A, f.B) # Unique combinations of A and B
# Unique in Frame operations
result = DT[:, :, dt.unique(f.A)]DT = dt.Frame({
'score': [85, 92, 78, 95, 67],
'category': ['A', 'B', 'A', 'B', 'C']
})
# Simple conditional
result = DT[:, dt.update(
grade=dt.ifelse(f.score >= 90, "A", "B")
)]
# Nested conditionals
result = DT[:, dt.update(
grade=dt.ifelse(f.score >= 90, "A",
dt.ifelse(f.score >= 80, "B",
dt.ifelse(f.score >= 70, "C", "F")))
)]
# Conditional aggregation
result = DT[:, dt.sum(dt.ifelse(f.score >= 80, 1, 0)), dt.by(f.category)]DT = dt.Frame({
'A': [1, None, 3, None, 5],
'B': [1.1, 2.2, None, 4.4, None]
})
# Fill missing values
result = DT[:, dt.update(
A_filled=dt.fillna(f.A, 0),
B_filled=dt.fillna(f.B, dt.mean(f.B))
)]
# Forward fill
result = DT[:, dt.update(
A_ffill=dt.fillna(f.A, dt.shift(f.A, 1))
)]
# Conditional filling
result = DT[:, dt.update(
A_smart=dt.ifelse(dt.isna(f.A), dt.mean(f.A), f.A)
)]DT = dt.Frame({
'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
'value': [10, 15, 12, 18]
})
# Lag values (shift down)
result = DT[:, dt.update(
prev_value=dt.shift(f.value, 1), # Previous value
prev2_value=dt.shift(f.value, 2) # Value 2 periods ago
)]
# Lead values (shift up)
result = DT[:, dt.update(
next_value=dt.shift(f.value, -1), # Next value
next2_value=dt.shift(f.value, -2) # Value 2 periods ahead
)]
# Calculate differences
result = DT[:, dt.update(
diff=f.value - dt.shift(f.value, 1),
pct_change=((f.value - dt.shift(f.value, 1)) / dt.shift(f.value, 1)) * 100
)]DT = dt.Frame({'A': [1, 2], 'B': ['x', 'y']})
# Repeat entire frame
repeated = dt.repeat(DT, 3) # 6 rows total
# Repeat with expressions
result = DT[:, dt.repeat(f.A, 2)] # Each value repeated twice
# Create expanding sequences
base = dt.Frame({'seq': [1]})
expanded = dt.repeat(base, 5)[:, dt.update(seq=range(1, 6))]DT = dt.Frame({
'A': [1, 2, 3], # int64 by default
'B': [1.1, 2.2, 3.3], # float64 by default
'C': ['1', '2', '3'] # str64
})
# Convert single column
result = DT[:, dt.update(A_float=dt.as_type(f.A, dt.float32))]
# Convert multiple columns
result = DT[:, dt.update(
A_str=dt.as_type(f.A, dt.str32),
C_int=dt.as_type(f.C, dt.int32)
)]
# Convert entire frame
DT_float = dt.as_type(DT, dt.float64)# Conditional type conversion
result = DT[:, dt.update(
A_converted=dt.ifelse(f.A > 2,
dt.as_type(f.A, dt.float32),
dt.as_type(f.A, dt.int32))
)]
# Safe conversion with error handling
try:
result = DT[:, dt.update(C_numeric=dt.as_type(f.C, dt.float64))]
except dt.exceptions.TypeError as e:
# Handle conversion errors
result = DT[:, dt.update(C_numeric=dt.fillna(dt.as_type(f.C, dt.float64), 0))]Install with Tessl CLI
npx tessl i tessl/pypi-datatable