Python package for manipulating 2-dimensional tabular data structures with emphasis on speed and big data support
npx @tessl/cli install tessl/pypi-datatable@1.1.0A high-performance Python library for manipulating 2-dimensional tabular data structures with emphasis on speed and big data support up to 100GB on single-node machines. It features column-oriented data storage with native-C implementation, fast CSV reading, multi-threaded processing, and an expressive query syntax similar to R's data.table.
pip install datatableimport datatable as dt
from datatable import f, g, by, joinCommon pattern for data manipulation:
import datatable as dt
from datatable import f, g, byimport datatable as dt
from datatable import f, g, by
# Read data from CSV
DT = dt.fread("data.csv")
# Create a Frame from data
DT = dt.Frame({
'A': [1, 2, 3, 4, 5],
'B': ['a', 'b', 'c', 'd', 'e'],
'C': [1.1, 2.2, 3.3, 4.4, 5.5]
})
# Basic operations
result = DT[:, f.A] # Select column A
result = DT[f.A > 2, :] # Filter rows where A > 2
result = DT[:, dt.sum(f.A)] # Aggregate sum of column A
# Groupby operations
result = DT[:, dt.sum(f.A), by(f.B)] # Sum A grouped by B
# Update operations
DT[:, dt.update(D=f.A * 2)] # Add new column D
# Join operations
DT2 = dt.Frame({'B': ['a', 'b'], 'X': [10, 20]})
result = DT[:, :, dt.join(DT2)] # Join on common columnsdatatable follows a columnar storage architecture for performance:
The library is designed specifically for machine learning applications requiring fast feature generation from large datasets, with copy-on-write semantics and rowindex views to minimize data copying.
The Frame class provides the main interface for tabular data manipulation with high-performance columnar storage and comprehensive data type support.
class Frame:
def __init__(self, data=None, *, names=None, stypes=None,
stype=None, types=None, type=None): ...
@property
def shape(self) -> tuple: ...
@property
def names(self) -> tuple: ...
@property
def stypes(self) -> tuple: ...
def __getitem__(self, key): ...
def __setitem__(self, key, value): ...Column references and expression building using f and g objects for flexible data queries and transformations.
# Column reference objects
f: object # Primary column reference
g: object # Secondary column reference (for joins)
class FExpr:
"""Expression object for column operations"""
pass
class Namespace:
"""Namespace for organizing column references"""
passHigh-performance reading and writing of various file formats with automatic type detection and memory-efficient processing.
def fread(anysource=None, *, file=None, text=None, cmd=None,
url=None, **kwargs) -> Frame: ...
def iread(anysource=None, *, file=None, text=None, cmd=None,
url=None, **kwargs): ... # Iterator versionComprehensive set of functions for combining, transforming, and reshaping data frames.
def cbind(*frames) -> Frame: ...
def rbind(*frames, force=False, bynames=True) -> Frame: ...
def unique(frame, *cols) -> Frame: ...
def sort(frame, *cols) -> Frame: ...
def update(**kwargs): ... # Update/add columns
def fillna(frame, value): ... # Fill missing values
def repeat(frame, n): ... # Repeat rows n times
def shift(frame, n): ... # Shift values by n positionsStatistical and mathematical reduction functions for data analysis and aggregation operations.
def sum(expr): ...
def mean(expr): ...
def count(expr=None): ...
def min(expr): ...
def max(expr): ...
def median(expr): ...
def sd(expr): ... # Standard deviation
def nunique(expr): ...Comprehensive mathematical operations including trigonometric, logarithmic, and statistical functions.
def abs(x): ...
def exp(x): ...
def log(x): ...
def log10(x): ...
def sqrt(x): ...
def isna(x): ...
def ifelse(condition, x, y): ... # Conditional selectionMathematical set operations for combining and comparing data frames.
def union(*frames) -> Frame: ...
def intersect(*frames) -> Frame: ...
def setdiff(frame1, frame2) -> Frame: ...
def symdiff(frame1, frame2) -> Frame: ...Element-wise operations across columns within rows for complex transformations.
def rowall(*cols): ...
def rowany(*cols): ...
def rowcount(*cols): ...
def rowsum(*cols): ...
def rowmean(*cols): ...Text processing and manipulation functions for string columns.
# String module functions
def len(x): ... # String length
def slice(x, start, stop=None): ... # String slicingDate and time manipulation functions for temporal data analysis.
def year(x): ...
def month(x): ...
def day(x): ...
def hour(x): ...
def minute(x): ...
def second(x): ...Comprehensive type system with storage types (stype) and logical types (ltype) for precise data type control.
class stype(Enum):
void = 0
bool8 = 1
int8 = 2
int16 = 3
int32 = 4
int64 = 5
float32 = 6
float64 = 7
str32 = 11
str64 = 12
obj64 = 21
def as_type(frame, new_type): ...Functions for data discretization and categorical encoding operations.
def cut(x, bins, right=True, labels=None): ... # Bin values into discrete intervals
def qcut(x, q, labels=None): ... # Quantile-based discretization
def split_into_nhot(frame, delimiter=","): ... # One-hot encoding for delimited strings# Module alias
dt = datatable # Common alias for the datatable module
# Configuration
options: Config # Global configuration system
# Display initialization
init_styles(): ... # Initialize display styles (auto-run in Jupyter)