Blazingly fast DataFrame library for legacy CPUs without AVX2 support
—
The fundamental data structures that form the foundation of polars-lts-cpu's data manipulation capabilities. These classes provide different approaches to working with tabular data, from eager evaluation to lazy optimization.
Two-dimensional data structure representing tabular data with rows and columns, providing eager evaluation for immediate operations.
class DataFrame:
def __init__(
self,
data: Any = None,
schema: Optional[SchemaDict] = None,
schema_overrides: Optional[SchemaDict] = None,
orient: Optional[str] = None,
infer_schema_length: Optional[int] = 100,
nan_to_null: bool = False
):
"""
Create a DataFrame from various data sources.
Parameters:
- data: Data source (dict, list, numpy array, pandas DataFrame, etc.)
- schema: Schema specification as {column: dtype} dict
- schema_overrides: Override inferred types for specific columns
- orient: Data orientation ('col' or 'row')
- infer_schema_length: Number of rows to scan for schema inference
- nan_to_null: Convert NaN values to null
"""
# Properties
@property
def shape(self) -> tuple[int, int]:
"""Returns (height, width) tuple."""
@property
def height(self) -> int:
"""Number of rows."""
@property
def width(self) -> int:
"""Number of columns."""
@property
def columns(self) -> list[str]:
"""Column names."""
@property
def dtypes(self) -> list[type]:
"""Column data types."""
@property
def schema(self) -> dict[str, type]:
"""Schema as {column: dtype} dict."""
# Data Selection and Filtering
def select(self, *exprs: IntoExpr) -> DataFrame:
"""Select columns using expressions."""
def filter(self, predicate: IntoExpr) -> DataFrame:
"""Filter rows based on predicate."""
def with_columns(self, *exprs: IntoExpr, **named_exprs: IntoExpr) -> DataFrame:
"""Add or modify columns."""
def drop(self, *columns: str) -> DataFrame:
"""Drop columns."""
def rename(self, mapping: dict[str, str]) -> DataFrame:
"""Rename columns."""
# Data Access
def get_column(self, name: str) -> Series:
"""Get column as Series."""
def get_columns(self) -> list[Series]:
"""Get all columns as list of Series."""
def row(self, index: int, *, named: bool = False) -> tuple | dict:
"""Get single row."""
def rows(self, *, named: bool = False) -> list[tuple] | list[dict]:
"""Get all rows."""
def item(self, row: int = None, column: str | int = None) -> Any:
"""Get single item."""
# Transformations
def sort(
self,
by: ColumnNameOrSelector | list[ColumnNameOrSelector],
*,
descending: bool | list[bool] = False,
nulls_last: bool = False
) -> DataFrame:
"""Sort DataFrame."""
def reverse(self) -> DataFrame:
"""Reverse row order."""
def transpose(
self,
*,
include_header: bool = False,
header_name: str = "column",
column_names: str | list[str] | None = None
) -> DataFrame:
"""Transpose DataFrame."""
def cast(self, dtypes: dict[str, type] | type, *, strict: bool = True) -> DataFrame:
"""Cast column types."""
# Aggregations
def sum(self, *, axis: int = 0) -> DataFrame | Series:
"""Sum values."""
def mean(self, *, axis: int = 0) -> DataFrame | Series:
"""Mean of values."""
def max(self, *, axis: int = 0) -> DataFrame | Series:
"""Maximum values."""
def min(self, *, axis: int = 0) -> DataFrame | Series:
"""Minimum values."""
def std(self, *, ddof: int = 1, axis: int = 0) -> DataFrame | Series:
"""Standard deviation."""
def var(self, *, ddof: int = 1, axis: int = 0) -> DataFrame | Series:
"""Variance."""
def median(self, *, axis: int = 0) -> DataFrame | Series:
"""Median values."""
def quantile(self, quantile: float, *, interpolation: str = "nearest", axis: int = 0) -> DataFrame | Series:
"""Quantile values."""
# Horizontal Operations
def sum_horizontal(self, *exprs: IntoExpr) -> DataFrame:
"""Sum values horizontally across columns."""
def mean_horizontal(self, *exprs: IntoExpr) -> DataFrame:
"""Mean values horizontally across columns."""
def max_horizontal(self, *exprs: IntoExpr) -> DataFrame:
"""Maximum values horizontally across columns."""
def min_horizontal(self, *exprs: IntoExpr) -> DataFrame:
"""Minimum values horizontally across columns."""
# Grouping Operations
def group_by(
self,
*by: IntoExpr,
maintain_order: bool = False,
**named_by: IntoExpr
) -> GroupBy:
"""Group DataFrame by expressions."""
def rolling(
self,
index_column: str,
*,
period: str | timedelta,
offset: str | timedelta | None = None,
closed: str = "right",
by: str | list[str] | None = None,
check_sorted: bool = True
) -> RollingGroupBy:
"""Create rolling window groupby."""
# Joins
def join(
self,
other: DataFrame,
on: str | list[str] | None = None,
how: str = "inner",
*,
left_on: str | list[str] | None = None,
right_on: str | list[str] | None = None,
suffix: str = "_right",
validate: str = "m:m",
join_nulls: bool = False
) -> DataFrame:
"""Join with another DataFrame."""
def join_asof(
self,
other: DataFrame,
*,
left_on: str | None = None,
right_on: str | None = None,
on: str | None = None,
by_left: str | list[str] | None = None,
by_right: str | list[str] | None = None,
by: str | list[str] | None = None,
strategy: str = "backward",
suffix: str = "_right",
tolerance: str | int | float | None = None,
allow_exact_matches: bool = True
) -> DataFrame:
"""Perform asof join."""
# Reshaping
def pivot(
self,
*,
on: ColumnNameOrSelector,
index: ColumnNameOrSelector | None = None,
values: ColumnNameOrSelector | None = None,
aggregate_function: str | Expr | None = None,
maintain_order: bool = True,
sort_columns: bool = False,
separator: str = "_"
) -> DataFrame:
"""Pivot DataFrame."""
def unpivot(
self,
on: ColumnNameOrSelector | None = None,
*,
index: ColumnNameOrSelector | None = None,
variable_name: str | None = None,
value_name: str | None = None
) -> DataFrame:
"""Unpivot DataFrame."""
def melt(
self,
id_vars: ColumnNameOrSelector | None = None,
value_vars: ColumnNameOrSelector | None = None,
*,
variable_name: str | None = None,
value_name: str | None = None
) -> DataFrame:
"""Melt DataFrame from wide to long format."""
# Utilities
def head(self, n: int = 5) -> DataFrame:
"""Get first n rows."""
def tail(self, n: int = 5) -> DataFrame:
"""Get last n rows."""
def slice(self, offset: int, length: int | None = None) -> DataFrame:
"""Slice DataFrame."""
def limit(self, n: int) -> DataFrame:
"""Limit to n rows."""
def sample(
self,
n: int | None = None,
*,
fraction: float | None = None,
with_replacement: bool = False,
shuffle: bool = False,
seed: int | None = None
) -> DataFrame:
"""Sample rows."""
def unique(
self,
subset: ColumnNameOrSelector | None = None,
*,
keep: str = "any",
maintain_order: bool = False
) -> DataFrame:
"""Get unique rows."""
def drop_nulls(self, subset: ColumnNameOrSelector | None = None) -> DataFrame:
"""Drop rows with null values."""
def fill_null(
self,
value: Any = None,
strategy: str | None = None,
limit: int | None = None,
*,
matches_supertype: bool = True
) -> DataFrame:
"""Fill null values."""
# Conversion
def lazy(self) -> LazyFrame:
"""Convert to LazyFrame."""
def to_series(self, index: int = 0) -> Series:
"""Convert to Series."""
def to_dict(self, *, as_series: bool = True) -> dict:
"""Convert to dictionary."""
def to_dicts(self) -> list[dict]:
"""Convert to list of dictionaries."""
def to_numpy(self, *, structured: bool = False, order: str = "c") -> np.ndarray:
"""Convert to numpy array."""
def to_pandas(self, **kwargs) -> pd.DataFrame:
"""Convert to pandas DataFrame."""
def to_arrow(self) -> pa.Table:
"""Convert to PyArrow table."""
# I/O Operations
def write_csv(
self,
file: str | Path | BytesIO,
*,
include_bom: bool = False,
include_header: bool = True,
separator: str = ",",
line_terminator: str = "\n",
quote_char: str = '"',
batch_size: int = 1024,
datetime_format: str | None = None,
date_format: str | None = None,
time_format: str | None = None,
float_scientific: bool | None = None,
float_precision: int | None = None,
null_value: str = ""
) -> None:
"""Write to CSV file."""
def write_parquet(
self,
file: str | Path | BytesIO,
*,
compression: str = "zstd",
compression_level: int | None = None,
statistics: bool | dict[str, bool] = True,
row_group_size: int | None = None,
data_page_size: int | None = None,
maintain_order: bool = True
) -> None:
"""Write to Parquet file."""
def write_json(self, file: str | Path | BytesIO, *, pretty: bool = False, row_oriented: bool = False) -> None:
"""Write to JSON file."""
# Analysis
def describe(self, *, percentiles: Sequence[float] = (0.25, 0.5, 0.75)) -> DataFrame:
"""Generate descriptive statistics."""
def equals(self, other: DataFrame, *, null_equal: bool = True) -> bool:
"""Check equality with another DataFrame."""
def is_duplicated(self) -> Series:
"""Check for duplicated rows."""
def is_unique(self) -> Series:
"""Check for unique rows."""Lazy evaluation version of DataFrame that builds a computation graph for optimized query execution.
class LazyFrame:
# Properties
@property
def columns(self) -> list[str]:
"""Column names."""
@property
def dtypes(self) -> list[type]:
"""Column data types."""
@property
def schema(self) -> dict[str, type]:
"""Schema as {column: dtype} dict."""
@property
def width(self) -> int:
"""Number of columns."""
# Query Execution
def collect(
self,
*,
predicate_pushdown: bool = True,
projection_pushdown: bool = True,
simplify_expression: bool = True,
slice_pushdown: bool = True,
comm_subplan_elim: bool = True,
comm_subexpr_elim: bool = True,
cluster_with_columns: bool = True,
streaming: bool = False
) -> DataFrame:
"""Execute lazy query and return DataFrame."""
def collect_async(self, *, gevent: bool = False) -> Awaitable[DataFrame]:
"""Execute lazy query asynchronously."""
def fetch(
self,
n_rows: int = 500,
*,
type_coercion: bool = True,
predicate_pushdown: bool = True,
projection_pushdown: bool = True,
simplify_expression: bool = True,
slice_pushdown: bool = True,
comm_subplan_elim: bool = True,
comm_subexpr_elim: bool = True,
streaming: bool = False
) -> DataFrame:
"""Execute lazy query for first n rows."""
def explain(
self,
*,
format: str = "plain",
optimized: bool = True,
type_coercion: bool = True,
predicate_pushdown: bool = True,
projection_pushdown: bool = True,
simplify_expression: bool = True,
slice_pushdown: bool = True,
comm_subplan_elim: bool = True,
comm_subexpr_elim: bool = True,
streaming: bool = False,
tree_format: bool | None = None
) -> str:
"""Show query execution plan."""
# Transformations (same interface as DataFrame but lazy)
def select(self, *exprs: IntoExpr) -> LazyFrame: ...
def filter(self, predicate: IntoExpr) -> LazyFrame: ...
def with_columns(self, *exprs: IntoExpr, **named_exprs: IntoExpr) -> LazyFrame: ...
def drop(self, *columns: str) -> LazyFrame: ...
def rename(self, mapping: dict[str, str]) -> LazyFrame: ...
def sort(self, by: ColumnNameOrSelector, *, descending: bool = False) -> LazyFrame: ...
def reverse(self) -> LazyFrame: ...
def cast(self, dtypes: dict[str, type] | type, *, strict: bool = True) -> LazyFrame: ...
# Grouping Operations
def group_by(self, *by: IntoExpr, maintain_order: bool = False) -> LazyGroupBy: ...
def rolling(self, index_column: str, *, period: str) -> RollingGroupBy: ...
# Joins
def join(self, other: LazyFrame, on: str | list[str], how: str = "inner", **kwargs) -> LazyFrame: ...
def join_asof(self, other: LazyFrame, **kwargs) -> LazyFrame: ...
# Utilities
def head(self, n: int = 5) -> LazyFrame: ...
def tail(self, n: int = 5) -> LazyFrame: ...
def slice(self, offset: int, length: int | None = None) -> LazyFrame: ...
def limit(self, n: int) -> LazyFrame: ...
# Streaming Sinks
def sink_parquet(
self,
path: str | Path,
*,
compression: str = "zstd",
maintain_order: bool = True,
**kwargs
) -> DataFrame:
"""Write to Parquet file using streaming engine."""
def sink_csv(self, path: str | Path, **kwargs) -> DataFrame:
"""Write to CSV file using streaming engine."""One-dimensional data structure representing a single column of data.
class Series:
def __init__(
self,
name: str | None = None,
values: Sequence[Any] | None = None,
dtype: type | None = None,
*,
strict: bool = True,
nan_to_null: bool = False
):
"""
Create a Series.
Parameters:
- name: Series name
- values: Data values
- dtype: Data type
- strict: Strict type checking
- nan_to_null: Convert NaN to null
"""
# Properties
@property
def dtype(self) -> type:
"""Data type."""
@property
def name(self) -> str:
"""Series name."""
@property
def shape(self) -> tuple[int]:
"""Shape as (length,) tuple."""
# Arithmetic Operations
def __add__(self, other: Any) -> Series: ...
def __sub__(self, other: Any) -> Series: ...
def __mul__(self, other: Any) -> Series: ...
def __truediv__(self, other: Any) -> Series: ...
def __floordiv__(self, other: Any) -> Series: ...
def __mod__(self, other: Any) -> Series: ...
def __pow__(self, other: Any) -> Series: ...
# Comparison Operations
def eq(self, other: Any) -> Series:
"""Element-wise equality."""
def ne(self, other: Any) -> Series:
"""Element-wise inequality."""
def lt(self, other: Any) -> Series:
"""Element-wise less than."""
def le(self, other: Any) -> Series:
"""Element-wise less than or equal."""
def gt(self, other: Any) -> Series:
"""Element-wise greater than."""
def ge(self, other: Any) -> Series:
"""Element-wise greater than or equal."""
# Aggregations
def sum(self) -> Any:
"""Sum of values."""
def mean(self) -> float | None:
"""Mean of values."""
def max(self) -> Any:
"""Maximum value."""
def min(self) -> Any:
"""Minimum value."""
def std(self, ddof: int = 1) -> float | None:
"""Standard deviation."""
def var(self, ddof: int = 1) -> float | None:
"""Variance."""
def median(self) -> float | None:
"""Median value."""
def quantile(self, quantile: float, interpolation: str = "nearest") -> float | None:
"""Quantile value."""
# Data Access
def get(self, index: int) -> Any:
"""Get value by index."""
def item(self, index: int | None = None) -> Any:
"""Get single item."""
def gather(self, indices: list[int] | Series) -> Series:
"""Gather values by indices."""
# Transformations
def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Series:
"""Sort Series."""
def reverse(self) -> Series:
"""Reverse Series."""
def cast(self, dtype: type, *, strict: bool = True) -> Series:
"""Cast to different type."""
def rename(self, name: str) -> Series:
"""Rename Series."""
# Utilities
def drop_nulls(self) -> Series:
"""Drop null values."""
def fill_null(self, value: Any = None, strategy: str | None = None) -> Series:
"""Fill null values."""
def unique(self, *, maintain_order: bool = False) -> Series:
"""Get unique values."""
def value_counts(self, *, sort: bool = False, parallel: bool = False) -> DataFrame:
"""Count unique values."""
# Conversion
def to_frame(self, name: str | None = None) -> DataFrame:
"""Convert to DataFrame."""
def to_list(self) -> list[Any]:
"""Convert to Python list."""
def to_numpy(self, *, zero_copy_only: bool = False, writable: bool = False) -> np.ndarray:
"""Convert to numpy array."""
def to_pandas(self, **kwargs) -> pd.Series:
"""Convert to pandas Series."""
# Namespaces
@property
def str(self) -> StringNameSpace:
"""String operations namespace."""
@property
def dt(self) -> DateTimeNameSpace:
"""DateTime operations namespace."""
@property
def list(self) -> ListNameSpace:
"""List operations namespace."""
@property
def struct(self) -> StructNameSpace:
"""Struct operations namespace."""Expression object for building complex lazy computations and transformations.
class Expr:
# Arithmetic Operations
def __add__(self, other: Any) -> Expr: ...
def __sub__(self, other: Any) -> Expr: ...
def __mul__(self, other: Any) -> Expr: ...
def __truediv__(self, other: Any) -> Expr: ...
# Aggregations
def sum(self) -> Expr:
"""Sum aggregation."""
def mean(self) -> Expr:
"""Mean aggregation."""
def max(self) -> Expr:
"""Maximum aggregation."""
def min(self) -> Expr:
"""Minimum aggregation."""
def count(self) -> Expr:
"""Count aggregation."""
def std(self, ddof: int = 1) -> Expr:
"""Standard deviation."""
def var(self, ddof: int = 1) -> Expr:
"""Variance."""
# Window Functions
def over(self, *partition_by: IntoExpr, order_by: IntoExpr | None = None) -> Expr:
"""Window function over partitions."""
def rolling_sum(self, window_size: int | str, weights: list[float] | None = None) -> Expr:
"""Rolling sum."""
def rolling_mean(self, window_size: int | str, weights: list[float] | None = None) -> Expr:
"""Rolling mean."""
# Conditional Logic
def when(self, predicate: Expr) -> ExprWhenThen:
"""Start conditional expression."""
def then(self, statement: IntoExpr) -> ExprWhenThen:
"""Then clause in conditional."""
def otherwise(self, statement: IntoExpr) -> Expr:
"""Else clause in conditional."""
# Transformations
def cast(self, dtype: type, *, strict: bool = True) -> Expr:
"""Cast to different type."""
def alias(self, name: str) -> Expr:
"""Alias expression."""
def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Expr:
"""Sort expression."""
def reverse(self) -> Expr:
"""Reverse expression."""
# Utilities
def is_null(self) -> Expr:
"""Check for null values."""
def is_not_null(self) -> Expr:
"""Check for non-null values."""
def fill_null(self, value: Any = None, strategy: str | None = None) -> Expr:
"""Fill null values."""
def drop_nulls(self) -> Expr:
"""Drop null values."""
# Namespaces
@property
def str(self) -> ExprStringNameSpace:
"""String operations namespace."""
@property
def dt(self) -> ExprDateTimeNameSpace:
"""DateTime operations namespace."""
@property
def list(self) -> ExprListNameSpace:
"""List operations namespace."""
@property
def arr(self) -> ExprArrayNameSpace:
"""Array operations namespace."""
@property
def struct(self) -> ExprStructNameSpace:
"""Struct operations namespace."""
@property
def cat(self) -> ExprCategoricalNameSpace:
"""Categorical operations namespace."""
@property
def bin(self) -> ExprBinaryNameSpace:
"""Binary operations namespace."""
@property
def name(self) -> ExprNameNameSpace:
"""Name operations namespace."""
@property
def meta(self) -> ExprMetaNameSpace:
"""Meta operations namespace."""import polars as pl
# Create DataFrame
df = pl.DataFrame({
"id": [1, 2, 3, 4],
"name": ["Alice", "Bob", "Charlie", "Diana"],
"age": [25, 30, 35, 28],
"salary": [50000, 60000, 70000, 55000]
})
# Select and transform columns
result = df.select([
pl.col("name"),
pl.col("age"),
(pl.col("salary") * 1.1).alias("new_salary")
]).filter(pl.col("age") > 25)
print(result)# Create LazyFrame and build query
lazy_df = (
pl.scan_csv("large_file.csv")
.filter(pl.col("date") >= "2023-01-01")
.group_by("category")
.agg([
pl.col("amount").sum().alias("total_amount"),
pl.col("id").count().alias("count")
])
.sort("total_amount", descending=True)
)
# Execute query
result = lazy_df.collect()# Create Series
s = pl.Series("values", [1, 2, 3, 4, 5])
# Perform operations
doubled = s * 2
mean_val = s.mean()
unique_vals = s.unique()
# String operations
text_series = pl.Series("text", ["hello", "world", "polars"])
upper_text = text_series.str.upper()# Complex expressions
expr = (
pl.when(pl.col("age") < 30)
.then(pl.col("salary") * 0.8)
.when(pl.col("age") < 40)
.then(pl.col("salary") * 0.9)
.otherwise(pl.col("salary"))
.alias("adjusted_salary")
)
# Use in DataFrame
df_with_adjustment = df.with_columns(expr)Configuration class for controlling query optimization behavior in LazyFrame operations, allowing fine-grained control over performance optimizations.
class QueryOptFlags:
def __init__(
self,
*,
predicate_pushdown: Optional[bool] = None,
projection_pushdown: Optional[bool] = None,
simplify_expression: Optional[bool] = None,
slice_pushdown: Optional[bool] = None,
comm_subplan_elim: Optional[bool] = None,
comm_subexpr_elim: Optional[bool] = None,
cluster_with_columns: Optional[bool] = None,
collapse_joins: Optional[bool] = None,
check_order_observe: Optional[bool] = None,
fast_projection: Optional[bool] = None,
):
"""
Configure query optimization flags.
Parameters:
- predicate_pushdown: Push predicates down in the query tree
- projection_pushdown: Push projections down in the query tree
- simplify_expression: Simplify expressions during optimization
- slice_pushdown: Push slice operations down in the query tree
- comm_subplan_elim: Eliminate common subplans
- comm_subexpr_elim: Eliminate common subexpressions
- cluster_with_columns: Cluster with_columns operations
- collapse_joins: Collapse consecutive joins
- check_order_observe: Check if ordering is observed
- fast_projection: Use fast projection when possible
"""
@staticmethod
def none(**kwargs) -> QueryOptFlags:
"""Create QueryOptFlags with all optimizations disabled."""
def update(self, **kwargs) -> QueryOptFlags:
"""Update optimization flags."""Configuration class for GPU-accelerated processing in LazyFrame operations.
class GPUEngine:
def __init__(self):
"""
Configure GPU engine for accelerated processing.
Note: GPU processing requires compatible hardware and drivers.
"""Install with Tessl CLI
npx tessl i tessl/pypi-polars-lts-cpu