Blazingly fast DataFrame library with 64-bit index support for handling datasets with more than 4.2 billion rows
—
Primary data structures for working with tabular data in Polars, including eager DataFrame/Series for immediate operations and LazyFrame for optimized query execution with the 64-bit index variant supporting datasets exceeding 4.2 billion rows.
Two-dimensional labeled data structure with columns of potentially different types. The primary data structure for eager evaluation where operations are executed immediately.
class DataFrame:
def __init__(
self,
data=None,
schema=None,
schema_overrides=None,
orient=None,
infer_schema_length=N_INFER_DEFAULT,
nan_to_null=False
):
"""
Create a DataFrame from various data sources.
Parameters:
- data: Data source (dict, list, numpy array, pandas DataFrame, etc.)
- schema: Column names and types
- schema_overrides: Override inferred types for specific columns
- orient: Data orientation ("row" or "col")
- infer_schema_length: Number of rows to scan for type inference
- nan_to_null: Convert NaN values to null
"""
@property
def shape(self) -> tuple[int, int]:
"""Get the shape (rows, columns) of the DataFrame."""
@property
def height(self) -> int:
"""Get the number of rows."""
@property
def width(self) -> int:
"""Get the number of columns."""
@property
def columns(self) -> list[str]:
"""Get column names."""
@property
def dtypes(self) -> list[DataType]:
"""Get data types of all columns."""
@property
def schema(self) -> Schema:
"""Get the schema (column names and types)."""
def select(self, *exprs, **named_exprs) -> DataFrame:
"""
Select columns using expressions.
Parameters:
- exprs: Column expressions to select
- named_exprs: Named expressions for new columns
Returns:
DataFrame with selected columns
"""
def filter(self, *predicates, **constraints) -> DataFrame:
"""
Filter rows based on predicates.
Parameters:
- predicates: Boolean expressions for filtering
- constraints: Named constraints
Returns:
Filtered DataFrame
"""
def with_columns(self, *exprs, **named_exprs) -> DataFrame:
"""
Add or modify columns.
Parameters:
- exprs: Column expressions to add/modify
- named_exprs: Named expressions for new columns
Returns:
DataFrame with added/modified columns
"""
def drop(self, *columns, strict=True) -> DataFrame:
"""
Drop columns from DataFrame.
Parameters:
- columns: Column names to drop
- strict: Whether to raise error if column doesn't exist
Returns:
DataFrame without dropped columns
"""
def rename(self, mapping: dict[str, str] | Callable[[str], str]) -> DataFrame:
"""
Rename columns.
Parameters:
- mapping: Dictionary mapping old to new names, or function
Returns:
DataFrame with renamed columns
"""
def sort(
self,
by,
*,
descending=False,
nulls_last=False,
multithreaded=True
) -> DataFrame:
"""
Sort DataFrame by columns.
Parameters:
- by: Column(s) to sort by
- descending: Sort in descending order
- nulls_last: Place nulls at end
- multithreaded: Use multiple threads
Returns:
Sorted DataFrame
"""
def group_by(self, *by, maintain_order=False, **named_by) -> GroupBy:
"""
Group DataFrame for aggregation.
Parameters:
- by: Columns to group by
- maintain_order: Maintain order of groups
- named_by: Named grouping expressions
Returns:
GroupBy object for aggregation
"""
def join(
self,
other,
on=None,
how="inner",
*,
left_on=None,
right_on=None,
suffix="_right",
validate="m:m",
join_nulls=False,
coalesce=None
) -> DataFrame:
"""
Join with another DataFrame.
Parameters:
- other: DataFrame to join with
- on: Column(s) to join on
- how: Join type ("inner", "left", "outer", "cross", "anti", "semi")
- left_on: Left DataFrame join columns
- right_on: Right DataFrame join columns
- suffix: Suffix for duplicate column names
- validate: Join validation ("m:m", "1:m", "m:1", "1:1")
- join_nulls: Join on null values
- coalesce: Coalesce join columns
Returns:
Joined DataFrame
"""
def concat(self, other, *, how="vertical", ignore_index=False) -> DataFrame:
"""
Concatenate with other DataFrame(s).
Parameters:
- other: DataFrame(s) to concatenate
- how: Concatenation method ("vertical", "horizontal", "diagonal")
- ignore_index: Reset index after concatenation
Returns:
Concatenated DataFrame
"""
def to_pandas(self, **kwargs) -> pd.DataFrame:
"""Convert to pandas DataFrame."""
def to_numpy(self, structured=False, order="c") -> np.ndarray:
"""Convert to NumPy array."""
def to_arrow(self, *, compat_level=None) -> pa.Table:
"""Convert to PyArrow Table."""
def to_dict(self, *, as_series=True) -> dict[str, Series | list[Any]]:
"""Convert to dictionary."""
def write_csv(self, file=None, **kwargs) -> str | None:
"""Write to CSV file."""
def write_json(self, file=None, **kwargs) -> str | None:
"""Write to JSON file."""
def write_parquet(self, file, **kwargs) -> None:
"""Write to Parquet file."""
def write_ipc(self, file, **kwargs) -> None:
"""Write to IPC/Arrow file."""
def lazy(self) -> LazyFrame:
"""Convert to LazyFrame for optimized operations."""
def head(self, n=5) -> DataFrame:
"""Get first n rows."""
def tail(self, n=5) -> DataFrame:
"""Get last n rows."""
def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> DataFrame:
"""Sample rows from DataFrame."""
def null_count(self) -> DataFrame:
"""Count null values per column."""
def is_empty(self) -> bool:
"""Check if DataFrame is empty."""
def clone(self) -> DataFrame:
"""Create a copy of the DataFrame."""One-dimensional labeled array with homogeneous data type. Similar to a column in a DataFrame but can exist independently.
class Series:
def __init__(
self,
name=None,
values=None,
dtype=None,
strict=True,
nan_to_null=False,
dtype_if_empty=Null
):
"""
Create a Series.
Parameters:
- name: Series name
- values: Data values
- dtype: Data type
- strict: Strict type checking
- nan_to_null: Convert NaN to null
- dtype_if_empty: Type when empty
"""
@property
def name(self) -> str:
"""Get Series name."""
@property
def dtype(self) -> DataType:
"""Get data type."""
@property
def shape(self) -> tuple[int]:
"""Get shape (length,)."""
def len(self) -> int:
"""Get length."""
def sum(self) -> Any:
"""Sum all values."""
def mean(self) -> float | None:
"""Calculate mean."""
def max(self) -> Any:
"""Get maximum value."""
def min(self) -> Any:
"""Get minimum value."""
def sort(self, *, descending=False, nulls_last=False) -> Series:
"""Sort Series values."""
def filter(self, predicate) -> Series:
"""Filter values based on predicate."""
def to_list(self) -> list[Any]:
"""Convert to Python list."""
def to_numpy(self) -> np.ndarray:
"""Convert to NumPy array."""
def to_pandas(self) -> pd.Series:
"""Convert to pandas Series."""
def to_frame(self, name=None) -> DataFrame:
"""Convert to single-column DataFrame."""Lazy evaluation version of DataFrame that builds a query plan without executing until .collect() is called. Enables query optimization and efficient processing of large datasets.
class LazyFrame:
def select(self, *exprs, **named_exprs) -> LazyFrame:
"""Select columns (lazy operation)."""
def filter(self, *predicates, **constraints) -> LazyFrame:
"""Filter rows (lazy operation)."""
def with_columns(self, *exprs, **named_exprs) -> LazyFrame:
"""Add/modify columns (lazy operation)."""
def drop(self, *columns, strict=True) -> LazyFrame:
"""Drop columns (lazy operation)."""
def sort(self, by, *, descending=False, nulls_last=False, multithreaded=True) -> LazyFrame:
"""Sort by columns (lazy operation)."""
def group_by(self, *by, maintain_order=False, **named_by) -> LazyGroupBy:
"""Group for aggregation (lazy operation)."""
def join(
self,
other,
on=None,
how="inner",
*,
left_on=None,
right_on=None,
suffix="_right",
validate="m:m",
join_nulls=False,
coalesce=None
) -> LazyFrame:
"""Join with another LazyFrame (lazy operation)."""
def collect(
self,
*,
type_coercion=True,
predicate_pushdown=True,
projection_pushdown=True,
simplify_expression=True,
slice_pushdown=True,
comm_subplan_elim=True,
comm_subexpr_elim=True,
cluster_with_columns=True,
no_optimization=False,
streaming=False,
background=False,
_eager=False
) -> DataFrame:
"""
Execute the lazy query and return DataFrame.
Parameters:
- type_coercion: Apply automatic type coercion
- predicate_pushdown: Push filters down to scan level
- projection_pushdown: Push column selection down
- simplify_expression: Simplify expressions
- slice_pushdown: Push limits/offsets down
- comm_subplan_elim: Eliminate common subplans
- comm_subexpr_elim: Eliminate common subexpressions
- cluster_with_columns: Cluster with_columns operations
- no_optimization: Disable all optimizations
- streaming: Execute in streaming mode
- background: Execute in background thread
Returns:
Executed DataFrame
"""
def explain(self, *, optimized=True, type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True, streaming=False) -> str:
"""Get query execution plan."""
def schema(self) -> Schema:
"""Get the expected schema."""
def dtypes(self) -> list[DataType]:
"""Get expected column data types."""
def columns(self) -> list[str]:
"""Get expected column names."""
def head(self, n=5) -> LazyFrame:
"""Get first n rows (lazy operation)."""
def tail(self, n=5) -> LazyFrame:
"""Get last n rows (lazy operation)."""
def limit(self, n) -> LazyFrame:
"""Limit number of rows (lazy operation)."""
def offset(self, n) -> LazyFrame:
"""Skip first n rows (lazy operation)."""
def slice(self, offset, length=None) -> LazyFrame:
"""Slice rows (lazy operation)."""GroupBy objects returned from group_by() operations on DataFrame and LazyFrame for aggregation operations.
class GroupBy:
def agg(self, *aggs, **named_aggs) -> DataFrame:
"""
Aggregate grouped data.
Parameters:
- aggs: Aggregation expressions
- named_aggs: Named aggregation expressions
Returns:
DataFrame with aggregated results
"""
def sum(self) -> DataFrame:
"""Sum each group."""
def mean(self) -> DataFrame:
"""Mean of each group."""
def max(self) -> DataFrame:
"""Maximum of each group."""
def min(self) -> DataFrame:
"""Minimum of each group."""
def count(self) -> DataFrame:
"""Count rows in each group."""
def first(self) -> DataFrame:
"""First value in each group."""
def last(self) -> DataFrame:
"""Last value in each group."""
class LazyGroupBy:
def agg(self, *aggs, **named_aggs) -> LazyFrame:
"""Aggregate grouped data (lazy operation)."""
def sum(self) -> LazyFrame:
"""Sum each group (lazy operation)."""
def mean(self) -> LazyFrame:
"""Mean of each group (lazy operation)."""
def max(self) -> LazyFrame:
"""Maximum of each group (lazy operation)."""
def min(self) -> LazyFrame:
"""Minimum of each group (lazy operation)."""
def count(self) -> LazyFrame:
"""Count rows in each group (lazy operation)."""import polars as pl
# From dictionary
df = pl.DataFrame({
"name": ["Alice", "Bob", "Charlie"],
"age": [25, 30, 35],
"salary": [50000, 60000, 70000]
})
# From list of dictionaries
data = [
{"name": "Alice", "age": 25, "salary": 50000},
{"name": "Bob", "age": 30, "salary": 60000},
{"name": "Charlie", "age": 35, "salary": 70000}
]
df = pl.DataFrame(data)
# From NumPy array
import numpy as np
arr = np.array([[1, 2, 3], [4, 5, 6]])
df = pl.DataFrame(arr, schema=["a", "b", "c"])# Basic operations
result = (df
.filter(pl.col("age") > 28)
.select([
pl.col("name"),
pl.col("age"),
(pl.col("salary") / 1000).alias("salary_k")
])
.sort("age", descending=True)
)
# Grouping and aggregation
summary = (df
.group_by("department")
.agg([
pl.col("salary").mean().alias("avg_salary"),
pl.col("name").count().alias("employee_count"),
pl.col("age").max().alias("max_age")
])
)# Build query plan without execution
lazy_query = (pl
.scan_csv("large_dataset.csv")
.filter(pl.col("amount") > 1000)
.group_by("category")
.agg([
pl.col("amount").sum().alias("total"),
pl.col("id").count().alias("count")
])
.sort("total", descending=True)
)
# Execute optimized query
result = lazy_query.collect()
# Check execution plan
print(lazy_query.explain())# The u64-idx variant handles datasets > 4.2B rows
very_large_df = pl.scan_parquet("huge_dataset.parquet")
# Operations work the same but support more rows
result = (very_large_df
.filter(pl.col("timestamp") > "2023-01-01")
.group_by("user_id")
.agg([
pl.col("value").sum(),
pl.col("event").count()
])
.collect(streaming=True) # Use streaming for memory efficiency
)Install with Tessl CLI
npx tessl i tessl/pypi-polars-u64-idx