Blazingly fast DataFrame library for Python with lazy and eager evaluation modes
—
The fundamental data structures that form the foundation of Polars: DataFrame for eager evaluation, LazyFrame for lazy evaluation with query optimization, Series for one-dimensional data, and Expr for building complex column operations and transformations.
Primary data structure for eager evaluation providing immediate computation with comprehensive data manipulation methods including filtering, selection, aggregation, joining, and reshaping operations.
class DataFrame:
def __init__(
self,
data=None,
schema=None,
*,
schema_overrides=None,
strict=True,
orient=None,
infer_schema_length=None,
nan_to_null=False
):
"""
Create a DataFrame from various data sources.
Parameters:
- data: Data source (dict, list, arrow table, pandas df, etc.)
- schema: Column names and types
- schema_overrides: Override specific column types
- strict: Strict schema validation
- orient: Data orientation ('row' or 'col')
- infer_schema_length: Rows to scan for type inference
- nan_to_null: Convert NaN to null values
"""
# Selection and Projection
def select(self, *exprs, **named_exprs) -> DataFrame: ...
def with_columns(self, *exprs, **named_exprs) -> DataFrame: ...
def drop(self, *columns) -> DataFrame: ...
def rename(self, mapping) -> DataFrame: ...
# Filtering and Sorting
def filter(self, *predicates) -> DataFrame: ...
def sort(self, by, *, descending=False, nulls_last=False) -> DataFrame: ...
def unique(self, subset=None, *, keep="any", maintain_order=False) -> DataFrame: ...
def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> DataFrame: ...
# Aggregation and Grouping
def group_by(self, *by, maintain_order=False) -> GroupBy: ...
def sum(self) -> DataFrame: ...
def mean(self) -> DataFrame: ...
def max(self) -> DataFrame: ...
def min(self) -> DataFrame: ...
def std(self, ddof=1) -> DataFrame: ...
def var(self, ddof=1) -> DataFrame: ...
# Reshaping and Transformation
def pivot(self, *, on, index=None, values=None, aggregate_function="first", sort_columns=False) -> DataFrame: ...
def unpivot(self, *, on=None, index=None, variable_name=None, value_name=None) -> DataFrame: ...
def transpose(self, *, include_header=False, header_name="column", column_names=None) -> DataFrame: ...
def explode(self, columns, *, schema_overrides=None) -> DataFrame: ...
# Joining Operations
def join(self, other, *, on=None, how="inner", left_on=None, right_on=None, suffix="_right", validate=None, join_nulls=False) -> DataFrame: ...
def join_asof(self, other, *, left_on=None, right_on=None, on=None, by_left=None, by_right=None, by=None, strategy="backward") -> DataFrame: ...
# Window Operations
def with_row_index(self, name="row_nr", offset=0) -> DataFrame: ...
def rolling(self, index_column, *, period, offset=None, closed="right", by=None, check_sorted=True) -> RollingGroupBy: ...
# I/O Operations
def write_csv(self, file=None, **kwargs) -> str | None: ...
def write_parquet(self, file, **kwargs) -> None: ...
def write_json(self, file=None, **kwargs) -> str | None: ...
def write_excel(self, workbook=None, worksheet=None, **kwargs): ...
def write_database(self, table_name, connection, **kwargs) -> int: ...
# Conversion Methods
def to_pandas(self, **kwargs): ...
def to_numpy(self, structured=False, **kwargs): ...
def to_arrow(self) -> pa.Table: ...
def to_dict(self, as_series=True) -> dict: ...
def to_dicts(self) -> list[dict]: ...
# Utility Methods
def head(self, n=5) -> DataFrame: ...
def tail(self, n=5) -> DataFrame: ...
def slice(self, offset, length=None) -> DataFrame: ...
def glimpse(self, *, max_items_per_column=10, max_colname_length=50, return_as_string=False) -> str | None: ...
def describe(self, *, percentiles=None, interpolation="nearest") -> DataFrame: ...
def is_empty(self) -> bool: ...
def lazy(self) -> LazyFrame: ...
# Properties
@property
def columns(self) -> list[str]: ...
@property
def dtypes(self) -> list[DataType]: ...
@property
def schema(self) -> Schema: ...
@property
def shape(self) -> tuple[int, int]: ...
@property
def height(self) -> int: ...
@property
def width(self) -> int: ...
@property
def flags(self) -> dict[str, dict[str, bool]]: ...Lazy evaluation data structure that builds a computation graph for query optimization, predicate pushdown, and efficient memory usage with automatic query planning.
class LazyFrame:
# Selection and Projection
def select(self, *exprs, **named_exprs) -> LazyFrame: ...
def with_columns(self, *exprs, **named_exprs) -> LazyFrame: ...
def drop(self, *columns) -> LazyFrame: ...
def rename(self, mapping) -> LazyFrame: ...
# Filtering and Sorting
def filter(self, *predicates) -> LazyFrame: ...
def sort(self, by, *, descending=False, nulls_last=False, multithreaded=True, maintain_order=False) -> LazyFrame: ...
def unique(self, subset=None, *, keep="any", maintain_order=False) -> LazyFrame: ...
def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> LazyFrame: ...
# Aggregation and Grouping
def group_by(self, *by, maintain_order=False) -> LazyGroupBy: ...
def sum(self) -> LazyFrame: ...
def mean(self) -> LazyFrame: ...
def max(self) -> LazyFrame: ...
def min(self) -> LazyFrame: ...
def std(self, ddof=1) -> LazyFrame: ...
def var(self, ddof=1) -> LazyFrame: ...
# Reshaping and Transformation
def pivot(self, *, on, index=None, values=None, aggregate_function="first", sort_columns=False) -> LazyFrame: ...
def unpivot(self, *, on=None, index=None, variable_name=None, value_name=None) -> LazyFrame: ...
def explode(self, columns, *, schema_overrides=None) -> LazyFrame: ...
# Joining Operations
def join(self, other, *, on=None, how="inner", left_on=None, right_on=None, suffix="_right", validate=None, join_nulls=False) -> LazyFrame: ...
def join_asof(self, other, *, left_on=None, right_on=None, on=None, by_left=None, by_right=None, by=None, strategy="backward") -> LazyFrame: ...
# Window Operations
def with_row_index(self, name="row_nr", offset=0) -> LazyFrame: ...
def rolling(self, index_column, *, period, offset=None, closed="right", by=None) -> RollingGroupBy: ...
# Execution and Optimization
def collect(self, *, type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True, streaming=False, background=False, _eager=True) -> DataFrame: ...
def explain(self, *, optimized=True, type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True, format="plain") -> str: ...
def show_graph(self, *, optimized=True, show=True, output_path=None, raw_output=False, figsize=(16, 12), type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True) -> str | None: ...
# Utility Methods
def head(self, n=5) -> LazyFrame: ...
def tail(self, n=5) -> LazyFrame: ...
def slice(self, offset, length=None) -> LazyFrame: ...
def first(self) -> LazyFrame: ...
def last(self) -> LazyFrame: ...
def cache(self) -> LazyFrame: ...
# Properties
@property
def columns(self) -> list[str]: ...
@property
def dtypes(self) -> list[DataType]: ...
@property
def schema(self) -> Schema: ...
@property
def width(self) -> int: ...One-dimensional data structure with vectorized operations, supporting element-wise transformations, aggregations, and integration with DataFrame operations.
class Series:
def __init__(self, name=None, values=None, dtype=None, strict=True, nan_to_null=False):
"""
Create a Series from values.
Parameters:
- name: Series name
- values: Data values (list, array, etc.)
- dtype: Data type
- strict: Strict type checking
- nan_to_null: Convert NaN to null
"""
# Element Access and Slicing
def __getitem__(self, item): ...
def get(self, index, *, default=None): ...
def slice(self, offset, length=None) -> Series: ...
def head(self, n=5) -> Series: ...
def tail(self, n=5) -> Series: ...
def take(self, indices) -> Series: ...
def gather(self, indices) -> Series: ...
# Filtering and Selection
def filter(self, predicate) -> Series: ...
def unique(self, *, maintain_order=False) -> Series: ...
def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> Series: ...
def sort(self, *, descending=False, nulls_last=False) -> Series: ...
# Transformations
def map_elements(self, function, return_dtype=None, *, skip_nulls=True) -> Series: ...
def cast(self, dtype, *, strict=True) -> Series: ...
def alias(self, name) -> Series: ...
def rename(self, name) -> Series: ...
# Aggregations
def sum(self) -> int | float: ...
def mean(self) -> float | None: ...
def median(self) -> float | None: ...
def max(self) -> Any: ...
def min(self) -> Any: ...
def std(self, ddof=1) -> float | None: ...
def var(self, ddof=1) -> float | None: ...
def count(self) -> int: ...
def len(self) -> int: ...
# String Operations (when dtype is String)
@property
def str(self) -> StringNameSpace: ...
# Datetime Operations (when dtype is temporal)
@property
def dt(self) -> DateTimeNameSpace: ...
# List Operations (when dtype is List)
@property
def list(self) -> ListNameSpace: ...
# Array Operations (when dtype is Array)
@property
def arr(self) -> ArrayNameSpace: ...
# Struct Operations (when dtype is Struct)
@property
def struct(self) -> StructNameSpace: ...
# Categorical Operations (when dtype is Categorical)
@property
def cat(self) -> CategoricalNameSpace: ...
# Binary Operations (when dtype is Binary)
@property
def bin(self) -> BinaryNameSpace: ...
# Conversion Methods
def to_list(self) -> list: ...
def to_numpy(self, *, zero_copy_only=False, writable=False) -> np.ndarray: ...
def to_arrow(self) -> pa.Array: ...
def to_pandas(self, **kwargs): ...
def to_frame(self, name=None) -> DataFrame: ...
# Utility Methods
def is_null(self) -> Series: ...
def is_not_null(self) -> Series: ...
def is_finite(self) -> Series: ...
def is_infinite(self) -> Series: ...
def is_nan(self) -> Series: ...
def is_not_nan(self) -> Series: ...
def is_empty(self) -> bool: ...
def describe(self, *, percentiles=None, interpolation="nearest") -> DataFrame: ...
# Properties
@property
def name(self) -> str: ...
@property
def dtype(self) -> DataType: ...
@property
def shape(self) -> tuple[int]: ...
@property
def flags(self) -> dict[str, bool]: ...Expression builder for column operations, transformations, and aggregations that can be used across DataFrame, LazyFrame, and various contexts for building complex data processing pipelines.
class Expr:
# Aliasing and Naming
def alias(self, name: str) -> Expr: ...
def name(self) -> ExprNameNameSpace: ...
# Filtering and Selection
def filter(self, predicate) -> Expr: ...
def sort(self, *, descending=False, nulls_last=False) -> Expr: ...
def sort_by(self, by, *, descending=False, nulls_last=False) -> Expr: ...
def unique(self, *, maintain_order=False) -> Expr: ...
def slice(self, offset, length=None) -> Expr: ...
def head(self, n=5) -> Expr: ...
def tail(self, n=5) -> Expr: ...
def first(self) -> Expr: ...
def last(self) -> Expr: ...
def take(self, indices) -> Expr: ...
def gather(self, indices) -> Expr: ...
# Aggregations
def sum(self) -> Expr: ...
def mean(self) -> Expr: ...
def median(self) -> Expr: ...
def max(self) -> Expr: ...
def min(self) -> Expr: ...
def std(self, ddof=1) -> Expr: ...
def var(self, ddof=1) -> Expr: ...
def count(self) -> Expr: ...
def len(self) -> Expr: ...
def n_unique(self) -> Expr: ...
def null_count(self) -> Expr: ...
def quantile(self, quantile, interpolation="nearest") -> Expr: ...
# Window Functions
def over(self, partition_by=None, *, order_by=None, mapping_strategy="group_to_rows") -> Expr: ...
def rank(self, method="average", *, descending=False, seed=None) -> Expr: ...
def cum_sum(self, *, reverse=False) -> Expr: ...
def cum_count(self, *, reverse=False) -> Expr: ...
def cum_max(self, *, reverse=False) -> Expr: ...
def cum_min(self, *, reverse=False) -> Expr: ...
# Mathematical Operations
def abs(self) -> Expr: ...
def sqrt(self) -> Expr: ...
def log(self, base=None) -> Expr: ...
def log10(self) -> Expr: ...
def exp(self) -> Expr: ...
def pow(self, exponent) -> Expr: ...
def round(self, decimals=0) -> Expr: ...
def floor(self) -> Expr: ...
def ceil(self) -> Expr: ...
# Type Operations
def cast(self, dtype, *, strict=True) -> Expr: ...
def is_null(self) -> Expr: ...
def is_not_null(self) -> Expr: ...
def is_finite(self) -> Expr: ...
def is_infinite(self) -> Expr: ...
def is_nan(self) -> Expr: ...
def is_not_nan(self) -> Expr: ...
def is_duplicated(self) -> Expr: ...
def is_unique(self) -> Expr: ...
def is_first_distinct(self) -> Expr: ...
def is_last_distinct(self) -> Expr: ...
# Conditional Operations
def is_between(self, lower_bound, upper_bound, closed="both") -> Expr: ...
def is_in(self, other) -> Expr: ...
def when(self, condition) -> When: ...
# String Operations (when expression evaluates to String)
@property
def str(self) -> ExprStringNameSpace: ...
# Datetime Operations (when expression evaluates to temporal type)
@property
def dt(self) -> ExprDateTimeNameSpace: ...
# List Operations (when expression evaluates to List)
@property
def list(self) -> ExprListNameSpace: ...
# Array Operations (when expression evaluates to Array)
@property
def arr(self) -> ExprArrayNameSpace: ...
# Struct Operations (when expression evaluates to Struct)
@property
def struct(self) -> ExprStructNameSpace: ...
# Categorical Operations (when expression evaluates to Categorical)
@property
def cat(self) -> ExprCategoricalNameSpace: ...
# Binary Operations (when expression evaluates to Binary)
@property
def bin(self) -> ExprBinaryNameSpace: ...
# Meta Operations
@property
def meta(self) -> ExprMetaNameSpace: ...import polars as pl
# Create DataFrame
df = pl.DataFrame({
"product": ["A", "B", "C", "A", "B"],
"sales": [100, 200, 150, 80, 250],
"region": ["North", "South", "North", "South", "North"]
})
# Chain operations
result = (
df
.filter(pl.col("sales") > 100)
.with_columns(
pl.col("sales").mul(1.1).alias("sales_with_tax"),
pl.col("product").str.to_lowercase().alias("product_lower")
)
.group_by("region")
.agg([
pl.col("sales").sum().alias("total_sales"),
pl.col("product").count().alias("product_count")
])
)# Build lazy computation
lazy_query = (
pl.scan_csv("large_dataset.csv")
.filter(pl.col("amount") > 1000)
.with_columns(
pl.col("date").str.to_date().alias("parsed_date"),
pl.col("category").str.to_uppercase()
)
.group_by(["category", pl.col("parsed_date").dt.month()])
.agg([
pl.col("amount").sum().alias("monthly_total"),
pl.col("transaction_id").count().alias("transaction_count")
])
.sort("monthly_total", descending=True)
)
# Execute optimized query
result = lazy_query.collect()
# View query plan
print(lazy_query.explain(optimized=True))# Complex expression building
complex_expr = (
pl.when(pl.col("score") >= 90)
.then(pl.lit("A"))
.when(pl.col("score") >= 80)
.then(pl.lit("B"))
.when(pl.col("score") >= 70)
.then(pl.lit("C"))
.otherwise(pl.lit("F"))
.alias("grade")
)
df = df.with_columns(complex_expr)
# Window functions
df = df.with_columns([
pl.col("sales").rank().over("region").alias("sales_rank"),
pl.col("sales").cum_sum().over("region").alias("running_total")
])Install with Tessl CLI
npx tessl i tessl/pypi-polars