CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-polars

Blazingly fast DataFrame library for Python with lazy and eager evaluation modes

Pending
Overview
Eval results
Files

core-data-structures.mddocs/

Core Data Structures

The fundamental data structures that form the foundation of Polars: DataFrame for eager evaluation, LazyFrame for lazy evaluation with query optimization, Series for one-dimensional data, and Expr for building complex column operations and transformations.

Capabilities

DataFrame

Primary data structure for eager evaluation providing immediate computation with comprehensive data manipulation methods including filtering, selection, aggregation, joining, and reshaping operations.

class DataFrame:
    def __init__(
        self,
        data=None,
        schema=None,
        *,
        schema_overrides=None,
        strict=True,
        orient=None,
        infer_schema_length=None,
        nan_to_null=False
    ): 
        """
        Create a DataFrame from various data sources.
        
        Parameters:
        - data: Data source (dict, list, arrow table, pandas df, etc.)
        - schema: Column names and types
        - schema_overrides: Override specific column types
        - strict: Strict schema validation
        - orient: Data orientation ('row' or 'col')
        - infer_schema_length: Rows to scan for type inference
        - nan_to_null: Convert NaN to null values
        """

    # Selection and Projection
    def select(self, *exprs, **named_exprs) -> DataFrame: ...
    def with_columns(self, *exprs, **named_exprs) -> DataFrame: ...
    def drop(self, *columns) -> DataFrame: ...
    def rename(self, mapping) -> DataFrame: ...

    # Filtering and Sorting
    def filter(self, *predicates) -> DataFrame: ...
    def sort(self, by, *, descending=False, nulls_last=False) -> DataFrame: ...
    def unique(self, subset=None, *, keep="any", maintain_order=False) -> DataFrame: ...
    def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> DataFrame: ...

    # Aggregation and Grouping
    def group_by(self, *by, maintain_order=False) -> GroupBy: ...
    def sum(self) -> DataFrame: ...
    def mean(self) -> DataFrame: ...
    def max(self) -> DataFrame: ...
    def min(self) -> DataFrame: ...
    def std(self, ddof=1) -> DataFrame: ...
    def var(self, ddof=1) -> DataFrame: ...

    # Reshaping and Transformation
    def pivot(self, *, on, index=None, values=None, aggregate_function="first", sort_columns=False) -> DataFrame: ...
    def unpivot(self, *, on=None, index=None, variable_name=None, value_name=None) -> DataFrame: ...
    def transpose(self, *, include_header=False, header_name="column", column_names=None) -> DataFrame: ...
    def explode(self, columns, *, schema_overrides=None) -> DataFrame: ...

    # Joining Operations  
    def join(self, other, *, on=None, how="inner", left_on=None, right_on=None, suffix="_right", validate=None, join_nulls=False) -> DataFrame: ...
    def join_asof(self, other, *, left_on=None, right_on=None, on=None, by_left=None, by_right=None, by=None, strategy="backward") -> DataFrame: ...

    # Window Operations
    def with_row_index(self, name="row_nr", offset=0) -> DataFrame: ...
    def rolling(self, index_column, *, period, offset=None, closed="right", by=None, check_sorted=True) -> RollingGroupBy: ...

    # I/O Operations
    def write_csv(self, file=None, **kwargs) -> str | None: ...
    def write_parquet(self, file, **kwargs) -> None: ...
    def write_json(self, file=None, **kwargs) -> str | None: ...
    def write_excel(self, workbook=None, worksheet=None, **kwargs): ...
    def write_database(self, table_name, connection, **kwargs) -> int: ...

    # Conversion Methods
    def to_pandas(self, **kwargs): ...
    def to_numpy(self, structured=False, **kwargs): ...
    def to_arrow(self) -> pa.Table: ...
    def to_dict(self, as_series=True) -> dict: ...
    def to_dicts(self) -> list[dict]: ...

    # Utility Methods
    def head(self, n=5) -> DataFrame: ...
    def tail(self, n=5) -> DataFrame: ...
    def slice(self, offset, length=None) -> DataFrame: ...
    def glimpse(self, *, max_items_per_column=10, max_colname_length=50, return_as_string=False) -> str | None: ...
    def describe(self, *, percentiles=None, interpolation="nearest") -> DataFrame: ...
    def is_empty(self) -> bool: ...
    def lazy(self) -> LazyFrame: ...

    # Properties
    @property
    def columns(self) -> list[str]: ...
    @property 
    def dtypes(self) -> list[DataType]: ...
    @property
    def schema(self) -> Schema: ...
    @property
    def shape(self) -> tuple[int, int]: ...
    @property
    def height(self) -> int: ...
    @property
    def width(self) -> int: ...
    @property
    def flags(self) -> dict[str, dict[str, bool]]: ...

LazyFrame

Lazy evaluation data structure that builds a computation graph for query optimization, predicate pushdown, and efficient memory usage with automatic query planning.

class LazyFrame:
    # Selection and Projection
    def select(self, *exprs, **named_exprs) -> LazyFrame: ...
    def with_columns(self, *exprs, **named_exprs) -> LazyFrame: ...
    def drop(self, *columns) -> LazyFrame: ...
    def rename(self, mapping) -> LazyFrame: ...

    # Filtering and Sorting  
    def filter(self, *predicates) -> LazyFrame: ...
    def sort(self, by, *, descending=False, nulls_last=False, multithreaded=True, maintain_order=False) -> LazyFrame: ...
    def unique(self, subset=None, *, keep="any", maintain_order=False) -> LazyFrame: ...
    def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> LazyFrame: ...

    # Aggregation and Grouping
    def group_by(self, *by, maintain_order=False) -> LazyGroupBy: ...
    def sum(self) -> LazyFrame: ...
    def mean(self) -> LazyFrame: ...  
    def max(self) -> LazyFrame: ...
    def min(self) -> LazyFrame: ...
    def std(self, ddof=1) -> LazyFrame: ...
    def var(self, ddof=1) -> LazyFrame: ...

    # Reshaping and Transformation
    def pivot(self, *, on, index=None, values=None, aggregate_function="first", sort_columns=False) -> LazyFrame: ...
    def unpivot(self, *, on=None, index=None, variable_name=None, value_name=None) -> LazyFrame: ...
    def explode(self, columns, *, schema_overrides=None) -> LazyFrame: ...

    # Joining Operations
    def join(self, other, *, on=None, how="inner", left_on=None, right_on=None, suffix="_right", validate=None, join_nulls=False) -> LazyFrame: ...
    def join_asof(self, other, *, left_on=None, right_on=None, on=None, by_left=None, by_right=None, by=None, strategy="backward") -> LazyFrame: ...

    # Window Operations
    def with_row_index(self, name="row_nr", offset=0) -> LazyFrame: ...
    def rolling(self, index_column, *, period, offset=None, closed="right", by=None) -> RollingGroupBy: ...

    # Execution and Optimization  
    def collect(self, *, type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True, streaming=False, background=False, _eager=True) -> DataFrame: ...
    def explain(self, *, optimized=True, type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True, format="plain") -> str: ...
    def show_graph(self, *, optimized=True, show=True, output_path=None, raw_output=False, figsize=(16, 12), type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True) -> str | None: ...

    # Utility Methods
    def head(self, n=5) -> LazyFrame: ...
    def tail(self, n=5) -> LazyFrame: ...
    def slice(self, offset, length=None) -> LazyFrame: ...
    def first(self) -> LazyFrame: ...
    def last(self) -> LazyFrame: ...
    def cache(self) -> LazyFrame: ...

    # Properties
    @property
    def columns(self) -> list[str]: ...
    @property
    def dtypes(self) -> list[DataType]: ...
    @property
    def schema(self) -> Schema: ...
    @property
    def width(self) -> int: ...

Series

One-dimensional data structure with vectorized operations, supporting element-wise transformations, aggregations, and integration with DataFrame operations.

class Series:
    def __init__(self, name=None, values=None, dtype=None, strict=True, nan_to_null=False): 
        """
        Create a Series from values.
        
        Parameters:
        - name: Series name
        - values: Data values (list, array, etc.)
        - dtype: Data type
        - strict: Strict type checking
        - nan_to_null: Convert NaN to null
        """

    # Element Access and Slicing
    def __getitem__(self, item): ...
    def get(self, index, *, default=None): ...
    def slice(self, offset, length=None) -> Series: ...
    def head(self, n=5) -> Series: ...
    def tail(self, n=5) -> Series: ...
    def take(self, indices) -> Series: ...
    def gather(self, indices) -> Series: ...

    # Filtering and Selection
    def filter(self, predicate) -> Series: ...
    def unique(self, *, maintain_order=False) -> Series: ...
    def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> Series: ...
    def sort(self, *, descending=False, nulls_last=False) -> Series: ...

    # Transformations
    def map_elements(self, function, return_dtype=None, *, skip_nulls=True) -> Series: ...
    def cast(self, dtype, *, strict=True) -> Series: ...
    def alias(self, name) -> Series: ...
    def rename(self, name) -> Series: ...

    # Aggregations
    def sum(self) -> int | float: ...
    def mean(self) -> float | None: ...
    def median(self) -> float | None: ...
    def max(self) -> Any: ...
    def min(self) -> Any: ...
    def std(self, ddof=1) -> float | None: ...
    def var(self, ddof=1) -> float | None: ...
    def count(self) -> int: ...
    def len(self) -> int: ...

    # String Operations (when dtype is String)
    @property
    def str(self) -> StringNameSpace: ...

    # Datetime Operations (when dtype is temporal)
    @property  
    def dt(self) -> DateTimeNameSpace: ...

    # List Operations (when dtype is List)
    @property
    def list(self) -> ListNameSpace: ...

    # Array Operations (when dtype is Array)
    @property
    def arr(self) -> ArrayNameSpace: ...

    # Struct Operations (when dtype is Struct)
    @property
    def struct(self) -> StructNameSpace: ...

    # Categorical Operations (when dtype is Categorical)
    @property
    def cat(self) -> CategoricalNameSpace: ...

    # Binary Operations (when dtype is Binary)  
    @property
    def bin(self) -> BinaryNameSpace: ...

    # Conversion Methods
    def to_list(self) -> list: ...
    def to_numpy(self, *, zero_copy_only=False, writable=False) -> np.ndarray: ...
    def to_arrow(self) -> pa.Array: ...
    def to_pandas(self, **kwargs): ...
    def to_frame(self, name=None) -> DataFrame: ...

    # Utility Methods
    def is_null(self) -> Series: ...
    def is_not_null(self) -> Series: ...
    def is_finite(self) -> Series: ...
    def is_infinite(self) -> Series: ...
    def is_nan(self) -> Series: ...
    def is_not_nan(self) -> Series: ...
    def is_empty(self) -> bool: ...
    def describe(self, *, percentiles=None, interpolation="nearest") -> DataFrame: ...

    # Properties
    @property
    def name(self) -> str: ...
    @property  
    def dtype(self) -> DataType: ...
    @property
    def shape(self) -> tuple[int]: ...
    @property
    def flags(self) -> dict[str, bool]: ...

Expr

Expression builder for column operations, transformations, and aggregations that can be used across DataFrame, LazyFrame, and various contexts for building complex data processing pipelines.

class Expr:
    # Aliasing and Naming
    def alias(self, name: str) -> Expr: ...
    def name(self) -> ExprNameNameSpace: ...

    # Filtering and Selection
    def filter(self, predicate) -> Expr: ...
    def sort(self, *, descending=False, nulls_last=False) -> Expr: ...
    def sort_by(self, by, *, descending=False, nulls_last=False) -> Expr: ...
    def unique(self, *, maintain_order=False) -> Expr: ...
    def slice(self, offset, length=None) -> Expr: ...
    def head(self, n=5) -> Expr: ...  
    def tail(self, n=5) -> Expr: ...
    def first(self) -> Expr: ...
    def last(self) -> Expr: ...
    def take(self, indices) -> Expr: ...
    def gather(self, indices) -> Expr: ...

    # Aggregations
    def sum(self) -> Expr: ...
    def mean(self) -> Expr: ...
    def median(self) -> Expr: ...
    def max(self) -> Expr: ...
    def min(self) -> Expr: ... 
    def std(self, ddof=1) -> Expr: ...
    def var(self, ddof=1) -> Expr: ...
    def count(self) -> Expr: ...
    def len(self) -> Expr: ...
    def n_unique(self) -> Expr: ...
    def null_count(self) -> Expr: ...
    def quantile(self, quantile, interpolation="nearest") -> Expr: ...

    # Window Functions
    def over(self, partition_by=None, *, order_by=None, mapping_strategy="group_to_rows") -> Expr: ...
    def rank(self, method="average", *, descending=False, seed=None) -> Expr: ...
    def cum_sum(self, *, reverse=False) -> Expr: ...
    def cum_count(self, *, reverse=False) -> Expr: ...
    def cum_max(self, *, reverse=False) -> Expr: ...
    def cum_min(self, *, reverse=False) -> Expr: ...

    # Mathematical Operations  
    def abs(self) -> Expr: ...
    def sqrt(self) -> Expr: ...
    def log(self, base=None) -> Expr: ...
    def log10(self) -> Expr: ...
    def exp(self) -> Expr: ...
    def pow(self, exponent) -> Expr: ...
    def round(self, decimals=0) -> Expr: ...
    def floor(self) -> Expr: ...
    def ceil(self) -> Expr: ...

    # Type Operations  
    def cast(self, dtype, *, strict=True) -> Expr: ...
    def is_null(self) -> Expr: ...
    def is_not_null(self) -> Expr: ...
    def is_finite(self) -> Expr: ...
    def is_infinite(self) -> Expr: ...
    def is_nan(self) -> Expr: ...
    def is_not_nan(self) -> Expr: ...
    def is_duplicated(self) -> Expr: ...
    def is_unique(self) -> Expr: ...
    def is_first_distinct(self) -> Expr: ...
    def is_last_distinct(self) -> Expr: ...

    # Conditional Operations
    def is_between(self, lower_bound, upper_bound, closed="both") -> Expr: ...
    def is_in(self, other) -> Expr: ...
    def when(self, condition) -> When: ...

    # String Operations (when expression evaluates to String)
    @property
    def str(self) -> ExprStringNameSpace: ...

    # Datetime Operations (when expression evaluates to temporal type)
    @property
    def dt(self) -> ExprDateTimeNameSpace: ...

    # List Operations (when expression evaluates to List)  
    @property
    def list(self) -> ExprListNameSpace: ...

    # Array Operations (when expression evaluates to Array)
    @property
    def arr(self) -> ExprArrayNameSpace: ...

    # Struct Operations (when expression evaluates to Struct)
    @property
    def struct(self) -> ExprStructNameSpace: ...

    # Categorical Operations (when expression evaluates to Categorical)
    @property
    def cat(self) -> ExprCategoricalNameSpace: ...

    # Binary Operations (when expression evaluates to Binary)
    @property
    def bin(self) -> ExprBinaryNameSpace: ...

    # Meta Operations
    @property
    def meta(self) -> ExprMetaNameSpace: ...

Usage Examples

Basic DataFrame Operations

import polars as pl

# Create DataFrame
df = pl.DataFrame({
    "product": ["A", "B", "C", "A", "B"],
    "sales": [100, 200, 150, 80, 250],
    "region": ["North", "South", "North", "South", "North"]
})

# Chain operations
result = (
    df
    .filter(pl.col("sales") > 100) 
    .with_columns(
        pl.col("sales").mul(1.1).alias("sales_with_tax"),
        pl.col("product").str.to_lowercase().alias("product_lower")
    )
    .group_by("region")
    .agg([
        pl.col("sales").sum().alias("total_sales"),
        pl.col("product").count().alias("product_count")
    ])
)

Lazy Evaluation with Query Optimization

# Build lazy computation
lazy_query = (
    pl.scan_csv("large_dataset.csv")
    .filter(pl.col("amount") > 1000)
    .with_columns(
        pl.col("date").str.to_date().alias("parsed_date"),
        pl.col("category").str.to_uppercase()  
    )
    .group_by(["category", pl.col("parsed_date").dt.month()])
    .agg([
        pl.col("amount").sum().alias("monthly_total"),
        pl.col("transaction_id").count().alias("transaction_count")
    ])
    .sort("monthly_total", descending=True)
)

# Execute optimized query
result = lazy_query.collect()

# View query plan
print(lazy_query.explain(optimized=True))

Advanced Expressions

# Complex expression building
complex_expr = (
    pl.when(pl.col("score") >= 90)
    .then(pl.lit("A"))
    .when(pl.col("score") >= 80)
    .then(pl.lit("B"))
    .when(pl.col("score") >= 70)
    .then(pl.lit("C"))
    .otherwise(pl.lit("F"))
    .alias("grade")
)

df = df.with_columns(complex_expr)

# Window functions
df = df.with_columns([
    pl.col("sales").rank().over("region").alias("sales_rank"),
    pl.col("sales").cum_sum().over("region").alias("running_total")
])

Install with Tessl CLI

npx tessl i tessl/pypi-polars

docs

column-selection.md

configuration.md

core-data-structures.md

data-conversion.md

data-types.md

error-handling.md

functions-expressions.md

index.md

io-operations.md

sql-interface.md

tile.json