Blazingly fast DataFrame library for Python with lazy and eager evaluation modes
npx @tessl/cli install tessl/pypi-polars@1.33.0A blazingly fast DataFrame library for Python built on Apache Arrow Columnar Format with lazy and eager execution modes. Polars provides comprehensive data manipulation and analysis capabilities with multi-threaded processing, SIMD optimization, query optimization, and powerful expression APIs designed for maximum performance in data science workflows.
pip install polarsimport polars as plFor specific components:
from polars import DataFrame, LazyFrame, Series, Expr
from polars import col, lit, when
from polars import read_csv, read_parquet, scan_csvimport polars as pl
# Create DataFrame from dictionary
df = pl.DataFrame({
"name": ["Alice", "Bob", "Charlie"],
"age": [25, 30, 35],
"city": ["New York", "London", "Tokyo"]
})
# Basic operations
result = (
df
.filter(pl.col("age") > 27)
.select([
pl.col("name"),
pl.col("age"),
pl.col("city").alias("location")
])
.sort("age", descending=True)
)
print(result)
# Lazy evaluation for query optimization
lazy_result = (
pl.scan_csv("data.csv")
.filter(pl.col("revenue") > 1000)
.group_by("department")
.agg([
pl.col("revenue").sum().alias("total_revenue"),
pl.col("employee_id").count().alias("employee_count")
])
.collect()
)Polars provides two main execution paradigms:
Key architectural components:
Primary data structures for eager and lazy computation, providing comprehensive data manipulation capabilities with vectorized operations and type safety.
class DataFrame:
def __init__(self, data=None, schema=None, *, schema_overrides=None, strict=True, orient=None, infer_schema_length=None, nan_to_null=False): ...
def select(self, *exprs, **named_exprs) -> DataFrame: ...
def filter(self, *predicates) -> DataFrame: ...
def with_columns(self, *exprs, **named_exprs) -> DataFrame: ...
def group_by(self, *by, maintain_order=False) -> GroupBy: ...
class LazyFrame:
def select(self, *exprs, **named_exprs) -> LazyFrame: ...
def filter(self, *predicates) -> LazyFrame: ...
def with_columns(self, *exprs, **named_exprs) -> LazyFrame: ...
def collect(self, **kwargs) -> DataFrame: ...
class Series:
def __init__(self, name=None, values=None, dtype=None): ...
def filter(self, predicate) -> Series: ...
def map_elements(self, function, return_dtype=None) -> Series: ...
class Expr:
def alias(self, name: str) -> Expr: ...
def filter(self, predicate) -> Expr: ...
def sum(self) -> Expr: ...Comprehensive type system supporting primitive types, temporal data, nested structures, and schema validation with automatic type inference and casting.
# Primitive Types
Boolean: DataType
Int8, Int16, Int32, Int64, Int128: DataType
UInt8, UInt16, UInt32, UInt64: DataType
Float32, Float64: DataType
Decimal: DataType
# String and Binary Types
String: DataType
Binary: DataType
Categorical: DataType
Enum: DataType
# Temporal Types
Date: DataType
Datetime: DataType
Time: DataType
Duration: DataType
# Nested Types
List: DataType
Array: DataType
Struct: DataType
class Schema:
def __init__(self, schema): ...
def names(self) -> list[str]: ...
def dtypes(self) -> list[DataType]: ...90+ utility functions for data construction, aggregation, statistical operations, and expression building with support for vectorized computations and window functions.
# Construction Functions
def col(name: str) -> Expr: ...
def lit(value) -> Expr: ...
def when(predicate) -> When: ...
def struct(*exprs) -> Expr: ...
# Aggregation Functions
def sum(*exprs) -> Expr: ...
def mean(*exprs) -> Expr: ...
def count(*exprs) -> Expr: ...
def max(*exprs) -> Expr: ...
def min(*exprs) -> Expr: ...
# Range Functions
def arange(start, end, step=1, dtype=None) -> Expr: ...
def date_range(start, end, interval="1d") -> Expr: ...
def int_range(start, end, step=1, dtype=None) -> Expr: ...
# Statistical Functions
def corr(a, b, method="pearson") -> Expr: ...
def std(column, ddof=1) -> Expr: ...
def var(column, ddof=1) -> Expr: ...Comprehensive I/O support for 15+ file formats including CSV, Parquet, JSON, Excel, databases, and cloud storage with both eager reading and lazy scanning capabilities.
# Read Functions (Eager)
def read_csv(source, **kwargs) -> DataFrame: ...
def read_parquet(source, **kwargs) -> DataFrame: ...
def read_json(source, **kwargs) -> DataFrame: ...
def read_excel(source, **kwargs) -> DataFrame: ...
def read_database(query, connection, **kwargs) -> DataFrame: ...
# Scan Functions (Lazy)
def scan_csv(source, **kwargs) -> LazyFrame: ...
def scan_parquet(source, **kwargs) -> LazyFrame: ...
def scan_ndjson(source, **kwargs) -> LazyFrame: ...
def scan_delta(source, **kwargs) -> LazyFrame: ...
# Cloud Credentials
class CredentialProviderAWS:
def __init__(self, **kwargs): ...
class CredentialProviderGCP:
def __init__(self, **kwargs): ...SQL query execution capabilities with SQLContext for managing multiple DataFrames and native SQL expression support within DataFrame operations.
class SQLContext:
def __init__(self): ...
def register(self, name: str, frame) -> None: ...
def execute(self, query: str, **kwargs) -> DataFrame: ...
def tables(self) -> list[str]: ...
def sql(query: str, **kwargs) -> DataFrame: ...
def sql_expr(sql: str) -> Expr: ...Global configuration system for controlling formatting, streaming behavior, and optimization settings with context managers and persistent configuration.
class Config:
@classmethod
def set_fmt_str_lengths(cls, n: int) -> type[Config]: ...
@classmethod
def set_tbl_rows(cls, n: int) -> type[Config]: ...
@classmethod
def set_streaming_chunk_size(cls, size: int) -> type[Config]: ...
@classmethod
def restore_defaults(cls) -> type[Config]: ...
class QueryOptFlags:
def __init__(self, **kwargs): ...
class GPUEngine:
def __init__(self, **kwargs): ...Configuration and Optimization
Advanced column selection system with 30+ selector functions supporting pattern matching, data type filtering, and logical operations for complex column manipulation.
import polars.selectors as cs
# Data Type Selectors
def by_dtype(dtypes) -> Selector: ...
def numeric() -> Selector: ...
def string() -> Selector: ...
def temporal() -> Selector: ...
def boolean() -> Selector: ...
# Pattern Selectors
def contains(pattern: str) -> Selector: ...
def starts_with(prefix: str) -> Selector: ...
def ends_with(suffix: str) -> Selector: ...
def matches(pattern: str) -> Selector: ...
# Index Selectors
def by_index(indices) -> Selector: ...
def first(n: int = 1) -> Selector: ...
def last(n: int = 1) -> Selector: ...Seamless integration with pandas, NumPy, PyArrow, and PyTorch through conversion functions supporting bidirectional data exchange with automatic schema mapping.
def from_pandas(df, **kwargs) -> DataFrame: ...
def from_numpy(data, schema=None, **kwargs) -> DataFrame: ...
def from_arrow(data, **kwargs) -> DataFrame: ...
def from_dict(data, schema=None) -> DataFrame: ...
def from_dicts(dicts, schema=None) -> DataFrame: ...
def from_torch(tensor, **kwargs) -> DataFrame: ...
def json_normalize(data, **kwargs) -> DataFrame: ...Comprehensive exception hierarchy for handling data errors, computation failures, and I/O issues with specific error types for precise error handling.
# Base Exceptions
class PolarsError(Exception): ...
class ComputeError(PolarsError): ...
# Data Exceptions
class ColumnNotFoundError(PolarsError): ...
class SchemaError(PolarsError): ...
class DuplicateError(PolarsError): ...
class ShapeError(PolarsError): ...
# Additional Row-Related Exceptions
class RowsError(PolarsError): ...
class NoRowsReturnedError(RowsError): ...
class TooManyRowsReturnedError(RowsError): ...
# SQL Exceptions
class SQLInterfaceError(PolarsError): ...
class SQLSyntaxError(PolarsError): ...
# Warning Types
class PerformanceWarning(UserWarning): ...
class CategoricalRemappingWarning(UserWarning): ...