Blazingly fast DataFrame library with 64-bit index support for handling datasets with more than 4.2 billion rows
npx @tessl/cli install tessl/pypi-polars-u64-idx@1.33.0Polars is a blazingly fast DataFrame library optimized for performance and memory efficiency. This variant provides 64-bit index support, enabling analysis of datasets with more than 4.2 billion rows. Built in Rust using Apache Arrow Columnar Format, it features lazy/eager execution, multi-threading, SIMD optimization, query optimization, and hybrid streaming for larger-than-RAM datasets.
pip install polars-u64-idximport polars as plFor specific functionality:
# Core data structures
from polars import DataFrame, Series, LazyFrame
# Data types
from polars import Int64, Float64, String, Date, Datetime
# Functions and expressions
from polars import col, lit, when, concatimport polars as pl
# Create a DataFrame
df = pl.DataFrame({
"name": ["Alice", "Bob", "Charlie"],
"age": [25, 30, 35],
"city": ["New York", "London", "Tokyo"]
})
# Basic operations
result = (df
.filter(pl.col("age") > 28)
.select([
pl.col("name"),
pl.col("age"),
pl.col("city").alias("location")
])
.sort("age")
)
print(result)
# Lazy evaluation for larger datasets
lazy_df = (pl
.scan_csv("large_file.csv")
.filter(pl.col("amount") > 1000)
.group_by("category")
.agg([
pl.col("amount").sum().alias("total_amount"),
pl.col("id").count().alias("count")
])
)
# Execute the lazy computation
result = lazy_df.collect()Polars uses a columnar data model built on Apache Arrow with several key components:
The 64-bit index variant removes the 4.2 billion row limit of standard Polars, making it suitable for very large datasets while maintaining the same API and performance characteristics.
Primary data structures for working with tabular data, including eager DataFrame/Series for immediate operations and LazyFrame for optimized query execution.
class DataFrame:
def __init__(self, data=None, schema=None, schema_overrides=None, orient=None, infer_schema_length=N_INFER_DEFAULT, nan_to_null=False): ...
def select(self, *exprs, **named_exprs) -> DataFrame: ...
def filter(self, *predicates, **constraints) -> DataFrame: ...
def with_columns(self, *exprs, **named_exprs) -> DataFrame: ...
def group_by(self, *by, maintain_order=False, **named_by) -> GroupBy: ...
def sort(self, by, *, descending=False, nulls_last=False, multithreaded=True) -> DataFrame: ...
def join(self, other, on=None, how="inner", *, left_on=None, right_on=None, suffix="_right", validate="m:m", join_nulls=False, coalesce=None) -> DataFrame: ...
class Series:
def __init__(self, name=None, values=None, dtype=None, strict=True, nan_to_null=False, dtype_if_empty=Null): ...
class LazyFrame:
def select(self, *exprs, **named_exprs) -> LazyFrame: ...
def filter(self, *predicates, **constraints) -> LazyFrame: ...
def collect(self, *, type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True, no_optimization=False, streaming=False, background=False, _eager=False) -> DataFrame: ...Powerful expression system for column transformations, aggregations, and complex operations that work across DataFrame and LazyFrame.
class Expr:
def alias(self, name: str) -> Expr: ...
def cast(self, dtype: DataType | type[Any], *, strict: bool = True) -> Expr: ...
def filter(self, predicate: Expr) -> Expr: ...
def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Expr: ...
def sum(self) -> Expr: ...
def mean(self) -> Expr: ...
def max(self) -> Expr: ...
def min(self) -> Expr: ...
def count(self) -> Expr: ...
def col(name: str | DataType) -> Expr: ...
def lit(value: Any, dtype: DataType | None = None) -> Expr: ...
def when(predicate: Expr) -> When: ...Expressions and Column Operations
Comprehensive type system with numeric, text, temporal, and nested types, plus schema definition and validation capabilities.
# Numeric types
class Int8: ...
class Int16: ...
class Int32: ...
class Int64: ...
class Int128: ...
class UInt8: ...
class UInt16: ...
class UInt32: ...
class UInt64: ...
class Float32: ...
class Float64: ...
class Decimal: ...
# Text types
class String: ...
class Binary: ...
# Temporal types
class Date: ...
class Datetime: ...
class Time: ...
class Duration: ...
# Special types
class Boolean: ...
class Categorical: ...
class Enum: ...
class List: ...
class Array: ...
class Struct: ...
class Schema:
def __init__(self, schema: Mapping[str, DataType] | Iterable[tuple[str, DataType]] | None = None): ...Comprehensive I/O capabilities supporting 10+ file formats with both eager reading and lazy scanning for performance optimization.
# CSV
def read_csv(source: str | Path | IO[str] | IO[bytes] | bytes, **kwargs) -> DataFrame: ...
def scan_csv(source: str | Path | list[str] | list[Path], **kwargs) -> LazyFrame: ...
# Parquet
def read_parquet(source: str | Path | IO[bytes] | bytes, **kwargs) -> DataFrame: ...
def scan_parquet(source: str | Path | list[str] | list[Path], **kwargs) -> LazyFrame: ...
# JSON
def read_json(source: str | Path | IO[str] | IO[bytes] | bytes, **kwargs) -> DataFrame: ...
def read_ndjson(source: str | Path | IO[str] | IO[bytes] | bytes, **kwargs) -> DataFrame: ...
# Database
def read_database(query: str, connection: str | ConnectionOrCursor, **kwargs) -> DataFrame: ...
# Excel
def read_excel(source: str | Path | IO[bytes] | bytes, **kwargs) -> DataFrame: ...Built-in functions for aggregation, transformations, date/time operations, string manipulation, and utility functions.
# Aggregation functions
def sum(*exprs) -> Expr: ...
def mean(*exprs) -> Expr: ...
def max(*exprs) -> Expr: ...
def min(*exprs) -> Expr: ...
def count(*exprs) -> Expr: ...
def all(*exprs) -> Expr: ...
def any(*exprs) -> Expr: ...
# Date/time functions
def date(year: int | Expr, month: int | Expr, day: int | Expr) -> Expr: ...
def datetime(year: int | Expr, month: int | Expr, day: int | Expr, hour: int | Expr = 0, minute: int | Expr = 0, second: int | Expr = 0, microsecond: int | Expr = 0, *, time_unit: TimeUnit = "us", time_zone: str | None = None) -> Expr: ...
def date_range(start: date | datetime | IntoExpr, end: date | datetime | IntoExpr, interval: str | timedelta = "1d", *, closed: ClosedInterval = "both", time_unit: TimeUnit | None = None, time_zone: str | None = None, eager: bool = False) -> Expr | Series: ...
# String functions
def concat_str(exprs: IntoExpr, *, separator: str = "", ignore_nulls: bool = False) -> Expr: ...SQL query interface allowing standard SQL operations on DataFrames and integration with existing SQL workflows.
class SQLContext:
def __init__(self, frames: dict[str, DataFrame | LazyFrame] | None = None, **named_frames: DataFrame | LazyFrame): ...
def execute(self, query: str, *, eager: bool = True) -> DataFrame | LazyFrame: ...
def register(self, name: str, frame: DataFrame | LazyFrame) -> SQLContext: ...
def unregister(self, name: str) -> SQLContext: ...
def sql(query: str, *, eager: bool = True, **named_frames: DataFrame | LazyFrame) -> DataFrame | LazyFrame: ...Polars provides a comprehensive exception hierarchy for different error scenarios:
# Core exceptions
class PolarsError(Exception): ...
class ColumnNotFoundError(PolarsError): ...
class ComputeError(PolarsError): ...
class DuplicateError(PolarsError): ...
class InvalidOperationError(PolarsError): ...
class NoDataError(PolarsError): ...
class OutOfBoundsError(PolarsError): ...
class PanicException(PolarsError): ...
class SchemaError(PolarsError): ...
class SchemaFieldNotFoundError(PolarsError): ...
class ShapeError(PolarsError): ...
class SQLInterfaceError(PolarsError): ...
class SQLSyntaxError(PolarsError): ...
# Warnings
class PolarsWarning(Exception): ...
class PerformanceWarning(PolarsWarning): ...All operations can raise these exceptions when encountering invalid data, schema mismatches, or computational errors. Proper exception handling should be used for production code.