tessl/pypi-polars-u64-idx

Blazingly fast DataFrame library with 64-bit index support for handling datasets with more than 4.2 billion rows

—

Pending

Overview

Eval results

Files

Core Data Structures

Name: tessl/pypi-polars-u64-idx
Author: tessl

Primary data structures for working with tabular data in Polars, including eager DataFrame/Series for immediate operations and LazyFrame for optimized query execution with the 64-bit index variant supporting datasets exceeding 4.2 billion rows.

Capabilities

DataFrame

Two-dimensional labeled data structure with columns of potentially different types. The primary data structure for eager evaluation where operations are executed immediately.

class DataFrame:
    def __init__(
        self, 
        data=None, 
        schema=None, 
        schema_overrides=None, 
        orient=None, 
        infer_schema_length=N_INFER_DEFAULT, 
        nan_to_null=False
    ):
        """
        Create a DataFrame from various data sources.
        
        Parameters:
        - data: Data source (dict, list, numpy array, pandas DataFrame, etc.)
        - schema: Column names and types
        - schema_overrides: Override inferred types for specific columns  
        - orient: Data orientation ("row" or "col")
        - infer_schema_length: Number of rows to scan for type inference
        - nan_to_null: Convert NaN values to null
        """
    
    @property
    def shape(self) -> tuple[int, int]:
        """Get the shape (rows, columns) of the DataFrame."""
    
    @property
    def height(self) -> int:
        """Get the number of rows."""
    
    @property
    def width(self) -> int:
        """Get the number of columns."""
    
    @property
    def columns(self) -> list[str]:
        """Get column names."""
    
    @property
    def dtypes(self) -> list[DataType]:
        """Get data types of all columns."""
    
    @property
    def schema(self) -> Schema:
        """Get the schema (column names and types)."""
    
    def select(self, *exprs, **named_exprs) -> DataFrame:
        """
        Select columns using expressions.
        
        Parameters:
        - exprs: Column expressions to select
        - named_exprs: Named expressions for new columns
        
        Returns:
        DataFrame with selected columns
        """
    
    def filter(self, *predicates, **constraints) -> DataFrame:
        """
        Filter rows based on predicates.
        
        Parameters:
        - predicates: Boolean expressions for filtering
        - constraints: Named constraints
        
        Returns:
        Filtered DataFrame
        """
    
    def with_columns(self, *exprs, **named_exprs) -> DataFrame:
        """
        Add or modify columns.
        
        Parameters:
        - exprs: Column expressions to add/modify
        - named_exprs: Named expressions for new columns
        
        Returns:
        DataFrame with added/modified columns
        """
    
    def drop(self, *columns, strict=True) -> DataFrame:
        """
        Drop columns from DataFrame.
        
        Parameters:
        - columns: Column names to drop
        - strict: Whether to raise error if column doesn't exist
        
        Returns:
        DataFrame without dropped columns
        """
    
    def rename(self, mapping: dict[str, str] | Callable[[str], str]) -> DataFrame:
        """
        Rename columns.
        
        Parameters:
        - mapping: Dictionary mapping old to new names, or function
        
        Returns:
        DataFrame with renamed columns
        """
    
    def sort(
        self, 
        by, 
        *, 
        descending=False, 
        nulls_last=False, 
        multithreaded=True
    ) -> DataFrame:
        """
        Sort DataFrame by columns.
        
        Parameters:
        - by: Column(s) to sort by
        - descending: Sort in descending order
        - nulls_last: Place nulls at end
        - multithreaded: Use multiple threads
        
        Returns:
        Sorted DataFrame
        """
    
    def group_by(self, *by, maintain_order=False, **named_by) -> GroupBy:
        """
        Group DataFrame for aggregation.
        
        Parameters:
        - by: Columns to group by
        - maintain_order: Maintain order of groups
        - named_by: Named grouping expressions
        
        Returns:
        GroupBy object for aggregation
        """
    
    def join(
        self, 
        other, 
        on=None, 
        how="inner", 
        *, 
        left_on=None, 
        right_on=None, 
        suffix="_right", 
        validate="m:m", 
        join_nulls=False, 
        coalesce=None
    ) -> DataFrame:
        """
        Join with another DataFrame.
        
        Parameters:
        - other: DataFrame to join with
        - on: Column(s) to join on
        - how: Join type ("inner", "left", "outer", "cross", "anti", "semi")
        - left_on: Left DataFrame join columns
        - right_on: Right DataFrame join columns  
        - suffix: Suffix for duplicate column names
        - validate: Join validation ("m:m", "1:m", "m:1", "1:1")
        - join_nulls: Join on null values
        - coalesce: Coalesce join columns
        
        Returns:
        Joined DataFrame
        """
    
    def concat(self, other, *, how="vertical", ignore_index=False) -> DataFrame:
        """
        Concatenate with other DataFrame(s).
        
        Parameters:
        - other: DataFrame(s) to concatenate
        - how: Concatenation method ("vertical", "horizontal", "diagonal")
        - ignore_index: Reset index after concatenation
        
        Returns:
        Concatenated DataFrame
        """
    
    def to_pandas(self, **kwargs) -> pd.DataFrame:
        """Convert to pandas DataFrame."""
    
    def to_numpy(self, structured=False, order="c") -> np.ndarray:
        """Convert to NumPy array."""
    
    def to_arrow(self, *, compat_level=None) -> pa.Table:
        """Convert to PyArrow Table."""
    
    def to_dict(self, *, as_series=True) -> dict[str, Series | list[Any]]:
        """Convert to dictionary."""
    
    def write_csv(self, file=None, **kwargs) -> str | None:
        """Write to CSV file."""
    
    def write_json(self, file=None, **kwargs) -> str | None:
        """Write to JSON file."""
    
    def write_parquet(self, file, **kwargs) -> None:
        """Write to Parquet file."""
    
    def write_ipc(self, file, **kwargs) -> None:
        """Write to IPC/Arrow file."""
    
    def lazy(self) -> LazyFrame:
        """Convert to LazyFrame for optimized operations."""
    
    def head(self, n=5) -> DataFrame:
        """Get first n rows."""
    
    def tail(self, n=5) -> DataFrame:
        """Get last n rows."""
    
    def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> DataFrame:
        """Sample rows from DataFrame."""
    
    def null_count(self) -> DataFrame:
        """Count null values per column."""
    
    def is_empty(self) -> bool:
        """Check if DataFrame is empty."""
    
    def clone(self) -> DataFrame:
        """Create a copy of the DataFrame."""

Series

One-dimensional labeled array with homogeneous data type. Similar to a column in a DataFrame but can exist independently.

class Series:
    def __init__(
        self, 
        name=None, 
        values=None, 
        dtype=None, 
        strict=True, 
        nan_to_null=False, 
        dtype_if_empty=Null
    ):
        """
        Create a Series.
        
        Parameters:
        - name: Series name
        - values: Data values
        - dtype: Data type
        - strict: Strict type checking
        - nan_to_null: Convert NaN to null
        - dtype_if_empty: Type when empty
        """
    
    @property
    def name(self) -> str:
        """Get Series name."""
    
    @property
    def dtype(self) -> DataType:
        """Get data type."""
    
    @property
    def shape(self) -> tuple[int]:
        """Get shape (length,)."""
    
    def len(self) -> int:
        """Get length."""
    
    def sum(self) -> Any:
        """Sum all values."""
    
    def mean(self) -> float | None:
        """Calculate mean."""
    
    def max(self) -> Any:
        """Get maximum value."""
    
    def min(self) -> Any:
        """Get minimum value."""
    
    def sort(self, *, descending=False, nulls_last=False) -> Series:
        """Sort Series values."""
    
    def filter(self, predicate) -> Series:
        """Filter values based on predicate."""
    
    def to_list(self) -> list[Any]:
        """Convert to Python list."""
    
    def to_numpy(self) -> np.ndarray:
        """Convert to NumPy array."""
    
    def to_pandas(self) -> pd.Series:
        """Convert to pandas Series."""
    
    def to_frame(self, name=None) -> DataFrame:
        """Convert to single-column DataFrame."""

LazyFrame

Lazy evaluation version of DataFrame that builds a query plan without executing until .collect() is called. Enables query optimization and efficient processing of large datasets.

class LazyFrame:
    def select(self, *exprs, **named_exprs) -> LazyFrame:
        """Select columns (lazy operation)."""
    
    def filter(self, *predicates, **constraints) -> LazyFrame:
        """Filter rows (lazy operation)."""
    
    def with_columns(self, *exprs, **named_exprs) -> LazyFrame:
        """Add/modify columns (lazy operation)."""
    
    def drop(self, *columns, strict=True) -> LazyFrame:
        """Drop columns (lazy operation)."""
    
    def sort(self, by, *, descending=False, nulls_last=False, multithreaded=True) -> LazyFrame:
        """Sort by columns (lazy operation)."""
    
    def group_by(self, *by, maintain_order=False, **named_by) -> LazyGroupBy:
        """Group for aggregation (lazy operation)."""
    
    def join(
        self, 
        other, 
        on=None, 
        how="inner", 
        *, 
        left_on=None, 
        right_on=None, 
        suffix="_right",
        validate="m:m", 
        join_nulls=False, 
        coalesce=None
    ) -> LazyFrame:
        """Join with another LazyFrame (lazy operation)."""
    
    def collect(
        self, 
        *, 
        type_coercion=True, 
        predicate_pushdown=True, 
        projection_pushdown=True, 
        simplify_expression=True, 
        slice_pushdown=True, 
        comm_subplan_elim=True, 
        comm_subexpr_elim=True, 
        cluster_with_columns=True, 
        no_optimization=False, 
        streaming=False, 
        background=False, 
        _eager=False
    ) -> DataFrame:
        """
        Execute the lazy query and return DataFrame.
        
        Parameters:
        - type_coercion: Apply automatic type coercion
        - predicate_pushdown: Push filters down to scan level  
        - projection_pushdown: Push column selection down
        - simplify_expression: Simplify expressions
        - slice_pushdown: Push limits/offsets down
        - comm_subplan_elim: Eliminate common subplans
        - comm_subexpr_elim: Eliminate common subexpressions
        - cluster_with_columns: Cluster with_columns operations
        - no_optimization: Disable all optimizations
        - streaming: Execute in streaming mode
        - background: Execute in background thread
        
        Returns:
        Executed DataFrame
        """
    
    def explain(self, *, optimized=True, type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True, streaming=False) -> str:
        """Get query execution plan."""
    
    def schema(self) -> Schema:
        """Get the expected schema."""
    
    def dtypes(self) -> list[DataType]:
        """Get expected column data types."""
    
    def columns(self) -> list[str]:
        """Get expected column names."""
    
    def head(self, n=5) -> LazyFrame:
        """Get first n rows (lazy operation)."""
    
    def tail(self, n=5) -> LazyFrame:
        """Get last n rows (lazy operation)."""
    
    def limit(self, n) -> LazyFrame:
        """Limit number of rows (lazy operation)."""
    
    def offset(self, n) -> LazyFrame:
        """Skip first n rows (lazy operation)."""
    
    def slice(self, offset, length=None) -> LazyFrame:
        """Slice rows (lazy operation)."""

GroupBy Operations

GroupBy objects returned from group_by() operations on DataFrame and LazyFrame for aggregation operations.

class GroupBy:
    def agg(self, *aggs, **named_aggs) -> DataFrame:
        """
        Aggregate grouped data.
        
        Parameters:
        - aggs: Aggregation expressions
        - named_aggs: Named aggregation expressions
        
        Returns:
        DataFrame with aggregated results
        """
    
    def sum(self) -> DataFrame:
        """Sum each group."""
    
    def mean(self) -> DataFrame:
        """Mean of each group."""
    
    def max(self) -> DataFrame:
        """Maximum of each group."""
    
    def min(self) -> DataFrame:
        """Minimum of each group."""
    
    def count(self) -> DataFrame:
        """Count rows in each group."""
    
    def first(self) -> DataFrame:
        """First value in each group."""
    
    def last(self) -> DataFrame:
        """Last value in each group."""

class LazyGroupBy:
    def agg(self, *aggs, **named_aggs) -> LazyFrame:
        """Aggregate grouped data (lazy operation)."""
    
    def sum(self) -> LazyFrame:
        """Sum each group (lazy operation)."""
    
    def mean(self) -> LazyFrame:
        """Mean of each group (lazy operation)."""
    
    def max(self) -> LazyFrame:
        """Maximum of each group (lazy operation)."""
    
    def min(self) -> LazyFrame:
        """Minimum of each group (lazy operation)."""
    
    def count(self) -> LazyFrame:
        """Count rows in each group (lazy operation)."""

Usage Examples

Creating DataFrames

import polars as pl

# From dictionary
df = pl.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "salary": [50000, 60000, 70000]
})

# From list of dictionaries
data = [
    {"name": "Alice", "age": 25, "salary": 50000},
    {"name": "Bob", "age": 30, "salary": 60000},
    {"name": "Charlie", "age": 35, "salary": 70000}
]
df = pl.DataFrame(data)

# From NumPy array
import numpy as np
arr = np.array([[1, 2, 3], [4, 5, 6]])
df = pl.DataFrame(arr, schema=["a", "b", "c"])

DataFrame Operations

# Basic operations
result = (df
    .filter(pl.col("age") > 28)
    .select([
        pl.col("name"),
        pl.col("age"),
        (pl.col("salary") / 1000).alias("salary_k")
    ])
    .sort("age", descending=True)
)

# Grouping and aggregation
summary = (df
    .group_by("department")
    .agg([
        pl.col("salary").mean().alias("avg_salary"),
        pl.col("name").count().alias("employee_count"),
        pl.col("age").max().alias("max_age")
    ])
)

Lazy Operations

# Build query plan without execution
lazy_query = (pl
    .scan_csv("large_dataset.csv")
    .filter(pl.col("amount") > 1000)
    .group_by("category")
    .agg([
        pl.col("amount").sum().alias("total"),
        pl.col("id").count().alias("count")
    ])
    .sort("total", descending=True)
)

# Execute optimized query
result = lazy_query.collect()

# Check execution plan
print(lazy_query.explain())

Working with Large Datasets (64-bit Index)

# The u64-idx variant handles datasets > 4.2B rows
very_large_df = pl.scan_parquet("huge_dataset.parquet")

# Operations work the same but support more rows
result = (very_large_df
    .filter(pl.col("timestamp") > "2023-01-01")
    .group_by("user_id")
    .agg([
        pl.col("value").sum(),
        pl.col("event").count()
    ])
    .collect(streaming=True)  # Use streaming for memory efficiency
)

Install with Tessl CLI