tessl/pypi-polars-lts-cpu

Blazingly fast DataFrame library for legacy CPUs without AVX2 support

—

Pending

Overview

Eval results

Files

Configuration and Utilities

Name: tessl/pypi-polars-lts-cpu
Author: tessl

Configuration options, selectors for column operations, string caching for categorical data, meta information utilities, and testing utilities for DataFrame comparisons. These components provide essential support for customizing Polars behavior and working efficiently with data.

Capabilities

Configuration

Customize Polars display options, performance settings, and behavior through the Config class.

class Config:
    def __init__(self):
        """Global configuration manager for Polars."""

    # Table Display Configuration
    def set_tbl_cols(self, n: int) -> Config:
        """
        Set maximum number of columns to display.
        
        Parameters:
        - n: Maximum columns (-1 for unlimited)
        
        Returns:
        - Config: Self for method chaining
        """

    def set_tbl_rows(self, n: int) -> Config:
        """
        Set maximum number of rows to display.
        
        Parameters:
        - n: Maximum rows (-1 for unlimited)
        
        Returns:
        - Config: Self for method chaining
        """

    def set_tbl_width_chars(self, width: int) -> Config:
        """
        Set maximum table width in characters.
        
        Parameters:
        - width: Maximum width in characters
        
        Returns:
        - Config: Self for method chaining
        """

    def set_tbl_column_data_type_inline(self, active: bool = True) -> Config:
        """
        Show column data types inline with headers.
        
        Parameters:
        - active: Enable inline data types
        
        Returns:
        - Config: Self for method chaining
        """

    def set_tbl_dataframe_shape_below(self, active: bool = True) -> Config:
        """
        Display DataFrame shape below the table.
        
        Parameters:
        - active: Show shape below table
        
        Returns:
        - Config: Self for method chaining
        """

    def set_tbl_formatting(
        self,
        format: str = "UTF8_FULL_CONDENSED",
        rounded_corners: bool = False
    ) -> Config:
        """
        Set table formatting style.
        
        Parameters:
        - format: Table format style
        - rounded_corners: Use rounded table corners
        
        Returns:
        - Config: Self for method chaining
        """

    def set_tbl_hide_column_data_types(self, active: bool = True) -> Config:
        """
        Hide column data types from display.
        
        Parameters:
        - active: Hide data types
        
        Returns:
        - Config: Self for method chaining
        """

    def set_tbl_hide_column_names(self, active: bool = True) -> Config:
        """
        Hide column names from display.
        
        Parameters:
        - active: Hide column names
        
        Returns:
        - Config: Self for method chaining
        """

    def set_tbl_hide_dtype_separator(self, active: bool = True) -> Config:
        """
        Hide separator between column names and types.
        
        Parameters:
        - active: Hide dtype separator
        
        Returns:
        - Config: Self for method chaining
        """

    # Performance and Behavior Configuration
    def set_verbose(self, active: bool = True) -> Config:
        """
        Enable verbose output for debugging.
        
        Parameters:
        - active: Enable verbose mode
        
        Returns:
        - Config: Self for method chaining
        """

    def set_streaming_chunk_size(self, size: int) -> Config:
        """
        Set chunk size for streaming operations.
        
        Parameters:
        - size: Chunk size in rows
        
        Returns:
        - Config: Self for method chaining
        """

    def set_auto_structify(self, active: bool = True) -> Config:
        """
        Automatically convert eligible data to struct format.
        
        Parameters:
        - active: Enable auto structification
        
        Returns:
        - Config: Self for method chaining
        """

    # Context Manager Support
    def __enter__(self) -> Config:
        """Enter configuration context."""

    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
        """Exit configuration context, restoring previous settings."""

    # Function Decorator Support
    def __call__(self, func: Callable) -> Callable:
        """Use as function decorator to apply config temporarily."""

String Cache

Optimize memory usage and performance for categorical-like string data through string interning.

class StringCache:
    def __init__(self):
        """Context manager for string cache operations."""

    def __enter__(self) -> StringCache:
        """Enable string cache."""

    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
        """Disable string cache and clean up."""

def enable_string_cache() -> None:
    """
    Enable global string cache for categorical operations.
    Strings are interned for memory efficiency and faster comparisons.
    """

def disable_string_cache() -> None:
    """
    Disable global string cache.
    Clean up interned strings and return to normal string handling.
    """

def using_string_cache() -> bool:
    """
    Check if string cache is currently enabled.
    
    Returns:
    - bool: True if string cache is active
    """

Meta Information

Access build information, version details, and system configuration.

def build_info() -> dict[str, str]:
    """
    Get Polars build information.
    
    Returns:
    - dict[str, str]: Build details including version, features, target
    """

def show_versions() -> None:
    """
    Display version information for Polars and key dependencies.
    Prints version details to stdout for debugging and support.
    """

def thread_pool_size() -> int:
    """
    Get current thread pool size for parallel operations.
    
    Returns:
    - int: Number of threads in the pool
    """

def threadpool_size() -> int:
    """
    Alias for thread_pool_size().
    
    Returns:
    - int: Number of threads in the pool
    """

def get_index_type() -> type:
    """
    Get the data type used for DataFrame indices.
    
    Returns:
    - type: Index data type (typically UInt32 or UInt64)
    """

Selectors System

Powerful column selection system for flexible DataFrame operations.

class Selector:
    """Base class for column selectors."""

# Type-based Selectors
def by_dtype(*dtypes: type) -> Selector:
    """Select columns by data type."""

def numeric() -> Selector:
    """Select numeric columns (int, float, decimal)."""

def integer() -> Selector:
    """Select integer columns."""

def signed_integer() -> Selector:
    """Select signed integer columns."""

def unsigned_integer() -> Selector:
    """Select unsigned integer columns."""

def float() -> Selector:
    """Select floating-point columns."""

def string() -> Selector:
    """Select string/text columns."""

def boolean() -> Selector:
    """Select boolean columns."""

def temporal() -> Selector:
    """Select temporal columns (date, datetime, time, duration)."""

def date() -> Selector:
    """Select date columns."""

def datetime(time_unit: str | None = None, time_zone: str | None = None) -> Selector:
    """Select datetime columns with optional unit/timezone filtering."""

def time() -> Selector:
    """Select time columns."""

def duration(time_unit: str | None = None) -> Selector:
    """Select duration columns with optional unit filtering."""

def categorical() -> Selector:
    """Select categorical columns."""

def enum() -> Selector:
    """Select enum columns."""

def binary() -> Selector:
    """Select binary data columns."""

def decimal() -> Selector:
    """Select decimal columns."""

# Complex Type Selectors
def list() -> Selector:
    """Select list columns."""

def array() -> Selector:
    """Select array columns."""

def struct() -> Selector:
    """Select struct columns."""

def nested() -> Selector:
    """Select nested columns (list, array, struct)."""

# Position-based Selectors
def first() -> Selector:
    """Select first column."""

def last() -> Selector:
    """Select last column."""

def by_index(*indices: int) -> Selector:
    """Select columns by index positions."""

# Name-based Selectors
def by_name(*names: str | list[str]) -> Selector:
    """Select columns by exact names."""

def matches(pattern: str, *, flags: int = 0) -> Selector:
    """Select columns matching regex pattern."""

def contains(substring: str) -> Selector:
    """Select columns containing substring."""

def starts_with(prefix: str) -> Selector:
    """Select columns starting with prefix."""

def ends_with(suffix: str) -> Selector:
    """Select columns ending with suffix."""

# Character Class Selectors
def alpha() -> Selector:
    """Select columns with alphabetic names."""

def alphanumeric() -> Selector:
    """Select columns with alphanumeric names."""

def digit() -> Selector:
    """Select columns with numeric names."""

# Utility Selectors
def all() -> Selector:
    """Select all columns."""

def exclude(*selectors: Selector | str) -> Selector:
    """Exclude specified selectors or column names."""

# Selector Operations
def expand_selector(
    frame: DataFrame | LazyFrame,
    *selectors: Selector | str
) -> list[str]:
    """
    Expand selectors to column names for given frame.
    
    Parameters:
    - frame: DataFrame or LazyFrame to expand selectors against
    - selectors: Selectors to expand
    
    Returns:
    - list[str]: Column names matching selectors
    """

def is_selector(obj: Any) -> bool:
    """
    Check if object is a selector.
    
    Parameters:
    - obj: Object to check
    
    Returns:
    - bool: True if object is a selector
    """

Testing Utilities

Assertion functions for comparing DataFrames and Series in tests.

def assert_frame_equal(
    left: DataFrame | LazyFrame,
    right: DataFrame | LazyFrame,
    *,
    check_dtype: bool = True,
    check_exact: bool = False,
    rtol: float = 1e-5,
    atol: float = 1e-8,
    categorical_as_str: bool = False,
    check_column_order: bool = True,
    check_row_order: bool = True
) -> None:
    """
    Assert that two DataFrames are equal.
    
    Parameters:
    - left: First DataFrame
    - right: Second DataFrame
    - check_dtype: Check column data types
    - check_exact: Check exact floating-point equality
    - rtol: Relative tolerance for floating-point comparison
    - atol: Absolute tolerance for floating-point comparison
    - categorical_as_str: Compare categoricals as strings
    - check_column_order: Check column order
    - check_row_order: Check row order
    
    Raises:
    - AssertionError: If DataFrames are not equal
    """

def assert_frame_not_equal(
    left: DataFrame | LazyFrame,
    right: DataFrame | LazyFrame,
    **kwargs
) -> None:
    """
    Assert that two DataFrames are not equal.
    
    Parameters:
    - left: First DataFrame
    - right: Second DataFrame
    - **kwargs: Same parameters as assert_frame_equal
    
    Raises:
    - AssertionError: If DataFrames are equal
    """

def assert_series_equal(
    left: Series,
    right: Series,
    *,
    check_dtype: bool = True,
    check_exact: bool = False,
    rtol: float = 1e-5,
    atol: float = 1e-8,
    categorical_as_str: bool = False,
    check_names: bool = True
) -> None:
    """
    Assert that two Series are equal.
    
    Parameters:
    - left: First Series
    - right: Second Series
    - check_dtype: Check data types
    - check_exact: Check exact floating-point equality
    - rtol: Relative tolerance for floating-point comparison
    - atol: Absolute tolerance for floating-point comparison
    - categorical_as_str: Compare categoricals as strings
    - check_names: Check Series names
    
    Raises:
    - AssertionError: If Series are not equal
    """

def assert_series_not_equal(
    left: Series,
    right: Series,
    **kwargs
) -> None:
    """
    Assert that two Series are not equal.
    
    Parameters:
    - left: First Series
    - right: Second Series
    - **kwargs: Same parameters as assert_series_equal
    
    Raises:
    - AssertionError: If Series are equal
    """

Usage Examples

Configuration Usage

import polars as pl

# Global configuration changes
pl.Config.set_tbl_rows(10)
pl.Config.set_tbl_cols(8)
pl.Config.set_verbose(True)

# Context manager for temporary config
with pl.Config() as cfg:
    cfg.set_tbl_rows(20)
    cfg.set_tbl_cols(12)
    # Configuration active only within this block
    print(large_df)  # Uses temporary settings

# Function decorator for config
@pl.Config(set_tbl_rows=5, set_verbose=False)
def analyze_data(df):
    return df.describe()

# Streaming configuration
pl.Config.set_streaming_chunk_size(50000)

String Cache Usage

# Context manager approach
with pl.StringCache():
    # String operations are optimized within this block
    df1 = pl.DataFrame({"category": ["A", "B", "A", "C", "B"]})
    df2 = pl.DataFrame({"category": ["A", "B", "C"]})
    
    # Joins and categorical operations are faster
    result = df1.join(df2, on="category")

# Global enable/disable
pl.enable_string_cache()

# Check if enabled
if pl.using_string_cache():
    print("String cache is active")

# Categorical operations benefit from string cache
df_cat = df.with_columns(pl.col("category").cast(pl.Categorical))

pl.disable_string_cache()

Meta Information

# Get build information
build_info = pl.build_info()
print(f"Polars version: {build_info['version']}")
print(f"Build features: {build_info['features']}")

# Show all version information
pl.show_versions()

# Thread pool information
thread_count = pl.thread_pool_size()
print(f"Using {thread_count} threads")

# Index type information
index_type = pl.get_index_type()
print(f"Index type: {index_type}")

Selectors Usage

import polars.selectors as cs

df = pl.DataFrame({
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "salary": [50000.0, 60000.0, 70000.0],
    "active": [True, False, True],
    "start_date": [pl.date(2020, 1, 1), pl.date(2019, 5, 15), pl.date(2021, 3, 10)]
})

# Type-based selection
numeric_cols = df.select(cs.numeric())
string_cols = df.select(cs.string())
temporal_cols = df.select(cs.temporal())

# Name-based selection
name_pattern_cols = df.select(cs.matches(r".*a.*"))  # Contains 'a'
prefix_cols = df.select(cs.starts_with("s"))  # Starts with 's'

# Combined selectors
analysis_cols = df.select(cs.numeric() | cs.temporal())
non_id_cols = df.select(cs.all() & ~cs.by_name("id"))

# Complex selector operations
selected_cols = df.select(
    cs.numeric() & ~cs.by_name("id"),  # Numeric except id
    cs.string(),                       # All strings
    cs.exclude(cs.boolean())          # Everything except boolean
)

# Expand selectors to column names
expanded = cs.expand_selector(df, cs.numeric(), cs.string())
print(f"Selected columns: {expanded}")

Testing Utilities

import polars.testing as plt

# Create test DataFrames
df1 = pl.DataFrame({
    "a": [1, 2, 3],
    "b": [4.0, 5.0, 6.0],
    "c": ["x", "y", "z"]
})

df2 = pl.DataFrame({
    "a": [1, 2, 3],
    "b": [4.0, 5.0, 6.0],
    "c": ["x", "y", "z"]
})

# Assert DataFrames are equal
plt.assert_frame_equal(df1, df2)

# Assert with tolerance for floating-point
df3 = pl.DataFrame({
    "a": [1, 2, 3],
    "b": [4.0001, 5.0001, 6.0001],
    "c": ["x", "y", "z"]
})

plt.assert_frame_equal(df1, df3, rtol=1e-3)

# Assert Series equality
s1 = pl.Series("values", [1, 2, 3])
s2 = pl.Series("values", [1, 2, 3])
plt.assert_series_equal(s1, s2)

# Assert inequality
df_different = pl.DataFrame({"a": [1, 2, 4]})  # Different values
plt.assert_frame_not_equal(df1, df_different)

# Testing in unit tests
def test_data_processing():
    input_df = pl.DataFrame({"x": [1, 2, 3]})
    expected_df = pl.DataFrame({"x": [2, 4, 6]})
    
    result_df = input_df.select(pl.col("x") * 2)
    
    plt.assert_frame_equal(result_df, expected_df)

Advanced Configuration Patterns

# Chained configuration
config_result = (
    pl.Config()
    .set_tbl_rows(15)
    .set_tbl_cols(10)
    .set_verbose(True)
    .set_streaming_chunk_size(25000)
)

# Configuration for different environments
def setup_dev_config():
    return (
        pl.Config()
        .set_verbose(True)
        .set_tbl_rows(-1)  # Show all rows
        .set_tbl_cols(-1)  # Show all columns
    )

def setup_prod_config():
    return (
        pl.Config()
        .set_verbose(False)
        .set_tbl_rows(10)
        .set_streaming_chunk_size(100000)
    )

# Environment-specific setup
if os.getenv("ENV") == "development":
    setup_dev_config()
else:
    setup_prod_config()

String Cache Performance Benefits

# Performance comparison example
import time

# Without string cache
start_time = time.time()
for _ in range(1000):
    df = pl.DataFrame({"cat": ["A", "B", "C"] * 1000})
    result = df.filter(pl.col("cat") == "A")
no_cache_time = time.time() - start_time

# With string cache
pl.enable_string_cache()
start_time = time.time()
for _ in range(1000):
    df = pl.DataFrame({"cat": ["A", "B", "C"] * 1000})
    result = df.filter(pl.col("cat") == "A")
cache_time = time.time() - start_time
pl.disable_string_cache()

print(f"Without cache: {no_cache_time:.3f}s")
print(f"With cache: {cache_time:.3f}s")
print(f"Speedup: {no_cache_time/cache_time:.2f}x")

CompatLevel

Data structure compatibility level configuration for controlling format compatibility when working with external systems and data interchange.

class CompatLevel:
    """
    Data structure compatibility level for interchange protocols.
    
    Used to control compatibility when converting to/from external formats
    like Arrow, ensuring data structures are compatible with different
    system requirements.
    """
    
    @staticmethod
    def newest() -> CompatLevel:
        """
        Get the highest supported compatibility level.
        
        Warning: Highest compatibility level is considered unstable
        and may change without notice.
        """
    
    @staticmethod 
    def oldest() -> CompatLevel:
        """Get the most compatible level for maximum compatibility."""

Install with Tessl CLI