tessl/pypi-polars-u64-idx

Blazingly fast DataFrame library with 64-bit index support for handling datasets with more than 4.2 billion rows

—

Pending

Overview

Eval results

Files

Data Types and Schema

Name: tessl/pypi-polars-u64-idx
Author: tessl

Comprehensive type system with numeric, text, temporal, and nested types, plus schema definition and validation capabilities. Polars provides a rich type system that ensures data integrity and enables optimized operations.

Capabilities

Numeric Data Types

Integer and floating-point data types with various precision levels.

# Signed integers
class Int8:
    """8-bit signed integer (-128 to 127)."""

class Int16:
    """16-bit signed integer (-32,768 to 32,767)."""

class Int32:
    """32-bit signed integer (-2^31 to 2^31-1)."""

class Int64:
    """64-bit signed integer (-2^63 to 2^63-1)."""

class Int128:
    """128-bit signed integer."""

# Unsigned integers
class UInt8:
    """8-bit unsigned integer (0 to 255)."""

class UInt16:
    """16-bit unsigned integer (0 to 65,535)."""

class UInt32:
    """32-bit unsigned integer (0 to 2^32-1)."""

class UInt64:
    """64-bit unsigned integer (0 to 2^64-1)."""

# Floating point
class Float32:
    """32-bit floating point number."""

class Float64:
    """64-bit floating point number."""

class Decimal:
    """Arbitrary precision decimal number."""
    def __init__(self, precision: int | None = None, scale: int = 0):
        """
        Create decimal type.
        
        Parameters:
        - precision: Number of significant digits
        - scale: Number of decimal places
        """

Text Data Types

String and binary data types for text processing.

class String:
    """UTF-8 encoded string data (variable length)."""

class Utf8:
    """UTF-8 encoded string data (alias for String)."""

class Binary:
    """Binary data (bytes)."""

Temporal Data Types

Date, time, and duration types for temporal data processing.

class Date:
    """Calendar date (year, month, day)."""

class Datetime:
    """Date and time with optional timezone."""
    def __init__(self, time_unit: TimeUnit = "us", time_zone: str | None = None):
        """
        Create datetime type.
        
        Parameters:
        - time_unit: Time precision ("ns", "us", "ms", "s")
        - time_zone: Timezone (e.g., "UTC", "America/New_York")
        """

class Time:
    """Time of day (hour, minute, second, subsecond)."""

class Duration:
    """Time duration/interval."""
    def __init__(self, time_unit: TimeUnit = "us"):
        """
        Create duration type.
        
        Parameters:
        - time_unit: Time precision ("ns", "us", "ms", "s")
        """

Boolean and Special Types

Boolean values and special data types.

class Boolean:
    """Boolean true/false values."""

class Null:
    """Null type (no data)."""

class Unknown:
    """Unknown type placeholder."""

class Object:
    """Python object type (stores arbitrary Python objects)."""

Categorical and Enumerated Types

Types for categorical and enumerated data with optimized storage.

class Categorical:
    """Categorical data with string categories."""
    def __init__(self, ordering: CategoricalOrdering = "physical"):
        """
        Create categorical type.
        
        Parameters:
        - ordering: Category ordering ("physical" or "lexical")
        """

class Enum:
    """Enumerated type with fixed set of string values."""
    def __init__(self, categories: list[str] | Series):
        """
        Create enum type.
        
        Parameters:
        - categories: Fixed list of valid string values
        """

class Categories:
    """Categories metadata for categorical types."""

Nested Data Types

Complex nested data structures including lists, arrays, and structs.

class List:
    """Variable-length list of same-typed elements."""
    def __init__(self, inner: DataType):
        """
        Create list type.
        
        Parameters:
        - inner: Element data type
        """

class Array:
    """Fixed-length array of same-typed elements."""
    def __init__(self, inner: DataType, shape: int | tuple[int, ...]):
        """
        Create array type.
        
        Parameters:
        - inner: Element data type
        - shape: Array dimensions
        """

class Struct:
    """Struct/record type with named fields."""
    def __init__(self, fields: list[Field] | dict[str, DataType]):
        """
        Create struct type.
        
        Parameters:
        - fields: List of Field objects or dict mapping names to types
        """

class Field:
    """Named field in struct type."""
    def __init__(self, name: str, dtype: DataType):
        """
        Create field.
        
        Parameters:
        - name: Field name
        - dtype: Field data type
        """

Schema Definition

Schema class for defining and validating DataFrame structure.

class Schema:
    def __init__(self, schema: Mapping[str, DataType] | Iterable[tuple[str, DataType]] | None = None):
        """
        Create schema.
        
        Parameters:
        - schema: Mapping of column names to data types
        """
    
    def __getitem__(self, item: str) -> DataType:
        """Get data type for column."""
    
    def __contains__(self, item: str) -> bool:
        """Check if column exists in schema."""
    
    def __iter__(self) -> Iterator[str]:
        """Iterate over column names."""
    
    def __len__(self) -> int:
        """Get number of columns."""
    
    def names(self) -> list[str]:
        """Get all column names."""
    
    def dtypes(self) -> list[DataType]:
        """Get all data types."""
    
    def to_python(self) -> dict[str, type]:
        """Convert to Python type mapping."""

Type Utilities

Utility functions for working with data types.

def dtype_to_py_type(dtype: DataType) -> type:
    """
    Convert Polars data type to Python type.
    
    Parameters:
    - dtype: Polars data type
    
    Returns:
    Corresponding Python type
    """

def is_polars_dtype(dtype: Any) -> bool:
    """
    Check if object is a Polars data type.
    
    Parameters:
    - dtype: Object to check
    
    Returns:
    True if Polars data type
    """

def py_type_to_constructor(py_type: type) -> DataType:
    """
    Get Polars constructor for Python type.
    
    Parameters:
    - py_type: Python type
    
    Returns:
    Polars data type constructor
    """

def numpy_char_code_to_dtype(char_code: str) -> DataType | None:
    """
    Convert NumPy character code to Polars data type.
    
    Parameters:
    - char_code: NumPy dtype character code
    
    Returns:
    Polars data type or None
    """

def unpack_dtypes(*dtypes: DataType | Iterable[DataType]) -> list[DataType]:
    """
    Unpack and flatten data type specifications.
    
    Parameters:
    - dtypes: Data type specifications
    
    Returns:
    Flattened list of data types
    """

Type Groups and Constants

Type groups and constants for working with related data types.

class IntegerType:
    """Base class for integer types."""

class TemporalType:
    """Base class for temporal types."""

class DataTypeClass:
    """Metaclass for data type classes."""

# Constants
N_INFER_DEFAULT: int  # Default number of rows for type inference
DTYPE_TEMPORAL_UNITS: frozenset[str]  # Valid temporal units

Usage Examples

Basic Type Usage

import polars as pl

# Creating DataFrames with explicit types
df = pl.DataFrame({
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "salary": [50000.0, 60000.0, 70000.0],
    "is_active": [True, False, True]
}, schema={
    "id": pl.Int32,
    "name": pl.String,
    "salary": pl.Float64,
    "is_active": pl.Boolean
})

# Schema inspection
print(df.schema)
print(df.dtypes)

Working with Temporal Types

# Creating datetime columns with different precisions
df = pl.DataFrame({
    "timestamp_us": ["2023-01-01 12:00:00"],
    "timestamp_ms": ["2023-01-01 12:00:00"],
    "date_only": ["2023-01-01"],
    "time_only": ["12:00:00"]
}).with_columns([
    pl.col("timestamp_us").str.strptime(pl.Datetime("us")),
    pl.col("timestamp_ms").str.strptime(pl.Datetime("ms")),
    pl.col("date_only").str.strptime(pl.Date),
    pl.col("time_only").str.strptime(pl.Time)
])

# Working with timezones
df_tz = pl.DataFrame({
    "utc_time": ["2023-01-01 12:00:00"]
}).with_columns([
    pl.col("utc_time").str.strptime(pl.Datetime("us", "UTC"))
])

Categorical and Enum Types

# Categorical data
df = pl.DataFrame({
    "category": ["A", "B", "A", "C", "B"]
}).with_columns([
    pl.col("category").cast(pl.Categorical)
])

# Enum with fixed categories
df = pl.DataFrame({
    "status": ["active", "inactive", "pending"]
}).with_columns([
    pl.col("status").cast(pl.Enum(["active", "inactive", "pending"]))
])

Nested Data Types

# List columns
df = pl.DataFrame({
    "numbers": [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
})
print(df.dtypes)  # [List(Int64)]

# Struct columns
df = pl.DataFrame({
    "person": [
        {"name": "Alice", "age": 25},
        {"name": "Bob", "age": 30}
    ]
})
print(df.dtypes)  # [Struct([Field('name', String), Field('age', Int64)])]

# Creating nested types explicitly
schema = pl.Schema({
    "id": pl.Int32,
    "scores": pl.List(pl.Float64),
    "metadata": pl.Struct([
        pl.Field("created_at", pl.Datetime),
        pl.Field("version", pl.String)
    ])
})

Type Casting and Conversion

df = pl.DataFrame({
    "text_numbers": ["1", "2", "3"],
    "floats": [1.0, 2.0, 3.0]
})

# Cast to different types
result = df.with_columns([
    pl.col("text_numbers").cast(pl.Int32).alias("integers"),
    pl.col("floats").cast(pl.Int64).alias("rounded")
])

# Safe casting with error handling
result = df.with_columns([
    pl.col("text_numbers").cast(pl.Int32, strict=False).alias("safe_cast")
])

Schema Validation

# Define expected schema
expected_schema = pl.Schema({
    "id": pl.Int32,
    "name": pl.String,
    "amount": pl.Float64,
    "timestamp": pl.Datetime("us")
})

# Read with schema validation
df = pl.read_csv("data.csv", schema=expected_schema)

# Override specific types
df = pl.read_csv("data.csv", schema_overrides={
    "id": pl.String,  # Read ID as string instead of number
    "amount": pl.Decimal(10, 2)  # Use decimal for precise amounts
})

Working with Decimal Types

# High precision decimal calculations
df = pl.DataFrame({
    "price": ["19.99", "29.99", "9.95"]
}).with_columns([
    pl.col("price").cast(pl.Decimal(10, 2))
])

# Financial calculations maintaining precision
result = df.with_columns([
    (pl.col("price") * pl.lit("1.08")).alias("with_tax"),
    (pl.col("price") * pl.lit("0.9")).alias("discounted")
])

Type Inspection and Utilities

# Check data types
df = pl.DataFrame({"mixed": [1, 2.5, "text"]})
print(pl.dtype_to_py_type(df.dtypes[0]))

# Type checking
schema = df.schema
for name, dtype in schema.items():
    print(f"{name}: {dtype}")
    if isinstance(dtype, pl.List):
        print(f"  List element type: {dtype.inner}")
    elif isinstance(dtype, pl.Struct):
        print(f"  Struct fields: {dtype.fields}")

Install with Tessl CLI