tessl/pypi-polars

Blazingly fast DataFrame library for Python with lazy and eager evaluation modes

—

Pending

Overview

Eval results

Files

Data Types and Schema

Name: tessl/pypi-polars
Author: tessl

Comprehensive type system supporting primitive types, temporal data, nested structures, and schema validation with automatic type inference, casting capabilities, and interoperability with Arrow and other data formats.

Capabilities

Primitive Data Types

Fundamental numeric and boolean types with support for various precision levels and null value handling.

# Integer Types
Int8: DataType      # 8-bit signed integer
Int16: DataType     # 16-bit signed integer  
Int32: DataType     # 32-bit signed integer
Int64: DataType     # 64-bit signed integer
Int128: DataType    # 128-bit signed integer

# Unsigned Integer Types
UInt8: DataType     # 8-bit unsigned integer
UInt16: DataType    # 16-bit unsigned integer
UInt32: DataType    # 32-bit unsigned integer
UInt64: DataType    # 64-bit unsigned integer

# Floating Point Types
Float32: DataType   # 32-bit floating point
Float64: DataType   # 64-bit floating point

# Decimal Type
Decimal: DataType   # High-precision decimal type

# Boolean Type
Boolean: DataType   # Boolean true/false

String and Binary Types

Text and binary data types with categorical optimization and encoding support.

# String Types
String: DataType      # UTF-8 string type
Utf8: DataType        # Alias for String (deprecated)

# Binary Type  
Binary: DataType      # Binary data type

# Categorical Types
Categorical: DataType # Categorical string type for efficiency
Enum: DataType        # Enumerated string type with fixed categories

# Special Types
Null: DataType        # Null type
Unknown: DataType     # Unknown type placeholder
Object: DataType      # Python object type

Temporal Data Types

Date, time, and duration types with timezone support and various precision levels.

# Date and Time Types
Date: DataType        # Date type (days since epoch)
Time: DataType        # Time of day type
Duration: DataType    # Time duration type

# DateTime Type with timezone support
Datetime: DataType    # DateTime with optional timezone

# DateTime constructor
def Datetime(time_unit="us", time_zone=None) -> DataType:
    """
    Create datetime type with specified precision and timezone.
    
    Parameters:
    - time_unit: Precision ("ns", "us", "ms")
    - time_zone: Timezone string (e.g., "UTC", "America/New_York")
    
    Returns:
    Datetime data type
    """

# Duration constructor  
def Duration(time_unit="us") -> DataType:
    """
    Create duration type with specified precision.
    
    Parameters:
    - time_unit: Precision ("ns", "us", "ms")
    
    Returns:
    Duration data type
    """

Nested Data Types

Complex nested structures supporting lists, arrays, and structured data.

# List Type (variable length)
List: DataType

def List(inner=None) -> DataType:
    """
    Create list type with specified inner type.
    
    Parameters:
    - inner: Inner data type for list elements
    
    Returns:
    List data type
    """

# Array Type (fixed length)
Array: DataType

def Array(inner=None, width=None) -> DataType:
    """
    Create array type with specified inner type and width.
    
    Parameters:
    - inner: Inner data type for array elements
    - width: Fixed width of array
    
    Returns:
    Array data type
    """

# Struct Type
Struct: DataType

def Struct(fields=None) -> DataType:
    """
    Create struct type with specified fields.
    
    Parameters:
    - fields: List of Field objects or dict mapping names to types
    
    Returns:
    Struct data type
    """

Schema Management

Schema definition and validation with field specifications and type checking.

class Schema:
    def __init__(self, schema=None):
        """
        Create schema from various inputs.
        
        Parameters:
        - schema: Dict mapping column names to types, list of Field objects, or existing Schema
        """
    
    def names(self) -> list[str]:
        """Get column names in schema order."""
    
    def dtypes(self) -> list[DataType]:
        """Get column data types in schema order."""
    
    def len(self) -> int:
        """Get number of columns in schema."""
    
    def __contains__(self, item) -> bool:
        """Check if column name exists in schema."""
    
    def __getitem__(self, item) -> DataType:
        """Get data type for column name."""
    
    def __iter__(self):
        """Iterate over (name, dtype) pairs."""

class Field:
    def __init__(self, name: str, dtype: DataType):
        """
        Create field definition.
        
        Parameters:
        - name: Field name
        - dtype: Field data type
        """
    
    @property
    def name(self) -> str:
        """Field name."""
    
    @property
    def dtype(self) -> DataType:
        """Field data type."""

Type Utilities and Checking

Functions for type inspection, validation, and conversion operations.

def dtype_of(value) -> DataType:
    """
    Get the data type of a value or expression.
    
    Parameters:
    - value: Value or expression to inspect
    
    Returns:
    Data type of the value
    """

class DataType:
    def __eq__(self, other) -> bool:
        """Check type equality."""
    
    def __ne__(self, other) -> bool:
        """Check type inequality."""
    
    def __hash__(self) -> int:
        """Hash for use in sets/dicts."""
    
    def __repr__(self) -> str:
        """String representation."""
    
    def is_numeric(self) -> bool:
        """Check if type is numeric."""
    
    def is_integer(self) -> bool:
        """Check if type is integer."""
    
    def is_float(self) -> bool:
        """Check if type is floating point."""
    
    def is_temporal(self) -> bool:
        """Check if type is temporal."""
    
    def is_nested(self) -> bool:
        """Check if type is nested (List, Array, Struct)."""

Categorical Types

Categorical and enumerated types for memory-efficient string handling with optional ordering.

def Categorical(ordering=None) -> DataType:
    """
    Create categorical type.
    
    Parameters:
    - ordering: Ordering type ("physical" or "lexical")
    
    Returns:
    Categorical data type
    """

def Enum(categories=None) -> DataType:
    """
    Create enum type with fixed categories.
    
    Parameters:
    - categories: List of valid category strings
    
    Returns:
    Enum data type
    """

class Categories:
    def __init__(self, categories=None):
        """
        Create categories definition.
        
        Parameters:
        - categories: List of category strings
        """

Decimal Type

High-precision decimal type for financial and scientific calculations requiring exact decimal representation.

def Decimal(precision=None, scale=0) -> DataType:
    """
    Create decimal type with specified precision and scale.
    
    Parameters:
    - precision: Total number of digits (default: inferred)
    - scale: Number of digits after decimal point
    
    Returns:
    Decimal data type
    """

Usage Examples

Basic Type Creation and Usage

import polars as pl

# Create DataFrame with explicit types
df = pl.DataFrame({
    "id": [1, 2, 3],
    "price": [10.5, 20.0, 15.75],
    "category": ["A", "B", "A"],
    "date": ["2023-01-01", "2023-01-02", "2023-01-03"]
}, schema={
    "id": pl.Int32,
    "price": pl.Float64,
    "category": pl.Categorical,
    "date": pl.Date
})

# Check schema
print(df.schema)
print(df.dtypes)

Working with Temporal Types

# Create datetime with timezone
dt_type = pl.Datetime("ms", "UTC")

# Create DataFrame with temporal types
df = pl.DataFrame({
    "timestamp": ["2023-01-01T10:30:00", "2023-01-01T11:45:00"],
    "date": ["2023-01-01", "2023-01-02"],
    "duration": ["1h 30m", "2h 15m"]
}, schema={
    "timestamp": pl.Datetime("ms", "UTC"),
    "date": pl.Date,
    "duration": pl.Duration("ms")
})

# Convert and work with temporal data
result = df.with_columns([
    pl.col("timestamp").dt.hour().alias("hour"),
    pl.col("date").dt.day().alias("day"),
    pl.col("duration").dt.total_seconds().alias("duration_seconds")
])

Nested Types: Lists and Structs

# Working with List types
df = pl.DataFrame({
    "id": [1, 2, 3],
    "scores": [[85, 90, 88], [92, 87, 95], [78, 82, 85]]
}, schema={
    "id": pl.Int32,
    "scores": pl.List(pl.Int32)
})

# Operations on lists
result = df.with_columns([
    pl.col("scores").list.mean().alias("avg_score"),
    pl.col("scores").list.max().alias("max_score"),
    pl.col("scores").list.len().alias("num_scores")
])

# Working with Struct types
df = pl.DataFrame({
    "person": [
        {"name": "Alice", "age": 25, "city": "NYC"},
        {"name": "Bob", "age": 30, "city": "LA"},
    ]
}, schema={
    "person": pl.Struct([
        pl.Field("name", pl.String),
        pl.Field("age", pl.Int32),
        pl.Field("city", pl.String)
    ])
})

# Access struct fields
result = df.with_columns([
    pl.col("person").struct.field("name").alias("name"),
    pl.col("person").struct.field("age").alias("age")
])

Type Casting and Conversion

# Type casting
df = pl.DataFrame({
    "int_col": [1, 2, 3],
    "str_col": ["10", "20", "30"],
    "float_col": [1.1, 2.2, 3.3]
})

# Cast between types
result = df.with_columns([
    pl.col("int_col").cast(pl.Float64).alias("int_as_float"),
    pl.col("str_col").cast(pl.Int32).alias("str_as_int"),
    pl.col("float_col").cast(pl.String).alias("float_as_str")
])

# Safe casting with strict=False
result = df.with_columns([
    pl.col("str_col").cast(pl.Int32, strict=False).alias("safe_cast")
])

Schema Validation and Overrides

# Define schema with validation
schema = pl.Schema({
    "id": pl.Int64,
    "name": pl.String,
    "score": pl.Float64,
    "category": pl.Categorical
})

# Create DataFrame with schema validation
df = pl.DataFrame({
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "score": [85.5, 92.0, 78.5],
    "category": ["A", "B", "A"]
}, schema=schema)

# Schema overrides for specific columns
df = pl.DataFrame({
    "values": ["1", "2", "3"]
}, schema_overrides={
    "values": pl.Int32  # Override inferred String type
})

Working with Categorical Data

# Create categorical for memory efficiency
df = pl.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "category": ["Small", "Large", "Medium", "Small", "Large"]
}, schema={
    "category": pl.Categorical
})

# Enum with fixed categories  
df = pl.DataFrame({
    "size": ["S", "M", "L", "S", "M"]
}, schema={
    "size": pl.Enum(["S", "M", "L", "XL"])
})

# Operations on categorical data
result = df.group_by("category").agg([
    pl.col("id").count().alias("count")
])

High-Precision Decimal Arithmetic

# Financial calculations with exact precision
df = pl.DataFrame({
    "amount": ["123.456789", "987.654321", "555.111222"]
}, schema={
    "amount": pl.Decimal(precision=10, scale=6)
})

# Precise calculations
result = df.with_columns([
    (pl.col("amount") * pl.lit("1.05")).alias("with_tax"),
    pl.col("amount").round(2).alias("rounded")
])

Install with Tessl CLI