Blazingly fast DataFrame library for Python with lazy and eager evaluation modes
—
Comprehensive type system supporting primitive types, temporal data, nested structures, and schema validation with automatic type inference, casting capabilities, and interoperability with Arrow and other data formats.
Fundamental numeric and boolean types with support for various precision levels and null value handling.
# Integer Types
Int8: DataType # 8-bit signed integer
Int16: DataType # 16-bit signed integer
Int32: DataType # 32-bit signed integer
Int64: DataType # 64-bit signed integer
Int128: DataType # 128-bit signed integer
# Unsigned Integer Types
UInt8: DataType # 8-bit unsigned integer
UInt16: DataType # 16-bit unsigned integer
UInt32: DataType # 32-bit unsigned integer
UInt64: DataType # 64-bit unsigned integer
# Floating Point Types
Float32: DataType # 32-bit floating point
Float64: DataType # 64-bit floating point
# Decimal Type
Decimal: DataType # High-precision decimal type
# Boolean Type
Boolean: DataType # Boolean true/falseText and binary data types with categorical optimization and encoding support.
# String Types
String: DataType # UTF-8 string type
Utf8: DataType # Alias for String (deprecated)
# Binary Type
Binary: DataType # Binary data type
# Categorical Types
Categorical: DataType # Categorical string type for efficiency
Enum: DataType # Enumerated string type with fixed categories
# Special Types
Null: DataType # Null type
Unknown: DataType # Unknown type placeholder
Object: DataType # Python object typeDate, time, and duration types with timezone support and various precision levels.
# Date and Time Types
Date: DataType # Date type (days since epoch)
Time: DataType # Time of day type
Duration: DataType # Time duration type
# DateTime Type with timezone support
Datetime: DataType # DateTime with optional timezone
# DateTime constructor
def Datetime(time_unit="us", time_zone=None) -> DataType:
"""
Create datetime type with specified precision and timezone.
Parameters:
- time_unit: Precision ("ns", "us", "ms")
- time_zone: Timezone string (e.g., "UTC", "America/New_York")
Returns:
Datetime data type
"""
# Duration constructor
def Duration(time_unit="us") -> DataType:
"""
Create duration type with specified precision.
Parameters:
- time_unit: Precision ("ns", "us", "ms")
Returns:
Duration data type
"""Complex nested structures supporting lists, arrays, and structured data.
# List Type (variable length)
List: DataType
def List(inner=None) -> DataType:
"""
Create list type with specified inner type.
Parameters:
- inner: Inner data type for list elements
Returns:
List data type
"""
# Array Type (fixed length)
Array: DataType
def Array(inner=None, width=None) -> DataType:
"""
Create array type with specified inner type and width.
Parameters:
- inner: Inner data type for array elements
- width: Fixed width of array
Returns:
Array data type
"""
# Struct Type
Struct: DataType
def Struct(fields=None) -> DataType:
"""
Create struct type with specified fields.
Parameters:
- fields: List of Field objects or dict mapping names to types
Returns:
Struct data type
"""Schema definition and validation with field specifications and type checking.
class Schema:
def __init__(self, schema=None):
"""
Create schema from various inputs.
Parameters:
- schema: Dict mapping column names to types, list of Field objects, or existing Schema
"""
def names(self) -> list[str]:
"""Get column names in schema order."""
def dtypes(self) -> list[DataType]:
"""Get column data types in schema order."""
def len(self) -> int:
"""Get number of columns in schema."""
def __contains__(self, item) -> bool:
"""Check if column name exists in schema."""
def __getitem__(self, item) -> DataType:
"""Get data type for column name."""
def __iter__(self):
"""Iterate over (name, dtype) pairs."""
class Field:
def __init__(self, name: str, dtype: DataType):
"""
Create field definition.
Parameters:
- name: Field name
- dtype: Field data type
"""
@property
def name(self) -> str:
"""Field name."""
@property
def dtype(self) -> DataType:
"""Field data type."""Functions for type inspection, validation, and conversion operations.
def dtype_of(value) -> DataType:
"""
Get the data type of a value or expression.
Parameters:
- value: Value or expression to inspect
Returns:
Data type of the value
"""
class DataType:
def __eq__(self, other) -> bool:
"""Check type equality."""
def __ne__(self, other) -> bool:
"""Check type inequality."""
def __hash__(self) -> int:
"""Hash for use in sets/dicts."""
def __repr__(self) -> str:
"""String representation."""
def is_numeric(self) -> bool:
"""Check if type is numeric."""
def is_integer(self) -> bool:
"""Check if type is integer."""
def is_float(self) -> bool:
"""Check if type is floating point."""
def is_temporal(self) -> bool:
"""Check if type is temporal."""
def is_nested(self) -> bool:
"""Check if type is nested (List, Array, Struct)."""Categorical and enumerated types for memory-efficient string handling with optional ordering.
def Categorical(ordering=None) -> DataType:
"""
Create categorical type.
Parameters:
- ordering: Ordering type ("physical" or "lexical")
Returns:
Categorical data type
"""
def Enum(categories=None) -> DataType:
"""
Create enum type with fixed categories.
Parameters:
- categories: List of valid category strings
Returns:
Enum data type
"""
class Categories:
def __init__(self, categories=None):
"""
Create categories definition.
Parameters:
- categories: List of category strings
"""High-precision decimal type for financial and scientific calculations requiring exact decimal representation.
def Decimal(precision=None, scale=0) -> DataType:
"""
Create decimal type with specified precision and scale.
Parameters:
- precision: Total number of digits (default: inferred)
- scale: Number of digits after decimal point
Returns:
Decimal data type
"""import polars as pl
# Create DataFrame with explicit types
df = pl.DataFrame({
"id": [1, 2, 3],
"price": [10.5, 20.0, 15.75],
"category": ["A", "B", "A"],
"date": ["2023-01-01", "2023-01-02", "2023-01-03"]
}, schema={
"id": pl.Int32,
"price": pl.Float64,
"category": pl.Categorical,
"date": pl.Date
})
# Check schema
print(df.schema)
print(df.dtypes)# Create datetime with timezone
dt_type = pl.Datetime("ms", "UTC")
# Create DataFrame with temporal types
df = pl.DataFrame({
"timestamp": ["2023-01-01T10:30:00", "2023-01-01T11:45:00"],
"date": ["2023-01-01", "2023-01-02"],
"duration": ["1h 30m", "2h 15m"]
}, schema={
"timestamp": pl.Datetime("ms", "UTC"),
"date": pl.Date,
"duration": pl.Duration("ms")
})
# Convert and work with temporal data
result = df.with_columns([
pl.col("timestamp").dt.hour().alias("hour"),
pl.col("date").dt.day().alias("day"),
pl.col("duration").dt.total_seconds().alias("duration_seconds")
])# Working with List types
df = pl.DataFrame({
"id": [1, 2, 3],
"scores": [[85, 90, 88], [92, 87, 95], [78, 82, 85]]
}, schema={
"id": pl.Int32,
"scores": pl.List(pl.Int32)
})
# Operations on lists
result = df.with_columns([
pl.col("scores").list.mean().alias("avg_score"),
pl.col("scores").list.max().alias("max_score"),
pl.col("scores").list.len().alias("num_scores")
])
# Working with Struct types
df = pl.DataFrame({
"person": [
{"name": "Alice", "age": 25, "city": "NYC"},
{"name": "Bob", "age": 30, "city": "LA"},
]
}, schema={
"person": pl.Struct([
pl.Field("name", pl.String),
pl.Field("age", pl.Int32),
pl.Field("city", pl.String)
])
})
# Access struct fields
result = df.with_columns([
pl.col("person").struct.field("name").alias("name"),
pl.col("person").struct.field("age").alias("age")
])# Type casting
df = pl.DataFrame({
"int_col": [1, 2, 3],
"str_col": ["10", "20", "30"],
"float_col": [1.1, 2.2, 3.3]
})
# Cast between types
result = df.with_columns([
pl.col("int_col").cast(pl.Float64).alias("int_as_float"),
pl.col("str_col").cast(pl.Int32).alias("str_as_int"),
pl.col("float_col").cast(pl.String).alias("float_as_str")
])
# Safe casting with strict=False
result = df.with_columns([
pl.col("str_col").cast(pl.Int32, strict=False).alias("safe_cast")
])# Define schema with validation
schema = pl.Schema({
"id": pl.Int64,
"name": pl.String,
"score": pl.Float64,
"category": pl.Categorical
})
# Create DataFrame with schema validation
df = pl.DataFrame({
"id": [1, 2, 3],
"name": ["Alice", "Bob", "Charlie"],
"score": [85.5, 92.0, 78.5],
"category": ["A", "B", "A"]
}, schema=schema)
# Schema overrides for specific columns
df = pl.DataFrame({
"values": ["1", "2", "3"]
}, schema_overrides={
"values": pl.Int32 # Override inferred String type
})# Create categorical for memory efficiency
df = pl.DataFrame({
"id": [1, 2, 3, 4, 5],
"category": ["Small", "Large", "Medium", "Small", "Large"]
}, schema={
"category": pl.Categorical
})
# Enum with fixed categories
df = pl.DataFrame({
"size": ["S", "M", "L", "S", "M"]
}, schema={
"size": pl.Enum(["S", "M", "L", "XL"])
})
# Operations on categorical data
result = df.group_by("category").agg([
pl.col("id").count().alias("count")
])# Financial calculations with exact precision
df = pl.DataFrame({
"amount": ["123.456789", "987.654321", "555.111222"]
}, schema={
"amount": pl.Decimal(precision=10, scale=6)
})
# Precise calculations
result = df.with_columns([
(pl.col("amount") * pl.lit("1.05")).alias("with_tax"),
pl.col("amount").round(2).alias("rounded")
])Install with Tessl CLI
npx tessl i tessl/pypi-polars