Blazingly fast DataFrame library with 64-bit index support for handling datasets with more than 4.2 billion rows
—
Comprehensive type system with numeric, text, temporal, and nested types, plus schema definition and validation capabilities. Polars provides a rich type system that ensures data integrity and enables optimized operations.
Integer and floating-point data types with various precision levels.
# Signed integers
class Int8:
"""8-bit signed integer (-128 to 127)."""
class Int16:
"""16-bit signed integer (-32,768 to 32,767)."""
class Int32:
"""32-bit signed integer (-2^31 to 2^31-1)."""
class Int64:
"""64-bit signed integer (-2^63 to 2^63-1)."""
class Int128:
"""128-bit signed integer."""
# Unsigned integers
class UInt8:
"""8-bit unsigned integer (0 to 255)."""
class UInt16:
"""16-bit unsigned integer (0 to 65,535)."""
class UInt32:
"""32-bit unsigned integer (0 to 2^32-1)."""
class UInt64:
"""64-bit unsigned integer (0 to 2^64-1)."""
# Floating point
class Float32:
"""32-bit floating point number."""
class Float64:
"""64-bit floating point number."""
class Decimal:
"""Arbitrary precision decimal number."""
def __init__(self, precision: int | None = None, scale: int = 0):
"""
Create decimal type.
Parameters:
- precision: Number of significant digits
- scale: Number of decimal places
"""String and binary data types for text processing.
class String:
"""UTF-8 encoded string data (variable length)."""
class Utf8:
"""UTF-8 encoded string data (alias for String)."""
class Binary:
"""Binary data (bytes)."""Date, time, and duration types for temporal data processing.
class Date:
"""Calendar date (year, month, day)."""
class Datetime:
"""Date and time with optional timezone."""
def __init__(self, time_unit: TimeUnit = "us", time_zone: str | None = None):
"""
Create datetime type.
Parameters:
- time_unit: Time precision ("ns", "us", "ms", "s")
- time_zone: Timezone (e.g., "UTC", "America/New_York")
"""
class Time:
"""Time of day (hour, minute, second, subsecond)."""
class Duration:
"""Time duration/interval."""
def __init__(self, time_unit: TimeUnit = "us"):
"""
Create duration type.
Parameters:
- time_unit: Time precision ("ns", "us", "ms", "s")
"""Boolean values and special data types.
class Boolean:
"""Boolean true/false values."""
class Null:
"""Null type (no data)."""
class Unknown:
"""Unknown type placeholder."""
class Object:
"""Python object type (stores arbitrary Python objects)."""Types for categorical and enumerated data with optimized storage.
class Categorical:
"""Categorical data with string categories."""
def __init__(self, ordering: CategoricalOrdering = "physical"):
"""
Create categorical type.
Parameters:
- ordering: Category ordering ("physical" or "lexical")
"""
class Enum:
"""Enumerated type with fixed set of string values."""
def __init__(self, categories: list[str] | Series):
"""
Create enum type.
Parameters:
- categories: Fixed list of valid string values
"""
class Categories:
"""Categories metadata for categorical types."""Complex nested data structures including lists, arrays, and structs.
class List:
"""Variable-length list of same-typed elements."""
def __init__(self, inner: DataType):
"""
Create list type.
Parameters:
- inner: Element data type
"""
class Array:
"""Fixed-length array of same-typed elements."""
def __init__(self, inner: DataType, shape: int | tuple[int, ...]):
"""
Create array type.
Parameters:
- inner: Element data type
- shape: Array dimensions
"""
class Struct:
"""Struct/record type with named fields."""
def __init__(self, fields: list[Field] | dict[str, DataType]):
"""
Create struct type.
Parameters:
- fields: List of Field objects or dict mapping names to types
"""
class Field:
"""Named field in struct type."""
def __init__(self, name: str, dtype: DataType):
"""
Create field.
Parameters:
- name: Field name
- dtype: Field data type
"""Schema class for defining and validating DataFrame structure.
class Schema:
def __init__(self, schema: Mapping[str, DataType] | Iterable[tuple[str, DataType]] | None = None):
"""
Create schema.
Parameters:
- schema: Mapping of column names to data types
"""
def __getitem__(self, item: str) -> DataType:
"""Get data type for column."""
def __contains__(self, item: str) -> bool:
"""Check if column exists in schema."""
def __iter__(self) -> Iterator[str]:
"""Iterate over column names."""
def __len__(self) -> int:
"""Get number of columns."""
def names(self) -> list[str]:
"""Get all column names."""
def dtypes(self) -> list[DataType]:
"""Get all data types."""
def to_python(self) -> dict[str, type]:
"""Convert to Python type mapping."""Utility functions for working with data types.
def dtype_to_py_type(dtype: DataType) -> type:
"""
Convert Polars data type to Python type.
Parameters:
- dtype: Polars data type
Returns:
Corresponding Python type
"""
def is_polars_dtype(dtype: Any) -> bool:
"""
Check if object is a Polars data type.
Parameters:
- dtype: Object to check
Returns:
True if Polars data type
"""
def py_type_to_constructor(py_type: type) -> DataType:
"""
Get Polars constructor for Python type.
Parameters:
- py_type: Python type
Returns:
Polars data type constructor
"""
def numpy_char_code_to_dtype(char_code: str) -> DataType | None:
"""
Convert NumPy character code to Polars data type.
Parameters:
- char_code: NumPy dtype character code
Returns:
Polars data type or None
"""
def unpack_dtypes(*dtypes: DataType | Iterable[DataType]) -> list[DataType]:
"""
Unpack and flatten data type specifications.
Parameters:
- dtypes: Data type specifications
Returns:
Flattened list of data types
"""Type groups and constants for working with related data types.
class IntegerType:
"""Base class for integer types."""
class TemporalType:
"""Base class for temporal types."""
class DataTypeClass:
"""Metaclass for data type classes."""
# Constants
N_INFER_DEFAULT: int # Default number of rows for type inference
DTYPE_TEMPORAL_UNITS: frozenset[str] # Valid temporal unitsimport polars as pl
# Creating DataFrames with explicit types
df = pl.DataFrame({
"id": [1, 2, 3],
"name": ["Alice", "Bob", "Charlie"],
"salary": [50000.0, 60000.0, 70000.0],
"is_active": [True, False, True]
}, schema={
"id": pl.Int32,
"name": pl.String,
"salary": pl.Float64,
"is_active": pl.Boolean
})
# Schema inspection
print(df.schema)
print(df.dtypes)# Creating datetime columns with different precisions
df = pl.DataFrame({
"timestamp_us": ["2023-01-01 12:00:00"],
"timestamp_ms": ["2023-01-01 12:00:00"],
"date_only": ["2023-01-01"],
"time_only": ["12:00:00"]
}).with_columns([
pl.col("timestamp_us").str.strptime(pl.Datetime("us")),
pl.col("timestamp_ms").str.strptime(pl.Datetime("ms")),
pl.col("date_only").str.strptime(pl.Date),
pl.col("time_only").str.strptime(pl.Time)
])
# Working with timezones
df_tz = pl.DataFrame({
"utc_time": ["2023-01-01 12:00:00"]
}).with_columns([
pl.col("utc_time").str.strptime(pl.Datetime("us", "UTC"))
])# Categorical data
df = pl.DataFrame({
"category": ["A", "B", "A", "C", "B"]
}).with_columns([
pl.col("category").cast(pl.Categorical)
])
# Enum with fixed categories
df = pl.DataFrame({
"status": ["active", "inactive", "pending"]
}).with_columns([
pl.col("status").cast(pl.Enum(["active", "inactive", "pending"]))
])# List columns
df = pl.DataFrame({
"numbers": [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
})
print(df.dtypes) # [List(Int64)]
# Struct columns
df = pl.DataFrame({
"person": [
{"name": "Alice", "age": 25},
{"name": "Bob", "age": 30}
]
})
print(df.dtypes) # [Struct([Field('name', String), Field('age', Int64)])]
# Creating nested types explicitly
schema = pl.Schema({
"id": pl.Int32,
"scores": pl.List(pl.Float64),
"metadata": pl.Struct([
pl.Field("created_at", pl.Datetime),
pl.Field("version", pl.String)
])
})df = pl.DataFrame({
"text_numbers": ["1", "2", "3"],
"floats": [1.0, 2.0, 3.0]
})
# Cast to different types
result = df.with_columns([
pl.col("text_numbers").cast(pl.Int32).alias("integers"),
pl.col("floats").cast(pl.Int64).alias("rounded")
])
# Safe casting with error handling
result = df.with_columns([
pl.col("text_numbers").cast(pl.Int32, strict=False).alias("safe_cast")
])# Define expected schema
expected_schema = pl.Schema({
"id": pl.Int32,
"name": pl.String,
"amount": pl.Float64,
"timestamp": pl.Datetime("us")
})
# Read with schema validation
df = pl.read_csv("data.csv", schema=expected_schema)
# Override specific types
df = pl.read_csv("data.csv", schema_overrides={
"id": pl.String, # Read ID as string instead of number
"amount": pl.Decimal(10, 2) # Use decimal for precise amounts
})# High precision decimal calculations
df = pl.DataFrame({
"price": ["19.99", "29.99", "9.95"]
}).with_columns([
pl.col("price").cast(pl.Decimal(10, 2))
])
# Financial calculations maintaining precision
result = df.with_columns([
(pl.col("price") * pl.lit("1.08")).alias("with_tax"),
(pl.col("price") * pl.lit("0.9")).alias("discounted")
])# Check data types
df = pl.DataFrame({"mixed": [1, 2.5, "text"]})
print(pl.dtype_to_py_type(df.dtypes[0]))
# Type checking
schema = df.schema
for name, dtype in schema.items():
print(f"{name}: {dtype}")
if isinstance(dtype, pl.List):
print(f" List element type: {dtype.inner}")
elif isinstance(dtype, pl.Struct):
print(f" Struct fields: {dtype.fields}")Install with Tessl CLI
npx tessl i tessl/pypi-polars-u64-idx