Blazingly fast DataFrame library for legacy CPUs without AVX2 support
—
Specialized namespaces for working with different data types including string operations (.str), datetime operations (.dt), list operations (.list), array operations (.arr), struct operations (.struct), categorical operations (.cat), binary operations (.bin), name operations (.name), and metadata operations (.meta).
String operations available on both Expr and Series for text processing and manipulation.
# Available as expr.str.method() and series.str.method()
def contains(
pattern: str | Expr,
*,
literal: bool = False,
strict: bool = True
) -> Expr:
"""Check if string contains pattern."""
def ends_with(suffix: str | Expr) -> Expr:
"""Check if string ends with suffix."""
def starts_with(prefix: str | Expr) -> Expr:
"""Check if string starts with prefix."""
def extract(pattern: str, group_index: int = 1) -> Expr:
"""Extract regex capture group."""
def extract_all(pattern: str) -> Expr:
"""Extract all regex matches."""
def find(pattern: str, *, literal: bool = False) -> Expr:
"""Find first occurrence of pattern."""
def replace(pattern: str, value: str, *, literal: bool = False, n: int = 1) -> Expr:
"""Replace pattern with value."""
def replace_all(pattern: str, value: str, *, literal: bool = False) -> Expr:
"""Replace all occurrences of pattern."""
def slice(offset: int, length: int | None = None) -> Expr:
"""Extract substring by position."""
def head(n: int = 5) -> Expr:
"""Get first n characters."""
def tail(n: int = 5) -> Expr:
"""Get last n characters."""
def to_lowercase() -> Expr:
"""Convert to lowercase."""
def to_uppercase() -> Expr:
"""Convert to uppercase."""
def to_titlecase() -> Expr:
"""Convert to title case."""
def strip_chars(characters: str | None = None) -> Expr:
"""Remove characters from both ends."""
def strip_chars_start(characters: str | None = None) -> Expr:
"""Remove characters from start."""
def strip_chars_end(characters: str | None = None) -> Expr:
"""Remove characters from end."""
def zfill(width: int) -> Expr:
"""Pad with zeros to specified width."""
def pad_start(width: int, fillchar: str = " ") -> Expr:
"""Pad string to width from start."""
def pad_end(width: int, fillchar: str = " ") -> Expr:
"""Pad string to width from end."""
def len_bytes() -> Expr:
"""Get byte length of strings."""
def len_chars() -> Expr:
"""Get character length of strings."""
def n_chars() -> Expr:
"""Alias for len_chars."""
def concat(delimiter: str = "") -> Expr:
"""Concatenate strings in list/array."""
def explode() -> Expr:
"""Split string into characters."""
def split(by: str, *, inclusive: bool = False) -> Expr:
"""Split string by delimiter."""
def split_exact(by: str, n: int, *, inclusive: bool = False) -> Expr:
"""Split string into exactly n parts."""
def splitn(by: str, n: int) -> Expr:
"""Split string into at most n parts."""
def json_decode(dtype: type | None = None, *, infer_schema_length: int | None = None) -> Expr:
"""Parse JSON strings."""
def json_path_match(json_path: str) -> Expr:
"""Extract JSON values using JSONPath."""
def encode(encoding: str = "utf8") -> Expr:
"""Encode strings to bytes."""
def decode(encoding: str = "utf8", *, strict: bool = True) -> Expr:
"""Decode bytes to strings."""
def to_integer(*, base: int = 10, strict: bool = True) -> Expr:
"""Parse strings as integers."""
def to_decimal(*, infer_length: int = 10) -> Expr:
"""Parse strings as decimals."""
def strftime(format: str) -> Expr:
"""Format datetime as string."""
def strptime(
dtype: type,
format: str | None = None,
*,
strict: bool = True,
exact: bool = True,
cache: bool = True
) -> Expr:
"""Parse strings as datetime."""DateTime operations for temporal data manipulation and extraction.
# Available as expr.dt.method() and series.dt.method()
def year() -> Expr:
"""Extract year."""
def month() -> Expr:
"""Extract month."""
def day() -> Expr:
"""Extract day."""
def hour() -> Expr:
"""Extract hour."""
def minute() -> Expr:
"""Extract minute."""
def second() -> Expr:
"""Extract second."""
def microsecond() -> Expr:
"""Extract microsecond."""
def nanosecond() -> Expr:
"""Extract nanosecond."""
def weekday() -> Expr:
"""Get weekday (Monday=1, Sunday=7)."""
def week() -> Expr:
"""Get ISO week number."""
def ordinal_day() -> Expr:
"""Get day of year (1-366)."""
def quarter() -> Expr:
"""Get quarter (1-4)."""
def date() -> Expr:
"""Extract date part."""
def time() -> Expr:
"""Extract time part."""
def epoch(time_unit: str = "us") -> Expr:
"""Convert to epoch timestamp."""
def timestamp(time_unit: str = "us") -> Expr:
"""Get timestamp."""
def with_time_unit(time_unit: str) -> Expr:
"""Change time unit."""
def cast_time_unit(time_unit: str) -> Expr:
"""Cast to different time unit."""
def convert_time_zone(time_zone: str) -> Expr:
"""Convert to different timezone."""
def replace_time_zone(
time_zone: str | None,
*,
ambiguous: str = "raise",
non_existent: str = "raise"
) -> Expr:
"""Replace timezone without conversion."""
def truncate(every: str) -> Expr:
"""Truncate to time interval."""
def round(every: str) -> Expr:
"""Round to nearest time interval."""
def strftime(format: str) -> Expr:
"""Format as string."""
def to_string(format: str) -> Expr:
"""Convert to string with format."""
def days() -> Expr:
"""Extract days from duration."""
def hours() -> Expr:
"""Extract hours from duration."""
def minutes() -> Expr:
"""Extract minutes from duration."""
def seconds() -> Expr:
"""Extract seconds from duration."""
def milliseconds() -> Expr:
"""Extract milliseconds from duration."""
def microseconds() -> Expr:
"""Extract microseconds from duration."""
def nanoseconds() -> Expr:
"""Extract nanoseconds from duration."""
def total_days() -> Expr:
"""Get total days in duration."""
def total_hours() -> Expr:
"""Get total hours in duration."""
def total_minutes() -> Expr:
"""Get total minutes in duration."""
def total_seconds() -> Expr:
"""Get total seconds in duration."""
def total_milliseconds() -> Expr:
"""Get total milliseconds in duration."""
def total_microseconds() -> Expr:
"""Get total microseconds in duration."""
def total_nanoseconds() -> Expr:
"""Get total nanoseconds in duration."""
def offset_by(by: str) -> Expr:
"""Offset datetime by interval."""
def is_between(
start: datetime | date | str | Expr,
end: datetime | date | str | Expr,
closed: str = "both"
) -> Expr:
"""Check if datetime is in range."""Operations for working with list-type columns.
# Available as expr.list.method() and series.list.method()
def len() -> Expr:
"""Get length of lists."""
def sum() -> Expr:
"""Sum elements in each list."""
def max() -> Expr:
"""Get maximum element in each list."""
def min() -> Expr:
"""Get minimum element in each list."""
def mean() -> Expr:
"""Get mean of elements in each list."""
def sort(*, descending: bool = False, nulls_last: bool = False) -> Expr:
"""Sort elements in each list."""
def reverse() -> Expr:
"""Reverse order of elements in each list."""
def unique(*, maintain_order: bool = False) -> Expr:
"""Get unique elements in each list."""
def n_unique() -> Expr:
"""Count unique elements in each list."""
def get(index: int | Expr, *, null_on_oob: bool = True) -> Expr:
"""Get element at index."""
def first() -> Expr:
"""Get first element."""
def last() -> Expr:
"""Get last element."""
def head(n: int = 5) -> Expr:
"""Get first n elements."""
def tail(n: int = 5) -> Expr:
"""Get last n elements."""
def slice(offset: int, length: int | None = None) -> Expr:
"""Slice lists."""
def explode() -> Expr:
"""Explode list elements to separate rows."""
def contains(item: Any) -> Expr:
"""Check if lists contain item."""
def join(separator: str, *, ignore_nulls: bool = True) -> Expr:
"""Join list elements into string."""
def arg_min() -> Expr:
"""Get index of minimum element."""
def arg_max() -> Expr:
"""Get index of maximum element."""
def diff(n: int = 1, null_behavior: str = "ignore") -> Expr:
"""Calculate differences between consecutive elements."""
def shift(n: int = 1, *, fill_value: Any = None) -> Expr:
"""Shift elements by n positions."""
def drop_nulls() -> Expr:
"""Remove null values from lists."""
def sample(
n: int | None = None,
*,
fraction: float | None = None,
with_replacement: bool = False,
shuffle: bool = False,
seed: int | None = None
) -> Expr:
"""Sample elements from lists."""
def count_matches(element: Any, *, parallel: bool = False) -> Expr:
"""Count occurrences of element."""
def to_array(width: int) -> Expr:
"""Convert to array with fixed width."""
def to_struct(
n_field_strategy: str = "first_non_null",
fields: Callable[[int], str] | Sequence[str] | None = None
) -> Expr:
"""Convert to struct."""
def eval(expr: Expr, *, parallel: bool = False) -> Expr:
"""Evaluate expression on list elements."""
def all() -> Expr:
"""Check if all elements are true."""
def any() -> Expr:
"""Check if any elements are true."""Operations for working with fixed-size array columns.
# Available as expr.arr.method() and series.arr.method()
def min() -> Expr:
"""Get minimum element in each array."""
def max() -> Expr:
"""Get maximum element in each array."""
def sum() -> Expr:
"""Sum elements in each array."""
def unique(*, maintain_order: bool = False) -> Expr:
"""Get unique elements in each array."""
def to_list() -> Expr:
"""Convert to list type."""
def get(index: int | Expr, *, null_on_oob: bool = True) -> Expr:
"""Get element at index."""
def first() -> Expr:
"""Get first element."""
def last() -> Expr:
"""Get last element."""
def join(separator: str, *, ignore_nulls: bool = True) -> Expr:
"""Join array elements into string."""
def contains(item: Any) -> Expr:
"""Check if arrays contain item."""
def count_matches(element: Any) -> Expr:
"""Count occurrences of element."""
def reverse() -> Expr:
"""Reverse order of elements."""
def shift(n: int = 1, *, fill_value: Any = None) -> Expr:
"""Shift elements by n positions."""
def slice(offset: int, length: int | None = None) -> Expr:
"""Slice arrays."""
def explode() -> Expr:
"""Explode array elements to separate rows."""
def all() -> Expr:
"""Check if all elements are true."""
def any() -> Expr:
"""Check if any elements are true."""
def sort(*, descending: bool = False, nulls_last: bool = False) -> Expr:
"""Sort elements in each array."""
def arg_min() -> Expr:
"""Get index of minimum element."""
def arg_max() -> Expr:
"""Get index of maximum element."""
def eval(expr: Expr, *, parallel: bool = False) -> Expr:
"""Evaluate expression on array elements."""Operations for working with structured/nested data.
# Available as expr.struct.method() and series.struct.method()
def field(name: str) -> Expr:
"""Extract field by name."""
def rename_fields(names: list[str]) -> Expr:
"""Rename struct fields."""
def json_encode() -> Expr:
"""Encode struct as JSON string."""
def with_fields(*exprs: Expr) -> Expr:
"""Add or update struct fields."""
def n_fields() -> int:
"""Get number of fields."""
def fields() -> list[str]:
"""Get field names."""
def schema() -> dict[str, type]:
"""Get struct schema."""
def to_frame() -> DataFrame:
"""Convert struct Series to DataFrame."""Operations for categorical data types.
# Available as expr.cat.method() and series.cat.method()
def get_categories() -> Expr:
"""Get categorical categories."""
def len_bytes() -> Expr:
"""Get byte length of category strings."""
def len_chars() -> Expr:
"""Get character length of category strings."""
def set_ordering(ordering: str) -> Expr:
"""Set categorical ordering ('physical' or 'lexical')."""
def get_ordering() -> str:
"""Get current categorical ordering."""
def to_local() -> Expr:
"""Convert to local categorical."""Operations for binary data types.
# Available as expr.bin.method() and series.bin.method()
def contains(literal: bytes) -> Expr:
"""Check if binary contains literal bytes."""
def ends_with(suffix: bytes) -> Expr:
"""Check if binary ends with suffix."""
def starts_with(prefix: bytes) -> Expr:
"""Check if binary starts with prefix."""
def decode(encoding: str = "utf8", *, strict: bool = True) -> Expr:
"""Decode binary to string."""
def encode(encoding: str = "utf8") -> Expr:
"""Encode string to binary."""
def size() -> Expr:
"""Get size of binary data in bytes."""Operations for working with expression and column names.
# Available as expr.name.method()
def keep() -> Expr:
"""Keep original column name."""
def map(function: Callable[[str], str]) -> Expr:
"""Apply function to column name."""
def prefix(prefix: str) -> Expr:
"""Add prefix to column name."""
def suffix(suffix: str) -> Expr:
"""Add suffix to column name."""
def to_lowercase() -> Expr:
"""Convert column name to lowercase."""
def to_uppercase() -> Expr:
"""Convert column name to uppercase."""Metadata operations for expressions.
# Available as expr.meta.method()
def eq(other: Expr) -> bool:
"""Check expression equality."""
def ne(other: Expr) -> bool:
"""Check expression inequality."""
def has_multiple_outputs() -> bool:
"""Check if expression produces multiple columns."""
def is_column() -> bool:
"""Check if expression is a column reference."""
def is_regex_projection() -> bool:
"""Check if expression is a regex column selection."""
def output_name() -> str | None:
"""Get output column name if determinable."""
def pop() -> list[Expr]:
"""Pop and return child expressions."""
def root_names() -> list[str]:
"""Get root column names used by expression."""
def tree_format(*, return_as_string: bool = False) -> str | None:
"""Display expression tree structure."""
def undo_aliases() -> Expr:
"""Remove aliases from expression."""
def write_json(file: IOBase) -> None:
"""Write expression as JSON."""import polars as pl
df = pl.DataFrame({
"text": ["Hello World", "POLARS rocks", " data science "],
"emails": ["user@example.com", "admin@test.org", "info@company.net"]
})
result = df.select([
pl.col("text").str.to_lowercase().alias("lower"),
pl.col("text").str.len_chars().alias("length"),
pl.col("text").str.strip_chars().alias("stripped"),
pl.col("emails").str.extract(r"@(.+)").alias("domain"),
pl.col("text").str.contains("data").alias("has_data")
])
# Advanced string operations
processed = df.select([
pl.col("text").str.split(" ").alias("words"),
pl.col("text").str.replace("World", "Universe").alias("replaced"),
pl.col("emails").str.starts_with("admin").alias("is_admin")
])df_dates = pl.DataFrame({
"timestamp": pl.datetime_range(
pl.datetime(2023, 1, 1),
pl.datetime(2023, 12, 31),
"1mo",
eager=True
)
})
result = df_dates.select([
pl.col("timestamp"),
pl.col("timestamp").dt.year().alias("year"),
pl.col("timestamp").dt.month().alias("month"),
pl.col("timestamp").dt.quarter().alias("quarter"),
pl.col("timestamp").dt.weekday().alias("weekday"),
pl.col("timestamp").dt.strftime("%Y-%m-%d").alias("formatted"),
pl.col("timestamp").dt.truncate("1w").alias("week_start")
])
# Duration operations
df_duration = pl.DataFrame({
"start": [pl.datetime(2023, 1, 1), pl.datetime(2023, 6, 1)],
"end": [pl.datetime(2023, 1, 15), pl.datetime(2023, 6, 30)]
})
duration_result = df_duration.select([
(pl.col("end") - pl.col("start")).alias("duration"),
(pl.col("end") - pl.col("start")).dt.total_days().alias("total_days")
])df_lists = pl.DataFrame({
"numbers": [[1, 2, 3], [4, 5], [6, 7, 8, 9]],
"words": [["hello", "world"], ["polars", "rocks"], ["data", "science"]]
})
result = df_lists.select([
pl.col("numbers").list.len().alias("count"),
pl.col("numbers").list.sum().alias("sum"),
pl.col("numbers").list.max().alias("max"),
pl.col("numbers").list.get(0).alias("first"),
pl.col("words").list.join(" ").alias("joined"),
pl.col("numbers").list.contains(5).alias("has_five")
])
# List transformations
transformed = df_lists.select([
pl.col("numbers").list.sort().alias("sorted"),
pl.col("numbers").list.reverse().alias("reversed"),
pl.col("numbers").list.unique().alias("unique"),
pl.col("numbers").list.slice(1, 2).alias("middle")
])df_struct = pl.DataFrame({
"person": [
{"name": "Alice", "age": 25, "city": "NYC"},
{"name": "Bob", "age": 30, "city": "LA"},
{"name": "Charlie", "age": 35, "city": "Chicago"}
]
})
result = df_struct.select([
pl.col("person").struct.field("name").alias("name"),
pl.col("person").struct.field("age").alias("age"),
pl.col("person").struct.field("city").alias("city")
])
# Struct modifications
modified = df_struct.select([
pl.col("person").struct.with_fields([
pl.col("person").struct.field("age").add(1).alias("age")
]).alias("person_older")
])df_cat = pl.DataFrame({
"category": ["A", "B", "A", "C", "B", "A"]
}).with_columns(
pl.col("category").cast(pl.Categorical).alias("category")
)
result = df_cat.select([
pl.col("category"),
pl.col("category").cat.get_categories().alias("categories"),
pl.col("category").cat.len_chars().alias("category_length")
])df_binary = pl.DataFrame({
"data": [b"hello", b"world", b"polars"]
})
result = df_binary.select([
pl.col("data"),
pl.col("data").bin.size().alias("size"),
pl.col("data").bin.decode().alias("decoded"),
pl.col("data").bin.starts_with(b"hel").alias("starts_with_hel")
])# Create complex expression
expr = pl.col("value").filter(pl.col("category") == "A").sum().over("group")
# Examine expression metadata
print(f"Output name: {expr.meta.output_name()}")
print(f"Root names: {expr.meta.root_names()}")
print(f"Has multiple outputs: {expr.meta.has_multiple_outputs()}")
print(f"Is column: {expr.meta.is_column()}")
# Display expression tree
print(expr.meta.tree_format(return_as_string=True))# Complex text processing with multiple namespaces
text_df = pl.DataFrame({
"logs": [
'{"timestamp": "2023-01-01T10:00:00", "level": "INFO", "message": "System started"}',
'{"timestamp": "2023-01-01T10:05:00", "level": "ERROR", "message": "Connection failed"}',
'{"timestamp": "2023-01-01T10:10:00", "level": "INFO", "message": "System recovered"}'
]
})
processed_logs = text_df.select([
pl.col("logs").str.json_path_match("$.timestamp").alias("timestamp_str"),
pl.col("logs").str.json_path_match("$.level").alias("level"),
pl.col("logs").str.json_path_match("$.message").alias("message")
]).with_columns([
pl.col("timestamp_str").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S").alias("timestamp")
]).select([
pl.col("timestamp").dt.hour().alias("hour"),
pl.col("level"),
pl.col("message"),
pl.col("message").str.len_chars().alias("message_length")
])
# Multi-level list and struct operations
nested_df = pl.DataFrame({
"data": [
[{"values": [1, 2, 3], "label": "A"}, {"values": [4, 5], "label": "B"}],
[{"values": [6, 7, 8, 9], "label": "C"}]
]
})
result = nested_df.select([
pl.col("data").list.len().alias("num_items"),
pl.col("data").list.eval(
pl.element().struct.field("values").list.sum()
).alias("sums_per_item"),
pl.col("data").list.eval(
pl.element().struct.field("label")
).alias("labels")
])Install with Tessl CLI
npx tessl i tessl/pypi-polars-lts-cpu