Distributed Dataframes for Multimodal Data with high-performance query engine and support for complex nested data structures, AI/ML operations, and seamless cloud storage integration.
—
Column operations and computational expressions for data transformation. Expressions provide type-safe, optimizable operations that can be applied to DataFrame columns with support for complex nested operations and function composition.
Create and manipulate column expressions for DataFrame operations.
def col(name: str) -> Expression:
"""
Reference a DataFrame column by name.
Parameters:
- name: Column name to reference
Returns:
Expression: Column expression
"""
def lit(value: Any) -> Expression:
"""
Create literal value expression.
Parameters:
- value: Literal value (number, string, boolean, etc.)
Returns:
Expression: Literal expression
"""
def coalesce(*exprs: Expression) -> Expression:
"""
Return first non-null value from expressions.
Parameters:
- exprs: Expressions to evaluate in order
Returns:
Expression: Coalesced expression
"""Create conditional logic with when/otherwise patterns.
class Expression:
def when(self, predicate: Expression) -> Expression:
"""
Create conditional expression.
Parameters:
- predicate: Boolean condition
Returns:
Expression: Conditional expression
"""
def otherwise(self, expr: Expression) -> Expression:
"""
Provide else clause for conditional expression.
Parameters:
- expr: Expression to use when condition is false
Returns:
Expression: Complete conditional expression
"""
def when(predicate: Expression) -> Expression:
"""
Start conditional expression chain.
Parameters:
- predicate: Boolean condition
Returns:
Expression: Conditional expression builder
"""Work with arrays, lists, and nested structures.
def list_(*exprs: Expression) -> Expression:
"""
Create list expression from multiple expressions.
Parameters:
- exprs: Expressions to combine into list
Returns:
Expression: List expression
"""
def struct(**kwargs: Expression) -> Expression:
"""
Create struct expression from named expressions.
Parameters:
- kwargs: Named expressions for struct fields
Returns:
Expression: Struct expression
"""
def element(n: int) -> Expression:
"""
Extract element from array/list by index.
Parameters:
- n: Index to extract (0-based)
Returns:
Expression: Element extraction expression
"""String manipulation and text processing functions.
class Expression:
def str_contains(self, pattern: str, regex: bool = False) -> Expression:
"""
Check if string contains pattern.
Parameters:
- pattern: Pattern to search for
- regex: Whether pattern is regular expression
Returns:
Expression: Boolean expression
"""
def str_length(self) -> Expression:
"""
Get string length.
Returns:
Expression: String length expression
"""
def str_upper(self) -> Expression:
"""
Convert string to uppercase.
Returns:
Expression: Uppercase string expression
"""
def str_lower(self) -> Expression:
"""
Convert string to lowercase.
Returns:
Expression: Lowercase string expression
"""
def str_slice(self, start: int, end: Optional[int] = None) -> Expression:
"""
Extract substring.
Parameters:
- start: Start index
- end: End index (end of string if None)
Returns:
Expression: Substring expression
"""Arithmetic and mathematical functions.
class Expression:
def __add__(self, other: Union[Expression, Any]) -> Expression:
"""Addition operation."""
def __sub__(self, other: Union[Expression, Any]) -> Expression:
"""Subtraction operation."""
def __mul__(self, other: Union[Expression, Any]) -> Expression:
"""Multiplication operation."""
def __truediv__(self, other: Union[Expression, Any]) -> Expression:
"""Division operation."""
def __mod__(self, other: Union[Expression, Any]) -> Expression:
"""Modulo operation."""
def abs(self) -> Expression:
"""Absolute value."""
def ceil(self) -> Expression:
"""Ceiling function."""
def floor(self) -> Expression:
"""Floor function."""
def round(self, decimals: int = 0) -> Expression:
"""Round to specified decimal places."""
def sqrt(self) -> Expression:
"""Square root."""
def sin(self) -> Expression:
"""Sine function."""
def cos(self) -> Expression:
"""Cosine function."""
def tan(self) -> Expression:
"""Tangent function."""Comparison and logical operations.
class Expression:
def __eq__(self, other: Union[Expression, Any]) -> Expression:
"""Equality comparison."""
def __ne__(self, other: Union[Expression, Any]) -> Expression:
"""Inequality comparison."""
def __lt__(self, other: Union[Expression, Any]) -> Expression:
"""Less than comparison."""
def __le__(self, other: Union[Expression, Any]) -> Expression:
"""Less than or equal comparison."""
def __gt__(self, other: Union[Expression, Any]) -> Expression:
"""Greater than comparison."""
def __ge__(self, other: Union[Expression, Any]) -> Expression:
"""Greater than or equal comparison."""
def __and__(self, other: Expression) -> Expression:
"""Logical AND operation."""
def __or__(self, other: Expression) -> Expression:
"""Logical OR operation."""
def __invert__(self) -> Expression:
"""Logical NOT operation."""
def isin(self, values: List[Any]) -> Expression:
"""Check if value is in list."""
def is_null(self) -> Expression:
"""Check if value is null."""
def is_not_null(self) -> Expression:
"""Check if value is not null."""Type casting and validation.
class Expression:
def cast(self, dtype: DataType) -> Expression:
"""
Cast expression to different data type.
Parameters:
- dtype: Target data type
Returns:
Expression: Cast expression
"""
def try_cast(self, dtype: DataType) -> Expression:
"""
Attempt to cast, returning null on failure.
Parameters:
- dtype: Target data type
Returns:
Expression: Safe cast expression
"""Create aggregation expressions for group operations.
class Expression:
def sum(self) -> Expression:
"""Sum aggregation."""
def mean(self) -> Expression:
"""Mean aggregation."""
def min(self) -> Expression:
"""Minimum aggregation."""
def max(self) -> Expression:
"""Maximum aggregation."""
def count(self) -> Expression:
"""Count aggregation."""
def std(self) -> Expression:
"""Standard deviation aggregation."""
def first(self) -> Expression:
"""First value aggregation."""
def last(self) -> Expression:
"""Last value aggregation."""
def list_agg(self) -> Expression:
"""Aggregate into list."""Date and time manipulation functions.
def interval(value: int, unit: str) -> Expression:
"""
Create time interval expression.
Parameters:
- value: Interval value
- unit: Time unit ('days', 'hours', 'minutes', 'seconds')
Returns:
Expression: Interval expression
"""
class Expression:
def dt_year(self) -> Expression:
"""Extract year from datetime."""
def dt_month(self) -> Expression:
"""Extract month from datetime."""
def dt_day(self) -> Expression:
"""Extract day from datetime."""
def dt_hour(self) -> Expression:
"""Extract hour from datetime."""
def dt_minute(self) -> Expression:
"""Extract minute from datetime."""
def dt_second(self) -> Expression:
"""Extract second from datetime."""
def dt_date(self) -> Expression:
"""Extract date part from datetime."""Window-based operations and rankings.
class Expression:
def over(self, window: Window) -> Expression:
"""
Apply expression over window.
Parameters:
- window: Window specification
Returns:
Expression: Windowed expression
"""
def row_number() -> Expression:
"""Row number within partition."""
def rank() -> Expression:
"""Rank within partition."""
def dense_rank() -> Expression:
"""Dense rank within partition."""
class Window:
def __init__(
self,
partition_by: Optional[List[Expression]] = None,
order_by: Optional[List[Expression]] = None
):
"""
Create window specification.
Parameters:
- partition_by: Columns to partition by
- order_by: Columns to order by within partition
"""
def rows_between(self, start: int, end: int) -> "Window":
"""
Define row-based frame boundaries.
Parameters:
- start: Start row offset (negative for preceding rows)
- end: End row offset (positive for following rows)
Returns:
Window: Window with row frame specification
"""
def range_between(self, start: Expression, end: Expression) -> "Window":
"""
Define range-based frame boundaries.
Parameters:
- start: Start range value
- end: End range value
Returns:
Window: Window with range frame specification
"""
# Window frame constants
unbounded_preceding: Expression # Unbounded preceding boundary
unbounded_following: Expression # Unbounded following boundary
current_row: Expression # Current row boundaryAdditional utility functions for data processing.
def columns_sum(*cols: ColumnInputType) -> Expression:
"""Sum across multiple columns."""
def columns_mean(*cols: ColumnInputType) -> Expression:
"""Mean across multiple columns."""
def columns_min(*cols: ColumnInputType) -> Expression:
"""Minimum across multiple columns."""
def columns_max(*cols: ColumnInputType) -> Expression:
"""Maximum across multiple columns."""
def monotonically_increasing_id() -> Expression:
"""Generate monotonically increasing IDs."""
def format(template: str, *args: Expression) -> Expression:
"""
Format string with expression arguments.
Parameters:
- template: Format string template
- args: Expressions to format into template
Returns:
Expression: Formatted string expression
"""Vector embedding operations for similarity calculations.
class Expression:
@property
def embedding(self) -> "ExpressionEmbeddingNamespace":
"""Access embedding operations namespace."""
class ExpressionEmbeddingNamespace:
def cosine_distance(self, other: Expression) -> Expression:
"""
Calculate cosine distance between embedding vectors.
Parameters:
- other: Another embedding expression to compare against
Returns:
Expression: Cosine distance (0.0 = identical, 2.0 = opposite)
"""from daft import col, lit, when
# Arithmetic operations
df.select(
col("price") * col("quantity").alias("total"),
(col("price") * 1.1).alias("price_with_tax"),
col("amount") + lit(100).alias("adjusted_amount")
)
# String operations
df.select(
col("name").str_upper().alias("name_upper"),
col("email").str_contains("@gmail.com").alias("is_gmail"),
col("description").str_length().alias("desc_length")
)# Conditional expressions
df.select(
when(col("age") >= 18)
.then(lit("Adult"))
.otherwise(lit("Minor"))
.alias("age_group"),
when(col("score") >= 90).then(lit("A"))
.when(col("score") >= 80).then(lit("B"))
.when(col("score") >= 70).then(lit("C"))
.otherwise(lit("F"))
.alias("grade")
)from daft import list_, struct, element
# Working with arrays and structs
df.select(
list_(col("item1"), col("item2"), col("item3")).alias("items"),
struct(
name=col("name"),
age=col("age"),
active=col("is_active")
).alias("person"),
element(0).alias("first_item") # Extract first element from array
)# Complex aggregations
df.groupby("category").agg(
col("price").mean().alias("avg_price"),
col("quantity").sum().alias("total_quantity"),
(col("price") * col("quantity")).sum().alias("total_revenue"),
col("name").count().alias("item_count")
)from daft import interval
# Date/time operations
df.select(
col("created_at").dt_year().alias("year"),
col("created_at").dt_month().alias("month"),
(col("created_at") + interval(30, "days")).alias("future_date"),
col("timestamp").dt_date().alias("date_only")
)from daft.window import Window
from daft.functions import row_number, rank
# Window operations
window = Window(
partition_by=[col("department")],
order_by=[col("salary").desc()]
)
df.select(
col("name"),
col("department"),
col("salary"),
row_number().over(window).alias("rank_in_dept"),
col("salary").sum().over(window).alias("dept_total_salary")
)class ExpressionVisitor:
"""Visitor pattern for traversing expression trees."""
def visit(self, expr: Expression) -> Any:
"""Visit expression node."""
def visit_column(self, expr: Expression) -> Any:
"""Visit column reference."""
def visit_literal(self, expr: Expression) -> Any:
"""Visit literal value."""
def visit_function(self, expr: Expression) -> Any:
"""Visit function call."""
class ExpressionsProjection:
"""Collection of expressions for projection operations."""
def __init__(self, exprs: List[Expression]): ...
def to_list(self) -> List[Expression]:
"""Convert to list of expressions."""ColumnInputType = Union[str, Expression]Install with Tessl CLI
npx tessl i tessl/pypi-daft