Column expressions and built-in SQL functions for data transformation. Provides both operator overloading and function-based APIs for creating complex expressions, with support for all SQL operations and advanced analytics functions.
Represents a column in a DataFrame that can be composed into complex expressions.
/**
* A column in a DataFrame that can be composed into complex expressions
*/
class Column {
/** Equality comparison (SQL-safe null handling) */
def ===(other: Any): Column
def =!=(other: Any): Column
/** Numeric comparison */
def >(other: Any): Column
def >=(other: Any): Column
def <(other: Any): Column
def <=(other: Any): Column
/** Arithmetic operations */
def +(other: Any): Column
def -(other: Any): Column
def *(other: Any): Column
def /(other: Any): Column
def %(other: Any): Column
/** Logical operations */
def &&(other: Column): Column
def ||(other: Column): Column
def unary_!(): Column
/** Null handling */
def isNull: Column
def isNotNull: Column
def isNaN: Column
/** String operations */
def startsWith(other: Column): Column
def startsWith(literal: String): Column
def endsWith(other: Column): Column
def endsWith(literal: String): Column
def contains(other: Column): Column
def contains(literal: String): Column
def like(literal: String): Column
def rlike(literal: String): Column
/** Type operations */
def cast(to: DataType): Column
def cast(to: String): Column
/** Alias operations */
def as(alias: String): Column
def as(alias: Symbol): Column
def alias(alias: String): Column
/** Sorting */
def asc: Column
def asc_nulls_first: Column
def asc_nulls_last: Column
def desc: Column
def desc_nulls_first: Column
def desc_nulls_last: Column
/** Window operations */
def over(window: WindowSpec): Column
/** Collection operations */
def getItem(key: Any): Column
def getField(fieldName: String): Column
/** Boolean operations for arrays */
def isin(list: Any*): Column
}Column with type information for input and output types.
/**
* Column with type information
* @tparam T Input type
* @tparam U Output type
*/
class TypedColumn[T, U] extends Column {
/** Apply to Dataset */
def apply(dataset: Dataset[T]): Dataset[U]
}Essential functions for column creation and manipulation.
object functions {
/** Create column from column name */
def col(colName: String): Column
def column(colName: String): Column
/** Create literal column */
def lit(literal: Any): Column
def typedLit[T : TypeTag](literal: T): Column
/** Conditional expressions */
def when(condition: Column, value: Any): Column
def coalesce(e: Column*): Column
def greatest(e: Column*): Column
def least(e: Column*): Column
def isnull(e: Column): Column
def isnan(e: Column): Column
def nanvl(col1: Column, col2: Column): Column
}Usage Examples:
import org.apache.spark.sql.functions._
val df = spark.table("employees")
// Column operations
val filtered = df.filter(col("age") > 25 && col("department") === "Engineering")
val calculated = df.withColumn("bonus", col("salary") * 0.1)
val sorted = df.orderBy(col("salary").desc, col("name").asc)
// Conditional logic
val categorized = df.withColumn("category",
when(col("age") < 30, "Young")
.when(col("age") < 50, "Middle")
.otherwise("Senior")
)
// Null handling
val cleaned = df.filter(col("email").isNotNull && col("phone").isNotNull)
val filled = df.withColumn("phone", coalesce(col("phone"), lit("N/A")))Statistical and aggregate functions for data analysis.
object functions {
/** Count functions */
def count(e: Column): Column
def countDistinct(expr: Column, exprs: Column*): Column
def approx_count_distinct(e: Column): Column
def approx_count_distinct(e: Column, rsd: Double): Column
/** Basic aggregates */
def sum(e: Column): Column
def sumDistinct(e: Column): Column
def avg(e: Column): Column
def mean(e: Column): Column
def min(e: Column): Column
def max(e: Column): Column
def first(e: Column): Column
def first(e: Column, ignoreNulls: Boolean): Column
def last(e: Column): Column
def last(e: Column, ignoreNulls: Boolean): Column
/** Statistical functions */
def stddev(e: Column): Column
def stddev_pop(e: Column): Column
def stddev_samp(e: Column): Column
def variance(e: Column): Column
def var_pop(e: Column): Column
def var_samp(e: Column): Column
def skewness(e: Column): Column
def kurtosis(e: Column): Column
def corr(column1: Column, column2: Column): Column
def covar_pop(column1: Column, column2: Column): Column
def covar_samp(column1: Column, column2: Column): Column
/** Collection aggregates */
def collect_list(e: Column): Column
def collect_set(e: Column): Column
}Mathematical operations and functions.
object functions {
/** Basic math */
def abs(e: Column): Column
def negate(e: Column): Column
def sqrt(e: Column): Column
def cbrt(e: Column): Column
/** Power and logarithmic */
def pow(l: Column, r: Column): Column
def pow(l: Column, r: Double): Column
def exp(e: Column): Column
def expm1(e: Column): Column
def log(e: Column): Column
def log(base: Double, e: Column): Column
def log10(e: Column): Column
def log1p(e: Column): Column
def log2(e: Column): Column
/** Trigonometric */
def sin(e: Column): Column
def cos(e: Column): Column
def tan(e: Column): Column
def asin(e: Column): Column
def acos(e: Column): Column
def atan(e: Column): Column
def atan2(y: Column, x: Column): Column
def sinh(e: Column): Column
def cosh(e: Column): Column
def tanh(e: Column): Column
/** Rounding */
def round(e: Column): Column
def round(e: Column, scale: Int): Column
def bround(e: Column): Column
def bround(e: Column, scale: Int): Column
def ceil(e: Column): Column
def floor(e: Column): Column
/** Random */
def rand(): Column
def rand(seed: Long): Column
def randn(): Column
def randn(seed: Long): Column
}String manipulation and processing functions.
object functions {
/** String operations */
def length(e: Column): Column
def char_length(e: Column): Column
def bit_length(e: Column): Column
def octet_length(e: Column): Column
/** Case conversion */
def upper(e: Column): Column
def lower(e: Column): Column
def initcap(e: Column): Column
/** Trimming */
def trim(e: Column): Column
def ltrim(e: Column): Column
def rtrim(e: Column): Column
/** Padding */
def lpad(str: Column, len: Int, pad: String): Column
def rpad(str: Column, len: Int, pad: String): Column
/** Concatenation */
def concat(exprs: Column*): Column
def concat_ws(sep: String, exprs: Column*): Column
/** Substring operations */
def substring(str: Column, pos: Int, len: Int): Column
def substring_index(str: Column, delim: String, count: Int): Column
def left(str: Column, len: Int): Column
def right(str: Column, len: Int): Column
/** Pattern matching */
def regexp_extract(e: Column, exp: String, groupIdx: Int): Column
def regexp_replace(e: Column, pattern: String, replacement: String): Column
def split(str: Column, pattern: String): Column
def split(str: Column, pattern: String, limit: Int): Column
/** String search */
def instr(str: Column, substring: String): Column
def locate(substr: String, str: Column): Column
def locate(substr: String, str: Column, pos: Int): Column
/** Encoding */
def ascii(e: Column): Column
def base64(e: Column): Column
def unbase64(e: Column): Column
def encode(value: Column, charset: String): Column
def decode(value: Column, charset: String): Column
/** Format */
def format_number(x: Column, d: Int): Column
def format_string(format: String, arguments: Column*): Column
}Date, time, and timestamp manipulation functions.
object functions {
/** Current date/time */
def current_date(): Column
def current_timestamp(): Column
def now(): Column
def unix_timestamp(): Column
def unix_timestamp(s: Column): Column
def unix_timestamp(s: Column, p: String): Column
/** Date/time conversion */
def to_date(e: Column): Column
def to_date(e: Column, fmt: String): Column
def to_timestamp(s: Column): Column
def to_timestamp(s: Column, fmt: String): Column
def from_unixtime(ut: Column): Column
def from_unixtime(ut: Column, f: String): Column
/** Date arithmetic */
def date_add(start: Column, days: Int): Column
def date_sub(start: Column, days: Int): Column
def datediff(end: Column, start: Column): Column
def months_between(end: Column, start: Column): Column
def months_between(end: Column, start: Column, roundOff: Boolean): Column
def add_months(start: Column, months: Int): Column
def next_day(date: Column, dayOfWeek: String): Column
def last_day(e: Column): Column
/** Date/time extraction */
def year(e: Column): Column
def quarter(e: Column): Column
def month(e: Column): Column
def dayofmonth(e: Column): Column
def dayofweek(e: Column): Column
def dayofyear(e: Column): Column
def hour(e: Column): Column
def minute(e: Column): Column
def second(e: Column): Column
def weekofyear(e: Column): Column
/** Date/time formatting */
def date_format(dateExpr: Column, format: String): Column
def from_utc_timestamp(ts: Column, tz: String): Column
def to_utc_timestamp(ts: Column, tz: String): Column
/** Time window */
def window(timeColumn: Column, windowDuration: String): Column
def window(timeColumn: Column, windowDuration: String, slideDuration: String): Column
def window(timeColumn: Column, windowDuration: String, slideDuration: String, startTime: String): Column
}Functions for working with complex data types.
object functions {
/** Array creation */
def array(cols: Column*): Column
def array(colName: String, colNames: String*): Column
def split(str: Column, pattern: String): Column
/** Array operations */
def array_contains(column: Column, value: Any): Column
def array_distinct(e: Column): Column
def array_except(col1: Column, col2: Column): Column
def array_intersect(col1: Column, col2: Column): Column
def array_join(column: Column, delimiter: String): Column
def array_join(column: Column, delimiter: String, nullReplacement: String): Column
def array_max(e: Column): Column
def array_min(e: Column): Column
def array_position(column: Column, value: Any): Column
def array_remove(column: Column, element: Any): Column
def array_repeat(e: Column, count: Int): Column
def array_sort(e: Column): Column
def array_union(col1: Column, col2: Column): Column
def arrays_overlap(a1: Column, a2: Column): Column
def arrays_zip(e: Column*): Column
def concat(exprs: Column*): Column
def element_at(column: Column, extraction: Any): Column
def explode(e: Column): Column
def explode_outer(e: Column): Column
def flatten(e: Column): Column
def posexplode(e: Column): Column
def posexplode_outer(e: Column): Column
def reverse(e: Column): Column
def sequence(start: Column, stop: Column): Column
def sequence(start: Column, stop: Column, step: Column): Column
def shuffle(e: Column): Column
def size(e: Column): Column
def slice(x: Column, start: Int, length: Int): Column
def sort_array(e: Column): Column
def sort_array(e: Column, asc: Boolean): Column
/** Map creation */
def map(cols: Column*): Column
def map_from_arrays(keys: Column, values: Column): Column
def map_from_entries(e: Column): Column
/** Map operations */
def map_concat(cols: Column*): Column
def map_entries(e: Column): Column
def map_filter(expr: Column, f: (Column, Column) => Column): Column
def map_keys(e: Column): Column
def map_values(e: Column): Column
def map_zip_with(left: Column, right: Column, f: (Column, Column, Column) => Column): Column
/** Struct operations */
def struct(cols: Column*): Column
def struct(colName: String, colNames: String*): Column
}Analytics functions for window-based calculations.
object functions {
/** Ranking functions */
def row_number(): Column
def rank(): Column
def dense_rank(): Column
def percent_rank(): Column
def ntile(n: Int): Column
def cume_dist(): Column
/** Analytic functions */
def lag(e: Column, offset: Int): Column
def lag(e: Column, offset: Int, defaultValue: Any): Column
def lead(e: Column, offset: Int): Column
def lead(e: Column, offset: Int, defaultValue: Any): Column
def first_value(e: Column): Column
def first_value(e: Column, ignoreNulls: Boolean): Column
def last_value(e: Column): Column
def last_value(e: Column, ignoreNulls: Boolean): Column
def nth_value(e: Column, n: Int): Column
def nth_value(e: Column, n: Int, ignoreNulls: Boolean): Column
}
/**
* Window specification for window functions
*/
class WindowSpec {
def partitionBy(colNames: String*): WindowSpec
def partitionBy(cols: Column*): WindowSpec
def orderBy(colNames: String*): WindowSpec
def orderBy(cols: Column*): WindowSpec
def rowsBetween(start: Long, end: Long): WindowSpec
def rangeBetween(start: Long, end: Long): WindowSpec
}
object Window {
/** Create unbounded window */
def partitionBy(colNames: String*): WindowSpec
def partitionBy(cols: Column*): WindowSpec
def orderBy(colNames: String*): WindowSpec
def orderBy(cols: Column*): WindowSpec
/** Window frame boundaries */
val unboundedPreceding: Long
val unboundedFollowing: Long
val currentRow: Long
}Usage Examples:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
val sales = spark.table("sales")
// Window functions
val windowSpec = Window.partitionBy("region").orderBy(col("date"))
val salesWithRank = sales.withColumn("rank",
row_number().over(windowSpec)
)
val salesWithLag = sales.withColumn("prev_amount",
lag(col("amount"), 1).over(windowSpec)
)
// Running totals
val runningTotal = sales.withColumn("running_total",
sum(col("amount")).over(windowSpec.rowsBetween(Window.unboundedPreceding, Window.currentRow))
)
// Moving averages
val movingAvg = sales.withColumn("moving_avg_3",
avg(col("amount")).over(windowSpec.rowsBetween(-2, 0))
)