Apache Spark SQL provides a comprehensive library of 764 built-in functions for data manipulation across all major functional categories. These functions enable complex data transformations, analytics, and processing operations within the distributed SQL query engine.
object functions {
// Column creation and literals
def col(colName: String): Column
def column(colName: String): Column
def lit(literal: Any): Column
def typedLit[T : TypeTag](literal: T): Column
def expr(expr: String): Column
// Special values
def broadcast[T](df: Dataset[T]): Dataset[T]
def monotonically_increasing_id(): Column
def spark_partition_id(): Column
def input_file_name(): Column
}// Basic aggregations
def count(e: Column): Column
def count(columnName: String): Column
def countDistinct(expr: Column, exprs: Column*): Column
def countDistinct(columnName: String, columnNames: String*): Column
def approx_count_distinct(e: Column): Column
def approx_count_distinct(e: Column, rsd: Double): Column
def approx_count_distinct(columnName: String): Column
def approx_count_distinct(columnName: String, rsd: Double): Column
def sum(e: Column): Column
def sum(columnName: String): Column
def sumDistinct(e: Column): Column
def sumDistinct(columnName: String): Column
def avg(e: Column): Column
def avg(columnName: String): Column
def mean(e: Column): Column
def mean(columnName: String): Column
def min(e: Column): Column
def min(columnName: String): Column
def max(e: Column): Column
def max(columnName: String): Column
// Statistical aggregations
def stddev(e: Column): Column
def stddev(columnName: String): Column
def stddev_pop(e: Column): Column
def stddev_pop(columnName: String): Column
def stddev_samp(e: Column): Column
def stddev_samp(columnName: String): Column
def variance(e: Column): Column
def variance(columnName: String): Column
def var_pop(e: Column): Column
def var_pop(columnName: String): Column
def var_samp(e: Column): Column
def var_samp(columnName: String): Column
def corr(column1: Column, column2: Column): Column
def corr(columnName1: String, columnName2: String): Column
def covar_pop(column1: Column, column2: Column): Column
def covar_pop(columnName1: String, columnName2: String): Column
def covar_samp(column1: Column, column2: Column): Column
def covar_samp(columnName1: String, columnName2: String): Column
def skewness(e: Column): Column
def skewness(columnName: String): Column
def kurtosis(e: Column): Column
def kurtosis(columnName: String): Column
// Collection aggregations
def collect_list(e: Column): Column
def collect_list(columnName: String): Column
def collect_set(e: Column): Column
def collect_set(columnName: String): Column
def first(e: Column): Column
def first(e: Column, ignoreNulls: Boolean): Column
def first(columnName: String): Column
def first(columnName: String, ignoreNulls: Boolean): Column
def last(e: Column): Column
def last(e: Column, ignoreNulls: Boolean): Column
def last(columnName: String): Column
def last(columnName: String, ignoreNulls: Boolean): Column
// Approximate aggregations
def approx_percentile(e: Column, percentage: Column): Column
def approx_percentile(e: Column, percentage: Column, accuracy: Column): Column
def approx_percentile(columnName: String, percentage: Double): Column
def approx_percentile(columnName: String, percentage: Double, accuracy: Double): Column
// Regression functions
def regr_avgx(y: Column, x: Column): Column
def regr_avgy(y: Column, x: Column): Column
def regr_count(y: Column, x: Column): Column
def regr_intercept(y: Column, x: Column): Column
def regr_r2(y: Column, x: Column): Column
def regr_slope(y: Column, x: Column): Column
def regr_sxx(y: Column, x: Column): Column
def regr_sxy(y: Column, x: Column): Column
def regr_syy(y: Column, x: Column): Column
// Percentile functions
def percentile_approx(col: Column, percentage: Column): Column
def percentile_approx(col: Column, percentage: Column, accuracy: Column): Column
def median(col: Column): Column// Basic arithmetic and rounding
def abs(e: Column): Column
def ceil(e: Column): Column
def ceiling(e: Column): Column
def floor(e: Column): Column
def round(e: Column): Column
def round(e: Column, scale: Int): Column
def rint(e: Column): Column
def sign(e: Column): Column
def signum(e: Column): Column
// Trigonometric functions
def sin(e: Column): Column
def cos(e: Column): Column
def tan(e: Column): Column
def asin(e: Column): Column
def acos(e: Column): Column
def atan(e: Column): Column
def atan2(y: Column, x: Column): Column
def sinh(e: Column): Column
def cosh(e: Column): Column
def tanh(e: Column): Column
def asinh(e: Column): Column
def acosh(e: Column): Column
def atanh(e: Column): Column
// Logarithmic and exponential
def exp(e: Column): Column
def exp2(e: Column): Column
def expm1(e: Column): Column
def log(e: Column): Column
def log(base: Double, e: Column): Column
def log10(e: Column): Column
def log2(e: Column): Column
def log1p(e: Column): Column
def ln(e: Column): Column
// Power and root functions
def pow(l: Column, r: Column): Column
def power(l: Column, r: Column): Column
def sqrt(e: Column): Column
def cbrt(e: Column): Column
// Random number generation
def rand(): Column
def rand(seed: Long): Column
def randn(): Column
def randn(seed: Long): Column
// Number conversion and formatting
def hex(column: Column): Column
def unhex(column: Column): Column
def conv(num: Column, fromBase: Int, toBase: Int): Column
def bin(e: Column): Column
// Advanced mathematical functions
def factorial(e: Column): Column
def degrees(e: Column): Column
def radians(e: Column): Column
def gcd(left: Column, right: Column): Column
def lcm(left: Column, right: Column): Column
def width_bucket(v: Column, min: Column, max: Column, numBucket: Column): Column// Current date/time functions
def current_date(): Column
def current_timestamp(): Column
def now(): Column
def localtimestamp(): Column
def current_timezone(): Column
// Date/time extraction functions
def year(e: Column): Column
def quarter(e: Column): Column
def month(e: Column): Column
def weekofyear(e: Column): Column
def weekday(e: Column): Column
def dayofyear(e: Column): Column
def dayofmonth(e: Column): Column
def dayofweek(e: Column): Column
def hour(e: Column): Column
def minute(e: Column): Column
def second(e: Column): Column
def extract(field: Column, source: Column): Column
// Date arithmetic and manipulation
def date_add(start: Column, days: Int): Column
def date_add(start: Column, days: Column): Column
def date_sub(start: Column, days: Int): Column
def date_sub(start: Column, days: Column): Column
def datediff(end: Column, start: Column): Column
def months_between(end: Column, start: Column): Column
def months_between(end: Column, start: Column, roundOff: Boolean): Column
def add_months(startDate: Column, numMonths: Int): Column
def add_months(startDate: Column, numMonths: Column): Column
// Date formatting and parsing
def date_format(dateExpr: Column, format: String): Column
def from_unixtime(ut: Column): Column
def from_unixtime(ut: Column, f: String): Column
def unix_timestamp(): Column
def unix_timestamp(s: Column): Column
def unix_timestamp(s: Column, p: String): Column
def to_date(e: Column): Column
def to_date(e: Column, fmt: String): Column
def to_timestamp(s: Column): Column
def to_timestamp(s: Column, fmt: String): Column
def make_date(year: Column, month: Column, day: Column): Column
def make_timestamp(
year: Column, month: Column, day: Column,
hour: Column, min: Column, sec: Column
): Column
def make_timestamp(
year: Column, month: Column, day: Column,
hour: Column, min: Column, sec: Column, timezone: Column
): Column
// Date truncation and boundaries
def date_trunc(format: String, timestamp: Column): Column
def trunc(date: Column, format: String): Column
def last_day(e: Column): Column
def next_day(date: Column, dayOfWeek: String): Column
// Timezone operations
def from_utc_timestamp(ts: Column, tz: String): Column
def from_utc_timestamp(ts: Column, tz: Column): Column
def to_utc_timestamp(ts: Column, tz: String): Column
def to_utc_timestamp(ts: Column, tz: Column): Column
// Date validation and testing
def isnan(e: Column): Column
def isnull(e: Column): Column// Array creation and manipulation
def array(cols: Column*): Column
def array_contains(column: Column, value: Any): Column
def array_distinct(e: Column): Column
def array_except(col1: Column, col2: Column): Column
def array_intersect(col1: Column, col2: Column): Column
def array_join(column: Column, delimiter: String): Column
def array_join(column: Column, delimiter: String, nullReplacement: String): Column
def array_max(e: Column): Column
def array_min(e: Column): Column
def array_position(column: Column, value: Any): Column
def array_remove(column: Column, element: Any): Column
def array_repeat(left: Column, right: Column): Column
def array_size(e: Column): Column
def array_sort(e: Column): Column
def array_union(col1: Column, col2: Column): Column
def arrays_overlap(a1: Column, a2: Column): Column
def arrays_zip(cols: Column*): Column
// Array aggregation and transformation
def flatten(e: Column): Column
def reverse(e: Column): Column
def shuffle(e: Column): Column
def slice(x: Column, start: Int, length: Int): Column
def sort_array(e: Column): Column
def sort_array(e: Column, asc: Boolean): Column
// Array element access
def element_at(column: Column, extraction: Any): Column
def get_json_object(e: Column, path: String): Column
// Array generation
def sequence(start: Column, stop: Column): Column
def sequence(start: Column, stop: Column, step: Column): Column// Map creation and manipulation
def map(cols: Column*): Column
def map_contains_key(column: Column, key: Any): Column
def map_entries(e: Column): Column
def map_from_arrays(keys: Column, values: Column): Column
def map_from_entries(e: Column): Column
def map_keys(e: Column): Column
def map_values(e: Column): Column
def map_zip_with(left: Column, right: Column, f: (Column, Column, Column) => Column): Column
// Map transformations
def transform_keys(expr: Column, f: (Column, Column) => Column): Column
def transform_values(expr: Column, f: (Column, Column) => Column): Column// JSON parsing and manipulation
def from_json(e: Column, schema: DataType): Column
def from_json(e: Column, schema: DataType, options: Map[String, String]): Column
def from_json(e: Column, schema: String): Column
def from_json(e: Column, schema: String, options: Map[String, String]): Column
def to_json(e: Column): Column
def to_json(e: Column, options: Map[String, String]): Column
def json_array_length(jsonArray: Column): Column
def json_object_keys(json: Column): Column
def json_tuple(json: Column, fields: String*): Column
def get_json_object(e: Column, path: String): Column
def json_extract(jsonStr: Column, path: Column): Column
// Schema inference
def schema_of_json(json: Column): Column
def schema_of_json(json: String): Column// Ranking functions
def row_number(): Column
def rank(): Column
def dense_rank(): Column
def percent_rank(): Column
def ntile(n: Int): Column
def cume_dist(): Column
// Value access functions
def lag(e: Column, offset: Int): Column
def lag(e: Column, offset: Int, defaultValue: Any): Column
def lead(e: Column, offset: Int): Column
def lead(e: Column, offset: Int, defaultValue: Any): Column
def first_value(e: Column): Column
def first_value(e: Column, ignoreNulls: Boolean): Column
def last_value(e: Column): Column
def last_value(e: Column, ignoreNulls: Boolean): Column
def nth_value(e: Column, offset: Int): Column
def nth_value(e: Column, offset: Int, ignoreNulls: Boolean): Column// Cryptographic hash functions
def md5(e: Column): Column
def sha1(e: Column): Column
def sha2(e: Column, numBits: Int): Column
def hash(cols: Column*): Column
def xxhash64(cols: Column*): Column
def crc32(e: Column): Column
def murmur3_hash(cols: Column*): Column// Conditional logic
def when(condition: Column, value: Any): Column
def coalesce(e: Column*): Column
def isnull(e: Column): Column
def isnan(e: Column): Column
def isnotnull(e: Column): Column
def nvl(col1: Column, col2: Column): Column
def nvl2(col1: Column, col2: Column, col3: Column): Column
def nullif(col1: Column, col2: Column): Column
def if(condition: Column, trueValue: Column, falseValue: Column): Column
def greatest(exprs: Column*): Column
def least(exprs: Column*): Column// String operations
def length(e: Column): Column
def length(columnName: String): Column
def char_length(e: Column): Column
def character_length(e: Column): Column
def bit_length(e: Column): Column
def octet_length(e: Column): Column
def upper(e: Column): Column
def upper(columnName: String): Column
def lower(e: Column): Column
def lower(columnName: String): Column
def initcap(e: Column): Column
def initcap(columnName: String): Column
def trim(e: Column): Column
def trim(columnName: String): Column
def trim(e: Column, trimString: String): Column
def ltrim(e: Column): Column
def ltrim(columnName: String): Column
def ltrim(e: Column, trimString: String): Column
def rtrim(e: Column): Column
def rtrim(columnName: String): Column
def rtrim(e: Column, trimString: String): Column
// String concatenation
def concat(exprs: Column*): Column
def concat_ws(sep: String, exprs: Column*): Column
// Substring operations
def substring(str: Column, pos: Int, len: Int): Column
def substring(columnName: String, pos: Int, len: Int): Column
def substr(str: Column, pos: Column, len: Column): Column
def left(str: Column, len: Column): Column
def right(str: Column, len: Column): Column
// String searching and matching
def instr(str: Column, substring: String): Column
def locate(substr: String, str: Column): Column
def locate(substr: String, str: Column, pos: Int): Column
def find_in_set(str: Column, strArray: Column): Column
// Regular expressions
def regexp_replace(e: Column, pattern: String, replacement: String): Column
def regexp_replace(e: Column, pattern: Column, replacement: Column): Column
def regexp_extract(e: Column, exp: String, groupIdx: Int): Column
def regexp_extract_all(e: Column, exp: String): Column
def regexp_extract_all(e: Column, exp: String, groupIdx: Int): Column
def rlike(str: Column, regexp: String): Column
def regexp_like(str: Column, regexp: String): Column
// String splitting and parsing
def split(str: Column, pattern: String): Column
def split(str: Column, pattern: String, limit: Int): Column
// String formatting
def format_string(format: String, arguments: Column*): Column
def format_number(x: Column, d: Int): Column
def printf(format: Column, arguments: Column*): Column
// String padding and alignment
def lpad(str: Column, len: Int, pad: String): Column
def rpad(str: Column, len: Int, pad: String): Column
// String translation and encoding
def translate(src: Column, matchingString: String, replaceString: String): Column
def encode(value: Column, charset: String): Column
def decode(value: Column, charset: String): Column
def base64(e: Column): Column
def unbase64(e: Column): Column
// String comparison and testing
def soundex(e: Column): Column
def levenshtein(l: Column, r: Column): Column
def levenshtein(l: Column, r: Column, threshold: Int): Column// Current date/time functions
def current_date(): Column
def current_timestamp(): Column
def now(): Column
def localtimestamp(): Column
def current_timezone(): Column
// Date/time extraction
def year(e: Column): Column
def year(columnName: String): Column
def quarter(e: Column): Column
def quarter(columnName: String): Column
def month(e: Column): Column
def month(columnName: String): Column
def dayofyear(e: Column): Column
def dayofyear(columnName: String): Column
def dayofmonth(e: Column): Column
def dayofmonth(columnName: String): Column
def dayofweek(e: Column): Column
def dayofweek(columnName: String): Column
def weekofyear(e: Column): Column
def weekofyear(columnName: String): Column
def hour(e: Column): Column
def hour(columnName: String): Column
def minute(e: Column): Column
def minute(columnName: String): Column
def second(e: Column): Column
def second(columnName: String): Column
// Date arithmetic
def date_add(start: Column, days: Int): Column
def date_add(start: Column, days: Column): Column
def date_add(columnName: String, days: Int): Column
def date_sub(start: Column, days: Int): Column
def date_sub(start: Column, days: Column): Column
def date_sub(columnName: String, days: Int): Column
def datediff(end: Column, start: Column): Column
def datediff(end: String, start: String): Column
def months_between(end: Column, start: Column): Column
def months_between(end: Column, start: Column, roundOff: Boolean): Column
def add_months(startDate: Column, numMonths: Int): Column
def add_months(startDate: Column, numMonths: Column): Column
// Date formatting and parsing
def date_format(dateExpr: Column, format: String): Column
def date_format(columnName: String, format: String): Column
def from_unixtime(ut: Column): Column
def from_unixtime(ut: Column, f: String): Column
def from_unixtime(columnName: String): Column
def from_unixtime(columnName: String, format: String): Column
def unix_timestamp(): Column
def unix_timestamp(s: Column): Column
def unix_timestamp(s: Column, p: String): Column
def unix_timestamp(columnName: String): Column
def unix_timestamp(columnName: String, pattern: String): Column
def to_date(e: Column): Column
def to_date(e: Column, fmt: String): Column
def to_timestamp(s: Column): Column
def to_timestamp(s: Column, fmt: String): Column
// Date truncation and rounding
def date_trunc(format: String, timestamp: Column): Column
def trunc(date: Column, format: String): Column
// Timezone operations
def from_utc_timestamp(ts: Column, tz: String): Column
def from_utc_timestamp(ts: Column, tz: Column): Column
def to_utc_timestamp(ts: Column, tz: String): Column
def to_utc_timestamp(ts: Column, tz: Column): Column
// Date sequence generation
def sequence(start: Column, stop: Column, step: Column): Column
def date_sequence(start: Column, stop: Column, step: Column): Column
// Last day operations
def last_day(e: Column): Column
def last_day(columnName: String): Column
def next_day(date: Column, dayOfWeek: String): Column// Basic arithmetic
def abs(e: Column): Column
def abs(columnName: String): Column
def negate(e: Column): Column
def sqrt(e: Column): Column
def sqrt(columnName: String): Column
def cbrt(e: Column): Column
def pow(l: Column, r: Column): Column
def pow(l: Column, r: Double): Column
def pow(columnName: String, p: Double): Column
def power(l: Column, r: Column): Column
// Rounding functions
def round(e: Column): Column
def round(e: Column, scale: Int): Column
def round(columnName: String): Column
def round(columnName: String, scale: Int): Column
def bround(e: Column): Column
def bround(e: Column, scale: Int): Column
def ceil(e: Column): Column
def ceil(columnName: String): Column
def ceiling(e: Column): Column
def floor(e: Column): Column
def floor(columnName: String): Column
def rint(e: Column): Column
def rint(columnName: String): Column
// Trigonometric functions
def sin(e: Column): Column
def sin(columnName: String): Column
def cos(e: Column): Column
def cos(columnName: String): Column
def tan(e: Column): Column
def tan(columnName: String): Column
def asin(e: Column): Column
def asin(columnName: String): Column
def acos(e: Column): Column
def acos(columnName: String): Column
def atan(e: Column): Column
def atan(columnName: String): Column
def atan2(l: Column, r: Column): Column
def atan2(l: Column, r: Double): Column
def atan2(columnName1: String, columnName2: String): Column
def sinh(e: Column): Column
def cosh(e: Column): Column
def tanh(e: Column): Column
// Logarithmic and exponential
def log(e: Column): Column
def log(columnName: String): Column
def log(base: Double, e: Column): Column
def log(base: Double, columnName: String): Column
def log10(e: Column): Column
def log10(columnName: String): Column
def log1p(e: Column): Column
def log1p(columnName: String): Column
def log2(e: Column): Column
def exp(e: Column): Column
def exp(columnName: String): Column
def expm1(e: Column): Column
def expm1(columnName: String): Column
// Angle conversion
def degrees(e: Column): Column
def degrees(columnName: String): Column
def radians(e: Column): Column
def radians(columnName: String): Column
def toDegrees(e: Column): Column
def toRadians(e: Column): Column
// Special mathematical functions
def factorial(e: Column): Column
def signum(e: Column): Column
def signum(columnName: String): Column
// Bitwise operations
def bitwiseNOT(e: Column): Column
def shiftLeft(e: Column, numBits: Int): Column
def shiftRight(e: Column, numBits: Int): Column
def shiftRightUnsigned(e: Column, numBits: Int): Column
// Random functions
def rand(): Column
def rand(seed: Long): Column
def randn(): Column
def randn(seed: Long): Column// Array creation
def array(cols: Column*): Column
def array_repeat(e: Column, count: Int): Column
def array_repeat(left: Column, right: Column): Column
// Array access and properties
def size(e: Column): Column
def array_min(e: Column): Column
def array_max(e: Column): Column
def array_position(column: Column, value: Any): Column
def element_at(column: Column, extraction: Any): Column
// Array testing
def array_contains(column: Column, value: Any): Column
def arrays_overlap(a1: Column, a2: Column): Column
// Array transformations
def array_distinct(e: Column): Column
def array_remove(column: Column, element: Column): Column
def array_sort(e: Column): Column
def array_union(col1: Column, col2: Column): Column
def array_intersect(col1: Column, col2: Column): Column
def array_except(col1: Column, col2: Column): Column
def array_join(column: Column, delimiter: String): Column
def array_join(column: Column, delimiter: String, nullReplacement: String): Column
def reverse(e: Column): Column
def shuffle(e: Column): Column
def slice(x: Column, start: Int, length: Int): Column
def slice(x: Column, start: Column, length: Column): Column
def sort_array(e: Column): Column
def sort_array(e: Column, asc: Boolean): Column
// Array aggregation
def array_agg(e: Column): Column
def flatten(e: Column): Column
def sequence(start: Column, stop: Column): Column
def sequence(start: Column, stop: Column, step: Column): Column
// Array explosion
def explode(e: Column): Column
def explode_outer(e: Column): Column
def posexplode(e: Column): Column
def posexplode_outer(e: Column): Column
// Array generation
def array_zip(e: Column*): Column
def arrays_zip(e: Column*): Column// Map creation
def map(cols: Column*): Column
def map_from_arrays(keys: Column, values: Column): Column
def map_from_entries(e: Column): Column
// Map access
def map_keys(e: Column): Column
def map_values(e: Column): Column
def map_entries(e: Column): Column
def map_concat(cols: Column*): Column
// Map filtering and transformation
def map_filter(expr: Column, f: (Column, Column) => Column): Column
def transform_keys(expr: Column, f: (Column, Column) => Column): Column
def transform_values(expr: Column, f: (Column, Column) => Column): Column
// Map explosion
def explode(e: Column): Column
def explode_outer(e: Column): Column
def posexplode(e: Column): Column
def posexplode_outer(e: Column): Column// Struct creation
def struct(cols: Column*): Column
def named_struct(cols: Column*): Column
// JSON operations
def from_json(e: Column, schema: StructType): Column
def from_json(e: Column, schema: String): Column
def from_json(e: Column, schema: Column): Column
def from_json(e: Column, schema: DataType): Column
def from_json(e: Column, schema: StructType, options: Map[String, String]): Column
def to_json(e: Column): Column
def to_json(e: Column, options: Map[String, String]): Column
def json_object_keys(json: Column): Column
def json_array_length(jsonArray: Column): Column
def json_tuple(json: Column, fields: String*): Column
def get_json_object(e: Column, path: String): Column
// Schema operations
def schema_of_json(json: Column): Column
def schema_of_json(json: String): Column
def schema_of_csv(csv: Column): Column
def schema_of_csv(csv: String): Column// Conditional expressions
def when(condition: Column, value: Any): Column
def coalesce(e: Column*): Column
def greatest(exprs: Column*): Column
def greatest(columnName: String, columnNames: String*): Column
def least(exprs: Column*): Column
def least(columnName: String, columnNames: String*): Column
// Null handling
def isnull(e: Column): Column
def isnan(e: Column): Column
def nanvl(col1: Column, col2: Column): Column
def nvl(col1: Column, col2: Column): Column
def nvl2(col1: Column, col2: Column, col3: Column): Column
def ifnull(col1: Column, col2: Column): Column
def nullif(col1: Column, col2: Column): Column// Ranking functions
def row_number(): Column
def rank(): Column
def dense_rank(): Column
def percent_rank(): Column
def ntile(n: Int): Column
def cume_dist(): Column
// Offset functions
def lag(e: Column, offset: Int): Column
def lag(e: Column, offset: Int, defaultValue: Any): Column
def lag(columnName: String, offset: Int): Column
def lag(columnName: String, offset: Int, defaultValue: Any): Column
def lead(e: Column, offset: Int): Column
def lead(e: Column, offset: Int, defaultValue: Any): Column
def lead(columnName: String, offset: Int): Column
def lead(columnName: String, offset: Int, defaultValue: Any): Column
// Value functions
def first_value(e: Column): Column
def first_value(e: Column, ignoreNulls: Boolean): Column
def last_value(e: Column): Column
def last_value(e: Column, ignoreNulls: Boolean): Column
def nth_value(e: Column, n: Int): Column
def nth_value(e: Column, n: Int, ignoreNulls: Boolean): Column// Type casting
def cast(e: Column, dataType: DataType): Column
def cast(columnName: String, dataType: String): Column
// Binary operations
def bin(e: Column): Column
def hex(column: Column): Column
def unhex(column: Column): Column
def md5(e: Column): Column
def sha1(e: Column): Column
def sha2(e: Column, numBits: Int): Column
def hash(cols: Column*): Column
def xxhash64(cols: Column*): Column
def crc32(e: Column): Columnimport org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().appName("Functions Demo").getOrCreate()
import spark.implicits._
// Sample data
val df = Seq(
("Alice", 25, 50000.0, "2023-01-15"),
("Bob", 30, 60000.0, "2023-02-20"),
("Charlie", 35, 70000.0, "2023-03-10")
).toDF("name", "age", "salary", "hire_date")
// String functions
val stringOps = df.select(
upper($"name").as("upper_name"),
length($"name").as("name_length"),
concat($"name", lit(" - "), $"age").as("name_age"),
regexp_replace($"name", "a", "@").as("replaced_name")
)
// Mathematical functions
val mathOps = df.select(
$"name",
abs($"age" - 30).as("age_diff_from_30"),
round($"salary" / 1000, 2).as("salary_in_k"),
pow($"age", 2).as("age_squared"),
sqrt($"salary").as("sqrt_salary")
)
// Date functions
val dateOps = df.select(
$"name",
to_date($"hire_date").as("hire_date_parsed"),
year(to_date($"hire_date")).as("hire_year"),
months_between(current_date(), to_date($"hire_date")).as("months_employed"),
date_add(to_date($"hire_date"), 365).as("one_year_later")
)// Basic aggregations
val basicStats = df.agg(
count($"name").as("total_employees"),
avg($"salary").as("avg_salary"),
min($"age").as("min_age"),
max($"salary").as("max_salary"),
stddev($"salary").as("salary_stddev")
)
// Grouped aggregations
val groupedStats = df.groupBy($"age" > 30)
.agg(
count($"name").as("count"),
sum($"salary").as("total_salary"),
avg($"salary").as("avg_salary"),
collect_list($"name").as("names"),
first($"name").as("first_name"),
last($"name").as("last_name")
)
// Statistical functions
val correlationStats = df.select(
corr($"age", $"salary").as("age_salary_correlation"),
covar_samp($"age", $"salary").as("age_salary_covariance")
)// Sample data with arrays
val arrayData = Seq(
("Alice", Array("reading", "swimming", "cooking")),
("Bob", Array("hiking", "reading", "gaming")),
("Charlie", Array("cooking", "gaming"))
).toDF("name", "hobbies")
// Array operations
val arrayOps = arrayData.select(
$"name",
$"hobbies",
size($"hobbies").as("hobby_count"),
array_contains($"hobbies", "reading").as("likes_reading"),
array_sort($"hobbies").as("sorted_hobbies"),
array_distinct($"hobbies").as("unique_hobbies")
)
// Explode arrays
val exploded = arrayData.select(
$"name",
explode($"hobbies").as("hobby")
)
// Map operations
val mapData = Seq(
("Alice", Map("skill1" -> "Java", "skill2" -> "Scala")),
("Bob", Map("skill1" -> "Python", "skill2" -> "SQL"))
).toDF("name", "skills")
val mapOps = mapData.select(
$"name",
map_keys($"skills").as("skill_names"),
map_values($"skills").as("skill_values"),
$"skills".getItem("skill1").as("primary_skill")
)import org.apache.spark.sql.expressions.Window
// Window specifications
val windowSpec = Window.partitionBy($"age" > 30).orderBy($"salary".desc)
val unboundedWindow = Window.partitionBy($"age" > 30)
.orderBy($"salary".desc)
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
// Window function usage
val windowedData = df.select(
$"name",
$"age",
$"salary",
row_number().over(windowSpec).as("salary_rank"),
rank().over(windowSpec).as("salary_rank_with_ties"),
dense_rank().over(windowSpec).as("salary_dense_rank"),
percent_rank().over(windowSpec).as("salary_percentile"),
ntile(3).over(windowSpec).as("salary_tertile"),
lag($"salary", 1).over(windowSpec).as("previous_salary"),
lead($"salary", 1).over(windowSpec).as("next_salary"),
sum($"salary").over(unboundedWindow).as("running_total_salary"),
avg($"salary").over(unboundedWindow).as("running_avg_salary")
)// Sample data with nulls
val dataWithNulls = Seq(
("Alice", Some(25), Some(50000.0)),
("Bob", None, Some(60000.0)),
("Charlie", Some(35), None)
).toDF("name", "age", "salary")
// Conditional operations
val conditionalOps = dataWithNulls.select(
$"name",
when($"age".isNull, "Unknown").otherwise($"age").as("age_with_default"),
when($"salary" > 55000, "High")
.when($"salary" > 45000, "Medium")
.otherwise("Low").as("salary_category"),
coalesce($"salary", lit(40000.0)).as("salary_with_default"),
nvl($"age", lit(0)).as("age_nvl"),
nvl2($"salary", "Has Salary", "No Salary").as("salary_status"),
greatest($"age", lit(30)).as("age_or_30"),
least($"age", lit(30)).as("min_age_30")
)
// Null checking
val nullChecks = dataWithNulls.select(
$"name",
isnull($"age").as("age_is_null"),
$"age".isNull.as("age_is_null_method"),
$"salary".isNotNull.as("salary_not_null")
)// JSON operations
val jsonData = Seq(
"""{"name": "Alice", "details": {"age": 25, "city": "Seattle"}}""",
"""{"name": "Bob", "details": {"age": 30, "city": "Portland"}}"""
).toDF("json_str")
val jsonSchema = new StructType()
.add("name", StringType)
.add("details", new StructType()
.add("age", IntegerType)
.add("city", StringType))
val parsedJson = jsonData.select(
from_json($"json_str", jsonSchema).as("data")
).select(
$"data.name",
$"data.details.age",
$"data.details.city"
)
// Complex transformations with multiple functions
val complexTransforms = df.select(
$"name",
// String manipulation
initcap($"name").as("proper_name"),
lpad($"name", 10, "*").as("padded_name"),
// Mathematical computations
round($"salary" / 12, 0).as("monthly_salary"),
// Conditional logic with math
when($"age" < 30, $"salary" * 0.1)
.when($"age" < 40, $"salary" * 0.15)
.otherwise($"salary" * 0.2).as("bonus"),
// Date calculations
datediff(current_date(), to_date($"hire_date")).as("days_employed"),
// Hash functions for data integrity
md5(concat($"name", $"age", $"salary")).as("record_hash")
)// Combining multiple functions for complex logic
val advancedTransforms = df.select(
$"name",
$"age",
$"salary",
// Performance rating based on multiple criteria
when($"age" > 32 && $"salary" > 65000, "Senior High Performer")
.when($"age" > 28 && $"salary" > 55000, "Mid-Level Performer")
.when($"salary" > 50000, "Junior High Performer")
.otherwise("Developing").as("performance_category"),
// Salary percentile within age group
percent_rank().over(Window.partitionBy(
when($"age" < 30, "Young")
.when($"age" < 40, "Mid")
.otherwise("Senior")
).orderBy($"salary")).as("salary_percentile_in_age_group"),
// Complex string formatting
format_string(
"Employee %s (age %d) earns $%.2f annually",
$"name", $"age", $"salary"
).as("employee_summary"),
// Encoded identifier
base64(
concat($"name", lit("_"), $"age", lit("_"),
date_format(current_date(), "yyyyMM"))
).as("encoded_id")
)