0
# Column Operations and Functions
1
2
Column expressions and built-in SQL functions for data transformation. Provides both operator overloading and function-based APIs for creating complex expressions, with support for all SQL operations and advanced analytics functions.
3
4
## Capabilities
5
6
### Column Class
7
8
Represents a column in a DataFrame that can be composed into complex expressions.
9
10
```scala { .api }
11
/**
12
* A column in a DataFrame that can be composed into complex expressions
13
*/
14
class Column {
15
/** Equality comparison (SQL-safe null handling) */
16
def ===(other: Any): Column
17
def =!=(other: Any): Column
18
19
/** Numeric comparison */
20
def >(other: Any): Column
21
def >=(other: Any): Column
22
def <(other: Any): Column
23
def <=(other: Any): Column
24
25
/** Arithmetic operations */
26
def +(other: Any): Column
27
def -(other: Any): Column
28
def *(other: Any): Column
29
def /(other: Any): Column
30
def %(other: Any): Column
31
32
/** Logical operations */
33
def &&(other: Column): Column
34
def ||(other: Column): Column
35
def unary_!(): Column
36
37
/** Null handling */
38
def isNull: Column
39
def isNotNull: Column
40
def isNaN: Column
41
42
/** String operations */
43
def startsWith(other: Column): Column
44
def startsWith(literal: String): Column
45
def endsWith(other: Column): Column
46
def endsWith(literal: String): Column
47
def contains(other: Column): Column
48
def contains(literal: String): Column
49
def like(literal: String): Column
50
def rlike(literal: String): Column
51
52
/** Type operations */
53
def cast(to: DataType): Column
54
def cast(to: String): Column
55
56
/** Alias operations */
57
def as(alias: String): Column
58
def as(alias: Symbol): Column
59
def alias(alias: String): Column
60
61
/** Sorting */
62
def asc: Column
63
def asc_nulls_first: Column
64
def asc_nulls_last: Column
65
def desc: Column
66
def desc_nulls_first: Column
67
def desc_nulls_last: Column
68
69
/** Window operations */
70
def over(window: WindowSpec): Column
71
72
/** Collection operations */
73
def getItem(key: Any): Column
74
def getField(fieldName: String): Column
75
76
/** Boolean operations for arrays */
77
def isin(list: Any*): Column
78
}
79
```
80
81
### TypedColumn
82
83
Column with type information for input and output types.
84
85
```scala { .api }
86
/**
87
* Column with type information
88
* @tparam T Input type
89
* @tparam U Output type
90
*/
91
class TypedColumn[T, U] extends Column {
92
/** Apply to Dataset */
93
def apply(dataset: Dataset[T]): Dataset[U]
94
}
95
```
96
97
### Core Functions
98
99
Essential functions for column creation and manipulation.
100
101
```scala { .api }
102
object functions {
103
/** Create column from column name */
104
def col(colName: String): Column
105
def column(colName: String): Column
106
107
/** Create literal column */
108
def lit(literal: Any): Column
109
def typedLit[T : TypeTag](literal: T): Column
110
111
/** Conditional expressions */
112
def when(condition: Column, value: Any): Column
113
def coalesce(e: Column*): Column
114
def greatest(e: Column*): Column
115
def least(e: Column*): Column
116
def isnull(e: Column): Column
117
def isnan(e: Column): Column
118
def nanvl(col1: Column, col2: Column): Column
119
}
120
```
121
122
**Usage Examples:**
123
124
```scala
125
import org.apache.spark.sql.functions._
126
127
val df = spark.table("employees")
128
129
// Column operations
130
val filtered = df.filter(col("age") > 25 && col("department") === "Engineering")
131
val calculated = df.withColumn("bonus", col("salary") * 0.1)
132
val sorted = df.orderBy(col("salary").desc, col("name").asc)
133
134
// Conditional logic
135
val categorized = df.withColumn("category",
136
when(col("age") < 30, "Young")
137
.when(col("age") < 50, "Middle")
138
.otherwise("Senior")
139
)
140
141
// Null handling
142
val cleaned = df.filter(col("email").isNotNull && col("phone").isNotNull)
143
val filled = df.withColumn("phone", coalesce(col("phone"), lit("N/A")))
144
```
145
146
### Aggregate Functions
147
148
Statistical and aggregate functions for data analysis.
149
150
```scala { .api }
151
object functions {
152
/** Count functions */
153
def count(e: Column): Column
154
def countDistinct(expr: Column, exprs: Column*): Column
155
def approx_count_distinct(e: Column): Column
156
def approx_count_distinct(e: Column, rsd: Double): Column
157
158
/** Basic aggregates */
159
def sum(e: Column): Column
160
def sumDistinct(e: Column): Column
161
def avg(e: Column): Column
162
def mean(e: Column): Column
163
def min(e: Column): Column
164
def max(e: Column): Column
165
def first(e: Column): Column
166
def first(e: Column, ignoreNulls: Boolean): Column
167
def last(e: Column): Column
168
def last(e: Column, ignoreNulls: Boolean): Column
169
170
/** Statistical functions */
171
def stddev(e: Column): Column
172
def stddev_pop(e: Column): Column
173
def stddev_samp(e: Column): Column
174
def variance(e: Column): Column
175
def var_pop(e: Column): Column
176
def var_samp(e: Column): Column
177
def skewness(e: Column): Column
178
def kurtosis(e: Column): Column
179
def corr(column1: Column, column2: Column): Column
180
def covar_pop(column1: Column, column2: Column): Column
181
def covar_samp(column1: Column, column2: Column): Column
182
183
/** Collection aggregates */
184
def collect_list(e: Column): Column
185
def collect_set(e: Column): Column
186
}
187
```
188
189
### Mathematical Functions
190
191
Mathematical operations and functions.
192
193
```scala { .api }
194
object functions {
195
/** Basic math */
196
def abs(e: Column): Column
197
def negate(e: Column): Column
198
def sqrt(e: Column): Column
199
def cbrt(e: Column): Column
200
201
/** Power and logarithmic */
202
def pow(l: Column, r: Column): Column
203
def pow(l: Column, r: Double): Column
204
def exp(e: Column): Column
205
def expm1(e: Column): Column
206
def log(e: Column): Column
207
def log(base: Double, e: Column): Column
208
def log10(e: Column): Column
209
def log1p(e: Column): Column
210
def log2(e: Column): Column
211
212
/** Trigonometric */
213
def sin(e: Column): Column
214
def cos(e: Column): Column
215
def tan(e: Column): Column
216
def asin(e: Column): Column
217
def acos(e: Column): Column
218
def atan(e: Column): Column
219
def atan2(y: Column, x: Column): Column
220
def sinh(e: Column): Column
221
def cosh(e: Column): Column
222
def tanh(e: Column): Column
223
224
/** Rounding */
225
def round(e: Column): Column
226
def round(e: Column, scale: Int): Column
227
def bround(e: Column): Column
228
def bround(e: Column, scale: Int): Column
229
def ceil(e: Column): Column
230
def floor(e: Column): Column
231
232
/** Random */
233
def rand(): Column
234
def rand(seed: Long): Column
235
def randn(): Column
236
def randn(seed: Long): Column
237
}
238
```
239
240
### String Functions
241
242
String manipulation and processing functions.
243
244
```scala { .api }
245
object functions {
246
/** String operations */
247
def length(e: Column): Column
248
def char_length(e: Column): Column
249
def bit_length(e: Column): Column
250
def octet_length(e: Column): Column
251
252
/** Case conversion */
253
def upper(e: Column): Column
254
def lower(e: Column): Column
255
def initcap(e: Column): Column
256
257
/** Trimming */
258
def trim(e: Column): Column
259
def ltrim(e: Column): Column
260
def rtrim(e: Column): Column
261
262
/** Padding */
263
def lpad(str: Column, len: Int, pad: String): Column
264
def rpad(str: Column, len: Int, pad: String): Column
265
266
/** Concatenation */
267
def concat(exprs: Column*): Column
268
def concat_ws(sep: String, exprs: Column*): Column
269
270
/** Substring operations */
271
def substring(str: Column, pos: Int, len: Int): Column
272
def substring_index(str: Column, delim: String, count: Int): Column
273
def left(str: Column, len: Int): Column
274
def right(str: Column, len: Int): Column
275
276
/** Pattern matching */
277
def regexp_extract(e: Column, exp: String, groupIdx: Int): Column
278
def regexp_replace(e: Column, pattern: String, replacement: String): Column
279
def split(str: Column, pattern: String): Column
280
def split(str: Column, pattern: String, limit: Int): Column
281
282
/** String search */
283
def instr(str: Column, substring: String): Column
284
def locate(substr: String, str: Column): Column
285
def locate(substr: String, str: Column, pos: Int): Column
286
287
/** Encoding */
288
def ascii(e: Column): Column
289
def base64(e: Column): Column
290
def unbase64(e: Column): Column
291
def encode(value: Column, charset: String): Column
292
def decode(value: Column, charset: String): Column
293
294
/** Format */
295
def format_number(x: Column, d: Int): Column
296
def format_string(format: String, arguments: Column*): Column
297
}
298
```
299
300
### Date and Time Functions
301
302
Date, time, and timestamp manipulation functions.
303
304
```scala { .api }
305
object functions {
306
/** Current date/time */
307
def current_date(): Column
308
def current_timestamp(): Column
309
def now(): Column
310
def unix_timestamp(): Column
311
def unix_timestamp(s: Column): Column
312
def unix_timestamp(s: Column, p: String): Column
313
314
/** Date/time conversion */
315
def to_date(e: Column): Column
316
def to_date(e: Column, fmt: String): Column
317
def to_timestamp(s: Column): Column
318
def to_timestamp(s: Column, fmt: String): Column
319
def from_unixtime(ut: Column): Column
320
def from_unixtime(ut: Column, f: String): Column
321
322
/** Date arithmetic */
323
def date_add(start: Column, days: Int): Column
324
def date_sub(start: Column, days: Int): Column
325
def datediff(end: Column, start: Column): Column
326
def months_between(end: Column, start: Column): Column
327
def months_between(end: Column, start: Column, roundOff: Boolean): Column
328
def add_months(start: Column, months: Int): Column
329
def next_day(date: Column, dayOfWeek: String): Column
330
def last_day(e: Column): Column
331
332
/** Date/time extraction */
333
def year(e: Column): Column
334
def quarter(e: Column): Column
335
def month(e: Column): Column
336
def dayofmonth(e: Column): Column
337
def dayofweek(e: Column): Column
338
def dayofyear(e: Column): Column
339
def hour(e: Column): Column
340
def minute(e: Column): Column
341
def second(e: Column): Column
342
def weekofyear(e: Column): Column
343
344
/** Date/time formatting */
345
def date_format(dateExpr: Column, format: String): Column
346
def from_utc_timestamp(ts: Column, tz: String): Column
347
def to_utc_timestamp(ts: Column, tz: String): Column
348
349
/** Time window */
350
def window(timeColumn: Column, windowDuration: String): Column
351
def window(timeColumn: Column, windowDuration: String, slideDuration: String): Column
352
def window(timeColumn: Column, windowDuration: String, slideDuration: String, startTime: String): Column
353
}
354
```
355
356
### Array and Map Functions
357
358
Functions for working with complex data types.
359
360
```scala { .api }
361
object functions {
362
/** Array creation */
363
def array(cols: Column*): Column
364
def array(colName: String, colNames: String*): Column
365
def split(str: Column, pattern: String): Column
366
367
/** Array operations */
368
def array_contains(column: Column, value: Any): Column
369
def array_distinct(e: Column): Column
370
def array_except(col1: Column, col2: Column): Column
371
def array_intersect(col1: Column, col2: Column): Column
372
def array_join(column: Column, delimiter: String): Column
373
def array_join(column: Column, delimiter: String, nullReplacement: String): Column
374
def array_max(e: Column): Column
375
def array_min(e: Column): Column
376
def array_position(column: Column, value: Any): Column
377
def array_remove(column: Column, element: Any): Column
378
def array_repeat(e: Column, count: Int): Column
379
def array_sort(e: Column): Column
380
def array_union(col1: Column, col2: Column): Column
381
def arrays_overlap(a1: Column, a2: Column): Column
382
def arrays_zip(e: Column*): Column
383
def concat(exprs: Column*): Column
384
def element_at(column: Column, extraction: Any): Column
385
def explode(e: Column): Column
386
def explode_outer(e: Column): Column
387
def flatten(e: Column): Column
388
def posexplode(e: Column): Column
389
def posexplode_outer(e: Column): Column
390
def reverse(e: Column): Column
391
def sequence(start: Column, stop: Column): Column
392
def sequence(start: Column, stop: Column, step: Column): Column
393
def shuffle(e: Column): Column
394
def size(e: Column): Column
395
def slice(x: Column, start: Int, length: Int): Column
396
def sort_array(e: Column): Column
397
def sort_array(e: Column, asc: Boolean): Column
398
399
/** Map creation */
400
def map(cols: Column*): Column
401
def map_from_arrays(keys: Column, values: Column): Column
402
def map_from_entries(e: Column): Column
403
404
/** Map operations */
405
def map_concat(cols: Column*): Column
406
def map_entries(e: Column): Column
407
def map_filter(expr: Column, f: (Column, Column) => Column): Column
408
def map_keys(e: Column): Column
409
def map_values(e: Column): Column
410
def map_zip_with(left: Column, right: Column, f: (Column, Column, Column) => Column): Column
411
412
/** Struct operations */
413
def struct(cols: Column*): Column
414
def struct(colName: String, colNames: String*): Column
415
}
416
```
417
418
### Window Functions
419
420
Analytics functions for window-based calculations.
421
422
```scala { .api }
423
object functions {
424
/** Ranking functions */
425
def row_number(): Column
426
def rank(): Column
427
def dense_rank(): Column
428
def percent_rank(): Column
429
def ntile(n: Int): Column
430
def cume_dist(): Column
431
432
/** Analytic functions */
433
def lag(e: Column, offset: Int): Column
434
def lag(e: Column, offset: Int, defaultValue: Any): Column
435
def lead(e: Column, offset: Int): Column
436
def lead(e: Column, offset: Int, defaultValue: Any): Column
437
def first_value(e: Column): Column
438
def first_value(e: Column, ignoreNulls: Boolean): Column
439
def last_value(e: Column): Column
440
def last_value(e: Column, ignoreNulls: Boolean): Column
441
def nth_value(e: Column, n: Int): Column
442
def nth_value(e: Column, n: Int, ignoreNulls: Boolean): Column
443
}
444
445
/**
446
* Window specification for window functions
447
*/
448
class WindowSpec {
449
def partitionBy(colNames: String*): WindowSpec
450
def partitionBy(cols: Column*): WindowSpec
451
def orderBy(colNames: String*): WindowSpec
452
def orderBy(cols: Column*): WindowSpec
453
def rowsBetween(start: Long, end: Long): WindowSpec
454
def rangeBetween(start: Long, end: Long): WindowSpec
455
}
456
457
object Window {
458
/** Create unbounded window */
459
def partitionBy(colNames: String*): WindowSpec
460
def partitionBy(cols: Column*): WindowSpec
461
def orderBy(colNames: String*): WindowSpec
462
def orderBy(cols: Column*): WindowSpec
463
464
/** Window frame boundaries */
465
val unboundedPreceding: Long
466
val unboundedFollowing: Long
467
val currentRow: Long
468
}
469
```
470
471
**Usage Examples:**
472
473
```scala
474
import org.apache.spark.sql.functions._
475
import org.apache.spark.sql.expressions.Window
476
477
val sales = spark.table("sales")
478
479
// Window functions
480
val windowSpec = Window.partitionBy("region").orderBy(col("date"))
481
482
val salesWithRank = sales.withColumn("rank",
483
row_number().over(windowSpec)
484
)
485
486
val salesWithLag = sales.withColumn("prev_amount",
487
lag(col("amount"), 1).over(windowSpec)
488
)
489
490
// Running totals
491
val runningTotal = sales.withColumn("running_total",
492
sum(col("amount")).over(windowSpec.rowsBetween(Window.unboundedPreceding, Window.currentRow))
493
)
494
495
// Moving averages
496
val movingAvg = sales.withColumn("moving_avg_3",
497
avg(col("amount")).over(windowSpec.rowsBetween(-2, 0))
498
)
499
```