0
# Expressions
1
2
Catalyst's expression system provides a tree-based representation for all SQL operations, functions, and computations. The expression framework supports type-safe evaluation, code generation, and optimization transformations essential for query processing.
3
4
## Core Imports
5
6
```scala
7
import org.apache.spark.sql.catalyst.expressions._
8
import org.apache.spark.sql.catalyst.InternalRow
9
import org.apache.spark.sql.types._
10
```
11
12
## Expression Hierarchy
13
14
### Base Expression Class
15
16
```scala { .api }
17
abstract class Expression extends TreeNode[Expression] {
18
def dataType: DataType
19
def nullable: Boolean
20
def eval(input: InternalRow): Any
21
def genCode(ctx: CodegenContext): ExprCode
22
def children: Seq[Expression]
23
def references: AttributeSet
24
def prettyName: String
25
def sql: String
26
def semanticEquals(other: Expression): Boolean
27
def deterministic: Boolean
28
def foldable: Boolean
29
def semanticHash(): Int
30
}
31
```
32
33
The base `Expression` class provides evaluation capabilities, type information, and code generation support for all SQL expressions.
34
35
### Expression Categories by Arity
36
37
#### LeafExpression
38
39
```scala { .api }
40
abstract class LeafExpression extends Expression {
41
final override def children: Seq[Expression] = Nil
42
}
43
```
44
45
Expressions with no child expressions, such as literals and column references.
46
47
#### UnaryExpression
48
49
```scala { .api }
50
abstract class UnaryExpression extends Expression {
51
def child: Expression
52
final override def children: Seq[Expression] = child :: Nil
53
}
54
```
55
56
Expressions with one child expression, such as mathematical functions and type casts.
57
58
#### BinaryExpression
59
60
```scala { .api }
61
abstract class BinaryExpression extends Expression {
62
def left: Expression
63
def right: Expression
64
final override def children: Seq[Expression] = Seq(left, right)
65
}
66
```
67
68
Expressions with two child expressions, such as arithmetic and comparison operators.
69
70
#### TernaryExpression
71
72
```scala { .api }
73
abstract class TernaryExpression extends Expression {
74
def first: Expression
75
def second: Expression
76
def third: Expression
77
final override def children: Seq[Expression] = Seq(first, second, third)
78
}
79
```
80
81
Expressions with three child expressions, such as conditional expressions and substring operations.
82
83
## Core Expression Types
84
85
### Literals and References
86
87
```scala { .api }
88
case class Literal(value: Any, dataType: DataType) extends LeafExpression {
89
def this(v: Any) = this(v, Literal.inferType(v))
90
}
91
92
object Literal {
93
def apply(v: Any): Literal
94
def create(v: Any, dataType: DataType): Literal
95
def default(dataType: DataType): Literal
96
def fromObject(obj: Any): Literal
97
val TrueLiteral: Literal
98
val FalseLiteral: Literal
99
}
100
101
case class AttributeReference(
102
name: String,
103
dataType: DataType,
104
nullable: Boolean = true,
105
metadata: Metadata = Metadata.empty
106
)(val exprId: ExprId = NamedExpression.newExprId,
107
val qualifier: Seq[String] = Seq.empty) extends Attribute
108
```
109
110
**Usage Example:**
111
```scala
112
// Create literals
113
val intLit = Literal(42)
114
val stringLit = Literal("hello")
115
val nullLit = Literal.create(null, StringType)
116
117
// Create attribute references
118
val nameAttr = AttributeReference("name", StringType, nullable = false)()
119
val ageAttr = AttributeReference("age", IntegerType, nullable = true)()
120
```
121
122
### Arithmetic Expressions
123
124
```scala { .api }
125
case class Add(left: Expression, right: Expression) extends BinaryArithmetic {
126
override def inputType: AbstractDataType = NumericType
127
protected override def nullSafeEval(input1: Any, input2: Any): Any
128
}
129
130
case class Subtract(left: Expression, right: Expression) extends BinaryArithmetic
131
case class Multiply(left: Expression, right: Expression) extends BinaryArithmetic
132
case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
133
case class Remainder(left: Expression, right: Expression) extends BinaryArithmetic
134
135
case class UnaryMinus(child: Expression) extends UnaryArithmetic
136
case class UnaryPlus(child: Expression) extends UnaryArithmetic
137
case class Abs(child: Expression) extends UnaryMathExpression
138
```
139
140
**Usage Example:**
141
```scala
142
val col1 = AttributeReference("a", IntegerType, false)()
143
val col2 = AttributeReference("b", IntegerType, false)()
144
145
val addExpr = Add(col1, col2)
146
val subtractExpr = Subtract(col1, Literal(10))
147
val multiplyExpr = Multiply(col1, col2)
148
val absExpr = Abs(UnaryMinus(col1))
149
```
150
151
### Comparison Expressions
152
153
```scala { .api }
154
case class EqualTo(left: Expression, right: Expression) extends BinaryComparison
155
case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComparison
156
case class LessThan(left: Expression, right: Expression) extends BinaryComparison
157
case class LessThanOrEqual(left: Expression, right: Expression) extends BinaryComparison
158
case class GreaterThan(left: Expression, right: Expression) extends BinaryComparison
159
case class GreaterThanOrEqual(left: Expression, right: Expression) extends BinaryComparison
160
161
case class In(value: Expression, list: Seq[Expression]) extends Predicate
162
case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with Predicate
163
```
164
165
**Usage Example:**
166
```scala
167
val nameCol = AttributeReference("name", StringType, false)()
168
val ageCol = AttributeReference("age", IntegerType, true)()
169
170
val equalExpr = EqualTo(nameCol, Literal("Alice"))
171
val rangeExpr = And(GreaterThan(ageCol, Literal(18)), LessThan(ageCol, Literal(65)))
172
val inExpr = In(nameCol, Seq(Literal("Alice"), Literal("Bob"), Literal("Charlie")))
173
```
174
175
### Logical Expressions
176
177
```scala { .api }
178
case class And(left: Expression, right: Expression) extends BinaryExpression with Predicate {
179
override def dataType: DataType = BooleanType
180
}
181
182
case class Or(left: Expression, right: Expression) extends BinaryExpression with Predicate {
183
override def dataType: DataType = BooleanType
184
}
185
186
case class Not(child: Expression) extends UnaryExpression with Predicate {
187
override def dataType: DataType = BooleanType
188
}
189
190
case class IsNull(child: Expression) extends UnaryExpression with Predicate
191
case class IsNotNull(child: Expression) extends UnaryExpression with Predicate
192
```
193
194
**Usage Example:**
195
```scala
196
val nameCol = AttributeReference("name", StringType, true)()
197
val ageCol = AttributeReference("age", IntegerType, true)()
198
199
val notNullName = IsNotNull(nameCol)
200
val validAge = And(GreaterThan(ageCol, Literal(0)), LessThan(ageCol, Literal(150)))
201
val condition = And(notNullName, validAge)
202
```
203
204
### String Expressions
205
206
```scala { .api }
207
case class Upper(child: Expression) extends UnaryExpression with ImplicitCastInputTypes
208
case class Lower(child: Expression) extends UnaryExpression with ImplicitCastInputTypes
209
case class Length(child: Expression) extends UnaryExpression with ImplicitCastInputTypes
210
211
case class Substring(str: Expression, pos: Expression, len: Expression) extends TernaryExpression with ImplicitCastInputTypes
212
213
case class Concat(children: Seq[Expression]) extends Expression with ImplicitCastInputTypes
214
215
case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expression) extends TernaryExpression with ImplicitCastInputTypes
216
217
case class StartsWith(left: Expression, right: Expression) extends BinaryExpression with Predicate
218
case class EndsWith(left: Expression, right: Expression) extends BinaryExpression with Predicate
219
case class Contains(left: Expression, right: Expression) extends BinaryExpression with Predicate
220
```
221
222
**Usage Example:**
223
```scala
224
val nameCol = AttributeReference("name", StringType, false)()
225
val descCol = AttributeReference("description", StringType, true)()
226
227
val upperName = Upper(nameCol)
228
val nameLength = Length(nameCol)
229
val substring = Substring(nameCol, Literal(1), Literal(3))
230
val concat = Concat(Seq(nameCol, Literal(" - "), descCol))
231
val startsWithA = StartsWith(nameCol, Literal("A"))
232
```
233
234
### Date and Time Expressions
235
236
```scala { .api }
237
case class CurrentDate(timeZoneId: Option[String] = None) extends LeafExpression with ImplicitCastInputTypes
238
case class CurrentTimestamp() extends LeafExpression with ImplicitCastInputTypes
239
240
case class Year(child: Expression) extends UnaryExpression with ImplicitCastInputTypes
241
case class Month(child: Expression) extends UnaryExpression with ImplicitCastInputTypes
242
case class DayOfMonth(child: Expression) extends UnaryExpression with ImplicitCastInputTypes
243
case class Hour(child: Expression) extends UnaryExpression with ImplicitCastInputTypes
244
case class Minute(child: Expression) extends UnaryExpression with ImplicitCastInputTypes
245
case class Second(child: Expression) extends UnaryExpression with ImplicitCastInputTypes
246
247
case class DateAdd(startDate: Expression, days: Expression) extends BinaryExpression with ImplicitCastInputTypes
248
case class DateSub(startDate: Expression, days: Expression) extends BinaryExpression with ImplicitCastInputTypes
249
```
250
251
**Usage Example:**
252
```scala
253
val dateCol = AttributeReference("created_at", DateType, false)()
254
val timestampCol = AttributeReference("updated_at", TimestampType, true)()
255
256
val currentDate = CurrentDate()
257
val extractYear = Year(dateCol)
258
val extractMonth = Month(dateCol)
259
val addDays = DateAdd(dateCol, Literal(30))
260
val hourFromTimestamp = Hour(timestampCol)
261
```
262
263
### Conditional Expressions
264
265
```scala { .api }
266
case class If(predicate: Expression, trueValue: Expression, falseValue: Expression) extends TernaryExpression
267
268
case class CaseWhen(branches: Seq[(Expression, Expression)], elseValue: Option[Expression] = None) extends Expression
269
270
case class Coalesce(children: Seq[Expression]) extends Expression
271
272
case class Greatest(children: Seq[Expression]) extends Expression
273
case class Least(children: Seq[Expression]) extends Expression
274
```
275
276
**Usage Example:**
277
```scala
278
val ageCol = AttributeReference("age", IntegerType, true)()
279
val statusCol = AttributeReference("status", StringType, true)()
280
281
val ifExpr = If(GreaterThan(ageCol, Literal(18)), Literal("Adult"), Literal("Minor"))
282
283
val caseWhen = CaseWhen(Seq(
284
(EqualTo(statusCol, Literal("A")), Literal("Active")),
285
(EqualTo(statusCol, Literal("I")), Literal("Inactive"))
286
), Some(Literal("Unknown")))
287
288
val coalesce = Coalesce(Seq(statusCol, Literal("Default")))
289
```
290
291
### Cast and Type Conversion
292
293
```scala { .api }
294
case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String] = None) extends UnaryExpression with TimeZoneAwareExpression
295
296
case class CheckOverflow(child: Expression, dataType: DecimalType, nullOnOverflow: Boolean) extends UnaryExpression
297
298
case class PromotePrecision(child: Expression) extends UnaryExpression
299
```
300
301
**Usage Example:**
302
```scala
303
val stringCol = AttributeReference("value", StringType, false)()
304
val intCol = AttributeReference("count", IntegerType, false)()
305
306
val castToInt = Cast(stringCol, IntegerType)
307
val castToDecimal = Cast(intCol, DecimalType(10, 2))
308
val castToString = Cast(intCol, StringType)
309
```
310
311
### Aggregate Expressions
312
313
```scala { .api }
314
case class Sum(child: Expression) extends DeclarativeAggregate
315
case class Count(children: Seq[Expression]) extends DeclarativeAggregate
316
case class Average(child: Expression) extends DeclarativeAggregate
317
case class Min(child: Expression) extends DeclarativeAggregate
318
case class Max(child: Expression) extends DeclarativeAggregate
319
320
case class First(child: Expression, ignoreNulls: Expression) extends DeclarativeAggregate
321
case class Last(child: Expression, ignoreNulls: Expression) extends DeclarativeAggregate
322
323
case class CollectList(child: Expression) extends TypedImperativeAggregate[mutable.ArrayBuffer[Any]]
324
case class CollectSet(child: Expression) extends TypedImperativeAggregate[mutable.Set[Any]]
325
```
326
327
**Usage Example:**
328
```scala
329
val salaryCol = AttributeReference("salary", DecimalType(10, 2), true)()
330
val nameCol = AttributeReference("name", StringType, false)()
331
332
val sumSalary = Sum(salaryCol)
333
val countNames = Count(Seq(nameCol))
334
val avgSalary = Average(salaryCol)
335
val minSalary = Min(salaryCol)
336
val collectNames = CollectList(nameCol)
337
```
338
339
### Window Functions
340
341
```scala { .api }
342
case class RowNumber() extends RowNumberLike
343
case class Rank(children: Seq[Expression]) extends RankLike
344
case class DenseRank(children: Seq[Expression]) extends RankLike
345
346
case class Lead(input: Expression, offset: Expression, default: Expression) extends OffsetWindowFunction
347
case class Lag(input: Expression, offset: Expression, default: Expression) extends OffsetWindowFunction
348
349
case class FirstValue(child: Expression, ignoreNulls: Expression) extends AggregateWindowFunction
350
case class LastValue(child: Expression, ignoreNulls: Expression) extends AggregateWindowFunction
351
```
352
353
**Usage Example:**
354
```scala
355
val salaryCol = AttributeReference("salary", DecimalType(10, 2), false)()
356
val dateCol = AttributeReference("hire_date", DateType, false)()
357
358
val rowNum = RowNumber()
359
val rankBySalary = Rank(Seq(salaryCol))
360
val denseRankBySalary = DenseRank(Seq(salaryCol))
361
val prevSalary = Lag(salaryCol, Literal(1), Literal(0))
362
val nextSalary = Lead(salaryCol, Literal(1), salaryCol)
363
```
364
365
## Collection Expressions
366
367
### Array Operations
368
369
```scala { .api }
370
case class CreateArray(children: Seq[Expression]) extends Expression
371
372
case class ArrayContains(left: Expression, right: Expression) extends BinaryExpression with Predicate
373
374
case class GetArrayItem(child: Expression, ordinal: Expression) extends BinaryExpression
375
376
case class Size(child: Expression) extends UnaryExpression
377
378
case class ArraySort(child: Expression) extends UnaryExpression
379
case class ArrayMin(child: Expression) extends UnaryExpression
380
case class ArrayMax(child: Expression) extends UnaryExpression
381
```
382
383
**Usage Example:**
384
```scala
385
val arrayCol = AttributeReference("tags", ArrayType(StringType), true)()
386
val indexCol = AttributeReference("index", IntegerType, false)()
387
388
val createArray = CreateArray(Seq(Literal("tag1"), Literal("tag2"), Literal("tag3")))
389
val arrayContains = ArrayContains(arrayCol, Literal("important"))
390
val getItem = GetArrayItem(arrayCol, indexCol)
391
val arraySize = Size(arrayCol)
392
val sortedArray = ArraySort(arrayCol)
393
```
394
395
### Map Operations
396
397
```scala { .api }
398
case class CreateMap(children: Seq[Expression]) extends Expression
399
400
case class GetMapValue(child: Expression, key: Expression) extends BinaryExpression
401
402
case class MapKeys(child: Expression) extends UnaryExpression
403
case class MapValues(child: Expression) extends UnaryExpression
404
```
405
406
**Usage Example:**
407
```scala
408
val mapCol = AttributeReference("properties", MapType(StringType, StringType), true)()
409
val keyCol = AttributeReference("key", StringType, false)()
410
411
val createMap = CreateMap(Seq(
412
Literal("name"), Literal("John"),
413
Literal("age"), Literal("30")
414
))
415
val getValue = GetMapValue(mapCol, keyCol)
416
val getKeys = MapKeys(mapCol)
417
val getValues = MapValues(mapCol)
418
```
419
420
## Expression Utilities
421
422
### Expression Analysis
423
424
```scala { .api }
425
object ExpressionSet {
426
def apply(expressions: Seq[Expression]): ExpressionSet
427
}
428
429
case class AttributeSet(baseSet: Set[Attribute]) extends Traversable[Attribute] {
430
def +(attribute: Attribute): AttributeSet
431
def ++(other: AttributeSet): AttributeSet
432
def -(attribute: Attribute): AttributeSet
433
def --(other: AttributeSet): AttributeSet
434
def contains(attribute: Attribute): Boolean
435
def intersect(other: AttributeSet): AttributeSet
436
def subsetOf(other: AttributeSet): Boolean
437
}
438
```
439
440
### Code Generation
441
442
```scala { .api }
443
case class ExprCode(code: String, isNull: String, value: String)
444
445
class CodegenContext {
446
def freshName(name: String): String
447
def addReferenceObj(objName: String, obj: Any, className: String = null): String
448
def addNewFunction(funcName: String, funcCode: String): String
449
}
450
```
451
452
## Common Expression Patterns
453
454
### Building Complex Expressions
455
```scala
456
val nameCol = AttributeReference("name", StringType, false)()
457
val ageCol = AttributeReference("age", IntegerType, true)()
458
val salaryCol = AttributeReference("salary", DecimalType(10, 2), true)()
459
460
// Complex filtering condition
461
val complexFilter = And(
462
IsNotNull(nameCol),
463
And(
464
GreaterThan(ageCol, Literal(21)),
465
Or(
466
GreaterThan(salaryCol, Literal(50000)),
467
StartsWith(nameCol, Literal("Senior"))
468
)
469
)
470
)
471
472
// Computed column
473
val computedSalary = If(
474
IsNull(salaryCol),
475
Literal(0),
476
Add(salaryCol, Multiply(salaryCol, Literal(0.1)))
477
)
478
```
479
480
### Expression Transformation
481
```scala
482
// Transform expressions using pattern matching
483
def removeRedundantCasts(expr: Expression): Expression = {
484
expr.transform {
485
case Cast(child, dataType, _) if child.dataType == dataType => child
486
case other => other
487
}
488
}
489
490
// Fold constant expressions
491
def foldConstants(expr: Expression): Expression = {
492
expr.transform {
493
case e if e.foldable => Literal.create(e.eval(EmptyRow), e.dataType)
494
case other => other
495
}
496
}
497
```
498
499
The expression system in Catalyst provides a comprehensive framework for representing and evaluating all SQL operations with support for optimization, code generation, and type safety.