0
# SQL Parsing
1
2
This section covers the SQL parsing interfaces and abstract syntax tree representations in Spark Catalyst. The parser converts SQL text into logical plan trees.
3
4
## Core Imports
5
6
```scala
7
import org.apache.spark.sql.catalyst.parser._
8
import org.apache.spark.sql.catalyst.plans.logical._
9
import org.apache.spark.sql.catalyst.expressions._
10
import org.apache.spark.sql.types._
11
```
12
13
## Parser Interface
14
15
The main interface for parsing SQL statements, expressions, and data types.
16
17
```scala { .api }
18
abstract class ParserInterface {
19
def parsePlan(sqlText: String): LogicalPlan
20
def parseExpression(sqlText: String): Expression
21
def parseDataType(sqlText: String): DataType
22
def parseTableIdentifier(sqlText: String): TableIdentifier
23
def parseFunctionIdentifier(sqlText: String): FunctionIdentifier
24
def parseTableSchema(sqlText: String): StructType
25
}
26
```
27
28
### Usage Example
29
30
```scala
31
import org.apache.spark.sql.catalyst.parser._
32
33
// Create parser instance
34
val parser = new CatalystSqlParser()
35
36
// Parse SQL statement
37
val logicalPlan = parser.parsePlan("SELECT name, age FROM users WHERE age > 18")
38
39
// Parse expression
40
val expression = parser.parseExpression("age + 1")
41
42
// Parse data type
43
val dataType = parser.parseDataType("STRUCT<name: STRING, age: INT>")
44
```
45
46
## Catalyst SQL Parser
47
48
The default SQL parser implementation for Spark SQL.
49
50
```scala { .api }
51
object CatalystSqlParser extends AbstractSqlParser {
52
override def astBuilder: AstBuilder = new AstBuilder()
53
54
def parseExpression(sqlText: String): Expression
55
def parsePlan(sqlText: String): LogicalPlan
56
def parseDataType(sqlText: String): DataType
57
def parseTableIdentifier(sqlText: String): TableIdentifier
58
def parseMultipartIdentifier(sqlText: String): Seq[String]
59
}
60
```
61
62
## AST Builder
63
64
Converts ANTLR parse trees into Catalyst logical plans and expressions.
65
66
```scala { .api }
67
trait AstBuilder extends SqlBaseBaseVisitor[AnyRef] {
68
// Plan visitors
69
def visitSingleStatement(ctx: SingleStatementContext): LogicalPlan
70
def visitQuery(ctx: QueryContext): LogicalPlan
71
def visitQuerySpecification(ctx: QuerySpecificationContext): LogicalPlan
72
def visitFromClause(ctx: FromClauseContext): LogicalPlan
73
def visitJoinRelation(ctx: JoinRelationContext): LogicalPlan
74
75
// Expression visitors
76
def visitSingleExpression(ctx: SingleExpressionContext): Expression
77
def visitArithmeticBinary(ctx: ArithmeticBinaryContext): Expression
78
def visitComparison(ctx: ComparisonContext): Expression
79
def visitLogicalBinary(ctx: LogicalBinaryContext): Expression
80
def visitPredicated(ctx: PredicatedContext): Expression
81
82
// Data type visitors
83
def visitSingleDataType(ctx: SingleDataTypeContext): DataType
84
def visitPrimitiveDataType(ctx: PrimitiveDataTypeContext): DataType
85
def visitComplexDataType(ctx: ComplexDataTypeContext): DataType
86
}
87
```
88
89
## Parse Exceptions
90
91
### ParseException
92
93
```scala { .api }
94
class ParseException(
95
message: String,
96
line: Int,
97
startPosition: Int,
98
cause: Throwable = null
99
) extends Exception(message, cause) {
100
101
def withCommand(command: String): ParseException
102
override def getMessage: String
103
}
104
105
case class TemplateSqlParseException(
106
message: String,
107
errorClass: String,
108
messageParameters: Map[String, String],
109
origin: Option[Origin],
110
cause: Option[Throwable]
111
) extends ParseException(message, 0, 0, cause.orNull)
112
```
113
114
## Data Type Parsing
115
116
### Type String Conversion
117
118
```scala { .api }
119
object DataType {
120
def fromDDL(ddl: String): DataType = CatalystSqlParser.parseDataType(ddl)
121
def fromJson(json: String): DataType
122
}
123
124
// Examples of DDL type strings
125
val intType = DataType.fromDDL("INT")
126
val arrayType = DataType.fromDDL("ARRAY<STRING>")
127
val structType = DataType.fromDDL("STRUCT<name: STRING, age: INT>")
128
val mapType = DataType.fromDDL("MAP<STRING, DOUBLE>")
129
```
130
131
### Schema Parsing
132
133
```scala { .api }
134
object StructType {
135
def fromDDL(ddl: String): StructType = CatalystSqlParser.parseTableSchema(ddl)
136
}
137
138
// Example schema parsing
139
val schema = StructType.fromDDL("name STRING, age INT, scores ARRAY<DOUBLE>")
140
```
141
142
## Expression Parsing
143
144
### Expression String Conversion
145
146
```scala
147
// Parse various expression types
148
val literalExpr = parser.parseExpression("42")
149
val columnExpr = parser.parseExpression("users.name")
150
val arithmeticExpr = parser.parseExpression("age + 1")
151
val functionExpr = parser.parseExpression("UPPER(name)")
152
val caseExpr = parser.parseExpression("CASE WHEN age > 18 THEN 'adult' ELSE 'minor' END")
153
```
154
155
### Complex Expression Examples
156
157
```scala
158
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser._
159
160
// Array operations
161
val arrayExpr = parseExpression("array(1, 2, 3)[0]")
162
163
// Map operations
164
val mapExpr = parseExpression("map('key1', 'value1')['key1']")
165
166
// Struct operations
167
val structExpr = parseExpression("named_struct('name', 'Alice', 'age', 25).name")
168
169
// Window functions
170
val windowExpr = parseExpression("ROW_NUMBER() OVER (PARTITION BY dept ORDER BY salary DESC)")
171
```
172
173
## SQL Statement Parsing
174
175
### DDL Statements
176
177
```scala { .api }
178
// Table creation
179
case class CreateTable(
180
tableIdentifier: TableIdentifier,
181
tableSchema: StructType,
182
partitionColumnNames: Seq[String],
183
bucketSpec: Option[BucketSpec],
184
properties: Map[String, String],
185
provider: Option[String],
186
options: Map[String, String],
187
location: Option[String],
188
comment: Option[String],
189
ifNotExists: Boolean
190
) extends LogicalPlan
191
192
// View creation
193
case class CreateView(
194
name: TableIdentifier,
195
userSpecifiedColumns: Seq[(String, Option[String])],
196
comment: Option[String],
197
properties: Map[String, String],
198
originalText: Option[String],
199
child: LogicalPlan,
200
allowExisting: Boolean,
201
replace: Boolean,
202
isTemporary: Boolean
203
) extends LogicalPlan
204
```
205
206
### DML Statements
207
208
```scala { .api }
209
// Insert statements
210
case class InsertIntoTable(
211
table: LogicalPlan,
212
partition: Map[String, Option[String]],
213
child: LogicalPlan,
214
overwrite: Boolean,
215
ifPartitionNotExists: Boolean
216
) extends LogicalPlan
217
218
// Update statements (Catalyst representation)
219
case class UpdateTable(
220
table: LogicalPlan,
221
assignments: Seq[Assignment],
222
condition: Option[Expression]
223
) extends LogicalPlan
224
```
225
226
## Function Parsing
227
228
### Function Identifier Parsing
229
230
```scala { .api }
231
case class FunctionIdentifier(funcName: String, database: Option[String] = None) {
232
def identifier: String = database.map(_ + ".").getOrElse("") + funcName
233
def unquotedString: String = identifier
234
def quotedString: String = database.map(quote).map(_ + ".").getOrElse("") + quote(funcName)
235
}
236
237
// Parse function names
238
val simpleFunc = parser.parseFunctionIdentifier("upper")
239
val qualifiedFunc = parser.parseFunctionIdentifier("my_db.custom_func")
240
```
241
242
## Parser Configuration
243
244
### SQL Configuration Impact
245
246
```scala
247
import org.apache.spark.sql.internal.SQLConf
248
249
// Parser behavior affected by configuration
250
val conf = SQLConf.get
251
val caseSensitive = conf.caseSensitiveAnalysis
252
val ansiMode = conf.ansiEnabled
253
val parser = new SparkSqlParser(conf)
254
```
255
256
## Custom Parser Extensions
257
258
### Extending the Parser
259
260
```scala
261
import org.apache.spark.sql.catalyst.parser._
262
263
// Custom AST builder with additional rules
264
class CustomAstBuilder extends AstBuilder {
265
override def visitCustomFunction(ctx: CustomFunctionContext): Expression = {
266
// Custom parsing logic for domain-specific functions
267
super.visitCustomFunction(ctx)
268
}
269
}
270
271
// Custom parser with extended functionality
272
class CustomSqlParser extends AbstractSqlParser {
273
override def astBuilder: AstBuilder = new CustomAstBuilder()
274
}
275
```
276
277
## Usage Examples
278
279
### Complete Parsing Workflow
280
281
```scala
282
import org.apache.spark.sql.catalyst.parser._
283
import org.apache.spark.sql.catalyst.plans.logical._
284
285
// Parse complex SQL query
286
val sqlText = """
287
SELECT
288
u.name,
289
COUNT(o.id) as order_count,
290
AVG(o.amount) as avg_amount
291
FROM users u
292
LEFT JOIN orders o ON u.id = o.user_id
293
WHERE u.age > 18
294
GROUP BY u.id, u.name
295
HAVING COUNT(o.id) > 5
296
ORDER BY avg_amount DESC
297
LIMIT 10
298
"""
299
300
val parser = CatalystSqlParser
301
val logicalPlan = parser.parsePlan(sqlText)
302
303
// Extract components
304
logicalPlan match {
305
case Limit(limitExpr,
306
Sort(order, global,
307
Filter(havingCondition,
308
Aggregate(groupingExprs, aggregateExprs,
309
Join(left, right, joinType, joinCondition))))) =>
310
println(s"Parsed complex query with joins, aggregation, and ordering")
311
}
312
```
313
314
### Error Handling
315
316
```scala
317
import org.apache.spark.sql.catalyst.parser.ParseException
318
319
try {
320
val plan = parser.parsePlan("INVALID SQL SYNTAX")
321
} catch {
322
case e: ParseException =>
323
println(s"Parse error at line ${e.line}, position ${e.startPosition}: ${e.getMessage}")
324
}
325
```
326
327
The parsing framework provides a complete SQL-to-AST transformation pipeline that handles the full spectrum of SQL constructs and converts them into Catalyst's internal representation for further processing.