Catalyst query optimization framework and expression evaluation engine for Apache Spark SQL
npx @tessl/cli install tessl/maven-org-apache-spark--spark-catalyst@2.2.0Catalyst is Apache Spark's query optimization framework and expression evaluation engine for Spark SQL. It provides a comprehensive system for manipulating relational query plans, transforming SQL queries and DataFrame operations into efficient execution plans through rule-based and cost-based optimization techniques.
org.apache.spark:spark-catalyst_2.11:2.2.3import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.trees._import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.expressions._
// Create a data type
val stringType = StringType
val intType = IntegerType
// Create a row
val row = Row("Alice", 25, true)
// Access row data
val name: String = row.getString(0)
val age: Int = row.getInt(1)
val isActive: Boolean = row.getBoolean(2)
// Create expressions
val col = UnresolvedAttribute("name")
val literal = Literal("Alice")
val equals = EqualTo(col, literal)Catalyst is built around several key components:
Core data type system including primitive types, complex types (arrays, maps, structs), and Row interface for data access.
trait Row {
def apply(i: Int): Any
def get(i: Int): Any
def isNullAt(i: Int): Boolean
def getInt(i: Int): Int
def getString(i: Int): String
def getBoolean(i: Int): Boolean
// ... additional primitive accessors
}
abstract class DataType {
def typeName: String
def json: String
def prettyJson: String
def simpleString: String
}Comprehensive expression framework for computations, predicates, aggregations, and data transformations with built-in code generation.
abstract class Expression extends TreeNode[Expression] {
def dataType: DataType
def nullable: Boolean
def eval(input: InternalRow): Any
def genCode(ctx: CodegenContext): ExprCode
def prettyName: String
}
abstract class UnaryExpression extends Expression {
def child: Expression
}
abstract class BinaryExpression extends Expression {
def left: Expression
def right: Expression
}Logical and physical query plan representations with tree-based transformations and optimization support.
abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
def output: Seq[Attribute]
def references: AttributeSet
def inputSet: AttributeSet
def resolved: Boolean
}
abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanType] {
def output: Seq[Attribute]
def outputSet: AttributeSet
def references: AttributeSet
}Rule-based analysis system for resolving unresolved references, type checking, and semantic validation.
class Analyzer(catalog: SessionCatalog, conf: SQLConf, maxIterations: Int)
extends RuleExecutor[LogicalPlan] {
def execute(plan: LogicalPlan): LogicalPlan
}
abstract class Rule[TreeType <: TreeNode[TreeType]] {
def ruleName: String
def apply(plan: TreeType): TreeType
}Query optimization engine with rule-based and cost-based optimization techniques.
abstract class Optimizer extends RuleExecutor[LogicalPlan] {
def batches: Seq[Batch]
}
case class Batch(name: String, strategy: Strategy, rules: Rule[LogicalPlan]*)
abstract class Strategy {
def maxIterations: Int
}SQL parsing interfaces and abstract syntax tree representations.
abstract class ParserInterface {
def parsePlan(sqlText: String): LogicalPlan
def parseExpression(sqlText: String): Expression
def parseDataType(sqlText: String): DataType
}
trait AstBuilder extends SqlBaseBaseVisitor[AnyRef] {
def visitSingleStatement(ctx: SingleStatementContext): LogicalPlan
def visitSingleExpression(ctx: SingleExpressionContext): Expression
}Framework for generating efficient Java code for expression evaluation and query execution.
class CodegenContext {
def freshName(name: String): String
def addReferenceObj(objName: String, obj: Any, className: String = null): String
def addMutableState(javaType: String, variableName: String, initFunc: String = ""): String
}
case class ExprCode(code: String, isNull: String, value: String)
trait CodeGenerator[InType <: AnyRef, OutType <: AnyRef] {
def generate(expressions: InType): OutType
}Utility classes for date/time operations, string manipulation, and other common operations.
object DateTimeUtils {
def stringToTimestamp(s: UTF8String): Option[Long]
def timestampToString(us: Long): String
def dateToString(days: Int): String
def stringToDate(s: UTF8String): Option[Int]
}
object UTF8String {
def fromString(str: String): UTF8String
def fromBytes(bytes: Array[Byte]): UTF8String
}