Catalyst query optimization framework and expression evaluation engine for Apache Spark SQL
—
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Pending
The risk profile of this skill
Catalyst is Apache Spark's query optimization framework and expression evaluation engine for Spark SQL. It provides a comprehensive system for manipulating relational query plans, transforming SQL queries and DataFrame operations into efficient execution plans through rule-based and cost-based optimization techniques.
org.apache.spark:spark-catalyst_2.11:2.2.3import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.trees._import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.expressions._
// Create a data type
val stringType = StringType
val intType = IntegerType
// Create a row
val row = Row("Alice", 25, true)
// Access row data
val name: String = row.getString(0)
val age: Int = row.getInt(1)
val isActive: Boolean = row.getBoolean(2)
// Create expressions
val col = UnresolvedAttribute("name")
val literal = Literal("Alice")
val equals = EqualTo(col, literal)Catalyst is built around several key components:
Core data type system including primitive types, complex types (arrays, maps, structs), and Row interface for data access.
trait Row {
def apply(i: Int): Any
def get(i: Int): Any
def isNullAt(i: Int): Boolean
def getInt(i: Int): Int
def getString(i: Int): String
def getBoolean(i: Int): Boolean
// ... additional primitive accessors
}
abstract class DataType {
def typeName: String
def json: String
def prettyJson: String
def simpleString: String
}Comprehensive expression framework for computations, predicates, aggregations, and data transformations with built-in code generation.
abstract class Expression extends TreeNode[Expression] {
def dataType: DataType
def nullable: Boolean
def eval(input: InternalRow): Any
def genCode(ctx: CodegenContext): ExprCode
def prettyName: String
}
abstract class UnaryExpression extends Expression {
def child: Expression
}
abstract class BinaryExpression extends Expression {
def left: Expression
def right: Expression
}Logical and physical query plan representations with tree-based transformations and optimization support.
abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
def output: Seq[Attribute]
def references: AttributeSet
def inputSet: AttributeSet
def resolved: Boolean
}
abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanType] {
def output: Seq[Attribute]
def outputSet: AttributeSet
def references: AttributeSet
}Rule-based analysis system for resolving unresolved references, type checking, and semantic validation.
class Analyzer(catalog: SessionCatalog, conf: SQLConf, maxIterations: Int)
extends RuleExecutor[LogicalPlan] {
def execute(plan: LogicalPlan): LogicalPlan
}
abstract class Rule[TreeType <: TreeNode[TreeType]] {
def ruleName: String
def apply(plan: TreeType): TreeType
}Query optimization engine with rule-based and cost-based optimization techniques.
abstract class Optimizer extends RuleExecutor[LogicalPlan] {
def batches: Seq[Batch]
}
case class Batch(name: String, strategy: Strategy, rules: Rule[LogicalPlan]*)
abstract class Strategy {
def maxIterations: Int
}SQL parsing interfaces and abstract syntax tree representations.
abstract class ParserInterface {
def parsePlan(sqlText: String): LogicalPlan
def parseExpression(sqlText: String): Expression
def parseDataType(sqlText: String): DataType
}
trait AstBuilder extends SqlBaseBaseVisitor[AnyRef] {
def visitSingleStatement(ctx: SingleStatementContext): LogicalPlan
def visitSingleExpression(ctx: SingleExpressionContext): Expression
}Framework for generating efficient Java code for expression evaluation and query execution.
class CodegenContext {
def freshName(name: String): String
def addReferenceObj(objName: String, obj: Any, className: String = null): String
def addMutableState(javaType: String, variableName: String, initFunc: String = ""): String
}
case class ExprCode(code: String, isNull: String, value: String)
trait CodeGenerator[InType <: AnyRef, OutType <: AnyRef] {
def generate(expressions: InType): OutType
}Utility classes for date/time operations, string manipulation, and other common operations.
object DateTimeUtils {
def stringToTimestamp(s: UTF8String): Option[Long]
def timestampToString(us: Long): String
def dateToString(days: Int): String
def stringToDate(s: UTF8String): Option[Int]
}
object UTF8String {
def fromString(str: String): UTF8String
def fromBytes(bytes: Array[Byte]): UTF8String
}