Catalyst is Spark's library for manipulating relational query plans and expressions
npx @tessl/cli install tessl/maven-org-apache-spark--spark-catalyst-2-13@4.0.0Catalyst is Apache Spark's foundational library for relational query planning and optimization. It provides a comprehensive framework for manipulating and optimizing SQL query plans through a tree-based representation system that supports rule-based optimization, cost-based optimization, and code generation.
org.apache.spark:spark-catalyst_2.13:4.0.0pom.xml or build.sbt<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-catalyst_2.13</artifactId>
<version>4.0.0</version>
</dependency>libraryDependencies += "org.apache.spark" %% "spark-catalyst" % "4.0.0"import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.connector.catalog._
import org.apache.spark.sql.catalyst.trees._import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.expressions._
// Create data types
val stringType = StringType
val intType = IntegerType
val structType = StructType(Seq(
StructField("name", StringType, nullable = false),
StructField("age", IntegerType, nullable = true)
))
// Create expressions
val nameCol = AttributeReference("name", StringType, nullable = false)()
val ageCol = AttributeReference("age", IntegerType, nullable = true)()
val filterExpr = GreaterThan(ageCol, Literal(18))
// Work with logical plans
import org.apache.spark.sql.catalyst.plans.logical._
val plan = Filter(filterExpr, LocalRelation(nameCol, ageCol))Catalyst is built around several foundational components:
Core type system providing all SQL data types with full type safety and JSON serialization support. Essential for schema definition and type checking.
abstract class DataType extends AbstractDataType {
def defaultSize: Int
def typeName: String
def json: String
def simpleString: String
def sql: String
}
case class StructType(fields: Array[StructField]) extends DataType
case class StructField(name: String, dataType: DataType, nullable: Boolean, metadata: Metadata)
case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataType
case class MapType(keyType: DataType, valueType: DataType, valueContainsNull: Boolean) extends DataTypeTree-based expression framework for representing all SQL operations, functions, and computations with support for code generation and optimization.
abstract class Expression extends TreeNode[Expression] {
def dataType: DataType
def nullable: Boolean
def eval(input: InternalRow): Any
def children: Seq[Expression]
}
abstract class BinaryExpression extends Expression {
def left: Expression
def right: Expression
}
abstract class UnaryExpression extends Expression {
def child: Expression
}Logical and physical query plan representations enabling sophisticated query optimization including predicate pushdown, join reordering, and cost-based optimization.
abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
def output: Seq[Attribute]
def children: Seq[LogicalPlan]
def resolved: Boolean
}
case class Filter(condition: Expression, child: LogicalPlan) extends UnaryNode
case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) extends UnaryNode
case class Join(left: LogicalPlan, right: LogicalPlan, joinType: JoinType, condition: Option[Expression]) extends BinaryNodeV2 connector APIs providing standardized interfaces for integrating external data sources with full support for predicate pushdown, column pruning, and streaming.
trait TableCatalog {
def loadTable(ident: Identifier): Table
def createTable(ident: Identifier, schema: StructType, partitions: Array[Transform], properties: Map[String, String]): Table
def alterTable(ident: Identifier, changes: TableChange*): Table
def dropTable(ident: Identifier): Boolean
}
trait Table {
def name(): String
def schema(): StructType
def partitioning(): Array[Transform]
def properties(): Map[String, String]
}Generic tree node framework providing transformation and traversal capabilities used throughout Catalyst for plans, expressions, and other tree structures.
abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
def children: Seq[BaseType]
def transform(rule: PartialFunction[BaseType, BaseType]): BaseType
def foreach(f: BaseType => Unit): Unit
def collect[B](pf: PartialFunction[BaseType, B]): Seq[B]
def find(f: BaseType => Boolean): Option[BaseType]
}The tree infrastructure enables powerful pattern matching and transformation capabilities essential for query optimization and analysis.