Catalyst is a library for manipulating relational query plans used as the foundation for Spark SQL's query optimizer and execution engine
Comprehensive type system supporting all SQL types including primitives, complex nested types, and user-defined types for representing data schemas in Catalyst.
The abstract base class for all Spark SQL data types.
/**
* The base type of all Spark SQL data types.
*/
abstract class DataType extends AbstractDataType {
/**
* The default size of a value of this data type, used internally for size estimation.
*/
def defaultSize: Int
/** Name of the type used in JSON serialization */
def typeName: String
/** The compact JSON representation of this data type */
def json: String
/** The pretty (i.e. indented) JSON representation of this data type */
def prettyJson: String
/** Readable string representation for the type */
def simpleString: String
/**
* Check if `this` and `other` are the same data type when ignoring nullability
* (`StructField.nullable`, `ArrayType.containsNull`, and `MapType.valueContainsNull`).
*/
def sameType(other: DataType): Boolean
/**
* Returns the same data type but set all nullability fields are true
* (`StructField.nullable`, `ArrayType.containsNull`, and `MapType.valueContainsNull`).
*/
def asNullable: DataType
}Usage Examples:
import org.apache.spark.sql.types._
// Get type information
val intType = IntegerType
val typeName = intType.typeName // "integer"
val json = intType.json // "integer"
val prettyJson = intType.prettyJson // "integer"
val simple = intType.simpleString // "int"
val size = intType.defaultSize // 4
// Compare types
val stringType = StringType
val same = intType.sameType(stringType) // false
val sameInt = intType.sameType(IntegerType) // true
// Create nullable version
val nullableArray = ArrayType(IntegerType, containsNull = false)
val nullable = nullableArray.asNullable
// ArrayType(IntegerType, containsNull = true)Basic data types representing primitive values.
/** Boolean data type */
object BooleanType extends DataType {
def defaultSize: Int = 1
}
/** Byte data type */
object ByteType extends DataType {
def defaultSize: Int = 1
}
/** Short integer data type */
object ShortType extends DataType {
def defaultSize: Int = 2
}
/** Integer data type */
object IntegerType extends DataType {
def defaultSize: Int = 4
}
/** Long integer data type */
object LongType extends DataType {
def defaultSize: Int = 8
}
/** Float data type */
object FloatType extends DataType {
def defaultSize: Int = 4
}
/** Double data type */
object DoubleType extends DataType {
def defaultSize: Int = 8
}
/** String data type */
object StringType extends DataType {
def defaultSize: Int = 20
}
/** Binary data type */
object BinaryType extends DataType {
def defaultSize: Int = 100
}
/** Date data type */
object DateType extends DataType {
def defaultSize: Int = 4
}
/** Timestamp data type */
object TimestampType extends DataType {
def defaultSize: Int = 12
}
/** Null data type */
object NullType extends DataType {
def defaultSize: Int = 1
}
/** Calendar interval data type */
object CalendarIntervalType extends DataType {
def defaultSize: Int = 16
}Usage Examples:
import org.apache.spark.sql.types._
// Use primitive types
val schema = StructType(Seq(
StructField("id", IntegerType, nullable = false),
StructField("name", StringType, nullable = true),
StructField("active", BooleanType, nullable = false),
StructField("score", DoubleType, nullable = true),
StructField("created", TimestampType, nullable = false)
))
// Type checking
def processValue(value: Any, dataType: DataType): String = dataType match {
case IntegerType => s"Integer: $value"
case StringType => s"String: $value"
case BooleanType => s"Boolean: $value"
case DoubleType => s"Double: $value"
case _ => s"Other: $value"
}Data types for complex nested structures including arrays, maps, and structs.
/**
* Array data type with element type and nullability
*/
case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataType {
def defaultSize: Int = 100
/** Creates a new ArrayType with the same element type but different nullability */
def copy(elementType: DataType = elementType, containsNull: Boolean = containsNull): ArrayType
}
/**
* Map data type with key/value types and nullability
*/
case class MapType(
keyType: DataType,
valueType: DataType,
valueContainsNull: Boolean) extends DataType {
def defaultSize: Int = 100
/** Creates a new MapType with updated parameters */
def copy(
keyType: DataType = keyType,
valueType: DataType = valueType,
valueContainsNull: Boolean = valueContainsNull): MapType
}
/**
* Struct data type with named fields
*/
case class StructType(fields: Array[StructField]) extends DataType {
def defaultSize: Int = fields.map(_.dataType.defaultSize).sum
/** Get field by name */
def apply(name: String): StructField
/** Get field names */
def fieldNames: Array[String]
/** Add a new field */
def add(field: StructField): StructType
def add(name: String, dataType: DataType): StructType
def add(name: String, dataType: DataType, nullable: Boolean): StructType
def add(name: String, dataType: DataType, nullable: Boolean, metadata: Metadata): StructType
}
/**
* Individual field in a struct with name, type, nullable flag, and metadata
*/
case class StructField(
name: String,
dataType: DataType,
nullable: Boolean,
metadata: Metadata = Metadata.empty) {
/** Creates a new StructField with updated parameters */
def copy(
name: String = name,
dataType: DataType = dataType,
nullable: Boolean = nullable,
metadata: Metadata = metadata): StructField
}Usage Examples:
import org.apache.spark.sql.types._
// Create array type
val intArray = ArrayType(IntegerType, containsNull = true)
val stringArray = ArrayType(StringType, containsNull = false)
// Create map type
val stringToInt = MapType(StringType, IntegerType, valueContainsNull = false)
val idToUser = MapType(IntegerType, StringType, valueContainsNull = true)
// Create struct type
val userSchema = StructType(Array(
StructField("id", IntegerType, nullable = false),
StructField("name", StringType, nullable = true),
StructField("emails", ArrayType(StringType, containsNull = false), nullable = true),
StructField("properties", MapType(StringType, StringType, valueContainsNull = true), nullable = true)
))
// Access struct fields
val nameField = userSchema("name")
val fieldNames = userSchema.fieldNames // Array("id", "name", "emails", "properties")
// Add fields to struct
val extendedSchema = userSchema
.add("created_at", TimestampType, nullable = false)
.add("active", BooleanType, nullable = false)
// Complex nested types
val nestedSchema = StructType(Array(
StructField("users", ArrayType(userSchema, containsNull = false), nullable = true),
StructField("metadata", MapType(StringType,
StructType(Array(
StructField("value", StringType, nullable = true),
StructField("type", StringType, nullable = false)
)), valueContainsNull = true), nullable = true)
))High-precision decimal types for financial and scientific calculations.
/**
* Decimal data type with precision and scale
*/
case class DecimalType(precision: Int, scale: Int) extends DataType {
def defaultSize: Int = 8
/** Maximum allowed precision */
def isWiderThan(other: DataType): Boolean
/** Whether this decimal type is tighter than another */
def isTighterThan(other: DataType): Boolean
}
object DecimalType {
/** System default decimal type */
val SYSTEM_DEFAULT: DecimalType
/** User default decimal type */
val USER_DEFAULT: DecimalType
/** Maximum precision allowed */
val MAX_PRECISION: Int = 38
/** Maximum scale allowed */
val MAX_SCALE: Int = 38
/** Create decimal type with precision and scale */
def apply(precision: Int, scale: Int): DecimalType
/** Create unbounded decimal type */
def UNLIMITED(): DecimalType
}
/**
* Decimal value implementation with high precision arithmetic
*/
case class Decimal(value: java.math.BigDecimal) extends Ordered[Decimal] {
/** Convert to different precision and scale */
def toPrecision(precision: Int, scale: Int): Decimal
/** Basic arithmetic operations */
def +(that: Decimal): Decimal
def -(that: Decimal): Decimal
def *(that: Decimal): Decimal
def /(that: Decimal): Decimal
def %(that: Decimal): Decimal
/** Comparison operations */
def compare(that: Decimal): Int
/** Convert to other numeric types */
def toDouble: Double
def toFloat: Float
def toLong: Long
def toInt: Int
/** String representation */
override def toString: String
}
object Decimal {
/** Create decimal from various sources */
def apply(value: BigDecimal): Decimal
def apply(value: java.math.BigDecimal): Decimal
def apply(value: String): Decimal
def apply(value: Long): Decimal
def apply(value: Double): Decimal
/** Zero and One constants */
val ZERO: Decimal
val ONE: Decimal
}Usage Examples:
import org.apache.spark.sql.types._
// Create decimal types
val currency = DecimalType(10, 2) // 10 digits, 2 decimal places
val percentage = DecimalType(5, 4) // 5 digits, 4 decimal places
val unlimited = DecimalType.UNLIMITED()
// Create decimal values
val price = Decimal("123.45")
val rate = Decimal(0.0825)
val count = Decimal(1000)
// Arithmetic operations
val total = price * count // Decimal("123450.00")
val withTax = total * (Decimal.ONE + rate) // Apply tax rate
val rounded = withTax.toPrecision(10, 2) // Round to currency precision
// Use in schema
val transactionSchema = StructType(Array(
StructField("id", StringType, nullable = false),
StructField("amount", DecimalType(12, 2), nullable = false),
StructField("tax_rate", DecimalType(6, 4), nullable = false),
StructField("total", DecimalType(15, 2), nullable = false)
))Specialized types for advanced use cases including user-defined types and abstract matching.
/**
* Base class for user-defined types
*/
abstract class UserDefinedType[UserType >: Null] extends DataType {
/** The underlying SQL data type for this UDT */
def sqlType: DataType
/** Serialize the user type to SQL type */
def serialize(obj: UserType): Any
/** Deserialize from SQL type to user type */
def deserialize(datum: Any): UserType
/** User class that this UDT represents */
def userClass: Class[UserType]
}
/**
* Abstract base for type matching in expressions
*/
abstract class AbstractDataType {
/** The default concrete type to use if we need to cast into this type */
def defaultConcreteType: DataType
/** Check if the given data type is an instance of this abstract type */
def acceptsType(other: DataType): Boolean
/** Readable string representation */
def simpleString: String
}
/**
* Type for JVM objects
*/
case class ObjectType(cls: Class[_]) extends DataType {
def defaultSize: Int = 4096
}Usage Examples:
import org.apache.spark.sql.types._
// Define a custom UDT for a Point class
case class Point(x: Double, y: Double)
class PointUDT extends UserDefinedType[Point] {
override def sqlType: DataType = StructType(Array(
StructField("x", DoubleType, nullable = false),
StructField("y", DoubleType, nullable = false)
))
override def serialize(point: Point): Any = {
val row = new GenericInternalRow(2)
row.setDouble(0, point.x)
row.setDouble(1, point.y)
row
}
override def deserialize(datum: Any): Point = {
datum match {
case row: InternalRow =>
Point(row.getDouble(0), row.getDouble(1))
}
}
override def userClass: Class[Point] = classOf[Point]
}
// Use custom UDT in schema
val pointUDT = new PointUDT()
val locationSchema = StructType(Array(
StructField("name", StringType, nullable = false),
StructField("location", pointUDT, nullable = false)
))
// Object type for arbitrary JVM objects
val stringObjectType = ObjectType(classOf[String])
val listObjectType = ObjectType(classOf[java.util.List[_]])Utility objects and methods for working with the type system.
object DataType {
/**
* Compares two types, ignoring nullability of ArrayType, MapType, StructType.
*/
def equalsIgnoreNullability(left: DataType, right: DataType): Boolean
/**
* Compares two types, ignoring compatible nullability.
*/
def equalsIgnoreCompatibleNullability(left: DataType, right: DataType): Boolean
/** Parse a data type from string representation */
def fromJson(json: String): DataType
/** Parse a data type from DDL string */
def fromDDL(ddl: String): DataType
}
/** Metadata for struct fields and other schema elements */
case class Metadata(map: Map[String, Any]) {
def contains(key: String): Boolean
def getString(key: String): String
def getLong(key: String): Long
def getDouble(key: String): Double
def getBoolean(key: String): Boolean
def getMetadata(key: String): Metadata
def getStringArray(key: String): Array[String]
def getLongArray(key: String): Array[Long]
def getDoubleArray(key: String): Array[Double]
def getBooleanArray(key: String): Array[Boolean]
def getMetadataArray(key: String): Array[Metadata]
def json: String
}
object Metadata {
/** Empty metadata */
val empty: Metadata
/** Create metadata from JSON */
def fromJson(json: String): Metadata
/** Create metadata builder */
def builder(): MetadataBuilder
}Usage Examples:
import org.apache.spark.sql.types._
// Compare types ignoring nullability
val nullable = StructType(Array(StructField("x", IntegerType, nullable = true)))
val nonNullable = StructType(Array(StructField("x", IntegerType, nullable = false)))
val equal = DataType.equalsIgnoreNullability(nullable, nonNullable) // true
// Parse types from strings
val parsedType = DataType.fromDDL("struct<id:int,name:string>")
val jsonType = DataType.fromJson("""{"type":"integer"}""")
// Work with metadata
val metadata = Metadata.builder()
.putString("description", "User identifier")
.putLong("version", 1L)
.putBoolean("required", true)
.build()
val fieldWithMetadata = StructField("id", IntegerType, nullable = false, metadata)
// Access metadata
val description = fieldWithMetadata.metadata.getString("description")
val version = fieldWithMetadata.metadata.getLong("version")
val required = fieldWithMetadata.metadata.getBoolean("required")Install with Tessl CLI
npx tessl i tessl/maven-org-apache-spark--spark-catalyst-2-10