Comprehensive type system for Spark SQL including primitives, collections, and complex nested structures. Essential for defining schemas and working with structured data in distributed environments.
Foundation classes for all Spark SQL data types.
/**
* Base class for all Spark SQL data types
*/
abstract class DataType extends AbstractDataType {
def defaultSize: Int
def typeName: String
def json: String
def prettyJson: String
def simpleString: String
def catalogString: String
def sql: String
def sameType(other: DataType): Boolean
def asNullable: DataType
}
/**
* Base for non-concrete data types
*/
abstract class AbstractDataType {
def defaultConcreteType: DataType
def acceptsType(other: DataType): Boolean
def simpleString: String
}Factory and parsing methods:
object DataType {
def fromDDL(ddl: String): DataType
def fromJson(json: String): DataType
def parseTypeWithFallback(
schema: String,
parser: String => DataType,
fallbackParser: String => DataType
): DataType
def equalsIgnoreNullability(left: DataType, right: DataType): Boolean
def equalsStructurally(from: DataType, to: DataType, ignoreNullability: Boolean): Boolean
}Basic scalar data types for fundamental values.
// String and binary types
case object StringType extends StringType
case object BinaryType extends BinaryType
// Numeric types
case object ByteType extends ByteType // 1 byte
case object ShortType extends ShortType // 2 bytes
case object IntegerType extends IntegerType // 4 bytes
case object LongType extends LongType // 8 bytes
case object FloatType extends FloatType // 4 bytes
case object DoubleType extends DoubleType // 8 bytes
// Boolean and null
case object BooleanType extends BooleanType
case object NullType extends NullTypeHigh-precision decimal numbers with configurable precision and scale.
/**
* Fixed precision decimal numbers
* @param precision Total number of digits (max 38)
* @param scale Number of digits after decimal point (max 38)
*/
case class DecimalType(precision: Int, scale: Int) extends FractionalType {
def isWiderThan(other: DataType): Boolean
def isTighterThan(other: DataType): Boolean
}
object DecimalType {
val MAX_PRECISION: Int = 38
val MAX_SCALE: Int = 38
val SYSTEM_DEFAULT: DecimalType = DecimalType(38, 18)
val USER_DEFAULT: DecimalType = DecimalType(10, 0)
def forType(dataType: DataType): DecimalType
def adjustPrecisionScale(precision: Int, scale: Int): DecimalType
def is32BitDecimalType(dt: DataType): Boolean
def is64BitDecimalType(dt: DataType): Boolean
def isByteArrayDecimalType(dt: DataType): Boolean
}Temporal data types for handling dates, timestamps, and time intervals.
// Date and timestamp types
case object DateType extends DateType // 4 bytes, date only
case object TimestampType extends TimestampType // 8 bytes, with timezone
case object TimestampNTZType extends TimestampNTZType // 8 bytes, no timezone
// Interval types
case object CalendarIntervalType extends CalendarIntervalType // 16 bytes
/**
* Day-time intervals (days, hours, minutes, seconds)
*/
case class DayTimeIntervalType(startField: Byte, endField: Byte) extends DataType
/**
* Year-month intervals (years, months)
*/
case class YearMonthIntervalType(startField: Byte, endField: Byte) extends DataTypeFixed and variable-length character string types.
/**
* Fixed-length character strings
* @param length Maximum character length
*/
case class CharType(length: Int) extends DataType
/**
* Variable-length character strings
* @param length Maximum character length
*/
case class VarcharType(length: Int) extends DataTypeArray and map types for handling collections of data.
/**
* Array of elements with same type
* @param elementType Type of array elements
* @param containsNull Whether array can contain null values
*/
case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataType
object ArrayType {
def apply(elementType: DataType): ArrayType = ArrayType(elementType, containsNull = true)
}
/**
* Key-value mappings
* @param keyType Type of map keys
* @param valueType Type of map values
* @param valueContainsNull Whether values can be null
*/
case class MapType(
keyType: DataType,
valueType: DataType,
valueContainsNull: Boolean
) extends DataTypeComplex nested structures with named fields.
/**
* Complex nested structures with named fields
* @param fields Array of struct fields
*/
case class StructType(fields: Array[StructField]) extends DataType with Seq[StructField] {
// Field access
def fieldNames: Array[String]
def names: Array[String]
def apply(name: String): StructField
def apply(names: Set[String]): StructType
def fieldIndex(name: String): Int
// Schema building - multiple add() overloads
def add(field: StructField): StructType
def add(name: String, dataType: DataType): StructType
def add(name: String, dataType: DataType, nullable: Boolean): StructType
def add(name: String, dataType: DataType, nullable: Boolean, metadata: Metadata): StructType
def add(name: String, dataType: DataType, nullable: Boolean, comment: String): StructType
// Schema operations
def merge(that: StructType, caseSensitive: Boolean = true): StructType
def toDDL: String
def treeString: String
def treeString(maxDepth: Int): String
def printTreeString(): Unit
// Advanced field operations
def findNestedField(
fieldNames: Seq[String],
includeCollections: Boolean = false,
resolver: Resolver = resolver
): Option[(Seq[String], StructField)]
}
object StructType {
def apply(fields: Seq[StructField]): StructType
def fromDDL(ddl: String): StructType
def findMissingFields(
source: StructType,
target: StructType,
resolver: Resolver
): Option[StructType]
}Individual fields within struct types.
/**
* Field within a StructType
* @param name Field name
* @param dataType Field data type
* @param nullable Whether field can be null
* @param metadata Field metadata
*/
case class StructField(
name: String,
dataType: DataType,
nullable: Boolean,
metadata: Metadata = Metadata.empty
) {
// Metadata operations
def withComment(comment: String): StructField
def getComment(): Option[String]
def withCurrentDefaultValue(value: String): StructField
def getCurrentDefaultValue(): Option[String]
def clearCurrentDefaultValue(): StructField
def toDDL: String
}Base class for custom data types.
/**
* Base for custom user-defined data types
*/
abstract class UserDefinedType[UserType >: Null] extends DataType with Serializable {
def sqlType: DataType
def pyUDT: String
}
/**
* Registry for user-defined types
*/
object UDTRegistration {
// Registration methods for UDTs
}Metadata system for attaching additional information to schema fields.
/**
* Metadata attached to schema fields
*/
class Metadata {
// Various getters/setters for metadata values
def contains(key: String): Boolean
def getLong(key: String): Long
def getDouble(key: String): Double
def getString(key: String): String
def getBoolean(key: String): Boolean
def getMetadata(key: String): Metadata
def getLongArray(key: String): Array[Long]
def getDoubleArray(key: String): Array[Double]
def getStringArray(key: String): Array[String]
def getBooleanArray(key: String): Array[Boolean]
def getMetadataArray(key: String): Array[Metadata]
}
object Metadata {
val empty: Metadata
def fromJson(json: String): Metadata
}Creating schemas:
import org.apache.spark.sql.types._
// Simple schema
val userSchema = StructType(Array(
StructField("id", LongType, nullable = false),
StructField("name", StringType, nullable = false),
StructField("email", StringType, nullable = true),
StructField("age", IntegerType, nullable = true)
))
// Complex nested schema
val orderSchema = StructType(Array(
StructField("orderId", StringType, false),
StructField("items", ArrayType(StructType(Array(
StructField("productId", StringType, false),
StructField("quantity", IntegerType, false),
StructField("price", DecimalType(10, 2), false)
)), containsNull = false), false),
StructField("metadata", MapType(StringType, StringType, true), true)
))Working with decimal precision:
// High precision financial calculations
val moneyType = DecimalType(19, 4) // 19 digits total, 4 after decimal
val percentType = DecimalType(5, 4) // 5 digits total, 4 after decimal (0.9999 max)
// System defaults
val systemDecimal = DecimalType.SYSTEM_DEFAULT // DecimalType(38, 18)
val userDecimal = DecimalType.USER_DEFAULT // DecimalType(10, 0)Schema parsing:
// Parse from DDL string
val parsedSchema = StructType.fromDDL(
"""
id BIGINT,
name STRING,
scores ARRAY<DOUBLE>,
profile MAP<STRING, STRING>
"""
)
// Parse individual types
val arrayType = DataType.fromDDL("ARRAY<STRING>")
val mapType = DataType.fromDDL("MAP<STRING, INT>")