Comprehensive type system supporting all SQL types including primitives, complex nested types, and user-defined types for representing data schemas in Catalyst.
The abstract base class for all Spark SQL data types.
/**
* The base type of all Spark SQL data types.
*/
abstract class DataType extends AbstractDataType {
/**
* The default size of a value of this data type, used internally for size estimation.
*/
def defaultSize: Int
/** Name of the type used in JSON serialization */
def typeName: String
/** The compact JSON representation of this data type */
def json: String
/** The pretty (i.e. indented) JSON representation of this data type */
def prettyJson: String
/** Readable string representation for the type */
def simpleString: String
/**
* Check if `this` and `other` are the same data type when ignoring nullability
* (`StructField.nullable`, `ArrayType.containsNull`, and `MapType.valueContainsNull`).
*/
def sameType(other: DataType): Boolean
/**
* Returns the same data type but set all nullability fields are true
* (`StructField.nullable`, `ArrayType.containsNull`, and `MapType.valueContainsNull`).
*/
def asNullable: DataType
}Usage Examples:
import org.apache.spark.sql.types._
// Get type information
val intType = IntegerType
val typeName = intType.typeName // "integer"
val json = intType.json // "integer"
val prettyJson = intType.prettyJson // "integer"
val simple = intType.simpleString // "int"
val size = intType.defaultSize // 4
// Compare types
val stringType = StringType
val same = intType.sameType(stringType) // false
val sameInt = intType.sameType(IntegerType) // true
// Create nullable version
val nullableArray = ArrayType(IntegerType, containsNull = false)
val nullable = nullableArray.asNullable
// ArrayType(IntegerType, containsNull = true)Basic data types representing primitive values.
/** Boolean data type */
object BooleanType extends DataType {
def defaultSize: Int = 1
}
/** Byte data type */
object ByteType extends DataType {
def defaultSize: Int = 1
}
/** Short integer data type */
object ShortType extends DataType {
def defaultSize: Int = 2
}
/** Integer data type */
object IntegerType extends DataType {
def defaultSize: Int = 4
}
/** Long integer data type */
object LongType extends DataType {
def defaultSize: Int = 8
}
/** Float data type */
object FloatType extends DataType {
def defaultSize: Int = 4
}
/** Double data type */
object DoubleType extends DataType {
def defaultSize: Int = 8
}
/** String data type */
object StringType extends DataType {
def defaultSize: Int = 20
}
/** Binary data type */
object BinaryType extends DataType {
def defaultSize: Int = 100
}
/** Date data type */
object DateType extends DataType {
def defaultSize: Int = 4
}
/** Timestamp data type */
object TimestampType extends DataType {
def defaultSize: Int = 12
}
/** Null data type */
object NullType extends DataType {
def defaultSize: Int = 1
}
/** Calendar interval data type */
object CalendarIntervalType extends DataType {
def defaultSize: Int = 16
}Usage Examples:
import org.apache.spark.sql.types._
// Use primitive types
val schema = StructType(Seq(
StructField("id", IntegerType, nullable = false),
StructField("name", StringType, nullable = true),
StructField("active", BooleanType, nullable = false),
StructField("score", DoubleType, nullable = true),
StructField("created", TimestampType, nullable = false)
))
// Type checking
def processValue(value: Any, dataType: DataType): String = dataType match {
case IntegerType => s"Integer: $value"
case StringType => s"String: $value"
case BooleanType => s"Boolean: $value"
case DoubleType => s"Double: $value"
case _ => s"Other: $value"
}Data types for complex nested structures including arrays, maps, and structs.
/**
* Array data type with element type and nullability
*/
case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataType {
def defaultSize: Int = 100
/** Creates a new ArrayType with the same element type but different nullability */
def copy(elementType: DataType = elementType, containsNull: Boolean = containsNull): ArrayType
}
/**
* Map data type with key/value types and nullability
*/
case class MapType(
keyType: DataType,
valueType: DataType,
valueContainsNull: Boolean) extends DataType {
def defaultSize: Int = 100
/** Creates a new MapType with updated parameters */
def copy(
keyType: DataType = keyType,
valueType: DataType = valueType,
valueContainsNull: Boolean = valueContainsNull): MapType
}
/**
* Struct data type with named fields
*/
case class StructType(fields: Array[StructField]) extends DataType {
def defaultSize: Int = fields.map(_.dataType.defaultSize).sum
/** Get field by name */
def apply(name: String): StructField
/** Get field names */
def fieldNames: Array[String]
/** Add a new field */
def add(field: StructField): StructType
def add(name: String, dataType: DataType): StructType
def add(name: String, dataType: DataType, nullable: Boolean): StructType
def add(name: String, dataType: DataType, nullable: Boolean, metadata: Metadata): StructType
}
/**
* Individual field in a struct with name, type, nullable flag, and metadata
*/
case class StructField(
name: String,
dataType: DataType,
nullable: Boolean,
metadata: Metadata = Metadata.empty) {
/** Creates a new StructField with updated parameters */
def copy(
name: String = name,
dataType: DataType = dataType,
nullable: Boolean = nullable,
metadata: Metadata = metadata): StructField
}Usage Examples:
import org.apache.spark.sql.types._
// Create array type
val intArray = ArrayType(IntegerType, containsNull = true)
val stringArray = ArrayType(StringType, containsNull = false)
// Create map type
val stringToInt = MapType(StringType, IntegerType, valueContainsNull = false)
val idToUser = MapType(IntegerType, StringType, valueContainsNull = true)
// Create struct type
val userSchema = StructType(Array(
StructField("id", IntegerType, nullable = false),
StructField("name", StringType, nullable = true),
StructField("emails", ArrayType(StringType, containsNull = false), nullable = true),
StructField("properties", MapType(StringType, StringType, valueContainsNull = true), nullable = true)
))
// Access struct fields
val nameField = userSchema("name")
val fieldNames = userSchema.fieldNames // Array("id", "name", "emails", "properties")
// Add fields to struct
val extendedSchema = userSchema
.add("created_at", TimestampType, nullable = false)
.add("active", BooleanType, nullable = false)
// Complex nested types
val nestedSchema = StructType(Array(
StructField("users", ArrayType(userSchema, containsNull = false), nullable = true),
StructField("metadata", MapType(StringType,
StructType(Array(
StructField("value", StringType, nullable = true),
StructField("type", StringType, nullable = false)
)), valueContainsNull = true), nullable = true)
))High-precision decimal types for financial and scientific calculations.
/**
* Decimal data type with precision and scale
*/
case class DecimalType(precision: Int, scale: Int) extends DataType {
def defaultSize: Int = 8
/** Maximum allowed precision */
def isWiderThan(other: DataType): Boolean
/** Whether this decimal type is tighter than another */
def isTighterThan(other: DataType): Boolean
}
object DecimalType {
/** System default decimal type */
val SYSTEM_DEFAULT: DecimalType
/** User default decimal type */
val USER_DEFAULT: DecimalType
/** Maximum precision allowed */
val MAX_PRECISION: Int = 38
/** Maximum scale allowed */
val MAX_SCALE: Int = 38
/** Create decimal type with precision and scale */
def apply(precision: Int, scale: Int): DecimalType
/** Create unbounded decimal type */
def UNLIMITED(): DecimalType
}
/**
* Decimal value implementation with high precision arithmetic
*/
case class Decimal(value: java.math.BigDecimal) extends Ordered[Decimal] {
/** Convert to different precision and scale */
def toPrecision(precision: Int, scale: Int): Decimal
/** Basic arithmetic operations */
def +(that: Decimal): Decimal
def -(that: Decimal): Decimal
def *(that: Decimal): Decimal
def /(that: Decimal): Decimal
def %(that: Decimal): Decimal
/** Comparison operations */
def compare(that: Decimal): Int
/** Convert to other numeric types */
def toDouble: Double
def toFloat: Float
def toLong: Long
def toInt: Int
/** String representation */
override def toString: String
}
object Decimal {
/** Create decimal from various sources */
def apply(value: BigDecimal): Decimal
def apply(value: java.math.BigDecimal): Decimal
def apply(value: String): Decimal
def apply(value: Long): Decimal
def apply(value: Double): Decimal
/** Zero and One constants */
val ZERO: Decimal
val ONE: Decimal
}Usage Examples:
import org.apache.spark.sql.types._
// Create decimal types
val currency = DecimalType(10, 2) // 10 digits, 2 decimal places
val percentage = DecimalType(5, 4) // 5 digits, 4 decimal places
val unlimited = DecimalType.UNLIMITED()
// Create decimal values
val price = Decimal("123.45")
val rate = Decimal(0.0825)
val count = Decimal(1000)
// Arithmetic operations
val total = price * count // Decimal("123450.00")
val withTax = total * (Decimal.ONE + rate) // Apply tax rate
val rounded = withTax.toPrecision(10, 2) // Round to currency precision
// Use in schema
val transactionSchema = StructType(Array(
StructField("id", StringType, nullable = false),
StructField("amount", DecimalType(12, 2), nullable = false),
StructField("tax_rate", DecimalType(6, 4), nullable = false),
StructField("total", DecimalType(15, 2), nullable = false)
))Specialized types for advanced use cases including user-defined types and abstract matching.
/**
* Base class for user-defined types
*/
abstract class UserDefinedType[UserType >: Null] extends DataType {
/** The underlying SQL data type for this UDT */
def sqlType: DataType
/** Serialize the user type to SQL type */
def serialize(obj: UserType): Any
/** Deserialize from SQL type to user type */
def deserialize(datum: Any): UserType
/** User class that this UDT represents */
def userClass: Class[UserType]
}
/**
* Abstract base for type matching in expressions
*/
abstract class AbstractDataType {
/** The default concrete type to use if we need to cast into this type */
def defaultConcreteType: DataType
/** Check if the given data type is an instance of this abstract type */
def acceptsType(other: DataType): Boolean
/** Readable string representation */
def simpleString: String
}
/**
* Type for JVM objects
*/
case class ObjectType(cls: Class[_]) extends DataType {
def defaultSize: Int = 4096
}Usage Examples:
import org.apache.spark.sql.types._
// Define a custom UDT for a Point class
case class Point(x: Double, y: Double)
class PointUDT extends UserDefinedType[Point] {
override def sqlType: DataType = StructType(Array(
StructField("x", DoubleType, nullable = false),
StructField("y", DoubleType, nullable = false)
))
override def serialize(point: Point): Any = {
val row = new GenericInternalRow(2)
row.setDouble(0, point.x)
row.setDouble(1, point.y)
row
}
override def deserialize(datum: Any): Point = {
datum match {
case row: InternalRow =>
Point(row.getDouble(0), row.getDouble(1))
}
}
override def userClass: Class[Point] = classOf[Point]
}
// Use custom UDT in schema
val pointUDT = new PointUDT()
val locationSchema = StructType(Array(
StructField("name", StringType, nullable = false),
StructField("location", pointUDT, nullable = false)
))
// Object type for arbitrary JVM objects
val stringObjectType = ObjectType(classOf[String])
val listObjectType = ObjectType(classOf[java.util.List[_]])Utility objects and methods for working with the type system.
object DataType {
/**
* Compares two types, ignoring nullability of ArrayType, MapType, StructType.
*/
def equalsIgnoreNullability(left: DataType, right: DataType): Boolean
/**
* Compares two types, ignoring compatible nullability.
*/
def equalsIgnoreCompatibleNullability(left: DataType, right: DataType): Boolean
/** Parse a data type from string representation */
def fromJson(json: String): DataType
/** Parse a data type from DDL string */
def fromDDL(ddl: String): DataType
}
/** Metadata for struct fields and other schema elements */
case class Metadata(map: Map[String, Any]) {
def contains(key: String): Boolean
def getString(key: String): String
def getLong(key: String): Long
def getDouble(key: String): Double
def getBoolean(key: String): Boolean
def getMetadata(key: String): Metadata
def getStringArray(key: String): Array[String]
def getLongArray(key: String): Array[Long]
def getDoubleArray(key: String): Array[Double]
def getBooleanArray(key: String): Array[Boolean]
def getMetadataArray(key: String): Array[Metadata]
def json: String
}
object Metadata {
/** Empty metadata */
val empty: Metadata
/** Create metadata from JSON */
def fromJson(json: String): Metadata
/** Create metadata builder */
def builder(): MetadataBuilder
}Usage Examples:
import org.apache.spark.sql.types._
// Compare types ignoring nullability
val nullable = StructType(Array(StructField("x", IntegerType, nullable = true)))
val nonNullable = StructType(Array(StructField("x", IntegerType, nullable = false)))
val equal = DataType.equalsIgnoreNullability(nullable, nonNullable) // true
// Parse types from strings
val parsedType = DataType.fromDDL("struct<id:int,name:string>")
val jsonType = DataType.fromJson("""{"type":"integer"}""")
// Work with metadata
val metadata = Metadata.builder()
.putString("description", "User identifier")
.putLong("version", 1L)
.putBoolean("required", true)
.build()
val fieldWithMetadata = StructField("id", IntegerType, nullable = false, metadata)
// Access metadata
val description = fieldWithMetadata.metadata.getString("description")
val version = fieldWithMetadata.metadata.getLong("version")
val required = fieldWithMetadata.metadata.getBoolean("required")