0
# Data Types
1
2
Catalyst's data type system provides comprehensive support for all SQL data types with full type safety, JSON serialization, and metadata support. The type system is the foundation for schema definition, expression evaluation, and query optimization.
3
4
## Core Imports
5
6
```scala
7
import org.apache.spark.sql.types._
8
```
9
10
## Type Hierarchy
11
12
### DataType Base Class
13
14
```scala { .api }
15
abstract class DataType extends AbstractDataType {
16
def defaultSize: Int
17
def typeName: String
18
def json: String
19
def prettyJson: String
20
def simpleString: String
21
def catalogString: String
22
def sql: String
23
private[spark] def sameType(other: DataType): Boolean
24
private[spark] def asNullable: DataType
25
}
26
```
27
28
The base `DataType` class provides common functionality for all Spark SQL types including serialization, string representation, and type comparison.
29
30
## Primitive Types
31
32
### Numeric Types
33
34
```scala { .api }
35
object BooleanType extends DataType
36
object ByteType extends DataType
37
object ShortType extends DataType
38
object IntegerType extends DataType
39
object LongType extends DataType
40
object FloatType extends DataType
41
object DoubleType extends DataType
42
43
case class DecimalType(precision: Int, scale: Int) extends FractionalType {
44
def this() = this(10, 0)
45
}
46
47
object DecimalType {
48
val SYSTEM_DEFAULT: DecimalType
49
val USER_DEFAULT: DecimalType
50
def apply(): DecimalType
51
def bounded(precision: Int, scale: Int): DecimalType
52
def unbounded: DecimalType
53
}
54
```
55
56
**Usage Example:**
57
```scala
58
val intCol = StructField("count", IntegerType, nullable = false)
59
val priceCol = StructField("price", DecimalType(10, 2), nullable = true)
60
val flagCol = StructField("active", BooleanType, nullable = false)
61
```
62
63
### String Types
64
65
```scala { .api }
66
class StringType(val collationId: Int) extends AtomicType {
67
def this() = this(0)
68
def getCollationSql: String
69
}
70
71
object StringType extends StringType(0)
72
73
case class CharType(length: Int) extends AtomicType
74
case class VarcharType(length: Int) extends AtomicType
75
```
76
77
**Usage Example:**
78
```scala
79
val nameCol = StructField("name", StringType, nullable = false)
80
val codeCol = StructField("code", CharType(10), nullable = false)
81
val descCol = StructField("description", VarcharType(255), nullable = true)
82
```
83
84
### Binary and Date/Time Types
85
86
```scala { .api }
87
object BinaryType extends DataType
88
object DateType extends DataType
89
object TimestampType extends DataType
90
object TimestampNTZType extends DataType
91
object CalendarIntervalType extends DataType
92
93
case class DayTimeIntervalType(startField: Byte, endField: Byte) extends DataType
94
case class YearMonthIntervalType(startField: Byte, endField: Byte) extends DataType
95
```
96
97
**Usage Example:**
98
```scala
99
val createdCol = StructField("created_at", TimestampType, nullable = false)
100
val birthdateCol = StructField("birthdate", DateType, nullable = true)
101
val dataCol = StructField("data", BinaryType, nullable = true)
102
```
103
104
### Special Types
105
106
```scala { .api }
107
object NullType extends DataType
108
object VariantType extends DataType
109
case class ObjectType(cls: Class[_]) extends DataType
110
```
111
112
## Complex Types
113
114
### StructType and StructField
115
116
```scala { .api }
117
case class StructType(fields: Array[StructField]) extends DataType {
118
def this(fields: Seq[StructField]) = this(fields.toArray)
119
def this(fields: java.util.List[StructField]) = this(fields.asScala.toArray)
120
121
// Field access
122
def apply(name: String): StructField
123
def apply(names: Seq[String]): StructField
124
def fieldNames: Array[String]
125
def names: Seq[String]
126
def length: Int
127
def iterator: Iterator[StructField]
128
129
// Field manipulation
130
def add(field: StructField): StructType
131
def add(name: String, dataType: DataType): StructType
132
def add(name: String, dataType: DataType, nullable: Boolean): StructType
133
def add(name: String, dataType: DataType, nullable: Boolean, metadata: Metadata): StructType
134
def add(name: String, dataType: String): StructType
135
def add(name: String, dataType: String, nullable: Boolean): StructType
136
def add(name: String, dataType: String, nullable: Boolean, metadata: Metadata): StructType
137
138
// Schema operations
139
def merge(that: StructType): StructType
140
def remove(fieldNames: Set[String]): StructType
141
def dropFields(fieldNames: String*): StructType
142
def getFieldIndex(name: String): Option[Int]
143
}
144
145
case class StructField(
146
name: String,
147
dataType: DataType,
148
nullable: Boolean = true,
149
metadata: Metadata = Metadata.empty
150
) {
151
def getComment(): Option[String]
152
}
153
```
154
155
**Usage Example:**
156
```scala
157
val schema = StructType(Array(
158
StructField("id", LongType, nullable = false),
159
StructField("name", StringType, nullable = false),
160
StructField("email", StringType, nullable = true),
161
StructField("age", IntegerType, nullable = true),
162
StructField("created_at", TimestampType, nullable = false)
163
))
164
165
// Add fields
166
val extendedSchema = schema
167
.add("last_login", TimestampType, nullable = true)
168
.add("is_active", BooleanType, nullable = false)
169
170
// Access fields
171
val nameField = schema("name")
172
val fieldNames = schema.fieldNames
173
```
174
175
### ArrayType
176
177
```scala { .api }
178
case class ArrayType(elementType: DataType, containsNull: Boolean = true) extends DataType {
179
def simpleString: String
180
}
181
182
object ArrayType {
183
def apply(elementType: DataType): ArrayType = ArrayType(elementType, containsNull = true)
184
}
185
```
186
187
**Usage Example:**
188
```scala
189
val tagsCol = StructField("tags", ArrayType(StringType), nullable = true)
190
val scoresCol = StructField("scores", ArrayType(IntegerType, containsNull = false), nullable = false)
191
val nestedCol = StructField("matrix", ArrayType(ArrayType(DoubleType)), nullable = true)
192
```
193
194
### MapType
195
196
```scala { .api }
197
case class MapType(
198
keyType: DataType,
199
valueType: DataType,
200
valueContainsNull: Boolean = true
201
) extends DataType
202
203
object MapType {
204
def apply(keyType: DataType, valueType: DataType): MapType =
205
MapType(keyType, valueType, valueContainsNull = true)
206
}
207
```
208
209
**Usage Example:**
210
```scala
211
val propsCol = StructField("properties", MapType(StringType, StringType), nullable = true)
212
val countsCol = StructField("counts", MapType(StringType, LongType, valueContainsNull = false), nullable = false)
213
val nestedCol = StructField("nested", MapType(StringType, ArrayType(IntegerType)), nullable = true)
214
```
215
216
## Metadata
217
218
```scala { .api }
219
class Metadata private (private val map: Map[String, Any]) {
220
def contains(key: String): Boolean
221
def getLong(key: String): Long
222
def getDouble(key: String): Double
223
def getBoolean(key: String): Boolean
224
def getString(key: String): String
225
def getMetadata(key: String): Metadata
226
def getLongArray(key: String): Array[Long]
227
def getDoubleArray(key: String): Array[Double]
228
def getBooleanArray(key: String): Array[Boolean]
229
def getStringArray(key: String): Array[String]
230
def getMetadataArray(key: String): Array[Metadata]
231
def json: String
232
}
233
234
object Metadata {
235
val empty: Metadata
236
def fromJson(json: String): Metadata
237
}
238
239
class MetadataBuilder {
240
def putLong(key: String, value: Long): MetadataBuilder
241
def putDouble(key: String, value: Double): MetadataBuilder
242
def putBoolean(key: String, value: Boolean): MetadataBuilder
243
def putString(key: String, value: String): MetadataBuilder
244
def putMetadata(key: String, value: Metadata): MetadataBuilder
245
def putLongArray(key: String, value: Array[Long]): MetadataBuilder
246
def putDoubleArray(key: String, value: Array[Double]): MetadataBuilder
247
def putBooleanArray(key: String, value: Array[Boolean]): MetadataBuilder
248
def putStringArray(key: String, value: Array[String]): MetadataBuilder
249
def putMetadataArray(key: String, value: Array[Metadata]): MetadataBuilder
250
def remove(key: String): MetadataBuilder
251
def build(): Metadata
252
}
253
```
254
255
**Usage Example:**
256
```scala
257
val metadata = new MetadataBuilder()
258
.putString("comment", "User identifier")
259
.putLong("maxLength", 100)
260
.putBoolean("required", true)
261
.build()
262
263
val fieldWithMetadata = StructField("user_id", StringType, nullable = false, metadata)
264
```
265
266
## User-Defined Types
267
268
```scala { .api }
269
abstract class UserDefinedType[UserType >: Null] extends DataType {
270
def sqlType: DataType
271
def serialize(obj: UserType): Any
272
def deserialize(datum: Any): UserType
273
def userClass: Class[UserType]
274
def equals(o: Any): Boolean
275
def hashCode(): Int
276
def typeName: String
277
}
278
279
object UDTRegistration {
280
def register(udtClass: String, udt: String): Unit
281
def register(udtClass: Class[_], udt: Class[_ <: UserDefinedType[_]]): Unit
282
def exists(udtClass: String): Boolean
283
def getUDTFor(udtClass: String): Option[UserDefinedType[_]]
284
}
285
```
286
287
**Usage Example:**
288
```scala
289
// Define a custom UDT
290
class PointUDT extends UserDefinedType[Point] {
291
override def sqlType: DataType = StructType(Seq(
292
StructField("x", DoubleType, false),
293
StructField("y", DoubleType, false)
294
))
295
296
override def serialize(point: Point): Any = {
297
InternalRow(point.x, point.y)
298
}
299
300
override def deserialize(datum: Any): Point = {
301
val row = datum.asInstanceOf[InternalRow]
302
Point(row.getDouble(0), row.getDouble(1))
303
}
304
305
override def userClass: Class[Point] = classOf[Point]
306
}
307
308
// Register the UDT
309
UDTRegistration.register(classOf[Point].getName, classOf[PointUDT].getName)
310
```
311
312
## Type Conversion and Utilities
313
314
```scala { .api }
315
object DataType {
316
def fromJson(json: String): DataType
317
def fromDDL(ddl: String): DataType
318
def equalsIgnoreNullability(left: DataType, right: DataType): Boolean
319
def equalsIgnoreCaseAndNullability(left: DataType, right: DataType): Boolean
320
}
321
322
abstract class AbstractDataType {
323
def defaultConcreteType: DataType
324
def acceptsType(other: DataType): Boolean
325
def simpleString: String
326
}
327
```
328
329
**Usage Example:**
330
```scala
331
// Parse DDL
332
val schema = DataType.fromDDL("struct<name:string,age:int,scores:array<double>>")
333
334
// Type checking
335
val accepts = IntegerType.acceptsType(ByteType) // true
336
val equal = DataType.equalsIgnoreNullability(
337
StructType(Seq(StructField("x", IntegerType, true))),
338
StructType(Seq(StructField("x", IntegerType, false)))
339
) // true
340
```
341
342
## Common Patterns
343
344
### Schema Evolution
345
```scala
346
// Start with base schema
347
val v1Schema = StructType(Array(
348
StructField("id", LongType, false),
349
StructField("name", StringType, false)
350
))
351
352
// Add new fields (backwards compatible)
353
val v2Schema = v1Schema
354
.add("email", StringType, nullable = true)
355
.add("created_at", TimestampType, nullable = true)
356
357
// Merge schemas
358
val mergedSchema = v1Schema.merge(v2Schema)
359
```
360
361
### Dynamic Schema Creation
362
```scala
363
def createSchema(fields: Seq[(String, DataType, Boolean)]): StructType = {
364
StructType(fields.map { case (name, dataType, nullable) =>
365
StructField(name, dataType, nullable)
366
}.toArray)
367
}
368
369
val dynamicSchema = createSchema(Seq(
370
("user_id", LongType, false),
371
("preferences", MapType(StringType, StringType), true),
372
("tags", ArrayType(StringType), true)
373
))
374
```
375
376
The data type system in Catalyst provides comprehensive support for all SQL data types with rich metadata capabilities, making it suitable for complex schema evolution and type-safe query processing.