0
# Data Types and Structures
1
2
This section covers the core data type system of Spark Catalyst, including primitive types, complex types (arrays, maps, structs), Row interface for data access, and encoders for type conversion.
3
4
## Core Imports
5
6
```scala
7
import org.apache.spark.sql.Row
8
import org.apache.spark.sql.types._
9
import org.apache.spark.sql.Encoder
10
import org.apache.spark.sql.Encoders
11
```
12
13
## Row Interface
14
15
The Row trait provides the primary interface for accessing structured data in Spark SQL.
16
17
### Row (trait)
18
19
```scala { .api }
20
trait Row {
21
def apply(i: Int): Any
22
def get(i: Int): Any
23
def isNullAt(i: Int): Boolean
24
def getInt(i: Int): Int
25
def getLong(i: Int): Long
26
def getFloat(i: Int): Float
27
def getDouble(i: Int): Double
28
def getString(i: Int): String
29
def getBoolean(i: Int): Boolean
30
def getByte(i: Int): Byte
31
def getShort(i: Int): Short
32
def getDecimal(i: Int): java.math.BigDecimal
33
def getDate(i: Int): java.sql.Date
34
def getTimestamp(i: Int): java.sql.Timestamp
35
def getSeq[T](i: Int): Seq[T]
36
def getList[T](i: Int): java.util.List[T]
37
def getMap[K, V](i: Int): scala.collection.Map[K, V]
38
def getJavaMap[K, V](i: Int): java.util.Map[K, V]
39
def getStruct(i: Int): Row
40
def getAs[T](i: Int): T
41
def getAs[T](fieldName: String): T
42
def length: Int
43
def size: Int
44
def schema: StructType
45
def copy(): Row
46
def toSeq: Seq[Any]
47
}
48
```
49
50
### Row (object)
51
52
Factory methods for creating Row instances.
53
54
```scala { .api }
55
object Row {
56
def unapplySeq(row: Row): Some[Seq[Any]]
57
def apply(values: Any*): Row
58
def fromSeq(values: Seq[Any]): Row
59
def fromTuple(tuple: Product): Row
60
def merge(rows: Row*): Row
61
def empty: Row
62
}
63
```
64
65
#### Usage Example
66
67
```scala
68
import org.apache.spark.sql.Row
69
70
// Create a row
71
val row = Row("Alice", 25, true)
72
73
// Access data by index
74
val name: String = row.getString(0)
75
val age: Int = row.getInt(1)
76
val isActive: Boolean = row.getBoolean(2)
77
78
// Generic access
79
val nameGeneric: String = row.getAs[String](0)
80
81
// Check for null values
82
if (!row.isNullAt(1)) {
83
val age = row.getInt(1)
84
}
85
```
86
87
## Data Type Hierarchy
88
89
### DataType (abstract class)
90
91
Base class for all Spark SQL data types.
92
93
```scala { .api }
94
abstract class DataType {
95
def defaultSize: Int
96
def typeName: String
97
def json: String
98
def prettyJson: String
99
def simpleString: String
100
def catalogString: String
101
def sql: String
102
def sameType(other: DataType): Boolean
103
def asNullable: DataType
104
def existsRecursively(f: (DataType) => Boolean): Boolean
105
}
106
```
107
108
### DataType (object)
109
110
Factory methods and utilities for DataType instances.
111
112
```scala { .api }
113
object DataType {
114
def fromJson(json: String): DataType
115
def fromDDL(ddl: String): DataType
116
def equalsIgnoreCompatibleNullability(from: DataType, to: DataType): Boolean
117
def equalsIgnoreNullability(from: DataType, to: DataType): Boolean
118
}
119
```
120
121
## Primitive Data Types
122
123
### Numeric Types
124
125
```scala { .api }
126
case object ByteType extends IntegralType
127
case object ShortType extends IntegralType
128
case object IntegerType extends IntegralType
129
case object LongType extends IntegralType
130
case object FloatType extends FractionalType
131
case object DoubleType extends FractionalType
132
133
case class DecimalType(precision: Int, scale: Int) extends FractionalType {
134
def this() = this(10, 0)
135
}
136
137
object DecimalType {
138
val USER_DEFAULT: DecimalType
139
val SYSTEM_DEFAULT: DecimalType
140
def apply(): DecimalType
141
def bounded(precision: Int, scale: Int): DecimalType
142
def unbounded: DecimalType
143
def unapply(t: DataType): Option[(Int, Int)]
144
}
145
```
146
147
### String and Binary Types
148
149
```scala { .api }
150
case object StringType extends AtomicType
151
case object BinaryType extends AtomicType
152
```
153
154
### Boolean Type
155
156
```scala { .api }
157
case object BooleanType extends AtomicType
158
```
159
160
### Date and Time Types
161
162
```scala { .api }
163
case object DateType extends AtomicType
164
case object TimestampType extends AtomicType
165
case object CalendarIntervalType extends DataType
166
```
167
168
### Null Type
169
170
```scala { .api }
171
case object NullType extends DataType
172
```
173
174
## Complex Data Types
175
176
### Array Type
177
178
```scala { .api }
179
case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataType {
180
def this(elementType: DataType) = this(elementType, containsNull = true)
181
def buildFormattedString(prefix: String, buffer: StringBuffer): Unit
182
}
183
184
object ArrayType {
185
def apply(elementType: DataType): ArrayType
186
}
187
```
188
189
### Map Type
190
191
```scala { .api }
192
case class MapType(
193
keyType: DataType,
194
valueType: DataType,
195
valueContainsNull: Boolean
196
) extends DataType {
197
def this(keyType: DataType, valueType: DataType) =
198
this(keyType, valueType, valueContainsNull = true)
199
def buildFormattedString(prefix: String, buffer: StringBuffer): Unit
200
}
201
202
object MapType {
203
def apply(keyType: DataType, valueType: DataType): MapType
204
}
205
```
206
207
### Struct Type
208
209
```scala { .api }
210
case class StructType(fields: Array[StructField]) extends DataType {
211
def this(fields: Seq[StructField]) = this(fields.toArray)
212
def this(fields: java.util.List[StructField]) = this(fields.asScala.toArray)
213
214
def apply(name: String): StructField
215
def apply(names: Set[String]): StructType
216
def fieldNames: Array[String]
217
def names: Seq[String]
218
def length: Int
219
def iterator: Iterator[StructField]
220
def getFieldIndex(name: String): Option[Int]
221
def indexOf(name: String): Int
222
def add(field: StructField): StructType
223
def add(name: String, dataType: DataType): StructType
224
def add(name: String, dataType: DataType, nullable: Boolean): StructType
225
def add(name: String, dataType: DataType, nullable: Boolean, metadata: Metadata): StructType
226
def add(name: String, dataType: String): StructType
227
def add(name: String, dataType: String, nullable: Boolean): StructType
228
def add(name: String, dataType: String, nullable: Boolean, metadata: Metadata): StructType
229
}
230
231
object StructType {
232
def apply(fields: Seq[StructField]): StructType
233
def apply(fields: java.util.List[StructField]): StructType
234
def fromDDL(ddl: String): StructType
235
}
236
```
237
238
### Struct Field
239
240
```scala { .api }
241
case class StructField(
242
name: String,
243
dataType: DataType,
244
nullable: Boolean,
245
metadata: Metadata
246
) {
247
def this(name: String, dataType: DataType, nullable: Boolean) =
248
this(name, dataType, nullable, Metadata.empty)
249
def this(name: String, dataType: DataType) =
250
this(name, dataType, nullable = true, Metadata.empty)
251
252
def getComment(): Option[String]
253
def withComment(comment: String): StructField
254
}
255
256
object StructField {
257
def apply(name: String, dataType: DataType): StructField
258
def apply(name: String, dataType: DataType, nullable: Boolean): StructField
259
}
260
```
261
262
#### Usage Example
263
264
```scala
265
import org.apache.spark.sql.types._
266
267
// Create a struct type
268
val schema = StructType(Array(
269
StructField("name", StringType, nullable = false),
270
StructField("age", IntegerType, nullable = true),
271
StructField("scores", ArrayType(DoubleType), nullable = true),
272
StructField("metadata", MapType(StringType, StringType), nullable = true)
273
))
274
275
// Access field information
276
val nameField = schema("name")
277
val fieldNames = schema.fieldNames
278
val nameIndex = schema.indexOf("name")
279
```
280
281
## Encoders
282
283
Type-safe conversion between JVM objects and Spark SQL's internal representation.
284
285
### Encoder (trait)
286
287
```scala { .api }
288
trait Encoder[T] {
289
def schema: StructType
290
def clsTag: ClassTag[T]
291
}
292
```
293
294
### Encoders (object)
295
296
Factory methods for creating encoder instances.
297
298
```scala { .api }
299
object Encoders {
300
def BOOLEAN: Encoder[java.lang.Boolean]
301
def BYTE: Encoder[java.lang.Byte]
302
def SHORT: Encoder[java.lang.Short]
303
def INT: Encoder[java.lang.Integer]
304
def LONG: Encoder[java.lang.Long]
305
def FLOAT: Encoder[java.lang.Float]
306
def DOUBLE: Encoder[java.lang.Double]
307
def STRING: Encoder[java.lang.String]
308
def DECIMAL: Encoder[java.math.BigDecimal]
309
def DATE: Encoder[java.sql.Date]
310
def TIMESTAMP: Encoder[java.sql.Timestamp]
311
def BINARY: Encoder[Array[Byte]]
312
313
def bean[T](beanClass: Class[T]): Encoder[T]
314
def kryo[T: ClassTag]: Encoder[T]
315
def kryo[T](clazz: Class[T]): Encoder[T]
316
def javaSerialization[T: ClassTag]: Encoder[T]
317
def javaSerialization[T](clazz: Class[T]): Encoder[T]
318
319
def tuple[T1, T2](e1: Encoder[T1], e2: Encoder[T2]): Encoder[(T1, T2)]
320
def tuple[T1, T2, T3](e1: Encoder[T1], e2: Encoder[T2], e3: Encoder[T3]): Encoder[(T1, T2, T3)]
321
def tuple[T1, T2, T3, T4](e1: Encoder[T1], e2: Encoder[T2], e3: Encoder[T3], e4: Encoder[T4]): Encoder[(T1, T2, T3, T4)]
322
def tuple[T1, T2, T3, T4, T5](e1: Encoder[T1], e2: Encoder[T2], e3: Encoder[T3], e4: Encoder[T4], e5: Encoder[T5]): Encoder[(T1, T2, T3, T4, T5)]
323
324
def product[T <: Product : TypeTag]: Encoder[T]
325
326
def scalaInt: Encoder[Int]
327
def scalaLong: Encoder[Long]
328
def scalaDouble: Encoder[Double]
329
def scalaFloat: Encoder[Float]
330
def scalaByte: Encoder[Byte]
331
def scalaShort: Encoder[Short]
332
def scalaBoolean: Encoder[Boolean]
333
}
334
```
335
336
#### Usage Example
337
338
```scala
339
import org.apache.spark.sql.Encoders
340
341
// Primitive encoders
342
val stringEncoder = Encoders.STRING
343
val intEncoder = Encoders.scalaInt
344
345
// Bean encoder for Java objects
346
case class Person(name: String, age: Int)
347
val personEncoder = Encoders.product[Person]
348
349
// Kryo encoder for complex objects
350
val kryoEncoder = Encoders.kryo[MyComplexClass]
351
```
352
353
## Metadata
354
355
Associated metadata for struct fields.
356
357
```scala { .api }
358
case class Metadata(map: Map[String, Any]) {
359
def contains(key: String): Boolean
360
def getLong(key: String): Long
361
def getDouble(key: String): Double
362
def getBoolean(key: String): Boolean
363
def getString(key: String): String
364
def getMetadata(key: String): Metadata
365
def getLongArray(key: String): Array[Long]
366
def getDoubleArray(key: String): Array[Double]
367
def getBooleanArray(key: String): Array[Boolean]
368
def getStringArray(key: String): Array[String]
369
def getMetadataArray(key: String): Array[Metadata]
370
def json: String
371
}
372
373
object Metadata {
374
def empty: Metadata
375
def fromJson(json: String): Metadata
376
}
377
```
378
379
## Exception Types
380
381
### AnalysisException
382
383
Exception thrown when query analysis fails.
384
385
```scala { .api }
386
class AnalysisException(
387
message: String,
388
line: Option[Int] = None,
389
startPosition: Option[Int] = None,
390
plan: Option[LogicalPlan] = None,
391
cause: Option[Throwable] = None
392
) extends Exception {
393
def withPosition(line: Option[Int], startPosition: Option[Int]): AnalysisException
394
override def getMessage: String
395
def getSimpleMessage: String
396
}
397
```