or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

connectors.mddata-types.mdexpressions.mdindex.mdquery-plans.md

data-types.mddocs/

0

# Data Types

1

2

Catalyst's data type system provides comprehensive support for all SQL data types with full type safety, JSON serialization, and metadata support. The type system is the foundation for schema definition, expression evaluation, and query optimization.

3

4

## Core Imports

5

6

```scala

7

import org.apache.spark.sql.types._

8

```

9

10

## Type Hierarchy

11

12

### DataType Base Class

13

14

```scala { .api }

15

abstract class DataType extends AbstractDataType {

16

def defaultSize: Int

17

def typeName: String

18

def json: String

19

def prettyJson: String

20

def simpleString: String

21

def catalogString: String

22

def sql: String

23

private[spark] def sameType(other: DataType): Boolean

24

private[spark] def asNullable: DataType

25

}

26

```

27

28

The base `DataType` class provides common functionality for all Spark SQL types including serialization, string representation, and type comparison.

29

30

## Primitive Types

31

32

### Numeric Types

33

34

```scala { .api }

35

object BooleanType extends DataType

36

object ByteType extends DataType

37

object ShortType extends DataType

38

object IntegerType extends DataType

39

object LongType extends DataType

40

object FloatType extends DataType

41

object DoubleType extends DataType

42

43

case class DecimalType(precision: Int, scale: Int) extends FractionalType {

44

def this() = this(10, 0)

45

}

46

47

object DecimalType {

48

val SYSTEM_DEFAULT: DecimalType

49

val USER_DEFAULT: DecimalType

50

def apply(): DecimalType

51

def bounded(precision: Int, scale: Int): DecimalType

52

def unbounded: DecimalType

53

}

54

```

55

56

**Usage Example:**

57

```scala

58

val intCol = StructField("count", IntegerType, nullable = false)

59

val priceCol = StructField("price", DecimalType(10, 2), nullable = true)

60

val flagCol = StructField("active", BooleanType, nullable = false)

61

```

62

63

### String Types

64

65

```scala { .api }

66

class StringType(val collationId: Int) extends AtomicType {

67

def this() = this(0)

68

def getCollationSql: String

69

}

70

71

object StringType extends StringType(0)

72

73

case class CharType(length: Int) extends AtomicType

74

case class VarcharType(length: Int) extends AtomicType

75

```

76

77

**Usage Example:**

78

```scala

79

val nameCol = StructField("name", StringType, nullable = false)

80

val codeCol = StructField("code", CharType(10), nullable = false)

81

val descCol = StructField("description", VarcharType(255), nullable = true)

82

```

83

84

### Binary and Date/Time Types

85

86

```scala { .api }

87

object BinaryType extends DataType

88

object DateType extends DataType

89

object TimestampType extends DataType

90

object TimestampNTZType extends DataType

91

object CalendarIntervalType extends DataType

92

93

case class DayTimeIntervalType(startField: Byte, endField: Byte) extends DataType

94

case class YearMonthIntervalType(startField: Byte, endField: Byte) extends DataType

95

```

96

97

**Usage Example:**

98

```scala

99

val createdCol = StructField("created_at", TimestampType, nullable = false)

100

val birthdateCol = StructField("birthdate", DateType, nullable = true)

101

val dataCol = StructField("data", BinaryType, nullable = true)

102

```

103

104

### Special Types

105

106

```scala { .api }

107

object NullType extends DataType

108

object VariantType extends DataType

109

case class ObjectType(cls: Class[_]) extends DataType

110

```

111

112

## Complex Types

113

114

### StructType and StructField

115

116

```scala { .api }

117

case class StructType(fields: Array[StructField]) extends DataType {

118

def this(fields: Seq[StructField]) = this(fields.toArray)

119

def this(fields: java.util.List[StructField]) = this(fields.asScala.toArray)

120

121

// Field access

122

def apply(name: String): StructField

123

def apply(names: Seq[String]): StructField

124

def fieldNames: Array[String]

125

def names: Seq[String]

126

def length: Int

127

def iterator: Iterator[StructField]

128

129

// Field manipulation

130

def add(field: StructField): StructType

131

def add(name: String, dataType: DataType): StructType

132

def add(name: String, dataType: DataType, nullable: Boolean): StructType

133

def add(name: String, dataType: DataType, nullable: Boolean, metadata: Metadata): StructType

134

def add(name: String, dataType: String): StructType

135

def add(name: String, dataType: String, nullable: Boolean): StructType

136

def add(name: String, dataType: String, nullable: Boolean, metadata: Metadata): StructType

137

138

// Schema operations

139

def merge(that: StructType): StructType

140

def remove(fieldNames: Set[String]): StructType

141

def dropFields(fieldNames: String*): StructType

142

def getFieldIndex(name: String): Option[Int]

143

}

144

145

case class StructField(

146

name: String,

147

dataType: DataType,

148

nullable: Boolean = true,

149

metadata: Metadata = Metadata.empty

150

) {

151

def getComment(): Option[String]

152

}

153

```

154

155

**Usage Example:**

156

```scala

157

val schema = StructType(Array(

158

StructField("id", LongType, nullable = false),

159

StructField("name", StringType, nullable = false),

160

StructField("email", StringType, nullable = true),

161

StructField("age", IntegerType, nullable = true),

162

StructField("created_at", TimestampType, nullable = false)

163

))

164

165

// Add fields

166

val extendedSchema = schema

167

.add("last_login", TimestampType, nullable = true)

168

.add("is_active", BooleanType, nullable = false)

169

170

// Access fields

171

val nameField = schema("name")

172

val fieldNames = schema.fieldNames

173

```

174

175

### ArrayType

176

177

```scala { .api }

178

case class ArrayType(elementType: DataType, containsNull: Boolean = true) extends DataType {

179

def simpleString: String

180

}

181

182

object ArrayType {

183

def apply(elementType: DataType): ArrayType = ArrayType(elementType, containsNull = true)

184

}

185

```

186

187

**Usage Example:**

188

```scala

189

val tagsCol = StructField("tags", ArrayType(StringType), nullable = true)

190

val scoresCol = StructField("scores", ArrayType(IntegerType, containsNull = false), nullable = false)

191

val nestedCol = StructField("matrix", ArrayType(ArrayType(DoubleType)), nullable = true)

192

```

193

194

### MapType

195

196

```scala { .api }

197

case class MapType(

198

keyType: DataType,

199

valueType: DataType,

200

valueContainsNull: Boolean = true

201

) extends DataType

202

203

object MapType {

204

def apply(keyType: DataType, valueType: DataType): MapType =

205

MapType(keyType, valueType, valueContainsNull = true)

206

}

207

```

208

209

**Usage Example:**

210

```scala

211

val propsCol = StructField("properties", MapType(StringType, StringType), nullable = true)

212

val countsCol = StructField("counts", MapType(StringType, LongType, valueContainsNull = false), nullable = false)

213

val nestedCol = StructField("nested", MapType(StringType, ArrayType(IntegerType)), nullable = true)

214

```

215

216

## Metadata

217

218

```scala { .api }

219

class Metadata private (private val map: Map[String, Any]) {

220

def contains(key: String): Boolean

221

def getLong(key: String): Long

222

def getDouble(key: String): Double

223

def getBoolean(key: String): Boolean

224

def getString(key: String): String

225

def getMetadata(key: String): Metadata

226

def getLongArray(key: String): Array[Long]

227

def getDoubleArray(key: String): Array[Double]

228

def getBooleanArray(key: String): Array[Boolean]

229

def getStringArray(key: String): Array[String]

230

def getMetadataArray(key: String): Array[Metadata]

231

def json: String

232

}

233

234

object Metadata {

235

val empty: Metadata

236

def fromJson(json: String): Metadata

237

}

238

239

class MetadataBuilder {

240

def putLong(key: String, value: Long): MetadataBuilder

241

def putDouble(key: String, value: Double): MetadataBuilder

242

def putBoolean(key: String, value: Boolean): MetadataBuilder

243

def putString(key: String, value: String): MetadataBuilder

244

def putMetadata(key: String, value: Metadata): MetadataBuilder

245

def putLongArray(key: String, value: Array[Long]): MetadataBuilder

246

def putDoubleArray(key: String, value: Array[Double]): MetadataBuilder

247

def putBooleanArray(key: String, value: Array[Boolean]): MetadataBuilder

248

def putStringArray(key: String, value: Array[String]): MetadataBuilder

249

def putMetadataArray(key: String, value: Array[Metadata]): MetadataBuilder

250

def remove(key: String): MetadataBuilder

251

def build(): Metadata

252

}

253

```

254

255

**Usage Example:**

256

```scala

257

val metadata = new MetadataBuilder()

258

.putString("comment", "User identifier")

259

.putLong("maxLength", 100)

260

.putBoolean("required", true)

261

.build()

262

263

val fieldWithMetadata = StructField("user_id", StringType, nullable = false, metadata)

264

```

265

266

## User-Defined Types

267

268

```scala { .api }

269

abstract class UserDefinedType[UserType >: Null] extends DataType {

270

def sqlType: DataType

271

def serialize(obj: UserType): Any

272

def deserialize(datum: Any): UserType

273

def userClass: Class[UserType]

274

def equals(o: Any): Boolean

275

def hashCode(): Int

276

def typeName: String

277

}

278

279

object UDTRegistration {

280

def register(udtClass: String, udt: String): Unit

281

def register(udtClass: Class[_], udt: Class[_ <: UserDefinedType[_]]): Unit

282

def exists(udtClass: String): Boolean

283

def getUDTFor(udtClass: String): Option[UserDefinedType[_]]

284

}

285

```

286

287

**Usage Example:**

288

```scala

289

// Define a custom UDT

290

class PointUDT extends UserDefinedType[Point] {

291

override def sqlType: DataType = StructType(Seq(

292

StructField("x", DoubleType, false),

293

StructField("y", DoubleType, false)

294

))

295

296

override def serialize(point: Point): Any = {

297

InternalRow(point.x, point.y)

298

}

299

300

override def deserialize(datum: Any): Point = {

301

val row = datum.asInstanceOf[InternalRow]

302

Point(row.getDouble(0), row.getDouble(1))

303

}

304

305

override def userClass: Class[Point] = classOf[Point]

306

}

307

308

// Register the UDT

309

UDTRegistration.register(classOf[Point].getName, classOf[PointUDT].getName)

310

```

311

312

## Type Conversion and Utilities

313

314

```scala { .api }

315

object DataType {

316

def fromJson(json: String): DataType

317

def fromDDL(ddl: String): DataType

318

def equalsIgnoreNullability(left: DataType, right: DataType): Boolean

319

def equalsIgnoreCaseAndNullability(left: DataType, right: DataType): Boolean

320

}

321

322

abstract class AbstractDataType {

323

def defaultConcreteType: DataType

324

def acceptsType(other: DataType): Boolean

325

def simpleString: String

326

}

327

```

328

329

**Usage Example:**

330

```scala

331

// Parse DDL

332

val schema = DataType.fromDDL("struct<name:string,age:int,scores:array<double>>")

333

334

// Type checking

335

val accepts = IntegerType.acceptsType(ByteType) // true

336

val equal = DataType.equalsIgnoreNullability(

337

StructType(Seq(StructField("x", IntegerType, true))),

338

StructType(Seq(StructField("x", IntegerType, false)))

339

) // true

340

```

341

342

## Common Patterns

343

344

### Schema Evolution

345

```scala

346

// Start with base schema

347

val v1Schema = StructType(Array(

348

StructField("id", LongType, false),

349

StructField("name", StringType, false)

350

))

351

352

// Add new fields (backwards compatible)

353

val v2Schema = v1Schema

354

.add("email", StringType, nullable = true)

355

.add("created_at", TimestampType, nullable = true)

356

357

// Merge schemas

358

val mergedSchema = v1Schema.merge(v2Schema)

359

```

360

361

### Dynamic Schema Creation

362

```scala

363

def createSchema(fields: Seq[(String, DataType, Boolean)]): StructType = {

364

StructType(fields.map { case (name, dataType, nullable) =>

365

StructField(name, dataType, nullable)

366

}.toArray)

367

}

368

369

val dynamicSchema = createSchema(Seq(

370

("user_id", LongType, false),

371

("preferences", MapType(StringType, StringType), true),

372

("tags", ArrayType(StringType), true)

373

))

374

```

375

376

The data type system in Catalyst provides comprehensive support for all SQL data types with rich metadata capabilities, making it suitable for complex schema evolution and type-safe query processing.