0
# Utilities
1
2
This section covers utility classes for date/time operations, string manipulation, and other common operations in Spark Catalyst.
3
4
## Core Imports
5
6
```scala
7
import org.apache.spark.sql.catalyst.util._
8
import org.apache.spark.unsafe.types.UTF8String
9
import org.apache.spark.sql.types._
10
```
11
12
## Date and Time Utilities
13
14
### DateTimeUtils
15
16
Comprehensive utilities for date and time operations.
17
18
```scala { .api }
19
object DateTimeUtils {
20
// String conversion
21
def stringToTimestamp(s: UTF8String): Option[Long]
22
def timestampToString(us: Long): String
23
def dateToString(days: Int): String
24
def stringToDate(s: UTF8String): Option[Int]
25
26
// Current time functions
27
def currentTimestamp(): Long
28
def currentDate(): Int
29
30
// Date arithmetic
31
def dateAddMonths(days: Int, months: Int): Int
32
def dateAddInterval(days: Int, interval: CalendarInterval): Int
33
def timestampAddInterval(timestamp: Long, interval: CalendarInterval): Long
34
35
// Date/time extraction
36
def getYear(days: Int): Int
37
def getMonth(days: Int): Int
38
def getDayOfMonth(days: Int): Int
39
def getDayOfYear(days: Int): Int
40
def getWeekOfYear(days: Int): Int
41
def getHour(timestamp: Long): Int
42
def getMinute(timestamp: Long): Int
43
def getSecond(timestamp: Long): Int
44
45
// Formatting
46
def formatTimestamp(timestamp: Long, format: String): String
47
def parseTimestamp(s: String, format: String): Option[Long]
48
49
// Constants
50
val MICROS_PER_SECOND: Long = 1000000L
51
val MICROS_PER_MILLIS: Long = 1000L
52
val MILLIS_PER_SECOND: Long = 1000L
53
val SECONDS_PER_DAY: Long = 24 * 60 * 60
54
val MICROS_PER_DAY: Long = SECONDS_PER_DAY * MICROS_PER_SECOND
55
}
56
```
57
58
### Usage Example
59
60
```scala
61
import org.apache.spark.sql.catalyst.util.DateTimeUtils
62
import org.apache.spark.unsafe.types.UTF8String
63
64
// Current date and time
65
val currentTs = DateTimeUtils.currentTimestamp()
66
val currentDate = DateTimeUtils.currentDate()
67
68
// String conversions
69
val dateString = UTF8String.fromString("2023-12-25")
70
val date = DateTimeUtils.stringToDate(dateString) // Some(19724)
71
val dateStr = DateTimeUtils.dateToString(19724) // "2023-12-25"
72
73
// Time extraction
74
val year = DateTimeUtils.getYear(19724) // 2023
75
val month = DateTimeUtils.getMonth(19724) // 12
76
val day = DateTimeUtils.getDayOfMonth(19724) // 25
77
78
// Timestamp operations
79
val timestampString = UTF8String.fromString("2023-12-25 14:30:00")
80
val timestamp = DateTimeUtils.stringToTimestamp(timestampString)
81
val hour = DateTimeUtils.getHour(timestamp.get) // 14
82
val minute = DateTimeUtils.getMinute(timestamp.get) // 30
83
```
84
85
## String Utilities
86
87
### UTF8String
88
89
Efficient UTF-8 string representation optimized for Spark SQL.
90
91
```scala { .api }
92
abstract class UTF8String extends Comparable[UTF8String] {
93
def numBytes(): Int
94
def numChars(): Int
95
def toString(): String
96
def getBytes(): Array[Byte]
97
98
// String operations
99
def contains(substring: UTF8String): Boolean
100
def startsWith(prefix: UTF8String): Boolean
101
def endsWith(suffix: UTF8String): Boolean
102
def indexOf(substring: UTF8String, start: Int): Int
103
def substring(start: Int): UTF8String
104
def substring(start: Int, until: Int): UTF8String
105
def trim(): UTF8String
106
def trimLeft(): UTF8String
107
def trimRight(): UTF8String
108
def reverse(): UTF8String
109
def repeat(times: Int): UTF8String
110
111
// Case operations
112
def toUpperCase(): UTF8String
113
def toLowerCase(): UTF8String
114
115
// Comparison
116
def compare(other: UTF8String): Int
117
def equals(other: Any): Boolean
118
def hashCode(): Int
119
120
// Conversion
121
def toLong(): Long
122
def toDouble(): Double
123
def toInt(): Int
124
}
125
126
object UTF8String {
127
def fromString(str: String): UTF8String
128
def fromBytes(bytes: Array[Byte]): UTF8String
129
def fromBytes(bytes: Array[Byte], offset: Int, numBytes: Int): UTF8String
130
131
val EMPTY_UTF8: UTF8String
132
133
// String manipulation utilities
134
def concat(inputs: UTF8String*): UTF8String
135
def concatWs(separator: UTF8String, inputs: UTF8String*): UTF8String
136
}
137
```
138
139
### Usage Example
140
141
```scala
142
import org.apache.spark.unsafe.types.UTF8String
143
144
// Create UTF8String instances
145
val str1 = UTF8String.fromString("Hello World")
146
val str2 = UTF8String.fromString("hello")
147
148
// String operations
149
val upper = str2.toUpperCase() // "HELLO"
150
val substring = str1.substring(0, 5) // "Hello"
151
val contains = str1.contains(UTF8String.fromString("World")) // true
152
153
// Concatenation
154
val concat = UTF8String.concat(str2, UTF8String.fromString(" "), str1)
155
val concatWs = UTF8String.concatWs(UTF8String.fromString("-"), str1, str2)
156
157
// Comparison
158
val comparison = str1.compare(str2) // > 0 (case sensitive)
159
```
160
161
## Array Utilities
162
163
### ArrayData
164
165
Abstract representation for array data in Catalyst.
166
167
```scala { .api }
168
abstract class ArrayData {
169
def numElements(): Int
170
def isNullAt(ordinal: Int): Boolean
171
def get(ordinal: Int, elementType: DataType): Any
172
def getBoolean(ordinal: Int): Boolean
173
def getByte(ordinal: Int): Byte
174
def getShort(ordinal: Int): Short
175
def getInt(ordinal: Int): Int
176
def getLong(ordinal: Int): Long
177
def getFloat(ordinal: Int): Float
178
def getDouble(ordinal: Int): Double
179
def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal
180
def getUTF8String(ordinal: Int): UTF8String
181
def getBinary(ordinal: Int): Array[Byte]
182
def getInterval(ordinal: Int): CalendarInterval
183
def getStruct(ordinal: Int, numFields: Int): InternalRow
184
def getArray(ordinal: Int): ArrayData
185
def getMap(ordinal: Int): MapData
186
187
def toArray[T](elementType: DataType): Array[T]
188
def toObjectArray(elementType: DataType): Array[AnyRef]
189
def copy(): ArrayData
190
}
191
```
192
193
## Map Utilities
194
195
### MapData
196
197
Abstract representation for map data in Catalyst.
198
199
```scala { .api }
200
abstract class MapData {
201
def numElements(): Int
202
def keyArray(): ArrayData
203
def valueArray(): ArrayData
204
def copy(): MapData
205
206
// Java Map conversion
207
def toMap[K, V](keyType: DataType, valueType: DataType): java.util.Map[K, V]
208
def toScalaMap[K, V](keyType: DataType, valueType: DataType): scala.collection.Map[K, V]
209
}
210
```
211
212
## Interval Utilities
213
214
### CalendarInterval
215
216
Represents a calendar interval (years, months, days, microseconds).
217
218
```scala { .api }
219
case class CalendarInterval(months: Int, days: Int, microseconds: Long) {
220
def add(that: CalendarInterval): CalendarInterval
221
def subtract(that: CalendarInterval): CalendarInterval
222
def negate(): CalendarInterval
223
def toString: String
224
}
225
226
object CalendarInterval {
227
val EMPTY: CalendarInterval = new CalendarInterval(0, 0, 0)
228
229
def fromString(input: String): CalendarInterval
230
def fromSingleUnitString(unit: String, input: String): CalendarInterval
231
232
// Factory methods
233
def fromYearMonthString(input: String): CalendarInterval
234
def fromDayTimeString(input: String): CalendarInterval
235
}
236
```
237
238
### Usage Example
239
240
```scala
241
import org.apache.spark.sql.catalyst.util.CalendarInterval
242
243
// Create intervals
244
val interval1 = new CalendarInterval(2, 15, 3600000000L) // 2 months, 15 days, 1 hour
245
val interval2 = CalendarInterval.fromString("1 year 2 months 3 days")
246
247
// Interval arithmetic
248
val sum = interval1.add(interval2)
249
val diff = interval1.subtract(interval2)
250
val negated = interval1.negate()
251
```
252
253
## Mathematical Utilities
254
255
### MathUtils
256
257
```scala { .api }
258
object MathUtils {
259
def floorDiv(x: Long, y: Long): Long
260
def floorMod(x: Long, y: Long): Long
261
def addExact(x: Long, y: Long): Long
262
def subtractExact(x: Long, y: Long): Long
263
def multiplyExact(x: Long, y: Long): Long
264
def toIntExact(value: Long): Int
265
266
// Rounding functions
267
def round(value: Double, scale: Int): Double
268
def roundUp(value: Double, scale: Int): Double
269
def roundDown(value: Double, scale: Int): Double
270
}
271
```
272
273
## Collection Utilities
274
275
### AttributeMap
276
277
Specialized map for attributes with efficient lookups.
278
279
```scala { .api }
280
class AttributeMap[A](val baseMap: Map[Attribute, A]) {
281
def get(k: Attribute): Option[A]
282
def apply(k: Attribute): A
283
def contains(k: Attribute): Boolean
284
def size: Int
285
def isEmpty: Boolean
286
def nonEmpty: Boolean
287
288
def ++(other: AttributeMap[A]): AttributeMap[A]
289
def -(key: Attribute): AttributeMap[A]
290
def updated(key: Attribute, value: A): AttributeMap[A]
291
292
def keys: Iterable[Attribute]
293
def values: Iterable[A]
294
def foreach[U](f: ((Attribute, A)) => U): Unit
295
def map[B](f: ((Attribute, A)) => (Attribute, B)): AttributeMap[B]
296
}
297
298
object AttributeMap {
299
def empty[A]: AttributeMap[A] = new AttributeMap(Map.empty)
300
def apply[A](kvs: (Attribute, A)*): AttributeMap[A] = new AttributeMap(kvs.toMap)
301
}
302
```
303
304
### AttributeSet
305
306
Efficient set implementation for attributes.
307
308
```scala { .api }
309
class AttributeSet private (val baseSet: Set[Attribute]) {
310
def contains(a: Attribute): Boolean
311
def subsetOf(other: AttributeSet): Boolean
312
def intersect(other: AttributeSet): AttributeSet
313
def ++(other: AttributeSet): AttributeSet
314
def +(attr: Attribute): AttributeSet
315
def -(attr: Attribute): AttributeSet
316
def filter(f: Attribute => Boolean): AttributeSet
317
def map(f: Attribute => Attribute): AttributeSet
318
def size: Int
319
def isEmpty: Boolean
320
def nonEmpty: Boolean
321
def toSeq: Seq[Attribute]
322
}
323
324
object AttributeSet {
325
def empty: AttributeSet = new AttributeSet(Set.empty)
326
def apply(attrs: Attribute*): AttributeSet = new AttributeSet(attrs.toSet)
327
def fromAttributeSets(sets: Seq[AttributeSet]): AttributeSet
328
}
329
```
330
331
## Hashing Utilities
332
333
### HashUtils
334
335
```scala { .api }
336
object HashUtils {
337
def murmur3Hash(input: Any, dataType: DataType, seed: Int): Int
338
def xxHash64(input: Any, dataType: DataType, seed: Long): Long
339
340
// Hash array elements
341
def hashArray(array: ArrayData, elementType: DataType, seed: Int): Int
342
def hashMap(map: MapData, keyType: DataType, valueType: DataType, seed: Int): Int
343
def hashStruct(struct: InternalRow, structType: StructType, seed: Int): Int
344
}
345
```
346
347
## Complete Usage Example
348
349
```scala
350
import org.apache.spark.sql.catalyst.util._
351
import org.apache.spark.unsafe.types.UTF8String
352
import org.apache.spark.sql.catalyst.InternalRow
353
354
// Working with dates and times
355
val dateStr = UTF8String.fromString("2023-12-25")
356
val date = DateTimeUtils.stringToDate(dateStr).get
357
val year = DateTimeUtils.getYear(date)
358
val formattedDate = DateTimeUtils.dateToString(date)
359
360
// String processing
361
val name = UTF8String.fromString("John Doe")
362
val upperName = name.toUpperCase()
363
val firstName = name.substring(0, 4)
364
365
// Interval operations
366
val interval = new CalendarInterval(1, 0, 0) // 1 month
367
val futureDate = DateTimeUtils.dateAddInterval(date, interval)
368
369
// Working with collections
370
val attrs = Seq(attr1, attr2, attr3)
371
val attrSet = AttributeSet(attrs: _*)
372
val attrMap = AttributeMap(attrs.zip(Seq("value1", "value2", "value3")): _*)
373
374
println(s"Date: $formattedDate, Year: $year")
375
println(s"Name: $upperName, First: $firstName")
376
println(s"Attribute set size: ${attrSet.size}")
377
```
378
379
These utilities provide the foundation for efficient data processing operations throughout the Catalyst framework, enabling high-performance query execution with proper handling of various data types and operations.