or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

analysis.mdcode-generation.mddata-types.mdexpressions.mdindex.mdoptimization.mdparsing.mdquery-plans.mdutilities.md

utilities.mddocs/

0

# Utilities

1

2

This section covers utility classes for date/time operations, string manipulation, and other common operations in Spark Catalyst.

3

4

## Core Imports

5

6

```scala

7

import org.apache.spark.sql.catalyst.util._

8

import org.apache.spark.unsafe.types.UTF8String

9

import org.apache.spark.sql.types._

10

```

11

12

## Date and Time Utilities

13

14

### DateTimeUtils

15

16

Comprehensive utilities for date and time operations.

17

18

```scala { .api }

19

object DateTimeUtils {

20

// String conversion

21

def stringToTimestamp(s: UTF8String): Option[Long]

22

def timestampToString(us: Long): String

23

def dateToString(days: Int): String

24

def stringToDate(s: UTF8String): Option[Int]

25

26

// Current time functions

27

def currentTimestamp(): Long

28

def currentDate(): Int

29

30

// Date arithmetic

31

def dateAddMonths(days: Int, months: Int): Int

32

def dateAddInterval(days: Int, interval: CalendarInterval): Int

33

def timestampAddInterval(timestamp: Long, interval: CalendarInterval): Long

34

35

// Date/time extraction

36

def getYear(days: Int): Int

37

def getMonth(days: Int): Int

38

def getDayOfMonth(days: Int): Int

39

def getDayOfYear(days: Int): Int

40

def getWeekOfYear(days: Int): Int

41

def getHour(timestamp: Long): Int

42

def getMinute(timestamp: Long): Int

43

def getSecond(timestamp: Long): Int

44

45

// Formatting

46

def formatTimestamp(timestamp: Long, format: String): String

47

def parseTimestamp(s: String, format: String): Option[Long]

48

49

// Constants

50

val MICROS_PER_SECOND: Long = 1000000L

51

val MICROS_PER_MILLIS: Long = 1000L

52

val MILLIS_PER_SECOND: Long = 1000L

53

val SECONDS_PER_DAY: Long = 24 * 60 * 60

54

val MICROS_PER_DAY: Long = SECONDS_PER_DAY * MICROS_PER_SECOND

55

}

56

```

57

58

### Usage Example

59

60

```scala

61

import org.apache.spark.sql.catalyst.util.DateTimeUtils

62

import org.apache.spark.unsafe.types.UTF8String

63

64

// Current date and time

65

val currentTs = DateTimeUtils.currentTimestamp()

66

val currentDate = DateTimeUtils.currentDate()

67

68

// String conversions

69

val dateString = UTF8String.fromString("2023-12-25")

70

val date = DateTimeUtils.stringToDate(dateString) // Some(19724)

71

val dateStr = DateTimeUtils.dateToString(19724) // "2023-12-25"

72

73

// Time extraction

74

val year = DateTimeUtils.getYear(19724) // 2023

75

val month = DateTimeUtils.getMonth(19724) // 12

76

val day = DateTimeUtils.getDayOfMonth(19724) // 25

77

78

// Timestamp operations

79

val timestampString = UTF8String.fromString("2023-12-25 14:30:00")

80

val timestamp = DateTimeUtils.stringToTimestamp(timestampString)

81

val hour = DateTimeUtils.getHour(timestamp.get) // 14

82

val minute = DateTimeUtils.getMinute(timestamp.get) // 30

83

```

84

85

## String Utilities

86

87

### UTF8String

88

89

Efficient UTF-8 string representation optimized for Spark SQL.

90

91

```scala { .api }

92

abstract class UTF8String extends Comparable[UTF8String] {

93

def numBytes(): Int

94

def numChars(): Int

95

def toString(): String

96

def getBytes(): Array[Byte]

97

98

// String operations

99

def contains(substring: UTF8String): Boolean

100

def startsWith(prefix: UTF8String): Boolean

101

def endsWith(suffix: UTF8String): Boolean

102

def indexOf(substring: UTF8String, start: Int): Int

103

def substring(start: Int): UTF8String

104

def substring(start: Int, until: Int): UTF8String

105

def trim(): UTF8String

106

def trimLeft(): UTF8String

107

def trimRight(): UTF8String

108

def reverse(): UTF8String

109

def repeat(times: Int): UTF8String

110

111

// Case operations

112

def toUpperCase(): UTF8String

113

def toLowerCase(): UTF8String

114

115

// Comparison

116

def compare(other: UTF8String): Int

117

def equals(other: Any): Boolean

118

def hashCode(): Int

119

120

// Conversion

121

def toLong(): Long

122

def toDouble(): Double

123

def toInt(): Int

124

}

125

126

object UTF8String {

127

def fromString(str: String): UTF8String

128

def fromBytes(bytes: Array[Byte]): UTF8String

129

def fromBytes(bytes: Array[Byte], offset: Int, numBytes: Int): UTF8String

130

131

val EMPTY_UTF8: UTF8String

132

133

// String manipulation utilities

134

def concat(inputs: UTF8String*): UTF8String

135

def concatWs(separator: UTF8String, inputs: UTF8String*): UTF8String

136

}

137

```

138

139

### Usage Example

140

141

```scala

142

import org.apache.spark.unsafe.types.UTF8String

143

144

// Create UTF8String instances

145

val str1 = UTF8String.fromString("Hello World")

146

val str2 = UTF8String.fromString("hello")

147

148

// String operations

149

val upper = str2.toUpperCase() // "HELLO"

150

val substring = str1.substring(0, 5) // "Hello"

151

val contains = str1.contains(UTF8String.fromString("World")) // true

152

153

// Concatenation

154

val concat = UTF8String.concat(str2, UTF8String.fromString(" "), str1)

155

val concatWs = UTF8String.concatWs(UTF8String.fromString("-"), str1, str2)

156

157

// Comparison

158

val comparison = str1.compare(str2) // > 0 (case sensitive)

159

```

160

161

## Array Utilities

162

163

### ArrayData

164

165

Abstract representation for array data in Catalyst.

166

167

```scala { .api }

168

abstract class ArrayData {

169

def numElements(): Int

170

def isNullAt(ordinal: Int): Boolean

171

def get(ordinal: Int, elementType: DataType): Any

172

def getBoolean(ordinal: Int): Boolean

173

def getByte(ordinal: Int): Byte

174

def getShort(ordinal: Int): Short

175

def getInt(ordinal: Int): Int

176

def getLong(ordinal: Int): Long

177

def getFloat(ordinal: Int): Float

178

def getDouble(ordinal: Int): Double

179

def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal

180

def getUTF8String(ordinal: Int): UTF8String

181

def getBinary(ordinal: Int): Array[Byte]

182

def getInterval(ordinal: Int): CalendarInterval

183

def getStruct(ordinal: Int, numFields: Int): InternalRow

184

def getArray(ordinal: Int): ArrayData

185

def getMap(ordinal: Int): MapData

186

187

def toArray[T](elementType: DataType): Array[T]

188

def toObjectArray(elementType: DataType): Array[AnyRef]

189

def copy(): ArrayData

190

}

191

```

192

193

## Map Utilities

194

195

### MapData

196

197

Abstract representation for map data in Catalyst.

198

199

```scala { .api }

200

abstract class MapData {

201

def numElements(): Int

202

def keyArray(): ArrayData

203

def valueArray(): ArrayData

204

def copy(): MapData

205

206

// Java Map conversion

207

def toMap[K, V](keyType: DataType, valueType: DataType): java.util.Map[K, V]

208

def toScalaMap[K, V](keyType: DataType, valueType: DataType): scala.collection.Map[K, V]

209

}

210

```

211

212

## Interval Utilities

213

214

### CalendarInterval

215

216

Represents a calendar interval (years, months, days, microseconds).

217

218

```scala { .api }

219

case class CalendarInterval(months: Int, days: Int, microseconds: Long) {

220

def add(that: CalendarInterval): CalendarInterval

221

def subtract(that: CalendarInterval): CalendarInterval

222

def negate(): CalendarInterval

223

def toString: String

224

}

225

226

object CalendarInterval {

227

val EMPTY: CalendarInterval = new CalendarInterval(0, 0, 0)

228

229

def fromString(input: String): CalendarInterval

230

def fromSingleUnitString(unit: String, input: String): CalendarInterval

231

232

// Factory methods

233

def fromYearMonthString(input: String): CalendarInterval

234

def fromDayTimeString(input: String): CalendarInterval

235

}

236

```

237

238

### Usage Example

239

240

```scala

241

import org.apache.spark.sql.catalyst.util.CalendarInterval

242

243

// Create intervals

244

val interval1 = new CalendarInterval(2, 15, 3600000000L) // 2 months, 15 days, 1 hour

245

val interval2 = CalendarInterval.fromString("1 year 2 months 3 days")

246

247

// Interval arithmetic

248

val sum = interval1.add(interval2)

249

val diff = interval1.subtract(interval2)

250

val negated = interval1.negate()

251

```

252

253

## Mathematical Utilities

254

255

### MathUtils

256

257

```scala { .api }

258

object MathUtils {

259

def floorDiv(x: Long, y: Long): Long

260

def floorMod(x: Long, y: Long): Long

261

def addExact(x: Long, y: Long): Long

262

def subtractExact(x: Long, y: Long): Long

263

def multiplyExact(x: Long, y: Long): Long

264

def toIntExact(value: Long): Int

265

266

// Rounding functions

267

def round(value: Double, scale: Int): Double

268

def roundUp(value: Double, scale: Int): Double

269

def roundDown(value: Double, scale: Int): Double

270

}

271

```

272

273

## Collection Utilities

274

275

### AttributeMap

276

277

Specialized map for attributes with efficient lookups.

278

279

```scala { .api }

280

class AttributeMap[A](val baseMap: Map[Attribute, A]) {

281

def get(k: Attribute): Option[A]

282

def apply(k: Attribute): A

283

def contains(k: Attribute): Boolean

284

def size: Int

285

def isEmpty: Boolean

286

def nonEmpty: Boolean

287

288

def ++(other: AttributeMap[A]): AttributeMap[A]

289

def -(key: Attribute): AttributeMap[A]

290

def updated(key: Attribute, value: A): AttributeMap[A]

291

292

def keys: Iterable[Attribute]

293

def values: Iterable[A]

294

def foreach[U](f: ((Attribute, A)) => U): Unit

295

def map[B](f: ((Attribute, A)) => (Attribute, B)): AttributeMap[B]

296

}

297

298

object AttributeMap {

299

def empty[A]: AttributeMap[A] = new AttributeMap(Map.empty)

300

def apply[A](kvs: (Attribute, A)*): AttributeMap[A] = new AttributeMap(kvs.toMap)

301

}

302

```

303

304

### AttributeSet

305

306

Efficient set implementation for attributes.

307

308

```scala { .api }

309

class AttributeSet private (val baseSet: Set[Attribute]) {

310

def contains(a: Attribute): Boolean

311

def subsetOf(other: AttributeSet): Boolean

312

def intersect(other: AttributeSet): AttributeSet

313

def ++(other: AttributeSet): AttributeSet

314

def +(attr: Attribute): AttributeSet

315

def -(attr: Attribute): AttributeSet

316

def filter(f: Attribute => Boolean): AttributeSet

317

def map(f: Attribute => Attribute): AttributeSet

318

def size: Int

319

def isEmpty: Boolean

320

def nonEmpty: Boolean

321

def toSeq: Seq[Attribute]

322

}

323

324

object AttributeSet {

325

def empty: AttributeSet = new AttributeSet(Set.empty)

326

def apply(attrs: Attribute*): AttributeSet = new AttributeSet(attrs.toSet)

327

def fromAttributeSets(sets: Seq[AttributeSet]): AttributeSet

328

}

329

```

330

331

## Hashing Utilities

332

333

### HashUtils

334

335

```scala { .api }

336

object HashUtils {

337

def murmur3Hash(input: Any, dataType: DataType, seed: Int): Int

338

def xxHash64(input: Any, dataType: DataType, seed: Long): Long

339

340

// Hash array elements

341

def hashArray(array: ArrayData, elementType: DataType, seed: Int): Int

342

def hashMap(map: MapData, keyType: DataType, valueType: DataType, seed: Int): Int

343

def hashStruct(struct: InternalRow, structType: StructType, seed: Int): Int

344

}

345

```

346

347

## Complete Usage Example

348

349

```scala

350

import org.apache.spark.sql.catalyst.util._

351

import org.apache.spark.unsafe.types.UTF8String

352

import org.apache.spark.sql.catalyst.InternalRow

353

354

// Working with dates and times

355

val dateStr = UTF8String.fromString("2023-12-25")

356

val date = DateTimeUtils.stringToDate(dateStr).get

357

val year = DateTimeUtils.getYear(date)

358

val formattedDate = DateTimeUtils.dateToString(date)

359

360

// String processing

361

val name = UTF8String.fromString("John Doe")

362

val upperName = name.toUpperCase()

363

val firstName = name.substring(0, 4)

364

365

// Interval operations

366

val interval = new CalendarInterval(1, 0, 0) // 1 month

367

val futureDate = DateTimeUtils.dateAddInterval(date, interval)

368

369

// Working with collections

370

val attrs = Seq(attr1, attr2, attr3)

371

val attrSet = AttributeSet(attrs: _*)

372

val attrMap = AttributeMap(attrs.zip(Seq("value1", "value2", "value3")): _*)

373

374

println(s"Date: $formattedDate, Year: $year")

375

println(s"Name: $upperName, First: $firstName")

376

println(s"Attribute set size: ${attrSet.size}")

377

```

378

379

These utilities provide the foundation for efficient data processing operations throughout the Catalyst framework, enabling high-performance query execution with proper handling of various data types and operations.