or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

data-types.mdencoders.mderror-handling.mdindex.mdrow-operations.mdstreaming-operations.mdutilities.md
tile.json

utilities.mddocs/

Utilities

Utility classes and objects for working with Spark SQL data types, conversions, and integrations.

Capabilities

Arrow Format Integration

Utilities for converting between Spark SQL types and Apache Arrow format for high-performance columnar data exchange.

/**
 * Utilities for converting between Spark SQL and Apache Arrow formats
 */
object ArrowUtils {
  /** Root allocator for Arrow memory management */
  val rootAllocator: org.apache.arrow.memory.RootAllocator
  
  /** Convert Spark DataType to Arrow ArrowType */
  def toArrowType(
    dt: DataType, 
    timeZoneId: String, 
    largeVarTypes: Boolean = false
  ): org.apache.arrow.vector.types.pojo.ArrowType
  
  /** Convert Arrow ArrowType to Spark DataType */
  def fromArrowType(dt: org.apache.arrow.vector.types.pojo.ArrowType): DataType
  
  /** Convert Spark field to Arrow Field */
  def toArrowField(
    name: String,
    dt: DataType,
    nullable: Boolean,
    timeZoneId: String,
    largeVarTypes: Boolean = false
  ): org.apache.arrow.vector.types.pojo.Field
  
  /** Convert Arrow Field to Spark DataType */
  def fromArrowField(field: org.apache.arrow.vector.types.pojo.Field): DataType
  
  /** Convert Spark StructType to Arrow Schema */
  def toArrowSchema(
    schema: StructType,
    timeZoneId: String,
    errorOnDuplicatedFieldNames: Boolean,
    largeVarTypes: Boolean = false
  ): org.apache.arrow.vector.types.pojo.Schema
  
  /** Convert Arrow Schema to Spark StructType */
  def fromArrowSchema(schema: org.apache.arrow.vector.types.pojo.Schema): StructType
}

Usage Examples

Converting Spark types to Arrow:

import org.apache.spark.sql.util.ArrowUtils
import org.apache.spark.sql.types._

// Convert basic types
val sparkIntType = IntegerType
val arrowIntType = ArrowUtils.toArrowType(sparkIntType, null)

val sparkStringType = StringType  
val arrowStringType = ArrowUtils.toArrowType(sparkStringType, null)

// Convert timestamp (requires timezone)
val sparkTimestampType = TimestampType
val arrowTimestampType = ArrowUtils.toArrowType(sparkTimestampType, "UTC")

// Convert complex schema
val sparkSchema = StructType(Array(
  StructField("id", LongType, false),
  StructField("name", StringType, false),
  StructField("scores", ArrayType(DoubleType, false), true),
  StructField("metadata", MapType(StringType, StringType, true), true)
))

val arrowSchema = ArrowUtils.toArrowSchema(
  sparkSchema, 
  timeZoneId = "UTC",
  errorOnDuplicatedFieldNames = true
)

Converting Arrow types back to Spark:

// Convert Arrow types back to Spark
val convertedDataType = ArrowUtils.fromArrowType(arrowIntType)
println(s"Converted back: $convertedDataType") // IntegerType

// Convert Arrow schema back to Spark
val convertedSchema = ArrowUtils.fromArrowSchema(arrowSchema)
println(s"Schema fields: ${convertedSchema.fieldNames.mkString(", ")}")

Working with large variable types:

// Use large variable types for very large strings/binary data
val largeStringType = ArrowUtils.toArrowType(
  StringType, 
  timeZoneId = null, 
  largeVarTypes = true
)

val largeBinaryType = ArrowUtils.toArrowType(
  BinaryType,
  timeZoneId = null,
  largeVarTypes = true  
)

Type conversion mappings:

The following conversions are supported:

Spark to Arrow:

  • BooleanTypeArrowType.Bool
  • ByteTypeArrowType.Int(8, signed=true)
  • ShortTypeArrowType.Int(16, signed=true)
  • IntegerTypeArrowType.Int(32, signed=true)
  • LongTypeArrowType.Int(64, signed=true)
  • FloatTypeArrowType.FloatingPoint(SINGLE)
  • DoubleTypeArrowType.FloatingPoint(DOUBLE)
  • StringTypeArrowType.Utf8 or ArrowType.LargeUtf8
  • BinaryTypeArrowType.Binary or ArrowType.LargeBinary
  • DecimalType(p,s)ArrowType.Decimal(p,s)
  • DateTypeArrowType.Date(DAY)
  • TimestampTypeArrowType.Timestamp(MICROSECOND, timezone)
  • TimestampNTZTypeArrowType.Timestamp(MICROSECOND, null)
  • ArrayTypeArrowType.List
  • MapTypeArrowType.Map
  • StructTypeArrowType.Struct
  • YearMonthIntervalTypeArrowType.Interval(YEAR_MONTH)
  • DayTimeIntervalTypeArrowType.Duration(MICROSECOND)

Error handling:

import org.apache.spark.sql.types.CalendarIntervalType

try {
  // This will throw an exception - CalendarIntervalType not supported
  ArrowUtils.toArrowType(CalendarIntervalType, null)
} catch {
  case e: Exception =>
    println(s"Unsupported type: ${e.getMessage}")
}

try {
  // This will throw - TimestampType requires timezone
  ArrowUtils.toArrowType(TimestampType, null)
} catch {
  case e: IllegalStateException =>
    println("TimestampType requires timeZoneId")
}