Utility classes and objects for working with Spark SQL data types, conversions, and integrations.
Utilities for converting between Spark SQL types and Apache Arrow format for high-performance columnar data exchange.
/**
* Utilities for converting between Spark SQL and Apache Arrow formats
*/
object ArrowUtils {
/** Root allocator for Arrow memory management */
val rootAllocator: org.apache.arrow.memory.RootAllocator
/** Convert Spark DataType to Arrow ArrowType */
def toArrowType(
dt: DataType,
timeZoneId: String,
largeVarTypes: Boolean = false
): org.apache.arrow.vector.types.pojo.ArrowType
/** Convert Arrow ArrowType to Spark DataType */
def fromArrowType(dt: org.apache.arrow.vector.types.pojo.ArrowType): DataType
/** Convert Spark field to Arrow Field */
def toArrowField(
name: String,
dt: DataType,
nullable: Boolean,
timeZoneId: String,
largeVarTypes: Boolean = false
): org.apache.arrow.vector.types.pojo.Field
/** Convert Arrow Field to Spark DataType */
def fromArrowField(field: org.apache.arrow.vector.types.pojo.Field): DataType
/** Convert Spark StructType to Arrow Schema */
def toArrowSchema(
schema: StructType,
timeZoneId: String,
errorOnDuplicatedFieldNames: Boolean,
largeVarTypes: Boolean = false
): org.apache.arrow.vector.types.pojo.Schema
/** Convert Arrow Schema to Spark StructType */
def fromArrowSchema(schema: org.apache.arrow.vector.types.pojo.Schema): StructType
}Converting Spark types to Arrow:
import org.apache.spark.sql.util.ArrowUtils
import org.apache.spark.sql.types._
// Convert basic types
val sparkIntType = IntegerType
val arrowIntType = ArrowUtils.toArrowType(sparkIntType, null)
val sparkStringType = StringType
val arrowStringType = ArrowUtils.toArrowType(sparkStringType, null)
// Convert timestamp (requires timezone)
val sparkTimestampType = TimestampType
val arrowTimestampType = ArrowUtils.toArrowType(sparkTimestampType, "UTC")
// Convert complex schema
val sparkSchema = StructType(Array(
StructField("id", LongType, false),
StructField("name", StringType, false),
StructField("scores", ArrayType(DoubleType, false), true),
StructField("metadata", MapType(StringType, StringType, true), true)
))
val arrowSchema = ArrowUtils.toArrowSchema(
sparkSchema,
timeZoneId = "UTC",
errorOnDuplicatedFieldNames = true
)Converting Arrow types back to Spark:
// Convert Arrow types back to Spark
val convertedDataType = ArrowUtils.fromArrowType(arrowIntType)
println(s"Converted back: $convertedDataType") // IntegerType
// Convert Arrow schema back to Spark
val convertedSchema = ArrowUtils.fromArrowSchema(arrowSchema)
println(s"Schema fields: ${convertedSchema.fieldNames.mkString(", ")}")Working with large variable types:
// Use large variable types for very large strings/binary data
val largeStringType = ArrowUtils.toArrowType(
StringType,
timeZoneId = null,
largeVarTypes = true
)
val largeBinaryType = ArrowUtils.toArrowType(
BinaryType,
timeZoneId = null,
largeVarTypes = true
)Type conversion mappings:
The following conversions are supported:
Spark to Arrow:
BooleanType → ArrowType.BoolByteType → ArrowType.Int(8, signed=true)ShortType → ArrowType.Int(16, signed=true)IntegerType → ArrowType.Int(32, signed=true)LongType → ArrowType.Int(64, signed=true)FloatType → ArrowType.FloatingPoint(SINGLE)DoubleType → ArrowType.FloatingPoint(DOUBLE)StringType → ArrowType.Utf8 or ArrowType.LargeUtf8BinaryType → ArrowType.Binary or ArrowType.LargeBinaryDecimalType(p,s) → ArrowType.Decimal(p,s)DateType → ArrowType.Date(DAY)TimestampType → ArrowType.Timestamp(MICROSECOND, timezone)TimestampNTZType → ArrowType.Timestamp(MICROSECOND, null)ArrayType → ArrowType.ListMapType → ArrowType.MapStructType → ArrowType.StructYearMonthIntervalType → ArrowType.Interval(YEAR_MONTH)DayTimeIntervalType → ArrowType.Duration(MICROSECOND)Error handling:
import org.apache.spark.sql.types.CalendarIntervalType
try {
// This will throw an exception - CalendarIntervalType not supported
ArrowUtils.toArrowType(CalendarIntervalType, null)
} catch {
case e: Exception =>
println(s"Unsupported type: ${e.getMessage}")
}
try {
// This will throw - TimestampType requires timezone
ArrowUtils.toArrowType(TimestampType, null)
} catch {
case e: IllegalStateException =>
println("TimestampType requires timeZoneId")
}