Spark SQL API module providing core SQL data types, rows, and foundational APIs for Spark SQL operations
Utility classes and objects for working with Spark SQL data types, conversions, and integrations.
Utilities for converting between Spark SQL types and Apache Arrow format for high-performance columnar data exchange.
/**
* Utilities for converting between Spark SQL and Apache Arrow formats
*/
object ArrowUtils {
/** Root allocator for Arrow memory management */
val rootAllocator: org.apache.arrow.memory.RootAllocator
/** Convert Spark DataType to Arrow ArrowType */
def toArrowType(
dt: DataType,
timeZoneId: String,
largeVarTypes: Boolean = false
): org.apache.arrow.vector.types.pojo.ArrowType
/** Convert Arrow ArrowType to Spark DataType */
def fromArrowType(dt: org.apache.arrow.vector.types.pojo.ArrowType): DataType
/** Convert Spark field to Arrow Field */
def toArrowField(
name: String,
dt: DataType,
nullable: Boolean,
timeZoneId: String,
largeVarTypes: Boolean = false
): org.apache.arrow.vector.types.pojo.Field
/** Convert Arrow Field to Spark DataType */
def fromArrowField(field: org.apache.arrow.vector.types.pojo.Field): DataType
/** Convert Spark StructType to Arrow Schema */
def toArrowSchema(
schema: StructType,
timeZoneId: String,
errorOnDuplicatedFieldNames: Boolean,
largeVarTypes: Boolean = false
): org.apache.arrow.vector.types.pojo.Schema
/** Convert Arrow Schema to Spark StructType */
def fromArrowSchema(schema: org.apache.arrow.vector.types.pojo.Schema): StructType
}Converting Spark types to Arrow:
import org.apache.spark.sql.util.ArrowUtils
import org.apache.spark.sql.types._
// Convert basic types
val sparkIntType = IntegerType
val arrowIntType = ArrowUtils.toArrowType(sparkIntType, null)
val sparkStringType = StringType
val arrowStringType = ArrowUtils.toArrowType(sparkStringType, null)
// Convert timestamp (requires timezone)
val sparkTimestampType = TimestampType
val arrowTimestampType = ArrowUtils.toArrowType(sparkTimestampType, "UTC")
// Convert complex schema
val sparkSchema = StructType(Array(
StructField("id", LongType, false),
StructField("name", StringType, false),
StructField("scores", ArrayType(DoubleType, false), true),
StructField("metadata", MapType(StringType, StringType, true), true)
))
val arrowSchema = ArrowUtils.toArrowSchema(
sparkSchema,
timeZoneId = "UTC",
errorOnDuplicatedFieldNames = true
)Converting Arrow types back to Spark:
// Convert Arrow types back to Spark
val convertedDataType = ArrowUtils.fromArrowType(arrowIntType)
println(s"Converted back: $convertedDataType") // IntegerType
// Convert Arrow schema back to Spark
val convertedSchema = ArrowUtils.fromArrowSchema(arrowSchema)
println(s"Schema fields: ${convertedSchema.fieldNames.mkString(", ")}")Working with large variable types:
// Use large variable types for very large strings/binary data
val largeStringType = ArrowUtils.toArrowType(
StringType,
timeZoneId = null,
largeVarTypes = true
)
val largeBinaryType = ArrowUtils.toArrowType(
BinaryType,
timeZoneId = null,
largeVarTypes = true
)Type conversion mappings:
The following conversions are supported:
Spark to Arrow:
BooleanType → ArrowType.BoolByteType → ArrowType.Int(8, signed=true)ShortType → ArrowType.Int(16, signed=true)IntegerType → ArrowType.Int(32, signed=true)LongType → ArrowType.Int(64, signed=true)FloatType → ArrowType.FloatingPoint(SINGLE)DoubleType → ArrowType.FloatingPoint(DOUBLE)StringType → ArrowType.Utf8 or ArrowType.LargeUtf8BinaryType → ArrowType.Binary or ArrowType.LargeBinaryDecimalType(p,s) → ArrowType.Decimal(p,s)DateType → ArrowType.Date(DAY)TimestampType → ArrowType.Timestamp(MICROSECOND, timezone)TimestampNTZType → ArrowType.Timestamp(MICROSECOND, null)ArrayType → ArrowType.ListMapType → ArrowType.MapStructType → ArrowType.StructYearMonthIntervalType → ArrowType.Interval(YEAR_MONTH)DayTimeIntervalType → ArrowType.Duration(MICROSECOND)Error handling:
import org.apache.spark.sql.types.CalendarIntervalType
try {
// This will throw an exception - CalendarIntervalType not supported
ArrowUtils.toArrowType(CalendarIntervalType, null)
} catch {
case e: Exception =>
println(s"Unsupported type: ${e.getMessage}")
}
try {
// This will throw - TimestampType requires timezone
ArrowUtils.toArrowType(TimestampType, null)
} catch {
case e: IllegalStateException =>
println("TimestampType requires timeZoneId")
}Install with Tessl CLI
npx tessl i tessl/maven-org-apache-spark--spark-sql-api-2-12