tessl install tessl/maven-org-apache-spark--spark-hive@1.6.0Apache Spark SQL Hive integration module providing HiveContext, metastore operations, HiveQL parsing, and Hive data format compatibility
The Spark Hive Type System provides bidirectional conversion between Spark SQL Catalyst types and Hive ObjectInspectors, enabling seamless data exchange between Spark and Hive ecosystems. This system handles all Hive data types including primitives, complex types, and custom SerDe formats.
trait HiveInspectors {
def toInspector(dataType: DataType): ObjectInspector
def unwrap(data: Any, oi: ObjectInspector): Any
def wrap(a: Any, oi: ObjectInspector): AnyRef
}toInspector - Converts Catalyst DataType to Hive ObjectInspector
unwrap - Converts Hive data representation to Catalyst internal format
wrap - Converts Catalyst data to Hive-compatible format
// Catalyst to Hive ObjectInspector mappings
val primitiveInspectorMap: Map[DataType, PrimitiveObjectInspector] = Map(
StringType -> PrimitiveObjectInspectorFactory.javaStringObjectInspector,
IntegerType -> PrimitiveObjectInspectorFactory.javaIntObjectInspector,
LongType -> PrimitiveObjectInspectorFactory.javaLongObjectInspector,
DoubleType -> PrimitiveObjectInspectorFactory.javaDoubleObjectInspector,
FloatType -> PrimitiveObjectInspectorFactory.javaFloatObjectInspector,
BooleanType -> PrimitiveObjectInspectorFactory.javaBooleanObjectInspector,
ByteType -> PrimitiveObjectInspectorFactory.javaByteObjectInspector,
ShortType -> PrimitiveObjectInspectorFactory.javaShortObjectInspector,
DateType -> PrimitiveObjectInspectorFactory.javaDateObjectInspector,
TimestampType -> PrimitiveObjectInspectorFactory.javaTimestampObjectInspector,
BinaryType -> PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector
)Usage Examples:
import org.apache.spark.sql.hive.HiveInspectors
val inspectors = new HiveInspectors {}
// Convert Catalyst types to Hive ObjectInspectors
val stringInspector = inspectors.toInspector(StringType)
val intInspector = inspectors.toInspector(IntegerType)
val timestampInspector = inspectors.toInspector(TimestampType)
// Convert data from Hive to Catalyst format
val catalystString = inspectors.unwrap("hello", stringInspector)
val catalystInt = inspectors.unwrap(java.lang.Integer.valueOf(42), intInspector)
// Convert data from Catalyst to Hive format
val hiveString = inspectors.wrap("world", stringInspector)
val hiveInt = inspectors.wrap(123, intInspector)case class DecimalTypeInspector(precision: Int, scale: Int) extends PrimitiveObjectInspectorUsage Example:
// Handle Hive decimal types with precision and scale
val decimalType = DecimalType(10, 2) // DECIMAL(10,2)
val decimalInspector = inspectors.toInspector(decimalType)
// Convert Hive BigDecimal to Catalyst Decimal
val hiveDecimal = new java.math.BigDecimal("123.45")
val catalystDecimal = inspectors.unwrap(hiveDecimal, decimalInspector)
// Convert Catalyst Decimal to Hive BigDecimal
val catalystDecimalValue = Decimal(123.45, 10, 2)
val hiveDecimalValue = inspectors.wrap(catalystDecimalValue, decimalInspector)case class ArrayTypeInspector(elementInspector: ObjectInspector) extends ListObjectInspectorUsage Examples:
// Array of strings: ARRAY<STRING>
val arrayType = ArrayType(StringType)
val arrayInspector = inspectors.toInspector(arrayType)
// Convert Hive array to Catalyst array
val hiveArray = java.util.Arrays.asList("a", "b", "c")
val catalystArray = inspectors.unwrap(hiveArray, arrayInspector)
// Convert Catalyst array to Hive array
val catalystArrayValue = Array("x", "y", "z")
val hiveArrayValue = inspectors.wrap(catalystArrayValue, arrayInspector)
// Complex nested arrays: ARRAY<ARRAY<INT>>
val nestedArrayType = ArrayType(ArrayType(IntegerType))
val nestedArrayInspector = inspectors.toInspector(nestedArrayType)case class MapTypeInspector(
keyInspector: ObjectInspector,
valueInspector: ObjectInspector
) extends MapObjectInspectorUsage Examples:
// Map type: MAP<STRING, INT>
val mapType = MapType(StringType, IntegerType)
val mapInspector = inspectors.toInspector(mapType)
// Convert Hive map to Catalyst map
val hiveMap = new java.util.HashMap[String, Integer]()
hiveMap.put("key1", 100)
hiveMap.put("key2", 200)
val catalystMap = inspectors.unwrap(hiveMap, mapInspector)
// Convert Catalyst map to Hive map
val catalystMapValue = Map("a" -> 1, "b" -> 2)
val hiveMapValue = inspectors.wrap(catalystMapValue, mapInspector)
// Complex map: MAP<STRING, ARRAY<INT>>
val complexMapType = MapType(StringType, ArrayType(IntegerType))
val complexMapInspector = inspectors.toInspector(complexMapType)case class StructTypeInspector(
fields: Seq[StructField],
fieldInspectors: Seq[ObjectInspector]
) extends StructObjectInspectorUsage Examples:
// Struct type: STRUCT<name:STRING, age:INT, active:BOOLEAN>
val structType = StructType(Seq(
StructField("name", StringType),
StructField("age", IntegerType),
StructField("active", BooleanType)
))
val structInspector = inspectors.toInspector(structType)
// Convert Hive struct to Catalyst row
val hiveStruct = Array("John", java.lang.Integer.valueOf(30), java.lang.Boolean.TRUE)
val catalystRow = inspectors.unwrap(hiveStruct, structInspector)
// Convert Catalyst row to Hive struct
val catalystRowValue = InternalRow.fromSeq(Seq("Jane", 25, false))
val hiveStructValue = inspectors.wrap(catalystRowValue, structInspector)
// Nested struct: STRUCT<person:STRUCT<name:STRING, age:INT>, score:DOUBLE>
val nestedStructType = StructType(Seq(
StructField("person", StructType(Seq(
StructField("name", StringType),
StructField("age", IntegerType)
))),
StructField("score", DoubleType)
))trait SerDeSupport extends HiveInspectors {
def deserialize(writable: Writable, serDe: Deserializer): Any
def serialize(data: Any, serDe: Serializer): Writable
}Usage Examples:
// JSON SerDe integration
val jsonSerDe = new org.apache.hive.hcatalog.data.JsonSerDe()
val jsonInspector = jsonSerDe.getObjectInspector()
// Deserialize JSON data
val jsonWritable = new Text("""{"name": "John", "age": 30}""")
val catalystData = inspectors.deserialize(jsonWritable, jsonSerDe)
// RegEx SerDe integration
val regexSerDe = new org.apache.hadoop.hive.serde2.RegexSerDe()
regexSerDe.initialize(conf, properties)
val regexInspector = regexSerDe.getObjectInspector()class LazySimpleSerDeInspector(
fieldSeparator: String,
fieldInspectors: Seq[ObjectInspector]
) extends StructObjectInspectorUsage Example:
// Default Hive text format with tab separation
val lazySerDe = new org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe()
val properties = new Properties()
properties.setProperty("field.delim", "\t")
properties.setProperty("columns", "id,name,amount")
properties.setProperty("columns.types", "bigint:string:double")
lazySerDe.initialize(conf, properties)
val lazyInspector = lazySerDe.getObjectInspector()
// Parse delimited text data
val textData = new Text("123\tJohn Doe\t45.67")
val parsedData = inspectors.unwrap(
lazySerDe.deserialize(textData),
lazyInspector
)trait UDFTypeSupport extends HiveInspectors {
def invokeUDF(udf: GenericUDF, args: Seq[Any]): Any
def convertUDFResult(result: Any, outputInspector: ObjectInspector): Any
}Usage Examples:
// Custom UDF with type conversion
class StringLengthUDF extends GenericUDF {
override def initialize(arguments: Array[ObjectInspector]): ObjectInspector = {
// Return integer inspector for string length
PrimitiveObjectInspectorFactory.javaIntObjectInspector
}
override def evaluate(arguments: DeferredObject[]): AnyRef = {
val stringArg = arguments(0).get().asInstanceOf[String]
java.lang.Integer.valueOf(stringArg.length)
}
}
// Register and use UDF
val lengthUDF = new StringLengthUDF()
val inputInspectors = Array(
PrimitiveObjectInspectorFactory.javaStringObjectInspector
)
val outputInspector = lengthUDF.initialize(inputInspectors)
// Convert arguments and invoke UDF
val catalystString = "Hello World"
val hiveString = inspectors.wrap(catalystString, inputInspectors(0))
val result = lengthUDF.evaluate(Array(new DeferredObjectAdapter(hiveString)))
val catalystResult = inspectors.unwrap(result, outputInspector)case class UnionTypeInspector(
typeInspectors: Seq[ObjectInspector]
) extends UnionObjectInspectorUsage Example:
// Union type: UNIONTYPE<STRING, INT, DOUBLE>
val unionInspectors = Seq(
PrimitiveObjectInspectorFactory.javaStringObjectInspector,
PrimitiveObjectInspectorFactory.javaIntObjectInspector,
PrimitiveObjectInspectorFactory.javaDoubleObjectInspector
)
val unionInspector = new UnionTypeInspector(unionInspectors)
// Handle union data (tag + value format)
val unionData = Array(0.toByte, "string_value") // Tag 0 = string
val catalystUnion = inspectors.unwrap(unionData, unionInspector)val voidInspector: VoidObjectInspector =
PrimitiveObjectInspectorFactory.javaVoidObjectInspectorUsed for NULL values and empty results in Hive operations.
def isCompatible(catalystType: DataType, hiveInspector: ObjectInspector): Boolean
def canConvert(fromType: DataType, toType: DataType): BooleanUsage Examples:
// Check type compatibility
val catalystType = StructType(Seq(
StructField("name", StringType),
StructField("age", IntegerType)
))
val hiveInspector = // ... from Hive table metadata
val compatible = inspectors.isCompatible(catalystType, hiveInspector)
if (compatible) {
// Proceed with data conversion
} else {
// Handle schema mismatch
println("Schema incompatible - check field types and names")
}def coerceType(value: Any, fromType: DataType, toType: DataType): AnyUsage Example:
// Automatic type coercion for compatible types
val stringValue = "123"
val intValue = inspectors.coerceType(stringValue, StringType, IntegerType)
// Result: 123 as Integer
val longValue = 123L
val doubleValue = inspectors.coerceType(longValue, LongType, DoubleType)
// Result: 123.0 as Doubleclass CachedObjectInspectorFactory {
private val cache = new ConcurrentHashMap[DataType, ObjectInspector]()
def getInspector(dataType: DataType): ObjectInspector = {
cache.computeIfAbsent(dataType, _ => createInspector(dataType))
}
}def convertBatch(
data: Seq[Any],
fromInspector: ObjectInspector,
toInspector: ObjectInspector
): Seq[Any]Usage Example:
// Efficient batch conversion for large datasets
val hiveData = Seq(/* large collection of Hive objects */)
val catalystData = inspectors.convertBatch(
hiveData,
hiveInspector,
catalystInspector
)try {
val result = inspectors.unwrap(hiveData, inspector)
} catch {
case e: ClassCastException =>
println("Type conversion failed - data type mismatch")
case e: NumberFormatException =>
println("Numeric conversion failed - invalid format")
case e: UnsupportedOperationException =>
println("Unsupported type conversion")
}try {
val inspector = inspectors.toInspector(complexType)
} catch {
case e: IllegalArgumentException =>
println("Invalid type definition")
case e: UnsupportedTypeException =>
println("Type not supported in Hive")
}