or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
mavenpkg:maven/org.apache.spark/spark-hive_2.11@1.6.x

docs

client-interface.mdexecution-engine.mdhive-context.mdhiveql-parser.mdindex.mdorc-support.mdtype-system.mdudf-support.md
tile.json

tessl/maven-org-apache-spark--spark-hive

tessl install tessl/maven-org-apache-spark--spark-hive@1.6.0

Apache Spark SQL Hive integration module providing HiveContext, metastore operations, HiveQL parsing, and Hive data format compatibility

type-system.mddocs/

Type System

The Spark Hive Type System provides bidirectional conversion between Spark SQL Catalyst types and Hive ObjectInspectors, enabling seamless data exchange between Spark and Hive ecosystems. This system handles all Hive data types including primitives, complex types, and custom SerDe formats.

Core Type Conversion

HiveInspectors Trait

trait HiveInspectors {
  def toInspector(dataType: DataType): ObjectInspector
  def unwrap(data: Any, oi: ObjectInspector): Any
  def wrap(a: Any, oi: ObjectInspector): AnyRef
}

toInspector - Converts Catalyst DataType to Hive ObjectInspector

unwrap - Converts Hive data representation to Catalyst internal format

wrap - Converts Catalyst data to Hive-compatible format

Primitive Type Conversions

Basic Data Types

// Catalyst to Hive ObjectInspector mappings
val primitiveInspectorMap: Map[DataType, PrimitiveObjectInspector] = Map(
  StringType -> PrimitiveObjectInspectorFactory.javaStringObjectInspector,
  IntegerType -> PrimitiveObjectInspectorFactory.javaIntObjectInspector,
  LongType -> PrimitiveObjectInspectorFactory.javaLongObjectInspector,
  DoubleType -> PrimitiveObjectInspectorFactory.javaDoubleObjectInspector,
  FloatType -> PrimitiveObjectInspectorFactory.javaFloatObjectInspector,
  BooleanType -> PrimitiveObjectInspectorFactory.javaBooleanObjectInspector,
  ByteType -> PrimitiveObjectInspectorFactory.javaByteObjectInspector,
  ShortType -> PrimitiveObjectInspectorFactory.javaShortObjectInspector,
  DateType -> PrimitiveObjectInspectorFactory.javaDateObjectInspector,
  TimestampType -> PrimitiveObjectInspectorFactory.javaTimestampObjectInspector,
  BinaryType -> PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector
)

Usage Examples:

import org.apache.spark.sql.hive.HiveInspectors

val inspectors = new HiveInspectors {}

// Convert Catalyst types to Hive ObjectInspectors
val stringInspector = inspectors.toInspector(StringType)
val intInspector = inspectors.toInspector(IntegerType)
val timestampInspector = inspectors.toInspector(TimestampType)

// Convert data from Hive to Catalyst format
val catalystString = inspectors.unwrap("hello", stringInspector)
val catalystInt = inspectors.unwrap(java.lang.Integer.valueOf(42), intInspector)

// Convert data from Catalyst to Hive format
val hiveString = inspectors.wrap("world", stringInspector)
val hiveInt = inspectors.wrap(123, intInspector)

Decimal Type Handling

case class DecimalTypeInspector(precision: Int, scale: Int) extends PrimitiveObjectInspector

Usage Example:

// Handle Hive decimal types with precision and scale
val decimalType = DecimalType(10, 2)  // DECIMAL(10,2)
val decimalInspector = inspectors.toInspector(decimalType)

// Convert Hive BigDecimal to Catalyst Decimal
val hiveDecimal = new java.math.BigDecimal("123.45")
val catalystDecimal = inspectors.unwrap(hiveDecimal, decimalInspector)

// Convert Catalyst Decimal to Hive BigDecimal
val catalystDecimalValue = Decimal(123.45, 10, 2)
val hiveDecimalValue = inspectors.wrap(catalystDecimalValue, decimalInspector)

Complex Type Conversions

Array Types

case class ArrayTypeInspector(elementInspector: ObjectInspector) extends ListObjectInspector

Usage Examples:

// Array of strings: ARRAY<STRING>
val arrayType = ArrayType(StringType)
val arrayInspector = inspectors.toInspector(arrayType)

// Convert Hive array to Catalyst array
val hiveArray = java.util.Arrays.asList("a", "b", "c")
val catalystArray = inspectors.unwrap(hiveArray, arrayInspector)

// Convert Catalyst array to Hive array
val catalystArrayValue = Array("x", "y", "z")
val hiveArrayValue = inspectors.wrap(catalystArrayValue, arrayInspector)

// Complex nested arrays: ARRAY<ARRAY<INT>>
val nestedArrayType = ArrayType(ArrayType(IntegerType))
val nestedArrayInspector = inspectors.toInspector(nestedArrayType)

Map Types

case class MapTypeInspector(
  keyInspector: ObjectInspector,
  valueInspector: ObjectInspector
) extends MapObjectInspector

Usage Examples:

// Map type: MAP<STRING, INT>
val mapType = MapType(StringType, IntegerType)
val mapInspector = inspectors.toInspector(mapType)

// Convert Hive map to Catalyst map
val hiveMap = new java.util.HashMap[String, Integer]()
hiveMap.put("key1", 100)
hiveMap.put("key2", 200)
val catalystMap = inspectors.unwrap(hiveMap, mapInspector)

// Convert Catalyst map to Hive map
val catalystMapValue = Map("a" -> 1, "b" -> 2)
val hiveMapValue = inspectors.wrap(catalystMapValue, mapInspector)

// Complex map: MAP<STRING, ARRAY<INT>>
val complexMapType = MapType(StringType, ArrayType(IntegerType))
val complexMapInspector = inspectors.toInspector(complexMapType)

Struct Types

case class StructTypeInspector(
  fields: Seq[StructField],
  fieldInspectors: Seq[ObjectInspector]
) extends StructObjectInspector

Usage Examples:

// Struct type: STRUCT<name:STRING, age:INT, active:BOOLEAN>
val structType = StructType(Seq(
  StructField("name", StringType),
  StructField("age", IntegerType),
  StructField("active", BooleanType)
))
val structInspector = inspectors.toInspector(structType)

// Convert Hive struct to Catalyst row
val hiveStruct = Array("John", java.lang.Integer.valueOf(30), java.lang.Boolean.TRUE)
val catalystRow = inspectors.unwrap(hiveStruct, structInspector)

// Convert Catalyst row to Hive struct
val catalystRowValue = InternalRow.fromSeq(Seq("Jane", 25, false))
val hiveStructValue = inspectors.wrap(catalystRowValue, structInspector)

// Nested struct: STRUCT<person:STRUCT<name:STRING, age:INT>, score:DOUBLE>
val nestedStructType = StructType(Seq(
  StructField("person", StructType(Seq(
    StructField("name", StringType),
    StructField("age", IntegerType)
  ))),
  StructField("score", DoubleType)
))

SerDe Integration

Custom SerDe Support

trait SerDeSupport extends HiveInspectors {
  def deserialize(writable: Writable, serDe: Deserializer): Any
  def serialize(data: Any, serDe: Serializer): Writable
}

Usage Examples:

// JSON SerDe integration
val jsonSerDe = new org.apache.hive.hcatalog.data.JsonSerDe()
val jsonInspector = jsonSerDe.getObjectInspector()

// Deserialize JSON data
val jsonWritable = new Text("""{"name": "John", "age": 30}""")
val catalystData = inspectors.deserialize(jsonWritable, jsonSerDe)

// RegEx SerDe integration
val regexSerDe = new org.apache.hadoop.hive.serde2.RegexSerDe()
regexSerDe.initialize(conf, properties)
val regexInspector = regexSerDe.getObjectInspector()

LazySimpleSerDe

class LazySimpleSerDeInspector(
  fieldSeparator: String,
  fieldInspectors: Seq[ObjectInspector]
) extends StructObjectInspector

Usage Example:

// Default Hive text format with tab separation
val lazySerDe = new org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe()
val properties = new Properties()
properties.setProperty("field.delim", "\t")
properties.setProperty("columns", "id,name,amount")
properties.setProperty("columns.types", "bigint:string:double")

lazySerDe.initialize(conf, properties)
val lazyInspector = lazySerDe.getObjectInspector()

// Parse delimited text data
val textData = new Text("123\tJohn Doe\t45.67")
val parsedData = inspectors.unwrap(
  lazySerDe.deserialize(textData), 
  lazyInspector
)

UDF Type Integration

User-Defined Function Support

trait UDFTypeSupport extends HiveInspectors {
  def invokeUDF(udf: GenericUDF, args: Seq[Any]): Any
  def convertUDFResult(result: Any, outputInspector: ObjectInspector): Any
}

Usage Examples:

// Custom UDF with type conversion
class StringLengthUDF extends GenericUDF {
  override def initialize(arguments: Array[ObjectInspector]): ObjectInspector = {
    // Return integer inspector for string length
    PrimitiveObjectInspectorFactory.javaIntObjectInspector
  }
  
  override def evaluate(arguments: DeferredObject[]): AnyRef = {
    val stringArg = arguments(0).get().asInstanceOf[String]
    java.lang.Integer.valueOf(stringArg.length)
  }
}

// Register and use UDF
val lengthUDF = new StringLengthUDF()
val inputInspectors = Array(
  PrimitiveObjectInspectorFactory.javaStringObjectInspector
)
val outputInspector = lengthUDF.initialize(inputInspectors)

// Convert arguments and invoke UDF
val catalystString = "Hello World"
val hiveString = inspectors.wrap(catalystString, inputInspectors(0))
val result = lengthUDF.evaluate(Array(new DeferredObjectAdapter(hiveString)))
val catalystResult = inspectors.unwrap(result, outputInspector)

Advanced Type Features

Union Types

case class UnionTypeInspector(
  typeInspectors: Seq[ObjectInspector]
) extends UnionObjectInspector

Usage Example:

// Union type: UNIONTYPE<STRING, INT, DOUBLE>
val unionInspectors = Seq(
  PrimitiveObjectInspectorFactory.javaStringObjectInspector,
  PrimitiveObjectInspectorFactory.javaIntObjectInspector,
  PrimitiveObjectInspectorFactory.javaDoubleObjectInspector
)
val unionInspector = new UnionTypeInspector(unionInspectors)

// Handle union data (tag + value format)
val unionData = Array(0.toByte, "string_value")  // Tag 0 = string
val catalystUnion = inspectors.unwrap(unionData, unionInspector)

Void Types

val voidInspector: VoidObjectInspector = 
  PrimitiveObjectInspectorFactory.javaVoidObjectInspector

Used for NULL values and empty results in Hive operations.

Type Validation and Conversion

Schema Compatibility

def isCompatible(catalystType: DataType, hiveInspector: ObjectInspector): Boolean
def canConvert(fromType: DataType, toType: DataType): Boolean

Usage Examples:

// Check type compatibility
val catalystType = StructType(Seq(
  StructField("name", StringType),
  StructField("age", IntegerType)
))

val hiveInspector = // ... from Hive table metadata
val compatible = inspectors.isCompatible(catalystType, hiveInspector)

if (compatible) {
  // Proceed with data conversion
} else {
  // Handle schema mismatch
  println("Schema incompatible - check field types and names")
}

Type Coercion

def coerceType(value: Any, fromType: DataType, toType: DataType): Any

Usage Example:

// Automatic type coercion for compatible types
val stringValue = "123"
val intValue = inspectors.coerceType(stringValue, StringType, IntegerType)
// Result: 123 as Integer

val longValue = 123L
val doubleValue = inspectors.coerceType(longValue, LongType, DoubleType)
// Result: 123.0 as Double

Performance Optimizations

ObjectInspector Caching

class CachedObjectInspectorFactory {
  private val cache = new ConcurrentHashMap[DataType, ObjectInspector]()
  
  def getInspector(dataType: DataType): ObjectInspector = {
    cache.computeIfAbsent(dataType, _ => createInspector(dataType))
  }
}

Bulk Conversion

def convertBatch(
  data: Seq[Any], 
  fromInspector: ObjectInspector,
  toInspector: ObjectInspector
): Seq[Any]

Usage Example:

// Efficient batch conversion for large datasets
val hiveData = Seq(/* large collection of Hive objects */)
val catalystData = inspectors.convertBatch(
  hiveData,
  hiveInspector,
  catalystInspector
)

Error Handling

Type Conversion Errors

try {
  val result = inspectors.unwrap(hiveData, inspector)
} catch {
  case e: ClassCastException =>
    println("Type conversion failed - data type mismatch")
  case e: NumberFormatException =>
    println("Numeric conversion failed - invalid format")
  case e: UnsupportedOperationException =>
    println("Unsupported type conversion")
}

Schema Validation Errors

try {
  val inspector = inspectors.toInspector(complexType)
} catch {
  case e: IllegalArgumentException =>
    println("Invalid type definition")
  case e: UnsupportedTypeException =>
    println("Type not supported in Hive")
}

Best Practices

Type System Usage

  1. Cache ObjectInspectors for frequently used types
  2. Validate compatibility before conversion
  3. Handle NULL values explicitly
  4. Use appropriate precision for decimal types
  5. Consider SerDe overhead for complex types

Performance Tips

  1. Batch conversions for large datasets
  2. Reuse inspectors across operations
  3. Minimize type conversions in hot paths
  4. Profile conversion overhead for critical queries
  5. Use native types when possible

Error Prevention

  1. Validate schemas before processing
  2. Handle type coercion gracefully
  3. Test with edge cases (NULL, empty collections)
  4. Monitor conversion failures in production
  5. Provide fallback mechanisms for type mismatches