or run

tessl search

Version

Workspace: tessl
Visibility: Public
Created: 4 months ago
Last updated: 4 months ago
Describes: pkg:maven/org.apache.spark/spark-protobuf_2.13@3.5.x

docs

configuration.md deserialization.md index.md schema-conversion.md serialization.md

tile.json

tessl/maven-org-apache-spark--spark-protobuf_2-13

tessl install tessl/maven-org-apache-spark--spark-protobuf_2-13@3.5.0

Apache Spark connector for Protocol Buffer (protobuf) data format support, providing SQL functions to convert between binary protobuf data and Catalyst data structures for processing structured data in distributed big data analytics pipelines

Schema Conversion

Tools for converting between Protobuf descriptors and Spark SQL schemas with advanced type mapping and schema inference capabilities.

Capabilities

Schema Converters Object

Main utility object providing schema conversion functionality between Protobuf and Spark SQL.

/**
 * Utility object for converting Protobuf schemas to Spark SQL schemas
 * @since 3.4.0
 */
@DeveloperApi
object SchemaConverters {
  /**
   * Converts a Protobuf schema to corresponding Spark SQL schema
   * @param descriptor - The Protobuf message descriptor
   * @param protobufOptions - Configuration options for schema conversion
   * @return SchemaType containing the converted DataType and nullability
   * @since 3.4.0
   */
  def toSqlType(
    descriptor: Descriptor,
    protobufOptions: ProtobufOptions = ProtobufOptions(Map.empty)
  ): SchemaType
}

Schema Type Wrapper

Internal wrapper for SQL data type and nullability information.

/**
 * Internal wrapper for SQL data type and nullability
 * @param dataType - The Spark SQL DataType
 * @param nullable - Whether the type can contain null values
 * @since 3.4.0
 */
case class SchemaType(dataType: DataType, nullable: Boolean)

Proto Schema Helper

Helper class for performing field lookup and matching between Protobuf and Catalyst schemas.

/**
 * Helper class for field lookup/matching on Protobuf schemas
 * @param descriptor - The Protobuf descriptor to search for fields
 * @param catalystSchema - The Catalyst schema to use for matching
 * @param protoPath - Sequence of parent field names leading to protoSchema
 * @param catalystPath - Sequence of parent field names leading to catalystSchema
 */
class ProtoSchemaHelper(
  descriptor: Descriptor,
  catalystSchema: StructType,
  protoPath: Seq[String],
  catalystPath: Seq[String]
) {
  /** The fields which have matching equivalents in both Protobuf and Catalyst schemas */
  val matchedFields: Seq[ProtoMatchedField]
  
  /** Validate that there are no extra Catalyst fields without Protobuf matches */
  def validateNoExtraCatalystFields(ignoreNullable: Boolean): Unit
  
  /** Validate that there are no extra required Protobuf fields without Catalyst matches */
  def validateNoExtraRequiredProtoFields(): Unit
  
  /** Extract a field from the Protobuf schema by name with case sensitivity handling */
  def getFieldByName(name: String): Option[FieldDescriptor]
}

Proto Matched Field

Wrapper for paired Catalyst and Protobuf fields.

/**
 * Wrapper for a pair of matched fields, one Catalyst and one corresponding Protobuf field
 * @param catalystField - The Catalyst StructField
 * @param catalystPosition - Position of the field in the Catalyst schema
 * @param fieldDescriptor - The corresponding Protobuf FieldDescriptor
 */
case class ProtoMatchedField(
  catalystField: StructField,
  catalystPosition: Int,
  fieldDescriptor: FieldDescriptor
)

Protobuf Utilities Object

Core utility functions for building descriptors and working with protobuf schemas.

/**
 * Utility object providing protobuf descriptor building and management functions
 */
object ProtobufUtils {
  /**
   * Builds Protobuf message descriptor from Java class or serialized descriptor
   * @param messageName - Protobuf message name or Java class name (when binaryFileDescriptorSet is None)
   * @param binaryFileDescriptorSet - When provided, descriptor and dependencies are read from it
   * @return The built Protobuf Descriptor
   */
  def buildDescriptor(
    messageName: String, 
    binaryFileDescriptorSet: Option[Array[Byte]]
  ): Descriptor
  
  /**
   * Builds descriptor from binary FileDescriptorSet and message name
   * @param binaryFileDescriptorSet - Serialized FileDescriptorSet bytes
   * @param messageName - The protobuf message name to find
   * @return The built Protobuf Descriptor
   */
  def buildDescriptor(
    binaryFileDescriptorSet: Array[Byte], 
    messageName: String
  ): Descriptor
  
  /**
   * Builds descriptor from Java class using reflection
   * @param protobufClassName - Full Java class name (e.g., com.example.MyMessage)
   * @return The built Protobuf Descriptor
   */
  def buildDescriptorFromJavaClass(protobufClassName: String): Descriptor
  
  /**
   * Reads protobuf descriptor file content from filesystem
   * @param filePath - Path to the descriptor file
   * @return Byte array containing the descriptor file content
   */
  def readDescriptorFileContent(filePath: String): Array[Byte]
  
  /**
   * Builds TypeRegistry from descriptor bytes for Any field support
   * @param descriptorBytes - Serialized descriptor bytes
   * @return TypeRegistry for protobuf Any type resolution
   */
  def buildTypeRegistry(descriptorBytes: Array[Byte]): TypeRegistry
  
  /**
   * Builds TypeRegistry from descriptor for Any field support
   * @param descriptor - Protobuf message descriptor
   * @return TypeRegistry for protobuf Any type resolution
   */
  def buildTypeRegistry(descriptor: Descriptor): TypeRegistry
}

Usage Examples

Basic Schema Conversion

import org.apache.spark.sql.protobuf.utils.{SchemaConverters, ProtobufUtils, ProtobufOptions}
import com.google.protobuf.{Descriptors, TypeRegistry}
import com.google.protobuf.Descriptors.Descriptor

// Build descriptor from file
val descriptor: Descriptor = ProtobufUtils.buildDescriptor(
  "PersonMessage", 
  Some(ProtobufUtils.readDescriptorFileContent("/path/to/person.desc"))
)

// Convert to Spark SQL schema
val schemaType = SchemaConverters.toSqlType(descriptor)
val sparkSchema = schemaType.dataType

println(s"Converted schema: ${sparkSchema.prettyJson}")
println(s"Is nullable: ${schemaType.nullable}")