or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mdfrom-protobuf.mdindex.mdschema-utilities.mdto-protobuf.md

from-protobuf.mddocs/

0

# Protobuf to Catalyst Conversion

1

2

Functions for converting binary Protocol Buffer data to Spark SQL structured types with comprehensive configuration options and multiple descriptor source support.

3

4

## Capabilities

5

6

### from_protobuf with Descriptor File Path

7

8

Convert binary protobuf data using a descriptor file path on the filesystem.

9

10

```scala { .api }

11

/**

12

* Converts a binary column of Protobuf format into its corresponding catalyst value.

13

* @param data the binary column containing protobuf data

14

* @param messageName the protobuf message name to look for in descriptor file

15

* @param descFilePath the protobuf descriptor file path (created with protoc --descriptor_set_out)

16

* @param options configuration options for deserialization behavior

17

* @return Column with deserialized Spark SQL data

18

*/

19

def from_protobuf(

20

data: Column,

21

messageName: String,

22

descFilePath: String,

23

options: java.util.Map[String, String]

24

): Column

25

26

/**

27

* Converts a binary column of Protobuf format into its corresponding catalyst value.

28

* @param data the binary column containing protobuf data

29

* @param messageName the protobuf message name to look for in descriptor file

30

* @param descFilePath the protobuf descriptor file path (created with protoc --descriptor_set_out)

31

* @return Column with deserialized Spark SQL data

32

*/

33

def from_protobuf(

34

data: Column,

35

messageName: String,

36

descFilePath: String

37

): Column

38

```

39

40

### from_protobuf with Binary Descriptor Set

41

42

Convert binary protobuf data using a pre-loaded binary FileDescriptorSet.

43

44

```scala { .api }

45

/**

46

* Converts a binary column of Protobuf format into its corresponding catalyst value.

47

* @param data the binary column containing protobuf data

48

* @param messageName the protobuf MessageName to look for in the descriptor set

49

* @param binaryFileDescriptorSet serialized protobuf descriptor (FileDescriptorSet)

50

* @param options configuration options for deserialization behavior

51

* @return Column with deserialized Spark SQL data

52

*/

53

def from_protobuf(

54

data: Column,

55

messageName: String,

56

binaryFileDescriptorSet: Array[Byte],

57

options: java.util.Map[String, String]

58

): Column

59

60

/**

61

* Converts a binary column of Protobuf format into its corresponding catalyst value.

62

* @param data the binary column containing protobuf data

63

* @param messageName the protobuf MessageName to look for in the descriptor set

64

* @param binaryFileDescriptorSet serialized protobuf descriptor (FileDescriptorSet)

65

* @return Column with deserialized Spark SQL data

66

*/

67

def from_protobuf(

68

data: Column,

69

messageName: String,

70

binaryFileDescriptorSet: Array[Byte]

71

): Column

72

```

73

74

### from_protobuf with Java Class Name

75

76

Convert binary protobuf data using a Java class name (requires shaded protobuf classes).

77

78

```scala { .api }

79

/**

80

* Converts a binary column of Protobuf format into its corresponding catalyst value.

81

* The jar containing Java class should be shaded with com.google.protobuf.*

82

* relocated to org.sparkproject.spark_protobuf.protobuf.*

83

* @param data the binary column containing protobuf data

84

* @param messageClassName the full name for Protobuf Java class

85

* @param options configuration options for deserialization behavior

86

* @return Column with deserialized Spark SQL data

87

*/

88

def from_protobuf(

89

data: Column,

90

messageClassName: String,

91

options: java.util.Map[String, String]

92

): Column

93

94

/**

95

* Converts a binary column of Protobuf format into its corresponding catalyst value.

96

* The jar containing Java class should be shaded with com.google.protobuf.*

97

* relocated to org.sparkproject.spark_protobuf.protobuf.*

98

* @param data the binary column containing protobuf data

99

* @param messageClassName the full name for Protobuf Java class

100

* @return Column with deserialized Spark SQL data

101

*/

102

def from_protobuf(

103

data: Column,

104

messageClassName: String

105

): Column

106

```

107

108

## Usage Examples

109

110

### Basic Deserialization

111

112

```scala

113

import org.apache.spark.sql.protobuf.functions._

114

import org.apache.spark.sql.functions.col

115

116

// Read binary protobuf data

117

val binaryDF = spark.read.format("binaryFile").load("/path/to/protobuf/files")

118

119

// Deserialize using descriptor file

120

val deserializedDF = binaryDF.select(

121

from_protobuf(col("content"), "PersonMessage", "/path/to/person.desc") as "person_data"

122

)

123

124

deserializedDF.show(false)

125

```

126

127

### Deserialization with Options

128

129

```scala

130

import scala.jdk.CollectionConverters._

131

132

val options = Map(

133

"mode" -> "PERMISSIVE",

134

"recursive.fields.max.depth" -> "3",

135

"emit.default.values" -> "true",

136

"enums.as.ints" -> "false"

137

).asJava

138

139

val deserializedDF = binaryDF.select(

140

from_protobuf(col("content"), "PersonMessage", descriptorBytes, options) as "person_data"

141

)

142

```

143

144

### Using Java Class Names

145

146

```scala

147

// Requires shaded protobuf JAR on classpath

148

val deserializedDF = binaryDF.select(

149

from_protobuf(col("content"), "com.example.protos.PersonMessage") as "person_data"

150

)

151

```

152

153

### Error Handling with Parse Modes

154

155

```scala

156

val options = Map("mode" -> "PERMISSIVE").asJava

157

158

// PERMISSIVE mode: malformed records become null

159

val tolerantDF = binaryDF.select(

160

from_protobuf(col("content"), "PersonMessage", descriptorFile, options) as "person_data"

161

).filter(col("person_data").isNotNull)

162

163

// FAILFAST mode: malformed records cause job failure

164

val strictOptions = Map("mode" -> "FAILFAST").asJava

165

val strictDF = binaryDF.select(

166

from_protobuf(col("content"), "PersonMessage", descriptorFile, strictOptions) as "person_data"

167

)

168

```

169

170

### Handling Any Fields as JSON

171

172

```scala

173

val options = Map("convert.any.fields.to.json" -> "true").asJava

174

175

val deserializedDF = binaryDF.select(

176

from_protobuf(col("content"), "MessageWithAny", descriptorBytes, options) as "data"

177

)

178

179

// Any fields will be converted to JSON strings instead of binary

180

```