or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

catalog-integration.mdconfiguration.mdfunction-module.mdindex.mdpartition-management.mdtable-source-sink.md

partition-management.mddocs/

0

# Partition Management and Utilities

1

2

Advanced partition handling utilities for efficient data access, partition pruning, and metadata management in partitioned Hive tables. These utilities provide comprehensive support for partition discovery, serialization, and manipulation in both batch and streaming scenarios.

3

4

## Capabilities

5

6

### HiveTablePartition

7

8

Core data structure representing a Hive table partition or whole table for non-partitioned tables.

9

10

```java { .api }

11

/**

12

* Represents a Hive table partition or whole table

13

* Encapsulates storage descriptor and partition specification

14

* Used for partition-aware data processing and metadata operations

15

*/

16

@PublicEvolving

17

public class HiveTablePartition implements Serializable {

18

19

/**

20

* Create partition representation for non-partitioned table

21

* @param storageDescriptor Hive storage descriptor with location and format info

22

* @param tableParameters Table-level parameters and properties

23

* @return HiveTablePartition representing the whole table

24

*/

25

public static HiveTablePartition ofTable(StorageDescriptor storageDescriptor,

26

Map<String, String> tableParameters);

27

28

/**

29

* Create partition representation for partitioned table

30

* @param storageDescriptor Partition storage descriptor with location and format info

31

* @param partitionSpec Partition key-value specification (e.g., {year=2024, month=01})

32

* @param tableParameters Table-level parameters and properties

33

* @return HiveTablePartition representing the specific partition

34

*/

35

public static HiveTablePartition ofPartition(StorageDescriptor storageDescriptor,

36

Map<String, String> partitionSpec,

37

Map<String, String> tableParameters);

38

39

/**

40

* Get storage descriptor containing location, input/output formats, and SerDe info

41

* @return StorageDescriptor with partition storage details

42

*/

43

public StorageDescriptor getStorageDescriptor();

44

45

/**

46

* Get partition specification as key-value pairs

47

* @return Map of partition keys to values, empty for non-partitioned tables

48

*/

49

public Map<String, String> getPartitionSpec();

50

51

/**

52

* Get table-level parameters and properties

53

* @return Map of table parameters

54

*/

55

public Map<String, String> getTableParameters();

56

57

/**

58

* Check if this represents a partitioned table entry

59

* @return true if partition has non-empty partition specification

60

*/

61

public boolean isPartitioned();

62

63

/**

64

* Get partition location path

65

* @return String path to partition data location

66

*/

67

public String getLocation();

68

69

/**

70

* Get input format class name

71

* @return Input format class for reading partition data

72

*/

73

public String getInputFormat();

74

75

/**

76

* Get output format class name

77

* @return Output format class for writing partition data

78

*/

79

public String getOutputFormat();

80

81

/**

82

* Get SerDe (Serializer/Deserializer) class name

83

* @return SerDe class for data serialization

84

*/

85

public String getSerDe();

86

}

87

```

88

89

**Usage Examples:**

90

91

```java

92

import org.apache.flink.connectors.hive.HiveTablePartition;

93

import org.apache.hadoop.hive.metastore.api.StorageDescriptor;

94

95

// Create partition for partitioned table

96

StorageDescriptor partitionSd = new StorageDescriptor();

97

partitionSd.setLocation("hdfs://namenode:9000/warehouse/events/year=2024/month=01");

98

partitionSd.setInputFormat("org.apache.hadoop.mapred.TextInputFormat");

99

partitionSd.setOutputFormat("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat");

100

101

Map<String, String> partitionSpec = Map.of(

102

"year", "2024",

103

"month", "01"

104

);

105

106

Map<String, String> tableParams = Map.of(

107

"table.type", "EXTERNAL_TABLE",

108

"transient_lastDdlTime", "1640995200"

109

);

110

111

HiveTablePartition partition = HiveTablePartition.ofPartition(

112

partitionSd,

113

partitionSpec,

114

tableParams

115

);

116

117

System.out.println("Partition location: " + partition.getLocation());

118

System.out.println("Partition spec: " + partition.getPartitionSpec());

119

System.out.println("Is partitioned: " + partition.isPartitioned());

120

121

// Create representation for non-partitioned table

122

StorageDescriptor tableSd = new StorageDescriptor();

123

tableSd.setLocation("hdfs://namenode:9000/warehouse/users");

124

tableSd.setInputFormat("org.apache.hadoop.mapred.TextInputFormat");

125

126

HiveTablePartition wholeTable = HiveTablePartition.ofTable(tableSd, tableParams);

127

System.out.println("Table location: " + wholeTable.getLocation());

128

System.out.println("Is partitioned: " + wholeTable.isPartitioned()); // false

129

```

130

131

### HivePartitionUtils

132

133

Utility class providing comprehensive partition management operations including discovery, serialization, and metadata handling.

134

135

```java { .api }

136

/**

137

* Utility class for Hive partition operations

138

* Provides methods for partition discovery, serialization, and metadata management

139

*/

140

public class HivePartitionUtils {

141

142

/**

143

* Get all partitions for a table, including metadata and storage descriptors

144

* @param jobConf Hadoop JobConf with Hive and HDFS configuration

145

* @param hiveVersion Hive version for compatibility

146

* @param tablePath Table path (database.table)

147

* @param partitionColNames List of partition column names for validation

148

* @param remainingPartitions List of remaining partition specifications to filter

149

* @return List of all table partitions with complete metadata

150

*/

151

public static List<HiveTablePartition> getAllPartitions(JobConf jobConf,

152

String hiveVersion,

153

ObjectPath tablePath,

154

List<String> partitionColNames,

155

List<Map<String, String>> remainingPartitions);

156

157

/**

158

* Serialize list of HiveTablePartition objects to list of byte arrays

159

* Used for efficient partition metadata transfer and caching

160

* @param partitions List of partitions to serialize

161

* @return List of serialized byte arrays, one per partition

162

*/

163

public static List<byte[]> serializeHiveTablePartition(List<HiveTablePartition> partitions);

164

165

/**

166

* Deserialize list of byte arrays back to list of HiveTablePartition objects

167

* @param partitionBytes List of serialized partition data

168

* @return List of deserialized partitions

169

*/

170

public static List<HiveTablePartition> deserializeHiveTablePartition(List<byte[]> partitionBytes);

171

}

172

```

173

174

**Usage Examples:**

175

176

```java

177

import org.apache.flink.connectors.hive.util.HivePartitionUtils;

178

import org.apache.hadoop.mapred.JobConf;

179

180

// Configure Hadoop JobConf

181

JobConf jobConf = new JobConf();

182

jobConf.set("hive.metastore.uris", "thrift://metastore:9083");

183

jobConf.set("fs.defaultFS", "hdfs://namenode:9000");

184

185

// Get all partitions for a table

186

ObjectPath tablePath = new ObjectPath("sales", "transactions");

187

List<String> partitionColumns = Arrays.asList("year", "month", "day");

188

189

List<HiveTablePartition> allPartitions = HivePartitionUtils.getAllPartitions(

190

jobConf,

191

"2.3.9", // hiveVersion

192

tablePath,

193

partitionColumns,

194

Collections.emptyList() // remainingPartitions

195

);

196

197

System.out.println("Found " + allPartitions.size() + " partitions");

198

199

// Filter partitions for specific date range

200

List<HiveTablePartition> filteredPartitions = allPartitions.stream()

201

.filter(p -> {

202

Map<String, String> spec = p.getPartitionSpec();

203

return "2024".equals(spec.get("year")) &&

204

Integer.parseInt(spec.get("month")) >= 1 &&

205

Integer.parseInt(spec.get("month")) <= 3;

206

})

207

.collect(Collectors.toList());

208

209

System.out.println("Q1 2024 partitions: " + filteredPartitions.size());

210

211

// Serialize partitions for caching

212

List<byte[]> serializedPartitions = HivePartitionUtils.serializeHiveTablePartition(filteredPartitions);

213

System.out.println("Serialized " + serializedPartitions.size() + " partitions");

214

215

// Deserialize partitions

216

List<HiveTablePartition> deserializedPartitions = HivePartitionUtils.deserializeHiveTablePartition(

217

serializedPartitions

218

);

219

220

System.out.println("Deserialized " + deserializedPartitions.size() + " partitions");

221

```

222

223