Apache Hudi Hadoop common utilities and components that provide core functionality for integrating Apache Hudi with Hadoop ecosystem including file system operations, configuration management, and Hadoop-specific utilities for managing data lakehouse operations
—
Specialized utilities for working with Parquet, ORC, and HFile formats, including metadata reading, schema conversions, and format-specific optimizations. Provides comprehensive support for common big data file formats in Hadoop ecosystems.
Comprehensive utilities for working with Parquet files, including metadata access and row key filtering.
/**
* Parquet format utilities extending FileFormatUtils
* Provides Parquet-specific operations and optimizations
*/
public class ParquetUtils extends FileFormatUtils {
/**
* Read Parquet file metadata
* @param storage - HoodieStorage instance for file access
* @param parquetFilePath - Path to Parquet file
* @return ParquetMetadata containing file metadata
*/
public static ParquetMetadata readMetadata(HoodieStorage storage, StoragePath parquetFilePath);
/**
* Filter row keys from Parquet file
* Efficiently reads only row keys that match the filter set
* @param storage - HoodieStorage instance for file access
* @param filePath - Path to Parquet file
* @param filter - Set of keys to filter for
* @return Set of Pair<String, Long> containing matching keys and row numbers
*/
public Set<Pair<String, Long>> filterRowKeys(HoodieStorage storage, StoragePath filePath,
Set<String> filter);
/**
* Get compression codec name from string
* @param codecName - String name of codec
* @return CompressionCodecName enum value
*/
public static CompressionCodecName getCompressionCodecName(String codecName);
/**
* Fetch record keys with their positions
* @param storage - HoodieStorage instance for file access
* @param filePath - Path to Parquet file
* @return ClosableIterator of Pair<HoodieKey, Long> containing keys and positions
*/
public ClosableIterator<Pair<HoodieKey, Long>> fetchRecordKeysWithPositions(HoodieStorage storage,
StoragePath filePath);
/**
* Get HoodieKey iterator for records in file
* @param storage - HoodieStorage instance for file access
* @param filePath - Path to Parquet file
* @return ClosableIterator of HoodieKey records
*/
public ClosableIterator<HoodieKey> getHoodieKeyIterator(HoodieStorage storage, StoragePath filePath);
/**
* Read Parquet schema as MessageType
* @param storage - HoodieStorage instance for file access
* @param parquetFilePath - Path to Parquet file
* @return MessageType schema
*/
public MessageType readSchema(HoodieStorage storage, StoragePath parquetFilePath);
/**
* Read Avro schema from Parquet file
* @param storage - HoodieStorage instance for file access
* @param filePath - Path to Parquet file
* @return Avro Schema
*/
public Schema readAvroSchema(HoodieStorage storage, StoragePath filePath);
/**
* Read column statistics from Parquet metadata
* @param storage - HoodieStorage instance for file access
* @param filePath - Path to Parquet file
* @param columnNames - List of column names to read stats for
* @return List of HoodieColumnRangeMetadata with statistics
*/
public List<HoodieColumnRangeMetadata<Comparable>> readColumnStatsFromMetadata(HoodieStorage storage,
StoragePath filePath,
List<String> columnNames);
/**
* Read all Avro records from Parquet file
* @param storage - HoodieStorage instance for file access
* @param filePath - Path to Parquet file
* @return List of GenericRecord objects
*/
public List<GenericRecord> readAvroRecords(HoodieStorage storage, StoragePath filePath);
/**
* Read Avro records from Parquet file with specific schema
* @param storage - HoodieStorage instance for file access
* @param filePath - Path to Parquet file
* @param schema - Avro schema to use for reading
* @return List of GenericRecord objects
*/
public List<GenericRecord> readAvroRecords(HoodieStorage storage, StoragePath filePath, Schema schema);
/**
* Get row count from Parquet file
* @param storage - HoodieStorage instance for file access
* @param filePath - Path to Parquet file
* @return Number of rows in the file
*/
public long getRowCount(HoodieStorage storage, StoragePath filePath);
/**
* Get file format
* @return HoodieFileFormat.PARQUET
*/
public HoodieFileFormat getFormat();
}Utilities for working with ORC (Optimized Row Columnar) format files with Hadoop integration.
/**
* ORC format utilities extending FileFormatUtils
* Provides ORC-specific operations and optimizations
*/
public class OrcUtils extends FileFormatUtils {
// ORC-specific file format utility methods
// Includes metadata reading, schema extraction, and optimization hints
// for working with ORC files in Hadoop environments
}Utilities for working with HBase HFile format, providing integration with HBase storage systems.
/**
* HFile format utilities extending FileFormatUtils
* Provides HFile-specific operations for HBase integration
*/
public class HFileUtils extends FileFormatUtils {
// HFile-specific file format utility methods
// Includes HBase integration, key-value operations, and
// specialized access patterns for HFile format
}Comprehensive utilities for converting between Avro and ORC schemas and data formats.
/**
* Utilities for Avro-ORC conversions
* Handles schema conversion and type mapping between formats
*/
public class AvroOrcUtils {
/**
* Create ORC schema from Avro schema
* @param avroSchema - Avro Schema to convert
* @return ORC TypeDescription equivalent
*/
public static TypeDescription createOrcSchema(Schema avroSchema);
/**
* Create Avro schema from ORC TypeDescription
* @param orcSchema - ORC TypeDescription to convert
* @return Avro Schema equivalent
*/
public static Schema createAvroSchema(TypeDescription orcSchema);
/**
* Create Avro schema with default values from ORC schema
* @param orcSchema - ORC TypeDescription to convert
* @param recordName - Name for the record
* @param namespace - Namespace for the schema
* @param nullable - Whether fields should be nullable
* @return Avro Schema with default values
*/
public static Schema createAvroSchemaWithDefaultValue(TypeDescription orcSchema, String recordName,
String namespace, boolean nullable);
/**
* Add value to ORC column vector
* @param type - ORC type description
* @param colVector - Column vector to add to
* @param avroSchema - Avro schema for the value
* @param value - Value to add
* @param vectorPos - Position in vector
*/
public static void addToVector(TypeDescription type, ColumnVector colVector, Schema avroSchema,
Object value, int vectorPos);
/**
* Read value from ORC column vector
* @param type - ORC type description
* @param colVector - Column vector to read from
* @param avroSchema - Avro schema for the value
* @param vectorPos - Position in vector
* @return Object value read from vector
*/
public static Object readFromVector(TypeDescription type, ColumnVector colVector, Schema avroSchema,
int vectorPos);
/**
* Get column names from ORC schema
* @param orcSchema - ORC TypeDescription
* @return List of column names in order
*/
public static List<String> getOrcColumnNames(TypeDescription orcSchema);
/**
* Get field mapping from ORC schema
* @param orcSchema - ORC TypeDescription
* @return Map of field names to TypeDescription
*/
public static Map<String, TypeDescription> getOrcFields(TypeDescription orcSchema);
}Utilities for optimizing Hadoop configuration for different file formats and operations.
/**
* Hadoop configuration utilities
* Provides optimized configurations for different scenarios
*/
public class HadoopConfigUtils {
/**
* Get optimized configuration for reading operations
* @param conf - Base Hadoop configuration
* @return Configuration optimized for reading
*/
public static Configuration getReaderConf(Configuration conf);
/**
* Add shutdown hook for configuration cleanup
* @param conf - Hadoop configuration
* @return Configuration with shutdown hook added
*/
public static Configuration addShutdownHook(Configuration conf);
}Common patterns for converting between different schema formats:
// Example Avro schema
{
"type": "record",
"name": "User",
"fields": [
{"name": "id", "type": "long"},
{"name": "name", "type": "string"},
{"name": "email", "type": ["null", "string"], "default": null},
{"name": "scores", "type": {"type": "array", "items": "int"}},
{"name": "metadata", "type": {"type": "map", "values": "string"}}
]
}
// Converts to ORC TypeDescription:
// struct<id:bigint,name:string,email:string,scores:array<int>,metadata:map<string,string>>// ORC TypeDescription: struct<product_id:bigint,product_name:string,price:decimal(10,2)>
//
// Converts to Avro schema:
{
"type": "record",
"name": "Product",
"fields": [
{"name": "product_id", "type": "long"},
{"name": "product_name", "type": "string"},
{"name": "price", "type": {"type": "bytes", "logicalType": "decimal", "precision": 10, "scale": 2}}
]
}Common type mappings between Avro and ORC formats:
| Avro Type | ORC Type | Notes |
|---|---|---|
boolean | boolean | Direct mapping |
int | int | Direct mapping |
long | bigint | Direct mapping |
float | float | Direct mapping |
double | double | Direct mapping |
string | string | Direct mapping |
bytes | binary | Direct mapping |
array<T> | array<T> | Recursive type mapping |
map<string,T> | map<string,T> | Recursive type mapping |
record | struct | Field-by-field mapping |
union | Complex | Requires special handling |
enum | string | Converted to string representation |
fixed | binary | Fixed-length binary |
Integration utilities for HFile-based bootstrap indexing with HBase compatibility.
/**
* HFile-based bootstrap index utilities
* Provides integration with HBase for efficient indexing
*/
public class HFileBootstrapIndex {
/**
* Key-value comparator for HBase integration
* Provides proper ordering for HFile operations
*/
public static class HoodieKVComparator extends CellComparatorImpl {
// Specialized comparator for Hudi key-value pairs
// Ensures proper ordering in HFile structures
}
}Specialized classes for reading and writing HFile-based bootstrap indexes.
/**
* HBase HFile bootstrap index reader
* Provides efficient reading of bootstrap index data
*/
public class HBaseHFileBootstrapIndexReader {
// Methods for reading bootstrap index data from HFiles
// Optimized for HBase storage patterns
}
/**
* HBase HFile bootstrap index writer
* Provides efficient writing of bootstrap index data
*/
public class HBaseHFileBootstrapIndexWriter {
// Methods for writing bootstrap index data to HFiles
// Optimized for HBase storage patterns and bulk loading
}Usage Examples:
import org.apache.hudi.common.util.*;
import org.apache.avro.Schema;
import org.apache.orc.TypeDescription;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
// Working with Parquet files
HoodieStorage storage = new HoodieHadoopStorage(storagePath, storageConf);
StoragePath parquetFile = new StoragePath("/data/table/partition/file.parquet");
// Read Parquet metadata
ParquetMetadata metadata = ParquetUtils.readMetadata(storage, parquetFile);
System.out.println("Number of rows: " + metadata.getBlocks().get(0).getRowCount());
System.out.println("Number of columns: " + metadata.getFileMetaData().getSchema().getColumns().size());
// Filter row keys from Parquet file
Set<String> keysToFind = Set.of("key1", "key2", "key3");
Set<Pair<String, Long>> foundKeys = ParquetUtils.filterRowKeys(storage, parquetFile, keysToFind);
for (Pair<String, Long> keyRow : foundKeys) {
System.out.println("Found key: " + keyRow.getLeft() + " at row: " + keyRow.getRight());
}
// Schema conversion between Avro and ORC
String avroSchemaJson = """
{
"type": "record",
"name": "Customer",
"fields": [
{"name": "customer_id", "type": "long"},
{"name": "name", "type": "string"},
{"name": "email", "type": ["null", "string"], "default": null},
{"name": "orders", "type": {"type": "array", "items": "string"}},
{"name": "preferences", "type": {"type": "map", "values": "string"}}
]
}
""";
Schema avroSchema = new Schema.Parser().parse(avroSchemaJson);
// Convert Avro to ORC
TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(avroSchema);
System.out.println("ORC Schema: " + orcSchema.toString());
// Output: struct<customer_id:bigint,name:string,email:string,orders:array<string>,preferences:map<string,string>>
// Convert back to Avro
Schema convertedBackSchema = AvroOrcUtils.createAvroSchema(orcSchema);
System.out.println("Converted back to Avro: " + convertedBackSchema.toString(true));
// Get ORC column information
List<String> columnNames = AvroOrcUtils.getOrcColumnNames(orcSchema);
System.out.println("Column names: " + columnNames);
// Output: [customer_id, name, email, orders, preferences]
Map<String, TypeDescription> fields = AvroOrcUtils.getOrcFields(orcSchema);
for (Map.Entry<String, TypeDescription> field : fields.entrySet()) {
System.out.println("Field: " + field.getKey() + ", Type: " + field.getValue().getCategory());
}
// Optimized Hadoop configuration for reading
Configuration baseConf = new Configuration();
Configuration readerConf = HadoopConfigUtils.getReaderConf(baseConf);
// Add shutdown hook for cleanup
Configuration confWithHook = HadoopConfigUtils.addShutdownHook(readerConf);
// Working with different file formats
StoragePath orcFile = new StoragePath("/data/table/partition/file.orc");
StoragePath hfile = new StoragePath("/data/index/region/family/hfile");
// ORC operations using OrcUtils
// (Specific methods depend on OrcUtils implementation)
// HFile operations using HFileUtils
// (Specific methods depend on HFileUtils implementation)
// Complex schema conversion example
String complexAvroSchema = """
{
"type": "record",
"name": "Transaction",
"fields": [
{"name": "id", "type": "string"},
{"name": "amount", "type": {"type": "bytes", "logicalType": "decimal", "precision": 10, "scale": 2}},
{"name": "timestamp", "type": {"type": "long", "logicalType": "timestamp-millis"}},
{"name": "metadata", "type": {
"type": "record",
"name": "TransactionMetadata",
"fields": [
{"name": "source", "type": "string"},
{"name": "tags", "type": {"type": "array", "items": "string"}}
]
}}
]
}
""";
Schema complexSchema = new Schema.Parser().parse(complexAvroSchema);
TypeDescription complexOrcSchema = AvroOrcUtils.createOrcSchema(complexSchema);
System.out.println("Complex ORC Schema: " + complexOrcSchema.toString());Install with Tessl CLI
npx tessl i tessl/maven-org-apache-hudi--hudi-hadoop-common