Apache Hudi Hadoop common utilities and components that provide core functionality for integrating Apache Hudi with Hadoop ecosystem including file system operations, configuration management, and Hadoop-specific utilities for managing data lakehouse operations
—
Factory pattern for creating format-specific file readers and writers with support for Avro, Parquet, and ORC formats in Hadoop environments. Provides comprehensive I/O capabilities for reading and writing structured data files.
Primary I/O factory for creating Hadoop-based file readers and writers with format-specific optimizations.
/**
* Factory for creating Hadoop-based file readers and writers
* Supports multiple record types and file formats
*/
public class HoodieHadoopIOFactory implements HoodieIOFactory {
/** Create I/O factory with storage backend */
public HoodieHadoopIOFactory(HoodieStorage storage);
/** Get reader factory for specific record type */
public HoodieFileReaderFactory getReaderFactory(HoodieRecord.HoodieRecordType recordType);
/** Get writer factory for specific record type */
public HoodieFileWriterFactory getWriterFactory(HoodieRecord.HoodieRecordType recordType);
/** Get format utilities for specific file format */
public FileFormatUtils getFileFormatUtils(HoodieFileFormat fileFormat);
/** Get storage instance for path */
public HoodieStorage getStorage(StoragePath storagePath);
/** Get storage instance with retry configuration */
public HoodieStorage getStorage(StoragePath path, boolean enableRetry,
long maxRetryIntervalMs, int maxRetryNumbers,
long initialRetryIntervalMs, String retryExceptions,
ConsistencyGuard consistencyGuard);
}Factory for creating Avro-based file readers supporting Parquet format with Avro serialization.
/**
* Factory for creating Avro file readers
* Specialized for Parquet files with Avro schema
*/
public class HoodieAvroFileReaderFactory implements HoodieFileReaderFactory {
/** Create reader factory with storage backend */
public HoodieAvroFileReaderFactory(HoodieStorage storage);
/** Create Parquet file reader for Avro records */
public HoodieAvroFileReader newParquetFileReader(HoodieStorage storage, StoragePath path);
}Factory for creating Avro-based file writers supporting multiple output formats.
/**
* Factory for creating Avro file writers
* Supports Parquet and ORC output formats
*/
public class HoodieAvroFileWriterFactory implements HoodieFileWriterFactory {
/** Create writer factory with storage backend */
public HoodieAvroFileWriterFactory(HoodieStorage storage);
}Avro-based Parquet file reader providing schema evolution and efficient columnar access.
/**
* Avro-based Parquet file reader
* Supports schema evolution and columnar data access
*/
public class HoodieAvroParquetReader implements HoodieFileReader {
/** Create reader for Parquet file with Avro schema */
public HoodieAvroParquetReader(HoodieStorage storage, StoragePath filePath);
/** Create reader with explicit writer schema */
public HoodieAvroParquetReader(HoodieStorage storage, StoragePath filePath,
Option<Schema> writerSchemaOpt);
/** Get the schema of the file */
public Schema getSchema();
/** Get iterator for records with custom reader schema */
public ClosableIterator<IndexedRecord> getRecordIterator(Schema readerSchema);
/** Get iterator for records with file schema */
public ClosableIterator<IndexedRecord> getRecordIterator();
/** Close the reader and release resources */
public void close();
}Avro-based Parquet file writer with bloom filter integration and metadata support.
/**
* Avro-based Parquet file writer
* Supports bloom filters and custom metadata
*/
public class HoodieAvroParquetWriter implements HoodieFileWriter {
/** Create writer with configuration and schema */
public HoodieAvroParquetWriter(StoragePath file, HoodieConfig config, Schema schema,
Task task, Option<BloomFilter> bloomFilterOpt,
boolean populateMetaFields);
/** Check if writer can accept more data */
public boolean canWrite();
/** Write Avro record with Hudi metadata */
public void writeAvroWithMetadata(HoodieKey key, IndexedRecord avroRecord);
/** Write Avro record with record key */
public void writeAvro(String recordKey, IndexedRecord record);
/** Close writer and return status */
public WriteStatus close();
/** Get current write status */
public WriteStatus getWriteStatus();
/** Get number of bytes written */
public long getBytesWritten();
}Avro-based ORC file reader supporting schema evolution and efficient columnar access.
/**
* Avro-based ORC file reader
* Supports schema evolution and columnar data access
*/
public class HoodieAvroOrcReader implements HoodieFileReader {
/** Create reader with explicit writer schema */
public HoodieAvroOrcReader(HoodieStorage storage, StoragePath filePath,
Option<Schema> writerSchemaOpt);
/** Get iterator for records with custom reader schema */
public ClosableIterator<IndexedRecord> getRecordIterator(Schema readerSchema);
/** Get iterator for records with file schema */
public ClosableIterator<IndexedRecord> getRecordIterator();
/** Close the reader and release resources */
public void close();
/** Get the schema of the file */
public Schema getSchema();
}Avro-based ORC file writer with bloom filter integration and metadata support.
/**
* Avro-based ORC file writer
* Supports bloom filters and custom metadata
*/
public class HoodieAvroOrcWriter implements HoodieFileWriter {
/** Create writer with configuration and schema */
public HoodieAvroOrcWriter(StoragePath filePath, HoodieConfig config, Schema schema,
Task task, boolean populateMetaFields,
Option<BloomFilter> bloomFilterOpt);
/** Check if writer can accept more data */
public boolean canWrite();
/** Write Avro record with Hudi metadata */
public void writeAvroWithMetadata(HoodieKey key, IndexedRecord avroRecord);
/** Write Avro record with record key */
public void writeAvro(String recordKey, IndexedRecord record);
/** Close writer and return status */
public WriteStatus close();
/** Get current write status */
public WriteStatus getWriteStatus();
/** Get number of bytes written */
public long getBytesWritten();
}Utilities for working with HBase HFile format in Hadoop environments.
/**
* Utilities for working with HFile format
* Provides HBase integration capabilities
*/
public class HoodieHFileUtils {
/** Create HFile reader with configuration */
public static HFile.Reader createHFileReader(FileSystem fs, Path path,
CacheConfig cacheConf, Configuration conf);
/** Get optimized configuration for HFile reading */
public static Configuration getHFileReaderConfiguration(Configuration conf);
/** Check if path points to an HFile */
public static boolean isHFile(StoragePath path);
}Parquet write support for Avro records with bloom filter integration.
/**
* Parquet write support for Avro with bloom filter integration
* Extends standard Parquet writing with Hudi-specific features
*/
public class HoodieAvroWriteSupport extends AvroWriteSupport<IndexedRecord> {
/** Create write support with schema and bloom filter */
public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema,
Option<BloomFilter> bloomFilterOpt, Properties properties);
/** Finalize write context with metadata */
public WriteSupport.FinalizedWriteContext finalizeWrite();
/** Add record key to bloom filter */
public void add(String recordKey);
/** Add custom footer metadata */
public void addFooterMetadata(String key, String value);
}Builder pattern for creating Hoodie-specific Avro Parquet readers.
/**
* Builder for Hoodie Avro Parquet readers
* Provides configuration and customization options
*/
public class HoodieAvroParquetReaderBuilder<T> extends ParquetReaderBuilder<T> {
/** Create builder with file path */
public HoodieAvroParquetReaderBuilder(Path path);
/** Create builder with input file */
public HoodieAvroParquetReaderBuilder(InputFile file);
/** Set Hadoop configuration */
public HoodieAvroParquetReaderBuilder<T> withConf(Configuration conf);
/** Build configured ParquetReader */
public ParquetReader<T> build();
}Parquet read support for Avro records with Hudi-specific optimizations.
/**
* Parquet read support for Avro records
* Extends standard AvroReadSupport with Hudi optimizations
*/
public class HoodieAvroReadSupport extends AvroReadSupport {
// Extends AvroReadSupport with Hoodie-specific functionality
// for reading Parquet files with Avro schema
}Usage Examples:
import org.apache.hudi.io.hadoop.*;
import org.apache.hudi.storage.hadoop.HoodieHadoopStorage;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
// Set up I/O factory
HoodieHadoopStorage storage = new HoodieHadoopStorage(storagePath, storageConf);
HoodieHadoopIOFactory ioFactory = new HoodieHadoopIOFactory(storage);
// Reading Parquet files
StoragePath parquetFile = new StoragePath("/data/table1/file.parquet");
// Create Avro Parquet reader
HoodieAvroParquetReader reader = new HoodieAvroParquetReader(storage, parquetFile);
Schema schema = reader.getSchema();
// Read records
try (ClosableIterator<IndexedRecord> iterator = reader.getRecordIterator()) {
while (iterator.hasNext()) {
IndexedRecord record = iterator.next();
// Process record
}
}
reader.close();
// Writing Parquet files
Schema writeSchema = new Schema.Parser().parse(schemaString);
StoragePath outputFile = new StoragePath("/data/table1/output.parquet");
HoodieAvroParquetWriter writer = new HoodieAvroParquetWriter(
outputFile,
hoodieConfig,
writeSchema,
task,
Option.empty(), // No bloom filter
true // Populate meta fields
);
// Write records
for (IndexedRecord record : records) {
writer.writeAvro("key", record);
}
WriteStatus status = writer.close();
System.out.println("Bytes written: " + status.getBytesWritten());
// Working with ORC files
HoodieAvroOrcReader orcReader = new HoodieAvroOrcReader(storage, orcFilePath, Option.empty());
try (ClosableIterator<IndexedRecord> iterator = orcReader.getRecordIterator()) {
while (iterator.hasNext()) {
IndexedRecord record = iterator.next();
// Process ORC record
}
}
orcReader.close();
// HFile operations
boolean isHFile = HoodieHFileUtils.isHFile(somePath);
if (isHFile) {
Configuration hfileConf = HoodieHFileUtils.getHFileReaderConfiguration(hadoopConf);
// Work with HFile using optimized configuration
}Install with Tessl CLI
npx tessl i tessl/maven-org-apache-hudi--hudi-hadoop-common