tessl/maven-org-apache-hudi--hudi-hadoop-common

Apache Hudi Hadoop common utilities and components that provide core functionality for integrating Apache Hudi with Hadoop ecosystem including file system operations, configuration management, and Hadoop-specific utilities for managing data lakehouse operations

—

Pending

Overview

Eval results

Files

I/O Operations

Name: tessl/maven-org-apache-hudi--hudi-hadoop-common
Author: tessl

Factory pattern for creating format-specific file readers and writers with support for Avro, Parquet, and ORC formats in Hadoop environments. Provides comprehensive I/O capabilities for reading and writing structured data files.

Capabilities

HoodieHadoopIOFactory

Primary I/O factory for creating Hadoop-based file readers and writers with format-specific optimizations.

/**
 * Factory for creating Hadoop-based file readers and writers
 * Supports multiple record types and file formats
 */
public class HoodieHadoopIOFactory implements HoodieIOFactory {
    
    /** Create I/O factory with storage backend */
    public HoodieHadoopIOFactory(HoodieStorage storage);
    
    /** Get reader factory for specific record type */
    public HoodieFileReaderFactory getReaderFactory(HoodieRecord.HoodieRecordType recordType);
    
    /** Get writer factory for specific record type */
    public HoodieFileWriterFactory getWriterFactory(HoodieRecord.HoodieRecordType recordType);
    
    /** Get format utilities for specific file format */
    public FileFormatUtils getFileFormatUtils(HoodieFileFormat fileFormat);
    
    /** Get storage instance for path */
    public HoodieStorage getStorage(StoragePath storagePath);
    
    /** Get storage instance with retry configuration */
    public HoodieStorage getStorage(StoragePath path, boolean enableRetry, 
                                   long maxRetryIntervalMs, int maxRetryNumbers, 
                                   long initialRetryIntervalMs, String retryExceptions, 
                                   ConsistencyGuard consistencyGuard);
}

Avro File Reader Factory

Factory for creating Avro-based file readers supporting Parquet format with Avro serialization.

/**
 * Factory for creating Avro file readers
 * Specialized for Parquet files with Avro schema
 */
public class HoodieAvroFileReaderFactory implements HoodieFileReaderFactory {
    
    /** Create reader factory with storage backend */
    public HoodieAvroFileReaderFactory(HoodieStorage storage);
    
    /** Create Parquet file reader for Avro records */
    public HoodieAvroFileReader newParquetFileReader(HoodieStorage storage, StoragePath path);
}

Avro File Writer Factory

Factory for creating Avro-based file writers supporting multiple output formats.

/**
 * Factory for creating Avro file writers
 * Supports Parquet and ORC output formats
 */
public class HoodieAvroFileWriterFactory implements HoodieFileWriterFactory {
    
    /** Create writer factory with storage backend */
    public HoodieAvroFileWriterFactory(HoodieStorage storage);
}

Avro Parquet Reader

Avro-based Parquet file reader providing schema evolution and efficient columnar access.

/**
 * Avro-based Parquet file reader
 * Supports schema evolution and columnar data access
 */
public class HoodieAvroParquetReader implements HoodieFileReader {
    
    /** Create reader for Parquet file with Avro schema */
    public HoodieAvroParquetReader(HoodieStorage storage, StoragePath filePath);
    
    /** Create reader with explicit writer schema */
    public HoodieAvroParquetReader(HoodieStorage storage, StoragePath filePath, 
                                  Option<Schema> writerSchemaOpt);
    
    /** Get the schema of the file */
    public Schema getSchema();
    
    /** Get iterator for records with custom reader schema */
    public ClosableIterator<IndexedRecord> getRecordIterator(Schema readerSchema);
    
    /** Get iterator for records with file schema */
    public ClosableIterator<IndexedRecord> getRecordIterator();
    
    /** Close the reader and release resources */
    public void close();
}

Avro Parquet Writer

Avro-based Parquet file writer with bloom filter integration and metadata support.

/**
 * Avro-based Parquet file writer
 * Supports bloom filters and custom metadata
 */
public class HoodieAvroParquetWriter implements HoodieFileWriter {
    
    /** Create writer with configuration and schema */
    public HoodieAvroParquetWriter(StoragePath file, HoodieConfig config, Schema schema, 
                                  Task task, Option<BloomFilter> bloomFilterOpt, 
                                  boolean populateMetaFields);
    
    /** Check if writer can accept more data */
    public boolean canWrite();
    
    /** Write Avro record with Hudi metadata */
    public void writeAvroWithMetadata(HoodieKey key, IndexedRecord avroRecord);
    
    /** Write Avro record with record key */
    public void writeAvro(String recordKey, IndexedRecord record);
    
    /** Close writer and return status */
    public WriteStatus close();
    
    /** Get current write status */
    public WriteStatus getWriteStatus();
    
    /** Get number of bytes written */
    public long getBytesWritten();
}

Avro ORC Reader

Avro-based ORC file reader supporting schema evolution and efficient columnar access.

/**
 * Avro-based ORC file reader
 * Supports schema evolution and columnar data access
 */
public class HoodieAvroOrcReader implements HoodieFileReader {
    
    /** Create reader with explicit writer schema */
    public HoodieAvroOrcReader(HoodieStorage storage, StoragePath filePath, 
                              Option<Schema> writerSchemaOpt);
    
    /** Get iterator for records with custom reader schema */
    public ClosableIterator<IndexedRecord> getRecordIterator(Schema readerSchema);
    
    /** Get iterator for records with file schema */
    public ClosableIterator<IndexedRecord> getRecordIterator();
    
    /** Close the reader and release resources */
    public void close();
    
    /** Get the schema of the file */
    public Schema getSchema();
}

Avro ORC Writer

Avro-based ORC file writer with bloom filter integration and metadata support.

/**
 * Avro-based ORC file writer
 * Supports bloom filters and custom metadata
 */
public class HoodieAvroOrcWriter implements HoodieFileWriter {
    
    /** Create writer with configuration and schema */
    public HoodieAvroOrcWriter(StoragePath filePath, HoodieConfig config, Schema schema, 
                              Task task, boolean populateMetaFields, 
                              Option<BloomFilter> bloomFilterOpt);
    
    /** Check if writer can accept more data */
    public boolean canWrite();
    
    /** Write Avro record with Hudi metadata */
    public void writeAvroWithMetadata(HoodieKey key, IndexedRecord avroRecord);
    
    /** Write Avro record with record key */
    public void writeAvro(String recordKey, IndexedRecord record);
    
    /** Close writer and return status */
    public WriteStatus close();
    
    /** Get current write status */
    public WriteStatus getWriteStatus();
    
    /** Get number of bytes written */
    public long getBytesWritten();
}

HFile Utilities

Utilities for working with HBase HFile format in Hadoop environments.

/**
 * Utilities for working with HFile format
 * Provides HBase integration capabilities
 */
public class HoodieHFileUtils {
    
    /** Create HFile reader with configuration */
    public static HFile.Reader createHFileReader(FileSystem fs, Path path, 
                                                CacheConfig cacheConf, Configuration conf);
    
    /** Get optimized configuration for HFile reading */
    public static Configuration getHFileReaderConfiguration(Configuration conf);
    
    /** Check if path points to an HFile */
    public static boolean isHFile(StoragePath path);
}

Parquet Write Support

Parquet write support for Avro records with bloom filter integration.

/**
 * Parquet write support for Avro with bloom filter integration
 * Extends standard Parquet writing with Hudi-specific features
 */
public class HoodieAvroWriteSupport extends AvroWriteSupport<IndexedRecord> {
    
    /** Create write support with schema and bloom filter */
    public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema, 
                                 Option<BloomFilter> bloomFilterOpt, Properties properties);
    
    /** Finalize write context with metadata */
    public WriteSupport.FinalizedWriteContext finalizeWrite();
    
    /** Add record key to bloom filter */
    public void add(String recordKey);
    
    /** Add custom footer metadata */
    public void addFooterMetadata(String key, String value);
}

Parquet Reader Builder

Builder pattern for creating Hoodie-specific Avro Parquet readers.

/**
 * Builder for Hoodie Avro Parquet readers
 * Provides configuration and customization options
 */
public class HoodieAvroParquetReaderBuilder<T> extends ParquetReaderBuilder<T> {
    
    /** Create builder with file path */
    public HoodieAvroParquetReaderBuilder(Path path);
    
    /** Create builder with input file */
    public HoodieAvroParquetReaderBuilder(InputFile file);
    
    /** Set Hadoop configuration */
    public HoodieAvroParquetReaderBuilder<T> withConf(Configuration conf);
    
    /** Build configured ParquetReader */
    public ParquetReader<T> build();
}

Parquet Read Support

Parquet read support for Avro records with Hudi-specific optimizations.

/**
 * Parquet read support for Avro records
 * Extends standard AvroReadSupport with Hudi optimizations
 */
public class HoodieAvroReadSupport extends AvroReadSupport {
    // Extends AvroReadSupport with Hoodie-specific functionality
    // for reading Parquet files with Avro schema
}

Usage Examples:

import org.apache.hudi.io.hadoop.*;
import org.apache.hudi.storage.hadoop.HoodieHadoopStorage;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;

// Set up I/O factory
HoodieHadoopStorage storage = new HoodieHadoopStorage(storagePath, storageConf);
HoodieHadoopIOFactory ioFactory = new HoodieHadoopIOFactory(storage);

// Reading Parquet files
StoragePath parquetFile = new StoragePath("/data/table1/file.parquet");

// Create Avro Parquet reader
HoodieAvroParquetReader reader = new HoodieAvroParquetReader(storage, parquetFile);
Schema schema = reader.getSchema();

// Read records
try (ClosableIterator<IndexedRecord> iterator = reader.getRecordIterator()) {
    while (iterator.hasNext()) {
        IndexedRecord record = iterator.next();
        // Process record
    }
}
reader.close();

// Writing Parquet files
Schema writeSchema = new Schema.Parser().parse(schemaString);
StoragePath outputFile = new StoragePath("/data/table1/output.parquet");

HoodieAvroParquetWriter writer = new HoodieAvroParquetWriter(
    outputFile, 
    hoodieConfig, 
    writeSchema, 
    task, 
    Option.empty(), // No bloom filter
    true  // Populate meta fields
);

// Write records
for (IndexedRecord record : records) {
    writer.writeAvro("key", record);
}

WriteStatus status = writer.close();
System.out.println("Bytes written: " + status.getBytesWritten());

// Working with ORC files
HoodieAvroOrcReader orcReader = new HoodieAvroOrcReader(storage, orcFilePath, Option.empty());
try (ClosableIterator<IndexedRecord> iterator = orcReader.getRecordIterator()) {
    while (iterator.hasNext()) {
        IndexedRecord record = iterator.next();
        // Process ORC record
    }
}
orcReader.close();

// HFile operations
boolean isHFile = HoodieHFileUtils.isHFile(somePath);
if (isHFile) {
    Configuration hfileConf = HoodieHFileUtils.getHFileReaderConfiguration(hadoopConf);
    // Work with HFile using optimized configuration
}

Install with Tessl CLI

npx tessl i tessl/maven-org-apache-hudi--hudi-hadoop-common

docs

configuration-management.md

filesystem-utilities.md

format-utilities.md

index.md

io-operations.md

storage-operations.md

tile.json