Apache Hudi Hadoop common utilities and components that provide core functionality for integrating Apache Hudi with Hadoop ecosystem including file system operations, configuration management, and Hadoop-specific utilities for managing data lakehouse operations
npx @tessl/cli install tessl/maven-org-apache-hudi--hudi-hadoop-common@1.0.0Apache Hudi Hadoop Common provides essential Hadoop integration components for Apache Hudi, an open data lakehouse platform. This library contains core utilities and abstractions that enable Hudi to work seamlessly with the Hadoop ecosystem, including HDFS file system operations, configuration management through DFSPropertiesConfiguration, and format-specific utilities for Parquet, ORC, and HFile formats.
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hadoop-common</artifactId>
<version>1.0.2</version>
</dependency>import org.apache.hudi.storage.hadoop.HoodieHadoopStorage;
import org.apache.hudi.storage.hadoop.HadoopStorageConfiguration;
import org.apache.hudi.io.hadoop.HoodieHadoopIOFactory;
import org.apache.hudi.common.config.DFSPropertiesConfiguration;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ParquetUtils;
import org.apache.hudi.common.util.AvroOrcUtils;
import org.apache.hudi.common.util.collection.ClosableIterator;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.MessageType;
import org.apache.orc.TypeDescription;
import org.apache.orc.storage.ql.exec.vector.ColumnVector;import org.apache.hudi.storage.hadoop.HoodieHadoopStorage;
import org.apache.hudi.storage.hadoop.HadoopStorageConfiguration;
import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.storage.StoragePath;
// Create Hadoop storage configuration
Configuration hadoopConf = new Configuration();
HadoopStorageConfiguration storageConf = new HadoopStorageConfiguration(hadoopConf);
// Initialize Hadoop storage
StoragePath path = new StoragePath("hdfs://example.com:8020/data");
HoodieHadoopStorage storage = new HoodieHadoopStorage(path, storageConf);
// Perform basic file operations
boolean exists = storage.exists(path);
InputStream inputStream = storage.open(path);
OutputStream outputStream = storage.create(path, true);Apache Hudi Hadoop Common is organized around several key architectural components:
HoodieHadoopStorage provides a unified interface for Hadoop FileSystem operations with consistency guarantees and retry mechanismsHoodieHadoopIOFactory creates format-specific readers and writers for Parquet, ORC, and HFile formatsDFSPropertiesConfiguration and HadoopStorageConfiguration handle Hadoop-specific configuration patternsHadoopFSUtils provides conversion utilities between Hudi and Hadoop path/configuration abstractionsCore Hadoop FileSystem abstraction with consistency guarantees, retry mechanisms, and unified interface for distributed storage operations.
public class HoodieHadoopStorage implements HoodieStorage {
public HoodieHadoopStorage(StoragePath path, StorageConfiguration<?> conf);
public InputStream open(StoragePath path);
public OutputStream create(StoragePath path, boolean overwrite);
public boolean exists(StoragePath path);
public List<StoragePathInfo> listDirectEntries(StoragePath path);
}Factory pattern for creating format-specific file readers and writers with support for Avro, Parquet, and ORC formats in Hadoop environments.
public class HoodieHadoopIOFactory implements HoodieIOFactory {
public HoodieHadoopIOFactory(HoodieStorage storage);
public HoodieFileReaderFactory getReaderFactory(HoodieRecord.HoodieRecordType recordType);
public HoodieFileWriterFactory getWriterFactory(HoodieRecord.HoodieRecordType recordType);
}Comprehensive utilities for Hadoop FileSystem operations, path conversions, and integration between Hudi and Hadoop abstractions.
public class HadoopFSUtils {
public static <T> FileSystem getFs(String pathStr, StorageConfiguration<T> storageConf);
public static StoragePath convertToStoragePath(Path path);
public static Path convertToHadoopPath(StoragePath path);
public static StorageConfiguration<Configuration> getStorageConf(Configuration conf);
}DFS-based configuration management with support for global properties, environment-specific settings, and Hadoop configuration integration.
public class DFSPropertiesConfiguration extends PropertiesConfig {
public DFSPropertiesConfiguration(Configuration hadoopConf, StoragePath filePath);
public static TypedProperties getGlobalProps();
public static DFSPropertiesConfiguration getGlobalDFSPropsConfiguration();
}Specialized utilities for working with Parquet, ORC, and HFile formats, including metadata reading, schema conversions, and format-specific optimizations.
public class ParquetUtils extends FileFormatUtils {
public static ParquetMetadata readMetadata(HoodieStorage storage, StoragePath parquetFilePath);
public static Set<Pair<String, Long>> filterRowKeys(HoodieStorage storage, StoragePath filePath, Set<String> filter);
}
public class AvroOrcUtils {
public static TypeDescription createOrcSchema(Schema avroSchema);
public static Schema createAvroSchema(TypeDescription orcSchema);
}// Storage path abstraction
class StoragePath {
public StoragePath(String path);
public String toString();
}
// Storage path information
class StoragePathInfo {
public StoragePath getPath();
public long getLength();
public boolean isDirectory();
public long getModificationTime();
}
// Storage configuration wrapper
interface StorageConfiguration<T> {
public T unwrap();
public String get(String key);
public void set(String key, String value);
}
// Consistency guard for file system operations
interface ConsistencyGuard {
public void waitTillFileAppears(StoragePath filePath);
public void waitTillFileDisappears(StoragePath filePath);
}