Apache Hudi Hadoop common utilities and components that provide core functionality for integrating Apache Hudi with Hadoop ecosystem including file system operations, configuration management, and Hadoop-specific utilities for managing data lakehouse operations
—
Core Hadoop FileSystem abstraction providing unified interface for distributed storage operations with consistency guarantees, retry mechanisms, and comprehensive file system capabilities.
Primary storage implementation providing Hadoop FileSystem-based operations with built-in retry logic and consistency guarantees.
/**
* Hadoop FileSystem-based implementation of HoodieStorage interface
* Provides distributed storage operations with consistency and retry support
*/
public class HoodieHadoopStorage implements HoodieStorage {
/** Create storage instance with path and configuration */
public HoodieHadoopStorage(StoragePath path, StorageConfiguration<?> conf);
/** Create storage instance with Hadoop path and configuration */
public HoodieHadoopStorage(Path path, Configuration conf);
/** Create storage instance with string path and configuration */
public HoodieHadoopStorage(String path, Configuration conf);
/** Create storage instance with string path and storage configuration */
public HoodieHadoopStorage(String path, StorageConfiguration<?> conf);
/** Create storage instance with retry configuration */
public HoodieHadoopStorage(StoragePath path, StorageConfiguration<?> conf,
boolean enableRetry, long maxRetryIntervalMs,
int maxRetryNumbers, long initialRetryIntervalMs,
String retryExceptions, ConsistencyGuard consistencyGuard);
/** Create storage instance from existing FileSystem */
public HoodieHadoopStorage(FileSystem fs);
}Core file system operations for reading, writing, and managing files in distributed storage.
/**
* Opens an input stream for reading from the specified path
* @param path - Storage path to read from
* @return InputStream for reading file contents
*/
public InputStream open(StoragePath path);
/**
* Opens a seekable data input stream with buffering options
* @param path - Storage path to read from
* @param bufferSize - Buffer size for read operations
* @param wrapStream - Whether to wrap the stream
* @return SeekableDataInputStream for random access reading
*/
public SeekableDataInputStream openSeekable(StoragePath path, int bufferSize, boolean wrapStream);
/**
* Creates an output stream for writing to the specified path
* @param path - Storage path to write to
* @param overwrite - Whether to overwrite existing file
* @return OutputStream for writing file contents
*/
public OutputStream create(StoragePath path, boolean overwrite);
/**
* Creates an output stream with advanced options
* @param path - Storage path to write to
* @param overwrite - Whether to overwrite existing file
* @param bufferSize - Buffer size for write operations (nullable)
* @param replication - Replication factor (nullable)
* @param sizeThreshold - Size threshold for optimization (nullable)
* @return OutputStream for writing file contents
*/
public OutputStream create(StoragePath path, boolean overwrite,
Integer bufferSize, Short replication, Long sizeThreshold);
/**
* Appends to an existing file
* @param path - Storage path to append to
* @return OutputStream for appending to file
*/
public OutputStream append(StoragePath path);
/**
* Creates a new empty file if it doesn't exist
* @param path - Storage path for new file
* @return true if file was created, false if it already existed
*/
public boolean createNewFile(StoragePath path);Operations for managing directories and retrieving file metadata information.
/**
* Checks if a path exists in storage
* @param path - Storage path to check
* @return true if path exists, false otherwise
*/
public boolean exists(StoragePath path);
/**
* Gets detailed information about a path
* @param path - Storage path to inspect
* @return StoragePathInfo with metadata
*/
public StoragePathInfo getPathInfo(StoragePath path);
/**
* Creates a directory and any necessary parent directories
* @param path - Storage path for directory to create
* @return true if directory was created or already exists
*/
public boolean createDirectory(StoragePath path);
/**
* Lists direct entries in a directory
* @param path - Directory path to list
* @return List of StoragePathInfo for direct children
*/
public List<StoragePathInfo> listDirectEntries(StoragePath path);
/**
* Lists all files recursively in a directory
* @param path - Directory path to list
* @return List of StoragePathInfo for all files
*/
public List<StoragePathInfo> listFiles(StoragePath path);
/**
* Lists direct entries with filtering
* @param path - Directory path to list
* @param filter - StoragePathFilter to apply
* @return Filtered list of StoragePathInfo
*/
public List<StoragePathInfo> listDirectEntries(StoragePath path, StoragePathFilter filter);
/**
* Lists direct entries for multiple paths
* @param pathList - List of directory paths to list
* @return Combined list of StoragePathInfo
*/
public List<StoragePathInfo> listDirectEntries(List<StoragePath> pathList);
/**
* Finds entries matching a glob pattern
* @param pathPattern - Glob pattern to match
* @return List of StoragePathInfo matching pattern
*/
public List<StoragePathInfo> globEntries(StoragePath pathPattern);Operations for moving, deleting, and modifying files and directories.
/**
* Renames or moves a file/directory
* @param oldPath - Current path
* @param newPath - New path
* @return true if rename succeeded
*/
public boolean rename(StoragePath oldPath, StoragePath newPath);
/**
* Deletes a directory and all its contents
* @param path - Directory path to delete
* @return true if deletion succeeded
*/
public boolean deleteDirectory(StoragePath path);
/**
* Deletes a single file
* @param path - File path to delete
* @return true if deletion succeeded
*/
public boolean deleteFile(StoragePath path);
/**
* Sets modification time for a path
* @param path - Storage path to modify
* @param modificationTimeInMillisEpoch - New modification time in milliseconds since epoch
*/
public void setModificationTime(StoragePath path, long modificationTimeInMillisEpoch);Storage configuration methods and property access.
/**
* Gets the URI scheme for this storage
* @return Scheme string (e.g., "hdfs", "file")
*/
public String getScheme();
/**
* Gets the URI for this storage
* @return URI representing the storage location
*/
public URI getUri();
/**
* Gets default block size for a path
* @param path - Storage path to check
* @return Default block size in bytes
*/
public int getDefaultBlockSize(StoragePath path);
/**
* Gets default buffer size for I/O operations
* @return Default buffer size in bytes
*/
public int getDefaultBufferSize();
/**
* Gets default replication factor for a path
* @param path - Storage path to check
* @return Default replication factor
*/
public short getDefaultReplication(StoragePath path);
/**
* Gets the underlying FileSystem object
* @return Hadoop FileSystem instance
*/
public Object getFileSystem();
/**
* Gets the raw storage implementation
* @return Raw HoodieStorage instance
*/
public HoodieStorage getRawStorage();
/**
* Creates a new storage instance with different path
* @param path - New storage path
* @param storageConf - Storage configuration
* @return New HoodieStorage instance
*/
public HoodieStorage newInstance(StoragePath path, StorageConfiguration<?> storageConf);
/**
* Closes the storage and releases resources
*/
public void close();Hadoop Configuration wrapper providing storage configuration abstraction.
/**
* Hadoop Configuration wrapper for storage configuration
*/
public class HadoopStorageConfiguration extends StorageConfiguration<Configuration> {
/** Create configuration wrapper with defaults flag */
public HadoopStorageConfiguration(Boolean loadDefaults);
/** Create configuration wrapper */
public HadoopStorageConfiguration(Configuration configuration);
/** Create configuration wrapper with copy option */
public HadoopStorageConfiguration(Configuration configuration, boolean copy);
/** Create configuration wrapper from existing instance */
public HadoopStorageConfiguration(HadoopStorageConfiguration configuration);
/** Create new instance of this configuration */
public StorageConfiguration<Configuration> newInstance();
/** Get underlying Hadoop Configuration */
public Configuration unwrap();
/** Get copy of underlying configuration */
public Configuration unwrapCopy();
/** Set configuration property */
public void set(String key, String value);
/** Get configuration property as Option */
public Option<String> getString(String key);
/** Get inline configuration for InLineFileSystem */
public StorageConfiguration<Configuration> getInline();
}Usage Examples:
import org.apache.hudi.storage.hadoop.HoodieHadoopStorage;
import org.apache.hudi.storage.hadoop.HadoopStorageConfiguration;
import org.apache.hadoop.conf.Configuration;
// Basic storage setup
Configuration hadoopConf = new Configuration();
hadoopConf.set("fs.defaultFS", "hdfs://namenode:8020");
HadoopStorageConfiguration storageConf = new HadoopStorageConfiguration(hadoopConf);
StoragePath basePath = new StoragePath("hdfs://namenode:8020/data/hudi");
HoodieHadoopStorage storage = new HoodieHadoopStorage(basePath, storageConf);
// File operations
StoragePath filePath = new StoragePath(basePath, "table1/partition1/file.parquet");
// Check existence and read
if (storage.exists(filePath)) {
try (InputStream input = storage.open(filePath)) {
// Process file content
}
}
// Write new file
StoragePath outputPath = new StoragePath(basePath, "table1/partition2/output.parquet");
try (OutputStream output = storage.create(outputPath, true)) {
// Write data
}
// List directory contents
List<StoragePathInfo> entries = storage.listDirectEntries(basePath);
for (StoragePathInfo entry : entries) {
System.out.println("Path: " + entry.getPath() +
", Size: " + entry.getLength() +
", IsDir: " + entry.isDirectory());
}Install with Tessl CLI
npx tessl i tessl/maven-org-apache-hudi--hudi-hadoop-common