CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-apache-hudi--hudi-hadoop-common

Apache Hudi Hadoop common utilities and components that provide core functionality for integrating Apache Hudi with Hadoop ecosystem including file system operations, configuration management, and Hadoop-specific utilities for managing data lakehouse operations

Pending
Overview
Eval results
Files

storage-operations.mddocs/

Storage Operations

Core Hadoop FileSystem abstraction providing unified interface for distributed storage operations with consistency guarantees, retry mechanisms, and comprehensive file system capabilities.

Capabilities

HoodieHadoopStorage

Primary storage implementation providing Hadoop FileSystem-based operations with built-in retry logic and consistency guarantees.

/**
 * Hadoop FileSystem-based implementation of HoodieStorage interface
 * Provides distributed storage operations with consistency and retry support
 */
public class HoodieHadoopStorage implements HoodieStorage {
    
    /** Create storage instance with path and configuration */
    public HoodieHadoopStorage(StoragePath path, StorageConfiguration<?> conf);
    
    /** Create storage instance with Hadoop path and configuration */
    public HoodieHadoopStorage(Path path, Configuration conf);
    
    /** Create storage instance with string path and configuration */
    public HoodieHadoopStorage(String path, Configuration conf);
    
    /** Create storage instance with string path and storage configuration */
    public HoodieHadoopStorage(String path, StorageConfiguration<?> conf);
    
    /** Create storage instance with retry configuration */
    public HoodieHadoopStorage(StoragePath path, StorageConfiguration<?> conf, 
                               boolean enableRetry, long maxRetryIntervalMs, 
                               int maxRetryNumbers, long initialRetryIntervalMs, 
                               String retryExceptions, ConsistencyGuard consistencyGuard);
    
    /** Create storage instance from existing FileSystem */
    public HoodieHadoopStorage(FileSystem fs);
}

File Operations

Core file system operations for reading, writing, and managing files in distributed storage.

/**
 * Opens an input stream for reading from the specified path
 * @param path - Storage path to read from
 * @return InputStream for reading file contents
 */
public InputStream open(StoragePath path);

/**
 * Opens a seekable data input stream with buffering options
 * @param path - Storage path to read from
 * @param bufferSize - Buffer size for read operations
 * @param wrapStream - Whether to wrap the stream
 * @return SeekableDataInputStream for random access reading
 */
public SeekableDataInputStream openSeekable(StoragePath path, int bufferSize, boolean wrapStream);

/**
 * Creates an output stream for writing to the specified path
 * @param path - Storage path to write to
 * @param overwrite - Whether to overwrite existing file
 * @return OutputStream for writing file contents
 */
public OutputStream create(StoragePath path, boolean overwrite);

/**
 * Creates an output stream with advanced options
 * @param path - Storage path to write to
 * @param overwrite - Whether to overwrite existing file
 * @param bufferSize - Buffer size for write operations (nullable)
 * @param replication - Replication factor (nullable)
 * @param sizeThreshold - Size threshold for optimization (nullable)
 * @return OutputStream for writing file contents
 */
public OutputStream create(StoragePath path, boolean overwrite, 
                          Integer bufferSize, Short replication, Long sizeThreshold);

/**
 * Appends to an existing file
 * @param path - Storage path to append to
 * @return OutputStream for appending to file
 */
public OutputStream append(StoragePath path);

/**
 * Creates a new empty file if it doesn't exist
 * @param path - Storage path for new file
 * @return true if file was created, false if it already existed
 */
public boolean createNewFile(StoragePath path);

Directory and Metadata Operations

Operations for managing directories and retrieving file metadata information.

/**
 * Checks if a path exists in storage
 * @param path - Storage path to check
 * @return true if path exists, false otherwise
 */
public boolean exists(StoragePath path);

/**
 * Gets detailed information about a path
 * @param path - Storage path to inspect
 * @return StoragePathInfo with metadata
 */
public StoragePathInfo getPathInfo(StoragePath path);

/**
 * Creates a directory and any necessary parent directories
 * @param path - Storage path for directory to create
 * @return true if directory was created or already exists
 */
public boolean createDirectory(StoragePath path);

/**
 * Lists direct entries in a directory
 * @param path - Directory path to list
 * @return List of StoragePathInfo for direct children
 */
public List<StoragePathInfo> listDirectEntries(StoragePath path);

/**
 * Lists all files recursively in a directory
 * @param path - Directory path to list
 * @return List of StoragePathInfo for all files
 */
public List<StoragePathInfo> listFiles(StoragePath path);

/**
 * Lists direct entries with filtering
 * @param path - Directory path to list
 * @param filter - StoragePathFilter to apply
 * @return Filtered list of StoragePathInfo
 */
public List<StoragePathInfo> listDirectEntries(StoragePath path, StoragePathFilter filter);

/**
 * Lists direct entries for multiple paths
 * @param pathList - List of directory paths to list
 * @return Combined list of StoragePathInfo
 */
public List<StoragePathInfo> listDirectEntries(List<StoragePath> pathList);

/**
 * Finds entries matching a glob pattern
 * @param pathPattern - Glob pattern to match
 * @return List of StoragePathInfo matching pattern
 */
public List<StoragePathInfo> globEntries(StoragePath pathPattern);

File Management Operations

Operations for moving, deleting, and modifying files and directories.

/**
 * Renames or moves a file/directory
 * @param oldPath - Current path
 * @param newPath - New path
 * @return true if rename succeeded
 */
public boolean rename(StoragePath oldPath, StoragePath newPath);

/**
 * Deletes a directory and all its contents
 * @param path - Directory path to delete
 * @return true if deletion succeeded
 */
public boolean deleteDirectory(StoragePath path);

/**
 * Deletes a single file
 * @param path - File path to delete
 * @return true if deletion succeeded
 */
public boolean deleteFile(StoragePath path);

/**
 * Sets modification time for a path
 * @param path - Storage path to modify
 * @param modificationTimeInMillisEpoch - New modification time in milliseconds since epoch
 */
public void setModificationTime(StoragePath path, long modificationTimeInMillisEpoch);

Storage Configuration and Properties

Storage configuration methods and property access.

/**
 * Gets the URI scheme for this storage
 * @return Scheme string (e.g., "hdfs", "file")
 */
public String getScheme();

/**
 * Gets the URI for this storage
 * @return URI representing the storage location
 */
public URI getUri();

/**
 * Gets default block size for a path
 * @param path - Storage path to check
 * @return Default block size in bytes
 */
public int getDefaultBlockSize(StoragePath path);

/**
 * Gets default buffer size for I/O operations
 * @return Default buffer size in bytes
 */
public int getDefaultBufferSize();

/**
 * Gets default replication factor for a path
 * @param path - Storage path to check
 * @return Default replication factor
 */
public short getDefaultReplication(StoragePath path);

/**
 * Gets the underlying FileSystem object
 * @return Hadoop FileSystem instance
 */
public Object getFileSystem();

/**
 * Gets the raw storage implementation
 * @return Raw HoodieStorage instance
 */
public HoodieStorage getRawStorage();

/**
 * Creates a new storage instance with different path
 * @param path - New storage path
 * @param storageConf - Storage configuration
 * @return New HoodieStorage instance
 */
public HoodieStorage newInstance(StoragePath path, StorageConfiguration<?> storageConf);

/**
 * Closes the storage and releases resources
 */
public void close();

HadoopStorageConfiguration

Hadoop Configuration wrapper providing storage configuration abstraction.

/**
 * Hadoop Configuration wrapper for storage configuration
 */
public class HadoopStorageConfiguration extends StorageConfiguration<Configuration> {
    
    /** Create configuration wrapper with defaults flag */
    public HadoopStorageConfiguration(Boolean loadDefaults);
    
    /** Create configuration wrapper */
    public HadoopStorageConfiguration(Configuration configuration);
    
    /** Create configuration wrapper with copy option */
    public HadoopStorageConfiguration(Configuration configuration, boolean copy);
    
    /** Create configuration wrapper from existing instance */
    public HadoopStorageConfiguration(HadoopStorageConfiguration configuration);
    
    /** Create new instance of this configuration */
    public StorageConfiguration<Configuration> newInstance();
    
    /** Get underlying Hadoop Configuration */
    public Configuration unwrap();
    
    /** Get copy of underlying configuration */
    public Configuration unwrapCopy();
    
    /** Set configuration property */
    public void set(String key, String value);
    
    /** Get configuration property as Option */
    public Option<String> getString(String key);
    
    /** Get inline configuration for InLineFileSystem */
    public StorageConfiguration<Configuration> getInline();
}

Usage Examples:

import org.apache.hudi.storage.hadoop.HoodieHadoopStorage;
import org.apache.hudi.storage.hadoop.HadoopStorageConfiguration;
import org.apache.hadoop.conf.Configuration;

// Basic storage setup
Configuration hadoopConf = new Configuration();
hadoopConf.set("fs.defaultFS", "hdfs://namenode:8020");

HadoopStorageConfiguration storageConf = new HadoopStorageConfiguration(hadoopConf);
StoragePath basePath = new StoragePath("hdfs://namenode:8020/data/hudi");

HoodieHadoopStorage storage = new HoodieHadoopStorage(basePath, storageConf);

// File operations
StoragePath filePath = new StoragePath(basePath, "table1/partition1/file.parquet");

// Check existence and read
if (storage.exists(filePath)) {
    try (InputStream input = storage.open(filePath)) {
        // Process file content
    }
}

// Write new file
StoragePath outputPath = new StoragePath(basePath, "table1/partition2/output.parquet");
try (OutputStream output = storage.create(outputPath, true)) {
    // Write data
}

// List directory contents
List<StoragePathInfo> entries = storage.listDirectEntries(basePath);
for (StoragePathInfo entry : entries) {
    System.out.println("Path: " + entry.getPath() + 
                      ", Size: " + entry.getLength() + 
                      ", IsDir: " + entry.isDirectory());
}

Install with Tessl CLI

npx tessl i tessl/maven-org-apache-hudi--hudi-hadoop-common

docs

configuration-management.md

filesystem-utilities.md

format-utilities.md

index.md

io-operations.md

storage-operations.md

tile.json