CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-apache-hudi--hudi-hadoop-common

Apache Hudi Hadoop common utilities and components that provide core functionality for integrating Apache Hudi with Hadoop ecosystem including file system operations, configuration management, and Hadoop-specific utilities for managing data lakehouse operations

Pending
Overview
Eval results
Files

filesystem-utilities.mddocs/

File System Utilities

Comprehensive utilities for Hadoop FileSystem operations, path conversions, and integration between Hudi and Hadoop abstractions. Provides wrapper functionality, consistency guarantees, and retry mechanisms for reliable distributed file system operations.

Capabilities

HadoopFSUtils

Core utility class providing conversion functions and FileSystem management for Hadoop integration.

/**
 * Hadoop FileSystem utility functions
 * Provides conversion utilities and FileSystem management
 */
public class HadoopFSUtils {
    
    /** Prepare Hadoop configuration with Hudi-specific settings */
    public static Configuration prepareHadoopConf(Configuration conf);
    
    /** Get storage configuration from Hadoop configuration */
    public static StorageConfiguration<Configuration> getStorageConf(Configuration conf);
    
    /** Get default storage configuration */
    public static StorageConfiguration<Configuration> getStorageConf();
    
    /** Get storage configuration with copy of Hadoop configuration */
    public static StorageConfiguration<Configuration> getStorageConfWithCopy(Configuration conf);
    
    /** Get FileSystem for path string with storage configuration */
    public static <T> FileSystem getFs(String pathStr, StorageConfiguration<T> storageConf);
    
    /** Get FileSystem for Hadoop Path with storage configuration */
    public static <T> FileSystem getFs(Path path, StorageConfiguration<T> storageConf);
    
    /** Get FileSystem for path string with Hadoop configuration */
    public static FileSystem getFs(String pathStr, Configuration conf);
    
    /** Get FileSystem for Hadoop Path with configuration */
    public static FileSystem getFs(Path path, Configuration conf);
    
    /** Get FileSystem for StoragePath with configuration */
    public static FileSystem getFs(StoragePath path, Configuration conf);
    
    /** Get FileSystem with newCopy option */
    public static <T> FileSystem getFs(String pathStr, StorageConfiguration<T> storageConf, boolean newCopy);
    
    /** Get FileSystem for Path with newCopy option */
    public static <T> FileSystem getFs(Path path, StorageConfiguration<T> storageConf, boolean newCopy);
    
    /** Get FileSystem with local default option */
    public static FileSystem getFs(String pathStr, Configuration conf, boolean localByDefault);
    
    /** Add scheme to local path if missing */
    public static Path addSchemeIfLocalPath(String path);
    
    /** Convert StoragePath to Hadoop Path */
    public static Path convertToHadoopPath(StoragePath path);
    
    /** Convert Hadoop Path to StoragePath */
    public static StoragePath convertToStoragePath(Path path);
    
    /** Convert FileStatus to StoragePathInfo */
    public static StoragePathInfo convertToStoragePathInfo(FileStatus fileStatus);
    
    /** Convert FileStatus to StoragePathInfo with locations */
    public static StoragePathInfo convertToStoragePathInfo(FileStatus fileStatus, String[] locations);
    
    /** Convert StoragePathInfo to Hadoop FileStatus */
    public static FileStatus convertToHadoopFileStatus(StoragePathInfo pathInfo);
    
    /** Get FSDataInputStream with buffer settings */
    public static FSDataInputStream getFSDataInputStream(FileSystem fs, StoragePath path, int bufferSize, boolean wrapStream);
    
    /** Check if FileSystem is Google Cloud Storage */
    public static boolean isGCSFileSystem(FileSystem fs);
    
    /** Check if FileSystem is Cloudera Hadoop Distribution */
    public static boolean isCHDFileSystem(FileSystem fs);
    
    /** Register file system for path */
    public static Configuration registerFileSystem(StoragePath file, Configuration conf);
    
    /** Get file size from FileSystem */
    public static long getFileSize(FileSystem fs, Path path);
    
    /** Get relative partition path */
    public static String getRelativePartitionPath(Path basePath, Path fullPartitionPath);
    
    /** Get file ID from log path */
    public static String getFileIdFromLogPath(Path path);
    
    /** Get delta commit time from log path */
    public static String getDeltaCommitTimeFromLogPath(Path path);
    
    /** Get file ID from file path */
    public static String getFileIdFromFilePath(Path filePath);
    
    /** Check if path is base file */
    public static boolean isBaseFile(Path path);
    
    /** Check if path is log file */
    public static boolean isLogFile(Path logPath);
    
    /** Check if path is data file */
    public static boolean isDataFile(Path path);
    
    /** Get all data files in partition */
    public static FileStatus[] getAllDataFilesInPartition(FileSystem fs, Path partitionPath);
    
    /** Construct absolute path in Hadoop */
    public static Path constructAbsolutePathInHadoopPath(String basePath, String relativePartitionPath);
    
    /** Get DFS full partition path */
    public static String getDFSFullPartitionPath(FileSystem fs, Path fullPartitionPath);
    
    /** Parallelize files processing */
    public static <T> Map<String, T> parallelizeFilesProcess(Configuration configuration, List<String> filePathList, 
                                                            SerializableFunction<String, T> func, int parallelism);
    
    /** Get file status at level */
    public static List<FileStatus> getFileStatusAtLevel(Configuration configuration, Path path, int level, 
                                                       int parallelism, String[] subPathFilters, List<Path> subPaths);
    
    /** Delete files in parallel */
    public static Map<String, Boolean> deleteFilesParallelize(Configuration configuration, List<String> filePathList, 
                                                              int parallelism);
}

Path Conversion Utilities

Utilities for converting between Hudi and Hadoop path representations.

/**
 * Add scheme to local path if missing
 * @param path - Path string that may need scheme
 * @return Hadoop Path with proper scheme
 */
public static Path addSchemeIfLocalPath(String path);

/**
 * Convert StoragePath to Hadoop Path
 * @param path - Hudi StoragePath
 * @return Equivalent Hadoop Path
 */
public static Path convertToHadoopPath(StoragePath path);

/**
 * Convert Hadoop Path to StoragePath
 * @param path - Hadoop Path
 * @return Equivalent Hudi StoragePath
 */
public static StoragePath convertToStoragePath(Path path);

/**
 * Convert Hadoop FileStatus to StoragePathInfo
 * @param fileStatus - Hadoop FileStatus
 * @return Equivalent StoragePathInfo
 */
public static StoragePathInfo convertToStoragePathInfo(FileStatus fileStatus);

/**
 * Convert StoragePathInfo to Hadoop FileStatus
 * @param pathInfo - Hudi StoragePathInfo
 * @return Equivalent Hadoop FileStatus
 */
public static FileStatus convertToHadoopFileStatus(StoragePathInfo pathInfo);

/**
 * Convert HoodiePath to Hadoop Path (legacy)
 * @param path - Legacy HoodiePath
 * @return Hadoop Path
 */
public static Path toPath(HoodiePath path);

/**
 * Convert Hadoop Path to HoodiePath (legacy)
 * @param path - Hadoop Path
 * @return Legacy HoodiePath
 */
public static HoodiePath fromPath(Path path);

Permission and Status Conversions

Utilities for converting file permissions and status objects between Hudi and Hadoop representations.

/**
 * Convert HoodieFSPermission to Hadoop FsPermission
 * @param fsPermission - Hudi file system permission
 * @return Hadoop FsPermission
 */
public static FsPermission toFSPermission(HoodieFSPermission fsPermission);

/**
 * Convert Hadoop FsPermission to HoodieFSPermission
 * @param fsPermission - Hadoop file system permission
 * @return Hudi HoodieFSPermission
 */
public static HoodieFSPermission fromFSPermission(FsPermission fsPermission);

/**
 * Convert Hadoop FileStatus to HoodieFileStatus
 * @param fileStatus - Hadoop FileStatus
 * @return Hudi HoodieFileStatus
 */
public static HoodieFileStatus fromFileStatus(FileStatus fileStatus);

File System Operations

Advanced file system operations and metadata utilities.

/**
 * Get FSDataInputStream with buffering options
 * @param fs - Hadoop FileSystem
 * @param filePath - Storage path to read
 * @param bufferSize - Buffer size for reading
 * @param wrapStream - Whether to wrap the stream
 * @return FSDataInputStream for reading
 */
public static FSDataInputStream getFSDataInputStream(FileSystem fs, StoragePath filePath, 
                                                    int bufferSize, boolean wrapStream);

/**
 * Get file size from FileSystem
 * @param fs - Hadoop FileSystem
 * @param path - Hadoop Path to check
 * @return File size in bytes
 */
public static long getFileSize(FileSystem fs, Path path);

/**
 * Get relative partition path
 * @param basePath - Base path of the table
 * @param fullPartitionPath - Full path to the partition
 * @return Relative partition path string
 */
public static String getRelativePartitionPath(Path basePath, Path fullPartitionPath);

/**
 * Register FileSystem for the given file
 * @param file - Storage path for file
 * @param conf - Hadoop configuration
 * @return Updated configuration with registered FileSystem
 */
public static Configuration registerFileSystem(StoragePath file, Configuration conf);

File System Type Detection

Methods for detecting specific FileSystem implementations.

/**
 * Check if FileSystem is Google Cloud Storage
 * @param fs - FileSystem to check
 * @return true if GCS FileSystem
 */
public static boolean isGCSFileSystem(FileSystem fs);

/**
 * Check if FileSystem is Cloudera CHD FileSystem
 * @param fs - FileSystem to check
 * @return true if CHD FileSystem
 */
public static boolean isCHDFileSystem(FileSystem fs);

File Identification Utilities

Utilities for extracting information from file paths and identifying file types.

/**
 * Extract file ID from log file path
 * @param path - Hadoop Path to log file
 * @return File ID string
 */
public static String getFileIdFromLogPath(Path path);

/**
 * Extract delta commit time from log file path
 * @param path - Hadoop Path to log file
 * @return Delta commit time string
 */
public static String getDeltaCommitTimeFromLogPath(Path path);

/**
 * Extract file ID from any file path
 * @param filePath - Hadoop Path to file
 * @return File ID string
 */
public static String getFileIdFromFilePath(Path filePath);

/**
 * Check if path points to a base file
 * @param path - Hadoop Path to check
 * @return true if base file
 */
public static boolean isBaseFile(Path path);

/**
 * Check if path points to a log file
 * @param logPath - Hadoop Path to check
 * @return true if log file
 */
public static boolean isLogFile(Path logPath);

/**
 * Check if path points to a data file (base or log)
 * @param path - Hadoop Path to check
 * @return true if data file
 */
public static boolean isDataFile(Path path);

/**
 * Get all data files in a partition
 * @param fs - Hadoop FileSystem
 * @param partitionPath - Path to partition directory
 * @return Array of FileStatus for data files
 */
public static FileStatus[] getAllDataFilesInPartition(FileSystem fs, Path partitionPath);

Distributed FileSystem Operations

Specialized operations for distributed file systems like HDFS.

/**
 * Recover DFS file lease for write operations
 * @param dfs - DistributedFileSystem instance
 * @param p - Path to file with lease issues
 * @return true if lease recovery succeeded
 */
public static boolean recoverDFSFileLease(DistributedFileSystem dfs, Path p);

HoodieWrapperFileSystem

FileSystem wrapper providing consistency guarantees through ConsistencyGuard integration.

/**
 * Wrapper FileSystem with consistency guarantees
 * Ensures file operations respect consistency requirements
 */
public class HoodieWrapperFileSystem extends FileSystem {
    
    /** Create wrapper with FileSystem and consistency guard */
    public HoodieWrapperFileSystem(FileSystem fs, ConsistencyGuard consistencyGuard);
    
    // Extends all FileSystem interface methods with consistency checks
    // All standard FileSystem operations are available with consistency guarantees
}

HoodieRetryWrapperFileSystem

FileSystem wrapper providing retry capabilities for unreliable file system operations.

/**
 * FileSystem wrapper with retry capabilities
 * Automatically retries failed operations with configurable parameters
 */
public class HoodieRetryWrapperFileSystem extends FileSystem {
    
    /** Create retry wrapper with configuration */
    public HoodieRetryWrapperFileSystem(FileSystem fs, long maxRetryIntervalMs, 
                                       int maxRetryNumbers, long initialRetryIntervalMs, 
                                       String retryExceptions);
    
    // Extends all FileSystem interface methods with retry logic
    // Operations are automatically retried on transient failures
}

HadoopSeekableDataInputStream

Seekable data input stream implementation for Hadoop FSDataInputStream.

/**
 * Seekable data input stream for Hadoop
 * Provides random access capabilities for file reading
 */
public class HadoopSeekableDataInputStream implements SeekableDataInputStream {
    
    /** Create seekable stream from FSDataInputStream */
    public HadoopSeekableDataInputStream(FSDataInputStream fsDataInputStream);
    
    /** Read single byte */
    public int read();
    
    /** Read bytes into buffer */
    public int read(byte[] b, int off, int len);
    
    /** Seek to specific position */
    public void seek(long pos);
    
    /** Get current position */
    public long getPos();
    
    /** Close the stream */
    public void close();
}

Additional Utility Classes

Specialized utility classes for file system operations.

/**
 * Serializable wrapper for FileStatus
 * Allows FileStatus to be serialized for distributed operations
 */
public class HoodieSerializableFileStatus implements Serializable {
    // Standard FileStatus interface methods with Serializable support
    public Path getPath();
    public long getLen();
    public boolean isDirectory();
    public long getModificationTime();
    public FsPermission getPermission();
}

/**
 * Path implementation with caching capabilities
 * Optimizes path operations through intelligent caching
 */
public class CachingPath extends Path {
    // Extended Path functionality with caching optimizations
    public CachingPath(String pathString);
    public CachingPath(Path parent, String child);
}

Usage Examples:

import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;

// Basic FileSystem operations
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://namenode:8020");

// Get FileSystem instance
FileSystem fs = HadoopFSUtils.getFs("hdfs://namenode:8020/data", conf);

// Path conversions
StoragePath storagePath = new StoragePath("hdfs://namenode:8020/data/table");
Path hadoopPath = HadoopFSUtils.convertToHadoopPath(storagePath);
StoragePath backToStorage = HadoopFSUtils.convertToStoragePath(hadoopPath);

// File identification
Path logFile = new Path("/data/table/.hoodie/20231201120000.deltacommit.log");
boolean isLog = HadoopFSUtils.isLogFile(logFile);
String fileId = HadoopFSUtils.getFileIdFromLogPath(logFile);
String commitTime = HadoopFSUtils.getDeltaCommitTimeFromLogPath(logFile);

// Get all data files in partition
Path partitionPath = new Path("/data/table/year=2023/month=12");
FileStatus[] dataFiles = HadoopFSUtils.getAllDataFilesInPartition(fs, partitionPath);

// Using wrapper FileSystem with consistency
ConsistencyGuard guard = new OptimisticConsistencyGuard(fs, conf);
HoodieWrapperFileSystem wrapperFs = new HoodieWrapperFileSystem(fs, guard);

// Using retry wrapper
HoodieRetryWrapperFileSystem retryFs = new HoodieRetryWrapperFileSystem(
    fs, 
    5000L, // maxRetryIntervalMs
    3,     // maxRetryNumbers  
    1000L, // initialRetryIntervalMs
    "java.io.IOException" // retryExceptions
);

// Seekable stream operations
Path dataFile = new Path("/data/table/file.parquet");
FSDataInputStream fsInput = fs.open(dataFile);
HadoopSeekableDataInputStream seekableInput = new HadoopSeekableDataInputStream(fsInput);

// Random access reading
seekableInput.seek(1024); // Seek to position 1024
int data = seekableInput.read();
long currentPos = seekableInput.getPos();
seekableInput.close();

Install with Tessl CLI

npx tessl i tessl/maven-org-apache-hudi--hudi-hadoop-common

docs

configuration-management.md

filesystem-utilities.md

format-utilities.md

index.md

io-operations.md

storage-operations.md

tile.json