High-performance hashing implementations and utility classes including Murmur3 hashing, bitset operations, date/time constants, and Hive-compatible hashing for data distribution and compatibility requirements.
Fast 32-bit Murmur3 hash implementation optimized for performance with support for different data types and memory layouts.
/**
* 32-bit Murmur3 hasher implementation
*/
final class Murmur3_x86_32 {
/**
* Create hasher with specific seed
* @param seed Seed value for hashing
*/
public Murmur3_x86_32(int seed);
/**
* Get string representation of hasher
* @return String representation
*/
public String toString();
// Instance methods using hasher's seed
/**
* Hash integer value using instance seed
* @param input Integer to hash
* @return Hash value
*/
public int hashInt(int input);
/**
* Hash long value using instance seed
* @param input Long to hash
* @return Hash value
*/
public int hashLong(long input);
/**
* Hash word-aligned bytes using instance seed
* @param base Base object (null for off-heap)
* @param offset Offset within object or address
* @param lengthInBytes Number of bytes to hash (must be word-aligned)
* @return Hash value
*/
public int hashUnsafeWords(Object base, long offset, int lengthInBytes);
// Static methods with explicit seed
/**
* Hash integer with provided seed
* @param input Integer to hash
* @param seed Seed value
* @return Hash value
*/
public static int hashInt(int input, int seed);
/**
* Hash long with provided seed
* @param input Long to hash
* @param seed Seed value
* @return Hash value
*/
public static int hashLong(long input, int seed);
/**
* Hash word-aligned bytes with provided seed
* @param base Base object (null for off-heap)
* @param offset Offset within object or address
* @param lengthInBytes Number of bytes to hash (must be word-aligned)
* @param seed Seed value
* @return Hash value
*/
public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed);
/**
* Hash arbitrary bytes with provided seed (legacy method)
* @param base Base object (null for off-heap)
* @param offset Offset within object or address
* @param lengthInBytes Number of bytes to hash
* @param seed Seed value
* @return Hash value
*/
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);
/**
* Hash arbitrary bytes with provided seed (compatible method)
* @param base Base object (null for off-heap)
* @param offset Offset within object or address
* @param lengthInBytes Number of bytes to hash
* @param seed Seed value
* @return Hash value
*/
public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed);
}Usage Examples:
import org.apache.spark.unsafe.hash.Murmur3_x86_32;
import org.apache.spark.unsafe.Platform;
// Create hasher with seed
Murmur3_x86_32 hasher = new Murmur3_x86_32(42);
// Hash different data types using instance methods
int intHash = hasher.hashInt(12345);
int longHash = hasher.hashLong(123456789L);
// Hash byte arrays
byte[] data = "Hello World".getBytes();
int arrayHash = hasher.hashUnsafeWords(
data,
Platform.BYTE_ARRAY_OFFSET,
data.length
);
// Use static methods with explicit seed
int staticIntHash = Murmur3_x86_32.hashInt(12345, 42);
int staticLongHash = Murmur3_x86_32.hashLong(123456789L, 42);
// Hash memory regions
byte[] buffer = new byte[1024];
// ... fill buffer ...
int bufferHash = Murmur3_x86_32.hashUnsafeBytes(
buffer,
Platform.BYTE_ARRAY_OFFSET,
buffer.length,
42
);
// Hash off-heap memory
long address = Platform.allocateMemory(100);
try {
Platform.setMemory(address, (byte) 0xFF, 100);
int offHeapHash = Murmur3_x86_32.hashUnsafeBytes(
null, address, 100, 42
);
} finally {
Platform.freeMemory(address);
}Methods for working with fixed-size uncompressed bitsets stored in memory, providing efficient bit manipulation operations.
/**
* Methods for working with fixed-size uncompressed bitsets
*/
final class BitSetMethods {
/**
* Set bit at specified index
* @param baseObject Base object (null for off-heap)
* @param baseOffset Base offset or address
* @param index Bit index to set
*/
public static void set(Object baseObject, long baseOffset, int index);
/**
* Unset (clear) bit at specified index
* @param baseObject Base object (null for off-heap)
* @param baseOffset Base offset or address
* @param index Bit index to unset
*/
public static void unset(Object baseObject, long baseOffset, int index);
/**
* Check if bit is set at specified index
* @param baseObject Base object (null for off-heap)
* @param baseOffset Base offset or address
* @param index Bit index to check
* @return true if bit is set
*/
public static boolean isSet(Object baseObject, long baseOffset, int index);
/**
* Check if any bit is set in the bitset
* @param baseObject Base object (null for off-heap)
* @param baseOffset Base offset or address
* @param bitSetWidthInWords Bitset width in 64-bit words
* @return true if any bit is set
*/
public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInWords);
/**
* Find next set bit starting from index
* @param baseObject Base object (null for off-heap)
* @param baseOffset Base offset or address
* @param fromIndex Starting index for search
* @param bitsetSizeInWords Bitset size in 64-bit words
* @return Index of next set bit or -1 if not found
*/
public static int nextSetBit(Object baseObject, long baseOffset, int fromIndex, int bitsetSizeInWords);
}Usage Examples:
import org.apache.spark.unsafe.bitset.BitSetMethods;
import org.apache.spark.unsafe.Platform;
// Create bitset using long array (8 words = 512 bits)
long[] bitsetData = new long[8];
Object baseObj = bitsetData;
long baseOffset = Platform.LONG_ARRAY_OFFSET;
// Set some bits
BitSetMethods.set(baseObj, baseOffset, 10);
BitSetMethods.set(baseObj, baseOffset, 25);
BitSetMethods.set(baseObj, baseOffset, 100);
// Check if bits are set
boolean bit10Set = BitSetMethods.isSet(baseObj, baseOffset, 10); // true
boolean bit15Set = BitSetMethods.isSet(baseObj, baseOffset, 15); // false
// Check if any bits are set
boolean anySet = BitSetMethods.anySet(baseObj, baseOffset, 8); // true
// Find next set bit
int nextBit = BitSetMethods.nextSetBit(baseObj, baseOffset, 0, 8); // 10
int afterTen = BitSetMethods.nextSetBit(baseObj, baseOffset, 11, 8); // 25
// Clear a bit
BitSetMethods.unset(baseObj, baseOffset, 25);
boolean bit25Set = BitSetMethods.isSet(baseObj, baseOffset, 25); // false
// Using off-heap bitset
long address = Platform.allocateMemory(64); // 8 words * 8 bytes
try {
Platform.setMemory(address, (byte) 0, 64); // Clear all bits
BitSetMethods.set(null, address, 42);
boolean isSet = BitSetMethods.isSet(null, address, 42);
} finally {
Platform.freeMemory(address);
}Comprehensive constants for date and time calculations and conversions, providing all common time unit relationships.
/**
* Constants for date/time calculations and conversions
*/
class DateTimeConstants {
// Basic time units
public static final int MONTHS_PER_YEAR = 12;
public static final byte DAYS_PER_WEEK = 7;
public static final long HOURS_PER_DAY = 24L;
public static final long MINUTES_PER_HOUR = 60L;
public static final long SECONDS_PER_MINUTE = 60L;
// Computed time constants
public static final long SECONDS_PER_HOUR; // 3600
public static final long SECONDS_PER_DAY; // 86400
// Millisecond conversions
public static final long MILLIS_PER_SECOND = 1000L;
public static final long MILLIS_PER_MINUTE; // 60000
public static final long MILLIS_PER_HOUR; // 3600000
public static final long MILLIS_PER_DAY; // 86400000
// Microsecond conversions
public static final long MICROS_PER_MILLIS = 1000L;
public static final long MICROS_PER_SECOND; // 1000000
public static final long MICROS_PER_MINUTE; // 60000000
public static final long MICROS_PER_HOUR; // 3600000000
public static final long MICROS_PER_DAY; // 86400000000
// Nanosecond conversions
public static final long NANOS_PER_MICROS = 1000L;
public static final long NANOS_PER_MILLIS; // 1000000
public static final long NANOS_PER_SECOND; // 1000000000
}Usage Examples:
import org.apache.spark.sql.catalyst.util.DateTimeConstants;
// Time calculations using constants
long currentTimeMillis = System.currentTimeMillis();
// Convert to different units
long currentTimeSeconds = currentTimeMillis / DateTimeConstants.MILLIS_PER_SECOND;
long currentTimeMicros = currentTimeMillis * DateTimeConstants.MICROS_PER_MILLIS;
long currentTimeNanos = currentTimeMillis * DateTimeConstants.NANOS_PER_MILLIS;
// Calculate time spans
long hoursInWeek = DateTimeConstants.DAYS_PER_WEEK * DateTimeConstants.HOURS_PER_DAY;
long secondsInWeek = hoursInWeek * DateTimeConstants.SECONDS_PER_HOUR;
// Duration calculations
long durationDays = 5;
long durationMillis = durationDays * DateTimeConstants.MILLIS_PER_DAY;
long durationMicros = durationDays * DateTimeConstants.MICROS_PER_DAY;
// Conversion helpers
public static long millisToMicros(long millis) {
return millis * DateTimeConstants.MICROS_PER_MILLIS;
}
public static long microsToNanos(long micros) {
return micros * DateTimeConstants.NANOS_PER_MICROS;
}
public static long secondsToMillis(long seconds) {
return seconds * DateTimeConstants.MILLIS_PER_SECOND;
}Hive-compatible hashing functions for maintaining compatibility with Hive v1.2.1 hashing behavior.
/**
* Simulates Hive's hashing function from Hive v1.2.1 for compatibility
*/
class HiveHasher {
/**
* Hash integer using Hive-compatible algorithm
* @param input Integer to hash
* @return Hive-compatible hash value
*/
public static int hashInt(int input);
/**
* Hash long using Hive-compatible algorithm
* @param input Long to hash
* @return Hive-compatible hash value
*/
public static int hashLong(long input);
/**
* Hash byte array using Hive-compatible algorithm with unsafe access
* @param base Base object (null for off-heap)
* @param offset Offset within object or address
* @param lengthInBytes Number of bytes to hash
* @return Hive-compatible hash value
*/
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes);
/**
* Get string representation
* @return String representation of hasher
*/
public String toString();
}Usage Examples:
import org.apache.spark.sql.catalyst.expressions.HiveHasher;
import org.apache.spark.unsafe.Platform;
// Hash integers for Hive compatibility
int intValue = 12345;
int hiveIntHash = HiveHasher.hashInt(intValue);
// Hash longs for Hive compatibility
long longValue = 123456789L;
int hiveLongHash = HiveHasher.hashLong(longValue);
// Hash byte arrays with Hive-compatible algorithm
byte[] data = "test data".getBytes();
int hiveArrayHash = HiveHasher.hashUnsafeBytes(
data,
Platform.BYTE_ARRAY_OFFSET,
data.length
);
// Use for partitioning compatibility with Hive tables
public int getHivePartition(Object value, int numPartitions) {
int hash;
if (value instanceof Integer) {
hash = HiveHasher.hashInt((Integer) value);
} else if (value instanceof Long) {
hash = HiveHasher.hashLong((Long) value);
} else {
byte[] bytes = value.toString().getBytes();
hash = HiveHasher.hashUnsafeBytes(
bytes, Platform.BYTE_ARRAY_OFFSET, bytes.length
);
}
return Math.abs(hash) % numPartitions;
}Platform-specific alignment handling for record length offsets, ensuring proper memory alignment across different architectures.
/**
* Handles platform-specific alignment for record length offsets
*/
class UnsafeAlignedOffset {
/**
* Set UAO size for testing purposes
* @param size UAO size to set
*/
public static void setUaoSize(int size);
/**
* Get current UAO size
* @return Current UAO size
*/
public static int getUaoSize();
/**
* Get size value considering platform alignment
* @param object Base object
* @param offset Offset within object
* @return Size value with proper alignment
*/
public static int getSize(Object object, long offset);
/**
* Put size value considering platform alignment
* @param object Base object
* @param offset Offset within object
* @param value Size value to store
*/
public static void putSize(Object object, long offset, int value);
}Usage Examples:
import org.apache.spark.unsafe.UnsafeAlignedOffset;
import org.apache.spark.unsafe.Platform;
// Working with aligned record sizes
byte[] recordBuffer = new byte[1024];
long recordOffset = Platform.BYTE_ARRAY_OFFSET;
// Store record size with proper alignment
int recordSize = 256;
UnsafeAlignedOffset.putSize(recordBuffer, recordOffset, recordSize);
// Read record size with proper alignment
int storedSize = UnsafeAlignedOffset.getSize(recordBuffer, recordOffset);
// Check current alignment requirements
int uaoSize = UnsafeAlignedOffset.getUaoSize();
System.out.println("Current UAO size: " + uaoSize);
// For testing different alignment scenarios
UnsafeAlignedOffset.setUaoSize(8); // Set 8-byte alignment for testing
// ... run tests ...
UnsafeAlignedOffset.setUaoSize(4); // Reset to 4-byte alignmentSpecialized data type for representing calendar intervals with separate components for months, days, and microseconds.
/**
* Represents calendar intervals with months, days, and microseconds
* @Unstable - API may change in future versions
*/
final class CalendarInterval implements Serializable {
// Public fields for interval components
public final int months; // Number of months
public final int days; // Number of days
public final long microseconds; // Number of microseconds
/**
* Create calendar interval with specified components
* @param months Number of months
* @param days Number of days
* @param microseconds Number of microseconds
*/
public CalendarInterval(int months, int days, long microseconds);
/**
* Check equality with another object
* @param o Object to compare with
* @return true if equal
*/
public boolean equals(Object o);
/**
* Calculate hash code
* @return Hash code value
*/
public int hashCode();
/**
* Get string representation
* @return String representation of interval
*/
public String toString();
/**
* Extract interval as Java Period (months and days only)
* @return Java Period representation
*/
public Period extractAsPeriod();
/**
* Extract interval as Java Duration (microseconds only)
* @return Java Duration representation
*/
public Duration extractAsDuration();
}Usage Examples:
import org.apache.spark.unsafe.types.CalendarInterval;
import java.time.Period;
import java.time.Duration;
// Create calendar intervals
CalendarInterval interval1 = new CalendarInterval(2, 15, 3600000000L); // 2 months, 15 days, 1 hour
CalendarInterval interval2 = new CalendarInterval(0, 0, 1500000L); // 1.5 seconds
CalendarInterval interval3 = new CalendarInterval(12, 0, 0L); // 1 year
// Working with interval components
int months = interval1.months;
int days = interval1.days;
long microseconds = interval1.microseconds;
// Convert to Java time types
Period period = interval1.extractAsPeriod(); // 2 months, 15 days
Duration duration = interval1.extractAsDuration(); // 1 hour
// Comparison and equality
boolean areEqual = interval1.equals(interval2);
int hashCode = interval1.hashCode();
String description = interval1.toString();
// Common use cases
public static CalendarInterval oneHour() {
return new CalendarInterval(0, 0, DateTimeConstants.MICROS_PER_HOUR);
}
public static CalendarInterval oneDay() {
return new CalendarInterval(0, 1, 0L);
}
public static CalendarInterval oneMonth() {
return new CalendarInterval(1, 0, 0L);
}// For general-purpose hashing (faster)
int hash1 = Murmur3_x86_32.hashInt(value, seed);
// For Hive compatibility (when interfacing with Hive)
int hash2 = HiveHasher.hashInt(value);
// For consistent partitioning across Spark and Hive
int partition = Math.abs(HiveHasher.hashInt(key)) % numPartitions;// BitSet requires word-aligned memory
int numBits = 1000;
int numWords = (numBits + 63) / 64; // Round up to words
long[] bitsetStorage = new long[numWords];
// Always specify correct word count
boolean anySet = BitSetMethods.anySet(
bitsetStorage,
Platform.LONG_ARRAY_OFFSET,
numWords // Important: use actual word count
);// Pre-calculate commonly used values
private static final long MICROS_PER_WEEK =
DateTimeConstants.DAYS_PER_WEEK * DateTimeConstants.MICROS_PER_DAY;
// Use constants for efficient conversions
public long convertDaysToMicros(int days) {
return days * DateTimeConstants.MICROS_PER_DAY;
}