Fast hash function implementations and bitset manipulation methods for efficient data processing and boolean operations. These operations provide high-performance hashing and bitset manipulation capabilities essential for data structures and algorithms in distributed computing.
32-bit Murmur3 hash function implementation providing fast, high-quality hashing for data processing applications.
public final class Murmur3_x86_32 {
public Murmur3_x86_32(int seed);
// Instance methods
public String toString();
public int hashInt(int input);
public int hashUnsafeWords(Object base, long offset, int lengthInBytes);
public int hashLong(long input);
// Static methods
public static int hashInt(int input, int seed);
public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed);
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);
public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed);
public static int hashLong(long input, int seed);
}Hash function implementation that simulates Hive's hash function for compatibility with Hive v1.2.1, ensuring consistent hashing across Spark and Hive environments.
public class HiveHasher {
public String toString();
public static int hashInt(int input);
public static int hashLong(long input);
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes);
}Methods for working with fixed-size uncompressed bitsets stored in memory, providing efficient boolean array operations.
public final class BitSetMethods {
public static void set(Object baseObject, long baseOffset, int index);
public static void unset(Object baseObject, long baseOffset, int index);
public static boolean isSet(Object baseObject, long baseOffset, int index);
public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInWords);
public static int nextSetBit(Object baseObject, long baseOffset, int fromIndex, int bitsetSizeInWords);
}import org.apache.spark.unsafe.hash.Murmur3_x86_32;
// Create hasher with specific seed
Murmur3_x86_32 hasher = new Murmur3_x86_32(42);
// Hash different data types
int intHash = hasher.hashInt(12345);
long longHash = hasher.hashLong(123456789L);
// Hash memory regions (word-aligned for optimal performance)
byte[] data = "Hello World".getBytes();
int memoryHash = hasher.hashUnsafeWords(
data,
Platform.BYTE_ARRAY_OFFSET,
data.length
);
// Static methods with custom seed
int staticIntHash = Murmur3_x86_32.hashInt(12345, 42);
int staticLongHash = Murmur3_x86_32.hashLong(123456789L, 42);
// Hash arbitrary byte sequences
int byteHash = Murmur3_x86_32.hashUnsafeBytes2(
data,
Platform.BYTE_ARRAY_OFFSET,
data.length,
42
);import org.apache.spark.sql.catalyst.expressions.HiveHasher;
// Hash integers (returns input unchanged for compatibility)
int intHash = HiveHasher.hashInt(12345); // Returns 12345
// Hash longs using XOR of upper and lower 32 bits
long longValue = 0x123456789ABCDEFL;
int longHash = HiveHasher.hashLong(longValue);
// Hash byte sequences using rolling hash algorithm
byte[] data = "test data".getBytes();
int byteHash = HiveHasher.hashUnsafeBytes(
data,
Platform.BYTE_ARRAY_OFFSET,
data.length
);import org.apache.spark.unsafe.bitset.BitSetMethods;
import org.apache.spark.unsafe.Platform;
// Allocate memory for bitset (e.g., 64 bits = 1 word)
long bitsetAddress = Platform.allocateMemory(8);
Platform.setMemory(bitsetAddress, (byte) 0, 8); // Initialize to zeros
// Set individual bits
BitSetMethods.set(null, bitsetAddress, 5); // Set bit 5
BitSetMethods.set(null, bitsetAddress, 10); // Set bit 10
BitSetMethods.set(null, bitsetAddress, 15); // Set bit 15
// Check if bits are set
boolean bit5Set = BitSetMethods.isSet(null, bitsetAddress, 5); // true
boolean bit7Set = BitSetMethods.isSet(null, bitsetAddress, 7); // false
// Unset bits
BitSetMethods.unset(null, bitsetAddress, 10); // Clear bit 10
boolean bit10Set = BitSetMethods.isSet(null, bitsetAddress, 10); // false
// Check if any bits are set in the bitset
boolean anyBitsSet = BitSetMethods.anySet(null, bitsetAddress, 1); // true (bits 5 and 15 are set)
// Find next set bit starting from a given index
int nextBit = BitSetMethods.nextSetBit(null, bitsetAddress, 0, 1); // Returns 5
int nextBit2 = BitSetMethods.nextSetBit(null, bitsetAddress, 6, 1); // Returns 15
// Clean up
Platform.freeMemory(bitsetAddress);import org.apache.spark.unsafe.bitset.BitSetMethods;
import org.apache.spark.unsafe.Platform;
// Use long array as bitset storage
long[] bitsetArray = new long[2]; // 128 bits
// Set bits using array as backing store
BitSetMethods.set(bitsetArray, Platform.LONG_ARRAY_OFFSET, 65); // Set bit 65 (in second long)
BitSetMethods.set(bitsetArray, Platform.LONG_ARRAY_OFFSET, 120); // Set bit 120
// Check bits
boolean bit65Set = BitSetMethods.isSet(bitsetArray, Platform.LONG_ARRAY_OFFSET, 65); // true
// Check if any bits are set in the entire bitset
boolean anySet = BitSetMethods.anySet(bitsetArray, Platform.LONG_ARRAY_OFFSET, 2); // true
// Find next set bit
int nextSetBit = BitSetMethods.nextSetBit(bitsetArray, Platform.LONG_ARRAY_OFFSET, 0, 2); // 65import org.apache.spark.unsafe.hash.Murmur3_x86_32;
import org.apache.spark.unsafe.bitset.BitSetMethods;
import org.apache.spark.unsafe.Platform;
// Create a bloom filter-like structure
long[] bitset = new long[16]; // 1024 bits
Murmur3_x86_32 hasher1 = new Murmur3_x86_32(42);
Murmur3_x86_32 hasher2 = new Murmur3_x86_32(123);
// Insert element
String element = "example";
byte[] elementBytes = element.getBytes();
int hash1 = Math.abs(hasher1.hashUnsafeWords(elementBytes, Platform.BYTE_ARRAY_OFFSET, elementBytes.length));
int hash2 = Math.abs(hasher2.hashUnsafeWords(elementBytes, Platform.BYTE_ARRAY_OFFSET, elementBytes.length));
int bit1 = hash1 % 1024;
int bit2 = hash2 % 1024;
BitSetMethods.set(bitset, Platform.LONG_ARRAY_OFFSET, bit1);
BitSetMethods.set(bitset, Platform.LONG_ARRAY_OFFSET, bit2);
// Check if element might be present
boolean mightContain = BitSetMethods.isSet(bitset, Platform.LONG_ARRAY_OFFSET, bit1) &&
BitSetMethods.isSet(bitset, Platform.LONG_ARRAY_OFFSET, bit2);import org.apache.spark.unsafe.hash.Murmur3_x86_32;
// For word-aligned data, use hashUnsafeWords for better performance
byte[] alignedData = new byte[16]; // Multiple of 4 bytes
int wordHash = Murmur3_x86_32.hashUnsafeWords(
alignedData,
Platform.BYTE_ARRAY_OFFSET,
alignedData.length,
42
);
// For arbitrary byte sequences, use hashUnsafeBytes2
byte[] arbitraryData = new byte[15]; // Not word-aligned
int byteHash = Murmur3_x86_32.hashUnsafeBytes2(
arbitraryData,
Platform.BYTE_ARRAY_OFFSET,
arbitraryData.length,
42
);
// Reuse hasher instances when hashing multiple values with the same seed
Murmur3_x86_32 reusableHasher = new Murmur3_x86_32(42);
int[] values = {1, 2, 3, 4, 5};
for (int value : values) {
int hash = reusableHasher.hashInt(value);
// Process hash
}