Spark Unsafe provides high-performance hashing functions and bitset manipulation utilities optimized for data processing workloads. The hashing functionality includes Murmur3 implementation, while the bitset operations provide efficient manipulation of packed bit arrays.
import org.apache.spark.unsafe.hash.Murmur3_x86_32;
import org.apache.spark.unsafe.bitset.BitSetMethods;// Create hasher with specific seed
Murmur3_x86_32 hasher = new Murmur3_x86_32(42);
// Hash integers
int intHash = hasher.hashInt(12345);
int longHash = hasher.hashLong(123456789L);
// Static hashing methods (convenient for one-off operations)
int staticIntHash = Murmur3_x86_32.hashInt(12345, 42);
int staticLongHash = Murmur3_x86_32.hashLong(123456789L, 42);
// Hash word-aligned memory regions
byte[] data = "Hello, World!".getBytes(StandardCharsets.UTF_8);
int wordsHash = Murmur3_x86_32.hashUnsafeWords(
data, Platform.BYTE_ARRAY_OFFSET, data.length, 42
);
// Hash arbitrary byte sequences (improved method)
int bytesHash = Murmur3_x86_32.hashUnsafeBytes2(
data, Platform.BYTE_ARRAY_OFFSET, data.length, 42
);// Allocate memory for bitset (assume 64 bits = 8 bytes = 1 word)
MemoryAllocator allocator = MemoryAllocator.HEAP;
MemoryBlock bitsetMemory = allocator.allocate(8);
Object baseObject = bitsetMemory.getBaseObject();
long baseOffset = bitsetMemory.getBaseOffset();
try {
// Clear the bitset initially
Platform.setMemory(baseObject, baseOffset, 8, (byte) 0);
// Set individual bits
BitSetMethods.set(baseObject, baseOffset, 5); // Set bit 5
BitSetMethods.set(baseObject, baseOffset, 10); // Set bit 10
BitSetMethods.set(baseObject, baseOffset, 15); // Set bit 15
// Check if bits are set
boolean bit5Set = BitSetMethods.isSet(baseObject, baseOffset, 5); // true
boolean bit7Set = BitSetMethods.isSet(baseObject, baseOffset, 7); // false
// Check if any bits are set in the bitset
boolean anySet = BitSetMethods.anySet(baseObject, baseOffset, 1); // true (1 word)
// Find next set bit from a given position
int nextSet = BitSetMethods.nextSetBit(baseObject, baseOffset, 6, 1); // Returns 10
int noMore = BitSetMethods.nextSetBit(baseObject, baseOffset, 16, 1); // Returns -1
// Unset bits
BitSetMethods.unset(baseObject, baseOffset, 10); // Clear bit 10
boolean bit10Set = BitSetMethods.isSet(baseObject, baseOffset, 10); // false
} finally {
allocator.free(bitsetMemory);
}// Working with larger bitsets (multiple words)
int numWords = 4; // 4 * 64 = 256 bits
MemoryBlock largeBitset = allocator.allocate(numWords * 8);
Object base = largeBitset.getBaseObject();
long offset = largeBitset.getBaseOffset();
try {
// Initialize bitset to all zeros
Platform.setMemory(base, offset, numWords * 8, (byte) 0);
// Set bits across different words
BitSetMethods.set(base, offset, 63); // Last bit of first word
BitSetMethods.set(base, offset, 64); // First bit of second word
BitSetMethods.set(base, offset, 200); // Bit in fourth word
// Check if any bits are set across all words
boolean anyBitsSet = BitSetMethods.anySet(base, offset, numWords);
// Find all set bits
int currentBit = -1;
while ((currentBit = BitSetMethods.nextSetBit(base, offset, currentBit + 1, numWords)) != -1) {
System.out.println("Bit " + currentBit + " is set");
}
} finally {
allocator.free(largeBitset);
}public final class Murmur3_x86_32 {
/**
* Creates Murmur3 hasher with specified seed value.
*/
public Murmur3_x86_32(int seed);
/**
* Returns string representation of this hasher.
*/
public String toString();
/**
* Computes Murmur3 hash of an integer value.
*/
public int hashInt(int input);
/**
* Computes Murmur3 hash of word-aligned data.
* Data length must be multiple of 4 bytes.
*/
public int hashUnsafeWords(Object base, long offset, int lengthInBytes);
/**
* Computes Murmur3 hash of a long value.
*/
public int hashLong(long input);
}/**
* Static version of integer hashing with explicit seed.
*/
public static int hashInt(int input, int seed);
/**
* Static version of word-aligned hashing with explicit seed.
*/
public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed);
/**
* Legacy byte-level hashing method (deprecated, use hashUnsafeBytes2).
*/
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);
/**
* Improved byte-level hashing method for arbitrary data.
*/
public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed);
/**
* Static version of long hashing with explicit seed.
*/
public static int hashLong(long input, int seed);public final class BitSetMethods {
/**
* Sets the bit at the specified index to true.
*
* @param baseObject Base object containing the bitset
* @param baseOffset Offset to the start of the bitset
* @param index Bit index to set (0-based)
*/
public static void set(Object baseObject, long baseOffset, int index);
/**
* Sets the bit at the specified index to false.
*
* @param baseObject Base object containing the bitset
* @param baseOffset Offset to the start of the bitset
* @param index Bit index to unset (0-based)
*/
public static void unset(Object baseObject, long baseOffset, int index);
/**
* Returns true if the bit at the specified index is set.
*
* @param baseObject Base object containing the bitset
* @param baseOffset Offset to the start of the bitset
* @param index Bit index to check (0-based)
* @return true if bit is set, false otherwise
*/
public static boolean isSet(Object baseObject, long baseOffset, int index);
/**
* Returns true if any bit is set in the bitset.
*
* @param baseObject Base object containing the bitset
* @param baseOffset Offset to the start of the bitset
* @param bitSetWidthInWords Width of bitset in 64-bit words
* @return true if any bit is set, false if all bits are clear
*/
public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInWords);
/**
* Finds the next set bit starting from the specified index.
*
* @param baseObject Base object containing the bitset
* @param baseOffset Offset to the start of the bitset
* @param fromIndex Starting bit index for search (inclusive)
* @param bitsetSizeInWords Size of bitset in 64-bit words
* @return Index of next set bit, or -1 if no set bit found
*/
public static int nextSetBit(Object baseObject, long baseOffset, int fromIndex, int bitsetSizeInWords);
}hashUnsafeWords is optimized for 4-byte aligned data.Memory Alignment: Bitsets should be properly aligned for optimal performance.
Thread Safety: None of these classes provide thread safety guarantees.
Memory Management: Bitset operations work on raw memory - ensure proper lifecycle management.
Bit Indexing: All bit operations use 0-based indexing.
Word Boundaries: Bitset operations are optimized for word (64-bit) boundaries.
Hash Collision: While Murmur3 has good distribution, consider collision handling in hash tables.
Seed Selection: Choose hash seeds carefully to avoid predictable hash patterns.
// Use Murmur3 for hash table operations
public class HashTable<K, V> {
private static final int DEFAULT_SEED = 42;
private final Murmur3_x86_32 hasher = new Murmur3_x86_32(DEFAULT_SEED);
private int hash(K key) {
if (key instanceof Integer) {
return hasher.hashInt((Integer) key);
} else if (key instanceof Long) {
return hasher.hashLong((Long) key);
} else {
byte[] bytes = key.toString().getBytes(StandardCharsets.UTF_8);
return Murmur3_x86_32.hashUnsafeBytes2(
bytes, Platform.BYTE_ARRAY_OFFSET, bytes.length, DEFAULT_SEED
);
}
}
}// Implement Bloom filter using bitset operations
public class BloomFilter {
private final MemoryBlock bitset;
private final Object baseObject;
private final long baseOffset;
private final int bitsetSizeInWords;
private final int[] hashSeeds;
public BloomFilter(int sizeInBits, int numHashFunctions) {
int sizeInBytes = (sizeInBits + 7) / 8; // Round up to byte boundary
this.bitsetSizeInWords = (sizeInBytes + 7) / 8; // Round up to word boundary
MemoryAllocator allocator = MemoryAllocator.HEAP;
this.bitset = allocator.allocate(bitsetSizeInWords * 8);
this.baseObject = bitset.getBaseObject();
this.baseOffset = bitset.getBaseOffset();
// Clear bitset
Platform.setMemory(baseObject, baseOffset, bitsetSizeInWords * 8, (byte) 0);
// Initialize hash seeds
this.hashSeeds = new int[numHashFunctions];
for (int i = 0; i < numHashFunctions; i++) {
this.hashSeeds[i] = i * 97; // Different seeds for each hash function
}
}
public void add(int value) {
for (int seed : hashSeeds) {
int hash = Murmur3_x86_32.hashInt(value, seed);
int bitIndex = Math.abs(hash) % (bitsetSizeInWords * 64);
BitSetMethods.set(baseObject, baseOffset, bitIndex);
}
}
public boolean mightContain(int value) {
for (int seed : hashSeeds) {
int hash = Murmur3_x86_32.hashInt(value, seed);
int bitIndex = Math.abs(hash) % (bitsetSizeInWords * 64);
if (!BitSetMethods.isSet(baseObject, baseOffset, bitIndex)) {
return false;
}
}
return true;
}
}