or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

arrays.mdhashing-bitsets.mdindex.mdintervals.mdmemory.mdplatform.mdutf8-strings.md
tile.json

hashing-bitsets.mddocs/

Hashing and Bitsets

Spark Unsafe provides high-performance hashing functions and bitset manipulation utilities optimized for data processing workloads. The hashing functionality includes Murmur3 implementation, while the bitset operations provide efficient manipulation of packed bit arrays.

Core Imports

import org.apache.spark.unsafe.hash.Murmur3_x86_32;
import org.apache.spark.unsafe.bitset.BitSetMethods;

Usage Examples

Murmur3 Hashing

// Create hasher with specific seed
Murmur3_x86_32 hasher = new Murmur3_x86_32(42);

// Hash integers
int intHash = hasher.hashInt(12345);
int longHash = hasher.hashLong(123456789L);

// Static hashing methods (convenient for one-off operations)
int staticIntHash = Murmur3_x86_32.hashInt(12345, 42);
int staticLongHash = Murmur3_x86_32.hashLong(123456789L, 42);

// Hash word-aligned memory regions
byte[] data = "Hello, World!".getBytes(StandardCharsets.UTF_8);
int wordsHash = Murmur3_x86_32.hashUnsafeWords(
    data, Platform.BYTE_ARRAY_OFFSET, data.length, 42
);

// Hash arbitrary byte sequences (improved method)
int bytesHash = Murmur3_x86_32.hashUnsafeBytes2(
    data, Platform.BYTE_ARRAY_OFFSET, data.length, 42
);

Bitset Operations

// Allocate memory for bitset (assume 64 bits = 8 bytes = 1 word)
MemoryAllocator allocator = MemoryAllocator.HEAP;
MemoryBlock bitsetMemory = allocator.allocate(8);
Object baseObject = bitsetMemory.getBaseObject();
long baseOffset = bitsetMemory.getBaseOffset();

try {
    // Clear the bitset initially
    Platform.setMemory(baseObject, baseOffset, 8, (byte) 0);
    
    // Set individual bits
    BitSetMethods.set(baseObject, baseOffset, 5);   // Set bit 5
    BitSetMethods.set(baseObject, baseOffset, 10);  // Set bit 10
    BitSetMethods.set(baseObject, baseOffset, 15);  // Set bit 15
    
    // Check if bits are set
    boolean bit5Set = BitSetMethods.isSet(baseObject, baseOffset, 5);   // true
    boolean bit7Set = BitSetMethods.isSet(baseObject, baseOffset, 7);   // false
    
    // Check if any bits are set in the bitset
    boolean anySet = BitSetMethods.anySet(baseObject, baseOffset, 1);   // true (1 word)
    
    // Find next set bit from a given position
    int nextSet = BitSetMethods.nextSetBit(baseObject, baseOffset, 6, 1); // Returns 10
    int noMore = BitSetMethods.nextSetBit(baseObject, baseOffset, 16, 1); // Returns -1
    
    // Unset bits
    BitSetMethods.unset(baseObject, baseOffset, 10); // Clear bit 10
    boolean bit10Set = BitSetMethods.isSet(baseObject, baseOffset, 10); // false
    
} finally {
    allocator.free(bitsetMemory);
}

Advanced Bitset Usage

// Working with larger bitsets (multiple words)
int numWords = 4; // 4 * 64 = 256 bits
MemoryBlock largeBitset = allocator.allocate(numWords * 8);
Object base = largeBitset.getBaseObject();
long offset = largeBitset.getBaseOffset();

try {
    // Initialize bitset to all zeros
    Platform.setMemory(base, offset, numWords * 8, (byte) 0);
    
    // Set bits across different words
    BitSetMethods.set(base, offset, 63);   // Last bit of first word
    BitSetMethods.set(base, offset, 64);   // First bit of second word
    BitSetMethods.set(base, offset, 200);  // Bit in fourth word
    
    // Check if any bits are set across all words
    boolean anyBitsSet = BitSetMethods.anySet(base, offset, numWords);
    
    // Find all set bits
    int currentBit = -1;
    while ((currentBit = BitSetMethods.nextSetBit(base, offset, currentBit + 1, numWords)) != -1) {
        System.out.println("Bit " + currentBit + " is set");
    }
    
} finally {
    allocator.free(largeBitset);
}

API Reference

Murmur3_x86_32 Class

public final class Murmur3_x86_32 {
    /**
     * Creates Murmur3 hasher with specified seed value.
     */
    public Murmur3_x86_32(int seed);
    
    /**
     * Returns string representation of this hasher.
     */
    public String toString();
    
    /**
     * Computes Murmur3 hash of an integer value.
     */
    public int hashInt(int input);
    
    /**
     * Computes Murmur3 hash of word-aligned data.
     * Data length must be multiple of 4 bytes.
     */
    public int hashUnsafeWords(Object base, long offset, int lengthInBytes);
    
    /**
     * Computes Murmur3 hash of a long value.
     */
    public int hashLong(long input);
}

Murmur3_x86_32 Static Methods

/**
 * Static version of integer hashing with explicit seed.
 */
public static int hashInt(int input, int seed);

/**
 * Static version of word-aligned hashing with explicit seed.
 */
public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed);

/**
 * Legacy byte-level hashing method (deprecated, use hashUnsafeBytes2).
 */
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);

/**
 * Improved byte-level hashing method for arbitrary data.
 */
public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed);

/**
 * Static version of long hashing with explicit seed.
 */
public static int hashLong(long input, int seed);

BitSetMethods Class

public final class BitSetMethods {
    /**
     * Sets the bit at the specified index to true.
     * 
     * @param baseObject Base object containing the bitset
     * @param baseOffset Offset to the start of the bitset
     * @param index Bit index to set (0-based)
     */
    public static void set(Object baseObject, long baseOffset, int index);
    
    /**
     * Sets the bit at the specified index to false.
     * 
     * @param baseObject Base object containing the bitset
     * @param baseOffset Offset to the start of the bitset
     * @param index Bit index to unset (0-based)
     */
    public static void unset(Object baseObject, long baseOffset, int index);
    
    /**
     * Returns true if the bit at the specified index is set.
     * 
     * @param baseObject Base object containing the bitset
     * @param baseOffset Offset to the start of the bitset
     * @param index Bit index to check (0-based)
     * @return true if bit is set, false otherwise
     */
    public static boolean isSet(Object baseObject, long baseOffset, int index);
    
    /**
     * Returns true if any bit is set in the bitset.
     * 
     * @param baseObject Base object containing the bitset
     * @param baseOffset Offset to the start of the bitset
     * @param bitSetWidthInWords Width of bitset in 64-bit words
     * @return true if any bit is set, false if all bits are clear
     */
    public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInWords);
    
    /**
     * Finds the next set bit starting from the specified index.
     * 
     * @param baseObject Base object containing the bitset
     * @param baseOffset Offset to the start of the bitset
     * @param fromIndex Starting bit index for search (inclusive)
     * @param bitsetSizeInWords Size of bitset in 64-bit words
     * @return Index of next set bit, or -1 if no set bit found
     */
    public static int nextSetBit(Object baseObject, long baseOffset, int fromIndex, int bitsetSizeInWords);
}

Performance Characteristics

Murmur3 Hashing

  1. Fast Distribution: Murmur3 provides good hash distribution with excellent performance.
  2. Word-Aligned Optimization: hashUnsafeWords is optimized for 4-byte aligned data.
  3. Seed Support: Allows different hash families with different seed values.
  4. Platform Optimized: x86_32 implementation optimized for 32-bit hash output.

Hive Compatibility

  1. Legacy Support: Provides Hive-compatible hashing for data migration scenarios.
  2. Simple Algorithms: Uses simple XOR-based hashing for predictable results.
  3. SQL Integration: Designed for Spark SQL compatibility with Hive.

Bitset Operations

  1. Word-Level Operations: Operates on 64-bit words for maximum efficiency.
  2. Cache Friendly: Sequential bit access patterns optimize cache usage.
  3. Minimal Overhead: Direct memory operations with no object allocation.

Usage Notes

  1. Memory Alignment: Bitsets should be properly aligned for optimal performance.

  2. Thread Safety: None of these classes provide thread safety guarantees.

  3. Memory Management: Bitset operations work on raw memory - ensure proper lifecycle management.

  4. Bit Indexing: All bit operations use 0-based indexing.

  5. Word Boundaries: Bitset operations are optimized for word (64-bit) boundaries.

  6. Hash Collision: While Murmur3 has good distribution, consider collision handling in hash tables.

  7. Seed Selection: Choose hash seeds carefully to avoid predictable hash patterns.

Common Patterns

Hash Table Implementation

// Use Murmur3 for hash table operations
public class HashTable<K, V> {
    private static final int DEFAULT_SEED = 42;
    private final Murmur3_x86_32 hasher = new Murmur3_x86_32(DEFAULT_SEED);
    
    private int hash(K key) {
        if (key instanceof Integer) {
            return hasher.hashInt((Integer) key);
        } else if (key instanceof Long) {
            return hasher.hashLong((Long) key);
        } else {
            byte[] bytes = key.toString().getBytes(StandardCharsets.UTF_8);
            return Murmur3_x86_32.hashUnsafeBytes2(
                bytes, Platform.BYTE_ARRAY_OFFSET, bytes.length, DEFAULT_SEED
            );
        }
    }
}

Efficient Bloom Filter

// Implement Bloom filter using bitset operations
public class BloomFilter {
    private final MemoryBlock bitset;
    private final Object baseObject;
    private final long baseOffset;
    private final int bitsetSizeInWords;
    private final int[] hashSeeds;
    
    public BloomFilter(int sizeInBits, int numHashFunctions) {
        int sizeInBytes = (sizeInBits + 7) / 8; // Round up to byte boundary
        this.bitsetSizeInWords = (sizeInBytes + 7) / 8; // Round up to word boundary
        
        MemoryAllocator allocator = MemoryAllocator.HEAP;
        this.bitset = allocator.allocate(bitsetSizeInWords * 8);
        this.baseObject = bitset.getBaseObject();
        this.baseOffset = bitset.getBaseOffset();
        
        // Clear bitset
        Platform.setMemory(baseObject, baseOffset, bitsetSizeInWords * 8, (byte) 0);
        
        // Initialize hash seeds
        this.hashSeeds = new int[numHashFunctions];
        for (int i = 0; i < numHashFunctions; i++) {
            this.hashSeeds[i] = i * 97; // Different seeds for each hash function
        }
    }
    
    public void add(int value) {
        for (int seed : hashSeeds) {
            int hash = Murmur3_x86_32.hashInt(value, seed);
            int bitIndex = Math.abs(hash) % (bitsetSizeInWords * 64);
            BitSetMethods.set(baseObject, baseOffset, bitIndex);
        }
    }
    
    public boolean mightContain(int value) {
        for (int seed : hashSeeds) {
            int hash = Murmur3_x86_32.hashInt(value, seed);
            int bitIndex = Math.abs(hash) % (bitsetSizeInWords * 64);
            if (!BitSetMethods.isSet(baseObject, baseOffset, bitIndex)) {
                return false;
            }
        }
        return true;
    }
}