or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

array-operations.mddata-types-utilities.mdhash-bitset-operations.mdindex.mdmemory-management.mdplatform-operations.mdutf8-string-processing.md
tile.json

hash-bitset-operations.mddocs/

Hash Functions and Bitset Operations

Fast hash function implementations and bitset manipulation methods for efficient data processing and boolean operations. These operations provide high-performance hashing and bitset manipulation capabilities essential for data structures and algorithms in distributed computing.

Capabilities

Murmur3 Hash Function

32-bit Murmur3 hash function implementation providing fast, high-quality hashing for data processing applications.

public final class Murmur3_x86_32 {
    public Murmur3_x86_32(int seed);
    
    // Instance methods
    public String toString();
    public int hashInt(int input);
    public int hashUnsafeWords(Object base, long offset, int lengthInBytes);
    public int hashLong(long input);
    
    // Static methods
    public static int hashInt(int input, int seed);
    public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed);
    public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);
    public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed);
    public static int hashLong(long input, int seed);
}

Hive-Compatible Hash Function

Hash function implementation that simulates Hive's hash function for compatibility with Hive v1.2.1, ensuring consistent hashing across Spark and Hive environments.

public class HiveHasher {
    public String toString();
    
    public static int hashInt(int input);
    public static int hashLong(long input);
    public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes);
}

Bitset Operations

Methods for working with fixed-size uncompressed bitsets stored in memory, providing efficient boolean array operations.

public final class BitSetMethods {
    public static void set(Object baseObject, long baseOffset, int index);
    public static void unset(Object baseObject, long baseOffset, int index);
    public static boolean isSet(Object baseObject, long baseOffset, int index);
    public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInWords);
    public static int nextSetBit(Object baseObject, long baseOffset, int fromIndex, int bitsetSizeInWords);
}

Usage Examples

Murmur3 Hashing

import org.apache.spark.unsafe.hash.Murmur3_x86_32;

// Create hasher with specific seed
Murmur3_x86_32 hasher = new Murmur3_x86_32(42);

// Hash different data types
int intHash = hasher.hashInt(12345);
long longHash = hasher.hashLong(123456789L);

// Hash memory regions (word-aligned for optimal performance)
byte[] data = "Hello World".getBytes();
int memoryHash = hasher.hashUnsafeWords(
    data, 
    Platform.BYTE_ARRAY_OFFSET, 
    data.length
);

// Static methods with custom seed
int staticIntHash = Murmur3_x86_32.hashInt(12345, 42);
int staticLongHash = Murmur3_x86_32.hashLong(123456789L, 42);

// Hash arbitrary byte sequences
int byteHash = Murmur3_x86_32.hashUnsafeBytes2(
    data,
    Platform.BYTE_ARRAY_OFFSET,
    data.length,
    42
);

Hive-Compatible Hashing

import org.apache.spark.sql.catalyst.expressions.HiveHasher;

// Hash integers (returns input unchanged for compatibility)
int intHash = HiveHasher.hashInt(12345);  // Returns 12345

// Hash longs using XOR of upper and lower 32 bits
long longValue = 0x123456789ABCDEFL;
int longHash = HiveHasher.hashLong(longValue);

// Hash byte sequences using rolling hash algorithm
byte[] data = "test data".getBytes();
int byteHash = HiveHasher.hashUnsafeBytes(
    data,
    Platform.BYTE_ARRAY_OFFSET,
    data.length
);

Bitset Operations

import org.apache.spark.unsafe.bitset.BitSetMethods;
import org.apache.spark.unsafe.Platform;

// Allocate memory for bitset (e.g., 64 bits = 1 word)
long bitsetAddress = Platform.allocateMemory(8);
Platform.setMemory(bitsetAddress, (byte) 0, 8);  // Initialize to zeros

// Set individual bits
BitSetMethods.set(null, bitsetAddress, 5);   // Set bit 5
BitSetMethods.set(null, bitsetAddress, 10);  // Set bit 10
BitSetMethods.set(null, bitsetAddress, 15);  // Set bit 15

// Check if bits are set
boolean bit5Set = BitSetMethods.isSet(null, bitsetAddress, 5);    // true
boolean bit7Set = BitSetMethods.isSet(null, bitsetAddress, 7);    // false

// Unset bits
BitSetMethods.unset(null, bitsetAddress, 10);  // Clear bit 10
boolean bit10Set = BitSetMethods.isSet(null, bitsetAddress, 10);  // false

// Check if any bits are set in the bitset
boolean anyBitsSet = BitSetMethods.anySet(null, bitsetAddress, 1);  // true (bits 5 and 15 are set)

// Find next set bit starting from a given index
int nextBit = BitSetMethods.nextSetBit(null, bitsetAddress, 0, 1);   // Returns 5
int nextBit2 = BitSetMethods.nextSetBit(null, bitsetAddress, 6, 1);  // Returns 15

// Clean up
Platform.freeMemory(bitsetAddress);

Array-Based Bitset Operations

import org.apache.spark.unsafe.bitset.BitSetMethods;
import org.apache.spark.unsafe.Platform;

// Use long array as bitset storage
long[] bitsetArray = new long[2];  // 128 bits

// Set bits using array as backing store
BitSetMethods.set(bitsetArray, Platform.LONG_ARRAY_OFFSET, 65);  // Set bit 65 (in second long)
BitSetMethods.set(bitsetArray, Platform.LONG_ARRAY_OFFSET, 120); // Set bit 120

// Check bits
boolean bit65Set = BitSetMethods.isSet(bitsetArray, Platform.LONG_ARRAY_OFFSET, 65);  // true

// Check if any bits are set in the entire bitset
boolean anySet = BitSetMethods.anySet(bitsetArray, Platform.LONG_ARRAY_OFFSET, 2);  // true

// Find next set bit
int nextSetBit = BitSetMethods.nextSetBit(bitsetArray, Platform.LONG_ARRAY_OFFSET, 0, 2);  // 65

Combined Hash and Bitset Usage

import org.apache.spark.unsafe.hash.Murmur3_x86_32;
import org.apache.spark.unsafe.bitset.BitSetMethods;
import org.apache.spark.unsafe.Platform;

// Create a bloom filter-like structure
long[] bitset = new long[16];  // 1024 bits
Murmur3_x86_32 hasher1 = new Murmur3_x86_32(42);
Murmur3_x86_32 hasher2 = new Murmur3_x86_32(123);

// Insert element
String element = "example";
byte[] elementBytes = element.getBytes();
int hash1 = Math.abs(hasher1.hashUnsafeWords(elementBytes, Platform.BYTE_ARRAY_OFFSET, elementBytes.length));
int hash2 = Math.abs(hasher2.hashUnsafeWords(elementBytes, Platform.BYTE_ARRAY_OFFSET, elementBytes.length));

int bit1 = hash1 % 1024;
int bit2 = hash2 % 1024;

BitSetMethods.set(bitset, Platform.LONG_ARRAY_OFFSET, bit1);
BitSetMethods.set(bitset, Platform.LONG_ARRAY_OFFSET, bit2);

// Check if element might be present
boolean mightContain = BitSetMethods.isSet(bitset, Platform.LONG_ARRAY_OFFSET, bit1) &&
                      BitSetMethods.isSet(bitset, Platform.LONG_ARRAY_OFFSET, bit2);

Performance Considerations

import org.apache.spark.unsafe.hash.Murmur3_x86_32;

// For word-aligned data, use hashUnsafeWords for better performance
byte[] alignedData = new byte[16];  // Multiple of 4 bytes
int wordHash = Murmur3_x86_32.hashUnsafeWords(
    alignedData, 
    Platform.BYTE_ARRAY_OFFSET, 
    alignedData.length, 
    42
);

// For arbitrary byte sequences, use hashUnsafeBytes2
byte[] arbitraryData = new byte[15];  // Not word-aligned
int byteHash = Murmur3_x86_32.hashUnsafeBytes2(
    arbitraryData,
    Platform.BYTE_ARRAY_OFFSET,
    arbitraryData.length,
    42
);

// Reuse hasher instances when hashing multiple values with the same seed
Murmur3_x86_32 reusableHasher = new Murmur3_x86_32(42);
int[] values = {1, 2, 3, 4, 5};
for (int value : values) {
    int hash = reusableHasher.hashInt(value);
    // Process hash
}