or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

array-operations.mdhashing-utilities.mdindex.mdmemory-management.mdplatform-operations.mdutf8-string-operations.md
tile.json

hashing-utilities.mddocs/

Hashing and Utilities

High-performance hashing implementations and utility classes including Murmur3 hashing, bitset operations, date/time constants, and Hive-compatible hashing for data distribution and compatibility requirements.

Capabilities

Murmur3 Hashing

Fast 32-bit Murmur3 hash implementation optimized for performance with support for different data types and memory layouts.

/**
 * 32-bit Murmur3 hasher implementation
 */
final class Murmur3_x86_32 {
    /**
     * Create hasher with specific seed
     * @param seed Seed value for hashing
     */
    public Murmur3_x86_32(int seed);
    
    /**
     * Get string representation of hasher
     * @return String representation
     */
    public String toString();
    
    // Instance methods using hasher's seed
    /**
     * Hash integer value using instance seed
     * @param input Integer to hash
     * @return Hash value
     */
    public int hashInt(int input);
    
    /**
     * Hash long value using instance seed
     * @param input Long to hash
     * @return Hash value
     */
    public int hashLong(long input);
    
    /**
     * Hash word-aligned bytes using instance seed
     * @param base Base object (null for off-heap)
     * @param offset Offset within object or address
     * @param lengthInBytes Number of bytes to hash (must be word-aligned)
     * @return Hash value
     */
    public int hashUnsafeWords(Object base, long offset, int lengthInBytes);
    
    // Static methods with explicit seed
    /**
     * Hash integer with provided seed
     * @param input Integer to hash
     * @param seed Seed value
     * @return Hash value
     */
    public static int hashInt(int input, int seed);
    
    /**
     * Hash long with provided seed
     * @param input Long to hash
     * @param seed Seed value
     * @return Hash value
     */
    public static int hashLong(long input, int seed);
    
    /**
     * Hash word-aligned bytes with provided seed
     * @param base Base object (null for off-heap)
     * @param offset Offset within object or address
     * @param lengthInBytes Number of bytes to hash (must be word-aligned)
     * @param seed Seed value
     * @return Hash value
     */
    public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed);
    
    /**
     * Hash arbitrary bytes with provided seed (legacy method)
     * @param base Base object (null for off-heap)
     * @param offset Offset within object or address
     * @param lengthInBytes Number of bytes to hash
     * @param seed Seed value
     * @return Hash value
     */
    public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);
    
    /**
     * Hash arbitrary bytes with provided seed (compatible method)
     * @param base Base object (null for off-heap)
     * @param offset Offset within object or address
     * @param lengthInBytes Number of bytes to hash
     * @param seed Seed value
     * @return Hash value
     */
    public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed);
}

Usage Examples:

import org.apache.spark.unsafe.hash.Murmur3_x86_32;
import org.apache.spark.unsafe.Platform;

// Create hasher with seed
Murmur3_x86_32 hasher = new Murmur3_x86_32(42);

// Hash different data types using instance methods
int intHash = hasher.hashInt(12345);
int longHash = hasher.hashLong(123456789L);

// Hash byte arrays
byte[] data = "Hello World".getBytes();
int arrayHash = hasher.hashUnsafeWords(
    data, 
    Platform.BYTE_ARRAY_OFFSET, 
    data.length
);

// Use static methods with explicit seed
int staticIntHash = Murmur3_x86_32.hashInt(12345, 42);
int staticLongHash = Murmur3_x86_32.hashLong(123456789L, 42);

// Hash memory regions
byte[] buffer = new byte[1024];
// ... fill buffer ...
int bufferHash = Murmur3_x86_32.hashUnsafeBytes(
    buffer,
    Platform.BYTE_ARRAY_OFFSET,
    buffer.length,
    42
);

// Hash off-heap memory
long address = Platform.allocateMemory(100);
try {
    Platform.setMemory(address, (byte) 0xFF, 100);
    int offHeapHash = Murmur3_x86_32.hashUnsafeBytes(
        null, address, 100, 42
    );
} finally {
    Platform.freeMemory(address);
}

BitSet Operations

Methods for working with fixed-size uncompressed bitsets stored in memory, providing efficient bit manipulation operations.

/**
 * Methods for working with fixed-size uncompressed bitsets
 */
final class BitSetMethods {
    /**
     * Set bit at specified index
     * @param baseObject Base object (null for off-heap)
     * @param baseOffset Base offset or address
     * @param index Bit index to set
     */
    public static void set(Object baseObject, long baseOffset, int index);
    
    /**
     * Unset (clear) bit at specified index
     * @param baseObject Base object (null for off-heap)
     * @param baseOffset Base offset or address
     * @param index Bit index to unset
     */
    public static void unset(Object baseObject, long baseOffset, int index);
    
    /**
     * Check if bit is set at specified index
     * @param baseObject Base object (null for off-heap)
     * @param baseOffset Base offset or address
     * @param index Bit index to check
     * @return true if bit is set
     */
    public static boolean isSet(Object baseObject, long baseOffset, int index);
    
    /**
     * Check if any bit is set in the bitset
     * @param baseObject Base object (null for off-heap)
     * @param baseOffset Base offset or address
     * @param bitSetWidthInWords Bitset width in 64-bit words
     * @return true if any bit is set
     */
    public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInWords);
    
    /**
     * Find next set bit starting from index
     * @param baseObject Base object (null for off-heap)
     * @param baseOffset Base offset or address
     * @param fromIndex Starting index for search
     * @param bitsetSizeInWords Bitset size in 64-bit words
     * @return Index of next set bit or -1 if not found
     */
    public static int nextSetBit(Object baseObject, long baseOffset, int fromIndex, int bitsetSizeInWords);
}

Usage Examples:

import org.apache.spark.unsafe.bitset.BitSetMethods;
import org.apache.spark.unsafe.Platform;

// Create bitset using long array (8 words = 512 bits)
long[] bitsetData = new long[8];
Object baseObj = bitsetData;
long baseOffset = Platform.LONG_ARRAY_OFFSET;

// Set some bits
BitSetMethods.set(baseObj, baseOffset, 10);
BitSetMethods.set(baseObj, baseOffset, 25);
BitSetMethods.set(baseObj, baseOffset, 100);

// Check if bits are set
boolean bit10Set = BitSetMethods.isSet(baseObj, baseOffset, 10); // true
boolean bit15Set = BitSetMethods.isSet(baseObj, baseOffset, 15); // false

// Check if any bits are set
boolean anySet = BitSetMethods.anySet(baseObj, baseOffset, 8); // true

// Find next set bit
int nextBit = BitSetMethods.nextSetBit(baseObj, baseOffset, 0, 8);  // 10
int afterTen = BitSetMethods.nextSetBit(baseObj, baseOffset, 11, 8); // 25

// Clear a bit
BitSetMethods.unset(baseObj, baseOffset, 25);
boolean bit25Set = BitSetMethods.isSet(baseObj, baseOffset, 25); // false

// Using off-heap bitset
long address = Platform.allocateMemory(64); // 8 words * 8 bytes
try {
    Platform.setMemory(address, (byte) 0, 64); // Clear all bits
    
    BitSetMethods.set(null, address, 42);
    boolean isSet = BitSetMethods.isSet(null, address, 42);
    
} finally {
    Platform.freeMemory(address);
}

Date/Time Constants

Comprehensive constants for date and time calculations and conversions, providing all common time unit relationships.

/**
 * Constants for date/time calculations and conversions
 */
class DateTimeConstants {
    // Basic time units
    public static final int MONTHS_PER_YEAR = 12;
    public static final byte DAYS_PER_WEEK = 7;
    public static final long HOURS_PER_DAY = 24L;
    public static final long MINUTES_PER_HOUR = 60L;
    public static final long SECONDS_PER_MINUTE = 60L;
    
    // Computed time constants
    public static final long SECONDS_PER_HOUR;      // 3600
    public static final long SECONDS_PER_DAY;       // 86400
    
    // Millisecond conversions
    public static final long MILLIS_PER_SECOND = 1000L;
    public static final long MILLIS_PER_MINUTE;     // 60000
    public static final long MILLIS_PER_HOUR;       // 3600000
    public static final long MILLIS_PER_DAY;        // 86400000
    
    // Microsecond conversions
    public static final long MICROS_PER_MILLIS = 1000L;
    public static final long MICROS_PER_SECOND;     // 1000000
    public static final long MICROS_PER_MINUTE;     // 60000000
    public static final long MICROS_PER_HOUR;       // 3600000000
    public static final long MICROS_PER_DAY;        // 86400000000
    
    // Nanosecond conversions
    public static final long NANOS_PER_MICROS = 1000L;
    public static final long NANOS_PER_MILLIS;      // 1000000
    public static final long NANOS_PER_SECOND;      // 1000000000
}

Usage Examples:

import org.apache.spark.sql.catalyst.util.DateTimeConstants;

// Time calculations using constants
long currentTimeMillis = System.currentTimeMillis();

// Convert to different units
long currentTimeSeconds = currentTimeMillis / DateTimeConstants.MILLIS_PER_SECOND;
long currentTimeMicros = currentTimeMillis * DateTimeConstants.MICROS_PER_MILLIS;
long currentTimeNanos = currentTimeMillis * DateTimeConstants.NANOS_PER_MILLIS;

// Calculate time spans
long hoursInWeek = DateTimeConstants.DAYS_PER_WEEK * DateTimeConstants.HOURS_PER_DAY;
long secondsInWeek = hoursInWeek * DateTimeConstants.SECONDS_PER_HOUR;

// Duration calculations
long durationDays = 5;
long durationMillis = durationDays * DateTimeConstants.MILLIS_PER_DAY;
long durationMicros = durationDays * DateTimeConstants.MICROS_PER_DAY;

// Conversion helpers
public static long millisToMicros(long millis) {
    return millis * DateTimeConstants.MICROS_PER_MILLIS;
}

public static long microsToNanos(long micros) {
    return micros * DateTimeConstants.NANOS_PER_MICROS;
}

public static long secondsToMillis(long seconds) {
    return seconds * DateTimeConstants.MILLIS_PER_SECOND;
}

Hive Hasher Compatibility

Hive-compatible hashing functions for maintaining compatibility with Hive v1.2.1 hashing behavior.

/**
 * Simulates Hive's hashing function from Hive v1.2.1 for compatibility
 */
class HiveHasher {
    /**
     * Hash integer using Hive-compatible algorithm
     * @param input Integer to hash
     * @return Hive-compatible hash value
     */
    public static int hashInt(int input);
    
    /**
     * Hash long using Hive-compatible algorithm
     * @param input Long to hash
     * @return Hive-compatible hash value
     */
    public static int hashLong(long input);
    
    /**
     * Hash byte array using Hive-compatible algorithm with unsafe access
     * @param base Base object (null for off-heap)
     * @param offset Offset within object or address
     * @param lengthInBytes Number of bytes to hash
     * @return Hive-compatible hash value
     */
    public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes);
    
    /**
     * Get string representation
     * @return String representation of hasher
     */
    public String toString();
}

Usage Examples:

import org.apache.spark.sql.catalyst.expressions.HiveHasher;
import org.apache.spark.unsafe.Platform;

// Hash integers for Hive compatibility
int intValue = 12345;
int hiveIntHash = HiveHasher.hashInt(intValue);

// Hash longs for Hive compatibility
long longValue = 123456789L;
int hiveLongHash = HiveHasher.hashLong(longValue);

// Hash byte arrays with Hive-compatible algorithm
byte[] data = "test data".getBytes();
int hiveArrayHash = HiveHasher.hashUnsafeBytes(
    data,
    Platform.BYTE_ARRAY_OFFSET,
    data.length
);

// Use for partitioning compatibility with Hive tables
public int getHivePartition(Object value, int numPartitions) {
    int hash;
    if (value instanceof Integer) {
        hash = HiveHasher.hashInt((Integer) value);
    } else if (value instanceof Long) {
        hash = HiveHasher.hashLong((Long) value);
    } else {
        byte[] bytes = value.toString().getBytes();
        hash = HiveHasher.hashUnsafeBytes(
            bytes, Platform.BYTE_ARRAY_OFFSET, bytes.length
        );
    }
    return Math.abs(hash) % numPartitions;
}

Unsafe Aligned Offset Utilities

Platform-specific alignment handling for record length offsets, ensuring proper memory alignment across different architectures.

/**
 * Handles platform-specific alignment for record length offsets
 */
class UnsafeAlignedOffset {
    /**
     * Set UAO size for testing purposes
     * @param size UAO size to set
     */
    public static void setUaoSize(int size);
    
    /**
     * Get current UAO size
     * @return Current UAO size
     */
    public static int getUaoSize();
    
    /**
     * Get size value considering platform alignment
     * @param object Base object
     * @param offset Offset within object
     * @return Size value with proper alignment
     */
    public static int getSize(Object object, long offset);
    
    /**
     * Put size value considering platform alignment
     * @param object Base object
     * @param offset Offset within object
     * @param value Size value to store
     */
    public static void putSize(Object object, long offset, int value);
}

Usage Examples:

import org.apache.spark.unsafe.UnsafeAlignedOffset;
import org.apache.spark.unsafe.Platform;

// Working with aligned record sizes
byte[] recordBuffer = new byte[1024];
long recordOffset = Platform.BYTE_ARRAY_OFFSET;

// Store record size with proper alignment
int recordSize = 256;
UnsafeAlignedOffset.putSize(recordBuffer, recordOffset, recordSize);

// Read record size with proper alignment
int storedSize = UnsafeAlignedOffset.getSize(recordBuffer, recordOffset);

// Check current alignment requirements
int uaoSize = UnsafeAlignedOffset.getUaoSize();
System.out.println("Current UAO size: " + uaoSize);

// For testing different alignment scenarios
UnsafeAlignedOffset.setUaoSize(8); // Set 8-byte alignment for testing
// ... run tests ...
UnsafeAlignedOffset.setUaoSize(4); // Reset to 4-byte alignment

Calendar Interval Type

Specialized data type for representing calendar intervals with separate components for months, days, and microseconds.

/**
 * Represents calendar intervals with months, days, and microseconds
 * @Unstable - API may change in future versions
 */
final class CalendarInterval implements Serializable {
    // Public fields for interval components
    public final int months;        // Number of months
    public final int days;          // Number of days
    public final long microseconds; // Number of microseconds
    
    /**
     * Create calendar interval with specified components
     * @param months Number of months
     * @param days Number of days
     * @param microseconds Number of microseconds
     */
    public CalendarInterval(int months, int days, long microseconds);
    
    /**
     * Check equality with another object
     * @param o Object to compare with
     * @return true if equal
     */
    public boolean equals(Object o);
    
    /**
     * Calculate hash code
     * @return Hash code value
     */
    public int hashCode();
    
    /**
     * Get string representation
     * @return String representation of interval
     */
    public String toString();
    
    /**
     * Extract interval as Java Period (months and days only)
     * @return Java Period representation
     */
    public Period extractAsPeriod();
    
    /**
     * Extract interval as Java Duration (microseconds only)
     * @return Java Duration representation
     */
    public Duration extractAsDuration();
}

Usage Examples:

import org.apache.spark.unsafe.types.CalendarInterval;
import java.time.Period;
import java.time.Duration;

// Create calendar intervals
CalendarInterval interval1 = new CalendarInterval(2, 15, 3600000000L); // 2 months, 15 days, 1 hour
CalendarInterval interval2 = new CalendarInterval(0, 0, 1500000L);     // 1.5 seconds
CalendarInterval interval3 = new CalendarInterval(12, 0, 0L);          // 1 year

// Working with interval components
int months = interval1.months;
int days = interval1.days;
long microseconds = interval1.microseconds;

// Convert to Java time types
Period period = interval1.extractAsPeriod();    // 2 months, 15 days
Duration duration = interval1.extractAsDuration(); // 1 hour

// Comparison and equality
boolean areEqual = interval1.equals(interval2);
int hashCode = interval1.hashCode();
String description = interval1.toString();

// Common use cases
public static CalendarInterval oneHour() {
    return new CalendarInterval(0, 0, DateTimeConstants.MICROS_PER_HOUR);
}

public static CalendarInterval oneDay() {
    return new CalendarInterval(0, 1, 0L);
}

public static CalendarInterval oneMonth() {
    return new CalendarInterval(1, 0, 0L);
}

Performance and Usage Guidelines

Choosing Hash Functions

// For general-purpose hashing (faster)
int hash1 = Murmur3_x86_32.hashInt(value, seed);

// For Hive compatibility (when interfacing with Hive)
int hash2 = HiveHasher.hashInt(value);

// For consistent partitioning across Spark and Hive
int partition = Math.abs(HiveHasher.hashInt(key)) % numPartitions;

BitSet Memory Layout

// BitSet requires word-aligned memory
int numBits = 1000;
int numWords = (numBits + 63) / 64; // Round up to words
long[] bitsetStorage = new long[numWords];

// Always specify correct word count
boolean anySet = BitSetMethods.anySet(
    bitsetStorage, 
    Platform.LONG_ARRAY_OFFSET, 
    numWords  // Important: use actual word count
);

Time Calculation Optimization

// Pre-calculate commonly used values
private static final long MICROS_PER_WEEK = 
    DateTimeConstants.DAYS_PER_WEEK * DateTimeConstants.MICROS_PER_DAY;

// Use constants for efficient conversions
public long convertDaysToMicros(int days) {
    return days * DateTimeConstants.MICROS_PER_DAY;
}