tessl install tessl/maven-org-apache-spark--spark-unsafe_2-13@3.5.0Low-level unsafe operations and optimized data structures for Apache Spark's internal memory management and performance-critical operations.
High-performance hash function implementations including 32-bit Murmur3 hasher and Hive-compatible hashing for data compatibility across systems. These hash functions are optimized for distributed computing scenarios and provide consistent hashing behavior for data partitioning and integrity verification.
Fast, high-quality 32-bit Murmur3 hash function implementation based on Google Guava, providing excellent distribution properties and performance for hash tables, data partitioning, and checksums.
public final class Murmur3_x86_32 {
public Murmur3_x86_32(int seed);
public String toString();
public int hashInt(int input);
public static int hashInt(int input, int seed);
public int hashLong(long input);
public static int hashLong(long input, int seed);
public int hashUnsafeWords(Object base, long offset, int lengthInBytes);
public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed);
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);
public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed);
}Usage Examples:
// Create hasher with seed
Murmur3_x86_32 hasher = new Murmur3_x86_32(42);
// Hash primitive types with instance
int intHash = hasher.hashInt(12345);
long longHash = hasher.hashLong(1234567890L);
// Hash primitive types with static methods
int staticIntHash = Murmur3_x86_32.hashInt(12345, 42);
int staticLongHash = Murmur3_x86_32.hashLong(1234567890L, 42);
// Results should be identical
assert intHash == staticIntHash;
assert longHash == staticLongHash;
System.out.println("Hasher info: " + hasher.toString());Direct hashing of memory regions supporting both on-heap and off-heap memory, with specialized optimizations for word-aligned data and arbitrary byte sequences.
// Word-aligned memory hashing (optimized)
public int hashUnsafeWords(Object base, long offset, int lengthInBytes);
public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed);
// Arbitrary byte hashing (legacy and compatible versions)
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);
public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed);Usage Examples:
Murmur3_x86_32 hasher = new Murmur3_x86_32(0);
// Hash word-aligned data (most efficient)
long[] alignedData = {1L, 2L, 3L, 4L, 5L};
int alignedHash = hasher.hashUnsafeWords(
alignedData,
Platform.LONG_ARRAY_OFFSET,
alignedData.length * 8 // 8 bytes per long
);
// Hash arbitrary byte array
byte[] byteData = "Hello World".getBytes();
int byteHash = Murmur3_x86_32.hashUnsafeBytes2(
byteData,
Platform.BYTE_ARRAY_OFFSET,
byteData.length,
0 // seed
);
// Hash off-heap memory
long offHeapAddr = Platform.allocateMemory(64);
Platform.putLong(null, offHeapAddr, 12345L);
int offHeapHash = hasher.hashUnsafeWords(null, offHeapAddr, 8);
Platform.freeMemory(offHeapAddr);Hash function implementation that simulates Hive's hashing behavior from Hive v1.2.1, ensuring compatibility with existing Hive-based data processing systems and consistent partitioning behavior.
public class HiveHasher {
public String toString();
public static int hashInt(int input);
public static int hashLong(long input);
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes);
}Usage Examples:
// Hash integers (identity function in Hive)
int intValue = 12345;
int hiveIntHash = HiveHasher.hashInt(intValue); // returns 12345
// Hash longs (XOR of upper and lower 32 bits)
long longValue = 0x123456789ABCDEFL;
int hiveLongHash = HiveHasher.hashLong(longValue); // XOR of 0x12345678 and 0x9ABCDEF
// Hash byte arrays with Hive algorithm
String text = "Hello Hive World";
byte[] textBytes = text.getBytes();
int hiveStringHash = HiveHasher.hashUnsafeBytes(
textBytes,
Platform.BYTE_ARRAY_OFFSET,
textBytes.length
);
System.out.println("Hive hasher: " + new HiveHasher().toString());Common patterns for using hash functions in data partitioning scenarios, ensuring consistent distribution of data across multiple partitions or nodes.
Consistent Partitioning Example:
// Partition data consistently using Murmur3
Murmur3_x86_32 partitioner = new Murmur3_x86_32(12345);
int numPartitions = 16;
// Partition string keys
String[] keys = {"user123", "user456", "user789", "user101112"};
for (String key : keys) {
byte[] keyBytes = key.getBytes();
int hash = partitioner.hashUnsafeBytes2(
keyBytes,
Platform.BYTE_ARRAY_OFFSET,
keyBytes.length,
12345
);
int partition = Math.abs(hash) % numPartitions;
System.out.println(key + " -> partition " + partition);
}
// Partition numeric keys
long[] numericKeys = {1001L, 1002L, 1003L, 1004L};
for (long key : numericKeys) {
int hash = partitioner.hashLong(key);
int partition = Math.abs(hash) % numPartitions;
System.out.println(key + " -> partition " + partition);
}Using hash functions for data integrity verification and deduplication in distributed storage scenarios.
Data Integrity Example:
// Create checksums for data integrity
Murmur3_x86_32 checksummer = new Murmur3_x86_32(0);
// Compute checksum for a data block
byte[] dataBlock = new byte[1024];
// ... fill with actual data ...
int checksum = checksummer.hashUnsafeBytes2(
dataBlock,
Platform.BYTE_ARRAY_OFFSET,
dataBlock.length,
0
);
// Store checksum with data
DataBlock block = new DataBlock(dataBlock, checksum);
// Later, verify integrity
int verifyChecksum = checksummer.hashUnsafeBytes2(
block.getData(),
Platform.BYTE_ARRAY_OFFSET,
block.getData().length,
0
);
boolean isValid = (verifyChecksum == block.getChecksum());
System.out.println("Data integrity check: " + (isValid ? "PASS" : "FAIL"));Ensuring hash compatibility with Hive-based systems for seamless data exchange and consistent partitioning behavior across different processing engines.
Hive Partitioning Compatibility:
// Replicate Hive's partitioning behavior exactly
public class HiveCompatiblePartitioner {
private final int numPartitions;
public HiveCompatiblePartitioner(int numPartitions) {
this.numPartitions = numPartitions;
}
public int getPartition(String key) {
if (key == null) return 0;
byte[] keyBytes = key.getBytes();
int hash = HiveHasher.hashUnsafeBytes(
keyBytes,
Platform.BYTE_ARRAY_OFFSET,
keyBytes.length
);
// Hive's partitioning logic
return (hash & Integer.MAX_VALUE) % numPartitions;
}
public int getPartition(long key) {
int hash = HiveHasher.hashLong(key);
return (hash & Integer.MAX_VALUE) % numPartitions;
}
}
// Usage for consistent partitioning with Hive
HiveCompatiblePartitioner partitioner = new HiveCompatiblePartitioner(32);
// These will produce the same partition assignments as Hive
int partition1 = partitioner.getPartition("customer_12345");
int partition2 = partitioner.getPartition(98765L);Guidelines for choosing appropriate hash functions and optimizing hash performance for different data types and access patterns.
Hash Function Selection:
// For maximum performance with word-aligned data
public int fastHashLongArray(long[] data) {
return Murmur3_x86_32.hashUnsafeWords(
data,
Platform.LONG_ARRAY_OFFSET,
data.length * 8,
0
);
}
// For compatibility with existing Hive systems
public int compatibleHashString(String str) {
byte[] bytes = str.getBytes();
return HiveHasher.hashUnsafeBytes(
bytes,
Platform.BYTE_ARRAY_OFFSET,
bytes.length
);
}
// For general-purpose hashing with good distribution
public int generalHashBytes(byte[] data, int seed) {
return Murmur3_x86_32.hashUnsafeBytes2(
data,
Platform.BYTE_ARRAY_OFFSET,
data.length,
seed
);
}Bulk Hashing Optimization:
// Efficient bulk hashing of multiple values
Murmur3_x86_32 bulkHasher = new Murmur3_x86_32(42);
int[] results = new int[1000];
// Hash array of longs efficiently
long[] values = new long[1000];
for (int i = 0; i < values.length; i++) {
results[i] = bulkHasher.hashLong(values[i]);
}
// Hash large memory region once instead of small pieces
byte[] largeBuffer = new byte[8192];
int singleHash = bulkHasher.hashUnsafeBytes2(
largeBuffer,
Platform.BYTE_ARRAY_OFFSET,
largeBuffer.length,
42
);