0
# Hash Functions and Bitset Operations
1
2
Fast hash function implementations and bitset manipulation methods for efficient data processing and boolean operations. These operations provide high-performance hashing and bitset manipulation capabilities essential for data structures and algorithms in distributed computing.
3
4
## Capabilities
5
6
### Murmur3 Hash Function
7
8
32-bit Murmur3 hash function implementation providing fast, high-quality hashing for data processing applications.
9
10
```java { .api }
11
public final class Murmur3_x86_32 {
12
public Murmur3_x86_32(int seed);
13
14
// Instance methods
15
public String toString();
16
public int hashInt(int input);
17
public int hashUnsafeWords(Object base, long offset, int lengthInBytes);
18
public int hashLong(long input);
19
20
// Static methods
21
public static int hashInt(int input, int seed);
22
public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed);
23
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);
24
public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed);
25
public static int hashLong(long input, int seed);
26
}
27
```
28
29
### Hive-Compatible Hash Function
30
31
Hash function implementation that simulates Hive's hash function for compatibility with Hive v1.2.1, ensuring consistent hashing across Spark and Hive environments.
32
33
```java { .api }
34
public class HiveHasher {
35
public String toString();
36
37
public static int hashInt(int input);
38
public static int hashLong(long input);
39
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes);
40
}
41
```
42
43
### Bitset Operations
44
45
Methods for working with fixed-size uncompressed bitsets stored in memory, providing efficient boolean array operations.
46
47
```java { .api }
48
public final class BitSetMethods {
49
public static void set(Object baseObject, long baseOffset, int index);
50
public static void unset(Object baseObject, long baseOffset, int index);
51
public static boolean isSet(Object baseObject, long baseOffset, int index);
52
public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInWords);
53
public static int nextSetBit(Object baseObject, long baseOffset, int fromIndex, int bitsetSizeInWords);
54
}
55
```
56
57
## Usage Examples
58
59
### Murmur3 Hashing
60
61
```java
62
import org.apache.spark.unsafe.hash.Murmur3_x86_32;
63
64
// Create hasher with specific seed
65
Murmur3_x86_32 hasher = new Murmur3_x86_32(42);
66
67
// Hash different data types
68
int intHash = hasher.hashInt(12345);
69
long longHash = hasher.hashLong(123456789L);
70
71
// Hash memory regions (word-aligned for optimal performance)
72
byte[] data = "Hello World".getBytes();
73
int memoryHash = hasher.hashUnsafeWords(
74
data,
75
Platform.BYTE_ARRAY_OFFSET,
76
data.length
77
);
78
79
// Static methods with custom seed
80
int staticIntHash = Murmur3_x86_32.hashInt(12345, 42);
81
int staticLongHash = Murmur3_x86_32.hashLong(123456789L, 42);
82
83
// Hash arbitrary byte sequences
84
int byteHash = Murmur3_x86_32.hashUnsafeBytes2(
85
data,
86
Platform.BYTE_ARRAY_OFFSET,
87
data.length,
88
42
89
);
90
```
91
92
### Hive-Compatible Hashing
93
94
```java
95
import org.apache.spark.sql.catalyst.expressions.HiveHasher;
96
97
// Hash integers (returns input unchanged for compatibility)
98
int intHash = HiveHasher.hashInt(12345); // Returns 12345
99
100
// Hash longs using XOR of upper and lower 32 bits
101
long longValue = 0x123456789ABCDEFL;
102
int longHash = HiveHasher.hashLong(longValue);
103
104
// Hash byte sequences using rolling hash algorithm
105
byte[] data = "test data".getBytes();
106
int byteHash = HiveHasher.hashUnsafeBytes(
107
data,
108
Platform.BYTE_ARRAY_OFFSET,
109
data.length
110
);
111
```
112
113
### Bitset Operations
114
115
```java
116
import org.apache.spark.unsafe.bitset.BitSetMethods;
117
import org.apache.spark.unsafe.Platform;
118
119
// Allocate memory for bitset (e.g., 64 bits = 1 word)
120
long bitsetAddress = Platform.allocateMemory(8);
121
Platform.setMemory(bitsetAddress, (byte) 0, 8); // Initialize to zeros
122
123
// Set individual bits
124
BitSetMethods.set(null, bitsetAddress, 5); // Set bit 5
125
BitSetMethods.set(null, bitsetAddress, 10); // Set bit 10
126
BitSetMethods.set(null, bitsetAddress, 15); // Set bit 15
127
128
// Check if bits are set
129
boolean bit5Set = BitSetMethods.isSet(null, bitsetAddress, 5); // true
130
boolean bit7Set = BitSetMethods.isSet(null, bitsetAddress, 7); // false
131
132
// Unset bits
133
BitSetMethods.unset(null, bitsetAddress, 10); // Clear bit 10
134
boolean bit10Set = BitSetMethods.isSet(null, bitsetAddress, 10); // false
135
136
// Check if any bits are set in the bitset
137
boolean anyBitsSet = BitSetMethods.anySet(null, bitsetAddress, 1); // true (bits 5 and 15 are set)
138
139
// Find next set bit starting from a given index
140
int nextBit = BitSetMethods.nextSetBit(null, bitsetAddress, 0, 1); // Returns 5
141
int nextBit2 = BitSetMethods.nextSetBit(null, bitsetAddress, 6, 1); // Returns 15
142
143
// Clean up
144
Platform.freeMemory(bitsetAddress);
145
```
146
147
### Array-Based Bitset Operations
148
149
```java
150
import org.apache.spark.unsafe.bitset.BitSetMethods;
151
import org.apache.spark.unsafe.Platform;
152
153
// Use long array as bitset storage
154
long[] bitsetArray = new long[2]; // 128 bits
155
156
// Set bits using array as backing store
157
BitSetMethods.set(bitsetArray, Platform.LONG_ARRAY_OFFSET, 65); // Set bit 65 (in second long)
158
BitSetMethods.set(bitsetArray, Platform.LONG_ARRAY_OFFSET, 120); // Set bit 120
159
160
// Check bits
161
boolean bit65Set = BitSetMethods.isSet(bitsetArray, Platform.LONG_ARRAY_OFFSET, 65); // true
162
163
// Check if any bits are set in the entire bitset
164
boolean anySet = BitSetMethods.anySet(bitsetArray, Platform.LONG_ARRAY_OFFSET, 2); // true
165
166
// Find next set bit
167
int nextSetBit = BitSetMethods.nextSetBit(bitsetArray, Platform.LONG_ARRAY_OFFSET, 0, 2); // 65
168
```
169
170
### Combined Hash and Bitset Usage
171
172
```java
173
import org.apache.spark.unsafe.hash.Murmur3_x86_32;
174
import org.apache.spark.unsafe.bitset.BitSetMethods;
175
import org.apache.spark.unsafe.Platform;
176
177
// Create a bloom filter-like structure
178
long[] bitset = new long[16]; // 1024 bits
179
Murmur3_x86_32 hasher1 = new Murmur3_x86_32(42);
180
Murmur3_x86_32 hasher2 = new Murmur3_x86_32(123);
181
182
// Insert element
183
String element = "example";
184
byte[] elementBytes = element.getBytes();
185
int hash1 = Math.abs(hasher1.hashUnsafeWords(elementBytes, Platform.BYTE_ARRAY_OFFSET, elementBytes.length));
186
int hash2 = Math.abs(hasher2.hashUnsafeWords(elementBytes, Platform.BYTE_ARRAY_OFFSET, elementBytes.length));
187
188
int bit1 = hash1 % 1024;
189
int bit2 = hash2 % 1024;
190
191
BitSetMethods.set(bitset, Platform.LONG_ARRAY_OFFSET, bit1);
192
BitSetMethods.set(bitset, Platform.LONG_ARRAY_OFFSET, bit2);
193
194
// Check if element might be present
195
boolean mightContain = BitSetMethods.isSet(bitset, Platform.LONG_ARRAY_OFFSET, bit1) &&
196
BitSetMethods.isSet(bitset, Platform.LONG_ARRAY_OFFSET, bit2);
197
```
198
199
### Performance Considerations
200
201
```java
202
import org.apache.spark.unsafe.hash.Murmur3_x86_32;
203
204
// For word-aligned data, use hashUnsafeWords for better performance
205
byte[] alignedData = new byte[16]; // Multiple of 4 bytes
206
int wordHash = Murmur3_x86_32.hashUnsafeWords(
207
alignedData,
208
Platform.BYTE_ARRAY_OFFSET,
209
alignedData.length,
210
42
211
);
212
213
// For arbitrary byte sequences, use hashUnsafeBytes2
214
byte[] arbitraryData = new byte[15]; // Not word-aligned
215
int byteHash = Murmur3_x86_32.hashUnsafeBytes2(
216
arbitraryData,
217
Platform.BYTE_ARRAY_OFFSET,
218
arbitraryData.length,
219
42
220
);
221
222
// Reuse hasher instances when hashing multiple values with the same seed
223
Murmur3_x86_32 reusableHasher = new Murmur3_x86_32(42);
224
int[] values = {1, 2, 3, 4, 5};
225
for (int value : values) {
226
int hash = reusableHasher.hashInt(value);
227
// Process hash
228
}
229
```