0
# Hashing and Bitsets
1
2
Spark Unsafe provides high-performance hashing functions and bitset manipulation utilities optimized for data processing workloads. The hashing functionality includes Murmur3 implementation, while the bitset operations provide efficient manipulation of packed bit arrays.
3
4
## Core Imports
5
6
```java
7
import org.apache.spark.unsafe.hash.Murmur3_x86_32;
8
import org.apache.spark.unsafe.bitset.BitSetMethods;
9
```
10
11
## Usage Examples
12
13
### Murmur3 Hashing
14
15
```java
16
// Create hasher with specific seed
17
Murmur3_x86_32 hasher = new Murmur3_x86_32(42);
18
19
// Hash integers
20
int intHash = hasher.hashInt(12345);
21
int longHash = hasher.hashLong(123456789L);
22
23
// Static hashing methods (convenient for one-off operations)
24
int staticIntHash = Murmur3_x86_32.hashInt(12345, 42);
25
int staticLongHash = Murmur3_x86_32.hashLong(123456789L, 42);
26
27
// Hash word-aligned memory regions
28
byte[] data = "Hello, World!".getBytes(StandardCharsets.UTF_8);
29
int wordsHash = Murmur3_x86_32.hashUnsafeWords(
30
data, Platform.BYTE_ARRAY_OFFSET, data.length, 42
31
);
32
33
// Hash arbitrary byte sequences (improved method)
34
int bytesHash = Murmur3_x86_32.hashUnsafeBytes2(
35
data, Platform.BYTE_ARRAY_OFFSET, data.length, 42
36
);
37
```
38
39
40
### Bitset Operations
41
42
```java
43
// Allocate memory for bitset (assume 64 bits = 8 bytes = 1 word)
44
MemoryAllocator allocator = MemoryAllocator.HEAP;
45
MemoryBlock bitsetMemory = allocator.allocate(8);
46
Object baseObject = bitsetMemory.getBaseObject();
47
long baseOffset = bitsetMemory.getBaseOffset();
48
49
try {
50
// Clear the bitset initially
51
Platform.setMemory(baseObject, baseOffset, 8, (byte) 0);
52
53
// Set individual bits
54
BitSetMethods.set(baseObject, baseOffset, 5); // Set bit 5
55
BitSetMethods.set(baseObject, baseOffset, 10); // Set bit 10
56
BitSetMethods.set(baseObject, baseOffset, 15); // Set bit 15
57
58
// Check if bits are set
59
boolean bit5Set = BitSetMethods.isSet(baseObject, baseOffset, 5); // true
60
boolean bit7Set = BitSetMethods.isSet(baseObject, baseOffset, 7); // false
61
62
// Check if any bits are set in the bitset
63
boolean anySet = BitSetMethods.anySet(baseObject, baseOffset, 1); // true (1 word)
64
65
// Find next set bit from a given position
66
int nextSet = BitSetMethods.nextSetBit(baseObject, baseOffset, 6, 1); // Returns 10
67
int noMore = BitSetMethods.nextSetBit(baseObject, baseOffset, 16, 1); // Returns -1
68
69
// Unset bits
70
BitSetMethods.unset(baseObject, baseOffset, 10); // Clear bit 10
71
boolean bit10Set = BitSetMethods.isSet(baseObject, baseOffset, 10); // false
72
73
} finally {
74
allocator.free(bitsetMemory);
75
}
76
```
77
78
### Advanced Bitset Usage
79
80
```java
81
// Working with larger bitsets (multiple words)
82
int numWords = 4; // 4 * 64 = 256 bits
83
MemoryBlock largeBitset = allocator.allocate(numWords * 8);
84
Object base = largeBitset.getBaseObject();
85
long offset = largeBitset.getBaseOffset();
86
87
try {
88
// Initialize bitset to all zeros
89
Platform.setMemory(base, offset, numWords * 8, (byte) 0);
90
91
// Set bits across different words
92
BitSetMethods.set(base, offset, 63); // Last bit of first word
93
BitSetMethods.set(base, offset, 64); // First bit of second word
94
BitSetMethods.set(base, offset, 200); // Bit in fourth word
95
96
// Check if any bits are set across all words
97
boolean anyBitsSet = BitSetMethods.anySet(base, offset, numWords);
98
99
// Find all set bits
100
int currentBit = -1;
101
while ((currentBit = BitSetMethods.nextSetBit(base, offset, currentBit + 1, numWords)) != -1) {
102
System.out.println("Bit " + currentBit + " is set");
103
}
104
105
} finally {
106
allocator.free(largeBitset);
107
}
108
```
109
110
## API Reference
111
112
### Murmur3_x86_32 Class
113
114
```java { .api }
115
public final class Murmur3_x86_32 {
116
/**
117
* Creates Murmur3 hasher with specified seed value.
118
*/
119
public Murmur3_x86_32(int seed);
120
121
/**
122
* Returns string representation of this hasher.
123
*/
124
public String toString();
125
126
/**
127
* Computes Murmur3 hash of an integer value.
128
*/
129
public int hashInt(int input);
130
131
/**
132
* Computes Murmur3 hash of word-aligned data.
133
* Data length must be multiple of 4 bytes.
134
*/
135
public int hashUnsafeWords(Object base, long offset, int lengthInBytes);
136
137
/**
138
* Computes Murmur3 hash of a long value.
139
*/
140
public int hashLong(long input);
141
}
142
```
143
144
### Murmur3_x86_32 Static Methods
145
146
```java { .api }
147
/**
148
* Static version of integer hashing with explicit seed.
149
*/
150
public static int hashInt(int input, int seed);
151
152
/**
153
* Static version of word-aligned hashing with explicit seed.
154
*/
155
public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed);
156
157
/**
158
* Legacy byte-level hashing method (deprecated, use hashUnsafeBytes2).
159
*/
160
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);
161
162
/**
163
* Improved byte-level hashing method for arbitrary data.
164
*/
165
public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed);
166
167
/**
168
* Static version of long hashing with explicit seed.
169
*/
170
public static int hashLong(long input, int seed);
171
```
172
173
174
### BitSetMethods Class
175
176
```java { .api }
177
public final class BitSetMethods {
178
/**
179
* Sets the bit at the specified index to true.
180
*
181
* @param baseObject Base object containing the bitset
182
* @param baseOffset Offset to the start of the bitset
183
* @param index Bit index to set (0-based)
184
*/
185
public static void set(Object baseObject, long baseOffset, int index);
186
187
/**
188
* Sets the bit at the specified index to false.
189
*
190
* @param baseObject Base object containing the bitset
191
* @param baseOffset Offset to the start of the bitset
192
* @param index Bit index to unset (0-based)
193
*/
194
public static void unset(Object baseObject, long baseOffset, int index);
195
196
/**
197
* Returns true if the bit at the specified index is set.
198
*
199
* @param baseObject Base object containing the bitset
200
* @param baseOffset Offset to the start of the bitset
201
* @param index Bit index to check (0-based)
202
* @return true if bit is set, false otherwise
203
*/
204
public static boolean isSet(Object baseObject, long baseOffset, int index);
205
206
/**
207
* Returns true if any bit is set in the bitset.
208
*
209
* @param baseObject Base object containing the bitset
210
* @param baseOffset Offset to the start of the bitset
211
* @param bitSetWidthInWords Width of bitset in 64-bit words
212
* @return true if any bit is set, false if all bits are clear
213
*/
214
public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInWords);
215
216
/**
217
* Finds the next set bit starting from the specified index.
218
*
219
* @param baseObject Base object containing the bitset
220
* @param baseOffset Offset to the start of the bitset
221
* @param fromIndex Starting bit index for search (inclusive)
222
* @param bitsetSizeInWords Size of bitset in 64-bit words
223
* @return Index of next set bit, or -1 if no set bit found
224
*/
225
public static int nextSetBit(Object baseObject, long baseOffset, int fromIndex, int bitsetSizeInWords);
226
}
227
```
228
229
## Performance Characteristics
230
231
### Murmur3 Hashing
232
233
1. **Fast Distribution**: Murmur3 provides good hash distribution with excellent performance.
234
2. **Word-Aligned Optimization**: `hashUnsafeWords` is optimized for 4-byte aligned data.
235
3. **Seed Support**: Allows different hash families with different seed values.
236
4. **Platform Optimized**: x86_32 implementation optimized for 32-bit hash output.
237
238
### Hive Compatibility
239
240
1. **Legacy Support**: Provides Hive-compatible hashing for data migration scenarios.
241
2. **Simple Algorithms**: Uses simple XOR-based hashing for predictable results.
242
3. **SQL Integration**: Designed for Spark SQL compatibility with Hive.
243
244
### Bitset Operations
245
246
1. **Word-Level Operations**: Operates on 64-bit words for maximum efficiency.
247
2. **Cache Friendly**: Sequential bit access patterns optimize cache usage.
248
3. **Minimal Overhead**: Direct memory operations with no object allocation.
249
250
## Usage Notes
251
252
1. **Memory Alignment**: Bitsets should be properly aligned for optimal performance.
253
254
2. **Thread Safety**: None of these classes provide thread safety guarantees.
255
256
3. **Memory Management**: Bitset operations work on raw memory - ensure proper lifecycle management.
257
258
4. **Bit Indexing**: All bit operations use 0-based indexing.
259
260
5. **Word Boundaries**: Bitset operations are optimized for word (64-bit) boundaries.
261
262
6. **Hash Collision**: While Murmur3 has good distribution, consider collision handling in hash tables.
263
264
7. **Seed Selection**: Choose hash seeds carefully to avoid predictable hash patterns.
265
266
## Common Patterns
267
268
### Hash Table Implementation
269
270
```java
271
// Use Murmur3 for hash table operations
272
public class HashTable<K, V> {
273
private static final int DEFAULT_SEED = 42;
274
private final Murmur3_x86_32 hasher = new Murmur3_x86_32(DEFAULT_SEED);
275
276
private int hash(K key) {
277
if (key instanceof Integer) {
278
return hasher.hashInt((Integer) key);
279
} else if (key instanceof Long) {
280
return hasher.hashLong((Long) key);
281
} else {
282
byte[] bytes = key.toString().getBytes(StandardCharsets.UTF_8);
283
return Murmur3_x86_32.hashUnsafeBytes2(
284
bytes, Platform.BYTE_ARRAY_OFFSET, bytes.length, DEFAULT_SEED
285
);
286
}
287
}
288
}
289
```
290
291
### Efficient Bloom Filter
292
293
```java
294
// Implement Bloom filter using bitset operations
295
public class BloomFilter {
296
private final MemoryBlock bitset;
297
private final Object baseObject;
298
private final long baseOffset;
299
private final int bitsetSizeInWords;
300
private final int[] hashSeeds;
301
302
public BloomFilter(int sizeInBits, int numHashFunctions) {
303
int sizeInBytes = (sizeInBits + 7) / 8; // Round up to byte boundary
304
this.bitsetSizeInWords = (sizeInBytes + 7) / 8; // Round up to word boundary
305
306
MemoryAllocator allocator = MemoryAllocator.HEAP;
307
this.bitset = allocator.allocate(bitsetSizeInWords * 8);
308
this.baseObject = bitset.getBaseObject();
309
this.baseOffset = bitset.getBaseOffset();
310
311
// Clear bitset
312
Platform.setMemory(baseObject, baseOffset, bitsetSizeInWords * 8, (byte) 0);
313
314
// Initialize hash seeds
315
this.hashSeeds = new int[numHashFunctions];
316
for (int i = 0; i < numHashFunctions; i++) {
317
this.hashSeeds[i] = i * 97; // Different seeds for each hash function
318
}
319
}
320
321
public void add(int value) {
322
for (int seed : hashSeeds) {
323
int hash = Murmur3_x86_32.hashInt(value, seed);
324
int bitIndex = Math.abs(hash) % (bitsetSizeInWords * 64);
325
BitSetMethods.set(baseObject, baseOffset, bitIndex);
326
}
327
}
328
329
public boolean mightContain(int value) {
330
for (int seed : hashSeeds) {
331
int hash = Murmur3_x86_32.hashInt(value, seed);
332
int bitIndex = Math.abs(hash) % (bitsetSizeInWords * 64);
333
if (!BitSetMethods.isSet(baseObject, baseOffset, bitIndex)) {
334
return false;
335
}
336
}
337
return true;
338
}
339
}
340
```