or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

arrays.mdhashing-bitsets.mdindex.mdintervals.mdmemory.mdplatform.mdutf8-strings.md

hashing-bitsets.mddocs/

0

# Hashing and Bitsets

1

2

Spark Unsafe provides high-performance hashing functions and bitset manipulation utilities optimized for data processing workloads. The hashing functionality includes Murmur3 implementation, while the bitset operations provide efficient manipulation of packed bit arrays.

3

4

## Core Imports

5

6

```java

7

import org.apache.spark.unsafe.hash.Murmur3_x86_32;

8

import org.apache.spark.unsafe.bitset.BitSetMethods;

9

```

10

11

## Usage Examples

12

13

### Murmur3 Hashing

14

15

```java

16

// Create hasher with specific seed

17

Murmur3_x86_32 hasher = new Murmur3_x86_32(42);

18

19

// Hash integers

20

int intHash = hasher.hashInt(12345);

21

int longHash = hasher.hashLong(123456789L);

22

23

// Static hashing methods (convenient for one-off operations)

24

int staticIntHash = Murmur3_x86_32.hashInt(12345, 42);

25

int staticLongHash = Murmur3_x86_32.hashLong(123456789L, 42);

26

27

// Hash word-aligned memory regions

28

byte[] data = "Hello, World!".getBytes(StandardCharsets.UTF_8);

29

int wordsHash = Murmur3_x86_32.hashUnsafeWords(

30

data, Platform.BYTE_ARRAY_OFFSET, data.length, 42

31

);

32

33

// Hash arbitrary byte sequences (improved method)

34

int bytesHash = Murmur3_x86_32.hashUnsafeBytes2(

35

data, Platform.BYTE_ARRAY_OFFSET, data.length, 42

36

);

37

```

38

39

40

### Bitset Operations

41

42

```java

43

// Allocate memory for bitset (assume 64 bits = 8 bytes = 1 word)

44

MemoryAllocator allocator = MemoryAllocator.HEAP;

45

MemoryBlock bitsetMemory = allocator.allocate(8);

46

Object baseObject = bitsetMemory.getBaseObject();

47

long baseOffset = bitsetMemory.getBaseOffset();

48

49

try {

50

// Clear the bitset initially

51

Platform.setMemory(baseObject, baseOffset, 8, (byte) 0);

52

53

// Set individual bits

54

BitSetMethods.set(baseObject, baseOffset, 5); // Set bit 5

55

BitSetMethods.set(baseObject, baseOffset, 10); // Set bit 10

56

BitSetMethods.set(baseObject, baseOffset, 15); // Set bit 15

57

58

// Check if bits are set

59

boolean bit5Set = BitSetMethods.isSet(baseObject, baseOffset, 5); // true

60

boolean bit7Set = BitSetMethods.isSet(baseObject, baseOffset, 7); // false

61

62

// Check if any bits are set in the bitset

63

boolean anySet = BitSetMethods.anySet(baseObject, baseOffset, 1); // true (1 word)

64

65

// Find next set bit from a given position

66

int nextSet = BitSetMethods.nextSetBit(baseObject, baseOffset, 6, 1); // Returns 10

67

int noMore = BitSetMethods.nextSetBit(baseObject, baseOffset, 16, 1); // Returns -1

68

69

// Unset bits

70

BitSetMethods.unset(baseObject, baseOffset, 10); // Clear bit 10

71

boolean bit10Set = BitSetMethods.isSet(baseObject, baseOffset, 10); // false

72

73

} finally {

74

allocator.free(bitsetMemory);

75

}

76

```

77

78

### Advanced Bitset Usage

79

80

```java

81

// Working with larger bitsets (multiple words)

82

int numWords = 4; // 4 * 64 = 256 bits

83

MemoryBlock largeBitset = allocator.allocate(numWords * 8);

84

Object base = largeBitset.getBaseObject();

85

long offset = largeBitset.getBaseOffset();

86

87

try {

88

// Initialize bitset to all zeros

89

Platform.setMemory(base, offset, numWords * 8, (byte) 0);

90

91

// Set bits across different words

92

BitSetMethods.set(base, offset, 63); // Last bit of first word

93

BitSetMethods.set(base, offset, 64); // First bit of second word

94

BitSetMethods.set(base, offset, 200); // Bit in fourth word

95

96

// Check if any bits are set across all words

97

boolean anyBitsSet = BitSetMethods.anySet(base, offset, numWords);

98

99

// Find all set bits

100

int currentBit = -1;

101

while ((currentBit = BitSetMethods.nextSetBit(base, offset, currentBit + 1, numWords)) != -1) {

102

System.out.println("Bit " + currentBit + " is set");

103

}

104

105

} finally {

106

allocator.free(largeBitset);

107

}

108

```

109

110

## API Reference

111

112

### Murmur3_x86_32 Class

113

114

```java { .api }

115

public final class Murmur3_x86_32 {

116

/**

117

* Creates Murmur3 hasher with specified seed value.

118

*/

119

public Murmur3_x86_32(int seed);

120

121

/**

122

* Returns string representation of this hasher.

123

*/

124

public String toString();

125

126

/**

127

* Computes Murmur3 hash of an integer value.

128

*/

129

public int hashInt(int input);

130

131

/**

132

* Computes Murmur3 hash of word-aligned data.

133

* Data length must be multiple of 4 bytes.

134

*/

135

public int hashUnsafeWords(Object base, long offset, int lengthInBytes);

136

137

/**

138

* Computes Murmur3 hash of a long value.

139

*/

140

public int hashLong(long input);

141

}

142

```

143

144

### Murmur3_x86_32 Static Methods

145

146

```java { .api }

147

/**

148

* Static version of integer hashing with explicit seed.

149

*/

150

public static int hashInt(int input, int seed);

151

152

/**

153

* Static version of word-aligned hashing with explicit seed.

154

*/

155

public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed);

156

157

/**

158

* Legacy byte-level hashing method (deprecated, use hashUnsafeBytes2).

159

*/

160

public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);

161

162

/**

163

* Improved byte-level hashing method for arbitrary data.

164

*/

165

public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed);

166

167

/**

168

* Static version of long hashing with explicit seed.

169

*/

170

public static int hashLong(long input, int seed);

171

```

172

173

174

### BitSetMethods Class

175

176

```java { .api }

177

public final class BitSetMethods {

178

/**

179

* Sets the bit at the specified index to true.

180

*

181

* @param baseObject Base object containing the bitset

182

* @param baseOffset Offset to the start of the bitset

183

* @param index Bit index to set (0-based)

184

*/

185

public static void set(Object baseObject, long baseOffset, int index);

186

187

/**

188

* Sets the bit at the specified index to false.

189

*

190

* @param baseObject Base object containing the bitset

191

* @param baseOffset Offset to the start of the bitset

192

* @param index Bit index to unset (0-based)

193

*/

194

public static void unset(Object baseObject, long baseOffset, int index);

195

196

/**

197

* Returns true if the bit at the specified index is set.

198

*

199

* @param baseObject Base object containing the bitset

200

* @param baseOffset Offset to the start of the bitset

201

* @param index Bit index to check (0-based)

202

* @return true if bit is set, false otherwise

203

*/

204

public static boolean isSet(Object baseObject, long baseOffset, int index);

205

206

/**

207

* Returns true if any bit is set in the bitset.

208

*

209

* @param baseObject Base object containing the bitset

210

* @param baseOffset Offset to the start of the bitset

211

* @param bitSetWidthInWords Width of bitset in 64-bit words

212

* @return true if any bit is set, false if all bits are clear

213

*/

214

public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInWords);

215

216

/**

217

* Finds the next set bit starting from the specified index.

218

*

219

* @param baseObject Base object containing the bitset

220

* @param baseOffset Offset to the start of the bitset

221

* @param fromIndex Starting bit index for search (inclusive)

222

* @param bitsetSizeInWords Size of bitset in 64-bit words

223

* @return Index of next set bit, or -1 if no set bit found

224

*/

225

public static int nextSetBit(Object baseObject, long baseOffset, int fromIndex, int bitsetSizeInWords);

226

}

227

```

228

229

## Performance Characteristics

230

231

### Murmur3 Hashing

232

233

1. **Fast Distribution**: Murmur3 provides good hash distribution with excellent performance.

234

2. **Word-Aligned Optimization**: `hashUnsafeWords` is optimized for 4-byte aligned data.

235

3. **Seed Support**: Allows different hash families with different seed values.

236

4. **Platform Optimized**: x86_32 implementation optimized for 32-bit hash output.

237

238

### Hive Compatibility

239

240

1. **Legacy Support**: Provides Hive-compatible hashing for data migration scenarios.

241

2. **Simple Algorithms**: Uses simple XOR-based hashing for predictable results.

242

3. **SQL Integration**: Designed for Spark SQL compatibility with Hive.

243

244

### Bitset Operations

245

246

1. **Word-Level Operations**: Operates on 64-bit words for maximum efficiency.

247

2. **Cache Friendly**: Sequential bit access patterns optimize cache usage.

248

3. **Minimal Overhead**: Direct memory operations with no object allocation.

249

250

## Usage Notes

251

252

1. **Memory Alignment**: Bitsets should be properly aligned for optimal performance.

253

254

2. **Thread Safety**: None of these classes provide thread safety guarantees.

255

256

3. **Memory Management**: Bitset operations work on raw memory - ensure proper lifecycle management.

257

258

4. **Bit Indexing**: All bit operations use 0-based indexing.

259

260

5. **Word Boundaries**: Bitset operations are optimized for word (64-bit) boundaries.

261

262

6. **Hash Collision**: While Murmur3 has good distribution, consider collision handling in hash tables.

263

264

7. **Seed Selection**: Choose hash seeds carefully to avoid predictable hash patterns.

265

266

## Common Patterns

267

268

### Hash Table Implementation

269

270

```java

271

// Use Murmur3 for hash table operations

272

public class HashTable<K, V> {

273

private static final int DEFAULT_SEED = 42;

274

private final Murmur3_x86_32 hasher = new Murmur3_x86_32(DEFAULT_SEED);

275

276

private int hash(K key) {

277

if (key instanceof Integer) {

278

return hasher.hashInt((Integer) key);

279

} else if (key instanceof Long) {

280

return hasher.hashLong((Long) key);

281

} else {

282

byte[] bytes = key.toString().getBytes(StandardCharsets.UTF_8);

283

return Murmur3_x86_32.hashUnsafeBytes2(

284

bytes, Platform.BYTE_ARRAY_OFFSET, bytes.length, DEFAULT_SEED

285

);

286

}

287

}

288

}

289

```

290

291

### Efficient Bloom Filter

292

293

```java

294

// Implement Bloom filter using bitset operations

295

public class BloomFilter {

296

private final MemoryBlock bitset;

297

private final Object baseObject;

298

private final long baseOffset;

299

private final int bitsetSizeInWords;

300

private final int[] hashSeeds;

301

302

public BloomFilter(int sizeInBits, int numHashFunctions) {

303

int sizeInBytes = (sizeInBits + 7) / 8; // Round up to byte boundary

304

this.bitsetSizeInWords = (sizeInBytes + 7) / 8; // Round up to word boundary

305

306

MemoryAllocator allocator = MemoryAllocator.HEAP;

307

this.bitset = allocator.allocate(bitsetSizeInWords * 8);

308

this.baseObject = bitset.getBaseObject();

309

this.baseOffset = bitset.getBaseOffset();

310

311

// Clear bitset

312

Platform.setMemory(baseObject, baseOffset, bitsetSizeInWords * 8, (byte) 0);

313

314

// Initialize hash seeds

315

this.hashSeeds = new int[numHashFunctions];

316

for (int i = 0; i < numHashFunctions; i++) {

317

this.hashSeeds[i] = i * 97; // Different seeds for each hash function

318

}

319

}

320

321

public void add(int value) {

322

for (int seed : hashSeeds) {

323

int hash = Murmur3_x86_32.hashInt(value, seed);

324

int bitIndex = Math.abs(hash) % (bitsetSizeInWords * 64);

325

BitSetMethods.set(baseObject, baseOffset, bitIndex);

326

}

327

}

328

329

public boolean mightContain(int value) {

330

for (int seed : hashSeeds) {

331

int hash = Murmur3_x86_32.hashInt(value, seed);

332

int bitIndex = Math.abs(hash) % (bitsetSizeInWords * 64);

333

if (!BitSetMethods.isSet(baseObject, baseOffset, bitIndex)) {

334

return false;

335

}

336

}

337

return true;

338

}

339

}

340

```