or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-operations.mddata-types-utilities.mdhash-bitset-operations.mdindex.mdmemory-management.mdplatform-operations.mdutf8-string-processing.md

hash-bitset-operations.mddocs/

0

# Hash Functions and Bitset Operations

1

2

Fast hash function implementations and bitset manipulation methods for efficient data processing and boolean operations. These operations provide high-performance hashing and bitset manipulation capabilities essential for data structures and algorithms in distributed computing.

3

4

## Capabilities

5

6

### Murmur3 Hash Function

7

8

32-bit Murmur3 hash function implementation providing fast, high-quality hashing for data processing applications.

9

10

```java { .api }

11

public final class Murmur3_x86_32 {

12

public Murmur3_x86_32(int seed);

13

14

// Instance methods

15

public String toString();

16

public int hashInt(int input);

17

public int hashUnsafeWords(Object base, long offset, int lengthInBytes);

18

public int hashLong(long input);

19

20

// Static methods

21

public static int hashInt(int input, int seed);

22

public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed);

23

public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);

24

public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed);

25

public static int hashLong(long input, int seed);

26

}

27

```

28

29

### Hive-Compatible Hash Function

30

31

Hash function implementation that simulates Hive's hash function for compatibility with Hive v1.2.1, ensuring consistent hashing across Spark and Hive environments.

32

33

```java { .api }

34

public class HiveHasher {

35

public String toString();

36

37

public static int hashInt(int input);

38

public static int hashLong(long input);

39

public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes);

40

}

41

```

42

43

### Bitset Operations

44

45

Methods for working with fixed-size uncompressed bitsets stored in memory, providing efficient boolean array operations.

46

47

```java { .api }

48

public final class BitSetMethods {

49

public static void set(Object baseObject, long baseOffset, int index);

50

public static void unset(Object baseObject, long baseOffset, int index);

51

public static boolean isSet(Object baseObject, long baseOffset, int index);

52

public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInWords);

53

public static int nextSetBit(Object baseObject, long baseOffset, int fromIndex, int bitsetSizeInWords);

54

}

55

```

56

57

## Usage Examples

58

59

### Murmur3 Hashing

60

61

```java

62

import org.apache.spark.unsafe.hash.Murmur3_x86_32;

63

64

// Create hasher with specific seed

65

Murmur3_x86_32 hasher = new Murmur3_x86_32(42);

66

67

// Hash different data types

68

int intHash = hasher.hashInt(12345);

69

long longHash = hasher.hashLong(123456789L);

70

71

// Hash memory regions (word-aligned for optimal performance)

72

byte[] data = "Hello World".getBytes();

73

int memoryHash = hasher.hashUnsafeWords(

74

data,

75

Platform.BYTE_ARRAY_OFFSET,

76

data.length

77

);

78

79

// Static methods with custom seed

80

int staticIntHash = Murmur3_x86_32.hashInt(12345, 42);

81

int staticLongHash = Murmur3_x86_32.hashLong(123456789L, 42);

82

83

// Hash arbitrary byte sequences

84

int byteHash = Murmur3_x86_32.hashUnsafeBytes2(

85

data,

86

Platform.BYTE_ARRAY_OFFSET,

87

data.length,

88

42

89

);

90

```

91

92

### Hive-Compatible Hashing

93

94

```java

95

import org.apache.spark.sql.catalyst.expressions.HiveHasher;

96

97

// Hash integers (returns input unchanged for compatibility)

98

int intHash = HiveHasher.hashInt(12345); // Returns 12345

99

100

// Hash longs using XOR of upper and lower 32 bits

101

long longValue = 0x123456789ABCDEFL;

102

int longHash = HiveHasher.hashLong(longValue);

103

104

// Hash byte sequences using rolling hash algorithm

105

byte[] data = "test data".getBytes();

106

int byteHash = HiveHasher.hashUnsafeBytes(

107

data,

108

Platform.BYTE_ARRAY_OFFSET,

109

data.length

110

);

111

```

112

113

### Bitset Operations

114

115

```java

116

import org.apache.spark.unsafe.bitset.BitSetMethods;

117

import org.apache.spark.unsafe.Platform;

118

119

// Allocate memory for bitset (e.g., 64 bits = 1 word)

120

long bitsetAddress = Platform.allocateMemory(8);

121

Platform.setMemory(bitsetAddress, (byte) 0, 8); // Initialize to zeros

122

123

// Set individual bits

124

BitSetMethods.set(null, bitsetAddress, 5); // Set bit 5

125

BitSetMethods.set(null, bitsetAddress, 10); // Set bit 10

126

BitSetMethods.set(null, bitsetAddress, 15); // Set bit 15

127

128

// Check if bits are set

129

boolean bit5Set = BitSetMethods.isSet(null, bitsetAddress, 5); // true

130

boolean bit7Set = BitSetMethods.isSet(null, bitsetAddress, 7); // false

131

132

// Unset bits

133

BitSetMethods.unset(null, bitsetAddress, 10); // Clear bit 10

134

boolean bit10Set = BitSetMethods.isSet(null, bitsetAddress, 10); // false

135

136

// Check if any bits are set in the bitset

137

boolean anyBitsSet = BitSetMethods.anySet(null, bitsetAddress, 1); // true (bits 5 and 15 are set)

138

139

// Find next set bit starting from a given index

140

int nextBit = BitSetMethods.nextSetBit(null, bitsetAddress, 0, 1); // Returns 5

141

int nextBit2 = BitSetMethods.nextSetBit(null, bitsetAddress, 6, 1); // Returns 15

142

143

// Clean up

144

Platform.freeMemory(bitsetAddress);

145

```

146

147

### Array-Based Bitset Operations

148

149

```java

150

import org.apache.spark.unsafe.bitset.BitSetMethods;

151

import org.apache.spark.unsafe.Platform;

152

153

// Use long array as bitset storage

154

long[] bitsetArray = new long[2]; // 128 bits

155

156

// Set bits using array as backing store

157

BitSetMethods.set(bitsetArray, Platform.LONG_ARRAY_OFFSET, 65); // Set bit 65 (in second long)

158

BitSetMethods.set(bitsetArray, Platform.LONG_ARRAY_OFFSET, 120); // Set bit 120

159

160

// Check bits

161

boolean bit65Set = BitSetMethods.isSet(bitsetArray, Platform.LONG_ARRAY_OFFSET, 65); // true

162

163

// Check if any bits are set in the entire bitset

164

boolean anySet = BitSetMethods.anySet(bitsetArray, Platform.LONG_ARRAY_OFFSET, 2); // true

165

166

// Find next set bit

167

int nextSetBit = BitSetMethods.nextSetBit(bitsetArray, Platform.LONG_ARRAY_OFFSET, 0, 2); // 65

168

```

169

170

### Combined Hash and Bitset Usage

171

172

```java

173

import org.apache.spark.unsafe.hash.Murmur3_x86_32;

174

import org.apache.spark.unsafe.bitset.BitSetMethods;

175

import org.apache.spark.unsafe.Platform;

176

177

// Create a bloom filter-like structure

178

long[] bitset = new long[16]; // 1024 bits

179

Murmur3_x86_32 hasher1 = new Murmur3_x86_32(42);

180

Murmur3_x86_32 hasher2 = new Murmur3_x86_32(123);

181

182

// Insert element

183

String element = "example";

184

byte[] elementBytes = element.getBytes();

185

int hash1 = Math.abs(hasher1.hashUnsafeWords(elementBytes, Platform.BYTE_ARRAY_OFFSET, elementBytes.length));

186

int hash2 = Math.abs(hasher2.hashUnsafeWords(elementBytes, Platform.BYTE_ARRAY_OFFSET, elementBytes.length));

187

188

int bit1 = hash1 % 1024;

189

int bit2 = hash2 % 1024;

190

191

BitSetMethods.set(bitset, Platform.LONG_ARRAY_OFFSET, bit1);

192

BitSetMethods.set(bitset, Platform.LONG_ARRAY_OFFSET, bit2);

193

194

// Check if element might be present

195

boolean mightContain = BitSetMethods.isSet(bitset, Platform.LONG_ARRAY_OFFSET, bit1) &&

196

BitSetMethods.isSet(bitset, Platform.LONG_ARRAY_OFFSET, bit2);

197

```

198

199

### Performance Considerations

200

201

```java

202

import org.apache.spark.unsafe.hash.Murmur3_x86_32;

203

204

// For word-aligned data, use hashUnsafeWords for better performance

205

byte[] alignedData = new byte[16]; // Multiple of 4 bytes

206

int wordHash = Murmur3_x86_32.hashUnsafeWords(

207

alignedData,

208

Platform.BYTE_ARRAY_OFFSET,

209

alignedData.length,

210

42

211

);

212

213

// For arbitrary byte sequences, use hashUnsafeBytes2

214

byte[] arbitraryData = new byte[15]; // Not word-aligned

215

int byteHash = Murmur3_x86_32.hashUnsafeBytes2(

216

arbitraryData,

217

Platform.BYTE_ARRAY_OFFSET,

218

arbitraryData.length,

219

42

220

);

221

222

// Reuse hasher instances when hashing multiple values with the same seed

223

Murmur3_x86_32 reusableHasher = new Murmur3_x86_32(42);

224

int[] values = {1, 2, 3, 4, 5};

225

for (int value : values) {

226

int hash = reusableHasher.hashInt(value);

227

// Process hash

228

}

229

```