or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-operations.mdhashing-utilities.mdindex.mdmemory-management.mdplatform-operations.mdutf8-string-operations.md

hashing-utilities.mddocs/

0

# Hashing and Utilities

1

2

High-performance hashing implementations and utility classes including Murmur3 hashing, bitset operations, date/time constants, and Hive-compatible hashing for data distribution and compatibility requirements.

3

4

## Capabilities

5

6

### Murmur3 Hashing

7

8

Fast 32-bit Murmur3 hash implementation optimized for performance with support for different data types and memory layouts.

9

10

```java { .api }

11

/**

12

* 32-bit Murmur3 hasher implementation

13

*/

14

final class Murmur3_x86_32 {

15

/**

16

* Create hasher with specific seed

17

* @param seed Seed value for hashing

18

*/

19

public Murmur3_x86_32(int seed);

20

21

/**

22

* Get string representation of hasher

23

* @return String representation

24

*/

25

public String toString();

26

27

// Instance methods using hasher's seed

28

/**

29

* Hash integer value using instance seed

30

* @param input Integer to hash

31

* @return Hash value

32

*/

33

public int hashInt(int input);

34

35

/**

36

* Hash long value using instance seed

37

* @param input Long to hash

38

* @return Hash value

39

*/

40

public int hashLong(long input);

41

42

/**

43

* Hash word-aligned bytes using instance seed

44

* @param base Base object (null for off-heap)

45

* @param offset Offset within object or address

46

* @param lengthInBytes Number of bytes to hash (must be word-aligned)

47

* @return Hash value

48

*/

49

public int hashUnsafeWords(Object base, long offset, int lengthInBytes);

50

51

// Static methods with explicit seed

52

/**

53

* Hash integer with provided seed

54

* @param input Integer to hash

55

* @param seed Seed value

56

* @return Hash value

57

*/

58

public static int hashInt(int input, int seed);

59

60

/**

61

* Hash long with provided seed

62

* @param input Long to hash

63

* @param seed Seed value

64

* @return Hash value

65

*/

66

public static int hashLong(long input, int seed);

67

68

/**

69

* Hash word-aligned bytes with provided seed

70

* @param base Base object (null for off-heap)

71

* @param offset Offset within object or address

72

* @param lengthInBytes Number of bytes to hash (must be word-aligned)

73

* @param seed Seed value

74

* @return Hash value

75

*/

76

public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed);

77

78

/**

79

* Hash arbitrary bytes with provided seed (legacy method)

80

* @param base Base object (null for off-heap)

81

* @param offset Offset within object or address

82

* @param lengthInBytes Number of bytes to hash

83

* @param seed Seed value

84

* @return Hash value

85

*/

86

public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);

87

88

/**

89

* Hash arbitrary bytes with provided seed (compatible method)

90

* @param base Base object (null for off-heap)

91

* @param offset Offset within object or address

92

* @param lengthInBytes Number of bytes to hash

93

* @param seed Seed value

94

* @return Hash value

95

*/

96

public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed);

97

}

98

```

99

100

**Usage Examples:**

101

102

```java

103

import org.apache.spark.unsafe.hash.Murmur3_x86_32;

104

import org.apache.spark.unsafe.Platform;

105

106

// Create hasher with seed

107

Murmur3_x86_32 hasher = new Murmur3_x86_32(42);

108

109

// Hash different data types using instance methods

110

int intHash = hasher.hashInt(12345);

111

int longHash = hasher.hashLong(123456789L);

112

113

// Hash byte arrays

114

byte[] data = "Hello World".getBytes();

115

int arrayHash = hasher.hashUnsafeWords(

116

data,

117

Platform.BYTE_ARRAY_OFFSET,

118

data.length

119

);

120

121

// Use static methods with explicit seed

122

int staticIntHash = Murmur3_x86_32.hashInt(12345, 42);

123

int staticLongHash = Murmur3_x86_32.hashLong(123456789L, 42);

124

125

// Hash memory regions

126

byte[] buffer = new byte[1024];

127

// ... fill buffer ...

128

int bufferHash = Murmur3_x86_32.hashUnsafeBytes(

129

buffer,

130

Platform.BYTE_ARRAY_OFFSET,

131

buffer.length,

132

42

133

);

134

135

// Hash off-heap memory

136

long address = Platform.allocateMemory(100);

137

try {

138

Platform.setMemory(address, (byte) 0xFF, 100);

139

int offHeapHash = Murmur3_x86_32.hashUnsafeBytes(

140

null, address, 100, 42

141

);

142

} finally {

143

Platform.freeMemory(address);

144

}

145

```

146

147

### BitSet Operations

148

149

Methods for working with fixed-size uncompressed bitsets stored in memory, providing efficient bit manipulation operations.

150

151

```java { .api }

152

/**

153

* Methods for working with fixed-size uncompressed bitsets

154

*/

155

final class BitSetMethods {

156

/**

157

* Set bit at specified index

158

* @param baseObject Base object (null for off-heap)

159

* @param baseOffset Base offset or address

160

* @param index Bit index to set

161

*/

162

public static void set(Object baseObject, long baseOffset, int index);

163

164

/**

165

* Unset (clear) bit at specified index

166

* @param baseObject Base object (null for off-heap)

167

* @param baseOffset Base offset or address

168

* @param index Bit index to unset

169

*/

170

public static void unset(Object baseObject, long baseOffset, int index);

171

172

/**

173

* Check if bit is set at specified index

174

* @param baseObject Base object (null for off-heap)

175

* @param baseOffset Base offset or address

176

* @param index Bit index to check

177

* @return true if bit is set

178

*/

179

public static boolean isSet(Object baseObject, long baseOffset, int index);

180

181

/**

182

* Check if any bit is set in the bitset

183

* @param baseObject Base object (null for off-heap)

184

* @param baseOffset Base offset or address

185

* @param bitSetWidthInWords Bitset width in 64-bit words

186

* @return true if any bit is set

187

*/

188

public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInWords);

189

190

/**

191

* Find next set bit starting from index

192

* @param baseObject Base object (null for off-heap)

193

* @param baseOffset Base offset or address

194

* @param fromIndex Starting index for search

195

* @param bitsetSizeInWords Bitset size in 64-bit words

196

* @return Index of next set bit or -1 if not found

197

*/

198

public static int nextSetBit(Object baseObject, long baseOffset, int fromIndex, int bitsetSizeInWords);

199

}

200

```

201

202

**Usage Examples:**

203

204

```java

205

import org.apache.spark.unsafe.bitset.BitSetMethods;

206

import org.apache.spark.unsafe.Platform;

207

208

// Create bitset using long array (8 words = 512 bits)

209

long[] bitsetData = new long[8];

210

Object baseObj = bitsetData;

211

long baseOffset = Platform.LONG_ARRAY_OFFSET;

212

213

// Set some bits

214

BitSetMethods.set(baseObj, baseOffset, 10);

215

BitSetMethods.set(baseObj, baseOffset, 25);

216

BitSetMethods.set(baseObj, baseOffset, 100);

217

218

// Check if bits are set

219

boolean bit10Set = BitSetMethods.isSet(baseObj, baseOffset, 10); // true

220

boolean bit15Set = BitSetMethods.isSet(baseObj, baseOffset, 15); // false

221

222

// Check if any bits are set

223

boolean anySet = BitSetMethods.anySet(baseObj, baseOffset, 8); // true

224

225

// Find next set bit

226

int nextBit = BitSetMethods.nextSetBit(baseObj, baseOffset, 0, 8); // 10

227

int afterTen = BitSetMethods.nextSetBit(baseObj, baseOffset, 11, 8); // 25

228

229

// Clear a bit

230

BitSetMethods.unset(baseObj, baseOffset, 25);

231

boolean bit25Set = BitSetMethods.isSet(baseObj, baseOffset, 25); // false

232

233

// Using off-heap bitset

234

long address = Platform.allocateMemory(64); // 8 words * 8 bytes

235

try {

236

Platform.setMemory(address, (byte) 0, 64); // Clear all bits

237

238

BitSetMethods.set(null, address, 42);

239

boolean isSet = BitSetMethods.isSet(null, address, 42);

240

241

} finally {

242

Platform.freeMemory(address);

243

}

244

```

245

246

### Date/Time Constants

247

248

Comprehensive constants for date and time calculations and conversions, providing all common time unit relationships.

249

250

```java { .api }

251

/**

252

* Constants for date/time calculations and conversions

253

*/

254

class DateTimeConstants {

255

// Basic time units

256

public static final int MONTHS_PER_YEAR = 12;

257

public static final byte DAYS_PER_WEEK = 7;

258

public static final long HOURS_PER_DAY = 24L;

259

public static final long MINUTES_PER_HOUR = 60L;

260

public static final long SECONDS_PER_MINUTE = 60L;

261

262

// Computed time constants

263

public static final long SECONDS_PER_HOUR; // 3600

264

public static final long SECONDS_PER_DAY; // 86400

265

266

// Millisecond conversions

267

public static final long MILLIS_PER_SECOND = 1000L;

268

public static final long MILLIS_PER_MINUTE; // 60000

269

public static final long MILLIS_PER_HOUR; // 3600000

270

public static final long MILLIS_PER_DAY; // 86400000

271

272

// Microsecond conversions

273

public static final long MICROS_PER_MILLIS = 1000L;

274

public static final long MICROS_PER_SECOND; // 1000000

275

public static final long MICROS_PER_MINUTE; // 60000000

276

public static final long MICROS_PER_HOUR; // 3600000000

277

public static final long MICROS_PER_DAY; // 86400000000

278

279

// Nanosecond conversions

280

public static final long NANOS_PER_MICROS = 1000L;

281

public static final long NANOS_PER_MILLIS; // 1000000

282

public static final long NANOS_PER_SECOND; // 1000000000

283

}

284

```

285

286

**Usage Examples:**

287

288

```java

289

import org.apache.spark.sql.catalyst.util.DateTimeConstants;

290

291

// Time calculations using constants

292

long currentTimeMillis = System.currentTimeMillis();

293

294

// Convert to different units

295

long currentTimeSeconds = currentTimeMillis / DateTimeConstants.MILLIS_PER_SECOND;

296

long currentTimeMicros = currentTimeMillis * DateTimeConstants.MICROS_PER_MILLIS;

297

long currentTimeNanos = currentTimeMillis * DateTimeConstants.NANOS_PER_MILLIS;

298

299

// Calculate time spans

300

long hoursInWeek = DateTimeConstants.DAYS_PER_WEEK * DateTimeConstants.HOURS_PER_DAY;

301

long secondsInWeek = hoursInWeek * DateTimeConstants.SECONDS_PER_HOUR;

302

303

// Duration calculations

304

long durationDays = 5;

305

long durationMillis = durationDays * DateTimeConstants.MILLIS_PER_DAY;

306

long durationMicros = durationDays * DateTimeConstants.MICROS_PER_DAY;

307

308

// Conversion helpers

309

public static long millisToMicros(long millis) {

310

return millis * DateTimeConstants.MICROS_PER_MILLIS;

311

}

312

313

public static long microsToNanos(long micros) {

314

return micros * DateTimeConstants.NANOS_PER_MICROS;

315

}

316

317

public static long secondsToMillis(long seconds) {

318

return seconds * DateTimeConstants.MILLIS_PER_SECOND;

319

}

320

```

321

322

### Hive Hasher Compatibility

323

324

Hive-compatible hashing functions for maintaining compatibility with Hive v1.2.1 hashing behavior.

325

326

```java { .api }

327

/**

328

* Simulates Hive's hashing function from Hive v1.2.1 for compatibility

329

*/

330

class HiveHasher {

331

/**

332

* Hash integer using Hive-compatible algorithm

333

* @param input Integer to hash

334

* @return Hive-compatible hash value

335

*/

336

public static int hashInt(int input);

337

338

/**

339

* Hash long using Hive-compatible algorithm

340

* @param input Long to hash

341

* @return Hive-compatible hash value

342

*/

343

public static int hashLong(long input);

344

345

/**

346

* Hash byte array using Hive-compatible algorithm with unsafe access

347

* @param base Base object (null for off-heap)

348

* @param offset Offset within object or address

349

* @param lengthInBytes Number of bytes to hash

350

* @return Hive-compatible hash value

351

*/

352

public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes);

353

354

/**

355

* Get string representation

356

* @return String representation of hasher

357

*/

358

public String toString();

359

}

360

```

361

362

**Usage Examples:**

363

364

```java

365

import org.apache.spark.sql.catalyst.expressions.HiveHasher;

366

import org.apache.spark.unsafe.Platform;

367

368

// Hash integers for Hive compatibility

369

int intValue = 12345;

370

int hiveIntHash = HiveHasher.hashInt(intValue);

371

372

// Hash longs for Hive compatibility

373

long longValue = 123456789L;

374

int hiveLongHash = HiveHasher.hashLong(longValue);

375

376

// Hash byte arrays with Hive-compatible algorithm

377

byte[] data = "test data".getBytes();

378

int hiveArrayHash = HiveHasher.hashUnsafeBytes(

379

data,

380

Platform.BYTE_ARRAY_OFFSET,

381

data.length

382

);

383

384

// Use for partitioning compatibility with Hive tables

385

public int getHivePartition(Object value, int numPartitions) {

386

int hash;

387

if (value instanceof Integer) {

388

hash = HiveHasher.hashInt((Integer) value);

389

} else if (value instanceof Long) {

390

hash = HiveHasher.hashLong((Long) value);

391

} else {

392

byte[] bytes = value.toString().getBytes();

393

hash = HiveHasher.hashUnsafeBytes(

394

bytes, Platform.BYTE_ARRAY_OFFSET, bytes.length

395

);

396

}

397

return Math.abs(hash) % numPartitions;

398

}

399

```

400

401

### Unsafe Aligned Offset Utilities

402

403

Platform-specific alignment handling for record length offsets, ensuring proper memory alignment across different architectures.

404

405

```java { .api }

406

/**

407

* Handles platform-specific alignment for record length offsets

408

*/

409

class UnsafeAlignedOffset {

410

/**

411

* Set UAO size for testing purposes

412

* @param size UAO size to set

413

*/

414

public static void setUaoSize(int size);

415

416

/**

417

* Get current UAO size

418

* @return Current UAO size

419

*/

420

public static int getUaoSize();

421

422

/**

423

* Get size value considering platform alignment

424

* @param object Base object

425

* @param offset Offset within object

426

* @return Size value with proper alignment

427

*/

428

public static int getSize(Object object, long offset);

429

430

/**

431

* Put size value considering platform alignment

432

* @param object Base object

433

* @param offset Offset within object

434

* @param value Size value to store

435

*/

436

public static void putSize(Object object, long offset, int value);

437

}

438

```

439

440

**Usage Examples:**

441

442

```java

443

import org.apache.spark.unsafe.UnsafeAlignedOffset;

444

import org.apache.spark.unsafe.Platform;

445

446

// Working with aligned record sizes

447

byte[] recordBuffer = new byte[1024];

448

long recordOffset = Platform.BYTE_ARRAY_OFFSET;

449

450

// Store record size with proper alignment

451

int recordSize = 256;

452

UnsafeAlignedOffset.putSize(recordBuffer, recordOffset, recordSize);

453

454

// Read record size with proper alignment

455

int storedSize = UnsafeAlignedOffset.getSize(recordBuffer, recordOffset);

456

457

// Check current alignment requirements

458

int uaoSize = UnsafeAlignedOffset.getUaoSize();

459

System.out.println("Current UAO size: " + uaoSize);

460

461

// For testing different alignment scenarios

462

UnsafeAlignedOffset.setUaoSize(8); // Set 8-byte alignment for testing

463

// ... run tests ...

464

UnsafeAlignedOffset.setUaoSize(4); // Reset to 4-byte alignment

465

```

466

467

## Calendar Interval Type

468

469

Specialized data type for representing calendar intervals with separate components for months, days, and microseconds.

470

471

```java { .api }

472

/**

473

* Represents calendar intervals with months, days, and microseconds

474

* @Unstable - API may change in future versions

475

*/

476

final class CalendarInterval implements Serializable {

477

// Public fields for interval components

478

public final int months; // Number of months

479

public final int days; // Number of days

480

public final long microseconds; // Number of microseconds

481

482

/**

483

* Create calendar interval with specified components

484

* @param months Number of months

485

* @param days Number of days

486

* @param microseconds Number of microseconds

487

*/

488

public CalendarInterval(int months, int days, long microseconds);

489

490

/**

491

* Check equality with another object

492

* @param o Object to compare with

493

* @return true if equal

494

*/

495

public boolean equals(Object o);

496

497

/**

498

* Calculate hash code

499

* @return Hash code value

500

*/

501

public int hashCode();

502

503

/**

504

* Get string representation

505

* @return String representation of interval

506

*/

507

public String toString();

508

509

/**

510

* Extract interval as Java Period (months and days only)

511

* @return Java Period representation

512

*/

513

public Period extractAsPeriod();

514

515

/**

516

* Extract interval as Java Duration (microseconds only)

517

* @return Java Duration representation

518

*/

519

public Duration extractAsDuration();

520

}

521

```

522

523

**Usage Examples:**

524

525

```java

526

import org.apache.spark.unsafe.types.CalendarInterval;

527

import java.time.Period;

528

import java.time.Duration;

529

530

// Create calendar intervals

531

CalendarInterval interval1 = new CalendarInterval(2, 15, 3600000000L); // 2 months, 15 days, 1 hour

532

CalendarInterval interval2 = new CalendarInterval(0, 0, 1500000L); // 1.5 seconds

533

CalendarInterval interval3 = new CalendarInterval(12, 0, 0L); // 1 year

534

535

// Working with interval components

536

int months = interval1.months;

537

int days = interval1.days;

538

long microseconds = interval1.microseconds;

539

540

// Convert to Java time types

541

Period period = interval1.extractAsPeriod(); // 2 months, 15 days

542

Duration duration = interval1.extractAsDuration(); // 1 hour

543

544

// Comparison and equality

545

boolean areEqual = interval1.equals(interval2);

546

int hashCode = interval1.hashCode();

547

String description = interval1.toString();

548

549

// Common use cases

550

public static CalendarInterval oneHour() {

551

return new CalendarInterval(0, 0, DateTimeConstants.MICROS_PER_HOUR);

552

}

553

554

public static CalendarInterval oneDay() {

555

return new CalendarInterval(0, 1, 0L);

556

}

557

558

public static CalendarInterval oneMonth() {

559

return new CalendarInterval(1, 0, 0L);

560

}

561

```

562

563

## Performance and Usage Guidelines

564

565

### Choosing Hash Functions

566

567

```java

568

// For general-purpose hashing (faster)

569

int hash1 = Murmur3_x86_32.hashInt(value, seed);

570

571

// For Hive compatibility (when interfacing with Hive)

572

int hash2 = HiveHasher.hashInt(value);

573

574

// For consistent partitioning across Spark and Hive

575

int partition = Math.abs(HiveHasher.hashInt(key)) % numPartitions;

576

```

577

578

### BitSet Memory Layout

579

580

```java

581

// BitSet requires word-aligned memory

582

int numBits = 1000;

583

int numWords = (numBits + 63) / 64; // Round up to words

584

long[] bitsetStorage = new long[numWords];

585

586

// Always specify correct word count

587

boolean anySet = BitSetMethods.anySet(

588

bitsetStorage,

589

Platform.LONG_ARRAY_OFFSET,

590

numWords // Important: use actual word count

591

);

592

```

593

594

### Time Calculation Optimization

595

596

```java

597

// Pre-calculate commonly used values

598

private static final long MICROS_PER_WEEK =

599

DateTimeConstants.DAYS_PER_WEEK * DateTimeConstants.MICROS_PER_DAY;

600

601

// Use constants for efficient conversions

602

public long convertDaysToMicros(int days) {

603

return days * DateTimeConstants.MICROS_PER_DAY;

604

}

605

```