or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-operations.mdhashing-utilities.mdindex.mdmemory-management.mdplatform-operations.mdutf8-string-operations.md

utf8-string-operations.mddocs/

0

# UTF8 String Operations

1

2

Memory-efficient UTF-8 string implementation with comprehensive string manipulation, parsing, and comparison operations optimized for Spark's internal use, providing zero-copy operations and direct memory access.

3

4

## Capabilities

5

6

### UTF8String Creation

7

8

Factory methods for creating UTF8String instances from various sources with memory-efficient operations.

9

10

```java { .api }

11

/**

12

* UTF-8 encoded string for internal Spark use with memory-efficient operations

13

*/

14

final class UTF8String implements Comparable<UTF8String>, Externalizable, KryoSerializable, Cloneable {

15

16

// Factory methods

17

/**

18

* Create UTF8String from Java String

19

* @param str Java String to convert

20

* @return UTF8String instance

21

*/

22

public static UTF8String fromString(String str);

23

24

/**

25

* Create UTF8String from byte array

26

* @param bytes UTF-8 encoded byte array

27

* @return UTF8String instance

28

*/

29

public static UTF8String fromBytes(byte[] bytes);

30

31

/**

32

* Create UTF8String from byte array slice

33

* @param bytes UTF-8 encoded byte array

34

* @param offset Starting offset in array

35

* @param numBytes Number of bytes to use

36

* @return UTF8String instance

37

*/

38

public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes);

39

40

/**

41

* Create UTF8String from memory address

42

* @param base Base object (null for off-heap)

43

* @param offset Offset within object or address

44

* @param numBytes Number of bytes

45

* @return UTF8String instance

46

*/

47

public static UTF8String fromAddress(Object base, long offset, int numBytes);

48

49

/**

50

* Create string of spaces

51

* @param length Number of spaces

52

* @return UTF8String containing spaces

53

*/

54

public static UTF8String blankString(int length);

55

56

// Constants

57

public static final UTF8String EMPTY_UTF8; // Empty UTF8String instance

58

}

59

```

60

61

**Usage Examples:**

62

63

```java

64

import org.apache.spark.unsafe.types.UTF8String;

65

66

// Create from Java String

67

UTF8String str1 = UTF8String.fromString("Hello World");

68

69

// Create from byte array

70

byte[] data = "Hello".getBytes("UTF-8");

71

UTF8String str2 = UTF8String.fromBytes(data);

72

73

// Create blank string

74

UTF8String spaces = UTF8String.blankString(10);

75

76

// Use empty constant

77

UTF8String empty = UTF8String.EMPTY_UTF8;

78

```

79

80

### String Concatenation

81

82

Efficient string concatenation operations supporting multiple input strings and custom separators.

83

84

```java { .api }

85

/**

86

* Concatenate multiple UTF8String instances

87

* @param inputs UTF8String instances to concatenate

88

* @return Concatenated UTF8String

89

*/

90

public static UTF8String concat(UTF8String... inputs);

91

92

/**

93

* Concatenate UTF8String instances with separator

94

* @param separator Separator string

95

* @param inputs UTF8String instances to concatenate

96

* @return Concatenated UTF8String with separators

97

*/

98

public static UTF8String concatWs(UTF8String separator, UTF8String... inputs);

99

```

100

101

### Memory Access Operations

102

103

Direct memory access methods for efficient I/O and serialization operations.

104

105

```java { .api }

106

/**

107

* Get base object for memory access (null for off-heap)

108

* @return Base object or null

109

*/

110

public Object getBaseObject();

111

112

/**

113

* Get offset within base object or direct address

114

* @return Offset or address

115

*/

116

public long getBaseOffset();

117

118

/**

119

* Get number of bytes in UTF-8 encoding

120

* @return Byte count

121

*/

122

public int numBytes();

123

124

/**

125

* Get underlying byte array (creates copy if needed)

126

* @return UTF-8 encoded byte array

127

*/

128

public byte[] getBytes();

129

130

/**

131

* Write string data to memory location

132

* @param target Target object (null for off-heap)

133

* @param targetOffset Target offset or address

134

*/

135

public void writeToMemory(Object target, long targetOffset);

136

137

/**

138

* Write string data to ByteBuffer

139

* @param buffer Target ByteBuffer

140

*/

141

public void writeTo(ByteBuffer buffer);

142

143

/**

144

* Get string as ByteBuffer view

145

* @return ByteBuffer view of string data

146

*/

147

public ByteBuffer getByteBuffer();

148

149

/**

150

* Write string data to OutputStream

151

* @param out Target OutputStream

152

* @throws IOException if I/O error occurs

153

*/

154

public void writeTo(OutputStream out) throws IOException;

155

```

156

157

### String Properties and Analysis

158

159

Methods for analyzing string properties and extracting metadata.

160

161

```java { .api }

162

/**

163

* Get number of Unicode characters

164

* @return Character count

165

*/

166

public int numChars();

167

168

/**

169

* Get sorting prefix for efficient comparisons

170

* @return Long value for prefix sorting

171

*/

172

public long getPrefix();

173

```

174

175

### Substring Operations

176

177

Efficient substring extraction with both index-based and SQL-style positioning.

178

179

```java { .api }

180

/**

181

* Extract substring using start and end indices

182

* @param start Starting character index (inclusive)

183

* @param until Ending character index (exclusive)

184

* @return Substring as UTF8String

185

*/

186

public UTF8String substring(int start, int until);

187

188

/**

189

* Extract substring using SQL-style 1-based positioning

190

* @param pos Starting position (1-based)

191

* @param length Number of characters

192

* @return Substring as UTF8String

193

*/

194

public UTF8String substringSQL(int pos, int length);

195

```

196

197

### String Search Operations

198

199

Methods for searching within strings including substring matching and position finding.

200

201

```java { .api }

202

/**

203

* Check if string contains substring

204

* @param substring Substring to search for

205

* @return true if substring is found

206

*/

207

public boolean contains(UTF8String substring);

208

209

/**

210

* Check if string starts with prefix

211

* @param prefix Prefix to check

212

* @return true if string starts with prefix

213

*/

214

public boolean startsWith(UTF8String prefix);

215

216

/**

217

* Check if string ends with suffix

218

* @param suffix Suffix to check

219

* @return true if string ends with suffix

220

*/

221

public boolean endsWith(UTF8String suffix);

222

223

/**

224

* Find index of substring starting from position

225

* @param v Substring to find

226

* @param start Starting position for search

227

* @return Index of substring or -1 if not found

228

*/

229

public int indexOf(UTF8String v, int start);

230

231

/**

232

* Find position in comma-separated list

233

* @param match String to find in list

234

* @return 1-based position or 0 if not found

235

*/

236

public int findInSet(UTF8String match);

237

238

/**

239

* Check if string matches at specific position

240

* @param s String to match

241

* @param pos Position to check match

242

* @return true if strings match at position

243

*/

244

public boolean matchAt(UTF8String s, int pos);

245

```

246

247

### Case Conversion

248

249

Case conversion operations preserving UTF-8 encoding and supporting Unicode.

250

251

```java { .api }

252

/**

253

* Convert to uppercase

254

* @return Uppercase UTF8String

255

*/

256

public UTF8String toUpperCase();

257

258

/**

259

* Convert to lowercase

260

* @return Lowercase UTF8String

261

*/

262

public UTF8String toLowerCase();

263

264

/**

265

* Convert to title case

266

* @return Title case UTF8String

267

*/

268

public UTF8String toTitleCase();

269

```

270

271

### String Trimming Operations

272

273

Comprehensive trimming operations for whitespace and custom character removal.

274

275

```java { .api }

276

/**

277

* Trim leading and trailing spaces

278

* @return Trimmed UTF8String

279

*/

280

public UTF8String trim();

281

282

/**

283

* Trim all types of whitespace characters

284

* @return Trimmed UTF8String

285

*/

286

public UTF8String trimAll();

287

288

/**

289

* Trim specific characters from both ends

290

* @param trimString Characters to trim

291

* @return Trimmed UTF8String

292

*/

293

public UTF8String trim(UTF8String trimString);

294

295

/**

296

* Trim leading spaces

297

* @return Left-trimmed UTF8String

298

*/

299

public UTF8String trimLeft();

300

301

/**

302

* Trim specific characters from start

303

* @param trimString Characters to trim

304

* @return Left-trimmed UTF8String

305

*/

306

public UTF8String trimLeft(UTF8String trimString);

307

308

/**

309

* Trim trailing spaces

310

* @return Right-trimmed UTF8String

311

*/

312

public UTF8String trimRight();

313

314

/**

315

* Trim specific number of trailing spaces

316

* @param numSpaces Number of spaces to trim

317

* @return Right-trimmed UTF8String

318

*/

319

public UTF8String trimTrailingSpaces(int numSpaces);

320

321

/**

322

* Trim specific characters from end

323

* @param trimString Characters to trim

324

* @return Right-trimmed UTF8String

325

*/

326

public UTF8String trimRight(UTF8String trimString);

327

```

328

329

### String Manipulation

330

331

Advanced string manipulation including reversal, repetition, padding, and character replacement.

332

333

```java { .api }

334

/**

335

* Reverse the string

336

* @return Reversed UTF8String

337

*/

338

public UTF8String reverse();

339

340

/**

341

* Repeat string multiple times

342

* @param times Number of repetitions

343

* @return Repeated UTF8String

344

*/

345

public UTF8String repeat(int times);

346

347

/**

348

* Right pad string to specified length

349

* @param len Target length

350

* @param pad Padding string

351

* @return Right-padded UTF8String

352

*/

353

public UTF8String rpad(int len, UTF8String pad);

354

355

/**

356

* Left pad string to specified length

357

* @param len Target length

358

* @param pad Padding string

359

* @return Left-padded UTF8String

360

*/

361

public UTF8String lpad(int len, UTF8String pad);

362

363

/**

364

* Replace all occurrences of search string

365

* @param search String to search for

366

* @param replace Replacement string

367

* @return String with replacements

368

*/

369

public UTF8String replace(UTF8String search, UTF8String replace);

370

371

/**

372

* Translate characters using mapping dictionary

373

* @param dict Character translation dictionary

374

* @return Translated UTF8String

375

*/

376

public UTF8String translate(Map<String, String> dict);

377

378

/**

379

* Generate soundex encoding

380

* @return Soundex encoded UTF8String

381

*/

382

public UTF8String soundex();

383

```

384

385

### String Splitting

386

387

String splitting operations with pattern matching and SQL-style delimiters.

388

389

```java { .api }

390

/**

391

* Split string using regex pattern

392

* @param pattern Regex pattern for splitting

393

* @param limit Maximum number of splits (-1 for no limit)

394

* @return Array of split UTF8String parts

395

*/

396

public UTF8String[] split(UTF8String pattern, int limit);

397

398

/**

399

* Split string using SQL-style delimiter

400

* @param delimiter Delimiter string

401

* @param limit Maximum number of splits (-1 for no limit)

402

* @return Array of split UTF8String parts

403

*/

404

public UTF8String[] splitSQL(UTF8String delimiter, int limit);

405

406

/**

407

* Extract substring by delimiter occurrence count

408

* @param delim Delimiter string

409

* @param count Occurrence count (positive from start, negative from end)

410

* @return Substring before/after delimiter

411

*/

412

public UTF8String subStringIndex(UTF8String delim, int count);

413

```

414

415

### Numeric Parsing

416

417

Safe and exact numeric parsing methods with wrapper classes for result handling.

418

419

```java { .api }

420

/**

421

* Wrapper class for long parsing results

422

*/

423

public static class LongWrapper implements Serializable {

424

public transient long value; // Parsed long value

425

}

426

427

/**

428

* Wrapper class for int parsing results

429

*/

430

public static class IntWrapper implements Serializable {

431

public transient int value; // Parsed int value

432

}

433

434

// Safe parsing methods (return false on failure)

435

/**

436

* Parse string to long with error handling

437

* @param toLongResult Wrapper to store result

438

* @return true if parsing succeeded

439

*/

440

public boolean toLong(LongWrapper toLongResult);

441

442

/**

443

* Parse string to int with error handling

444

* @param intWrapper Wrapper to store result

445

* @return true if parsing succeeded

446

*/

447

public boolean toInt(IntWrapper intWrapper);

448

449

/**

450

* Parse string to short with error handling

451

* @param intWrapper Wrapper to store result

452

* @return true if parsing succeeded

453

*/

454

public boolean toShort(IntWrapper intWrapper);

455

456

/**

457

* Parse string to byte with error handling

458

* @param intWrapper Wrapper to store result

459

* @return true if parsing succeeded

460

*/

461

public boolean toByte(IntWrapper intWrapper);

462

463

// Exact parsing methods (throw exceptions on failure)

464

/**

465

* Parse string to long (throws exception on failure)

466

* @return Parsed long value

467

* @throws NumberFormatException if parsing fails

468

*/

469

public long toLongExact();

470

471

/**

472

* Parse string to int (throws exception on failure)

473

* @return Parsed int value

474

* @throws NumberFormatException if parsing fails

475

*/

476

public int toIntExact();

477

478

/**

479

* Parse string to short (throws exception on failure)

480

* @return Parsed short value

481

* @throws NumberFormatException if parsing fails

482

*/

483

public short toShortExact();

484

485

/**

486

* Parse string to byte (throws exception on failure)

487

* @return Parsed byte value

488

* @throws NumberFormatException if parsing fails

489

*/

490

public byte toByteExact();

491

```

492

493

**Usage Examples:**

494

495

```java

496

import org.apache.spark.unsafe.types.UTF8String;

497

498

// Safe parsing with error handling

499

UTF8String numStr = UTF8String.fromString("123");

500

UTF8String.LongWrapper longResult = new UTF8String.LongWrapper();

501

502

if (numStr.toLong(longResult)) {

503

long value = longResult.value;

504

System.out.println("Parsed: " + value);

505

} else {

506

System.out.println("Failed to parse as long");

507

}

508

509

// Exact parsing with exceptions

510

try {

511

int value = UTF8String.fromString("456").toIntExact();

512

System.out.println("Parsed: " + value);

513

} catch (NumberFormatException e) {

514

System.out.println("Invalid number format");

515

}

516

```

517

518

### String Comparison

519

520

Comprehensive comparison operations including lexicographic ordering and distance calculations.

521

522

```java { .api }

523

/**

524

* Compare strings lexicographically

525

* @param other String to compare with

526

* @return Negative, zero, or positive value

527

*/

528

public int compareTo(UTF8String other);

529

530

/**

531

* Compare strings (alias for compareTo)

532

* @param other String to compare with

533

* @return Negative, zero, or positive value

534

*/

535

public int compare(UTF8String other);

536

537

/**

538

* Check equality with another object

539

* @param other Object to compare with

540

* @return true if equal

541

*/

542

public boolean equals(Object other);

543

544

/**

545

* Calculate hash code

546

* @return Hash code value

547

*/

548

public int hashCode();

549

550

/**

551

* Calculate Levenshtein distance

552

* @param other String to compare with

553

* @return Edit distance

554

*/

555

public int levenshteinDistance(UTF8String other);

556

557

/**

558

* Calculate Levenshtein distance with threshold

559

* @param other String to compare with

560

* @param threshold Maximum distance to calculate

561

* @return Edit distance or -1 if exceeds threshold

562

*/

563

public int levenshteinDistance(UTF8String other, int threshold);

564

```

565

566

### Utility Operations

567

568

General utility methods for string conversion, copying, and serialization.

569

570

```java { .api }

571

/**

572

* Convert to Java String

573

* @return Java String representation

574

*/

575

public String toString();

576

577

/**

578

* Clone the UTF8String

579

* @return Cloned UTF8String

580

*/

581

public UTF8String clone();

582

583

/**

584

* Create a copy of the UTF8String

585

* @return Copied UTF8String

586

*/

587

public UTF8String copy();

588

589

// Serialization support methods

590

/**

591

* Write object to ObjectOutput (Externalizable)

592

* @param out ObjectOutput stream

593

* @throws IOException if I/O error occurs

594

*/

595

public void writeExternal(ObjectOutput out) throws IOException;

596

597

/**

598

* Read object from ObjectInput (Externalizable)

599

* @param in ObjectInput stream

600

* @throws IOException if I/O error occurs

601

* @throws ClassNotFoundException if class not found

602

*/

603

public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException;

604

605

/**

606

* Write using Kryo serialization

607

* @param kryo Kryo instance

608

* @param out Output stream

609

*/

610

public void write(Kryo kryo, Output out);

611

612

/**

613

* Read using Kryo serialization

614

* @param kryo Kryo instance

615

* @param in Input stream

616

*/

617

public void read(Kryo kryo, Input in);

618

```

619

620

## UTF8StringBuilder

621

622

Efficient builder for constructing UTF8String objects with automatic memory management.

623

624

```java { .api }

625

/**

626

* Builder for constructing UTF8String objects efficiently

627

*/

628

class UTF8StringBuilder {

629

/**

630

* Create builder with default initial size (16 bytes)

631

*/

632

public UTF8StringBuilder();

633

634

/**

635

* Create builder with custom initial size

636

* @param initialSize Initial buffer size in bytes

637

*/

638

public UTF8StringBuilder(int initialSize);

639

640

/**

641

* Append UTF8String to builder

642

* @param value UTF8String to append

643

*/

644

public void append(UTF8String value);

645

646

/**

647

* Append Java String to builder

648

* @param value Java String to append

649

*/

650

public void append(String value);

651

652

/**

653

* Append raw bytes to builder

654

* @param base Base object (null for off-heap)

655

* @param offset Offset within object or address

656

* @param length Number of bytes to append

657

*/

658

public void appendBytes(Object base, long offset, int length);

659

660

/**

661

* Build final UTF8String from accumulated data

662

* @return Constructed UTF8String

663

*/

664

public UTF8String build();

665

}

666

```

667

668

**Usage Examples:**

669

670

```java

671

import org.apache.spark.unsafe.types.*;

672

673

// Basic string operations

674

UTF8String str = UTF8String.fromString("Hello World");

675

UTF8String upper = str.toUpperCase();

676

UTF8String trimmed = str.trim();

677

678

// String searching and manipulation

679

boolean hasHello = str.contains(UTF8String.fromString("Hello"));

680

UTF8String substr = str.substring(0, 5);

681

UTF8String[] parts = str.split(UTF8String.fromString(" "), -1);

682

683

// Using StringBuilder

684

UTF8StringBuilder builder = new UTF8StringBuilder();

685

builder.append(UTF8String.fromString("Hello"));

686

builder.append(" ");

687

builder.append(UTF8String.fromString("World"));

688

UTF8String result = builder.build();

689

690

// Comparison and sorting

691

UTF8String str1 = UTF8String.fromString("apple");

692

UTF8String str2 = UTF8String.fromString("banana");

693

int comparison = str1.compareTo(str2); // negative value

694

695

// Numeric parsing

696

UTF8String numStr = UTF8String.fromString("123");

697

int value = numStr.toIntExact();

698

```

699

700

## Performance Characteristics

701

702

- **Zero-Copy Operations**: Many operations avoid memory copying by using views

703

- **Memory Efficiency**: Direct UTF-8 storage without Java String overhead

704

- **Fast Comparisons**: Optimized comparison using prefix sorting

705

- **Lazy Evaluation**: Some operations defer computation until needed

706

- **Native Operations**: Core operations compile to efficient native code