or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

arrays.mdhashing-bitsets.mdindex.mdintervals.mdmemory.mdplatform.mdutf8-strings.md

utf8-strings.mddocs/

0

# UTF8 String Operations

1

2

The `UTF8String` class provides a high-performance UTF-8 string implementation specifically optimized for Spark SQL operations. It stores strings as UTF-8 encoded byte arrays with direct memory access for maximum performance in data processing workloads.

3

4

**Important**: This class is designed for internal Spark SQL use and should not be used in general applications outside of SQL contexts.

5

6

## Core Imports

7

8

```java

9

import org.apache.spark.unsafe.types.UTF8String;

10

```

11

12

## Usage Examples

13

14

### Basic String Creation and Conversion

15

16

```java

17

// Create UTF8String from Java String

18

UTF8String utf8 = UTF8String.fromString("Hello, World!");

19

20

// Create from byte array

21

byte[] bytes = "Hello".getBytes(StandardCharsets.UTF_8);

22

UTF8String fromBytes = UTF8String.fromBytes(bytes);

23

24

// Convert back to Java String

25

String javaString = utf8.toString();

26

27

// Get underlying bytes

28

byte[] underlyingBytes = utf8.getBytes();

29

```

30

31

### String Operations

32

33

```java

34

UTF8String original = UTF8String.fromString("Hello, World!");

35

36

// Basic properties

37

int numBytes = original.numBytes(); // Number of UTF-8 bytes

38

int numChars = original.numChars(); // Number of Unicode characters

39

40

// Case operations

41

UTF8String upper = original.toUpperCase();

42

UTF8String lower = original.toLowerCase();

43

UTF8String title = original.toTitleCase();

44

45

// Substring operations

46

UTF8String sub1 = original.substring(0, 5); // "Hello"

47

UTF8String sub2 = original.substringSQL(1, 5); // SQL-style substring

48

49

// Search operations

50

boolean contains = original.contains(UTF8String.fromString("World"));

51

boolean starts = original.startsWith(UTF8String.fromString("Hello"));

52

boolean ends = original.endsWith(UTF8String.fromString("!"));

53

```

54

55

### String Concatenation

56

57

```java

58

UTF8String str1 = UTF8String.fromString("Hello");

59

UTF8String str2 = UTF8String.fromString("World");

60

UTF8String separator = UTF8String.fromString(", ");

61

62

// Concatenate multiple strings

63

UTF8String result1 = UTF8String.concat(str1, separator, str2);

64

65

// Concatenate with separator

66

UTF8String result2 = UTF8String.concatWs(separator, str1, str2);

67

```

68

69

### String Manipulation

70

71

```java

72

UTF8String text = UTF8String.fromString(" Hello, World! ");

73

74

// Trimming operations

75

UTF8String trimmed = text.trim(); // Remove whitespace

76

UTF8String leftTrim = text.trimLeft(); // Remove left whitespace

77

UTF8String rightTrim = text.trimRight(); // Remove right whitespace

78

79

// Custom character trimming

80

UTF8String customTrim = text.trim(UTF8String.fromString(" !"));

81

82

// Other operations

83

UTF8String reversed = text.reverse();

84

UTF8String repeated = UTF8String.fromString("Hi").repeat(3); // "HiHiHi"

85

```

86

87

### Advanced String Operations

88

89

```java

90

UTF8String data = UTF8String.fromString("apple,banana,cherry");

91

UTF8String pattern = UTF8String.fromString(",");

92

93

// Split string

94

UTF8String[] parts = data.split(pattern, -1);

95

96

// Find and replace

97

UTF8String search = UTF8String.fromString("banana");

98

UTF8String replace = UTF8String.fromString("orange");

99

UTF8String replaced = data.replace(search, replace);

100

101

// Padding operations

102

UTF8String padded = UTF8String.fromString("Hi").rpad(10, UTF8String.fromString("*"));

103

UTF8String leftPadded = UTF8String.fromString("Hi").lpad(10, UTF8String.fromString("*"));

104

```

105

106

### Numeric Parsing

107

108

```java

109

UTF8String numberStr = UTF8String.fromString("12345");

110

111

// Parse as different numeric types

112

UTF8String.LongWrapper longResult = new UTF8String.LongWrapper();

113

boolean isValidLong = numberStr.toLong(longResult);

114

if (isValidLong) {

115

long value = longResult.value;

116

}

117

118

UTF8String.IntWrapper intResult = new UTF8String.IntWrapper();

119

boolean isValidInt = numberStr.toInt(intResult);

120

if (isValidInt) {

121

int value = intResult.value;

122

}

123

```

124

125

## API Reference

126

127

### Construction and Conversion

128

129

```java { .api }

130

public final class UTF8String implements Comparable<UTF8String>,

131

java.io.Externalizable, com.esotericsoftware.kryo.KryoSerializable, Cloneable {

132

133

// Constants

134

public static final UTF8String EMPTY_UTF8;

135

136

// Constructor for serialization

137

public UTF8String();

138

139

// Creation methods

140

public static UTF8String fromString(String str);

141

public static UTF8String fromBytes(byte[] bytes);

142

public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes);

143

public static UTF8String fromAddress(Object base, long offset, int numBytes);

144

public static UTF8String blankString(int length);

145

}

146

```

147

148

### Memory Access

149

150

```java { .api }

151

/**

152

* Returns base object for memory access.

153

*/

154

public Object getBaseObject();

155

156

/**

157

* Returns base offset for memory access.

158

*/

159

public long getBaseOffset();

160

161

/**

162

* Writes string content to specified memory location.

163

*/

164

public void writeToMemory(Object target, long targetOffset);

165

166

/**

167

* Writes string content to ByteBuffer.

168

*/

169

public void writeTo(java.nio.ByteBuffer buffer);

170

171

/**

172

* Returns ByteBuffer wrapping the string data.

173

*/

174

public java.nio.ByteBuffer getByteBuffer();

175

176

/**

177

* Writes string content to OutputStream.

178

*/

179

public void writeTo(java.io.OutputStream out);

180

```

181

182

### String Properties

183

184

```java { .api }

185

/**

186

* Returns number of bytes in UTF-8 encoding.

187

*/

188

public int numBytes();

189

190

/**

191

* Returns number of Unicode characters.

192

*/

193

public int numChars();

194

195

/**

196

* Returns 64-bit prefix for sorting operations.

197

*/

198

public long getPrefix();

199

200

/**

201

* Returns copy of underlying bytes.

202

*/

203

public byte[] getBytes();

204

```

205

206

### Substring Operations

207

208

```java { .api }

209

/**

210

* Returns substring by character positions (0-based, exclusive end).

211

*/

212

public UTF8String substring(int start, int until);

213

214

/**

215

* Returns substring with SQL semantics (1-based, inclusive length).

216

*/

217

public UTF8String substringSQL(int pos, int length);

218

```

219

220

### Search Operations

221

222

```java { .api }

223

/**

224

* Checks if string contains the specified substring.

225

*/

226

public boolean contains(UTF8String substring);

227

228

/**

229

* Checks if string starts with the specified prefix.

230

*/

231

public boolean startsWith(UTF8String prefix);

232

233

/**

234

* Checks if string ends with the specified suffix.

235

*/

236

public boolean endsWith(UTF8String suffix);

237

238

/**

239

* Finds index of substring starting from specified position.

240

*/

241

public int indexOf(UTF8String v, int start);

242

243

/**

244

* Finds position in comma-separated value list (1-based).

245

*/

246

public int findInSet(UTF8String match);

247

```

248

249

### Case Operations

250

251

```java { .api }

252

/**

253

* Returns uppercase version of the string.

254

*/

255

public UTF8String toUpperCase();

256

257

/**

258

* Returns lowercase version of the string.

259

*/

260

public UTF8String toLowerCase();

261

262

/**

263

* Returns title case version of the string.

264

*/

265

public UTF8String toTitleCase();

266

```

267

268

### Trimming Operations

269

270

```java { .api }

271

/**

272

* Trims whitespace from both ends.

273

*/

274

public UTF8String trim();

275

276

/**

277

* Trims specified characters from both ends.

278

*/

279

public UTF8String trim(UTF8String trimString);

280

281

/**

282

* Trims whitespace from left end.

283

*/

284

public UTF8String trimLeft();

285

286

/**

287

* Trims specified characters from left end.

288

*/

289

public UTF8String trimLeft(UTF8String trimString);

290

291

/**

292

* Trims whitespace from right end.

293

*/

294

public UTF8String trimRight();

295

296

/**

297

* Trims specified characters from right end.

298

*/

299

public UTF8String trimRight(UTF8String trimString);

300

```

301

302

### String Manipulation

303

304

```java { .api }

305

/**

306

* Returns reversed string.

307

*/

308

public UTF8String reverse();

309

310

/**

311

* Returns string repeated specified number of times.

312

*/

313

public UTF8String repeat(int times);

314

315

/**

316

* Returns substring before/after nth occurrence of delimiter.

317

*/

318

public UTF8String subStringIndex(UTF8String delim, int count);

319

320

/**

321

* Right-pads string to specified length with pad string.

322

*/

323

public UTF8String rpad(int len, UTF8String pad);

324

325

/**

326

* Left-pads string to specified length with pad string.

327

*/

328

public UTF8String lpad(int len, UTF8String pad);

329

```

330

331

### Split and Replace Operations

332

333

```java { .api }

334

/**

335

* Splits string using regex pattern with optional limit.

336

*/

337

public UTF8String[] split(UTF8String pattern, int limit);

338

339

/**

340

* Replaces all occurrences of search string with replacement.

341

*/

342

public UTF8String replace(UTF8String search, UTF8String replace);

343

344

/**

345

* Translates characters using the provided dictionary.

346

*/

347

public UTF8String translate(java.util.Map<Character, Character> dict);

348

```

349

350

### Concatenation Operations

351

352

```java { .api }

353

/**

354

* Concatenates multiple UTF8Strings.

355

*/

356

public static UTF8String concat(UTF8String... inputs);

357

358

/**

359

* Concatenates UTF8Strings with separator.

360

*/

361

public static UTF8String concatWs(UTF8String separator, UTF8String... inputs);

362

```

363

364

### Numeric Parsing

365

366

```java { .api }

367

/**

368

* Parses string as long, returns success status.

369

*/

370

public boolean toLong(LongWrapper toLongResult);

371

372

/**

373

* Parses string as int, returns success status.

374

*/

375

public boolean toInt(IntWrapper intWrapper);

376

377

/**

378

* Parses string as short, returns success status.

379

*/

380

public boolean toShort(IntWrapper intWrapper);

381

382

/**

383

* Parses string as byte, returns success status.

384

*/

385

public boolean toByte(IntWrapper intWrapper);

386

```

387

388

### Comparison and Hashing

389

390

```java { .api }

391

/**

392

* Compares strings lexicographically.

393

*/

394

public int compareTo(UTF8String other);

395

396

/**

397

* Alias for compareTo.

398

*/

399

public int compare(UTF8String other);

400

401

/**

402

* Compares strings for equality.

403

*/

404

public boolean equals(Object other);

405

406

/**

407

* Computes Levenshtein distance between strings.

408

*/

409

public int levenshteinDistance(UTF8String other);

410

411

/**

412

* Returns Murmur3 hash code.

413

*/

414

public int hashCode();

415

```

416

417

### Object Operations

418

419

```java { .api }

420

/**

421

* Converts to Java String.

422

*/

423

public String toString();

424

425

/**

426

* Creates shallow copy sharing underlying data.

427

*/

428

public UTF8String clone();

429

430

/**

431

* Creates deep copy with new byte array.

432

*/

433

public UTF8String copy();

434

```

435

436

### Specialized Operations

437

438

```java { .api }

439

/**

440

* Computes Soundex phonetic encoding.

441

*/

442

public UTF8String soundex();

443

```

444

445

### Nested Classes

446

447

```java { .api }

448

/**

449

* Wrapper for long parsing results.

450

*/

451

public static final class LongWrapper {

452

public long value;

453

}

454

455

/**

456

* Wrapper for int parsing results.

457

*/

458

public static final class IntWrapper {

459

public int value;

460

}

461

```

462

463

## Performance Characteristics

464

465

1. **Memory Efficiency**: Stores strings as UTF-8 bytes, more compact than Java's UTF-16 strings

466

2. **Direct Access**: Provides direct memory access for high-performance operations

467

3. **Lazy Conversion**: Avoids unnecessary conversions to Java String objects

468

4. **Optimized Operations**: Many operations work directly on UTF-8 bytes without decoding

469

470

## Usage Notes

471

472

1. **SQL Context**: Designed specifically for Spark SQL operations, not general string processing

473

2. **Immutability**: UTF8String objects are immutable; operations return new instances

474

3. **Memory Management**: When created from memory addresses, ensure the underlying memory remains valid

475

4. **Character vs Byte Indexing**: Be aware of the difference between character positions and byte positions

476

5. **Thread Safety**: UTF8String instances are immutable and thread-safe