or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

array-operations.mddata-types-utilities.mdhash-bitset-operations.mdindex.mdmemory-management.mdplatform-operations.mdutf8-string-processing.md
tile.json

utf8-string-processing.mddocs/

UTF-8 String Processing

Comprehensive UTF-8 string manipulation capabilities with extensive string operations, collation support, and optimized storage for internal Spark use. UTF8String provides a complete alternative to Java's String class optimized for big data processing with support for direct memory access and collation-aware operations.

Capabilities

UTF8String Core Operations

UTF-8 encoded string class with comprehensive string manipulation capabilities, implementing Comparable, Externalizable, KryoSerializable, and Cloneable interfaces.

public final class UTF8String implements Comparable<UTF8String>, Externalizable, KryoSerializable, Cloneable {
    // Construction and factory methods
    public UTF8String();
    public static UTF8String fromBytes(byte[] bytes);
    public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes);
    public static UTF8String fromAddress(Object base, long offset, int numBytes);
    public static UTF8String fromString(String str);
    public static UTF8String blankString(int length);
    public static boolean isWhitespaceOrISOControl(int codePoint);
    public static int numBytesForFirstByte(byte b);
    
    // Constants
    public static final UTF8String EMPTY_UTF8;
    public static final UTF8String ZERO_UTF8;
    public static final UTF8String SPACE_UTF8;
    
    // Core access methods
    public Object getBaseObject();
    public long getBaseOffset();
    public int numBytes();
    public int numChars();
    public long getPrefix();
    public byte[] getBytes();
    public ByteBuffer getByteBuffer();
}

String Access and Validation

Methods for accessing individual characters, bytes, and validating UTF-8 encoding.

// Character and byte access
public byte getByte(int byteIndex);
public int getChar(int charIndex);
public int codePointFrom(int byteIndex);

// Validation methods
public UTF8String makeValid();
public boolean isValid();
public boolean isFullAscii();

// Position conversion
public int charPosToByte(int charPos);
public int bytePosToChar(int bytePos);

Substring Operations

Various substring extraction methods with different indexing strategies.

public UTF8String substring(int start, int until);
public UTF8String substringSQL(int pos, int length);
public UTF8String copyUTF8String(int start, int end);

Search Operations

Comprehensive string search and pattern matching capabilities.

public boolean contains(UTF8String substring);
public boolean matchAt(UTF8String s, int pos);
public boolean startsWith(UTF8String prefix);
public boolean endsWith(UTF8String suffix);
public int indexOf(UTF8String v, int start);
public int indexOfEmpty(int start);
public int find(UTF8String str, int start);
public int rfind(UTF8String str, int start);
public int findInSet(UTF8String match);

Case Conversion

Case conversion methods with both ASCII-only and full Unicode support.

public UTF8String toUpperCase();
public UTF8String toUpperCaseAscii();
public UTF8String toLowerCase();
public UTF8String toLowerCaseAscii();
public UTF8String toTitleCase();
public UTF8String toTitleCaseICU();

Trimming Operations

Various trimming methods for whitespace and custom character removal.

public UTF8String trim();
public UTF8String trimAll();
public UTF8String trim(UTF8String trimString);
public UTF8String trimLeft();
public UTF8String trimLeft(UTF8String trimString);
public UTF8String trimRight();
public UTF8String trimTrailingSpaces(int numSpaces);
public UTF8String trimRight(UTF8String trimString);

String Manipulation

Methods for string transformation, padding, and manipulation.

public UTF8String reverse();
public UTF8String repeat(int times);
public UTF8String rpad(int len, UTF8String pad);
public UTF8String lpad(int len, UTF8String pad);
public UTF8String subStringIndex(UTF8String delim, int count);
public UTF8String replace(UTF8String search, UTF8String replace);
public UTF8String translate(Map<String, String> dict);

Splitting Operations

String splitting with regex and SQL-style delimiters.

public UTF8String[] split(UTF8String pattern, int limit);
public UTF8String[] splitSQL(UTF8String delimiter, int limit);

Concatenation

Static methods for efficient string concatenation.

public static UTF8String concat(UTF8String... inputs);
public static UTF8String concatWs(UTF8String separator, UTF8String... inputs);
public static UTF8String toBinaryString(long val);

Numeric Conversion

Methods for parsing strings as numeric values with error handling.

public boolean toLong(LongWrapper toLongResult);
public boolean toInt(IntWrapper intWrapper);
public boolean toShort(IntWrapper intWrapper);
public boolean toByte(IntWrapper intWrapper);
public long toLongExact();
public int toIntExact();
public short toShortExact();
public byte toByteExact();

Comparison Operations

Various comparison methods including binary and collation-aware comparisons.

public int compareTo(UTF8String other);
public int binaryCompare(UTF8String other);
public int semanticCompare(UTF8String other, int collationId);
public boolean equals(Object other);
public boolean binaryEquals(UTF8String other);
public boolean semanticEquals(UTF8String other, int collationId);

I/O Operations

Methods for serialization and I/O operations.

public void writeToMemory(Object target, long targetOffset);
public void writeTo(ByteBuffer buffer);
public void writeTo(OutputStream out) throws IOException;
public void writeExternal(ObjectOutput out) throws IOException;
public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException;
public void write(Kryo kryo, Output out);
public void read(Kryo kryo, Input in);

Iterator Support

Iterator methods for code point traversal.

public Iterator<Integer> codePointIterator();
public Iterator<Integer> codePointIterator(CodePointIteratorType iteratorMode);
public Iterator<Integer> reverseCodePointIterator();
public Iterator<Integer> reverseCodePointIterator(CodePointIteratorType iteratorMode);

Utility Methods

Utility methods for cloning, hashing, and distance calculations.

public String toString();
public String toValidString();
public UTF8String clone();
public UTF8String copy();
public int hashCode();
public int levenshteinDistance(UTF8String other);
public int levenshteinDistance(UTF8String other, int threshold);
public UTF8String soundex();

UTF8StringBuilder

Helper class for building UTF8String objects by concatenating multiple UTF-8 encoded strings.

public class UTF8StringBuilder {
    public UTF8StringBuilder();
    public UTF8StringBuilder(int initialSize);
    public void append(UTF8String value);
    public void append(String value);
    public void appendBytes(Object base, long offset, int length);
    public UTF8String build();
    public void appendCodePoint(int codePoint);
}

Usage Examples

Basic String Operations

import org.apache.spark.unsafe.types.UTF8String;

// Create UTF8String instances
UTF8String str1 = UTF8String.fromString("Hello");
UTF8String str2 = UTF8String.fromString("World");
UTF8String empty = UTF8String.EMPTY_UTF8;

// Basic properties
int bytes = str1.numBytes();     // Number of UTF-8 bytes
int chars = str1.numChars();     // Number of Unicode characters
boolean isAscii = str1.isFullAscii();

// UTF-8 validation and utility
boolean isWhitespace = UTF8String.isWhitespaceOrISOControl(0x0020); // Space character
byte firstByte = "Hello".getBytes()[0];
int bytesForChar = UTF8String.numBytesForFirstByte(firstByte); // Number of bytes for UTF-8 character

// Concatenation
UTF8String result = UTF8String.concat(str1, UTF8String.fromString(" "), str2);
UTF8String joined = UTF8String.concatWs(UTF8String.fromString(","), str1, str2);

// Conversion back to Java String
String javaString = result.toString();

Substring and Search Operations

import org.apache.spark.unsafe.types.UTF8String;

UTF8String text = UTF8String.fromString("Hello World Example");

// Substring operations
UTF8String sub1 = text.substring(0, 5);          // "Hello" (character-based)
UTF8String sub2 = text.substringSQL(1, 5);       // "Hello" (SQL 1-based indexing)

// Search operations
boolean contains = text.contains(UTF8String.fromString("World"));
int index = text.indexOf(UTF8String.fromString("World"), 0);
boolean startsWith = text.startsWith(UTF8String.fromString("Hello"));
boolean endsWith = text.endsWith(UTF8String.fromString("Example"));

// Pattern matching at specific position
boolean matches = text.matchAt(UTF8String.fromString("World"), 6);

Case Conversion and Trimming

import org.apache.spark.unsafe.types.UTF8String;

UTF8String text = UTF8String.fromString("  Hello World  ");

// Case conversion
UTF8String upper = text.toUpperCase();
UTF8String lower = text.toLowerCase();
UTF8String title = text.toTitleCase();

// ASCII-only conversion (faster for ASCII strings)
UTF8String upperAscii = text.toUpperCaseAscii();
UTF8String lowerAscii = text.toLowerCaseAscii();

// Trimming operations
UTF8String trimmed = text.trim();                    // Remove whitespace
UTF8String leftTrim = text.trimLeft();              // Remove leading whitespace
UTF8String rightTrim = text.trimRight();            // Remove trailing whitespace

// Custom character trimming
UTF8String customTrim = text.trim(UTF8String.fromString(" H"));

String Manipulation

import org.apache.spark.unsafe.types.UTF8String;

UTF8String text = UTF8String.fromString("Hello");

// String manipulation
UTF8String reversed = text.reverse();               // "olleH"
UTF8String repeated = text.repeat(3);               // "HelloHelloHello"
UTF8String padded = text.rpad(10, UTF8String.fromString("*")); // "Hello*****"
UTF8String leftPadded = text.lpad(10, UTF8String.fromString("*")); // "*****Hello"

// Replace operations
UTF8String replaced = text.replace(
    UTF8String.fromString("ll"), 
    UTF8String.fromString("XX")
); // "HeXXo"

// Translation using character mapping
Map<String, String> dict = new HashMap<>();
dict.put("l", "1");
dict.put("o", "0");
UTF8String translated = text.translate(dict); // "He110"

Splitting and Parsing

import org.apache.spark.unsafe.types.UTF8String;

UTF8String csv = UTF8String.fromString("apple,banana,cherry");

// Split operations
UTF8String[] parts = csv.splitSQL(UTF8String.fromString(","), -1);
// Results in: ["apple", "banana", "cherry"]

// Numeric parsing
UTF8String number = UTF8String.fromString("12345");
try {
    long value = number.toLongExact();    // 12345L
    int intValue = number.toIntExact();   // 12345
} catch (NumberFormatException e) {
    // Handle parsing error
}

// Safe parsing with wrapper objects
LongWrapper longWrapper = new LongWrapper();
if (number.toLong(longWrapper)) {
    long value = longWrapper.value;  // Parsing succeeded
}

String Building

import org.apache.spark.unsafe.UTF8StringBuilder;
import org.apache.spark.unsafe.types.UTF8String;

// Efficient string building
UTF8StringBuilder builder = new UTF8StringBuilder();
builder.append(UTF8String.fromString("Hello"));
builder.append(" ");  // Java string automatically converted
builder.append(UTF8String.fromString("World"));
builder.appendCodePoint(0x1F600); // Unicode emoji

UTF8String result = builder.build();

Memory-Based String Operations

import org.apache.spark.unsafe.types.UTF8String;
import org.apache.spark.unsafe.Platform;

// Create string from memory address
byte[] data = "Hello World".getBytes("UTF-8");
long address = Platform.allocateMemory(data.length);
Platform.copyMemory(data, Platform.BYTE_ARRAY_OFFSET, null, address, data.length);

UTF8String str = UTF8String.fromAddress(null, address, data.length);

// Write string to memory
long targetAddress = Platform.allocateMemory(str.numBytes());
str.writeToMemory(null, targetAddress);

// Clean up
Platform.freeMemory(address);
Platform.freeMemory(targetAddress);

Collation-Aware Operations

import org.apache.spark.unsafe.types.UTF8String;

UTF8String str1 = UTF8String.fromString("Hello");
UTF8String str2 = UTF8String.fromString("HELLO");

// Binary comparison (case-sensitive)
int binaryCompare = str1.binaryCompare(str2);    // != 0

// Semantic comparison with collation ID
int collationId = 1; // Case-insensitive collation
int semanticCompare = str1.semanticCompare(str2, collationId);  // == 0

// Semantic equality
boolean equal = str1.semanticEquals(str2, collationId);  // true