Comprehensive UTF-8 string manipulation capabilities with extensive string operations, collation support, and optimized storage for internal Spark use. UTF8String provides a complete alternative to Java's String class optimized for big data processing with support for direct memory access and collation-aware operations.
UTF-8 encoded string class with comprehensive string manipulation capabilities, implementing Comparable, Externalizable, KryoSerializable, and Cloneable interfaces.
public final class UTF8String implements Comparable<UTF8String>, Externalizable, KryoSerializable, Cloneable {
// Construction and factory methods
public UTF8String();
public static UTF8String fromBytes(byte[] bytes);
public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes);
public static UTF8String fromAddress(Object base, long offset, int numBytes);
public static UTF8String fromString(String str);
public static UTF8String blankString(int length);
public static boolean isWhitespaceOrISOControl(int codePoint);
public static int numBytesForFirstByte(byte b);
// Constants
public static final UTF8String EMPTY_UTF8;
public static final UTF8String ZERO_UTF8;
public static final UTF8String SPACE_UTF8;
// Core access methods
public Object getBaseObject();
public long getBaseOffset();
public int numBytes();
public int numChars();
public long getPrefix();
public byte[] getBytes();
public ByteBuffer getByteBuffer();
}Methods for accessing individual characters, bytes, and validating UTF-8 encoding.
// Character and byte access
public byte getByte(int byteIndex);
public int getChar(int charIndex);
public int codePointFrom(int byteIndex);
// Validation methods
public UTF8String makeValid();
public boolean isValid();
public boolean isFullAscii();
// Position conversion
public int charPosToByte(int charPos);
public int bytePosToChar(int bytePos);Various substring extraction methods with different indexing strategies.
public UTF8String substring(int start, int until);
public UTF8String substringSQL(int pos, int length);
public UTF8String copyUTF8String(int start, int end);Comprehensive string search and pattern matching capabilities.
public boolean contains(UTF8String substring);
public boolean matchAt(UTF8String s, int pos);
public boolean startsWith(UTF8String prefix);
public boolean endsWith(UTF8String suffix);
public int indexOf(UTF8String v, int start);
public int indexOfEmpty(int start);
public int find(UTF8String str, int start);
public int rfind(UTF8String str, int start);
public int findInSet(UTF8String match);Case conversion methods with both ASCII-only and full Unicode support.
public UTF8String toUpperCase();
public UTF8String toUpperCaseAscii();
public UTF8String toLowerCase();
public UTF8String toLowerCaseAscii();
public UTF8String toTitleCase();
public UTF8String toTitleCaseICU();Various trimming methods for whitespace and custom character removal.
public UTF8String trim();
public UTF8String trimAll();
public UTF8String trim(UTF8String trimString);
public UTF8String trimLeft();
public UTF8String trimLeft(UTF8String trimString);
public UTF8String trimRight();
public UTF8String trimTrailingSpaces(int numSpaces);
public UTF8String trimRight(UTF8String trimString);Methods for string transformation, padding, and manipulation.
public UTF8String reverse();
public UTF8String repeat(int times);
public UTF8String rpad(int len, UTF8String pad);
public UTF8String lpad(int len, UTF8String pad);
public UTF8String subStringIndex(UTF8String delim, int count);
public UTF8String replace(UTF8String search, UTF8String replace);
public UTF8String translate(Map<String, String> dict);String splitting with regex and SQL-style delimiters.
public UTF8String[] split(UTF8String pattern, int limit);
public UTF8String[] splitSQL(UTF8String delimiter, int limit);Static methods for efficient string concatenation.
public static UTF8String concat(UTF8String... inputs);
public static UTF8String concatWs(UTF8String separator, UTF8String... inputs);
public static UTF8String toBinaryString(long val);Methods for parsing strings as numeric values with error handling.
public boolean toLong(LongWrapper toLongResult);
public boolean toInt(IntWrapper intWrapper);
public boolean toShort(IntWrapper intWrapper);
public boolean toByte(IntWrapper intWrapper);
public long toLongExact();
public int toIntExact();
public short toShortExact();
public byte toByteExact();Various comparison methods including binary and collation-aware comparisons.
public int compareTo(UTF8String other);
public int binaryCompare(UTF8String other);
public int semanticCompare(UTF8String other, int collationId);
public boolean equals(Object other);
public boolean binaryEquals(UTF8String other);
public boolean semanticEquals(UTF8String other, int collationId);Methods for serialization and I/O operations.
public void writeToMemory(Object target, long targetOffset);
public void writeTo(ByteBuffer buffer);
public void writeTo(OutputStream out) throws IOException;
public void writeExternal(ObjectOutput out) throws IOException;
public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException;
public void write(Kryo kryo, Output out);
public void read(Kryo kryo, Input in);Iterator methods for code point traversal.
public Iterator<Integer> codePointIterator();
public Iterator<Integer> codePointIterator(CodePointIteratorType iteratorMode);
public Iterator<Integer> reverseCodePointIterator();
public Iterator<Integer> reverseCodePointIterator(CodePointIteratorType iteratorMode);Utility methods for cloning, hashing, and distance calculations.
public String toString();
public String toValidString();
public UTF8String clone();
public UTF8String copy();
public int hashCode();
public int levenshteinDistance(UTF8String other);
public int levenshteinDistance(UTF8String other, int threshold);
public UTF8String soundex();Helper class for building UTF8String objects by concatenating multiple UTF-8 encoded strings.
public class UTF8StringBuilder {
public UTF8StringBuilder();
public UTF8StringBuilder(int initialSize);
public void append(UTF8String value);
public void append(String value);
public void appendBytes(Object base, long offset, int length);
public UTF8String build();
public void appendCodePoint(int codePoint);
}import org.apache.spark.unsafe.types.UTF8String;
// Create UTF8String instances
UTF8String str1 = UTF8String.fromString("Hello");
UTF8String str2 = UTF8String.fromString("World");
UTF8String empty = UTF8String.EMPTY_UTF8;
// Basic properties
int bytes = str1.numBytes(); // Number of UTF-8 bytes
int chars = str1.numChars(); // Number of Unicode characters
boolean isAscii = str1.isFullAscii();
// UTF-8 validation and utility
boolean isWhitespace = UTF8String.isWhitespaceOrISOControl(0x0020); // Space character
byte firstByte = "Hello".getBytes()[0];
int bytesForChar = UTF8String.numBytesForFirstByte(firstByte); // Number of bytes for UTF-8 character
// Concatenation
UTF8String result = UTF8String.concat(str1, UTF8String.fromString(" "), str2);
UTF8String joined = UTF8String.concatWs(UTF8String.fromString(","), str1, str2);
// Conversion back to Java String
String javaString = result.toString();import org.apache.spark.unsafe.types.UTF8String;
UTF8String text = UTF8String.fromString("Hello World Example");
// Substring operations
UTF8String sub1 = text.substring(0, 5); // "Hello" (character-based)
UTF8String sub2 = text.substringSQL(1, 5); // "Hello" (SQL 1-based indexing)
// Search operations
boolean contains = text.contains(UTF8String.fromString("World"));
int index = text.indexOf(UTF8String.fromString("World"), 0);
boolean startsWith = text.startsWith(UTF8String.fromString("Hello"));
boolean endsWith = text.endsWith(UTF8String.fromString("Example"));
// Pattern matching at specific position
boolean matches = text.matchAt(UTF8String.fromString("World"), 6);import org.apache.spark.unsafe.types.UTF8String;
UTF8String text = UTF8String.fromString(" Hello World ");
// Case conversion
UTF8String upper = text.toUpperCase();
UTF8String lower = text.toLowerCase();
UTF8String title = text.toTitleCase();
// ASCII-only conversion (faster for ASCII strings)
UTF8String upperAscii = text.toUpperCaseAscii();
UTF8String lowerAscii = text.toLowerCaseAscii();
// Trimming operations
UTF8String trimmed = text.trim(); // Remove whitespace
UTF8String leftTrim = text.trimLeft(); // Remove leading whitespace
UTF8String rightTrim = text.trimRight(); // Remove trailing whitespace
// Custom character trimming
UTF8String customTrim = text.trim(UTF8String.fromString(" H"));import org.apache.spark.unsafe.types.UTF8String;
UTF8String text = UTF8String.fromString("Hello");
// String manipulation
UTF8String reversed = text.reverse(); // "olleH"
UTF8String repeated = text.repeat(3); // "HelloHelloHello"
UTF8String padded = text.rpad(10, UTF8String.fromString("*")); // "Hello*****"
UTF8String leftPadded = text.lpad(10, UTF8String.fromString("*")); // "*****Hello"
// Replace operations
UTF8String replaced = text.replace(
UTF8String.fromString("ll"),
UTF8String.fromString("XX")
); // "HeXXo"
// Translation using character mapping
Map<String, String> dict = new HashMap<>();
dict.put("l", "1");
dict.put("o", "0");
UTF8String translated = text.translate(dict); // "He110"import org.apache.spark.unsafe.types.UTF8String;
UTF8String csv = UTF8String.fromString("apple,banana,cherry");
// Split operations
UTF8String[] parts = csv.splitSQL(UTF8String.fromString(","), -1);
// Results in: ["apple", "banana", "cherry"]
// Numeric parsing
UTF8String number = UTF8String.fromString("12345");
try {
long value = number.toLongExact(); // 12345L
int intValue = number.toIntExact(); // 12345
} catch (NumberFormatException e) {
// Handle parsing error
}
// Safe parsing with wrapper objects
LongWrapper longWrapper = new LongWrapper();
if (number.toLong(longWrapper)) {
long value = longWrapper.value; // Parsing succeeded
}import org.apache.spark.unsafe.UTF8StringBuilder;
import org.apache.spark.unsafe.types.UTF8String;
// Efficient string building
UTF8StringBuilder builder = new UTF8StringBuilder();
builder.append(UTF8String.fromString("Hello"));
builder.append(" "); // Java string automatically converted
builder.append(UTF8String.fromString("World"));
builder.appendCodePoint(0x1F600); // Unicode emoji
UTF8String result = builder.build();import org.apache.spark.unsafe.types.UTF8String;
import org.apache.spark.unsafe.Platform;
// Create string from memory address
byte[] data = "Hello World".getBytes("UTF-8");
long address = Platform.allocateMemory(data.length);
Platform.copyMemory(data, Platform.BYTE_ARRAY_OFFSET, null, address, data.length);
UTF8String str = UTF8String.fromAddress(null, address, data.length);
// Write string to memory
long targetAddress = Platform.allocateMemory(str.numBytes());
str.writeToMemory(null, targetAddress);
// Clean up
Platform.freeMemory(address);
Platform.freeMemory(targetAddress);import org.apache.spark.unsafe.types.UTF8String;
UTF8String str1 = UTF8String.fromString("Hello");
UTF8String str2 = UTF8String.fromString("HELLO");
// Binary comparison (case-sensitive)
int binaryCompare = str1.binaryCompare(str2); // != 0
// Semantic comparison with collation ID
int collationId = 1; // Case-insensitive collation
int semanticCompare = str1.semanticCompare(str2, collationId); // == 0
// Semantic equality
boolean equal = str1.semanticEquals(str2, collationId); // true