Low-level memory operations and data structures for high-performance computation in Apache Spark
The UTF8String class provides a high-performance UTF-8 string implementation specifically optimized for Spark SQL operations. It stores strings as UTF-8 encoded byte arrays with direct memory access for maximum performance in data processing workloads.
Important: This class is designed for internal Spark SQL use and should not be used in general applications outside of SQL contexts.
import org.apache.spark.unsafe.types.UTF8String;// Create UTF8String from Java String
UTF8String utf8 = UTF8String.fromString("Hello, World!");
// Create from byte array
byte[] bytes = "Hello".getBytes(StandardCharsets.UTF_8);
UTF8String fromBytes = UTF8String.fromBytes(bytes);
// Convert back to Java String
String javaString = utf8.toString();
// Get underlying bytes
byte[] underlyingBytes = utf8.getBytes();UTF8String original = UTF8String.fromString("Hello, World!");
// Basic properties
int numBytes = original.numBytes(); // Number of UTF-8 bytes
int numChars = original.numChars(); // Number of Unicode characters
// Case operations
UTF8String upper = original.toUpperCase();
UTF8String lower = original.toLowerCase();
UTF8String title = original.toTitleCase();
// Substring operations
UTF8String sub1 = original.substring(0, 5); // "Hello"
UTF8String sub2 = original.substringSQL(1, 5); // SQL-style substring
// Search operations
boolean contains = original.contains(UTF8String.fromString("World"));
boolean starts = original.startsWith(UTF8String.fromString("Hello"));
boolean ends = original.endsWith(UTF8String.fromString("!"));UTF8String str1 = UTF8String.fromString("Hello");
UTF8String str2 = UTF8String.fromString("World");
UTF8String separator = UTF8String.fromString(", ");
// Concatenate multiple strings
UTF8String result1 = UTF8String.concat(str1, separator, str2);
// Concatenate with separator
UTF8String result2 = UTF8String.concatWs(separator, str1, str2);UTF8String text = UTF8String.fromString(" Hello, World! ");
// Trimming operations
UTF8String trimmed = text.trim(); // Remove whitespace
UTF8String leftTrim = text.trimLeft(); // Remove left whitespace
UTF8String rightTrim = text.trimRight(); // Remove right whitespace
// Custom character trimming
UTF8String customTrim = text.trim(UTF8String.fromString(" !"));
// Other operations
UTF8String reversed = text.reverse();
UTF8String repeated = UTF8String.fromString("Hi").repeat(3); // "HiHiHi"UTF8String data = UTF8String.fromString("apple,banana,cherry");
UTF8String pattern = UTF8String.fromString(",");
// Split string
UTF8String[] parts = data.split(pattern, -1);
// Find and replace
UTF8String search = UTF8String.fromString("banana");
UTF8String replace = UTF8String.fromString("orange");
UTF8String replaced = data.replace(search, replace);
// Padding operations
UTF8String padded = UTF8String.fromString("Hi").rpad(10, UTF8String.fromString("*"));
UTF8String leftPadded = UTF8String.fromString("Hi").lpad(10, UTF8String.fromString("*"));UTF8String numberStr = UTF8String.fromString("12345");
// Parse as different numeric types
UTF8String.LongWrapper longResult = new UTF8String.LongWrapper();
boolean isValidLong = numberStr.toLong(longResult);
if (isValidLong) {
long value = longResult.value;
}
UTF8String.IntWrapper intResult = new UTF8String.IntWrapper();
boolean isValidInt = numberStr.toInt(intResult);
if (isValidInt) {
int value = intResult.value;
}public final class UTF8String implements Comparable<UTF8String>,
java.io.Externalizable, com.esotericsoftware.kryo.KryoSerializable, Cloneable {
// Constants
public static final UTF8String EMPTY_UTF8;
// Constructor for serialization
public UTF8String();
// Creation methods
public static UTF8String fromString(String str);
public static UTF8String fromBytes(byte[] bytes);
public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes);
public static UTF8String fromAddress(Object base, long offset, int numBytes);
public static UTF8String blankString(int length);
}/**
* Returns base object for memory access.
*/
public Object getBaseObject();
/**
* Returns base offset for memory access.
*/
public long getBaseOffset();
/**
* Writes string content to specified memory location.
*/
public void writeToMemory(Object target, long targetOffset);
/**
* Writes string content to ByteBuffer.
*/
public void writeTo(java.nio.ByteBuffer buffer);
/**
* Returns ByteBuffer wrapping the string data.
*/
public java.nio.ByteBuffer getByteBuffer();
/**
* Writes string content to OutputStream.
*/
public void writeTo(java.io.OutputStream out);/**
* Returns number of bytes in UTF-8 encoding.
*/
public int numBytes();
/**
* Returns number of Unicode characters.
*/
public int numChars();
/**
* Returns 64-bit prefix for sorting operations.
*/
public long getPrefix();
/**
* Returns copy of underlying bytes.
*/
public byte[] getBytes();/**
* Returns substring by character positions (0-based, exclusive end).
*/
public UTF8String substring(int start, int until);
/**
* Returns substring with SQL semantics (1-based, inclusive length).
*/
public UTF8String substringSQL(int pos, int length);/**
* Checks if string contains the specified substring.
*/
public boolean contains(UTF8String substring);
/**
* Checks if string starts with the specified prefix.
*/
public boolean startsWith(UTF8String prefix);
/**
* Checks if string ends with the specified suffix.
*/
public boolean endsWith(UTF8String suffix);
/**
* Finds index of substring starting from specified position.
*/
public int indexOf(UTF8String v, int start);
/**
* Finds position in comma-separated value list (1-based).
*/
public int findInSet(UTF8String match);/**
* Returns uppercase version of the string.
*/
public UTF8String toUpperCase();
/**
* Returns lowercase version of the string.
*/
public UTF8String toLowerCase();
/**
* Returns title case version of the string.
*/
public UTF8String toTitleCase();/**
* Trims whitespace from both ends.
*/
public UTF8String trim();
/**
* Trims specified characters from both ends.
*/
public UTF8String trim(UTF8String trimString);
/**
* Trims whitespace from left end.
*/
public UTF8String trimLeft();
/**
* Trims specified characters from left end.
*/
public UTF8String trimLeft(UTF8String trimString);
/**
* Trims whitespace from right end.
*/
public UTF8String trimRight();
/**
* Trims specified characters from right end.
*/
public UTF8String trimRight(UTF8String trimString);/**
* Returns reversed string.
*/
public UTF8String reverse();
/**
* Returns string repeated specified number of times.
*/
public UTF8String repeat(int times);
/**
* Returns substring before/after nth occurrence of delimiter.
*/
public UTF8String subStringIndex(UTF8String delim, int count);
/**
* Right-pads string to specified length with pad string.
*/
public UTF8String rpad(int len, UTF8String pad);
/**
* Left-pads string to specified length with pad string.
*/
public UTF8String lpad(int len, UTF8String pad);/**
* Splits string using regex pattern with optional limit.
*/
public UTF8String[] split(UTF8String pattern, int limit);
/**
* Replaces all occurrences of search string with replacement.
*/
public UTF8String replace(UTF8String search, UTF8String replace);
/**
* Translates characters using the provided dictionary.
*/
public UTF8String translate(java.util.Map<Character, Character> dict);/**
* Concatenates multiple UTF8Strings.
*/
public static UTF8String concat(UTF8String... inputs);
/**
* Concatenates UTF8Strings with separator.
*/
public static UTF8String concatWs(UTF8String separator, UTF8String... inputs);/**
* Parses string as long, returns success status.
*/
public boolean toLong(LongWrapper toLongResult);
/**
* Parses string as int, returns success status.
*/
public boolean toInt(IntWrapper intWrapper);
/**
* Parses string as short, returns success status.
*/
public boolean toShort(IntWrapper intWrapper);
/**
* Parses string as byte, returns success status.
*/
public boolean toByte(IntWrapper intWrapper);/**
* Compares strings lexicographically.
*/
public int compareTo(UTF8String other);
/**
* Alias for compareTo.
*/
public int compare(UTF8String other);
/**
* Compares strings for equality.
*/
public boolean equals(Object other);
/**
* Computes Levenshtein distance between strings.
*/
public int levenshteinDistance(UTF8String other);
/**
* Returns Murmur3 hash code.
*/
public int hashCode();/**
* Converts to Java String.
*/
public String toString();
/**
* Creates shallow copy sharing underlying data.
*/
public UTF8String clone();
/**
* Creates deep copy with new byte array.
*/
public UTF8String copy();/**
* Computes Soundex phonetic encoding.
*/
public UTF8String soundex();/**
* Wrapper for long parsing results.
*/
public static final class LongWrapper {
public long value;
}
/**
* Wrapper for int parsing results.
*/
public static final class IntWrapper {
public int value;
}Install with Tessl CLI
npx tessl i tessl/maven-org-apache-spark--spark-unsafe-2-11