tessl install tessl/maven-org-apache-spark--spark-unsafe_2-13@3.5.0Low-level unsafe operations and optimized data structures for Apache Spark's internal memory management and performance-critical operations.
High-performance UTF-8 encoded string implementation with extensive string manipulation capabilities, optimized for internal Spark operations and zero-copy scenarios. UTF8String provides memory-efficient string operations that work directly with byte arrays and support both on-heap and off-heap memory locations.
Multiple ways to create UTF8String instances from various data sources, supporting both copying and zero-copy scenarios for maximum flexibility and performance.
public static UTF8String fromBytes(byte[] bytes);
public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes);
public static UTF8String fromAddress(Object base, long offset, int numBytes);
public static UTF8String fromString(String str);
public static UTF8String blankString(int length);
public static UTF8String concat(UTF8String... inputs);
public static UTF8String concatWs(UTF8String separator, UTF8String... inputs);Usage Examples:
// Create from Java String
UTF8String str1 = UTF8String.fromString("Hello World");
// Create from byte array
byte[] bytes = "Hello".getBytes(StandardCharsets.UTF_8);
UTF8String str2 = UTF8String.fromBytes(bytes);
// Create from byte array segment
UTF8String str3 = UTF8String.fromBytes(bytes, 1, 3); // "ell"
// Create from memory location (zero-copy)
UTF8String str4 = UTF8String.fromAddress(byteArray, Platform.BYTE_ARRAY_OFFSET + 5, 5);
// Create blank string of spaces
UTF8String spaces = UTF8String.blankString(10); // " "
// Concatenation
UTF8String result = UTF8String.concat(str1, str2, str3);
UTF8String csv = UTF8String.concatWs(UTF8String.fromString(","), str1, str2, str3);Access string properties including byte length, character count, and sorting prefixes for efficient string processing and comparison operations.
public int numBytes();
public int numChars();
public long getPrefix();
public Object getBaseObject();
public long getBaseOffset();Usage Examples:
UTF8String str = UTF8String.fromString("Hello 世界");
int byteCount = str.numBytes(); // 11 bytes (5 ASCII + 6 for 2 Chinese chars)
int charCount = str.numChars(); // 7 characters
long prefix = str.getPrefix(); // 64-bit prefix for fast sorting
// Memory location access
Object baseObj = str.getBaseObject(); // underlying object or null for off-heap
long offset = str.getBaseOffset(); // offset within object or absolute addressMultiple ways to output and serialize UTF8String data for integration with different systems, I/O operations, and data exchange formats.
public void writeToMemory(Object target, long targetOffset);
public void writeTo(ByteBuffer buffer);
public ByteBuffer getByteBuffer();
public void writeTo(OutputStream out) throws IOException;
public byte[] getBytes();Usage Examples:
UTF8String str = UTF8String.fromString("Hello World");
// Write to memory location
byte[] buffer = new byte[str.numBytes()];
str.writeToMemory(buffer, Platform.BYTE_ARRAY_OFFSET);
// Write to ByteBuffer
ByteBuffer bb = ByteBuffer.allocate(str.numBytes());
str.writeTo(bb);
// Get as ByteBuffer (read-only view)
ByteBuffer readOnlyBB = str.getByteBuffer();
// Write to OutputStream
ByteArrayOutputStream baos = new ByteArrayOutputStream();
str.writeTo(baos);
// Get underlying bytes (copy if needed)
byte[] bytes = str.getBytes();Extract substrings using character-based or SQL-compatible indexing, supporting both Unicode-aware character counting and byte-level operations.
public UTF8String substring(int start, int until);
public UTF8String substringSQL(int pos, int length);Usage Examples:
UTF8String str = UTF8String.fromString("Hello World");
// Character-based substring (0-indexed, until exclusive)
UTF8String hello = str.substring(0, 5); // "Hello"
UTF8String world = str.substring(6, 11); // "World"
// SQL-style substring (1-indexed, length-based)
UTF8String hel = str.substringSQL(1, 3); // "Hel" (start at pos 1, length 3)
UTF8String orld = str.substringSQL(8, 4); // "orld" (start at pos 8, length 4)
// Handle Unicode characters
UTF8String unicode = UTF8String.fromString("Hello 世界");
UTF8String chars = unicode.substring(6, 8); // "世界" (2 Unicode characters)Comprehensive pattern matching capabilities including containment checks, position matching, prefix/suffix testing, and substring searching.
public boolean contains(UTF8String substring);
public boolean matchAt(UTF8String s, int pos);
public boolean startsWith(UTF8String prefix);
public boolean endsWith(UTF8String suffix);
public int indexOf(UTF8String v, int start);
public int findInSet(UTF8String match);Usage Examples:
UTF8String text = UTF8String.fromString("The quick brown fox jumps");
UTF8String pattern = UTF8String.fromString("quick");
// Contains check
boolean hasQuick = text.contains(pattern); // true
// Match at specific position
boolean matchesAt1 = text.matchAt(pattern, 4); // true (0-indexed)
boolean matchesAt0 = text.matchAt(pattern, 0); // false
// Prefix and suffix checks
boolean startsWithThe = text.startsWith(UTF8String.fromString("The")); // true
boolean endsWithJumps = text.endsWith(UTF8String.fromString("jumps")); // true
// Find index
int quickIndex = text.indexOf(pattern, 0); // 4
// Find in comma-separated set
UTF8String set = UTF8String.fromString("apple,banana,cherry");
UTF8String item = UTF8String.fromString("banana");
int position = set.findInSet(item); // 2 (1-indexed position in set)Unicode-aware case conversion supporting standard uppercase, lowercase, and title case transformations for international text processing.
public UTF8String toUpperCase();
public UTF8String toLowerCase();
public UTF8String toTitleCase();Usage Examples:
UTF8String text = UTF8String.fromString("Hello World 世界");
UTF8String upper = text.toUpperCase(); // "HELLO WORLD 世界"
UTF8String lower = text.toLowerCase(); // "hello world 世界"
UTF8String title = text.toTitleCase(); // "Hello World 世界"
// Unicode case conversion
UTF8String german = UTF8String.fromString("straße");
UTF8String upperGerman = german.toUpperCase(); // "STRASSE" (ß -> SS)Flexible trimming operations supporting ASCII spaces, Unicode whitespace, and custom character sets with left, right, and both-sides trimming.
public UTF8String trim();
public UTF8String trimAll();
public UTF8String trim(UTF8String trimString);
public UTF8String trimLeft();
public UTF8String trimLeft(UTF8String trimString);
public UTF8String trimRight();
public UTF8String trimRight(UTF8String trimString);
public UTF8String trimTrailingSpaces(int numSpaces);Usage Examples:
// Basic trimming (ASCII space character 32)
UTF8String padded = UTF8String.fromString(" Hello World ");
UTF8String trimmed = padded.trim(); // "Hello World"
UTF8String leftTrimmed = padded.trimLeft(); // "Hello World "
UTF8String rightTrimmed = padded.trimRight(); // " Hello World"
// Unicode whitespace trimming
UTF8String unicodeSpaces = UTF8String.fromString("\t\n Hello \r\n");
UTF8String allTrimmed = unicodeSpaces.trimAll(); // "Hello"
// Custom character trimming
UTF8String bracketed = UTF8String.fromString("[[Hello World]]");
UTF8String brackets = UTF8String.fromString("[]");
UTF8String content = bracketed.trim(brackets); // "Hello World"
// Trim specific number of trailing spaces
UTF8String withSpaces = UTF8String.fromString("Hello ");
UTF8String trimmedSpaces = withSpaces.trimTrailingSpaces(3); // "Hello "Advanced string transformation operations including reversal, repetition, padding, and splitting with support for both literal and pattern-based operations.
public UTF8String reverse();
public UTF8String repeat(int times);
public UTF8String rpad(int len, UTF8String pad);
public UTF8String lpad(int len, UTF8String pad);
public UTF8String[] split(UTF8String pattern, int limit);
public UTF8String[] splitSQL(UTF8String delimiter, int limit);
public UTF8String replace(UTF8String search, UTF8String replace);
public UTF8String translate(Map<String, String> dict);
public UTF8String subStringIndex(UTF8String delim, int count);Usage Examples:
UTF8String text = UTF8String.fromString("Hello");
// Reverse string
UTF8String reversed = text.reverse(); // "olleH"
// Repeat string
UTF8String repeated = text.repeat(3); // "HelloHelloHello"
// Padding operations
UTF8String rightPadded = text.rpad(10, UTF8String.fromString("*")); // "Hello*****"
UTF8String leftPadded = text.lpad(10, UTF8String.fromString("0")); // "00000Hello"
// Splitting
UTF8String csv = UTF8String.fromString("apple,banana,cherry");
UTF8String[] parts = csv.splitSQL(UTF8String.fromString(","), -1); // ["apple", "banana", "cherry"]
// Replace operations
UTF8String sentence = UTF8String.fromString("Hello World Hello");
UTF8String replaced = sentence.replace(UTF8String.fromString("Hello"),
UTF8String.fromString("Hi")); // "Hi World Hi"
// Character translation
Map<String, String> transMap = new HashMap<>();
transMap.put("e", "3");
transMap.put("o", "0");
UTF8String translated = text.translate(transMap); // "H3ll0"
// Substring by delimiter
UTF8String path = UTF8String.fromString("a/b/c/d");
UTF8String firstTwo = path.subStringIndex(UTF8String.fromString("/"), 2); // "a/b"Safe numeric parsing with wrapper classes for error handling, supporting both safe parsing with result wrappers and throwing parsers for different error handling strategies.
public boolean toLong(LongWrapper toLongResult);
public boolean toInt(IntWrapper intWrapper);
public boolean toShort(IntWrapper intWrapper);
public boolean toByte(IntWrapper intWrapper);
public long toLongExact();
public int toIntExact();
public short toShortExact();
public byte toByteExact();Wrapper Classes:
public static class LongWrapper implements Serializable {
public transient long value = 0;
}
public static class IntWrapper implements Serializable {
public transient int value = 0;
}Usage Examples:
// Safe parsing with wrappers
UTF8String numberStr = UTF8String.fromString("12345");
UTF8String.LongWrapper longWrapper = new UTF8String.LongWrapper();
if (numberStr.toLong(longWrapper)) {
long value = longWrapper.value; // 12345
System.out.println("Parsed: " + value);
} else {
System.out.println("Parse failed");
}
// Throwing parsers
try {
UTF8String validNumber = UTF8String.fromString("42");
int intValue = validNumber.toIntExact(); // 42
UTF8String invalidNumber = UTF8String.fromString("not_a_number");
int failValue = invalidNumber.toIntExact(); // throws NumberFormatException
} catch (NumberFormatException e) {
System.out.println("Invalid number format");
}Comprehensive comparison operations including lexicographic ordering, equality testing, and edit distance calculations for fuzzy matching scenarios.
public int compareTo(@Nonnull UTF8String other);
public int compare(UTF8String other);
public boolean equals(Object other);
public int hashCode();
public int levenshteinDistance(UTF8String other);
public int levenshteinDistance(UTF8String other, int threshold);Usage Examples:
UTF8String str1 = UTF8String.fromString("apple");
UTF8String str2 = UTF8String.fromString("banana");
UTF8String str3 = UTF8String.fromString("apple");
// Comparison operations
int cmp = str1.compareTo(str2); // negative (apple < banana)
boolean equal = str1.equals(str3); // true
int hash = str1.hashCode(); // Murmur3 hash
// Edit distance for fuzzy matching
UTF8String word1 = UTF8String.fromString("kitten");
UTF8String word2 = UTF8String.fromString("sitting");
int distance = word1.levenshteinDistance(word2); // 3
// Edit distance with threshold for early termination
int fastDistance = word1.levenshteinDistance(word2, 5); // 3 (within threshold)
int exceedsThreshold = word1.levenshteinDistance(UTF8String.fromString("completely_different"), 3); // -1Soundex algorithm implementation for phonetic string matching and similarity detection based on pronunciation rather than spelling.
public UTF8String soundex();Usage Example:
UTF8String name1 = UTF8String.fromString("Smith");
UTF8String name2 = UTF8String.fromString("Smyth");
UTF8String soundex1 = name1.soundex(); // "S530"
UTF8String soundex2 = name2.soundex(); // "S530"
boolean phoneticMatch = soundex1.equals(soundex2); // trueConversion methods for integration with Java's standard String class and serialization frameworks, supporting cloning and copying operations.
public String toString();
public UTF8String clone();
public UTF8String copy();Usage Examples:
UTF8String utf8Str = UTF8String.fromString("Hello World");
// Convert to Java String
String javaStr = utf8Str.toString(); // "Hello World"
// Clone (shallow copy sharing underlying data)
UTF8String cloned = utf8Str.clone();
// Copy (deep copy with new underlying data)
UTF8String copied = utf8Str.copy();
// All three should be equal but may have different underlying storage
boolean equal1 = utf8Str.equals(cloned); // true
boolean equal2 = utf8Str.equals(copied); // trueBuilder pattern support for efficient construction of concatenated UTF8String instances with internal buffer management and automatic resizing.
public class UTF8StringBuilder {
public UTF8StringBuilder();
public UTF8StringBuilder(int initialSize);
public void append(UTF8String value);
public void append(String value);
public void appendBytes(Object base, long offset, int length);
public UTF8String build();
}Usage Example:
// Create builder with default size (16 bytes)
UTF8StringBuilder builder = new UTF8StringBuilder();
// Create builder with specific initial size
UTF8StringBuilder largeBuilder = new UTF8StringBuilder(1024);
// Append different types of content
builder.append(UTF8String.fromString("Hello"));
builder.append(" ");
builder.append("World"); // Java String automatically converted
// Append raw bytes from memory
byte[] extraBytes = "!".getBytes(StandardCharsets.UTF_8);
builder.appendBytes(extraBytes, Platform.BYTE_ARRAY_OFFSET, extraBytes.length);
// Build final UTF8String
UTF8String result = builder.build(); // "Hello World!"