Memory-efficient UTF-8 string implementation with comprehensive string manipulation, parsing, and comparison operations optimized for Spark's internal use, providing zero-copy operations and direct memory access.
Factory methods for creating UTF8String instances from various sources with memory-efficient operations.
/**
* UTF-8 encoded string for internal Spark use with memory-efficient operations
*/
final class UTF8String implements Comparable<UTF8String>, Externalizable, KryoSerializable, Cloneable {
// Factory methods
/**
* Create UTF8String from Java String
* @param str Java String to convert
* @return UTF8String instance
*/
public static UTF8String fromString(String str);
/**
* Create UTF8String from byte array
* @param bytes UTF-8 encoded byte array
* @return UTF8String instance
*/
public static UTF8String fromBytes(byte[] bytes);
/**
* Create UTF8String from byte array slice
* @param bytes UTF-8 encoded byte array
* @param offset Starting offset in array
* @param numBytes Number of bytes to use
* @return UTF8String instance
*/
public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes);
/**
* Create UTF8String from memory address
* @param base Base object (null for off-heap)
* @param offset Offset within object or address
* @param numBytes Number of bytes
* @return UTF8String instance
*/
public static UTF8String fromAddress(Object base, long offset, int numBytes);
/**
* Create string of spaces
* @param length Number of spaces
* @return UTF8String containing spaces
*/
public static UTF8String blankString(int length);
// Constants
public static final UTF8String EMPTY_UTF8; // Empty UTF8String instance
}Usage Examples:
import org.apache.spark.unsafe.types.UTF8String;
// Create from Java String
UTF8String str1 = UTF8String.fromString("Hello World");
// Create from byte array
byte[] data = "Hello".getBytes("UTF-8");
UTF8String str2 = UTF8String.fromBytes(data);
// Create blank string
UTF8String spaces = UTF8String.blankString(10);
// Use empty constant
UTF8String empty = UTF8String.EMPTY_UTF8;Efficient string concatenation operations supporting multiple input strings and custom separators.
/**
* Concatenate multiple UTF8String instances
* @param inputs UTF8String instances to concatenate
* @return Concatenated UTF8String
*/
public static UTF8String concat(UTF8String... inputs);
/**
* Concatenate UTF8String instances with separator
* @param separator Separator string
* @param inputs UTF8String instances to concatenate
* @return Concatenated UTF8String with separators
*/
public static UTF8String concatWs(UTF8String separator, UTF8String... inputs);Direct memory access methods for efficient I/O and serialization operations.
/**
* Get base object for memory access (null for off-heap)
* @return Base object or null
*/
public Object getBaseObject();
/**
* Get offset within base object or direct address
* @return Offset or address
*/
public long getBaseOffset();
/**
* Get number of bytes in UTF-8 encoding
* @return Byte count
*/
public int numBytes();
/**
* Get underlying byte array (creates copy if needed)
* @return UTF-8 encoded byte array
*/
public byte[] getBytes();
/**
* Write string data to memory location
* @param target Target object (null for off-heap)
* @param targetOffset Target offset or address
*/
public void writeToMemory(Object target, long targetOffset);
/**
* Write string data to ByteBuffer
* @param buffer Target ByteBuffer
*/
public void writeTo(ByteBuffer buffer);
/**
* Get string as ByteBuffer view
* @return ByteBuffer view of string data
*/
public ByteBuffer getByteBuffer();
/**
* Write string data to OutputStream
* @param out Target OutputStream
* @throws IOException if I/O error occurs
*/
public void writeTo(OutputStream out) throws IOException;Methods for analyzing string properties and extracting metadata.
/**
* Get number of Unicode characters
* @return Character count
*/
public int numChars();
/**
* Get sorting prefix for efficient comparisons
* @return Long value for prefix sorting
*/
public long getPrefix();Efficient substring extraction with both index-based and SQL-style positioning.
/**
* Extract substring using start and end indices
* @param start Starting character index (inclusive)
* @param until Ending character index (exclusive)
* @return Substring as UTF8String
*/
public UTF8String substring(int start, int until);
/**
* Extract substring using SQL-style 1-based positioning
* @param pos Starting position (1-based)
* @param length Number of characters
* @return Substring as UTF8String
*/
public UTF8String substringSQL(int pos, int length);Methods for searching within strings including substring matching and position finding.
/**
* Check if string contains substring
* @param substring Substring to search for
* @return true if substring is found
*/
public boolean contains(UTF8String substring);
/**
* Check if string starts with prefix
* @param prefix Prefix to check
* @return true if string starts with prefix
*/
public boolean startsWith(UTF8String prefix);
/**
* Check if string ends with suffix
* @param suffix Suffix to check
* @return true if string ends with suffix
*/
public boolean endsWith(UTF8String suffix);
/**
* Find index of substring starting from position
* @param v Substring to find
* @param start Starting position for search
* @return Index of substring or -1 if not found
*/
public int indexOf(UTF8String v, int start);
/**
* Find position in comma-separated list
* @param match String to find in list
* @return 1-based position or 0 if not found
*/
public int findInSet(UTF8String match);
/**
* Check if string matches at specific position
* @param s String to match
* @param pos Position to check match
* @return true if strings match at position
*/
public boolean matchAt(UTF8String s, int pos);Case conversion operations preserving UTF-8 encoding and supporting Unicode.
/**
* Convert to uppercase
* @return Uppercase UTF8String
*/
public UTF8String toUpperCase();
/**
* Convert to lowercase
* @return Lowercase UTF8String
*/
public UTF8String toLowerCase();
/**
* Convert to title case
* @return Title case UTF8String
*/
public UTF8String toTitleCase();Comprehensive trimming operations for whitespace and custom character removal.
/**
* Trim leading and trailing spaces
* @return Trimmed UTF8String
*/
public UTF8String trim();
/**
* Trim all types of whitespace characters
* @return Trimmed UTF8String
*/
public UTF8String trimAll();
/**
* Trim specific characters from both ends
* @param trimString Characters to trim
* @return Trimmed UTF8String
*/
public UTF8String trim(UTF8String trimString);
/**
* Trim leading spaces
* @return Left-trimmed UTF8String
*/
public UTF8String trimLeft();
/**
* Trim specific characters from start
* @param trimString Characters to trim
* @return Left-trimmed UTF8String
*/
public UTF8String trimLeft(UTF8String trimString);
/**
* Trim trailing spaces
* @return Right-trimmed UTF8String
*/
public UTF8String trimRight();
/**
* Trim specific number of trailing spaces
* @param numSpaces Number of spaces to trim
* @return Right-trimmed UTF8String
*/
public UTF8String trimTrailingSpaces(int numSpaces);
/**
* Trim specific characters from end
* @param trimString Characters to trim
* @return Right-trimmed UTF8String
*/
public UTF8String trimRight(UTF8String trimString);Advanced string manipulation including reversal, repetition, padding, and character replacement.
/**
* Reverse the string
* @return Reversed UTF8String
*/
public UTF8String reverse();
/**
* Repeat string multiple times
* @param times Number of repetitions
* @return Repeated UTF8String
*/
public UTF8String repeat(int times);
/**
* Right pad string to specified length
* @param len Target length
* @param pad Padding string
* @return Right-padded UTF8String
*/
public UTF8String rpad(int len, UTF8String pad);
/**
* Left pad string to specified length
* @param len Target length
* @param pad Padding string
* @return Left-padded UTF8String
*/
public UTF8String lpad(int len, UTF8String pad);
/**
* Replace all occurrences of search string
* @param search String to search for
* @param replace Replacement string
* @return String with replacements
*/
public UTF8String replace(UTF8String search, UTF8String replace);
/**
* Translate characters using mapping dictionary
* @param dict Character translation dictionary
* @return Translated UTF8String
*/
public UTF8String translate(Map<String, String> dict);
/**
* Generate soundex encoding
* @return Soundex encoded UTF8String
*/
public UTF8String soundex();String splitting operations with pattern matching and SQL-style delimiters.
/**
* Split string using regex pattern
* @param pattern Regex pattern for splitting
* @param limit Maximum number of splits (-1 for no limit)
* @return Array of split UTF8String parts
*/
public UTF8String[] split(UTF8String pattern, int limit);
/**
* Split string using SQL-style delimiter
* @param delimiter Delimiter string
* @param limit Maximum number of splits (-1 for no limit)
* @return Array of split UTF8String parts
*/
public UTF8String[] splitSQL(UTF8String delimiter, int limit);
/**
* Extract substring by delimiter occurrence count
* @param delim Delimiter string
* @param count Occurrence count (positive from start, negative from end)
* @return Substring before/after delimiter
*/
public UTF8String subStringIndex(UTF8String delim, int count);Safe and exact numeric parsing methods with wrapper classes for result handling.
/**
* Wrapper class for long parsing results
*/
public static class LongWrapper implements Serializable {
public transient long value; // Parsed long value
}
/**
* Wrapper class for int parsing results
*/
public static class IntWrapper implements Serializable {
public transient int value; // Parsed int value
}
// Safe parsing methods (return false on failure)
/**
* Parse string to long with error handling
* @param toLongResult Wrapper to store result
* @return true if parsing succeeded
*/
public boolean toLong(LongWrapper toLongResult);
/**
* Parse string to int with error handling
* @param intWrapper Wrapper to store result
* @return true if parsing succeeded
*/
public boolean toInt(IntWrapper intWrapper);
/**
* Parse string to short with error handling
* @param intWrapper Wrapper to store result
* @return true if parsing succeeded
*/
public boolean toShort(IntWrapper intWrapper);
/**
* Parse string to byte with error handling
* @param intWrapper Wrapper to store result
* @return true if parsing succeeded
*/
public boolean toByte(IntWrapper intWrapper);
// Exact parsing methods (throw exceptions on failure)
/**
* Parse string to long (throws exception on failure)
* @return Parsed long value
* @throws NumberFormatException if parsing fails
*/
public long toLongExact();
/**
* Parse string to int (throws exception on failure)
* @return Parsed int value
* @throws NumberFormatException if parsing fails
*/
public int toIntExact();
/**
* Parse string to short (throws exception on failure)
* @return Parsed short value
* @throws NumberFormatException if parsing fails
*/
public short toShortExact();
/**
* Parse string to byte (throws exception on failure)
* @return Parsed byte value
* @throws NumberFormatException if parsing fails
*/
public byte toByteExact();Usage Examples:
import org.apache.spark.unsafe.types.UTF8String;
// Safe parsing with error handling
UTF8String numStr = UTF8String.fromString("123");
UTF8String.LongWrapper longResult = new UTF8String.LongWrapper();
if (numStr.toLong(longResult)) {
long value = longResult.value;
System.out.println("Parsed: " + value);
} else {
System.out.println("Failed to parse as long");
}
// Exact parsing with exceptions
try {
int value = UTF8String.fromString("456").toIntExact();
System.out.println("Parsed: " + value);
} catch (NumberFormatException e) {
System.out.println("Invalid number format");
}Comprehensive comparison operations including lexicographic ordering and distance calculations.
/**
* Compare strings lexicographically
* @param other String to compare with
* @return Negative, zero, or positive value
*/
public int compareTo(UTF8String other);
/**
* Compare strings (alias for compareTo)
* @param other String to compare with
* @return Negative, zero, or positive value
*/
public int compare(UTF8String other);
/**
* Check equality with another object
* @param other Object to compare with
* @return true if equal
*/
public boolean equals(Object other);
/**
* Calculate hash code
* @return Hash code value
*/
public int hashCode();
/**
* Calculate Levenshtein distance
* @param other String to compare with
* @return Edit distance
*/
public int levenshteinDistance(UTF8String other);
/**
* Calculate Levenshtein distance with threshold
* @param other String to compare with
* @param threshold Maximum distance to calculate
* @return Edit distance or -1 if exceeds threshold
*/
public int levenshteinDistance(UTF8String other, int threshold);General utility methods for string conversion, copying, and serialization.
/**
* Convert to Java String
* @return Java String representation
*/
public String toString();
/**
* Clone the UTF8String
* @return Cloned UTF8String
*/
public UTF8String clone();
/**
* Create a copy of the UTF8String
* @return Copied UTF8String
*/
public UTF8String copy();
// Serialization support methods
/**
* Write object to ObjectOutput (Externalizable)
* @param out ObjectOutput stream
* @throws IOException if I/O error occurs
*/
public void writeExternal(ObjectOutput out) throws IOException;
/**
* Read object from ObjectInput (Externalizable)
* @param in ObjectInput stream
* @throws IOException if I/O error occurs
* @throws ClassNotFoundException if class not found
*/
public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException;
/**
* Write using Kryo serialization
* @param kryo Kryo instance
* @param out Output stream
*/
public void write(Kryo kryo, Output out);
/**
* Read using Kryo serialization
* @param kryo Kryo instance
* @param in Input stream
*/
public void read(Kryo kryo, Input in);Efficient builder for constructing UTF8String objects with automatic memory management.
/**
* Builder for constructing UTF8String objects efficiently
*/
class UTF8StringBuilder {
/**
* Create builder with default initial size (16 bytes)
*/
public UTF8StringBuilder();
/**
* Create builder with custom initial size
* @param initialSize Initial buffer size in bytes
*/
public UTF8StringBuilder(int initialSize);
/**
* Append UTF8String to builder
* @param value UTF8String to append
*/
public void append(UTF8String value);
/**
* Append Java String to builder
* @param value Java String to append
*/
public void append(String value);
/**
* Append raw bytes to builder
* @param base Base object (null for off-heap)
* @param offset Offset within object or address
* @param length Number of bytes to append
*/
public void appendBytes(Object base, long offset, int length);
/**
* Build final UTF8String from accumulated data
* @return Constructed UTF8String
*/
public UTF8String build();
}Usage Examples:
import org.apache.spark.unsafe.types.*;
// Basic string operations
UTF8String str = UTF8String.fromString("Hello World");
UTF8String upper = str.toUpperCase();
UTF8String trimmed = str.trim();
// String searching and manipulation
boolean hasHello = str.contains(UTF8String.fromString("Hello"));
UTF8String substr = str.substring(0, 5);
UTF8String[] parts = str.split(UTF8String.fromString(" "), -1);
// Using StringBuilder
UTF8StringBuilder builder = new UTF8StringBuilder();
builder.append(UTF8String.fromString("Hello"));
builder.append(" ");
builder.append(UTF8String.fromString("World"));
UTF8String result = builder.build();
// Comparison and sorting
UTF8String str1 = UTF8String.fromString("apple");
UTF8String str2 = UTF8String.fromString("banana");
int comparison = str1.compareTo(str2); // negative value
// Numeric parsing
UTF8String numStr = UTF8String.fromString("123");
int value = numStr.toIntExact();