or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

array-operations.mdhashing-utilities.mdindex.mdmemory-management.mdplatform-operations.mdutf8-string-operations.md
tile.json

utf8-string-operations.mddocs/

UTF8 String Operations

Memory-efficient UTF-8 string implementation with comprehensive string manipulation, parsing, and comparison operations optimized for Spark's internal use, providing zero-copy operations and direct memory access.

Capabilities

UTF8String Creation

Factory methods for creating UTF8String instances from various sources with memory-efficient operations.

/**
 * UTF-8 encoded string for internal Spark use with memory-efficient operations
 */
final class UTF8String implements Comparable<UTF8String>, Externalizable, KryoSerializable, Cloneable {
    
    // Factory methods
    /**
     * Create UTF8String from Java String
     * @param str Java String to convert
     * @return UTF8String instance
     */
    public static UTF8String fromString(String str);
    
    /**
     * Create UTF8String from byte array
     * @param bytes UTF-8 encoded byte array
     * @return UTF8String instance
     */
    public static UTF8String fromBytes(byte[] bytes);
    
    /**
     * Create UTF8String from byte array slice
     * @param bytes UTF-8 encoded byte array
     * @param offset Starting offset in array
     * @param numBytes Number of bytes to use
     * @return UTF8String instance
     */
    public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes);
    
    /**
     * Create UTF8String from memory address
     * @param base Base object (null for off-heap)
     * @param offset Offset within object or address
     * @param numBytes Number of bytes
     * @return UTF8String instance
     */
    public static UTF8String fromAddress(Object base, long offset, int numBytes);
    
    /**
     * Create string of spaces
     * @param length Number of spaces
     * @return UTF8String containing spaces
     */
    public static UTF8String blankString(int length);
    
    // Constants
    public static final UTF8String EMPTY_UTF8;  // Empty UTF8String instance
}

Usage Examples:

import org.apache.spark.unsafe.types.UTF8String;

// Create from Java String
UTF8String str1 = UTF8String.fromString("Hello World");

// Create from byte array
byte[] data = "Hello".getBytes("UTF-8");
UTF8String str2 = UTF8String.fromBytes(data);

// Create blank string
UTF8String spaces = UTF8String.blankString(10);

// Use empty constant
UTF8String empty = UTF8String.EMPTY_UTF8;

String Concatenation

Efficient string concatenation operations supporting multiple input strings and custom separators.

/**
 * Concatenate multiple UTF8String instances
 * @param inputs UTF8String instances to concatenate
 * @return Concatenated UTF8String
 */
public static UTF8String concat(UTF8String... inputs);

/**
 * Concatenate UTF8String instances with separator
 * @param separator Separator string
 * @param inputs UTF8String instances to concatenate
 * @return Concatenated UTF8String with separators
 */
public static UTF8String concatWs(UTF8String separator, UTF8String... inputs);

Memory Access Operations

Direct memory access methods for efficient I/O and serialization operations.

/**
 * Get base object for memory access (null for off-heap)
 * @return Base object or null
 */
public Object getBaseObject();

/**
 * Get offset within base object or direct address
 * @return Offset or address
 */
public long getBaseOffset();

/**
 * Get number of bytes in UTF-8 encoding
 * @return Byte count
 */
public int numBytes();

/**
 * Get underlying byte array (creates copy if needed)
 * @return UTF-8 encoded byte array
 */
public byte[] getBytes();

/**
 * Write string data to memory location
 * @param target Target object (null for off-heap)
 * @param targetOffset Target offset or address
 */
public void writeToMemory(Object target, long targetOffset);

/**
 * Write string data to ByteBuffer
 * @param buffer Target ByteBuffer
 */
public void writeTo(ByteBuffer buffer);

/**
 * Get string as ByteBuffer view
 * @return ByteBuffer view of string data
 */
public ByteBuffer getByteBuffer();

/**
 * Write string data to OutputStream
 * @param out Target OutputStream
 * @throws IOException if I/O error occurs
 */
public void writeTo(OutputStream out) throws IOException;

String Properties and Analysis

Methods for analyzing string properties and extracting metadata.

/**
 * Get number of Unicode characters
 * @return Character count
 */
public int numChars();

/**
 * Get sorting prefix for efficient comparisons
 * @return Long value for prefix sorting
 */
public long getPrefix();

Substring Operations

Efficient substring extraction with both index-based and SQL-style positioning.

/**
 * Extract substring using start and end indices
 * @param start Starting character index (inclusive)
 * @param until Ending character index (exclusive)
 * @return Substring as UTF8String
 */
public UTF8String substring(int start, int until);

/**
 * Extract substring using SQL-style 1-based positioning
 * @param pos Starting position (1-based)
 * @param length Number of characters
 * @return Substring as UTF8String
 */
public UTF8String substringSQL(int pos, int length);

String Search Operations

Methods for searching within strings including substring matching and position finding.

/**
 * Check if string contains substring
 * @param substring Substring to search for
 * @return true if substring is found
 */
public boolean contains(UTF8String substring);

/**
 * Check if string starts with prefix
 * @param prefix Prefix to check
 * @return true if string starts with prefix
 */
public boolean startsWith(UTF8String prefix);

/**
 * Check if string ends with suffix
 * @param suffix Suffix to check
 * @return true if string ends with suffix
 */
public boolean endsWith(UTF8String suffix);

/**
 * Find index of substring starting from position
 * @param v Substring to find
 * @param start Starting position for search
 * @return Index of substring or -1 if not found
 */
public int indexOf(UTF8String v, int start);

/**
 * Find position in comma-separated list
 * @param match String to find in list
 * @return 1-based position or 0 if not found
 */
public int findInSet(UTF8String match);

/**
 * Check if string matches at specific position
 * @param s String to match
 * @param pos Position to check match
 * @return true if strings match at position
 */
public boolean matchAt(UTF8String s, int pos);

Case Conversion

Case conversion operations preserving UTF-8 encoding and supporting Unicode.

/**
 * Convert to uppercase
 * @return Uppercase UTF8String
 */
public UTF8String toUpperCase();

/**
 * Convert to lowercase  
 * @return Lowercase UTF8String
 */
public UTF8String toLowerCase();

/**
 * Convert to title case
 * @return Title case UTF8String
 */
public UTF8String toTitleCase();

String Trimming Operations

Comprehensive trimming operations for whitespace and custom character removal.

/**
 * Trim leading and trailing spaces
 * @return Trimmed UTF8String
 */
public UTF8String trim();

/**
 * Trim all types of whitespace characters
 * @return Trimmed UTF8String
 */
public UTF8String trimAll();

/**
 * Trim specific characters from both ends
 * @param trimString Characters to trim
 * @return Trimmed UTF8String
 */
public UTF8String trim(UTF8String trimString);

/**
 * Trim leading spaces
 * @return Left-trimmed UTF8String
 */
public UTF8String trimLeft();

/**
 * Trim specific characters from start
 * @param trimString Characters to trim
 * @return Left-trimmed UTF8String
 */
public UTF8String trimLeft(UTF8String trimString);

/**
 * Trim trailing spaces
 * @return Right-trimmed UTF8String
 */
public UTF8String trimRight();

/**
 * Trim specific number of trailing spaces
 * @param numSpaces Number of spaces to trim
 * @return Right-trimmed UTF8String
 */
public UTF8String trimTrailingSpaces(int numSpaces);

/**
 * Trim specific characters from end
 * @param trimString Characters to trim
 * @return Right-trimmed UTF8String
 */
public UTF8String trimRight(UTF8String trimString);

String Manipulation

Advanced string manipulation including reversal, repetition, padding, and character replacement.

/**
 * Reverse the string
 * @return Reversed UTF8String
 */
public UTF8String reverse();

/**
 * Repeat string multiple times
 * @param times Number of repetitions
 * @return Repeated UTF8String
 */
public UTF8String repeat(int times);

/**
 * Right pad string to specified length
 * @param len Target length
 * @param pad Padding string
 * @return Right-padded UTF8String
 */
public UTF8String rpad(int len, UTF8String pad);

/**
 * Left pad string to specified length
 * @param len Target length
 * @param pad Padding string
 * @return Left-padded UTF8String
 */
public UTF8String lpad(int len, UTF8String pad);

/**
 * Replace all occurrences of search string
 * @param search String to search for
 * @param replace Replacement string
 * @return String with replacements
 */
public UTF8String replace(UTF8String search, UTF8String replace);

/**
 * Translate characters using mapping dictionary
 * @param dict Character translation dictionary
 * @return Translated UTF8String
 */
public UTF8String translate(Map<String, String> dict);

/**
 * Generate soundex encoding
 * @return Soundex encoded UTF8String
 */
public UTF8String soundex();

String Splitting

String splitting operations with pattern matching and SQL-style delimiters.

/**
 * Split string using regex pattern
 * @param pattern Regex pattern for splitting
 * @param limit Maximum number of splits (-1 for no limit)
 * @return Array of split UTF8String parts
 */
public UTF8String[] split(UTF8String pattern, int limit);

/**
 * Split string using SQL-style delimiter
 * @param delimiter Delimiter string
 * @param limit Maximum number of splits (-1 for no limit)  
 * @return Array of split UTF8String parts
 */
public UTF8String[] splitSQL(UTF8String delimiter, int limit);

/**
 * Extract substring by delimiter occurrence count
 * @param delim Delimiter string
 * @param count Occurrence count (positive from start, negative from end)
 * @return Substring before/after delimiter
 */
public UTF8String subStringIndex(UTF8String delim, int count);

Numeric Parsing

Safe and exact numeric parsing methods with wrapper classes for result handling.

/**
 * Wrapper class for long parsing results
 */
public static class LongWrapper implements Serializable {
    public transient long value;  // Parsed long value
}

/**
 * Wrapper class for int parsing results  
 */
public static class IntWrapper implements Serializable {
    public transient int value;   // Parsed int value
}

// Safe parsing methods (return false on failure)
/**
 * Parse string to long with error handling
 * @param toLongResult Wrapper to store result
 * @return true if parsing succeeded
 */
public boolean toLong(LongWrapper toLongResult);

/**
 * Parse string to int with error handling
 * @param intWrapper Wrapper to store result
 * @return true if parsing succeeded
 */
public boolean toInt(IntWrapper intWrapper);

/**
 * Parse string to short with error handling
 * @param intWrapper Wrapper to store result
 * @return true if parsing succeeded
 */
public boolean toShort(IntWrapper intWrapper);

/**
 * Parse string to byte with error handling
 * @param intWrapper Wrapper to store result
 * @return true if parsing succeeded
 */
public boolean toByte(IntWrapper intWrapper);

// Exact parsing methods (throw exceptions on failure)
/**
 * Parse string to long (throws exception on failure)
 * @return Parsed long value
 * @throws NumberFormatException if parsing fails
 */
public long toLongExact();

/**
 * Parse string to int (throws exception on failure)
 * @return Parsed int value
 * @throws NumberFormatException if parsing fails
 */
public int toIntExact();

/**
 * Parse string to short (throws exception on failure)
 * @return Parsed short value
 * @throws NumberFormatException if parsing fails
 */
public short toShortExact();

/**
 * Parse string to byte (throws exception on failure)
 * @return Parsed byte value
 * @throws NumberFormatException if parsing fails
 */
public byte toByteExact();

Usage Examples:

import org.apache.spark.unsafe.types.UTF8String;

// Safe parsing with error handling
UTF8String numStr = UTF8String.fromString("123");
UTF8String.LongWrapper longResult = new UTF8String.LongWrapper();

if (numStr.toLong(longResult)) {
    long value = longResult.value;
    System.out.println("Parsed: " + value);
} else {
    System.out.println("Failed to parse as long");
}

// Exact parsing with exceptions
try {
    int value = UTF8String.fromString("456").toIntExact();
    System.out.println("Parsed: " + value);
} catch (NumberFormatException e) {
    System.out.println("Invalid number format");
}

String Comparison

Comprehensive comparison operations including lexicographic ordering and distance calculations.

/**
 * Compare strings lexicographically
 * @param other String to compare with
 * @return Negative, zero, or positive value
 */
public int compareTo(UTF8String other);

/**
 * Compare strings (alias for compareTo)
 * @param other String to compare with
 * @return Negative, zero, or positive value
 */
public int compare(UTF8String other);

/**
 * Check equality with another object
 * @param other Object to compare with
 * @return true if equal
 */
public boolean equals(Object other);

/**
 * Calculate hash code
 * @return Hash code value
 */
public int hashCode();

/**
 * Calculate Levenshtein distance
 * @param other String to compare with
 * @return Edit distance
 */
public int levenshteinDistance(UTF8String other);

/**
 * Calculate Levenshtein distance with threshold
 * @param other String to compare with
 * @param threshold Maximum distance to calculate
 * @return Edit distance or -1 if exceeds threshold
 */
public int levenshteinDistance(UTF8String other, int threshold);

Utility Operations

General utility methods for string conversion, copying, and serialization.

/**
 * Convert to Java String
 * @return Java String representation
 */
public String toString();

/**
 * Clone the UTF8String
 * @return Cloned UTF8String
 */
public UTF8String clone();

/**
 * Create a copy of the UTF8String
 * @return Copied UTF8String
 */
public UTF8String copy();

// Serialization support methods
/**
 * Write object to ObjectOutput (Externalizable)
 * @param out ObjectOutput stream
 * @throws IOException if I/O error occurs
 */
public void writeExternal(ObjectOutput out) throws IOException;

/**
 * Read object from ObjectInput (Externalizable)
 * @param in ObjectInput stream
 * @throws IOException if I/O error occurs
 * @throws ClassNotFoundException if class not found
 */
public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException;

/**
 * Write using Kryo serialization
 * @param kryo Kryo instance
 * @param out Output stream
 */
public void write(Kryo kryo, Output out);

/**
 * Read using Kryo serialization
 * @param kryo Kryo instance
 * @param in Input stream
 */
public void read(Kryo kryo, Input in);

UTF8StringBuilder

Efficient builder for constructing UTF8String objects with automatic memory management.

/**
 * Builder for constructing UTF8String objects efficiently
 */
class UTF8StringBuilder {
    /**
     * Create builder with default initial size (16 bytes)
     */
    public UTF8StringBuilder();
    
    /**
     * Create builder with custom initial size
     * @param initialSize Initial buffer size in bytes
     */
    public UTF8StringBuilder(int initialSize);
    
    /**
     * Append UTF8String to builder
     * @param value UTF8String to append
     */
    public void append(UTF8String value);
    
    /**
     * Append Java String to builder
     * @param value Java String to append
     */
    public void append(String value);
    
    /**
     * Append raw bytes to builder
     * @param base Base object (null for off-heap)
     * @param offset Offset within object or address
     * @param length Number of bytes to append
     */
    public void appendBytes(Object base, long offset, int length);
    
    /**
     * Build final UTF8String from accumulated data
     * @return Constructed UTF8String
     */
    public UTF8String build();
}

Usage Examples:

import org.apache.spark.unsafe.types.*;

// Basic string operations
UTF8String str = UTF8String.fromString("Hello World");
UTF8String upper = str.toUpperCase();
UTF8String trimmed = str.trim();

// String searching and manipulation
boolean hasHello = str.contains(UTF8String.fromString("Hello"));
UTF8String substr = str.substring(0, 5);
UTF8String[] parts = str.split(UTF8String.fromString(" "), -1);

// Using StringBuilder
UTF8StringBuilder builder = new UTF8StringBuilder();
builder.append(UTF8String.fromString("Hello"));
builder.append(" ");
builder.append(UTF8String.fromString("World"));
UTF8String result = builder.build();

// Comparison and sorting
UTF8String str1 = UTF8String.fromString("apple");
UTF8String str2 = UTF8String.fromString("banana");
int comparison = str1.compareTo(str2); // negative value

// Numeric parsing
UTF8String numStr = UTF8String.fromString("123");
int value = numStr.toIntExact();

Performance Characteristics

  • Zero-Copy Operations: Many operations avoid memory copying by using views
  • Memory Efficiency: Direct UTF-8 storage without Java String overhead
  • Fast Comparisons: Optimized comparison using prefix sorting
  • Lazy Evaluation: Some operations defer computation until needed
  • Native Operations: Core operations compile to efficient native code