or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

array-operations.mddata-types-utilities.mdhash-bitset-operations.mdindex.mdmemory-management.mdplatform-operations.mdutf8-string-processing.md
tile.json

data-types-utilities.mddocs/

Data Types and Utilities

Specialized data types including calendar intervals, variant values, and utility classes for date/time operations and collation support. These components provide essential data structures and utilities for handling complex data types in distributed computing environments.

Capabilities

Calendar Interval

Represents calendar intervals with months, days, and microseconds components, implementing Serializable and Comparable interfaces.

public final class CalendarInterval implements Serializable, Comparable<CalendarInterval> {
    // Fields
    public final int months;
    public final int days;
    public final long microseconds;
    
    // Constructor
    public CalendarInterval(int months, int days, long microseconds);
    
    // Methods
    public boolean equals(Object o);
    public int hashCode();
    public String toString();
    public Period extractAsPeriod();
    public Duration extractAsDuration();
    public int compareTo(CalendarInterval o);
}

Variant Value

Physical representation of Variant type for semi-structured data, supporting JSON conversion and serialization.

public class VariantVal implements Serializable {
    // Constructor
    public VariantVal(byte[] value, byte[] metadata);
    
    // Instance methods
    public byte[] getValue();
    public byte[] getMetadata();
    public String debugString();
    public String toJson(ZoneId zoneId);
    public String toString();
    public boolean equals(Object other);
    public int hashCode();
    
    // Static methods
    public static VariantVal readFromUnsafeRow(long offsetAndSize, Object baseObject, long baseOffset);
}

Byte Array Utilities

Utility methods for byte array operations including comparison, padding, and concatenation.

public final class ByteArray {
    // Constants
    public static final byte[] EMPTY_BYTE;
    
    // Static methods
    public static void writeToMemory(byte[] src, Object target, long targetOffset);
    public static long getPrefix(byte[] bytes);
    public static int compareBinary(byte[] leftBase, byte[] rightBase);
    public static byte[] subStringSQL(byte[] bytes, int pos, int len);
    public static byte[] concat(byte[]... inputs);
    public static byte[] concatWS(byte[] delimiter, byte[]... inputs);
    public static byte[] lpad(byte[] bytes, int len, byte[] pad);
    public static byte[] rpad(byte[] bytes, int len, byte[] pad);
}

Collation Factory

Factory for collation-aware string operations with support for different collation providers (ICU, Unicode).

public final class CollationFactory {
    // Constants
    public static final List<String> SUPPORTED_PROVIDERS;
    
    // Collation management
    public static int collationNameToId(String collationName) throws SparkException;
    public static String resolveFullyQualifiedName(String[] collationName) throws SparkException;
    public static String fullyQualifiedName(int collationId);
    public static Collation fetchCollation(int collationId);
    public static Collation fetchCollation(String collationName) throws SparkException;
    public static List<CollationIdentifier> listCollations();
    public static CollationMeta loadCollationMeta(CollationIdentifier collationIdentifier);
    
    // Collation properties
    public static boolean isCaseInsensitive(int collationId);
    public static boolean isAccentInsensitive(int collationId);
    public static void assertValidProvider(String provider) throws SparkException;
    
    // String operations
    public static UTF8String applyTrimmingPolicy(UTF8String input, int collationId);
    public static boolean ignoresSpacesInTrimFunctions(String provider);
    public static UTF8String getCollationKey(UTF8String input, int collationId);
    public static byte[] getCollationKeyBytes(UTF8String input, int collationId);
    
    // ICU support
    public static String[] getICULocaleNames();
    public static StringSearch getStringSearch(UTF8String target, UTF8String pattern, int collationId);
    public static StringSearch getStringSearch(String target, String pattern, int collationId);
    public static StringSearch getStringSearch(String target, UTF8String pattern, int collationId);
    
    // Utilities
    public static SparkException collationInvalidNameException(String collationName);
    public static String getClosestSuggestionsOnInvalidName(String invalidCollationName);
}

Collation Support

Entry point for collation-aware string expressions with comprehensive string operation support.

public final class CollationSupport {
    // String operations with collation support
    public static UTF8String[] exec(UTF8String s, UTF8String d, int collationId);  // Split
    public static boolean exec(UTF8String l, UTF8String r, int collationId);       // Contains/StartsWith/EndsWith
    public static UTF8String exec(UTF8String v, int collationId, boolean useICU);  // Case conversion
    public static int exec(UTF8String word, UTF8String set, int collationId);      // FindInSet
    public static int exec(UTF8String string, UTF8String substring, int collationId);  // StringInstr
    public static UTF8String exec(UTF8String src, UTF8String search, UTF8String replace, int collationId);  // Replace
    public static UTF8String exec(UTF8String string, UTF8String delimiter, int count, int collationId);  // SubstringIndex
    public static UTF8String exec(UTF8String source, Map<String, String> dict, int collationId);  // Translate
    public static UTF8String exec(UTF8String srcString);  // Trim
    public static UTF8String exec(UTF8String srcString, UTF8String trimString, int collationId);  // Trim with collation
    
    // Regex support
    public static boolean supportsLowercaseRegex(int collationId);
    public static int collationAwareRegexFlags(int collationId);
    public static UTF8String lowercaseRegex(UTF8String regex);
    public static UTF8String collationAwareRegex(UTF8String regex, int collationId);
}

Date Time Constants

Constants for date/time calculations used throughout temporal operations.

public class DateTimeConstants {
    public static final int MONTHS_PER_YEAR = 12;
    public static final byte DAYS_PER_WEEK = 7;
    public static final long HOURS_PER_DAY = 24L;
    public static final long MINUTES_PER_HOUR = 60L;
    public static final long SECONDS_PER_MINUTE = 60L;
    public static final long SECONDS_PER_HOUR;
    public static final long SECONDS_PER_DAY;
    public static final long MILLIS_PER_SECOND = 1000L;
    public static final long MILLIS_PER_MINUTE;
    public static final long MILLIS_PER_HOUR;
    public static final long MILLIS_PER_DAY;
    public static final long MICROS_PER_MILLIS = 1000L;
    public static final long MICROS_PER_SECOND;
    public static final long MICROS_PER_MINUTE;
    public static final long MICROS_PER_HOUR;
    public static final long MICROS_PER_DAY;
    public static final long NANOS_PER_MICROS = 1000L;
    public static final long NANOS_PER_MILLIS;
    public static final long NANOS_PER_SECOND;
}

Collation-Aware UTF8String Operations

Enhanced UTF8String operations with comprehensive collation support for case-insensitive and accent-insensitive string operations.

public final class CollationAwareUTF8String {
    public static UTF8String toLowerCase(UTF8String input, int collationId);
    public static UTF8String toUpperCase(UTF8String input, int collationId);
    public static UTF8String toTitleCase(UTF8String input, int collationId);
    public static int indexOf(UTF8String target, UTF8String pattern, int start, int collationId);
    public static boolean startsWith(UTF8String string, UTF8String prefix, int collationId);
    public static boolean endsWith(UTF8String string, UTF8String suffix, int collationId);
    public static boolean contains(UTF8String string, UTF8String pattern, int collationId);
    public static UTF8String[] split(UTF8String input, UTF8String pattern, int limit, int collationId);
    public static UTF8String replace(UTF8String input, UTF8String search, UTF8String replacement, int collationId);
    public static int compareTo(UTF8String left, UTF8String right, int collationId);
    public static boolean equals(UTF8String left, UTF8String right, int collationId);
}

Collation Names and Constants

String constants and special Unicode code points for collation operations.

public final class CollationNames {
    public static final String UTF8_BINARY = "UTF8_BINARY";
    public static final String UTF8_LCASE = "UTF8_LCASE";
    public static final String UNICODE = "UNICODE";
    public static final String UNICODE_CI = "UNICODE_CI";
}

public final class SpecialCodePointConstants {
    public static final int COMBINING_DOT = 0x0307;
    public static final int ASCII_SMALL_I = 0x0069;
    public static final int ASCII_SPACE = 0x0020;
    public static final int GREEK_CAPITAL_SIGMA = 0x03A3;
    public static final int GREEK_SMALL_SIGMA = 0x03C3;
    public static final int GREEK_FINAL_SIGMA = 0x03C2;
    public static final int CAPITAL_I_WITH_DOT_ABOVE = 0x0130;
}

Usage Examples

Calendar Interval Operations

import org.apache.spark.unsafe.types.CalendarInterval;
import java.time.Period;
import java.time.Duration;

// Create calendar intervals
CalendarInterval interval1 = new CalendarInterval(2, 15, 3600_000_000L); // 2 months, 15 days, 1 hour
CalendarInterval interval2 = new CalendarInterval(0, 0, 1800_000_000L);  // 30 minutes

// Access components
int months = interval1.months;        // 2
int days = interval1.days;           // 15
long micros = interval1.microseconds; // 3600000000

// String representation
String str = interval1.toString(); // Human-readable format

// Extract as Java time objects
Period period = interval1.extractAsPeriod();       // Date part (2 months, 15 days)
Duration duration = interval1.extractAsDuration(); // Time part (1 hour)

// Comparison for aggregation
int comparison = interval1.compareTo(interval2);

// Equality and hashing
boolean equal = interval1.equals(interval2);
int hash = interval1.hashCode();

Variant Value Operations

import org.apache.spark.unsafe.types.VariantVal;
import java.time.ZoneId;

// Create variant value
byte[] value = "{\"name\":\"John\",\"age\":30}".getBytes();
byte[] metadata = new byte[]{1, 2, 3}; // Schema metadata
VariantVal variant = new VariantVal(value, metadata);

// Access components
byte[] valueBytes = variant.getValue();
byte[] metadataBytes = variant.getMetadata();

// JSON conversion
String json = variant.toJson(ZoneId.systemDefault());
String defaultJson = variant.toString(); // Uses system default timezone

// Debug representation
String debug = variant.debugString();

// Equality and hashing
VariantVal other = new VariantVal(value, metadata);
boolean equal = variant.equals(other); // Byte-level comparison
int hash = variant.hashCode();

// Read from UnsafeRow storage
long offsetAndSize = 0x1000_0020L; // Encoded offset and size
Object baseObject = someUnsafeRowObject;
long baseOffset = someBaseOffset;
VariantVal fromRow = VariantVal.readFromUnsafeRow(offsetAndSize, baseObject, baseOffset);

Byte Array Utilities

import org.apache.spark.unsafe.types.ByteArray;

// Concatenation operations
byte[] part1 = "Hello".getBytes();
byte[] part2 = " ".getBytes();
byte[] part3 = "World".getBytes();

byte[] concatenated = ByteArray.concat(part1, part2, part3); // "Hello World"

// Concatenation with delimiter
byte[] delimiter = ",".getBytes();
byte[] csvData = ByteArray.concatWS(delimiter, part1, part3); // "Hello,World"

// Padding operations
byte[] data = "Hi".getBytes();
byte[] pad = "*".getBytes();

byte[] leftPadded = ByteArray.lpad(data, 5, pad);  // "***Hi"
byte[] rightPadded = ByteArray.rpad(data, 5, pad); // "Hi***"

// SQL-style substring
byte[] text = "Hello World".getBytes();
byte[] substring = ByteArray.subStringSQL(text, 1, 5); // "Hello" (1-based indexing)

// Binary comparison
byte[] array1 = "abc".getBytes();
byte[] array2 = "abd".getBytes();
int comparison = ByteArray.compareBinary(array1, array2); // < 0

// Sorting prefix (first 8 bytes as long for sorting)
byte[] sortData = "example data for sorting".getBytes();
long prefix = ByteArray.getPrefix(sortData);

// Memory operations
long targetAddress = Platform.allocateMemory(data.length);
ByteArray.writeToMemory(data, null, targetAddress);
Platform.freeMemory(targetAddress);

Collation Operations

import org.apache.spark.sql.catalyst.util.CollationFactory;
import org.apache.spark.unsafe.types.UTF8String;

// Get collation ID from name
int collationId = CollationFactory.collationNameToId("UTF8_LCASE");

// Check collation properties
boolean caseInsensitive = CollationFactory.isCaseInsensitive(collationId);
boolean accentInsensitive = CollationFactory.isAccentInsensitive(collationId);

// Get collation key for sorting
UTF8String text = UTF8String.fromString("Hello World");
UTF8String collationKey = CollationFactory.getCollationKey(text, collationId);
byte[] keyBytes = CollationFactory.getCollationKeyBytes(text, collationId);

// List all available collations
List<CollationIdentifier> collations = CollationFactory.listCollations();

// Get ICU locale names
String[] locales = CollationFactory.getICULocaleNames();

// String search with collation
StringSearch search = CollationFactory.getStringSearch(
    UTF8String.fromString("Hello World"),
    UTF8String.fromString("world"),
    collationId
);

Date Time Constants Usage

import org.apache.spark.sql.catalyst.util.DateTimeConstants;

// Convert between time units
long hours = 5;
long seconds = hours * DateTimeConstants.SECONDS_PER_HOUR;
long millis = seconds * DateTimeConstants.MILLIS_PER_SECOND;
long micros = millis * DateTimeConstants.MICROS_PER_MILLIS;
long nanos = micros * DateTimeConstants.NANOS_PER_MICROS;

// Calendar calculations
int totalMonths = 2 * DateTimeConstants.MONTHS_PER_YEAR + 6; // 2.5 years
long weekInMicros = DateTimeConstants.DAYS_PER_WEEK * DateTimeConstants.MICROS_PER_DAY;

// Time interval calculations
long intervalMicros = 3 * DateTimeConstants.MICROS_PER_HOUR + 
                     30 * DateTimeConstants.MICROS_PER_MINUTE + 
                     45 * DateTimeConstants.MICROS_PER_SECOND;

Collation-Aware String Operations

import org.apache.spark.sql.catalyst.util.CollationSupport;
import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String;
import org.apache.spark.unsafe.types.UTF8String;

UTF8String text = UTF8String.fromString("Hello World");
UTF8String pattern = UTF8String.fromString("WORLD");
int collationId = CollationFactory.collationNameToId("UTF8_LCASE");

// Case-insensitive contains check using CollationSupport
boolean contains = CollationSupport.exec(text, pattern, collationId);

// Direct collation-aware operations using CollationAwareUTF8String
UTF8String lower = CollationAwareUTF8String.toLowerCase(text, collationId);
UTF8String upper = CollationAwareUTF8String.toUpperCase(text, collationId);
UTF8String title = CollationAwareUTF8String.toTitleCase(text, collationId);

// Collation-aware search operations
int index = CollationAwareUTF8String.indexOf(text, pattern, 0, collationId);
boolean startsWith = CollationAwareUTF8String.startsWith(text, UTF8String.fromString("hello"), collationId);
boolean endsWith = CollationAwareUTF8String.endsWith(text, UTF8String.fromString("WORLD"), collationId);

// String comparison with collation
int comparison = CollationAwareUTF8String.compareTo(text, pattern, collationId);
boolean equal = CollationAwareUTF8String.equals(text, pattern, collationId);

// Replace and split with collation
UTF8String replaced = CollationAwareUTF8String.replace(
    text, 
    UTF8String.fromString("HELLO"), 
    UTF8String.fromString("Hi"), 
    collationId
);

UTF8String[] parts = CollationAwareUTF8String.split(
    UTF8String.fromString("apple,BANANA,cherry"),
    UTF8String.fromString(","),
    -1,
    collationId
);