Specialized data types including calendar intervals, variant values, and utility classes for date/time operations and collation support. These components provide essential data structures and utilities for handling complex data types in distributed computing environments.
Represents calendar intervals with months, days, and microseconds components, implementing Serializable and Comparable interfaces.
public final class CalendarInterval implements Serializable, Comparable<CalendarInterval> {
// Fields
public final int months;
public final int days;
public final long microseconds;
// Constructor
public CalendarInterval(int months, int days, long microseconds);
// Methods
public boolean equals(Object o);
public int hashCode();
public String toString();
public Period extractAsPeriod();
public Duration extractAsDuration();
public int compareTo(CalendarInterval o);
}Physical representation of Variant type for semi-structured data, supporting JSON conversion and serialization.
public class VariantVal implements Serializable {
// Constructor
public VariantVal(byte[] value, byte[] metadata);
// Instance methods
public byte[] getValue();
public byte[] getMetadata();
public String debugString();
public String toJson(ZoneId zoneId);
public String toString();
public boolean equals(Object other);
public int hashCode();
// Static methods
public static VariantVal readFromUnsafeRow(long offsetAndSize, Object baseObject, long baseOffset);
}Utility methods for byte array operations including comparison, padding, and concatenation.
public final class ByteArray {
// Constants
public static final byte[] EMPTY_BYTE;
// Static methods
public static void writeToMemory(byte[] src, Object target, long targetOffset);
public static long getPrefix(byte[] bytes);
public static int compareBinary(byte[] leftBase, byte[] rightBase);
public static byte[] subStringSQL(byte[] bytes, int pos, int len);
public static byte[] concat(byte[]... inputs);
public static byte[] concatWS(byte[] delimiter, byte[]... inputs);
public static byte[] lpad(byte[] bytes, int len, byte[] pad);
public static byte[] rpad(byte[] bytes, int len, byte[] pad);
}Factory for collation-aware string operations with support for different collation providers (ICU, Unicode).
public final class CollationFactory {
// Constants
public static final List<String> SUPPORTED_PROVIDERS;
// Collation management
public static int collationNameToId(String collationName) throws SparkException;
public static String resolveFullyQualifiedName(String[] collationName) throws SparkException;
public static String fullyQualifiedName(int collationId);
public static Collation fetchCollation(int collationId);
public static Collation fetchCollation(String collationName) throws SparkException;
public static List<CollationIdentifier> listCollations();
public static CollationMeta loadCollationMeta(CollationIdentifier collationIdentifier);
// Collation properties
public static boolean isCaseInsensitive(int collationId);
public static boolean isAccentInsensitive(int collationId);
public static void assertValidProvider(String provider) throws SparkException;
// String operations
public static UTF8String applyTrimmingPolicy(UTF8String input, int collationId);
public static boolean ignoresSpacesInTrimFunctions(String provider);
public static UTF8String getCollationKey(UTF8String input, int collationId);
public static byte[] getCollationKeyBytes(UTF8String input, int collationId);
// ICU support
public static String[] getICULocaleNames();
public static StringSearch getStringSearch(UTF8String target, UTF8String pattern, int collationId);
public static StringSearch getStringSearch(String target, String pattern, int collationId);
public static StringSearch getStringSearch(String target, UTF8String pattern, int collationId);
// Utilities
public static SparkException collationInvalidNameException(String collationName);
public static String getClosestSuggestionsOnInvalidName(String invalidCollationName);
}Entry point for collation-aware string expressions with comprehensive string operation support.
public final class CollationSupport {
// String operations with collation support
public static UTF8String[] exec(UTF8String s, UTF8String d, int collationId); // Split
public static boolean exec(UTF8String l, UTF8String r, int collationId); // Contains/StartsWith/EndsWith
public static UTF8String exec(UTF8String v, int collationId, boolean useICU); // Case conversion
public static int exec(UTF8String word, UTF8String set, int collationId); // FindInSet
public static int exec(UTF8String string, UTF8String substring, int collationId); // StringInstr
public static UTF8String exec(UTF8String src, UTF8String search, UTF8String replace, int collationId); // Replace
public static UTF8String exec(UTF8String string, UTF8String delimiter, int count, int collationId); // SubstringIndex
public static UTF8String exec(UTF8String source, Map<String, String> dict, int collationId); // Translate
public static UTF8String exec(UTF8String srcString); // Trim
public static UTF8String exec(UTF8String srcString, UTF8String trimString, int collationId); // Trim with collation
// Regex support
public static boolean supportsLowercaseRegex(int collationId);
public static int collationAwareRegexFlags(int collationId);
public static UTF8String lowercaseRegex(UTF8String regex);
public static UTF8String collationAwareRegex(UTF8String regex, int collationId);
}Constants for date/time calculations used throughout temporal operations.
public class DateTimeConstants {
public static final int MONTHS_PER_YEAR = 12;
public static final byte DAYS_PER_WEEK = 7;
public static final long HOURS_PER_DAY = 24L;
public static final long MINUTES_PER_HOUR = 60L;
public static final long SECONDS_PER_MINUTE = 60L;
public static final long SECONDS_PER_HOUR;
public static final long SECONDS_PER_DAY;
public static final long MILLIS_PER_SECOND = 1000L;
public static final long MILLIS_PER_MINUTE;
public static final long MILLIS_PER_HOUR;
public static final long MILLIS_PER_DAY;
public static final long MICROS_PER_MILLIS = 1000L;
public static final long MICROS_PER_SECOND;
public static final long MICROS_PER_MINUTE;
public static final long MICROS_PER_HOUR;
public static final long MICROS_PER_DAY;
public static final long NANOS_PER_MICROS = 1000L;
public static final long NANOS_PER_MILLIS;
public static final long NANOS_PER_SECOND;
}Enhanced UTF8String operations with comprehensive collation support for case-insensitive and accent-insensitive string operations.
public final class CollationAwareUTF8String {
public static UTF8String toLowerCase(UTF8String input, int collationId);
public static UTF8String toUpperCase(UTF8String input, int collationId);
public static UTF8String toTitleCase(UTF8String input, int collationId);
public static int indexOf(UTF8String target, UTF8String pattern, int start, int collationId);
public static boolean startsWith(UTF8String string, UTF8String prefix, int collationId);
public static boolean endsWith(UTF8String string, UTF8String suffix, int collationId);
public static boolean contains(UTF8String string, UTF8String pattern, int collationId);
public static UTF8String[] split(UTF8String input, UTF8String pattern, int limit, int collationId);
public static UTF8String replace(UTF8String input, UTF8String search, UTF8String replacement, int collationId);
public static int compareTo(UTF8String left, UTF8String right, int collationId);
public static boolean equals(UTF8String left, UTF8String right, int collationId);
}String constants and special Unicode code points for collation operations.
public final class CollationNames {
public static final String UTF8_BINARY = "UTF8_BINARY";
public static final String UTF8_LCASE = "UTF8_LCASE";
public static final String UNICODE = "UNICODE";
public static final String UNICODE_CI = "UNICODE_CI";
}
public final class SpecialCodePointConstants {
public static final int COMBINING_DOT = 0x0307;
public static final int ASCII_SMALL_I = 0x0069;
public static final int ASCII_SPACE = 0x0020;
public static final int GREEK_CAPITAL_SIGMA = 0x03A3;
public static final int GREEK_SMALL_SIGMA = 0x03C3;
public static final int GREEK_FINAL_SIGMA = 0x03C2;
public static final int CAPITAL_I_WITH_DOT_ABOVE = 0x0130;
}import org.apache.spark.unsafe.types.CalendarInterval;
import java.time.Period;
import java.time.Duration;
// Create calendar intervals
CalendarInterval interval1 = new CalendarInterval(2, 15, 3600_000_000L); // 2 months, 15 days, 1 hour
CalendarInterval interval2 = new CalendarInterval(0, 0, 1800_000_000L); // 30 minutes
// Access components
int months = interval1.months; // 2
int days = interval1.days; // 15
long micros = interval1.microseconds; // 3600000000
// String representation
String str = interval1.toString(); // Human-readable format
// Extract as Java time objects
Period period = interval1.extractAsPeriod(); // Date part (2 months, 15 days)
Duration duration = interval1.extractAsDuration(); // Time part (1 hour)
// Comparison for aggregation
int comparison = interval1.compareTo(interval2);
// Equality and hashing
boolean equal = interval1.equals(interval2);
int hash = interval1.hashCode();import org.apache.spark.unsafe.types.VariantVal;
import java.time.ZoneId;
// Create variant value
byte[] value = "{\"name\":\"John\",\"age\":30}".getBytes();
byte[] metadata = new byte[]{1, 2, 3}; // Schema metadata
VariantVal variant = new VariantVal(value, metadata);
// Access components
byte[] valueBytes = variant.getValue();
byte[] metadataBytes = variant.getMetadata();
// JSON conversion
String json = variant.toJson(ZoneId.systemDefault());
String defaultJson = variant.toString(); // Uses system default timezone
// Debug representation
String debug = variant.debugString();
// Equality and hashing
VariantVal other = new VariantVal(value, metadata);
boolean equal = variant.equals(other); // Byte-level comparison
int hash = variant.hashCode();
// Read from UnsafeRow storage
long offsetAndSize = 0x1000_0020L; // Encoded offset and size
Object baseObject = someUnsafeRowObject;
long baseOffset = someBaseOffset;
VariantVal fromRow = VariantVal.readFromUnsafeRow(offsetAndSize, baseObject, baseOffset);import org.apache.spark.unsafe.types.ByteArray;
// Concatenation operations
byte[] part1 = "Hello".getBytes();
byte[] part2 = " ".getBytes();
byte[] part3 = "World".getBytes();
byte[] concatenated = ByteArray.concat(part1, part2, part3); // "Hello World"
// Concatenation with delimiter
byte[] delimiter = ",".getBytes();
byte[] csvData = ByteArray.concatWS(delimiter, part1, part3); // "Hello,World"
// Padding operations
byte[] data = "Hi".getBytes();
byte[] pad = "*".getBytes();
byte[] leftPadded = ByteArray.lpad(data, 5, pad); // "***Hi"
byte[] rightPadded = ByteArray.rpad(data, 5, pad); // "Hi***"
// SQL-style substring
byte[] text = "Hello World".getBytes();
byte[] substring = ByteArray.subStringSQL(text, 1, 5); // "Hello" (1-based indexing)
// Binary comparison
byte[] array1 = "abc".getBytes();
byte[] array2 = "abd".getBytes();
int comparison = ByteArray.compareBinary(array1, array2); // < 0
// Sorting prefix (first 8 bytes as long for sorting)
byte[] sortData = "example data for sorting".getBytes();
long prefix = ByteArray.getPrefix(sortData);
// Memory operations
long targetAddress = Platform.allocateMemory(data.length);
ByteArray.writeToMemory(data, null, targetAddress);
Platform.freeMemory(targetAddress);import org.apache.spark.sql.catalyst.util.CollationFactory;
import org.apache.spark.unsafe.types.UTF8String;
// Get collation ID from name
int collationId = CollationFactory.collationNameToId("UTF8_LCASE");
// Check collation properties
boolean caseInsensitive = CollationFactory.isCaseInsensitive(collationId);
boolean accentInsensitive = CollationFactory.isAccentInsensitive(collationId);
// Get collation key for sorting
UTF8String text = UTF8String.fromString("Hello World");
UTF8String collationKey = CollationFactory.getCollationKey(text, collationId);
byte[] keyBytes = CollationFactory.getCollationKeyBytes(text, collationId);
// List all available collations
List<CollationIdentifier> collations = CollationFactory.listCollations();
// Get ICU locale names
String[] locales = CollationFactory.getICULocaleNames();
// String search with collation
StringSearch search = CollationFactory.getStringSearch(
UTF8String.fromString("Hello World"),
UTF8String.fromString("world"),
collationId
);import org.apache.spark.sql.catalyst.util.DateTimeConstants;
// Convert between time units
long hours = 5;
long seconds = hours * DateTimeConstants.SECONDS_PER_HOUR;
long millis = seconds * DateTimeConstants.MILLIS_PER_SECOND;
long micros = millis * DateTimeConstants.MICROS_PER_MILLIS;
long nanos = micros * DateTimeConstants.NANOS_PER_MICROS;
// Calendar calculations
int totalMonths = 2 * DateTimeConstants.MONTHS_PER_YEAR + 6; // 2.5 years
long weekInMicros = DateTimeConstants.DAYS_PER_WEEK * DateTimeConstants.MICROS_PER_DAY;
// Time interval calculations
long intervalMicros = 3 * DateTimeConstants.MICROS_PER_HOUR +
30 * DateTimeConstants.MICROS_PER_MINUTE +
45 * DateTimeConstants.MICROS_PER_SECOND;import org.apache.spark.sql.catalyst.util.CollationSupport;
import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String;
import org.apache.spark.unsafe.types.UTF8String;
UTF8String text = UTF8String.fromString("Hello World");
UTF8String pattern = UTF8String.fromString("WORLD");
int collationId = CollationFactory.collationNameToId("UTF8_LCASE");
// Case-insensitive contains check using CollationSupport
boolean contains = CollationSupport.exec(text, pattern, collationId);
// Direct collation-aware operations using CollationAwareUTF8String
UTF8String lower = CollationAwareUTF8String.toLowerCase(text, collationId);
UTF8String upper = CollationAwareUTF8String.toUpperCase(text, collationId);
UTF8String title = CollationAwareUTF8String.toTitleCase(text, collationId);
// Collation-aware search operations
int index = CollationAwareUTF8String.indexOf(text, pattern, 0, collationId);
boolean startsWith = CollationAwareUTF8String.startsWith(text, UTF8String.fromString("hello"), collationId);
boolean endsWith = CollationAwareUTF8String.endsWith(text, UTF8String.fromString("WORLD"), collationId);
// String comparison with collation
int comparison = CollationAwareUTF8String.compareTo(text, pattern, collationId);
boolean equal = CollationAwareUTF8String.equals(text, pattern, collationId);
// Replace and split with collation
UTF8String replaced = CollationAwareUTF8String.replace(
text,
UTF8String.fromString("HELLO"),
UTF8String.fromString("Hi"),
collationId
);
UTF8String[] parts = CollationAwareUTF8String.split(
UTF8String.fromString("apple,BANANA,cherry"),
UTF8String.fromString(","),
-1,
collationId
);