Low-level memory operations and data structures for high-performance computation in Apache Spark
npx @tessl/cli install tessl/maven-org-apache-spark--spark-unsafe_2_11@2.4.0Apache Spark Unsafe provides low-level memory operations and high-performance data structures for Apache Spark's internal operations. It includes unsafe memory operations, specialized data types for efficient string and interval handling, memory allocators (heap and off-heap), array manipulation utilities, hashing functions, and bitset operations.
Important: This library is designed for internal Spark use to achieve maximum performance by bypassing Java's safety mechanisms for direct memory access. It is not suitable for general application development but is critical for Spark's core engine performance.
pom.xml: <dependency><groupId>org.apache.spark</groupId><artifactId>spark-unsafe_2.11</artifactId><version>2.4.8</version></dependency>import org.apache.spark.unsafe.Platform;
import org.apache.spark.unsafe.UnsafeAlignedOffset;
import org.apache.spark.unsafe.KVIterator;
import org.apache.spark.unsafe.types.UTF8String;
import org.apache.spark.unsafe.types.CalendarInterval;
import org.apache.spark.unsafe.types.ByteArray;
import org.apache.spark.unsafe.memory.MemoryAllocator;
import org.apache.spark.unsafe.memory.MemoryBlock;import org.apache.spark.unsafe.Platform;
import org.apache.spark.unsafe.UnsafeAlignedOffset;
import org.apache.spark.unsafe.types.UTF8String;
import org.apache.spark.unsafe.types.ByteArray;
import org.apache.spark.unsafe.memory.HeapMemoryAllocator;
import org.apache.spark.unsafe.memory.MemoryBlock;
// Basic unsafe memory operations
long address = Platform.allocateMemory(1024);
Platform.putLong(null, address, 42L);
long value = Platform.getLong(null, address);
Platform.freeMemory(address);
// UTF8String operations
UTF8String str = UTF8String.fromString("Hello, World!");
UTF8String upper = str.toUpperCase();
boolean contains = str.contains(UTF8String.fromString("World"));
// Memory allocation with aligned offset
HeapMemoryAllocator allocator = new HeapMemoryAllocator();
int headerSize = UnsafeAlignedOffset.getUaoSize();
MemoryBlock block = allocator.allocate(headerSize + 1024);
UnsafeAlignedOffset.putSize(block.getBaseObject(), block.getBaseOffset(), 1024);
allocator.free(block);
// ByteArray operations
byte[] data1 = "Hello".getBytes();
byte[] data2 = "World".getBytes();
byte[] combined = ByteArray.concat(data1, " ".getBytes(), data2);The Spark Unsafe module is organized into several key functional areas:
Platform classThe core unsafe platform operations provide direct memory access capabilities that bypass Java's safety mechanisms for maximum performance.
public final class Platform {
// Array base offsets
public static final int BOOLEAN_ARRAY_OFFSET;
public static final int BYTE_ARRAY_OFFSET;
public static final int SHORT_ARRAY_OFFSET;
public static final int INT_ARRAY_OFFSET;
public static final int LONG_ARRAY_OFFSET;
public static final int FLOAT_ARRAY_OFFSET;
public static final int DOUBLE_ARRAY_OFFSET;
// Platform capabilities
public static boolean unaligned();
// Memory operations
public static long allocateMemory(long size);
public static void freeMemory(long address);
public static long reallocateMemory(long address, long oldSize, long newSize);
public static java.nio.ByteBuffer allocateDirectBuffer(int size);
}Memory management capabilities include both heap and off-heap allocation strategies with debug support.
public interface MemoryAllocator {
public static final MemoryAllocator UNSAFE;
public static final MemoryAllocator HEAP;
MemoryBlock allocate(long size);
void free(MemoryBlock memory);
}
public class MemoryBlock {
public int pageNumber;
public MemoryBlock(Object obj, long offset, long length);
public long size();
public void fill(byte value);
}High-performance UTF-8 string implementation optimized for Spark SQL operations with comprehensive string manipulation capabilities.
public final class UTF8String implements Comparable<UTF8String> {
public static final UTF8String EMPTY_UTF8;
// Creation methods
public static UTF8String fromString(String str);
public static UTF8String fromBytes(byte[] bytes);
public static UTF8String concat(UTF8String... inputs);
// Core operations
public int numBytes();
public int numChars();
public UTF8String substring(int start, int until);
public boolean contains(UTF8String substring);
public UTF8String toUpperCase();
public UTF8String toLowerCase();
}Calendar interval representation for handling time periods with month and microsecond precision.
public final class CalendarInterval {
// Time constants
public static final long MICROS_PER_SECOND = 1000000L;
public static final long MICROS_PER_MINUTE = 60000000L;
public static final long MICROS_PER_HOUR = 3600000000L;
public static final long MICROS_PER_DAY = 86400000000L;
public int months;
public long microseconds;
public CalendarInterval(int months, long microseconds);
public static CalendarInterval fromString(String s);
public CalendarInterval add(CalendarInterval that);
}High-performance utilities for byte arrays and memory-backed long arrays with optimized equality checking and alignment operations.
public class ByteArrayMethods {
public static final int MAX_ROUNDED_ARRAY_LENGTH;
public static long nextPowerOf2(long num);
public static int roundNumberOfBytesToNearestWord(int numBytes);
public static boolean arrayEquals(Object leftBase, long leftOffset,
Object rightBase, long rightOffset, long length);
}
public final class LongArray {
public LongArray(MemoryBlock memory);
public long size();
public void set(int index, long value);
public long get(int index);
}Murmur3 hash implementation and bitset manipulation utilities for high-performance data processing.
public final class Murmur3_x86_32 {
public Murmur3_x86_32(int seed);
public int hashInt(int input);
public int hashLong(long input);
public static int hashInt(int input, int seed);
public static int hashLong(long input, int seed);
}
public final class BitSetMethods {
public static void set(Object baseObject, long baseOffset, int index);
public static boolean isSet(Object baseObject, long baseOffset, int index);
public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInWords);
}public class MemoryLocation {
public MemoryLocation(Object obj, long offset);
public Object getBaseObject();
public long getBaseOffset();
public void setObjAndOffset(Object newObj, long newOffset);
}
public abstract class KVIterator<K, V> {
public abstract boolean next();
public abstract K getKey();
public abstract V getValue();
public abstract void close();
}public class HeapMemoryAllocator implements MemoryAllocator {
public MemoryBlock allocate(long size);
public void free(MemoryBlock memory);
}
public class UnsafeMemoryAllocator implements MemoryAllocator {
public MemoryBlock allocate(long size);
public void free(MemoryBlock memory);
}public class UnsafeAlignedOffset {
public static int getUaoSize();
public static int getSize(Object object, long offset);
public static void putSize(Object object, long offset, int value);
}