Apache Spark Unsafe is a low-level memory management and unsafe operations library that provides direct memory access, optimized data structures, and platform-specific functionality for high-performance data processing within Apache Spark. This library enables bypassing standard Java memory safety mechanisms when necessary, offering direct access to off-heap memory, efficient serialization operations, and optimized data structures for large-scale distributed computing workloads.
pom.xml:<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-unsafe_2.12</artifactId>
<version>3.5.6</version>
</dependency>// Memory operations and platform utilities
import org.apache.spark.unsafe.Platform;
import org.apache.spark.unsafe.memory.MemoryAllocator;
import org.apache.spark.unsafe.memory.MemoryBlock;
import org.apache.spark.unsafe.memory.MemoryLocation;
import org.apache.spark.unsafe.memory.HeapMemoryAllocator;
import org.apache.spark.unsafe.memory.UnsafeMemoryAllocator;
// Data structures and arrays
import org.apache.spark.unsafe.array.LongArray;
import org.apache.spark.unsafe.array.ByteArrayMethods;
import org.apache.spark.unsafe.types.UTF8String;
import org.apache.spark.unsafe.types.UTF8StringBuilder;
import org.apache.spark.unsafe.types.CalendarInterval;
import org.apache.spark.unsafe.types.ByteArray;
// Hashing and utilities
import org.apache.spark.unsafe.hash.Murmur3_x86_32;
import org.apache.spark.unsafe.bitset.BitSetMethods;
import org.apache.spark.unsafe.UnsafeAlignedOffset;
import org.apache.spark.sql.catalyst.expressions.HiveHasher;
import org.apache.spark.sql.catalyst.util.DateTimeConstants;import org.apache.spark.unsafe.Platform;
import org.apache.spark.unsafe.memory.*;
import org.apache.spark.unsafe.types.UTF8String;
// Memory allocation example
MemoryAllocator allocator = MemoryAllocator.HEAP;
MemoryBlock block = allocator.allocate(1024);
// Direct memory operations
Platform.putLong(block.getBaseObject(), block.getBaseOffset(), 42L);
long value = Platform.getLong(block.getBaseObject(), block.getBaseOffset());
// UTF8String operations
UTF8String str = UTF8String.fromString("Hello Spark");
UTF8String upper = str.toUpperCase();
int length = str.numChars();
// Clean up
allocator.free(block);Apache Spark Unsafe is built around several key components:
Core memory allocation and management functionality providing both heap and off-heap memory strategies with automatic pooling and debug support.
interface MemoryAllocator {
MemoryBlock allocate(long size) throws OutOfMemoryError;
void free(MemoryBlock memory);
static final MemoryAllocator HEAP = new HeapMemoryAllocator();
static final MemoryAllocator UNSAFE = new UnsafeMemoryAllocator();
}
class MemoryBlock extends MemoryLocation {
public MemoryBlock(Object obj, long offset, long length);
public long size();
public void fill(byte value);
public static MemoryBlock fromLongArray(long[] array);
}Low-level unsafe memory operations and platform-specific functionality for direct memory access, copying, and platform feature detection.
final class Platform {
// Memory access methods for all primitive types
public static int getInt(Object object, long offset);
public static void putInt(Object object, long offset, int value);
public static long getLong(Object object, long offset);
public static void putLong(Object object, long offset, long value);
public static byte getByte(Object object, long offset);
public static void putByte(Object object, long offset, byte value);
public static short getShort(Object object, long offset);
public static void putShort(Object object, long offset, short value);
public static float getFloat(Object object, long offset);
public static void putFloat(Object object, long offset, float value);
public static double getDouble(Object object, long offset);
public static void putDouble(Object object, long offset, double value);
public static boolean getBoolean(Object object, long offset);
public static void putBoolean(Object object, long offset, boolean value);
// Volatile operations
public static Object getObjectVolatile(Object object, long offset);
public static void putObjectVolatile(Object object, long offset, Object value);
// Memory allocation and management
public static long allocateMemory(long size);
public static void freeMemory(long address);
public static long reallocateMemory(long address, long oldSize, long newSize);
public static void copyMemory(Object src, long srcOffset, Object dst, long dstOffset, long length);
public static void setMemory(Object object, long offset, long size, byte value);
public static void setMemory(long address, byte value, long size);
// Platform detection and utilities
public static boolean unaligned();
public static boolean cleanerCreateMethodIsDefined();
public static void throwException(Throwable t);
public static ByteBuffer allocateDirectBuffer(int size);
}Memory-efficient UTF-8 string implementation with comprehensive string manipulation, parsing, and comparison operations optimized for Spark's internal use.
final class UTF8String implements Comparable<UTF8String>, Externalizable, KryoSerializable, Cloneable {
// Factory methods
public static UTF8String fromString(String str);
public static UTF8String fromBytes(byte[] bytes);
public static UTF8String concat(UTF8String... inputs);
// String operations
public UTF8String substring(int start, int until);
public UTF8String toUpperCase();
public UTF8String trim();
public boolean contains(UTF8String substring);
// Parsing
public long toLongExact();
public int toIntExact();
}Efficient array implementations and utility methods for byte arrays and long arrays with support for both heap and off-heap memory storage.
final class LongArray {
public LongArray(MemoryBlock memory);
public long get(int index);
public void set(int index, long value);
public long size();
public void zeroOut();
}
class ByteArrayMethods {
public static boolean arrayEquals(Object leftBase, long leftOffset, Object rightBase, long rightOffset, long length);
public static boolean contains(byte[] arr, byte[] sub);
public static long nextPowerOf2(long num);
}High-performance hashing implementations and utility classes including Murmur3 hashing, bitset operations, date/time constants, and platform alignment utilities.
final class Murmur3_x86_32 {
public Murmur3_x86_32(int seed);
public static int hashInt(int input, int seed);
public static int hashLong(long input, int seed);
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);
}
final class BitSetMethods {
public static void set(Object baseObject, long baseOffset, int index);
public static boolean isSet(Object baseObject, long baseOffset, int index);
public static int nextSetBit(Object baseObject, long baseOffset, int fromIndex, int bitsetSizeInWords);
}
class HiveHasher {
public static int hashInt(int input);
public static int hashLong(long input);
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes);
}
class DateTimeConstants {
public static final int MONTHS_PER_YEAR = 12;
public static final long HOURS_PER_DAY = 24L;
public static final long SECONDS_PER_MINUTE = 60L;
public static final long MILLIS_PER_SECOND = 1000L;
public static final long MICROS_PER_MILLIS = 1000L;
public static final long NANOS_PER_MICROS = 1000L;
}
class UnsafeAlignedOffset {
public static int getUaoSize();
public static int getSize(Object object, long offset);
public static void putSize(Object object, long offset, int value);
}class MemoryLocation {
public MemoryLocation(Object obj, long offset);
public final Object getBaseObject();
public final long getBaseOffset();
}
abstract class KVIterator<K, V> {
public abstract boolean next() throws IOException;
public abstract K getKey();
public abstract V getValue();
public abstract void close();
}final class CalendarInterval implements Serializable {
public final int months;
public final int days;
public final long microseconds;
public CalendarInterval(int months, int days, long microseconds);
}
class UTF8StringBuilder {
public UTF8StringBuilder();
public UTF8StringBuilder(int initialSize);
public void append(UTF8String value);
public UTF8String build();
}
final class ByteArray {
public static final byte[] EMPTY_BYTE;
public static void writeToMemory(byte[] src, Object target, long targetOffset);
public static long getPrefix(byte[] bytes);
public static int compareBinary(byte[] leftBase, byte[] rightBase);
public static byte[] subStringSQL(byte[] bytes, int pos, int len);
public static byte[] concat(byte[]... inputs);
}The library throws standard Java exceptions:
OutOfMemoryError - When memory allocation failsIOException - During iterator operationsNumberFormatException - During string parsing operationsIndexOutOfBoundsException - For array access violationsMost operations are designed to be fail-fast and do not perform bounds checking for performance reasons. Users should validate inputs before calling unsafe operations.