or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

array-operations.mdhashing-utilities.mdindex.mdmemory-management.mdplatform-operations.mdutf8-string-operations.md
tile.json

index.mddocs/

Apache Spark Unsafe

Apache Spark Unsafe is a low-level memory management and unsafe operations library that provides direct memory access, optimized data structures, and platform-specific functionality for high-performance data processing within Apache Spark. This library enables bypassing standard Java memory safety mechanisms when necessary, offering direct access to off-heap memory, efficient serialization operations, and optimized data structures for large-scale distributed computing workloads.

Package Information

  • Package Name: spark-unsafe_2.12
  • Package Type: Maven
  • Language: Java
  • Group ID: org.apache.spark
  • Artifact ID: spark-unsafe_2.12
  • Installation: Add to Maven pom.xml:
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-unsafe_2.12</artifactId>
    <version>3.5.6</version>
</dependency>

Core Imports

// Memory operations and platform utilities
import org.apache.spark.unsafe.Platform;
import org.apache.spark.unsafe.memory.MemoryAllocator;
import org.apache.spark.unsafe.memory.MemoryBlock;
import org.apache.spark.unsafe.memory.MemoryLocation;
import org.apache.spark.unsafe.memory.HeapMemoryAllocator;
import org.apache.spark.unsafe.memory.UnsafeMemoryAllocator;

// Data structures and arrays
import org.apache.spark.unsafe.array.LongArray;
import org.apache.spark.unsafe.array.ByteArrayMethods;
import org.apache.spark.unsafe.types.UTF8String;
import org.apache.spark.unsafe.types.UTF8StringBuilder;
import org.apache.spark.unsafe.types.CalendarInterval;
import org.apache.spark.unsafe.types.ByteArray;

// Hashing and utilities
import org.apache.spark.unsafe.hash.Murmur3_x86_32;
import org.apache.spark.unsafe.bitset.BitSetMethods;
import org.apache.spark.unsafe.UnsafeAlignedOffset;
import org.apache.spark.sql.catalyst.expressions.HiveHasher;
import org.apache.spark.sql.catalyst.util.DateTimeConstants;

Basic Usage

import org.apache.spark.unsafe.Platform;
import org.apache.spark.unsafe.memory.*;
import org.apache.spark.unsafe.types.UTF8String;

// Memory allocation example
MemoryAllocator allocator = MemoryAllocator.HEAP;
MemoryBlock block = allocator.allocate(1024);

// Direct memory operations
Platform.putLong(block.getBaseObject(), block.getBaseOffset(), 42L);
long value = Platform.getLong(block.getBaseObject(), block.getBaseOffset());

// UTF8String operations
UTF8String str = UTF8String.fromString("Hello Spark");
UTF8String upper = str.toUpperCase();
int length = str.numChars();

// Clean up
allocator.free(block);

Architecture

Apache Spark Unsafe is built around several key components:

  • Memory Management: Allocation strategies for heap and off-heap memory with automatic pooling
  • Platform Layer: Low-level unsafe operations that provide direct memory access across different platforms
  • Data Structures: Memory-efficient arrays, strings, and bitsets optimized for Spark's use cases
  • Type System: Specialized types like UTF8String and CalendarInterval designed for internal Spark operations
  • Hashing: Fast hashing implementations for data distribution and partitioning
  • Safety Features: Debug modes and alignment handling for platform-specific memory requirements

Capabilities

Memory Management

Core memory allocation and management functionality providing both heap and off-heap memory strategies with automatic pooling and debug support.

interface MemoryAllocator {
    MemoryBlock allocate(long size) throws OutOfMemoryError;
    void free(MemoryBlock memory);
    
    static final MemoryAllocator HEAP = new HeapMemoryAllocator();
    static final MemoryAllocator UNSAFE = new UnsafeMemoryAllocator();
}

class MemoryBlock extends MemoryLocation {
    public MemoryBlock(Object obj, long offset, long length);
    public long size();
    public void fill(byte value);
    public static MemoryBlock fromLongArray(long[] array);
}

Memory Management

Platform Operations

Low-level unsafe memory operations and platform-specific functionality for direct memory access, copying, and platform feature detection.

final class Platform {
    // Memory access methods for all primitive types
    public static int getInt(Object object, long offset);
    public static void putInt(Object object, long offset, int value);
    public static long getLong(Object object, long offset);
    public static void putLong(Object object, long offset, long value);
    public static byte getByte(Object object, long offset);
    public static void putByte(Object object, long offset, byte value);
    public static short getShort(Object object, long offset);
    public static void putShort(Object object, long offset, short value);
    public static float getFloat(Object object, long offset);
    public static void putFloat(Object object, long offset, float value);
    public static double getDouble(Object object, long offset);
    public static void putDouble(Object object, long offset, double value);
    public static boolean getBoolean(Object object, long offset);
    public static void putBoolean(Object object, long offset, boolean value);
    
    // Volatile operations
    public static Object getObjectVolatile(Object object, long offset);
    public static void putObjectVolatile(Object object, long offset, Object value);
    
    // Memory allocation and management
    public static long allocateMemory(long size);
    public static void freeMemory(long address);
    public static long reallocateMemory(long address, long oldSize, long newSize);
    public static void copyMemory(Object src, long srcOffset, Object dst, long dstOffset, long length);
    public static void setMemory(Object object, long offset, long size, byte value);
    public static void setMemory(long address, byte value, long size);
    
    // Platform detection and utilities
    public static boolean unaligned();
    public static boolean cleanerCreateMethodIsDefined();
    public static void throwException(Throwable t);
    public static ByteBuffer allocateDirectBuffer(int size);
}

Platform Operations

UTF8 String Operations

Memory-efficient UTF-8 string implementation with comprehensive string manipulation, parsing, and comparison operations optimized for Spark's internal use.

final class UTF8String implements Comparable<UTF8String>, Externalizable, KryoSerializable, Cloneable {
    // Factory methods
    public static UTF8String fromString(String str);
    public static UTF8String fromBytes(byte[] bytes);
    public static UTF8String concat(UTF8String... inputs);
    
    // String operations
    public UTF8String substring(int start, int until);
    public UTF8String toUpperCase();
    public UTF8String trim();
    public boolean contains(UTF8String substring);
    
    // Parsing
    public long toLongExact();
    public int toIntExact();
}

UTF8 String Operations

Array Operations

Efficient array implementations and utility methods for byte arrays and long arrays with support for both heap and off-heap memory storage.

final class LongArray {
    public LongArray(MemoryBlock memory);
    public long get(int index);
    public void set(int index, long value);
    public long size();
    public void zeroOut();
}

class ByteArrayMethods {
    public static boolean arrayEquals(Object leftBase, long leftOffset, Object rightBase, long rightOffset, long length);
    public static boolean contains(byte[] arr, byte[] sub);
    public static long nextPowerOf2(long num);
}

Array Operations

Hashing and Utilities

High-performance hashing implementations and utility classes including Murmur3 hashing, bitset operations, date/time constants, and platform alignment utilities.

final class Murmur3_x86_32 {
    public Murmur3_x86_32(int seed);
    public static int hashInt(int input, int seed);
    public static int hashLong(long input, int seed);
    public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed);
}

final class BitSetMethods {
    public static void set(Object baseObject, long baseOffset, int index);
    public static boolean isSet(Object baseObject, long baseOffset, int index);
    public static int nextSetBit(Object baseObject, long baseOffset, int fromIndex, int bitsetSizeInWords);
}

class HiveHasher {
    public static int hashInt(int input);
    public static int hashLong(long input);
    public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes);
}

class DateTimeConstants {
    public static final int MONTHS_PER_YEAR = 12;
    public static final long HOURS_PER_DAY = 24L;
    public static final long SECONDS_PER_MINUTE = 60L;
    public static final long MILLIS_PER_SECOND = 1000L;
    public static final long MICROS_PER_MILLIS = 1000L;
    public static final long NANOS_PER_MICROS = 1000L;
}

class UnsafeAlignedOffset {
    public static int getUaoSize();
    public static int getSize(Object object, long offset);
    public static void putSize(Object object, long offset, int value);
}

Hashing and Utilities

Types

Core Memory Types

class MemoryLocation {
    public MemoryLocation(Object obj, long offset);
    public final Object getBaseObject();
    public final long getBaseOffset();
}

abstract class KVIterator<K, V> {
    public abstract boolean next() throws IOException;
    public abstract K getKey();
    public abstract V getValue();
    public abstract void close();
}

Specialized Data Types

final class CalendarInterval implements Serializable {
    public final int months;
    public final int days;
    public final long microseconds;
    
    public CalendarInterval(int months, int days, long microseconds);
}

class UTF8StringBuilder {
    public UTF8StringBuilder();
    public UTF8StringBuilder(int initialSize);
    public void append(UTF8String value);
    public UTF8String build();
}

final class ByteArray {
    public static final byte[] EMPTY_BYTE;
    public static void writeToMemory(byte[] src, Object target, long targetOffset);
    public static long getPrefix(byte[] bytes);
    public static int compareBinary(byte[] leftBase, byte[] rightBase);
    public static byte[] subStringSQL(byte[] bytes, int pos, int len);
    public static byte[] concat(byte[]... inputs);
}

Error Handling

The library throws standard Java exceptions:

  • OutOfMemoryError - When memory allocation fails
  • IOException - During iterator operations
  • NumberFormatException - During string parsing operations
  • IndexOutOfBoundsException - For array access violations

Most operations are designed to be fail-fast and do not perform bounds checking for performance reasons. Users should validate inputs before calling unsafe operations.

Performance Considerations

  • Zero-Copy Operations: Many string and array operations avoid memory copying
  • Direct Memory Access: Bypasses JVM safety checks for maximum performance
  • Memory Pooling: Automatic pooling for large heap allocations
  • Platform Optimization: Platform-specific optimizations for memory alignment
  • Lazy Evaluation: Some operations use lazy evaluation patterns for efficiency