Apache Spark - Unified analytics engine for large-scale data processing
npx @tessl/cli install tessl/maven-org-apache-spark--spark-parent-2-12@3.5.0Apache Spark is a unified analytics engine for large-scale data processing that provides advanced programming APIs in Scala, Java, Python, and R. It offers high-level APIs for distributed data processing along with an optimized computation engine supporting general directed acyclic graphs for data analysis.
org.apache.spark:spark-core_2.12:3.5.6pip install pyspark==3.5.6install.packages("SparkR")import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.rdd.RDDFor Java:
import org.apache.spark.SparkContext;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;For Python:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, DataFrame, Column
from pyspark.rdd import RDDCommon PySpark imports:
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegressionimport org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
// Create SparkSession (modern approach)
val spark = SparkSession.builder()
.appName("MyApp")
.master("local[*]")
.getOrCreate()
// Working with DataFrames
val df = spark.read
.option("header", "true")
.csv("data.csv")
df.select("name", "age")
.filter($"age" > 21)
.show()
// Working with RDDs (low-level API)
val sc = spark.sparkContext
val data = sc.parallelize(1 to 1000)
val result = data
.map(_ * 2)
.filter(_ > 100)
.collect()
spark.stop()Python equivalent:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# Create SparkSession
spark = SparkSession.builder \
.appName("MyApp") \
.master("local[*]") \
.getOrCreate()
# Working with DataFrames
df = spark.read \
.option("header", "true") \
.csv("data.csv")
df.select("name", "age") \
.filter(F.col("age") > 21) \
.show()
# Working with RDDs
sc = spark.sparkContext
data = sc.parallelize(range(1, 1001))
result = data \
.map(lambda x: x * 2) \
.filter(lambda x: x > 100) \
.collect()
spark.stop()Apache Spark is built around several key components:
Distributed data processing using Resilient Distributed Datasets (RDDs) and the fundamental Spark execution engine. Provides fault-tolerant, parallel data structures and transformations.
class SparkContext(config: SparkConf) {
def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T]
def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String]
def broadcast[T: ClassTag](value: T): Broadcast[T]
def stop(): Unit
}
abstract class RDD[T: ClassTag] {
def map[U: ClassTag](f: T => U): RDD[U]
def filter(f: T => Boolean): RDD[T]
def collect(): Array[T]
def count(): Long
def cache(): RDD[T]
}Python API:
class SparkContext:
def __init__(self, conf: SparkConf = None)
def parallelize(self, c: Iterable, numSlices: int = None) -> RDD
def textFile(self, name: str, minPartitions: int = None) -> RDD
def broadcast(self, value: Any) -> Broadcast
def stop(self) -> None
class RDD:
def map(self, f: Callable) -> RDD
def filter(self, f: Callable) -> RDD
def collect(self) -> List
def count(self) -> int
def cache(self) -> RDDHigh-level APIs for working with structured data using DataFrames and Datasets. Built on Spark SQL with Catalyst optimizer for query optimization and code generation.
class SparkSession {
def read: DataFrameReader
def sql(sqlText: String): DataFrame
def table(tableName: String): DataFrame
def createDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame
def stop(): Unit
}
class Dataset[T] {
def select(cols: Column*): DataFrame
def filter(condition: Column): Dataset[T]
def groupBy(cols: Column*): RelationalGroupedDataset
def join(right: Dataset[_]): DataFrame
def write: DataFrameWriter[T]
def collect(): Array[T]
def show(numRows: Int = 20): Unit
}
type DataFrame = Dataset[Row]Python API:
class SparkSession:
@property
def read(self) -> DataFrameReader
def sql(self, sqlQuery: str) -> DataFrame
def table(self, tableName: str) -> DataFrame
def createDataFrame(self, data: List, schema: Optional[Union[List, StructType]] = None) -> DataFrame
def stop(self) -> None
class DataFrame:
def select(self, *cols: Union[str, Column]) -> DataFrame
def filter(self, condition: Union[str, Column]) -> DataFrame
def where(self, condition: Union[str, Column]) -> DataFrame
def groupBy(self, *cols: Union[str, Column]) -> GroupedData
def join(self, other: DataFrame, on: Optional[Union[str, List[str], Column]] = None) -> DataFrame
def collect(self) -> List[Row]
def show(self, n: int = 20, truncate: bool = True) -> NoneComprehensive machine learning library with algorithms for classification, regression, clustering, and collaborative filtering. Provides both high-level Pipeline API and low-level RDD-based APIs.
class Pipeline extends Estimator[PipelineModel] {
def setStages(value: Array[PipelineStage]): Pipeline
def fit(dataset: Dataset[_]): PipelineModel
}
abstract class Estimator[M <: Model[M]] extends PipelineStage {
def fit(dataset: Dataset[_]): M
}
abstract class Transformer extends PipelineStage {
def transform(dataset: Dataset[_]): DataFrame
}Python API:
class Pipeline(Estimator):
def setStages(self, value: List[PipelineStage]) -> Pipeline
def fit(self, dataset: DataFrame) -> PipelineModel
class Estimator(PipelineStage):
def fit(self, dataset: DataFrame) -> Model
class Transformer(PipelineStage):
def transform(self, dataset: DataFrame) -> DataFrame
class PipelineModel(Model):
def transform(self, dataset: DataFrame) -> DataFrameGraphX provides APIs for graphs and graph-parallel computation with fundamental operators like subgraph, joinVertices, and aggregateMessages, plus optimized variants of graph algorithms. Note: GraphX is only available in Scala and Java - there is no Python API.
abstract class Graph[VD: ClassTag, ED: ClassTag] {
val vertices: VertexRDD[VD]
val edges: EdgeRDD[ED]
def mapVertices[VD2: ClassTag](map: (VertexId, VD) => VD2): Graph[VD2, ED]
def aggregateMessages[A: ClassTag](
sendMsg: EdgeContext[VD, ED, A] => Unit,
mergeMsg: (A, A) => A
): VertexRDD[A]
}
type VertexId = LongPython Alternative: Use GraphFrames library (pip install graphframes) for graph processing in Python with Spark DataFrames.
Structured Streaming provides real-time stream processing with exactly-once fault-tolerance guarantees. Built on the Spark SQL engine for seamless integration with batch processing.
class StreamingContext(conf: SparkConf, batchDuration: Duration) {
def socketTextStream(hostname: String, port: Int): ReceiverInputDStream[String]
def textFileStream(directory: String): DStream[String]
def start(): Unit
def stop(): Unit
def awaitTermination(): Unit
}
abstract class DStream[T: ClassTag] {
def map[U: ClassTag](mapFunc: T => U): DStream[U]
def filter(filterFunc: T => Boolean): DStream[T]
def window(windowDuration: Duration, slideDuration: Duration): DStream[T]
def foreachRDD(foreachFunc: RDD[T] => Unit): Unit
}Python API:
class StreamingContext:
def __init__(self, sparkContext: SparkContext, batchDuration: float)
def socketTextStream(self, hostname: str, port: int) -> DStream
def textFileStream(self, directory: str) -> DStream
def start(self) -> None
def stop(self, stopSparkContext: bool = True, stopGracefully: bool = False) -> None
def awaitTermination(self, timeout: Optional[float] = None) -> None
class DStream:
def map(self, f: Callable) -> DStream
def filter(self, f: Callable) -> DStream
def window(self, windowDuration: float, slideDuration: Optional[float] = None) -> DStream
def foreachRDD(self, func: Callable[[RDD], None]) -> NoneFor modern streaming applications, prefer Structured Streaming via SparkSession.readStream which provides better fault tolerance and performance.
Programmatic interfaces for launching, monitoring, and managing Spark applications across different cluster managers and deployment modes.
class SparkConf(loadDefaults: Boolean = true) {
def set(key: String, value: String): SparkConf
def setMaster(master: String): SparkConf
def setAppName(name: String): SparkConf
def get(key: String): String
}public class SparkLauncher {
public SparkLauncher setAppName(String appName);
public SparkLauncher setMaster(String master);
public SparkLauncher setMainClass(String mainClass);
public Process launch() throws IOException;
public SparkAppHandle startApplication() throws IOException;
}Python API:
class SparkConf:
def __init__(self, loadDefaults: bool = True)
def set(self, key: str, value: str) -> SparkConf
def setMaster(self, value: str) -> SparkConf
def setAppName(self, value: str) -> SparkConf
def get(self, key: str, defaultValue: Optional[str] = None) -> str
def getAll(self) -> List[Tuple[str, str]]Note: Python does not have a direct equivalent to SparkLauncher. Use spark-submit command-line tool or SparkSession.builder for application management.
Python-specific Spark capabilities including pandas API compatibility, type hints, and Python-optimized operations.
# Pandas API on Spark (pyspark.pandas)
import pyspark.pandas as ps
class DataFrame: # pandas-compatible DataFrame
def head(self, n: int = 5) -> DataFrame
def describe(self) -> DataFrame
def groupby(self, by: Union[str, List[str]]) -> GroupBy
def merge(self, right: DataFrame, on: str = None) -> DataFrame
# SQL Functions
from pyspark.sql import functions as F
# Common functions (473+ available)
def col(colName: str) -> Column
def lit(col: Any) -> Column
def when(condition: Column, value: Any) -> Column
def coalesce(*cols: Column) -> Column
def concat(*cols: Column) -> Column
def regexp_replace(str: Column, pattern: str, replacement: str) -> ColumnKey Python Features:
pyspark.pandas provides pandas-compatible DataFrame operations