Apache Spark is a unified analytics engine for large-scale data processing with high-level APIs in Scala, Java, Python, and R.
npx @tessl/cli install tessl/maven-org-apache-spark--spark-parent-2-13@4.0.0Apache Spark is a unified analytics engine for large-scale data processing. It provides high-level APIs in Scala, Java, Python, and R (deprecated), and an optimized engine that supports general computation graphs for data analysis. Spark includes specialized tools for SQL and DataFrames, machine learning (MLlib), graph processing (GraphX), and stream processing.
Apache Spark is a unified analytics engine for large-scale data processing with high-level APIs in Scala, Java, Python, and R (deprecated). Key features include:
org.apache.spark:spark-core_2.13:4.0.0For Scala applications:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import org.apache.spark.rdd.RDDFor Java applications:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;import org.apache.spark.{SparkConf, SparkContext}
// Create Spark configuration and context
val conf = new SparkConf().setAppName("MyApp").setMaster("local[*]")
val sc = new SparkContext(conf)
// Create RDD from collection
val data = Array(1, 2, 3, 4, 5)
val rdd = sc.parallelize(data)
// Transform and collect results
val result = rdd.map(_ * 2).filter(_ > 5).collect()
sc.stop()import org.apache.spark.sql.SparkSession
// Create Spark session
val spark = SparkSession.builder()
.appName("MyApp")
.master("local[*]")
.getOrCreate()
// Read data into DataFrame
val df = spark.read.json("path/to/data.json")
// SQL operations
df.select("name", "age")
.filter(df("age") > 21)
.show()
// SQL queries
df.createOrReplaceTempView("people")
val adults = spark.sql("SELECT name FROM people WHERE age >= 18")
spark.stop()Apache Spark follows a driver-executor architecture:
Key architectural components:
Provides the fundamental distributed computing capabilities with RDDs, transformations, actions, and distributed variables.
class SparkContext(config: SparkConf) {
def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T]
def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String]
def broadcast[T](value: T): Broadcast[T]
def longAccumulator(): LongAccumulator
def stop(): Unit
}
abstract class RDD[T: ClassTag] {
def map[U: ClassTag](f: T => U): RDD[U]
def filter(f: T => Boolean): RDD[T]
def collect(): Array[T]
def count(): Long
def reduce(f: (T, T) => T): T
def cache(): RDD[T]
}Structured data processing with SQL queries, DataFrames, and type-safe Datasets. Includes data source connectors and streaming capabilities.
object SparkSession {
def builder(): Builder
}
class SparkSession {
def read: DataFrameReader
def sql(sqlText: String): DataFrame
def createDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame
def stop(): Unit
}
abstract class Dataset[T] {
def select(cols: Column*): DataFrame
def filter(condition: Column): Dataset[T]
def groupBy(cols: Column*): RelationalGroupedDataset
def join(right: Dataset[_]): DataFrame
def collect(): Array[T]
def show(numRows: Int = 20): Unit
}Scalable machine learning algorithms and utilities, including both RDD-based (MLlib) and DataFrame-based (ML) APIs.
// DataFrame-based ML Pipeline API
abstract class Estimator[M <: Model[M]] extends PipelineStage {
def fit(dataset: Dataset[_]): M
}
abstract class Transformer extends PipelineStage {
def transform(dataset: Dataset[_]): DataFrame
}
class Pipeline(stages: Array[PipelineStage]) extends Estimator[PipelineModel] {
def fit(dataset: Dataset[_]): PipelineModel
}Large-scale graph processing with GraphX, including graph algorithms and graph-parallel computations.
abstract class Graph[VD: ClassTag, ED: ClassTag] {
def vertices: VertexRDD[VD]
def edges: EdgeRDD[ED]
def mapVertices[VD2: ClassTag](map: (VertexId, VD) => VD2): Graph[VD2, ED]
def subgraph(epred: EdgeTriplet[VD, ED] => Boolean = _ => true,
vpred: (VertexId, VD) => Boolean = (_, _) => true): Graph[VD, ED]
}
object GraphLoader {
def edgeListFile(sc: SparkContext, path: String): Graph[Int, Int]
}Real-time data processing with both legacy DStreams and modern Structured Streaming APIs.
class DataStreamReader {
def format(source: String): DataStreamReader
def option(key: String, value: String): DataStreamReader
def load(): DataFrame
}
abstract class StreamingQuery {
def start(): StreamingQuery
def stop(): Unit
def awaitTermination(): Unit
def isActive: Boolean
}class SparkConf(loadDefaults: Boolean = true) {
def set(key: String, value: String): SparkConf
def setAppName(name: String): SparkConf
def setMaster(master: String): SparkConf
def get(key: String): String
}
abstract class Broadcast[T] extends Serializable {
def value: T
def unpersist(): Unit
def destroy(): Unit
}
abstract class AccumulatorV2[IN, OUT] extends Serializable {
def add(v: IN): Unit
def value: OUT
def reset(): Unit
}
object StorageLevel {
val MEMORY_ONLY: StorageLevel
val MEMORY_AND_DISK: StorageLevel
val DISK_ONLY: StorageLevel
}trait Row extends Serializable {
def get(i: Int): Any
def getString(i: Int): String
def getInt(i: Int): Int
def getDouble(i: Int): Double
def isNullAt(i: Int): Boolean
def size: Int
}
class Column(expr: Expression) {
def ===(other: Any): Column
def !==(other: Any): Column
def >(other: Any): Column
def <(other: Any): Column
def isNull: Column
def cast(to: DataType): Column
def alias(alias: String): Column
}
class StructType(fields: Array[StructField]) extends DataType {
def add(field: StructField): StructType
def add(name: String, dataType: DataType): StructType
def fieldNames: Array[String]
}type VertexId = Long
case class Edge[ED](srcId: VertexId, dstId: VertexId, attr: ED)
class EdgeTriplet[VD, ED] extends Edge[ED] {
def srcAttr: VD
def dstAttr: VD
}
abstract class VertexRDD[VD] extends RDD[(VertexId, VD)]
abstract class EdgeRDD[ED] extends RDD[Edge[ED]]