Apache Spark is a unified analytics engine for large-scale data processing
npx @tessl/cli install tessl/maven-org-apache-spark--spark-parent-212@3.5.0Apache Spark is a unified analytics engine for large-scale data processing that provides high-level APIs in Scala, Java, Python, and R, along with an optimized engine supporting general computation graphs. Spark includes multiple specialized components for SQL and DataFrames processing, machine learning, graph processing, and real-time stream processing.
For Scala applications:
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.rdd.RDDFor Java applications:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;Maven dependency:
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.13</artifactId>
<version>3.5.6</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.13</artifactId>
<version>3.5.6</version>
</dependency>import org.apache.spark.sql.SparkSession
// Create SparkSession (entry point for DataFrame and SQL APIs)
val spark = SparkSession.builder()
.appName("MySparkApp")
.master("local[*]")
.getOrCreate()
// Create DataFrame from data
val df = spark.createDataFrame(Seq(
("Alice", 25),
("Bob", 30),
("Charlie", 35)
)).toDF("name", "age")
// Run SQL queries
df.createOrReplaceTempView("people")
val adults = spark.sql("SELECT * FROM people WHERE age >= 30")
adults.show()
// DataFrame transformations
val result = df.filter($"age" > 25)
.select($"name", $"age")
.orderBy($"age".desc)
result.collect()
spark.stop()Apache Spark consists of several key components:
Fundamental distributed computing capabilities with Resilient Distributed Datasets (RDDs).
class SparkContext(config: SparkConf) {
def parallelize[T](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T]
def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String]
def stop(): Unit
}
abstract class RDD[T] {
def map[U](f: T => U): RDD[U]
def filter(f: T => Boolean): RDD[T]
def collect(): Array[T]
def count(): Long
def cache(): this.type
}High-level APIs for working with structured data, including DataFrames, Datasets, and SQL.
class SparkSession {
def sql(sqlText: String): DataFrame
def read: DataFrameReader
def createDataFrame[A <: Product](rdd: RDD[A]): DataFrame
}
class Dataset[T] {
def select(cols: Column*): DataFrame
def filter(condition: Column): Dataset[T]
def groupBy(cols: Column*): RelationalGroupedDataset
def join(right: Dataset[_], joinExprs: Column): DataFrame
def show(): Unit
def collect(): Array[T]
}Comprehensive machine learning library with algorithms, feature engineering, and model evaluation.
class Pipeline extends Estimator[PipelineModel] {
def setStages(value: Array[PipelineStage]): Pipeline
def fit(dataset: Dataset[_]): PipelineModel
}
abstract class Estimator[M <: Model[M]] {
def fit(dataset: Dataset[_]): M
}
abstract class Transformer {
def transform(dataset: Dataset[_]): DataFrame
}Graph-parallel computation framework for processing property graphs.
abstract class Graph[VD, ED] {
def vertices: VertexRDD[VD]
def edges: EdgeRDD[ED]
def mapVertices[VD2](map: (VertexId, VD) => VD2): Graph[VD2, ED]
def aggregateMessages[A](sendMsg: EdgeContext[VD, ED, A] => Unit,
mergeMsg: (A, A) => A): VertexRDD[A]
def pageRank(tol: Double, resetProb: Double = 0.15): Graph[Double, Double]
}Real-time stream processing capabilities for continuous data processing.
class StreamingContext(sparkContext: SparkContext, batchDuration: Duration) {
def socketTextStream(hostname: String, port: Int): ReceiverInputDStream[String]
def textFileStream(directory: String): DStream[String]
def start(): Unit
def awaitTermination(): Unit
}
abstract class DStream[T] {
def map[U](mapFunc: T => U): DStream[U]
def filter(filterFunc: T => Boolean): DStream[T]
def window(windowDuration: Duration, slideDuration: Duration): DStream[T]
def print(): Unit
}Programmatic launching of Spark applications with monitoring capabilities.
public class SparkLauncher {
public SparkLauncher setAppName(String appName);
public SparkLauncher setMaster(String master);
public SparkLauncher setMainClass(String mainClass);
public SparkAppHandle startApplication();
}
public interface SparkAppHandle {
State getState();
String getAppId();
void kill();
}class SparkConf {
def set(key: String, value: String): SparkConf
def setMaster(master: String): SparkConf
def setAppName(name: String): SparkConf
}
class Broadcast[T] {
def value: T
def destroy(): Unit
}
object StorageLevel {
val MEMORY_ONLY: StorageLevel
val MEMORY_AND_DISK: StorageLevel
val MEMORY_ONLY_SER: StorageLevel
val DISK_ONLY: StorageLevel
}import org.apache.spark.sql.types._
case class StructType(fields: Array[StructField])
case class StructField(name: String, dataType: DataType, nullable: Boolean = true)
abstract class DataType
object DataTypes {
val StringType: DataType
val IntegerType: DataType
val DoubleType: DataType
val BooleanType: DataType
val TimestampType: DataType
}
trait Row {
def getString(i: Int): String
def getInt(i: Int): Int
def getDouble(i: Int): Double
def getBoolean(i: Int): Boolean
}
class Column {
def ===(other: Any): Column
def &&(other: Column): Column
def ||(other: Column): Column
def isNull: Column
def isNotNull: Column
}type VertexId = Long
case class Edge[ED](srcId: VertexId, dstId: VertexId, attr: ED)
class EdgeTriplet[VD, ED] extends Edge[ED] {
def srcAttr: VD
def dstAttr: VD
}
abstract class VertexRDD[VD] extends RDD[(VertexId, VD)]
abstract class EdgeRDD[ED] extends RDD[Edge[ED]]