Apache Spark YARN resource manager integration module that enables Spark applications to run on YARN clusters
npx @tessl/cli install tessl/maven-org-apache-spark--spark-yarn-2-12@3.5.0Apache Spark YARN Resource Manager provides integration between Apache Spark and YARN (Yet Another Resource Negotiator) for running Spark applications on Hadoop clusters. This module enables Spark to leverage YARN's resource management and scheduling capabilities, supporting both client and cluster deployment modes with comprehensive resource allocation, security, and monitoring features.
pom.xml or include in Spark distributionimport org.apache.spark.deploy.yarn.{Client, ApplicationMaster}
import org.apache.spark.scheduler.cluster.{YarnClusterManager, YarnSchedulerBackend}
import org.apache.spark.SparkConfimport org.apache.spark.{SparkConf, SparkContext}
// Configure Spark for YARN
val conf = new SparkConf()
.setAppName("MySparkApp")
.setMaster("yarn")
.set("spark.yarn.queue", "default")
.set("spark.yarn.am.memory", "1g")
.set("spark.executor.memory", "2g")
.set("spark.executor.cores", "2")
// Create SparkContext - YARN integration is handled automatically
val sc = new SparkContext(conf)
// Your Spark application code here
val rdd = sc.parallelize(1 to 100)
val result = rdd.map(_ * 2).collect()
sc.stop()The Apache Spark YARN integration consists of several key components:
Core components for submitting and managing Spark applications on YARN clusters. Handles application submission, monitoring, and lifecycle management.
class Client(
args: ClientArguments,
sparkConf: SparkConf,
rpcEnv: RpcEnv
)
class ApplicationMaster(
args: ApplicationMasterArguments,
sparkConf: SparkConf,
yarnConf: YarnConfiguration
)Integration components that connect Spark's task scheduling system with YARN's resource management. Provides cluster manager and scheduler backends for both client and cluster modes.
class YarnClusterManager extends ExternalClusterManager
abstract class YarnSchedulerBackend(
scheduler: TaskSchedulerImpl,
sc: SparkContext
) extends CoarseGrainedSchedulerBackend
class YarnClientSchedulerBackend(
scheduler: TaskSchedulerImpl,
sc: SparkContext
) extends YarnSchedulerBackend
class YarnClusterSchedulerBackend(
scheduler: TaskSchedulerImpl,
sc: SparkContext
) extends YarnSchedulerBackendComponents responsible for allocating and managing YARN containers for Spark executors. Includes allocation strategies, placement policies, and resource request management.
class YarnAllocator
class YarnRMClient
object ResourceRequestHelper
class LocalityPreferredContainerPlacementStrategyComprehensive configuration system for YARN-specific settings including resource allocation, security, and deployment options.
package object config {
val APPLICATION_TAGS: ConfigEntry[Set[String]]
val QUEUE_NAME: ConfigEntry[String]
val AM_MEMORY: ConfigEntry[Long]
val AM_CORES: ConfigEntry[Int]
val EXECUTOR_NODE_LABEL_EXPRESSION: OptionalConfigEntry[String]
// ... and many more configuration options
}
class ClientArguments(args: Array[String])
class ApplicationMasterArguments(args: Array[String])case class YarnAppReport(
appState: YarnApplicationState,
finalState: FinalApplicationStatus,
diagnostics: Option[String]
)
class YarnClusterApplication extends SparkApplication {
def start(args: Array[String], conf: SparkConf): Unit
}class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl
class YarnClusterScheduler(sc: SparkContext) extends YarnSchedulerclass YarnCoarseGrainedExecutorBackend extends CoarseGrainedExecutorBackend {
def getUserClassPath: Seq[URL]
def extractLogUrls: Map[String, String]
def extractAttributes: Map[String, String]
}
class ExecutorRunnable {
def run(): Unit
def launchContextDebugInfo(): String
}Client class for custom application submission--master yarnApplicationMaster.main() - Entry point for cluster mode ApplicationMasterYarnCoarseGrainedExecutorBackend.main() - Entry point for executor processesYarnClusterApplication.start() - Entry point for programmatic cluster mode submissionExecutorLauncher.main() - Entry point for client mode executor launcher