YARN integration support for Apache Spark cluster computing, enabling Spark applications to run on Hadoop YARN clusters
npx @tessl/cli install tessl/maven-org-apache-spark--yarn-parent-2-10@1.2.0Apache Spark YARN Integration (org.apache.spark:yarn-parent_2.10) provides YARN (Yet Another Resource Negotiator) integration capabilities for Apache Spark, enabling Spark applications to run on Hadoop YARN clusters. This package serves as a comprehensive solution for deploying, managing, and monitoring Spark applications in Hadoop ecosystems.
org.apache.spark:yarn-parent_2.10:1.2.2org.apache.spark.deploy.yarn, org.apache.spark.scheduler.clusterimport org.apache.spark.deploy.yarn.{Client, ClientArguments, ApplicationMaster, ApplicationMasterArguments}
import org.apache.spark.deploy.yarn.{YarnRMClient, YarnAllocator, AllocationType}
import org.apache.spark.scheduler.cluster.{YarnClientSchedulerBackend, YarnClusterSchedulerBackend}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.yarn.api.records._import org.apache.spark.deploy.yarn.{Client, ClientArguments}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.conf.Configuration
// Create Spark configuration
val sparkConf = new SparkConf()
.setAppName("My Spark App")
.set("spark.executor.memory", "2g")
.set("spark.executor.cores", "2")
// Configure YARN client
val hadoopConf = new Configuration()
val args = Array("--jar", "my-app.jar", "--class", "MyMainClass")
val clientArgs = new ClientArguments(args, sparkConf)
// Submit application to YARN
val client = new Client(clientArgs, hadoopConf, sparkConf)
// Application submission handled by Spark runtimeThe Spark YARN integration is built around several key architectural components:
Core client functionality for submitting and managing Spark applications on YARN clusters. Handles application lifecycle, resource negotiation, and monitoring.
class Client(
args: ClientArguments,
hadoopConf: Configuration,
sparkConf: SparkConf
) extends ClientBase {
def stop(): Unit
}
private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf) {
var addJars: String
var files: String
var archives: String
var userJar: String
var userClass: String
var userArgs: Seq[String]
var executorMemory: Int
var executorCores: Int
var numExecutors: Int
var amQueue: String
var amMemory: Int
var appName: String
var priority: Int
val amMemoryOverhead: Int
val executorMemoryOverhead: Int
}ApplicationMaster functionality for managing Spark applications running on YARN. Handles resource negotiation with ResourceManager and executor lifecycle management.
class ApplicationMaster(
args: ApplicationMasterArguments,
client: YarnRMClient
) {
// Application lifecycle management
// Resource negotiation with YARN ResourceManager
// Executor management and monitoring
}
class ApplicationMasterArguments(val args: Array[String]) {
var userJar: String
var userClass: String
var userArgs: Seq[String]
var executorMemory: Int
var executorCores: Int
var numExecutors: Int
def printUsageAndExit(exitCode: Int, unknownParam: Any = null): Unit
}Scheduler backend implementations for integrating Spark's TaskScheduler with YARN resource management, supporting both client and cluster deployment modes.
class YarnClientSchedulerBackend(
scheduler: TaskSchedulerImpl,
sc: SparkContext
) extends YarnSchedulerBackend {
def start(): Unit
}
class YarnClusterSchedulerBackend extends YarnSchedulerBackend
class YarnClientClusterScheduler(sc: SparkContext) extends TaskSchedulerImpl
class YarnClusterScheduler(sc: SparkContext) extends TaskSchedulerImplResource allocation and management components for negotiating and monitoring YARN cluster resources for Spark executors.
trait YarnRMClient {
// ResourceManager client interface
}
abstract class YarnAllocator {
// Abstract base class for YARN resource allocation logic
}
object AllocationType extends Enumeration {
// Enumeration for YARN allocation types
}Utility classes and configuration management for YARN-specific operations, distributed cache management, and executor container handling.
class YarnSparkHadoopUtil extends SparkHadoopUtil {
// YARN-specific Hadoop utilities
}
class ClientDistributedCacheManager {
// Manages distributed cache for YARN applications
}
trait ExecutorRunnableUtil {
// Utility trait for executor container management
}// Core argument and configuration types
private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
class ApplicationMasterArguments(val args: Array[String])
// Resource management interfaces
trait YarnRMClient
class YarnRMClientImpl(args: ApplicationMasterArguments) extends YarnRMClient
private[yarn] abstract class YarnAllocator(
conf: Configuration,
sparkConf: SparkConf,
appAttemptId: ApplicationAttemptId,
args: ApplicationMasterArguments,
preferredNodes: collection.Map[String, collection.Set[SplitInfo]],
securityMgr: SecurityManager
) extends Logging
// Allocation strategy enumeration
object AllocationType extends Enumeration {
type AllocationType = Value
val HOST, RACK, ANY = Value
}
// Scheduler and backend types
private[spark] abstract class YarnSchedulerBackend extends CoarseGrainedSchedulerBackend
private[spark] class TaskSchedulerImpl extends TaskScheduler
// Client and utility traits
private[spark] trait ClientBase
trait ExecutorRunnableUtilThe YARN integration supports two primary deployment modes:
Both modes are handled transparently through the appropriate scheduler backend selection.
This package provides support for multiple Hadoop YARN API versions:
The build system automatically selects the appropriate implementation based on Maven profiles.