Split Brain Resolution (SBR) in Akka Cluster provides strategies for handling network partitions by automatically downing unreachable members to maintain cluster consistency and availability.
abstract class DowningProvider {
def downRemovalMargin: FiniteDuration
def downingActorProps: Option[Props]
}
object DowningProvider {
def load(fqcn: String, system: ActorSystem): DowningProvider
}class NoDowning extends DowningProvider {
override def downRemovalMargin: FiniteDuration = Duration.Zero
override def downingActorProps: Option[Props] = None
}class SplitBrainResolverProvider(system: ActorSystem) extends DowningProvider {
override def downRemovalMargin: FiniteDuration
override def downingActorProps: Option[Props]
}akka.cluster {
downing-provider-class = "akka.cluster.sbr.SplitBrainResolverProvider"
split-brain-resolver {
# Select strategy: keep-majority, lease-majority, static-quorum, keep-oldest, down-all
active-strategy = "keep-majority"
# Time margin after which unreachable nodes will be downed
stable-after = 20s
# If on, down all members if cluster size is less than this
down-all-when-unstable = "on"
}
}Keeps the partition with the majority of nodes, downs the minority.
akka.cluster.split-brain-resolver {
active-strategy = "keep-majority"
keep-majority {
# Additional minimum size of majority partition
role = ""
}
}Behavior:
Usage Example:
// 5-node cluster splits into 3+2
// 3-node partition survives, 2-node partition is downed
// 4-node cluster splits into 2+2
// No partition is downed (requires configuration tuning)Uses a distributed lease to determine which partition can continue.
akka.cluster.split-brain-resolver {
active-strategy = "lease-majority"
lease-majority {
lease-implementation = "akka.coordination.lease.kubernetes"
# Acquire lease timeout
acquire-lease-delay-for-minority = 2s
# Release lease after
release-after = 40s
}
}Behavior:
Downs minority partitions based on configured quorum size.
akka.cluster.split-brain-resolver {
active-strategy = "static-quorum"
static-quorum {
# Minimum cluster size to maintain
quorum-size = 3
# Specific role that must meet quorum
role = ""
}
}Behavior:
quorum-size nodes are downedKeeps the partition containing the oldest member (by cluster join time).
akka.cluster.split-brain-resolver {
active-strategy = "keep-oldest"
keep-oldest {
# Prioritize members with this role
role = ""
# Down oldest member if singleton partition
down-if-alone = on
}
}Behavior:
Downs all unreachable members (primarily for testing).
akka.cluster.split-brain-resolver {
active-strategy = "down-all"
}Behavior:
class SplitBrainResolverSettings(config: Config) {
def activeStrategy: String
def stableAfter: FiniteDuration
def downAllWhenUnstable: DownAllWhenUnstable
}
sealed trait DownAllWhenUnstable
case object DownAllWhenUnstableOn extends DownAllWhenUnstable
case object DownAllWhenUnstableOff extends DownAllWhenUnstableakka.cluster.split-brain-resolver {
# Strategy to use
active-strategy = "keep-majority"
# Time to wait before taking downing decision
stable-after = 20s
# Down all when cluster becomes unstable
down-all-when-unstable = "on"
# Additional settings per strategy
keep-majority {
# Minimum size for majority
role = "core"
}
static-quorum {
quorum-size = 3
role = "important"
}
keep-oldest {
role = "seed"
down-if-alone = off
}
lease-majority {
lease-implementation = "akka.coordination.lease.kubernetes"
acquire-lease-delay-for-minority = 2s
release-after = 40s
}
}import akka.cluster.DowningProvider
import akka.actor.{ActorSystem, Props}
import scala.concurrent.duration._
class CustomDowningProvider(system: ActorSystem) extends DowningProvider {
override def downRemovalMargin: FiniteDuration = 10.seconds
override def downingActorProps: Option[Props] =
Some(Props(classOf[CustomDowningActor]))
}
class CustomDowningActor extends Actor with ActorLogging {
val cluster = Cluster(context.system)
// Subscribe to unreachability events
override def preStart(): Unit = {
cluster.subscribe(self, classOf[UnreachableMember])
}
override def postStop(): Unit = {
cluster.unsubscribe(self)
}
def receive = {
case UnreachableMember(member) =>
log.info("Member {} is unreachable", member)
// Custom downing logic
if (shouldDownMember(member)) {
log.warning("Downing unreachable member {}", member)
cluster.down(member.address)
}
}
private def shouldDownMember(member: Member): Boolean = {
// Custom logic - example: down after 30 seconds unreachable
// In practice, you'd track unreachable time
true
}
}akka.cluster.downing-provider-class = "com.example.CustomDowningProvider"// SBR logs decisions at INFO level
// Example log messages:
// "SBR is downing [Member(akka://sys@host1:2551, Up)] in partition [...]"
// "SBR is keeping partition [Member(akka://sys@host2:2551, Up), ...]"import akka.cluster.ClusterEvent._
class SBRMonitor extends Actor with ActorLogging {
val cluster = Cluster(context.system)
override def preStart(): Unit = {
cluster.subscribe(self, classOf[MemberDowned], classOf[MemberRemoved])
}
def receive = {
case MemberDowned(member) =>
log.warning("Member downed by SBR: {}", member)
// Send alert/metric
case MemberRemoved(member, previousStatus) =>
if (previousStatus == MemberStatus.Down) {
log.info("Previously downed member removed: {}", member)
// Update monitoring dashboard
}
}
}import akka.http.scaladsl.server.Route
import akka.http.scaladsl.server.Directives._
def healthRoute: Route = {
path("health") {
get {
val cluster = Cluster(system)
val unreachableCount = cluster.state.unreachable.size
if (unreachableCount == 0) {
complete("healthy")
} else {
complete(s"unhealthy: $unreachableCount unreachable members")
}
}
}
}Keep Majority:
Lease Majority:
Static Quorum:
Keep Oldest:
# Production configuration
akka.cluster.split-brain-resolver {
active-strategy = "keep-majority"
stable-after = 30s # Allow time for transient network issues
down-all-when-unstable = "on" # Prevent brain-dead cluster states
keep-majority {
# Use role-based majority for heterogeneous clusters
role = "core"
}
}// Monitor cluster health
val cluster = Cluster(system)
// Check for unreachable members
val unreachableMembers = cluster.state.unreachable
if (unreachableMembers.nonEmpty) {
log.warning("Unreachable members detected: {}",
unreachableMembers.map(_.address).mkString(", "))
}
// Monitor cluster size
val memberCount = cluster.state.members.count(_.status == MemberStatus.Up)
val minimumRequired = 3 // Your application's minimum
if (memberCount < minimumRequired) {
log.error("Cluster size {} below minimum required {}", memberCount, minimumRequired)
// Consider alerting or graceful degradation
}// Use MultiNodeSpec for testing split brain scenarios
class SplitBrainResolverSpec extends MultiNodeSpec(SplitBrainConfig) {
"Split Brain Resolver" should {
"down minority partition in keep-majority strategy" in {
// Create 5-node cluster
awaitClusterUp(first, second, third, fourth, fifth)
// Partition cluster into 3+2
testConductor.blackhole(first, fourth, Direction.Both)
testConductor.blackhole(first, fifth, Direction.Both)
testConductor.blackhole(second, fourth, Direction.Both)
testConductor.blackhole(second, fifth, Direction.Both)
testConductor.blackhole(third, fourth, Direction.Both)
testConductor.blackhole(third, fifth, Direction.Both)
// Verify majority partition (first, second, third) survives
runOn(first, second, third) {
within(30.seconds) {
awaitAssert {
cluster.state.members.size should be(3)
cluster.state.unreachable should be(empty)
}
}
}
// Verify minority partition (fourth, fifth) is downed
runOn(fourth, fifth) {
within(30.seconds) {
awaitAssert {
cluster.isTerminated should be(true)
}
}
}
}
}
}