Fault-tolerant decentralized peer-to-peer cluster membership service with no single point of failure for Akka distributed systems
Split Brain Resolution (SBR) in Akka Cluster provides strategies for handling network partitions by automatically downing unreachable members to maintain cluster consistency and availability.
abstract class DowningProvider {
def downRemovalMargin: FiniteDuration
def downingActorProps: Option[Props]
}
object DowningProvider {
def load(fqcn: String, system: ActorSystem): DowningProvider
}class NoDowning extends DowningProvider {
override def downRemovalMargin: FiniteDuration = Duration.Zero
override def downingActorProps: Option[Props] = None
}class SplitBrainResolverProvider(system: ActorSystem) extends DowningProvider {
override def downRemovalMargin: FiniteDuration
override def downingActorProps: Option[Props]
}akka.cluster {
downing-provider-class = "akka.cluster.sbr.SplitBrainResolverProvider"
split-brain-resolver {
# Select strategy: keep-majority, lease-majority, static-quorum, keep-oldest, down-all
active-strategy = "keep-majority"
# Time margin after which unreachable nodes will be downed
stable-after = 20s
# If on, down all members if cluster size is less than this
down-all-when-unstable = "on"
}
}Keeps the partition with the majority of nodes, downs the minority.
akka.cluster.split-brain-resolver {
active-strategy = "keep-majority"
keep-majority {
# Additional minimum size of majority partition
role = ""
}
}Behavior:
Usage Example:
// 5-node cluster splits into 3+2
// 3-node partition survives, 2-node partition is downed
// 4-node cluster splits into 2+2
// No partition is downed (requires configuration tuning)Uses a distributed lease to determine which partition can continue.
akka.cluster.split-brain-resolver {
active-strategy = "lease-majority"
lease-majority {
lease-implementation = "akka.coordination.lease.kubernetes"
# Acquire lease timeout
acquire-lease-delay-for-minority = 2s
# Release lease after
release-after = 40s
}
}Behavior:
Downs minority partitions based on configured quorum size.
akka.cluster.split-brain-resolver {
active-strategy = "static-quorum"
static-quorum {
# Minimum cluster size to maintain
quorum-size = 3
# Specific role that must meet quorum
role = ""
}
}Behavior:
quorum-size nodes are downedKeeps the partition containing the oldest member (by cluster join time).
akka.cluster.split-brain-resolver {
active-strategy = "keep-oldest"
keep-oldest {
# Prioritize members with this role
role = ""
# Down oldest member if singleton partition
down-if-alone = on
}
}Behavior:
Downs all unreachable members (primarily for testing).
akka.cluster.split-brain-resolver {
active-strategy = "down-all"
}Behavior:
class SplitBrainResolverSettings(config: Config) {
def activeStrategy: String
def stableAfter: FiniteDuration
def downAllWhenUnstable: DownAllWhenUnstable
}
sealed trait DownAllWhenUnstable
case object DownAllWhenUnstableOn extends DownAllWhenUnstable
case object DownAllWhenUnstableOff extends DownAllWhenUnstableakka.cluster.split-brain-resolver {
# Strategy to use
active-strategy = "keep-majority"
# Time to wait before taking downing decision
stable-after = 20s
# Down all when cluster becomes unstable
down-all-when-unstable = "on"
# Additional settings per strategy
keep-majority {
# Minimum size for majority
role = "core"
}
static-quorum {
quorum-size = 3
role = "important"
}
keep-oldest {
role = "seed"
down-if-alone = off
}
lease-majority {
lease-implementation = "akka.coordination.lease.kubernetes"
acquire-lease-delay-for-minority = 2s
release-after = 40s
}
}import akka.cluster.DowningProvider
import akka.actor.{ActorSystem, Props}
import scala.concurrent.duration._
class CustomDowningProvider(system: ActorSystem) extends DowningProvider {
override def downRemovalMargin: FiniteDuration = 10.seconds
override def downingActorProps: Option[Props] =
Some(Props(classOf[CustomDowningActor]))
}
class CustomDowningActor extends Actor with ActorLogging {
val cluster = Cluster(context.system)
// Subscribe to unreachability events
override def preStart(): Unit = {
cluster.subscribe(self, classOf[UnreachableMember])
}
override def postStop(): Unit = {
cluster.unsubscribe(self)
}
def receive = {
case UnreachableMember(member) =>
log.info("Member {} is unreachable", member)
// Custom downing logic
if (shouldDownMember(member)) {
log.warning("Downing unreachable member {}", member)
cluster.down(member.address)
}
}
private def shouldDownMember(member: Member): Boolean = {
// Custom logic - example: down after 30 seconds unreachable
// In practice, you'd track unreachable time
true
}
}akka.cluster.downing-provider-class = "com.example.CustomDowningProvider"// SBR logs decisions at INFO level
// Example log messages:
// "SBR is downing [Member(akka://sys@host1:2551, Up)] in partition [...]"
// "SBR is keeping partition [Member(akka://sys@host2:2551, Up), ...]"import akka.cluster.ClusterEvent._
class SBRMonitor extends Actor with ActorLogging {
val cluster = Cluster(context.system)
override def preStart(): Unit = {
cluster.subscribe(self, classOf[MemberDowned], classOf[MemberRemoved])
}
def receive = {
case MemberDowned(member) =>
log.warning("Member downed by SBR: {}", member)
// Send alert/metric
case MemberRemoved(member, previousStatus) =>
if (previousStatus == MemberStatus.Down) {
log.info("Previously downed member removed: {}", member)
// Update monitoring dashboard
}
}
}import akka.http.scaladsl.server.Route
import akka.http.scaladsl.server.Directives._
def healthRoute: Route = {
path("health") {
get {
val cluster = Cluster(system)
val unreachableCount = cluster.state.unreachable.size
if (unreachableCount == 0) {
complete("healthy")
} else {
complete(s"unhealthy: $unreachableCount unreachable members")
}
}
}
}Keep Majority:
Lease Majority:
Static Quorum:
Keep Oldest:
# Production configuration
akka.cluster.split-brain-resolver {
active-strategy = "keep-majority"
stable-after = 30s # Allow time for transient network issues
down-all-when-unstable = "on" # Prevent brain-dead cluster states
keep-majority {
# Use role-based majority for heterogeneous clusters
role = "core"
}
}// Monitor cluster health
val cluster = Cluster(system)
// Check for unreachable members
val unreachableMembers = cluster.state.unreachable
if (unreachableMembers.nonEmpty) {
log.warning("Unreachable members detected: {}",
unreachableMembers.map(_.address).mkString(", "))
}
// Monitor cluster size
val memberCount = cluster.state.members.count(_.status == MemberStatus.Up)
val minimumRequired = 3 // Your application's minimum
if (memberCount < minimumRequired) {
log.error("Cluster size {} below minimum required {}", memberCount, minimumRequired)
// Consider alerting or graceful degradation
}// Use MultiNodeSpec for testing split brain scenarios
class SplitBrainResolverSpec extends MultiNodeSpec(SplitBrainConfig) {
"Split Brain Resolver" should {
"down minority partition in keep-majority strategy" in {
// Create 5-node cluster
awaitClusterUp(first, second, third, fourth, fifth)
// Partition cluster into 3+2
testConductor.blackhole(first, fourth, Direction.Both)
testConductor.blackhole(first, fifth, Direction.Both)
testConductor.blackhole(second, fourth, Direction.Both)
testConductor.blackhole(second, fifth, Direction.Both)
testConductor.blackhole(third, fourth, Direction.Both)
testConductor.blackhole(third, fifth, Direction.Both)
// Verify majority partition (first, second, third) survives
runOn(first, second, third) {
within(30.seconds) {
awaitAssert {
cluster.state.members.size should be(3)
cluster.state.unreachable should be(empty)
}
}
}
// Verify minority partition (fourth, fifth) is downed
runOn(fourth, fifth) {
within(30.seconds) {
awaitAssert {
cluster.isTerminated should be(true)
}
}
}
}
}
}Install with Tessl CLI
npx tessl i tessl/maven-com-typesafe-akka--akka-cluster-2-12