or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

cluster-management.mdcluster-routing.mdconfiguration.mdevent-system.mdindex.mdmember-management.mdsplit-brain-resolution.md
tile.json

split-brain-resolution.mddocs/

Split Brain Resolution

Split Brain Resolution (SBR) in Akka Cluster provides strategies for handling network partitions by automatically downing unreachable members to maintain cluster consistency and availability.

DowningProvider API

Base DowningProvider

abstract class DowningProvider {
  def downRemovalMargin: FiniteDuration
  def downingActorProps: Option[Props]
}

object DowningProvider {
  def load(fqcn: String, system: ActorSystem): DowningProvider
}

NoDowning (Default)

class NoDowning extends DowningProvider {
  override def downRemovalMargin: FiniteDuration = Duration.Zero
  override def downingActorProps: Option[Props] = None
}

Split Brain Resolver Provider

SplitBrainResolverProvider

class SplitBrainResolverProvider(system: ActorSystem) extends DowningProvider {
  override def downRemovalMargin: FiniteDuration
  override def downingActorProps: Option[Props]
}

Configuration

akka.cluster {
  downing-provider-class = "akka.cluster.sbr.SplitBrainResolverProvider"
  
  split-brain-resolver {
    # Select strategy: keep-majority, lease-majority, static-quorum, keep-oldest, down-all
    active-strategy = "keep-majority"
    
    # Time margin after which unreachable nodes will be downed
    stable-after = 20s
    
    # If on, down all members if cluster size is less than this
    down-all-when-unstable = "on"
  }
}

SBR Strategies

Keep Majority Strategy

Keeps the partition with the majority of nodes, downs the minority.

akka.cluster.split-brain-resolver {
  active-strategy = "keep-majority"
  
  keep-majority {
    # Additional minimum size of majority partition
    role = ""
  }
}

Behavior:

  • Partition with >50% of nodes survives
  • Minority partitions are downed
  • Equal-sized partitions: no nodes are downed (configurable)

Usage Example:

// 5-node cluster splits into 3+2
// 3-node partition survives, 2-node partition is downed

// 4-node cluster splits into 2+2  
// No partition is downed (requires configuration tuning)

Lease Majority Strategy

Uses a distributed lease to determine which partition can continue.

akka.cluster.split-brain-resolver {
  active-strategy = "lease-majority"
  
  lease-majority {
    lease-implementation = "akka.coordination.lease.kubernetes"
    # Acquire lease timeout
    acquire-lease-delay-for-minority = 2s
    # Release lease after
    release-after = 40s
  }
}

Behavior:

  • Majority partition acquires lease and survives
  • Minority waits then attempts lease acquisition
  • Only one partition can hold lease at a time

Static Quorum Strategy

Downs minority partitions based on configured quorum size.

akka.cluster.split-brain-resolver {
  active-strategy = "static-quorum"
  
  static-quorum {
    # Minimum cluster size to maintain
    quorum-size = 3
    
    # Specific role that must meet quorum
    role = ""
  }
}

Behavior:

  • Partitions with fewer than quorum-size nodes are downed
  • Multiple partitions can survive if both meet quorum
  • Useful for clusters with known minimum size requirements

Keep Oldest Strategy

Keeps the partition containing the oldest member (by cluster join time).

akka.cluster.split-brain-resolver {
  active-strategy = "keep-oldest"
  
  keep-oldest {
    # Prioritize members with this role
    role = ""
    
    # Down oldest member if singleton partition
    down-if-alone = on
  }
}

Behavior:

  • Partition with oldest member survives
  • Other partitions are downed
  • Deterministic: always same result for same partition scenario

Down All Strategy

Downs all unreachable members (primarily for testing).

akka.cluster.split-brain-resolver {
  active-strategy = "down-all"
}

Behavior:

  • All unreachable members are downed
  • Cluster continues with reachable members only
  • Use with caution in production

SBR Settings

SplitBrainResolverSettings

class SplitBrainResolverSettings(config: Config) {
  def activeStrategy: String
  def stableAfter: FiniteDuration
  def downAllWhenUnstable: DownAllWhenUnstable
}

sealed trait DownAllWhenUnstable
case object DownAllWhenUnstableOn extends DownAllWhenUnstable
case object DownAllWhenUnstableOff extends DownAllWhenUnstable

Global SBR Configuration

akka.cluster.split-brain-resolver {
  # Strategy to use
  active-strategy = "keep-majority"
  
  # Time to wait before taking downing decision  
  stable-after = 20s
  
  # Down all when cluster becomes unstable
  down-all-when-unstable = "on"
  
  # Additional settings per strategy
  keep-majority {
    # Minimum size for majority
    role = "core"
  }
  
  static-quorum {
    quorum-size = 3
    role = "important"
  }
  
  keep-oldest {
    role = "seed"
    down-if-alone = off
  }
  
  lease-majority {
    lease-implementation = "akka.coordination.lease.kubernetes"
    acquire-lease-delay-for-minority = 2s
    release-after = 40s
  }
}

Custom Downing Provider

Creating Custom Provider

import akka.cluster.DowningProvider
import akka.actor.{ActorSystem, Props}
import scala.concurrent.duration._

class CustomDowningProvider(system: ActorSystem) extends DowningProvider {
  override def downRemovalMargin: FiniteDuration = 10.seconds
  
  override def downingActorProps: Option[Props] = 
    Some(Props(classOf[CustomDowningActor]))
}

class CustomDowningActor extends Actor with ActorLogging {
  val cluster = Cluster(context.system)
  
  // Subscribe to unreachability events
  override def preStart(): Unit = {
    cluster.subscribe(self, classOf[UnreachableMember])
  }
  
  override def postStop(): Unit = {
    cluster.unsubscribe(self)
  }
  
  def receive = {
    case UnreachableMember(member) =>
      log.info("Member {} is unreachable", member)
      
      // Custom downing logic
      if (shouldDownMember(member)) {
        log.warning("Downing unreachable member {}", member)
        cluster.down(member.address)
      }
  }
  
  private def shouldDownMember(member: Member): Boolean = {
    // Custom logic - example: down after 30 seconds unreachable
    // In practice, you'd track unreachable time
    true
  }
}

Registering Custom Provider

akka.cluster.downing-provider-class = "com.example.CustomDowningProvider"

SBR Monitoring and Observability

SBR Decision Logging

// SBR logs decisions at INFO level
// Example log messages:
// "SBR is downing [Member(akka://sys@host1:2551, Up)] in partition [...]"
// "SBR is keeping partition [Member(akka://sys@host2:2551, Up), ...]"

Monitoring SBR Events

import akka.cluster.ClusterEvent._

class SBRMonitor extends Actor with ActorLogging {
  val cluster = Cluster(context.system)
  
  override def preStart(): Unit = {
    cluster.subscribe(self, classOf[MemberDowned], classOf[MemberRemoved])
  }
  
  def receive = {
    case MemberDowned(member) =>
      log.warning("Member downed by SBR: {}", member)
      // Send alert/metric
      
    case MemberRemoved(member, previousStatus) =>
      if (previousStatus == MemberStatus.Down) {
        log.info("Previously downed member removed: {}", member)
        // Update monitoring dashboard
      }
  }
}

Health Check Integration

import akka.http.scaladsl.server.Route
import akka.http.scaladsl.server.Directives._

def healthRoute: Route = {
  path("health") {
    get {
      val cluster = Cluster(system)
      val unreachableCount = cluster.state.unreachable.size
      
      if (unreachableCount == 0) {
        complete("healthy")
      } else {
        complete(s"unhealthy: $unreachableCount unreachable members")
      }
    }
  }
}

Production Best Practices

Strategy Selection Guidelines

Keep Majority:

  • Best for most scenarios
  • Good balance of availability and consistency
  • Works well with odd number of nodes

Lease Majority:

  • Use with external coordination systems (Kubernetes, etcd)
  • Provides strongest consistency guarantees
  • Requires reliable lease implementation

Static Quorum:

  • Use when minimum cluster size is known
  • Good for clusters with well-defined capacity requirements
  • May result in multiple surviving partitions

Keep Oldest:

  • Use when one node has special significance
  • Deterministic but potentially less available
  • Good for master/worker patterns

Configuration Recommendations

# Production configuration
akka.cluster.split-brain-resolver {
  active-strategy = "keep-majority"
  stable-after = 30s  # Allow time for transient network issues
  down-all-when-unstable = "on"  # Prevent brain-dead cluster states
  
  keep-majority {
    # Use role-based majority for heterogeneous clusters
    role = "core"
  }
}

Operational Considerations

// Monitor cluster health
val cluster = Cluster(system)

// Check for unreachable members
val unreachableMembers = cluster.state.unreachable
if (unreachableMembers.nonEmpty) {
  log.warning("Unreachable members detected: {}", 
              unreachableMembers.map(_.address).mkString(", "))
}

// Monitor cluster size
val memberCount = cluster.state.members.count(_.status == MemberStatus.Up)
val minimumRequired = 3 // Your application's minimum

if (memberCount < minimumRequired) {
  log.error("Cluster size {} below minimum required {}", memberCount, minimumRequired)
  // Consider alerting or graceful degradation
}

Testing SBR Strategies

// Use MultiNodeSpec for testing split brain scenarios
class SplitBrainResolverSpec extends MultiNodeSpec(SplitBrainConfig) {
  
  "Split Brain Resolver" should {
    "down minority partition in keep-majority strategy" in {
      // Create 5-node cluster
      awaitClusterUp(first, second, third, fourth, fifth)
      
      // Partition cluster into 3+2
      testConductor.blackhole(first, fourth, Direction.Both)
      testConductor.blackhole(first, fifth, Direction.Both)
      testConductor.blackhole(second, fourth, Direction.Both)
      testConductor.blackhole(second, fifth, Direction.Both)
      testConductor.blackhole(third, fourth, Direction.Both)
      testConductor.blackhole(third, fifth, Direction.Both)
      
      // Verify majority partition (first, second, third) survives
      runOn(first, second, third) {
        within(30.seconds) {
          awaitAssert {
            cluster.state.members.size should be(3)
            cluster.state.unreachable should be(empty)
          }
        }
      }
      
      // Verify minority partition (fourth, fifth) is downed
      runOn(fourth, fifth) {
        within(30.seconds) {
          awaitAssert {
            cluster.isTerminated should be(true)
          }
        }
      }
    }
  }
}