or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

data-streams.md event-monitoring.md index.md input-sources.md java-api.md key-value-ops.md state-management.md streaming-context.md window-ops.md

tile.json

State Management

State management in Spark Streaming allows applications to maintain state information across streaming batches. This is essential for use cases like session tracking, running aggregations, and maintaining counters that persist beyond individual batch boundaries.

Capabilities

UpdateStateByKey Operations

Traditional stateful operations that maintain state across batches using update functions.

/**
 * Update state by key across batches using an update function
 * @param updateFunc - Function that takes new values and current state, returns updated state
 * @returns DStream of (key, state) pairs
 */
def updateStateByKey[S: ClassTag](
  updateFunc: (Seq[V], Option[S]) => Option[S]
): DStream[(K, S)]

/**
 * Update state by key with custom number of partitions
 * @param updateFunc - Function that takes new values and current state, returns updated state  
 * @param numPartitions - Number of partitions for state storage
 * @returns DStream of (key, state) pairs
 */
def updateStateByKey[S: ClassTag](
  updateFunc: (Seq[V], Option[S]) => Option[S],
  numPartitions: Int
): DStream[(K, S)]

/**
 * Update state by key with custom partitioner
 * @param updateFunc - Function that takes new values and current state, returns updated state
 * @param partitioner - Custom partitioner for state distribution
 * @returns DStream of (key, state) pairs
 */
def updateStateByKey[S: ClassTag](
  updateFunc: (Seq[V], Option[S]) => Option[S], 
  partitioner: Partitioner
): DStream[(K, S)]

/**
 * Update state by key with initial state RDD
 * @param updateFunc - State update function
 * @param partitioner - Partitioner for state distribution
 * @param initialRDD - RDD containing initial state for keys
 * @returns DStream of (key, state) pairs
 */
def updateStateByKey[S: ClassTag](
  updateFunc: (Seq[V], Option[S]) => Option[S],
  partitioner: Partitioner,
  initialRDD: RDD[(K, S)]
): DStream[(K, S)]

Usage Examples:

val wordPairs = lines.flatMap(_.split(" ")).map((_, 1))

// Running word count
val runningCounts = wordPairs.updateStateByKey[Int] { (values, state) =>
  val currentCount = state.getOrElse(0)
  val newCount = currentCount + values.sum
  if (newCount == 0) None else Some(newCount)
}

// Session tracking with timeout
val sessionStream = events.map(event => (event.userId, event.timestamp))
val sessions = sessionStream.updateStateByKey[Long] { (timestamps, lastSeen) =>
  val now = System.currentTimeMillis()
  val latest = timestamps.max
  
  // Timeout sessions after 30 minutes of inactivity
  if (now - latest > 30 * 60 * 1000) {
    None  // Remove inactive session
  } else {
    Some(latest)  // Update last seen time
  }
}

MapWithState Operations (Experimental)

Advanced stateful operations providing more efficient state management and additional features.

/**
 * Transform stream with state using StateSpec configuration
 * @param spec - StateSpec defining state mapping behavior
 * @returns MapWithStateDStream for advanced state operations
 */
def mapWithState[StateType: ClassTag, MappedType: ClassTag](
  spec: StateSpec[K, V, StateType, MappedType]
): MapWithStateDStream[K, V, StateType, MappedType]

StateSpec Configuration

Configuration object for mapWithState operations providing fine-grained control over state behavior.

/**
 * StateSpec factory methods and configuration
 */
object StateSpec {
  /**
   * Create StateSpec with mapping function
   * @param mappingFunction - Function to map (key, value, state) to output
   * @returns StateSpec for use with mapWithState
   */
  def function[KeyType, ValueType, StateType, MappedType](
    mappingFunction: (KeyType, Option[ValueType], State[StateType]) => Option[MappedType]
  ): StateSpec[KeyType, ValueType, StateType, MappedType]
}

abstract class StateSpec[KeyType, ValueType, StateType, MappedType] {
  /**
   * Set initial state from RDD
   * @param rdd - RDD containing initial (key, state) pairs
   * @returns This StateSpec for method chaining
   */
  def initialState(rdd: RDD[(KeyType, StateType)]): this.type
  
  /**
   * Set initial state from Java PairRDD
   * @param javaPairRDD - Java PairRDD containing initial state
   * @returns This StateSpec for method chaining
   */
  def initialState(javaPairRDD: JavaPairRDD[KeyType, StateType]): this.type
  
  /**
   * Set number of partitions for state storage
   * @param numPartitions - Number of partitions
   * @returns This StateSpec for method chaining
   */
  def numPartitions(numPartitions: Int): this.type
  
  /**
   * Set custom partitioner for state distribution
   * @param partitioner - Custom partitioner
   * @returns This StateSpec for method chaining  
   */
  def partitioner(partitioner: Partitioner): this.type
  
  /**
   * Set timeout for inactive keys
   * @param idleDuration - Duration after which inactive keys are timed out
   * @returns This StateSpec for method chaining
   */
  def timeout(idleDuration: Duration): this.type
}

State Object Interface

Interface for accessing and modifying state within mapWithState operations.

/**
 * State access object for mapWithState operations
 */
abstract class State[S] {
  /**
   * Check if state exists for current key
   * @returns true if state exists, false otherwise
   */
  def exists(): Boolean
  
  /**
   * Get current state value
   * @returns Current state value (throws exception if state doesn't exist)
   */
  def get(): S
  
  /**
   * Get current state as Option
   * @returns Some(state) if exists, None otherwise
   */  
  def getOption(): Option[S]
  
  /**
   * Update state with new value
   * @param newState - New state value to set
   */
  def update(newState: S): Unit
  
  /**
   * Remove state for current key
   */
  def remove(): Unit
  
  /**
   * Check if this key is timing out in current batch
   * @returns true if key is timing out, false otherwise
   */
  def isTimingOut(): Boolean
}

Usage Examples:

// Advanced word counting with mapWithState
val wordCounts = wordPairs.mapWithState(
  StateSpec.function((word: String, count: Option[Int], state: State[Int]) => {
    val currentCount = state.getOption().getOrElse(0)
    val newCount = currentCount + count.getOrElse(0)
    
    state.update(newCount)
    Some((word, newCount))  // Output current word and count
  })
)

// Session tracking with timeout
val userSessions = userEvents.mapWithState(
  StateSpec
    .function((userId: String, event: Option[UserEvent], state: State[SessionInfo]) => {
      if (state.isTimingOut()) {
        // Session is timing out, emit final session info
        val sessionInfo = state.get()
        state.remove()
        Some(SessionEnded(userId, sessionInfo))
      } else {
        event match {
          case Some(evt) =>
            val sessionInfo = state.getOption().getOrElse(SessionInfo.empty)
            val updatedSession = sessionInfo.addEvent(evt)
            state.update(updatedSession)
            Some(SessionUpdate(userId, updatedSession))
          case None => None
        }
      }
    })
    .timeout(Minutes(30))  // Timeout inactive sessions after 30 minutes
)

// Complex aggregation state
case class AggregateState(count: Long, sum: Double, max: Double, min: Double)

val aggregates = measurements.mapWithState(
  StateSpec
    .function((sensorId: String, value: Option[Double], state: State[AggregateState]) => {
      value match {
        case Some(v) =>
          val current = state.getOption().getOrElse(AggregateState(0, 0.0, Double.MinValue, Double.MaxValue))
          val updated = AggregateState(
            count = current.count + 1,
            sum = current.sum + v,
            max = math.max(current.max, v),
            min = math.min(current.min, v)
          )
          state.update(updated)
          Some((sensorId, updated.copy(avg = updated.sum / updated.count)))
        case None => None
      }
    })
    .initialState(ssc.sparkContext.parallelize(initialAggregates))
    .numPartitions(4)
)

MapWithStateDStream Operations

Additional operations available on the result of mapWithState.

abstract class MapWithStateDStream[K, V, S, E] extends DStream[E] {
  /**
   * Get stream of state snapshots (all current key-state pairs)
   * @returns DStream of (key, state) pairs representing current state
   */
  def stateSnapshots(): DStream[(K, S)]
}

Usage Examples:

val statefulStream = wordPairs.mapWithState(/* StateSpec */)

// Get periodic snapshots of all state
val stateSnapshots = statefulStream.stateSnapshots()

// Save state snapshots periodically
stateSnapshots.foreachRDD { rdd =>
  rdd.saveAsTextFile(s"hdfs://state-backup/${System.currentTimeMillis()}")
}

// Monitor state size
stateSnapshots.foreachRDD { rdd =>
  val stateSize = rdd.count()
  println(s"Current state contains $stateSize keys")
}