feat(code/engine): Buffer consensus messages during startup and recovery (#860)

romac · web-flow · commit a8be71757b35 · 2025-02-19T15:33:25.000Z
* feat(engine): Buffer consensus messages during startup and recovery

This commit introduces message buffering for the consensus engine during startup and
recovery phases. This is an important improvement to handle messages that arrive
before the consensus engine is fully started or while it is recovering state via the WAL.

- Add a `VecDeque` buffer to store messages that arrive during `Unstarted`/`Recovering` phases
- Implement a message filtering function to determine which messages should be buffered
- Add processing of buffered messages once consensus starts
- Set a maximum buffer size of 1024 messages to prevent unbounded memory growth

Messages that should be buffered include votes, proposals, and proposal parts.
Control messages like `StartHeight`, `TimeoutElapsed`, and network
connection events are processed immediately.

The buffered messages are replayed in order once:
1. The WAL has been checked and replayed
2. The consensus engine transitions to the `Running` phase

The buffer size is capped at 1024 messages to prevent memory exhaustion attacks.
Messages beyond this limit are dropped with a warning log.

* Buffer elapsed timeouts as well

This does not really matter since no timeout can actually expire while we are replaying the WAL,
but it is less misleading that way
diff --git a/code/Cargo.lock b/code/Cargo.lock
diff --git a/code/crates/engine/Cargo.toml b/code/crates/engine/Cargo.toml
@@ -26,6 +26,7 @@ malachitebft-sync.workspace = true
 malachitebft-wal.workspace = true
 
 async-trait = { workspace = true }
+async-recursion = { workspace = true }
 bytes = { workspace = true, features = ["serde"] }
 byteorder = { workspace = true }
 derive-where = { workspace = true }
diff --git a/code/crates/engine/src/consensus.rs b/code/crates/engine/src/consensus.rs
@@ -2,11 +2,13 @@ use std::collections::BTreeSet;
 use std::sync::Arc;
 use std::time::Duration;
 
+use async_recursion::async_recursion;
 use async_trait::async_trait;
+use derive_where::derive_where;
 use eyre::eyre;
 use ractor::{Actor, ActorProcessingErr, ActorRef, RpcReplyPort};
 use tokio::time::Instant;
-use tracing::{debug, error, info, warn};
+use tracing::{debug, error, error_span, info, warn};
 
 use malachitebft_codec as codec;
 use malachitebft_config::TimeoutConfig;
@@ -27,6 +29,7 @@ use crate::network::{NetworkEvent, NetworkMsg, NetworkRef, Status};
 use crate::sync::Msg as SyncMsg;
 use crate::sync::SyncRef;
 use crate::util::events::{Event, TxEvent};
+use crate::util::msg_buffer::MessageBuffer;
 use crate::util::streaming::StreamMessage;
 use crate::util::timers::{TimeoutElapsed, TimerScheduler};
 use crate::wal::{Msg as WalMsg, WalEntry, WalRef};
@@ -80,6 +83,7 @@ where
 
 pub type ConsensusMsg<Ctx> = Msg<Ctx>;
 
+#[derive_where(Debug)]
 pub enum Msg<Ctx: Context> {
     /// Start consensus for the given height with the given validator set
     StartHeight(Ctx::Height, Ctx::ValidatorSet),
@@ -164,6 +168,10 @@ enum Phase {
     Recovering,
 }
 
+/// Maximum number of messages to buffer while consensus is
+/// in the `Unstarted` or `Recovering` phase
+const MAX_BUFFER_SIZE: usize = 1024;
+
 pub struct State<Ctx: Context> {
     /// Scheduler for timers
     timers: Timers,
@@ -179,6 +187,10 @@ pub struct State<Ctx: Context> {
 
     /// The current phase
     phase: Phase,
+
+    /// A buffer of messages that were received while
+    /// consensus was `Unstarted` or in the `Recovering` phase
+    msg_buffer: MessageBuffer<Ctx>,
 }
 
 impl<Ctx> State<Ctx>
@@ -251,6 +263,23 @@ where
         )
     }
 
+    #[async_recursion]
+    async fn process_buffered_msgs(&self, myself: &ActorRef<Msg<Ctx>>, state: &mut State<Ctx>) {
+        if state.msg_buffer.is_empty() {
+            return;
+        }
+
+        info!(count = %state.msg_buffer.len(), "Replaying buffered messages");
+
+        while let Some(msg) = state.msg_buffer.pop() {
+            info!("Replaying buffered message: {msg:?}");
+
+            if let Err(e) = self.handle_msg(myself.clone(), state, msg).await {
+                error!("Error when handling buffered message: {e:?}");
+            }
+        }
+    }
+
     async fn handle_msg(
         &self,
         myself: ActorRef<Msg<Ctx>>,
@@ -259,8 +288,6 @@ where
     ) -> Result<(), ActorProcessingErr> {
         match msg {
             Msg::StartHeight(height, validator_set) => {
-                state.phase = Phase::Running;
-
                 let result = self
                     .process_input(
                         &myself,
@@ -286,6 +313,10 @@ where
                     error!(%height, "Error when checking and replaying WAL: {e}");
                 }
 
+                self.process_buffered_msgs(&myself, state).await;
+
+                state.phase = Phase::Running;
+
                 Ok(())
             }
 
@@ -607,8 +638,6 @@ where
                     error!(%height, "Failed to replay WAL entries: {e}");
                     self.tx_event.send(|| Event::WalReplayError(Arc::new(e)));
                 }
-
-                state.phase = Phase::Running;
             }
             Err(e) => {
                 error!(%height, "Error when notifying WAL of started height: {e}");
@@ -1103,6 +1132,7 @@ where
             consensus: ConsensusState::new(self.ctx.clone(), self.params.clone()),
             connected_peers: BTreeSet::new(),
             phase: Phase::Unstarted,
+            msg_buffer: MessageBuffer::new(MAX_BUFFER_SIZE),
         })
     }
 
@@ -1129,7 +1159,13 @@ where
         msg: Msg<Ctx>,
         state: &mut State<Ctx>,
     ) -> Result<(), ActorProcessingErr> {
-        if let Err(e) = self.handle_msg(myself, state, msg).await {
+        if state.phase != Phase::Running && should_buffer(&msg) {
+            let _span = error_span!("buffer", phase = ?state.phase).entered();
+            state.msg_buffer.buffer(msg);
+            return Ok(());
+        }
+
+        if let Err(e) = self.handle_msg(myself.clone(), state, msg).await {
             error!("Error when handling message: {e:?}");
         }
 
@@ -1142,9 +1178,18 @@ where
         state: &mut State<Ctx>,
     ) -> Result<(), ActorProcessingErr> {
         info!("Stopping...");
-
         state.timers.cancel_all();
-
         Ok(())
     }
 }
+
+fn should_buffer<Ctx: Context>(msg: &Msg<Ctx>) -> bool {
+    !matches!(
+        msg,
+        Msg::StartHeight(..)
+            | Msg::GetStatus(..)
+            | Msg::NetworkEvent(NetworkEvent::Listening(..))
+            | Msg::NetworkEvent(NetworkEvent::PeerConnected(..))
+            | Msg::NetworkEvent(NetworkEvent::PeerDisconnected(..))
+    )
+}
diff --git a/code/crates/engine/src/util/mod.rs b/code/crates/engine/src/util/mod.rs
@@ -1,4 +1,5 @@
 pub mod events;
+pub mod msg_buffer;
 pub mod streaming;
 pub mod ticker;
 pub mod timers;
diff --git a/code/crates/engine/src/util/msg_buffer.rs b/code/crates/engine/src/util/msg_buffer.rs
@@ -0,0 +1,43 @@
+use std::collections::VecDeque;
+
+use malachitebft_core_types::Context;
+use tracing::{info, warn};
+
+use crate::consensus::ConsensusMsg;
+
+pub struct MessageBuffer<Ctx: Context> {
+    messages: VecDeque<ConsensusMsg<Ctx>>,
+    max_size: usize,
+}
+
+impl<Ctx: Context> MessageBuffer<Ctx> {
+    pub fn new(max_size: usize) -> Self {
+        Self {
+            messages: VecDeque::new(),
+            max_size,
+        }
+    }
+
+    pub fn buffer(&mut self, msg: ConsensusMsg<Ctx>) -> bool {
+        if self.messages.len() < self.max_size {
+            info!("Buffering message: {msg:?}");
+            self.messages.push_back(msg);
+            true
+        } else {
+            warn!("Buffer is full, dropping message: {msg:?}");
+            false
+        }
+    }
+
+    pub fn pop(&mut self) -> Option<ConsensusMsg<Ctx>> {
+        self.messages.pop_front()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.messages.is_empty()
+    }
+
+    pub fn len(&self) -> usize {
+        self.messages.len()
+    }
+}