From d829fccabc1f599dad65e412dd941b4b005251f3 Mon Sep 17 00:00:00 2001 From: elina-chertova Date: Sun, 21 Jun 2026 14:22:18 +0000 Subject: [PATCH] hotblocks: skip stale data packs below the finalized head instead of crash-looping A data source that has fallen behind can re-deliver a pack that lies entirely within the already-finalized region: its blocks end at or below the current finalized head and the finalized head it reports is below ours. Because finalized blocks are immutable, this is stale, already-committed data from a lagging endpoint, not a genuine fork. Previously WriteController::new_chunk treated this as an unrecoverable fork and returned `bail!("can't fork safely ...")`. That error aborts the dataset update task, which the controller then restarts every 60s, re-pulls the same stale pack, and fails again - an infinite crash-loop. A single behind endpoint thus stalls the whole dataset even though the other endpoints are ahead, and the dataset's lag grows unbounded. Fix the fatality: when the incoming pack is entirely within the finalized region and reports a lower finalized head, log a warning and skip it (no-op) rather than failing. The existing finalized chain is kept intact - the stale pack is dropped, not accepted as truth - so a lagging data source can no longer crash-loop ingestion. Genuine forks above the finalized head are unaffected. Cause: observed on hotblocks-db-0 for dataset binance-mainnet-traceless, where a lagging upstream (head stuck ~17 min behind at canonical block 105536898) drove the loop while a healthy endpoint was already at 105536909; cross-checked against an independent node that the lagging pack's data was canonical, i.e. merely stale. Falsification: if a stale-below-finalized pack is still delivered, new_chunk now emits "ignoring stale data pack below the current finalized head" and returns Ok; if instead the dataset task keeps logging "can't fork safely" / "dataset update task failed, will restart it in 1 minute", the guard did not cover the case. Co-Authored-By: Claude Opus 4.8 --- .../dataset_controller/write_controller.rs | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/crates/hotblocks/src/dataset_controller/write_controller.rs b/crates/hotblocks/src/dataset_controller/write_controller.rs index 1d74f5c6..269af73e 100644 --- a/crates/hotblocks/src/dataset_controller/write_controller.rs +++ b/crates/hotblocks/src/dataset_controller/write_controller.rs @@ -344,6 +344,27 @@ impl WriteController { finalized_head = valuable(&finalized_head), ))] pub fn new_chunk(&mut self, finalized_head: Option<&BlockRef>, chunk: &StorageChunk) -> anyhow::Result<()> { + // A data source that has fallen behind can re-deliver a pack that lies entirely + // within the already-finalized region: its blocks end at or below our current + // finalized head and the finalized head it reports is below ours. Finalized blocks + // are immutable, so this is stale, already-committed data from a lagging endpoint, + // not a genuine fork. Skip it instead of failing the dataset update task: otherwise + // a single behind endpoint crash-loops the task every minute (see the bail! below) + // and the dataset stops ingesting from the endpoints that are still ahead. + if let Some(current) = self.finalized_head.as_ref() { + let pack_below_finalized = chunk.last_block() <= current.number + && finalized_head.map_or(true, |new| new.number < current.number); + if pack_below_finalized { + warn!( + first_block = chunk.first_block(), + last_block = chunk.last_block(), + current_finalized_head = current.number, + "ignoring stale data pack below the current finalized head (lagging data source)" + ); + return Ok(()); + } + } + // FIXME: accept self.first_block rollback limit let finalized_head = self.db.update_dataset(self.dataset_id, |tx| { let new_finalized_head = match (finalized_head, tx.label().finalized_head()) {