Skip to content

Commit 0f6818f

Browse files
auricomclaude
andcommitted
fix(raft): address coderabbitai feedback — ShutdownTimeout clamp, transfer error propagation, deterministic test
ShutdownTimeout zero-value panic (critical): NewNode now clamps ShutdownTimeout to 5*SendTimeout when the caller passes zero, preventing a panic in time.NewTicker inside waitForMsgsLanded. The normal path through initRaftNode already sets it explicitly; this guard protects direct callers (e.g. tests) that omit the field. Leadership transfer error propagation (major): When store-lag abdication calls leadershipTransfer() and it fails, the error is now returned instead of being logged and silently continuing. Continuing after a failed transfer left the node as leader-without-worker, stalling the cluster. Deterministic abdication test (major): Replace time.Sleep(10ms) + t.Fatal-in-goroutine with channel-based synchronization: leader runFn signals leaderStarted; the test goroutine waits up to 50ms for that signal and calls t.Error (safe from goroutines) if it arrives, then cancels the context either way. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 7e09507 commit 0f6818f

3 files changed

Lines changed: 26 additions & 4 deletions

File tree

pkg/raft/election.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ func (d *DynamicLeaderElection) Run(ctx context.Context) error {
146146
Msg("became leader but store is significantly behind raft state; abdicating to prevent stalled block production")
147147
if tErr := d.node.leadershipTransfer(); tErr != nil {
148148
d.logger.Error().Err(tErr).Msg("leadership transfer failed after store-lag abdication")
149+
return fmt.Errorf("leadership transfer failed after store-lag abdication: %w", tErr)
149150
}
150151
continue
151152
}

pkg/raft/election_test.go

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,15 @@ func TestDynamicLeaderElectionRun(t *testing.T) {
241241
startedCh: fStarted,
242242
isSyncedFn: func(*RaftBlockState) (int, error) { return -5, nil },
243243
}
244-
// Leader must never start
244+
// Signal if leader ever starts — it must not.
245+
leaderStarted := make(chan struct{}, 1)
245246
leader := &testRunnable{runFn: func(ctx context.Context) error {
246-
t.Fatal("leader should not start when store is significantly behind raft")
247-
return nil
247+
select {
248+
case leaderStarted <- struct{}{}:
249+
default:
250+
}
251+
<-ctx.Done()
252+
return ctx.Err()
248253
}}
249254

250255
logger := zerolog.Nop()
@@ -257,7 +262,14 @@ func TestDynamicLeaderElectionRun(t *testing.T) {
257262
leaderCh <- false
258263
<-fStarted
259264
leaderCh <- true
260-
time.Sleep(10 * time.Millisecond)
265+
// Wait for abdication to complete (transfer + continue) then verify
266+
// the leader was never started before cancelling.
267+
select {
268+
case <-leaderStarted:
269+
t.Error("leader should not start when store is significantly behind raft")
270+
case <-time.After(50 * time.Millisecond):
271+
// leadership transferred without starting leader — expected
272+
}
261273
cancel()
262274
}()
263275
return d, ctx, cancel

pkg/raft/node.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,15 @@ func buildRaftConfig(cfg *Config, logger zerolog.Logger) *raft.Config {
8282
}
8383

8484
func NewNode(cfg *Config, logger zerolog.Logger) (*Node, error) {
85+
// Clamp ShutdownTimeout so waitForMsgsLanded never receives a zero or
86+
// negative interval (which would panic in time.NewTicker). Callers such as
87+
// initRaftNode already set this, but direct callers in tests may not.
88+
if cfg.ShutdownTimeout <= 0 {
89+
cfgCopy := *cfg
90+
cfgCopy.ShutdownTimeout = 5 * cfg.SendTimeout
91+
cfg = &cfgCopy
92+
}
93+
8594
if err := os.MkdirAll(cfg.RaftDir, 0755); err != nil {
8695
return nil, fmt.Errorf("create raft dir: %w", err)
8796
}

0 commit comments

Comments
 (0)