Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 31 additions & 5 deletions internal/cli/resume.go
Original file line number Diff line number Diff line change
Expand Up @@ -752,10 +752,30 @@ func (g *ghOpsAdapter) MergePR(repoDir string, prNumber int) error {
return vxdgit.MergePR(repoDir, prNumber)
}

// recoverOrphanedStories finds stories stuck in "in_progress" with no live
// tmux session and a worktree containing committed work. It returns ActiveAgent
// entries that the monitor will immediately detect as terminated, routing them
// through postExecutionPipeline (review → QA → merge).
// isRecoverableStalledStatus reports whether a story in this status, with a
// worktree of committed work but no live agent session, should be re-routed
// through the post-execution pipeline on resume.
//
// "in_progress" is the classic orphan (agent died mid-implementation). But a
// story whose agent FINISHED emits STORY_COMPLETED, which projects to "review";
// if the monitor is then killed (e.g. a session-limit pause) before review→QA→
// merge runs, the story is stranded in "review" (or "qa") forever: it is not
// dispatchable (not "draft") and never merges, blocking every dependent story.
// Recovering these post-agent states is what lets an interrupted build resume
// cleanly instead of stalling with "no stories ready for dispatch".
func isRecoverableStalledStatus(status string) bool {
switch status {
case "in_progress", "review", "qa":
return true
default:
return false
}
}

// recoverOrphanedStories finds stories stalled mid-pipeline (agent finished or
// died) with no live tmux session and a worktree containing committed work. It
// returns ActiveAgent entries that the monitor will immediately detect as
// terminated, routing them through postExecutionPipeline (review → QA → merge).
func recoverOrphanedStories(stories []state.Story, proj *state.SQLiteStore, cfg config.Config) []engine.ActiveAgent {
worktreeBase := filepath.Join(expandHome(cfg.Workspace.StateDir), "worktrees")

Expand All @@ -775,7 +795,7 @@ func recoverOrphanedStories(stories []state.Story, proj *state.SQLiteStore, cfg

var orphans []engine.ActiveAgent
for _, story := range stories {
if story.Status != "in_progress" {
if !isRecoverableStalledStatus(story.Status) {
continue
}

Expand All @@ -792,6 +812,12 @@ func recoverOrphanedStories(stories []state.Story, proj *state.SQLiteStore, cfg

if ag, ok := agentByID[story.AgentID]; ok {
if ag.SessionName != "" {
// If the real agent session is still alive, the story is
// genuinely in flight — leave it to the live monitor rather
// than yanking it into post-execution.
if tmux.SessionExists(ag.SessionName) {
continue
}
sessionName = ag.SessionName
}
if ag.Runtime != "" {
Expand Down
22 changes: 21 additions & 1 deletion internal/state/sqlite.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,14 @@ func (s *SQLiteStore) Project(evt Event) error {
case EventStoryRejected:
return s.updateStoryStatus(evt.StoryID, "draft")
case EventStoryReset:
return s.updateStoryStatus(evt.StoryID, "draft")
// A reset returns the story to draft AND zeroes the cached escalation
// tier so the dispatcher routes it fresh (the event-sourced CurrentTier
// already scopes escalations to the latest reset; this keeps the
// denormalized column consistent so routeStory doesn't see a stale tier).
if err := s.updateStoryStatus(evt.StoryID, "draft"); err != nil {
return err
}
return s.updateStoryEscalationTier(evt.StoryID, 0)

case EventStoryEscalated:
return s.projectStoryEscalated(evt, payload)
Expand Down Expand Up @@ -732,6 +739,19 @@ func (s *SQLiteStore) ArchiveStoriesByReq(reqID string) error {
return err
}

// updateStoryEscalationTier sets the cached escalation_tier column for a story.
// Used by STORY_RESET to return a story to tier 0 so the dispatcher routes it
// fresh after a transient-failure recovery.
func (s *SQLiteStore) updateStoryEscalationTier(storyID string, tier int) error {
if _, err := s.db.Exec(
`UPDATE stories SET escalation_tier = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?`,
tier, storyID,
); err != nil {
return fmt.Errorf("update story escalation_tier: %w", err)
}
return nil
}

func (s *SQLiteStore) projectStoryEscalated(evt Event, payload map[string]any) error {
fromTier := payloadInt(payload, "from_tier")
toTier := payloadInt(payload, "to_tier")
Expand Down
Loading