Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions harness/drive_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"strings"
"testing"
"time"

"github.com/mike-diff/sesh/agent"
)
Expand Down Expand Up @@ -345,6 +346,89 @@ func TestInterruptsQueue(t *testing.T) {
}
}

// TestInterruptsCancelInGap: an Escape that lands between iterations, after one
// turn's context is detached and before the next is minted, still aborts the
// next iteration. Without the sticky-abort window fix the cancel hits a nil
// context and vanishes, and the freshly minted context comes back live.
// Breaker: drop the aborted flag (or its consume in turnContext) and the next
// context is not cancelled.
func TestInterruptsCancelInGap(t *testing.T) {
in := &interrupts{}
_, done := in.turnContext() // a turn runs
done() // the worker detaches its context: in.cancel is now nil
in.cancelCurrent() // Escape in the gap: no live context, but the abort must stick

ctx, ndone := in.turnContext() // the next iteration mints a fresh context
defer ndone()
select {
case <-ctx.Done():
// good: the gap-Escape aborted the next iteration before it could run
default:
t.Fatal("an Escape between iterations must cancel the next context, not vanish")
}

// The abort is consumed once: a later request after a reset starts live again.
in.resetAbort()
ctx2, ndone2 := in.turnContext()
defer ndone2()
select {
case <-ctx2.Done():
t.Fatal("a fresh request after reset must not inherit a stale abort")
default:
}
}

// TestInterruptsResetClearsStaleAbort: an Escape that arrives as a drive is
// already stopping must not cancel the next request the user types. resetAbort
// at the top-level boundary clears it. Breaker: drop the resetAbort call and a
// steer-cancel's aborted flag leaks into the next turn's context.
func TestInterruptsResetClearsStaleAbort(t *testing.T) {
in := &interrupts{}
in.cancelCurrent() // an abort with no live turn (the gap, or a steer that just cancelled)
in.resetAbort() // the turn is fully over; the abort is stale
ctx, done := in.turnContext()
defer done()
select {
case <-ctx.Done():
t.Fatal("a reset must clear the abort so the next context starts live")
default:
}
}

// TestInterruptsCtrlCDoublePress: one Ctrl-C warns and does not quit; a second
// within the window force-quits via cleanup. The spy cleanup is the only
// observable effect, so os.Exit stays out of the unit (the signal goroutine and
// the editor hook own the exit at the call boundary). Breaker: drop the
// double-press check (quit on the first press) and the single-press case sees
// cleanup fire; drop the double branch and the second press never fires it.
func TestInterruptsCtrlCDoublePress(t *testing.T) {
var cleaned int
in := newInterruptsTest(func() { cleaned++ })

if forced := in.ctrlC(); forced || cleaned != 0 {
t.Fatalf("first Ctrl-C must warn, not quit: forced=%v cleaned=%d", forced, cleaned)
}
if forced := in.ctrlC(); !forced || cleaned != 1 {
t.Fatalf("second Ctrl-C within the window must quit via cleanup: forced=%v cleaned=%d", forced, cleaned)
}

// A press after the window lapses is a fresh first press: it warns, not quits.
in.last = time.Now().Add(-2 * doublePressWindow)
if forced := in.ctrlC(); forced {
t.Fatal("a Ctrl-C after the window lapses must warn, not quit")
}
if cleaned != 1 {
t.Fatalf("a lapsed press must not re-fire cleanup: cleaned=%d", cleaned)
}
}

// newInterruptsTest builds an interrupts with the given cleanup spy but without
// the signal.Notify wiring of the real constructor, so the unit observes ctrlC
// in isolation (no live OS-signal goroutine, no os.Exit).
func newInterruptsTest(cleanup func()) *interrupts {
return &interrupts{cleanup: cleanup}
}

// TestDriveJudgeUnavailable: no verdict means no mandate to keep spending.
func TestDriveJudgeUnavailable(t *testing.T) {
p := &seqChat{} // judge call errors immediately
Expand Down
83 changes: 71 additions & 12 deletions harness/harness.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,14 @@ var (
yellow string
red string
green string
cyan string
reset string
)

func initColor() {
if useColor() {
dim, yellow, red, green, reset = "\033[2m", "\033[33m", "\033[31m", "\033[32m", "\033[0m"
cyan = "\033[36m"
}
}

Expand Down Expand Up @@ -417,12 +419,31 @@ func Main() {
}()
}

// Escape cancels the running turn; Ctrl-C quits (twice within two seconds,
// after restoring the terminal).
// Escape cancels the running turn; Ctrl-C quits (twice within two seconds).
// The second press is a force quit: restore the terminal first so raw mode
// never leaks, then signal owned process groups best-effort with no grace
// window, then release the lock. The OS reaps any group that has not died
// yet, so a stubborn background process cannot delay the exit. The graceful
// reapAll stays on the clean single-quit and normal-exit paths.
intr := newInterrupts(func() {
con.Close()
r.goodbye()
if r.procs != nil {
r.procs.killAllNow()
}
releaseLock(r.sess.ID)
})
// Extended-keys terminals deliver Ctrl-C as a keystroke, not an OS signal, so
// the editor needs a console-level hook to reach the same quit semantics. It
// is console-level (not per-turn) because Ctrl-C must work at the idle prompt
// and during a turn alike; os.Exit stays here at the boundary, never inside
// ctrlC, so the editor goroutine that calls this never holds its own mutex.
if t, ok := con.(*tuiConsole); ok {
t.onCtrlC = func() {
if intr.ctrlC() {
os.Exit(130)
}
}
}

say := func(f string, a ...any) {
emit("%s "+f+"%s\n", append(append([]any{dim}, a...), reset)...)
Expand Down Expand Up @@ -490,6 +511,7 @@ func Main() {
tc.attendTurn(turnAttend{done: doneCh, cancel: intr.cancelCurrent, queue: intr.enqueue})
<-doneCh // the worker is finished (attendTurn can also return on EOF); never overlap turns
stopSpin()
intr.resetAbort() // the turn is over; an Escape now is for the next prompt, not a stale carry-over
if q := intr.drain(); len(q) > 0 { // typed after the last boundary: run next
pending = strings.Join(q, "\n")
}
Expand All @@ -514,6 +536,7 @@ func Main() {
type interrupts struct {
mu sync.Mutex
cancel context.CancelFunc // non-nil while a turn is running
aborted bool // an Escape landed between iterations; the next context starts cancelled
last time.Time
cleanup func()
queued []string // messages typed during a turn, drained at the next boundary
Expand All @@ -529,25 +552,45 @@ func newInterrupts(cleanup func()) *interrupts {
sigc := make(chan os.Signal, 1)
signal.Notify(sigc, os.Interrupt)
go func() {
// Some terminals deliver Ctrl-C as an OS signal; others (extended-keys
// mode) deliver it as a keystroke that the editor routes to ctrlC. Both
// triggers share this one body, and os.Exit lives at the call boundary so
// ctrlC itself stays unit-testable around the spy-observable cleanup.
for range sigc {
in.mu.Lock()
double := time.Since(in.last) < doublePressWindow
in.last = time.Now()
in.mu.Unlock()
if double {
in.cleanup()
if in.ctrlC() {
os.Exit(130)
}
emit("\n%s (ctrl-c again to quit; esc cancels the turn)%s\n", yellow, reset)
}
}()
return in
}

// cancelCurrent aborts the turn in flight, if any (the live editor's Escape).
// ctrlC applies one Ctrl-C press: a stray press warns, a second within the
// window force-quits via cleanup. It reports whether this press force-quit so
// the caller can os.Exit(130) at the boundary, keeping os.Exit out of the unit
// under test. Both the signal goroutine and the live editor call this.
func (in *interrupts) ctrlC() bool {
in.mu.Lock()
double := time.Since(in.last) < doublePressWindow
in.last = time.Now()
in.mu.Unlock()
if double {
in.cleanup()
return true
}
emit("\n%s (ctrl-c again to quit; esc cancels the turn)%s\n", yellow, reset)
return false
}

// cancelCurrent aborts the turn in flight (the live editor's Escape). It also
// records the abort even when no context is live: between iterations the worker
// detaches one context before the next is minted, and an Escape in that window
// would otherwise vanish. The sticky flag makes the next turnContext start
// already cancelled, so the abort reaches the iteration it was meant for.
func (in *interrupts) cancelCurrent() {
in.mu.Lock()
c := in.cancel
in.aborted = true
in.mu.Unlock()
if c != nil {
c()
Expand All @@ -571,16 +614,32 @@ func (in *interrupts) drain() []string {
}

// turnContext hands out a cancellable context for one turn and the cleanup
// that detaches it from the watcher.
// that detaches it from the watcher. An Escape that arrived between iterations
// (the sticky aborted flag) cancels the fresh context at once, so the abort is
// honored by the iteration it was aimed at instead of being dropped in the gap.
func (in *interrupts) turnContext() (context.Context, func()) {
ctx, cancel := context.WithCancel(context.Background())
in.mu.Lock()
in.cancel = cancel
wasAborted := in.aborted
in.aborted = false
in.mu.Unlock()
if wasAborted {
cancel()
}
return ctx, func() {
in.mu.Lock()
in.cancel = nil
in.mu.Unlock()
cancel()
}
}

// resetAbort clears a stale abort once a turn has fully ended, so an Escape that
// landed as the drive was already stopping cannot cancel the next request the
// user types. Called at the top-level boundary, not inside a drive cycle.
func (in *interrupts) resetAbort() {
in.mu.Lock()
in.aborted = false
in.mu.Unlock()
}
33 changes: 32 additions & 1 deletion harness/proc.go
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,32 @@ func (m *procManager) reapAll() {
os.RemoveAll(runDir(m.sessionID))
}

// killAllNow is the force-quit reaper: SIGTERM then SIGKILL every owned group
// back to back, with no grace window, so a second Ctrl-C exits at once instead
// of blocking on a stubborn process. The signals are sent in the same loop so
// the kernel queues SIGKILL right behind SIGTERM; the caller exits immediately
// after and the OS reaps any group that has not died yet. Returns the number of
// groups it signalled, so a test can prove it acted without sleeping.
func (m *procManager) killAllNow() int {
m.mu.Lock()
procs := append([]*Proc(nil), m.procs...)
m.mu.Unlock()
signalled := 0
for _, p := range procs {
p.mu.Lock()
pgid := p.pgid
dead := p.ended || pgid <= 0
p.mu.Unlock()
if dead {
continue
}
syscall.Kill(-pgid, syscall.SIGTERM)
syscall.Kill(-pgid, syscall.SIGKILL)
signalled++
}
return signalled
}

func (m *procManager) byID(id string) *Proc {
m.mu.Lock()
defer m.mu.Unlock()
Expand Down Expand Up @@ -531,7 +557,12 @@ type procArgs struct {
Filter string `json:"filter"`
}

func (m *procManager) runTool(raw json.RawMessage) (string, bool) {
// runTool dispatches the proc tool. ctx carries the turn's cancellation; every
// action here is a fast bookkeeping call (start spawns and returns, the rest
// only read or signal), so none blocks on it. The parameter keeps the proc
// tool's Run on the same cancellable contract as bash, so a future blocking
// action inherits cancellation instead of having to retrofit the signature.
func (m *procManager) runTool(ctx context.Context, raw json.RawMessage) (string, bool) {
var a procArgs
if err := json.Unmarshal(raw, &a); err != nil {
return "invalid proc input: " + err.Error(), true
Expand Down
41 changes: 35 additions & 6 deletions harness/proc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,34 @@ func TestProcStopKillsGroup(t *testing.T) {
}
}

// TestKillAllNowReturnsPromptly: the force-quit reaper signals every owned group
// and returns without the killGrace sleep, so a second Ctrl-C exits at once even
// with a stubborn process running. Breaker: put a time.Sleep(killGrace) back on
// this path and the elapsed time exceeds the grace window.
func TestKillAllNowReturnsPromptly(t *testing.T) {
t.Setenv("HOME", t.TempDir())
m := newProcManager("scale-force")
// A process that ignores SIGTERM and keeps running, so only SIGKILL ends it:
// the helper must not wait for it to die before returning.
p, _, err := m.start("trap '' TERM; sleep 30", "")
if err != nil {
t.Fatal(err)
}
if !waitFor(2*time.Second, func() bool { return p.pgid > 0 }) {
t.Fatal("process group never came up")
}
start := time.Now()
n := m.killAllNow()
elapsed := time.Since(start)
if n != 1 {
t.Fatalf("must signal the one owned group, got %d", n)
}
if elapsed >= killGrace {
t.Fatalf("force reaper blocked for %v; it must not sleep killGrace (%v)", elapsed, killGrace)
}
m.reapAll() // clean up the run dir and make sure nothing leaks past the test
}

// TestAutoPromote: a command that outlives the promote window becomes a tracked
// background process (handle returned, kept in the registry); a quick command
// returns synchronously and leaves nothing behind. Breaker: revert doBash to
Expand Down Expand Up @@ -311,24 +339,25 @@ func TestForegroundNoTrailingNewline(t *testing.T) {
func TestProcToolDispatch(t *testing.T) {
t.Setenv("HOME", t.TempDir())
m := newProcManager("scale-tool")
out, isErr := m.runTool([]byte(`{"action":"start","command":"echo hello; sleep 30","name":"svc"}`))
ctx := context.Background()
out, isErr := m.runTool(ctx, []byte(`{"action":"start","command":"echo hello; sleep 30","name":"svc"}`))
if isErr || !strings.Contains(out, "started proc-1") {
t.Fatalf("start: %q err=%v", out, isErr)
}
if out, _ := m.runTool([]byte(`{"action":"list"}`)); !strings.Contains(out, "proc-1") || !strings.Contains(out, "svc") {
if out, _ := m.runTool(ctx, []byte(`{"action":"list"}`)); !strings.Contains(out, "proc-1") || !strings.Contains(out, "svc") {
t.Fatalf("list must show the process: %q", out)
}
waitFor(2*time.Second, func() bool {
o, _ := m.runTool([]byte(`{"action":"logs","id":"proc-1"}`))
o, _ := m.runTool(ctx, []byte(`{"action":"logs","id":"proc-1"}`))
return strings.Contains(o, "hello")
})
if out, _ := m.runTool([]byte(`{"action":"logs","id":"proc-1"}`)); strings.Contains(out, "hello") {
if out, _ := m.runTool(ctx, []byte(`{"action":"logs","id":"proc-1"}`)); strings.Contains(out, "hello") {
t.Fatalf("a second logs read must not repeat consumed output: %q", out)
}
if out, isErr := m.runTool([]byte(`{"action":"stop","id":"proc-1"}`)); isErr || !strings.Contains(out, "stopped proc-1") {
if out, isErr := m.runTool(ctx, []byte(`{"action":"stop","id":"proc-1"}`)); isErr || !strings.Contains(out, "stopped proc-1") {
t.Fatalf("stop: %q err=%v", out, isErr)
}
if out, _ := m.runTool([]byte(`{"action":"bogus"}`)); !strings.Contains(out, "unknown proc action") {
if out, _ := m.runTool(ctx, []byte(`{"action":"bogus"}`)); !strings.Contains(out, "unknown proc action") {
t.Fatalf("unknown action must be reported: %q", out)
}
m.reapAll()
Expand Down
Loading
Loading