diff --git a/.gitignore b/.gitignore index 9234779..6bf0a62 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,6 @@ # local agent session state (worktrees, settings) .claude/ .pi/ -.cursor/ \ No newline at end of file +.cursor/ +# Claude Code agent context (ephemeral) +.context/ diff --git a/README.md b/README.md index 6e899e6..242ddd6 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ Then just talk to it: -> Hey sesh, tell me about yourself, how you work, what you can do and what are mods. ``` -Submit `/help` for the key reference. When stdin or stdout is a pipe it falls back to plain line input. Type `exit` to quit; sessions autosave to `~/.sesh/sessions/`. +Submit `/help` for the key reference. Ctrl-V (or Alt-V where a terminal swallows Ctrl-V) pastes a clipboard image, shown inline as `[image-N]` and sent to a vision-capable model. When stdin or stdout is a pipe it falls back to plain line input. Type `exit` to quit; sessions autosave to `~/.sesh/sessions/`. ## The tools diff --git a/agent/agent.go b/agent/agent.go index 82e0733..a12fe36 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -35,10 +35,23 @@ type ToolResult struct { type Turn struct { Role string `json:"role"` Text string `json:"text,omitempty"` + Images []Image `json:"images,omitempty"` Calls []ToolCall `json:"calls,omitempty"` Results []ToolResult `json:"results,omitempty"` } +// Image is one image carried on a user Turn. Hash and metadata persist with the +// session; the bytes do not. Data is tagged json:"-" so history stays lean on +// disk: the harness keeps the bytes out of line and repopulates Data before the +// provider call that needs them. +type Image struct { + Hash string `json:"hash"` // sha256 of the (downscaled) bytes + MediaType string `json:"media_type"` // "image/png" | "image/jpeg" + Width int `json:"width,omitempty"` + Height int `json:"height,omitempty"` + Data []byte `json:"-"` // in-memory for the wire call; never persisted +} + type ToolDef struct { Name string Description string diff --git a/harness/blob.go b/harness/blob.go new file mode 100644 index 0000000..0d7a43f --- /dev/null +++ b/harness/blob.go @@ -0,0 +1,157 @@ +// Out-of-line image storage: a content-addressed sidecar so session JSON stays +// lean. Image bytes live under ~/.sesh/blobs keyed by their sha256, and the +// session records only the hash and metadata. Identical pastes share one blob. +package harness + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/mike-diff/sesh/agent" +) + +func blobsDir() string { + home, err := os.UserHomeDir() + if err != nil { + return ".sesh-blobs" + } + return filepath.Join(home, ".sesh", "blobs") +} + +// blobExt maps a media type to the on-disk extension. Unknown types fall back +// to .bin so a pass-through (undecodable) image still has a stable home. +func blobExt(mediaType string) string { + switch mediaType { + case "image/png": + return "png" + case "image/jpeg": + return "jpg" + default: + return "bin" + } +} + +func blobPath(hash, mediaType string) string { + return filepath.Join(blobsDir(), hash+"."+blobExt(mediaType)) +} + +// storeBlob writes data under its sha256 and returns the hash. An existing blob +// is left untouched (content addressing makes a rewrite redundant), so repeated +// pastes of the same image cost one file. The write is atomic write-then-rename +// so a crash mid-store cannot leave a truncated blob. +func storeBlob(data []byte, mediaType string) (string, error) { + sum := sha256.Sum256(data) + hash := hex.EncodeToString(sum[:]) + path := blobPath(hash, mediaType) + if _, err := os.Stat(path); err == nil { + return hash, nil // dedupe: identical bytes already stored + } + if err := os.MkdirAll(blobsDir(), 0o755); err != nil { + return "", err + } + tmp := path + ".tmp" + if err := os.WriteFile(tmp, data, 0o644); err != nil { + return "", err + } + if err := os.Rename(tmp, path); err != nil { + return "", err + } + return hash, nil +} + +// loadBlob reads the bytes stored under hash. The media type only selects the +// extension; both known extensions are tried so a caller need not remember it. +func loadBlob(hash string) ([]byte, error) { + for _, ext := range []string{"png", "jpg", "bin"} { + if b, err := os.ReadFile(filepath.Join(blobsDir(), hash+"."+ext)); err == nil { + return b, nil + } + } + return nil, fmt.Errorf("blob %s not found", hash) +} + +// blobGCMinAge is how old an unreferenced blob must be before gcBlobs will +// delete it. The floor exists to avoid racing a blob another live instance just +// pasted and stored but has not yet written a session for; an hour is far longer +// than any save latency, so a still-referenced blob is never mistaken for trash. +const blobGCMinAge = time.Hour + +// gcBlobs deletes orphaned image blobs: those referenced by no session and older +// than blobGCMinAge. It scans every session (sealed ones included, so a blob a +// sealed transcript still references is never collected), builds the set of live +// hashes, and removes only blobs that are both unreferenced and past the age +// floor. It is deliberately conservative: deleting a live blob (a missing image +// the user can no longer resolve) is worse than leaving a small orphan behind, so +// every error is skipped rather than fatal and recent files are always kept. +func gcBlobs() { + referenced := map[string]bool{} + for _, s := range allSessions() { + for _, t := range s.Turns { + for _, im := range t.Images { + if im.Hash != "" { + referenced[im.Hash] = true + } + } + } + } + entries, err := os.ReadDir(blobsDir()) + if err != nil { + return // no blobs dir yet, or unreadable: nothing to collect + } + cutoff := time.Now().Add(-blobGCMinAge) + for _, e := range entries { + if e.IsDir() { + continue + } + name := e.Name() + hash := strings.TrimSuffix(name, filepath.Ext(name)) + if referenced[hash] { + continue // a session still points at this blob + } + info, err := e.Info() + if err != nil || info.ModTime().After(cutoff) { + continue // unreadable, or freshly written: leave it, a save may be in flight + } + os.Remove(filepath.Join(blobsDir(), name)) // best-effort; an error just leaves the orphan + } +} + +// rehydrateImages repopulates the in-memory Data of any image whose bytes were +// dropped on save (Data is json:"-"), so a resumed or handed-off turn can be +// re-sent to the model. It walks history in place, modifying the shared slice: +// an image already holding Data is left alone, so it is cheap on live turns and +// safe to call repeatedly. An image whose blob cannot be loaded is dropped from +// its turn rather than left with empty Data, which would send zero bytes to the +// model; a dim note tells the user the image could not be restored. +func rehydrateImages(history []agent.Turn) { + for i := range history { + t := &history[i] + if len(t.Images) == 0 { + continue + } + kept := t.Images[:0] + for _, im := range t.Images { + if len(im.Data) > 0 { + kept = append(kept, im) + continue + } + data, err := loadBlob(im.Hash) + if err != nil { + emit("%s could not restore a pasted image (blob %s missing); continuing without it%s\n", dim, im.Hash, reset) + continue + } + im.Data = data + kept = append(kept, im) + } + if len(kept) == 0 { + t.Images = nil + } else { + t.Images = kept + } + } +} diff --git a/harness/blob_test.go b/harness/blob_test.go new file mode 100644 index 0000000..546d2bb --- /dev/null +++ b/harness/blob_test.go @@ -0,0 +1,236 @@ +package harness + +import ( + "os" + "path/filepath" + "testing" + "time" + + "github.com/mike-diff/sesh/agent" +) + +// TestBlobRoundTripAndDedupe: a stored blob loads back byte-for-byte, and a +// second store of identical bytes reuses the same blob rather than writing a new +// one. Breaker: drop the os.Stat dedupe guard in storeBlob and the second store +// rewrites, so the directory holds a duplicate (or a different hash is returned). +func TestBlobRoundTripAndDedupe(t *testing.T) { + t.Setenv("HOME", t.TempDir()) + + data := []byte("not really a png, but content-addressing does not care") + hash, err := storeBlob(data, "image/png") + if err != nil { + t.Fatal(err) + } + + got, err := loadBlob(hash) + if err != nil { + t.Fatal(err) + } + if string(got) != string(data) { + t.Fatalf("round-trip mismatch: %q", got) + } + + hash2, err := storeBlob(data, "image/png") + if err != nil { + t.Fatal(err) + } + if hash2 != hash { + t.Fatalf("identical bytes must dedupe to one hash: %s vs %s", hash, hash2) + } + + entries, err := os.ReadDir(blobsDir()) + if err != nil { + t.Fatal(err) + } + if len(entries) != 1 { + t.Fatalf("dedupe must leave one blob, got %d: %v", len(entries), entries) + } +} + +// TestBlobContentAddressed: the on-disk name is the sha256 of the bytes, so a +// caller that knows only the hash can find it and different bytes never collide. +// Breaker: key the file on anything but the content hash and either the path is +// unpredictable or two distinct images overwrite each other. +func TestBlobContentAddressed(t *testing.T) { + t.Setenv("HOME", t.TempDir()) + + hashA, err := storeBlob([]byte("alpha"), "image/jpeg") + if err != nil { + t.Fatal(err) + } + hashB, err := storeBlob([]byte("beta"), "image/jpeg") + if err != nil { + t.Fatal(err) + } + if hashA == hashB { + t.Fatal("different bytes must hash differently") + } + if _, err := os.Stat(filepath.Join(blobsDir(), hashA+".jpg")); err != nil { + t.Fatalf("jpeg blob must land at .jpg: %v", err) + } +} + +// TestLoadBlobMissing: a hash with no blob is an error, not empty success, so a +// dangling reference is loud rather than a silent empty image. +func TestLoadBlobMissing(t *testing.T) { + t.Setenv("HOME", t.TempDir()) + if _, err := loadBlob("deadbeef"); err == nil { + t.Fatal("loading an absent blob must error") + } +} + +// TestRehydrateRestoresData: a resumed turn carries an image with only a hash +// (Data is json:"-" on disk); rehydrateImages must load the bytes back from the +// blob store so the image can be re-sent. Breaker: make rehydrateImages a no-op +// and Data stays nil. +func TestRehydrateRestoresData(t *testing.T) { + t.Setenv("HOME", t.TempDir()) + want := []byte("the original downscaled bytes") + hash, err := storeBlob(want, "image/png") + if err != nil { + t.Fatal(err) + } + + history := []agent.Turn{{Role: "user", Text: "what is this?", + Images: []agent.Image{{Hash: hash, MediaType: "image/png"}}}} + rehydrateImages(history) + + if len(history[0].Images) != 1 { + t.Fatalf("the image must survive rehydration: %d images", len(history[0].Images)) + } + if string(history[0].Images[0].Data) != string(want) { + t.Fatalf("Data not restored from the blob: got %q", history[0].Images[0].Data) + } +} + +// TestRehydrateSkipsLiveData: an image that already holds Data (a fresh capture, +// not a resume) must be left untouched, so rehydration is cheap and idempotent. +// Breaker: always reload from disk and a live image with no stored blob loses +// its bytes (or errors). +func TestRehydrateSkipsLiveData(t *testing.T) { + t.Setenv("HOME", t.TempDir()) // empty blob store on purpose: nothing to load + live := []byte("freshly captured, never stored") + history := []agent.Turn{{Role: "user", + Images: []agent.Image{{Hash: "no-such-blob", MediaType: "image/png", Data: live}}}} + + rehydrateImages(history) + + if len(history[0].Images) != 1 || string(history[0].Images[0].Data) != string(live) { + t.Fatalf("a live image must keep its in-memory Data: %+v", history[0].Images) + } +} + +// TestRehydrateDropsMissingBlob: an image whose blob is gone must be removed from +// the turn, never left with empty Data (which would send zero bytes to the +// model). Breaker: leave the image in place and the turn keeps an empty-Data +// image instead of dropping it. +func TestRehydrateDropsMissingBlob(t *testing.T) { + t.Setenv("HOME", t.TempDir()) // no blobs stored + history := []agent.Turn{{Role: "user", Text: "see attached", + Images: []agent.Image{{Hash: "missing-hash", MediaType: "image/png"}}}} + + rehydrateImages(history) + + if len(history[0].Images) != 0 { + t.Fatalf("a missing blob must drop the image, not keep an empty one: %+v", history[0].Images) + } +} + +// TestRehydrateDropsOnlyMissing: with two images on one turn, only the one whose +// blob is gone is dropped; the recoverable one is rehydrated and kept in place. +// Breaker: drop the whole turn's images on any miss and the good one is lost too. +func TestRehydrateDropsOnlyMissing(t *testing.T) { + t.Setenv("HOME", t.TempDir()) + good := []byte("recoverable bytes") + hash, err := storeBlob(good, "image/png") + if err != nil { + t.Fatal(err) + } + history := []agent.Turn{{Role: "user", + Images: []agent.Image{ + {Hash: "gone", MediaType: "image/png"}, + {Hash: hash, MediaType: "image/png"}, + }}} + + rehydrateImages(history) + + if len(history[0].Images) != 1 { + t.Fatalf("only the missing image should drop: %d remain", len(history[0].Images)) + } + if history[0].Images[0].Hash != hash || string(history[0].Images[0].Data) != string(good) { + t.Fatalf("the recoverable image must survive with its bytes: %+v", history[0].Images[0]) + } +} + +// TestGCBlobsDeletesOnlyOldOrphans: gcBlobs must remove only blobs that are both +// unreferenced by any session AND older than the age floor. A referenced blob +// (even from a sealed session) and a recently-written unreferenced blob both +// survive. Breakers: drop the referenced-hash check and the referenced blob is +// deleted; drop the age floor and the fresh orphan is deleted. +func TestGCBlobsDeletesOnlyOldOrphans(t *testing.T) { + t.Setenv("HOME", t.TempDir()) + + // A live session and a sealed one, each referencing one blob. Scanning sealed + // sessions matters: a handed-off transcript still resolves its images. + liveHash, err := storeBlob([]byte("live-session image bytes"), "image/png") + if err != nil { + t.Fatal(err) + } + sealedHash, err := storeBlob([]byte("sealed-session image bytes"), "image/png") + if err != nil { + t.Fatal(err) + } + live := &Session{ID: "live-1", Created: time.Now(), + Turns: []agent.Turn{{Role: "user", Text: "see this", + Images: []agent.Image{{Hash: liveHash, MediaType: "image/png"}}}}} + sealed := &Session{ID: "sealed-1", Child: "live-1", Created: time.Now(), + Turns: []agent.Turn{{Role: "user", Text: "and this", + Images: []agent.Image{{Hash: sealedHash, MediaType: "image/png"}}}}} + if err := live.save(); err != nil { + t.Fatal(err) + } + if err := sealed.save(); err != nil { + t.Fatal(err) + } + // Age the referenced blobs past the floor too, so only the referenced-hash + // check (not the age floor) can save them: that makes the check load-bearing. + aged := time.Now().Add(-2 * blobGCMinAge) + for _, h := range []string{liveHash, sealedHash} { + if err := os.Chtimes(blobPath(h, "image/png"), aged, aged); err != nil { + t.Fatal(err) + } + } + + // An old orphan (no session points at it, aged past the floor): collectible. + oldHash, err := storeBlob([]byte("nobody references these bytes anymore"), "image/png") + if err != nil { + t.Fatal(err) + } + oldPath := blobPath(oldHash, "image/png") + if err := os.Chtimes(oldPath, aged, aged); err != nil { + t.Fatal(err) + } + + // A fresh orphan (unreferenced but just written): kept, the floor protects a + // blob whose session may not be saved yet. + freshHash, err := storeBlob([]byte("just pasted, no session saved yet"), "image/png") + if err != nil { + t.Fatal(err) + } + freshPath := blobPath(freshHash, "image/png") + + gcBlobs() + + if _, err := os.Stat(oldPath); !os.IsNotExist(err) { + t.Fatalf("old unreferenced blob must be deleted (err=%v)", err) + } + for name, path := range map[string]string{ + "live-referenced": blobPath(liveHash, "image/png"), + "sealed-referenced": blobPath(sealedHash, "image/png"), + "fresh-orphan": freshPath, + } { + if _, err := os.Stat(path); err != nil { + t.Fatalf("%s blob must survive gc: %v", name, err) + } + } +} diff --git a/harness/clipboard.go b/harness/clipboard.go index 512dfa8..4ec14f4 100644 --- a/harness/clipboard.go +++ b/harness/clipboard.go @@ -1,7 +1,8 @@ -// Clipboard support for /copy: put text on the system clipboard from a terminal -// app, with no third-party dependency. Two independent paths are used so the -// text lands whether sesh runs locally or over SSH, and whether or not the -// terminal allows programmatic clipboard writes. +// Clipboard support, with no third-party dependency. Writing text (for /copy) +// uses two independent paths so it lands whether sesh runs locally or over SSH, +// and whether or not the terminal allows programmatic clipboard writes. Reading +// an image (for Ctrl-V paste) shells out to a platform tool, the read-direction +// twin of the write path. package harness import ( @@ -12,6 +13,76 @@ import ( "strings" ) +// readClipboardImage reads an image off the system clipboard by shelling out to +// the first per-platform tool found on PATH, the read-direction twin of +// localCopy. It returns the raw image bytes and a media type. A tool that runs +// but yields nothing means no image is on the clipboard; no tool on PATH names +// what to install. It never returns nil error with empty data, so the caller +// always has something honest to show the user. +func readClipboardImage() (data []byte, mediaType string, err error) { + tools := imageReadTools() + found := false + for _, tool := range tools { + if _, lerr := exec.LookPath(tool.cmd[0]); lerr != nil { + continue + } + found = true + out, rerr := exec.Command(tool.cmd[0], tool.cmd[1:]...).Output() + if rerr != nil || len(out) == 0 { + continue // wrong selection type, or nothing of this type on the clipboard + } + return out, tool.mediaType, nil + } + if !found { + return nil, "", fmt.Errorf("%s", missingImageToolHint()) + } + return nil, "", fmt.Errorf("no image on the clipboard") +} + +// imageTool is one clipboard image reader: the command to run and the media +// type its output carries. +type imageTool struct { + cmd []string + mediaType string +} + +// imageReadTools is the per-platform list of clipboard image readers, in +// preference order. Each reads the clipboard image to stdout. +func imageReadTools() []imageTool { + switch runtime.GOOS { + case "darwin": + // AppleScript pulls PNG data off the clipboard and writes the raw bytes + // to stdout; there is no pbpaste flag for image data. + return []imageTool{{ + cmd: []string{"osascript", "-e", "set the clipboard to (the clipboard as «class PNGf»)", "-e", "get the clipboard as «class PNGf»"}, + mediaType: "image/png", + }} + case "windows": + return []imageTool{{ + cmd: []string{"powershell", "-NoProfile", "-Command", "$img = Get-Clipboard -Format Image; if ($img) { $ms = New-Object System.IO.MemoryStream; $img.Save($ms, [System.Drawing.Imaging.ImageFormat]::Png); [Console]::OpenStandardOutput().Write($ms.ToArray(), 0, $ms.Length) }"}, + mediaType: "image/png", + }} + default: // linux, the BSDs + return []imageTool{ + {cmd: []string{"wl-paste", "--type", "image/png"}, mediaType: "image/png"}, // Wayland + {cmd: []string{"xclip", "-selection", "clipboard", "-t", "image/png", "-o"}, mediaType: "image/png"}, // X11 + } + } +} + +// missingImageToolHint names the tool a user should install to paste images, +// per platform, so a failed read points at the fix rather than a dead end. +func missingImageToolHint() string { + switch runtime.GOOS { + case "darwin": + return "osascript not found; it ships with macOS" + case "windows": + return "powershell not found to read the clipboard image" + default: + return "install wl-clipboard or xclip to paste images" + } +} + // osc52 is the terminal "set clipboard" escape sequence: the terminal itself // base64-decodes the payload and stores it, so it reaches the clipboard even // across an SSH hop. The "c" selection is the system clipboard. diff --git a/harness/continuity.go b/harness/continuity.go index 59eca55..26ed10c 100644 --- a/harness/continuity.go +++ b/harness/continuity.go @@ -40,6 +40,12 @@ func approxTokens(turns []agent.Turn) int { for _, r := range t.Results { n += len(r.Content) } + // Images cost tokens too, but by patch grid, not characters. Pre-multiply + // by 4 so each image's estimate survives the final chars/4 division and the + // verbatim-tail budget reflects what an image turn will actually send. + for _, im := range t.Images { + n += estimateImageTokens(im.Width, im.Height) * 4 + } } return n / 4 } @@ -59,7 +65,14 @@ func renderTranscript(turns []agent.Turn, maxResult int) string { for _, t := range turns { switch t.Role { case "user": - fmt.Fprintf(&b, "USER: %s\n\n", t.Text) + fmt.Fprintf(&b, "USER: %s\n", t.Text) + // The brief writer must know an image existed without carrying its + // bytes: a one-line note keeps the visual context discoverable while + // the transcript stays text. + if len(t.Images) > 0 { + fmt.Fprintf(&b, "%s\n", imageNote(t.Images)) + } + b.WriteByte('\n') case "assistant": if t.Text != "" { fmt.Fprintf(&b, "ASSISTANT: %s\n", t.Text) @@ -87,6 +100,17 @@ func renderTranscript(turns []agent.Turn, maxResult int) string { return s } +// imageNote renders a one-line, byte-free summary of a user turn's images for +// the transcript: enough for the brief writer to record that images existed and +// their shape, never the data itself. +func imageNote(images []agent.Image) string { + parts := make([]string, len(images)) + for i, im := range images { + parts[i] = fmt.Sprintf("%s %dx%d", im.MediaType, im.Width, im.Height) + } + return fmt.Sprintf("[%d image(s): %s]", len(images), strings.Join(parts, ", ")) +} + // briefInstructions is structured role-first. Negative constraints and failed // approaches get their own numbered section because they are what default // summaries reliably drop, and re-walking a known dead end is the most diff --git a/harness/continuity_test.go b/harness/continuity_test.go index a46a401..60081d6 100644 --- a/harness/continuity_test.go +++ b/harness/continuity_test.go @@ -2,6 +2,7 @@ package harness import ( "context" + "encoding/base64" "encoding/json" "fmt" "os" @@ -154,6 +155,122 @@ func TestRecallAcrossChain(t *testing.T) { } } +// TestSessionJSONHasNoImageBytes: an image turn saved the way session.save +// marshals it carries the hash and metadata but never the base64 of Data, so the +// session file stays lean. Breaker: drop the json:"-" tag on Image.Data and the +// marshalled JSON contains the encoded bytes. +func TestSessionJSONHasNoImageBytes(t *testing.T) { + data := []byte("PRETEND-IMAGE-BYTES-THAT-MUST-NOT-LEAK") + sess := &Session{ + ID: "img-sess", + Turns: []agent.Turn{{Role: "user", Text: "look", Images: []agent.Image{{Hash: "abc123", MediaType: "image/png", Width: 100, Height: 80, Data: data}}}}, + } + + b, err := json.MarshalIndent(sess, "", " ") // exactly how session.save marshals + if err != nil { + t.Fatal(err) + } + out := string(b) + + if !strings.Contains(out, "abc123") { + t.Fatal("the session JSON must keep the image hash as the reference") + } + if strings.Contains(out, base64.StdEncoding.EncodeToString(data)) { + t.Fatal("the session JSON must not contain the base64 of the image bytes") + } + if strings.Contains(out, string(data)) { + t.Fatal("the raw image bytes must not appear in the session JSON") + } +} + +// TestHandoffCarriesImageRef: the verbatim tail copied into a successor session +// carries the image reference (hash/metadata) but not the bytes, and the +// successor resolves the bytes from the shared blob store via rehydrateImages. +// Breaker: exclude Images from the carried turn and the successor has no image +// ref to resolve. +func TestHandoffCarriesImageRef(t *testing.T) { + t.Setenv("HOME", t.TempDir()) + bytesOnDisk := []byte("the shared blob the successor will resolve") + hash, err := storeBlob(bytesOnDisk, "image/png") + if err != nil { + t.Fatal(err) + } + + // A dying session whose tail is a user turn bearing an image with no Data + // (the on-disk form: bytes live in the blob store, not the turn). + dying := []agent.Turn{ + {Role: "user", Text: "older"}, {Role: "assistant", Text: "ok"}, + {Role: "user", Text: "what is in this screenshot?", + Images: []agent.Image{{Hash: hash, MediaType: "image/png", Width: 100, Height: 80}}}, + {Role: "assistant", Text: "a chart"}, + } + + tail := verbatimTail(dying, 1_000_000) + old := &Session{ID: "dying-1", Cwd: "/w"} + next := seedChain(old, "BRIEF", "entry", "branch: main", tail) + + // the carried turn keeps the reference, not the bytes + var imgTurn *agent.Turn + for i := range next.Turns { + if len(next.Turns[i].Images) > 0 { + imgTurn = &next.Turns[i] + break + } + } + if imgTurn == nil { + t.Fatal("the successor must carry the image-bearing turn from the tail") + } + if imgTurn.Images[0].Hash != hash { + t.Fatalf("the carried image must keep its hash ref: %q", imgTurn.Images[0].Hash) + } + if len(imgTurn.Images[0].Data) != 0 { + t.Fatal("the seed must carry a reference, not the bytes") + } + + // the successor resolves the bytes from the shared blob store + rehydrateImages(next.Turns) + if string(imgTurn.Images[0].Data) != string(bytesOnDisk) { + t.Fatalf("the successor must resolve Data from the shared blob store: got %q", imgTurn.Images[0].Data) + } +} + +// TestApproxTokensCountsImages: a turn carrying an image reports more tokens than +// the same turn without it, so the verbatim-tail budget accounts for image cost. +// Breaker: omit the image term in approxTokens and the two counts are equal. +func TestApproxTokensCountsImages(t *testing.T) { + withText := []agent.Turn{{Role: "user", Text: "describe this"}} + withImage := []agent.Turn{{Role: "user", Text: "describe this", + Images: []agent.Image{{Hash: "h", MediaType: "image/png", Width: 1456, Height: 819}}}} + + plain := approxTokens(withText) + withImg := approxTokens(withImage) + if withImg <= plain { + t.Fatalf("an image must add to the token estimate: text=%d image=%d", plain, withImg) + } + if want := estimateImageTokens(1456, 819); withImg-plain != want { + t.Fatalf("the image term must equal its patch estimate: delta=%d want=%d", withImg-plain, want) + } +} + +// TestRenderTranscriptNotesImages: a user turn with images gets a one-line, +// byte-free note in the brief transcript so the brief writer knows an image +// existed. Breaker: skip the note in the user arm and the dimensions/media type +// never reach the transcript. +func TestRenderTranscriptNotesImages(t *testing.T) { + h := []agent.Turn{{Role: "user", Text: "compare these", + Images: []agent.Image{ + {Hash: "a", MediaType: "image/png", Width: 1456, Height: 819}, + {Hash: "b", MediaType: "image/jpeg", Width: 800, Height: 600}, + }}} + got := renderTranscript(h, 100) + + for _, want := range []string{"USER: compare these", "2 image(s)", "image/png 1456x819", "image/jpeg 800x600"} { + if !strings.Contains(got, want) { + t.Fatalf("transcript missing %q:\n%s", want, got) + } + } +} + // TestReplHandoff: the full product flow: brief written, old session archived, // new session seeded and live, ledger grown. func TestReplHandoff(t *testing.T) { diff --git a/harness/harness.go b/harness/harness.go index 30a00d4..f111e61 100644 --- a/harness/harness.go +++ b/harness/harness.go @@ -191,6 +191,7 @@ func Main() { sweepDeadProcs(sess.ID) // reap processes a previously-crashed sesh left behind pm := newProcManager(sess.ID) os.Setenv("SESH_SESSION", sess.ID) // tool/gate/statusline mods can find this session's run dir + go gcBlobs() // sweep orphaned image blobs off the hot path; best-effort, never blocks startup tools := builtinTools(*unsafePaths, pm) // The engines (skill, mcp) join only when their user-space content exists: // an empty mount costs zero tokens. They are built-ins, so they claim @@ -205,6 +206,13 @@ func Main() { } modTools, modNotes := loadToolMods(taken) tools = append(tools, modTools...) + // A tools-less model (e.g. a local vision model) rejects any tools array, so + // the no_tools dial drops every tool: the built-ins and engines/mods here, and + // the task/recall pair each mode wires in below. + noTools := pcfg.Providers[spec.name].NoTools + if noTools { + tools = nil + } for _, n := range append(engNotes, modNotes...) { fmt.Fprintf(os.Stderr, "%s%s%s\n", yellow, n, reset) } @@ -255,14 +263,17 @@ func Main() { } pg := budgetGate(*maxTools, counted) // first turn's budget; drive refreshes per iteration sessOf := func() *Session { return r.sess } - tools = append(tools, - taskTool(func() agent.Provider { return r.p }, sessOf, 1, *unsafePaths, pg, nil), - recallTool(sessOf)) + if !noTools { + tools = append(tools, + taskTool(func() agent.Provider { return r.p }, sessOf, 1, *unsafePaths, pg, nil), + recallTool(sessOf)) + } if r.preflight(*printMode) { os.Exit(1) // the message can never fit; nothing was sent } mark := len(r.history) r.history = append(r.history, agent.Turn{Role: "user", Text: *printMode}) + rehydrateImages(r.history) // a resumed session's prior images carry only a hash; load the bytes back before the wire call out, spent, err := agent.Run(context.Background(), r.p, r.system, r.history, tools, agent.Hooks{Gate: pg}) if err != nil { @@ -357,9 +368,11 @@ func Main() { // applies to later spawns, and child token usage lands in the totals // (accountChild shares acctMu because parallel children report concurrently). sessOf := func() *Session { return r.sess } - tools = append(tools, - taskTool(func() agent.Provider { return r.p }, sessOf, 1, *unsafePaths, g, r.accountChild), - recallTool(sessOf)) + if !noTools { + tools = append(tools, + taskTool(func() agent.Provider { return r.p }, sessOf, 1, *unsafePaths, g, r.accountChild), + recallTool(sessOf)) + } // The TUI completes commands, provider names, and model ids on tab, and // reaps owned processes if a signal tears the session down. if t, ok := con.(*tuiConsole); ok { diff --git a/harness/help.go b/harness/help.go index 4eec8aa..c08aed1 100644 --- a/harness/help.go +++ b/harness/help.go @@ -45,6 +45,13 @@ CONTINUITY (infinite sessions) FLAGS ` + flagDefaults() + ` +INPUT KEYS (interactive footer TUI) + Ctrl-V (Alt-V fallback) paste a clipboard image, shown inline as [image-N] + and sent to a vision-capable model (Alt-V for + terminals that swallow Ctrl-V, like Windows Terminal) + Shift-Enter, Ctrl-J, \+Enter newline; Enter submits + Esc cancel the running turn (type to steer at the next step) + SESSION COMMANDS (interactive; tab completes) /provider [add|remove|name] pick, add (wizard), remove, or switch providers /model [id|#|substring] pick/switch models, or add a custom one; window retunes @@ -77,7 +84,10 @@ MODEL TOOLS (what the agent inside can do) reaped when the session exits. FILES AND MODS (project .sesh/ overrides global ~/.sesh/) - providers.json named provider profiles (managed by /provider) + providers.json named provider profiles (managed by /provider); + "vision": true|false overrides image support (default + by name); "no_tools": true sends no tools (tools-less + models, e.g. local vision models) credentials.json, key AES-256-GCM encrypted API keys (0600) SYSTEM.md / APPEND_SYSTEM.md replace / extend the system prompt prompts/.md override model-facing templates: brief, judge, diff --git a/harness/image.go b/harness/image.go new file mode 100644 index 0000000..7d22fd7 --- /dev/null +++ b/harness/image.go @@ -0,0 +1,119 @@ +// Image ingest: decode a pasted PNG or JPEG, downscale it for token hygiene, +// and re-encode it, using only the standard library so the single binary keeps +// its zero-dependency promise. Formats the stdlib cannot decode (for example +// WEBP) pass through untouched, carrying their detected media type and unknown +// dimensions. The token estimate is the patch-grid heuristic vision models use. +package harness + +import ( + "bytes" + "errors" + "image" + "image/jpeg" + "image/png" +) + +// maxEdge is the longest-edge ceiling images are downscaled to: the size above +// which a vision model gains no detail but the request keeps paying tokens. +const maxEdge = 1568 + +// decodeAndDownscale decodes raw image bytes, scales them so the longest edge is +// at most maxEdge, and re-encodes in the source format. It returns the encoded +// bytes, their media type, and the (possibly reduced) dimensions. An image that +// is already within the cap is re-encoded unchanged. A format the stdlib cannot +// decode but can still identify (e.g. WEBP) passes through verbatim for the API +// to handle; bytes that cannot even be identified return an error. +func decodeAndDownscale(raw []byte) (out []byte, mediaType string, w, h int, err error) { + src, format, derr := image.Decode(bytes.NewReader(raw)) + if derr != nil { + mt := detectMediaType(raw) + if mt == "application/octet-stream" { + return nil, "", 0, 0, errors.New("unsupported image format (PNG or JPEG)") + } + return raw, mt, 0, 0, nil + } + mediaType = "image/" + format + if format == "jpeg" { + mediaType = "image/jpeg" + } + + b := src.Bounds() + sw, sh := b.Dx(), b.Dy() + dst := src + dw, dh := sw, sh + if longest := max(sw, sh); longest > maxEdge { + dw = sw * maxEdge / longest + dh = sh * maxEdge / longest + // A very long, thin image rounds the short edge to zero; keep at least + // one pixel so the re-encode produces a real image, not an empty one. + dw, dh = max(dw, 1), max(dh, 1) + dst = scale(src, dw, dh) + } + + encoded, eerr := encode(dst, format) + if eerr != nil { + return nil, "", 0, 0, eerr + } + return encoded, mediaType, dw, dh, nil +} + +// scale resizes src to dw x dh by nearest-neighbor sampling. It is a box filter +// without the smoothing a resampling filter would add, which is acceptable for +// shrinking screenshots and keeps the dependency footprint at zero. +func scale(src image.Image, dw, dh int) image.Image { + b := src.Bounds() + sw, sh := b.Dx(), b.Dy() + dst := image.NewRGBA(image.Rect(0, 0, dw, dh)) + for y := 0; y < dh; y++ { + sy := b.Min.Y + y*sh/dh + for x := 0; x < dw; x++ { + sx := b.Min.X + x*sw/dw + dst.Set(x, y, src.At(sx, sy)) + } + } + return dst +} + +// encode writes img back to its source format. JPEG keeps a high quality so a +// re-encode of an already-lossy paste does not visibly degrade it. +func encode(img image.Image, format string) ([]byte, error) { + var buf bytes.Buffer + if format == "jpeg" { + if err := jpeg.Encode(&buf, img, &jpeg.Options{Quality: 90}); err != nil { + return nil, err + } + return buf.Bytes(), nil + } + if err := png.Encode(&buf, img); err != nil { + return nil, err + } + return buf.Bytes(), nil +} + +// detectMediaType sniffs the media type for bytes the stdlib could not decode, +// so a pass-through image still carries a type for the wire call. +func detectMediaType(raw []byte) string { + switch { + case bytes.HasPrefix(raw, []byte("\x89PNG\r\n\x1a\n")): + return "image/png" + case bytes.HasPrefix(raw, []byte{0xff, 0xd8, 0xff}): + return "image/jpeg" + case len(raw) >= 12 && bytes.Equal(raw[0:4], []byte("RIFF")) && bytes.Equal(raw[8:12], []byte("WEBP")): + return "image/webp" + default: + return "application/octet-stream" + } +} + +// estimateImageTokens approximates an image's input-token cost from its pixel +// dimensions: one token per 28x28 patch, capped at maxEdge. It is for display, +// not billing. +func estimateImageTokens(w, h int) int { + tokens := ceilDiv(w, 28) * ceilDiv(h, 28) + if tokens > maxEdge { + return maxEdge + } + return tokens +} + +func ceilDiv(a, b int) int { return (a + b - 1) / b } diff --git a/harness/image_test.go b/harness/image_test.go new file mode 100644 index 0000000..8aa025b --- /dev/null +++ b/harness/image_test.go @@ -0,0 +1,128 @@ +package harness + +import ( + "bytes" + "image" + "image/color" + "image/png" + "testing" +) + +// pngBytes encodes a w x h opaque PNG for the tests to feed to the decoder. +func pngBytes(t *testing.T, w, h int) []byte { + t.Helper() + img := image.NewRGBA(image.Rect(0, 0, w, h)) + for y := 0; y < h; y++ { + for x := 0; x < w; x++ { + img.Set(x, y, color.RGBA{R: uint8(x), G: uint8(y), B: 128, A: 255}) + } + } + var buf bytes.Buffer + if err := png.Encode(&buf, img); err != nil { + t.Fatal(err) + } + return buf.Bytes() +} + +// TestDownscaleCapsLongestEdge: a 4000x3000 image comes back with its longest +// edge clamped to maxEdge, aspect preserved, and still decodable. Breaker: drop +// the > maxEdge scale branch and the output keeps the original 4000px width. +func TestDownscaleCapsLongestEdge(t *testing.T) { + out, mt, w, h, err := decodeAndDownscale(pngBytes(t, 4000, 3000)) + if err != nil { + t.Fatal(err) + } + if w != maxEdge { + t.Fatalf("longest edge must cap at %d, got width %d", maxEdge, w) + } + if h != 3000*maxEdge/4000 { + t.Fatalf("aspect ratio not preserved: %dx%d", w, h) + } + if mt != "image/png" { + t.Fatalf("media type: %q", mt) + } + cfg, err := png.DecodeConfig(bytes.NewReader(out)) + if err != nil { + t.Fatalf("downscaled output must stay a decodable png: %v", err) + } + if cfg.Width != w || cfg.Height != h { + t.Fatalf("encoded dims %dx%d disagree with reported %dx%d", cfg.Width, cfg.Height, w, h) + } +} + +// TestDownscaleLeavesSmallImage: an image already within the cap keeps its +// dimensions (no needless upscale or crop). Breaker: scale unconditionally and a +// small image's dimensions change. +func TestDownscaleLeavesSmallImage(t *testing.T) { + _, _, w, h, err := decodeAndDownscale(pngBytes(t, 800, 600)) + if err != nil { + t.Fatal(err) + } + if w != 800 || h != 600 { + t.Fatalf("within-cap image must be left at its size, got %dx%d", w, h) + } +} + +// TestDecodePassThrough: bytes the stdlib cannot decode (here a WEBP header) +// come back unchanged with a detected media type and zero dimensions, instead of +// erroring. Breaker: return the decode error and an undecodable paste is lost. +func TestDecodePassThrough(t *testing.T) { + webp := append([]byte("RIFF\x00\x00\x00\x00WEBP"), []byte("vp8 garbage")...) + out, mt, w, h, err := decodeAndDownscale(webp) + if err != nil { + t.Fatalf("undecodable bytes must pass through, not error: %v", err) + } + if !bytes.Equal(out, webp) { + t.Fatal("pass-through must return the original bytes unchanged") + } + if mt != "image/webp" { + t.Fatalf("media type should be sniffed from the header, got %q", mt) + } + if w != 0 || h != 0 { + t.Fatalf("undecodable image has unknown dimensions, got %dx%d", w, h) + } +} + +// TestEstimateImageTokens: the patch-grid estimate is ceil(w/28)*ceil(h/28), +// capped at maxEdge. Breaker: use floor division, or drop the cap, and one of +// these cases is wrong. +func TestEstimateImageTokens(t *testing.T) { + // 56x56 = 2x2 patches exactly. + if got := estimateImageTokens(56, 56); got != 4 { + t.Fatalf("56x56 -> %d, want 4", got) + } + // 57x29 rounds up to 3x2 = 6 patches (proves ceiling, not floor). + if got := estimateImageTokens(57, 29); got != 6 { + t.Fatalf("57x29 -> %d, want 6 (ceil)", got) + } + // A large image is clamped to the cap. + if got := estimateImageTokens(maxEdge, maxEdge); got != maxEdge { + t.Fatalf("oversized estimate must clamp to %d, got %d", maxEdge, got) + } +} + +// TestDownscaleClampsTinyEdge: a long, thin image whose short edge would round to +// zero on downscale keeps at least one pixel and re-encodes to a real image. +// Breaker: drop the max(.,1) floor and the short edge is 0, yielding an empty image. +func TestDownscaleClampsTinyEdge(t *testing.T) { + out, _, w, h, err := decodeAndDownscale(pngBytes(t, 2000, 1)) + if err != nil { + t.Fatal(err) + } + if w != maxEdge || h != 1 { + t.Fatalf("long-thin image must clamp the short edge to 1, got %dx%d", w, h) + } + if cfg, err := png.DecodeConfig(bytes.NewReader(out)); err != nil || cfg.Height != 1 { + t.Fatalf("clamped output must be a real 1px-tall png: err=%v cfg=%+v", err, cfg) + } +} + +// TestDecodeRejectsUnknown: bytes that are neither decodable nor identifiable as a +// known image format are rejected with an error, so an octet-stream blob is never +// stored or sent to the API. Breaker: pass unidentifiable bytes through with an +// octet-stream type instead of erroring, and an unsupported blob reaches the wire. +func TestDecodeRejectsUnknown(t *testing.T) { + if _, _, _, _, err := decodeAndDownscale([]byte("this is plainly not an image")); err == nil { + t.Fatal("unidentifiable bytes must be rejected, not passed through") + } +} diff --git a/harness/providers.go b/harness/providers.go index 10380df..384f563 100644 --- a/harness/providers.go +++ b/harness/providers.go @@ -33,6 +33,14 @@ type Profile struct { // CustomModel is one user-added model the endpoint did not list, remembered // per provider so it persists and stays in /model. CustomModel string `json:"custom_model,omitempty"` + // Vision is the tri-state override for whether this profile's model can see + // images: nil leaves it to the model-name heuristic, true or false forces it. + // It is the escape hatch named in the paste-blocked guidance. + Vision *bool `json:"vision,omitempty"` + // NoTools, when true, sends no tool definitions to this profile's model. A + // tools-less model (such as a local vision model used only to read images) + // rejects any tools array, so the agent runs as plain conversation. + NoTools bool `json:"no_tools,omitempty"` } // ProvidersConfig is a parsed providers.json: a set of named profiles and the diff --git a/harness/repl.go b/harness/repl.go index 8fd2c8c..98ca2fa 100644 --- a/harness/repl.go +++ b/harness/repl.go @@ -179,6 +179,40 @@ func (r *repl) refreshProcLine() { // invalidates the provider's prompt cache, so the rebuild is cache-neutral. func (r *repl) refreshSystem() { r.system = systemPrompt() + identityBlock(r.current, r.model, r.protocol, r.switched) + // The TUI's image-paste gate reads the live brain, so wire it here where the + // brain is first set and on every switch. The closure resolves dynamically, + // so the gate tracks /model and /provider changes without re-wiring. + if t, ok := r.con.(*tuiConsole); ok && t.visionOK == nil { + t.visionOK = r.visionCapable + } +} + +// visionCapable reports whether the active model can see images. An explicit +// Vision dial on the active profile wins; otherwise the model name is matched +// against a heuristic, which treats an unknown model as text-only so an image is +// never silently dropped on a model that cannot read it. +func (r *repl) visionCapable() bool { + if prof, ok := r.pcfg.Providers[r.current]; ok && prof.Vision != nil { + return *prof.Vision + } + return modelSupportsVision(r.model) +} + +// modelSupportsVision is the name heuristic behind visionCapable: a model whose +// id contains one of these markers is assumed to accept images. Unknown models +// are not assumed vision-capable. +func modelSupportsVision(name string) bool { + name = strings.ToLower(name) + for _, marker := range []string{ + "claude", "gpt-4o", "gpt-4.1", "gpt-5", "o3", "o4", + "gemini", "llava", "pixtral", "vl", "vision", "moondream", + "minicpm-v", "glm-4.5v", "glm-4.6v", + } { + if strings.Contains(name, marker) { + return true + } + } + return false } // goodbye releases the live-instance lock and prints how to pick the @@ -1155,7 +1189,16 @@ func ask(c console, prompt, def string) string { // on the next call. func (r *repl) runTurn(ctx context.Context, line string, tools []agent.Tool, hooks agent.Hooks) ([]agent.Turn, bool) { mark := len(r.history) - r.history = append(r.history, agent.Turn{Role: "user", Text: line}) + // Images the user pasted into this message ride along on the user turn. Only + // the interactive TUI captures them; the plain console has none. Pulling them + // here (the single first-turn chokepoint) keeps them off driven steers, which + // by design carry text only. + var images []agent.Image + if t, ok := r.con.(*tuiConsole); ok { + images = t.takeImages() + } + r.history = append(r.history, agent.Turn{Role: "user", Text: line, Images: images}) + rehydrateImages(r.history) // a resumed session's prior images carry only a hash; load the bytes back before the wire call out, spent, err := agent.Run(ctx, r.p, r.system, r.history, tools, hooks) r.history = out r.md.flush() // emit the message's trailing partial line before the summary diff --git a/harness/repl_test.go b/harness/repl_test.go index ddeb223..2a5ca01 100644 --- a/harness/repl_test.go +++ b/harness/repl_test.go @@ -608,3 +608,48 @@ func TestStatusTextNoProvider(t *testing.T) { t.Fatalf("status with no provider must say so: %q", got) } } + +// TestModelSupportsVision: the name heuristic recognizes known vision families +// (case-insensitively) and treats an unknown model as text-only, so an image is +// never silently sent to a model that cannot read it. Breaker: default the +// unknown case to true and "qwen2.5-coder" is wrongly called vision-capable. +func TestModelSupportsVision(t *testing.T) { + for _, name := range []string{"claude-opus-4", "gpt-4o", "GPT-4.1", "gemini-1.5-pro", "pixtral-12b", + "qwen2-vl", "qwen2.5vl:3b", "llama3.2-vision", "moondream", "minicpm-v", "glm-4.6v"} { + if !modelSupportsVision(name) { + t.Fatalf("%q should be vision-capable", name) + } + } + for _, name := range []string{"qwen2.5-coder", "llama3", "deepseek-chat", ""} { + if modelSupportsVision(name) { + t.Fatalf("%q should not be assumed vision-capable", name) + } + } +} + +// TestVisionCapableDialOverrides: an explicit Vision dial on the active profile +// wins over the name heuristic in both directions, the escape hatch named in the +// paste-blocked guidance. Breaker: consult the heuristic before the dial and the +// forced-on text-model case returns false. +func TestVisionCapableDialOverrides(t *testing.T) { + tru, fls := true, false + cases := []struct { + name string + model string + vision *bool + want bool + }{ + {"dial forces a text model on", "qwen2.5-coder", &tru, true}, + {"dial forces a vision model off", "claude-opus-4", &fls, false}, + {"unset dial falls back to the heuristic", "claude-opus-4", nil, true}, + {"no profile falls back to the heuristic", "qwen2.5-coder", nil, false}, + } + for _, c := range cases { + r := &repl{model: c.model, current: "p", pcfg: ProvidersConfig{Providers: map[string]Profile{ + "p": {Vision: c.vision}, + }}} + if got := r.visionCapable(); got != c.want { + t.Fatalf("%s: visionCapable = %v, want %v", c.name, got, c.want) + } + } +} diff --git a/harness/tui.go b/harness/tui.go index c18ea72..c8e83e4 100644 --- a/harness/tui.go +++ b/harness/tui.go @@ -24,6 +24,8 @@ import ( "sync" "syscall" "time" + + "github.com/mike-diff/sesh/agent" ) // console abstracts where user input comes from and how the footer is drawn, @@ -158,6 +160,12 @@ func (c *plainConsole) Select(title string, items []string, current int) (int, e // rune per snippet, so the cursor treats a snippet as a single character. const snippetBase rune = 0xE000 +// imageBase marks pasted-image tokens in the buffer, in a separate private-use +// block from snippetBase. A token's rune value is its absolute index into the +// images slice (stable for byte lookup); its displayed [image-N] number is by +// order of appearance, so a deletion renumbers the rest. +const imageBase rune = 0xF000 + type tuiConsole struct { mu sync.Mutex out *os.File @@ -185,6 +193,16 @@ type tuiConsole struct { // pastes large enough to collapse; index i renders as [snippet #i+1] snippets []string + // images is the append-only list of pasted images for the message being + // composed; an imageBase token's rune value indexes into it. It is never + // compacted on delete (that would invalidate live token indices). submitImages + // holds the images of the last submitted message, drained by takeImages. + // visionOK reports whether the active model can see images; nil disables the + // gate (the plain console and tests leave it unset). + images []agent.Image + submitImages []agent.Image + visionOK func() bool + // history and completion hist []string histIdx int // == len(hist) means the live draft @@ -596,6 +614,7 @@ func lineEnd(buf []rune, pos int) int { func (t *tuiConsole) segments() []string { hi := t.mentionMask() segs := make([]string, len(t.buf)) + imgSeen := 0 // running count of image tokens, so display numbering is by appearance for i, r := range t.buf { switch { case t.mask: @@ -604,9 +623,12 @@ func (t *tuiConsole) segments() []string { // A real break: layout starts a new row here, and the submit echo // shows the message on multiple lines. segs[i] = "\n" - case r >= snippetBase && int(r-snippetBase) < len(t.snippets): + case r >= snippetBase && r < imageBase && int(r-snippetBase) < len(t.snippets): n := int(r - snippetBase) segs[i] = fmt.Sprintf("[snippet #%d: %d lines]", n+1, 1+strings.Count(t.snippets[n], "\n")) + case r >= imageBase && int(r-imageBase) < len(t.images): + imgSeen++ + segs[i] = fmt.Sprintf("[image-%d]", imgSeen) default: segs[i] = string(r) } @@ -749,6 +771,7 @@ func (t *tuiConsole) beginInput(prompt string, mask bool) { t.mu.Lock() t.prompt, t.buf, t.pos, t.mask = prompt, nil, 0, mask t.snippets = nil + t.images = nil t.histIdx = len(t.hist) t.draft = nil t.winTop = 0 @@ -892,10 +915,19 @@ func (t *tuiConsole) readLineMode(prompt string, mask bool, turn *turnAttend) (s t.mu.Unlock() return "", io.EOF case r == '\r' && turn != nil: // Enter while working: queue the message, keep editing - if line := strings.TrimSpace(t.expandSnippets()); line != "" { + // A queued steer carries text only: images pasted into it are not sent + // (per the feature's non-goal), but their labels still render so no raw + // token rune leaks into the steer text. + text, imgs := t.composeMessage() + if line := strings.TrimSpace(text); line != "" { turn.queue(line) t.noteQueuedLocked(line) - t.buf, t.pos, t.snippets = nil, 0, nil + if len(imgs) > 0 { + t.removeFooterLocked() + t.writeLocked(fmt.Sprintf("%s images are not carried on a steer; resend them in a fresh message%s\n", dim, reset)) + t.drawFooterLocked() + } + t.buf, t.pos, t.snippets, t.images = nil, 0, nil, nil } case r == '\r': // Enter submits; Shift+Enter, Ctrl-J, and \+Enter newline if !mask && t.pos > 0 && t.buf[t.pos-1] == '\\' { @@ -911,7 +943,9 @@ func (t *tuiConsole) readLineMode(prompt string, mask bool, turn *turnAttend) (s for _, s := range segs { shown.WriteString(s) } - line := strings.TrimSpace(t.expandSnippets()) + text, imgs := t.composeMessage() + line := strings.TrimSpace(text) + t.submitImages = imgs // drained by takeImages after ReadLine returns t.endInput(t.prompt + shown.String()) if !mask { t.hist = appendHistory(t.histPath, t.hist, line) @@ -939,6 +973,12 @@ func (t *tuiConsole) readLineMode(prompt string, mask bool, turn *turnAttend) (s t.pos = lineStart(t.buf, t.pos) case r == 0x05: // Ctrl-E: end of the current logical line t.pos = lineEnd(t.buf, t.pos) + case r == 0x16 && !mask: // Ctrl-V: paste an image off the clipboard + // The capture shells out and writes to the transcript, so it must run + // without the mutex (Print relocks it); mirror the Escape handler. + t.mu.Unlock() + t.captureImage() + t.mu.Lock() case r == '\t': if !mask { t.completeLocked() @@ -974,7 +1014,7 @@ func (t *tuiConsole) insertLocked(r rune) { func (t *tuiConsole) expandSnippets() string { var b strings.Builder for _, r := range t.buf { - if r >= snippetBase && int(r-snippetBase) < len(t.snippets) { + if r >= snippetBase && r < imageBase && int(r-snippetBase) < len(t.snippets) { b.WriteString(t.snippets[int(r-snippetBase)]) continue } @@ -983,18 +1023,140 @@ func (t *tuiConsole) expandSnippets() string { return b.String() } +// captureImage runs the Ctrl-V pipeline: read the clipboard image, gate it on +// the active model's vision support, downscale and store it, then insert an +// atomic [image-N] token and print an honest note. It takes the mutex itself +// (the caller released it so the transcript writes can relock); every path that +// declines to insert states why, so a paste is never silently dropped. +func (t *tuiConsole) captureImage() { + raw, _, err := readClipboardImage() + if err != nil { + t.note("can't paste image: " + err.Error()) + return + } + t.captureRaw(raw) +} + +// captureImageQuiet is the Cmd+V empty-paste path: it captures an image only +// when one is actually on the clipboard and emits nothing otherwise, so a plain +// empty paste (no image) produces no output. Once an image is found it runs the +// same gating, store, and honest-note path as Ctrl+V. +func (t *tuiConsole) captureImageQuiet() { + raw, _, err := readClipboardImage() + if err != nil { + return // no image (or no tool): stay silent, this was an ordinary empty paste + } + t.captureRaw(raw) +} + +// captureRaw is the shared tail of both capture paths once raw clipboard bytes +// are in hand: vision gating, downscale, blob store, the atomic [image-N] token, +// and the honest note. It takes the mutex itself for the buffer edit (the caller +// released it so the transcript writes can relock). +func (t *tuiConsole) captureRaw(raw []byte) { + if t.visionOK != nil && !t.visionOK() { + t.note("the current model can't see images; switch with /model, or set \"vision\": true on the provider profile if it does") + return + } + data, mediaType, w, h, err := decodeAndDownscale(raw) + if err != nil { + t.note("can't paste image: " + err.Error()) + return + } + hash, err := storeBlob(data, mediaType) + if err != nil { + t.note("can't paste image: " + err.Error()) + return + } + im := agent.Image{Hash: hash, MediaType: mediaType, Width: w, Height: h, Data: data} + t.mu.Lock() + t.images = append(t.images, im) + t.insertLocked(imageBase + rune(len(t.images)-1)) + if t.footer { + t.refreshFooterLocked() + } + t.mu.Unlock() + dims := "unknown size" + if w > 0 && h > 0 { + dims = fmt.Sprintf("%dx%d", w, h) + } + t.note(fmt.Sprintf("image captured: %s %s, ~%d tokens", + mediaType, dims, estimateImageTokens(w, h))) +} + +// note drops a dim transcript line above the footer, the capture feedback +// counterpart to the snippet note. It takes the mutex itself. +func (t *tuiConsole) note(msg string) { + t.mu.Lock() + defer t.mu.Unlock() + t.removeFooterLocked() + t.writeLocked(fmt.Sprintf("%s %s%s\n", dim, msg, reset)) + t.drawFooterLocked() +} + +// composeMessage walks the buffer into the text to send and the ordered images +// it carries: snippet tokens expand to their pasted content, image tokens append +// their image and write a [image-K] label (numbered by appearance, matching what +// segments() drew), and every other rune passes through. An image token never +// leaks into the text as a raw private-use rune. +func (t *tuiConsole) composeMessage() (string, []agent.Image) { + var b strings.Builder + var imgs []agent.Image + for _, r := range t.buf { + switch { + case r >= snippetBase && r < imageBase && int(r-snippetBase) < len(t.snippets): + b.WriteString(t.snippets[int(r-snippetBase)]) + case r >= imageBase && int(r-imageBase) < len(t.images): + imgs = append(imgs, t.images[int(r-imageBase)]) + fmt.Fprintf(&b, "[image-%d]", len(imgs)) + default: + b.WriteRune(r) + } + } + return b.String(), imgs +} + +// takeImages returns the images of the last submitted message and clears them, +// so the next submit starts empty. The repl pulls them after ReadLine to attach +// to the user Turn. +func (t *tuiConsole) takeImages() []agent.Image { + imgs := t.submitImages + t.submitImages = nil + return imgs +} + // handleEscape reads one escape sequence and applies its editing action: // arrows move the cursor or walk history, home/end/delete edit, Shift+Enter // inserts a newline (CSI 13;2u from the Kitty disambiguation flag, or CSI // 27;2;13~ from terminals/tmux in extended-keys mode), and a bracketed-paste -// begin marker pulls the whole paste into the buffer. +// begin marker pulls the whole paste into the buffer. Alt+V (ESC then a plain +// v) is the Ctrl+V fallback for terminals like Windows Terminal that swallow +// Ctrl+V for their own paste; it runs the same image-capture pipeline. func (t *tuiConsole) handleEscape() { + // Peek the byte after ESC: a plain v is Alt+V, not a CSI/SS3 introducer, so + // route it to the capture pipeline before the sequence decoder. The capture + // writes to the transcript (Print relocks), so it must run unlocked, exactly + // like the Ctrl+V case; the caller already released the mutex for handleEscape. + if r, more := t.nextRune(); more { + if r == 'v' || r == 'V' { + t.captureImage() + return + } + t.unget(r) // not Alt+V: hand the introducer back to the CSI/SS3 decoder + } params, final, ok := t.readCSI() if !ok { return } if final == '~' && params == "200" { // bracketed paste content := t.readPaste() + if len(content) == 0 { + // macOS Cmd+V often delivers an image paste as an empty bracketed + // paste. Try a quiet image capture: it acts only when an image is + // actually on the clipboard, so an ordinary empty paste stays silent. + t.captureImageQuiet() + return + } t.mu.Lock() t.insertPasteLocked(content) t.mu.Unlock() diff --git a/harness/tui_test.go b/harness/tui_test.go index 5ed1f5e..ff253ab 100644 --- a/harness/tui_test.go +++ b/harness/tui_test.go @@ -5,8 +5,60 @@ import ( "os" "strings" "testing" + + "github.com/mike-diff/sesh/agent" ) +// TestSegmentsImageRenumbering: image tokens display as [image-K] by order of +// appearance, independent of their absolute rune index, so deleting an earlier +// image renumbers the rest. The token rune stays the absolute images-slice index +// for byte lookup. Breaker: number the label off (r-imageBase)+1 instead of the +// running appearance count and the second image reads [image-3] not [image-2]. +func TestSegmentsImageRenumbering(t *testing.T) { + tc := &tuiConsole{images: make([]agent.Image, 3)} + // Buffer holds tokens for absolute indices 0 and 2 (index 1 was "deleted": + // its token is gone from the buffer but the slice is not compacted). + tc.buf = []rune{'a', imageBase + 0, 'b', imageBase + 2} + if got := strings.Join(tc.segments(), ""); got != "a[image-1]b[image-2]" { + t.Fatalf("renumber by appearance: %q, want %q", got, "a[image-1]b[image-2]") + } +} + +// TestComposeMessageOrdersImagesAndLabels: composeMessage returns the ordered +// images a buffer carries and writes a [image-K] label for each (never the raw +// private-use rune), with snippets expanded inline. The image order follows the +// buffer, not the absolute index. Breaker: append t.images by absolute index +// rather than in buffer order and the returned images come back reversed. +func TestComposeMessageOrdersImagesAndLabels(t *testing.T) { + imgs := []agent.Image{{Hash: "h0"}, {Hash: "h1"}} + tc := &tuiConsole{images: imgs, snippets: []string{"BIG"}} + // Appearance order is index 1 then index 0; a snippet token sits between them. + tc.buf = []rune{'s', 'e', 'e', ' ', imageBase + 1, ' ', snippetBase + 0, ' ', imageBase + 0} + text, got := tc.composeMessage() + if text != "see [image-1] BIG [image-2]" { + t.Fatalf("text = %q, want %q", text, "see [image-1] BIG [image-2]") + } + if strings.ContainsRune(text, imageBase) { + t.Fatalf("text leaked a raw image token rune: %q", text) + } + if len(got) != 2 || got[0].Hash != "h1" || got[1].Hash != "h0" { + t.Fatalf("images must follow buffer order, got %+v", got) + } +} + +// TestTakeImagesDrains: takeImages returns the last submit's images once and +// then nothing, so a message's images are attached exactly once. Breaker: drop +// the clear in takeImages and a second turn re-attaches the prior images. +func TestTakeImagesDrains(t *testing.T) { + tc := &tuiConsole{submitImages: []agent.Image{{Hash: "h0"}}} + if got := tc.takeImages(); len(got) != 1 { + t.Fatalf("first take must return the submit's images, got %d", len(got)) + } + if got := tc.takeImages(); got != nil { + t.Fatalf("second take must be empty, got %d", len(got)) + } +} + // TestSegWidthIgnoresANSI: a segment carrying highlight SGR measures by its // visible width, so the input window math stays aligned. Breaker: stop // stripping ANSI in segWidth and the escape bytes inflate the width. diff --git a/provider/provider.go b/provider/provider.go index be0c98c..01ebc41 100644 --- a/provider/provider.go +++ b/provider/provider.go @@ -13,6 +13,7 @@ import ( "bufio" "bytes" "context" + "encoding/base64" "encoding/json" "fmt" "io" @@ -252,6 +253,24 @@ func (p Anthropic) Chat(ctx context.Context, system string, history []agent.Turn for _, t := range history { switch t.Role { case "user": + if len(t.Images) > 0 { + var blocks []map[string]any + if t.Text != "" { + blocks = append(blocks, map[string]any{"type": "text", "text": t.Text}) + } + for _, im := range t.Images { + blocks = append(blocks, map[string]any{ + "type": "image", + "source": map[string]any{ + "type": "base64", + "media_type": im.MediaType, + "data": base64.StdEncoding.EncodeToString(im.Data), + }, + }) + } + msgs = append(msgs, map[string]any{"role": "user", "content": blocks}) + break + } msgs = append(msgs, map[string]any{"role": "user", "content": t.Text}) case "assistant": var blocks []map[string]any @@ -388,6 +407,21 @@ func (p OpenAI) Chat(ctx context.Context, system string, history []agent.Turn, t for _, t := range history { switch t.Role { case "user": + if len(t.Images) > 0 { + var parts []map[string]any + if t.Text != "" { + parts = append(parts, map[string]any{"type": "text", "text": t.Text}) + } + for _, im := range t.Images { + url := "data:" + im.MediaType + ";base64," + base64.StdEncoding.EncodeToString(im.Data) + parts = append(parts, map[string]any{ + "type": "image_url", + "image_url": map[string]any{"url": url}, + }) + } + msgs = append(msgs, map[string]any{"role": "user", "content": parts}) + break + } msgs = append(msgs, map[string]any{"role": "user", "content": t.Text}) case "assistant": m := map[string]any{"role": "assistant", "content": t.Text} diff --git a/provider/provider_test.go b/provider/provider_test.go index 10b06b7..85398fb 100644 --- a/provider/provider_test.go +++ b/provider/provider_test.go @@ -2,6 +2,7 @@ package provider import ( "context" + "encoding/base64" "encoding/json" "errors" "fmt" @@ -198,6 +199,135 @@ func TestOpenAICachedTokens(t *testing.T) { } } +// TestAnthropicImageContent: a user Turn carrying an Image serializes its +// content as a block array (a text block when there is text, then a bare-base64 +// image block), while a text-only turn keeps the plain-string content exactly as +// before. Breaker: remove the len(t.Images)>0 branch and the image turn falls +// through to a string content with no image block, failing the block assertions. +func TestAnthropicImageContent(t *testing.T) { + var body map[string]any + var hdr http.Header + srv := sseServer(t, &body, &hdr, + `{"type":"message_start","message":{"usage":{"input_tokens":1}}}`, + ) + defer srv.Close() + + p := Anthropic{BaseURL: srv.URL, Key: "k", Model: "m"} + _, err := p.Chat(context.Background(), "SYS", + []agent.Turn{ + {Role: "user", Text: "plain"}, + {Role: "user", Text: "look", Images: []agent.Image{ + {MediaType: "image/png", Data: []byte("PNGBYTES")}, + }}, + }, nil, func(string) {}, func(string) {}) + if err != nil { + t.Fatal(err) + } + + msgs := body["messages"].([]any) + if len(msgs) != 2 { + t.Fatalf("got %d messages", len(msgs)) + } + + // text-only turn: content stays a bare string (no regression) + if c := msgs[0].(map[string]any)["content"]; c != "plain" { + t.Fatalf("text-only content must stay the bare string, got %T %v", c, c) + } + + // image turn: content is a block array, text block first, then image block + blocks := msgs[1].(map[string]any)["content"].([]any) + if len(blocks) != 2 { + t.Fatalf("image turn should have a text block and an image block, got %v", blocks) + } + if tb := blocks[0].(map[string]any); tb["type"] != "text" || tb["text"] != "look" { + t.Fatalf("first block must carry the text: %v", tb) + } + im := blocks[1].(map[string]any) + if im["type"] != "image" { + t.Fatalf("second block must be an image: %v", im) + } + src := im["source"].(map[string]any) + if src["type"] != "base64" || src["media_type"] != "image/png" { + t.Fatalf("image source shape: %v", src) + } + // data is BARE base64, no data: URI prefix + want := base64.StdEncoding.EncodeToString([]byte("PNGBYTES")) + if src["data"] != want { + t.Fatalf("image data must be bare base64 %q, got %q", want, src["data"]) + } +} + +// TestOpenAIImageContent: the OpenAI adapter emits a parts array with a data-URI +// image_url part for an image turn, and keeps plain-string content for a +// text-only turn. Breaker: remove the len(t.Images)>0 branch and the image turn +// serializes as a bare string with no image_url part. +func TestOpenAIImageContent(t *testing.T) { + var body map[string]any + var hdr http.Header + srv := sseServer(t, &body, &hdr, + `{"choices":[{"delta":{"content":"ok"}}]}`, + `[DONE]`, + ) + defer srv.Close() + + p := OpenAI{BaseURL: srv.URL, Key: "k", Model: "m"} + _, err := p.Chat(context.Background(), "SYS", + []agent.Turn{ + {Role: "user", Text: "plain"}, + {Role: "user", Text: "look", Images: []agent.Image{ + {MediaType: "image/jpeg", Data: []byte("JPGBYTES")}, + }}, + }, nil, func(string) {}, func(string) {}) + if err != nil { + t.Fatal(err) + } + + msgs := body["messages"].([]any) + // msgs[0] is the system message; the two user turns follow + if len(msgs) != 3 { + t.Fatalf("got %d messages", len(msgs)) + } + if c := msgs[1].(map[string]any)["content"]; c != "plain" { + t.Fatalf("text-only content must stay the bare string, got %T %v", c, c) + } + + parts := msgs[2].(map[string]any)["content"].([]any) + if len(parts) != 2 { + t.Fatalf("image turn should have a text part and an image_url part, got %v", parts) + } + if tp := parts[0].(map[string]any); tp["type"] != "text" || tp["text"] != "look" { + t.Fatalf("first part must carry the text: %v", tp) + } + ip := parts[1].(map[string]any) + if ip["type"] != "image_url" { + t.Fatalf("second part must be image_url: %v", ip) + } + wantURL := "data:image/jpeg;base64," + base64.StdEncoding.EncodeToString([]byte("JPGBYTES")) + if got := ip["image_url"].(map[string]any)["url"]; got != wantURL { + t.Fatalf("image_url must be a data URI %q, got %q", wantURL, got) + } +} + +// TestOpenAIOmitsToolsWhenNone: with no tools the request body carries no "tools" +// field, so a tools-less model (a local vision model via the no_tools profile +// dial) is not rejected. Breaker: drop the len(toolsParam)>0 guard and an empty +// "tools" appears in the body. +func TestOpenAIOmitsToolsWhenNone(t *testing.T) { + var body map[string]any + var hdr http.Header + srv := sseServer(t, &body, &hdr, `{"choices":[{"delta":{"content":"ok"}}]}`, `[DONE]`) + defer srv.Close() + + p := OpenAI{BaseURL: srv.URL, Key: "k", Model: "m"} + if _, err := p.Chat(context.Background(), "SYS", + []agent.Turn{{Role: "user", Text: "hi"}}, nil, func(string) {}, func(string) {}); err != nil { + t.Fatal(err) + } + if _, ok := body["tools"]; ok { + t.Fatalf("with no tools the body must omit the tools field, got %v", body["tools"]) + } +} + // TestListModelInfos: discovery parses the context-length field names the // wild uses, and models without one report 0. func TestListModelInfos(t *testing.T) {