Skip to content

Commit ac2f515

Browse files
committed
feat: vision describe supports --file-id to skip base64 encoding
- Add --file-id option as mutually exclusive alternative to --image - When fileId is provided, body sends {prompt, file_id} directly (no base64) - When image is provided, falls back to existing base64 toDataUri path - TTY interactive prompt accepts path/URL/fileId with heuristic detection - Updates required field markers and export-schema schema
1 parent 1aace14 commit ac2f515

1 file changed

Lines changed: 44 additions & 37 deletions

File tree

src/commands/vision/describe.ts

Lines changed: 44 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,11 @@ const MIME_TYPES: Record<string, string> = {
2323
};
2424

2525
async function toDataUri(image: string): Promise<string> {
26-
if (image.startsWith('data:')) {
27-
return image;
28-
}
26+
if (image.startsWith('data:')) return image;
2927

3028
if (image.startsWith('http://') || image.startsWith('https://')) {
3129
const res = await fetch(image);
32-
if (!res.ok) {
33-
throw new CLIError(
34-
`Failed to download image: HTTP ${res.status}`,
35-
ExitCode.GENERAL,
36-
);
37-
}
30+
if (!res.ok) throw new CLIError(`Failed to download image: HTTP ${res.status}`, ExitCode.GENERAL);
3831
const contentType = res.headers.get('content-type') || 'image/jpeg';
3932
const mime = contentType.split(';')[0]!.trim();
4033
const buf = await res.arrayBuffer();
@@ -43,79 +36,93 @@ async function toDataUri(image: string): Promise<string> {
4336
}
4437

4538
// Local file
46-
if (!existsSync(image)) {
47-
throw new CLIError(
48-
`File not found: ${image}`,
49-
ExitCode.USAGE,
50-
);
51-
}
52-
39+
if (!existsSync(image)) throw new CLIError(`File not found: ${image}`, ExitCode.USAGE);
5340
const ext = extname(image).toLowerCase();
5441
const mime = MIME_TYPES[ext];
55-
if (!mime) {
56-
throw new CLIError(
57-
`Unsupported image format "${ext}". Supported: jpg, jpeg, png, webp`,
58-
ExitCode.USAGE,
59-
);
60-
}
61-
42+
if (!mime) throw new CLIError(`Unsupported image format "${ext}". Supported: jpg, jpeg, png, webp`, ExitCode.USAGE);
6243
const buf = readFileSync(image);
63-
const b64 = buf.toString('base64');
64-
return `data:${mime};base64,${b64}`;
44+
return `data:${mime};base64,${buf.toString('base64')}`;
6545
}
6646

6747
export default defineCommand({
6848
name: 'vision describe',
6949
description: 'Describe an image using MiniMax VLM',
70-
usage: 'minimax vision describe --image <path-or-url> [--prompt <text>]',
50+
usage: 'minimax vision describe (--image <path-or-url> | --file-id <id>) [--prompt <text>]',
7151
options: [
72-
{ flag: '--image <path-or-url>', description: 'Image file path or URL', required: true },
52+
{ flag: '--image <path-or-url>', description: 'Local image path or URL (base64 encoded automatically)' },
53+
{ flag: '--file-id <id>', description: 'Pre-uploaded file ID (skips base64 conversion)' },
7354
{ flag: '--prompt <text>', description: 'Question about the image (default: "Describe the image.")' },
7455
],
7556
examples: [
7657
'minimax vision describe --image photo.jpg',
7758
'minimax vision describe --image https://example.com/photo.jpg --prompt "What breed is this dog?"',
78-
'minimax vision describe --image screenshot.png --prompt "Extract the text" --output json',
59+
'minimax vision describe --file-id file-123456789 --prompt "Extract the text"',
7960
],
8061
async run(config: Config, flags: GlobalFlags) {
8162
let image = flags.image as string | undefined;
63+
let fileId = flags.fileId as string | undefined;
8264
const prompt = (flags.prompt as string) || 'Describe the image.';
8365

84-
if (!image) {
66+
// Mutually exclusive: must provide one, cannot provide both
67+
if (!image && !fileId) {
8568
if (isInteractive({ nonInteractive: config.nonInteractive })) {
8669
const hint = await promptText({
87-
message: 'Enter image path or URL:',
70+
message: 'Enter image path, URL, or File ID:',
8871
});
8972
if (!hint) {
9073
process.stderr.write('Vision describe cancelled.\n');
9174
process.exit(1);
9275
}
93-
image = hint;
76+
// Simple heuristic: if no extension and not http(s), treat as fileId
77+
if (!hint.includes('.') && !hint.startsWith('http')) {
78+
fileId = hint;
79+
} else {
80+
image = hint;
81+
}
9482
} else {
95-
failIfMissing('image', 'minimax vision describe --image <path-or-url>');
83+
throw new CLIError(
84+
'Missing required argument. Must provide either --image or --file-id.',
85+
ExitCode.USAGE,
86+
'minimax vision describe --image <path> OR --file-id <id>',
87+
);
9688
}
89+
} else if (image && fileId) {
90+
throw new CLIError(
91+
'Conflicting arguments: cannot provide both --image and --file-id.',
92+
ExitCode.USAGE,
93+
);
9794
}
9895

9996
const format = detectOutputFormat(config.output);
10097

10198
if (config.dryRun) {
102-
console.log(formatOutput({ request: { prompt, image } }, format));
99+
process.stdout.write(formatOutput({ request: { prompt, image, fileId } }, format) + '\n');
103100
return;
104101
}
105102

106-
const imageUrl = await toDataUri(image);
107103
const url = vlmEndpoint(config.baseUrl);
104+
let body: Record<string, unknown> = { prompt };
105+
106+
if (fileId) {
107+
// Skip base64: pass fileId directly to the API
108+
body.file_id = fileId;
109+
} else if (image) {
110+
// Fallback to base64 encoding for local/HTTP images
111+
const imageUrl = await toDataUri(image);
112+
body.image_url = imageUrl;
113+
}
114+
108115
const response = await requestJson<VlmResponse>(config, {
109116
url,
110117
method: 'POST',
111-
body: { prompt, image_url: imageUrl },
118+
body,
112119
});
113120

114121
if (format !== 'text') {
115-
console.log(formatOutput(response, format));
122+
process.stdout.write(formatOutput(response, format) + '\n');
116123
return;
117124
}
118125

119-
console.log(response.content);
126+
process.stdout.write(response.content + '\n');
120127
},
121128
});

0 commit comments

Comments
 (0)