Improve AI metadata generation for videos

richiemcilroy · richiemcilroy · commit 3ce3a409c036 · 2026-01-14T11:39:51.000Z
diff --git a/.gitignore b/.gitignore
@@ -58,3 +58,6 @@ tauri.windows.conf.json
 scripts/backfill-releases.sh
 scripts/update-github-releases.sh
 scripts/releases-backfill-data.txt
+
+# SEO agent state (machine-local, contains sensitive ranking data)
+seo/
diff --git a/apps/web/actions/videos/generate-ai-metadata.ts b/apps/web/actions/videos/generate-ai-metadata.ts
@@ -11,6 +11,137 @@ import { Effect, Option } from "effect";
 import { GROQ_MODEL, getGroqClient } from "@/lib/groq-client";
 import { runPromise } from "@/lib/server";
 
+const MAX_CHARS_PER_CHUNK = 24000;
+
+interface VttSegment {
+	start: number;
+	text: string;
+}
+
+function parseVttWithTimestamps(vttContent: string): VttSegment[] {
+	const lines = vttContent.split("\n");
+	const segments: VttSegment[] = [];
+	let currentStart = 0;
+
+	for (let i = 0; i < lines.length; i++) {
+		const line = lines[i]?.trim() ?? "";
+		if (line.includes("-->")) {
+			const timeMatch = line.match(/(\d{2}):(\d{2}):(\d{2})[.,](\d{3})/);
+			if (timeMatch) {
+				currentStart =
+					parseInt(timeMatch[1] ?? "0", 10) * 3600 +
+					parseInt(timeMatch[2] ?? "0", 10) * 60 +
+					parseInt(timeMatch[3] ?? "0", 10);
+			}
+		} else if (
+			line &&
+			line !== "WEBVTT" &&
+			!/^\d+$/.test(line) &&
+			!line.includes("-->")
+		) {
+			segments.push({ start: currentStart, text: line });
+		}
+	}
+
+	return segments;
+}
+
+function chunkTranscriptWithTimestamps(
+	segments: VttSegment[],
+): { text: string; startTime: number; endTime: number }[] {
+	const chunks: { text: string; startTime: number; endTime: number }[] = [];
+	let currentChunk: VttSegment[] = [];
+	let currentLength = 0;
+
+	for (const segment of segments) {
+		if (
+			currentLength + segment.text.length > MAX_CHARS_PER_CHUNK &&
+			currentChunk.length > 0
+		) {
+			chunks.push({
+				text: currentChunk.map((s) => s.text).join(" "),
+				startTime: currentChunk[0]?.start ?? 0,
+				endTime: currentChunk[currentChunk.length - 1]?.start ?? 0,
+			});
+			currentChunk = [];
+			currentLength = 0;
+		}
+		currentChunk.push(segment);
+		currentLength += segment.text.length + 1;
+	}
+
+	if (currentChunk.length > 0) {
+		chunks.push({
+			text: currentChunk.map((s) => s.text).join(" "),
+			startTime: currentChunk[0]?.start ?? 0,
+			endTime: currentChunk[currentChunk.length - 1]?.start ?? 0,
+		});
+	}
+
+	return chunks;
+}
+
+async function callAiApi(
+	prompt: string,
+	groqClient: ReturnType<typeof getGroqClient>,
+): Promise<string> {
+	if (groqClient) {
+		try {
+			const completion = await groqClient.chat.completions.create({
+				messages: [{ role: "user", content: prompt }],
+				model: GROQ_MODEL,
+			});
+			return completion.choices?.[0]?.message?.content || "{}";
+		} catch (groqError) {
+			console.error(
+				`[generateAiMetadata] Groq API error: ${groqError}, falling back to OpenAI`,
+			);
+			if (serverEnv().OPENAI_API_KEY) {
+				const aiRes = await fetch(
+					"https://api.openai.com/v1/chat/completions",
+					{
+						method: "POST",
+						headers: {
+							"Content-Type": "application/json",
+							Authorization: `Bearer ${serverEnv().OPENAI_API_KEY}`,
+						},
+						body: JSON.stringify({
+							model: "gpt-4o-mini",
+							messages: [{ role: "user", content: prompt }],
+						}),
+					},
+				);
+				if (!aiRes.ok) {
+					const errorText = await aiRes.text();
+					throw new Error(`OpenAI API error: ${aiRes.status} ${errorText}`);
+				}
+				const aiJson = await aiRes.json();
+				return aiJson.choices?.[0]?.message?.content || "{}";
+			}
+			throw groqError;
+		}
+	} else if (serverEnv().OPENAI_API_KEY) {
+		const aiRes = await fetch("https://api.openai.com/v1/chat/completions", {
+			method: "POST",
+			headers: {
+				"Content-Type": "application/json",
+				Authorization: `Bearer ${serverEnv().OPENAI_API_KEY}`,
+			},
+			body: JSON.stringify({
+				model: "gpt-4o-mini",
+				messages: [{ role: "user", content: prompt }],
+			}),
+		});
+		if (!aiRes.ok) {
+			const errorText = await aiRes.text();
+			throw new Error(`OpenAI API error: ${aiRes.status} ${errorText}`);
+		}
+		const aiJson = await aiRes.json();
+		return aiJson.choices?.[0]?.message?.content || "{}";
+	}
+	return "{}";
+}
+
 export async function generateAiMetadata(
 	videoId: Video.VideoId,
 	userId: string,
@@ -160,15 +291,9 @@ export async function generateAiMetadata(
 			return;
 		}
 
-		const transcriptText = vtt.value
-			.split("\n")
-			.filter(
-				(l) =>
-					l.trim() &&
-					l !== "WEBVTT" &&
-					!/^\d+$/.test(l.trim()) &&
-					!l.includes("-->"),
-			)
+		const segments = parseVttWithTimestamps(vtt.value);
+		const transcriptText = segments
+			.map((s) => s.text)
 			.join(" ")
 			.trim();
 
@@ -189,80 +314,154 @@ export async function generateAiMetadata(
 			return;
 		}
 
-		const prompt = `You are Cap AI. Summarize the transcript and provide JSON in the following format:
+		const chunks = chunkTranscriptWithTimestamps(segments);
+		console.log(
+			`[generateAiMetadata] Processing ${videoId}: ${transcriptText.length} chars, ${chunks.length} chunk(s)`,
+		);
+
+		let content = "{}";
+
+		if (chunks.length === 1) {
+			const prompt = `You are Cap AI, an expert at analyzing video content and creating comprehensive summaries.
+
+Analyze this transcript thoroughly and provide a detailed JSON response:
 {
-  "title": "string",
-  "summary": "string (write from 1st person perspective if appropriate, e.g. 'In this video, I demonstrate...' to make it feel personable)",
-  "chapters": [{"title": "string", "start": number}]
+  "title": "string (concise but descriptive title that captures the main topic)",
+  "summary": "string (detailed summary that covers ALL key points discussed. For meetings: include decisions made, action items, and key discussion points. For tutorials: cover all steps and concepts explained. For presentations: summarize all main arguments and supporting points. Write from 1st person perspective if the speaker is teaching/presenting, e.g. 'In this video, I walk through...'. Make it comprehensive enough that someone could understand the full content without watching.)",
+  "chapters": [{"title": "string (descriptive chapter title)", "start": number (seconds from start)}]
 }
+
+Guidelines:
+- The summary should be detailed and comprehensive, not a brief overview
+- Capture ALL important topics, not just the main theme
+- For longer content, organize the summary by topic or chronologically
+- Include specific details, names, numbers, and conclusions mentioned
+- Chapters should mark distinct topic changes or sections
+
 Return ONLY valid JSON without any markdown formatting or code blocks.
 Transcript:
 ${transcriptText}`;
+			content = await callAiApi(prompt, groqClient);
+		} else {
+			const chunkSummaries: {
+				summary: string;
+				keyPoints: string[];
+				chapters: { title: string; start: number }[];
+				startTime: number;
+				endTime: number;
+			}[] = [];
 
-		let content = "{}";
-
-		if (groqClient) {
-			try {
-				const completion = await groqClient.chat.completions.create({
-					messages: [{ role: "user", content: prompt }],
-					model: GROQ_MODEL,
-				});
-				content = completion.choices?.[0]?.message?.content || "{}";
-			} catch (groqError) {
-				console.error(
-					`[generateAiMetadata] Groq API error: ${groqError}, falling back to OpenAI`,
+			for (let i = 0; i < chunks.length; i++) {
+				const chunk = chunks[i];
+				if (!chunk) continue;
+				console.log(
+					`[generateAiMetadata] Processing chunk ${i + 1}/${chunks.length} for ${videoId} (${chunk.startTime}s - ${chunk.endTime}s)`,
 				);
-				// Fallback to OpenAI if Groq fails and OpenAI key exists
-				if (serverEnv().OPENAI_API_KEY) {
-					const aiRes = await fetch(
-						"https://api.openai.com/v1/chat/completions",
-						{
-							method: "POST",
-							headers: {
-								"Content-Type": "application/json",
-								Authorization: `Bearer ${serverEnv().OPENAI_API_KEY}`,
-							},
-							body: JSON.stringify({
-								model: "gpt-4o-mini",
-								messages: [{ role: "user", content: prompt }],
-							}),
-						},
-					);
-					if (!aiRes.ok) {
-						const errorText = await aiRes.text();
-						console.error(
-							`[generateAiMetadata] OpenAI API error: ${aiRes.status} ${errorText}`,
-						);
-						throw new Error(`OpenAI API error: ${aiRes.status} ${errorText}`);
+
+				const chunkPrompt = `You are Cap AI, an expert at analyzing video content. This is section ${i + 1} of ${chunks.length} from a longer video (timestamp ${Math.floor(chunk.startTime / 60)}:${String(chunk.startTime % 60).padStart(2, "0")} to ${Math.floor(chunk.endTime / 60)}:${String(chunk.endTime % 60).padStart(2, "0")}).
+
+Analyze this section thoroughly and provide JSON:
+{
+  "summary": "string (detailed summary of this section - capture ALL key points, topics discussed, decisions made, or concepts explained. Include specific details like names, numbers, action items, and conclusions. This should be 3-6 sentences minimum.)",
+  "keyPoints": ["string (specific key point or takeaway)", ...],
+  "chapters": [{"title": "string (descriptive title for this topic/section)", "start": number (seconds from video start)}]
+}
+
+Be thorough - this summary will be combined with other sections to create a comprehensive overview.
+Return ONLY valid JSON without any markdown formatting or code blocks.
+Transcript section:
+${chunk.text}`;
+
+				const chunkContent = await callAiApi(chunkPrompt, groqClient);
+				try {
+					let cleanContent = chunkContent;
+					if (chunkContent.includes("```json")) {
+						cleanContent = chunkContent
+							.replace(/```json\s*/g, "")
+							.replace(/```\s*/g, "");
+					} else if (chunkContent.includes("```")) {
+						cleanContent = chunkContent.replace(/```\s*/g, "");
 					}
-					const aiJson = await aiRes.json();
-					content = aiJson.choices?.[0]?.message?.content || "{}";
-				} else {
-					throw groqError;
+					const parsed = JSON.parse(cleanContent.trim());
+					chunkSummaries.push({
+						summary: parsed.summary || "",
+						keyPoints: parsed.keyPoints || [],
+						chapters: parsed.chapters || [],
+						startTime: chunk.startTime,
+						endTime: chunk.endTime,
+					});
+				} catch {
+					console.error(
+						`[generateAiMetadata] Failed to parse chunk ${i + 1} response for ${videoId}`,
+					);
 				}
 			}
-		} else if (serverEnv().OPENAI_API_KEY) {
-			// Use OpenAI if Groq client is not available
-			const aiRes = await fetch("https://api.openai.com/v1/chat/completions", {
-				method: "POST",
-				headers: {
-					"Content-Type": "application/json",
-					Authorization: `Bearer ${serverEnv().OPENAI_API_KEY}`,
-				},
-				body: JSON.stringify({
-					model: "gpt-4o-mini",
-					messages: [{ role: "user", content: prompt }],
-				}),
-			});
-			if (!aiRes.ok) {
-				const errorText = await aiRes.text();
+
+			const allChapters = chunkSummaries.flatMap((c) => c.chapters);
+			const allKeyPoints = chunkSummaries.flatMap((c) => c.keyPoints);
+
+			const sectionDetails = chunkSummaries
+				.map((c, i) => {
+					const timeRange = `${Math.floor(c.startTime / 60)}:${String(c.startTime % 60).padStart(2, "0")} - ${Math.floor(c.endTime / 60)}:${String(c.endTime % 60).padStart(2, "0")}`;
+					const keyPointsList =
+						c.keyPoints.length > 0
+							? `\nKey points: ${c.keyPoints.join("; ")}`
+							: "";
+					return `Section ${i + 1} (${timeRange}):\n${c.summary}${keyPointsList}`;
+				})
+				.join("\n\n");
+
+			const finalPrompt = `You are Cap AI, an expert at synthesizing information into comprehensive, well-organized summaries.
+
+Based on these detailed section analyses of a video, create a thorough final summary that captures EVERYTHING important.
+
+Section analyses:
+${sectionDetails}
+
+${allKeyPoints.length > 0 ? `All key points identified:\n${allKeyPoints.map((p, i) => `${i + 1}. ${p}`).join("\n")}\n` : ""}
+
+Provide JSON in the following format:
+{
+  "title": "string (concise but descriptive title that captures the main topic/purpose)",
+  "summary": "string (COMPREHENSIVE summary that covers the entire video thoroughly. This should be detailed enough that someone could understand all the important content without watching. Include: main topics covered, key decisions or conclusions, important details mentioned, action items if any. Organize it logically - for meetings use topics/agenda items, for tutorials use steps/concepts, for presentations use main arguments. Write from 1st person perspective if appropriate. This should be several paragraphs for longer content.)"
+}
+
+The summary must be detailed and comprehensive - not a brief overview. Capture all the important information from every section.
+Return ONLY valid JSON without any markdown formatting or code blocks.`;
+
+			const finalContent = await callAiApi(finalPrompt, groqClient);
+			try {
+				let cleanContent = finalContent;
+				if (finalContent.includes("```json")) {
+					cleanContent = finalContent
+						.replace(/```json\s*/g, "")
+						.replace(/```\s*/g, "");
+				} else if (finalContent.includes("```")) {
+					cleanContent = finalContent.replace(/```\s*/g, "");
+				}
+				const parsed = JSON.parse(cleanContent.trim());
+				content = JSON.stringify({
+					title: parsed.title,
+					summary: parsed.summary,
+					chapters: allChapters,
+				});
+			} catch {
 				console.error(
-					`[generateAiMetadata] OpenAI API error: ${aiRes.status} ${errorText}`,
+					`[generateAiMetadata] Failed to parse final summary for ${videoId}`,
 				);
-				throw new Error(`OpenAI API error: ${aiRes.status} ${errorText}`);
+				const fallbackSummary = chunkSummaries
+					.map((c, i) => `**Part ${i + 1}:** ${c.summary}`)
+					.join("\n\n");
+				const keyPointsSummary =
+					allKeyPoints.length > 0
+						? `\n\n**Key Points:**\n${allKeyPoints.map((p) => `- ${p}`).join("\n")}`
+						: "";
+				content = JSON.stringify({
+					title: "Video Summary",
+					summary: fallbackSummary + keyPointsSummary,
+					chapters: allChapters,
+				});
 			}
-			const aiJson = await aiRes.json();
-			content = aiJson.choices?.[0]?.message?.content || "{}";
 		}
 
 		let data: {