feat(media): add image/video description and local whisper priority

- Add describe-image.ts: OpenAI Vision API (gpt-4o-mini) image description - Add describe-video.ts: ffmpeg frame extraction + Vision API description - Rewrite transcribe.ts: local whisper/whisper-cli → OpenAI API → null - Update manager.ts routeMedia(): all media converted to text before agent - Image: describeImage() → text (was: raw ImageContent via writeWithImages) - Video: describeVideo() → text (was: file path info only) - Audio: unchanged (but underlying transcribeAudio now tries local first) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 11:03:31 +08:00 · 2026-02-09 11:03:31 +08:00 · db214b25ca
commit db214b25ca
parent 4e5780692e
4 changed files with 258 additions and 23 deletions
--- a/src/channels/manager.ts
+++ b/src/channels/manager.ts
@ -6,9 +6,12 @@
 * - Outgoing: agent reply → check lastRoute → forward to originating channel
 *
 * Uses "last route" pattern: whoever sent the last message gets the reply.
+ *
+ * @see docs/channels/README.md — Channel system overview
+ * @see docs/channels/media-handling.md — Media processing pipeline
+ * @see docs/message-paths.md — All three message paths (Desktop / Web / Channel)
 */

-import { readFile } from "node:fs/promises";
 import type { Hub } from "../hub/hub.js";
 import type {
  ChannelPlugin,
@ -21,6 +24,8 @@ import { loadChannelsConfig } from "./config.js";
 import { MessageAggregator, DEFAULT_CHUNKER_CONFIG } from "../hub/message-aggregator.js";
 import type { AsyncAgent } from "../agent/async-agent.js";
 import { transcribeAudio } from "../media/transcribe.js";
+import { describeImage } from "../media/describe-image.js";
+import { describeVideo } from "../media/describe-video.js";

 interface AccountHandle {
  channelId: string;
@ -289,12 +294,18 @@ export class ChannelManager {
      const filePath = await plugin.downloadMedia!(media.fileId, accountId);

      if (media.type === "image") {
-        // Images: pass directly to LLM as ImageContent
-        const buffer = await readFile(filePath);
-        const base64 = buffer.toString("base64");
-        const mimeType = media.mimeType ?? "image/jpeg";
-        const caption = media.caption || "User sent an image.";
-        agent.writeWithImages(caption, [{ type: "image", data: base64, mimeType }]);
+        // Images: describe via Vision API before reaching agent
+        const description = await describeImage(filePath);
+        if (description) {
+          const parts = ["[Image]", `Description: ${description}`];
+          if (media.caption) parts.push(`Caption: ${media.caption}`);
+          agent.write(parts.join("\n"));
+        } else {
+          // No API key — fall back to file path
+          const parts = ["[image message received]", `File: ${filePath}`];
+          if (media.caption) parts.push(`Caption: ${media.caption}`);
+          agent.write(parts.join("\n"));
+        }
      } else if (media.type === "audio") {
        // Audio: transcribe via Whisper API before reaching agent
        const transcript = await transcribeAudio(filePath);
@ -310,13 +321,28 @@ export class ChannelManager {
          if (media.caption) parts.push(`Caption: ${media.caption}`);
          agent.write(parts.join("\n"));
        }
+      } else if (media.type === "video") {
+        // Video: extract frame + describe via Vision API
+        const description = await describeVideo(filePath);
+        if (description) {
+          const parts = ["[Video]", `Description: ${description}`];
+          if (media.duration) parts.push(`Duration: ${media.duration}s`);
+          if (media.caption) parts.push(`Caption: ${media.caption}`);
+          agent.write(parts.join("\n"));
+        } else {
+          // ffmpeg unavailable or no API key — fall back to file path
+          const parts = ["[video message received]", `File: ${filePath}`];
+          if (media.mimeType) parts.push(`Type: ${media.mimeType}`);
+          if (media.duration) parts.push(`Duration: ${media.duration}s`);
+          if (media.caption) parts.push(`Caption: ${media.caption}`);
+          agent.write(parts.join("\n"));
+        }
      } else {
-        // Video/document: tell agent the file path
+        // Document: tell agent the file path
        const parts: string[] = [];
-        parts.push(`[${media.type} message received]`);
+        parts.push(`[document message received]`);
        parts.push(`File: ${filePath}`);
        if (media.mimeType) parts.push(`Type: ${media.mimeType}`);
-        if (media.duration) parts.push(`Duration: ${media.duration}s`);
        if (media.caption) parts.push(`Caption: ${media.caption}`);
        agent.write(parts.join("\n"));
      }
--- a/src/media/describe-image.ts
+++ b/src/media/describe-image.ts
@ -0,0 +1,77 @@
+/**
+ * Image description via OpenAI Vision API.
+ *
+ * Called by ChannelManager before the message reaches the Agent,
+ * so the Agent only ever sees a text description of the image.
+ *
+ * @see docs/channels/media-handling.md — Media processing pipeline
+ */
+
+import { readFile } from "node:fs/promises";
+import { extname } from "node:path";
+import { credentialManager } from "../agent/credentials.js";
+
+/** Map file extension to MIME type for common image formats */
+function mimeFromExt(filePath: string): string {
+  const ext = extname(filePath).toLowerCase();
+  switch (ext) {
+    case ".png": return "image/png";
+    case ".gif": return "image/gif";
+    case ".webp": return "image/webp";
+    default: return "image/jpeg";
+  }
+}
+
+/**
+ * Describe an image using OpenAI Vision API (gpt-4o-mini).
+ *
+ * @param filePath - Local path to the image file
+ * @returns Text description, or null if no API key configured
+ */
+export async function describeImage(filePath: string): Promise<string | null> {
+  const config = credentialManager.getLlmProviderConfig("openai");
+  const apiKey = config?.apiKey;
+  if (!apiKey) return null;
+
+  const buffer = await readFile(filePath);
+  const base64 = buffer.toString("base64");
+  const mimeType = mimeFromExt(filePath);
+  const dataUrl = `data:${mimeType};base64,${base64}`;
+
+  const res = await fetch("https://api.openai.com/v1/chat/completions", {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${apiKey}`,
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({
+      model: "gpt-4o-mini",
+      messages: [
+        {
+          role: "user",
+          content: [
+            {
+              type: "text",
+              text: "Describe this image concisely. Focus on the main content and any text visible in the image.",
+            },
+            {
+              type: "image_url",
+              image_url: { url: dataUrl },
+            },
+          ],
+        },
+      ],
+      max_tokens: 500,
+    }),
+  });
+
+  if (!res.ok) {
+    const errText = await res.text().catch(() => "");
+    throw new Error(`Vision API error: HTTP ${res.status} ${errText}`);
+  }
+
+  const result = (await res.json()) as {
+    choices: Array<{ message: { content: string } }>;
+  };
+  return result.choices[0]?.message.content ?? null;
+}
--- a/src/media/describe-video.ts
+++ b/src/media/describe-video.ts
@ -0,0 +1,49 @@
+/**
+ * Video description via frame extraction + Vision API.
+ *
+ * Extracts the first frame using ffmpeg, then describes it
+ * with the same Vision API used for images.
+ *
+ * @see docs/channels/media-handling.md — Media processing pipeline
+ */
+
+import { join } from "node:path";
+import { execFile } from "node:child_process";
+import { unlink } from "node:fs/promises";
+import { v7 as uuidv7 } from "uuid";
+import { MEDIA_CACHE_DIR } from "../shared/paths.js";
+import { describeImage } from "./describe-image.js";
+
+/**
+ * Describe a video by extracting the first frame and passing it to Vision API.
+ *
+ * @param filePath - Local path to the video file
+ * @returns Text description, or null if ffmpeg unavailable or no API key
+ */
+export async function describeVideo(filePath: string): Promise<string | null> {
+  const framePath = join(MEDIA_CACHE_DIR, `${uuidv7()}.jpg`);
+
+  try {
+    // Extract first frame with ffmpeg
+    await new Promise<void>((resolve, reject) => {
+      execFile(
+        "ffmpeg",
+        ["-i", filePath, "-vframes", "1", "-f", "image2", "-y", framePath],
+        { timeout: 10000 },
+        (err) => (err ? reject(err) : resolve()),
+      );
+    });
+
+    // Describe the extracted frame
+    const description = await describeImage(framePath);
+
+    // Clean up the frame file
+    await unlink(framePath).catch(() => {});
+
+    return description;
+  } catch {
+    // ffmpeg not available or extraction failed
+    await unlink(framePath).catch(() => {});
+    return null;
+  }
+}
--- a/src/media/transcribe.ts
+++ b/src/media/transcribe.ts
@ -1,25 +1,77 @@
 /**
- * Audio transcription via OpenAI Whisper API.
+ * Audio transcription — local whisper first, OpenAI API fallback.
+ *
+ * Priority:
+ * 1. Local whisper/whisper-cli binary (free, no latency, offline)
+ * 2. OpenAI Whisper API (requires API key)
+ * 3. null (no provider available — placeholder stays for Agent)
 *
 * Called by ChannelManager before the message reaches the Agent,
 * so the Agent only ever sees text.
+ *
+ * @see docs/channels/media-handling.md — Media processing pipeline and provider priority
 */

-import { readFile } from "node:fs/promises";
-import { basename } from "node:path";
+import { readFile, unlink } from "node:fs/promises";
+import { basename, join } from "node:path";
+import { execFile, execFileSync } from "node:child_process";
+import { tmpdir } from "node:os";
 import { credentialManager } from "../agent/credentials.js";

-/**
- * Transcribe an audio file using OpenAI Whisper API.
- *
- * @param filePath - Local path to the audio file
- * @returns Transcribed text, or null if no API key configured
- */
-export async function transcribeAudio(filePath: string): Promise<string | null> {
-  const config = credentialManager.getLlmProviderConfig("openai");
-  const apiKey = config?.apiKey;
-  if (!apiKey) return null;
+/** Cached path to local whisper binary, or false if not found */
+let cachedWhisperBin: string | false | undefined;

+/** Find local whisper binary in PATH */
+function findWhisperBin(): string | false {
+  if (cachedWhisperBin !== undefined) return cachedWhisperBin;
+
+  for (const bin of ["whisper", "whisper-cli"]) {
+    try {
+      execFileSync("which", [bin], { stdio: "pipe" });
+      cachedWhisperBin = bin;
+      return bin;
+    } catch {
+      // not found, try next
+    }
+  }
+
+  cachedWhisperBin = false;
+  return false;
+}
+
+/**
+ * Transcribe audio using local whisper CLI.
+ *
+ * Runs: whisper "<file>" --model base --output_format txt --output_dir <tmpdir>
+ * Reads the generated .txt file and returns its content.
+ */
+async function transcribeLocal(whisperBin: string, filePath: string): Promise<string> {
+  const outDir = tmpdir();
+
+  await new Promise<void>((resolve, reject) => {
+    execFile(
+      whisperBin,
+      [filePath, "--model", "base", "--output_format", "txt", "--output_dir", outDir],
+      { timeout: 120000 },
+      (err) => (err ? reject(err) : resolve()),
+    );
+  });
+
+  // whisper outputs <basename_without_ext>.txt
+  const name = basename(filePath).replace(/\.[^.]+$/, "");
+  const txtPath = join(outDir, `${name}.txt`);
+  const text = (await readFile(txtPath, "utf-8")).trim();
+
+  // Clean up the txt file
+  await unlink(txtPath).catch(() => {});
+
+  return text;
+}
+
+/**
+ * Transcribe audio using OpenAI Whisper API.
+ */
+async function transcribeApi(apiKey: string, filePath: string): Promise<string> {
  const fileBuffer = await readFile(filePath);
  const fileName = basename(filePath);

@ -61,3 +113,34 @@ export async function transcribeAudio(filePath: string): Promise<string | null>
  const result = (await res.json()) as { text: string };
  return result.text;
 }
+
+/**
+ * Transcribe an audio file.
+ *
+ * Priority: local whisper → OpenAI API → null.
+ *
+ * @param filePath - Local path to the audio file
+ * @returns Transcribed text, or null if no provider available
+ */
+export async function transcribeAudio(filePath: string): Promise<string | null> {
+  // 1. Try local whisper
+  const whisperBin = findWhisperBin();
+  if (whisperBin) {
+    try {
+      return await transcribeLocal(whisperBin, filePath);
+    } catch (err) {
+      const msg = err instanceof Error ? err.message : String(err);
+      console.error(`[Transcribe] Local whisper failed: ${msg}, trying API...`);
+    }
+  }
+
+  // 2. Try OpenAI API
+  const config = credentialManager.getLlmProviderConfig("openai");
+  const apiKey = config?.apiKey;
+  if (apiKey) {
+    return await transcribeApi(apiKey, filePath);
+  }
+
+  // 3. No provider available
+  return null;
+}