From db214b25ca9e12c210ce25d3668e5af5c3b478f3 Mon Sep 17 00:00:00 2001
From: Naiyuan Qing <145280634+NevilleQingNY@users.noreply.github.com>
Date: Mon, 9 Feb 2026 11:03:31 +0800
Subject: [PATCH] feat(media): add image/video description and local whisper
 priority
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add describe-image.ts: OpenAI Vision API (gpt-4o-mini) image description
- Add describe-video.ts: ffmpeg frame extraction + Vision API description
- Rewrite transcribe.ts: local whisper/whisper-cli → OpenAI API → null
- Update manager.ts routeMedia(): all media converted to text before agent
  - Image: describeImage() → text (was: raw ImageContent via writeWithImages)
  - Video: describeVideo() → text (was: file path info only)
  - Audio: unchanged (but underlying transcribeAudio now tries local first)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/channels/manager.ts     |  46 +++++++++++----
 src/media/describe-image.ts |  77 +++++++++++++++++++++++++
 src/media/describe-video.ts |  49 ++++++++++++++++
 src/media/transcribe.ts     | 109 +++++++++++++++++++++++++++++++-----
 4 files changed, 258 insertions(+), 23 deletions(-)
 create mode 100644 src/media/describe-image.ts
 create mode 100644 src/media/describe-video.ts

diff --git a/src/channels/manager.ts b/src/channels/manager.ts
index 4250b024..3c3dd92f 100644
--- a/src/channels/manager.ts
+++ b/src/channels/manager.ts
@@ -6,9 +6,12 @@
  * - Outgoing: agent reply → check lastRoute → forward to originating channel
  *
  * Uses "last route" pattern: whoever sent the last message gets the reply.
+ *
+ * @see docs/channels/README.md — Channel system overview
+ * @see docs/channels/media-handling.md — Media processing pipeline
+ * @see docs/message-paths.md — All three message paths (Desktop / Web / Channel)
  */
 
-import { readFile } from "node:fs/promises";
 import type { Hub } from "../hub/hub.js";
 import type {
   ChannelPlugin,
@@ -21,6 +24,8 @@ import { loadChannelsConfig } from "./config.js";
 import { MessageAggregator, DEFAULT_CHUNKER_CONFIG } from "../hub/message-aggregator.js";
 import type { AsyncAgent } from "../agent/async-agent.js";
 import { transcribeAudio } from "../media/transcribe.js";
+import { describeImage } from "../media/describe-image.js";
+import { describeVideo } from "../media/describe-video.js";
 
 interface AccountHandle {
   channelId: string;
@@ -289,12 +294,18 @@ export class ChannelManager {
       const filePath = await plugin.downloadMedia!(media.fileId, accountId);
 
       if (media.type === "image") {
-        // Images: pass directly to LLM as ImageContent
-        const buffer = await readFile(filePath);
-        const base64 = buffer.toString("base64");
-        const mimeType = media.mimeType ?? "image/jpeg";
-        const caption = media.caption || "User sent an image.";
-        agent.writeWithImages(caption, [{ type: "image", data: base64, mimeType }]);
+        // Images: describe via Vision API before reaching agent
+        const description = await describeImage(filePath);
+        if (description) {
+          const parts = ["[Image]", `Description: ${description}`];
+          if (media.caption) parts.push(`Caption: ${media.caption}`);
+          agent.write(parts.join("\n"));
+        } else {
+          // No API key — fall back to file path
+          const parts = ["[image message received]", `File: ${filePath}`];
+          if (media.caption) parts.push(`Caption: ${media.caption}`);
+          agent.write(parts.join("\n"));
+        }
       } else if (media.type === "audio") {
         // Audio: transcribe via Whisper API before reaching agent
         const transcript = await transcribeAudio(filePath);
@@ -310,13 +321,28 @@ export class ChannelManager {
           if (media.caption) parts.push(`Caption: ${media.caption}`);
           agent.write(parts.join("\n"));
         }
+      } else if (media.type === "video") {
+        // Video: extract frame + describe via Vision API
+        const description = await describeVideo(filePath);
+        if (description) {
+          const parts = ["[Video]", `Description: ${description}`];
+          if (media.duration) parts.push(`Duration: ${media.duration}s`);
+          if (media.caption) parts.push(`Caption: ${media.caption}`);
+          agent.write(parts.join("\n"));
+        } else {
+          // ffmpeg unavailable or no API key — fall back to file path
+          const parts = ["[video message received]", `File: ${filePath}`];
+          if (media.mimeType) parts.push(`Type: ${media.mimeType}`);
+          if (media.duration) parts.push(`Duration: ${media.duration}s`);
+          if (media.caption) parts.push(`Caption: ${media.caption}`);
+          agent.write(parts.join("\n"));
+        }
       } else {
-        // Video/document: tell agent the file path
+        // Document: tell agent the file path
         const parts: string[] = [];
-        parts.push(`[${media.type} message received]`);
+        parts.push(`[document message received]`);
         parts.push(`File: ${filePath}`);
         if (media.mimeType) parts.push(`Type: ${media.mimeType}`);
-        if (media.duration) parts.push(`Duration: ${media.duration}s`);
         if (media.caption) parts.push(`Caption: ${media.caption}`);
         agent.write(parts.join("\n"));
       }
diff --git a/src/media/describe-image.ts b/src/media/describe-image.ts
new file mode 100644
index 00000000..700f5ca2
--- /dev/null
+++ b/src/media/describe-image.ts
@@ -0,0 +1,77 @@
+/**
+ * Image description via OpenAI Vision API.
+ *
+ * Called by ChannelManager before the message reaches the Agent,
+ * so the Agent only ever sees a text description of the image.
+ *
+ * @see docs/channels/media-handling.md — Media processing pipeline
+ */
+
+import { readFile } from "node:fs/promises";
+import { extname } from "node:path";
+import { credentialManager } from "../agent/credentials.js";
+
+/** Map file extension to MIME type for common image formats */
+function mimeFromExt(filePath: string): string {
+  const ext = extname(filePath).toLowerCase();
+  switch (ext) {
+    case ".png": return "image/png";
+    case ".gif": return "image/gif";
+    case ".webp": return "image/webp";
+    default: return "image/jpeg";
+  }
+}
+
+/**
+ * Describe an image using OpenAI Vision API (gpt-4o-mini).
+ *
+ * @param filePath - Local path to the image file
+ * @returns Text description, or null if no API key configured
+ */
+export async function describeImage(filePath: string): Promise<string | null> {
+  const config = credentialManager.getLlmProviderConfig("openai");
+  const apiKey = config?.apiKey;
+  if (!apiKey) return null;
+
+  const buffer = await readFile(filePath);
+  const base64 = buffer.toString("base64");
+  const mimeType = mimeFromExt(filePath);
+  const dataUrl = `data:${mimeType};base64,${base64}`;
+
+  const res = await fetch("https://api.openai.com/v1/chat/completions", {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${apiKey}`,
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({
+      model: "gpt-4o-mini",
+      messages: [
+        {
+          role: "user",
+          content: [
+            {
+              type: "text",
+              text: "Describe this image concisely. Focus on the main content and any text visible in the image.",
+            },
+            {
+              type: "image_url",
+              image_url: { url: dataUrl },
+            },
+          ],
+        },
+      ],
+      max_tokens: 500,
+    }),
+  });
+
+  if (!res.ok) {
+    const errText = await res.text().catch(() => "");
+    throw new Error(`Vision API error: HTTP ${res.status} ${errText}`);
+  }
+
+  const result = (await res.json()) as {
+    choices: Array<{ message: { content: string } }>;
+  };
+  return result.choices[0]?.message.content ?? null;
+}
diff --git a/src/media/describe-video.ts b/src/media/describe-video.ts
new file mode 100644
index 00000000..5ea55f18
--- /dev/null
+++ b/src/media/describe-video.ts
@@ -0,0 +1,49 @@
+/**
+ * Video description via frame extraction + Vision API.
+ *
+ * Extracts the first frame using ffmpeg, then describes it
+ * with the same Vision API used for images.
+ *
+ * @see docs/channels/media-handling.md — Media processing pipeline
+ */
+
+import { join } from "node:path";
+import { execFile } from "node:child_process";
+import { unlink } from "node:fs/promises";
+import { v7 as uuidv7 } from "uuid";
+import { MEDIA_CACHE_DIR } from "../shared/paths.js";
+import { describeImage } from "./describe-image.js";
+
+/**
+ * Describe a video by extracting the first frame and passing it to Vision API.
+ *
+ * @param filePath - Local path to the video file
+ * @returns Text description, or null if ffmpeg unavailable or no API key
+ */
+export async function describeVideo(filePath: string): Promise<string | null> {
+  const framePath = join(MEDIA_CACHE_DIR, `${uuidv7()}.jpg`);
+
+  try {
+    // Extract first frame with ffmpeg
+    await new Promise<void>((resolve, reject) => {
+      execFile(
+        "ffmpeg",
+        ["-i", filePath, "-vframes", "1", "-f", "image2", "-y", framePath],
+        { timeout: 10000 },
+        (err) => (err ? reject(err) : resolve()),
+      );
+    });
+
+    // Describe the extracted frame
+    const description = await describeImage(framePath);
+
+    // Clean up the frame file
+    await unlink(framePath).catch(() => {});
+
+    return description;
+  } catch {
+    // ffmpeg not available or extraction failed
+    await unlink(framePath).catch(() => {});
+    return null;
+  }
+}
diff --git a/src/media/transcribe.ts b/src/media/transcribe.ts
index d301a4e5..5f09c9a8 100644
--- a/src/media/transcribe.ts
+++ b/src/media/transcribe.ts
@@ -1,25 +1,77 @@
 /**
- * Audio transcription via OpenAI Whisper API.
+ * Audio transcription — local whisper first, OpenAI API fallback.
+ *
+ * Priority:
+ * 1. Local whisper/whisper-cli binary (free, no latency, offline)
+ * 2. OpenAI Whisper API (requires API key)
+ * 3. null (no provider available — placeholder stays for Agent)
  *
  * Called by ChannelManager before the message reaches the Agent,
  * so the Agent only ever sees text.
+ *
+ * @see docs/channels/media-handling.md — Media processing pipeline and provider priority
  */
 
-import { readFile } from "node:fs/promises";
-import { basename } from "node:path";
+import { readFile, unlink } from "node:fs/promises";
+import { basename, join } from "node:path";
+import { execFile, execFileSync } from "node:child_process";
+import { tmpdir } from "node:os";
 import { credentialManager } from "../agent/credentials.js";
 
-/**
- * Transcribe an audio file using OpenAI Whisper API.
- *
- * @param filePath - Local path to the audio file
- * @returns Transcribed text, or null if no API key configured
- */
-export async function transcribeAudio(filePath: string): Promise<string | null> {
-  const config = credentialManager.getLlmProviderConfig("openai");
-  const apiKey = config?.apiKey;
-  if (!apiKey) return null;
+/** Cached path to local whisper binary, or false if not found */
+let cachedWhisperBin: string | false | undefined;
 
+/** Find local whisper binary in PATH */
+function findWhisperBin(): string | false {
+  if (cachedWhisperBin !== undefined) return cachedWhisperBin;
+
+  for (const bin of ["whisper", "whisper-cli"]) {
+    try {
+      execFileSync("which", [bin], { stdio: "pipe" });
+      cachedWhisperBin = bin;
+      return bin;
+    } catch {
+      // not found, try next
+    }
+  }
+
+  cachedWhisperBin = false;
+  return false;
+}
+
+/**
+ * Transcribe audio using local whisper CLI.
+ *
+ * Runs: whisper "<file>" --model base --output_format txt --output_dir <tmpdir>
+ * Reads the generated .txt file and returns its content.
+ */
+async function transcribeLocal(whisperBin: string, filePath: string): Promise<string> {
+  const outDir = tmpdir();
+
+  await new Promise<void>((resolve, reject) => {
+    execFile(
+      whisperBin,
+      [filePath, "--model", "base", "--output_format", "txt", "--output_dir", outDir],
+      { timeout: 120000 },
+      (err) => (err ? reject(err) : resolve()),
+    );
+  });
+
+  // whisper outputs <basename_without_ext>.txt
+  const name = basename(filePath).replace(/\.[^.]+$/, "");
+  const txtPath = join(outDir, `${name}.txt`);
+  const text = (await readFile(txtPath, "utf-8")).trim();
+
+  // Clean up the txt file
+  await unlink(txtPath).catch(() => {});
+
+  return text;
+}
+
+/**
+ * Transcribe audio using OpenAI Whisper API.
+ */
+async function transcribeApi(apiKey: string, filePath: string): Promise<string> {
   const fileBuffer = await readFile(filePath);
   const fileName = basename(filePath);
 
@@ -61,3 +113,34 @@ export async function transcribeAudio(filePath: string): Promise<string | null>
   const result = (await res.json()) as { text: string };
   return result.text;
 }
+
+/**
+ * Transcribe an audio file.
+ *
+ * Priority: local whisper → OpenAI API → null.
+ *
+ * @param filePath - Local path to the audio file
+ * @returns Transcribed text, or null if no provider available
+ */
+export async function transcribeAudio(filePath: string): Promise<string | null> {
+  // 1. Try local whisper
+  const whisperBin = findWhisperBin();
+  if (whisperBin) {
+    try {
+      return await transcribeLocal(whisperBin, filePath);
+    } catch (err) {
+      const msg = err instanceof Error ? err.message : String(err);
+      console.error(`[Transcribe] Local whisper failed: ${msg}, trying API...`);
+    }
+  }
+
+  // 2. Try OpenAI API
+  const config = credentialManager.getLlmProviderConfig("openai");
+  const apiKey = config?.apiKey;
+  if (apiKey) {
+    return await transcribeApi(apiKey, filePath);
+  }
+
+  // 3. No provider available
+  return null;
+}