From db214b25ca9e12c210ce25d3668e5af5c3b478f3 Mon Sep 17 00:00:00 2001 From: Naiyuan Qing <145280634+NevilleQingNY@users.noreply.github.com> Date: Mon, 9 Feb 2026 11:03:31 +0800 Subject: [PATCH] feat(media): add image/video description and local whisper priority MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add describe-image.ts: OpenAI Vision API (gpt-4o-mini) image description - Add describe-video.ts: ffmpeg frame extraction + Vision API description - Rewrite transcribe.ts: local whisper/whisper-cli → OpenAI API → null - Update manager.ts routeMedia(): all media converted to text before agent - Image: describeImage() → text (was: raw ImageContent via writeWithImages) - Video: describeVideo() → text (was: file path info only) - Audio: unchanged (but underlying transcribeAudio now tries local first) Co-Authored-By: Claude Opus 4.6 --- src/channels/manager.ts | 46 +++++++++++---- src/media/describe-image.ts | 77 +++++++++++++++++++++++++ src/media/describe-video.ts | 49 ++++++++++++++++ src/media/transcribe.ts | 109 +++++++++++++++++++++++++++++++----- 4 files changed, 258 insertions(+), 23 deletions(-) create mode 100644 src/media/describe-image.ts create mode 100644 src/media/describe-video.ts diff --git a/src/channels/manager.ts b/src/channels/manager.ts index 4250b024..3c3dd92f 100644 --- a/src/channels/manager.ts +++ b/src/channels/manager.ts @@ -6,9 +6,12 @@ * - Outgoing: agent reply → check lastRoute → forward to originating channel * * Uses "last route" pattern: whoever sent the last message gets the reply. + * + * @see docs/channels/README.md — Channel system overview + * @see docs/channels/media-handling.md — Media processing pipeline + * @see docs/message-paths.md — All three message paths (Desktop / Web / Channel) */ -import { readFile } from "node:fs/promises"; import type { Hub } from "../hub/hub.js"; import type { ChannelPlugin, @@ -21,6 +24,8 @@ import { loadChannelsConfig } from "./config.js"; import { MessageAggregator, DEFAULT_CHUNKER_CONFIG } from "../hub/message-aggregator.js"; import type { AsyncAgent } from "../agent/async-agent.js"; import { transcribeAudio } from "../media/transcribe.js"; +import { describeImage } from "../media/describe-image.js"; +import { describeVideo } from "../media/describe-video.js"; interface AccountHandle { channelId: string; @@ -289,12 +294,18 @@ export class ChannelManager { const filePath = await plugin.downloadMedia!(media.fileId, accountId); if (media.type === "image") { - // Images: pass directly to LLM as ImageContent - const buffer = await readFile(filePath); - const base64 = buffer.toString("base64"); - const mimeType = media.mimeType ?? "image/jpeg"; - const caption = media.caption || "User sent an image."; - agent.writeWithImages(caption, [{ type: "image", data: base64, mimeType }]); + // Images: describe via Vision API before reaching agent + const description = await describeImage(filePath); + if (description) { + const parts = ["[Image]", `Description: ${description}`]; + if (media.caption) parts.push(`Caption: ${media.caption}`); + agent.write(parts.join("\n")); + } else { + // No API key — fall back to file path + const parts = ["[image message received]", `File: ${filePath}`]; + if (media.caption) parts.push(`Caption: ${media.caption}`); + agent.write(parts.join("\n")); + } } else if (media.type === "audio") { // Audio: transcribe via Whisper API before reaching agent const transcript = await transcribeAudio(filePath); @@ -310,13 +321,28 @@ export class ChannelManager { if (media.caption) parts.push(`Caption: ${media.caption}`); agent.write(parts.join("\n")); } + } else if (media.type === "video") { + // Video: extract frame + describe via Vision API + const description = await describeVideo(filePath); + if (description) { + const parts = ["[Video]", `Description: ${description}`]; + if (media.duration) parts.push(`Duration: ${media.duration}s`); + if (media.caption) parts.push(`Caption: ${media.caption}`); + agent.write(parts.join("\n")); + } else { + // ffmpeg unavailable or no API key — fall back to file path + const parts = ["[video message received]", `File: ${filePath}`]; + if (media.mimeType) parts.push(`Type: ${media.mimeType}`); + if (media.duration) parts.push(`Duration: ${media.duration}s`); + if (media.caption) parts.push(`Caption: ${media.caption}`); + agent.write(parts.join("\n")); + } } else { - // Video/document: tell agent the file path + // Document: tell agent the file path const parts: string[] = []; - parts.push(`[${media.type} message received]`); + parts.push(`[document message received]`); parts.push(`File: ${filePath}`); if (media.mimeType) parts.push(`Type: ${media.mimeType}`); - if (media.duration) parts.push(`Duration: ${media.duration}s`); if (media.caption) parts.push(`Caption: ${media.caption}`); agent.write(parts.join("\n")); } diff --git a/src/media/describe-image.ts b/src/media/describe-image.ts new file mode 100644 index 00000000..700f5ca2 --- /dev/null +++ b/src/media/describe-image.ts @@ -0,0 +1,77 @@ +/** + * Image description via OpenAI Vision API. + * + * Called by ChannelManager before the message reaches the Agent, + * so the Agent only ever sees a text description of the image. + * + * @see docs/channels/media-handling.md — Media processing pipeline + */ + +import { readFile } from "node:fs/promises"; +import { extname } from "node:path"; +import { credentialManager } from "../agent/credentials.js"; + +/** Map file extension to MIME type for common image formats */ +function mimeFromExt(filePath: string): string { + const ext = extname(filePath).toLowerCase(); + switch (ext) { + case ".png": return "image/png"; + case ".gif": return "image/gif"; + case ".webp": return "image/webp"; + default: return "image/jpeg"; + } +} + +/** + * Describe an image using OpenAI Vision API (gpt-4o-mini). + * + * @param filePath - Local path to the image file + * @returns Text description, or null if no API key configured + */ +export async function describeImage(filePath: string): Promise { + const config = credentialManager.getLlmProviderConfig("openai"); + const apiKey = config?.apiKey; + if (!apiKey) return null; + + const buffer = await readFile(filePath); + const base64 = buffer.toString("base64"); + const mimeType = mimeFromExt(filePath); + const dataUrl = `data:${mimeType};base64,${base64}`; + + const res = await fetch("https://api.openai.com/v1/chat/completions", { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model: "gpt-4o-mini", + messages: [ + { + role: "user", + content: [ + { + type: "text", + text: "Describe this image concisely. Focus on the main content and any text visible in the image.", + }, + { + type: "image_url", + image_url: { url: dataUrl }, + }, + ], + }, + ], + max_tokens: 500, + }), + }); + + if (!res.ok) { + const errText = await res.text().catch(() => ""); + throw new Error(`Vision API error: HTTP ${res.status} ${errText}`); + } + + const result = (await res.json()) as { + choices: Array<{ message: { content: string } }>; + }; + return result.choices[0]?.message.content ?? null; +} diff --git a/src/media/describe-video.ts b/src/media/describe-video.ts new file mode 100644 index 00000000..5ea55f18 --- /dev/null +++ b/src/media/describe-video.ts @@ -0,0 +1,49 @@ +/** + * Video description via frame extraction + Vision API. + * + * Extracts the first frame using ffmpeg, then describes it + * with the same Vision API used for images. + * + * @see docs/channels/media-handling.md — Media processing pipeline + */ + +import { join } from "node:path"; +import { execFile } from "node:child_process"; +import { unlink } from "node:fs/promises"; +import { v7 as uuidv7 } from "uuid"; +import { MEDIA_CACHE_DIR } from "../shared/paths.js"; +import { describeImage } from "./describe-image.js"; + +/** + * Describe a video by extracting the first frame and passing it to Vision API. + * + * @param filePath - Local path to the video file + * @returns Text description, or null if ffmpeg unavailable or no API key + */ +export async function describeVideo(filePath: string): Promise { + const framePath = join(MEDIA_CACHE_DIR, `${uuidv7()}.jpg`); + + try { + // Extract first frame with ffmpeg + await new Promise((resolve, reject) => { + execFile( + "ffmpeg", + ["-i", filePath, "-vframes", "1", "-f", "image2", "-y", framePath], + { timeout: 10000 }, + (err) => (err ? reject(err) : resolve()), + ); + }); + + // Describe the extracted frame + const description = await describeImage(framePath); + + // Clean up the frame file + await unlink(framePath).catch(() => {}); + + return description; + } catch { + // ffmpeg not available or extraction failed + await unlink(framePath).catch(() => {}); + return null; + } +} diff --git a/src/media/transcribe.ts b/src/media/transcribe.ts index d301a4e5..5f09c9a8 100644 --- a/src/media/transcribe.ts +++ b/src/media/transcribe.ts @@ -1,25 +1,77 @@ /** - * Audio transcription via OpenAI Whisper API. + * Audio transcription — local whisper first, OpenAI API fallback. + * + * Priority: + * 1. Local whisper/whisper-cli binary (free, no latency, offline) + * 2. OpenAI Whisper API (requires API key) + * 3. null (no provider available — placeholder stays for Agent) * * Called by ChannelManager before the message reaches the Agent, * so the Agent only ever sees text. + * + * @see docs/channels/media-handling.md — Media processing pipeline and provider priority */ -import { readFile } from "node:fs/promises"; -import { basename } from "node:path"; +import { readFile, unlink } from "node:fs/promises"; +import { basename, join } from "node:path"; +import { execFile, execFileSync } from "node:child_process"; +import { tmpdir } from "node:os"; import { credentialManager } from "../agent/credentials.js"; -/** - * Transcribe an audio file using OpenAI Whisper API. - * - * @param filePath - Local path to the audio file - * @returns Transcribed text, or null if no API key configured - */ -export async function transcribeAudio(filePath: string): Promise { - const config = credentialManager.getLlmProviderConfig("openai"); - const apiKey = config?.apiKey; - if (!apiKey) return null; +/** Cached path to local whisper binary, or false if not found */ +let cachedWhisperBin: string | false | undefined; +/** Find local whisper binary in PATH */ +function findWhisperBin(): string | false { + if (cachedWhisperBin !== undefined) return cachedWhisperBin; + + for (const bin of ["whisper", "whisper-cli"]) { + try { + execFileSync("which", [bin], { stdio: "pipe" }); + cachedWhisperBin = bin; + return bin; + } catch { + // not found, try next + } + } + + cachedWhisperBin = false; + return false; +} + +/** + * Transcribe audio using local whisper CLI. + * + * Runs: whisper "" --model base --output_format txt --output_dir + * Reads the generated .txt file and returns its content. + */ +async function transcribeLocal(whisperBin: string, filePath: string): Promise { + const outDir = tmpdir(); + + await new Promise((resolve, reject) => { + execFile( + whisperBin, + [filePath, "--model", "base", "--output_format", "txt", "--output_dir", outDir], + { timeout: 120000 }, + (err) => (err ? reject(err) : resolve()), + ); + }); + + // whisper outputs .txt + const name = basename(filePath).replace(/\.[^.]+$/, ""); + const txtPath = join(outDir, `${name}.txt`); + const text = (await readFile(txtPath, "utf-8")).trim(); + + // Clean up the txt file + await unlink(txtPath).catch(() => {}); + + return text; +} + +/** + * Transcribe audio using OpenAI Whisper API. + */ +async function transcribeApi(apiKey: string, filePath: string): Promise { const fileBuffer = await readFile(filePath); const fileName = basename(filePath); @@ -61,3 +113,34 @@ export async function transcribeAudio(filePath: string): Promise const result = (await res.json()) as { text: string }; return result.text; } + +/** + * Transcribe an audio file. + * + * Priority: local whisper → OpenAI API → null. + * + * @param filePath - Local path to the audio file + * @returns Transcribed text, or null if no provider available + */ +export async function transcribeAudio(filePath: string): Promise { + // 1. Try local whisper + const whisperBin = findWhisperBin(); + if (whisperBin) { + try { + return await transcribeLocal(whisperBin, filePath); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error(`[Transcribe] Local whisper failed: ${msg}, trying API...`); + } + } + + // 2. Try OpenAI API + const config = credentialManager.getLlmProviderConfig("openai"); + const apiKey = config?.apiKey; + if (apiKey) { + return await transcribeApi(apiKey, filePath); + } + + // 3. No provider available + return null; +}