feat(media): add image/video description and local whisper priority

- Add describe-image.ts: OpenAI Vision API (gpt-4o-mini) image description
- Add describe-video.ts: ffmpeg frame extraction + Vision API description
- Rewrite transcribe.ts: local whisper/whisper-cli → OpenAI API → null
- Update manager.ts routeMedia(): all media converted to text before agent
  - Image: describeImage() → text (was: raw ImageContent via writeWithImages)
  - Video: describeVideo() → text (was: file path info only)
  - Audio: unchanged (but underlying transcribeAudio now tries local first)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Naiyuan Qing 2026-02-09 11:03:31 +08:00
parent 4e5780692e
commit db214b25ca
4 changed files with 258 additions and 23 deletions

View file

@ -6,9 +6,12 @@
* - Outgoing: agent reply check lastRoute forward to originating channel
*
* Uses "last route" pattern: whoever sent the last message gets the reply.
*
* @see docs/channels/README.md Channel system overview
* @see docs/channels/media-handling.md Media processing pipeline
* @see docs/message-paths.md All three message paths (Desktop / Web / Channel)
*/
import { readFile } from "node:fs/promises";
import type { Hub } from "../hub/hub.js";
import type {
ChannelPlugin,
@ -21,6 +24,8 @@ import { loadChannelsConfig } from "./config.js";
import { MessageAggregator, DEFAULT_CHUNKER_CONFIG } from "../hub/message-aggregator.js";
import type { AsyncAgent } from "../agent/async-agent.js";
import { transcribeAudio } from "../media/transcribe.js";
import { describeImage } from "../media/describe-image.js";
import { describeVideo } from "../media/describe-video.js";
interface AccountHandle {
channelId: string;
@ -289,12 +294,18 @@ export class ChannelManager {
const filePath = await plugin.downloadMedia!(media.fileId, accountId);
if (media.type === "image") {
// Images: pass directly to LLM as ImageContent
const buffer = await readFile(filePath);
const base64 = buffer.toString("base64");
const mimeType = media.mimeType ?? "image/jpeg";
const caption = media.caption || "User sent an image.";
agent.writeWithImages(caption, [{ type: "image", data: base64, mimeType }]);
// Images: describe via Vision API before reaching agent
const description = await describeImage(filePath);
if (description) {
const parts = ["[Image]", `Description: ${description}`];
if (media.caption) parts.push(`Caption: ${media.caption}`);
agent.write(parts.join("\n"));
} else {
// No API key — fall back to file path
const parts = ["[image message received]", `File: ${filePath}`];
if (media.caption) parts.push(`Caption: ${media.caption}`);
agent.write(parts.join("\n"));
}
} else if (media.type === "audio") {
// Audio: transcribe via Whisper API before reaching agent
const transcript = await transcribeAudio(filePath);
@ -310,13 +321,28 @@ export class ChannelManager {
if (media.caption) parts.push(`Caption: ${media.caption}`);
agent.write(parts.join("\n"));
}
} else if (media.type === "video") {
// Video: extract frame + describe via Vision API
const description = await describeVideo(filePath);
if (description) {
const parts = ["[Video]", `Description: ${description}`];
if (media.duration) parts.push(`Duration: ${media.duration}s`);
if (media.caption) parts.push(`Caption: ${media.caption}`);
agent.write(parts.join("\n"));
} else {
// ffmpeg unavailable or no API key — fall back to file path
const parts = ["[video message received]", `File: ${filePath}`];
if (media.mimeType) parts.push(`Type: ${media.mimeType}`);
if (media.duration) parts.push(`Duration: ${media.duration}s`);
if (media.caption) parts.push(`Caption: ${media.caption}`);
agent.write(parts.join("\n"));
}
} else {
// Video/document: tell agent the file path
// Document: tell agent the file path
const parts: string[] = [];
parts.push(`[${media.type} message received]`);
parts.push(`[document message received]`);
parts.push(`File: ${filePath}`);
if (media.mimeType) parts.push(`Type: ${media.mimeType}`);
if (media.duration) parts.push(`Duration: ${media.duration}s`);
if (media.caption) parts.push(`Caption: ${media.caption}`);
agent.write(parts.join("\n"));
}

View file

@ -0,0 +1,77 @@
/**
* Image description via OpenAI Vision API.
*
* Called by ChannelManager before the message reaches the Agent,
* so the Agent only ever sees a text description of the image.
*
* @see docs/channels/media-handling.md Media processing pipeline
*/
import { readFile } from "node:fs/promises";
import { extname } from "node:path";
import { credentialManager } from "../agent/credentials.js";
/** Map file extension to MIME type for common image formats */
function mimeFromExt(filePath: string): string {
const ext = extname(filePath).toLowerCase();
switch (ext) {
case ".png": return "image/png";
case ".gif": return "image/gif";
case ".webp": return "image/webp";
default: return "image/jpeg";
}
}
/**
* Describe an image using OpenAI Vision API (gpt-4o-mini).
*
* @param filePath - Local path to the image file
* @returns Text description, or null if no API key configured
*/
export async function describeImage(filePath: string): Promise<string | null> {
const config = credentialManager.getLlmProviderConfig("openai");
const apiKey = config?.apiKey;
if (!apiKey) return null;
const buffer = await readFile(filePath);
const base64 = buffer.toString("base64");
const mimeType = mimeFromExt(filePath);
const dataUrl = `data:${mimeType};base64,${base64}`;
const res = await fetch("https://api.openai.com/v1/chat/completions", {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model: "gpt-4o-mini",
messages: [
{
role: "user",
content: [
{
type: "text",
text: "Describe this image concisely. Focus on the main content and any text visible in the image.",
},
{
type: "image_url",
image_url: { url: dataUrl },
},
],
},
],
max_tokens: 500,
}),
});
if (!res.ok) {
const errText = await res.text().catch(() => "");
throw new Error(`Vision API error: HTTP ${res.status} ${errText}`);
}
const result = (await res.json()) as {
choices: Array<{ message: { content: string } }>;
};
return result.choices[0]?.message.content ?? null;
}

View file

@ -0,0 +1,49 @@
/**
* Video description via frame extraction + Vision API.
*
* Extracts the first frame using ffmpeg, then describes it
* with the same Vision API used for images.
*
* @see docs/channels/media-handling.md Media processing pipeline
*/
import { join } from "node:path";
import { execFile } from "node:child_process";
import { unlink } from "node:fs/promises";
import { v7 as uuidv7 } from "uuid";
import { MEDIA_CACHE_DIR } from "../shared/paths.js";
import { describeImage } from "./describe-image.js";
/**
* Describe a video by extracting the first frame and passing it to Vision API.
*
* @param filePath - Local path to the video file
* @returns Text description, or null if ffmpeg unavailable or no API key
*/
export async function describeVideo(filePath: string): Promise<string | null> {
const framePath = join(MEDIA_CACHE_DIR, `${uuidv7()}.jpg`);
try {
// Extract first frame with ffmpeg
await new Promise<void>((resolve, reject) => {
execFile(
"ffmpeg",
["-i", filePath, "-vframes", "1", "-f", "image2", "-y", framePath],
{ timeout: 10000 },
(err) => (err ? reject(err) : resolve()),
);
});
// Describe the extracted frame
const description = await describeImage(framePath);
// Clean up the frame file
await unlink(framePath).catch(() => {});
return description;
} catch {
// ffmpeg not available or extraction failed
await unlink(framePath).catch(() => {});
return null;
}
}

View file

@ -1,25 +1,77 @@
/**
* Audio transcription via OpenAI Whisper API.
* Audio transcription local whisper first, OpenAI API fallback.
*
* Priority:
* 1. Local whisper/whisper-cli binary (free, no latency, offline)
* 2. OpenAI Whisper API (requires API key)
* 3. null (no provider available placeholder stays for Agent)
*
* Called by ChannelManager before the message reaches the Agent,
* so the Agent only ever sees text.
*
* @see docs/channels/media-handling.md Media processing pipeline and provider priority
*/
import { readFile } from "node:fs/promises";
import { basename } from "node:path";
import { readFile, unlink } from "node:fs/promises";
import { basename, join } from "node:path";
import { execFile, execFileSync } from "node:child_process";
import { tmpdir } from "node:os";
import { credentialManager } from "../agent/credentials.js";
/**
* Transcribe an audio file using OpenAI Whisper API.
*
* @param filePath - Local path to the audio file
* @returns Transcribed text, or null if no API key configured
*/
export async function transcribeAudio(filePath: string): Promise<string | null> {
const config = credentialManager.getLlmProviderConfig("openai");
const apiKey = config?.apiKey;
if (!apiKey) return null;
/** Cached path to local whisper binary, or false if not found */
let cachedWhisperBin: string | false | undefined;
/** Find local whisper binary in PATH */
function findWhisperBin(): string | false {
if (cachedWhisperBin !== undefined) return cachedWhisperBin;
for (const bin of ["whisper", "whisper-cli"]) {
try {
execFileSync("which", [bin], { stdio: "pipe" });
cachedWhisperBin = bin;
return bin;
} catch {
// not found, try next
}
}
cachedWhisperBin = false;
return false;
}
/**
* Transcribe audio using local whisper CLI.
*
* Runs: whisper "<file>" --model base --output_format txt --output_dir <tmpdir>
* Reads the generated .txt file and returns its content.
*/
async function transcribeLocal(whisperBin: string, filePath: string): Promise<string> {
const outDir = tmpdir();
await new Promise<void>((resolve, reject) => {
execFile(
whisperBin,
[filePath, "--model", "base", "--output_format", "txt", "--output_dir", outDir],
{ timeout: 120000 },
(err) => (err ? reject(err) : resolve()),
);
});
// whisper outputs <basename_without_ext>.txt
const name = basename(filePath).replace(/\.[^.]+$/, "");
const txtPath = join(outDir, `${name}.txt`);
const text = (await readFile(txtPath, "utf-8")).trim();
// Clean up the txt file
await unlink(txtPath).catch(() => {});
return text;
}
/**
* Transcribe audio using OpenAI Whisper API.
*/
async function transcribeApi(apiKey: string, filePath: string): Promise<string> {
const fileBuffer = await readFile(filePath);
const fileName = basename(filePath);
@ -61,3 +113,34 @@ export async function transcribeAudio(filePath: string): Promise<string | null>
const result = (await res.json()) as { text: string };
return result.text;
}
/**
* Transcribe an audio file.
*
* Priority: local whisper OpenAI API null.
*
* @param filePath - Local path to the audio file
* @returns Transcribed text, or null if no provider available
*/
export async function transcribeAudio(filePath: string): Promise<string | null> {
// 1. Try local whisper
const whisperBin = findWhisperBin();
if (whisperBin) {
try {
return await transcribeLocal(whisperBin, filePath);
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
console.error(`[Transcribe] Local whisper failed: ${msg}, trying API...`);
}
}
// 2. Try OpenAI API
const config = credentialManager.getLlmProviderConfig("openai");
const apiKey = config?.apiKey;
if (apiKey) {
return await transcribeApi(apiKey, filePath);
}
// 3. No provider available
return null;
}