feat(media): add image/video description and local whisper priority
- Add describe-image.ts: OpenAI Vision API (gpt-4o-mini) image description - Add describe-video.ts: ffmpeg frame extraction + Vision API description - Rewrite transcribe.ts: local whisper/whisper-cli → OpenAI API → null - Update manager.ts routeMedia(): all media converted to text before agent - Image: describeImage() → text (was: raw ImageContent via writeWithImages) - Video: describeVideo() → text (was: file path info only) - Audio: unchanged (but underlying transcribeAudio now tries local first) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
4e5780692e
commit
db214b25ca
4 changed files with 258 additions and 23 deletions
|
|
@ -6,9 +6,12 @@
|
|||
* - Outgoing: agent reply → check lastRoute → forward to originating channel
|
||||
*
|
||||
* Uses "last route" pattern: whoever sent the last message gets the reply.
|
||||
*
|
||||
* @see docs/channels/README.md — Channel system overview
|
||||
* @see docs/channels/media-handling.md — Media processing pipeline
|
||||
* @see docs/message-paths.md — All three message paths (Desktop / Web / Channel)
|
||||
*/
|
||||
|
||||
import { readFile } from "node:fs/promises";
|
||||
import type { Hub } from "../hub/hub.js";
|
||||
import type {
|
||||
ChannelPlugin,
|
||||
|
|
@ -21,6 +24,8 @@ import { loadChannelsConfig } from "./config.js";
|
|||
import { MessageAggregator, DEFAULT_CHUNKER_CONFIG } from "../hub/message-aggregator.js";
|
||||
import type { AsyncAgent } from "../agent/async-agent.js";
|
||||
import { transcribeAudio } from "../media/transcribe.js";
|
||||
import { describeImage } from "../media/describe-image.js";
|
||||
import { describeVideo } from "../media/describe-video.js";
|
||||
|
||||
interface AccountHandle {
|
||||
channelId: string;
|
||||
|
|
@ -289,12 +294,18 @@ export class ChannelManager {
|
|||
const filePath = await plugin.downloadMedia!(media.fileId, accountId);
|
||||
|
||||
if (media.type === "image") {
|
||||
// Images: pass directly to LLM as ImageContent
|
||||
const buffer = await readFile(filePath);
|
||||
const base64 = buffer.toString("base64");
|
||||
const mimeType = media.mimeType ?? "image/jpeg";
|
||||
const caption = media.caption || "User sent an image.";
|
||||
agent.writeWithImages(caption, [{ type: "image", data: base64, mimeType }]);
|
||||
// Images: describe via Vision API before reaching agent
|
||||
const description = await describeImage(filePath);
|
||||
if (description) {
|
||||
const parts = ["[Image]", `Description: ${description}`];
|
||||
if (media.caption) parts.push(`Caption: ${media.caption}`);
|
||||
agent.write(parts.join("\n"));
|
||||
} else {
|
||||
// No API key — fall back to file path
|
||||
const parts = ["[image message received]", `File: ${filePath}`];
|
||||
if (media.caption) parts.push(`Caption: ${media.caption}`);
|
||||
agent.write(parts.join("\n"));
|
||||
}
|
||||
} else if (media.type === "audio") {
|
||||
// Audio: transcribe via Whisper API before reaching agent
|
||||
const transcript = await transcribeAudio(filePath);
|
||||
|
|
@ -310,13 +321,28 @@ export class ChannelManager {
|
|||
if (media.caption) parts.push(`Caption: ${media.caption}`);
|
||||
agent.write(parts.join("\n"));
|
||||
}
|
||||
} else if (media.type === "video") {
|
||||
// Video: extract frame + describe via Vision API
|
||||
const description = await describeVideo(filePath);
|
||||
if (description) {
|
||||
const parts = ["[Video]", `Description: ${description}`];
|
||||
if (media.duration) parts.push(`Duration: ${media.duration}s`);
|
||||
if (media.caption) parts.push(`Caption: ${media.caption}`);
|
||||
agent.write(parts.join("\n"));
|
||||
} else {
|
||||
// ffmpeg unavailable or no API key — fall back to file path
|
||||
const parts = ["[video message received]", `File: ${filePath}`];
|
||||
if (media.mimeType) parts.push(`Type: ${media.mimeType}`);
|
||||
if (media.duration) parts.push(`Duration: ${media.duration}s`);
|
||||
if (media.caption) parts.push(`Caption: ${media.caption}`);
|
||||
agent.write(parts.join("\n"));
|
||||
}
|
||||
} else {
|
||||
// Video/document: tell agent the file path
|
||||
// Document: tell agent the file path
|
||||
const parts: string[] = [];
|
||||
parts.push(`[${media.type} message received]`);
|
||||
parts.push(`[document message received]`);
|
||||
parts.push(`File: ${filePath}`);
|
||||
if (media.mimeType) parts.push(`Type: ${media.mimeType}`);
|
||||
if (media.duration) parts.push(`Duration: ${media.duration}s`);
|
||||
if (media.caption) parts.push(`Caption: ${media.caption}`);
|
||||
agent.write(parts.join("\n"));
|
||||
}
|
||||
|
|
|
|||
77
src/media/describe-image.ts
Normal file
77
src/media/describe-image.ts
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
/**
|
||||
* Image description via OpenAI Vision API.
|
||||
*
|
||||
* Called by ChannelManager before the message reaches the Agent,
|
||||
* so the Agent only ever sees a text description of the image.
|
||||
*
|
||||
* @see docs/channels/media-handling.md — Media processing pipeline
|
||||
*/
|
||||
|
||||
import { readFile } from "node:fs/promises";
|
||||
import { extname } from "node:path";
|
||||
import { credentialManager } from "../agent/credentials.js";
|
||||
|
||||
/** Map file extension to MIME type for common image formats */
|
||||
function mimeFromExt(filePath: string): string {
|
||||
const ext = extname(filePath).toLowerCase();
|
||||
switch (ext) {
|
||||
case ".png": return "image/png";
|
||||
case ".gif": return "image/gif";
|
||||
case ".webp": return "image/webp";
|
||||
default: return "image/jpeg";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe an image using OpenAI Vision API (gpt-4o-mini).
|
||||
*
|
||||
* @param filePath - Local path to the image file
|
||||
* @returns Text description, or null if no API key configured
|
||||
*/
|
||||
export async function describeImage(filePath: string): Promise<string | null> {
|
||||
const config = credentialManager.getLlmProviderConfig("openai");
|
||||
const apiKey = config?.apiKey;
|
||||
if (!apiKey) return null;
|
||||
|
||||
const buffer = await readFile(filePath);
|
||||
const base64 = buffer.toString("base64");
|
||||
const mimeType = mimeFromExt(filePath);
|
||||
const dataUrl = `data:${mimeType};base64,${base64}`;
|
||||
|
||||
const res = await fetch("https://api.openai.com/v1/chat/completions", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: "gpt-4o-mini",
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: "Describe this image concisely. Focus on the main content and any text visible in the image.",
|
||||
},
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: { url: dataUrl },
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
max_tokens: 500,
|
||||
}),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const errText = await res.text().catch(() => "");
|
||||
throw new Error(`Vision API error: HTTP ${res.status} ${errText}`);
|
||||
}
|
||||
|
||||
const result = (await res.json()) as {
|
||||
choices: Array<{ message: { content: string } }>;
|
||||
};
|
||||
return result.choices[0]?.message.content ?? null;
|
||||
}
|
||||
49
src/media/describe-video.ts
Normal file
49
src/media/describe-video.ts
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* Video description via frame extraction + Vision API.
|
||||
*
|
||||
* Extracts the first frame using ffmpeg, then describes it
|
||||
* with the same Vision API used for images.
|
||||
*
|
||||
* @see docs/channels/media-handling.md — Media processing pipeline
|
||||
*/
|
||||
|
||||
import { join } from "node:path";
|
||||
import { execFile } from "node:child_process";
|
||||
import { unlink } from "node:fs/promises";
|
||||
import { v7 as uuidv7 } from "uuid";
|
||||
import { MEDIA_CACHE_DIR } from "../shared/paths.js";
|
||||
import { describeImage } from "./describe-image.js";
|
||||
|
||||
/**
|
||||
* Describe a video by extracting the first frame and passing it to Vision API.
|
||||
*
|
||||
* @param filePath - Local path to the video file
|
||||
* @returns Text description, or null if ffmpeg unavailable or no API key
|
||||
*/
|
||||
export async function describeVideo(filePath: string): Promise<string | null> {
|
||||
const framePath = join(MEDIA_CACHE_DIR, `${uuidv7()}.jpg`);
|
||||
|
||||
try {
|
||||
// Extract first frame with ffmpeg
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
execFile(
|
||||
"ffmpeg",
|
||||
["-i", filePath, "-vframes", "1", "-f", "image2", "-y", framePath],
|
||||
{ timeout: 10000 },
|
||||
(err) => (err ? reject(err) : resolve()),
|
||||
);
|
||||
});
|
||||
|
||||
// Describe the extracted frame
|
||||
const description = await describeImage(framePath);
|
||||
|
||||
// Clean up the frame file
|
||||
await unlink(framePath).catch(() => {});
|
||||
|
||||
return description;
|
||||
} catch {
|
||||
// ffmpeg not available or extraction failed
|
||||
await unlink(framePath).catch(() => {});
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,25 +1,77 @@
|
|||
/**
|
||||
* Audio transcription via OpenAI Whisper API.
|
||||
* Audio transcription — local whisper first, OpenAI API fallback.
|
||||
*
|
||||
* Priority:
|
||||
* 1. Local whisper/whisper-cli binary (free, no latency, offline)
|
||||
* 2. OpenAI Whisper API (requires API key)
|
||||
* 3. null (no provider available — placeholder stays for Agent)
|
||||
*
|
||||
* Called by ChannelManager before the message reaches the Agent,
|
||||
* so the Agent only ever sees text.
|
||||
*
|
||||
* @see docs/channels/media-handling.md — Media processing pipeline and provider priority
|
||||
*/
|
||||
|
||||
import { readFile } from "node:fs/promises";
|
||||
import { basename } from "node:path";
|
||||
import { readFile, unlink } from "node:fs/promises";
|
||||
import { basename, join } from "node:path";
|
||||
import { execFile, execFileSync } from "node:child_process";
|
||||
import { tmpdir } from "node:os";
|
||||
import { credentialManager } from "../agent/credentials.js";
|
||||
|
||||
/**
|
||||
* Transcribe an audio file using OpenAI Whisper API.
|
||||
*
|
||||
* @param filePath - Local path to the audio file
|
||||
* @returns Transcribed text, or null if no API key configured
|
||||
*/
|
||||
export async function transcribeAudio(filePath: string): Promise<string | null> {
|
||||
const config = credentialManager.getLlmProviderConfig("openai");
|
||||
const apiKey = config?.apiKey;
|
||||
if (!apiKey) return null;
|
||||
/** Cached path to local whisper binary, or false if not found */
|
||||
let cachedWhisperBin: string | false | undefined;
|
||||
|
||||
/** Find local whisper binary in PATH */
|
||||
function findWhisperBin(): string | false {
|
||||
if (cachedWhisperBin !== undefined) return cachedWhisperBin;
|
||||
|
||||
for (const bin of ["whisper", "whisper-cli"]) {
|
||||
try {
|
||||
execFileSync("which", [bin], { stdio: "pipe" });
|
||||
cachedWhisperBin = bin;
|
||||
return bin;
|
||||
} catch {
|
||||
// not found, try next
|
||||
}
|
||||
}
|
||||
|
||||
cachedWhisperBin = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe audio using local whisper CLI.
|
||||
*
|
||||
* Runs: whisper "<file>" --model base --output_format txt --output_dir <tmpdir>
|
||||
* Reads the generated .txt file and returns its content.
|
||||
*/
|
||||
async function transcribeLocal(whisperBin: string, filePath: string): Promise<string> {
|
||||
const outDir = tmpdir();
|
||||
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
execFile(
|
||||
whisperBin,
|
||||
[filePath, "--model", "base", "--output_format", "txt", "--output_dir", outDir],
|
||||
{ timeout: 120000 },
|
||||
(err) => (err ? reject(err) : resolve()),
|
||||
);
|
||||
});
|
||||
|
||||
// whisper outputs <basename_without_ext>.txt
|
||||
const name = basename(filePath).replace(/\.[^.]+$/, "");
|
||||
const txtPath = join(outDir, `${name}.txt`);
|
||||
const text = (await readFile(txtPath, "utf-8")).trim();
|
||||
|
||||
// Clean up the txt file
|
||||
await unlink(txtPath).catch(() => {});
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe audio using OpenAI Whisper API.
|
||||
*/
|
||||
async function transcribeApi(apiKey: string, filePath: string): Promise<string> {
|
||||
const fileBuffer = await readFile(filePath);
|
||||
const fileName = basename(filePath);
|
||||
|
||||
|
|
@ -61,3 +113,34 @@ export async function transcribeAudio(filePath: string): Promise<string | null>
|
|||
const result = (await res.json()) as { text: string };
|
||||
return result.text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe an audio file.
|
||||
*
|
||||
* Priority: local whisper → OpenAI API → null.
|
||||
*
|
||||
* @param filePath - Local path to the audio file
|
||||
* @returns Transcribed text, or null if no provider available
|
||||
*/
|
||||
export async function transcribeAudio(filePath: string): Promise<string | null> {
|
||||
// 1. Try local whisper
|
||||
const whisperBin = findWhisperBin();
|
||||
if (whisperBin) {
|
||||
try {
|
||||
return await transcribeLocal(whisperBin, filePath);
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
console.error(`[Transcribe] Local whisper failed: ${msg}, trying API...`);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Try OpenAI API
|
||||
const config = credentialManager.getLlmProviderConfig("openai");
|
||||
const apiKey = config?.apiKey;
|
||||
if (apiKey) {
|
||||
return await transcribeApi(apiKey, filePath);
|
||||
}
|
||||
|
||||
// 3. No provider available
|
||||
return null;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue