feat: migrate from smart-whisper to custom binding + add cuda support
This commit is contained in:
parent
696193eb44
commit
048915da61
52 changed files with 1490 additions and 4353 deletions
24
.github/workflows/release.yml
vendored
24
.github/workflows/release.yml
vendored
|
|
@ -4,6 +4,7 @@ on:
|
|||
push:
|
||||
branches:
|
||||
- feat.windows.support
|
||||
- feat/whisper.migration
|
||||
tags:
|
||||
- 'v*'
|
||||
workflow_dispatch:
|
||||
|
|
@ -70,6 +71,13 @@ jobs:
|
|||
node-version: '24.1.0'
|
||||
cache: 'pnpm'
|
||||
|
||||
- name: Install CUDA Toolkit
|
||||
if: matrix.os == 'windows'
|
||||
uses: Jimver/cuda-toolkit@v0.2.15
|
||||
with:
|
||||
cuda: '12.4.1'
|
||||
method: 'network'
|
||||
|
||||
- name: Log Node.js architecture and platform
|
||||
run: |
|
||||
echo "=== Node.js Process Information ==="
|
||||
|
|
@ -78,8 +86,24 @@ jobs:
|
|||
echo ""
|
||||
|
||||
- name: Install dependencies
|
||||
env:
|
||||
GGML_NATIVE: OFF # ensure postinstall builds avoid i8mm on CI runners
|
||||
run: pnpm install --frozen-lockfile
|
||||
|
||||
- name: Build whisper wrapper JS
|
||||
run: pnpm --filter @amical/whisper-wrapper build
|
||||
|
||||
- name: Build whisper native binaries
|
||||
env:
|
||||
GGML_NATIVE: OFF # CI mac runners lack i8mm support; keep CPU features conservative here
|
||||
run: pnpm --filter @amical/whisper-wrapper build:native
|
||||
|
||||
- name: Build whisper native binaries (cuda)
|
||||
if: matrix.os == 'windows'
|
||||
env:
|
||||
GGML_NATIVE: OFF
|
||||
run: pnpm --filter @amical/whisper-wrapper build:native:cuda
|
||||
|
||||
- name: Download Node.js binaries
|
||||
working-directory: apps/desktop
|
||||
run: pnpm download-node
|
||||
|
|
|
|||
10
.gitignore
vendored
10
.gitignore
vendored
|
|
@ -21,11 +21,20 @@ coverage
|
|||
# Vercel
|
||||
.vercel
|
||||
|
||||
# CMake-js cache
|
||||
.cmake-js/
|
||||
**/.cmake-js/
|
||||
|
||||
# Tool helpers
|
||||
.home/
|
||||
**/.home/
|
||||
|
||||
# Build Outputs
|
||||
.next/
|
||||
out/
|
||||
build
|
||||
dist
|
||||
packages/whisper-wrapper/native/
|
||||
|
||||
|
||||
# Debug
|
||||
|
|
@ -41,6 +50,7 @@ CLAUDE.md
|
|||
.local
|
||||
.claude
|
||||
amical.db
|
||||
AGENTS.md
|
||||
|
||||
# Temp files
|
||||
/tmp
|
||||
|
|
|
|||
4
.gitmodules
vendored
4
.gitmodules
vendored
|
|
@ -1,3 +1,3 @@
|
|||
[submodule "packages/smart-whisper/whisper.cpp"]
|
||||
path = packages/smart-whisper/whisper.cpp
|
||||
[submodule "packages/whisper-wrapper/whisper.cpp"]
|
||||
path = packages/whisper-wrapper/whisper.cpp
|
||||
url = https://github.com/ggerganov/whisper.cpp.git
|
||||
|
|
@ -40,7 +40,7 @@ export const EXTERNAL_DEPENDENCIES = [
|
|||
"libsql",
|
||||
"onnxruntime-node",
|
||||
"workerpool",
|
||||
"@amical/smart-whisper",
|
||||
"@amical/whisper-wrapper",
|
||||
// Add any other native modules you need here
|
||||
];
|
||||
|
||||
|
|
@ -160,6 +160,24 @@ const config: ForgeConfig = {
|
|||
}
|
||||
}
|
||||
|
||||
// Prune heavy native sources that trigger MAX_PATH on Windows packages
|
||||
const whisperWrapperPath = join(
|
||||
localNodeModules,
|
||||
"@amical",
|
||||
"whisper-wrapper",
|
||||
);
|
||||
const whisperPruneTargets = [
|
||||
join(whisperWrapperPath, "whisper.cpp"),
|
||||
join(whisperWrapperPath, "build"),
|
||||
join(whisperWrapperPath, ".cmake-js"),
|
||||
];
|
||||
for (const target of whisperPruneTargets) {
|
||||
if (existsSync(target)) {
|
||||
console.log(`Pruning ${target} from packaged output`);
|
||||
rmSync(target, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
// Second pass: Replace any symlinks with dereferenced copies
|
||||
console.log("Checking for symlinks in copied dependencies...");
|
||||
for (const dep of nativeModuleDependenciesToPackage) {
|
||||
|
|
@ -318,7 +336,7 @@ const config: ForgeConfig = {
|
|||
packagerConfig: {
|
||||
asar: {
|
||||
unpack:
|
||||
"{*.node,*.dylib,*.so,*.dll,*.metal,**/node_modules/@amical/smart-whisper/**,**/whisper.cpp/**,**/.vite/build/whisper-worker-fork.js,**/node_modules/jest-worker/**,**/onnxruntime-node/bin/**}",
|
||||
"{*.node,*.dylib,*.so,*.dll,*.metal,**/node_modules/@amical/whisper-wrapper/**,**/whisper.cpp/**,**/.vite/build/whisper-worker-fork.js,**/node_modules/jest-worker/**,**/onnxruntime-node/bin/**}",
|
||||
},
|
||||
name: "Amical",
|
||||
executableName: "Amical",
|
||||
|
|
|
|||
|
|
@ -81,7 +81,7 @@
|
|||
"dependencies": {
|
||||
"@ai-sdk/openai": "^1.3.22",
|
||||
"@amical/eslint-config": "workspace:*",
|
||||
"@amical/smart-whisper": "workspace:*",
|
||||
"@amical/whisper-wrapper": "workspace:*",
|
||||
"@amical/types": "workspace:*",
|
||||
"@amical/y-libsql": "workspace:*",
|
||||
"@dnd-kit/core": "^6.3.1",
|
||||
|
|
|
|||
|
|
@ -2,21 +2,7 @@ import dotenv from "dotenv";
|
|||
dotenv.config();
|
||||
|
||||
import { app } from "electron";
|
||||
import * as path from "path";
|
||||
|
||||
// Set GGML_METAL_PATH_RESOURCES before any other imports
|
||||
// This ensures @amical/smart-whisper can find its resources when unpacked from asar
|
||||
if (app.isPackaged) {
|
||||
// Point to the unpacked whisper.cpp directory
|
||||
process.env.GGML_METAL_PATH_RESOURCES = path.join(
|
||||
process.resourcesPath,
|
||||
"app.asar.unpacked",
|
||||
"node_modules",
|
||||
"@amical",
|
||||
"smart-whisper",
|
||||
"whisper.cpp",
|
||||
);
|
||||
}
|
||||
import started from "electron-squirrel-startup";
|
||||
import { AppManager } from "./core/app-manager";
|
||||
import { updateElectronApp } from "update-electron-app";
|
||||
|
|
|
|||
|
|
@ -44,7 +44,6 @@ export class SimpleForkWrapper {
|
|||
const workerEnv: any = {
|
||||
...process.env,
|
||||
ELECTRON_RUN_AS_NODE: "1",
|
||||
GGML_METAL_PATH_RESOURCES: process.env.GGML_METAL_PATH_RESOURCES,
|
||||
NODE_OPTIONS: "--max-old-space-size=8192",
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -57,6 +57,21 @@ export class WhisperProvider implements TranscriptionProvider {
|
|||
await this.initializeWhisper();
|
||||
}
|
||||
|
||||
async getBindingInfo(): Promise<{ path: string; type: string } | null> {
|
||||
if (!this.workerWrapper) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return await this.workerWrapper.exec<{
|
||||
path: string;
|
||||
type: string;
|
||||
} | null>("getBindingInfo", []);
|
||||
} catch (error) {
|
||||
logger.transcription.warn("Failed to get binding info:", error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async transcribe(
|
||||
params: TranscribeParams & { flush?: boolean },
|
||||
): Promise<string> {
|
||||
|
|
@ -119,7 +134,7 @@ export class WhisperProvider implements TranscriptionProvider {
|
|||
`Starting transcription of ${aggregatedAudio.length} samples (${((aggregatedAudio.length / this.SAMPLE_RATE) * 1000).toFixed(0)}ms)`,
|
||||
);
|
||||
|
||||
// Transcribe using smart-whisper
|
||||
// Transcribe using the local Whisper wrapper
|
||||
if (!this.workerWrapper) {
|
||||
throw new Error("Worker wrapper is not initialized");
|
||||
}
|
||||
|
|
@ -137,7 +152,7 @@ export class WhisperProvider implements TranscriptionProvider {
|
|||
initial_prompt: initialPrompt,
|
||||
suppress_blank: true,
|
||||
suppress_non_speech_tokens: true,
|
||||
no_timestamps: true,
|
||||
no_timestamps: false,
|
||||
},
|
||||
]);
|
||||
|
||||
|
|
@ -302,7 +317,7 @@ export class WhisperProvider implements TranscriptionProvider {
|
|||
await this.workerWrapper.exec("initializeModel", [modelPath]);
|
||||
} catch (error) {
|
||||
logger.transcription.error(`Failed to initialize:`, error);
|
||||
throw new Error(`Failed to initialize smart-whisper: ${error}`);
|
||||
throw new Error(`Failed to initialize whisper wrapper: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,19 @@
|
|||
// Worker process entry point for fork
|
||||
import { Whisper } from "@amical/smart-whisper";
|
||||
import { Whisper, getLoadedBindingInfo } from "@amical/whisper-wrapper";
|
||||
|
||||
// Type definitions for IPC communication
|
||||
interface WorkerMessage {
|
||||
id: number;
|
||||
method: string;
|
||||
args: unknown[];
|
||||
}
|
||||
|
||||
interface SerializedFloat32Array {
|
||||
__type: "Float32Array";
|
||||
data: number[];
|
||||
}
|
||||
|
||||
type MethodArg = SerializedFloat32Array | unknown;
|
||||
|
||||
// Simple console-based logging for worker process
|
||||
const logger = {
|
||||
|
|
@ -29,7 +43,6 @@ const methods = {
|
|||
whisperInstance = null;
|
||||
}
|
||||
|
||||
const { Whisper } = await import("@amical/smart-whisper");
|
||||
whisperInstance = new Whisper(modelPath, { gpu: true });
|
||||
try {
|
||||
await whisperInstance.load();
|
||||
|
|
@ -71,8 +84,17 @@ const methods = {
|
|||
);
|
||||
const transcription = await result;
|
||||
|
||||
logger.transcription.debug(
|
||||
`Transcription segments: ${Array.isArray(transcription) ? transcription.length : "?"}`,
|
||||
);
|
||||
if (Array.isArray(transcription)) {
|
||||
logger.transcription.debug(
|
||||
`First segment preview: ${transcription[0]?.text ?? "<none>"}`,
|
||||
);
|
||||
}
|
||||
|
||||
return transcription
|
||||
.map((segment) => segment.text)
|
||||
.map((segment: { text: string }) => segment.text)
|
||||
.join(" ")
|
||||
.trim();
|
||||
},
|
||||
|
|
@ -84,23 +106,39 @@ const methods = {
|
|||
currentModelPath = null;
|
||||
}
|
||||
},
|
||||
|
||||
getBindingInfo(): { path: string; type: string } | null {
|
||||
return getLoadedBindingInfo();
|
||||
},
|
||||
};
|
||||
|
||||
// Handle messages from parent process
|
||||
process.on("message", async (message: any) => {
|
||||
process.on("message", async (message: WorkerMessage) => {
|
||||
const { id, method, args } = message;
|
||||
|
||||
try {
|
||||
// Deserialize Float32Array from IPC
|
||||
const deserializedArgs = args.map((arg: any) => {
|
||||
if (arg && arg.__type === "Float32Array" && Array.isArray(arg.data)) {
|
||||
return new Float32Array(arg.data);
|
||||
const deserializedArgs = args.map((arg: MethodArg) => {
|
||||
if (
|
||||
arg &&
|
||||
typeof arg === "object" &&
|
||||
"__type" in arg &&
|
||||
arg.__type === "Float32Array"
|
||||
) {
|
||||
const serialized = arg as SerializedFloat32Array;
|
||||
if (Array.isArray(serialized.data)) {
|
||||
return new Float32Array(serialized.data);
|
||||
}
|
||||
}
|
||||
return arg;
|
||||
});
|
||||
|
||||
if (method in methods) {
|
||||
const result = await (methods as any)[method](...deserializedArgs);
|
||||
const methodName = method as keyof typeof methods;
|
||||
const fn = methods[methodName] as (
|
||||
...args: unknown[]
|
||||
) => Promise<unknown>;
|
||||
const result = await fn(...deserializedArgs);
|
||||
process.send!({ id, result });
|
||||
} else {
|
||||
process.send!({ id, error: `Unknown method: ${method}` });
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
// This file contains just the Whisper-specific operations that need to run in a separate process
|
||||
import { Whisper } from "@amical/smart-whisper";
|
||||
import { Whisper } from "@amical/whisper-wrapper";
|
||||
|
||||
// Simple console-based logging for worker process
|
||||
const logger = {
|
||||
|
|
@ -27,7 +27,6 @@ export async function initializeModel(modelPath: string): Promise<void> {
|
|||
whisperInstance = null;
|
||||
}
|
||||
|
||||
const { Whisper } = await import("@amical/smart-whisper");
|
||||
whisperInstance = new Whisper(modelPath, { gpu: true });
|
||||
try {
|
||||
await whisperInstance.load();
|
||||
|
|
@ -57,7 +56,7 @@ export async function transcribeAudio(
|
|||
const transcription = await result;
|
||||
|
||||
return transcription
|
||||
.map((segment) => segment.text)
|
||||
.map((segment: { text: string }) => segment.text)
|
||||
.join(" ")
|
||||
.trim();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ export interface TranscriptionMetrics {
|
|||
session_id?: string;
|
||||
model_id: string;
|
||||
model_preloaded?: boolean;
|
||||
whisper_native_binding?: string;
|
||||
total_duration_ms?: number;
|
||||
recording_duration_ms?: number;
|
||||
processing_duration_ms?: number;
|
||||
|
|
|
|||
|
|
@ -417,10 +417,22 @@ export class TranscriptionService {
|
|||
const audioDurationSeconds =
|
||||
session.context.sharedData.audioMetadata?.duration;
|
||||
|
||||
// Get native binding info if using local whisper
|
||||
let whisperNativeBinding: string | undefined;
|
||||
if (this.whisperProvider && "getBindingInfo" in this.whisperProvider) {
|
||||
const bindingInfo = await this.whisperProvider.getBindingInfo();
|
||||
whisperNativeBinding = bindingInfo?.type;
|
||||
logger.transcription.info(
|
||||
"whisper native binding used",
|
||||
whisperNativeBinding,
|
||||
);
|
||||
}
|
||||
|
||||
this.telemetryService.trackTranscriptionCompleted({
|
||||
session_id: sessionId,
|
||||
model_id: selectedModel!,
|
||||
model_preloaded: this.modelWasPreloaded,
|
||||
whisper_native_binding: whisperNativeBinding,
|
||||
total_duration_ms: totalDuration || 0,
|
||||
recording_duration_ms: recordingDuration,
|
||||
processing_duration_ms: processingDuration,
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ export default defineConfig({
|
|||
entryFileNames: "[name].js",
|
||||
},
|
||||
external: [
|
||||
"@amical/smart-whisper",
|
||||
"@amical/whisper-wrapper",
|
||||
"@libsql/client",
|
||||
"@libsql/darwin-arm64",
|
||||
"@libsql/darwin-x64",
|
||||
|
|
@ -51,7 +51,7 @@ export default defineConfig({
|
|||
optimizeDeps: {
|
||||
exclude: [
|
||||
"better-sqlite3",
|
||||
"smart-whisper",
|
||||
"@amical/whisper-wrapper",
|
||||
"drizzle-orm",
|
||||
"@libsql/client",
|
||||
],
|
||||
|
|
|
|||
|
|
@ -32,13 +32,13 @@
|
|||
"keytar",
|
||||
"protobufjs",
|
||||
"sharp",
|
||||
"smart-whisper",
|
||||
"@amical/whisper-wrapper",
|
||||
"drizzle-orm/libsql"
|
||||
],
|
||||
"onlyBuiltDependencies": [
|
||||
"electron",
|
||||
"electron-winstaller",
|
||||
"smart-whisper",
|
||||
"@amical/whisper-wrapper",
|
||||
"drizzle-orm/libsql",
|
||||
"@libsql",
|
||||
"macos-alias",
|
||||
|
|
|
|||
64
packages/smart-whisper/.gitignore
vendored
64
packages/smart-whisper/.gitignore
vendored
|
|
@ -1,64 +0,0 @@
|
|||
# Dependencies
|
||||
node_modules/
|
||||
|
||||
# Build outputs
|
||||
build/
|
||||
*.node
|
||||
bin/
|
||||
|
||||
# TypeScript outputs
|
||||
dist/
|
||||
# Keep the build configuration file
|
||||
!dist/build.js
|
||||
*.tsbuildinfo
|
||||
|
||||
# Native compilation artifacts
|
||||
*.o
|
||||
*.a
|
||||
*.so
|
||||
*.dylib
|
||||
*.dll
|
||||
*.lib
|
||||
*.exp
|
||||
*.ilk
|
||||
*.pdb
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
lerna-debug.log*
|
||||
.pnpm-debug.log*
|
||||
|
||||
# OS files
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
desktop.ini
|
||||
|
||||
# IDE files
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# Environment variables
|
||||
.env
|
||||
.env.local
|
||||
.env.*.local
|
||||
|
||||
# Test coverage
|
||||
coverage/
|
||||
*.lcov
|
||||
.nyc_output/
|
||||
|
||||
# Temporary files
|
||||
tmp/
|
||||
temp/
|
||||
*.tmp
|
||||
|
||||
# whisper.cpp build artifacts (if any get generated)
|
||||
whisper.cpp/build/
|
||||
whisper.cpp/*.o
|
||||
whisper.cpp/**/*.o
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
# Whisper.cpp Version Information
|
||||
|
||||
Repository: https://github.com/ggerganov/whisper.cpp
|
||||
Commit: 2ef717b293fe93872cc3a03ca77942936a281959
|
||||
Date: November 2024
|
||||
Description: whisper : add large-v3-turbo (#2440)
|
||||
|
||||
This file tracks the exact version of whisper.cpp used in this package.
|
||||
To update whisper.cpp, replace the contents of the whisper.cpp directory
|
||||
with a new version and update this file accordingly.
|
||||
|
|
@ -1,37 +0,0 @@
|
|||
{
|
||||
'variables' : {
|
||||
'openssl_fips': '',
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"target_name": "smart-whisper",
|
||||
"sources": [
|
||||
"src/binding/binding.cc",
|
||||
"src/binding/common.cc",
|
||||
"src/binding/model.cc",
|
||||
"src/binding/transcribe.cc",
|
||||
"<!@(node -p \"require('./dist/build.js').sources\")"
|
||||
],
|
||||
"libraries": [ "<!@(node -p \"require('./dist/build.js').libraries\")" ],
|
||||
'defines': [ "<!@(node -p \"require('./dist/build.js').defines\")" ],
|
||||
'include_dirs': ["<!@(node -p \"require('node-addon-api').include\")", "whisper.cpp/include", "whisper.cpp/ggml/include", "whisper.cpp/examples"],
|
||||
'dependencies': ["<!(node -p \"require('node-addon-api').gyp\")"],
|
||||
'cflags!': [ '-fno-exceptions' ],
|
||||
'cflags_cc!': [ '-fno-exceptions' ],
|
||||
'xcode_settings': {
|
||||
'GCC_ENABLE_CPP_EXCEPTIONS': 'YES',
|
||||
'CLANG_CXX_LIBRARY': 'libc++',
|
||||
},
|
||||
'msvs_settings': {
|
||||
'VCCLCompilerTool': { 'ExceptionHandling': 1 },
|
||||
},
|
||||
'conditions': [
|
||||
['OS=="mac"', {
|
||||
'xcode_settings': {
|
||||
'GCC_SYMBOLS_PRIVATE_EXTERN': 'YES', # -fvisibility=hidden
|
||||
}
|
||||
}]
|
||||
]
|
||||
}
|
||||
],
|
||||
}
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
{
|
||||
"name": "@amical/smart-whisper",
|
||||
"version": "0.1.0",
|
||||
"description": "Whisper.cpp Node.js binding with auto model offloading strategy.",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
"keywords": [
|
||||
"whisper",
|
||||
"whisper.cpp",
|
||||
"native",
|
||||
"binding",
|
||||
"addon"
|
||||
],
|
||||
"gypfile": true,
|
||||
"files": [
|
||||
"dist",
|
||||
"src",
|
||||
"scripts",
|
||||
"binding.gyp",
|
||||
"whisper.cpp/**/*.{c,h,cpp,hpp,m,cu,metal}",
|
||||
"whisper.cpp/Makefile",
|
||||
"whisper.cpp/LICENSE"
|
||||
],
|
||||
"scripts": {
|
||||
"install": "tsup",
|
||||
"postinstall": "node-gyp rebuild",
|
||||
"build": "tsup && node-gyp rebuild",
|
||||
"build:ts": "tsup",
|
||||
"build:native": "node-gyp rebuild"
|
||||
},
|
||||
"dependencies": {
|
||||
"node-addon-api": "^8.5.0",
|
||||
"minimatch": "10.0.3"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@amical/typescript-config": "workspace:*",
|
||||
"@types/node": "^24.3.0",
|
||||
"tsup": "^8.5.0",
|
||||
"typescript": "^5.8.2"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
"use strict";
|
||||
|
||||
var libs = [];
|
||||
if (process.env.WHISPER_OPENBLAS) {
|
||||
libs.push(`-lopenblas`);
|
||||
}
|
||||
if (process.env.WHISPER_CUBLAS) {
|
||||
libs.push(
|
||||
`-lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64`,
|
||||
);
|
||||
}
|
||||
if (process.env.WHISPER_HIPBLAS) {
|
||||
libs.push(
|
||||
`lhipblas -lamdhip64 -lrocblas -L/opt/rocm/lib -L/opt/rocm/hipblas/lib -Wl,-rpath=/opt/rocm/lib`,
|
||||
);
|
||||
}
|
||||
if (process.env.WHISPER_CLBLAST) {
|
||||
libs.push(`-lclblast -lOpenCL`);
|
||||
}
|
||||
|
||||
console.log(libs.join(" "));
|
||||
|
|
@ -1,78 +0,0 @@
|
|||
process.env.GGML_METAL_PATH_RESOURCES =
|
||||
process.env.GGML_METAL_PATH_RESOURCES ||
|
||||
path.join(__dirname, "../whisper.cpp/ggml/src");
|
||||
|
||||
import path from "node:path";
|
||||
import { TranscribeFormat, TranscribeParams, TranscribeResult } from "./types";
|
||||
const module = require(path.join(__dirname, "../build/Release/smart-whisper"));
|
||||
|
||||
/**
|
||||
* A external handle to a model.
|
||||
*/
|
||||
export type Handle = {
|
||||
readonly "": unique symbol;
|
||||
};
|
||||
|
||||
export namespace Binding {
|
||||
/**
|
||||
* Load a model from a whisper weights file.
|
||||
* @param file The path to the whisper weights file.
|
||||
* @param gpu Whether to use the GPU or not.
|
||||
* @param callback A callback that will be called with the handle to the model.
|
||||
*/
|
||||
export declare function load(
|
||||
file: string,
|
||||
gpu: boolean,
|
||||
callback: (handle: Handle) => void,
|
||||
): void;
|
||||
|
||||
/**
|
||||
* Release the memory of the model, it will be unusable after this.
|
||||
* @param handle The handle to the model.
|
||||
* @param callback A callback that will be called when the model is freed.
|
||||
*/
|
||||
export declare function free(handle: Handle, callback: () => void): void;
|
||||
|
||||
/**
|
||||
* Transcribe a PCM buffer.
|
||||
* @param handle The handle to the model.
|
||||
* @param pcm The PCM buffer.
|
||||
* @param params The parameters to use for transcription.
|
||||
* @param finish A callback that will be called when the transcription is finished.
|
||||
* @param progress A callback that will be called when a new result is available.
|
||||
*/
|
||||
export declare function transcribe<
|
||||
Format extends TranscribeFormat,
|
||||
TokenTimestamp extends boolean,
|
||||
>(
|
||||
handle: Handle,
|
||||
pcm: Float32Array,
|
||||
params: Partial<TranscribeParams<Format, TokenTimestamp>>,
|
||||
finish: (results: TranscribeResult<Format, TokenTimestamp>[]) => void,
|
||||
progress: (result: TranscribeResult<Format, TokenTimestamp>) => void,
|
||||
): void;
|
||||
|
||||
export declare class WhisperModel {
|
||||
private _ctx;
|
||||
constructor(handle: Handle);
|
||||
get handle(): Handle | null;
|
||||
get freed(): boolean;
|
||||
/**
|
||||
* Release the memory of the model, it will be unusable after this.
|
||||
* It's safe to call this multiple times, but it will only free the model once.
|
||||
*/
|
||||
free(): Promise<void>;
|
||||
/**
|
||||
* Load a model from a whisper weights file.
|
||||
* @param file The path to the whisper weights file.
|
||||
* @param gpu Whether to use the GPU or not.
|
||||
* @returns A promise that resolves to a {@link WhisperModel}.
|
||||
*/
|
||||
static load(file: string, gpu?: boolean): Promise<WhisperModel>;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The native binding for the underlying C++ addon.
|
||||
*/
|
||||
export const binding: typeof Binding = module;
|
||||
|
|
@ -1,18 +0,0 @@
|
|||
#include <napi.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "model.h"
|
||||
#include "transcribe.h"
|
||||
|
||||
Napi::Object Init(Napi::Env env, Napi::Object exports) {
|
||||
exports.Set("transcribe", Napi::Function::New(env, Transcribe));
|
||||
WhisperModel::Init(env, exports);
|
||||
|
||||
if (IsProduction(env.Global())) {
|
||||
whisper_log_set([](ggml_log_level level, const char *text, void *user_data) {}, nullptr);
|
||||
}
|
||||
|
||||
return exports;
|
||||
}
|
||||
|
||||
NODE_API_MODULE(whisper, Init)
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
#include "common.h"
|
||||
|
||||
Napi::Promise PromiseWorker::Promise() { return promise.Promise(); }
|
||||
|
||||
bool IsProduction(const Napi::Object global_env) {
|
||||
Napi::Object process = global_env.Get("process").As<Napi::Object>();
|
||||
Napi::Object env = process.Get("env").As<Napi::Object>();
|
||||
Napi::Value node_env = env.Get("NODE_ENV");
|
||||
|
||||
if (!node_env.IsString()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Napi::String node_env_str = node_env.As<Napi::String>();
|
||||
return node_env_str.Utf8Value() == "production";
|
||||
}
|
||||
|
|
@ -1,22 +0,0 @@
|
|||
#ifndef _GUARD_SW_COMMON_H
|
||||
#define _GUARD_SW_COMMON_H
|
||||
|
||||
#ifndef NAPI_VERSION
|
||||
// Support Node.js 16+
|
||||
#define NAPI_VERSION 8
|
||||
#endif
|
||||
#include <napi.h>
|
||||
|
||||
class PromiseWorker : public Napi::AsyncWorker {
|
||||
public:
|
||||
PromiseWorker(Napi::Env &env) : AsyncWorker(env), promise(Napi::Promise::Deferred::New(env)) {}
|
||||
|
||||
Napi::Promise Promise();
|
||||
|
||||
protected:
|
||||
Napi::Promise::Deferred promise;
|
||||
};
|
||||
|
||||
bool IsProduction(const Napi::Object global_env);
|
||||
|
||||
#endif
|
||||
|
|
@ -1,145 +0,0 @@
|
|||
#include "model.h"
|
||||
|
||||
class LoadModelWorker : public PromiseWorker {
|
||||
public:
|
||||
LoadModelWorker(Napi::Env &env, const std::string &model_path,
|
||||
struct whisper_context_params params)
|
||||
: PromiseWorker(env), model_path(model_path), params(params) {}
|
||||
|
||||
void Execute() override {
|
||||
context = whisper_init_from_file_with_params_no_state(model_path.c_str(), params);
|
||||
if (context == nullptr) {
|
||||
SetError("Failed to initialize whisper context");
|
||||
}
|
||||
whisper_print_timings(context);
|
||||
}
|
||||
|
||||
void OnOK() override {
|
||||
Napi::HandleScope scope(Env());
|
||||
auto handle = Napi::External<whisper_context>::New(Env(), context);
|
||||
auto constructor = Env().GetInstanceData<Napi::FunctionReference>();
|
||||
auto model = constructor->New({handle});
|
||||
|
||||
promise.Resolve(model);
|
||||
}
|
||||
|
||||
private:
|
||||
std::string model_path;
|
||||
struct whisper_context_params params;
|
||||
whisper_context *context;
|
||||
};
|
||||
|
||||
class FreeModelWorker : public PromiseWorker {
|
||||
public:
|
||||
FreeModelWorker(Napi::Env &env, whisper_context *context)
|
||||
: PromiseWorker(env), context(context) {}
|
||||
|
||||
void Execute() override { whisper_free(context); }
|
||||
|
||||
void OnOK() override {
|
||||
Napi::HandleScope scope(Env());
|
||||
promise.Resolve(Env().Undefined());
|
||||
}
|
||||
|
||||
private:
|
||||
whisper_context *context;
|
||||
};
|
||||
|
||||
Napi::Object WhisperModel::Init(Napi::Env env, Napi::Object exports) {
|
||||
Napi::Function func = DefineClass(
|
||||
env, "WhisperModel",
|
||||
{
|
||||
StaticMethod<&WhisperModel::Load>(
|
||||
"load", static_cast<napi_property_attributes>(napi_writable | napi_configurable)),
|
||||
InstanceMethod<&WhisperModel::Free>(
|
||||
"free", static_cast<napi_property_attributes>(napi_writable | napi_configurable)),
|
||||
InstanceAccessor(
|
||||
"freed", &WhisperModel::GetFreed, nullptr,
|
||||
static_cast<napi_property_attributes>(napi_enumerable | napi_configurable)),
|
||||
InstanceAccessor(
|
||||
"handle", &WhisperModel::GetHandle, nullptr,
|
||||
static_cast<napi_property_attributes>(napi_enumerable | napi_configurable)),
|
||||
});
|
||||
|
||||
auto constructor = new Napi::FunctionReference();
|
||||
*constructor = Napi::Persistent(func);
|
||||
env.SetInstanceData<Napi::FunctionReference>(constructor);
|
||||
|
||||
exports.Set("WhisperModel", func);
|
||||
return exports;
|
||||
}
|
||||
|
||||
WhisperModel::WhisperModel(const Napi::CallbackInfo &info) : Napi::ObjectWrap<WhisperModel>(info) {
|
||||
Napi::Env env = info.Env();
|
||||
Napi::HandleScope scope(env);
|
||||
|
||||
if (info.Length() != 1) {
|
||||
Napi::TypeError::New(env, "Wrong number of arguments").ThrowAsJavaScriptException();
|
||||
return;
|
||||
}
|
||||
|
||||
whisper_context *context = info[0].As<Napi::External<whisper_context>>().Data();
|
||||
this->context = context;
|
||||
}
|
||||
|
||||
void WhisperModel::Finalize(Napi::Env env) {
|
||||
if (context != nullptr) {
|
||||
whisper_free(context);
|
||||
context = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
Napi::Value WhisperModel::Load(const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
|
||||
if (info.Length() < 1 || info.Length() > 2) {
|
||||
Napi::TypeError::New(env, "Wrong number of arguments").ThrowAsJavaScriptException();
|
||||
return env.Null();
|
||||
}
|
||||
|
||||
std::string model_path = info[0].As<Napi::String>();
|
||||
|
||||
whisper_context_params params;
|
||||
params.use_gpu = info.Length() == 2 ? info[1].As<Napi::Boolean>() : true;
|
||||
|
||||
auto worker = new LoadModelWorker(env, model_path, params);
|
||||
worker->Queue();
|
||||
|
||||
return worker->Promise();
|
||||
}
|
||||
|
||||
Napi::Value WhisperModel::Free(const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
|
||||
if (info.Length() != 0) {
|
||||
Napi::TypeError::New(env, "Wrong number of arguments").ThrowAsJavaScriptException();
|
||||
return env.Null();
|
||||
}
|
||||
|
||||
if (context == nullptr) {
|
||||
auto deferred = Napi::Promise::Deferred::New(env);
|
||||
deferred.Resolve(env.Undefined());
|
||||
return deferred.Promise();
|
||||
} else {
|
||||
auto worker = new FreeModelWorker(env, context);
|
||||
context = nullptr;
|
||||
worker->Queue();
|
||||
return worker->Promise();
|
||||
}
|
||||
}
|
||||
|
||||
Napi::Value WhisperModel::GetFreed(const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
|
||||
return Napi::Boolean::New(env, context == nullptr);
|
||||
}
|
||||
|
||||
Napi::Value WhisperModel::GetHandle(const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
|
||||
if (context == nullptr) {
|
||||
return env.Null();
|
||||
}
|
||||
|
||||
return Napi::External<whisper_context>::New(env, context);
|
||||
}
|
||||
|
|
@ -1,22 +0,0 @@
|
|||
#ifndef _GUARD_SW_MODEL_H
|
||||
#define _GUARD_SW_MODEL_H
|
||||
|
||||
#include "common.h"
|
||||
#include "whisper.h"
|
||||
|
||||
class WhisperModel : public Napi::ObjectWrap<WhisperModel> {
|
||||
public:
|
||||
static Napi::Object Init(Napi::Env env, Napi::Object exports);
|
||||
|
||||
WhisperModel(const Napi::CallbackInfo &info);
|
||||
void Finalize(Napi::Env env);
|
||||
|
||||
private:
|
||||
whisper_context *context;
|
||||
static Napi::Value Load(const Napi::CallbackInfo &info);
|
||||
Napi::Value Free(const Napi::CallbackInfo &info);
|
||||
Napi::Value GetFreed(const Napi::CallbackInfo &info);
|
||||
Napi::Value GetHandle(const Napi::CallbackInfo &info);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -1,358 +0,0 @@
|
|||
#include "transcribe.h"
|
||||
|
||||
struct smart_whisper_transcribe_params {
|
||||
const char* format;
|
||||
};
|
||||
|
||||
struct whisper_full_params whisper_full_params_from_js(Napi::Object o) {
|
||||
struct whisper_full_params params =
|
||||
whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
|
||||
if (o.Has("strategy")) {
|
||||
params.strategy = static_cast<whisper_sampling_strategy>(
|
||||
o.Get("strategy").As<Napi::Number>().Int32Value());
|
||||
}
|
||||
if (o.Has("n_threads")) {
|
||||
params.n_threads = o.Get("n_threads").As<Napi::Number>();
|
||||
}
|
||||
if (o.Has("n_max_text_ctx")) {
|
||||
params.n_max_text_ctx = o.Get("n_max_text_ctx").As<Napi::Number>();
|
||||
}
|
||||
if (o.Has("offset_ms")) {
|
||||
params.offset_ms = o.Get("offset_ms").As<Napi::Number>();
|
||||
}
|
||||
if (o.Has("duration_ms")) {
|
||||
params.duration_ms = o.Get("duration_ms").As<Napi::Number>();
|
||||
}
|
||||
|
||||
if (o.Has("translate")) {
|
||||
params.translate = o.Get("translate").As<Napi::Boolean>();
|
||||
}
|
||||
if (o.Has("no_context")) {
|
||||
params.no_context = o.Get("no_context").As<Napi::Boolean>();
|
||||
}
|
||||
if (o.Has("no_timestamps")) {
|
||||
params.no_timestamps = o.Get("no_timestamps").As<Napi::Boolean>();
|
||||
}
|
||||
if (o.Has("single_segment")) {
|
||||
params.single_segment = o.Get("single_segment").As<Napi::Boolean>();
|
||||
}
|
||||
if (o.Has("print_special")) {
|
||||
params.print_special = o.Get("print_special").As<Napi::Boolean>();
|
||||
}
|
||||
if (o.Has("print_progress")) {
|
||||
params.print_progress = o.Get("print_progress").As<Napi::Boolean>();
|
||||
}
|
||||
if (o.Has("print_realtime")) {
|
||||
params.print_realtime = o.Get("print_realtime").As<Napi::Boolean>();
|
||||
}
|
||||
if (o.Has("print_timestamps")) {
|
||||
params.print_timestamps = o.Get("print_timestamps").As<Napi::Boolean>();
|
||||
}
|
||||
|
||||
if (o.Has("token_timestamps")) {
|
||||
params.token_timestamps = o.Get("token_timestamps").As<Napi::Boolean>();
|
||||
}
|
||||
if (o.Has("thold_pt")) {
|
||||
params.thold_pt = o.Get("thold_pt").As<Napi::Number>();
|
||||
}
|
||||
if (o.Has("thold_ptsum")) {
|
||||
params.thold_ptsum = o.Get("thold_ptsum").As<Napi::Number>();
|
||||
}
|
||||
if (o.Has("max_len")) {
|
||||
params.max_len = o.Get("max_len").As<Napi::Number>();
|
||||
}
|
||||
if (o.Has("split_on_word")) {
|
||||
params.split_on_word = o.Get("split_on_word").As<Napi::Boolean>();
|
||||
}
|
||||
if (o.Has("max_tokens")) {
|
||||
params.max_tokens = o.Get("max_tokens").As<Napi::Number>();
|
||||
}
|
||||
|
||||
if (o.Has("debug_mode")) {
|
||||
params.debug_mode = o.Get("debug_mode").As<Napi::Boolean>();
|
||||
}
|
||||
if (o.Has("audio_ctx")) {
|
||||
params.audio_ctx = o.Get("audio_ctx").As<Napi::Number>();
|
||||
}
|
||||
|
||||
if (o.Has("tdrz_enable")) {
|
||||
params.tdrz_enable = o.Get("tdrz_enable").As<Napi::Boolean>();
|
||||
}
|
||||
|
||||
if (o.Has("initial_prompt") && o.Get("initial_prompt").IsString()) {
|
||||
std::string initial_prompt = o.Get("initial_prompt").As<Napi::String>().Utf8Value();
|
||||
params.initial_prompt = strdup(initial_prompt.c_str());
|
||||
} else {
|
||||
params.initial_prompt = nullptr;
|
||||
}
|
||||
|
||||
if (o.Has("language") && o.Get("language").IsString()) {
|
||||
std::string language = o.Get("language").As<Napi::String>().Utf8Value();
|
||||
params.language = strdup(language.c_str());
|
||||
} else {
|
||||
params.language = strdup("auto");
|
||||
}
|
||||
|
||||
if (o.Has("suppress_blank") && o.Get("suppress_blank").IsBoolean()) {
|
||||
params.suppress_blank = o.Get("suppress_blank").As<Napi::Boolean>();
|
||||
}
|
||||
if (o.Has("suppress_non_speech_tokens") && o.Get("suppress_non_speech_tokens").IsBoolean()) {
|
||||
params.suppress_non_speech_tokens = o.Get("suppress_non_speech_tokens").As<Napi::Boolean>();
|
||||
}
|
||||
|
||||
if (o.Has("temperature")) {
|
||||
params.temperature = o.Get("temperature").As<Napi::Number>();
|
||||
}
|
||||
if (o.Has("max_initial_ts")) {
|
||||
params.max_initial_ts = o.Get("max_initial_ts").As<Napi::Number>();
|
||||
}
|
||||
if (o.Has("length_penalty")) {
|
||||
params.length_penalty = o.Get("length_penalty").As<Napi::Number>();
|
||||
}
|
||||
|
||||
if (o.Has("temperature_inc")) {
|
||||
params.temperature_inc = o.Get("temperature_inc").As<Napi::Number>();
|
||||
}
|
||||
if (o.Has("entropy_thold")) {
|
||||
params.entropy_thold = o.Get("entropy_thold").As<Napi::Number>();
|
||||
}
|
||||
if (o.Has("logprob_thold")) {
|
||||
params.logprob_thold = o.Get("logprob_thold").As<Napi::Number>();
|
||||
}
|
||||
if (o.Has("no_speech_thold")) {
|
||||
params.no_speech_thold = o.Get("no_speech_thold").As<Napi::Number>();
|
||||
}
|
||||
|
||||
if (o.Has("best_of")) {
|
||||
params.greedy.best_of = o.Get("best_of").As<Napi::Number>();
|
||||
}
|
||||
|
||||
if (o.Has("beam_size")) {
|
||||
params.beam_search.beam_size = o.Get("beam_size").As<Napi::Number>();
|
||||
}
|
||||
|
||||
return params;
|
||||
}
|
||||
|
||||
struct smart_whisper_transcribe_params smart_whisper_transcribe_params_from_js(Napi::Object o) {
|
||||
struct smart_whisper_transcribe_params params;
|
||||
|
||||
if (o.Has("format") && o.Get("format").IsString()) {
|
||||
std::string format = o.Get("format").As<Napi::String>().Utf8Value();
|
||||
params.format = strdup(format.c_str());
|
||||
} else {
|
||||
params.format = strdup("simple");
|
||||
}
|
||||
|
||||
return params;
|
||||
}
|
||||
|
||||
class TranscribeWorker : public Napi::AsyncProgressQueueWorker<int> {
|
||||
public:
|
||||
TranscribeWorker(whisper_context* context, const float* samples, int n_samples,
|
||||
struct whisper_full_params params,
|
||||
struct smart_whisper_transcribe_params smart_params,
|
||||
Napi::Function& finish_callback, Napi::Function& progress_callback)
|
||||
: AsyncProgressQueueWorker(finish_callback),
|
||||
context(context),
|
||||
samples(samples),
|
||||
n_samples(n_samples),
|
||||
params(params),
|
||||
smart_params(smart_params) {
|
||||
this->progress_callback.Reset(progress_callback, 1);
|
||||
state = nullptr;
|
||||
}
|
||||
|
||||
~TranscribeWorker() {
|
||||
delete[] samples;
|
||||
// whisper_free_params(¶ms); will lead to a double free
|
||||
if (params.initial_prompt != nullptr) {
|
||||
free((void*)params.initial_prompt);
|
||||
}
|
||||
if (params.language != nullptr) {
|
||||
free((void*)params.language);
|
||||
}
|
||||
if (state != nullptr) {
|
||||
whisper_free_state(state);
|
||||
}
|
||||
|
||||
free((void*)smart_params.format);
|
||||
}
|
||||
|
||||
void Execute(const ExecutionProgress& progress) override {
|
||||
state = whisper_init_state(context);
|
||||
|
||||
params.new_segment_callback = [](struct whisper_context* ctx, struct whisper_state* state,
|
||||
int n_new, void* user_data) {
|
||||
const ExecutionProgress& progress = *(ExecutionProgress*)user_data;
|
||||
|
||||
const int i = whisper_full_n_segments_from_state(state) - 1;
|
||||
progress.Send(&i, 1);
|
||||
};
|
||||
params.new_segment_callback_user_data = (void*)&progress;
|
||||
|
||||
int err = whisper_full_with_state(context, state, params, samples, n_samples);
|
||||
if (err != 0) {
|
||||
SetError("whisper_full operation failed");
|
||||
}
|
||||
}
|
||||
|
||||
void OnProgress(const int* data, size_t _count) override {
|
||||
Napi::HandleScope scope(Env());
|
||||
|
||||
if (this->progress_callback.IsEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i = (*data);
|
||||
|
||||
Napi::Object segment = Napi::Object::New(Env());
|
||||
segment.Set("from", Napi::Number::New(
|
||||
Env(), whisper_full_get_segment_t0_from_state(state, i) * 10));
|
||||
segment.Set(
|
||||
"to", Napi::Number::New(Env(), whisper_full_get_segment_t1_from_state(state, i) * 10));
|
||||
segment.Set("text",
|
||||
Napi::String::New(Env(), whisper_full_get_segment_text_from_state(state, i)));
|
||||
|
||||
if (strcmp(smart_params.format, "detail") == 0) {
|
||||
float confidence = 0, min_p = 1, max_p = 0;
|
||||
int skips = 0;
|
||||
int tokens = whisper_full_n_tokens_from_state(state, i);
|
||||
Napi::Array tokens_array = Napi::Array::New(Env(), tokens);
|
||||
for (int j = 0; j < tokens; j++) {
|
||||
auto token = whisper_full_get_token_data_from_state(state, i, j);
|
||||
Napi::Object token_object = Napi::Object::New(Env());
|
||||
token_object.Set("text",
|
||||
Napi::String::New(Env(), whisper_full_get_token_text_from_state(
|
||||
context, state, i, j)));
|
||||
token_object.Set("id", Napi::Number::New(Env(), token.id));
|
||||
token_object.Set("p", Napi::Number::New(Env(), token.p));
|
||||
tokens_array.Set(j, token_object);
|
||||
|
||||
if (token.id > whisper_token_eot(context)) {
|
||||
skips++;
|
||||
continue;
|
||||
}
|
||||
confidence += token.p;
|
||||
min_p = std::min(min_p, token.p);
|
||||
max_p = std::max(max_p, token.p);
|
||||
}
|
||||
|
||||
if (tokens > 2) {
|
||||
confidence = (confidence - min_p - max_p) / (tokens - 2 - skips);
|
||||
} else {
|
||||
confidence = confidence / (tokens - skips);
|
||||
}
|
||||
|
||||
segment.Set(
|
||||
"lang",
|
||||
Napi::String::New(Env(), whisper_lang_str(whisper_full_lang_id_from_state(state))));
|
||||
segment.Set("confidence", Napi::Number::New(Env(), confidence));
|
||||
segment.Set("tokens", tokens_array);
|
||||
}
|
||||
|
||||
this->progress_callback.Call({segment});
|
||||
}
|
||||
|
||||
void OnOK() override {
|
||||
Napi::HandleScope scope(Env());
|
||||
|
||||
int n_segments = whisper_full_n_segments_from_state(state);
|
||||
Napi::Array segments = Napi::Array::New(Env(), n_segments);
|
||||
for (int i = 0; i < n_segments; i++) {
|
||||
Napi::Object segment = Napi::Object::New(Env());
|
||||
segment.Set("from", Napi::Number::New(
|
||||
Env(), whisper_full_get_segment_t0_from_state(state, i) * 10));
|
||||
segment.Set("to", Napi::Number::New(
|
||||
Env(), whisper_full_get_segment_t1_from_state(state, i) * 10));
|
||||
segment.Set("text", Napi::String::New(
|
||||
Env(), whisper_full_get_segment_text_from_state(state, i)));
|
||||
|
||||
if (strcmp(smart_params.format, "detail") == 0) {
|
||||
float confidence = 0, min_p = 1, max_p = 0;
|
||||
int skips = 0;
|
||||
int tokens = whisper_full_n_tokens_from_state(state, i);
|
||||
Napi::Array tokens_array = Napi::Array::New(Env(), tokens);
|
||||
for (int j = 0; j < tokens; j++) {
|
||||
auto token = whisper_full_get_token_data_from_state(state, i, j);
|
||||
Napi::Object token_object = Napi::Object::New(Env());
|
||||
token_object.Set(
|
||||
"text", Napi::String::New(Env(), whisper_full_get_token_text_from_state(
|
||||
context, state, i, j)));
|
||||
token_object.Set("id", Napi::Number::New(Env(), token.id));
|
||||
token_object.Set("p", Napi::Number::New(Env(), token.p));
|
||||
if (params.token_timestamps) {
|
||||
token_object.Set("from", Napi::Number::New(Env(), token.t0 * 10));
|
||||
token_object.Set("to", Napi::Number::New(Env(), token.t1 * 10));
|
||||
}
|
||||
|
||||
tokens_array.Set(j, token_object);
|
||||
|
||||
if (token.id > whisper_token_eot(context)) {
|
||||
skips++;
|
||||
continue;
|
||||
}
|
||||
confidence += token.p;
|
||||
min_p = std::min(min_p, token.p);
|
||||
max_p = std::max(max_p, token.p);
|
||||
}
|
||||
|
||||
if (tokens - skips > 2) {
|
||||
confidence = (confidence - min_p - max_p) / (tokens - skips - 2);
|
||||
} else if (tokens - skips > 0) {
|
||||
confidence = confidence / (tokens - skips);
|
||||
}
|
||||
|
||||
segment.Set("lang",
|
||||
Napi::String::New(
|
||||
Env(), whisper_lang_str(whisper_full_lang_id_from_state(state))));
|
||||
segment.Set("confidence", Napi::Number::New(Env(), confidence));
|
||||
segment.Set("tokens", tokens_array);
|
||||
}
|
||||
|
||||
segments.Set(i, segment);
|
||||
}
|
||||
|
||||
Callback().Call({segments});
|
||||
}
|
||||
|
||||
private:
|
||||
whisper_context* context;
|
||||
whisper_state* state;
|
||||
const float* samples;
|
||||
int n_samples;
|
||||
struct whisper_full_params params;
|
||||
struct smart_whisper_transcribe_params smart_params;
|
||||
Napi::FunctionReference progress_callback;
|
||||
};
|
||||
|
||||
Napi::Value Transcribe(const Napi::CallbackInfo& info) {
|
||||
Napi::Env env = info.Env();
|
||||
|
||||
if (info.Length() != 5) {
|
||||
Napi::TypeError::New(env, "Wrong number of arguments").ThrowAsJavaScriptException();
|
||||
return env.Null();
|
||||
}
|
||||
|
||||
whisper_context* context = info[0].As<Napi::External<whisper_context>>().Data();
|
||||
|
||||
Napi::Float32Array pcm = info[1].As<Napi::Float32Array>();
|
||||
float* samples = new float[pcm.ElementLength()];
|
||||
memcpy(samples, pcm.Data(), pcm.ByteLength());
|
||||
|
||||
int n_samples = static_cast<int>(pcm.ElementLength());
|
||||
|
||||
Napi::Object params = info[2].As<Napi::Object>();
|
||||
auto whisper_params = whisper_full_params_from_js(params);
|
||||
auto smart_params = smart_whisper_transcribe_params_from_js(params);
|
||||
|
||||
Napi::Function finish_callback = info[3].As<Napi::Function>();
|
||||
Napi::Function progress_callback = info[4].As<Napi::Function>();
|
||||
|
||||
auto worker = new TranscribeWorker(context, samples, n_samples, whisper_params, smart_params,
|
||||
finish_callback, progress_callback);
|
||||
worker->Queue();
|
||||
|
||||
return env.Undefined();
|
||||
}
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
#ifndef _GUARD_SW_TRANSCRIBE_H
|
||||
#define _GUARD_SW_TRANSCRIBE_H
|
||||
|
||||
#include "common.h"
|
||||
#include "whisper.h"
|
||||
|
||||
Napi::Value Transcribe(const Napi::CallbackInfo& info);
|
||||
|
||||
#endif
|
||||
|
|
@ -1,97 +0,0 @@
|
|||
import os from "node:os";
|
||||
import { execSync } from "node:child_process";
|
||||
|
||||
type ComputeBackend = "cpu" | "accelerate" | "metal" | "clblast" | "openblas";
|
||||
|
||||
const cfg = config();
|
||||
|
||||
export const sources = cfg.sources.join(" ");
|
||||
export const defines = cfg.defines.join(" ");
|
||||
export const libraries = cfg.libraries.join(" ");
|
||||
|
||||
function config(): {
|
||||
sources: string[];
|
||||
defines: string[];
|
||||
libraries: string[];
|
||||
} {
|
||||
if (process.env.BYOL) {
|
||||
return {
|
||||
sources: [],
|
||||
defines: [],
|
||||
libraries: [process.env.BYOL],
|
||||
};
|
||||
}
|
||||
|
||||
const COMPUTE_BACKEND: ComputeBackend =
|
||||
(process.env.COMPUTE_BACKEND as ComputeBackend | undefined) ??
|
||||
infer_backend();
|
||||
|
||||
const cfg = {
|
||||
sources: [
|
||||
"whisper.cpp/src/whisper.cpp",
|
||||
"whisper.cpp/ggml/src/ggml.c",
|
||||
"whisper.cpp/ggml/src/ggml-alloc.c",
|
||||
"whisper.cpp/ggml/src/ggml-backend.c",
|
||||
"whisper.cpp/ggml/src/ggml-quants.c",
|
||||
"whisper.cpp/ggml/src/ggml-aarch64.c",
|
||||
] as string[],
|
||||
defines: [] as string[],
|
||||
libraries: [] as string[],
|
||||
};
|
||||
|
||||
switch (COMPUTE_BACKEND) {
|
||||
case "accelerate": {
|
||||
cfg.defines.push("GGML_USE_ACCELERATE");
|
||||
|
||||
cfg.libraries.push('"-framework Foundation"');
|
||||
cfg.libraries.push('"-framework Accelerate"');
|
||||
break;
|
||||
}
|
||||
case "metal": {
|
||||
cfg.sources.push("whisper.cpp/ggml/src/ggml-metal.m");
|
||||
|
||||
cfg.defines.push("GGML_USE_ACCELERATE");
|
||||
cfg.defines.push("GGML_USE_METAL");
|
||||
|
||||
cfg.libraries.push('"-framework Foundation"');
|
||||
cfg.libraries.push('"-framework Accelerate"');
|
||||
cfg.libraries.push('"-framework Metal"');
|
||||
cfg.libraries.push('"-framework MetalKit"');
|
||||
break;
|
||||
}
|
||||
case "openblas": {
|
||||
cfg.defines.push("GGML_USE_OPENBLAS");
|
||||
|
||||
cfg.libraries.push("-lopenblas");
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
}
|
||||
}
|
||||
|
||||
return cfg;
|
||||
}
|
||||
|
||||
function infer_backend(): ComputeBackend {
|
||||
let backend: ComputeBackend = "cpu";
|
||||
|
||||
try {
|
||||
if (os.platform() === "darwin") {
|
||||
backend = "accelerate";
|
||||
if (os.arch() === "arm64") {
|
||||
backend = "metal";
|
||||
}
|
||||
} else if (os.platform() === "linux") {
|
||||
const has_libopenblas = !!execSync("ldconfig -p | grep libopenblas")
|
||||
.toString()
|
||||
.trim();
|
||||
if (has_libopenblas) {
|
||||
backend = "openblas";
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// if anything goes wrong, just use the default cpu backend
|
||||
}
|
||||
|
||||
return backend;
|
||||
}
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
export * from "./binding";
|
||||
export * from "./model";
|
||||
export * from "./transcribe";
|
||||
export * from "./types";
|
||||
export * from "./whisper";
|
||||
|
||||
export * as manager from "./model-manager";
|
||||
|
|
@ -1,125 +0,0 @@
|
|||
import path from "node:path";
|
||||
import fs from "node:fs";
|
||||
import os from "node:os";
|
||||
import { Readable } from "node:stream";
|
||||
import type { ReadableStream } from "node:stream/web";
|
||||
|
||||
const root = path.join(os.homedir(), ".smart-whisper");
|
||||
const models = path.join(root, "models");
|
||||
const ext = ".bin";
|
||||
|
||||
fs.mkdirSync(models, { recursive: true });
|
||||
|
||||
const BASE_MODELS_URL =
|
||||
"https://huggingface.co/ggerganov/whisper.cpp/resolve/main";
|
||||
|
||||
/**
|
||||
* MODELS is an object that contains the URLs of different ggml whisper models.
|
||||
* Each model is represented by a key-value pair, where the key is the model name
|
||||
* and the value is the URL of the model.
|
||||
*/
|
||||
export const MODELS = {
|
||||
tiny: `${BASE_MODELS_URL}/ggml-tiny.bin`,
|
||||
"tiny.en": `${BASE_MODELS_URL}/ggml-tiny.en.bin`,
|
||||
small: `${BASE_MODELS_URL}/ggml-small.bin`,
|
||||
"small.en": `${BASE_MODELS_URL}/ggml-small.en.bin`,
|
||||
base: `${BASE_MODELS_URL}/ggml-base.bin`,
|
||||
"base.en": `${BASE_MODELS_URL}/ggml-base.en.bin`,
|
||||
medium: `${BASE_MODELS_URL}/ggml-medium.bin`,
|
||||
"medium.en": `${BASE_MODELS_URL}/ggml-medium.en.bin`,
|
||||
"large-v1": `${BASE_MODELS_URL}/ggml-large-v1.bin`,
|
||||
"large-v2": `${BASE_MODELS_URL}/ggml-large-v2.bin`,
|
||||
"large-v3": `${BASE_MODELS_URL}/ggml-large-v3.bin`,
|
||||
"large-v3-turbo": `${BASE_MODELS_URL}/ggml-large-v3-turbo.bin`,
|
||||
} as const;
|
||||
|
||||
export type ModelName = keyof typeof MODELS | (string & {});
|
||||
|
||||
/**
|
||||
* Downloads a ggml whisper model from a specified URL or shorthand.
|
||||
*
|
||||
* @param model - The model to download, specified either as a key of the {@link MODELS} object or as a URL.
|
||||
* @returns A promise that resolves to the name of the downloaded model.
|
||||
* @throws An error if the model URL or shorthand is invalid, or if the model fails to download.
|
||||
*/
|
||||
export async function download(model: ModelName): Promise<string> {
|
||||
let url = "",
|
||||
name = "";
|
||||
if (model in MODELS) {
|
||||
url = MODELS[model as keyof typeof MODELS];
|
||||
name = model;
|
||||
} else {
|
||||
try {
|
||||
url = new URL(model).href;
|
||||
name = new URL(url).pathname.split("/").pop() ?? "";
|
||||
} catch {}
|
||||
}
|
||||
|
||||
if (!url) {
|
||||
throw new Error(`Invalid model URL or shorthand: ${model}`);
|
||||
}
|
||||
|
||||
if (!name) {
|
||||
throw new Error(`Failed to parse model name: ${url}`);
|
||||
}
|
||||
|
||||
if (check(name)) {
|
||||
return name;
|
||||
}
|
||||
|
||||
const res = await fetch(url);
|
||||
if (!res.ok || !res.body) {
|
||||
throw new Error(`Failed to download model: ${res.statusText}`);
|
||||
}
|
||||
|
||||
const stream = fs.createWriteStream(
|
||||
path.join(models, name.endsWith(ext) ? name : name + ext),
|
||||
);
|
||||
Readable.fromWeb(res.body as ReadableStream<Uint8Array>).pipe(stream);
|
||||
|
||||
return new Promise((resolve) => stream.on("finish", () => resolve(name)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes a locally downloaded model.
|
||||
* @param model - The name of the model to remove.
|
||||
*/
|
||||
export function remove(model: ModelName): void {
|
||||
if (check(model)) {
|
||||
fs.unlinkSync(path.join(models, model + ext));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves a list of model names that are available locally.
|
||||
* @returns An array of model names.
|
||||
*/
|
||||
export function list(): ModelName[] {
|
||||
const files = fs.readdirSync(models).filter((file) => file.endsWith(ext));
|
||||
return files.map((file) => file.slice(0, -ext.length));
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a model exists.
|
||||
* @param model - The name of the model.
|
||||
* @returns True if the model exists, false otherwise.
|
||||
*/
|
||||
export function check(model: ModelName): boolean {
|
||||
return fs.existsSync(path.join(models, model + ext));
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves the absolute path of a model.
|
||||
* @param model - The name of the model.
|
||||
* @returns The resolved path of the model.
|
||||
* @throws Error if the model is not found.
|
||||
*/
|
||||
export function resolve(model: ModelName): string {
|
||||
if (check(model)) {
|
||||
return path.join(models, model + ext);
|
||||
} else {
|
||||
throw new Error(`Model not found: ${model}`);
|
||||
}
|
||||
}
|
||||
|
||||
export const dir = { root, models };
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
import { binding } from "./binding";
|
||||
|
||||
export class WhisperModel extends binding.WhisperModel {}
|
||||
|
|
@ -1,114 +0,0 @@
|
|||
import EventEmitter from "node:events";
|
||||
import type { WhisperModel } from "./model";
|
||||
import { TranscribeFormat, TranscribeParams, TranscribeResult } from "./types";
|
||||
import { binding } from "./binding";
|
||||
|
||||
export class TranscribeTask<
|
||||
Format extends TranscribeFormat,
|
||||
TokenTimestamp extends boolean,
|
||||
> extends EventEmitter {
|
||||
private _model: WhisperModel;
|
||||
private _result: Promise<TranscribeResult<Format, TokenTimestamp>[]> | null =
|
||||
null;
|
||||
|
||||
/**
|
||||
* You should not construct this class directly, use {@link TranscribeTask.run} instead.
|
||||
*/
|
||||
constructor(model: WhisperModel) {
|
||||
super();
|
||||
this._model = model;
|
||||
}
|
||||
|
||||
get model(): WhisperModel {
|
||||
return this._model;
|
||||
}
|
||||
|
||||
/**
|
||||
* A promise that resolves to the result of the transcription task.
|
||||
*/
|
||||
get result(): Promise<TranscribeResult<Format, TokenTimestamp>[]> {
|
||||
if (this._result === null) {
|
||||
throw new Error("Task has not been started");
|
||||
}
|
||||
return this._result;
|
||||
}
|
||||
|
||||
private async _run(
|
||||
pcm: Float32Array,
|
||||
params: Partial<TranscribeParams<Format, TokenTimestamp>>,
|
||||
): Promise<TranscribeResult<Format, TokenTimestamp>[]> {
|
||||
return new Promise((resolve) => {
|
||||
const handle = this.model.handle;
|
||||
if (!handle) {
|
||||
throw new Error("Model has been freed");
|
||||
}
|
||||
|
||||
binding.transcribe(
|
||||
handle,
|
||||
pcm,
|
||||
params,
|
||||
(results) => {
|
||||
this.emit("finish");
|
||||
resolve(results);
|
||||
},
|
||||
(result) => {
|
||||
this.emit("transcribed", result);
|
||||
},
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
static async run<
|
||||
Format extends TranscribeFormat,
|
||||
TokenTimestamp extends boolean,
|
||||
>(
|
||||
model: WhisperModel,
|
||||
pcm: Float32Array,
|
||||
params: Partial<TranscribeParams<Format, TokenTimestamp>>,
|
||||
): Promise<TranscribeTask<Format, TokenTimestamp>> {
|
||||
if (model.freed) {
|
||||
throw new Error("Model has been freed");
|
||||
}
|
||||
|
||||
const task = new TranscribeTask(model);
|
||||
task._result = task._run(pcm, params);
|
||||
|
||||
return task;
|
||||
}
|
||||
|
||||
on(
|
||||
event: "finish",
|
||||
listener: (results: TranscribeResult<Format, TokenTimestamp>[]) => void,
|
||||
): this;
|
||||
on(
|
||||
event: "transcribed",
|
||||
listener: (result: TranscribeResult<Format, TokenTimestamp>) => void,
|
||||
): this;
|
||||
on(event: string, listener: (...args: any[]) => void): this {
|
||||
return super.on(event, listener);
|
||||
}
|
||||
|
||||
once(
|
||||
event: "finish",
|
||||
listener: (results: TranscribeResult<Format, TokenTimestamp>[]) => void,
|
||||
): this;
|
||||
once(
|
||||
event: "transcribed",
|
||||
listener: (result: TranscribeResult<Format, TokenTimestamp>) => void,
|
||||
): this;
|
||||
once(event: string, listener: (...args: any[]) => void): this {
|
||||
return super.once(event, listener);
|
||||
}
|
||||
|
||||
off(
|
||||
event: "finish",
|
||||
listener: (results: TranscribeResult<Format, TokenTimestamp>[]) => void,
|
||||
): this;
|
||||
off(
|
||||
event: "transcribed",
|
||||
listener: (result: TranscribeResult<Format, TokenTimestamp>) => void,
|
||||
): this;
|
||||
off(event: string, listener: (...args: any[]) => void): this {
|
||||
return super.off(event, listener);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,104 +0,0 @@
|
|||
export enum WhisperSamplingStrategy {
|
||||
WHISPER_SAMPLING_GREEDY,
|
||||
WHISPER_SAMPLING_BEAM_SEARCH,
|
||||
}
|
||||
|
||||
export type TranscribeFormat = "simple" | "detail";
|
||||
|
||||
/**
|
||||
* See {@link https://github.com/ggerganov/whisper.cpp/blob/00b7a4be02ca82d53ac69dd2dd438c16e2af7658/whisper.h#L433C19-L433C19} for details.
|
||||
*/
|
||||
export interface TranscribeParams<
|
||||
Format extends TranscribeFormat = TranscribeFormat,
|
||||
TokenTimestamp extends boolean = false,
|
||||
> {
|
||||
strategy: WhisperSamplingStrategy;
|
||||
n_threads: number;
|
||||
n_max_text_ctx: number;
|
||||
offset_ms: number;
|
||||
duration_ms: number;
|
||||
|
||||
translate: boolean;
|
||||
no_context: boolean;
|
||||
no_timestamps: boolean;
|
||||
single_segment: boolean;
|
||||
print_special: boolean;
|
||||
print_progress: boolean;
|
||||
print_realtime: boolean;
|
||||
print_timestamps: boolean;
|
||||
|
||||
token_timestamps: TokenTimestamp;
|
||||
thold_pt: number;
|
||||
thold_ptsum: number;
|
||||
max_len: number;
|
||||
split_on_word: boolean;
|
||||
max_tokens: number;
|
||||
|
||||
speed_up: boolean;
|
||||
debug_mode: boolean;
|
||||
audio_ctx: number;
|
||||
|
||||
tdrz_enable: boolean;
|
||||
|
||||
initial_prompt: string;
|
||||
|
||||
/**
|
||||
* Language code, e.g. "en", "de", "fr", "es", "it", "nl", "pt", "ru", "tr", "uk", "pl", "sv", "cs", "zh", "ja", "ko"
|
||||
*/
|
||||
language: string;
|
||||
|
||||
suppress_blank: boolean;
|
||||
suppress_non_speech_tokens: boolean;
|
||||
|
||||
temperature: number;
|
||||
max_initial_ts: number;
|
||||
length_penalty: number;
|
||||
|
||||
temperature_inc: number;
|
||||
entropy_thold: number;
|
||||
logprob_thold: number;
|
||||
no_speech_thold: number;
|
||||
|
||||
best_of: number;
|
||||
|
||||
beam_size: number;
|
||||
|
||||
format: Format;
|
||||
}
|
||||
|
||||
export interface TranscribeSimpleResult {
|
||||
from: number;
|
||||
to: number;
|
||||
text: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a detailed result of transcription.
|
||||
*/
|
||||
export interface TranscribeDetailedResult<TokenTimestamp extends boolean>
|
||||
extends TranscribeSimpleResult {
|
||||
/** The detected spoken language. */
|
||||
lang: string;
|
||||
/** The confidence level of the transcription, calculated by the average probability of the tokens. */
|
||||
confidence: number;
|
||||
/** The tokens generated during the transcription process. */
|
||||
tokens: {
|
||||
/** The text of the token, for CJK languages, due to the BPE encoding, the token text may not be readable. */
|
||||
text: string;
|
||||
/** The ID of the token. */
|
||||
id: number;
|
||||
/** The probability of the token. */
|
||||
p: number;
|
||||
/** The start timestamp of the token, in milliseconds. Only available when `token_timestamps` of {@link TranscribeParams} is `true`. */
|
||||
from: TokenTimestamp extends true ? number : undefined;
|
||||
/** The end timestamp of the token, in milliseconds. Only available when `token_timestamps` of {@link TranscribeParams} is `true`. */
|
||||
to: TokenTimestamp extends true ? number : undefined;
|
||||
}[];
|
||||
}
|
||||
|
||||
export type TranscribeResult<
|
||||
Format extends TranscribeFormat = TranscribeFormat,
|
||||
TokenTimestamp extends boolean = boolean,
|
||||
> = Format extends "simple"
|
||||
? TranscribeSimpleResult
|
||||
: TranscribeDetailedResult<TokenTimestamp>;
|
||||
|
|
@ -1,148 +0,0 @@
|
|||
import type {
|
||||
TranscribeFormat,
|
||||
TranscribeParams,
|
||||
TranscribeResult,
|
||||
} from "./types";
|
||||
import { WhisperModel } from "./model";
|
||||
import { TranscribeTask } from "./transcribe";
|
||||
|
||||
export interface WhisperConfig {
|
||||
/**
|
||||
* Time in seconds to wait before offloading the model if it's not being used.
|
||||
*/
|
||||
offload: number;
|
||||
|
||||
/**
|
||||
* Whether to use the GPU or not.
|
||||
*/
|
||||
gpu: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* The Whisper class is responsible for managing the lifecycle and operations of whisper model.
|
||||
* It handles the loading and offloading of the model, managing transcription tasks, and configuring model parameters.
|
||||
*/
|
||||
export class Whisper {
|
||||
private _file: string;
|
||||
private _available: WhisperModel | null = null;
|
||||
private _loading: Promise<WhisperModel> | null = null;
|
||||
private _tasks: Promise<TranscribeResult[]>[] = [];
|
||||
private _config: WhisperConfig;
|
||||
private _offload_timer: NodeJS.Timeout | null = null;
|
||||
|
||||
/**
|
||||
* Constructs a new Whisper instance with a specified model file and configuration.
|
||||
* @param file - The path to the Whisper model file.
|
||||
* @param config - Optional configuration for the Whisper instance.
|
||||
*/
|
||||
constructor(file: string, config: Partial<WhisperConfig> = {}) {
|
||||
this._file = file;
|
||||
this._config = {
|
||||
offload: 300,
|
||||
gpu: true,
|
||||
...config,
|
||||
};
|
||||
}
|
||||
|
||||
get file(): string {
|
||||
return this._file;
|
||||
}
|
||||
|
||||
set file(file: string) {
|
||||
this._file = file;
|
||||
}
|
||||
|
||||
get config(): WhisperConfig {
|
||||
return this._config;
|
||||
}
|
||||
|
||||
get tasks(): Promise<TranscribeResult[]>[] {
|
||||
return this._tasks;
|
||||
}
|
||||
|
||||
reset_offload_timer(): void {
|
||||
this.clear_offload_timer();
|
||||
this._offload_timer = setTimeout(() => {
|
||||
this.free();
|
||||
}, this.config.offload * 1000);
|
||||
}
|
||||
|
||||
private clear_offload_timer(): void {
|
||||
if (this._offload_timer !== null) {
|
||||
clearTimeout(this._offload_timer);
|
||||
this._offload_timer = null;
|
||||
}
|
||||
}
|
||||
|
||||
async model(): Promise<WhisperModel> {
|
||||
if (this._available === null) {
|
||||
return this.load();
|
||||
}
|
||||
this.reset_offload_timer();
|
||||
return Promise.resolve(this._available);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the whisper model asynchronously.
|
||||
* If the model is already being loaded, returns the existing one.
|
||||
*
|
||||
* You don't need to call this method directly, it's called automatically if necessary when you call {@link Whisper.transcribe}.
|
||||
*
|
||||
* @returns A Promise that resolves to the loaded model.
|
||||
*/
|
||||
async load(): Promise<WhisperModel> {
|
||||
if (this._loading !== null) {
|
||||
return this._loading;
|
||||
}
|
||||
|
||||
const model = WhisperModel.load(this.file, this.config.gpu);
|
||||
this._loading = model;
|
||||
this._available = await model;
|
||||
this._loading = null;
|
||||
this.reset_offload_timer();
|
||||
return this._available;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribes the given PCM audio data using the Whisper model.
|
||||
* @param pcm - The mono 16k PCM audio data to transcribe.
|
||||
* @param params - Optional parameters for transcription.
|
||||
* @returns A promise that resolves to the result of the transcription task.
|
||||
*/
|
||||
async transcribe<
|
||||
Format extends TranscribeFormat,
|
||||
TokenTimestamp extends boolean,
|
||||
>(
|
||||
pcm: Float32Array,
|
||||
params: Partial<TranscribeParams<Format, TokenTimestamp>> = {},
|
||||
): Promise<TranscribeTask<Format, TokenTimestamp>> {
|
||||
const model = await this.model();
|
||||
const task = await TranscribeTask.run<Format, TokenTimestamp>(
|
||||
model,
|
||||
pcm,
|
||||
params,
|
||||
);
|
||||
this._tasks.push(task.result);
|
||||
return task;
|
||||
}
|
||||
|
||||
async free(): Promise<void> {
|
||||
if (this._available === null) {
|
||||
return;
|
||||
}
|
||||
const model = this._available;
|
||||
this._available = null;
|
||||
this.clear_offload_timer();
|
||||
await Promise.all(this.tasks);
|
||||
await model.free();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Here's a life cycle diagram of a model:
|
||||
* | Method | (0) Not Available | (1) Loading | (2) Available | (3) Freeing | (0) Not Available |
|
||||
* |------------|-------------------|-------------|---------------|-------------|-------------------|
|
||||
* | load | V | - | - | - | V |
|
||||
* | free | - | - | wait tasks, V | - | - |
|
||||
* | transcribe | load | load | V | load | load |
|
||||
*/
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
{
|
||||
"extends": "@amical/typescript-config/base.json",
|
||||
"compilerOptions": {
|
||||
"outDir": "dist"
|
||||
},
|
||||
"include": ["src"],
|
||||
"exclude": ["node_modules", "dist"],
|
||||
"types": ["node"]
|
||||
}
|
||||
|
|
@ -1,18 +0,0 @@
|
|||
import { defineConfig } from "tsup";
|
||||
import { readFileSync, writeFileSync } from "node:fs";
|
||||
|
||||
export default defineConfig({
|
||||
entry: ["src/index.ts", "src/build.ts"],
|
||||
outDir: "dist",
|
||||
dts: true,
|
||||
async onSuccess() {
|
||||
// replace `#include "ggml-common.h" in whisper.cpp/ggml/src/ggml-metal.metal with full content
|
||||
const metal = readFileSync(
|
||||
"whisper.cpp/ggml/src/ggml-metal.metal",
|
||||
"utf-8",
|
||||
);
|
||||
const common = readFileSync("whisper.cpp/ggml/src/ggml-common.h", "utf-8");
|
||||
const replaced = metal.replace(/#include "ggml-common.h"/, common);
|
||||
writeFileSync("whisper.cpp/ggml/src/ggml-metal.metal", replaced);
|
||||
},
|
||||
});
|
||||
|
|
@ -1 +0,0 @@
|
|||
Subproject commit 2ef717b293fe93872cc3a03ca77942936a281959
|
||||
91
packages/whisper-wrapper/README.md
Normal file
91
packages/whisper-wrapper/README.md
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
# @amical/whisper-wrapper
|
||||
|
||||
This package wraps the `whisper.cpp` Node addon so the desktop app can call into
|
||||
Whisper from a forked worker process. The build and runtime layers are tuned for
|
||||
the desktop pipeline; the notes below capture the important knobs and the
|
||||
reasoning behind them.
|
||||
|
||||
## Build workflow
|
||||
|
||||
- `pnpm install` (postinstall) runs `bin/build-addon.js` via CMake.js and drops
|
||||
the resulting `whisper.node` into `native/<platform-arch(-tag)>/`.
|
||||
- `pnpm --filter @amical/whisper-wrapper build:native` rebuilds the default
|
||||
variants for this platform (Metal + CPU on macOS, CPU elsewhere).
|
||||
- `pnpm --filter @amical/whisper-wrapper build:native:cuda` builds an extra
|
||||
`win32-x64-cuda` binary alongside the regular `win32-x64` fallback. Install
|
||||
the CUDA toolkit (12.x tested) before running it.
|
||||
- Every macOS build is ad-hoc signed (`codesign -s -`) so Electron/Node can load
|
||||
it without crashing.
|
||||
- Each variant is produced as a _single_ `.node` binary. We force static
|
||||
libraries (`GGML_STATIC=ON`, `BUILD_SHARED_LIBS=OFF`) so all ggml/whisper
|
||||
code is linked directly into the addon—no sidecar `.dylib/.dll` files ship
|
||||
at runtime.
|
||||
- The full CMake build directory is deleted after each variant so Electron
|
||||
Forge/Squirrel never sees the long `CMakeFiles/...` paths that blew past
|
||||
Windows’ MAX_PATH limit during packaging.
|
||||
|
||||
## GPU/CPU fallback
|
||||
|
||||
`resolveBinding()` in `src/loader.ts` no longer throws if the first candidate
|
||||
fails. `loadBinding()` walks the list:
|
||||
|
||||
1. `platform-arch-metal`
|
||||
2. `platform-arch-openblas`
|
||||
3. `platform-arch-cuda`
|
||||
4. `platform-arch`
|
||||
5. `cpu-fallback`
|
||||
|
||||
If `require()` raises `ERR_DLOPEN_FAILED` (missing runtime, wrong driver, etc.)
|
||||
it logs a warning and tries the next candidate. That lets us ship CUDA/Metal
|
||||
binaries alongside CPU ones without breaking installs that lack the GPU stack.
|
||||
|
||||
## GGML_NATIVE on macOS arm64
|
||||
|
||||
GitHub’s hosted macOS runners expose `i8mm` but clang refuses to emit the
|
||||
`vmmlaq_s32` intrinsic when `-mcpu=native` is passed, so the build dies in
|
||||
`ggml-cpu/arch/arm/quants.c`. CI therefore exports `GGML_NATIVE=OFF` before
|
||||
calling the build scripts. Locally you can flip it back on if your toolchain
|
||||
supports those instructions:
|
||||
|
||||
```bash
|
||||
GGML_NATIVE=ON pnpm --filter @amical/whisper-wrapper build:native
|
||||
```
|
||||
|
||||
Leave it off in CI unless you control the runner.
|
||||
|
||||
## Custom targets
|
||||
|
||||
`WHISPER_TARGETS` lets you override which variants to build. The value is a
|
||||
comma-separated list of directory names that should map to `native/<name>`.
|
||||
Examples:
|
||||
|
||||
```bash
|
||||
WHISPER_TARGETS="linux-x64-gnu" pnpm --filter @amical/whisper-wrapper build:native
|
||||
WHISPER_TARGETS="win32-x64-cuda,win32-x64" pnpm --filter @amical/whisper-wrapper build:native
|
||||
```
|
||||
|
||||
Absent overrides the script builds the Metal variant (on macOS) followed by the
|
||||
plain CPU build.
|
||||
|
||||
## Runtime API
|
||||
|
||||
`src/index.ts` exposes a minimal class that mirrors the desktop worker protocol:
|
||||
|
||||
- `new Whisper(modelPath, { gpu?: boolean })`
|
||||
- `await whisper.load()` (no-op placeholder)
|
||||
- `await whisper.transcribe(audioOrNull, options)`
|
||||
- `await whisper.free()`
|
||||
|
||||
If you pass `null` (and a `fname_inp` in `options`) the addon reads the audio
|
||||
file directly, matching the CLI smoke tests.
|
||||
|
||||
## Local expectations
|
||||
|
||||
- `whisper.cpp` is tracked as a submodule under `packages/whisper-wrapper/`.
|
||||
- `cmake-js` / `node` / `pnpm` must be installed (the workspace root sets the
|
||||
required versions).
|
||||
- The build creates `.cmake-js/` and `.home/` caches inside the package; they’re
|
||||
ignored in git.
|
||||
|
||||
For any tweaks (new build targets, additional fallbacks, etc.) update this file
|
||||
so the CI configuration stays discoverable.
|
||||
1
packages/whisper-wrapper/WHISPER_CPP_VERSION
Normal file
1
packages/whisper-wrapper/WHISPER_CPP_VERSION
Normal file
|
|
@ -0,0 +1 @@
|
|||
v1.7.6
|
||||
84
packages/whisper-wrapper/addon/CMakeLists.txt
Normal file
84
packages/whisper-wrapper/addon/CMakeLists.txt
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
cmake_minimum_required(VERSION 3.20)
|
||||
project(whisper_node LANGUAGES C CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
add_definitions(-DNAPI_VERSION=8)
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../whisper.cpp/cmake")
|
||||
|
||||
set(WHISPER_CPP_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../whisper.cpp")
|
||||
|
||||
set(WHISPER_BUILD_TESTS OFF CACHE BOOL "" FORCE)
|
||||
set(WHISPER_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
|
||||
set(WHISPER_BUILD_SERVER OFF CACHE BOOL "" FORCE)
|
||||
set(WHISPER_CURL OFF CACHE BOOL "" FORCE)
|
||||
set(WHISPER_SDL2 OFF CACHE BOOL "" FORCE)
|
||||
set(WHISPER_FFMPEG OFF CACHE BOOL "" FORCE)
|
||||
set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
|
||||
set(GGML_STATIC ON CACHE BOOL "" FORCE)
|
||||
set(GGML_SHARED OFF CACHE BOOL "" FORCE)
|
||||
|
||||
add_subdirectory(${WHISPER_CPP_DIR} whispercpp EXCLUDE_FROM_ALL)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
set(ADDON_SOURCES
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/addon.cpp
|
||||
${WHISPER_CPP_DIR}/examples/common.cpp
|
||||
${WHISPER_CPP_DIR}/examples/common-ggml.cpp
|
||||
${WHISPER_CPP_DIR}/examples/common-whisper.cpp
|
||||
${WHISPER_CPP_DIR}/examples/grammar-parser.cpp
|
||||
)
|
||||
|
||||
add_library(whisper_node SHARED ${ADDON_SOURCES})
|
||||
set_target_properties(whisper_node PROPERTIES PREFIX "" SUFFIX ".node" OUTPUT_NAME "whisper")
|
||||
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
target_compile_options(whisper_node PRIVATE -Wall -Wextra -Wno-unused-parameter)
|
||||
endif()
|
||||
|
||||
set_target_properties(whisper_node PROPERTIES
|
||||
CXX_VISIBILITY_PRESET hidden
|
||||
VISIBILITY_INLINES_HIDDEN ON)
|
||||
|
||||
# CMake-js variables
|
||||
if (DEFINED CMAKE_JS_INC)
|
||||
string(REPLACE ";" " " TMP_CMAKE_JS_INC "${CMAKE_JS_INC}")
|
||||
endif()
|
||||
if (DEFINED CMAKE_JS_LIB)
|
||||
string(REPLACE ";" " " TMP_CMAKE_JS_LIB "${CMAKE_JS_LIB}")
|
||||
endif()
|
||||
|
||||
if (DEFINED TMP_CMAKE_JS_INC)
|
||||
separate_arguments(TMP_CMAKE_JS_INC)
|
||||
foreach(INC ${TMP_CMAKE_JS_INC})
|
||||
target_include_directories(whisper_node PRIVATE "${INC}")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
if (DEFINED TMP_CMAKE_JS_LIB)
|
||||
separate_arguments(TMP_CMAKE_JS_LIB)
|
||||
endif()
|
||||
|
||||
# Include directories
|
||||
target_include_directories(whisper_node PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${WHISPER_CPP_DIR}/include
|
||||
${WHISPER_CPP_DIR}/ggml/include
|
||||
${WHISPER_CPP_DIR}/examples
|
||||
)
|
||||
|
||||
# Link libraries
|
||||
if (DEFINED TMP_CMAKE_JS_LIB)
|
||||
target_link_libraries(whisper_node PRIVATE ${TMP_CMAKE_JS_LIB})
|
||||
endif()
|
||||
|
||||
target_link_libraries(whisper_node PRIVATE whisper Threads::Threads)
|
||||
|
||||
# On macOS we need to allow undefined symbols for node addon
|
||||
if (APPLE)
|
||||
target_link_options(whisper_node PRIVATE "-undefined" "dynamic_lookup")
|
||||
endif()
|
||||
455
packages/whisper-wrapper/addon/addon.cpp
Normal file
455
packages/whisper-wrapper/addon/addon.cpp
Normal file
|
|
@ -0,0 +1,455 @@
|
|||
#include "napi.h"
|
||||
|
||||
#include "whisper.h"
|
||||
#include "common.h"
|
||||
#include "common-whisper.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
namespace {
|
||||
|
||||
struct WhisperHandle {
|
||||
std::mutex mutex;
|
||||
whisper_context* ctx = nullptr;
|
||||
bool freed = false;
|
||||
};
|
||||
|
||||
struct TokenData {
|
||||
std::string text;
|
||||
int id = 0;
|
||||
float p = 0.0f;
|
||||
int from_ms = -1;
|
||||
int to_ms = -1;
|
||||
};
|
||||
|
||||
struct SegmentData {
|
||||
int from_ms = 0;
|
||||
int to_ms = 0;
|
||||
std::string text;
|
||||
float confidence = 0.0f;
|
||||
std::string language;
|
||||
std::vector<TokenData> tokens;
|
||||
};
|
||||
|
||||
struct FullParamConfig {
|
||||
whisper_full_params params;
|
||||
std::string initial_prompt;
|
||||
std::string language;
|
||||
bool detailed = false;
|
||||
bool token_timestamps = false;
|
||||
};
|
||||
|
||||
FullParamConfig parse_full_params(const Napi::Env env, const Napi::Object& options) {
|
||||
FullParamConfig cfg;
|
||||
cfg.params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
if (options.Has("strategy")) {
|
||||
cfg.params.strategy = static_cast<whisper_sampling_strategy>(
|
||||
options.Get("strategy").As<Napi::Number>().Int32Value());
|
||||
}
|
||||
if (options.Has("n_threads")) {
|
||||
cfg.params.n_threads = options.Get("n_threads").As<Napi::Number>().Int32Value();
|
||||
}
|
||||
if (options.Has("n_max_text_ctx")) {
|
||||
cfg.params.n_max_text_ctx = options.Get("n_max_text_ctx").As<Napi::Number>().Int32Value();
|
||||
}
|
||||
if (options.Has("offset_ms")) {
|
||||
cfg.params.offset_ms = options.Get("offset_ms").As<Napi::Number>().Int32Value();
|
||||
}
|
||||
if (options.Has("duration_ms")) {
|
||||
cfg.params.duration_ms = options.Get("duration_ms").As<Napi::Number>().Int32Value();
|
||||
}
|
||||
|
||||
if (options.Has("translate")) {
|
||||
cfg.params.translate = options.Get("translate").As<Napi::Boolean>().Value();
|
||||
}
|
||||
if (options.Has("no_context")) {
|
||||
cfg.params.no_context = options.Get("no_context").As<Napi::Boolean>().Value();
|
||||
}
|
||||
if (options.Has("no_timestamps")) {
|
||||
cfg.params.no_timestamps = options.Get("no_timestamps").As<Napi::Boolean>().Value();
|
||||
}
|
||||
if (options.Has("single_segment")) {
|
||||
cfg.params.single_segment = options.Get("single_segment").As<Napi::Boolean>().Value();
|
||||
}
|
||||
if (options.Has("print_special")) {
|
||||
cfg.params.print_special = options.Get("print_special").As<Napi::Boolean>().Value();
|
||||
}
|
||||
if (options.Has("print_progress")) {
|
||||
cfg.params.print_progress = options.Get("print_progress").As<Napi::Boolean>().Value();
|
||||
} else {
|
||||
cfg.params.print_progress = false;
|
||||
}
|
||||
if (options.Has("print_realtime")) {
|
||||
cfg.params.print_realtime = options.Get("print_realtime").As<Napi::Boolean>().Value();
|
||||
}
|
||||
if (options.Has("print_timestamps")) {
|
||||
cfg.params.print_timestamps = options.Get("print_timestamps").As<Napi::Boolean>().Value();
|
||||
}
|
||||
|
||||
if (options.Has("token_timestamps")) {
|
||||
cfg.params.token_timestamps = options.Get("token_timestamps").As<Napi::Boolean>().Value();
|
||||
}
|
||||
cfg.token_timestamps = cfg.params.token_timestamps;
|
||||
|
||||
if (options.Has("thold_pt")) {
|
||||
cfg.params.thold_pt = options.Get("thold_pt").As<Napi::Number>();
|
||||
}
|
||||
if (options.Has("thold_ptsum")) {
|
||||
cfg.params.thold_ptsum = options.Get("thold_ptsum").As<Napi::Number>();
|
||||
}
|
||||
if (options.Has("max_len")) {
|
||||
cfg.params.max_len = options.Get("max_len").As<Napi::Number>().Int32Value();
|
||||
}
|
||||
if (options.Has("split_on_word")) {
|
||||
cfg.params.split_on_word = options.Get("split_on_word").As<Napi::Boolean>().Value();
|
||||
}
|
||||
if (options.Has("max_tokens")) {
|
||||
cfg.params.max_tokens = options.Get("max_tokens").As<Napi::Number>().Int32Value();
|
||||
}
|
||||
|
||||
if (options.Has("debug_mode")) {
|
||||
cfg.params.debug_mode = options.Get("debug_mode").As<Napi::Boolean>().Value();
|
||||
}
|
||||
if (options.Has("audio_ctx")) {
|
||||
cfg.params.audio_ctx = options.Get("audio_ctx").As<Napi::Number>().Int32Value();
|
||||
}
|
||||
|
||||
if (options.Has("tdrz_enable")) {
|
||||
cfg.params.tdrz_enable = options.Get("tdrz_enable").As<Napi::Boolean>().Value();
|
||||
}
|
||||
|
||||
if (options.Has("initial_prompt") && options.Get("initial_prompt").IsString()) {
|
||||
cfg.initial_prompt = options.Get("initial_prompt").As<Napi::String>();
|
||||
}
|
||||
|
||||
if (options.Has("language") && options.Get("language").IsString()) {
|
||||
cfg.language = options.Get("language").As<Napi::String>();
|
||||
} else {
|
||||
cfg.language = "auto";
|
||||
}
|
||||
|
||||
if (options.Has("suppress_blank")) {
|
||||
cfg.params.suppress_blank = options.Get("suppress_blank").As<Napi::Boolean>().Value();
|
||||
}
|
||||
if (options.Has("suppress_non_speech_tokens")) {
|
||||
cfg.params.suppress_nst = options.Get("suppress_non_speech_tokens").As<Napi::Boolean>().Value();
|
||||
}
|
||||
|
||||
if (options.Has("temperature")) {
|
||||
cfg.params.temperature = options.Get("temperature").As<Napi::Number>();
|
||||
}
|
||||
if (options.Has("max_initial_ts")) {
|
||||
cfg.params.max_initial_ts = options.Get("max_initial_ts").As<Napi::Number>().Int32Value();
|
||||
}
|
||||
if (options.Has("length_penalty")) {
|
||||
cfg.params.length_penalty = options.Get("length_penalty").As<Napi::Number>();
|
||||
}
|
||||
|
||||
if (options.Has("temperature_inc")) {
|
||||
cfg.params.temperature_inc = options.Get("temperature_inc").As<Napi::Number>();
|
||||
}
|
||||
if (options.Has("entropy_thold")) {
|
||||
cfg.params.entropy_thold = options.Get("entropy_thold").As<Napi::Number>();
|
||||
}
|
||||
if (options.Has("logprob_thold")) {
|
||||
cfg.params.logprob_thold = options.Get("logprob_thold").As<Napi::Number>();
|
||||
}
|
||||
if (options.Has("no_speech_thold")) {
|
||||
cfg.params.no_speech_thold = options.Get("no_speech_thold").As<Napi::Number>();
|
||||
}
|
||||
|
||||
if (options.Has("best_of")) {
|
||||
cfg.params.greedy.best_of = options.Get("best_of").As<Napi::Number>().Int32Value();
|
||||
}
|
||||
if (options.Has("beam_size")) {
|
||||
cfg.params.beam_search.beam_size = options.Get("beam_size").As<Napi::Number>().Int32Value();
|
||||
if (cfg.params.beam_search.beam_size > 1) {
|
||||
cfg.params.strategy = WHISPER_SAMPLING_BEAM_SEARCH;
|
||||
}
|
||||
}
|
||||
|
||||
if (options.Has("prompt") && options.Get("prompt").IsString() && cfg.initial_prompt.empty()) {
|
||||
cfg.initial_prompt = options.Get("prompt").As<Napi::String>();
|
||||
}
|
||||
|
||||
if (options.Has("format") && options.Get("format").IsString()) {
|
||||
std::string format = options.Get("format").As<Napi::String>();
|
||||
std::transform(format.begin(), format.end(), format.begin(), ::tolower);
|
||||
cfg.detailed = (format == "detail");
|
||||
}
|
||||
|
||||
if (options.Has("detect_language")) {
|
||||
cfg.params.detect_language = options.Get("detect_language").As<Napi::Boolean>().Value();
|
||||
}
|
||||
|
||||
if (cfg.language.empty()) {
|
||||
cfg.language = "auto";
|
||||
}
|
||||
|
||||
return cfg;
|
||||
}
|
||||
|
||||
Napi::External<WhisperHandle> wrap_handle(Napi::Env env, WhisperHandle* handle) {
|
||||
return Napi::External<WhisperHandle>::New(
|
||||
env,
|
||||
handle,
|
||||
[](Napi::Env /*env*/, WhisperHandle* ptr) {
|
||||
if (!ptr) return;
|
||||
std::lock_guard<std::mutex> guard(ptr->mutex);
|
||||
if (!ptr->freed && ptr->ctx) {
|
||||
whisper_free(ptr->ctx);
|
||||
ptr->ctx = nullptr;
|
||||
ptr->freed = true;
|
||||
}
|
||||
delete ptr;
|
||||
});
|
||||
}
|
||||
|
||||
WhisperHandle* unwrap_handle(const Napi::CallbackInfo& info, size_t index) {
|
||||
if (info.Length() <= index || !info[index].IsExternal()) {
|
||||
throw Napi::TypeError::New(info.Env(), "Invalid context handle");
|
||||
}
|
||||
return info[index].As<Napi::External<WhisperHandle>>().Data();
|
||||
}
|
||||
|
||||
std::vector<float> extract_audio(const Napi::Env env, const Napi::Object& options) {
|
||||
std::vector<float> pcmf32;
|
||||
if (options.Has("audio") && options.Get("audio").IsTypedArray()) {
|
||||
Napi::Float32Array array = options.Get("audio").As<Napi::Float32Array>();
|
||||
pcmf32.resize(array.ElementLength());
|
||||
std::copy(array.Data(), array.Data() + array.ElementLength(), pcmf32.begin());
|
||||
}
|
||||
return pcmf32;
|
||||
}
|
||||
|
||||
std::vector<std::string> extract_files(const Napi::Object& options) {
|
||||
std::vector<std::string> files;
|
||||
if (options.Has("fname_inp")) {
|
||||
const auto value = options.Get("fname_inp");
|
||||
if (value.IsString()) {
|
||||
files.emplace_back(value.As<Napi::String>());
|
||||
}
|
||||
}
|
||||
return files;
|
||||
}
|
||||
|
||||
Napi::Value init_model(const Napi::CallbackInfo& info) {
|
||||
Napi::Env env = info.Env();
|
||||
if (info.Length() < 1 || !info[0].IsObject()) {
|
||||
throw Napi::TypeError::New(env, "Expected init options object");
|
||||
}
|
||||
|
||||
auto options = info[0].As<Napi::Object>();
|
||||
if (!options.Has("model") || !options.Get("model").IsString()) {
|
||||
throw Napi::TypeError::New(env, "Missing 'model' path");
|
||||
}
|
||||
|
||||
std::string model = options.Get("model").As<Napi::String>();
|
||||
bool use_gpu = true;
|
||||
if (options.Has("gpu")) {
|
||||
use_gpu = options.Get("gpu").As<Napi::Boolean>();
|
||||
} else if (options.Has("use_gpu")) {
|
||||
use_gpu = options.Get("use_gpu").As<Napi::Boolean>();
|
||||
}
|
||||
|
||||
bool flash_attn = false;
|
||||
if (options.Has("flash_attn")) {
|
||||
flash_attn = options.Get("flash_attn").As<Napi::Boolean>();
|
||||
}
|
||||
|
||||
whisper_context_params cparams = whisper_context_default_params();
|
||||
cparams.use_gpu = use_gpu;
|
||||
cparams.flash_attn = flash_attn;
|
||||
|
||||
whisper_context* ctx = whisper_init_from_file_with_params(model.c_str(), cparams);
|
||||
if (ctx == nullptr) {
|
||||
throw Napi::Error::New(env, "Failed to initialize whisper context");
|
||||
}
|
||||
|
||||
auto* handle = new WhisperHandle();
|
||||
handle->ctx = ctx;
|
||||
|
||||
return wrap_handle(env, handle);
|
||||
}
|
||||
|
||||
Napi::Value free_model(const Napi::CallbackInfo& info) {
|
||||
Napi::Env env = info.Env();
|
||||
WhisperHandle* handle = unwrap_handle(info, 0);
|
||||
|
||||
std::lock_guard<std::mutex> guard(handle->mutex);
|
||||
if (!handle->freed && handle->ctx) {
|
||||
whisper_free(handle->ctx);
|
||||
handle->ctx = nullptr;
|
||||
handle->freed = true;
|
||||
}
|
||||
|
||||
return env.Undefined();
|
||||
}
|
||||
|
||||
Napi::Array build_segments(const Napi::Env env,
|
||||
whisper_context* ctx,
|
||||
const FullParamConfig& cfg,
|
||||
const std::vector<float>& pcmf32,
|
||||
const std::vector<std::vector<float>>& pcmf32s) {
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
Napi::Array segments = Napi::Array::New(env, n_segments);
|
||||
|
||||
const std::string detected_language = whisper_lang_str(whisper_full_lang_id(ctx));
|
||||
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
SegmentData segment;
|
||||
segment.from_ms = whisper_full_get_segment_t0(ctx, i) * 10;
|
||||
segment.to_ms = whisper_full_get_segment_t1(ctx, i) * 10;
|
||||
segment.text = whisper_full_get_segment_text(ctx, i);
|
||||
|
||||
if (cfg.detailed) {
|
||||
const int n_tokens = whisper_full_n_tokens(ctx, i);
|
||||
segment.tokens.reserve(n_tokens);
|
||||
|
||||
float confidence_sum = 0.0f;
|
||||
float min_p = 1.0f;
|
||||
float max_p = 0.0f;
|
||||
int valid_tokens = 0;
|
||||
|
||||
for (int j = 0; j < n_tokens; ++j) {
|
||||
whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
|
||||
|
||||
TokenData token_data;
|
||||
token_data.text = whisper_full_get_token_text(ctx, i, j);
|
||||
token_data.id = token.id;
|
||||
token_data.p = token.p;
|
||||
if (cfg.token_timestamps) {
|
||||
token_data.from_ms = token.t0 * 10;
|
||||
token_data.to_ms = token.t1 * 10;
|
||||
}
|
||||
|
||||
segment.tokens.push_back(std::move(token_data));
|
||||
|
||||
if (token.id > whisper_token_eot(ctx)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
confidence_sum += token.p;
|
||||
min_p = std::min(min_p, token.p);
|
||||
max_p = std::max(max_p, token.p);
|
||||
++valid_tokens;
|
||||
}
|
||||
|
||||
if (valid_tokens > 2) {
|
||||
segment.confidence =
|
||||
(confidence_sum - min_p - max_p) / static_cast<float>(valid_tokens - 2);
|
||||
} else if (valid_tokens > 0) {
|
||||
segment.confidence = confidence_sum / static_cast<float>(valid_tokens);
|
||||
} else {
|
||||
segment.confidence = 0.0f;
|
||||
}
|
||||
|
||||
segment.language = detected_language;
|
||||
}
|
||||
|
||||
Napi::Object jsSegment = Napi::Object::New(env);
|
||||
jsSegment.Set("from", Napi::Number::New(env, segment.from_ms));
|
||||
jsSegment.Set("to", Napi::Number::New(env, segment.to_ms));
|
||||
jsSegment.Set("text", Napi::String::New(env, segment.text));
|
||||
|
||||
if (cfg.detailed) {
|
||||
jsSegment.Set("lang", Napi::String::New(env, segment.language));
|
||||
jsSegment.Set("confidence", Napi::Number::New(env, segment.confidence));
|
||||
|
||||
Napi::Array jsTokens = Napi::Array::New(env, segment.tokens.size());
|
||||
for (size_t t = 0; t < segment.tokens.size(); ++t) {
|
||||
const TokenData& token = segment.tokens[t];
|
||||
Napi::Object jsToken = Napi::Object::New(env);
|
||||
jsToken.Set("text", Napi::String::New(env, token.text));
|
||||
jsToken.Set("id", Napi::Number::New(env, token.id));
|
||||
jsToken.Set("p", Napi::Number::New(env, token.p));
|
||||
if (cfg.token_timestamps) {
|
||||
jsToken.Set("from", Napi::Number::New(env, token.from_ms));
|
||||
jsToken.Set("to", Napi::Number::New(env, token.to_ms));
|
||||
}
|
||||
jsTokens.Set(t, jsToken);
|
||||
}
|
||||
|
||||
jsSegment.Set("tokens", jsTokens);
|
||||
}
|
||||
|
||||
segments.Set(i, jsSegment);
|
||||
}
|
||||
|
||||
return segments;
|
||||
}
|
||||
|
||||
Napi::Value full_transcribe(const Napi::CallbackInfo& info) {
|
||||
Napi::Env env = info.Env();
|
||||
if (info.Length() < 2 || !info[1].IsObject()) {
|
||||
throw Napi::TypeError::New(env, "Expected arguments (handle, options)");
|
||||
}
|
||||
|
||||
WhisperHandle* handle = unwrap_handle(info, 0);
|
||||
if (handle->freed || handle->ctx == nullptr) {
|
||||
throw Napi::Error::New(env, "Model has been freed");
|
||||
}
|
||||
|
||||
auto options = info[1].As<Napi::Object>();
|
||||
|
||||
std::vector<float> pcmf32 = extract_audio(env, options);
|
||||
std::vector<std::vector<float>> pcmf32s;
|
||||
std::vector<std::string> files = extract_files(options);
|
||||
|
||||
if (pcmf32.empty()) {
|
||||
if (files.empty()) {
|
||||
throw Napi::Error::New(env, "No audio provided (audio buffer or fname_inp required)");
|
||||
}
|
||||
if (!::read_audio_data(files[0], pcmf32, pcmf32s, false)) {
|
||||
throw Napi::Error::New(env, "Failed to read input audio file");
|
||||
}
|
||||
}
|
||||
|
||||
FullParamConfig cfg = parse_full_params(env, options);
|
||||
|
||||
if (cfg.language.empty()) {
|
||||
cfg.language = "auto";
|
||||
}
|
||||
|
||||
cfg.params.language = cfg.language.c_str();
|
||||
cfg.params.initial_prompt = cfg.initial_prompt.empty() ? nullptr : cfg.initial_prompt.c_str();
|
||||
|
||||
int n_processors = 1;
|
||||
if (options.Has("n_processors")) {
|
||||
n_processors = std::max(1, options.Get("n_processors").As<Napi::Number>().Int32Value());
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> guard(handle->mutex);
|
||||
|
||||
int result = whisper_full_parallel(
|
||||
handle->ctx,
|
||||
cfg.params,
|
||||
pcmf32.data(),
|
||||
static_cast<int>(pcmf32.size()),
|
||||
n_processors);
|
||||
|
||||
if (result != 0) {
|
||||
throw Napi::Error::New(env, "whisper_full_parallel failed");
|
||||
}
|
||||
|
||||
return build_segments(env, handle->ctx, cfg, pcmf32, pcmf32s);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Napi::Object InitAll(Napi::Env env, Napi::Object exports) {
|
||||
exports.Set("init", Napi::Function::New(env, init_model));
|
||||
exports.Set("full", Napi::Function::New(env, full_transcribe));
|
||||
exports.Set("free", Napi::Function::New(env, free_model));
|
||||
return exports;
|
||||
}
|
||||
|
||||
NODE_API_MODULE(whisper, InitAll)
|
||||
9
packages/whisper-wrapper/addon/package.json
Normal file
9
packages/whisper-wrapper/addon/package.json
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
{
|
||||
"name": "@amical/whisper-node-addon",
|
||||
"private": true,
|
||||
"binary": {
|
||||
"napi_versions": [
|
||||
8
|
||||
]
|
||||
}
|
||||
}
|
||||
296
packages/whisper-wrapper/bin/build-addon.js
Normal file
296
packages/whisper-wrapper/bin/build-addon.js
Normal file
|
|
@ -0,0 +1,296 @@
|
|||
#!/usr/bin/env node
|
||||
/*
|
||||
* build-addon.js
|
||||
* --------------------------------------------------
|
||||
* Compiles the whisper.cpp Node addon (examples/addon.node) for the current
|
||||
* platform/arch with acceleration flags, then places the resulting
|
||||
* `whisper.node` binary in native/<target>/.
|
||||
*
|
||||
* NOTE: This is an initial scaffold. It expects the whisper.cpp sources to be
|
||||
* vendored at `./whisper.cpp` (git submodule or manual copy). You can refine
|
||||
* the build flags as needed.
|
||||
*/
|
||||
|
||||
const { execSync } = require("child_process");
|
||||
const path = require("path");
|
||||
const fs = require("fs");
|
||||
|
||||
function run(cmd, opts = {}) {
|
||||
console.log(`[build-addon] ${cmd}`);
|
||||
execSync(cmd, { stdio: "inherit", ...opts });
|
||||
}
|
||||
|
||||
const pkgDir = path.resolve(__dirname, "..");
|
||||
const addonDir = path.join(pkgDir, "addon");
|
||||
const whisperDir = path.join(pkgDir, "whisper.cpp");
|
||||
|
||||
if (!fs.existsSync(addonDir) || !fs.existsSync(whisperDir)) {
|
||||
console.error(
|
||||
"whisper.cpp sources not found. Please add them to packages/whisper-wrapper/whisper.cpp",
|
||||
);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const buildDir = path.join(pkgDir, "build");
|
||||
if (!fs.existsSync(buildDir)) fs.mkdirSync(buildDir);
|
||||
|
||||
const cacheDir = path.join(pkgDir, ".cmake-js");
|
||||
if (!fs.existsSync(cacheDir)) fs.mkdirSync(cacheDir);
|
||||
|
||||
const homeDir = path.join(pkgDir, ".home");
|
||||
if (!fs.existsSync(homeDir)) fs.mkdirSync(homeDir);
|
||||
|
||||
function resolveLibExecutable(env, arch) {
|
||||
const archDir = arch === "ia32" ? "x86" : arch === "arm64" ? "arm64" : "x64";
|
||||
const hostDir = arch === "ia32" ? "Hostx86" : "Hostx64";
|
||||
const candidates = [];
|
||||
|
||||
const addIfExists = (candidate) => {
|
||||
if (candidate && fs.existsSync(candidate) && !candidates.includes(candidate)) {
|
||||
candidates.push(candidate);
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
const whereOutput = execSync("where lib.exe", {
|
||||
env,
|
||||
stdio: ["ignore", "pipe", "ignore"],
|
||||
})
|
||||
.toString()
|
||||
.split(/\r?\n/)
|
||||
.map((line) => line.trim())
|
||||
.filter(Boolean);
|
||||
for (const line of whereOutput) {
|
||||
addIfExists(line);
|
||||
}
|
||||
} catch (err) {
|
||||
// ignore when lib.exe is not on PATH; fall back to manual probing
|
||||
}
|
||||
|
||||
const probeVersionedDir = (dir) => {
|
||||
if (!dir || !fs.existsSync(dir) || !fs.statSync(dir).isDirectory()) return;
|
||||
const entries = fs
|
||||
.readdirSync(dir, { withFileTypes: true })
|
||||
.filter((entry) => entry.isDirectory())
|
||||
.map((entry) => entry.name)
|
||||
.sort((a, b) => b.localeCompare(a, undefined, { numeric: true, sensitivity: "base" }));
|
||||
for (const entry of entries) {
|
||||
const candidate = path.join(dir, entry, "bin", hostDir, archDir, "lib.exe");
|
||||
if (fs.existsSync(candidate)) {
|
||||
addIfExists(candidate);
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const probeInstallDir = (installDir) => {
|
||||
if (!installDir) return;
|
||||
if (fs.existsSync(installDir) && fs.statSync(installDir).isFile()) {
|
||||
addIfExists(installDir);
|
||||
return;
|
||||
}
|
||||
|
||||
const directCandidate = path.join(installDir, "bin", hostDir, archDir, "lib.exe");
|
||||
addIfExists(directCandidate);
|
||||
|
||||
const toolsDir = path.join(installDir, "Tools", "MSVC");
|
||||
probeVersionedDir(toolsDir);
|
||||
};
|
||||
|
||||
probeInstallDir(env.VCToolsInstallDir);
|
||||
probeInstallDir(env.VCINSTALLDIR);
|
||||
probeInstallDir(env.VSINSTALLDIR && path.join(env.VSINSTALLDIR, "VC"));
|
||||
probeVersionedDir("C:/Program Files/Microsoft Visual Studio/2022/Enterprise/VC/Tools/MSVC");
|
||||
probeVersionedDir("C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC");
|
||||
probeVersionedDir("C:/Program Files/Microsoft Visual Studio/2022/Professional/VC/Tools/MSVC");
|
||||
probeVersionedDir("C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC");
|
||||
|
||||
return candidates[0] || null;
|
||||
}
|
||||
|
||||
function ensureWindowsNodeImportLib(buildVariantDir, arch, env) {
|
||||
if (process.platform !== "win32") return;
|
||||
|
||||
const nodeImportLib = path.join(buildVariantDir, "node.lib");
|
||||
if (fs.existsSync(nodeImportLib)) return;
|
||||
|
||||
let headersPackageJson;
|
||||
try {
|
||||
headersPackageJson = require.resolve("node-api-headers/package.json", {
|
||||
paths: [pkgDir],
|
||||
});
|
||||
} catch (err) {
|
||||
throw new Error(
|
||||
"node-api-headers package not found; cannot generate node.lib on Windows",
|
||||
);
|
||||
}
|
||||
|
||||
const defPath = path.join(path.dirname(headersPackageJson), "def", "node_api.def");
|
||||
if (!fs.existsSync(defPath)) {
|
||||
throw new Error(`node_api.def not found at ${defPath}`);
|
||||
}
|
||||
|
||||
const machineMap = { x64: "X64", ia32: "X86", arm64: "ARM64" };
|
||||
const machine = machineMap[arch] || "X64";
|
||||
|
||||
const libExecutable = resolveLibExecutable(env, arch);
|
||||
if (!libExecutable) {
|
||||
throw new Error(
|
||||
"Unable to locate lib.exe. Ensure the Visual Studio Build Tools are installed and vcvarsall has been applied.",
|
||||
);
|
||||
}
|
||||
|
||||
console.log(
|
||||
`[build-addon] Generating node import library using ${libExecutable} for ${machine} into ${nodeImportLib}`,
|
||||
);
|
||||
try {
|
||||
run(`"${libExecutable}" /def:"${defPath}" /machine:${machine} /out:"${nodeImportLib}"`, {
|
||||
env,
|
||||
});
|
||||
} catch (error) {
|
||||
const message =
|
||||
"Failed to generate node import library. Ensure Visual Studio build tools are installed.";
|
||||
if (error instanceof Error) {
|
||||
error.message = `${message}\n${error.message}`;
|
||||
throw error;
|
||||
}
|
||||
throw new Error(message);
|
||||
}
|
||||
}
|
||||
|
||||
function variantFromName(name, platform, arch) {
|
||||
const envOverrides = {};
|
||||
if (name === "cpu-fallback") {
|
||||
return { name, env: envOverrides };
|
||||
}
|
||||
|
||||
if (!name.includes("-")) {
|
||||
// expand shorthand like "metal" to full name
|
||||
name = `${platform}-${arch}-${name}`;
|
||||
} else if (!name.startsWith(platform)) {
|
||||
console.warn(
|
||||
`[build-addon] Warning: variant '${name}' does not match current platform (${platform}), skipping.`,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (name.includes("-metal")) {
|
||||
envOverrides.GGML_METAL = "1";
|
||||
envOverrides.GGML_USE_ACCELERATE = "1";
|
||||
}
|
||||
if (name.includes("-openblas")) {
|
||||
envOverrides.GGML_OPENBLAS = "1";
|
||||
envOverrides.GGML_BLAS = "1";
|
||||
}
|
||||
if (name.includes("-cuda")) {
|
||||
envOverrides.GGML_CUDA = "1";
|
||||
}
|
||||
if (name.startsWith("darwin-")) {
|
||||
envOverrides.GGML_USE_ACCELERATE = envOverrides.GGML_USE_ACCELERATE || "1";
|
||||
}
|
||||
|
||||
return { name, env: envOverrides };
|
||||
}
|
||||
|
||||
function computeVariants(platform, arch) {
|
||||
const overrides = (process.env.WHISPER_TARGETS || "")
|
||||
.split(",")
|
||||
.map((v) => v.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
const result = [];
|
||||
|
||||
if (overrides.length > 0) {
|
||||
for (const override of overrides) {
|
||||
const variant = variantFromName(override, platform, arch);
|
||||
if (variant) result.push(variant);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
if (platform === "darwin") {
|
||||
const metal = variantFromName(`${platform}-${arch}-metal`, platform, arch);
|
||||
if (metal) result.push(metal);
|
||||
}
|
||||
|
||||
const primary = variantFromName(`${platform}-${arch}`, platform, arch);
|
||||
if (primary) result.push(primary);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const { platform, arch } = process;
|
||||
const variants = computeVariants(platform, arch);
|
||||
|
||||
if (variants.length === 0) {
|
||||
console.warn("[build-addon] No variants requested, building default cpu-fallback.");
|
||||
const fallback = variantFromName("cpu-fallback", platform, arch);
|
||||
if (fallback) variants.push(fallback);
|
||||
}
|
||||
|
||||
for (const variant of variants) {
|
||||
const buildVariantDir = path.join(buildDir, variant.name.replace(/[\\/]/g, "_"));
|
||||
fs.rmSync(buildVariantDir, { recursive: true, force: true });
|
||||
fs.mkdirSync(buildVariantDir, { recursive: true });
|
||||
|
||||
const env = {
|
||||
...process.env,
|
||||
CMAKE_JS_CACHE: cacheDir,
|
||||
HOME: homeDir,
|
||||
CMAKE_JS_NODE_DIR: path.resolve(process.execPath, "..", ".."),
|
||||
...variant.env,
|
||||
};
|
||||
|
||||
console.log(`[build-addon] Building variant ${variant.name}`);
|
||||
|
||||
ensureWindowsNodeImportLib(buildVariantDir, arch, env);
|
||||
|
||||
const cmakeParts = [
|
||||
"npx cmake-js compile",
|
||||
`-O "${buildVariantDir}"`,
|
||||
"-B Release",
|
||||
`-d "${addonDir}"`,
|
||||
"-T whisper_node",
|
||||
"--CD node_runtime=node",
|
||||
];
|
||||
|
||||
const propagateCMakeBool = (key) => {
|
||||
const value = env[key];
|
||||
if (typeof value === "string" && value.length > 0) {
|
||||
cmakeParts.push(`--CD${key}=${value}`);
|
||||
}
|
||||
};
|
||||
|
||||
propagateCMakeBool("GGML_NATIVE");
|
||||
|
||||
run(cmakeParts.join(" "), {
|
||||
cwd: addonDir,
|
||||
env,
|
||||
});
|
||||
|
||||
const builtBinary = path.join(buildVariantDir, "Release", "whisper.node");
|
||||
if (!fs.existsSync(builtBinary)) {
|
||||
throw new Error(`Build succeeded but whisper.node not found for variant ${variant.name}`);
|
||||
}
|
||||
|
||||
const targetDir = path.join(pkgDir, "native", variant.name);
|
||||
fs.mkdirSync(targetDir, { recursive: true });
|
||||
fs.copyFileSync(builtBinary, path.join(targetDir, "whisper.node"));
|
||||
console.log(`[build-addon] copied to native/${variant.name}/whisper.node`);
|
||||
|
||||
if (platform === "darwin") {
|
||||
const targetBinary = path.join(targetDir, "whisper.node");
|
||||
try {
|
||||
run(`codesign --force --sign - "${targetBinary}"`);
|
||||
console.log("[build-addon] codesigned", targetBinary);
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
`[build-addon] warning: codesign failed for ${targetBinary}: ${err.message}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove intermediate build artifacts to keep the package footprint small and avoid
|
||||
// extremely long CMake-generated paths that break Windows packaging tools.
|
||||
fs.rmSync(buildVariantDir, { recursive: true, force: true });
|
||||
}
|
||||
33
packages/whisper-wrapper/package.json
Normal file
33
packages/whisper-wrapper/package.json
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
{
|
||||
"name": "@amical/whisper-wrapper",
|
||||
"version": "0.0.0",
|
||||
"private": true,
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
"files": [
|
||||
"dist",
|
||||
"native",
|
||||
"src",
|
||||
"addon"
|
||||
],
|
||||
"binary": {
|
||||
"napi_versions": [
|
||||
8
|
||||
]
|
||||
},
|
||||
"scripts": {
|
||||
"build": "tsc -p tsconfig.json",
|
||||
"postinstall": "node ./bin/build-addon.js",
|
||||
"build:native": "node ./scripts/build-native.js",
|
||||
"build:native:cuda": "node ./scripts/build-native.js --cuda"
|
||||
},
|
||||
"dependencies": {
|
||||
"cmake-js": "^7.3.1",
|
||||
"minimatch": "10.0.3",
|
||||
"node-api-headers": "^1.5.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@amical/typescript-config": "workspace:*",
|
||||
"typescript": "^5.8.3"
|
||||
}
|
||||
}
|
||||
19
packages/whisper-wrapper/scripts/build-native.js
Executable file
19
packages/whisper-wrapper/scripts/build-native.js
Executable file
|
|
@ -0,0 +1,19 @@
|
|||
#!/usr/bin/env node
|
||||
const { execSync } = require("node:child_process");
|
||||
const path = require("node:path");
|
||||
|
||||
function build(targets) {
|
||||
const baseEnv = { ...process.env };
|
||||
baseEnv.WHISPER_TARGETS = targets.join(",");
|
||||
execSync("node ./bin/build-addon.js", {
|
||||
cwd: path.join(__dirname, ".."),
|
||||
stdio: "inherit",
|
||||
env: baseEnv,
|
||||
});
|
||||
}
|
||||
|
||||
if (process.argv.includes("--cuda")) {
|
||||
build(["win32-x64-cuda", "win32-x64"]);
|
||||
} else {
|
||||
build([]);
|
||||
}
|
||||
160
packages/whisper-wrapper/scripts/test-addon.js
Normal file
160
packages/whisper-wrapper/scripts/test-addon.js
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
#!/usr/bin/env node
|
||||
// Quick smoke-test runner for the whisper.cpp Node addon build.
|
||||
//
|
||||
// Usage:
|
||||
// node scripts/test-addon.js [--model /path/to/model.bin] [--audio /path/to/audio.wav]
|
||||
//
|
||||
// If no flags are provided the script will grab the first *.bin model from
|
||||
// "~/Library/Application Support/amical/models" and the bundled jfk sample.
|
||||
|
||||
const fs = require("node:fs");
|
||||
const os = require("node:os");
|
||||
const path = require("node:path");
|
||||
|
||||
function resolveBinding() {
|
||||
const nativeRoot = path.resolve(__dirname, "..", "native");
|
||||
const { platform, arch } = process;
|
||||
const candidates = [
|
||||
`${platform}-${arch}-metal`,
|
||||
`${platform}-${arch}-openblas`,
|
||||
`${platform}-${arch}-cuda`,
|
||||
`${platform}-${arch}`,
|
||||
"cpu-fallback",
|
||||
];
|
||||
|
||||
for (const dir of candidates) {
|
||||
const bindingPath = path.join(nativeRoot, dir, "whisper.node");
|
||||
if (fs.existsSync(bindingPath)) {
|
||||
return bindingPath;
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(
|
||||
`Unable to locate a whisper.node binary for ${platform}-${arch}. ` +
|
||||
`Expected one of: ${candidates.join(", ")}`,
|
||||
);
|
||||
}
|
||||
|
||||
function defaultModelPath() {
|
||||
const modelsDir = path.join(
|
||||
os.homedir(),
|
||||
"Library",
|
||||
"Application Support",
|
||||
"amical",
|
||||
"models",
|
||||
);
|
||||
|
||||
if (!fs.existsSync(modelsDir)) {
|
||||
throw new Error(
|
||||
`Model directory not found at ${modelsDir}. Pass --model to override.`,
|
||||
);
|
||||
}
|
||||
|
||||
const candidates = fs
|
||||
.readdirSync(modelsDir)
|
||||
.filter((f) => f.toLowerCase().endsWith(".bin"))
|
||||
.map((name) => {
|
||||
const fullPath = path.join(modelsDir, name);
|
||||
const stats = fs.statSync(fullPath);
|
||||
return { name, fullPath, size: stats.size };
|
||||
})
|
||||
.sort((a, b) => - a.size + b.size);
|
||||
|
||||
if (candidates.length === 0) {
|
||||
throw new Error(
|
||||
`No .bin model files found in ${modelsDir}. Pass --model to override.`,
|
||||
);
|
||||
}
|
||||
|
||||
return candidates[0].fullPath;
|
||||
}
|
||||
|
||||
function defaultAudioPath() {
|
||||
const audio = path.resolve(
|
||||
__dirname,
|
||||
"..",
|
||||
"whisper.cpp",
|
||||
"samples",
|
||||
"jfk.wav",
|
||||
);
|
||||
|
||||
if (!fs.existsSync(audio)) {
|
||||
throw new Error(
|
||||
`Sample audio not found at ${audio}. Pass --audio to override.`,
|
||||
);
|
||||
}
|
||||
|
||||
return audio;
|
||||
}
|
||||
|
||||
function parseArgs() {
|
||||
const args = process.argv.slice(2);
|
||||
const options = {};
|
||||
|
||||
for (const arg of args) {
|
||||
if (!arg.startsWith("--")) continue;
|
||||
const [key, value] = arg.slice(2).split("=");
|
||||
if (!value) {
|
||||
throw new Error(`Flag '${arg}' must be provided as --${key}=<value>`);
|
||||
}
|
||||
options[key] = value;
|
||||
}
|
||||
|
||||
return options;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const opts = parseArgs();
|
||||
const modelPath = path.resolve(opts.model || defaultModelPath());
|
||||
const audioPath = path.resolve(opts.audio || defaultAudioPath());
|
||||
|
||||
if (!fs.existsSync(modelPath)) {
|
||||
throw new Error(`Model file not found at ${modelPath}`);
|
||||
}
|
||||
if (!fs.existsSync(audioPath)) {
|
||||
throw new Error(`Audio file not found at ${audioPath}`);
|
||||
}
|
||||
|
||||
const bindingPath = resolveBinding();
|
||||
console.log(`> Using addon: ${bindingPath}`);
|
||||
console.log(`> Using model: ${modelPath}`);
|
||||
console.log(`> Using audio: ${audioPath}`);
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
||||
const binding = require(bindingPath);
|
||||
|
||||
if (typeof binding.init !== "function" ||
|
||||
typeof binding.full !== "function" ||
|
||||
typeof binding.free !== "function") {
|
||||
throw new Error(`Addon at ${bindingPath} does not expose init/full/free APIs.`);
|
||||
}
|
||||
|
||||
const handle = binding.init({ model: modelPath, gpu: true });
|
||||
try {
|
||||
const segments = binding.full(handle, {
|
||||
fname_inp: audioPath,
|
||||
language: "en",
|
||||
no_timestamps: false,
|
||||
suppress_blank: true,
|
||||
suppress_non_speech_tokens: true,
|
||||
});
|
||||
|
||||
console.log("Transcription segments:\n");
|
||||
for (const segment of segments) {
|
||||
const from = typeof segment.from === "number" ? segment.from : "?";
|
||||
const to = typeof segment.to === "number" ? segment.to : "?";
|
||||
console.log(` [${from} -> ${to}] ${segment.text}`);
|
||||
}
|
||||
|
||||
console.log("\nDone.");
|
||||
} finally {
|
||||
binding.free(handle);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("Test run failed:", err);
|
||||
process.exitCode = 1;
|
||||
});
|
||||
43
packages/whisper-wrapper/src/index.ts
Normal file
43
packages/whisper-wrapper/src/index.ts
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
/* eslint-disable @typescript-eslint/no-var-requires */
|
||||
import { loadBinding, getLoadedBindingInfo } from "./loader";
|
||||
|
||||
const binding = loadBinding();
|
||||
|
||||
export interface WhisperOptions {
|
||||
gpu?: boolean;
|
||||
}
|
||||
|
||||
export { getLoadedBindingInfo } from "./loader";
|
||||
|
||||
export class Whisper {
|
||||
private ctx: any;
|
||||
|
||||
constructor(
|
||||
private modelPath: string,
|
||||
_opts?: WhisperOptions,
|
||||
) {
|
||||
this.ctx = binding.init({ model: modelPath });
|
||||
}
|
||||
|
||||
async load(): Promise<void> {
|
||||
return;
|
||||
}
|
||||
|
||||
async transcribe(
|
||||
audio: Float32Array | null,
|
||||
options: Record<string, unknown>,
|
||||
): Promise<{ result: Promise<Array<{ text: string }>> }> {
|
||||
const payload =
|
||||
audio instanceof Float32Array ? { audio, ...options } : options;
|
||||
const segments = binding.full(this.ctx, payload);
|
||||
return { result: Promise.resolve(segments) };
|
||||
}
|
||||
|
||||
async free(): Promise<void> {
|
||||
binding.free(this.ctx);
|
||||
}
|
||||
|
||||
static getBindingInfo(): { path: string; type: string } | null {
|
||||
return getLoadedBindingInfo();
|
||||
}
|
||||
}
|
||||
106
packages/whisper-wrapper/src/loader.ts
Normal file
106
packages/whisper-wrapper/src/loader.ts
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
import path from "node:path";
|
||||
import fs from "node:fs";
|
||||
|
||||
const GPU_FIRST_CANDIDATES = ["metal", "openblas", "cuda"] as const;
|
||||
|
||||
function candidateDirs(platform: string, arch: string): string[] {
|
||||
return [
|
||||
...GPU_FIRST_CANDIDATES.map((tag) => `${platform}-${arch}-${tag}`),
|
||||
`${platform}-${arch}`,
|
||||
"cpu-fallback",
|
||||
];
|
||||
}
|
||||
|
||||
function bindingPathFor(dir: string): string {
|
||||
return path.join(__dirname, "..", "native", dir, "whisper.node");
|
||||
}
|
||||
|
||||
function isLoadableError(error: unknown): boolean {
|
||||
return (
|
||||
!!error &&
|
||||
typeof error === "object" &&
|
||||
"code" in error &&
|
||||
(error as NodeJS.ErrnoException).code === "ERR_DLOPEN_FAILED"
|
||||
);
|
||||
}
|
||||
|
||||
export function resolveBinding(): string {
|
||||
const { platform, arch } = process;
|
||||
for (const dir of candidateDirs(platform, arch)) {
|
||||
const candidate = bindingPathFor(dir);
|
||||
if (fs.existsSync(candidate)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
throw new Error(
|
||||
`No suitable whisper.node binary found for ${platform}-${arch}`,
|
||||
);
|
||||
}
|
||||
|
||||
let loadedBindingInfo: { path: string; type: string } | null = null;
|
||||
|
||||
export function getLoadedBindingInfo(): { path: string; type: string } | null {
|
||||
return loadedBindingInfo;
|
||||
}
|
||||
|
||||
export function loadBinding(): any {
|
||||
const { platform, arch } = process;
|
||||
const attempted: string[] = [];
|
||||
let lastLoadError: unknown = null;
|
||||
|
||||
for (const dir of candidateDirs(platform, arch)) {
|
||||
const candidate = bindingPathFor(dir);
|
||||
if (!fs.existsSync(candidate)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
attempted.push(candidate);
|
||||
try {
|
||||
const mod = require(candidate);
|
||||
if (attempted.length > 1) {
|
||||
console.warn(
|
||||
`[whisper-wrapper] loaded fallback binary: ${candidate} (attempted ${attempted.length} candidates)`,
|
||||
);
|
||||
}
|
||||
|
||||
// Store the loaded binding info
|
||||
const bindingType = dir.includes("-cuda")
|
||||
? "cuda"
|
||||
: dir.includes("-metal")
|
||||
? "metal"
|
||||
: dir.includes("-openblas")
|
||||
? "openblas"
|
||||
: dir === "cpu-fallback"
|
||||
? "cpu-fallback"
|
||||
: "cpu";
|
||||
loadedBindingInfo = {
|
||||
path: candidate,
|
||||
type: bindingType,
|
||||
};
|
||||
|
||||
return mod;
|
||||
} catch (error) {
|
||||
if (isLoadableError(error)) {
|
||||
console.warn(
|
||||
`[whisper-wrapper] failed to load ${candidate}: ${(error as Error).message}. Trying next candidate...`,
|
||||
);
|
||||
lastLoadError = error;
|
||||
continue;
|
||||
}
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
if (lastLoadError) {
|
||||
const error = new Error(
|
||||
`Unable to load whisper.node for ${platform}-${arch}. Attempted: ${attempted.join(", ")}`,
|
||||
{ cause: lastLoadError },
|
||||
);
|
||||
throw error;
|
||||
}
|
||||
|
||||
throw new Error(
|
||||
`No suitable whisper.node binary found for ${platform}-${arch}`,
|
||||
);
|
||||
}
|
||||
8
packages/whisper-wrapper/tsconfig.json
Normal file
8
packages/whisper-wrapper/tsconfig.json
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"extends": "../typescript-config/base.json",
|
||||
"compilerOptions": {
|
||||
"outDir": "dist",
|
||||
"rootDir": "src"
|
||||
},
|
||||
"include": ["src"]
|
||||
}
|
||||
1
packages/whisper-wrapper/whisper.cpp
Submodule
1
packages/whisper-wrapper/whisper.cpp
Submodule
|
|
@ -0,0 +1 @@
|
|||
Subproject commit a8d002cfd879315632a579e73f0148d06959de36
|
||||
2887
pnpm-lock.yaml
generated
2887
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load diff
|
|
@ -2,3 +2,4 @@ packages:
|
|||
- "apps/*"
|
||||
- "packages/*"
|
||||
- "packages/**"
|
||||
- "!packages/**/whisper.cpp/**"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue