feat: migrate from smart-whisper to custom binding + add cuda support

This commit is contained in:
nchopra 2025-09-19 16:09:08 +05:30
parent 696193eb44
commit 048915da61
52 changed files with 1490 additions and 4353 deletions

View file

@ -4,6 +4,7 @@ on:
push:
branches:
- feat.windows.support
- feat/whisper.migration
tags:
- 'v*'
workflow_dispatch:
@ -70,6 +71,13 @@ jobs:
node-version: '24.1.0'
cache: 'pnpm'
- name: Install CUDA Toolkit
if: matrix.os == 'windows'
uses: Jimver/cuda-toolkit@v0.2.15
with:
cuda: '12.4.1'
method: 'network'
- name: Log Node.js architecture and platform
run: |
echo "=== Node.js Process Information ==="
@ -78,8 +86,24 @@ jobs:
echo ""
- name: Install dependencies
env:
GGML_NATIVE: OFF # ensure postinstall builds avoid i8mm on CI runners
run: pnpm install --frozen-lockfile
- name: Build whisper wrapper JS
run: pnpm --filter @amical/whisper-wrapper build
- name: Build whisper native binaries
env:
GGML_NATIVE: OFF # CI mac runners lack i8mm support; keep CPU features conservative here
run: pnpm --filter @amical/whisper-wrapper build:native
- name: Build whisper native binaries (cuda)
if: matrix.os == 'windows'
env:
GGML_NATIVE: OFF
run: pnpm --filter @amical/whisper-wrapper build:native:cuda
- name: Download Node.js binaries
working-directory: apps/desktop
run: pnpm download-node

10
.gitignore vendored
View file

@ -21,11 +21,20 @@ coverage
# Vercel
.vercel
# CMake-js cache
.cmake-js/
**/.cmake-js/
# Tool helpers
.home/
**/.home/
# Build Outputs
.next/
out/
build
dist
packages/whisper-wrapper/native/
# Debug
@ -41,6 +50,7 @@ CLAUDE.md
.local
.claude
amical.db
AGENTS.md
# Temp files
/tmp

4
.gitmodules vendored
View file

@ -1,3 +1,3 @@
[submodule "packages/smart-whisper/whisper.cpp"]
path = packages/smart-whisper/whisper.cpp
[submodule "packages/whisper-wrapper/whisper.cpp"]
path = packages/whisper-wrapper/whisper.cpp
url = https://github.com/ggerganov/whisper.cpp.git

View file

@ -40,7 +40,7 @@ export const EXTERNAL_DEPENDENCIES = [
"libsql",
"onnxruntime-node",
"workerpool",
"@amical/smart-whisper",
"@amical/whisper-wrapper",
// Add any other native modules you need here
];
@ -160,6 +160,24 @@ const config: ForgeConfig = {
}
}
// Prune heavy native sources that trigger MAX_PATH on Windows packages
const whisperWrapperPath = join(
localNodeModules,
"@amical",
"whisper-wrapper",
);
const whisperPruneTargets = [
join(whisperWrapperPath, "whisper.cpp"),
join(whisperWrapperPath, "build"),
join(whisperWrapperPath, ".cmake-js"),
];
for (const target of whisperPruneTargets) {
if (existsSync(target)) {
console.log(`Pruning ${target} from packaged output`);
rmSync(target, { recursive: true, force: true });
}
}
// Second pass: Replace any symlinks with dereferenced copies
console.log("Checking for symlinks in copied dependencies...");
for (const dep of nativeModuleDependenciesToPackage) {
@ -318,7 +336,7 @@ const config: ForgeConfig = {
packagerConfig: {
asar: {
unpack:
"{*.node,*.dylib,*.so,*.dll,*.metal,**/node_modules/@amical/smart-whisper/**,**/whisper.cpp/**,**/.vite/build/whisper-worker-fork.js,**/node_modules/jest-worker/**,**/onnxruntime-node/bin/**}",
"{*.node,*.dylib,*.so,*.dll,*.metal,**/node_modules/@amical/whisper-wrapper/**,**/whisper.cpp/**,**/.vite/build/whisper-worker-fork.js,**/node_modules/jest-worker/**,**/onnxruntime-node/bin/**}",
},
name: "Amical",
executableName: "Amical",

View file

@ -81,7 +81,7 @@
"dependencies": {
"@ai-sdk/openai": "^1.3.22",
"@amical/eslint-config": "workspace:*",
"@amical/smart-whisper": "workspace:*",
"@amical/whisper-wrapper": "workspace:*",
"@amical/types": "workspace:*",
"@amical/y-libsql": "workspace:*",
"@dnd-kit/core": "^6.3.1",

View file

@ -2,21 +2,7 @@ import dotenv from "dotenv";
dotenv.config();
import { app } from "electron";
import * as path from "path";
// Set GGML_METAL_PATH_RESOURCES before any other imports
// This ensures @amical/smart-whisper can find its resources when unpacked from asar
if (app.isPackaged) {
// Point to the unpacked whisper.cpp directory
process.env.GGML_METAL_PATH_RESOURCES = path.join(
process.resourcesPath,
"app.asar.unpacked",
"node_modules",
"@amical",
"smart-whisper",
"whisper.cpp",
);
}
import started from "electron-squirrel-startup";
import { AppManager } from "./core/app-manager";
import { updateElectronApp } from "update-electron-app";

View file

@ -44,7 +44,6 @@ export class SimpleForkWrapper {
const workerEnv: any = {
...process.env,
ELECTRON_RUN_AS_NODE: "1",
GGML_METAL_PATH_RESOURCES: process.env.GGML_METAL_PATH_RESOURCES,
NODE_OPTIONS: "--max-old-space-size=8192",
};

View file

@ -57,6 +57,21 @@ export class WhisperProvider implements TranscriptionProvider {
await this.initializeWhisper();
}
async getBindingInfo(): Promise<{ path: string; type: string } | null> {
if (!this.workerWrapper) {
return null;
}
try {
return await this.workerWrapper.exec<{
path: string;
type: string;
} | null>("getBindingInfo", []);
} catch (error) {
logger.transcription.warn("Failed to get binding info:", error);
return null;
}
}
async transcribe(
params: TranscribeParams & { flush?: boolean },
): Promise<string> {
@ -119,7 +134,7 @@ export class WhisperProvider implements TranscriptionProvider {
`Starting transcription of ${aggregatedAudio.length} samples (${((aggregatedAudio.length / this.SAMPLE_RATE) * 1000).toFixed(0)}ms)`,
);
// Transcribe using smart-whisper
// Transcribe using the local Whisper wrapper
if (!this.workerWrapper) {
throw new Error("Worker wrapper is not initialized");
}
@ -137,7 +152,7 @@ export class WhisperProvider implements TranscriptionProvider {
initial_prompt: initialPrompt,
suppress_blank: true,
suppress_non_speech_tokens: true,
no_timestamps: true,
no_timestamps: false,
},
]);
@ -302,7 +317,7 @@ export class WhisperProvider implements TranscriptionProvider {
await this.workerWrapper.exec("initializeModel", [modelPath]);
} catch (error) {
logger.transcription.error(`Failed to initialize:`, error);
throw new Error(`Failed to initialize smart-whisper: ${error}`);
throw new Error(`Failed to initialize whisper wrapper: ${error}`);
}
}

View file

@ -1,5 +1,19 @@
// Worker process entry point for fork
import { Whisper } from "@amical/smart-whisper";
import { Whisper, getLoadedBindingInfo } from "@amical/whisper-wrapper";
// Type definitions for IPC communication
interface WorkerMessage {
id: number;
method: string;
args: unknown[];
}
interface SerializedFloat32Array {
__type: "Float32Array";
data: number[];
}
type MethodArg = SerializedFloat32Array | unknown;
// Simple console-based logging for worker process
const logger = {
@ -29,7 +43,6 @@ const methods = {
whisperInstance = null;
}
const { Whisper } = await import("@amical/smart-whisper");
whisperInstance = new Whisper(modelPath, { gpu: true });
try {
await whisperInstance.load();
@ -71,8 +84,17 @@ const methods = {
);
const transcription = await result;
logger.transcription.debug(
`Transcription segments: ${Array.isArray(transcription) ? transcription.length : "?"}`,
);
if (Array.isArray(transcription)) {
logger.transcription.debug(
`First segment preview: ${transcription[0]?.text ?? "<none>"}`,
);
}
return transcription
.map((segment) => segment.text)
.map((segment: { text: string }) => segment.text)
.join(" ")
.trim();
},
@ -84,23 +106,39 @@ const methods = {
currentModelPath = null;
}
},
getBindingInfo(): { path: string; type: string } | null {
return getLoadedBindingInfo();
},
};
// Handle messages from parent process
process.on("message", async (message: any) => {
process.on("message", async (message: WorkerMessage) => {
const { id, method, args } = message;
try {
// Deserialize Float32Array from IPC
const deserializedArgs = args.map((arg: any) => {
if (arg && arg.__type === "Float32Array" && Array.isArray(arg.data)) {
return new Float32Array(arg.data);
const deserializedArgs = args.map((arg: MethodArg) => {
if (
arg &&
typeof arg === "object" &&
"__type" in arg &&
arg.__type === "Float32Array"
) {
const serialized = arg as SerializedFloat32Array;
if (Array.isArray(serialized.data)) {
return new Float32Array(serialized.data);
}
}
return arg;
});
if (method in methods) {
const result = await (methods as any)[method](...deserializedArgs);
const methodName = method as keyof typeof methods;
const fn = methods[methodName] as (
...args: unknown[]
) => Promise<unknown>;
const result = await fn(...deserializedArgs);
process.send!({ id, result });
} else {
process.send!({ id, error: `Unknown method: ${method}` });

View file

@ -1,5 +1,5 @@
// This file contains just the Whisper-specific operations that need to run in a separate process
import { Whisper } from "@amical/smart-whisper";
import { Whisper } from "@amical/whisper-wrapper";
// Simple console-based logging for worker process
const logger = {
@ -27,7 +27,6 @@ export async function initializeModel(modelPath: string): Promise<void> {
whisperInstance = null;
}
const { Whisper } = await import("@amical/smart-whisper");
whisperInstance = new Whisper(modelPath, { gpu: true });
try {
await whisperInstance.load();
@ -57,7 +56,7 @@ export async function transcribeAudio(
const transcription = await result;
return transcription
.map((segment) => segment.text)
.map((segment: { text: string }) => segment.text)
.join(" ")
.trim();
}

View file

@ -9,6 +9,7 @@ export interface TranscriptionMetrics {
session_id?: string;
model_id: string;
model_preloaded?: boolean;
whisper_native_binding?: string;
total_duration_ms?: number;
recording_duration_ms?: number;
processing_duration_ms?: number;

View file

@ -417,10 +417,22 @@ export class TranscriptionService {
const audioDurationSeconds =
session.context.sharedData.audioMetadata?.duration;
// Get native binding info if using local whisper
let whisperNativeBinding: string | undefined;
if (this.whisperProvider && "getBindingInfo" in this.whisperProvider) {
const bindingInfo = await this.whisperProvider.getBindingInfo();
whisperNativeBinding = bindingInfo?.type;
logger.transcription.info(
"whisper native binding used",
whisperNativeBinding,
);
}
this.telemetryService.trackTranscriptionCompleted({
session_id: sessionId,
model_id: selectedModel!,
model_preloaded: this.modelWasPreloaded,
whisper_native_binding: whisperNativeBinding,
total_duration_ms: totalDuration || 0,
recording_duration_ms: recordingDuration,
processing_duration_ms: processingDuration,

View file

@ -29,7 +29,7 @@ export default defineConfig({
entryFileNames: "[name].js",
},
external: [
"@amical/smart-whisper",
"@amical/whisper-wrapper",
"@libsql/client",
"@libsql/darwin-arm64",
"@libsql/darwin-x64",
@ -51,7 +51,7 @@ export default defineConfig({
optimizeDeps: {
exclude: [
"better-sqlite3",
"smart-whisper",
"@amical/whisper-wrapper",
"drizzle-orm",
"@libsql/client",
],

View file

@ -32,13 +32,13 @@
"keytar",
"protobufjs",
"sharp",
"smart-whisper",
"@amical/whisper-wrapper",
"drizzle-orm/libsql"
],
"onlyBuiltDependencies": [
"electron",
"electron-winstaller",
"smart-whisper",
"@amical/whisper-wrapper",
"drizzle-orm/libsql",
"@libsql",
"macos-alias",

View file

@ -1,64 +0,0 @@
# Dependencies
node_modules/
# Build outputs
build/
*.node
bin/
# TypeScript outputs
dist/
# Keep the build configuration file
!dist/build.js
*.tsbuildinfo
# Native compilation artifacts
*.o
*.a
*.so
*.dylib
*.dll
*.lib
*.exp
*.ilk
*.pdb
# Logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*
# OS files
.DS_Store
Thumbs.db
desktop.ini
# IDE files
.vscode/
.idea/
*.swp
*.swo
*~
# Environment variables
.env
.env.local
.env.*.local
# Test coverage
coverage/
*.lcov
.nyc_output/
# Temporary files
tmp/
temp/
*.tmp
# whisper.cpp build artifacts (if any get generated)
whisper.cpp/build/
whisper.cpp/*.o
whisper.cpp/**/*.o

View file

@ -1,10 +0,0 @@
# Whisper.cpp Version Information
Repository: https://github.com/ggerganov/whisper.cpp
Commit: 2ef717b293fe93872cc3a03ca77942936a281959
Date: November 2024
Description: whisper : add large-v3-turbo (#2440)
This file tracks the exact version of whisper.cpp used in this package.
To update whisper.cpp, replace the contents of the whisper.cpp directory
with a new version and update this file accordingly.

View file

@ -1,37 +0,0 @@
{
'variables' : {
'openssl_fips': '',
},
"targets": [
{
"target_name": "smart-whisper",
"sources": [
"src/binding/binding.cc",
"src/binding/common.cc",
"src/binding/model.cc",
"src/binding/transcribe.cc",
"<!@(node -p \"require('./dist/build.js').sources\")"
],
"libraries": [ "<!@(node -p \"require('./dist/build.js').libraries\")" ],
'defines': [ "<!@(node -p \"require('./dist/build.js').defines\")" ],
'include_dirs': ["<!@(node -p \"require('node-addon-api').include\")", "whisper.cpp/include", "whisper.cpp/ggml/include", "whisper.cpp/examples"],
'dependencies': ["<!(node -p \"require('node-addon-api').gyp\")"],
'cflags!': [ '-fno-exceptions' ],
'cflags_cc!': [ '-fno-exceptions' ],
'xcode_settings': {
'GCC_ENABLE_CPP_EXCEPTIONS': 'YES',
'CLANG_CXX_LIBRARY': 'libc++',
},
'msvs_settings': {
'VCCLCompilerTool': { 'ExceptionHandling': 1 },
},
'conditions': [
['OS=="mac"', {
'xcode_settings': {
'GCC_SYMBOLS_PRIVATE_EXTERN': 'YES', # -fvisibility=hidden
}
}]
]
}
],
}

View file

@ -1,41 +0,0 @@
{
"name": "@amical/smart-whisper",
"version": "0.1.0",
"description": "Whisper.cpp Node.js binding with auto model offloading strategy.",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"keywords": [
"whisper",
"whisper.cpp",
"native",
"binding",
"addon"
],
"gypfile": true,
"files": [
"dist",
"src",
"scripts",
"binding.gyp",
"whisper.cpp/**/*.{c,h,cpp,hpp,m,cu,metal}",
"whisper.cpp/Makefile",
"whisper.cpp/LICENSE"
],
"scripts": {
"install": "tsup",
"postinstall": "node-gyp rebuild",
"build": "tsup && node-gyp rebuild",
"build:ts": "tsup",
"build:native": "node-gyp rebuild"
},
"dependencies": {
"node-addon-api": "^8.5.0",
"minimatch": "10.0.3"
},
"devDependencies": {
"@amical/typescript-config": "workspace:*",
"@types/node": "^24.3.0",
"tsup": "^8.5.0",
"typescript": "^5.8.2"
}
}

View file

@ -1,21 +0,0 @@
"use strict";
var libs = [];
if (process.env.WHISPER_OPENBLAS) {
libs.push(`-lopenblas`);
}
if (process.env.WHISPER_CUBLAS) {
libs.push(
`-lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64`,
);
}
if (process.env.WHISPER_HIPBLAS) {
libs.push(
`lhipblas -lamdhip64 -lrocblas -L/opt/rocm/lib -L/opt/rocm/hipblas/lib -Wl,-rpath=/opt/rocm/lib`,
);
}
if (process.env.WHISPER_CLBLAST) {
libs.push(`-lclblast -lOpenCL`);
}
console.log(libs.join(" "));

View file

@ -1,78 +0,0 @@
process.env.GGML_METAL_PATH_RESOURCES =
process.env.GGML_METAL_PATH_RESOURCES ||
path.join(__dirname, "../whisper.cpp/ggml/src");
import path from "node:path";
import { TranscribeFormat, TranscribeParams, TranscribeResult } from "./types";
const module = require(path.join(__dirname, "../build/Release/smart-whisper"));
/**
* A external handle to a model.
*/
export type Handle = {
readonly "": unique symbol;
};
export namespace Binding {
/**
* Load a model from a whisper weights file.
* @param file The path to the whisper weights file.
* @param gpu Whether to use the GPU or not.
* @param callback A callback that will be called with the handle to the model.
*/
export declare function load(
file: string,
gpu: boolean,
callback: (handle: Handle) => void,
): void;
/**
* Release the memory of the model, it will be unusable after this.
* @param handle The handle to the model.
* @param callback A callback that will be called when the model is freed.
*/
export declare function free(handle: Handle, callback: () => void): void;
/**
* Transcribe a PCM buffer.
* @param handle The handle to the model.
* @param pcm The PCM buffer.
* @param params The parameters to use for transcription.
* @param finish A callback that will be called when the transcription is finished.
* @param progress A callback that will be called when a new result is available.
*/
export declare function transcribe<
Format extends TranscribeFormat,
TokenTimestamp extends boolean,
>(
handle: Handle,
pcm: Float32Array,
params: Partial<TranscribeParams<Format, TokenTimestamp>>,
finish: (results: TranscribeResult<Format, TokenTimestamp>[]) => void,
progress: (result: TranscribeResult<Format, TokenTimestamp>) => void,
): void;
export declare class WhisperModel {
private _ctx;
constructor(handle: Handle);
get handle(): Handle | null;
get freed(): boolean;
/**
* Release the memory of the model, it will be unusable after this.
* It's safe to call this multiple times, but it will only free the model once.
*/
free(): Promise<void>;
/**
* Load a model from a whisper weights file.
* @param file The path to the whisper weights file.
* @param gpu Whether to use the GPU or not.
* @returns A promise that resolves to a {@link WhisperModel}.
*/
static load(file: string, gpu?: boolean): Promise<WhisperModel>;
}
}
/**
* The native binding for the underlying C++ addon.
*/
export const binding: typeof Binding = module;

View file

@ -1,18 +0,0 @@
#include <napi.h>
#include "common.h"
#include "model.h"
#include "transcribe.h"
Napi::Object Init(Napi::Env env, Napi::Object exports) {
exports.Set("transcribe", Napi::Function::New(env, Transcribe));
WhisperModel::Init(env, exports);
if (IsProduction(env.Global())) {
whisper_log_set([](ggml_log_level level, const char *text, void *user_data) {}, nullptr);
}
return exports;
}
NODE_API_MODULE(whisper, Init)

View file

@ -1,16 +0,0 @@
#include "common.h"
Napi::Promise PromiseWorker::Promise() { return promise.Promise(); }
bool IsProduction(const Napi::Object global_env) {
Napi::Object process = global_env.Get("process").As<Napi::Object>();
Napi::Object env = process.Get("env").As<Napi::Object>();
Napi::Value node_env = env.Get("NODE_ENV");
if (!node_env.IsString()) {
return false;
}
Napi::String node_env_str = node_env.As<Napi::String>();
return node_env_str.Utf8Value() == "production";
}

View file

@ -1,22 +0,0 @@
#ifndef _GUARD_SW_COMMON_H
#define _GUARD_SW_COMMON_H
#ifndef NAPI_VERSION
// Support Node.js 16+
#define NAPI_VERSION 8
#endif
#include <napi.h>
class PromiseWorker : public Napi::AsyncWorker {
public:
PromiseWorker(Napi::Env &env) : AsyncWorker(env), promise(Napi::Promise::Deferred::New(env)) {}
Napi::Promise Promise();
protected:
Napi::Promise::Deferred promise;
};
bool IsProduction(const Napi::Object global_env);
#endif

View file

@ -1,145 +0,0 @@
#include "model.h"
class LoadModelWorker : public PromiseWorker {
public:
LoadModelWorker(Napi::Env &env, const std::string &model_path,
struct whisper_context_params params)
: PromiseWorker(env), model_path(model_path), params(params) {}
void Execute() override {
context = whisper_init_from_file_with_params_no_state(model_path.c_str(), params);
if (context == nullptr) {
SetError("Failed to initialize whisper context");
}
whisper_print_timings(context);
}
void OnOK() override {
Napi::HandleScope scope(Env());
auto handle = Napi::External<whisper_context>::New(Env(), context);
auto constructor = Env().GetInstanceData<Napi::FunctionReference>();
auto model = constructor->New({handle});
promise.Resolve(model);
}
private:
std::string model_path;
struct whisper_context_params params;
whisper_context *context;
};
class FreeModelWorker : public PromiseWorker {
public:
FreeModelWorker(Napi::Env &env, whisper_context *context)
: PromiseWorker(env), context(context) {}
void Execute() override { whisper_free(context); }
void OnOK() override {
Napi::HandleScope scope(Env());
promise.Resolve(Env().Undefined());
}
private:
whisper_context *context;
};
Napi::Object WhisperModel::Init(Napi::Env env, Napi::Object exports) {
Napi::Function func = DefineClass(
env, "WhisperModel",
{
StaticMethod<&WhisperModel::Load>(
"load", static_cast<napi_property_attributes>(napi_writable | napi_configurable)),
InstanceMethod<&WhisperModel::Free>(
"free", static_cast<napi_property_attributes>(napi_writable | napi_configurable)),
InstanceAccessor(
"freed", &WhisperModel::GetFreed, nullptr,
static_cast<napi_property_attributes>(napi_enumerable | napi_configurable)),
InstanceAccessor(
"handle", &WhisperModel::GetHandle, nullptr,
static_cast<napi_property_attributes>(napi_enumerable | napi_configurable)),
});
auto constructor = new Napi::FunctionReference();
*constructor = Napi::Persistent(func);
env.SetInstanceData<Napi::FunctionReference>(constructor);
exports.Set("WhisperModel", func);
return exports;
}
WhisperModel::WhisperModel(const Napi::CallbackInfo &info) : Napi::ObjectWrap<WhisperModel>(info) {
Napi::Env env = info.Env();
Napi::HandleScope scope(env);
if (info.Length() != 1) {
Napi::TypeError::New(env, "Wrong number of arguments").ThrowAsJavaScriptException();
return;
}
whisper_context *context = info[0].As<Napi::External<whisper_context>>().Data();
this->context = context;
}
void WhisperModel::Finalize(Napi::Env env) {
if (context != nullptr) {
whisper_free(context);
context = nullptr;
}
}
Napi::Value WhisperModel::Load(const Napi::CallbackInfo &info) {
Napi::Env env = info.Env();
if (info.Length() < 1 || info.Length() > 2) {
Napi::TypeError::New(env, "Wrong number of arguments").ThrowAsJavaScriptException();
return env.Null();
}
std::string model_path = info[0].As<Napi::String>();
whisper_context_params params;
params.use_gpu = info.Length() == 2 ? info[1].As<Napi::Boolean>() : true;
auto worker = new LoadModelWorker(env, model_path, params);
worker->Queue();
return worker->Promise();
}
Napi::Value WhisperModel::Free(const Napi::CallbackInfo &info) {
Napi::Env env = info.Env();
if (info.Length() != 0) {
Napi::TypeError::New(env, "Wrong number of arguments").ThrowAsJavaScriptException();
return env.Null();
}
if (context == nullptr) {
auto deferred = Napi::Promise::Deferred::New(env);
deferred.Resolve(env.Undefined());
return deferred.Promise();
} else {
auto worker = new FreeModelWorker(env, context);
context = nullptr;
worker->Queue();
return worker->Promise();
}
}
Napi::Value WhisperModel::GetFreed(const Napi::CallbackInfo &info) {
Napi::Env env = info.Env();
return Napi::Boolean::New(env, context == nullptr);
}
Napi::Value WhisperModel::GetHandle(const Napi::CallbackInfo &info) {
Napi::Env env = info.Env();
if (context == nullptr) {
return env.Null();
}
return Napi::External<whisper_context>::New(env, context);
}

View file

@ -1,22 +0,0 @@
#ifndef _GUARD_SW_MODEL_H
#define _GUARD_SW_MODEL_H
#include "common.h"
#include "whisper.h"
class WhisperModel : public Napi::ObjectWrap<WhisperModel> {
public:
static Napi::Object Init(Napi::Env env, Napi::Object exports);
WhisperModel(const Napi::CallbackInfo &info);
void Finalize(Napi::Env env);
private:
whisper_context *context;
static Napi::Value Load(const Napi::CallbackInfo &info);
Napi::Value Free(const Napi::CallbackInfo &info);
Napi::Value GetFreed(const Napi::CallbackInfo &info);
Napi::Value GetHandle(const Napi::CallbackInfo &info);
};
#endif

View file

@ -1,358 +0,0 @@
#include "transcribe.h"
struct smart_whisper_transcribe_params {
const char* format;
};
struct whisper_full_params whisper_full_params_from_js(Napi::Object o) {
struct whisper_full_params params =
whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH);
if (o.Has("strategy")) {
params.strategy = static_cast<whisper_sampling_strategy>(
o.Get("strategy").As<Napi::Number>().Int32Value());
}
if (o.Has("n_threads")) {
params.n_threads = o.Get("n_threads").As<Napi::Number>();
}
if (o.Has("n_max_text_ctx")) {
params.n_max_text_ctx = o.Get("n_max_text_ctx").As<Napi::Number>();
}
if (o.Has("offset_ms")) {
params.offset_ms = o.Get("offset_ms").As<Napi::Number>();
}
if (o.Has("duration_ms")) {
params.duration_ms = o.Get("duration_ms").As<Napi::Number>();
}
if (o.Has("translate")) {
params.translate = o.Get("translate").As<Napi::Boolean>();
}
if (o.Has("no_context")) {
params.no_context = o.Get("no_context").As<Napi::Boolean>();
}
if (o.Has("no_timestamps")) {
params.no_timestamps = o.Get("no_timestamps").As<Napi::Boolean>();
}
if (o.Has("single_segment")) {
params.single_segment = o.Get("single_segment").As<Napi::Boolean>();
}
if (o.Has("print_special")) {
params.print_special = o.Get("print_special").As<Napi::Boolean>();
}
if (o.Has("print_progress")) {
params.print_progress = o.Get("print_progress").As<Napi::Boolean>();
}
if (o.Has("print_realtime")) {
params.print_realtime = o.Get("print_realtime").As<Napi::Boolean>();
}
if (o.Has("print_timestamps")) {
params.print_timestamps = o.Get("print_timestamps").As<Napi::Boolean>();
}
if (o.Has("token_timestamps")) {
params.token_timestamps = o.Get("token_timestamps").As<Napi::Boolean>();
}
if (o.Has("thold_pt")) {
params.thold_pt = o.Get("thold_pt").As<Napi::Number>();
}
if (o.Has("thold_ptsum")) {
params.thold_ptsum = o.Get("thold_ptsum").As<Napi::Number>();
}
if (o.Has("max_len")) {
params.max_len = o.Get("max_len").As<Napi::Number>();
}
if (o.Has("split_on_word")) {
params.split_on_word = o.Get("split_on_word").As<Napi::Boolean>();
}
if (o.Has("max_tokens")) {
params.max_tokens = o.Get("max_tokens").As<Napi::Number>();
}
if (o.Has("debug_mode")) {
params.debug_mode = o.Get("debug_mode").As<Napi::Boolean>();
}
if (o.Has("audio_ctx")) {
params.audio_ctx = o.Get("audio_ctx").As<Napi::Number>();
}
if (o.Has("tdrz_enable")) {
params.tdrz_enable = o.Get("tdrz_enable").As<Napi::Boolean>();
}
if (o.Has("initial_prompt") && o.Get("initial_prompt").IsString()) {
std::string initial_prompt = o.Get("initial_prompt").As<Napi::String>().Utf8Value();
params.initial_prompt = strdup(initial_prompt.c_str());
} else {
params.initial_prompt = nullptr;
}
if (o.Has("language") && o.Get("language").IsString()) {
std::string language = o.Get("language").As<Napi::String>().Utf8Value();
params.language = strdup(language.c_str());
} else {
params.language = strdup("auto");
}
if (o.Has("suppress_blank") && o.Get("suppress_blank").IsBoolean()) {
params.suppress_blank = o.Get("suppress_blank").As<Napi::Boolean>();
}
if (o.Has("suppress_non_speech_tokens") && o.Get("suppress_non_speech_tokens").IsBoolean()) {
params.suppress_non_speech_tokens = o.Get("suppress_non_speech_tokens").As<Napi::Boolean>();
}
if (o.Has("temperature")) {
params.temperature = o.Get("temperature").As<Napi::Number>();
}
if (o.Has("max_initial_ts")) {
params.max_initial_ts = o.Get("max_initial_ts").As<Napi::Number>();
}
if (o.Has("length_penalty")) {
params.length_penalty = o.Get("length_penalty").As<Napi::Number>();
}
if (o.Has("temperature_inc")) {
params.temperature_inc = o.Get("temperature_inc").As<Napi::Number>();
}
if (o.Has("entropy_thold")) {
params.entropy_thold = o.Get("entropy_thold").As<Napi::Number>();
}
if (o.Has("logprob_thold")) {
params.logprob_thold = o.Get("logprob_thold").As<Napi::Number>();
}
if (o.Has("no_speech_thold")) {
params.no_speech_thold = o.Get("no_speech_thold").As<Napi::Number>();
}
if (o.Has("best_of")) {
params.greedy.best_of = o.Get("best_of").As<Napi::Number>();
}
if (o.Has("beam_size")) {
params.beam_search.beam_size = o.Get("beam_size").As<Napi::Number>();
}
return params;
}
struct smart_whisper_transcribe_params smart_whisper_transcribe_params_from_js(Napi::Object o) {
struct smart_whisper_transcribe_params params;
if (o.Has("format") && o.Get("format").IsString()) {
std::string format = o.Get("format").As<Napi::String>().Utf8Value();
params.format = strdup(format.c_str());
} else {
params.format = strdup("simple");
}
return params;
}
class TranscribeWorker : public Napi::AsyncProgressQueueWorker<int> {
public:
TranscribeWorker(whisper_context* context, const float* samples, int n_samples,
struct whisper_full_params params,
struct smart_whisper_transcribe_params smart_params,
Napi::Function& finish_callback, Napi::Function& progress_callback)
: AsyncProgressQueueWorker(finish_callback),
context(context),
samples(samples),
n_samples(n_samples),
params(params),
smart_params(smart_params) {
this->progress_callback.Reset(progress_callback, 1);
state = nullptr;
}
~TranscribeWorker() {
delete[] samples;
// whisper_free_params(&params); will lead to a double free
if (params.initial_prompt != nullptr) {
free((void*)params.initial_prompt);
}
if (params.language != nullptr) {
free((void*)params.language);
}
if (state != nullptr) {
whisper_free_state(state);
}
free((void*)smart_params.format);
}
void Execute(const ExecutionProgress& progress) override {
state = whisper_init_state(context);
params.new_segment_callback = [](struct whisper_context* ctx, struct whisper_state* state,
int n_new, void* user_data) {
const ExecutionProgress& progress = *(ExecutionProgress*)user_data;
const int i = whisper_full_n_segments_from_state(state) - 1;
progress.Send(&i, 1);
};
params.new_segment_callback_user_data = (void*)&progress;
int err = whisper_full_with_state(context, state, params, samples, n_samples);
if (err != 0) {
SetError("whisper_full operation failed");
}
}
void OnProgress(const int* data, size_t _count) override {
Napi::HandleScope scope(Env());
if (this->progress_callback.IsEmpty()) {
return;
}
int i = (*data);
Napi::Object segment = Napi::Object::New(Env());
segment.Set("from", Napi::Number::New(
Env(), whisper_full_get_segment_t0_from_state(state, i) * 10));
segment.Set(
"to", Napi::Number::New(Env(), whisper_full_get_segment_t1_from_state(state, i) * 10));
segment.Set("text",
Napi::String::New(Env(), whisper_full_get_segment_text_from_state(state, i)));
if (strcmp(smart_params.format, "detail") == 0) {
float confidence = 0, min_p = 1, max_p = 0;
int skips = 0;
int tokens = whisper_full_n_tokens_from_state(state, i);
Napi::Array tokens_array = Napi::Array::New(Env(), tokens);
for (int j = 0; j < tokens; j++) {
auto token = whisper_full_get_token_data_from_state(state, i, j);
Napi::Object token_object = Napi::Object::New(Env());
token_object.Set("text",
Napi::String::New(Env(), whisper_full_get_token_text_from_state(
context, state, i, j)));
token_object.Set("id", Napi::Number::New(Env(), token.id));
token_object.Set("p", Napi::Number::New(Env(), token.p));
tokens_array.Set(j, token_object);
if (token.id > whisper_token_eot(context)) {
skips++;
continue;
}
confidence += token.p;
min_p = std::min(min_p, token.p);
max_p = std::max(max_p, token.p);
}
if (tokens > 2) {
confidence = (confidence - min_p - max_p) / (tokens - 2 - skips);
} else {
confidence = confidence / (tokens - skips);
}
segment.Set(
"lang",
Napi::String::New(Env(), whisper_lang_str(whisper_full_lang_id_from_state(state))));
segment.Set("confidence", Napi::Number::New(Env(), confidence));
segment.Set("tokens", tokens_array);
}
this->progress_callback.Call({segment});
}
void OnOK() override {
Napi::HandleScope scope(Env());
int n_segments = whisper_full_n_segments_from_state(state);
Napi::Array segments = Napi::Array::New(Env(), n_segments);
for (int i = 0; i < n_segments; i++) {
Napi::Object segment = Napi::Object::New(Env());
segment.Set("from", Napi::Number::New(
Env(), whisper_full_get_segment_t0_from_state(state, i) * 10));
segment.Set("to", Napi::Number::New(
Env(), whisper_full_get_segment_t1_from_state(state, i) * 10));
segment.Set("text", Napi::String::New(
Env(), whisper_full_get_segment_text_from_state(state, i)));
if (strcmp(smart_params.format, "detail") == 0) {
float confidence = 0, min_p = 1, max_p = 0;
int skips = 0;
int tokens = whisper_full_n_tokens_from_state(state, i);
Napi::Array tokens_array = Napi::Array::New(Env(), tokens);
for (int j = 0; j < tokens; j++) {
auto token = whisper_full_get_token_data_from_state(state, i, j);
Napi::Object token_object = Napi::Object::New(Env());
token_object.Set(
"text", Napi::String::New(Env(), whisper_full_get_token_text_from_state(
context, state, i, j)));
token_object.Set("id", Napi::Number::New(Env(), token.id));
token_object.Set("p", Napi::Number::New(Env(), token.p));
if (params.token_timestamps) {
token_object.Set("from", Napi::Number::New(Env(), token.t0 * 10));
token_object.Set("to", Napi::Number::New(Env(), token.t1 * 10));
}
tokens_array.Set(j, token_object);
if (token.id > whisper_token_eot(context)) {
skips++;
continue;
}
confidence += token.p;
min_p = std::min(min_p, token.p);
max_p = std::max(max_p, token.p);
}
if (tokens - skips > 2) {
confidence = (confidence - min_p - max_p) / (tokens - skips - 2);
} else if (tokens - skips > 0) {
confidence = confidence / (tokens - skips);
}
segment.Set("lang",
Napi::String::New(
Env(), whisper_lang_str(whisper_full_lang_id_from_state(state))));
segment.Set("confidence", Napi::Number::New(Env(), confidence));
segment.Set("tokens", tokens_array);
}
segments.Set(i, segment);
}
Callback().Call({segments});
}
private:
whisper_context* context;
whisper_state* state;
const float* samples;
int n_samples;
struct whisper_full_params params;
struct smart_whisper_transcribe_params smart_params;
Napi::FunctionReference progress_callback;
};
Napi::Value Transcribe(const Napi::CallbackInfo& info) {
Napi::Env env = info.Env();
if (info.Length() != 5) {
Napi::TypeError::New(env, "Wrong number of arguments").ThrowAsJavaScriptException();
return env.Null();
}
whisper_context* context = info[0].As<Napi::External<whisper_context>>().Data();
Napi::Float32Array pcm = info[1].As<Napi::Float32Array>();
float* samples = new float[pcm.ElementLength()];
memcpy(samples, pcm.Data(), pcm.ByteLength());
int n_samples = static_cast<int>(pcm.ElementLength());
Napi::Object params = info[2].As<Napi::Object>();
auto whisper_params = whisper_full_params_from_js(params);
auto smart_params = smart_whisper_transcribe_params_from_js(params);
Napi::Function finish_callback = info[3].As<Napi::Function>();
Napi::Function progress_callback = info[4].As<Napi::Function>();
auto worker = new TranscribeWorker(context, samples, n_samples, whisper_params, smart_params,
finish_callback, progress_callback);
worker->Queue();
return env.Undefined();
}

View file

@ -1,9 +0,0 @@
#ifndef _GUARD_SW_TRANSCRIBE_H
#define _GUARD_SW_TRANSCRIBE_H
#include "common.h"
#include "whisper.h"
Napi::Value Transcribe(const Napi::CallbackInfo& info);
#endif

View file

@ -1,97 +0,0 @@
import os from "node:os";
import { execSync } from "node:child_process";
type ComputeBackend = "cpu" | "accelerate" | "metal" | "clblast" | "openblas";
const cfg = config();
export const sources = cfg.sources.join(" ");
export const defines = cfg.defines.join(" ");
export const libraries = cfg.libraries.join(" ");
function config(): {
sources: string[];
defines: string[];
libraries: string[];
} {
if (process.env.BYOL) {
return {
sources: [],
defines: [],
libraries: [process.env.BYOL],
};
}
const COMPUTE_BACKEND: ComputeBackend =
(process.env.COMPUTE_BACKEND as ComputeBackend | undefined) ??
infer_backend();
const cfg = {
sources: [
"whisper.cpp/src/whisper.cpp",
"whisper.cpp/ggml/src/ggml.c",
"whisper.cpp/ggml/src/ggml-alloc.c",
"whisper.cpp/ggml/src/ggml-backend.c",
"whisper.cpp/ggml/src/ggml-quants.c",
"whisper.cpp/ggml/src/ggml-aarch64.c",
] as string[],
defines: [] as string[],
libraries: [] as string[],
};
switch (COMPUTE_BACKEND) {
case "accelerate": {
cfg.defines.push("GGML_USE_ACCELERATE");
cfg.libraries.push('"-framework Foundation"');
cfg.libraries.push('"-framework Accelerate"');
break;
}
case "metal": {
cfg.sources.push("whisper.cpp/ggml/src/ggml-metal.m");
cfg.defines.push("GGML_USE_ACCELERATE");
cfg.defines.push("GGML_USE_METAL");
cfg.libraries.push('"-framework Foundation"');
cfg.libraries.push('"-framework Accelerate"');
cfg.libraries.push('"-framework Metal"');
cfg.libraries.push('"-framework MetalKit"');
break;
}
case "openblas": {
cfg.defines.push("GGML_USE_OPENBLAS");
cfg.libraries.push("-lopenblas");
break;
}
default: {
}
}
return cfg;
}
function infer_backend(): ComputeBackend {
let backend: ComputeBackend = "cpu";
try {
if (os.platform() === "darwin") {
backend = "accelerate";
if (os.arch() === "arm64") {
backend = "metal";
}
} else if (os.platform() === "linux") {
const has_libopenblas = !!execSync("ldconfig -p | grep libopenblas")
.toString()
.trim();
if (has_libopenblas) {
backend = "openblas";
}
}
} catch {
// if anything goes wrong, just use the default cpu backend
}
return backend;
}

View file

@ -1,7 +0,0 @@
export * from "./binding";
export * from "./model";
export * from "./transcribe";
export * from "./types";
export * from "./whisper";
export * as manager from "./model-manager";

View file

@ -1,125 +0,0 @@
import path from "node:path";
import fs from "node:fs";
import os from "node:os";
import { Readable } from "node:stream";
import type { ReadableStream } from "node:stream/web";
const root = path.join(os.homedir(), ".smart-whisper");
const models = path.join(root, "models");
const ext = ".bin";
fs.mkdirSync(models, { recursive: true });
const BASE_MODELS_URL =
"https://huggingface.co/ggerganov/whisper.cpp/resolve/main";
/**
* MODELS is an object that contains the URLs of different ggml whisper models.
* Each model is represented by a key-value pair, where the key is the model name
* and the value is the URL of the model.
*/
export const MODELS = {
tiny: `${BASE_MODELS_URL}/ggml-tiny.bin`,
"tiny.en": `${BASE_MODELS_URL}/ggml-tiny.en.bin`,
small: `${BASE_MODELS_URL}/ggml-small.bin`,
"small.en": `${BASE_MODELS_URL}/ggml-small.en.bin`,
base: `${BASE_MODELS_URL}/ggml-base.bin`,
"base.en": `${BASE_MODELS_URL}/ggml-base.en.bin`,
medium: `${BASE_MODELS_URL}/ggml-medium.bin`,
"medium.en": `${BASE_MODELS_URL}/ggml-medium.en.bin`,
"large-v1": `${BASE_MODELS_URL}/ggml-large-v1.bin`,
"large-v2": `${BASE_MODELS_URL}/ggml-large-v2.bin`,
"large-v3": `${BASE_MODELS_URL}/ggml-large-v3.bin`,
"large-v3-turbo": `${BASE_MODELS_URL}/ggml-large-v3-turbo.bin`,
} as const;
export type ModelName = keyof typeof MODELS | (string & {});
/**
* Downloads a ggml whisper model from a specified URL or shorthand.
*
* @param model - The model to download, specified either as a key of the {@link MODELS} object or as a URL.
* @returns A promise that resolves to the name of the downloaded model.
* @throws An error if the model URL or shorthand is invalid, or if the model fails to download.
*/
export async function download(model: ModelName): Promise<string> {
let url = "",
name = "";
if (model in MODELS) {
url = MODELS[model as keyof typeof MODELS];
name = model;
} else {
try {
url = new URL(model).href;
name = new URL(url).pathname.split("/").pop() ?? "";
} catch {}
}
if (!url) {
throw new Error(`Invalid model URL or shorthand: ${model}`);
}
if (!name) {
throw new Error(`Failed to parse model name: ${url}`);
}
if (check(name)) {
return name;
}
const res = await fetch(url);
if (!res.ok || !res.body) {
throw new Error(`Failed to download model: ${res.statusText}`);
}
const stream = fs.createWriteStream(
path.join(models, name.endsWith(ext) ? name : name + ext),
);
Readable.fromWeb(res.body as ReadableStream<Uint8Array>).pipe(stream);
return new Promise((resolve) => stream.on("finish", () => resolve(name)));
}
/**
* Removes a locally downloaded model.
* @param model - The name of the model to remove.
*/
export function remove(model: ModelName): void {
if (check(model)) {
fs.unlinkSync(path.join(models, model + ext));
}
}
/**
* Retrieves a list of model names that are available locally.
* @returns An array of model names.
*/
export function list(): ModelName[] {
const files = fs.readdirSync(models).filter((file) => file.endsWith(ext));
return files.map((file) => file.slice(0, -ext.length));
}
/**
* Checks if a model exists.
* @param model - The name of the model.
* @returns True if the model exists, false otherwise.
*/
export function check(model: ModelName): boolean {
return fs.existsSync(path.join(models, model + ext));
}
/**
* Resolves the absolute path of a model.
* @param model - The name of the model.
* @returns The resolved path of the model.
* @throws Error if the model is not found.
*/
export function resolve(model: ModelName): string {
if (check(model)) {
return path.join(models, model + ext);
} else {
throw new Error(`Model not found: ${model}`);
}
}
export const dir = { root, models };

View file

@ -1,3 +0,0 @@
import { binding } from "./binding";
export class WhisperModel extends binding.WhisperModel {}

View file

@ -1,114 +0,0 @@
import EventEmitter from "node:events";
import type { WhisperModel } from "./model";
import { TranscribeFormat, TranscribeParams, TranscribeResult } from "./types";
import { binding } from "./binding";
export class TranscribeTask<
Format extends TranscribeFormat,
TokenTimestamp extends boolean,
> extends EventEmitter {
private _model: WhisperModel;
private _result: Promise<TranscribeResult<Format, TokenTimestamp>[]> | null =
null;
/**
* You should not construct this class directly, use {@link TranscribeTask.run} instead.
*/
constructor(model: WhisperModel) {
super();
this._model = model;
}
get model(): WhisperModel {
return this._model;
}
/**
* A promise that resolves to the result of the transcription task.
*/
get result(): Promise<TranscribeResult<Format, TokenTimestamp>[]> {
if (this._result === null) {
throw new Error("Task has not been started");
}
return this._result;
}
private async _run(
pcm: Float32Array,
params: Partial<TranscribeParams<Format, TokenTimestamp>>,
): Promise<TranscribeResult<Format, TokenTimestamp>[]> {
return new Promise((resolve) => {
const handle = this.model.handle;
if (!handle) {
throw new Error("Model has been freed");
}
binding.transcribe(
handle,
pcm,
params,
(results) => {
this.emit("finish");
resolve(results);
},
(result) => {
this.emit("transcribed", result);
},
);
});
}
static async run<
Format extends TranscribeFormat,
TokenTimestamp extends boolean,
>(
model: WhisperModel,
pcm: Float32Array,
params: Partial<TranscribeParams<Format, TokenTimestamp>>,
): Promise<TranscribeTask<Format, TokenTimestamp>> {
if (model.freed) {
throw new Error("Model has been freed");
}
const task = new TranscribeTask(model);
task._result = task._run(pcm, params);
return task;
}
on(
event: "finish",
listener: (results: TranscribeResult<Format, TokenTimestamp>[]) => void,
): this;
on(
event: "transcribed",
listener: (result: TranscribeResult<Format, TokenTimestamp>) => void,
): this;
on(event: string, listener: (...args: any[]) => void): this {
return super.on(event, listener);
}
once(
event: "finish",
listener: (results: TranscribeResult<Format, TokenTimestamp>[]) => void,
): this;
once(
event: "transcribed",
listener: (result: TranscribeResult<Format, TokenTimestamp>) => void,
): this;
once(event: string, listener: (...args: any[]) => void): this {
return super.once(event, listener);
}
off(
event: "finish",
listener: (results: TranscribeResult<Format, TokenTimestamp>[]) => void,
): this;
off(
event: "transcribed",
listener: (result: TranscribeResult<Format, TokenTimestamp>) => void,
): this;
off(event: string, listener: (...args: any[]) => void): this {
return super.off(event, listener);
}
}

View file

@ -1,104 +0,0 @@
export enum WhisperSamplingStrategy {
WHISPER_SAMPLING_GREEDY,
WHISPER_SAMPLING_BEAM_SEARCH,
}
export type TranscribeFormat = "simple" | "detail";
/**
* See {@link https://github.com/ggerganov/whisper.cpp/blob/00b7a4be02ca82d53ac69dd2dd438c16e2af7658/whisper.h#L433C19-L433C19} for details.
*/
export interface TranscribeParams<
Format extends TranscribeFormat = TranscribeFormat,
TokenTimestamp extends boolean = false,
> {
strategy: WhisperSamplingStrategy;
n_threads: number;
n_max_text_ctx: number;
offset_ms: number;
duration_ms: number;
translate: boolean;
no_context: boolean;
no_timestamps: boolean;
single_segment: boolean;
print_special: boolean;
print_progress: boolean;
print_realtime: boolean;
print_timestamps: boolean;
token_timestamps: TokenTimestamp;
thold_pt: number;
thold_ptsum: number;
max_len: number;
split_on_word: boolean;
max_tokens: number;
speed_up: boolean;
debug_mode: boolean;
audio_ctx: number;
tdrz_enable: boolean;
initial_prompt: string;
/**
* Language code, e.g. "en", "de", "fr", "es", "it", "nl", "pt", "ru", "tr", "uk", "pl", "sv", "cs", "zh", "ja", "ko"
*/
language: string;
suppress_blank: boolean;
suppress_non_speech_tokens: boolean;
temperature: number;
max_initial_ts: number;
length_penalty: number;
temperature_inc: number;
entropy_thold: number;
logprob_thold: number;
no_speech_thold: number;
best_of: number;
beam_size: number;
format: Format;
}
export interface TranscribeSimpleResult {
from: number;
to: number;
text: string;
}
/**
* Represents a detailed result of transcription.
*/
export interface TranscribeDetailedResult<TokenTimestamp extends boolean>
extends TranscribeSimpleResult {
/** The detected spoken language. */
lang: string;
/** The confidence level of the transcription, calculated by the average probability of the tokens. */
confidence: number;
/** The tokens generated during the transcription process. */
tokens: {
/** The text of the token, for CJK languages, due to the BPE encoding, the token text may not be readable. */
text: string;
/** The ID of the token. */
id: number;
/** The probability of the token. */
p: number;
/** The start timestamp of the token, in milliseconds. Only available when `token_timestamps` of {@link TranscribeParams} is `true`. */
from: TokenTimestamp extends true ? number : undefined;
/** The end timestamp of the token, in milliseconds. Only available when `token_timestamps` of {@link TranscribeParams} is `true`. */
to: TokenTimestamp extends true ? number : undefined;
}[];
}
export type TranscribeResult<
Format extends TranscribeFormat = TranscribeFormat,
TokenTimestamp extends boolean = boolean,
> = Format extends "simple"
? TranscribeSimpleResult
: TranscribeDetailedResult<TokenTimestamp>;

View file

@ -1,148 +0,0 @@
import type {
TranscribeFormat,
TranscribeParams,
TranscribeResult,
} from "./types";
import { WhisperModel } from "./model";
import { TranscribeTask } from "./transcribe";
export interface WhisperConfig {
/**
* Time in seconds to wait before offloading the model if it's not being used.
*/
offload: number;
/**
* Whether to use the GPU or not.
*/
gpu: boolean;
}
/**
* The Whisper class is responsible for managing the lifecycle and operations of whisper model.
* It handles the loading and offloading of the model, managing transcription tasks, and configuring model parameters.
*/
export class Whisper {
private _file: string;
private _available: WhisperModel | null = null;
private _loading: Promise<WhisperModel> | null = null;
private _tasks: Promise<TranscribeResult[]>[] = [];
private _config: WhisperConfig;
private _offload_timer: NodeJS.Timeout | null = null;
/**
* Constructs a new Whisper instance with a specified model file and configuration.
* @param file - The path to the Whisper model file.
* @param config - Optional configuration for the Whisper instance.
*/
constructor(file: string, config: Partial<WhisperConfig> = {}) {
this._file = file;
this._config = {
offload: 300,
gpu: true,
...config,
};
}
get file(): string {
return this._file;
}
set file(file: string) {
this._file = file;
}
get config(): WhisperConfig {
return this._config;
}
get tasks(): Promise<TranscribeResult[]>[] {
return this._tasks;
}
reset_offload_timer(): void {
this.clear_offload_timer();
this._offload_timer = setTimeout(() => {
this.free();
}, this.config.offload * 1000);
}
private clear_offload_timer(): void {
if (this._offload_timer !== null) {
clearTimeout(this._offload_timer);
this._offload_timer = null;
}
}
async model(): Promise<WhisperModel> {
if (this._available === null) {
return this.load();
}
this.reset_offload_timer();
return Promise.resolve(this._available);
}
/**
* Loads the whisper model asynchronously.
* If the model is already being loaded, returns the existing one.
*
* You don't need to call this method directly, it's called automatically if necessary when you call {@link Whisper.transcribe}.
*
* @returns A Promise that resolves to the loaded model.
*/
async load(): Promise<WhisperModel> {
if (this._loading !== null) {
return this._loading;
}
const model = WhisperModel.load(this.file, this.config.gpu);
this._loading = model;
this._available = await model;
this._loading = null;
this.reset_offload_timer();
return this._available;
}
/**
* Transcribes the given PCM audio data using the Whisper model.
* @param pcm - The mono 16k PCM audio data to transcribe.
* @param params - Optional parameters for transcription.
* @returns A promise that resolves to the result of the transcription task.
*/
async transcribe<
Format extends TranscribeFormat,
TokenTimestamp extends boolean,
>(
pcm: Float32Array,
params: Partial<TranscribeParams<Format, TokenTimestamp>> = {},
): Promise<TranscribeTask<Format, TokenTimestamp>> {
const model = await this.model();
const task = await TranscribeTask.run<Format, TokenTimestamp>(
model,
pcm,
params,
);
this._tasks.push(task.result);
return task;
}
async free(): Promise<void> {
if (this._available === null) {
return;
}
const model = this._available;
this._available = null;
this.clear_offload_timer();
await Promise.all(this.tasks);
await model.free();
}
}
/**
* Here's a life cycle diagram of a model:
* | Method | (0) Not Available | (1) Loading | (2) Available | (3) Freeing | (0) Not Available |
* |------------|-------------------|-------------|---------------|-------------|-------------------|
* | load | V | - | - | - | V |
* | free | - | - | wait tasks, V | - | - |
* | transcribe | load | load | V | load | load |
*/

View file

@ -1,9 +0,0 @@
{
"extends": "@amical/typescript-config/base.json",
"compilerOptions": {
"outDir": "dist"
},
"include": ["src"],
"exclude": ["node_modules", "dist"],
"types": ["node"]
}

View file

@ -1,18 +0,0 @@
import { defineConfig } from "tsup";
import { readFileSync, writeFileSync } from "node:fs";
export default defineConfig({
entry: ["src/index.ts", "src/build.ts"],
outDir: "dist",
dts: true,
async onSuccess() {
// replace `#include "ggml-common.h" in whisper.cpp/ggml/src/ggml-metal.metal with full content
const metal = readFileSync(
"whisper.cpp/ggml/src/ggml-metal.metal",
"utf-8",
);
const common = readFileSync("whisper.cpp/ggml/src/ggml-common.h", "utf-8");
const replaced = metal.replace(/#include "ggml-common.h"/, common);
writeFileSync("whisper.cpp/ggml/src/ggml-metal.metal", replaced);
},
});

@ -1 +0,0 @@
Subproject commit 2ef717b293fe93872cc3a03ca77942936a281959

View file

@ -0,0 +1,91 @@
# @amical/whisper-wrapper
This package wraps the `whisper.cpp` Node addon so the desktop app can call into
Whisper from a forked worker process. The build and runtime layers are tuned for
the desktop pipeline; the notes below capture the important knobs and the
reasoning behind them.
## Build workflow
- `pnpm install` (postinstall) runs `bin/build-addon.js` via CMake.js and drops
the resulting `whisper.node` into `native/<platform-arch(-tag)>/`.
- `pnpm --filter @amical/whisper-wrapper build:native` rebuilds the default
variants for this platform (Metal + CPU on macOS, CPU elsewhere).
- `pnpm --filter @amical/whisper-wrapper build:native:cuda` builds an extra
`win32-x64-cuda` binary alongside the regular `win32-x64` fallback. Install
the CUDA toolkit (12.x tested) before running it.
- Every macOS build is ad-hoc signed (`codesign -s -`) so Electron/Node can load
it without crashing.
- Each variant is produced as a _single_ `.node` binary. We force static
libraries (`GGML_STATIC=ON`, `BUILD_SHARED_LIBS=OFF`) so all ggml/whisper
code is linked directly into the addon—no sidecar `.dylib/.dll` files ship
at runtime.
- The full CMake build directory is deleted after each variant so Electron
Forge/Squirrel never sees the long `CMakeFiles/...` paths that blew past
Windows MAX_PATH limit during packaging.
## GPU/CPU fallback
`resolveBinding()` in `src/loader.ts` no longer throws if the first candidate
fails. `loadBinding()` walks the list:
1. `platform-arch-metal`
2. `platform-arch-openblas`
3. `platform-arch-cuda`
4. `platform-arch`
5. `cpu-fallback`
If `require()` raises `ERR_DLOPEN_FAILED` (missing runtime, wrong driver, etc.)
it logs a warning and tries the next candidate. That lets us ship CUDA/Metal
binaries alongside CPU ones without breaking installs that lack the GPU stack.
## GGML_NATIVE on macOS arm64
GitHubs hosted macOS runners expose `i8mm` but clang refuses to emit the
`vmmlaq_s32` intrinsic when `-mcpu=native` is passed, so the build dies in
`ggml-cpu/arch/arm/quants.c`. CI therefore exports `GGML_NATIVE=OFF` before
calling the build scripts. Locally you can flip it back on if your toolchain
supports those instructions:
```bash
GGML_NATIVE=ON pnpm --filter @amical/whisper-wrapper build:native
```
Leave it off in CI unless you control the runner.
## Custom targets
`WHISPER_TARGETS` lets you override which variants to build. The value is a
comma-separated list of directory names that should map to `native/<name>`.
Examples:
```bash
WHISPER_TARGETS="linux-x64-gnu" pnpm --filter @amical/whisper-wrapper build:native
WHISPER_TARGETS="win32-x64-cuda,win32-x64" pnpm --filter @amical/whisper-wrapper build:native
```
Absent overrides the script builds the Metal variant (on macOS) followed by the
plain CPU build.
## Runtime API
`src/index.ts` exposes a minimal class that mirrors the desktop worker protocol:
- `new Whisper(modelPath, { gpu?: boolean })`
- `await whisper.load()` (no-op placeholder)
- `await whisper.transcribe(audioOrNull, options)`
- `await whisper.free()`
If you pass `null` (and a `fname_inp` in `options`) the addon reads the audio
file directly, matching the CLI smoke tests.
## Local expectations
- `whisper.cpp` is tracked as a submodule under `packages/whisper-wrapper/`.
- `cmake-js` / `node` / `pnpm` must be installed (the workspace root sets the
required versions).
- The build creates `.cmake-js/` and `.home/` caches inside the package; theyre
ignored in git.
For any tweaks (new build targets, additional fallbacks, etc.) update this file
so the CI configuration stays discoverable.

View file

@ -0,0 +1 @@
v1.7.6

View file

@ -0,0 +1,84 @@
cmake_minimum_required(VERSION 3.20)
project(whisper_node LANGUAGES C CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_definitions(-DNAPI_VERSION=8)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../whisper.cpp/cmake")
set(WHISPER_CPP_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../whisper.cpp")
set(WHISPER_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(WHISPER_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
set(WHISPER_BUILD_SERVER OFF CACHE BOOL "" FORCE)
set(WHISPER_CURL OFF CACHE BOOL "" FORCE)
set(WHISPER_SDL2 OFF CACHE BOOL "" FORCE)
set(WHISPER_FFMPEG OFF CACHE BOOL "" FORCE)
set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
set(GGML_STATIC ON CACHE BOOL "" FORCE)
set(GGML_SHARED OFF CACHE BOOL "" FORCE)
add_subdirectory(${WHISPER_CPP_DIR} whispercpp EXCLUDE_FROM_ALL)
find_package(Threads REQUIRED)
set(ADDON_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/addon.cpp
${WHISPER_CPP_DIR}/examples/common.cpp
${WHISPER_CPP_DIR}/examples/common-ggml.cpp
${WHISPER_CPP_DIR}/examples/common-whisper.cpp
${WHISPER_CPP_DIR}/examples/grammar-parser.cpp
)
add_library(whisper_node SHARED ${ADDON_SOURCES})
set_target_properties(whisper_node PROPERTIES PREFIX "" SUFFIX ".node" OUTPUT_NAME "whisper")
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
target_compile_options(whisper_node PRIVATE -Wall -Wextra -Wno-unused-parameter)
endif()
set_target_properties(whisper_node PROPERTIES
CXX_VISIBILITY_PRESET hidden
VISIBILITY_INLINES_HIDDEN ON)
# CMake-js variables
if (DEFINED CMAKE_JS_INC)
string(REPLACE ";" " " TMP_CMAKE_JS_INC "${CMAKE_JS_INC}")
endif()
if (DEFINED CMAKE_JS_LIB)
string(REPLACE ";" " " TMP_CMAKE_JS_LIB "${CMAKE_JS_LIB}")
endif()
if (DEFINED TMP_CMAKE_JS_INC)
separate_arguments(TMP_CMAKE_JS_INC)
foreach(INC ${TMP_CMAKE_JS_INC})
target_include_directories(whisper_node PRIVATE "${INC}")
endforeach()
endif()
if (DEFINED TMP_CMAKE_JS_LIB)
separate_arguments(TMP_CMAKE_JS_LIB)
endif()
# Include directories
target_include_directories(whisper_node PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${WHISPER_CPP_DIR}/include
${WHISPER_CPP_DIR}/ggml/include
${WHISPER_CPP_DIR}/examples
)
# Link libraries
if (DEFINED TMP_CMAKE_JS_LIB)
target_link_libraries(whisper_node PRIVATE ${TMP_CMAKE_JS_LIB})
endif()
target_link_libraries(whisper_node PRIVATE whisper Threads::Threads)
# On macOS we need to allow undefined symbols for node addon
if (APPLE)
target_link_options(whisper_node PRIVATE "-undefined" "dynamic_lookup")
endif()

View file

@ -0,0 +1,455 @@
#include "napi.h"
#include "whisper.h"
#include "common.h"
#include "common-whisper.h"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <memory>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
namespace {
struct WhisperHandle {
std::mutex mutex;
whisper_context* ctx = nullptr;
bool freed = false;
};
struct TokenData {
std::string text;
int id = 0;
float p = 0.0f;
int from_ms = -1;
int to_ms = -1;
};
struct SegmentData {
int from_ms = 0;
int to_ms = 0;
std::string text;
float confidence = 0.0f;
std::string language;
std::vector<TokenData> tokens;
};
struct FullParamConfig {
whisper_full_params params;
std::string initial_prompt;
std::string language;
bool detailed = false;
bool token_timestamps = false;
};
FullParamConfig parse_full_params(const Napi::Env env, const Napi::Object& options) {
FullParamConfig cfg;
cfg.params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
if (options.Has("strategy")) {
cfg.params.strategy = static_cast<whisper_sampling_strategy>(
options.Get("strategy").As<Napi::Number>().Int32Value());
}
if (options.Has("n_threads")) {
cfg.params.n_threads = options.Get("n_threads").As<Napi::Number>().Int32Value();
}
if (options.Has("n_max_text_ctx")) {
cfg.params.n_max_text_ctx = options.Get("n_max_text_ctx").As<Napi::Number>().Int32Value();
}
if (options.Has("offset_ms")) {
cfg.params.offset_ms = options.Get("offset_ms").As<Napi::Number>().Int32Value();
}
if (options.Has("duration_ms")) {
cfg.params.duration_ms = options.Get("duration_ms").As<Napi::Number>().Int32Value();
}
if (options.Has("translate")) {
cfg.params.translate = options.Get("translate").As<Napi::Boolean>().Value();
}
if (options.Has("no_context")) {
cfg.params.no_context = options.Get("no_context").As<Napi::Boolean>().Value();
}
if (options.Has("no_timestamps")) {
cfg.params.no_timestamps = options.Get("no_timestamps").As<Napi::Boolean>().Value();
}
if (options.Has("single_segment")) {
cfg.params.single_segment = options.Get("single_segment").As<Napi::Boolean>().Value();
}
if (options.Has("print_special")) {
cfg.params.print_special = options.Get("print_special").As<Napi::Boolean>().Value();
}
if (options.Has("print_progress")) {
cfg.params.print_progress = options.Get("print_progress").As<Napi::Boolean>().Value();
} else {
cfg.params.print_progress = false;
}
if (options.Has("print_realtime")) {
cfg.params.print_realtime = options.Get("print_realtime").As<Napi::Boolean>().Value();
}
if (options.Has("print_timestamps")) {
cfg.params.print_timestamps = options.Get("print_timestamps").As<Napi::Boolean>().Value();
}
if (options.Has("token_timestamps")) {
cfg.params.token_timestamps = options.Get("token_timestamps").As<Napi::Boolean>().Value();
}
cfg.token_timestamps = cfg.params.token_timestamps;
if (options.Has("thold_pt")) {
cfg.params.thold_pt = options.Get("thold_pt").As<Napi::Number>();
}
if (options.Has("thold_ptsum")) {
cfg.params.thold_ptsum = options.Get("thold_ptsum").As<Napi::Number>();
}
if (options.Has("max_len")) {
cfg.params.max_len = options.Get("max_len").As<Napi::Number>().Int32Value();
}
if (options.Has("split_on_word")) {
cfg.params.split_on_word = options.Get("split_on_word").As<Napi::Boolean>().Value();
}
if (options.Has("max_tokens")) {
cfg.params.max_tokens = options.Get("max_tokens").As<Napi::Number>().Int32Value();
}
if (options.Has("debug_mode")) {
cfg.params.debug_mode = options.Get("debug_mode").As<Napi::Boolean>().Value();
}
if (options.Has("audio_ctx")) {
cfg.params.audio_ctx = options.Get("audio_ctx").As<Napi::Number>().Int32Value();
}
if (options.Has("tdrz_enable")) {
cfg.params.tdrz_enable = options.Get("tdrz_enable").As<Napi::Boolean>().Value();
}
if (options.Has("initial_prompt") && options.Get("initial_prompt").IsString()) {
cfg.initial_prompt = options.Get("initial_prompt").As<Napi::String>();
}
if (options.Has("language") && options.Get("language").IsString()) {
cfg.language = options.Get("language").As<Napi::String>();
} else {
cfg.language = "auto";
}
if (options.Has("suppress_blank")) {
cfg.params.suppress_blank = options.Get("suppress_blank").As<Napi::Boolean>().Value();
}
if (options.Has("suppress_non_speech_tokens")) {
cfg.params.suppress_nst = options.Get("suppress_non_speech_tokens").As<Napi::Boolean>().Value();
}
if (options.Has("temperature")) {
cfg.params.temperature = options.Get("temperature").As<Napi::Number>();
}
if (options.Has("max_initial_ts")) {
cfg.params.max_initial_ts = options.Get("max_initial_ts").As<Napi::Number>().Int32Value();
}
if (options.Has("length_penalty")) {
cfg.params.length_penalty = options.Get("length_penalty").As<Napi::Number>();
}
if (options.Has("temperature_inc")) {
cfg.params.temperature_inc = options.Get("temperature_inc").As<Napi::Number>();
}
if (options.Has("entropy_thold")) {
cfg.params.entropy_thold = options.Get("entropy_thold").As<Napi::Number>();
}
if (options.Has("logprob_thold")) {
cfg.params.logprob_thold = options.Get("logprob_thold").As<Napi::Number>();
}
if (options.Has("no_speech_thold")) {
cfg.params.no_speech_thold = options.Get("no_speech_thold").As<Napi::Number>();
}
if (options.Has("best_of")) {
cfg.params.greedy.best_of = options.Get("best_of").As<Napi::Number>().Int32Value();
}
if (options.Has("beam_size")) {
cfg.params.beam_search.beam_size = options.Get("beam_size").As<Napi::Number>().Int32Value();
if (cfg.params.beam_search.beam_size > 1) {
cfg.params.strategy = WHISPER_SAMPLING_BEAM_SEARCH;
}
}
if (options.Has("prompt") && options.Get("prompt").IsString() && cfg.initial_prompt.empty()) {
cfg.initial_prompt = options.Get("prompt").As<Napi::String>();
}
if (options.Has("format") && options.Get("format").IsString()) {
std::string format = options.Get("format").As<Napi::String>();
std::transform(format.begin(), format.end(), format.begin(), ::tolower);
cfg.detailed = (format == "detail");
}
if (options.Has("detect_language")) {
cfg.params.detect_language = options.Get("detect_language").As<Napi::Boolean>().Value();
}
if (cfg.language.empty()) {
cfg.language = "auto";
}
return cfg;
}
Napi::External<WhisperHandle> wrap_handle(Napi::Env env, WhisperHandle* handle) {
return Napi::External<WhisperHandle>::New(
env,
handle,
[](Napi::Env /*env*/, WhisperHandle* ptr) {
if (!ptr) return;
std::lock_guard<std::mutex> guard(ptr->mutex);
if (!ptr->freed && ptr->ctx) {
whisper_free(ptr->ctx);
ptr->ctx = nullptr;
ptr->freed = true;
}
delete ptr;
});
}
WhisperHandle* unwrap_handle(const Napi::CallbackInfo& info, size_t index) {
if (info.Length() <= index || !info[index].IsExternal()) {
throw Napi::TypeError::New(info.Env(), "Invalid context handle");
}
return info[index].As<Napi::External<WhisperHandle>>().Data();
}
std::vector<float> extract_audio(const Napi::Env env, const Napi::Object& options) {
std::vector<float> pcmf32;
if (options.Has("audio") && options.Get("audio").IsTypedArray()) {
Napi::Float32Array array = options.Get("audio").As<Napi::Float32Array>();
pcmf32.resize(array.ElementLength());
std::copy(array.Data(), array.Data() + array.ElementLength(), pcmf32.begin());
}
return pcmf32;
}
std::vector<std::string> extract_files(const Napi::Object& options) {
std::vector<std::string> files;
if (options.Has("fname_inp")) {
const auto value = options.Get("fname_inp");
if (value.IsString()) {
files.emplace_back(value.As<Napi::String>());
}
}
return files;
}
Napi::Value init_model(const Napi::CallbackInfo& info) {
Napi::Env env = info.Env();
if (info.Length() < 1 || !info[0].IsObject()) {
throw Napi::TypeError::New(env, "Expected init options object");
}
auto options = info[0].As<Napi::Object>();
if (!options.Has("model") || !options.Get("model").IsString()) {
throw Napi::TypeError::New(env, "Missing 'model' path");
}
std::string model = options.Get("model").As<Napi::String>();
bool use_gpu = true;
if (options.Has("gpu")) {
use_gpu = options.Get("gpu").As<Napi::Boolean>();
} else if (options.Has("use_gpu")) {
use_gpu = options.Get("use_gpu").As<Napi::Boolean>();
}
bool flash_attn = false;
if (options.Has("flash_attn")) {
flash_attn = options.Get("flash_attn").As<Napi::Boolean>();
}
whisper_context_params cparams = whisper_context_default_params();
cparams.use_gpu = use_gpu;
cparams.flash_attn = flash_attn;
whisper_context* ctx = whisper_init_from_file_with_params(model.c_str(), cparams);
if (ctx == nullptr) {
throw Napi::Error::New(env, "Failed to initialize whisper context");
}
auto* handle = new WhisperHandle();
handle->ctx = ctx;
return wrap_handle(env, handle);
}
Napi::Value free_model(const Napi::CallbackInfo& info) {
Napi::Env env = info.Env();
WhisperHandle* handle = unwrap_handle(info, 0);
std::lock_guard<std::mutex> guard(handle->mutex);
if (!handle->freed && handle->ctx) {
whisper_free(handle->ctx);
handle->ctx = nullptr;
handle->freed = true;
}
return env.Undefined();
}
Napi::Array build_segments(const Napi::Env env,
whisper_context* ctx,
const FullParamConfig& cfg,
const std::vector<float>& pcmf32,
const std::vector<std::vector<float>>& pcmf32s) {
const int n_segments = whisper_full_n_segments(ctx);
Napi::Array segments = Napi::Array::New(env, n_segments);
const std::string detected_language = whisper_lang_str(whisper_full_lang_id(ctx));
for (int i = 0; i < n_segments; ++i) {
SegmentData segment;
segment.from_ms = whisper_full_get_segment_t0(ctx, i) * 10;
segment.to_ms = whisper_full_get_segment_t1(ctx, i) * 10;
segment.text = whisper_full_get_segment_text(ctx, i);
if (cfg.detailed) {
const int n_tokens = whisper_full_n_tokens(ctx, i);
segment.tokens.reserve(n_tokens);
float confidence_sum = 0.0f;
float min_p = 1.0f;
float max_p = 0.0f;
int valid_tokens = 0;
for (int j = 0; j < n_tokens; ++j) {
whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
TokenData token_data;
token_data.text = whisper_full_get_token_text(ctx, i, j);
token_data.id = token.id;
token_data.p = token.p;
if (cfg.token_timestamps) {
token_data.from_ms = token.t0 * 10;
token_data.to_ms = token.t1 * 10;
}
segment.tokens.push_back(std::move(token_data));
if (token.id > whisper_token_eot(ctx)) {
continue;
}
confidence_sum += token.p;
min_p = std::min(min_p, token.p);
max_p = std::max(max_p, token.p);
++valid_tokens;
}
if (valid_tokens > 2) {
segment.confidence =
(confidence_sum - min_p - max_p) / static_cast<float>(valid_tokens - 2);
} else if (valid_tokens > 0) {
segment.confidence = confidence_sum / static_cast<float>(valid_tokens);
} else {
segment.confidence = 0.0f;
}
segment.language = detected_language;
}
Napi::Object jsSegment = Napi::Object::New(env);
jsSegment.Set("from", Napi::Number::New(env, segment.from_ms));
jsSegment.Set("to", Napi::Number::New(env, segment.to_ms));
jsSegment.Set("text", Napi::String::New(env, segment.text));
if (cfg.detailed) {
jsSegment.Set("lang", Napi::String::New(env, segment.language));
jsSegment.Set("confidence", Napi::Number::New(env, segment.confidence));
Napi::Array jsTokens = Napi::Array::New(env, segment.tokens.size());
for (size_t t = 0; t < segment.tokens.size(); ++t) {
const TokenData& token = segment.tokens[t];
Napi::Object jsToken = Napi::Object::New(env);
jsToken.Set("text", Napi::String::New(env, token.text));
jsToken.Set("id", Napi::Number::New(env, token.id));
jsToken.Set("p", Napi::Number::New(env, token.p));
if (cfg.token_timestamps) {
jsToken.Set("from", Napi::Number::New(env, token.from_ms));
jsToken.Set("to", Napi::Number::New(env, token.to_ms));
}
jsTokens.Set(t, jsToken);
}
jsSegment.Set("tokens", jsTokens);
}
segments.Set(i, jsSegment);
}
return segments;
}
Napi::Value full_transcribe(const Napi::CallbackInfo& info) {
Napi::Env env = info.Env();
if (info.Length() < 2 || !info[1].IsObject()) {
throw Napi::TypeError::New(env, "Expected arguments (handle, options)");
}
WhisperHandle* handle = unwrap_handle(info, 0);
if (handle->freed || handle->ctx == nullptr) {
throw Napi::Error::New(env, "Model has been freed");
}
auto options = info[1].As<Napi::Object>();
std::vector<float> pcmf32 = extract_audio(env, options);
std::vector<std::vector<float>> pcmf32s;
std::vector<std::string> files = extract_files(options);
if (pcmf32.empty()) {
if (files.empty()) {
throw Napi::Error::New(env, "No audio provided (audio buffer or fname_inp required)");
}
if (!::read_audio_data(files[0], pcmf32, pcmf32s, false)) {
throw Napi::Error::New(env, "Failed to read input audio file");
}
}
FullParamConfig cfg = parse_full_params(env, options);
if (cfg.language.empty()) {
cfg.language = "auto";
}
cfg.params.language = cfg.language.c_str();
cfg.params.initial_prompt = cfg.initial_prompt.empty() ? nullptr : cfg.initial_prompt.c_str();
int n_processors = 1;
if (options.Has("n_processors")) {
n_processors = std::max(1, options.Get("n_processors").As<Napi::Number>().Int32Value());
}
std::lock_guard<std::mutex> guard(handle->mutex);
int result = whisper_full_parallel(
handle->ctx,
cfg.params,
pcmf32.data(),
static_cast<int>(pcmf32.size()),
n_processors);
if (result != 0) {
throw Napi::Error::New(env, "whisper_full_parallel failed");
}
return build_segments(env, handle->ctx, cfg, pcmf32, pcmf32s);
}
} // namespace
Napi::Object InitAll(Napi::Env env, Napi::Object exports) {
exports.Set("init", Napi::Function::New(env, init_model));
exports.Set("full", Napi::Function::New(env, full_transcribe));
exports.Set("free", Napi::Function::New(env, free_model));
return exports;
}
NODE_API_MODULE(whisper, InitAll)

View file

@ -0,0 +1,9 @@
{
"name": "@amical/whisper-node-addon",
"private": true,
"binary": {
"napi_versions": [
8
]
}
}

View file

@ -0,0 +1,296 @@
#!/usr/bin/env node
/*
* build-addon.js
* --------------------------------------------------
* Compiles the whisper.cpp Node addon (examples/addon.node) for the current
* platform/arch with acceleration flags, then places the resulting
* `whisper.node` binary in native/<target>/.
*
* NOTE: This is an initial scaffold. It expects the whisper.cpp sources to be
* vendored at `./whisper.cpp` (git submodule or manual copy). You can refine
* the build flags as needed.
*/
const { execSync } = require("child_process");
const path = require("path");
const fs = require("fs");
function run(cmd, opts = {}) {
console.log(`[build-addon] ${cmd}`);
execSync(cmd, { stdio: "inherit", ...opts });
}
const pkgDir = path.resolve(__dirname, "..");
const addonDir = path.join(pkgDir, "addon");
const whisperDir = path.join(pkgDir, "whisper.cpp");
if (!fs.existsSync(addonDir) || !fs.existsSync(whisperDir)) {
console.error(
"whisper.cpp sources not found. Please add them to packages/whisper-wrapper/whisper.cpp",
);
process.exit(1);
}
const buildDir = path.join(pkgDir, "build");
if (!fs.existsSync(buildDir)) fs.mkdirSync(buildDir);
const cacheDir = path.join(pkgDir, ".cmake-js");
if (!fs.existsSync(cacheDir)) fs.mkdirSync(cacheDir);
const homeDir = path.join(pkgDir, ".home");
if (!fs.existsSync(homeDir)) fs.mkdirSync(homeDir);
function resolveLibExecutable(env, arch) {
const archDir = arch === "ia32" ? "x86" : arch === "arm64" ? "arm64" : "x64";
const hostDir = arch === "ia32" ? "Hostx86" : "Hostx64";
const candidates = [];
const addIfExists = (candidate) => {
if (candidate && fs.existsSync(candidate) && !candidates.includes(candidate)) {
candidates.push(candidate);
}
};
try {
const whereOutput = execSync("where lib.exe", {
env,
stdio: ["ignore", "pipe", "ignore"],
})
.toString()
.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean);
for (const line of whereOutput) {
addIfExists(line);
}
} catch (err) {
// ignore when lib.exe is not on PATH; fall back to manual probing
}
const probeVersionedDir = (dir) => {
if (!dir || !fs.existsSync(dir) || !fs.statSync(dir).isDirectory()) return;
const entries = fs
.readdirSync(dir, { withFileTypes: true })
.filter((entry) => entry.isDirectory())
.map((entry) => entry.name)
.sort((a, b) => b.localeCompare(a, undefined, { numeric: true, sensitivity: "base" }));
for (const entry of entries) {
const candidate = path.join(dir, entry, "bin", hostDir, archDir, "lib.exe");
if (fs.existsSync(candidate)) {
addIfExists(candidate);
break;
}
}
};
const probeInstallDir = (installDir) => {
if (!installDir) return;
if (fs.existsSync(installDir) && fs.statSync(installDir).isFile()) {
addIfExists(installDir);
return;
}
const directCandidate = path.join(installDir, "bin", hostDir, archDir, "lib.exe");
addIfExists(directCandidate);
const toolsDir = path.join(installDir, "Tools", "MSVC");
probeVersionedDir(toolsDir);
};
probeInstallDir(env.VCToolsInstallDir);
probeInstallDir(env.VCINSTALLDIR);
probeInstallDir(env.VSINSTALLDIR && path.join(env.VSINSTALLDIR, "VC"));
probeVersionedDir("C:/Program Files/Microsoft Visual Studio/2022/Enterprise/VC/Tools/MSVC");
probeVersionedDir("C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC");
probeVersionedDir("C:/Program Files/Microsoft Visual Studio/2022/Professional/VC/Tools/MSVC");
probeVersionedDir("C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC");
return candidates[0] || null;
}
function ensureWindowsNodeImportLib(buildVariantDir, arch, env) {
if (process.platform !== "win32") return;
const nodeImportLib = path.join(buildVariantDir, "node.lib");
if (fs.existsSync(nodeImportLib)) return;
let headersPackageJson;
try {
headersPackageJson = require.resolve("node-api-headers/package.json", {
paths: [pkgDir],
});
} catch (err) {
throw new Error(
"node-api-headers package not found; cannot generate node.lib on Windows",
);
}
const defPath = path.join(path.dirname(headersPackageJson), "def", "node_api.def");
if (!fs.existsSync(defPath)) {
throw new Error(`node_api.def not found at ${defPath}`);
}
const machineMap = { x64: "X64", ia32: "X86", arm64: "ARM64" };
const machine = machineMap[arch] || "X64";
const libExecutable = resolveLibExecutable(env, arch);
if (!libExecutable) {
throw new Error(
"Unable to locate lib.exe. Ensure the Visual Studio Build Tools are installed and vcvarsall has been applied.",
);
}
console.log(
`[build-addon] Generating node import library using ${libExecutable} for ${machine} into ${nodeImportLib}`,
);
try {
run(`"${libExecutable}" /def:"${defPath}" /machine:${machine} /out:"${nodeImportLib}"`, {
env,
});
} catch (error) {
const message =
"Failed to generate node import library. Ensure Visual Studio build tools are installed.";
if (error instanceof Error) {
error.message = `${message}\n${error.message}`;
throw error;
}
throw new Error(message);
}
}
function variantFromName(name, platform, arch) {
const envOverrides = {};
if (name === "cpu-fallback") {
return { name, env: envOverrides };
}
if (!name.includes("-")) {
// expand shorthand like "metal" to full name
name = `${platform}-${arch}-${name}`;
} else if (!name.startsWith(platform)) {
console.warn(
`[build-addon] Warning: variant '${name}' does not match current platform (${platform}), skipping.`,
);
return null;
}
if (name.includes("-metal")) {
envOverrides.GGML_METAL = "1";
envOverrides.GGML_USE_ACCELERATE = "1";
}
if (name.includes("-openblas")) {
envOverrides.GGML_OPENBLAS = "1";
envOverrides.GGML_BLAS = "1";
}
if (name.includes("-cuda")) {
envOverrides.GGML_CUDA = "1";
}
if (name.startsWith("darwin-")) {
envOverrides.GGML_USE_ACCELERATE = envOverrides.GGML_USE_ACCELERATE || "1";
}
return { name, env: envOverrides };
}
function computeVariants(platform, arch) {
const overrides = (process.env.WHISPER_TARGETS || "")
.split(",")
.map((v) => v.trim())
.filter(Boolean);
const result = [];
if (overrides.length > 0) {
for (const override of overrides) {
const variant = variantFromName(override, platform, arch);
if (variant) result.push(variant);
}
return result;
}
if (platform === "darwin") {
const metal = variantFromName(`${platform}-${arch}-metal`, platform, arch);
if (metal) result.push(metal);
}
const primary = variantFromName(`${platform}-${arch}`, platform, arch);
if (primary) result.push(primary);
return result;
}
const { platform, arch } = process;
const variants = computeVariants(platform, arch);
if (variants.length === 0) {
console.warn("[build-addon] No variants requested, building default cpu-fallback.");
const fallback = variantFromName("cpu-fallback", platform, arch);
if (fallback) variants.push(fallback);
}
for (const variant of variants) {
const buildVariantDir = path.join(buildDir, variant.name.replace(/[\\/]/g, "_"));
fs.rmSync(buildVariantDir, { recursive: true, force: true });
fs.mkdirSync(buildVariantDir, { recursive: true });
const env = {
...process.env,
CMAKE_JS_CACHE: cacheDir,
HOME: homeDir,
CMAKE_JS_NODE_DIR: path.resolve(process.execPath, "..", ".."),
...variant.env,
};
console.log(`[build-addon] Building variant ${variant.name}`);
ensureWindowsNodeImportLib(buildVariantDir, arch, env);
const cmakeParts = [
"npx cmake-js compile",
`-O "${buildVariantDir}"`,
"-B Release",
`-d "${addonDir}"`,
"-T whisper_node",
"--CD node_runtime=node",
];
const propagateCMakeBool = (key) => {
const value = env[key];
if (typeof value === "string" && value.length > 0) {
cmakeParts.push(`--CD${key}=${value}`);
}
};
propagateCMakeBool("GGML_NATIVE");
run(cmakeParts.join(" "), {
cwd: addonDir,
env,
});
const builtBinary = path.join(buildVariantDir, "Release", "whisper.node");
if (!fs.existsSync(builtBinary)) {
throw new Error(`Build succeeded but whisper.node not found for variant ${variant.name}`);
}
const targetDir = path.join(pkgDir, "native", variant.name);
fs.mkdirSync(targetDir, { recursive: true });
fs.copyFileSync(builtBinary, path.join(targetDir, "whisper.node"));
console.log(`[build-addon] copied to native/${variant.name}/whisper.node`);
if (platform === "darwin") {
const targetBinary = path.join(targetDir, "whisper.node");
try {
run(`codesign --force --sign - "${targetBinary}"`);
console.log("[build-addon] codesigned", targetBinary);
} catch (err) {
console.warn(
`[build-addon] warning: codesign failed for ${targetBinary}: ${err.message}`,
);
}
}
// Remove intermediate build artifacts to keep the package footprint small and avoid
// extremely long CMake-generated paths that break Windows packaging tools.
fs.rmSync(buildVariantDir, { recursive: true, force: true });
}

View file

@ -0,0 +1,33 @@
{
"name": "@amical/whisper-wrapper",
"version": "0.0.0",
"private": true,
"main": "dist/index.js",
"types": "dist/index.d.ts",
"files": [
"dist",
"native",
"src",
"addon"
],
"binary": {
"napi_versions": [
8
]
},
"scripts": {
"build": "tsc -p tsconfig.json",
"postinstall": "node ./bin/build-addon.js",
"build:native": "node ./scripts/build-native.js",
"build:native:cuda": "node ./scripts/build-native.js --cuda"
},
"dependencies": {
"cmake-js": "^7.3.1",
"minimatch": "10.0.3",
"node-api-headers": "^1.5.0"
},
"devDependencies": {
"@amical/typescript-config": "workspace:*",
"typescript": "^5.8.3"
}
}

View file

@ -0,0 +1,19 @@
#!/usr/bin/env node
const { execSync } = require("node:child_process");
const path = require("node:path");
function build(targets) {
const baseEnv = { ...process.env };
baseEnv.WHISPER_TARGETS = targets.join(",");
execSync("node ./bin/build-addon.js", {
cwd: path.join(__dirname, ".."),
stdio: "inherit",
env: baseEnv,
});
}
if (process.argv.includes("--cuda")) {
build(["win32-x64-cuda", "win32-x64"]);
} else {
build([]);
}

View file

@ -0,0 +1,160 @@
#!/usr/bin/env node
// Quick smoke-test runner for the whisper.cpp Node addon build.
//
// Usage:
// node scripts/test-addon.js [--model /path/to/model.bin] [--audio /path/to/audio.wav]
//
// If no flags are provided the script will grab the first *.bin model from
// "~/Library/Application Support/amical/models" and the bundled jfk sample.
const fs = require("node:fs");
const os = require("node:os");
const path = require("node:path");
function resolveBinding() {
const nativeRoot = path.resolve(__dirname, "..", "native");
const { platform, arch } = process;
const candidates = [
`${platform}-${arch}-metal`,
`${platform}-${arch}-openblas`,
`${platform}-${arch}-cuda`,
`${platform}-${arch}`,
"cpu-fallback",
];
for (const dir of candidates) {
const bindingPath = path.join(nativeRoot, dir, "whisper.node");
if (fs.existsSync(bindingPath)) {
return bindingPath;
}
}
throw new Error(
`Unable to locate a whisper.node binary for ${platform}-${arch}. ` +
`Expected one of: ${candidates.join(", ")}`,
);
}
function defaultModelPath() {
const modelsDir = path.join(
os.homedir(),
"Library",
"Application Support",
"amical",
"models",
);
if (!fs.existsSync(modelsDir)) {
throw new Error(
`Model directory not found at ${modelsDir}. Pass --model to override.`,
);
}
const candidates = fs
.readdirSync(modelsDir)
.filter((f) => f.toLowerCase().endsWith(".bin"))
.map((name) => {
const fullPath = path.join(modelsDir, name);
const stats = fs.statSync(fullPath);
return { name, fullPath, size: stats.size };
})
.sort((a, b) => - a.size + b.size);
if (candidates.length === 0) {
throw new Error(
`No .bin model files found in ${modelsDir}. Pass --model to override.`,
);
}
return candidates[0].fullPath;
}
function defaultAudioPath() {
const audio = path.resolve(
__dirname,
"..",
"whisper.cpp",
"samples",
"jfk.wav",
);
if (!fs.existsSync(audio)) {
throw new Error(
`Sample audio not found at ${audio}. Pass --audio to override.`,
);
}
return audio;
}
function parseArgs() {
const args = process.argv.slice(2);
const options = {};
for (const arg of args) {
if (!arg.startsWith("--")) continue;
const [key, value] = arg.slice(2).split("=");
if (!value) {
throw new Error(`Flag '${arg}' must be provided as --${key}=<value>`);
}
options[key] = value;
}
return options;
}
async function main() {
const opts = parseArgs();
const modelPath = path.resolve(opts.model || defaultModelPath());
const audioPath = path.resolve(opts.audio || defaultAudioPath());
if (!fs.existsSync(modelPath)) {
throw new Error(`Model file not found at ${modelPath}`);
}
if (!fs.existsSync(audioPath)) {
throw new Error(`Audio file not found at ${audioPath}`);
}
const bindingPath = resolveBinding();
console.log(`> Using addon: ${bindingPath}`);
console.log(`> Using model: ${modelPath}`);
console.log(`> Using audio: ${audioPath}`);
// eslint-disable-next-line @typescript-eslint/no-var-requires
const binding = require(bindingPath);
if (typeof binding.init !== "function" ||
typeof binding.full !== "function" ||
typeof binding.free !== "function") {
throw new Error(`Addon at ${bindingPath} does not expose init/full/free APIs.`);
}
const handle = binding.init({ model: modelPath, gpu: true });
try {
const segments = binding.full(handle, {
fname_inp: audioPath,
language: "en",
no_timestamps: false,
suppress_blank: true,
suppress_non_speech_tokens: true,
});
console.log("Transcription segments:\n");
for (const segment of segments) {
const from = typeof segment.from === "number" ? segment.from : "?";
const to = typeof segment.to === "number" ? segment.to : "?";
console.log(` [${from} -> ${to}] ${segment.text}`);
}
console.log("\nDone.");
} finally {
binding.free(handle);
}
}
main().catch((err) => {
console.error("Test run failed:", err);
process.exitCode = 1;
});

View file

@ -0,0 +1,43 @@
/* eslint-disable @typescript-eslint/no-var-requires */
import { loadBinding, getLoadedBindingInfo } from "./loader";
const binding = loadBinding();
export interface WhisperOptions {
gpu?: boolean;
}
export { getLoadedBindingInfo } from "./loader";
export class Whisper {
private ctx: any;
constructor(
private modelPath: string,
_opts?: WhisperOptions,
) {
this.ctx = binding.init({ model: modelPath });
}
async load(): Promise<void> {
return;
}
async transcribe(
audio: Float32Array | null,
options: Record<string, unknown>,
): Promise<{ result: Promise<Array<{ text: string }>> }> {
const payload =
audio instanceof Float32Array ? { audio, ...options } : options;
const segments = binding.full(this.ctx, payload);
return { result: Promise.resolve(segments) };
}
async free(): Promise<void> {
binding.free(this.ctx);
}
static getBindingInfo(): { path: string; type: string } | null {
return getLoadedBindingInfo();
}
}

View file

@ -0,0 +1,106 @@
import path from "node:path";
import fs from "node:fs";
const GPU_FIRST_CANDIDATES = ["metal", "openblas", "cuda"] as const;
function candidateDirs(platform: string, arch: string): string[] {
return [
...GPU_FIRST_CANDIDATES.map((tag) => `${platform}-${arch}-${tag}`),
`${platform}-${arch}`,
"cpu-fallback",
];
}
function bindingPathFor(dir: string): string {
return path.join(__dirname, "..", "native", dir, "whisper.node");
}
function isLoadableError(error: unknown): boolean {
return (
!!error &&
typeof error === "object" &&
"code" in error &&
(error as NodeJS.ErrnoException).code === "ERR_DLOPEN_FAILED"
);
}
export function resolveBinding(): string {
const { platform, arch } = process;
for (const dir of candidateDirs(platform, arch)) {
const candidate = bindingPathFor(dir);
if (fs.existsSync(candidate)) {
return candidate;
}
}
throw new Error(
`No suitable whisper.node binary found for ${platform}-${arch}`,
);
}
let loadedBindingInfo: { path: string; type: string } | null = null;
export function getLoadedBindingInfo(): { path: string; type: string } | null {
return loadedBindingInfo;
}
export function loadBinding(): any {
const { platform, arch } = process;
const attempted: string[] = [];
let lastLoadError: unknown = null;
for (const dir of candidateDirs(platform, arch)) {
const candidate = bindingPathFor(dir);
if (!fs.existsSync(candidate)) {
continue;
}
attempted.push(candidate);
try {
const mod = require(candidate);
if (attempted.length > 1) {
console.warn(
`[whisper-wrapper] loaded fallback binary: ${candidate} (attempted ${attempted.length} candidates)`,
);
}
// Store the loaded binding info
const bindingType = dir.includes("-cuda")
? "cuda"
: dir.includes("-metal")
? "metal"
: dir.includes("-openblas")
? "openblas"
: dir === "cpu-fallback"
? "cpu-fallback"
: "cpu";
loadedBindingInfo = {
path: candidate,
type: bindingType,
};
return mod;
} catch (error) {
if (isLoadableError(error)) {
console.warn(
`[whisper-wrapper] failed to load ${candidate}: ${(error as Error).message}. Trying next candidate...`,
);
lastLoadError = error;
continue;
}
throw error;
}
}
if (lastLoadError) {
const error = new Error(
`Unable to load whisper.node for ${platform}-${arch}. Attempted: ${attempted.join(", ")}`,
{ cause: lastLoadError },
);
throw error;
}
throw new Error(
`No suitable whisper.node binary found for ${platform}-${arch}`,
);
}

View file

@ -0,0 +1,8 @@
{
"extends": "../typescript-config/base.json",
"compilerOptions": {
"outDir": "dist",
"rootDir": "src"
},
"include": ["src"]
}

@ -0,0 +1 @@
Subproject commit a8d002cfd879315632a579e73f0148d06959de36

2887
pnpm-lock.yaml generated

File diff suppressed because it is too large Load diff

View file

@ -2,3 +2,4 @@ packages:
- "apps/*"
- "packages/*"
- "packages/**"
- "!packages/**/whisper.cpp/**"