Transcription on mac

This commit is contained in:
2026-03-11 03:08:27 +01:00
parent 0da98bfd96
commit f0e2b60f43
10 changed files with 320 additions and 487 deletions

57
server/dist/index.js vendored
View File

@@ -1,7 +1,5 @@
import crypto from 'node:crypto';
import fs from 'node:fs';
import fsPromises from 'node:fs/promises';
import os from 'node:os';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { TextEncoder } from 'node:util';
@@ -15,7 +13,7 @@ import { generateAuthenticationOptions, generateRegistrationOptions, verifyAuthe
import Fastify from 'fastify';
import { Redis } from 'ioredis';
import { z } from 'zod';
import { WhisperTranscriber } from './whisper-transcriber.js';
import { SpeechTranscriber } from './speech-transcriber.js';
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
const registerSchema = z.object({
@@ -100,11 +98,9 @@ const frontendDistPath = resolveProjectPath(process.env.PRIVATECHAT_WEB_DIST_DIR
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'ws://192.168.1.19:8080';
const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto';
const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000);
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
@@ -115,12 +111,10 @@ const webAuthnRpName = process.env.WEBAUTHN_RP_NAME ?? 'PrivateChat';
const webAuthnUserVerification = resolveWebAuthnUserVerification(process.env.WEBAUTHN_USER_VERIFICATION);
const frontendIndexPath = path.join(frontendDistPath, 'index.html');
const hasFrontendBuild = fs.existsSync(frontendIndexPath);
const whisperTranscriber = new WhisperTranscriber({
pythonExecutable: whisperPythonExecutable,
scriptPath: whisperScriptPath,
model: whisperModel,
device: whisperDevice,
computeType: whisperComputeType,
const speechTranscriber = new SpeechTranscriber({
serviceUrl: speechTranscriptionServiceUrl,
language: speechTranscriptionLanguage,
requestTimeoutMs: speechTranscriptionTimeoutMs,
}, app.log);
fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
@@ -750,7 +744,7 @@ async function handleSocketMessage(userId, sessionId, socket, rawMessage) {
});
}
catch (error) {
app.log.warn({ err: error, userId }, 'Whisper transcription failed');
app.log.warn({ err: error, userId }, 'Speech transcription failed');
send(socket, {
type: 'speech-transcription-error',
requestId: parsed.requestId,
@@ -1150,38 +1144,7 @@ function parseClientMessage(rawMessage) {
};
}
async function transcribeAudioPayload(requestId, audioBase64, mimeType) {
const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
const extension = audioExtensionForMimeType(mimeType);
const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
try {
await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
return await whisperTranscriber.transcribe(requestId, audioPath);
}
finally {
await fsPromises.rm(tempDirectory, { recursive: true, force: true });
}
}
function audioExtensionForMimeType(mimeType) {
switch (mimeType.toLowerCase()) {
case 'audio/webm':
case 'audio/webm;codecs=opus':
return 'webm';
case 'audio/ogg':
case 'audio/ogg;codecs=opus':
return 'ogg';
case 'audio/mp4':
case 'audio/m4a':
return 'm4a';
case 'audio/mpeg':
case 'audio/mp3':
return 'mp3';
case 'audio/wav':
case 'audio/wave':
case 'audio/x-wav':
return 'wav';
default:
return 'webm';
}
return await speechTranscriber.transcribe(requestId, audioBase64, mimeType);
}
async function generateImageFromPrompt(prompt) {
const abortController = new AbortController();

124
server/dist/speech-transcriber.js vendored Normal file
View File

@@ -0,0 +1,124 @@
import WebSocket from 'ws';
export class SpeechTranscriber {
options;
logger;
constructor(options, logger) {
this.options = options;
this.logger = logger;
}
async transcribe(requestId, audioBase64, mimeType) {
const audio = this.normalizeAudioPayload(audioBase64, mimeType);
return await new Promise((resolve, reject) => {
let settled = false;
const socket = new WebSocket(this.options.serviceUrl);
const finish = (handler) => {
if (settled) {
return;
}
settled = true;
clearTimeout(timeout);
socket.removeAllListeners();
if (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN) {
socket.close();
}
handler();
};
const timeout = setTimeout(() => {
finish(() => {
reject(new Error(`The transcription service timed out after ${this.options.requestTimeoutMs}ms.`));
});
}, this.options.requestTimeoutMs);
socket.on('open', () => {
try {
socket.send(JSON.stringify({
type: 'transcribe',
id: requestId,
language: this.options.language,
audio,
}));
}
catch (error) {
finish(() => {
reject(error instanceof Error ? error : new Error('Could not send transcription request.'));
});
}
});
socket.on('message', (payload) => {
const event = this.parseEvent(payload);
if (!event) {
return;
}
if (event.id && event.id !== requestId) {
this.logger.warn({ requestId, event }, 'Ignored transcription event for another request');
return;
}
if (event.type === 'start') {
this.logger.info({ requestId, model: event.model, language: event.language }, 'Speech transcription started');
return;
}
if (event.type === 'delta') {
return;
}
if (event.type === 'done') {
finish(() => {
resolve(event.text.trim());
});
return;
}
finish(() => {
reject(new Error(event.message));
});
});
socket.on('error', (error) => {
finish(() => {
reject(error instanceof Error ? error : new Error('The transcription service connection failed.'));
});
});
socket.on('close', (code, reasonBuffer) => {
if (settled) {
return;
}
const reason = reasonBuffer.toString().trim();
const detail = reason
? `The transcription service closed the connection unexpectedly (code=${code}, reason=${reason}).`
: `The transcription service closed the connection unexpectedly (code=${code}).`;
finish(() => {
reject(new Error(detail));
});
});
});
}
normalizeAudioPayload(audioBase64, mimeType) {
const trimmedAudio = audioBase64.trim();
if (trimmedAudio.startsWith('data:')) {
return trimmedAudio;
}
const normalizedMimeType = mimeType.trim() || 'audio/webm';
return `data:${normalizedMimeType};base64,${trimmedAudio}`;
}
parseEvent(payload) {
const message = this.rawDataToString(payload).trim();
if (!message) {
return null;
}
try {
return JSON.parse(message);
}
catch {
this.logger.warn({ transcriptionPayload: message }, 'Ignored non-JSON transcription event');
return null;
}
}
rawDataToString(payload) {
if (typeof payload === 'string') {
return payload;
}
if (payload instanceof ArrayBuffer) {
return Buffer.from(payload).toString('utf8');
}
if (Array.isArray(payload)) {
return Buffer.concat(payload).toString('utf8');
}
return payload.toString('utf8');
}
}

View File

@@ -1,121 +0,0 @@
import { spawn } from 'node:child_process';
import { createInterface } from 'node:readline';
export class WhisperTranscriber {
options;
logger;
worker = null;
readyPromise = null;
resolveReady = null;
rejectReady = null;
pendingRequests = new Map();
constructor(options, logger) {
this.options = options;
this.logger = logger;
}
async transcribe(requestId, audioPath) {
await this.ensureWorker();
if (!this.worker || this.worker.stdin.destroyed) {
throw new Error('The Whisper worker is not available.');
}
return new Promise((resolve, reject) => {
this.pendingRequests.set(requestId, { resolve, reject });
try {
this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`);
}
catch (error) {
this.pendingRequests.delete(requestId);
reject(error);
}
});
}
async ensureWorker() {
if (this.readyPromise) {
return this.readyPromise;
}
this.worker = spawn(this.options.pythonExecutable, [
this.options.scriptPath,
'--model',
this.options.model,
'--device',
this.options.device,
'--compute-type',
this.options.computeType,
], { stdio: ['pipe', 'pipe', 'pipe'] });
this.readyPromise = new Promise((resolve, reject) => {
this.resolveReady = resolve;
this.rejectReady = reject;
});
const stdout = createInterface({ input: this.worker.stdout });
stdout.on('line', (line) => {
this.handleWorkerLine(line);
});
this.worker.stderr.on('data', (chunk) => {
const message = chunk.toString().trim();
if (message) {
this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr');
}
});
this.worker.on('error', (error) => {
this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.'));
});
this.worker.on('exit', (code, signal) => {
this.failWorker(new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`));
});
return this.readyPromise;
}
handleWorkerLine(line) {
let payload;
try {
payload = JSON.parse(line);
}
catch {
this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output');
return;
}
if (payload.type === 'ready') {
this.logger.info({ model: payload.model }, 'Whisper worker ready');
this.resolveReady?.();
this.resolveReady = null;
this.rejectReady = null;
return;
}
if (payload.type === 'fatal') {
this.failWorker(new Error(payload.message));
return;
}
if (payload.type === 'error') {
if (!payload.requestId) {
this.failWorker(new Error(payload.message));
return;
}
const pendingRequest = this.pendingRequests.get(payload.requestId);
if (!pendingRequest) {
return;
}
this.pendingRequests.delete(payload.requestId);
pendingRequest.reject(new Error(payload.message));
return;
}
const pendingRequest = this.pendingRequests.get(payload.requestId);
if (!pendingRequest) {
return;
}
this.pendingRequests.delete(payload.requestId);
pendingRequest.resolve(payload.text.trim());
}
failWorker(error) {
if (this.worker) {
this.worker.removeAllListeners();
this.worker = null;
}
this.rejectReady?.(error);
this.resolveReady = null;
this.rejectReady = null;
this.readyPromise = null;
for (const { reject } of this.pendingRequests.values()) {
reject(error);
}
this.pendingRequests.clear();
this.logger.error({ err: error }, 'Whisper worker failed');
}
}

View File

@@ -16,6 +16,7 @@
"dotenv": "^17.3.1",
"fastify": "^5.8.2",
"ioredis": "^5.10.0",
"ws": "^8.19.0",
"zod": "^4.3.6"
},
"devDependencies": {

View File

@@ -6,8 +6,7 @@
"scripts": {
"dev": "node node_modules/tsx/dist/cli.mjs watch src/index.ts",
"build": "node node_modules/typescript/bin/tsc -p tsconfig.json",
"start": "node dist/index.js",
"setup-whisper": "python3 -m pip install -r requirements-whisper.txt"
"start": "node dist/index.js"
},
"dependencies": {
"@fastify/cors": "^11.2.0",
@@ -18,6 +17,7 @@
"dotenv": "^17.3.1",
"fastify": "^5.8.2",
"ioredis": "^5.10.0",
"ws": "^8.19.0",
"zod": "^4.3.6"
},
"devDependencies": {

View File

@@ -1 +0,0 @@
faster-whisper>=1.0.0

View File

@@ -1,92 +0,0 @@
#!/usr/bin/env python3
import argparse
import json
import sys
def emit(payload):
print(json.dumps(payload), flush=True)
def load_model(model_name, device, compute_type):
try:
from faster_whisper import WhisperModel
except Exception as exc:
emit(
{
"type": "fatal",
"message": "faster-whisper is not installed. Run `python3 -m pip install -r server/requirements-whisper.txt`.",
}
)
raise SystemExit(1) from exc
try:
return WhisperModel(model_name, device=device, compute_type=compute_type)
except Exception as exc:
emit(
{
"type": "fatal",
"message": f"Could not load the faster-whisper model '{model_name}': {exc}",
}
)
raise SystemExit(1) from exc
def transcribe(model, request_id, audio_path):
try:
segments, _ = model.transcribe(audio_path, vad_filter=True, beam_size=5)
text = "".join(segment.text for segment in segments).strip()
emit({"type": "result", "requestId": request_id, "text": text})
except Exception as exc:
emit(
{
"type": "error",
"requestId": request_id,
"message": f"Whisper transcription failed: {exc}",
}
)
def main():
parser = argparse.ArgumentParser(description="Persistent faster-whisper transcription worker")
parser.add_argument("--model", default="small")
parser.add_argument("--device", default="cpu")
parser.add_argument("--compute-type", default="int8")
args = parser.parse_args()
model = load_model(args.model, args.device, args.compute_type)
emit({"type": "ready", "model": args.model})
for raw_line in sys.stdin:
line = raw_line.strip()
if not line:
continue
try:
payload = json.loads(line)
except Exception as exc:
emit({"type": "error", "message": f"Invalid request JSON: {exc}"})
continue
request_id = payload.get("requestId")
audio_path = payload.get("audioPath")
if not request_id or not audio_path:
emit(
{
"type": "error",
"requestId": request_id,
"message": "Missing requestId or audioPath.",
}
)
continue
transcribe(model, request_id, audio_path)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -1,7 +1,5 @@
import crypto from 'node:crypto';
import fs from 'node:fs';
import fsPromises from 'node:fs/promises';
import os from 'node:os';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { TextEncoder } from 'node:util';
@@ -25,7 +23,7 @@ import { Redis } from 'ioredis';
import type WebSocket from 'ws';
import { z } from 'zod';
import { WhisperTranscriber } from './whisper-transcriber.js';
import { SpeechTranscriber } from './speech-transcriber.js';
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
@@ -333,11 +331,9 @@ const frontendDistPath = resolveProjectPath(
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'ws://192.168.1.19:8080';
const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto';
const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000);
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
@@ -351,13 +347,11 @@ const webAuthnUserVerification = resolveWebAuthnUserVerification(
const frontendIndexPath = path.join(frontendDistPath, 'index.html');
const hasFrontendBuild = fs.existsSync(frontendIndexPath);
const whisperTranscriber = new WhisperTranscriber(
const speechTranscriber = new SpeechTranscriber(
{
pythonExecutable: whisperPythonExecutable,
scriptPath: whisperScriptPath,
model: whisperModel,
device: whisperDevice,
computeType: whisperComputeType,
serviceUrl: speechTranscriptionServiceUrl,
language: speechTranscriptionLanguage,
requestTimeoutMs: speechTranscriptionTimeoutMs,
},
app.log,
);
@@ -1179,7 +1173,7 @@ async function handleSocketMessage(
text,
});
} catch (error) {
app.log.warn({ err: error, userId }, 'Whisper transcription failed');
app.log.warn({ err: error, userId }, 'Speech transcription failed');
send(socket, {
type: 'speech-transcription-error',
requestId: parsed.requestId,
@@ -1748,39 +1742,7 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
}
async function transcribeAudioPayload(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
const extension = audioExtensionForMimeType(mimeType);
const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
try {
await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
return await whisperTranscriber.transcribe(requestId, audioPath);
} finally {
await fsPromises.rm(tempDirectory, { recursive: true, force: true });
}
}
function audioExtensionForMimeType(mimeType: string): string {
switch (mimeType.toLowerCase()) {
case 'audio/webm':
case 'audio/webm;codecs=opus':
return 'webm';
case 'audio/ogg':
case 'audio/ogg;codecs=opus':
return 'ogg';
case 'audio/mp4':
case 'audio/m4a':
return 'm4a';
case 'audio/mpeg':
case 'audio/mp3':
return 'mp3';
case 'audio/wav':
case 'audio/wave':
case 'audio/x-wav':
return 'wav';
default:
return 'webm';
}
return await speechTranscriber.transcribe(requestId, audioBase64, mimeType);
}
async function generateImageFromPrompt(prompt: string): Promise<{ imageBase64: string; mimeType: string }> {

View File

@@ -0,0 +1,173 @@
import WebSocket, { type RawData } from 'ws';
type LoggerLike = {
info: (payload: unknown, message?: string) => void;
warn: (payload: unknown, message?: string) => void;
error: (payload: unknown, message?: string) => void;
};
type SpeechTranscriberOptions = {
serviceUrl: string;
language: string;
requestTimeoutMs: number;
};
type ServiceEvent =
| { type: 'start'; id: string; model: string; language: string }
| { type: 'delta'; id: string; text: string; fullText: string }
| { type: 'done'; id: string; text: string }
| { type: 'error'; id?: string; message: string };
export class SpeechTranscriber {
constructor(
private readonly options: SpeechTranscriberOptions,
private readonly logger: LoggerLike,
) {}
async transcribe(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
const audio = this.normalizeAudioPayload(audioBase64, mimeType);
return await new Promise<string>((resolve, reject) => {
let settled = false;
const socket = new WebSocket(this.options.serviceUrl);
const finish = (handler: () => void): void => {
if (settled) {
return;
}
settled = true;
clearTimeout(timeout);
socket.removeAllListeners();
if (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN) {
socket.close();
}
handler();
};
const timeout = setTimeout(() => {
finish(() => {
reject(new Error(`The transcription service timed out after ${this.options.requestTimeoutMs}ms.`));
});
}, this.options.requestTimeoutMs);
socket.on('open', () => {
try {
socket.send(
JSON.stringify({
type: 'transcribe',
id: requestId,
language: this.options.language,
audio,
}),
);
} catch (error) {
finish(() => {
reject(error instanceof Error ? error : new Error('Could not send transcription request.'));
});
}
});
socket.on('message', (payload) => {
const event = this.parseEvent(payload);
if (!event) {
return;
}
if (event.id && event.id !== requestId) {
this.logger.warn({ requestId, event }, 'Ignored transcription event for another request');
return;
}
if (event.type === 'start') {
this.logger.info(
{ requestId, model: event.model, language: event.language },
'Speech transcription started',
);
return;
}
if (event.type === 'delta') {
return;
}
if (event.type === 'done') {
finish(() => {
resolve(event.text.trim());
});
return;
}
finish(() => {
reject(new Error(event.message));
});
});
socket.on('error', (error) => {
finish(() => {
reject(error instanceof Error ? error : new Error('The transcription service connection failed.'));
});
});
socket.on('close', (code, reasonBuffer) => {
if (settled) {
return;
}
const reason = reasonBuffer.toString().trim();
const detail = reason
? `The transcription service closed the connection unexpectedly (code=${code}, reason=${reason}).`
: `The transcription service closed the connection unexpectedly (code=${code}).`;
finish(() => {
reject(new Error(detail));
});
});
});
}
private normalizeAudioPayload(audioBase64: string, mimeType: string): string {
const trimmedAudio = audioBase64.trim();
if (trimmedAudio.startsWith('data:')) {
return trimmedAudio;
}
const normalizedMimeType = mimeType.trim() || 'audio/webm';
return `data:${normalizedMimeType};base64,${trimmedAudio}`;
}
private parseEvent(payload: RawData): ServiceEvent | null {
const message = this.rawDataToString(payload).trim();
if (!message) {
return null;
}
try {
return JSON.parse(message) as ServiceEvent;
} catch {
this.logger.warn({ transcriptionPayload: message }, 'Ignored non-JSON transcription event');
return null;
}
}
private rawDataToString(payload: RawData): string {
if (typeof payload === 'string') {
return payload;
}
if (payload instanceof ArrayBuffer) {
return Buffer.from(payload).toString('utf8');
}
if (Array.isArray(payload)) {
return Buffer.concat(payload).toString('utf8');
}
return payload.toString('utf8');
}
}

View File

@@ -1,176 +0,0 @@
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
import { createInterface } from 'node:readline';
type LoggerLike = {
info: (payload: unknown, message?: string) => void;
warn: (payload: unknown, message?: string) => void;
error: (payload: unknown, message?: string) => void;
};
type WhisperTranscriberOptions = {
pythonExecutable: string;
scriptPath: string;
model: string;
device: string;
computeType: string;
};
type WorkerEvent =
| { type: 'ready'; model: string }
| { type: 'result'; requestId: string; text: string }
| { type: 'error'; requestId?: string; message: string }
| { type: 'fatal'; message: string };
export class WhisperTranscriber {
private worker: ChildProcessWithoutNullStreams | null = null;
private readyPromise: Promise<void> | null = null;
private resolveReady: (() => void) | null = null;
private rejectReady: ((reason?: unknown) => void) | null = null;
private readonly pendingRequests = new Map<
string,
{ resolve: (text: string) => void; reject: (reason?: unknown) => void }
>();
constructor(
private readonly options: WhisperTranscriberOptions,
private readonly logger: LoggerLike,
) {}
async transcribe(requestId: string, audioPath: string): Promise<string> {
await this.ensureWorker();
if (!this.worker || this.worker.stdin.destroyed) {
throw new Error('The Whisper worker is not available.');
}
return new Promise<string>((resolve, reject) => {
this.pendingRequests.set(requestId, { resolve, reject });
try {
this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`);
} catch (error) {
this.pendingRequests.delete(requestId);
reject(error);
}
});
}
private async ensureWorker(): Promise<void> {
if (this.readyPromise) {
return this.readyPromise;
}
this.worker = spawn(
this.options.pythonExecutable,
[
this.options.scriptPath,
'--model',
this.options.model,
'--device',
this.options.device,
'--compute-type',
this.options.computeType,
],
{ stdio: ['pipe', 'pipe', 'pipe'] },
);
this.readyPromise = new Promise<void>((resolve, reject) => {
this.resolveReady = resolve;
this.rejectReady = reject;
});
const stdout = createInterface({ input: this.worker.stdout });
stdout.on('line', (line) => {
this.handleWorkerLine(line);
});
this.worker.stderr.on('data', (chunk) => {
const message = chunk.toString().trim();
if (message) {
this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr');
}
});
this.worker.on('error', (error) => {
this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.'));
});
this.worker.on('exit', (code, signal) => {
this.failWorker(
new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`),
);
});
return this.readyPromise;
}
private handleWorkerLine(line: string): void {
let payload: WorkerEvent;
try {
payload = JSON.parse(line) as WorkerEvent;
} catch {
this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output');
return;
}
if (payload.type === 'ready') {
this.logger.info({ model: payload.model }, 'Whisper worker ready');
this.resolveReady?.();
this.resolveReady = null;
this.rejectReady = null;
return;
}
if (payload.type === 'fatal') {
this.failWorker(new Error(payload.message));
return;
}
if (payload.type === 'error') {
if (!payload.requestId) {
this.failWorker(new Error(payload.message));
return;
}
const pendingRequest = this.pendingRequests.get(payload.requestId);
if (!pendingRequest) {
return;
}
this.pendingRequests.delete(payload.requestId);
pendingRequest.reject(new Error(payload.message));
return;
}
const pendingRequest = this.pendingRequests.get(payload.requestId);
if (!pendingRequest) {
return;
}
this.pendingRequests.delete(payload.requestId);
pendingRequest.resolve(payload.text.trim());
}
private failWorker(error: Error): void {
if (this.worker) {
this.worker.removeAllListeners();
this.worker = null;
}
this.rejectReady?.(error);
this.resolveReady = null;
this.rejectReady = null;
this.readyPromise = null;
for (const { reject } of this.pendingRequests.values()) {
reject(error);
}
this.pendingRequests.clear();
this.logger.error({ err: error }, 'Whisper worker failed');
}
}