From f0e2b60f43852637745f43b1fd13540ec9f484af Mon Sep 17 00:00:00 2001 From: Laurent Dubertrand Date: Wed, 11 Mar 2026 03:08:27 +0100 Subject: [PATCH] Transcription on mac --- server/dist/index.js | 57 ++------- server/dist/speech-transcriber.js | 124 +++++++++++++++++++ server/dist/whisper-transcriber.js | 121 ------------------ server/package-lock.json | 1 + server/package.json | 4 +- server/requirements-whisper.txt | 1 - server/scripts/transcribe_whisper.py | 92 -------------- server/src/index.ts | 58 ++------- server/src/speech-transcriber.ts | 173 ++++++++++++++++++++++++++ server/src/whisper-transcriber.ts | 176 --------------------------- 10 files changed, 320 insertions(+), 487 deletions(-) create mode 100644 server/dist/speech-transcriber.js delete mode 100644 server/dist/whisper-transcriber.js delete mode 100644 server/requirements-whisper.txt delete mode 100644 server/scripts/transcribe_whisper.py create mode 100644 server/src/speech-transcriber.ts delete mode 100644 server/src/whisper-transcriber.ts diff --git a/server/dist/index.js b/server/dist/index.js index cd13c26..11b9a12 100644 --- a/server/dist/index.js +++ b/server/dist/index.js @@ -1,7 +1,5 @@ import crypto from 'node:crypto'; import fs from 'node:fs'; -import fsPromises from 'node:fs/promises'; -import os from 'node:os'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { TextEncoder } from 'node:util'; @@ -15,7 +13,7 @@ import { generateAuthenticationOptions, generateRegistrationOptions, verifyAuthe import Fastify from 'fastify'; import { Redis } from 'ioredis'; import { z } from 'zod'; -import { WhisperTranscriber } from './whisper-transcriber.js'; +import { SpeechTranscriber } from './speech-transcriber.js'; dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) }); const projectRootPath = fileURLToPath(new URL('../../', import.meta.url)); const registerSchema = z.object({ @@ -100,11 +98,9 @@ const frontendDistPath = resolveProjectPath(process.env.PRIVATECHAT_WEB_DIST_DIR const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, ''); const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest'; const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024'; -const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3'; -const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small'; -const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu'; -const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8'; -const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py'); +const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'ws://192.168.1.19:8080'; +const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto'; +const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000); const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12); const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60); const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN); @@ -115,12 +111,10 @@ const webAuthnRpName = process.env.WEBAUTHN_RP_NAME ?? 'PrivateChat'; const webAuthnUserVerification = resolveWebAuthnUserVerification(process.env.WEBAUTHN_USER_VERIFICATION); const frontendIndexPath = path.join(frontendDistPath, 'index.html'); const hasFrontendBuild = fs.existsSync(frontendIndexPath); -const whisperTranscriber = new WhisperTranscriber({ - pythonExecutable: whisperPythonExecutable, - scriptPath: whisperScriptPath, - model: whisperModel, - device: whisperDevice, - computeType: whisperComputeType, +const speechTranscriber = new SpeechTranscriber({ + serviceUrl: speechTranscriptionServiceUrl, + language: speechTranscriptionLanguage, + requestTimeoutMs: speechTranscriptionTimeoutMs, }, app.log); fs.mkdirSync(path.dirname(sqlitePath), { recursive: true }); fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true }); @@ -750,7 +744,7 @@ async function handleSocketMessage(userId, sessionId, socket, rawMessage) { }); } catch (error) { - app.log.warn({ err: error, userId }, 'Whisper transcription failed'); + app.log.warn({ err: error, userId }, 'Speech transcription failed'); send(socket, { type: 'speech-transcription-error', requestId: parsed.requestId, @@ -1150,38 +1144,7 @@ function parseClientMessage(rawMessage) { }; } async function transcribeAudioPayload(requestId, audioBase64, mimeType) { - const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-')); - const extension = audioExtensionForMimeType(mimeType); - const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`); - try { - await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64')); - return await whisperTranscriber.transcribe(requestId, audioPath); - } - finally { - await fsPromises.rm(tempDirectory, { recursive: true, force: true }); - } -} -function audioExtensionForMimeType(mimeType) { - switch (mimeType.toLowerCase()) { - case 'audio/webm': - case 'audio/webm;codecs=opus': - return 'webm'; - case 'audio/ogg': - case 'audio/ogg;codecs=opus': - return 'ogg'; - case 'audio/mp4': - case 'audio/m4a': - return 'm4a'; - case 'audio/mpeg': - case 'audio/mp3': - return 'mp3'; - case 'audio/wav': - case 'audio/wave': - case 'audio/x-wav': - return 'wav'; - default: - return 'webm'; - } + return await speechTranscriber.transcribe(requestId, audioBase64, mimeType); } async function generateImageFromPrompt(prompt) { const abortController = new AbortController(); diff --git a/server/dist/speech-transcriber.js b/server/dist/speech-transcriber.js new file mode 100644 index 0000000..b75fe0b --- /dev/null +++ b/server/dist/speech-transcriber.js @@ -0,0 +1,124 @@ +import WebSocket from 'ws'; +export class SpeechTranscriber { + options; + logger; + constructor(options, logger) { + this.options = options; + this.logger = logger; + } + async transcribe(requestId, audioBase64, mimeType) { + const audio = this.normalizeAudioPayload(audioBase64, mimeType); + return await new Promise((resolve, reject) => { + let settled = false; + const socket = new WebSocket(this.options.serviceUrl); + const finish = (handler) => { + if (settled) { + return; + } + settled = true; + clearTimeout(timeout); + socket.removeAllListeners(); + if (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN) { + socket.close(); + } + handler(); + }; + const timeout = setTimeout(() => { + finish(() => { + reject(new Error(`The transcription service timed out after ${this.options.requestTimeoutMs}ms.`)); + }); + }, this.options.requestTimeoutMs); + socket.on('open', () => { + try { + socket.send(JSON.stringify({ + type: 'transcribe', + id: requestId, + language: this.options.language, + audio, + })); + } + catch (error) { + finish(() => { + reject(error instanceof Error ? error : new Error('Could not send transcription request.')); + }); + } + }); + socket.on('message', (payload) => { + const event = this.parseEvent(payload); + if (!event) { + return; + } + if (event.id && event.id !== requestId) { + this.logger.warn({ requestId, event }, 'Ignored transcription event for another request'); + return; + } + if (event.type === 'start') { + this.logger.info({ requestId, model: event.model, language: event.language }, 'Speech transcription started'); + return; + } + if (event.type === 'delta') { + return; + } + if (event.type === 'done') { + finish(() => { + resolve(event.text.trim()); + }); + return; + } + finish(() => { + reject(new Error(event.message)); + }); + }); + socket.on('error', (error) => { + finish(() => { + reject(error instanceof Error ? error : new Error('The transcription service connection failed.')); + }); + }); + socket.on('close', (code, reasonBuffer) => { + if (settled) { + return; + } + const reason = reasonBuffer.toString().trim(); + const detail = reason + ? `The transcription service closed the connection unexpectedly (code=${code}, reason=${reason}).` + : `The transcription service closed the connection unexpectedly (code=${code}).`; + finish(() => { + reject(new Error(detail)); + }); + }); + }); + } + normalizeAudioPayload(audioBase64, mimeType) { + const trimmedAudio = audioBase64.trim(); + if (trimmedAudio.startsWith('data:')) { + return trimmedAudio; + } + const normalizedMimeType = mimeType.trim() || 'audio/webm'; + return `data:${normalizedMimeType};base64,${trimmedAudio}`; + } + parseEvent(payload) { + const message = this.rawDataToString(payload).trim(); + if (!message) { + return null; + } + try { + return JSON.parse(message); + } + catch { + this.logger.warn({ transcriptionPayload: message }, 'Ignored non-JSON transcription event'); + return null; + } + } + rawDataToString(payload) { + if (typeof payload === 'string') { + return payload; + } + if (payload instanceof ArrayBuffer) { + return Buffer.from(payload).toString('utf8'); + } + if (Array.isArray(payload)) { + return Buffer.concat(payload).toString('utf8'); + } + return payload.toString('utf8'); + } +} diff --git a/server/dist/whisper-transcriber.js b/server/dist/whisper-transcriber.js deleted file mode 100644 index 6f145e6..0000000 --- a/server/dist/whisper-transcriber.js +++ /dev/null @@ -1,121 +0,0 @@ -import { spawn } from 'node:child_process'; -import { createInterface } from 'node:readline'; -export class WhisperTranscriber { - options; - logger; - worker = null; - readyPromise = null; - resolveReady = null; - rejectReady = null; - pendingRequests = new Map(); - constructor(options, logger) { - this.options = options; - this.logger = logger; - } - async transcribe(requestId, audioPath) { - await this.ensureWorker(); - if (!this.worker || this.worker.stdin.destroyed) { - throw new Error('The Whisper worker is not available.'); - } - return new Promise((resolve, reject) => { - this.pendingRequests.set(requestId, { resolve, reject }); - try { - this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`); - } - catch (error) { - this.pendingRequests.delete(requestId); - reject(error); - } - }); - } - async ensureWorker() { - if (this.readyPromise) { - return this.readyPromise; - } - this.worker = spawn(this.options.pythonExecutable, [ - this.options.scriptPath, - '--model', - this.options.model, - '--device', - this.options.device, - '--compute-type', - this.options.computeType, - ], { stdio: ['pipe', 'pipe', 'pipe'] }); - this.readyPromise = new Promise((resolve, reject) => { - this.resolveReady = resolve; - this.rejectReady = reject; - }); - const stdout = createInterface({ input: this.worker.stdout }); - stdout.on('line', (line) => { - this.handleWorkerLine(line); - }); - this.worker.stderr.on('data', (chunk) => { - const message = chunk.toString().trim(); - if (message) { - this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr'); - } - }); - this.worker.on('error', (error) => { - this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.')); - }); - this.worker.on('exit', (code, signal) => { - this.failWorker(new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`)); - }); - return this.readyPromise; - } - handleWorkerLine(line) { - let payload; - try { - payload = JSON.parse(line); - } - catch { - this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output'); - return; - } - if (payload.type === 'ready') { - this.logger.info({ model: payload.model }, 'Whisper worker ready'); - this.resolveReady?.(); - this.resolveReady = null; - this.rejectReady = null; - return; - } - if (payload.type === 'fatal') { - this.failWorker(new Error(payload.message)); - return; - } - if (payload.type === 'error') { - if (!payload.requestId) { - this.failWorker(new Error(payload.message)); - return; - } - const pendingRequest = this.pendingRequests.get(payload.requestId); - if (!pendingRequest) { - return; - } - this.pendingRequests.delete(payload.requestId); - pendingRequest.reject(new Error(payload.message)); - return; - } - const pendingRequest = this.pendingRequests.get(payload.requestId); - if (!pendingRequest) { - return; - } - this.pendingRequests.delete(payload.requestId); - pendingRequest.resolve(payload.text.trim()); - } - failWorker(error) { - if (this.worker) { - this.worker.removeAllListeners(); - this.worker = null; - } - this.rejectReady?.(error); - this.resolveReady = null; - this.rejectReady = null; - this.readyPromise = null; - for (const { reject } of this.pendingRequests.values()) { - reject(error); - } - this.pendingRequests.clear(); - this.logger.error({ err: error }, 'Whisper worker failed'); - } -} diff --git a/server/package-lock.json b/server/package-lock.json index d9af003..06af055 100644 --- a/server/package-lock.json +++ b/server/package-lock.json @@ -16,6 +16,7 @@ "dotenv": "^17.3.1", "fastify": "^5.8.2", "ioredis": "^5.10.0", + "ws": "^8.19.0", "zod": "^4.3.6" }, "devDependencies": { diff --git a/server/package.json b/server/package.json index 7e513de..3d8df28 100644 --- a/server/package.json +++ b/server/package.json @@ -6,8 +6,7 @@ "scripts": { "dev": "node node_modules/tsx/dist/cli.mjs watch src/index.ts", "build": "node node_modules/typescript/bin/tsc -p tsconfig.json", - "start": "node dist/index.js", - "setup-whisper": "python3 -m pip install -r requirements-whisper.txt" + "start": "node dist/index.js" }, "dependencies": { "@fastify/cors": "^11.2.0", @@ -18,6 +17,7 @@ "dotenv": "^17.3.1", "fastify": "^5.8.2", "ioredis": "^5.10.0", + "ws": "^8.19.0", "zod": "^4.3.6" }, "devDependencies": { diff --git a/server/requirements-whisper.txt b/server/requirements-whisper.txt deleted file mode 100644 index 144536a..0000000 --- a/server/requirements-whisper.txt +++ /dev/null @@ -1 +0,0 @@ -faster-whisper>=1.0.0 diff --git a/server/scripts/transcribe_whisper.py b/server/scripts/transcribe_whisper.py deleted file mode 100644 index a1c8db8..0000000 --- a/server/scripts/transcribe_whisper.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import json -import sys - - -def emit(payload): - print(json.dumps(payload), flush=True) - - -def load_model(model_name, device, compute_type): - try: - from faster_whisper import WhisperModel - except Exception as exc: - emit( - { - "type": "fatal", - "message": "faster-whisper is not installed. Run `python3 -m pip install -r server/requirements-whisper.txt`.", - } - ) - raise SystemExit(1) from exc - - try: - return WhisperModel(model_name, device=device, compute_type=compute_type) - except Exception as exc: - emit( - { - "type": "fatal", - "message": f"Could not load the faster-whisper model '{model_name}': {exc}", - } - ) - raise SystemExit(1) from exc - - -def transcribe(model, request_id, audio_path): - try: - segments, _ = model.transcribe(audio_path, vad_filter=True, beam_size=5) - text = "".join(segment.text for segment in segments).strip() - emit({"type": "result", "requestId": request_id, "text": text}) - except Exception as exc: - emit( - { - "type": "error", - "requestId": request_id, - "message": f"Whisper transcription failed: {exc}", - } - ) - - -def main(): - parser = argparse.ArgumentParser(description="Persistent faster-whisper transcription worker") - parser.add_argument("--model", default="small") - parser.add_argument("--device", default="cpu") - parser.add_argument("--compute-type", default="int8") - args = parser.parse_args() - - model = load_model(args.model, args.device, args.compute_type) - emit({"type": "ready", "model": args.model}) - - for raw_line in sys.stdin: - line = raw_line.strip() - - if not line: - continue - - try: - payload = json.loads(line) - except Exception as exc: - emit({"type": "error", "message": f"Invalid request JSON: {exc}"}) - continue - - request_id = payload.get("requestId") - audio_path = payload.get("audioPath") - - if not request_id or not audio_path: - emit( - { - "type": "error", - "requestId": request_id, - "message": "Missing requestId or audioPath.", - } - ) - continue - - transcribe(model, request_id, audio_path) - - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/server/src/index.ts b/server/src/index.ts index 4e19e56..208343f 100644 --- a/server/src/index.ts +++ b/server/src/index.ts @@ -1,7 +1,5 @@ import crypto from 'node:crypto'; import fs from 'node:fs'; -import fsPromises from 'node:fs/promises'; -import os from 'node:os'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { TextEncoder } from 'node:util'; @@ -25,7 +23,7 @@ import { Redis } from 'ioredis'; import type WebSocket from 'ws'; import { z } from 'zod'; -import { WhisperTranscriber } from './whisper-transcriber.js'; +import { SpeechTranscriber } from './speech-transcriber.js'; dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) }); @@ -333,11 +331,9 @@ const frontendDistPath = resolveProjectPath( const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, ''); const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest'; const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024'; -const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3'; -const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small'; -const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu'; -const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8'; -const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py'); +const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'ws://192.168.1.19:8080'; +const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto'; +const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000); const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12); const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60); const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN); @@ -351,13 +347,11 @@ const webAuthnUserVerification = resolveWebAuthnUserVerification( const frontendIndexPath = path.join(frontendDistPath, 'index.html'); const hasFrontendBuild = fs.existsSync(frontendIndexPath); -const whisperTranscriber = new WhisperTranscriber( +const speechTranscriber = new SpeechTranscriber( { - pythonExecutable: whisperPythonExecutable, - scriptPath: whisperScriptPath, - model: whisperModel, - device: whisperDevice, - computeType: whisperComputeType, + serviceUrl: speechTranscriptionServiceUrl, + language: speechTranscriptionLanguage, + requestTimeoutMs: speechTranscriptionTimeoutMs, }, app.log, ); @@ -1179,7 +1173,7 @@ async function handleSocketMessage( text, }); } catch (error) { - app.log.warn({ err: error, userId }, 'Whisper transcription failed'); + app.log.warn({ err: error, userId }, 'Speech transcription failed'); send(socket, { type: 'speech-transcription-error', requestId: parsed.requestId, @@ -1748,39 +1742,7 @@ function parseClientMessage(rawMessage: string): ClientMessage | null { } async function transcribeAudioPayload(requestId: string, audioBase64: string, mimeType: string): Promise { - const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-')); - const extension = audioExtensionForMimeType(mimeType); - const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`); - - try { - await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64')); - return await whisperTranscriber.transcribe(requestId, audioPath); - } finally { - await fsPromises.rm(tempDirectory, { recursive: true, force: true }); - } -} - -function audioExtensionForMimeType(mimeType: string): string { - switch (mimeType.toLowerCase()) { - case 'audio/webm': - case 'audio/webm;codecs=opus': - return 'webm'; - case 'audio/ogg': - case 'audio/ogg;codecs=opus': - return 'ogg'; - case 'audio/mp4': - case 'audio/m4a': - return 'm4a'; - case 'audio/mpeg': - case 'audio/mp3': - return 'mp3'; - case 'audio/wav': - case 'audio/wave': - case 'audio/x-wav': - return 'wav'; - default: - return 'webm'; - } + return await speechTranscriber.transcribe(requestId, audioBase64, mimeType); } async function generateImageFromPrompt(prompt: string): Promise<{ imageBase64: string; mimeType: string }> { diff --git a/server/src/speech-transcriber.ts b/server/src/speech-transcriber.ts new file mode 100644 index 0000000..6b8148f --- /dev/null +++ b/server/src/speech-transcriber.ts @@ -0,0 +1,173 @@ +import WebSocket, { type RawData } from 'ws'; + +type LoggerLike = { + info: (payload: unknown, message?: string) => void; + warn: (payload: unknown, message?: string) => void; + error: (payload: unknown, message?: string) => void; +}; + +type SpeechTranscriberOptions = { + serviceUrl: string; + language: string; + requestTimeoutMs: number; +}; + +type ServiceEvent = + | { type: 'start'; id: string; model: string; language: string } + | { type: 'delta'; id: string; text: string; fullText: string } + | { type: 'done'; id: string; text: string } + | { type: 'error'; id?: string; message: string }; + +export class SpeechTranscriber { + constructor( + private readonly options: SpeechTranscriberOptions, + private readonly logger: LoggerLike, + ) {} + + async transcribe(requestId: string, audioBase64: string, mimeType: string): Promise { + const audio = this.normalizeAudioPayload(audioBase64, mimeType); + + return await new Promise((resolve, reject) => { + let settled = false; + const socket = new WebSocket(this.options.serviceUrl); + + const finish = (handler: () => void): void => { + if (settled) { + return; + } + + settled = true; + clearTimeout(timeout); + socket.removeAllListeners(); + + if (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN) { + socket.close(); + } + + handler(); + }; + + const timeout = setTimeout(() => { + finish(() => { + reject(new Error(`The transcription service timed out after ${this.options.requestTimeoutMs}ms.`)); + }); + }, this.options.requestTimeoutMs); + + socket.on('open', () => { + try { + socket.send( + JSON.stringify({ + type: 'transcribe', + id: requestId, + language: this.options.language, + audio, + }), + ); + } catch (error) { + finish(() => { + reject(error instanceof Error ? error : new Error('Could not send transcription request.')); + }); + } + }); + + socket.on('message', (payload) => { + const event = this.parseEvent(payload); + + if (!event) { + return; + } + + if (event.id && event.id !== requestId) { + this.logger.warn({ requestId, event }, 'Ignored transcription event for another request'); + return; + } + + if (event.type === 'start') { + this.logger.info( + { requestId, model: event.model, language: event.language }, + 'Speech transcription started', + ); + return; + } + + if (event.type === 'delta') { + return; + } + + if (event.type === 'done') { + finish(() => { + resolve(event.text.trim()); + }); + return; + } + + finish(() => { + reject(new Error(event.message)); + }); + }); + + socket.on('error', (error) => { + finish(() => { + reject(error instanceof Error ? error : new Error('The transcription service connection failed.')); + }); + }); + + socket.on('close', (code, reasonBuffer) => { + if (settled) { + return; + } + + const reason = reasonBuffer.toString().trim(); + const detail = reason + ? `The transcription service closed the connection unexpectedly (code=${code}, reason=${reason}).` + : `The transcription service closed the connection unexpectedly (code=${code}).`; + + finish(() => { + reject(new Error(detail)); + }); + }); + }); + } + + private normalizeAudioPayload(audioBase64: string, mimeType: string): string { + const trimmedAudio = audioBase64.trim(); + + if (trimmedAudio.startsWith('data:')) { + return trimmedAudio; + } + + const normalizedMimeType = mimeType.trim() || 'audio/webm'; + return `data:${normalizedMimeType};base64,${trimmedAudio}`; + } + + private parseEvent(payload: RawData): ServiceEvent | null { + const message = this.rawDataToString(payload).trim(); + + if (!message) { + return null; + } + + try { + return JSON.parse(message) as ServiceEvent; + } catch { + this.logger.warn({ transcriptionPayload: message }, 'Ignored non-JSON transcription event'); + return null; + } + } + + private rawDataToString(payload: RawData): string { + if (typeof payload === 'string') { + return payload; + } + + if (payload instanceof ArrayBuffer) { + return Buffer.from(payload).toString('utf8'); + } + + if (Array.isArray(payload)) { + return Buffer.concat(payload).toString('utf8'); + } + + return payload.toString('utf8'); + } +} diff --git a/server/src/whisper-transcriber.ts b/server/src/whisper-transcriber.ts deleted file mode 100644 index ce39a7d..0000000 --- a/server/src/whisper-transcriber.ts +++ /dev/null @@ -1,176 +0,0 @@ -import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process'; -import { createInterface } from 'node:readline'; - -type LoggerLike = { - info: (payload: unknown, message?: string) => void; - warn: (payload: unknown, message?: string) => void; - error: (payload: unknown, message?: string) => void; -}; - -type WhisperTranscriberOptions = { - pythonExecutable: string; - scriptPath: string; - model: string; - device: string; - computeType: string; -}; - -type WorkerEvent = - | { type: 'ready'; model: string } - | { type: 'result'; requestId: string; text: string } - | { type: 'error'; requestId?: string; message: string } - | { type: 'fatal'; message: string }; - -export class WhisperTranscriber { - private worker: ChildProcessWithoutNullStreams | null = null; - private readyPromise: Promise | null = null; - private resolveReady: (() => void) | null = null; - private rejectReady: ((reason?: unknown) => void) | null = null; - private readonly pendingRequests = new Map< - string, - { resolve: (text: string) => void; reject: (reason?: unknown) => void } - >(); - - constructor( - private readonly options: WhisperTranscriberOptions, - private readonly logger: LoggerLike, - ) {} - - async transcribe(requestId: string, audioPath: string): Promise { - await this.ensureWorker(); - - if (!this.worker || this.worker.stdin.destroyed) { - throw new Error('The Whisper worker is not available.'); - } - - return new Promise((resolve, reject) => { - this.pendingRequests.set(requestId, { resolve, reject }); - - try { - this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`); - } catch (error) { - this.pendingRequests.delete(requestId); - reject(error); - } - }); - } - - private async ensureWorker(): Promise { - if (this.readyPromise) { - return this.readyPromise; - } - - this.worker = spawn( - this.options.pythonExecutable, - [ - this.options.scriptPath, - '--model', - this.options.model, - '--device', - this.options.device, - '--compute-type', - this.options.computeType, - ], - { stdio: ['pipe', 'pipe', 'pipe'] }, - ); - - this.readyPromise = new Promise((resolve, reject) => { - this.resolveReady = resolve; - this.rejectReady = reject; - }); - - const stdout = createInterface({ input: this.worker.stdout }); - stdout.on('line', (line) => { - this.handleWorkerLine(line); - }); - - this.worker.stderr.on('data', (chunk) => { - const message = chunk.toString().trim(); - - if (message) { - this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr'); - } - }); - - this.worker.on('error', (error) => { - this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.')); - }); - - this.worker.on('exit', (code, signal) => { - this.failWorker( - new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`), - ); - }); - - return this.readyPromise; - } - - private handleWorkerLine(line: string): void { - let payload: WorkerEvent; - - try { - payload = JSON.parse(line) as WorkerEvent; - } catch { - this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output'); - return; - } - - if (payload.type === 'ready') { - this.logger.info({ model: payload.model }, 'Whisper worker ready'); - this.resolveReady?.(); - this.resolveReady = null; - this.rejectReady = null; - return; - } - - if (payload.type === 'fatal') { - this.failWorker(new Error(payload.message)); - return; - } - - if (payload.type === 'error') { - if (!payload.requestId) { - this.failWorker(new Error(payload.message)); - return; - } - - const pendingRequest = this.pendingRequests.get(payload.requestId); - - if (!pendingRequest) { - return; - } - - this.pendingRequests.delete(payload.requestId); - pendingRequest.reject(new Error(payload.message)); - return; - } - - const pendingRequest = this.pendingRequests.get(payload.requestId); - - if (!pendingRequest) { - return; - } - - this.pendingRequests.delete(payload.requestId); - pendingRequest.resolve(payload.text.trim()); - } - - private failWorker(error: Error): void { - if (this.worker) { - this.worker.removeAllListeners(); - this.worker = null; - } - - this.rejectReady?.(error); - this.resolveReady = null; - this.rejectReady = null; - this.readyPromise = null; - - for (const { reject } of this.pendingRequests.values()) { - reject(error); - } - - this.pendingRequests.clear(); - this.logger.error({ err: error }, 'Whisper worker failed'); - } -}