From 189f989c0d702ed25de635b6509ab5a1d15d3c9f Mon Sep 17 00:00:00 2001 From: Laurent Dubertrand Date: Wed, 11 Mar 2026 00:26:49 +0100 Subject: [PATCH] Dictation through AI --- client/src/app/chat-page.component.html | 24 +++ client/src/app/chat-page.component.scss | 7 + client/src/app/chat-page.component.ts | 203 +++++++++++++++++++++++- client/src/app/chat-session.service.ts | 86 ++++++++++ client/src/app/models.ts | 10 ++ server/dist/index.js | 82 ++++++++++ server/dist/whisper-transcriber.js | 121 ++++++++++++++ server/package.json | 3 +- server/requirements-whisper.txt | 1 + server/scripts/transcribe_whisper.py | 92 +++++++++++ server/src/index.ts | 108 +++++++++++++ server/src/whisper-transcriber.ts | 176 ++++++++++++++++++++ 12 files changed, 911 insertions(+), 2 deletions(-) create mode 100644 server/dist/whisper-transcriber.js create mode 100644 server/requirements-whisper.txt create mode 100644 server/scripts/transcribe_whisper.py create mode 100644 server/src/whisper-transcriber.ts diff --git a/client/src/app/chat-page.component.html b/client/src/app/chat-page.component.html index bf4de09..22fcb28 100644 --- a/client/src/app/chat-page.component.html +++ b/client/src/app/chat-page.component.html @@ -314,6 +314,30 @@ {{ isRecordingVoice() ? 'âšī¸' : 'đŸŽ™ī¸' }} + + | null = null; + private resolveDictationCompletion: (() => void) | null = null; + private dictationApplyToken = 0; @ViewChild('callAudioElement') set callAudioElementRef(value: ElementRef | undefined) { this.callAudioElement = value; @@ -39,6 +48,8 @@ export class ChatPageComponent implements OnDestroy { readonly forwardingEntryId = signal(null); readonly emojiPickerOpen = signal(false); readonly isRecordingVoice = signal(false); + readonly isDictating = signal(false); + readonly isTranscribingDictation = signal(false); readonly emojiOptions = [ '😀', '😁', '😂', 'đŸ¤Ŗ', '😊', '😉', '😍', '😘', '😎', '🤔', @@ -152,6 +163,7 @@ export class ChatPageComponent implements OnDestroy { } ngOnDestroy(): void { + void this.stopDictation(true); this.stopVoiceRecording(true); this.detachCallAudioSource(); } @@ -174,6 +186,7 @@ export class ChatPageComponent implements OnDestroy { return; } + await this.stopDictation(false); await this.session.sendText(peerId, this.messageText); this.messageText = ''; this.emojiPickerOpen.set(false); @@ -188,6 +201,7 @@ export class ChatPageComponent implements OnDestroy { return; } + await this.stopDictation(false); const requested = await this.session.requestGeneratedImage(peerId, this.messageText); if (!requested) { @@ -262,6 +276,92 @@ export class ChatPageComponent implements OnDestroy { input.value = ''; } + async toggleDictation(textarea: HTMLTextAreaElement): Promise { + if (this.isDictating()) { + await this.stopDictation(false); + return; + } + + if (this.isTranscribingDictation()) { + return; + } + const peerId = this.peerId(); + + if (!peerId) { + return; + } + + if (typeof MediaRecorder === 'undefined' || typeof navigator === 'undefined') { + this.session.error.set('This browser does not support dictation recording.'); + return; + } + + if (typeof navigator.mediaDevices?.getUserMedia !== 'function') { + this.session.error.set('This browser cannot access the microphone for dictation.'); + return; + } + + this.dictationBaseText = this.messageText; + this.discardRecordedDictation = false; + this.dictationApplyToken += 1; + + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + const preferredMimeType = this.preferredVoiceMimeType(); + const recorder = preferredMimeType + ? new MediaRecorder(stream, { mimeType: preferredMimeType }) + : new MediaRecorder(stream); + const applyToken = this.dictationApplyToken; + + this.dictationChunks = []; + this.dictationStream = stream; + this.dictationRecorder = recorder; + this.dictationCompletionPromise = new Promise((resolve) => { + this.resolveDictationCompletion = resolve; + }); + + recorder.ondataavailable = (event) => { + if (event.data.size > 0) { + this.dictationChunks.push(event.data); + } + }; + + recorder.onerror = () => { + this.ngZone.run(() => { + this.session.error.set('Could not record dictation audio.'); + this.cleanupDictationRecorder(); + this.finishDictationCompletion(); + }); + }; + + recorder.onstop = () => { + const shouldDiscard = this.discardRecordedDictation; + const mimeType = recorder.mimeType || preferredMimeType || 'audio/webm'; + const blob = new Blob(this.dictationChunks, { type: mimeType }); + + this.ngZone.run(() => { + this.cleanupDictationRecorder(); + + if (shouldDiscard || blob.size === 0) { + this.finishDictationCompletion(); + return; + } + + this.isTranscribingDictation.set(true); + void this.transcribeDictation(blob, textarea, applyToken); + }); + }; + + recorder.start(); + this.isDictating.set(true); + this.session.error.set(null); + } catch { + this.session.error.set('Could not start dictation recording.'); + this.cleanupDictationRecorder(); + this.finishDictationCompletion(); + } + } + async toggleVoiceRecording(): Promise { if (this.isRecordingVoice()) { this.stopVoiceRecording(false); @@ -482,6 +582,7 @@ export class ChatPageComponent implements OnDestroy { return; } + await this.stopDictation(true); this.stopVoiceRecording(true); this.forwardingEntryId.set(null); this.emojiPickerOpen.set(false); @@ -532,6 +633,106 @@ export class ChatPageComponent implements OnDestroy { return candidates.find((candidate) => MediaRecorder.isTypeSupported(candidate)) ?? ''; } + private async stopDictation(discard: boolean): Promise { + const completion = this.dictationCompletionPromise; + + if (discard) { + this.dictationApplyToken += 1; + this.messageText = this.dictationBaseText || this.messageText; + this.handleMessageTextChange(this.messageText); + this.isTranscribingDictation.set(false); + } else { + this.dictationBaseText = this.messageText; + } + + if (this.dictationRecorder) { + this.discardRecordedDictation = discard; + + if (this.dictationRecorder.state !== 'inactive') { + this.dictationRecorder.stop(); + } else { + this.cleanupDictationRecorder(); + this.finishDictationCompletion(); + } + } else if (!completion) { + this.dictationBaseText = ''; + } + + if (completion) { + await completion; + } + } + + private cleanupDictationRecorder(): void { + if (this.dictationStream) { + for (const track of this.dictationStream.getTracks()) { + track.stop(); + } + } + + this.dictationRecorder = null; + this.dictationStream = null; + this.dictationChunks = []; + this.discardRecordedDictation = false; + this.isDictating.set(false); + } + + private finishDictationCompletion(): void { + this.resolveDictationCompletion?.(); + this.resolveDictationCompletion = null; + this.dictationCompletionPromise = null; + this.dictationBaseText = ''; + } + + private async transcribeDictation(blob: Blob, textarea: HTMLTextAreaElement, applyToken: number): Promise { + try { + const transcript = await this.session.requestSpeechTranscription(blob); + + if (applyToken !== this.dictationApplyToken) { + return; + } + + this.applyDictatedText(this.mergeDictatedText(this.dictationBaseText, transcript), textarea); + } catch { + if (applyToken === this.dictationApplyToken) { + this.session.error.set('Dictation transcription failed.'); + } + } finally { + if (applyToken === this.dictationApplyToken) { + this.isTranscribingDictation.set(false); + } + + this.finishDictationCompletion(); + } + } + + private mergeDictatedText(baseText: string, transcript: string): string { + const trimmedTranscript = transcript.trim(); + + if (!trimmedTranscript) { + return baseText; + } + + if (!baseText.trim()) { + return trimmedTranscript; + } + + return `${baseText.trimEnd()} ${trimmedTranscript}`; + } + + private applyDictatedText(text: string, textarea: HTMLTextAreaElement): void { + this.messageText = text; + textarea.value = text; + this.composerSelectionStart = text.length; + this.composerSelectionEnd = text.length; + this.handleMessageTextChange(text); + + queueMicrotask(() => { + textarea.focus(); + textarea.setSelectionRange(text.length, text.length); + }); + } + private syncCallAudioSource(): void { const audio = this.callAudioElement?.nativeElement; diff --git a/client/src/app/chat-session.service.ts b/client/src/app/chat-session.service.ts index a354a67..69f79d9 100644 --- a/client/src/app/chat-session.service.ts +++ b/client/src/app/chat-session.service.ts @@ -170,6 +170,10 @@ export class ChatSessionService { string, { peerId: string; prompt: string; waitMessageId: string } >(); + private readonly pendingSpeechTranscriptionRequests = new Map< + string, + { resolve: (text: string) => void; reject: (reason?: unknown) => void } + >(); private readonly remoteVideoStreams = signal>([]); private readonly remoteAudioStreams = signal>([]); private readonly activeCameraPeerId = signal(null); @@ -916,6 +920,32 @@ export class ChatSessionService { return true; } + async requestSpeechTranscription(audioBlob: Blob): Promise { + if (!this.websocket || this.websocket.readyState !== WebSocket.OPEN) { + throw new Error('You must be connected to signaling before using dictation.'); + } + + const requestId = crypto.randomUUID(); + const audioBase64 = await this.blobToBase64(audioBlob); + + return new Promise((resolve, reject) => { + this.pendingSpeechTranscriptionRequests.set(requestId, { resolve, reject }); + + try { + this.error.set(null); + this.websocket?.send(JSON.stringify({ + type: 'speech-transcription', + requestId, + mimeType: audioBlob.type || 'audio/webm', + audioBase64, + })); + } catch (error) { + this.pendingSpeechTranscriptionRequests.delete(requestId); + reject(error); + } + }); + } + private async loadAccessKeys(): Promise { const token = this.token(); @@ -990,6 +1020,7 @@ export class ChatSessionService { const shouldReconnect = this.websocket === websocket && !this.suppressSocketReconnect; this.stopWebSocketHeartbeat(); + this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.'); this.signalingState.set('disconnected'); this.status.set('Signaling connection closed.'); @@ -1014,6 +1045,7 @@ export class ChatSessionService { private disconnectWebSocket(): void { this.stopWebSocketHeartbeat(); + this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.'); if (this.websocket) { this.suppressSocketReconnect = true; @@ -1055,6 +1087,12 @@ export class ChatSessionService { case 'image-generation-error': this.handleGeneratedImageError(event); break; + case 'speech-transcribed': + this.handleSpeechTranscribed(event); + break; + case 'speech-transcription-error': + this.handleSpeechTranscriptionError(event); + break; case 'pong': break; case 'error': @@ -1109,6 +1147,28 @@ export class ChatSessionService { this.error.set(event.message); } + private handleSpeechTranscribed(event: Extract): void { + const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId); + + if (!pendingRequest) { + return; + } + + this.pendingSpeechTranscriptionRequests.delete(event.requestId); + pendingRequest.resolve(event.text); + } + + private handleSpeechTranscriptionError(event: Extract): void { + const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId); + + if (pendingRequest) { + this.pendingSpeechTranscriptionRequests.delete(event.requestId); + pendingRequest.reject(new Error(event.message)); + } + + this.error.set(event.message); + } + private async restoreSession(): Promise { const token = this.token(); @@ -2024,6 +2084,18 @@ export class ChatSessionService { } } + private rejectPendingSpeechTranscriptions(message: string): void { + if (this.pendingSpeechTranscriptionRequests.size === 0) { + return; + } + + for (const { reject } of this.pendingSpeechTranscriptionRequests.values()) { + reject(new Error(message)); + } + + this.pendingSpeechTranscriptionRequests.clear(); + } + private clearLocalAuth(statusMessage: string): void { this.clearWebSocketReconnect(); this.disconnectWebSocket(); @@ -2034,6 +2106,7 @@ export class ChatSessionService { this.stopRingtone(); this.releasePreloadedRingtone(); this.pendingImageGenerationRequests.clear(); + this.rejectPendingSpeechTranscriptions('Session ended during dictation.'); this.remoteVideoStreams.set([]); this.remoteAudioStreams.set([]); this.remoteVideoModalPeerId.set(null); @@ -2060,6 +2133,19 @@ export class ChatSessionService { this.removeStorage('privatechat.user'); } + private async blobToBase64(blob: Blob): Promise { + const buffer = await blob.arrayBuffer(); + let binary = ''; + const bytes = new Uint8Array(buffer); + const chunkSize = 0x8000; + + for (let index = 0; index < bytes.length; index += chunkSize) { + binary += String.fromCharCode(...bytes.subarray(index, index + chunkSize)); + } + + return btoa(binary); + } + private async loadPersistedMessages(userId: string): Promise { const messageEncryptionKey = this.messageEncryptionKey; diff --git a/client/src/app/models.ts b/client/src/app/models.ts index 667631c..1704cd7 100644 --- a/client/src/app/models.ts +++ b/client/src/app/models.ts @@ -130,6 +130,16 @@ export type ServerEvent = peerId: string; message: string; } + | { + type: 'speech-transcribed'; + requestId: string; + text: string; + } + | { + type: 'speech-transcription-error'; + requestId: string; + message: string; + } | { type: 'pong' } | { type: 'error'; message: string }; diff --git a/server/dist/index.js b/server/dist/index.js index 19f452f..cd13c26 100644 --- a/server/dist/index.js +++ b/server/dist/index.js @@ -1,5 +1,7 @@ import crypto from 'node:crypto'; import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import os from 'node:os'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { TextEncoder } from 'node:util'; @@ -13,6 +15,7 @@ import { generateAuthenticationOptions, generateRegistrationOptions, verifyAuthe import Fastify from 'fastify'; import { Redis } from 'ioredis'; import { z } from 'zod'; +import { WhisperTranscriber } from './whisper-transcriber.js'; dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) }); const projectRootPath = fileURLToPath(new URL('../../', import.meta.url)); const registerSchema = z.object({ @@ -81,6 +84,12 @@ const signalMessageSchema = z.discriminatedUnion('type', [ z.object({ type: z.literal('ping'), }), + z.object({ + type: z.literal('speech-transcription'), + requestId: z.string().uuid(), + mimeType: z.string().trim().min(1).max(128), + audioBase64: z.string().min(1).max(32_000_000), + }), ]); const app = Fastify({ logger: true, trustProxy: true }); const approvalAdminUsername = 'ladparis'; @@ -91,6 +100,11 @@ const frontendDistPath = resolveProjectPath(process.env.PRIVATECHAT_WEB_DIST_DIR const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, ''); const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest'; const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024'; +const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3'; +const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small'; +const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu'; +const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8'; +const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py'); const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12); const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60); const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN); @@ -101,6 +115,13 @@ const webAuthnRpName = process.env.WEBAUTHN_RP_NAME ?? 'PrivateChat'; const webAuthnUserVerification = resolveWebAuthnUserVerification(process.env.WEBAUTHN_USER_VERIFICATION); const frontendIndexPath = path.join(frontendDistPath, 'index.html'); const hasFrontendBuild = fs.existsSync(frontendIndexPath); +const whisperTranscriber = new WhisperTranscriber({ + pythonExecutable: whisperPythonExecutable, + scriptPath: whisperScriptPath, + model: whisperModel, + device: whisperDevice, + computeType: whisperComputeType, +}, app.log); fs.mkdirSync(path.dirname(sqlitePath), { recursive: true }); fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true }); const encryptionKey = deriveEncryptionKey(loadOrCreateMasterKey(masterKeyPath)); @@ -719,6 +740,25 @@ async function handleSocketMessage(userId, sessionId, socket, rawMessage) { } return; } + if (parsed.type === 'speech-transcription') { + try { + const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType); + send(socket, { + type: 'speech-transcribed', + requestId: parsed.requestId, + text, + }); + } + catch (error) { + app.log.warn({ err: error, userId }, 'Whisper transcription failed'); + send(socket, { + type: 'speech-transcription-error', + requestId: parsed.requestId, + message: error instanceof Error ? error.message : 'Speech transcription failed.', + }); + } + return; + } let delivered = 0; const recipientSockets = socketsByUserId.get(parsed.to); if (recipientSockets) { @@ -1095,12 +1135,54 @@ function parseClientMessage(rawMessage) { prompt: parsed.data.prompt, }; } + if (parsed.data.type === 'speech-transcription') { + return { + type: 'speech-transcription', + requestId: parsed.data.requestId, + mimeType: parsed.data.mimeType, + audioBase64: parsed.data.audioBase64, + }; + } return { type: 'signal', to: parsed.data.to, signal: normalizeSignal(parsed.data.signal), }; } +async function transcribeAudioPayload(requestId, audioBase64, mimeType) { + const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-')); + const extension = audioExtensionForMimeType(mimeType); + const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`); + try { + await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64')); + return await whisperTranscriber.transcribe(requestId, audioPath); + } + finally { + await fsPromises.rm(tempDirectory, { recursive: true, force: true }); + } +} +function audioExtensionForMimeType(mimeType) { + switch (mimeType.toLowerCase()) { + case 'audio/webm': + case 'audio/webm;codecs=opus': + return 'webm'; + case 'audio/ogg': + case 'audio/ogg;codecs=opus': + return 'ogg'; + case 'audio/mp4': + case 'audio/m4a': + return 'm4a'; + case 'audio/mpeg': + case 'audio/mp3': + return 'mp3'; + case 'audio/wav': + case 'audio/wave': + case 'audio/x-wav': + return 'wav'; + default: + return 'webm'; + } +} async function generateImageFromPrompt(prompt) { const abortController = new AbortController(); const timeoutId = setTimeout(() => abortController.abort(), 120_000); diff --git a/server/dist/whisper-transcriber.js b/server/dist/whisper-transcriber.js new file mode 100644 index 0000000..6f145e6 --- /dev/null +++ b/server/dist/whisper-transcriber.js @@ -0,0 +1,121 @@ +import { spawn } from 'node:child_process'; +import { createInterface } from 'node:readline'; +export class WhisperTranscriber { + options; + logger; + worker = null; + readyPromise = null; + resolveReady = null; + rejectReady = null; + pendingRequests = new Map(); + constructor(options, logger) { + this.options = options; + this.logger = logger; + } + async transcribe(requestId, audioPath) { + await this.ensureWorker(); + if (!this.worker || this.worker.stdin.destroyed) { + throw new Error('The Whisper worker is not available.'); + } + return new Promise((resolve, reject) => { + this.pendingRequests.set(requestId, { resolve, reject }); + try { + this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`); + } + catch (error) { + this.pendingRequests.delete(requestId); + reject(error); + } + }); + } + async ensureWorker() { + if (this.readyPromise) { + return this.readyPromise; + } + this.worker = spawn(this.options.pythonExecutable, [ + this.options.scriptPath, + '--model', + this.options.model, + '--device', + this.options.device, + '--compute-type', + this.options.computeType, + ], { stdio: ['pipe', 'pipe', 'pipe'] }); + this.readyPromise = new Promise((resolve, reject) => { + this.resolveReady = resolve; + this.rejectReady = reject; + }); + const stdout = createInterface({ input: this.worker.stdout }); + stdout.on('line', (line) => { + this.handleWorkerLine(line); + }); + this.worker.stderr.on('data', (chunk) => { + const message = chunk.toString().trim(); + if (message) { + this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr'); + } + }); + this.worker.on('error', (error) => { + this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.')); + }); + this.worker.on('exit', (code, signal) => { + this.failWorker(new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`)); + }); + return this.readyPromise; + } + handleWorkerLine(line) { + let payload; + try { + payload = JSON.parse(line); + } + catch { + this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output'); + return; + } + if (payload.type === 'ready') { + this.logger.info({ model: payload.model }, 'Whisper worker ready'); + this.resolveReady?.(); + this.resolveReady = null; + this.rejectReady = null; + return; + } + if (payload.type === 'fatal') { + this.failWorker(new Error(payload.message)); + return; + } + if (payload.type === 'error') { + if (!payload.requestId) { + this.failWorker(new Error(payload.message)); + return; + } + const pendingRequest = this.pendingRequests.get(payload.requestId); + if (!pendingRequest) { + return; + } + this.pendingRequests.delete(payload.requestId); + pendingRequest.reject(new Error(payload.message)); + return; + } + const pendingRequest = this.pendingRequests.get(payload.requestId); + if (!pendingRequest) { + return; + } + this.pendingRequests.delete(payload.requestId); + pendingRequest.resolve(payload.text.trim()); + } + failWorker(error) { + if (this.worker) { + this.worker.removeAllListeners(); + this.worker = null; + } + this.rejectReady?.(error); + this.resolveReady = null; + this.rejectReady = null; + this.readyPromise = null; + for (const { reject } of this.pendingRequests.values()) { + reject(error); + } + this.pendingRequests.clear(); + this.logger.error({ err: error }, 'Whisper worker failed'); + } +} diff --git a/server/package.json b/server/package.json index 85c9a81..7e513de 100644 --- a/server/package.json +++ b/server/package.json @@ -6,7 +6,8 @@ "scripts": { "dev": "node node_modules/tsx/dist/cli.mjs watch src/index.ts", "build": "node node_modules/typescript/bin/tsc -p tsconfig.json", - "start": "node dist/index.js" + "start": "node dist/index.js", + "setup-whisper": "python3 -m pip install -r requirements-whisper.txt" }, "dependencies": { "@fastify/cors": "^11.2.0", diff --git a/server/requirements-whisper.txt b/server/requirements-whisper.txt new file mode 100644 index 0000000..144536a --- /dev/null +++ b/server/requirements-whisper.txt @@ -0,0 +1 @@ +faster-whisper>=1.0.0 diff --git a/server/scripts/transcribe_whisper.py b/server/scripts/transcribe_whisper.py new file mode 100644 index 0000000..a1c8db8 --- /dev/null +++ b/server/scripts/transcribe_whisper.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +import argparse +import json +import sys + + +def emit(payload): + print(json.dumps(payload), flush=True) + + +def load_model(model_name, device, compute_type): + try: + from faster_whisper import WhisperModel + except Exception as exc: + emit( + { + "type": "fatal", + "message": "faster-whisper is not installed. Run `python3 -m pip install -r server/requirements-whisper.txt`.", + } + ) + raise SystemExit(1) from exc + + try: + return WhisperModel(model_name, device=device, compute_type=compute_type) + except Exception as exc: + emit( + { + "type": "fatal", + "message": f"Could not load the faster-whisper model '{model_name}': {exc}", + } + ) + raise SystemExit(1) from exc + + +def transcribe(model, request_id, audio_path): + try: + segments, _ = model.transcribe(audio_path, vad_filter=True, beam_size=5) + text = "".join(segment.text for segment in segments).strip() + emit({"type": "result", "requestId": request_id, "text": text}) + except Exception as exc: + emit( + { + "type": "error", + "requestId": request_id, + "message": f"Whisper transcription failed: {exc}", + } + ) + + +def main(): + parser = argparse.ArgumentParser(description="Persistent faster-whisper transcription worker") + parser.add_argument("--model", default="small") + parser.add_argument("--device", default="cpu") + parser.add_argument("--compute-type", default="int8") + args = parser.parse_args() + + model = load_model(args.model, args.device, args.compute_type) + emit({"type": "ready", "model": args.model}) + + for raw_line in sys.stdin: + line = raw_line.strip() + + if not line: + continue + + try: + payload = json.loads(line) + except Exception as exc: + emit({"type": "error", "message": f"Invalid request JSON: {exc}"}) + continue + + request_id = payload.get("requestId") + audio_path = payload.get("audioPath") + + if not request_id or not audio_path: + emit( + { + "type": "error", + "requestId": request_id, + "message": "Missing requestId or audioPath.", + } + ) + continue + + transcribe(model, request_id, audio_path) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/server/src/index.ts b/server/src/index.ts index 51e79b3..4e19e56 100644 --- a/server/src/index.ts +++ b/server/src/index.ts @@ -1,5 +1,7 @@ import crypto from 'node:crypto'; import fs from 'node:fs'; +import fsPromises from 'node:fs/promises'; +import os from 'node:os'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { TextEncoder } from 'node:util'; @@ -23,6 +25,8 @@ import { Redis } from 'ioredis'; import type WebSocket from 'ws'; import { z } from 'zod'; +import { WhisperTranscriber } from './whisper-transcriber.js'; + dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) }); const projectRootPath = fileURLToPath(new URL('../../', import.meta.url)); @@ -120,6 +124,12 @@ type ClientMessage = } | { type: 'ping'; + } + | { + type: 'speech-transcription'; + requestId: string; + mimeType: string; + audioBase64: string; }; type ServerMessage = @@ -142,6 +152,16 @@ type ServerMessage = peerId: string; message: string; } + | { + type: 'speech-transcribed'; + requestId: string; + text: string; + } + | { + type: 'speech-transcription-error'; + requestId: string; + message: string; + } | { type: 'pong' } | { type: 'error'; message: string }; @@ -289,6 +309,12 @@ const signalMessageSchema = z.discriminatedUnion('type', [ z.object({ type: z.literal('ping'), }), + z.object({ + type: z.literal('speech-transcription'), + requestId: z.string().uuid(), + mimeType: z.string().trim().min(1).max(128), + audioBase64: z.string().min(1).max(32_000_000), + }), ]); const app = Fastify({ logger: true, trustProxy: true }); @@ -307,6 +333,11 @@ const frontendDistPath = resolveProjectPath( const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, ''); const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest'; const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024'; +const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3'; +const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small'; +const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu'; +const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8'; +const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py'); const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12); const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60); const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN); @@ -320,6 +351,17 @@ const webAuthnUserVerification = resolveWebAuthnUserVerification( const frontendIndexPath = path.join(frontendDistPath, 'index.html'); const hasFrontendBuild = fs.existsSync(frontendIndexPath); +const whisperTranscriber = new WhisperTranscriber( + { + pythonExecutable: whisperPythonExecutable, + scriptPath: whisperScriptPath, + model: whisperModel, + device: whisperDevice, + computeType: whisperComputeType, + }, + app.log, +); + fs.mkdirSync(path.dirname(sqlitePath), { recursive: true }); fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true }); @@ -1127,6 +1169,27 @@ async function handleSocketMessage( return; } + if (parsed.type === 'speech-transcription') { + try { + const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType); + + send(socket, { + type: 'speech-transcribed', + requestId: parsed.requestId, + text, + }); + } catch (error) { + app.log.warn({ err: error, userId }, 'Whisper transcription failed'); + send(socket, { + type: 'speech-transcription-error', + requestId: parsed.requestId, + message: error instanceof Error ? error.message : 'Speech transcription failed.', + }); + } + + return; + } + let delivered = 0; const recipientSockets = socketsByUserId.get(parsed.to); @@ -1668,6 +1731,15 @@ function parseClientMessage(rawMessage: string): ClientMessage | null { }; } + if (parsed.data.type === 'speech-transcription') { + return { + type: 'speech-transcription', + requestId: parsed.data.requestId, + mimeType: parsed.data.mimeType, + audioBase64: parsed.data.audioBase64, + }; + } + return { type: 'signal', to: parsed.data.to, @@ -1675,6 +1747,42 @@ function parseClientMessage(rawMessage: string): ClientMessage | null { }; } +async function transcribeAudioPayload(requestId: string, audioBase64: string, mimeType: string): Promise { + const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-')); + const extension = audioExtensionForMimeType(mimeType); + const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`); + + try { + await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64')); + return await whisperTranscriber.transcribe(requestId, audioPath); + } finally { + await fsPromises.rm(tempDirectory, { recursive: true, force: true }); + } +} + +function audioExtensionForMimeType(mimeType: string): string { + switch (mimeType.toLowerCase()) { + case 'audio/webm': + case 'audio/webm;codecs=opus': + return 'webm'; + case 'audio/ogg': + case 'audio/ogg;codecs=opus': + return 'ogg'; + case 'audio/mp4': + case 'audio/m4a': + return 'm4a'; + case 'audio/mpeg': + case 'audio/mp3': + return 'mp3'; + case 'audio/wav': + case 'audio/wave': + case 'audio/x-wav': + return 'wav'; + default: + return 'webm'; + } +} + async function generateImageFromPrompt(prompt: string): Promise<{ imageBase64: string; mimeType: string }> { const abortController = new AbortController(); const timeoutId = setTimeout(() => abortController.abort(), 120_000); diff --git a/server/src/whisper-transcriber.ts b/server/src/whisper-transcriber.ts new file mode 100644 index 0000000..ce39a7d --- /dev/null +++ b/server/src/whisper-transcriber.ts @@ -0,0 +1,176 @@ +import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process'; +import { createInterface } from 'node:readline'; + +type LoggerLike = { + info: (payload: unknown, message?: string) => void; + warn: (payload: unknown, message?: string) => void; + error: (payload: unknown, message?: string) => void; +}; + +type WhisperTranscriberOptions = { + pythonExecutable: string; + scriptPath: string; + model: string; + device: string; + computeType: string; +}; + +type WorkerEvent = + | { type: 'ready'; model: string } + | { type: 'result'; requestId: string; text: string } + | { type: 'error'; requestId?: string; message: string } + | { type: 'fatal'; message: string }; + +export class WhisperTranscriber { + private worker: ChildProcessWithoutNullStreams | null = null; + private readyPromise: Promise | null = null; + private resolveReady: (() => void) | null = null; + private rejectReady: ((reason?: unknown) => void) | null = null; + private readonly pendingRequests = new Map< + string, + { resolve: (text: string) => void; reject: (reason?: unknown) => void } + >(); + + constructor( + private readonly options: WhisperTranscriberOptions, + private readonly logger: LoggerLike, + ) {} + + async transcribe(requestId: string, audioPath: string): Promise { + await this.ensureWorker(); + + if (!this.worker || this.worker.stdin.destroyed) { + throw new Error('The Whisper worker is not available.'); + } + + return new Promise((resolve, reject) => { + this.pendingRequests.set(requestId, { resolve, reject }); + + try { + this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`); + } catch (error) { + this.pendingRequests.delete(requestId); + reject(error); + } + }); + } + + private async ensureWorker(): Promise { + if (this.readyPromise) { + return this.readyPromise; + } + + this.worker = spawn( + this.options.pythonExecutable, + [ + this.options.scriptPath, + '--model', + this.options.model, + '--device', + this.options.device, + '--compute-type', + this.options.computeType, + ], + { stdio: ['pipe', 'pipe', 'pipe'] }, + ); + + this.readyPromise = new Promise((resolve, reject) => { + this.resolveReady = resolve; + this.rejectReady = reject; + }); + + const stdout = createInterface({ input: this.worker.stdout }); + stdout.on('line', (line) => { + this.handleWorkerLine(line); + }); + + this.worker.stderr.on('data', (chunk) => { + const message = chunk.toString().trim(); + + if (message) { + this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr'); + } + }); + + this.worker.on('error', (error) => { + this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.')); + }); + + this.worker.on('exit', (code, signal) => { + this.failWorker( + new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`), + ); + }); + + return this.readyPromise; + } + + private handleWorkerLine(line: string): void { + let payload: WorkerEvent; + + try { + payload = JSON.parse(line) as WorkerEvent; + } catch { + this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output'); + return; + } + + if (payload.type === 'ready') { + this.logger.info({ model: payload.model }, 'Whisper worker ready'); + this.resolveReady?.(); + this.resolveReady = null; + this.rejectReady = null; + return; + } + + if (payload.type === 'fatal') { + this.failWorker(new Error(payload.message)); + return; + } + + if (payload.type === 'error') { + if (!payload.requestId) { + this.failWorker(new Error(payload.message)); + return; + } + + const pendingRequest = this.pendingRequests.get(payload.requestId); + + if (!pendingRequest) { + return; + } + + this.pendingRequests.delete(payload.requestId); + pendingRequest.reject(new Error(payload.message)); + return; + } + + const pendingRequest = this.pendingRequests.get(payload.requestId); + + if (!pendingRequest) { + return; + } + + this.pendingRequests.delete(payload.requestId); + pendingRequest.resolve(payload.text.trim()); + } + + private failWorker(error: Error): void { + if (this.worker) { + this.worker.removeAllListeners(); + this.worker = null; + } + + this.rejectReady?.(error); + this.resolveReady = null; + this.rejectReady = null; + this.readyPromise = null; + + for (const { reject } of this.pendingRequests.values()) { + reject(error); + } + + this.pendingRequests.clear(); + this.logger.error({ err: error }, 'Whisper worker failed'); + } +}