diff --git a/client/src/app/chat-page.component.html b/client/src/app/chat-page.component.html
index bf4de09..22fcb28 100644
--- a/client/src/app/chat-page.component.html
+++ b/client/src/app/chat-page.component.html
@@ -314,6 +314,30 @@
{{ isRecordingVoice() ? 'âšī¸' : 'đī¸' }}
+
+
| null = null;
+ private resolveDictationCompletion: (() => void) | null = null;
+ private dictationApplyToken = 0;
@ViewChild('callAudioElement')
set callAudioElementRef(value: ElementRef | undefined) {
this.callAudioElement = value;
@@ -39,6 +48,8 @@ export class ChatPageComponent implements OnDestroy {
readonly forwardingEntryId = signal(null);
readonly emojiPickerOpen = signal(false);
readonly isRecordingVoice = signal(false);
+ readonly isDictating = signal(false);
+ readonly isTranscribingDictation = signal(false);
readonly emojiOptions = [
'đ', 'đ', 'đ', 'đ¤Ŗ', 'đ',
'đ', 'đ', 'đ', 'đ', 'đ¤',
@@ -152,6 +163,7 @@ export class ChatPageComponent implements OnDestroy {
}
ngOnDestroy(): void {
+ void this.stopDictation(true);
this.stopVoiceRecording(true);
this.detachCallAudioSource();
}
@@ -174,6 +186,7 @@ export class ChatPageComponent implements OnDestroy {
return;
}
+ await this.stopDictation(false);
await this.session.sendText(peerId, this.messageText);
this.messageText = '';
this.emojiPickerOpen.set(false);
@@ -188,6 +201,7 @@ export class ChatPageComponent implements OnDestroy {
return;
}
+ await this.stopDictation(false);
const requested = await this.session.requestGeneratedImage(peerId, this.messageText);
if (!requested) {
@@ -262,6 +276,92 @@ export class ChatPageComponent implements OnDestroy {
input.value = '';
}
+ async toggleDictation(textarea: HTMLTextAreaElement): Promise {
+ if (this.isDictating()) {
+ await this.stopDictation(false);
+ return;
+ }
+
+ if (this.isTranscribingDictation()) {
+ return;
+ }
+ const peerId = this.peerId();
+
+ if (!peerId) {
+ return;
+ }
+
+ if (typeof MediaRecorder === 'undefined' || typeof navigator === 'undefined') {
+ this.session.error.set('This browser does not support dictation recording.');
+ return;
+ }
+
+ if (typeof navigator.mediaDevices?.getUserMedia !== 'function') {
+ this.session.error.set('This browser cannot access the microphone for dictation.');
+ return;
+ }
+
+ this.dictationBaseText = this.messageText;
+ this.discardRecordedDictation = false;
+ this.dictationApplyToken += 1;
+
+ try {
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+ const preferredMimeType = this.preferredVoiceMimeType();
+ const recorder = preferredMimeType
+ ? new MediaRecorder(stream, { mimeType: preferredMimeType })
+ : new MediaRecorder(stream);
+ const applyToken = this.dictationApplyToken;
+
+ this.dictationChunks = [];
+ this.dictationStream = stream;
+ this.dictationRecorder = recorder;
+ this.dictationCompletionPromise = new Promise((resolve) => {
+ this.resolveDictationCompletion = resolve;
+ });
+
+ recorder.ondataavailable = (event) => {
+ if (event.data.size > 0) {
+ this.dictationChunks.push(event.data);
+ }
+ };
+
+ recorder.onerror = () => {
+ this.ngZone.run(() => {
+ this.session.error.set('Could not record dictation audio.');
+ this.cleanupDictationRecorder();
+ this.finishDictationCompletion();
+ });
+ };
+
+ recorder.onstop = () => {
+ const shouldDiscard = this.discardRecordedDictation;
+ const mimeType = recorder.mimeType || preferredMimeType || 'audio/webm';
+ const blob = new Blob(this.dictationChunks, { type: mimeType });
+
+ this.ngZone.run(() => {
+ this.cleanupDictationRecorder();
+
+ if (shouldDiscard || blob.size === 0) {
+ this.finishDictationCompletion();
+ return;
+ }
+
+ this.isTranscribingDictation.set(true);
+ void this.transcribeDictation(blob, textarea, applyToken);
+ });
+ };
+
+ recorder.start();
+ this.isDictating.set(true);
+ this.session.error.set(null);
+ } catch {
+ this.session.error.set('Could not start dictation recording.');
+ this.cleanupDictationRecorder();
+ this.finishDictationCompletion();
+ }
+ }
+
async toggleVoiceRecording(): Promise {
if (this.isRecordingVoice()) {
this.stopVoiceRecording(false);
@@ -482,6 +582,7 @@ export class ChatPageComponent implements OnDestroy {
return;
}
+ await this.stopDictation(true);
this.stopVoiceRecording(true);
this.forwardingEntryId.set(null);
this.emojiPickerOpen.set(false);
@@ -532,6 +633,106 @@ export class ChatPageComponent implements OnDestroy {
return candidates.find((candidate) => MediaRecorder.isTypeSupported(candidate)) ?? '';
}
+ private async stopDictation(discard: boolean): Promise {
+ const completion = this.dictationCompletionPromise;
+
+ if (discard) {
+ this.dictationApplyToken += 1;
+ this.messageText = this.dictationBaseText || this.messageText;
+ this.handleMessageTextChange(this.messageText);
+ this.isTranscribingDictation.set(false);
+ } else {
+ this.dictationBaseText = this.messageText;
+ }
+
+ if (this.dictationRecorder) {
+ this.discardRecordedDictation = discard;
+
+ if (this.dictationRecorder.state !== 'inactive') {
+ this.dictationRecorder.stop();
+ } else {
+ this.cleanupDictationRecorder();
+ this.finishDictationCompletion();
+ }
+ } else if (!completion) {
+ this.dictationBaseText = '';
+ }
+
+ if (completion) {
+ await completion;
+ }
+ }
+
+ private cleanupDictationRecorder(): void {
+ if (this.dictationStream) {
+ for (const track of this.dictationStream.getTracks()) {
+ track.stop();
+ }
+ }
+
+ this.dictationRecorder = null;
+ this.dictationStream = null;
+ this.dictationChunks = [];
+ this.discardRecordedDictation = false;
+ this.isDictating.set(false);
+ }
+
+ private finishDictationCompletion(): void {
+ this.resolveDictationCompletion?.();
+ this.resolveDictationCompletion = null;
+ this.dictationCompletionPromise = null;
+ this.dictationBaseText = '';
+ }
+
+ private async transcribeDictation(blob: Blob, textarea: HTMLTextAreaElement, applyToken: number): Promise {
+ try {
+ const transcript = await this.session.requestSpeechTranscription(blob);
+
+ if (applyToken !== this.dictationApplyToken) {
+ return;
+ }
+
+ this.applyDictatedText(this.mergeDictatedText(this.dictationBaseText, transcript), textarea);
+ } catch {
+ if (applyToken === this.dictationApplyToken) {
+ this.session.error.set('Dictation transcription failed.');
+ }
+ } finally {
+ if (applyToken === this.dictationApplyToken) {
+ this.isTranscribingDictation.set(false);
+ }
+
+ this.finishDictationCompletion();
+ }
+ }
+
+ private mergeDictatedText(baseText: string, transcript: string): string {
+ const trimmedTranscript = transcript.trim();
+
+ if (!trimmedTranscript) {
+ return baseText;
+ }
+
+ if (!baseText.trim()) {
+ return trimmedTranscript;
+ }
+
+ return `${baseText.trimEnd()} ${trimmedTranscript}`;
+ }
+
+ private applyDictatedText(text: string, textarea: HTMLTextAreaElement): void {
+ this.messageText = text;
+ textarea.value = text;
+ this.composerSelectionStart = text.length;
+ this.composerSelectionEnd = text.length;
+ this.handleMessageTextChange(text);
+
+ queueMicrotask(() => {
+ textarea.focus();
+ textarea.setSelectionRange(text.length, text.length);
+ });
+ }
+
private syncCallAudioSource(): void {
const audio = this.callAudioElement?.nativeElement;
diff --git a/client/src/app/chat-session.service.ts b/client/src/app/chat-session.service.ts
index a354a67..69f79d9 100644
--- a/client/src/app/chat-session.service.ts
+++ b/client/src/app/chat-session.service.ts
@@ -170,6 +170,10 @@ export class ChatSessionService {
string,
{ peerId: string; prompt: string; waitMessageId: string }
>();
+ private readonly pendingSpeechTranscriptionRequests = new Map<
+ string,
+ { resolve: (text: string) => void; reject: (reason?: unknown) => void }
+ >();
private readonly remoteVideoStreams = signal>([]);
private readonly remoteAudioStreams = signal>([]);
private readonly activeCameraPeerId = signal(null);
@@ -916,6 +920,32 @@ export class ChatSessionService {
return true;
}
+ async requestSpeechTranscription(audioBlob: Blob): Promise {
+ if (!this.websocket || this.websocket.readyState !== WebSocket.OPEN) {
+ throw new Error('You must be connected to signaling before using dictation.');
+ }
+
+ const requestId = crypto.randomUUID();
+ const audioBase64 = await this.blobToBase64(audioBlob);
+
+ return new Promise((resolve, reject) => {
+ this.pendingSpeechTranscriptionRequests.set(requestId, { resolve, reject });
+
+ try {
+ this.error.set(null);
+ this.websocket?.send(JSON.stringify({
+ type: 'speech-transcription',
+ requestId,
+ mimeType: audioBlob.type || 'audio/webm',
+ audioBase64,
+ }));
+ } catch (error) {
+ this.pendingSpeechTranscriptionRequests.delete(requestId);
+ reject(error);
+ }
+ });
+ }
+
private async loadAccessKeys(): Promise {
const token = this.token();
@@ -990,6 +1020,7 @@ export class ChatSessionService {
const shouldReconnect = this.websocket === websocket && !this.suppressSocketReconnect;
this.stopWebSocketHeartbeat();
+ this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
this.signalingState.set('disconnected');
this.status.set('Signaling connection closed.');
@@ -1014,6 +1045,7 @@ export class ChatSessionService {
private disconnectWebSocket(): void {
this.stopWebSocketHeartbeat();
+ this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
if (this.websocket) {
this.suppressSocketReconnect = true;
@@ -1055,6 +1087,12 @@ export class ChatSessionService {
case 'image-generation-error':
this.handleGeneratedImageError(event);
break;
+ case 'speech-transcribed':
+ this.handleSpeechTranscribed(event);
+ break;
+ case 'speech-transcription-error':
+ this.handleSpeechTranscriptionError(event);
+ break;
case 'pong':
break;
case 'error':
@@ -1109,6 +1147,28 @@ export class ChatSessionService {
this.error.set(event.message);
}
+ private handleSpeechTranscribed(event: Extract): void {
+ const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
+
+ if (!pendingRequest) {
+ return;
+ }
+
+ this.pendingSpeechTranscriptionRequests.delete(event.requestId);
+ pendingRequest.resolve(event.text);
+ }
+
+ private handleSpeechTranscriptionError(event: Extract): void {
+ const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
+
+ if (pendingRequest) {
+ this.pendingSpeechTranscriptionRequests.delete(event.requestId);
+ pendingRequest.reject(new Error(event.message));
+ }
+
+ this.error.set(event.message);
+ }
+
private async restoreSession(): Promise {
const token = this.token();
@@ -2024,6 +2084,18 @@ export class ChatSessionService {
}
}
+ private rejectPendingSpeechTranscriptions(message: string): void {
+ if (this.pendingSpeechTranscriptionRequests.size === 0) {
+ return;
+ }
+
+ for (const { reject } of this.pendingSpeechTranscriptionRequests.values()) {
+ reject(new Error(message));
+ }
+
+ this.pendingSpeechTranscriptionRequests.clear();
+ }
+
private clearLocalAuth(statusMessage: string): void {
this.clearWebSocketReconnect();
this.disconnectWebSocket();
@@ -2034,6 +2106,7 @@ export class ChatSessionService {
this.stopRingtone();
this.releasePreloadedRingtone();
this.pendingImageGenerationRequests.clear();
+ this.rejectPendingSpeechTranscriptions('Session ended during dictation.');
this.remoteVideoStreams.set([]);
this.remoteAudioStreams.set([]);
this.remoteVideoModalPeerId.set(null);
@@ -2060,6 +2133,19 @@ export class ChatSessionService {
this.removeStorage('privatechat.user');
}
+ private async blobToBase64(blob: Blob): Promise {
+ const buffer = await blob.arrayBuffer();
+ let binary = '';
+ const bytes = new Uint8Array(buffer);
+ const chunkSize = 0x8000;
+
+ for (let index = 0; index < bytes.length; index += chunkSize) {
+ binary += String.fromCharCode(...bytes.subarray(index, index + chunkSize));
+ }
+
+ return btoa(binary);
+ }
+
private async loadPersistedMessages(userId: string): Promise {
const messageEncryptionKey = this.messageEncryptionKey;
diff --git a/client/src/app/models.ts b/client/src/app/models.ts
index 667631c..1704cd7 100644
--- a/client/src/app/models.ts
+++ b/client/src/app/models.ts
@@ -130,6 +130,16 @@ export type ServerEvent =
peerId: string;
message: string;
}
+ | {
+ type: 'speech-transcribed';
+ requestId: string;
+ text: string;
+ }
+ | {
+ type: 'speech-transcription-error';
+ requestId: string;
+ message: string;
+ }
| { type: 'pong' }
| { type: 'error'; message: string };
diff --git a/server/dist/index.js b/server/dist/index.js
index 19f452f..cd13c26 100644
--- a/server/dist/index.js
+++ b/server/dist/index.js
@@ -1,5 +1,7 @@
import crypto from 'node:crypto';
import fs from 'node:fs';
+import fsPromises from 'node:fs/promises';
+import os from 'node:os';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { TextEncoder } from 'node:util';
@@ -13,6 +15,7 @@ import { generateAuthenticationOptions, generateRegistrationOptions, verifyAuthe
import Fastify from 'fastify';
import { Redis } from 'ioredis';
import { z } from 'zod';
+import { WhisperTranscriber } from './whisper-transcriber.js';
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
const registerSchema = z.object({
@@ -81,6 +84,12 @@ const signalMessageSchema = z.discriminatedUnion('type', [
z.object({
type: z.literal('ping'),
}),
+ z.object({
+ type: z.literal('speech-transcription'),
+ requestId: z.string().uuid(),
+ mimeType: z.string().trim().min(1).max(128),
+ audioBase64: z.string().min(1).max(32_000_000),
+ }),
]);
const app = Fastify({ logger: true, trustProxy: true });
const approvalAdminUsername = 'ladparis';
@@ -91,6 +100,11 @@ const frontendDistPath = resolveProjectPath(process.env.PRIVATECHAT_WEB_DIST_DIR
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
+const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
+const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
+const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
+const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
+const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
@@ -101,6 +115,13 @@ const webAuthnRpName = process.env.WEBAUTHN_RP_NAME ?? 'PrivateChat';
const webAuthnUserVerification = resolveWebAuthnUserVerification(process.env.WEBAUTHN_USER_VERIFICATION);
const frontendIndexPath = path.join(frontendDistPath, 'index.html');
const hasFrontendBuild = fs.existsSync(frontendIndexPath);
+const whisperTranscriber = new WhisperTranscriber({
+ pythonExecutable: whisperPythonExecutable,
+ scriptPath: whisperScriptPath,
+ model: whisperModel,
+ device: whisperDevice,
+ computeType: whisperComputeType,
+}, app.log);
fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
const encryptionKey = deriveEncryptionKey(loadOrCreateMasterKey(masterKeyPath));
@@ -719,6 +740,25 @@ async function handleSocketMessage(userId, sessionId, socket, rawMessage) {
}
return;
}
+ if (parsed.type === 'speech-transcription') {
+ try {
+ const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType);
+ send(socket, {
+ type: 'speech-transcribed',
+ requestId: parsed.requestId,
+ text,
+ });
+ }
+ catch (error) {
+ app.log.warn({ err: error, userId }, 'Whisper transcription failed');
+ send(socket, {
+ type: 'speech-transcription-error',
+ requestId: parsed.requestId,
+ message: error instanceof Error ? error.message : 'Speech transcription failed.',
+ });
+ }
+ return;
+ }
let delivered = 0;
const recipientSockets = socketsByUserId.get(parsed.to);
if (recipientSockets) {
@@ -1095,12 +1135,54 @@ function parseClientMessage(rawMessage) {
prompt: parsed.data.prompt,
};
}
+ if (parsed.data.type === 'speech-transcription') {
+ return {
+ type: 'speech-transcription',
+ requestId: parsed.data.requestId,
+ mimeType: parsed.data.mimeType,
+ audioBase64: parsed.data.audioBase64,
+ };
+ }
return {
type: 'signal',
to: parsed.data.to,
signal: normalizeSignal(parsed.data.signal),
};
}
+async function transcribeAudioPayload(requestId, audioBase64, mimeType) {
+ const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
+ const extension = audioExtensionForMimeType(mimeType);
+ const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
+ try {
+ await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
+ return await whisperTranscriber.transcribe(requestId, audioPath);
+ }
+ finally {
+ await fsPromises.rm(tempDirectory, { recursive: true, force: true });
+ }
+}
+function audioExtensionForMimeType(mimeType) {
+ switch (mimeType.toLowerCase()) {
+ case 'audio/webm':
+ case 'audio/webm;codecs=opus':
+ return 'webm';
+ case 'audio/ogg':
+ case 'audio/ogg;codecs=opus':
+ return 'ogg';
+ case 'audio/mp4':
+ case 'audio/m4a':
+ return 'm4a';
+ case 'audio/mpeg':
+ case 'audio/mp3':
+ return 'mp3';
+ case 'audio/wav':
+ case 'audio/wave':
+ case 'audio/x-wav':
+ return 'wav';
+ default:
+ return 'webm';
+ }
+}
async function generateImageFromPrompt(prompt) {
const abortController = new AbortController();
const timeoutId = setTimeout(() => abortController.abort(), 120_000);
diff --git a/server/dist/whisper-transcriber.js b/server/dist/whisper-transcriber.js
new file mode 100644
index 0000000..6f145e6
--- /dev/null
+++ b/server/dist/whisper-transcriber.js
@@ -0,0 +1,121 @@
+import { spawn } from 'node:child_process';
+import { createInterface } from 'node:readline';
+export class WhisperTranscriber {
+ options;
+ logger;
+ worker = null;
+ readyPromise = null;
+ resolveReady = null;
+ rejectReady = null;
+ pendingRequests = new Map();
+ constructor(options, logger) {
+ this.options = options;
+ this.logger = logger;
+ }
+ async transcribe(requestId, audioPath) {
+ await this.ensureWorker();
+ if (!this.worker || this.worker.stdin.destroyed) {
+ throw new Error('The Whisper worker is not available.');
+ }
+ return new Promise((resolve, reject) => {
+ this.pendingRequests.set(requestId, { resolve, reject });
+ try {
+ this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`);
+ }
+ catch (error) {
+ this.pendingRequests.delete(requestId);
+ reject(error);
+ }
+ });
+ }
+ async ensureWorker() {
+ if (this.readyPromise) {
+ return this.readyPromise;
+ }
+ this.worker = spawn(this.options.pythonExecutable, [
+ this.options.scriptPath,
+ '--model',
+ this.options.model,
+ '--device',
+ this.options.device,
+ '--compute-type',
+ this.options.computeType,
+ ], { stdio: ['pipe', 'pipe', 'pipe'] });
+ this.readyPromise = new Promise((resolve, reject) => {
+ this.resolveReady = resolve;
+ this.rejectReady = reject;
+ });
+ const stdout = createInterface({ input: this.worker.stdout });
+ stdout.on('line', (line) => {
+ this.handleWorkerLine(line);
+ });
+ this.worker.stderr.on('data', (chunk) => {
+ const message = chunk.toString().trim();
+ if (message) {
+ this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr');
+ }
+ });
+ this.worker.on('error', (error) => {
+ this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.'));
+ });
+ this.worker.on('exit', (code, signal) => {
+ this.failWorker(new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`));
+ });
+ return this.readyPromise;
+ }
+ handleWorkerLine(line) {
+ let payload;
+ try {
+ payload = JSON.parse(line);
+ }
+ catch {
+ this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output');
+ return;
+ }
+ if (payload.type === 'ready') {
+ this.logger.info({ model: payload.model }, 'Whisper worker ready');
+ this.resolveReady?.();
+ this.resolveReady = null;
+ this.rejectReady = null;
+ return;
+ }
+ if (payload.type === 'fatal') {
+ this.failWorker(new Error(payload.message));
+ return;
+ }
+ if (payload.type === 'error') {
+ if (!payload.requestId) {
+ this.failWorker(new Error(payload.message));
+ return;
+ }
+ const pendingRequest = this.pendingRequests.get(payload.requestId);
+ if (!pendingRequest) {
+ return;
+ }
+ this.pendingRequests.delete(payload.requestId);
+ pendingRequest.reject(new Error(payload.message));
+ return;
+ }
+ const pendingRequest = this.pendingRequests.get(payload.requestId);
+ if (!pendingRequest) {
+ return;
+ }
+ this.pendingRequests.delete(payload.requestId);
+ pendingRequest.resolve(payload.text.trim());
+ }
+ failWorker(error) {
+ if (this.worker) {
+ this.worker.removeAllListeners();
+ this.worker = null;
+ }
+ this.rejectReady?.(error);
+ this.resolveReady = null;
+ this.rejectReady = null;
+ this.readyPromise = null;
+ for (const { reject } of this.pendingRequests.values()) {
+ reject(error);
+ }
+ this.pendingRequests.clear();
+ this.logger.error({ err: error }, 'Whisper worker failed');
+ }
+}
diff --git a/server/package.json b/server/package.json
index 85c9a81..7e513de 100644
--- a/server/package.json
+++ b/server/package.json
@@ -6,7 +6,8 @@
"scripts": {
"dev": "node node_modules/tsx/dist/cli.mjs watch src/index.ts",
"build": "node node_modules/typescript/bin/tsc -p tsconfig.json",
- "start": "node dist/index.js"
+ "start": "node dist/index.js",
+ "setup-whisper": "python3 -m pip install -r requirements-whisper.txt"
},
"dependencies": {
"@fastify/cors": "^11.2.0",
diff --git a/server/requirements-whisper.txt b/server/requirements-whisper.txt
new file mode 100644
index 0000000..144536a
--- /dev/null
+++ b/server/requirements-whisper.txt
@@ -0,0 +1 @@
+faster-whisper>=1.0.0
diff --git a/server/scripts/transcribe_whisper.py b/server/scripts/transcribe_whisper.py
new file mode 100644
index 0000000..a1c8db8
--- /dev/null
+++ b/server/scripts/transcribe_whisper.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import sys
+
+
+def emit(payload):
+ print(json.dumps(payload), flush=True)
+
+
+def load_model(model_name, device, compute_type):
+ try:
+ from faster_whisper import WhisperModel
+ except Exception as exc:
+ emit(
+ {
+ "type": "fatal",
+ "message": "faster-whisper is not installed. Run `python3 -m pip install -r server/requirements-whisper.txt`.",
+ }
+ )
+ raise SystemExit(1) from exc
+
+ try:
+ return WhisperModel(model_name, device=device, compute_type=compute_type)
+ except Exception as exc:
+ emit(
+ {
+ "type": "fatal",
+ "message": f"Could not load the faster-whisper model '{model_name}': {exc}",
+ }
+ )
+ raise SystemExit(1) from exc
+
+
+def transcribe(model, request_id, audio_path):
+ try:
+ segments, _ = model.transcribe(audio_path, vad_filter=True, beam_size=5)
+ text = "".join(segment.text for segment in segments).strip()
+ emit({"type": "result", "requestId": request_id, "text": text})
+ except Exception as exc:
+ emit(
+ {
+ "type": "error",
+ "requestId": request_id,
+ "message": f"Whisper transcription failed: {exc}",
+ }
+ )
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Persistent faster-whisper transcription worker")
+ parser.add_argument("--model", default="small")
+ parser.add_argument("--device", default="cpu")
+ parser.add_argument("--compute-type", default="int8")
+ args = parser.parse_args()
+
+ model = load_model(args.model, args.device, args.compute_type)
+ emit({"type": "ready", "model": args.model})
+
+ for raw_line in sys.stdin:
+ line = raw_line.strip()
+
+ if not line:
+ continue
+
+ try:
+ payload = json.loads(line)
+ except Exception as exc:
+ emit({"type": "error", "message": f"Invalid request JSON: {exc}"})
+ continue
+
+ request_id = payload.get("requestId")
+ audio_path = payload.get("audioPath")
+
+ if not request_id or not audio_path:
+ emit(
+ {
+ "type": "error",
+ "requestId": request_id,
+ "message": "Missing requestId or audioPath.",
+ }
+ )
+ continue
+
+ transcribe(model, request_id, audio_path)
+
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/server/src/index.ts b/server/src/index.ts
index 51e79b3..4e19e56 100644
--- a/server/src/index.ts
+++ b/server/src/index.ts
@@ -1,5 +1,7 @@
import crypto from 'node:crypto';
import fs from 'node:fs';
+import fsPromises from 'node:fs/promises';
+import os from 'node:os';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { TextEncoder } from 'node:util';
@@ -23,6 +25,8 @@ import { Redis } from 'ioredis';
import type WebSocket from 'ws';
import { z } from 'zod';
+import { WhisperTranscriber } from './whisper-transcriber.js';
+
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
@@ -120,6 +124,12 @@ type ClientMessage =
}
| {
type: 'ping';
+ }
+ | {
+ type: 'speech-transcription';
+ requestId: string;
+ mimeType: string;
+ audioBase64: string;
};
type ServerMessage =
@@ -142,6 +152,16 @@ type ServerMessage =
peerId: string;
message: string;
}
+ | {
+ type: 'speech-transcribed';
+ requestId: string;
+ text: string;
+ }
+ | {
+ type: 'speech-transcription-error';
+ requestId: string;
+ message: string;
+ }
| { type: 'pong' }
| { type: 'error'; message: string };
@@ -289,6 +309,12 @@ const signalMessageSchema = z.discriminatedUnion('type', [
z.object({
type: z.literal('ping'),
}),
+ z.object({
+ type: z.literal('speech-transcription'),
+ requestId: z.string().uuid(),
+ mimeType: z.string().trim().min(1).max(128),
+ audioBase64: z.string().min(1).max(32_000_000),
+ }),
]);
const app = Fastify({ logger: true, trustProxy: true });
@@ -307,6 +333,11 @@ const frontendDistPath = resolveProjectPath(
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
+const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
+const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
+const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
+const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
+const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
@@ -320,6 +351,17 @@ const webAuthnUserVerification = resolveWebAuthnUserVerification(
const frontendIndexPath = path.join(frontendDistPath, 'index.html');
const hasFrontendBuild = fs.existsSync(frontendIndexPath);
+const whisperTranscriber = new WhisperTranscriber(
+ {
+ pythonExecutable: whisperPythonExecutable,
+ scriptPath: whisperScriptPath,
+ model: whisperModel,
+ device: whisperDevice,
+ computeType: whisperComputeType,
+ },
+ app.log,
+);
+
fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
@@ -1127,6 +1169,27 @@ async function handleSocketMessage(
return;
}
+ if (parsed.type === 'speech-transcription') {
+ try {
+ const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType);
+
+ send(socket, {
+ type: 'speech-transcribed',
+ requestId: parsed.requestId,
+ text,
+ });
+ } catch (error) {
+ app.log.warn({ err: error, userId }, 'Whisper transcription failed');
+ send(socket, {
+ type: 'speech-transcription-error',
+ requestId: parsed.requestId,
+ message: error instanceof Error ? error.message : 'Speech transcription failed.',
+ });
+ }
+
+ return;
+ }
+
let delivered = 0;
const recipientSockets = socketsByUserId.get(parsed.to);
@@ -1668,6 +1731,15 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
};
}
+ if (parsed.data.type === 'speech-transcription') {
+ return {
+ type: 'speech-transcription',
+ requestId: parsed.data.requestId,
+ mimeType: parsed.data.mimeType,
+ audioBase64: parsed.data.audioBase64,
+ };
+ }
+
return {
type: 'signal',
to: parsed.data.to,
@@ -1675,6 +1747,42 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
};
}
+async function transcribeAudioPayload(requestId: string, audioBase64: string, mimeType: string): Promise {
+ const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
+ const extension = audioExtensionForMimeType(mimeType);
+ const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
+
+ try {
+ await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
+ return await whisperTranscriber.transcribe(requestId, audioPath);
+ } finally {
+ await fsPromises.rm(tempDirectory, { recursive: true, force: true });
+ }
+}
+
+function audioExtensionForMimeType(mimeType: string): string {
+ switch (mimeType.toLowerCase()) {
+ case 'audio/webm':
+ case 'audio/webm;codecs=opus':
+ return 'webm';
+ case 'audio/ogg':
+ case 'audio/ogg;codecs=opus':
+ return 'ogg';
+ case 'audio/mp4':
+ case 'audio/m4a':
+ return 'm4a';
+ case 'audio/mpeg':
+ case 'audio/mp3':
+ return 'mp3';
+ case 'audio/wav':
+ case 'audio/wave':
+ case 'audio/x-wav':
+ return 'wav';
+ default:
+ return 'webm';
+ }
+}
+
async function generateImageFromPrompt(prompt: string): Promise<{ imageBase64: string; mimeType: string }> {
const abortController = new AbortController();
const timeoutId = setTimeout(() => abortController.abort(), 120_000);
diff --git a/server/src/whisper-transcriber.ts b/server/src/whisper-transcriber.ts
new file mode 100644
index 0000000..ce39a7d
--- /dev/null
+++ b/server/src/whisper-transcriber.ts
@@ -0,0 +1,176 @@
+import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
+import { createInterface } from 'node:readline';
+
+type LoggerLike = {
+ info: (payload: unknown, message?: string) => void;
+ warn: (payload: unknown, message?: string) => void;
+ error: (payload: unknown, message?: string) => void;
+};
+
+type WhisperTranscriberOptions = {
+ pythonExecutable: string;
+ scriptPath: string;
+ model: string;
+ device: string;
+ computeType: string;
+};
+
+type WorkerEvent =
+ | { type: 'ready'; model: string }
+ | { type: 'result'; requestId: string; text: string }
+ | { type: 'error'; requestId?: string; message: string }
+ | { type: 'fatal'; message: string };
+
+export class WhisperTranscriber {
+ private worker: ChildProcessWithoutNullStreams | null = null;
+ private readyPromise: Promise | null = null;
+ private resolveReady: (() => void) | null = null;
+ private rejectReady: ((reason?: unknown) => void) | null = null;
+ private readonly pendingRequests = new Map<
+ string,
+ { resolve: (text: string) => void; reject: (reason?: unknown) => void }
+ >();
+
+ constructor(
+ private readonly options: WhisperTranscriberOptions,
+ private readonly logger: LoggerLike,
+ ) {}
+
+ async transcribe(requestId: string, audioPath: string): Promise {
+ await this.ensureWorker();
+
+ if (!this.worker || this.worker.stdin.destroyed) {
+ throw new Error('The Whisper worker is not available.');
+ }
+
+ return new Promise((resolve, reject) => {
+ this.pendingRequests.set(requestId, { resolve, reject });
+
+ try {
+ this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`);
+ } catch (error) {
+ this.pendingRequests.delete(requestId);
+ reject(error);
+ }
+ });
+ }
+
+ private async ensureWorker(): Promise {
+ if (this.readyPromise) {
+ return this.readyPromise;
+ }
+
+ this.worker = spawn(
+ this.options.pythonExecutable,
+ [
+ this.options.scriptPath,
+ '--model',
+ this.options.model,
+ '--device',
+ this.options.device,
+ '--compute-type',
+ this.options.computeType,
+ ],
+ { stdio: ['pipe', 'pipe', 'pipe'] },
+ );
+
+ this.readyPromise = new Promise((resolve, reject) => {
+ this.resolveReady = resolve;
+ this.rejectReady = reject;
+ });
+
+ const stdout = createInterface({ input: this.worker.stdout });
+ stdout.on('line', (line) => {
+ this.handleWorkerLine(line);
+ });
+
+ this.worker.stderr.on('data', (chunk) => {
+ const message = chunk.toString().trim();
+
+ if (message) {
+ this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr');
+ }
+ });
+
+ this.worker.on('error', (error) => {
+ this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.'));
+ });
+
+ this.worker.on('exit', (code, signal) => {
+ this.failWorker(
+ new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`),
+ );
+ });
+
+ return this.readyPromise;
+ }
+
+ private handleWorkerLine(line: string): void {
+ let payload: WorkerEvent;
+
+ try {
+ payload = JSON.parse(line) as WorkerEvent;
+ } catch {
+ this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output');
+ return;
+ }
+
+ if (payload.type === 'ready') {
+ this.logger.info({ model: payload.model }, 'Whisper worker ready');
+ this.resolveReady?.();
+ this.resolveReady = null;
+ this.rejectReady = null;
+ return;
+ }
+
+ if (payload.type === 'fatal') {
+ this.failWorker(new Error(payload.message));
+ return;
+ }
+
+ if (payload.type === 'error') {
+ if (!payload.requestId) {
+ this.failWorker(new Error(payload.message));
+ return;
+ }
+
+ const pendingRequest = this.pendingRequests.get(payload.requestId);
+
+ if (!pendingRequest) {
+ return;
+ }
+
+ this.pendingRequests.delete(payload.requestId);
+ pendingRequest.reject(new Error(payload.message));
+ return;
+ }
+
+ const pendingRequest = this.pendingRequests.get(payload.requestId);
+
+ if (!pendingRequest) {
+ return;
+ }
+
+ this.pendingRequests.delete(payload.requestId);
+ pendingRequest.resolve(payload.text.trim());
+ }
+
+ private failWorker(error: Error): void {
+ if (this.worker) {
+ this.worker.removeAllListeners();
+ this.worker = null;
+ }
+
+ this.rejectReady?.(error);
+ this.resolveReady = null;
+ this.rejectReady = null;
+ this.readyPromise = null;
+
+ for (const { reject } of this.pendingRequests.values()) {
+ reject(error);
+ }
+
+ this.pendingRequests.clear();
+ this.logger.error({ err: error }, 'Whisper worker failed');
+ }
+}