From f0e2b60f43852637745f43b1fd13540ec9f484af Mon Sep 17 00:00:00 2001
From: Laurent Dubertrand <laurent@dubertrand.com>
Date: Wed, 11 Mar 2026 03:08:27 +0100
Subject: [PATCH] Transcription on mac

---
 server/dist/index.js                 |  57 ++-------
 server/dist/speech-transcriber.js    | 124 +++++++++++++++++++
 server/dist/whisper-transcriber.js   | 121 ------------------
 server/package-lock.json             |   1 +
 server/package.json                  |   4 +-
 server/requirements-whisper.txt      |   1 -
 server/scripts/transcribe_whisper.py |  92 --------------
 server/src/index.ts                  |  58 ++-------
 server/src/speech-transcriber.ts     | 173 ++++++++++++++++++++++++++
 server/src/whisper-transcriber.ts    | 176 ---------------------------
 10 files changed, 320 insertions(+), 487 deletions(-)
 create mode 100644 server/dist/speech-transcriber.js
 delete mode 100644 server/dist/whisper-transcriber.js
 delete mode 100644 server/requirements-whisper.txt
 delete mode 100644 server/scripts/transcribe_whisper.py
 create mode 100644 server/src/speech-transcriber.ts
 delete mode 100644 server/src/whisper-transcriber.ts

diff --git a/server/dist/index.js b/server/dist/index.js
index cd13c26..11b9a12 100644
--- a/server/dist/index.js
+++ b/server/dist/index.js
@@ -1,7 +1,5 @@
 import crypto from 'node:crypto';
 import fs from 'node:fs';
-import fsPromises from 'node:fs/promises';
-import os from 'node:os';
 import path from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { TextEncoder } from 'node:util';
@@ -15,7 +13,7 @@ import { generateAuthenticationOptions, generateRegistrationOptions, verifyAuthe
 import Fastify from 'fastify';
 import { Redis } from 'ioredis';
 import { z } from 'zod';
-import { WhisperTranscriber } from './whisper-transcriber.js';
+import { SpeechTranscriber } from './speech-transcriber.js';
 dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
 const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
 const registerSchema = z.object({
@@ -100,11 +98,9 @@ const frontendDistPath = resolveProjectPath(process.env.PRIVATECHAT_WEB_DIST_DIR
 const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
 const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
 const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
-const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
-const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
-const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
-const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
-const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
+const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'ws://192.168.1.19:8080';
+const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto';
+const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000);
 const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
 const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
 const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
@@ -115,12 +111,10 @@ const webAuthnRpName = process.env.WEBAUTHN_RP_NAME ?? 'PrivateChat';
 const webAuthnUserVerification = resolveWebAuthnUserVerification(process.env.WEBAUTHN_USER_VERIFICATION);
 const frontendIndexPath = path.join(frontendDistPath, 'index.html');
 const hasFrontendBuild = fs.existsSync(frontendIndexPath);
-const whisperTranscriber = new WhisperTranscriber({
-    pythonExecutable: whisperPythonExecutable,
-    scriptPath: whisperScriptPath,
-    model: whisperModel,
-    device: whisperDevice,
-    computeType: whisperComputeType,
+const speechTranscriber = new SpeechTranscriber({
+    serviceUrl: speechTranscriptionServiceUrl,
+    language: speechTranscriptionLanguage,
+    requestTimeoutMs: speechTranscriptionTimeoutMs,
 }, app.log);
 fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
 fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
@@ -750,7 +744,7 @@ async function handleSocketMessage(userId, sessionId, socket, rawMessage) {
             });
         }
         catch (error) {
-            app.log.warn({ err: error, userId }, 'Whisper transcription failed');
+            app.log.warn({ err: error, userId }, 'Speech transcription failed');
             send(socket, {
                 type: 'speech-transcription-error',
                 requestId: parsed.requestId,
@@ -1150,38 +1144,7 @@ function parseClientMessage(rawMessage) {
     };
 }
 async function transcribeAudioPayload(requestId, audioBase64, mimeType) {
-    const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
-    const extension = audioExtensionForMimeType(mimeType);
-    const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
-    try {
-        await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
-        return await whisperTranscriber.transcribe(requestId, audioPath);
-    }
-    finally {
-        await fsPromises.rm(tempDirectory, { recursive: true, force: true });
-    }
-}
-function audioExtensionForMimeType(mimeType) {
-    switch (mimeType.toLowerCase()) {
-        case 'audio/webm':
-        case 'audio/webm;codecs=opus':
-            return 'webm';
-        case 'audio/ogg':
-        case 'audio/ogg;codecs=opus':
-            return 'ogg';
-        case 'audio/mp4':
-        case 'audio/m4a':
-            return 'm4a';
-        case 'audio/mpeg':
-        case 'audio/mp3':
-            return 'mp3';
-        case 'audio/wav':
-        case 'audio/wave':
-        case 'audio/x-wav':
-            return 'wav';
-        default:
-            return 'webm';
-    }
+    return await speechTranscriber.transcribe(requestId, audioBase64, mimeType);
 }
 async function generateImageFromPrompt(prompt) {
     const abortController = new AbortController();
diff --git a/server/dist/speech-transcriber.js b/server/dist/speech-transcriber.js
new file mode 100644
index 0000000..b75fe0b
--- /dev/null
+++ b/server/dist/speech-transcriber.js
@@ -0,0 +1,124 @@
+import WebSocket from 'ws';
+export class SpeechTranscriber {
+    options;
+    logger;
+    constructor(options, logger) {
+        this.options = options;
+        this.logger = logger;
+    }
+    async transcribe(requestId, audioBase64, mimeType) {
+        const audio = this.normalizeAudioPayload(audioBase64, mimeType);
+        return await new Promise((resolve, reject) => {
+            let settled = false;
+            const socket = new WebSocket(this.options.serviceUrl);
+            const finish = (handler) => {
+                if (settled) {
+                    return;
+                }
+                settled = true;
+                clearTimeout(timeout);
+                socket.removeAllListeners();
+                if (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN) {
+                    socket.close();
+                }
+                handler();
+            };
+            const timeout = setTimeout(() => {
+                finish(() => {
+                    reject(new Error(`The transcription service timed out after ${this.options.requestTimeoutMs}ms.`));
+                });
+            }, this.options.requestTimeoutMs);
+            socket.on('open', () => {
+                try {
+                    socket.send(JSON.stringify({
+                        type: 'transcribe',
+                        id: requestId,
+                        language: this.options.language,
+                        audio,
+                    }));
+                }
+                catch (error) {
+                    finish(() => {
+                        reject(error instanceof Error ? error : new Error('Could not send transcription request.'));
+                    });
+                }
+            });
+            socket.on('message', (payload) => {
+                const event = this.parseEvent(payload);
+                if (!event) {
+                    return;
+                }
+                if (event.id && event.id !== requestId) {
+                    this.logger.warn({ requestId, event }, 'Ignored transcription event for another request');
+                    return;
+                }
+                if (event.type === 'start') {
+                    this.logger.info({ requestId, model: event.model, language: event.language }, 'Speech transcription started');
+                    return;
+                }
+                if (event.type === 'delta') {
+                    return;
+                }
+                if (event.type === 'done') {
+                    finish(() => {
+                        resolve(event.text.trim());
+                    });
+                    return;
+                }
+                finish(() => {
+                    reject(new Error(event.message));
+                });
+            });
+            socket.on('error', (error) => {
+                finish(() => {
+                    reject(error instanceof Error ? error : new Error('The transcription service connection failed.'));
+                });
+            });
+            socket.on('close', (code, reasonBuffer) => {
+                if (settled) {
+                    return;
+                }
+                const reason = reasonBuffer.toString().trim();
+                const detail = reason
+                    ? `The transcription service closed the connection unexpectedly (code=${code}, reason=${reason}).`
+                    : `The transcription service closed the connection unexpectedly (code=${code}).`;
+                finish(() => {
+                    reject(new Error(detail));
+                });
+            });
+        });
+    }
+    normalizeAudioPayload(audioBase64, mimeType) {
+        const trimmedAudio = audioBase64.trim();
+        if (trimmedAudio.startsWith('data:')) {
+            return trimmedAudio;
+        }
+        const normalizedMimeType = mimeType.trim() || 'audio/webm';
+        return `data:${normalizedMimeType};base64,${trimmedAudio}`;
+    }
+    parseEvent(payload) {
+        const message = this.rawDataToString(payload).trim();
+        if (!message) {
+            return null;
+        }
+        try {
+            return JSON.parse(message);
+        }
+        catch {
+            this.logger.warn({ transcriptionPayload: message }, 'Ignored non-JSON transcription event');
+            return null;
+        }
+    }
+    rawDataToString(payload) {
+        if (typeof payload === 'string') {
+            return payload;
+        }
+        if (payload instanceof ArrayBuffer) {
+            return Buffer.from(payload).toString('utf8');
+        }
+        if (Array.isArray(payload)) {
+            return Buffer.concat(payload).toString('utf8');
+        }
+        return payload.toString('utf8');
+    }
+}
diff --git a/server/dist/whisper-transcriber.js b/server/dist/whisper-transcriber.js
deleted file mode 100644
index 6f145e6..0000000
--- a/server/dist/whisper-transcriber.js
+++ /dev/null
@@ -1,121 +0,0 @@
-import { spawn } from 'node:child_process';
-import { createInterface } from 'node:readline';
-export class WhisperTranscriber {
-    options;
-    logger;
-    worker = null;
-    readyPromise = null;
-    resolveReady = null;
-    rejectReady = null;
-    pendingRequests = new Map();
-    constructor(options, logger) {
-        this.options = options;
-        this.logger = logger;
-    }
-    async transcribe(requestId, audioPath) {
-        await this.ensureWorker();
-        if (!this.worker || this.worker.stdin.destroyed) {
-            throw new Error('The Whisper worker is not available.');
-        }
-        return new Promise((resolve, reject) => {
-            this.pendingRequests.set(requestId, { resolve, reject });
-            try {
-                this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`);
-            }
-            catch (error) {
-                this.pendingRequests.delete(requestId);
-                reject(error);
-            }
-        });
-    }
-    async ensureWorker() {
-        if (this.readyPromise) {
-            return this.readyPromise;
-        }
-        this.worker = spawn(this.options.pythonExecutable, [
-            this.options.scriptPath,
-            '--model',
-            this.options.model,
-            '--device',
-            this.options.device,
-            '--compute-type',
-            this.options.computeType,
-        ], { stdio: ['pipe', 'pipe', 'pipe'] });
-        this.readyPromise = new Promise((resolve, reject) => {
-            this.resolveReady = resolve;
-            this.rejectReady = reject;
-        });
-        const stdout = createInterface({ input: this.worker.stdout });
-        stdout.on('line', (line) => {
-            this.handleWorkerLine(line);
-        });
-        this.worker.stderr.on('data', (chunk) => {
-            const message = chunk.toString().trim();
-            if (message) {
-                this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr');
-            }
-        });
-        this.worker.on('error', (error) => {
-            this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.'));
-        });
-        this.worker.on('exit', (code, signal) => {
-            this.failWorker(new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`));
-        });
-        return this.readyPromise;
-    }
-    handleWorkerLine(line) {
-        let payload;
-        try {
-            payload = JSON.parse(line);
-        }
-        catch {
-            this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output');
-            return;
-        }
-        if (payload.type === 'ready') {
-            this.logger.info({ model: payload.model }, 'Whisper worker ready');
-            this.resolveReady?.();
-            this.resolveReady = null;
-            this.rejectReady = null;
-            return;
-        }
-        if (payload.type === 'fatal') {
-            this.failWorker(new Error(payload.message));
-            return;
-        }
-        if (payload.type === 'error') {
-            if (!payload.requestId) {
-                this.failWorker(new Error(payload.message));
-                return;
-            }
-            const pendingRequest = this.pendingRequests.get(payload.requestId);
-            if (!pendingRequest) {
-                return;
-            }
-            this.pendingRequests.delete(payload.requestId);
-            pendingRequest.reject(new Error(payload.message));
-            return;
-        }
-        const pendingRequest = this.pendingRequests.get(payload.requestId);
-        if (!pendingRequest) {
-            return;
-        }
-        this.pendingRequests.delete(payload.requestId);
-        pendingRequest.resolve(payload.text.trim());
-    }
-    failWorker(error) {
-        if (this.worker) {
-            this.worker.removeAllListeners();
-            this.worker = null;
-        }
-        this.rejectReady?.(error);
-        this.resolveReady = null;
-        this.rejectReady = null;
-        this.readyPromise = null;
-        for (const { reject } of this.pendingRequests.values()) {
-            reject(error);
-        }
-        this.pendingRequests.clear();
-        this.logger.error({ err: error }, 'Whisper worker failed');
-    }
-}
diff --git a/server/package-lock.json b/server/package-lock.json
index d9af003..06af055 100644
--- a/server/package-lock.json
+++ b/server/package-lock.json
@@ -16,6 +16,7 @@
         "dotenv": "^17.3.1",
         "fastify": "^5.8.2",
         "ioredis": "^5.10.0",
+        "ws": "^8.19.0",
         "zod": "^4.3.6"
       },
       "devDependencies": {
diff --git a/server/package.json b/server/package.json
index 7e513de..3d8df28 100644
--- a/server/package.json
+++ b/server/package.json
@@ -6,8 +6,7 @@
   "scripts": {
     "dev": "node node_modules/tsx/dist/cli.mjs watch src/index.ts",
     "build": "node node_modules/typescript/bin/tsc -p tsconfig.json",
-    "start": "node dist/index.js",
-    "setup-whisper": "python3 -m pip install -r requirements-whisper.txt"
+    "start": "node dist/index.js"
   },
   "dependencies": {
     "@fastify/cors": "^11.2.0",
@@ -18,6 +17,7 @@
     "dotenv": "^17.3.1",
     "fastify": "^5.8.2",
     "ioredis": "^5.10.0",
+    "ws": "^8.19.0",
     "zod": "^4.3.6"
   },
   "devDependencies": {
diff --git a/server/requirements-whisper.txt b/server/requirements-whisper.txt
deleted file mode 100644
index 144536a..0000000
--- a/server/requirements-whisper.txt
+++ /dev/null
@@ -1 +0,0 @@
-faster-whisper>=1.0.0
diff --git a/server/scripts/transcribe_whisper.py b/server/scripts/transcribe_whisper.py
deleted file mode 100644
index a1c8db8..0000000
--- a/server/scripts/transcribe_whisper.py
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import json
-import sys
-
-
-def emit(payload):
-    print(json.dumps(payload), flush=True)
-
-
-def load_model(model_name, device, compute_type):
-    try:
-        from faster_whisper import WhisperModel
-    except Exception as exc:
-        emit(
-            {
-                "type": "fatal",
-                "message": "faster-whisper is not installed. Run `python3 -m pip install -r server/requirements-whisper.txt`.",
-            }
-        )
-        raise SystemExit(1) from exc
-
-    try:
-        return WhisperModel(model_name, device=device, compute_type=compute_type)
-    except Exception as exc:
-        emit(
-            {
-                "type": "fatal",
-                "message": f"Could not load the faster-whisper model '{model_name}': {exc}",
-            }
-        )
-        raise SystemExit(1) from exc
-
-
-def transcribe(model, request_id, audio_path):
-    try:
-        segments, _ = model.transcribe(audio_path, vad_filter=True, beam_size=5)
-        text = "".join(segment.text for segment in segments).strip()
-        emit({"type": "result", "requestId": request_id, "text": text})
-    except Exception as exc:
-        emit(
-            {
-                "type": "error",
-                "requestId": request_id,
-                "message": f"Whisper transcription failed: {exc}",
-            }
-        )
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Persistent faster-whisper transcription worker")
-    parser.add_argument("--model", default="small")
-    parser.add_argument("--device", default="cpu")
-    parser.add_argument("--compute-type", default="int8")
-    args = parser.parse_args()
-
-    model = load_model(args.model, args.device, args.compute_type)
-    emit({"type": "ready", "model": args.model})
-
-    for raw_line in sys.stdin:
-        line = raw_line.strip()
-
-        if not line:
-            continue
-
-        try:
-            payload = json.loads(line)
-        except Exception as exc:
-            emit({"type": "error", "message": f"Invalid request JSON: {exc}"})
-            continue
-
-        request_id = payload.get("requestId")
-        audio_path = payload.get("audioPath")
-
-        if not request_id or not audio_path:
-            emit(
-                {
-                    "type": "error",
-                    "requestId": request_id,
-                    "message": "Missing requestId or audioPath.",
-                }
-            )
-            continue
-
-        transcribe(model, request_id, audio_path)
-
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
diff --git a/server/src/index.ts b/server/src/index.ts
index 4e19e56..208343f 100644
--- a/server/src/index.ts
+++ b/server/src/index.ts
@@ -1,7 +1,5 @@
 import crypto from 'node:crypto';
 import fs from 'node:fs';
-import fsPromises from 'node:fs/promises';
-import os from 'node:os';
 import path from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { TextEncoder } from 'node:util';
@@ -25,7 +23,7 @@ import { Redis } from 'ioredis';
 import type WebSocket from 'ws';
 import { z } from 'zod';
 
-import { WhisperTranscriber } from './whisper-transcriber.js';
+import { SpeechTranscriber } from './speech-transcriber.js';
 
 dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
 
@@ -333,11 +331,9 @@ const frontendDistPath = resolveProjectPath(
 const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
 const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
 const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
-const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
-const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
-const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
-const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
-const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
+const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'ws://192.168.1.19:8080';
+const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto';
+const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000);
 const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
 const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
 const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
@@ -351,13 +347,11 @@ const webAuthnUserVerification = resolveWebAuthnUserVerification(
 const frontendIndexPath = path.join(frontendDistPath, 'index.html');
 const hasFrontendBuild = fs.existsSync(frontendIndexPath);
 
-const whisperTranscriber = new WhisperTranscriber(
+const speechTranscriber = new SpeechTranscriber(
   {
-    pythonExecutable: whisperPythonExecutable,
-    scriptPath: whisperScriptPath,
-    model: whisperModel,
-    device: whisperDevice,
-    computeType: whisperComputeType,
+    serviceUrl: speechTranscriptionServiceUrl,
+    language: speechTranscriptionLanguage,
+    requestTimeoutMs: speechTranscriptionTimeoutMs,
   },
   app.log,
 );
@@ -1179,7 +1173,7 @@ async function handleSocketMessage(
         text,
       });
     } catch (error) {
-      app.log.warn({ err: error, userId }, 'Whisper transcription failed');
+      app.log.warn({ err: error, userId }, 'Speech transcription failed');
       send(socket, {
         type: 'speech-transcription-error',
         requestId: parsed.requestId,
@@ -1748,39 +1742,7 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
 }
 
 async function transcribeAudioPayload(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
-  const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
-  const extension = audioExtensionForMimeType(mimeType);
-  const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
-
-  try {
-    await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
-    return await whisperTranscriber.transcribe(requestId, audioPath);
-  } finally {
-    await fsPromises.rm(tempDirectory, { recursive: true, force: true });
-  }
-}
-
-function audioExtensionForMimeType(mimeType: string): string {
-  switch (mimeType.toLowerCase()) {
-    case 'audio/webm':
-    case 'audio/webm;codecs=opus':
-      return 'webm';
-    case 'audio/ogg':
-    case 'audio/ogg;codecs=opus':
-      return 'ogg';
-    case 'audio/mp4':
-    case 'audio/m4a':
-      return 'm4a';
-    case 'audio/mpeg':
-    case 'audio/mp3':
-      return 'mp3';
-    case 'audio/wav':
-    case 'audio/wave':
-    case 'audio/x-wav':
-      return 'wav';
-    default:
-      return 'webm';
-  }
+  return await speechTranscriber.transcribe(requestId, audioBase64, mimeType);
 }
 
 async function generateImageFromPrompt(prompt: string): Promise<{ imageBase64: string; mimeType: string }> {
diff --git a/server/src/speech-transcriber.ts b/server/src/speech-transcriber.ts
new file mode 100644
index 0000000..6b8148f
--- /dev/null
+++ b/server/src/speech-transcriber.ts
@@ -0,0 +1,173 @@
+import WebSocket, { type RawData } from 'ws';
+
+type LoggerLike = {
+  info: (payload: unknown, message?: string) => void;
+  warn: (payload: unknown, message?: string) => void;
+  error: (payload: unknown, message?: string) => void;
+};
+
+type SpeechTranscriberOptions = {
+  serviceUrl: string;
+  language: string;
+  requestTimeoutMs: number;
+};
+
+type ServiceEvent =
+  | { type: 'start'; id: string; model: string; language: string }
+  | { type: 'delta'; id: string; text: string; fullText: string }
+  | { type: 'done'; id: string; text: string }
+  | { type: 'error'; id?: string; message: string };
+
+export class SpeechTranscriber {
+  constructor(
+    private readonly options: SpeechTranscriberOptions,
+    private readonly logger: LoggerLike,
+  ) {}
+
+  async transcribe(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
+    const audio = this.normalizeAudioPayload(audioBase64, mimeType);
+
+    return await new Promise<string>((resolve, reject) => {
+      let settled = false;
+      const socket = new WebSocket(this.options.serviceUrl);
+
+      const finish = (handler: () => void): void => {
+        if (settled) {
+          return;
+        }
+
+        settled = true;
+        clearTimeout(timeout);
+        socket.removeAllListeners();
+
+        if (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN) {
+          socket.close();
+        }
+
+        handler();
+      };
+
+      const timeout = setTimeout(() => {
+        finish(() => {
+          reject(new Error(`The transcription service timed out after ${this.options.requestTimeoutMs}ms.`));
+        });
+      }, this.options.requestTimeoutMs);
+
+      socket.on('open', () => {
+        try {
+          socket.send(
+            JSON.stringify({
+              type: 'transcribe',
+              id: requestId,
+              language: this.options.language,
+              audio,
+            }),
+          );
+        } catch (error) {
+          finish(() => {
+            reject(error instanceof Error ? error : new Error('Could not send transcription request.'));
+          });
+        }
+      });
+
+      socket.on('message', (payload) => {
+        const event = this.parseEvent(payload);
+
+        if (!event) {
+          return;
+        }
+
+        if (event.id && event.id !== requestId) {
+          this.logger.warn({ requestId, event }, 'Ignored transcription event for another request');
+          return;
+        }
+
+        if (event.type === 'start') {
+          this.logger.info(
+            { requestId, model: event.model, language: event.language },
+            'Speech transcription started',
+          );
+          return;
+        }
+
+        if (event.type === 'delta') {
+          return;
+        }
+
+        if (event.type === 'done') {
+          finish(() => {
+            resolve(event.text.trim());
+          });
+          return;
+        }
+
+        finish(() => {
+          reject(new Error(event.message));
+        });
+      });
+
+      socket.on('error', (error) => {
+        finish(() => {
+          reject(error instanceof Error ? error : new Error('The transcription service connection failed.'));
+        });
+      });
+
+      socket.on('close', (code, reasonBuffer) => {
+        if (settled) {
+          return;
+        }
+
+        const reason = reasonBuffer.toString().trim();
+        const detail = reason
+          ? `The transcription service closed the connection unexpectedly (code=${code}, reason=${reason}).`
+          : `The transcription service closed the connection unexpectedly (code=${code}).`;
+
+        finish(() => {
+          reject(new Error(detail));
+        });
+      });
+    });
+  }
+
+  private normalizeAudioPayload(audioBase64: string, mimeType: string): string {
+    const trimmedAudio = audioBase64.trim();
+
+    if (trimmedAudio.startsWith('data:')) {
+      return trimmedAudio;
+    }
+
+    const normalizedMimeType = mimeType.trim() || 'audio/webm';
+    return `data:${normalizedMimeType};base64,${trimmedAudio}`;
+  }
+
+  private parseEvent(payload: RawData): ServiceEvent | null {
+    const message = this.rawDataToString(payload).trim();
+
+    if (!message) {
+      return null;
+    }
+
+    try {
+      return JSON.parse(message) as ServiceEvent;
+    } catch {
+      this.logger.warn({ transcriptionPayload: message }, 'Ignored non-JSON transcription event');
+      return null;
+    }
+  }
+
+  private rawDataToString(payload: RawData): string {
+    if (typeof payload === 'string') {
+      return payload;
+    }
+
+    if (payload instanceof ArrayBuffer) {
+      return Buffer.from(payload).toString('utf8');
+    }
+
+    if (Array.isArray(payload)) {
+      return Buffer.concat(payload).toString('utf8');
+    }
+
+    return payload.toString('utf8');
+  }
+}
diff --git a/server/src/whisper-transcriber.ts b/server/src/whisper-transcriber.ts
deleted file mode 100644
index ce39a7d..0000000
--- a/server/src/whisper-transcriber.ts
+++ /dev/null
@@ -1,176 +0,0 @@
-import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
-import { createInterface } from 'node:readline';
-
-type LoggerLike = {
-  info: (payload: unknown, message?: string) => void;
-  warn: (payload: unknown, message?: string) => void;
-  error: (payload: unknown, message?: string) => void;
-};
-
-type WhisperTranscriberOptions = {
-  pythonExecutable: string;
-  scriptPath: string;
-  model: string;
-  device: string;
-  computeType: string;
-};
-
-type WorkerEvent =
-  | { type: 'ready'; model: string }
-  | { type: 'result'; requestId: string; text: string }
-  | { type: 'error'; requestId?: string; message: string }
-  | { type: 'fatal'; message: string };
-
-export class WhisperTranscriber {
-  private worker: ChildProcessWithoutNullStreams | null = null;
-  private readyPromise: Promise<void> | null = null;
-  private resolveReady: (() => void) | null = null;
-  private rejectReady: ((reason?: unknown) => void) | null = null;
-  private readonly pendingRequests = new Map<
-    string,
-    { resolve: (text: string) => void; reject: (reason?: unknown) => void }
-  >();
-
-  constructor(
-    private readonly options: WhisperTranscriberOptions,
-    private readonly logger: LoggerLike,
-  ) {}
-
-  async transcribe(requestId: string, audioPath: string): Promise<string> {
-    await this.ensureWorker();
-
-    if (!this.worker || this.worker.stdin.destroyed) {
-      throw new Error('The Whisper worker is not available.');
-    }
-
-    return new Promise<string>((resolve, reject) => {
-      this.pendingRequests.set(requestId, { resolve, reject });
-
-      try {
-        this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`);
-      } catch (error) {
-        this.pendingRequests.delete(requestId);
-        reject(error);
-      }
-    });
-  }
-
-  private async ensureWorker(): Promise<void> {
-    if (this.readyPromise) {
-      return this.readyPromise;
-    }
-
-    this.worker = spawn(
-      this.options.pythonExecutable,
-      [
-        this.options.scriptPath,
-        '--model',
-        this.options.model,
-        '--device',
-        this.options.device,
-        '--compute-type',
-        this.options.computeType,
-      ],
-      { stdio: ['pipe', 'pipe', 'pipe'] },
-    );
-
-    this.readyPromise = new Promise<void>((resolve, reject) => {
-      this.resolveReady = resolve;
-      this.rejectReady = reject;
-    });
-
-    const stdout = createInterface({ input: this.worker.stdout });
-    stdout.on('line', (line) => {
-      this.handleWorkerLine(line);
-    });
-
-    this.worker.stderr.on('data', (chunk) => {
-      const message = chunk.toString().trim();
-
-      if (message) {
-        this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr');
-      }
-    });
-
-    this.worker.on('error', (error) => {
-      this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.'));
-    });
-
-    this.worker.on('exit', (code, signal) => {
-      this.failWorker(
-        new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`),
-      );
-    });
-
-    return this.readyPromise;
-  }
-
-  private handleWorkerLine(line: string): void {
-    let payload: WorkerEvent;
-
-    try {
-      payload = JSON.parse(line) as WorkerEvent;
-    } catch {
-      this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output');
-      return;
-    }
-
-    if (payload.type === 'ready') {
-      this.logger.info({ model: payload.model }, 'Whisper worker ready');
-      this.resolveReady?.();
-      this.resolveReady = null;
-      this.rejectReady = null;
-      return;
-    }
-
-    if (payload.type === 'fatal') {
-      this.failWorker(new Error(payload.message));
-      return;
-    }
-
-    if (payload.type === 'error') {
-      if (!payload.requestId) {
-        this.failWorker(new Error(payload.message));
-        return;
-      }
-
-      const pendingRequest = this.pendingRequests.get(payload.requestId);
-
-      if (!pendingRequest) {
-        return;
-      }
-
-      this.pendingRequests.delete(payload.requestId);
-      pendingRequest.reject(new Error(payload.message));
-      return;
-    }
-
-    const pendingRequest = this.pendingRequests.get(payload.requestId);
-
-    if (!pendingRequest) {
-      return;
-    }
-
-    this.pendingRequests.delete(payload.requestId);
-    pendingRequest.resolve(payload.text.trim());
-  }
-
-  private failWorker(error: Error): void {
-    if (this.worker) {
-      this.worker.removeAllListeners();
-      this.worker = null;
-    }
-
-    this.rejectReady?.(error);
-    this.resolveReady = null;
-    this.rejectReady = null;
-    this.readyPromise = null;
-
-    for (const { reject } of this.pendingRequests.values()) {
-      reject(error);
-    }
-
-    this.pendingRequests.clear();
-    this.logger.error({ err: error }, 'Whisper worker failed');
-  }
-}