From 189f989c0d702ed25de635b6509ab5a1d15d3c9f Mon Sep 17 00:00:00 2001
From: Laurent Dubertrand <laurent@dubertrand.com>
Date: Wed, 11 Mar 2026 00:26:49 +0100
Subject: [PATCH] Dictation through AI

---
 client/src/app/chat-page.component.html |  24 +++
 client/src/app/chat-page.component.scss |   7 +
 client/src/app/chat-page.component.ts   | 203 +++++++++++++++++++++++-
 client/src/app/chat-session.service.ts  |  86 ++++++++++
 client/src/app/models.ts                |  10 ++
 server/dist/index.js                    |  82 ++++++++++
 server/dist/whisper-transcriber.js      | 121 ++++++++++++++
 server/package.json                     |   3 +-
 server/requirements-whisper.txt         |   1 +
 server/scripts/transcribe_whisper.py    |  92 +++++++++++
 server/src/index.ts                     | 108 +++++++++++++
 server/src/whisper-transcriber.ts       | 176 ++++++++++++++++++++
 12 files changed, 911 insertions(+), 2 deletions(-)
 create mode 100644 server/dist/whisper-transcriber.js
 create mode 100644 server/requirements-whisper.txt
 create mode 100644 server/scripts/transcribe_whisper.py
 create mode 100644 server/src/whisper-transcriber.ts
diff --git a/client/src/app/chat-page.component.html b/client/src/app/chat-page.component.html
index bf4de09..22fcb28 100644
--- a/client/src/app/chat-page.component.html
+++ b/client/src/app/chat-page.component.html
@@ -314,6 +314,30 @@
                   {{ isRecordingVoice() ? '⏹️' : '🎙️' }}
                 </button>
 
+                <button
+                  class="composer-dictation"
+                  type="button"
+                  [disabled]="!session.isSelectedPeerReady() || session.signalingState() !== 'connected' || isTranscribingDictation()"
+                  (click)="toggleDictation(composerTextarea)"
+                  [title]="
+                    isDictating()
+                      ? 'Stop dictation and transcribe'
+                      : isTranscribingDictation()
+                        ? 'Transcribing dictated audio'
+                        : 'Start dictation'
+                  "
+                  [attr.aria-label]="
+                    isDictating()
+                      ? 'Stop dictation and transcribe'
+                      : isTranscribingDictation()
+                        ? 'Transcribing dictated audio'
+                        : 'Start dictation'
+                  "
+                  [class.composer-dictation-active]="isDictating() || isTranscribingDictation()"
+                >
+                  {{ isDictating() ? '🛑' : isTranscribingDictation() ? '⏳' : '🗣️' }}
+                </button>
+
                 <input
                   #fileInput
                   class="composer-file-input"
diff --git a/client/src/app/chat-page.component.scss b/client/src/app/chat-page.component.scss
index 59a4b18..5a2b5c5 100644
--- a/client/src/app/chat-page.component.scss
+++ b/client/src/app/chat-page.component.scss
@@ -357,6 +357,7 @@
 
 .composer-camera,
 .composer-call,
+.composer-dictation,
 .composer-hangup,
 .composer-voice,
 .composer-image-generate,
@@ -398,6 +399,12 @@
   background: var(--badge-background);
 }
 
+.composer-dictation {
+  color: var(--page-text);
+  background: linear-gradient(135deg, #f6d8ff, #ffcadb);
+}
+
+.composer-dictation-active,
 .composer-hangup,
 .composer-voice-recording {
   color: #fff;
diff --git a/client/src/app/chat-page.component.ts b/client/src/app/chat-page.component.ts
index fb7cd40..ee9cd44 100644
--- a/client/src/app/chat-page.component.ts
+++ b/client/src/app/chat-page.component.ts
@@ -1,5 +1,5 @@
 import { CommonModule } from '@angular/common';
-import { Component, computed, effect, ElementRef, inject, OnDestroy, signal, ViewChild } from '@angular/core';
+import { Component, computed, effect, ElementRef, inject, NgZone, OnDestroy, signal, ViewChild } from '@angular/core';
 import { toSignal } from '@angular/core/rxjs-interop';
 import { FormsModule } from '@angular/forms';
 import { ActivatedRoute, Router, RouterLink } from '@angular/router';
@@ -18,6 +18,7 @@ import type { ChatEntry, ConnectionState, PeerSummary } from './models';
 export class ChatPageComponent implements OnDestroy {
   private readonly route = inject(ActivatedRoute);
   private readonly router = inject(Router);
+  private readonly ngZone = inject(NgZone);
   private readonly routeParamMap = toSignal(this.route.paramMap, {
     initialValue: this.route.snapshot.paramMap,
   });
@@ -28,6 +29,14 @@ export class ChatPageComponent implements OnDestroy {
   private voiceChunks: Blob[] = [];
   private discardRecordedVoice = false;
   private recordingPeerId: string | null = null;
+  private dictationRecorder: MediaRecorder | null = null;
+  private dictationStream: MediaStream | null = null;
+  private dictationChunks: Blob[] = [];
+  private dictationBaseText = '';
+  private discardRecordedDictation = false;
+  private dictationCompletionPromise: Promise<void> | null = null;
+  private resolveDictationCompletion: (() => void) | null = null;
+  private dictationApplyToken = 0;
   @ViewChild('callAudioElement')
   set callAudioElementRef(value: ElementRef<HTMLAudioElement> | undefined) {
     this.callAudioElement = value;
@@ -39,6 +48,8 @@ export class ChatPageComponent implements OnDestroy {
   readonly forwardingEntryId = signal<string | null>(null);
   readonly emojiPickerOpen = signal(false);
   readonly isRecordingVoice = signal(false);
+  readonly isDictating = signal(false);
+  readonly isTranscribingDictation = signal(false);
   readonly emojiOptions = [
     '😀', '😁', '😂', '🤣', '😊',
     '😉', '😍', '😘', '😎', '🤔',
@@ -152,6 +163,7 @@ export class ChatPageComponent implements OnDestroy {
   }
 
   ngOnDestroy(): void {
+    void this.stopDictation(true);
     this.stopVoiceRecording(true);
     this.detachCallAudioSource();
   }
@@ -174,6 +186,7 @@ export class ChatPageComponent implements OnDestroy {
       return;
     }
 
+    await this.stopDictation(false);
     await this.session.sendText(peerId, this.messageText);
     this.messageText = '';
     this.emojiPickerOpen.set(false);
@@ -188,6 +201,7 @@ export class ChatPageComponent implements OnDestroy {
       return;
     }
 
+    await this.stopDictation(false);
     const requested = await this.session.requestGeneratedImage(peerId, this.messageText);
 
     if (!requested) {
@@ -262,6 +276,92 @@ export class ChatPageComponent implements OnDestroy {
     input.value = '';
   }
 
+  async toggleDictation(textarea: HTMLTextAreaElement): Promise<void> {
+    if (this.isDictating()) {
+      await this.stopDictation(false);
+      return;
+    }
+
+    if (this.isTranscribingDictation()) {
+      return;
+    }
+    const peerId = this.peerId();
+
+    if (!peerId) {
+      return;
+    }
+
+    if (typeof MediaRecorder === 'undefined' || typeof navigator === 'undefined') {
+      this.session.error.set('This browser does not support dictation recording.');
+      return;
+    }
+
+    if (typeof navigator.mediaDevices?.getUserMedia !== 'function') {
+      this.session.error.set('This browser cannot access the microphone for dictation.');
+      return;
+    }
+
+    this.dictationBaseText = this.messageText;
+    this.discardRecordedDictation = false;
+    this.dictationApplyToken += 1;
+
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      const preferredMimeType = this.preferredVoiceMimeType();
+      const recorder = preferredMimeType
+        ? new MediaRecorder(stream, { mimeType: preferredMimeType })
+        : new MediaRecorder(stream);
+      const applyToken = this.dictationApplyToken;
+
+      this.dictationChunks = [];
+      this.dictationStream = stream;
+      this.dictationRecorder = recorder;
+      this.dictationCompletionPromise = new Promise<void>((resolve) => {
+        this.resolveDictationCompletion = resolve;
+      });
+
+      recorder.ondataavailable = (event) => {
+        if (event.data.size > 0) {
+          this.dictationChunks.push(event.data);
+        }
+      };
+
+      recorder.onerror = () => {
+        this.ngZone.run(() => {
+          this.session.error.set('Could not record dictation audio.');
+          this.cleanupDictationRecorder();
+          this.finishDictationCompletion();
+        });
+      };
+
+      recorder.onstop = () => {
+        const shouldDiscard = this.discardRecordedDictation;
+        const mimeType = recorder.mimeType || preferredMimeType || 'audio/webm';
+        const blob = new Blob(this.dictationChunks, { type: mimeType });
+
+        this.ngZone.run(() => {
+          this.cleanupDictationRecorder();
+
+          if (shouldDiscard || blob.size === 0) {
+            this.finishDictationCompletion();
+            return;
+          }
+
+          this.isTranscribingDictation.set(true);
+          void this.transcribeDictation(blob, textarea, applyToken);
+        });
+      };
+
+      recorder.start();
+      this.isDictating.set(true);
+      this.session.error.set(null);
+    } catch {
+      this.session.error.set('Could not start dictation recording.');
+      this.cleanupDictationRecorder();
+      this.finishDictationCompletion();
+    }
+  }
+
   async toggleVoiceRecording(): Promise<void> {
     if (this.isRecordingVoice()) {
       this.stopVoiceRecording(false);
@@ -482,6 +582,7 @@ export class ChatPageComponent implements OnDestroy {
       return;
     }
 
+    await this.stopDictation(true);
     this.stopVoiceRecording(true);
     this.forwardingEntryId.set(null);
     this.emojiPickerOpen.set(false);
@@ -532,6 +633,106 @@ export class ChatPageComponent implements OnDestroy {
     return candidates.find((candidate) => MediaRecorder.isTypeSupported(candidate)) ?? '';
   }
 
+  private async stopDictation(discard: boolean): Promise<void> {
+    const completion = this.dictationCompletionPromise;
+
+    if (discard) {
+      this.dictationApplyToken += 1;
+      this.messageText = this.dictationBaseText || this.messageText;
+      this.handleMessageTextChange(this.messageText);
+      this.isTranscribingDictation.set(false);
+    } else {
+      this.dictationBaseText = this.messageText;
+    }
+
+    if (this.dictationRecorder) {
+      this.discardRecordedDictation = discard;
+
+      if (this.dictationRecorder.state !== 'inactive') {
+        this.dictationRecorder.stop();
+      } else {
+        this.cleanupDictationRecorder();
+        this.finishDictationCompletion();
+      }
+    } else if (!completion) {
+      this.dictationBaseText = '';
+    }
+
+    if (completion) {
+      await completion;
+    }
+  }
+
+  private cleanupDictationRecorder(): void {
+    if (this.dictationStream) {
+      for (const track of this.dictationStream.getTracks()) {
+        track.stop();
+      }
+    }
+
+    this.dictationRecorder = null;
+    this.dictationStream = null;
+    this.dictationChunks = [];
+    this.discardRecordedDictation = false;
+    this.isDictating.set(false);
+  }
+
+  private finishDictationCompletion(): void {
+    this.resolveDictationCompletion?.();
+    this.resolveDictationCompletion = null;
+    this.dictationCompletionPromise = null;
+    this.dictationBaseText = '';
+  }
+
+  private async transcribeDictation(blob: Blob, textarea: HTMLTextAreaElement, applyToken: number): Promise<void> {
+    try {
+      const transcript = await this.session.requestSpeechTranscription(blob);
+
+      if (applyToken !== this.dictationApplyToken) {
+        return;
+      }
+
+      this.applyDictatedText(this.mergeDictatedText(this.dictationBaseText, transcript), textarea);
+    } catch {
+      if (applyToken === this.dictationApplyToken) {
+        this.session.error.set('Dictation transcription failed.');
+      }
+    } finally {
+      if (applyToken === this.dictationApplyToken) {
+        this.isTranscribingDictation.set(false);
+      }
+
+      this.finishDictationCompletion();
+    }
+  }
+
+  private mergeDictatedText(baseText: string, transcript: string): string {
+    const trimmedTranscript = transcript.trim();
+
+    if (!trimmedTranscript) {
+      return baseText;
+    }
+
+    if (!baseText.trim()) {
+      return trimmedTranscript;
+    }
+
+    return `${baseText.trimEnd()} ${trimmedTranscript}`;
+  }
+
+  private applyDictatedText(text: string, textarea: HTMLTextAreaElement): void {
+    this.messageText = text;
+    textarea.value = text;
+    this.composerSelectionStart = text.length;
+    this.composerSelectionEnd = text.length;
+    this.handleMessageTextChange(text);
+
+    queueMicrotask(() => {
+      textarea.focus();
+      textarea.setSelectionRange(text.length, text.length);
+    });
+  }
+
   private syncCallAudioSource(): void {
     const audio = this.callAudioElement?.nativeElement;
 
diff --git a/client/src/app/chat-session.service.ts b/client/src/app/chat-session.service.ts
index a354a67..69f79d9 100644
--- a/client/src/app/chat-session.service.ts
+++ b/client/src/app/chat-session.service.ts
@@ -170,6 +170,10 @@ export class ChatSessionService {
     string,
     { peerId: string; prompt: string; waitMessageId: string }
   >();
+  private readonly pendingSpeechTranscriptionRequests = new Map<
+    string,
+    { resolve: (text: string) => void; reject: (reason?: unknown) => void }
+  >();
   private readonly remoteVideoStreams = signal<Array<{ peerId: string; stream: MediaStream }>>([]);
   private readonly remoteAudioStreams = signal<Array<{ peerId: string; stream: MediaStream }>>([]);
   private readonly activeCameraPeerId = signal<string | null>(null);
@@ -916,6 +920,32 @@ export class ChatSessionService {
     return true;
   }
 
+  async requestSpeechTranscription(audioBlob: Blob): Promise<string> {
+    if (!this.websocket || this.websocket.readyState !== WebSocket.OPEN) {
+      throw new Error('You must be connected to signaling before using dictation.');
+    }
+
+    const requestId = crypto.randomUUID();
+    const audioBase64 = await this.blobToBase64(audioBlob);
+
+    return new Promise<string>((resolve, reject) => {
+      this.pendingSpeechTranscriptionRequests.set(requestId, { resolve, reject });
+
+      try {
+        this.error.set(null);
+        this.websocket?.send(JSON.stringify({
+          type: 'speech-transcription',
+          requestId,
+          mimeType: audioBlob.type || 'audio/webm',
+          audioBase64,
+        }));
+      } catch (error) {
+        this.pendingSpeechTranscriptionRequests.delete(requestId);
+        reject(error);
+      }
+    });
+  }
+
   private async loadAccessKeys(): Promise<void> {
     const token = this.token();
 
@@ -990,6 +1020,7 @@ export class ChatSessionService {
       const shouldReconnect = this.websocket === websocket && !this.suppressSocketReconnect;
 
       this.stopWebSocketHeartbeat();
+      this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
       this.signalingState.set('disconnected');
       this.status.set('Signaling connection closed.');
 
@@ -1014,6 +1045,7 @@ export class ChatSessionService {
 
   private disconnectWebSocket(): void {
     this.stopWebSocketHeartbeat();
+    this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
 
     if (this.websocket) {
       this.suppressSocketReconnect = true;
@@ -1055,6 +1087,12 @@ export class ChatSessionService {
       case 'image-generation-error':
         this.handleGeneratedImageError(event);
         break;
+      case 'speech-transcribed':
+        this.handleSpeechTranscribed(event);
+        break;
+      case 'speech-transcription-error':
+        this.handleSpeechTranscriptionError(event);
+        break;
       case 'pong':
         break;
       case 'error':
@@ -1109,6 +1147,28 @@ export class ChatSessionService {
     this.error.set(event.message);
   }
 
+  private handleSpeechTranscribed(event: Extract<ServerEvent, { type: 'speech-transcribed' }>): void {
+    const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
+
+    if (!pendingRequest) {
+      return;
+    }
+
+    this.pendingSpeechTranscriptionRequests.delete(event.requestId);
+    pendingRequest.resolve(event.text);
+  }
+
+  private handleSpeechTranscriptionError(event: Extract<ServerEvent, { type: 'speech-transcription-error' }>): void {
+    const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
+
+    if (pendingRequest) {
+      this.pendingSpeechTranscriptionRequests.delete(event.requestId);
+      pendingRequest.reject(new Error(event.message));
+    }
+
+    this.error.set(event.message);
+  }
+
   private async restoreSession(): Promise<void> {
     const token = this.token();
 
@@ -2024,6 +2084,18 @@ export class ChatSessionService {
     }
   }
 
+  private rejectPendingSpeechTranscriptions(message: string): void {
+    if (this.pendingSpeechTranscriptionRequests.size === 0) {
+      return;
+    }
+
+    for (const { reject } of this.pendingSpeechTranscriptionRequests.values()) {
+      reject(new Error(message));
+    }
+
+    this.pendingSpeechTranscriptionRequests.clear();
+  }
+
   private clearLocalAuth(statusMessage: string): void {
     this.clearWebSocketReconnect();
     this.disconnectWebSocket();
@@ -2034,6 +2106,7 @@ export class ChatSessionService {
     this.stopRingtone();
     this.releasePreloadedRingtone();
     this.pendingImageGenerationRequests.clear();
+    this.rejectPendingSpeechTranscriptions('Session ended during dictation.');
     this.remoteVideoStreams.set([]);
     this.remoteAudioStreams.set([]);
     this.remoteVideoModalPeerId.set(null);
@@ -2060,6 +2133,19 @@ export class ChatSessionService {
     this.removeStorage('privatechat.user');
   }
 
+  private async blobToBase64(blob: Blob): Promise<string> {
+    const buffer = await blob.arrayBuffer();
+    let binary = '';
+    const bytes = new Uint8Array(buffer);
+    const chunkSize = 0x8000;
+
+    for (let index = 0; index < bytes.length; index += chunkSize) {
+      binary += String.fromCharCode(...bytes.subarray(index, index + chunkSize));
+    }
+
+    return btoa(binary);
+  }
+
   private async loadPersistedMessages(userId: string): Promise<void> {
     const messageEncryptionKey = this.messageEncryptionKey;
 
diff --git a/client/src/app/models.ts b/client/src/app/models.ts
index 667631c..1704cd7 100644
--- a/client/src/app/models.ts
+++ b/client/src/app/models.ts
@@ -130,6 +130,16 @@ export type ServerEvent =
       peerId: string;
       message: string;
     }
+  | {
+      type: 'speech-transcribed';
+      requestId: string;
+      text: string;
+    }
+  | {
+      type: 'speech-transcription-error';
+      requestId: string;
+      message: string;
+    }
   | { type: 'pong' }
   | { type: 'error'; message: string };
 
diff --git a/server/dist/index.js b/server/dist/index.js
index 19f452f..cd13c26 100644
--- a/server/dist/index.js
+++ b/server/dist/index.js
@@ -1,5 +1,7 @@
 import crypto from 'node:crypto';
 import fs from 'node:fs';
+import fsPromises from 'node:fs/promises';
+import os from 'node:os';
 import path from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { TextEncoder } from 'node:util';
@@ -13,6 +15,7 @@ import { generateAuthenticationOptions, generateRegistrationOptions, verifyAuthe
 import Fastify from 'fastify';
 import { Redis } from 'ioredis';
 import { z } from 'zod';
+import { WhisperTranscriber } from './whisper-transcriber.js';
 dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
 const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
 const registerSchema = z.object({
@@ -81,6 +84,12 @@ const signalMessageSchema = z.discriminatedUnion('type', [
     z.object({
         type: z.literal('ping'),
     }),
+    z.object({
+        type: z.literal('speech-transcription'),
+        requestId: z.string().uuid(),
+        mimeType: z.string().trim().min(1).max(128),
+        audioBase64: z.string().min(1).max(32_000_000),
+    }),
 ]);
 const app = Fastify({ logger: true, trustProxy: true });
 const approvalAdminUsername = 'ladparis';
@@ -91,6 +100,11 @@ const frontendDistPath = resolveProjectPath(process.env.PRIVATECHAT_WEB_DIST_DIR
 const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
 const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
 const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
+const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
+const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
+const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
+const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
+const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
 const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
 const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
 const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
@@ -101,6 +115,13 @@ const webAuthnRpName = process.env.WEBAUTHN_RP_NAME ?? 'PrivateChat';
 const webAuthnUserVerification = resolveWebAuthnUserVerification(process.env.WEBAUTHN_USER_VERIFICATION);
 const frontendIndexPath = path.join(frontendDistPath, 'index.html');
 const hasFrontendBuild = fs.existsSync(frontendIndexPath);
+const whisperTranscriber = new WhisperTranscriber({
+    pythonExecutable: whisperPythonExecutable,
+    scriptPath: whisperScriptPath,
+    model: whisperModel,
+    device: whisperDevice,
+    computeType: whisperComputeType,
+}, app.log);
 fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
 fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
 const encryptionKey = deriveEncryptionKey(loadOrCreateMasterKey(masterKeyPath));
@@ -719,6 +740,25 @@ async function handleSocketMessage(userId, sessionId, socket, rawMessage) {
         }
         return;
     }
+    if (parsed.type === 'speech-transcription') {
+        try {
+            const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType);
+            send(socket, {
+                type: 'speech-transcribed',
+                requestId: parsed.requestId,
+                text,
+            });
+        }
+        catch (error) {
+            app.log.warn({ err: error, userId }, 'Whisper transcription failed');
+            send(socket, {
+                type: 'speech-transcription-error',
+                requestId: parsed.requestId,
+                message: error instanceof Error ? error.message : 'Speech transcription failed.',
+            });
+        }
+        return;
+    }
     let delivered = 0;
     const recipientSockets = socketsByUserId.get(parsed.to);
     if (recipientSockets) {
@@ -1095,12 +1135,54 @@ function parseClientMessage(rawMessage) {
             prompt: parsed.data.prompt,
         };
     }
+    if (parsed.data.type === 'speech-transcription') {
+        return {
+            type: 'speech-transcription',
+            requestId: parsed.data.requestId,
+            mimeType: parsed.data.mimeType,
+            audioBase64: parsed.data.audioBase64,
+        };
+    }
     return {
         type: 'signal',
         to: parsed.data.to,
         signal: normalizeSignal(parsed.data.signal),
     };
 }
+async function transcribeAudioPayload(requestId, audioBase64, mimeType) {
+    const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
+    const extension = audioExtensionForMimeType(mimeType);
+    const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
+    try {
+        await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
+        return await whisperTranscriber.transcribe(requestId, audioPath);
+    }
+    finally {
+        await fsPromises.rm(tempDirectory, { recursive: true, force: true });
+    }
+}
+function audioExtensionForMimeType(mimeType) {
+    switch (mimeType.toLowerCase()) {
+        case 'audio/webm':
+        case 'audio/webm;codecs=opus':
+            return 'webm';
+        case 'audio/ogg':
+        case 'audio/ogg;codecs=opus':
+            return 'ogg';
+        case 'audio/mp4':
+        case 'audio/m4a':
+            return 'm4a';
+        case 'audio/mpeg':
+        case 'audio/mp3':
+            return 'mp3';
+        case 'audio/wav':
+        case 'audio/wave':
+        case 'audio/x-wav':
+            return 'wav';
+        default:
+            return 'webm';
+    }
+}
 async function generateImageFromPrompt(prompt) {
     const abortController = new AbortController();
     const timeoutId = setTimeout(() => abortController.abort(), 120_000);
diff --git a/server/dist/whisper-transcriber.js b/server/dist/whisper-transcriber.js
new file mode 100644
index 0000000..6f145e6
--- /dev/null
+++ b/server/dist/whisper-transcriber.js
@@ -0,0 +1,121 @@
+import { spawn } from 'node:child_process';
+import { createInterface } from 'node:readline';
+export class WhisperTranscriber {
+    options;
+    logger;
+    worker = null;
+    readyPromise = null;
+    resolveReady = null;
+    rejectReady = null;
+    pendingRequests = new Map();
+    constructor(options, logger) {
+        this.options = options;
+        this.logger = logger;
+    }
+    async transcribe(requestId, audioPath) {
+        await this.ensureWorker();
+        if (!this.worker || this.worker.stdin.destroyed) {
+            throw new Error('The Whisper worker is not available.');
+        }
+        return new Promise((resolve, reject) => {
+            this.pendingRequests.set(requestId, { resolve, reject });
+            try {
+                this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`);
+            }
+            catch (error) {
+                this.pendingRequests.delete(requestId);
+                reject(error);
+            }
+        });
+    }
+    async ensureWorker() {
+        if (this.readyPromise) {
+            return this.readyPromise;
+        }
+        this.worker = spawn(this.options.pythonExecutable, [
+            this.options.scriptPath,
+            '--model',
+            this.options.model,
+            '--device',
+            this.options.device,
+            '--compute-type',
+            this.options.computeType,
+        ], { stdio: ['pipe', 'pipe', 'pipe'] });
+        this.readyPromise = new Promise((resolve, reject) => {
+            this.resolveReady = resolve;
+            this.rejectReady = reject;
+        });
+        const stdout = createInterface({ input: this.worker.stdout });
+        stdout.on('line', (line) => {
+            this.handleWorkerLine(line);
+        });
+        this.worker.stderr.on('data', (chunk) => {
+            const message = chunk.toString().trim();
+            if (message) {
+                this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr');
+            }
+        });
+        this.worker.on('error', (error) => {
+            this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.'));
+        });
+        this.worker.on('exit', (code, signal) => {
+            this.failWorker(new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`));
+        });
+        return this.readyPromise;
+    }
+    handleWorkerLine(line) {
+        let payload;
+        try {
+            payload = JSON.parse(line);
+        }
+        catch {
+            this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output');
+            return;
+        }
+        if (payload.type === 'ready') {
+            this.logger.info({ model: payload.model }, 'Whisper worker ready');
+            this.resolveReady?.();
+            this.resolveReady = null;
+            this.rejectReady = null;
+            return;
+        }
+        if (payload.type === 'fatal') {
+            this.failWorker(new Error(payload.message));
+            return;
+        }
+        if (payload.type === 'error') {
+            if (!payload.requestId) {
+                this.failWorker(new Error(payload.message));
+                return;
+            }
+            const pendingRequest = this.pendingRequests.get(payload.requestId);
+            if (!pendingRequest) {
+                return;
+            }
+            this.pendingRequests.delete(payload.requestId);
+            pendingRequest.reject(new Error(payload.message));
+            return;
+        }
+        const pendingRequest = this.pendingRequests.get(payload.requestId);
+        if (!pendingRequest) {
+            return;
+        }
+        this.pendingRequests.delete(payload.requestId);
+        pendingRequest.resolve(payload.text.trim());
+    }
+    failWorker(error) {
+        if (this.worker) {
+            this.worker.removeAllListeners();
+            this.worker = null;
+        }
+        this.rejectReady?.(error);
+        this.resolveReady = null;
+        this.rejectReady = null;
+        this.readyPromise = null;
+        for (const { reject } of this.pendingRequests.values()) {
+            reject(error);
+        }
+        this.pendingRequests.clear();
+        this.logger.error({ err: error }, 'Whisper worker failed');
+    }
+}
diff --git a/server/package.json b/server/package.json
index 85c9a81..7e513de 100644
--- a/server/package.json
+++ b/server/package.json
@@ -6,7 +6,8 @@
   "scripts": {
     "dev": "node node_modules/tsx/dist/cli.mjs watch src/index.ts",
     "build": "node node_modules/typescript/bin/tsc -p tsconfig.json",
-    "start": "node dist/index.js"
+    "start": "node dist/index.js",
+    "setup-whisper": "python3 -m pip install -r requirements-whisper.txt"
   },
   "dependencies": {
     "@fastify/cors": "^11.2.0",
diff --git a/server/requirements-whisper.txt b/server/requirements-whisper.txt
new file mode 100644
index 0000000..144536a
--- /dev/null
+++ b/server/requirements-whisper.txt
@@ -0,0 +1 @@
+faster-whisper>=1.0.0
diff --git a/server/scripts/transcribe_whisper.py b/server/scripts/transcribe_whisper.py
new file mode 100644
index 0000000..a1c8db8
--- /dev/null
+++ b/server/scripts/transcribe_whisper.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import sys
+
+
+def emit(payload):
+    print(json.dumps(payload), flush=True)
+
+
+def load_model(model_name, device, compute_type):
+    try:
+        from faster_whisper import WhisperModel
+    except Exception as exc:
+        emit(
+            {
+                "type": "fatal",
+                "message": "faster-whisper is not installed. Run `python3 -m pip install -r server/requirements-whisper.txt`.",
+            }
+        )
+        raise SystemExit(1) from exc
+
+    try:
+        return WhisperModel(model_name, device=device, compute_type=compute_type)
+    except Exception as exc:
+        emit(
+            {
+                "type": "fatal",
+                "message": f"Could not load the faster-whisper model '{model_name}': {exc}",
+            }
+        )
+        raise SystemExit(1) from exc
+
+
+def transcribe(model, request_id, audio_path):
+    try:
+        segments, _ = model.transcribe(audio_path, vad_filter=True, beam_size=5)
+        text = "".join(segment.text for segment in segments).strip()
+        emit({"type": "result", "requestId": request_id, "text": text})
+    except Exception as exc:
+        emit(
+            {
+                "type": "error",
+                "requestId": request_id,
+                "message": f"Whisper transcription failed: {exc}",
+            }
+        )
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Persistent faster-whisper transcription worker")
+    parser.add_argument("--model", default="small")
+    parser.add_argument("--device", default="cpu")
+    parser.add_argument("--compute-type", default="int8")
+    args = parser.parse_args()
+
+    model = load_model(args.model, args.device, args.compute_type)
+    emit({"type": "ready", "model": args.model})
+
+    for raw_line in sys.stdin:
+        line = raw_line.strip()
+
+        if not line:
+            continue
+
+        try:
+            payload = json.loads(line)
+        except Exception as exc:
+            emit({"type": "error", "message": f"Invalid request JSON: {exc}"})
+            continue
+
+        request_id = payload.get("requestId")
+        audio_path = payload.get("audioPath")
+
+        if not request_id or not audio_path:
+            emit(
+                {
+                    "type": "error",
+                    "requestId": request_id,
+                    "message": "Missing requestId or audioPath.",
+                }
+            )
+            continue
+
+        transcribe(model, request_id, audio_path)
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/server/src/index.ts b/server/src/index.ts
index 51e79b3..4e19e56 100644
--- a/server/src/index.ts
+++ b/server/src/index.ts
@@ -1,5 +1,7 @@
 import crypto from 'node:crypto';
 import fs from 'node:fs';
+import fsPromises from 'node:fs/promises';
+import os from 'node:os';
 import path from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { TextEncoder } from 'node:util';
@@ -23,6 +25,8 @@ import { Redis } from 'ioredis';
 import type WebSocket from 'ws';
 import { z } from 'zod';
 
+import { WhisperTranscriber } from './whisper-transcriber.js';
+
 dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
 
 const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
@@ -120,6 +124,12 @@ type ClientMessage =
     }
   | {
       type: 'ping';
+    }
+  | {
+      type: 'speech-transcription';
+      requestId: string;
+      mimeType: string;
+      audioBase64: string;
     };
 
 type ServerMessage =
@@ -142,6 +152,16 @@ type ServerMessage =
       peerId: string;
       message: string;
     }
+  | {
+      type: 'speech-transcribed';
+      requestId: string;
+      text: string;
+    }
+  | {
+      type: 'speech-transcription-error';
+      requestId: string;
+      message: string;
+    }
   | { type: 'pong' }
   | { type: 'error'; message: string };
 
@@ -289,6 +309,12 @@ const signalMessageSchema = z.discriminatedUnion('type', [
   z.object({
     type: z.literal('ping'),
   }),
+  z.object({
+    type: z.literal('speech-transcription'),
+    requestId: z.string().uuid(),
+    mimeType: z.string().trim().min(1).max(128),
+    audioBase64: z.string().min(1).max(32_000_000),
+  }),
 ]);
 
 const app = Fastify({ logger: true, trustProxy: true });
@@ -307,6 +333,11 @@ const frontendDistPath = resolveProjectPath(
 const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
 const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
 const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
+const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
+const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
+const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
+const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
+const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
 const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
 const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
 const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
@@ -320,6 +351,17 @@ const webAuthnUserVerification = resolveWebAuthnUserVerification(
 const frontendIndexPath = path.join(frontendDistPath, 'index.html');
 const hasFrontendBuild = fs.existsSync(frontendIndexPath);
 
+const whisperTranscriber = new WhisperTranscriber(
+  {
+    pythonExecutable: whisperPythonExecutable,
+    scriptPath: whisperScriptPath,
+    model: whisperModel,
+    device: whisperDevice,
+    computeType: whisperComputeType,
+  },
+  app.log,
+);
+
 fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
 fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
 
@@ -1127,6 +1169,27 @@ async function handleSocketMessage(
     return;
   }
 
+  if (parsed.type === 'speech-transcription') {
+    try {
+      const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType);
+
+      send(socket, {
+        type: 'speech-transcribed',
+        requestId: parsed.requestId,
+        text,
+      });
+    } catch (error) {
+      app.log.warn({ err: error, userId }, 'Whisper transcription failed');
+      send(socket, {
+        type: 'speech-transcription-error',
+        requestId: parsed.requestId,
+        message: error instanceof Error ? error.message : 'Speech transcription failed.',
+      });
+    }
+
+    return;
+  }
+
   let delivered = 0;
   const recipientSockets = socketsByUserId.get(parsed.to);
 
@@ -1668,6 +1731,15 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
     };
   }
 
+  if (parsed.data.type === 'speech-transcription') {
+    return {
+      type: 'speech-transcription',
+      requestId: parsed.data.requestId,
+      mimeType: parsed.data.mimeType,
+      audioBase64: parsed.data.audioBase64,
+    };
+  }
+
   return {
     type: 'signal',
     to: parsed.data.to,
@@ -1675,6 +1747,42 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
   };
 }
 
+async function transcribeAudioPayload(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
+  const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
+  const extension = audioExtensionForMimeType(mimeType);
+  const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
+
+  try {
+    await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
+    return await whisperTranscriber.transcribe(requestId, audioPath);
+  } finally {
+    await fsPromises.rm(tempDirectory, { recursive: true, force: true });
+  }
+}
+
+function audioExtensionForMimeType(mimeType: string): string {
+  switch (mimeType.toLowerCase()) {
+    case 'audio/webm':
+    case 'audio/webm;codecs=opus':
+      return 'webm';
+    case 'audio/ogg':
+    case 'audio/ogg;codecs=opus':
+      return 'ogg';
+    case 'audio/mp4':
+    case 'audio/m4a':
+      return 'm4a';
+    case 'audio/mpeg':
+    case 'audio/mp3':
+      return 'mp3';
+    case 'audio/wav':
+    case 'audio/wave':
+    case 'audio/x-wav':
+      return 'wav';
+    default:
+      return 'webm';
+  }
+}
+
 async function generateImageFromPrompt(prompt: string): Promise<{ imageBase64: string; mimeType: string }> {
   const abortController = new AbortController();
   const timeoutId = setTimeout(() => abortController.abort(), 120_000);
diff --git a/server/src/whisper-transcriber.ts b/server/src/whisper-transcriber.ts
new file mode 100644
index 0000000..ce39a7d
--- /dev/null
+++ b/server/src/whisper-transcriber.ts
@@ -0,0 +1,176 @@
+import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
+import { createInterface } from 'node:readline';
+
+type LoggerLike = {
+  info: (payload: unknown, message?: string) => void;
+  warn: (payload: unknown, message?: string) => void;
+  error: (payload: unknown, message?: string) => void;
+};
+
+type WhisperTranscriberOptions = {
+  pythonExecutable: string;
+  scriptPath: string;
+  model: string;
+  device: string;
+  computeType: string;
+};
+
+type WorkerEvent =
+  | { type: 'ready'; model: string }
+  | { type: 'result'; requestId: string; text: string }
+  | { type: 'error'; requestId?: string; message: string }
+  | { type: 'fatal'; message: string };
+
+export class WhisperTranscriber {
+  private worker: ChildProcessWithoutNullStreams | null = null;
+  private readyPromise: Promise<void> | null = null;
+  private resolveReady: (() => void) | null = null;
+  private rejectReady: ((reason?: unknown) => void) | null = null;
+  private readonly pendingRequests = new Map<
+    string,
+    { resolve: (text: string) => void; reject: (reason?: unknown) => void }
+  >();
+
+  constructor(
+    private readonly options: WhisperTranscriberOptions,
+    private readonly logger: LoggerLike,
+  ) {}
+
+  async transcribe(requestId: string, audioPath: string): Promise<string> {
+    await this.ensureWorker();
+
+    if (!this.worker || this.worker.stdin.destroyed) {
+      throw new Error('The Whisper worker is not available.');
+    }
+
+    return new Promise<string>((resolve, reject) => {
+      this.pendingRequests.set(requestId, { resolve, reject });
+
+      try {
+        this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`);
+      } catch (error) {
+        this.pendingRequests.delete(requestId);
+        reject(error);
+      }
+    });
+  }
+
+  private async ensureWorker(): Promise<void> {
+    if (this.readyPromise) {
+      return this.readyPromise;
+    }
+
+    this.worker = spawn(
+      this.options.pythonExecutable,
+      [
+        this.options.scriptPath,
+        '--model',
+        this.options.model,
+        '--device',
+        this.options.device,
+        '--compute-type',
+        this.options.computeType,
+      ],
+      { stdio: ['pipe', 'pipe', 'pipe'] },
+    );
+
+    this.readyPromise = new Promise<void>((resolve, reject) => {
+      this.resolveReady = resolve;
+      this.rejectReady = reject;
+    });
+
+    const stdout = createInterface({ input: this.worker.stdout });
+    stdout.on('line', (line) => {
+      this.handleWorkerLine(line);
+    });
+
+    this.worker.stderr.on('data', (chunk) => {
+      const message = chunk.toString().trim();
+
+      if (message) {
+        this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr');
+      }
+    });
+
+    this.worker.on('error', (error) => {
+      this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.'));
+    });
+
+    this.worker.on('exit', (code, signal) => {
+      this.failWorker(
+        new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`),
+      );
+    });
+
+    return this.readyPromise;
+  }
+
+  private handleWorkerLine(line: string): void {
+    let payload: WorkerEvent;
+
+    try {
+      payload = JSON.parse(line) as WorkerEvent;
+    } catch {
+      this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output');
+      return;
+    }
+
+    if (payload.type === 'ready') {
+      this.logger.info({ model: payload.model }, 'Whisper worker ready');
+      this.resolveReady?.();
+      this.resolveReady = null;
+      this.rejectReady = null;
+      return;
+    }
+
+    if (payload.type === 'fatal') {
+      this.failWorker(new Error(payload.message));
+      return;
+    }
+
+    if (payload.type === 'error') {
+      if (!payload.requestId) {
+        this.failWorker(new Error(payload.message));
+        return;
+      }
+
+      const pendingRequest = this.pendingRequests.get(payload.requestId);
+
+      if (!pendingRequest) {
+        return;
+      }
+
+      this.pendingRequests.delete(payload.requestId);
+      pendingRequest.reject(new Error(payload.message));
+      return;
+    }
+
+    const pendingRequest = this.pendingRequests.get(payload.requestId);
+
+    if (!pendingRequest) {
+      return;
+    }
+
+    this.pendingRequests.delete(payload.requestId);
+    pendingRequest.resolve(payload.text.trim());
+  }
+
+  private failWorker(error: Error): void {
+    if (this.worker) {
+      this.worker.removeAllListeners();
+      this.worker = null;
+    }
+
+    this.rejectReady?.(error);
+    this.resolveReady = null;
+    this.rejectReady = null;
+    this.readyPromise = null;
+
+    for (const { reject } of this.pendingRequests.values()) {
+      reject(error);
+    }
+
+    this.pendingRequests.clear();
+    this.logger.error({ err: error }, 'Whisper worker failed');
+  }
+}