local multi language STT

2026-03-26 19:38:54 +01:00
parent cc14b4d1b7
commit f2bf70bc7d
18 changed files with 1334 additions and 517 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ server/server/data/master.key
 client/dist/*
 client/apple-client/WebApp/**
 server/data/master.key
+.vscode/extensions.json
--- a/client/angular.json
+++ b/client/angular.json
@@ -54,6 +54,11 @@
                "glob": "magick.wasm",
                "input": "node_modules/@imagemagick/magick-wasm/dist",
                "output": "/"
+              },
+              {
+                "glob": "ort-wasm-simd-threaded.jsep.*",
+                "input": "node_modules/@huggingface/transformers/dist",
+                "output": "/transformers-wasm"
              }
            ],
            "styles": [
--- a/client/package-lock.json
+++ b/client/package-lock.json
--- a/client/package.json
+++ b/client/package.json
@@ -18,6 +18,7 @@
    "@angular/forms": "^21.2.0",
    "@angular/platform-browser": "^21.2.0",
    "@angular/router": "^21.2.0",
+    "@huggingface/transformers": "^3.8.1",
    "@imagemagick/magick-wasm": "^0.0.39",
    "bootstrap": "^5.3.8",
    "ngx-extended-pdf-viewer": "^25.6.4",
--- a/client/public/env.js
+++ b/client/public/env.js
@@ -1,3 +1,6 @@
 window.__PRIVATECHAT_ENV__ = {
-  "PRIVATECHAT_CLIENT_SERVER_URL": "https://chatter.dubertrand.fr"
+  "PRIVATECHAT_CLIENT_SERVER_URL": "https://chatter.dubertrand.fr",
+  "PRIVATECHAT_CLIENT_WHISPER_MODEL": "Xenova/whisper-small",
+  "PRIVATECHAT_CLIENT_WHISPER_LANGUAGE": "auto",
+  "PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH": "/transformers-wasm/"
 };
--- a/client/scripts/write-env.js
+++ b/client/scripts/write-env.js
@@ -9,6 +9,10 @@ dotenv.config({ path: rootEnvPath });

 const runtimeEnv = {
  PRIVATECHAT_CLIENT_SERVER_URL: process.env.PRIVATECHAT_CLIENT_SERVER_URL ?? 'http://localhost:3000',
+  PRIVATECHAT_CLIENT_WHISPER_MODEL: process.env.PRIVATECHAT_CLIENT_WHISPER_MODEL ?? 'Xenova/whisper-small',
+  PRIVATECHAT_CLIENT_WHISPER_LANGUAGE: process.env.PRIVATECHAT_CLIENT_WHISPER_LANGUAGE ?? 'auto',
+  PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH:
+    process.env.PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH ?? '/transformers-wasm/',
 };

 const fileContents = `window.__PRIVATECHAT_ENV__ = ${JSON.stringify(runtimeEnv, null, 2)};\n`;
--- a/client/src/app/browser-speech-transcriber.service.ts
+++ b/client/src/app/browser-speech-transcriber.service.ts
@@ -0,0 +1,273 @@
+import { inject, Injectable } from '@angular/core';
+
+import { ChatSessionService } from './chat-session.service';
+import type { DictationLanguage } from './models';
+
+type PrivateChatRuntimeEnv = {
+  PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH?: string;
+  PRIVATECHAT_CLIENT_WHISPER_LANGUAGE?: string;
+  PRIVATECHAT_CLIENT_WHISPER_MODEL?: string;
+};
+
+type AutomaticSpeechRecognitionOutput = {
+  text: string;
+};
+
+type AutomaticSpeechRecognitionPipeline = (
+  audio: Float32Array,
+  options?: {
+    chunk_length_s?: number;
+    stride_length_s?: number;
+    task?: 'transcribe';
+    language?: string;
+  },
+) => Promise<AutomaticSpeechRecognitionOutput | AutomaticSpeechRecognitionOutput[]>;
+
+type TransformersModule = {
+  env: {
+    backends: {
+      onnx: {
+        wasm?: {
+          wasmPaths?: string;
+        };
+      };
+    };
+  };
+  pipeline: (
+    task: string,
+    model: string,
+    options?: {
+      device?: 'wasm' | 'webgpu';
+      dtype?: 'fp32';
+      model_file_name?: string;
+      subfolder?: string;
+    },
+  ) => Promise<unknown>;
+};
+
+const whisperTargetSampleRate = 16_000;
+const defaultWhisperModel = 'Xenova/whisper-small';
+const defaultTransformersWasmPath = '/transformers-wasm/';
+const defaultChunkLengthSeconds = 30;
+const defaultStrideLengthSeconds = 5;
+const whisperLanguageNames: Record<DictationLanguage, string> = {
+  en: 'english',
+  fr: 'french',
+  es: 'spanish',
+};
+
+function readRuntimeEnv(): PrivateChatRuntimeEnv {
+  if (typeof window === 'undefined') {
+    return {};
+  }
+
+  return (window as typeof window & { __PRIVATECHAT_ENV__?: PrivateChatRuntimeEnv }).__PRIVATECHAT_ENV__ ?? {};
+}
+
+function resolveAudioContextConstructor(): typeof AudioContext | null {
+  if (typeof window === 'undefined') {
+    return null;
+  }
+
+  return window.AudioContext
+    ?? (window as typeof window & { webkitAudioContext?: typeof AudioContext }).webkitAudioContext
+    ?? null;
+}
+
+@Injectable({ providedIn: 'root' })
+export class BrowserSpeechTranscriberService {
+  private readonly session = inject(ChatSessionService);
+  private readonly runtimeEnv = readRuntimeEnv();
+  private readonly modelId = this.runtimeEnv.PRIVATECHAT_CLIENT_WHISPER_MODEL?.trim() || defaultWhisperModel;
+  private readonly fallbackLanguage = this.normalizeLanguage(
+    this.runtimeEnv.PRIVATECHAT_CLIENT_WHISPER_LANGUAGE,
+  );
+  private transformersModulePromise: Promise<TransformersModule> | null = null;
+  private pipelinePromise: Promise<AutomaticSpeechRecognitionPipeline> | null = null;
+
+  async preload(): Promise<void> {
+    await this.getPipeline();
+  }
+
+  async transcribe(audioBlob: Blob): Promise<string> {
+    if (audioBlob.size === 0) {
+      return '';
+    }
+
+    const waveform = await this.decodeToWhisperWaveform(audioBlob);
+    const transcriber = await this.getPipeline();
+    const inputLanguage = this.session.currentUser()
+      ? this.session.dictationLanguage()
+      : this.resolveFallbackInputLanguage();
+    const output = await transcriber(waveform, {
+      chunk_length_s: defaultChunkLengthSeconds,
+      stride_length_s: defaultStrideLengthSeconds,
+      task: 'transcribe',
+      language: whisperLanguageNames[inputLanguage],
+    });
+    const transcription = Array.isArray(output) ? output[0] : output;
+    return transcription.text.trim();
+  }
+
+  private async getPipeline(): Promise<AutomaticSpeechRecognitionPipeline> {
+    if (!this.pipelinePromise) {
+      this.pipelinePromise = this.createPreferredPipeline<AutomaticSpeechRecognitionPipeline>(
+        'automatic-speech-recognition',
+        this.modelId,
+      );
+    }
+
+    return await this.pipelinePromise!;
+  }
+
+  private async getTransformersModule(): Promise<TransformersModule> {
+    if (!this.transformersModulePromise) {
+      this.transformersModulePromise = import('@huggingface/transformers') as Promise<TransformersModule>;
+    }
+
+    const transformersModule = await this.transformersModulePromise;
+    const onnxWasmEnv = transformersModule.env.backends.onnx.wasm;
+
+    if (onnxWasmEnv && !onnxWasmEnv.wasmPaths) {
+      onnxWasmEnv.wasmPaths =
+        this.runtimeEnv.PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH?.trim() || defaultTransformersWasmPath;
+    }
+
+    return transformersModule;
+  }
+
+  private async createPreferredPipeline<T>(
+    task: string,
+    model: string,
+    options?: {
+      dtype?: 'fp32';
+      model_file_name?: string;
+      subfolder?: string;
+    },
+  ): Promise<T> {
+    const transformersModule = await this.getTransformersModule();
+    const candidateDevices: Array<'webgpu' | 'wasm'> = this.browserSupportsWebGpu()
+      ? ['webgpu', 'wasm']
+      : ['wasm'];
+    let lastError: unknown = null;
+
+    for (const device of candidateDevices) {
+      try {
+        const pipeline = await transformersModule.pipeline(task, model, {
+          ...options,
+          device,
+        });
+
+        console.info(`[dictation] Loaded ${task} pipeline for ${model} on ${device}.`);
+        return pipeline as T;
+      } catch (error) {
+        lastError = error;
+        console.warn(`[dictation] Could not load ${task} pipeline for ${model} on ${device}.`, error);
+      }
+    }
+
+    throw lastError instanceof Error ? lastError : new Error(`Could not load ${task} pipeline for ${model}.`);
+  }
+
+  private async decodeToWhisperWaveform(audioBlob: Blob): Promise<Float32Array> {
+    const audioContextConstructor = resolveAudioContextConstructor();
+
+    if (!audioContextConstructor) {
+      throw new Error('This browser cannot decode recorded audio for dictation.');
+    }
+
+    const arrayBuffer = await audioBlob.arrayBuffer();
+    const audioContext = new audioContextConstructor();
+
+    try {
+      const audioBuffer = await audioContext.decodeAudioData(arrayBuffer.slice(0));
+      const monoChannel = this.mixToMono(audioBuffer);
+
+      if (audioBuffer.sampleRate === whisperTargetSampleRate) {
+        return monoChannel;
+      }
+
+      return this.resampleMonoChannel(monoChannel, audioBuffer.sampleRate, whisperTargetSampleRate);
+    } catch (error) {
+      throw error instanceof Error
+        ? error
+        : new Error('Could not decode the recorded dictation audio.');
+    } finally {
+      await audioContext.close().catch(() => undefined);
+    }
+  }
+
+  private mixToMono(audioBuffer: AudioBuffer): Float32Array {
+    const mixed = new Float32Array(audioBuffer.length);
+
+    for (let channelIndex = 0; channelIndex < audioBuffer.numberOfChannels; channelIndex += 1) {
+      const channel = audioBuffer.getChannelData(channelIndex);
+
+      for (let sampleIndex = 0; sampleIndex < channel.length; sampleIndex += 1) {
+        mixed[sampleIndex] += channel[sampleIndex];
+      }
+    }
+
+    if (audioBuffer.numberOfChannels > 1) {
+      for (let sampleIndex = 0; sampleIndex < mixed.length; sampleIndex += 1) {
+        mixed[sampleIndex] /= audioBuffer.numberOfChannels;
+      }
+    }
+
+    return mixed;
+  }
+
+  private resampleMonoChannel(
+    monoChannel: Float32Array,
+    sourceSampleRate: number,
+    targetSampleRate: number,
+  ): Float32Array {
+    if (sourceSampleRate === targetSampleRate) {
+      return monoChannel;
+    }
+
+    const targetLength = Math.max(1, Math.round(monoChannel.length * targetSampleRate / sourceSampleRate));
+    const resampled = new Float32Array(targetLength);
+    const positionRatio = sourceSampleRate / targetSampleRate;
+
+    for (let sampleIndex = 0; sampleIndex < targetLength; sampleIndex += 1) {
+      const sourcePosition = sampleIndex * positionRatio;
+      const sourceIndex = Math.floor(sourcePosition);
+      const nextSourceIndex = Math.min(sourceIndex + 1, monoChannel.length - 1);
+      const interpolationWeight = sourcePosition - sourceIndex;
+      const currentValue = monoChannel[sourceIndex] ?? 0;
+      const nextValue = monoChannel[nextSourceIndex] ?? currentValue;
+
+      resampled[sampleIndex] = currentValue + ((nextValue - currentValue) * interpolationWeight);
+    }
+
+    return resampled;
+  }
+
+  private normalizeLanguage(language: string | undefined): string | null {
+    const trimmedLanguage = language?.trim();
+
+    if (!trimmedLanguage || trimmedLanguage.toLowerCase() === 'auto') {
+      return null;
+    }
+
+    return trimmedLanguage;
+  }
+
+  private browserSupportsWebGpu(): boolean {
+    return typeof navigator !== 'undefined' && 'gpu' in navigator;
+  }
+
+  private resolveFallbackInputLanguage(): DictationLanguage {
+    switch (this.fallbackLanguage?.toLowerCase()) {
+      case 'french':
+      case 'fr':
+        return 'fr';
+      case 'spanish':
+      case 'es':
+        return 'es';
+      default:
+        return 'en';
+    }
+  }
+}
--- a/client/src/app/chat-page.component.html
+++ b/client/src/app/chat-page.component.html
@@ -261,7 +261,7 @@
 	                    <button
 	                      class="composer-dictation"
 	                      type="button"
-	                      [disabled]="!session.isSelectedPeerReady() || session.signalingState() !== 'connected' || isTranscribingDictation()"
+	                      [disabled]="!selectedPeerId || isTranscribingDictation()"
 	                      (click)="toggleDictation(composerTextarea)"
 	                      [title]="
 	                        isDictating()
--- a/client/src/app/chat-page.component.ts
+++ b/client/src/app/chat-page.component.ts
@@ -4,6 +4,7 @@ import { toSignal } from '@angular/core/rxjs-interop';
 import { FormsModule } from '@angular/forms';
 import { ActivatedRoute, Router, RouterLink } from '@angular/router';

+import { BrowserSpeechTranscriberService } from './browser-speech-transcriber.service';
 import { PeerCallModalComponent } from './peer-call-modal.component';
 import { ChatSessionService } from './chat-session.service';
 import { JsonFileViewerComponent } from './json-file-viewer.component';
@@ -36,6 +37,7 @@ export class ChatPageComponent implements OnDestroy {
  private readonly route = inject(ActivatedRoute);
  private readonly router = inject(Router);
  private readonly ngZone = inject(NgZone);
+  private readonly speechTranscriber = inject(BrowserSpeechTranscriberService);
  private readonly routeParamMap = toSignal(this.route.paramMap, {
    initialValue: this.route.snapshot.paramMap,
  });
@@ -274,6 +276,10 @@ export class ChatPageComponent implements OnDestroy {
      void this.router.navigateByUrl('/');
    }

+    queueMicrotask(() => {
+      void this.speechTranscriber.preload().catch(() => undefined);
+    });
+
    effect(() => {
      const currentUserId = this.currentUser()?.id ?? null;
      this.knownPeers.set(this.readKnownPeers(currentUserId));
@@ -1115,16 +1121,16 @@ export class ChatPageComponent implements OnDestroy {

  private async transcribeDictation(blob: Blob, textarea: HTMLTextAreaElement, applyToken: number): Promise<void> {
    try {
-      const transcript = await this.session.requestSpeechTranscription(blob);
+      const transcript = await this.speechTranscriber.transcribe(blob);

      if (applyToken !== this.dictationApplyToken) {
        return;
      }

      this.applyDictatedText(this.mergeDictatedText(this.dictationBaseText, transcript), textarea);
-    } catch {
+    } catch (error) {
      if (applyToken === this.dictationApplyToken) {
-        this.session.error.set('Dictation transcription failed.');
+        this.session.error.set(error instanceof Error ? error.message : 'Dictation transcription failed.');
      }
    } finally {
      if (applyToken === this.dictationApplyToken) {
--- a/client/src/app/chat-session.service.ts
+++ b/client/src/app/chat-session.service.ts
@@ -1,5 +1,5 @@
 import { HttpClient, HttpErrorResponse } from '@angular/common/http';
-import { computed, Injectable, signal } from '@angular/core';
+import { computed, effect, Injectable, signal } from '@angular/core';
 import { ImageMagick, MagickFormat, initializeImageMagick } from '@imagemagick/magick-wasm';
 import { firstValueFrom } from 'rxjs';

@@ -12,6 +12,7 @@ import {
  ChatEntry,
  ConnectionState,
  DataEnvelope,
+  DictationLanguage,
  DeliveryState,
  PendingApprovalResponse,
  PendingApprovalUser,
@@ -126,6 +127,7 @@ export class ChatSessionService {
  private static readonly messageStoreName = 'conversation_messages';
  private static readonly knownPeersStoragePrefix = 'privatechat.knownPeers';
  private static readonly incomingMessageSoundStorageKey = 'privatechat.incomingMessageSoundEnabled';
+  private static readonly dictationLanguageStoragePrefix = 'privatechat.dictationLanguage';
  private static readonly messageRetentionLimit = 256;
  private static readonly sessionKeepaliveMs = 5 * 60 * 1000;
  private static readonly signalingHeartbeatMs = 25 * 1000;
@@ -158,6 +160,7 @@ export class ChatSessionService {
  readonly incomingMessageSoundEnabled = signal(
    this.readStorage(ChatSessionService.incomingMessageSoundStorageKey) !== '0',
  );
+  readonly dictationLanguage = signal<DictationLanguage>('en');
  readonly webAuthnSupported = signal(
    typeof window !== 'undefined' &&
      typeof window.PublicKeyCredential !== 'undefined' &&
@@ -193,10 +196,6 @@ export class ChatSessionService {
    string,
    { peerId: string; prompt: string; waitMessageId: string }
  >();
-  private readonly pendingSpeechTranscriptionRequests = new Map<
-    string,
-    { resolve: (text: string) => void; reject: (reason?: unknown) => void }
-  >();
  private readonly incomingCallModes = signal<Array<{ peerId: string; mode: CallMode }>>([]);
  private readonly outgoingCallModes = signal<Array<{ peerId: string; mode: CallMode }>>([]);
  private readonly activeCallModes = signal<Array<{ peerId: string; mode: CallMode }>>([]);
@@ -224,6 +223,17 @@ export class ChatSessionService {
  constructor(private readonly http: HttpClient) {
    this.installConnectionRecoveryListeners();

+    effect(() => {
+      const currentUserId = this.currentUser()?.id;
+
+      if (!currentUserId) {
+        this.dictationLanguage.set('en');
+        return;
+      }
+
+      this.dictationLanguage.set(this.readStoredDictationLanguage(currentUserId));
+    });
+
    if (this.token() && this.currentUser()) {
      queueMicrotask(() => {
        void this.restoreSession();
@@ -331,6 +341,19 @@ export class ChatSessionService {
    this.writeStorage(ChatSessionService.incomingMessageSoundStorageKey, enabled ? '1' : '0');
  }

+  setDictationLanguage(language: DictationLanguage): void {
+    const nextLanguage = this.normalizeDictationLanguage(language);
+    this.dictationLanguage.set(nextLanguage);
+
+    const currentUserId = this.currentUser()?.id;
+
+    if (!currentUserId) {
+      return;
+    }
+
+    this.writeStorage(this.dictationLanguageStorageKey(currentUserId), nextLanguage);
+  }
+
  selectPeer(peerId: string): void {
    this.activePeerId.set(peerId);
    this.clearUnreadPeer(peerId);
@@ -1263,32 +1286,6 @@ export class ChatSessionService {
    return true;
  }

-  async requestSpeechTranscription(audioBlob: Blob): Promise<string> {
-    if (!this.websocket || this.websocket.readyState !== WebSocket.OPEN) {
-      throw new Error('You must be connected to signaling before using dictation.');
-    }
-
-    const requestId = crypto.randomUUID();
-    const audioBase64 = await this.blobToBase64(audioBlob);
-
-    return new Promise<string>((resolve, reject) => {
-      this.pendingSpeechTranscriptionRequests.set(requestId, { resolve, reject });
-
-      try {
-        this.error.set(null);
-        this.websocket?.send(JSON.stringify({
-          type: 'speech-transcription',
-          requestId,
-          mimeType: audioBlob.type || 'audio/webm',
-          audioBase64,
-        }));
-      } catch (error) {
-        this.pendingSpeechTranscriptionRequests.delete(requestId);
-        reject(error);
-      }
-    });
-  }
-
  private async loadAccessKeys(): Promise<void> {
    const token = this.token();

@@ -1365,7 +1362,6 @@ export class ChatSessionService {
      const shouldReconnect = this.websocket === websocket && !this.suppressSocketReconnect;

      this.stopWebSocketHeartbeat();
-      this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
      this.signalingState.set('disconnected');
      this.status.set('Signaling connection closed.');

@@ -1408,8 +1404,6 @@ export class ChatSessionService {

  private disconnectWebSocket(): void {
    this.stopWebSocketHeartbeat();
-    this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
-
    if (this.websocket) {
      this.suppressSocketReconnect = true;
      this.websocket.close();
@@ -1450,12 +1444,6 @@ export class ChatSessionService {
      case 'image-generation-error':
        this.handleGeneratedImageError(event);
        break;
-      case 'speech-transcribed':
-        this.handleSpeechTranscribed(event);
-        break;
-      case 'speech-transcription-error':
-        this.handleSpeechTranscriptionError(event);
-        break;
      case 'pong':
        break;
      case 'error':
@@ -1515,28 +1503,6 @@ export class ChatSessionService {
    this.error.set(event.message);
  }

-  private handleSpeechTranscribed(event: Extract<ServerEvent, { type: 'speech-transcribed' }>): void {
-    const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
-
-    if (!pendingRequest) {
-      return;
-    }
-
-    this.pendingSpeechTranscriptionRequests.delete(event.requestId);
-    pendingRequest.resolve(event.text);
-  }
-
-  private handleSpeechTranscriptionError(event: Extract<ServerEvent, { type: 'speech-transcription-error' }>): void {
-    const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
-
-    if (pendingRequest) {
-      this.pendingSpeechTranscriptionRequests.delete(event.requestId);
-      pendingRequest.reject(new Error(event.message));
-    }
-
-    this.error.set(event.message);
-  }
-
  private async restoreSession(): Promise<void> {
    const token = this.token();

@@ -2530,18 +2496,6 @@ export class ChatSessionService {
    }
  }

-  private rejectPendingSpeechTranscriptions(message: string): void {
-    if (this.pendingSpeechTranscriptionRequests.size === 0) {
-      return;
-    }
-
-    for (const { reject } of this.pendingSpeechTranscriptionRequests.values()) {
-      reject(new Error(message));
-    }
-
-    this.pendingSpeechTranscriptionRequests.clear();
-  }
-
  private clearLocalAuth(statusMessage: string): void {
    this.clearWebSocketReconnect();
    this.signalingRecoveryPromise = null;
@@ -2555,7 +2509,6 @@ export class ChatSessionService {
    this.releasePreloadedRingtone();
    this.pendingImageGenerationRequests.clear();
    this.pendingOutgoingFlushes.clear();
-    this.rejectPendingSpeechTranscriptions('Session ended during dictation.');
    this.incomingCallModes.set([]);
    this.outgoingCallModes.set([]);
    this.activeCallModes.set([]);
@@ -3865,4 +3818,25 @@ export class ChatSessionService {

    return responseMessage ?? thrownMessage ?? fallback;
  }
+
+  private readStoredDictationLanguage(currentUserId: string): DictationLanguage {
+    const storedValue = this.readStorage(this.dictationLanguageStorageKey(currentUserId));
+
+    return this.normalizeDictationLanguage(storedValue);
+  }
+
+  private dictationLanguageStorageKey(currentUserId: string): string {
+    return `${ChatSessionService.dictationLanguageStoragePrefix}.${currentUserId}`;
+  }
+
+  private normalizeDictationLanguage(value: string | null | undefined): DictationLanguage {
+    switch (value) {
+      case 'fr':
+      case 'es':
+      case 'en':
+        return value;
+      default:
+        return 'en';
+    }
+  }
 }
--- a/client/src/app/home-page.component.html
+++ b/client/src/app/home-page.component.html
@@ -188,6 +188,31 @@
              <div class="alert alert-success mb-4">{{ session.notice() }}</div>
            }

+            <section class="access-key-panel mb-4">
+              <div class="dictation-language-panel">
+                <div>
+                  <h3 class="h5 mb-1">Dictation language</h3>
+                  <p class="small text-secondary mb-0">
+                    Speech input and text output use the same selected language.
+                  </p>
+                </div>
+
+                <div class="dictation-language-select-shell mt-3">
+                  <label class="form-label small mb-2" for="dictationLanguage">Language</label>
+                  <select
+                    id="dictationLanguage"
+                    class="form-select"
+                    [ngModel]="session.dictationLanguage()"
+                    (ngModelChange)="setDictationLanguage($event)"
+                  >
+                    @for (option of dictationLanguageOptions; track option.value) {
+                      <option [ngValue]="option.value">{{ option.label }}</option>
+                    }
+                  </select>
+                </div>
+              </div>
+            </section>
+
            <section class="access-key-panel mb-4">
              <div class="d-flex justify-content-between align-items-start gap-3">
                <div>
--- a/client/src/app/home-page.component.scss
+++ b/client/src/app/home-page.component.scss
@@ -114,6 +114,12 @@
  background: var(--panel-soft-background);
 }

+.dictation-language-panel,
+.dictation-language-select-shell {
+  display: grid;
+  gap: 0.75rem;
+}
+
 .user-search-panel {
  display: grid;
  gap: 0.75rem;
@@ -183,7 +189,9 @@
 }

 .form-control,
-.form-control:focus {
+.form-control:focus,
+.form-select,
+.form-select:focus {
  color: var(--page-text);
  background-color: var(--input-background);
  border-color: var(--input-border);
--- a/client/src/app/home-page.component.ts
+++ b/client/src/app/home-page.component.ts
@@ -4,7 +4,7 @@ import { FormsModule } from '@angular/forms';
 import { Router, RouterLink } from '@angular/router';

 import { ChatSessionService } from './chat-session.service';
-import type { AdminUserSummary, UserProfile } from './models';
+import type { AdminUserSummary, DictationLanguage, UserProfile } from './models';
 import { ThemeService } from './theme.service';

@Component({
@@ -32,6 +32,11 @@ export class HomePageComponent {
  readonly loadingAdminUsers = signal(false);
  readonly deletingUserId = signal<string | null>(null);
  readonly adminUsersError = signal<string | null>(null);
+  readonly dictationLanguageOptions: Array<{ value: DictationLanguage; label: string }> = [
+    { value: 'en', label: 'English' },
+    { value: 'fr', label: 'French' },
+    { value: 'es', label: 'Spanish' },
+  ];
  readonly filteredKnownUsers = computed(() => {
    const query = this.userSearch.trim().toLowerCase();
    const users = this.knownUsers();
@@ -202,4 +207,8 @@ export class HomePageComponent {
  setIncomingMessageSound(enabled: boolean): void {
    this.session.setIncomingMessageSoundEnabled(enabled);
  }
+
+  setDictationLanguage(language: string): void {
+    this.session.setDictationLanguage(language as DictationLanguage);
+  }
 }
--- a/client/src/app/models.ts
+++ b/client/src/app/models.ts
@@ -113,6 +113,7 @@ export interface ChatEntry {
 }

 export type CallMode = 'audio' | 'video';
+export type DictationLanguage = 'en' | 'fr' | 'es';

 export type SignalPayload =
  | { type: 'sdp'; description: RTCSessionDescriptionInit }
@@ -138,16 +139,6 @@ export type ServerEvent =
      peerId: string;
      message: string;
    }
-  | {
-      type: 'speech-transcribed';
-      requestId: string;
-      text: string;
-    }
-  | {
-      type: 'speech-transcription-error';
-      requestId: string;
-      message: string;
-    }
  | { type: 'pong' }
  | { type: 'error'; message: string };

--- a/server/dist/index.js
+++ b/server/dist/index.js
@@ -16,7 +16,6 @@ import { generateAuthenticationOptions, generateRegistrationOptions, verifyAuthe
 import Fastify from 'fastify';
 import { Redis } from 'ioredis';
 import { z } from 'zod';
-import { SpeechTranscriber } from './speech-transcriber.js';
 dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
 const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
 const registerSchema = z.object({
@@ -90,12 +89,6 @@ const signalMessageSchema = z.discriminatedUnion('type', [
    z.object({
        type: z.literal('ping'),
    }),
-    z.object({
-        type: z.literal('speech-transcription'),
-        requestId: z.string().uuid(),
-        mimeType: z.string().trim().min(1).max(128),
-        audioBase64: z.string().min(1).max(32_000_000),
-    }),
 ]);
 const app = Fastify({ logger: true, trustProxy: true });
 const approvalAdminUsername = 'ladparis';
@@ -106,9 +99,6 @@ const frontendDistPath = resolveProjectPath(process.env.PRIVATECHAT_WEB_DIST_DIR
 const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
 const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
 const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
-const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'wss://whisper.dubertrand.fr';
-const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto';
-const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000);
 const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
 const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
 const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
@@ -121,11 +111,6 @@ const frontendIndexPath = path.join(frontendDistPath, 'index.html');
 const hasFrontendBuild = fs.existsSync(frontendIndexPath);
 const convertOfficeDocument = promisify(libreOffice.convertWithOptions);
 const execFileAsync = promisify(execFile);
-const speechTranscriber = new SpeechTranscriber({
-    serviceUrl: speechTranscriptionServiceUrl,
-    language: speechTranscriptionLanguage,
-    requestTimeoutMs: speechTranscriptionTimeoutMs,
-}, app.log);
 fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
 fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
 const encryptionKey = deriveEncryptionKey(loadOrCreateMasterKey(masterKeyPath));
@@ -782,25 +767,6 @@ async function handleSocketMessage(userId, sessionId, socket, rawMessage) {
        }
        return;
    }
-    if (parsed.type === 'speech-transcription') {
-        try {
-            const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType);
-            send(socket, {
-                type: 'speech-transcribed',
-                requestId: parsed.requestId,
-                text,
-            });
-        }
-        catch (error) {
-            app.log.warn({ err: error, userId }, 'Speech transcription failed');
-            send(socket, {
-                type: 'speech-transcription-error',
-                requestId: parsed.requestId,
-                message: error instanceof Error ? error.message : 'Speech transcription failed.',
-            });
-        }
-        return;
-    }
    let delivered = 0;
    const recipientSockets = socketsByUserId.get(parsed.to);
    if (recipientSockets) {
@@ -1257,23 +1223,12 @@ function parseClientMessage(rawMessage) {
            prompt: parsed.data.prompt,
        };
    }
-    if (parsed.data.type === 'speech-transcription') {
-        return {
-            type: 'speech-transcription',
-            requestId: parsed.data.requestId,
-            mimeType: parsed.data.mimeType,
-            audioBase64: parsed.data.audioBase64,
-        };
-    }
    return {
        type: 'signal',
        to: parsed.data.to,
        signal: normalizeSignal(parsed.data.signal),
    };
 }
-async function transcribeAudioPayload(requestId, audioBase64, mimeType) {
-    return await speechTranscriber.transcribe(requestId, audioBase64, mimeType);
-}
 async function generateImageFromPrompt(prompt) {
    const abortController = new AbortController();
    const timeoutId = setTimeout(() => abortController.abort(), 120_000);
--- a/server/dist/speech-transcriber.js
+++ b/server/dist/speech-transcriber.js
@@ -1,124 +0,0 @@
-import WebSocket from 'ws';
-export class SpeechTranscriber {
-    options;
-    logger;
-    constructor(options, logger) {
-        this.options = options;
-        this.logger = logger;
-    }
-    async transcribe(requestId, audioBase64, mimeType) {
-        const audio = this.normalizeAudioPayload(audioBase64, mimeType);
-        return await new Promise((resolve, reject) => {
-            let settled = false;
-            const socket = new WebSocket(this.options.serviceUrl);
-            const finish = (handler) => {
-                if (settled) {
-                    return;
-                }
-                settled = true;
-                clearTimeout(timeout);
-                socket.removeAllListeners();
-                if (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN) {
-                    socket.close();
-                }
-                handler();
-            };
-            const timeout = setTimeout(() => {
-                finish(() => {
-                    reject(new Error(`The transcription service timed out after ${this.options.requestTimeoutMs}ms.`));
-                });
-            }, this.options.requestTimeoutMs);
-            socket.on('open', () => {
-                try {
-                    socket.send(JSON.stringify({
-                        type: 'transcribe',
-                        id: requestId,
-                        language: this.options.language,
-                        audio,
-                    }));
-                }
-                catch (error) {
-                    finish(() => {
-                        reject(error instanceof Error ? error : new Error('Could not send transcription request.'));
-                    });
-                }
-            });
-            socket.on('message', (payload) => {
-                const event = this.parseEvent(payload);
-                if (!event) {
-                    return;
-                }
-                if (event.id && event.id !== requestId) {
-                    this.logger.warn({ requestId, event }, 'Ignored transcription event for another request');
-                    return;
-                }
-                if (event.type === 'start') {
-                    this.logger.info({ requestId, model: event.model, language: event.language }, 'Speech transcription started');
-                    return;
-                }
-                if (event.type === 'delta') {
-                    return;
-                }
-                if (event.type === 'done') {
-                    finish(() => {
-                        resolve(event.text.trim());
-                    });
-                    return;
-                }
-                finish(() => {
-                    reject(new Error(event.message));
-                });
-            });
-            socket.on('error', (error) => {
-                finish(() => {
-                    reject(error instanceof Error ? error : new Error('The transcription service connection failed.'));
-                });
-            });
-            socket.on('close', (code, reasonBuffer) => {
-                if (settled) {
-                    return;
-                }
-                const reason = reasonBuffer.toString().trim();
-                const detail = reason
-                    ? `The transcription service closed the connection unexpectedly (code=${code}, reason=${reason}).`
-                    : `The transcription service closed the connection unexpectedly (code=${code}).`;
-                finish(() => {
-                    reject(new Error(detail));
-                });
-            });
-        });
-    }
-    normalizeAudioPayload(audioBase64, mimeType) {
-        const trimmedAudio = audioBase64.trim();
-        if (trimmedAudio.startsWith('data:')) {
-            return trimmedAudio;
-        }
-        const normalizedMimeType = mimeType.trim() || 'audio/webm';
-        return `data:${normalizedMimeType};base64,${trimmedAudio}`;
-    }
-    parseEvent(payload) {
-        const message = this.rawDataToString(payload).trim();
-        if (!message) {
-            return null;
-        }
-        try {
-            return JSON.parse(message);
-        }
-        catch {
-            this.logger.warn({ transcriptionPayload: message }, 'Ignored non-JSON transcription event');
-            return null;
-        }
-    }
-    rawDataToString(payload) {
-        if (typeof payload === 'string') {
-            return payload;
-        }
-        if (payload instanceof ArrayBuffer) {
-            return Buffer.from(payload).toString('utf8');
-        }
-        if (Array.isArray(payload)) {
-            return Buffer.concat(payload).toString('utf8');
-        }
-        return payload.toString('utf8');
-    }
-}
--- a/server/src/index.ts
+++ b/server/src/index.ts
@@ -26,8 +26,6 @@ import { Redis } from 'ioredis';
 import type WebSocket from 'ws';
 import { z } from 'zod';

-import { SpeechTranscriber } from './speech-transcriber.js';
-
 dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });

 const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
@@ -125,12 +123,6 @@ type ClientMessage =
    }
  | {
      type: 'ping';
-    }
-  | {
-      type: 'speech-transcription';
-      requestId: string;
-      mimeType: string;
-      audioBase64: string;
    };

 type ServerMessage =
@@ -153,16 +145,6 @@ type ServerMessage =
      peerId: string;
      message: string;
    }
-  | {
-      type: 'speech-transcribed';
-      requestId: string;
-      text: string;
-    }
-  | {
-      type: 'speech-transcription-error';
-      requestId: string;
-      message: string;
-    }
  | { type: 'pong' }
  | { type: 'error'; message: string };

@@ -316,12 +298,6 @@ const signalMessageSchema = z.discriminatedUnion('type', [
  z.object({
    type: z.literal('ping'),
  }),
-  z.object({
-    type: z.literal('speech-transcription'),
-    requestId: z.string().uuid(),
-    mimeType: z.string().trim().min(1).max(128),
-    audioBase64: z.string().min(1).max(32_000_000),
-  }),
 ]);

 const app = Fastify({ logger: true, trustProxy: true });
@@ -340,9 +316,6 @@ const frontendDistPath = resolveProjectPath(
 const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
 const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
 const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
-const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'wss://whisper.dubertrand.fr';
-const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto';
-const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000);
 const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
 const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
 const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
@@ -358,15 +331,6 @@ const hasFrontendBuild = fs.existsSync(frontendIndexPath);
 const convertOfficeDocument = promisify(libreOffice.convertWithOptions);
 const execFileAsync = promisify(execFile);

-const speechTranscriber = new SpeechTranscriber(
-  {
-    serviceUrl: speechTranscriptionServiceUrl,
-    language: speechTranscriptionLanguage,
-    requestTimeoutMs: speechTranscriptionTimeoutMs,
-  },
-  app.log,
-);
-
 fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
 fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });

@@ -1225,27 +1189,6 @@ async function handleSocketMessage(
    return;
  }

-  if (parsed.type === 'speech-transcription') {
-    try {
-      const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType);
-
-      send(socket, {
-        type: 'speech-transcribed',
-        requestId: parsed.requestId,
-        text,
-      });
-    } catch (error) {
-      app.log.warn({ err: error, userId }, 'Speech transcription failed');
-      send(socket, {
-        type: 'speech-transcription-error',
-        requestId: parsed.requestId,
-        message: error instanceof Error ? error.message : 'Speech transcription failed.',
-      });
-    }
-
-    return;
-  }
-
  let delivered = 0;
  const recipientSockets = socketsByUserId.get(parsed.to);

@@ -1897,15 +1840,6 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
    };
  }

-  if (parsed.data.type === 'speech-transcription') {
-    return {
-      type: 'speech-transcription',
-      requestId: parsed.data.requestId,
-      mimeType: parsed.data.mimeType,
-      audioBase64: parsed.data.audioBase64,
-    };
-  }
-
  return {
    type: 'signal',
    to: parsed.data.to,
@@ -1913,10 +1847,6 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
  };
 }

-async function transcribeAudioPayload(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
-  return await speechTranscriber.transcribe(requestId, audioBase64, mimeType);
-}
-
 async function generateImageFromPrompt(prompt: string): Promise<{ imageBase64: string; mimeType: string }> {
  const abortController = new AbortController();
  const timeoutId = setTimeout(() => abortController.abort(), 120_000);
--- a/server/src/speech-transcriber.ts
+++ b/server/src/speech-transcriber.ts
@@ -1,173 +0,0 @@
-import WebSocket, { type RawData } from 'ws';
-
-type LoggerLike = {
-  info: (payload: unknown, message?: string) => void;
-  warn: (payload: unknown, message?: string) => void;
-  error: (payload: unknown, message?: string) => void;
-};
-
-type SpeechTranscriberOptions = {
-  serviceUrl: string;
-  language: string;
-  requestTimeoutMs: number;
-};
-
-type ServiceEvent =
-  | { type: 'start'; id: string; model: string; language: string }
-  | { type: 'delta'; id: string; text: string; fullText: string }
-  | { type: 'done'; id: string; text: string }
-  | { type: 'error'; id?: string; message: string };
-
-export class SpeechTranscriber {
-  constructor(
-    private readonly options: SpeechTranscriberOptions,
-    private readonly logger: LoggerLike,
-  ) {}
-
-  async transcribe(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
-    const audio = this.normalizeAudioPayload(audioBase64, mimeType);
-
-    return await new Promise<string>((resolve, reject) => {
-      let settled = false;
-      const socket = new WebSocket(this.options.serviceUrl);
-
-      const finish = (handler: () => void): void => {
-        if (settled) {
-          return;
-        }
-
-        settled = true;
-        clearTimeout(timeout);
-        socket.removeAllListeners();
-
-        if (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN) {
-          socket.close();
-        }
-
-        handler();
-      };
-
-      const timeout = setTimeout(() => {
-        finish(() => {
-          reject(new Error(`The transcription service timed out after ${this.options.requestTimeoutMs}ms.`));
-        });
-      }, this.options.requestTimeoutMs);
-
-      socket.on('open', () => {
-        try {
-          socket.send(
-            JSON.stringify({
-              type: 'transcribe',
-              id: requestId,
-              language: this.options.language,
-              audio,
-            }),
-          );
-        } catch (error) {
-          finish(() => {
-            reject(error instanceof Error ? error : new Error('Could not send transcription request.'));
-          });
-        }
-      });
-
-      socket.on('message', (payload) => {
-        const event = this.parseEvent(payload);
-
-        if (!event) {
-          return;
-        }
-
-        if (event.id && event.id !== requestId) {
-          this.logger.warn({ requestId, event }, 'Ignored transcription event for another request');
-          return;
-        }
-
-        if (event.type === 'start') {
-          this.logger.info(
-            { requestId, model: event.model, language: event.language },
-            'Speech transcription started',
-          );
-          return;
-        }
-
-        if (event.type === 'delta') {
-          return;
-        }
-
-        if (event.type === 'done') {
-          finish(() => {
-            resolve(event.text.trim());
-          });
-          return;
-        }
-
-        finish(() => {
-          reject(new Error(event.message));
-        });
-      });
-
-      socket.on('error', (error) => {
-        finish(() => {
-          reject(error instanceof Error ? error : new Error('The transcription service connection failed.'));
-        });
-      });
-
-      socket.on('close', (code, reasonBuffer) => {
-        if (settled) {
-          return;
-        }
-
-        const reason = reasonBuffer.toString().trim();
-        const detail = reason
-          ? `The transcription service closed the connection unexpectedly (code=${code}, reason=${reason}).`
-          : `The transcription service closed the connection unexpectedly (code=${code}).`;
-
-        finish(() => {
-          reject(new Error(detail));
-        });
-      });
-    });
-  }
-
-  private normalizeAudioPayload(audioBase64: string, mimeType: string): string {
-    const trimmedAudio = audioBase64.trim();
-
-    if (trimmedAudio.startsWith('data:')) {
-      return trimmedAudio;
-    }
-
-    const normalizedMimeType = mimeType.trim() || 'audio/webm';
-    return `data:${normalizedMimeType};base64,${trimmedAudio}`;
-  }
-
-  private parseEvent(payload: RawData): ServiceEvent | null {
-    const message = this.rawDataToString(payload).trim();
-
-    if (!message) {
-      return null;
-    }
-
-    try {
-      return JSON.parse(message) as ServiceEvent;
-    } catch {
-      this.logger.warn({ transcriptionPayload: message }, 'Ignored non-JSON transcription event');
-      return null;
-    }
-  }
-
-  private rawDataToString(payload: RawData): string {
-    if (typeof payload === 'string') {
-      return payload;
-    }
-
-    if (payload instanceof ArrayBuffer) {
-      return Buffer.from(payload).toString('utf8');
-    }
-
-    if (Array.isArray(payload)) {
-      return Buffer.concat(payload).toString('utf8');
-    }
-
-    return payload.toString('utf8');
-  }
-}