local multi language STT

2026-03-26 19:38:54 +01:00
parent cc14b4d1b7
commit f2bf70bc7d
18 changed files with 1334 additions and 517 deletions
--- a/client/angular.json
+++ b/client/angular.json
@@ -54,6 +54,11 @@
                "glob": "magick.wasm",
                "input": "node_modules/@imagemagick/magick-wasm/dist",
                "output": "/"
+              },
+              {
+                "glob": "ort-wasm-simd-threaded.jsep.*",
+                "input": "node_modules/@huggingface/transformers/dist",
+                "output": "/transformers-wasm"
              }
            ],
            "styles": [
--- a/client/package-lock.json
+++ b/client/package-lock.json
--- a/client/package.json
+++ b/client/package.json
@@ -18,6 +18,7 @@
    "@angular/forms": "^21.2.0",
    "@angular/platform-browser": "^21.2.0",
    "@angular/router": "^21.2.0",
+    "@huggingface/transformers": "^3.8.1",
    "@imagemagick/magick-wasm": "^0.0.39",
    "bootstrap": "^5.3.8",
    "ngx-extended-pdf-viewer": "^25.6.4",
--- a/client/public/env.js
+++ b/client/public/env.js
@@ -1,3 +1,6 @@
 window.__PRIVATECHAT_ENV__ = {
-  "PRIVATECHAT_CLIENT_SERVER_URL": "https://chatter.dubertrand.fr"
+  "PRIVATECHAT_CLIENT_SERVER_URL": "https://chatter.dubertrand.fr",
+  "PRIVATECHAT_CLIENT_WHISPER_MODEL": "Xenova/whisper-small",
+  "PRIVATECHAT_CLIENT_WHISPER_LANGUAGE": "auto",
+  "PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH": "/transformers-wasm/"
 };
--- a/client/scripts/write-env.js
+++ b/client/scripts/write-env.js
@@ -9,6 +9,10 @@ dotenv.config({ path: rootEnvPath });

 const runtimeEnv = {
  PRIVATECHAT_CLIENT_SERVER_URL: process.env.PRIVATECHAT_CLIENT_SERVER_URL ?? 'http://localhost:3000',
+  PRIVATECHAT_CLIENT_WHISPER_MODEL: process.env.PRIVATECHAT_CLIENT_WHISPER_MODEL ?? 'Xenova/whisper-small',
+  PRIVATECHAT_CLIENT_WHISPER_LANGUAGE: process.env.PRIVATECHAT_CLIENT_WHISPER_LANGUAGE ?? 'auto',
+  PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH:
+    process.env.PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH ?? '/transformers-wasm/',
 };

 const fileContents = `window.__PRIVATECHAT_ENV__ = ${JSON.stringify(runtimeEnv, null, 2)};\n`;
--- a/client/src/app/browser-speech-transcriber.service.ts
+++ b/client/src/app/browser-speech-transcriber.service.ts
@@ -0,0 +1,273 @@
+import { inject, Injectable } from '@angular/core';
+
+import { ChatSessionService } from './chat-session.service';
+import type { DictationLanguage } from './models';
+
+type PrivateChatRuntimeEnv = {
+  PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH?: string;
+  PRIVATECHAT_CLIENT_WHISPER_LANGUAGE?: string;
+  PRIVATECHAT_CLIENT_WHISPER_MODEL?: string;
+};
+
+type AutomaticSpeechRecognitionOutput = {
+  text: string;
+};
+
+type AutomaticSpeechRecognitionPipeline = (
+  audio: Float32Array,
+  options?: {
+    chunk_length_s?: number;
+    stride_length_s?: number;
+    task?: 'transcribe';
+    language?: string;
+  },
+) => Promise<AutomaticSpeechRecognitionOutput | AutomaticSpeechRecognitionOutput[]>;
+
+type TransformersModule = {
+  env: {
+    backends: {
+      onnx: {
+        wasm?: {
+          wasmPaths?: string;
+        };
+      };
+    };
+  };
+  pipeline: (
+    task: string,
+    model: string,
+    options?: {
+      device?: 'wasm' | 'webgpu';
+      dtype?: 'fp32';
+      model_file_name?: string;
+      subfolder?: string;
+    },
+  ) => Promise<unknown>;
+};
+
+const whisperTargetSampleRate = 16_000;
+const defaultWhisperModel = 'Xenova/whisper-small';
+const defaultTransformersWasmPath = '/transformers-wasm/';
+const defaultChunkLengthSeconds = 30;
+const defaultStrideLengthSeconds = 5;
+const whisperLanguageNames: Record<DictationLanguage, string> = {
+  en: 'english',
+  fr: 'french',
+  es: 'spanish',
+};
+
+function readRuntimeEnv(): PrivateChatRuntimeEnv {
+  if (typeof window === 'undefined') {
+    return {};
+  }
+
+  return (window as typeof window & { __PRIVATECHAT_ENV__?: PrivateChatRuntimeEnv }).__PRIVATECHAT_ENV__ ?? {};
+}
+
+function resolveAudioContextConstructor(): typeof AudioContext | null {
+  if (typeof window === 'undefined') {
+    return null;
+  }
+
+  return window.AudioContext
+    ?? (window as typeof window & { webkitAudioContext?: typeof AudioContext }).webkitAudioContext
+    ?? null;
+}
+
+@Injectable({ providedIn: 'root' })
+export class BrowserSpeechTranscriberService {
+  private readonly session = inject(ChatSessionService);
+  private readonly runtimeEnv = readRuntimeEnv();
+  private readonly modelId = this.runtimeEnv.PRIVATECHAT_CLIENT_WHISPER_MODEL?.trim() || defaultWhisperModel;
+  private readonly fallbackLanguage = this.normalizeLanguage(
+    this.runtimeEnv.PRIVATECHAT_CLIENT_WHISPER_LANGUAGE,
+  );
+  private transformersModulePromise: Promise<TransformersModule> | null = null;
+  private pipelinePromise: Promise<AutomaticSpeechRecognitionPipeline> | null = null;
+
+  async preload(): Promise<void> {
+    await this.getPipeline();
+  }
+
+  async transcribe(audioBlob: Blob): Promise<string> {
+    if (audioBlob.size === 0) {
+      return '';
+    }
+
+    const waveform = await this.decodeToWhisperWaveform(audioBlob);
+    const transcriber = await this.getPipeline();
+    const inputLanguage = this.session.currentUser()
+      ? this.session.dictationLanguage()
+      : this.resolveFallbackInputLanguage();
+    const output = await transcriber(waveform, {
+      chunk_length_s: defaultChunkLengthSeconds,
+      stride_length_s: defaultStrideLengthSeconds,
+      task: 'transcribe',
+      language: whisperLanguageNames[inputLanguage],
+    });
+    const transcription = Array.isArray(output) ? output[0] : output;
+    return transcription.text.trim();
+  }
+
+  private async getPipeline(): Promise<AutomaticSpeechRecognitionPipeline> {
+    if (!this.pipelinePromise) {
+      this.pipelinePromise = this.createPreferredPipeline<AutomaticSpeechRecognitionPipeline>(
+        'automatic-speech-recognition',
+        this.modelId,
+      );
+    }
+
+    return await this.pipelinePromise!;
+  }
+
+  private async getTransformersModule(): Promise<TransformersModule> {
+    if (!this.transformersModulePromise) {
+      this.transformersModulePromise = import('@huggingface/transformers') as Promise<TransformersModule>;
+    }
+
+    const transformersModule = await this.transformersModulePromise;
+    const onnxWasmEnv = transformersModule.env.backends.onnx.wasm;
+
+    if (onnxWasmEnv && !onnxWasmEnv.wasmPaths) {
+      onnxWasmEnv.wasmPaths =
+        this.runtimeEnv.PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH?.trim() || defaultTransformersWasmPath;
+    }
+
+    return transformersModule;
+  }
+
+  private async createPreferredPipeline<T>(
+    task: string,
+    model: string,
+    options?: {
+      dtype?: 'fp32';
+      model_file_name?: string;
+      subfolder?: string;
+    },
+  ): Promise<T> {
+    const transformersModule = await this.getTransformersModule();
+    const candidateDevices: Array<'webgpu' | 'wasm'> = this.browserSupportsWebGpu()
+      ? ['webgpu', 'wasm']
+      : ['wasm'];
+    let lastError: unknown = null;
+
+    for (const device of candidateDevices) {
+      try {
+        const pipeline = await transformersModule.pipeline(task, model, {
+          ...options,
+          device,
+        });
+
+        console.info(`[dictation] Loaded ${task} pipeline for ${model} on ${device}.`);
+        return pipeline as T;
+      } catch (error) {
+        lastError = error;
+        console.warn(`[dictation] Could not load ${task} pipeline for ${model} on ${device}.`, error);
+      }
+    }
+
+    throw lastError instanceof Error ? lastError : new Error(`Could not load ${task} pipeline for ${model}.`);
+  }
+
+  private async decodeToWhisperWaveform(audioBlob: Blob): Promise<Float32Array> {
+    const audioContextConstructor = resolveAudioContextConstructor();
+
+    if (!audioContextConstructor) {
+      throw new Error('This browser cannot decode recorded audio for dictation.');
+    }
+
+    const arrayBuffer = await audioBlob.arrayBuffer();
+    const audioContext = new audioContextConstructor();
+
+    try {
+      const audioBuffer = await audioContext.decodeAudioData(arrayBuffer.slice(0));
+      const monoChannel = this.mixToMono(audioBuffer);
+
+      if (audioBuffer.sampleRate === whisperTargetSampleRate) {
+        return monoChannel;
+      }
+
+      return this.resampleMonoChannel(monoChannel, audioBuffer.sampleRate, whisperTargetSampleRate);
+    } catch (error) {
+      throw error instanceof Error
+        ? error
+        : new Error('Could not decode the recorded dictation audio.');
+    } finally {
+      await audioContext.close().catch(() => undefined);
+    }
+  }
+
+  private mixToMono(audioBuffer: AudioBuffer): Float32Array {
+    const mixed = new Float32Array(audioBuffer.length);
+
+    for (let channelIndex = 0; channelIndex < audioBuffer.numberOfChannels; channelIndex += 1) {
+      const channel = audioBuffer.getChannelData(channelIndex);
+
+      for (let sampleIndex = 0; sampleIndex < channel.length; sampleIndex += 1) {
+        mixed[sampleIndex] += channel[sampleIndex];
+      }
+    }
+
+    if (audioBuffer.numberOfChannels > 1) {
+      for (let sampleIndex = 0; sampleIndex < mixed.length; sampleIndex += 1) {
+        mixed[sampleIndex] /= audioBuffer.numberOfChannels;
+      }
+    }
+
+    return mixed;
+  }
+
+  private resampleMonoChannel(
+    monoChannel: Float32Array,
+    sourceSampleRate: number,
+    targetSampleRate: number,
+  ): Float32Array {
+    if (sourceSampleRate === targetSampleRate) {
+      return monoChannel;
+    }
+
+    const targetLength = Math.max(1, Math.round(monoChannel.length * targetSampleRate / sourceSampleRate));
+    const resampled = new Float32Array(targetLength);
+    const positionRatio = sourceSampleRate / targetSampleRate;
+
+    for (let sampleIndex = 0; sampleIndex < targetLength; sampleIndex += 1) {
+      const sourcePosition = sampleIndex * positionRatio;
+      const sourceIndex = Math.floor(sourcePosition);
+      const nextSourceIndex = Math.min(sourceIndex + 1, monoChannel.length - 1);
+      const interpolationWeight = sourcePosition - sourceIndex;
+      const currentValue = monoChannel[sourceIndex] ?? 0;
+      const nextValue = monoChannel[nextSourceIndex] ?? currentValue;
+
+      resampled[sampleIndex] = currentValue + ((nextValue - currentValue) * interpolationWeight);
+    }
+
+    return resampled;
+  }
+
+  private normalizeLanguage(language: string | undefined): string | null {
+    const trimmedLanguage = language?.trim();
+
+    if (!trimmedLanguage || trimmedLanguage.toLowerCase() === 'auto') {
+      return null;
+    }
+
+    return trimmedLanguage;
+  }
+
+  private browserSupportsWebGpu(): boolean {
+    return typeof navigator !== 'undefined' && 'gpu' in navigator;
+  }
+
+  private resolveFallbackInputLanguage(): DictationLanguage {
+    switch (this.fallbackLanguage?.toLowerCase()) {
+      case 'french':
+      case 'fr':
+        return 'fr';
+      case 'spanish':
+      case 'es':
+        return 'es';
+      default:
+        return 'en';
+    }
+  }
+}
--- a/client/src/app/chat-page.component.html
+++ b/client/src/app/chat-page.component.html
@@ -261,7 +261,7 @@
 	                    <button
 	                      class="composer-dictation"
 	                      type="button"
-	                      [disabled]="!session.isSelectedPeerReady() || session.signalingState() !== 'connected' || isTranscribingDictation()"
+	                      [disabled]="!selectedPeerId || isTranscribingDictation()"
 	                      (click)="toggleDictation(composerTextarea)"
 	                      [title]="
 	                        isDictating()
--- a/client/src/app/chat-page.component.ts
+++ b/client/src/app/chat-page.component.ts
@@ -4,6 +4,7 @@ import { toSignal } from '@angular/core/rxjs-interop';
 import { FormsModule } from '@angular/forms';
 import { ActivatedRoute, Router, RouterLink } from '@angular/router';

+import { BrowserSpeechTranscriberService } from './browser-speech-transcriber.service';
 import { PeerCallModalComponent } from './peer-call-modal.component';
 import { ChatSessionService } from './chat-session.service';
 import { JsonFileViewerComponent } from './json-file-viewer.component';
@@ -36,6 +37,7 @@ export class ChatPageComponent implements OnDestroy {
  private readonly route = inject(ActivatedRoute);
  private readonly router = inject(Router);
  private readonly ngZone = inject(NgZone);
+  private readonly speechTranscriber = inject(BrowserSpeechTranscriberService);
  private readonly routeParamMap = toSignal(this.route.paramMap, {
    initialValue: this.route.snapshot.paramMap,
  });
@@ -274,6 +276,10 @@ export class ChatPageComponent implements OnDestroy {
      void this.router.navigateByUrl('/');
    }

+    queueMicrotask(() => {
+      void this.speechTranscriber.preload().catch(() => undefined);
+    });
+
    effect(() => {
      const currentUserId = this.currentUser()?.id ?? null;
      this.knownPeers.set(this.readKnownPeers(currentUserId));
@@ -1115,16 +1121,16 @@ export class ChatPageComponent implements OnDestroy {

  private async transcribeDictation(blob: Blob, textarea: HTMLTextAreaElement, applyToken: number): Promise<void> {
    try {
-      const transcript = await this.session.requestSpeechTranscription(blob);
+      const transcript = await this.speechTranscriber.transcribe(blob);

      if (applyToken !== this.dictationApplyToken) {
        return;
      }

      this.applyDictatedText(this.mergeDictatedText(this.dictationBaseText, transcript), textarea);
-    } catch {
+    } catch (error) {
      if (applyToken === this.dictationApplyToken) {
-        this.session.error.set('Dictation transcription failed.');
+        this.session.error.set(error instanceof Error ? error.message : 'Dictation transcription failed.');
      }
    } finally {
      if (applyToken === this.dictationApplyToken) {
--- a/client/src/app/chat-session.service.ts
+++ b/client/src/app/chat-session.service.ts
@@ -1,5 +1,5 @@
 import { HttpClient, HttpErrorResponse } from '@angular/common/http';
-import { computed, Injectable, signal } from '@angular/core';
+import { computed, effect, Injectable, signal } from '@angular/core';
 import { ImageMagick, MagickFormat, initializeImageMagick } from '@imagemagick/magick-wasm';
 import { firstValueFrom } from 'rxjs';

@@ -12,6 +12,7 @@ import {
  ChatEntry,
  ConnectionState,
  DataEnvelope,
+  DictationLanguage,
  DeliveryState,
  PendingApprovalResponse,
  PendingApprovalUser,
@@ -126,6 +127,7 @@ export class ChatSessionService {
  private static readonly messageStoreName = 'conversation_messages';
  private static readonly knownPeersStoragePrefix = 'privatechat.knownPeers';
  private static readonly incomingMessageSoundStorageKey = 'privatechat.incomingMessageSoundEnabled';
+  private static readonly dictationLanguageStoragePrefix = 'privatechat.dictationLanguage';
  private static readonly messageRetentionLimit = 256;
  private static readonly sessionKeepaliveMs = 5 * 60 * 1000;
  private static readonly signalingHeartbeatMs = 25 * 1000;
@@ -158,6 +160,7 @@ export class ChatSessionService {
  readonly incomingMessageSoundEnabled = signal(
    this.readStorage(ChatSessionService.incomingMessageSoundStorageKey) !== '0',
  );
+  readonly dictationLanguage = signal<DictationLanguage>('en');
  readonly webAuthnSupported = signal(
    typeof window !== 'undefined' &&
      typeof window.PublicKeyCredential !== 'undefined' &&
@@ -193,10 +196,6 @@ export class ChatSessionService {
    string,
    { peerId: string; prompt: string; waitMessageId: string }
  >();
-  private readonly pendingSpeechTranscriptionRequests = new Map<
-    string,
-    { resolve: (text: string) => void; reject: (reason?: unknown) => void }
-  >();
  private readonly incomingCallModes = signal<Array<{ peerId: string; mode: CallMode }>>([]);
  private readonly outgoingCallModes = signal<Array<{ peerId: string; mode: CallMode }>>([]);
  private readonly activeCallModes = signal<Array<{ peerId: string; mode: CallMode }>>([]);
@@ -224,6 +223,17 @@ export class ChatSessionService {
  constructor(private readonly http: HttpClient) {
    this.installConnectionRecoveryListeners();

+    effect(() => {
+      const currentUserId = this.currentUser()?.id;
+
+      if (!currentUserId) {
+        this.dictationLanguage.set('en');
+        return;
+      }
+
+      this.dictationLanguage.set(this.readStoredDictationLanguage(currentUserId));
+    });
+
    if (this.token() && this.currentUser()) {
      queueMicrotask(() => {
        void this.restoreSession();
@@ -331,6 +341,19 @@ export class ChatSessionService {
    this.writeStorage(ChatSessionService.incomingMessageSoundStorageKey, enabled ? '1' : '0');
  }

+  setDictationLanguage(language: DictationLanguage): void {
+    const nextLanguage = this.normalizeDictationLanguage(language);
+    this.dictationLanguage.set(nextLanguage);
+
+    const currentUserId = this.currentUser()?.id;
+
+    if (!currentUserId) {
+      return;
+    }
+
+    this.writeStorage(this.dictationLanguageStorageKey(currentUserId), nextLanguage);
+  }
+
  selectPeer(peerId: string): void {
    this.activePeerId.set(peerId);
    this.clearUnreadPeer(peerId);
@@ -1263,32 +1286,6 @@ export class ChatSessionService {
    return true;
  }

-  async requestSpeechTranscription(audioBlob: Blob): Promise<string> {
-    if (!this.websocket || this.websocket.readyState !== WebSocket.OPEN) {
-      throw new Error('You must be connected to signaling before using dictation.');
-    }
-
-    const requestId = crypto.randomUUID();
-    const audioBase64 = await this.blobToBase64(audioBlob);
-
-    return new Promise<string>((resolve, reject) => {
-      this.pendingSpeechTranscriptionRequests.set(requestId, { resolve, reject });
-
-      try {
-        this.error.set(null);
-        this.websocket?.send(JSON.stringify({
-          type: 'speech-transcription',
-          requestId,
-          mimeType: audioBlob.type || 'audio/webm',
-          audioBase64,
-        }));
-      } catch (error) {
-        this.pendingSpeechTranscriptionRequests.delete(requestId);
-        reject(error);
-      }
-    });
-  }
-
  private async loadAccessKeys(): Promise<void> {
    const token = this.token();

@@ -1365,7 +1362,6 @@ export class ChatSessionService {
      const shouldReconnect = this.websocket === websocket && !this.suppressSocketReconnect;

      this.stopWebSocketHeartbeat();
-      this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
      this.signalingState.set('disconnected');
      this.status.set('Signaling connection closed.');

@@ -1408,8 +1404,6 @@ export class ChatSessionService {

  private disconnectWebSocket(): void {
    this.stopWebSocketHeartbeat();
-    this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
-
    if (this.websocket) {
      this.suppressSocketReconnect = true;
      this.websocket.close();
@@ -1450,12 +1444,6 @@ export class ChatSessionService {
      case 'image-generation-error':
        this.handleGeneratedImageError(event);
        break;
-      case 'speech-transcribed':
-        this.handleSpeechTranscribed(event);
-        break;
-      case 'speech-transcription-error':
-        this.handleSpeechTranscriptionError(event);
-        break;
      case 'pong':
        break;
      case 'error':
@@ -1515,28 +1503,6 @@ export class ChatSessionService {
    this.error.set(event.message);
  }

-  private handleSpeechTranscribed(event: Extract<ServerEvent, { type: 'speech-transcribed' }>): void {
-    const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
-
-    if (!pendingRequest) {
-      return;
-    }
-
-    this.pendingSpeechTranscriptionRequests.delete(event.requestId);
-    pendingRequest.resolve(event.text);
-  }
-
-  private handleSpeechTranscriptionError(event: Extract<ServerEvent, { type: 'speech-transcription-error' }>): void {
-    const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
-
-    if (pendingRequest) {
-      this.pendingSpeechTranscriptionRequests.delete(event.requestId);
-      pendingRequest.reject(new Error(event.message));
-    }
-
-    this.error.set(event.message);
-  }
-
  private async restoreSession(): Promise<void> {
    const token = this.token();

@@ -2530,18 +2496,6 @@ export class ChatSessionService {
    }
  }

-  private rejectPendingSpeechTranscriptions(message: string): void {
-    if (this.pendingSpeechTranscriptionRequests.size === 0) {
-      return;
-    }
-
-    for (const { reject } of this.pendingSpeechTranscriptionRequests.values()) {
-      reject(new Error(message));
-    }
-
-    this.pendingSpeechTranscriptionRequests.clear();
-  }
-
  private clearLocalAuth(statusMessage: string): void {
    this.clearWebSocketReconnect();
    this.signalingRecoveryPromise = null;
@@ -2555,7 +2509,6 @@ export class ChatSessionService {
    this.releasePreloadedRingtone();
    this.pendingImageGenerationRequests.clear();
    this.pendingOutgoingFlushes.clear();
-    this.rejectPendingSpeechTranscriptions('Session ended during dictation.');
    this.incomingCallModes.set([]);
    this.outgoingCallModes.set([]);
    this.activeCallModes.set([]);
@@ -3865,4 +3818,25 @@ export class ChatSessionService {

    return responseMessage ?? thrownMessage ?? fallback;
  }
+
+  private readStoredDictationLanguage(currentUserId: string): DictationLanguage {
+    const storedValue = this.readStorage(this.dictationLanguageStorageKey(currentUserId));
+
+    return this.normalizeDictationLanguage(storedValue);
+  }
+
+  private dictationLanguageStorageKey(currentUserId: string): string {
+    return `${ChatSessionService.dictationLanguageStoragePrefix}.${currentUserId}`;
+  }
+
+  private normalizeDictationLanguage(value: string | null | undefined): DictationLanguage {
+    switch (value) {
+      case 'fr':
+      case 'es':
+      case 'en':
+        return value;
+      default:
+        return 'en';
+    }
+  }
 }
--- a/client/src/app/home-page.component.html
+++ b/client/src/app/home-page.component.html
@@ -188,6 +188,31 @@
              <div class="alert alert-success mb-4">{{ session.notice() }}</div>
            }

+            <section class="access-key-panel mb-4">
+              <div class="dictation-language-panel">
+                <div>
+                  <h3 class="h5 mb-1">Dictation language</h3>
+                  <p class="small text-secondary mb-0">
+                    Speech input and text output use the same selected language.
+                  </p>
+                </div>
+
+                <div class="dictation-language-select-shell mt-3">
+                  <label class="form-label small mb-2" for="dictationLanguage">Language</label>
+                  <select
+                    id="dictationLanguage"
+                    class="form-select"
+                    [ngModel]="session.dictationLanguage()"
+                    (ngModelChange)="setDictationLanguage($event)"
+                  >
+                    @for (option of dictationLanguageOptions; track option.value) {
+                      <option [ngValue]="option.value">{{ option.label }}</option>
+                    }
+                  </select>
+                </div>
+              </div>
+            </section>
+
            <section class="access-key-panel mb-4">
              <div class="d-flex justify-content-between align-items-start gap-3">
                <div>
--- a/client/src/app/home-page.component.scss
+++ b/client/src/app/home-page.component.scss
@@ -114,6 +114,12 @@
  background: var(--panel-soft-background);
 }

+.dictation-language-panel,
+.dictation-language-select-shell {
+  display: grid;
+  gap: 0.75rem;
+}
+
 .user-search-panel {
  display: grid;
  gap: 0.75rem;
@@ -183,7 +189,9 @@
 }

 .form-control,
-.form-control:focus {
+.form-control:focus,
+.form-select,
+.form-select:focus {
  color: var(--page-text);
  background-color: var(--input-background);
  border-color: var(--input-border);
--- a/client/src/app/home-page.component.ts
+++ b/client/src/app/home-page.component.ts
@@ -4,7 +4,7 @@ import { FormsModule } from '@angular/forms';
 import { Router, RouterLink } from '@angular/router';

 import { ChatSessionService } from './chat-session.service';
-import type { AdminUserSummary, UserProfile } from './models';
+import type { AdminUserSummary, DictationLanguage, UserProfile } from './models';
 import { ThemeService } from './theme.service';

@Component({
@@ -32,6 +32,11 @@ export class HomePageComponent {
  readonly loadingAdminUsers = signal(false);
  readonly deletingUserId = signal<string | null>(null);
  readonly adminUsersError = signal<string | null>(null);
+  readonly dictationLanguageOptions: Array<{ value: DictationLanguage; label: string }> = [
+    { value: 'en', label: 'English' },
+    { value: 'fr', label: 'French' },
+    { value: 'es', label: 'Spanish' },
+  ];
  readonly filteredKnownUsers = computed(() => {
    const query = this.userSearch.trim().toLowerCase();
    const users = this.knownUsers();
@@ -202,4 +207,8 @@ export class HomePageComponent {
  setIncomingMessageSound(enabled: boolean): void {
    this.session.setIncomingMessageSoundEnabled(enabled);
  }
+
+  setDictationLanguage(language: string): void {
+    this.session.setDictationLanguage(language as DictationLanguage);
+  }
 }
--- a/client/src/app/models.ts
+++ b/client/src/app/models.ts
@@ -113,6 +113,7 @@ export interface ChatEntry {
 }

 export type CallMode = 'audio' | 'video';
+export type DictationLanguage = 'en' | 'fr' | 'es';

 export type SignalPayload =
  | { type: 'sdp'; description: RTCSessionDescriptionInit }
@@ -138,16 +139,6 @@ export type ServerEvent =
      peerId: string;
      message: string;
    }
-  | {
-      type: 'speech-transcribed';
-      requestId: string;
-      text: string;
-    }
-  | {
-      type: 'speech-transcription-error';
-      requestId: string;
-      message: string;
-    }
  | { type: 'pong' }
  | { type: 'error'; message: string };