local multi language STT

This commit is contained in:
2026-03-26 19:38:54 +01:00
parent cc14b4d1b7
commit f2bf70bc7d
18 changed files with 1334 additions and 517 deletions

View File

@@ -54,6 +54,11 @@
"glob": "magick.wasm",
"input": "node_modules/@imagemagick/magick-wasm/dist",
"output": "/"
},
{
"glob": "ort-wasm-simd-threaded.jsep.*",
"input": "node_modules/@huggingface/transformers/dist",
"output": "/transformers-wasm"
}
],
"styles": [

955
client/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -18,6 +18,7 @@
"@angular/forms": "^21.2.0",
"@angular/platform-browser": "^21.2.0",
"@angular/router": "^21.2.0",
"@huggingface/transformers": "^3.8.1",
"@imagemagick/magick-wasm": "^0.0.39",
"bootstrap": "^5.3.8",
"ngx-extended-pdf-viewer": "^25.6.4",

View File

@@ -1,3 +1,6 @@
window.__PRIVATECHAT_ENV__ = {
"PRIVATECHAT_CLIENT_SERVER_URL": "https://chatter.dubertrand.fr"
"PRIVATECHAT_CLIENT_SERVER_URL": "https://chatter.dubertrand.fr",
"PRIVATECHAT_CLIENT_WHISPER_MODEL": "Xenova/whisper-small",
"PRIVATECHAT_CLIENT_WHISPER_LANGUAGE": "auto",
"PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH": "/transformers-wasm/"
};

View File

@@ -9,6 +9,10 @@ dotenv.config({ path: rootEnvPath });
const runtimeEnv = {
PRIVATECHAT_CLIENT_SERVER_URL: process.env.PRIVATECHAT_CLIENT_SERVER_URL ?? 'http://localhost:3000',
PRIVATECHAT_CLIENT_WHISPER_MODEL: process.env.PRIVATECHAT_CLIENT_WHISPER_MODEL ?? 'Xenova/whisper-small',
PRIVATECHAT_CLIENT_WHISPER_LANGUAGE: process.env.PRIVATECHAT_CLIENT_WHISPER_LANGUAGE ?? 'auto',
PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH:
process.env.PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH ?? '/transformers-wasm/',
};
const fileContents = `window.__PRIVATECHAT_ENV__ = ${JSON.stringify(runtimeEnv, null, 2)};\n`;

View File

@@ -0,0 +1,273 @@
import { inject, Injectable } from '@angular/core';
import { ChatSessionService } from './chat-session.service';
import type { DictationLanguage } from './models';
type PrivateChatRuntimeEnv = {
PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH?: string;
PRIVATECHAT_CLIENT_WHISPER_LANGUAGE?: string;
PRIVATECHAT_CLIENT_WHISPER_MODEL?: string;
};
type AutomaticSpeechRecognitionOutput = {
text: string;
};
type AutomaticSpeechRecognitionPipeline = (
audio: Float32Array,
options?: {
chunk_length_s?: number;
stride_length_s?: number;
task?: 'transcribe';
language?: string;
},
) => Promise<AutomaticSpeechRecognitionOutput | AutomaticSpeechRecognitionOutput[]>;
type TransformersModule = {
env: {
backends: {
onnx: {
wasm?: {
wasmPaths?: string;
};
};
};
};
pipeline: (
task: string,
model: string,
options?: {
device?: 'wasm' | 'webgpu';
dtype?: 'fp32';
model_file_name?: string;
subfolder?: string;
},
) => Promise<unknown>;
};
const whisperTargetSampleRate = 16_000;
const defaultWhisperModel = 'Xenova/whisper-small';
const defaultTransformersWasmPath = '/transformers-wasm/';
const defaultChunkLengthSeconds = 30;
const defaultStrideLengthSeconds = 5;
const whisperLanguageNames: Record<DictationLanguage, string> = {
en: 'english',
fr: 'french',
es: 'spanish',
};
function readRuntimeEnv(): PrivateChatRuntimeEnv {
if (typeof window === 'undefined') {
return {};
}
return (window as typeof window & { __PRIVATECHAT_ENV__?: PrivateChatRuntimeEnv }).__PRIVATECHAT_ENV__ ?? {};
}
function resolveAudioContextConstructor(): typeof AudioContext | null {
if (typeof window === 'undefined') {
return null;
}
return window.AudioContext
?? (window as typeof window & { webkitAudioContext?: typeof AudioContext }).webkitAudioContext
?? null;
}
@Injectable({ providedIn: 'root' })
export class BrowserSpeechTranscriberService {
private readonly session = inject(ChatSessionService);
private readonly runtimeEnv = readRuntimeEnv();
private readonly modelId = this.runtimeEnv.PRIVATECHAT_CLIENT_WHISPER_MODEL?.trim() || defaultWhisperModel;
private readonly fallbackLanguage = this.normalizeLanguage(
this.runtimeEnv.PRIVATECHAT_CLIENT_WHISPER_LANGUAGE,
);
private transformersModulePromise: Promise<TransformersModule> | null = null;
private pipelinePromise: Promise<AutomaticSpeechRecognitionPipeline> | null = null;
async preload(): Promise<void> {
await this.getPipeline();
}
async transcribe(audioBlob: Blob): Promise<string> {
if (audioBlob.size === 0) {
return '';
}
const waveform = await this.decodeToWhisperWaveform(audioBlob);
const transcriber = await this.getPipeline();
const inputLanguage = this.session.currentUser()
? this.session.dictationLanguage()
: this.resolveFallbackInputLanguage();
const output = await transcriber(waveform, {
chunk_length_s: defaultChunkLengthSeconds,
stride_length_s: defaultStrideLengthSeconds,
task: 'transcribe',
language: whisperLanguageNames[inputLanguage],
});
const transcription = Array.isArray(output) ? output[0] : output;
return transcription.text.trim();
}
private async getPipeline(): Promise<AutomaticSpeechRecognitionPipeline> {
if (!this.pipelinePromise) {
this.pipelinePromise = this.createPreferredPipeline<AutomaticSpeechRecognitionPipeline>(
'automatic-speech-recognition',
this.modelId,
);
}
return await this.pipelinePromise!;
}
private async getTransformersModule(): Promise<TransformersModule> {
if (!this.transformersModulePromise) {
this.transformersModulePromise = import('@huggingface/transformers') as Promise<TransformersModule>;
}
const transformersModule = await this.transformersModulePromise;
const onnxWasmEnv = transformersModule.env.backends.onnx.wasm;
if (onnxWasmEnv && !onnxWasmEnv.wasmPaths) {
onnxWasmEnv.wasmPaths =
this.runtimeEnv.PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH?.trim() || defaultTransformersWasmPath;
}
return transformersModule;
}
private async createPreferredPipeline<T>(
task: string,
model: string,
options?: {
dtype?: 'fp32';
model_file_name?: string;
subfolder?: string;
},
): Promise<T> {
const transformersModule = await this.getTransformersModule();
const candidateDevices: Array<'webgpu' | 'wasm'> = this.browserSupportsWebGpu()
? ['webgpu', 'wasm']
: ['wasm'];
let lastError: unknown = null;
for (const device of candidateDevices) {
try {
const pipeline = await transformersModule.pipeline(task, model, {
...options,
device,
});
console.info(`[dictation] Loaded ${task} pipeline for ${model} on ${device}.`);
return pipeline as T;
} catch (error) {
lastError = error;
console.warn(`[dictation] Could not load ${task} pipeline for ${model} on ${device}.`, error);
}
}
throw lastError instanceof Error ? lastError : new Error(`Could not load ${task} pipeline for ${model}.`);
}
private async decodeToWhisperWaveform(audioBlob: Blob): Promise<Float32Array> {
const audioContextConstructor = resolveAudioContextConstructor();
if (!audioContextConstructor) {
throw new Error('This browser cannot decode recorded audio for dictation.');
}
const arrayBuffer = await audioBlob.arrayBuffer();
const audioContext = new audioContextConstructor();
try {
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer.slice(0));
const monoChannel = this.mixToMono(audioBuffer);
if (audioBuffer.sampleRate === whisperTargetSampleRate) {
return monoChannel;
}
return this.resampleMonoChannel(monoChannel, audioBuffer.sampleRate, whisperTargetSampleRate);
} catch (error) {
throw error instanceof Error
? error
: new Error('Could not decode the recorded dictation audio.');
} finally {
await audioContext.close().catch(() => undefined);
}
}
private mixToMono(audioBuffer: AudioBuffer): Float32Array {
const mixed = new Float32Array(audioBuffer.length);
for (let channelIndex = 0; channelIndex < audioBuffer.numberOfChannels; channelIndex += 1) {
const channel = audioBuffer.getChannelData(channelIndex);
for (let sampleIndex = 0; sampleIndex < channel.length; sampleIndex += 1) {
mixed[sampleIndex] += channel[sampleIndex];
}
}
if (audioBuffer.numberOfChannels > 1) {
for (let sampleIndex = 0; sampleIndex < mixed.length; sampleIndex += 1) {
mixed[sampleIndex] /= audioBuffer.numberOfChannels;
}
}
return mixed;
}
private resampleMonoChannel(
monoChannel: Float32Array,
sourceSampleRate: number,
targetSampleRate: number,
): Float32Array {
if (sourceSampleRate === targetSampleRate) {
return monoChannel;
}
const targetLength = Math.max(1, Math.round(monoChannel.length * targetSampleRate / sourceSampleRate));
const resampled = new Float32Array(targetLength);
const positionRatio = sourceSampleRate / targetSampleRate;
for (let sampleIndex = 0; sampleIndex < targetLength; sampleIndex += 1) {
const sourcePosition = sampleIndex * positionRatio;
const sourceIndex = Math.floor(sourcePosition);
const nextSourceIndex = Math.min(sourceIndex + 1, monoChannel.length - 1);
const interpolationWeight = sourcePosition - sourceIndex;
const currentValue = monoChannel[sourceIndex] ?? 0;
const nextValue = monoChannel[nextSourceIndex] ?? currentValue;
resampled[sampleIndex] = currentValue + ((nextValue - currentValue) * interpolationWeight);
}
return resampled;
}
private normalizeLanguage(language: string | undefined): string | null {
const trimmedLanguage = language?.trim();
if (!trimmedLanguage || trimmedLanguage.toLowerCase() === 'auto') {
return null;
}
return trimmedLanguage;
}
private browserSupportsWebGpu(): boolean {
return typeof navigator !== 'undefined' && 'gpu' in navigator;
}
private resolveFallbackInputLanguage(): DictationLanguage {
switch (this.fallbackLanguage?.toLowerCase()) {
case 'french':
case 'fr':
return 'fr';
case 'spanish':
case 'es':
return 'es';
default:
return 'en';
}
}
}

View File

@@ -261,7 +261,7 @@
<button
class="composer-dictation"
type="button"
[disabled]="!session.isSelectedPeerReady() || session.signalingState() !== 'connected' || isTranscribingDictation()"
[disabled]="!selectedPeerId || isTranscribingDictation()"
(click)="toggleDictation(composerTextarea)"
[title]="
isDictating()

View File

@@ -4,6 +4,7 @@ import { toSignal } from '@angular/core/rxjs-interop';
import { FormsModule } from '@angular/forms';
import { ActivatedRoute, Router, RouterLink } from '@angular/router';
import { BrowserSpeechTranscriberService } from './browser-speech-transcriber.service';
import { PeerCallModalComponent } from './peer-call-modal.component';
import { ChatSessionService } from './chat-session.service';
import { JsonFileViewerComponent } from './json-file-viewer.component';
@@ -36,6 +37,7 @@ export class ChatPageComponent implements OnDestroy {
private readonly route = inject(ActivatedRoute);
private readonly router = inject(Router);
private readonly ngZone = inject(NgZone);
private readonly speechTranscriber = inject(BrowserSpeechTranscriberService);
private readonly routeParamMap = toSignal(this.route.paramMap, {
initialValue: this.route.snapshot.paramMap,
});
@@ -274,6 +276,10 @@ export class ChatPageComponent implements OnDestroy {
void this.router.navigateByUrl('/');
}
queueMicrotask(() => {
void this.speechTranscriber.preload().catch(() => undefined);
});
effect(() => {
const currentUserId = this.currentUser()?.id ?? null;
this.knownPeers.set(this.readKnownPeers(currentUserId));
@@ -1115,16 +1121,16 @@ export class ChatPageComponent implements OnDestroy {
private async transcribeDictation(blob: Blob, textarea: HTMLTextAreaElement, applyToken: number): Promise<void> {
try {
const transcript = await this.session.requestSpeechTranscription(blob);
const transcript = await this.speechTranscriber.transcribe(blob);
if (applyToken !== this.dictationApplyToken) {
return;
}
this.applyDictatedText(this.mergeDictatedText(this.dictationBaseText, transcript), textarea);
} catch {
} catch (error) {
if (applyToken === this.dictationApplyToken) {
this.session.error.set('Dictation transcription failed.');
this.session.error.set(error instanceof Error ? error.message : 'Dictation transcription failed.');
}
} finally {
if (applyToken === this.dictationApplyToken) {

View File

@@ -1,5 +1,5 @@
import { HttpClient, HttpErrorResponse } from '@angular/common/http';
import { computed, Injectable, signal } from '@angular/core';
import { computed, effect, Injectable, signal } from '@angular/core';
import { ImageMagick, MagickFormat, initializeImageMagick } from '@imagemagick/magick-wasm';
import { firstValueFrom } from 'rxjs';
@@ -12,6 +12,7 @@ import {
ChatEntry,
ConnectionState,
DataEnvelope,
DictationLanguage,
DeliveryState,
PendingApprovalResponse,
PendingApprovalUser,
@@ -126,6 +127,7 @@ export class ChatSessionService {
private static readonly messageStoreName = 'conversation_messages';
private static readonly knownPeersStoragePrefix = 'privatechat.knownPeers';
private static readonly incomingMessageSoundStorageKey = 'privatechat.incomingMessageSoundEnabled';
private static readonly dictationLanguageStoragePrefix = 'privatechat.dictationLanguage';
private static readonly messageRetentionLimit = 256;
private static readonly sessionKeepaliveMs = 5 * 60 * 1000;
private static readonly signalingHeartbeatMs = 25 * 1000;
@@ -158,6 +160,7 @@ export class ChatSessionService {
readonly incomingMessageSoundEnabled = signal(
this.readStorage(ChatSessionService.incomingMessageSoundStorageKey) !== '0',
);
readonly dictationLanguage = signal<DictationLanguage>('en');
readonly webAuthnSupported = signal(
typeof window !== 'undefined' &&
typeof window.PublicKeyCredential !== 'undefined' &&
@@ -193,10 +196,6 @@ export class ChatSessionService {
string,
{ peerId: string; prompt: string; waitMessageId: string }
>();
private readonly pendingSpeechTranscriptionRequests = new Map<
string,
{ resolve: (text: string) => void; reject: (reason?: unknown) => void }
>();
private readonly incomingCallModes = signal<Array<{ peerId: string; mode: CallMode }>>([]);
private readonly outgoingCallModes = signal<Array<{ peerId: string; mode: CallMode }>>([]);
private readonly activeCallModes = signal<Array<{ peerId: string; mode: CallMode }>>([]);
@@ -224,6 +223,17 @@ export class ChatSessionService {
constructor(private readonly http: HttpClient) {
this.installConnectionRecoveryListeners();
effect(() => {
const currentUserId = this.currentUser()?.id;
if (!currentUserId) {
this.dictationLanguage.set('en');
return;
}
this.dictationLanguage.set(this.readStoredDictationLanguage(currentUserId));
});
if (this.token() && this.currentUser()) {
queueMicrotask(() => {
void this.restoreSession();
@@ -331,6 +341,19 @@ export class ChatSessionService {
this.writeStorage(ChatSessionService.incomingMessageSoundStorageKey, enabled ? '1' : '0');
}
setDictationLanguage(language: DictationLanguage): void {
const nextLanguage = this.normalizeDictationLanguage(language);
this.dictationLanguage.set(nextLanguage);
const currentUserId = this.currentUser()?.id;
if (!currentUserId) {
return;
}
this.writeStorage(this.dictationLanguageStorageKey(currentUserId), nextLanguage);
}
selectPeer(peerId: string): void {
this.activePeerId.set(peerId);
this.clearUnreadPeer(peerId);
@@ -1263,32 +1286,6 @@ export class ChatSessionService {
return true;
}
async requestSpeechTranscription(audioBlob: Blob): Promise<string> {
if (!this.websocket || this.websocket.readyState !== WebSocket.OPEN) {
throw new Error('You must be connected to signaling before using dictation.');
}
const requestId = crypto.randomUUID();
const audioBase64 = await this.blobToBase64(audioBlob);
return new Promise<string>((resolve, reject) => {
this.pendingSpeechTranscriptionRequests.set(requestId, { resolve, reject });
try {
this.error.set(null);
this.websocket?.send(JSON.stringify({
type: 'speech-transcription',
requestId,
mimeType: audioBlob.type || 'audio/webm',
audioBase64,
}));
} catch (error) {
this.pendingSpeechTranscriptionRequests.delete(requestId);
reject(error);
}
});
}
private async loadAccessKeys(): Promise<void> {
const token = this.token();
@@ -1365,7 +1362,6 @@ export class ChatSessionService {
const shouldReconnect = this.websocket === websocket && !this.suppressSocketReconnect;
this.stopWebSocketHeartbeat();
this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
this.signalingState.set('disconnected');
this.status.set('Signaling connection closed.');
@@ -1408,8 +1404,6 @@ export class ChatSessionService {
private disconnectWebSocket(): void {
this.stopWebSocketHeartbeat();
this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
if (this.websocket) {
this.suppressSocketReconnect = true;
this.websocket.close();
@@ -1450,12 +1444,6 @@ export class ChatSessionService {
case 'image-generation-error':
this.handleGeneratedImageError(event);
break;
case 'speech-transcribed':
this.handleSpeechTranscribed(event);
break;
case 'speech-transcription-error':
this.handleSpeechTranscriptionError(event);
break;
case 'pong':
break;
case 'error':
@@ -1515,28 +1503,6 @@ export class ChatSessionService {
this.error.set(event.message);
}
private handleSpeechTranscribed(event: Extract<ServerEvent, { type: 'speech-transcribed' }>): void {
const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
if (!pendingRequest) {
return;
}
this.pendingSpeechTranscriptionRequests.delete(event.requestId);
pendingRequest.resolve(event.text);
}
private handleSpeechTranscriptionError(event: Extract<ServerEvent, { type: 'speech-transcription-error' }>): void {
const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
if (pendingRequest) {
this.pendingSpeechTranscriptionRequests.delete(event.requestId);
pendingRequest.reject(new Error(event.message));
}
this.error.set(event.message);
}
private async restoreSession(): Promise<void> {
const token = this.token();
@@ -2530,18 +2496,6 @@ export class ChatSessionService {
}
}
private rejectPendingSpeechTranscriptions(message: string): void {
if (this.pendingSpeechTranscriptionRequests.size === 0) {
return;
}
for (const { reject } of this.pendingSpeechTranscriptionRequests.values()) {
reject(new Error(message));
}
this.pendingSpeechTranscriptionRequests.clear();
}
private clearLocalAuth(statusMessage: string): void {
this.clearWebSocketReconnect();
this.signalingRecoveryPromise = null;
@@ -2555,7 +2509,6 @@ export class ChatSessionService {
this.releasePreloadedRingtone();
this.pendingImageGenerationRequests.clear();
this.pendingOutgoingFlushes.clear();
this.rejectPendingSpeechTranscriptions('Session ended during dictation.');
this.incomingCallModes.set([]);
this.outgoingCallModes.set([]);
this.activeCallModes.set([]);
@@ -3865,4 +3818,25 @@ export class ChatSessionService {
return responseMessage ?? thrownMessage ?? fallback;
}
private readStoredDictationLanguage(currentUserId: string): DictationLanguage {
const storedValue = this.readStorage(this.dictationLanguageStorageKey(currentUserId));
return this.normalizeDictationLanguage(storedValue);
}
private dictationLanguageStorageKey(currentUserId: string): string {
return `${ChatSessionService.dictationLanguageStoragePrefix}.${currentUserId}`;
}
private normalizeDictationLanguage(value: string | null | undefined): DictationLanguage {
switch (value) {
case 'fr':
case 'es':
case 'en':
return value;
default:
return 'en';
}
}
}

View File

@@ -188,6 +188,31 @@
<div class="alert alert-success mb-4">{{ session.notice() }}</div>
}
<section class="access-key-panel mb-4">
<div class="dictation-language-panel">
<div>
<h3 class="h5 mb-1">Dictation language</h3>
<p class="small text-secondary mb-0">
Speech input and text output use the same selected language.
</p>
</div>
<div class="dictation-language-select-shell mt-3">
<label class="form-label small mb-2" for="dictationLanguage">Language</label>
<select
id="dictationLanguage"
class="form-select"
[ngModel]="session.dictationLanguage()"
(ngModelChange)="setDictationLanguage($event)"
>
@for (option of dictationLanguageOptions; track option.value) {
<option [ngValue]="option.value">{{ option.label }}</option>
}
</select>
</div>
</div>
</section>
<section class="access-key-panel mb-4">
<div class="d-flex justify-content-between align-items-start gap-3">
<div>

View File

@@ -114,6 +114,12 @@
background: var(--panel-soft-background);
}
.dictation-language-panel,
.dictation-language-select-shell {
display: grid;
gap: 0.75rem;
}
.user-search-panel {
display: grid;
gap: 0.75rem;
@@ -183,7 +189,9 @@
}
.form-control,
.form-control:focus {
.form-control:focus,
.form-select,
.form-select:focus {
color: var(--page-text);
background-color: var(--input-background);
border-color: var(--input-border);

View File

@@ -4,7 +4,7 @@ import { FormsModule } from '@angular/forms';
import { Router, RouterLink } from '@angular/router';
import { ChatSessionService } from './chat-session.service';
import type { AdminUserSummary, UserProfile } from './models';
import type { AdminUserSummary, DictationLanguage, UserProfile } from './models';
import { ThemeService } from './theme.service';
@Component({
@@ -32,6 +32,11 @@ export class HomePageComponent {
readonly loadingAdminUsers = signal(false);
readonly deletingUserId = signal<string | null>(null);
readonly adminUsersError = signal<string | null>(null);
readonly dictationLanguageOptions: Array<{ value: DictationLanguage; label: string }> = [
{ value: 'en', label: 'English' },
{ value: 'fr', label: 'French' },
{ value: 'es', label: 'Spanish' },
];
readonly filteredKnownUsers = computed(() => {
const query = this.userSearch.trim().toLowerCase();
const users = this.knownUsers();
@@ -202,4 +207,8 @@ export class HomePageComponent {
setIncomingMessageSound(enabled: boolean): void {
this.session.setIncomingMessageSoundEnabled(enabled);
}
setDictationLanguage(language: string): void {
this.session.setDictationLanguage(language as DictationLanguage);
}
}

View File

@@ -113,6 +113,7 @@ export interface ChatEntry {
}
export type CallMode = 'audio' | 'video';
export type DictationLanguage = 'en' | 'fr' | 'es';
export type SignalPayload =
| { type: 'sdp'; description: RTCSessionDescriptionInit }
@@ -138,16 +139,6 @@ export type ServerEvent =
peerId: string;
message: string;
}
| {
type: 'speech-transcribed';
requestId: string;
text: string;
}
| {
type: 'speech-transcription-error';
requestId: string;
message: string;
}
| { type: 'pong' }
| { type: 'error'; message: string };