local multi language STT
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -9,3 +9,4 @@ server/server/data/master.key
|
||||
client/dist/*
|
||||
client/apple-client/WebApp/**
|
||||
server/data/master.key
|
||||
.vscode/extensions.json
|
||||
|
||||
@@ -54,6 +54,11 @@
|
||||
"glob": "magick.wasm",
|
||||
"input": "node_modules/@imagemagick/magick-wasm/dist",
|
||||
"output": "/"
|
||||
},
|
||||
{
|
||||
"glob": "ort-wasm-simd-threaded.jsep.*",
|
||||
"input": "node_modules/@huggingface/transformers/dist",
|
||||
"output": "/transformers-wasm"
|
||||
}
|
||||
],
|
||||
"styles": [
|
||||
|
||||
955
client/package-lock.json
generated
955
client/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -18,6 +18,7 @@
|
||||
"@angular/forms": "^21.2.0",
|
||||
"@angular/platform-browser": "^21.2.0",
|
||||
"@angular/router": "^21.2.0",
|
||||
"@huggingface/transformers": "^3.8.1",
|
||||
"@imagemagick/magick-wasm": "^0.0.39",
|
||||
"bootstrap": "^5.3.8",
|
||||
"ngx-extended-pdf-viewer": "^25.6.4",
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
window.__PRIVATECHAT_ENV__ = {
|
||||
"PRIVATECHAT_CLIENT_SERVER_URL": "https://chatter.dubertrand.fr"
|
||||
"PRIVATECHAT_CLIENT_SERVER_URL": "https://chatter.dubertrand.fr",
|
||||
"PRIVATECHAT_CLIENT_WHISPER_MODEL": "Xenova/whisper-small",
|
||||
"PRIVATECHAT_CLIENT_WHISPER_LANGUAGE": "auto",
|
||||
"PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH": "/transformers-wasm/"
|
||||
};
|
||||
|
||||
@@ -9,6 +9,10 @@ dotenv.config({ path: rootEnvPath });
|
||||
|
||||
const runtimeEnv = {
|
||||
PRIVATECHAT_CLIENT_SERVER_URL: process.env.PRIVATECHAT_CLIENT_SERVER_URL ?? 'http://localhost:3000',
|
||||
PRIVATECHAT_CLIENT_WHISPER_MODEL: process.env.PRIVATECHAT_CLIENT_WHISPER_MODEL ?? 'Xenova/whisper-small',
|
||||
PRIVATECHAT_CLIENT_WHISPER_LANGUAGE: process.env.PRIVATECHAT_CLIENT_WHISPER_LANGUAGE ?? 'auto',
|
||||
PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH:
|
||||
process.env.PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH ?? '/transformers-wasm/',
|
||||
};
|
||||
|
||||
const fileContents = `window.__PRIVATECHAT_ENV__ = ${JSON.stringify(runtimeEnv, null, 2)};\n`;
|
||||
|
||||
273
client/src/app/browser-speech-transcriber.service.ts
Normal file
273
client/src/app/browser-speech-transcriber.service.ts
Normal file
@@ -0,0 +1,273 @@
|
||||
import { inject, Injectable } from '@angular/core';
|
||||
|
||||
import { ChatSessionService } from './chat-session.service';
|
||||
import type { DictationLanguage } from './models';
|
||||
|
||||
type PrivateChatRuntimeEnv = {
|
||||
PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH?: string;
|
||||
PRIVATECHAT_CLIENT_WHISPER_LANGUAGE?: string;
|
||||
PRIVATECHAT_CLIENT_WHISPER_MODEL?: string;
|
||||
};
|
||||
|
||||
type AutomaticSpeechRecognitionOutput = {
|
||||
text: string;
|
||||
};
|
||||
|
||||
type AutomaticSpeechRecognitionPipeline = (
|
||||
audio: Float32Array,
|
||||
options?: {
|
||||
chunk_length_s?: number;
|
||||
stride_length_s?: number;
|
||||
task?: 'transcribe';
|
||||
language?: string;
|
||||
},
|
||||
) => Promise<AutomaticSpeechRecognitionOutput | AutomaticSpeechRecognitionOutput[]>;
|
||||
|
||||
type TransformersModule = {
|
||||
env: {
|
||||
backends: {
|
||||
onnx: {
|
||||
wasm?: {
|
||||
wasmPaths?: string;
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
pipeline: (
|
||||
task: string,
|
||||
model: string,
|
||||
options?: {
|
||||
device?: 'wasm' | 'webgpu';
|
||||
dtype?: 'fp32';
|
||||
model_file_name?: string;
|
||||
subfolder?: string;
|
||||
},
|
||||
) => Promise<unknown>;
|
||||
};
|
||||
|
||||
const whisperTargetSampleRate = 16_000;
|
||||
const defaultWhisperModel = 'Xenova/whisper-small';
|
||||
const defaultTransformersWasmPath = '/transformers-wasm/';
|
||||
const defaultChunkLengthSeconds = 30;
|
||||
const defaultStrideLengthSeconds = 5;
|
||||
const whisperLanguageNames: Record<DictationLanguage, string> = {
|
||||
en: 'english',
|
||||
fr: 'french',
|
||||
es: 'spanish',
|
||||
};
|
||||
|
||||
function readRuntimeEnv(): PrivateChatRuntimeEnv {
|
||||
if (typeof window === 'undefined') {
|
||||
return {};
|
||||
}
|
||||
|
||||
return (window as typeof window & { __PRIVATECHAT_ENV__?: PrivateChatRuntimeEnv }).__PRIVATECHAT_ENV__ ?? {};
|
||||
}
|
||||
|
||||
function resolveAudioContextConstructor(): typeof AudioContext | null {
|
||||
if (typeof window === 'undefined') {
|
||||
return null;
|
||||
}
|
||||
|
||||
return window.AudioContext
|
||||
?? (window as typeof window & { webkitAudioContext?: typeof AudioContext }).webkitAudioContext
|
||||
?? null;
|
||||
}
|
||||
|
||||
@Injectable({ providedIn: 'root' })
|
||||
export class BrowserSpeechTranscriberService {
|
||||
private readonly session = inject(ChatSessionService);
|
||||
private readonly runtimeEnv = readRuntimeEnv();
|
||||
private readonly modelId = this.runtimeEnv.PRIVATECHAT_CLIENT_WHISPER_MODEL?.trim() || defaultWhisperModel;
|
||||
private readonly fallbackLanguage = this.normalizeLanguage(
|
||||
this.runtimeEnv.PRIVATECHAT_CLIENT_WHISPER_LANGUAGE,
|
||||
);
|
||||
private transformersModulePromise: Promise<TransformersModule> | null = null;
|
||||
private pipelinePromise: Promise<AutomaticSpeechRecognitionPipeline> | null = null;
|
||||
|
||||
async preload(): Promise<void> {
|
||||
await this.getPipeline();
|
||||
}
|
||||
|
||||
async transcribe(audioBlob: Blob): Promise<string> {
|
||||
if (audioBlob.size === 0) {
|
||||
return '';
|
||||
}
|
||||
|
||||
const waveform = await this.decodeToWhisperWaveform(audioBlob);
|
||||
const transcriber = await this.getPipeline();
|
||||
const inputLanguage = this.session.currentUser()
|
||||
? this.session.dictationLanguage()
|
||||
: this.resolveFallbackInputLanguage();
|
||||
const output = await transcriber(waveform, {
|
||||
chunk_length_s: defaultChunkLengthSeconds,
|
||||
stride_length_s: defaultStrideLengthSeconds,
|
||||
task: 'transcribe',
|
||||
language: whisperLanguageNames[inputLanguage],
|
||||
});
|
||||
const transcription = Array.isArray(output) ? output[0] : output;
|
||||
return transcription.text.trim();
|
||||
}
|
||||
|
||||
private async getPipeline(): Promise<AutomaticSpeechRecognitionPipeline> {
|
||||
if (!this.pipelinePromise) {
|
||||
this.pipelinePromise = this.createPreferredPipeline<AutomaticSpeechRecognitionPipeline>(
|
||||
'automatic-speech-recognition',
|
||||
this.modelId,
|
||||
);
|
||||
}
|
||||
|
||||
return await this.pipelinePromise!;
|
||||
}
|
||||
|
||||
private async getTransformersModule(): Promise<TransformersModule> {
|
||||
if (!this.transformersModulePromise) {
|
||||
this.transformersModulePromise = import('@huggingface/transformers') as Promise<TransformersModule>;
|
||||
}
|
||||
|
||||
const transformersModule = await this.transformersModulePromise;
|
||||
const onnxWasmEnv = transformersModule.env.backends.onnx.wasm;
|
||||
|
||||
if (onnxWasmEnv && !onnxWasmEnv.wasmPaths) {
|
||||
onnxWasmEnv.wasmPaths =
|
||||
this.runtimeEnv.PRIVATECHAT_CLIENT_TRANSFORMERS_WASM_PATH?.trim() || defaultTransformersWasmPath;
|
||||
}
|
||||
|
||||
return transformersModule;
|
||||
}
|
||||
|
||||
private async createPreferredPipeline<T>(
|
||||
task: string,
|
||||
model: string,
|
||||
options?: {
|
||||
dtype?: 'fp32';
|
||||
model_file_name?: string;
|
||||
subfolder?: string;
|
||||
},
|
||||
): Promise<T> {
|
||||
const transformersModule = await this.getTransformersModule();
|
||||
const candidateDevices: Array<'webgpu' | 'wasm'> = this.browserSupportsWebGpu()
|
||||
? ['webgpu', 'wasm']
|
||||
: ['wasm'];
|
||||
let lastError: unknown = null;
|
||||
|
||||
for (const device of candidateDevices) {
|
||||
try {
|
||||
const pipeline = await transformersModule.pipeline(task, model, {
|
||||
...options,
|
||||
device,
|
||||
});
|
||||
|
||||
console.info(`[dictation] Loaded ${task} pipeline for ${model} on ${device}.`);
|
||||
return pipeline as T;
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
console.warn(`[dictation] Could not load ${task} pipeline for ${model} on ${device}.`, error);
|
||||
}
|
||||
}
|
||||
|
||||
throw lastError instanceof Error ? lastError : new Error(`Could not load ${task} pipeline for ${model}.`);
|
||||
}
|
||||
|
||||
private async decodeToWhisperWaveform(audioBlob: Blob): Promise<Float32Array> {
|
||||
const audioContextConstructor = resolveAudioContextConstructor();
|
||||
|
||||
if (!audioContextConstructor) {
|
||||
throw new Error('This browser cannot decode recorded audio for dictation.');
|
||||
}
|
||||
|
||||
const arrayBuffer = await audioBlob.arrayBuffer();
|
||||
const audioContext = new audioContextConstructor();
|
||||
|
||||
try {
|
||||
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer.slice(0));
|
||||
const monoChannel = this.mixToMono(audioBuffer);
|
||||
|
||||
if (audioBuffer.sampleRate === whisperTargetSampleRate) {
|
||||
return monoChannel;
|
||||
}
|
||||
|
||||
return this.resampleMonoChannel(monoChannel, audioBuffer.sampleRate, whisperTargetSampleRate);
|
||||
} catch (error) {
|
||||
throw error instanceof Error
|
||||
? error
|
||||
: new Error('Could not decode the recorded dictation audio.');
|
||||
} finally {
|
||||
await audioContext.close().catch(() => undefined);
|
||||
}
|
||||
}
|
||||
|
||||
private mixToMono(audioBuffer: AudioBuffer): Float32Array {
|
||||
const mixed = new Float32Array(audioBuffer.length);
|
||||
|
||||
for (let channelIndex = 0; channelIndex < audioBuffer.numberOfChannels; channelIndex += 1) {
|
||||
const channel = audioBuffer.getChannelData(channelIndex);
|
||||
|
||||
for (let sampleIndex = 0; sampleIndex < channel.length; sampleIndex += 1) {
|
||||
mixed[sampleIndex] += channel[sampleIndex];
|
||||
}
|
||||
}
|
||||
|
||||
if (audioBuffer.numberOfChannels > 1) {
|
||||
for (let sampleIndex = 0; sampleIndex < mixed.length; sampleIndex += 1) {
|
||||
mixed[sampleIndex] /= audioBuffer.numberOfChannels;
|
||||
}
|
||||
}
|
||||
|
||||
return mixed;
|
||||
}
|
||||
|
||||
private resampleMonoChannel(
|
||||
monoChannel: Float32Array,
|
||||
sourceSampleRate: number,
|
||||
targetSampleRate: number,
|
||||
): Float32Array {
|
||||
if (sourceSampleRate === targetSampleRate) {
|
||||
return monoChannel;
|
||||
}
|
||||
|
||||
const targetLength = Math.max(1, Math.round(monoChannel.length * targetSampleRate / sourceSampleRate));
|
||||
const resampled = new Float32Array(targetLength);
|
||||
const positionRatio = sourceSampleRate / targetSampleRate;
|
||||
|
||||
for (let sampleIndex = 0; sampleIndex < targetLength; sampleIndex += 1) {
|
||||
const sourcePosition = sampleIndex * positionRatio;
|
||||
const sourceIndex = Math.floor(sourcePosition);
|
||||
const nextSourceIndex = Math.min(sourceIndex + 1, monoChannel.length - 1);
|
||||
const interpolationWeight = sourcePosition - sourceIndex;
|
||||
const currentValue = monoChannel[sourceIndex] ?? 0;
|
||||
const nextValue = monoChannel[nextSourceIndex] ?? currentValue;
|
||||
|
||||
resampled[sampleIndex] = currentValue + ((nextValue - currentValue) * interpolationWeight);
|
||||
}
|
||||
|
||||
return resampled;
|
||||
}
|
||||
|
||||
private normalizeLanguage(language: string | undefined): string | null {
|
||||
const trimmedLanguage = language?.trim();
|
||||
|
||||
if (!trimmedLanguage || trimmedLanguage.toLowerCase() === 'auto') {
|
||||
return null;
|
||||
}
|
||||
|
||||
return trimmedLanguage;
|
||||
}
|
||||
|
||||
private browserSupportsWebGpu(): boolean {
|
||||
return typeof navigator !== 'undefined' && 'gpu' in navigator;
|
||||
}
|
||||
|
||||
private resolveFallbackInputLanguage(): DictationLanguage {
|
||||
switch (this.fallbackLanguage?.toLowerCase()) {
|
||||
case 'french':
|
||||
case 'fr':
|
||||
return 'fr';
|
||||
case 'spanish':
|
||||
case 'es':
|
||||
return 'es';
|
||||
default:
|
||||
return 'en';
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -261,7 +261,7 @@
|
||||
<button
|
||||
class="composer-dictation"
|
||||
type="button"
|
||||
[disabled]="!session.isSelectedPeerReady() || session.signalingState() !== 'connected' || isTranscribingDictation()"
|
||||
[disabled]="!selectedPeerId || isTranscribingDictation()"
|
||||
(click)="toggleDictation(composerTextarea)"
|
||||
[title]="
|
||||
isDictating()
|
||||
|
||||
@@ -4,6 +4,7 @@ import { toSignal } from '@angular/core/rxjs-interop';
|
||||
import { FormsModule } from '@angular/forms';
|
||||
import { ActivatedRoute, Router, RouterLink } from '@angular/router';
|
||||
|
||||
import { BrowserSpeechTranscriberService } from './browser-speech-transcriber.service';
|
||||
import { PeerCallModalComponent } from './peer-call-modal.component';
|
||||
import { ChatSessionService } from './chat-session.service';
|
||||
import { JsonFileViewerComponent } from './json-file-viewer.component';
|
||||
@@ -36,6 +37,7 @@ export class ChatPageComponent implements OnDestroy {
|
||||
private readonly route = inject(ActivatedRoute);
|
||||
private readonly router = inject(Router);
|
||||
private readonly ngZone = inject(NgZone);
|
||||
private readonly speechTranscriber = inject(BrowserSpeechTranscriberService);
|
||||
private readonly routeParamMap = toSignal(this.route.paramMap, {
|
||||
initialValue: this.route.snapshot.paramMap,
|
||||
});
|
||||
@@ -274,6 +276,10 @@ export class ChatPageComponent implements OnDestroy {
|
||||
void this.router.navigateByUrl('/');
|
||||
}
|
||||
|
||||
queueMicrotask(() => {
|
||||
void this.speechTranscriber.preload().catch(() => undefined);
|
||||
});
|
||||
|
||||
effect(() => {
|
||||
const currentUserId = this.currentUser()?.id ?? null;
|
||||
this.knownPeers.set(this.readKnownPeers(currentUserId));
|
||||
@@ -1115,16 +1121,16 @@ export class ChatPageComponent implements OnDestroy {
|
||||
|
||||
private async transcribeDictation(blob: Blob, textarea: HTMLTextAreaElement, applyToken: number): Promise<void> {
|
||||
try {
|
||||
const transcript = await this.session.requestSpeechTranscription(blob);
|
||||
const transcript = await this.speechTranscriber.transcribe(blob);
|
||||
|
||||
if (applyToken !== this.dictationApplyToken) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.applyDictatedText(this.mergeDictatedText(this.dictationBaseText, transcript), textarea);
|
||||
} catch {
|
||||
} catch (error) {
|
||||
if (applyToken === this.dictationApplyToken) {
|
||||
this.session.error.set('Dictation transcription failed.');
|
||||
this.session.error.set(error instanceof Error ? error.message : 'Dictation transcription failed.');
|
||||
}
|
||||
} finally {
|
||||
if (applyToken === this.dictationApplyToken) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { HttpClient, HttpErrorResponse } from '@angular/common/http';
|
||||
import { computed, Injectable, signal } from '@angular/core';
|
||||
import { computed, effect, Injectable, signal } from '@angular/core';
|
||||
import { ImageMagick, MagickFormat, initializeImageMagick } from '@imagemagick/magick-wasm';
|
||||
import { firstValueFrom } from 'rxjs';
|
||||
|
||||
@@ -12,6 +12,7 @@ import {
|
||||
ChatEntry,
|
||||
ConnectionState,
|
||||
DataEnvelope,
|
||||
DictationLanguage,
|
||||
DeliveryState,
|
||||
PendingApprovalResponse,
|
||||
PendingApprovalUser,
|
||||
@@ -126,6 +127,7 @@ export class ChatSessionService {
|
||||
private static readonly messageStoreName = 'conversation_messages';
|
||||
private static readonly knownPeersStoragePrefix = 'privatechat.knownPeers';
|
||||
private static readonly incomingMessageSoundStorageKey = 'privatechat.incomingMessageSoundEnabled';
|
||||
private static readonly dictationLanguageStoragePrefix = 'privatechat.dictationLanguage';
|
||||
private static readonly messageRetentionLimit = 256;
|
||||
private static readonly sessionKeepaliveMs = 5 * 60 * 1000;
|
||||
private static readonly signalingHeartbeatMs = 25 * 1000;
|
||||
@@ -158,6 +160,7 @@ export class ChatSessionService {
|
||||
readonly incomingMessageSoundEnabled = signal(
|
||||
this.readStorage(ChatSessionService.incomingMessageSoundStorageKey) !== '0',
|
||||
);
|
||||
readonly dictationLanguage = signal<DictationLanguage>('en');
|
||||
readonly webAuthnSupported = signal(
|
||||
typeof window !== 'undefined' &&
|
||||
typeof window.PublicKeyCredential !== 'undefined' &&
|
||||
@@ -193,10 +196,6 @@ export class ChatSessionService {
|
||||
string,
|
||||
{ peerId: string; prompt: string; waitMessageId: string }
|
||||
>();
|
||||
private readonly pendingSpeechTranscriptionRequests = new Map<
|
||||
string,
|
||||
{ resolve: (text: string) => void; reject: (reason?: unknown) => void }
|
||||
>();
|
||||
private readonly incomingCallModes = signal<Array<{ peerId: string; mode: CallMode }>>([]);
|
||||
private readonly outgoingCallModes = signal<Array<{ peerId: string; mode: CallMode }>>([]);
|
||||
private readonly activeCallModes = signal<Array<{ peerId: string; mode: CallMode }>>([]);
|
||||
@@ -224,6 +223,17 @@ export class ChatSessionService {
|
||||
constructor(private readonly http: HttpClient) {
|
||||
this.installConnectionRecoveryListeners();
|
||||
|
||||
effect(() => {
|
||||
const currentUserId = this.currentUser()?.id;
|
||||
|
||||
if (!currentUserId) {
|
||||
this.dictationLanguage.set('en');
|
||||
return;
|
||||
}
|
||||
|
||||
this.dictationLanguage.set(this.readStoredDictationLanguage(currentUserId));
|
||||
});
|
||||
|
||||
if (this.token() && this.currentUser()) {
|
||||
queueMicrotask(() => {
|
||||
void this.restoreSession();
|
||||
@@ -331,6 +341,19 @@ export class ChatSessionService {
|
||||
this.writeStorage(ChatSessionService.incomingMessageSoundStorageKey, enabled ? '1' : '0');
|
||||
}
|
||||
|
||||
setDictationLanguage(language: DictationLanguage): void {
|
||||
const nextLanguage = this.normalizeDictationLanguage(language);
|
||||
this.dictationLanguage.set(nextLanguage);
|
||||
|
||||
const currentUserId = this.currentUser()?.id;
|
||||
|
||||
if (!currentUserId) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.writeStorage(this.dictationLanguageStorageKey(currentUserId), nextLanguage);
|
||||
}
|
||||
|
||||
selectPeer(peerId: string): void {
|
||||
this.activePeerId.set(peerId);
|
||||
this.clearUnreadPeer(peerId);
|
||||
@@ -1263,32 +1286,6 @@ export class ChatSessionService {
|
||||
return true;
|
||||
}
|
||||
|
||||
async requestSpeechTranscription(audioBlob: Blob): Promise<string> {
|
||||
if (!this.websocket || this.websocket.readyState !== WebSocket.OPEN) {
|
||||
throw new Error('You must be connected to signaling before using dictation.');
|
||||
}
|
||||
|
||||
const requestId = crypto.randomUUID();
|
||||
const audioBase64 = await this.blobToBase64(audioBlob);
|
||||
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
this.pendingSpeechTranscriptionRequests.set(requestId, { resolve, reject });
|
||||
|
||||
try {
|
||||
this.error.set(null);
|
||||
this.websocket?.send(JSON.stringify({
|
||||
type: 'speech-transcription',
|
||||
requestId,
|
||||
mimeType: audioBlob.type || 'audio/webm',
|
||||
audioBase64,
|
||||
}));
|
||||
} catch (error) {
|
||||
this.pendingSpeechTranscriptionRequests.delete(requestId);
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private async loadAccessKeys(): Promise<void> {
|
||||
const token = this.token();
|
||||
|
||||
@@ -1365,7 +1362,6 @@ export class ChatSessionService {
|
||||
const shouldReconnect = this.websocket === websocket && !this.suppressSocketReconnect;
|
||||
|
||||
this.stopWebSocketHeartbeat();
|
||||
this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
|
||||
this.signalingState.set('disconnected');
|
||||
this.status.set('Signaling connection closed.');
|
||||
|
||||
@@ -1408,8 +1404,6 @@ export class ChatSessionService {
|
||||
|
||||
private disconnectWebSocket(): void {
|
||||
this.stopWebSocketHeartbeat();
|
||||
this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
|
||||
|
||||
if (this.websocket) {
|
||||
this.suppressSocketReconnect = true;
|
||||
this.websocket.close();
|
||||
@@ -1450,12 +1444,6 @@ export class ChatSessionService {
|
||||
case 'image-generation-error':
|
||||
this.handleGeneratedImageError(event);
|
||||
break;
|
||||
case 'speech-transcribed':
|
||||
this.handleSpeechTranscribed(event);
|
||||
break;
|
||||
case 'speech-transcription-error':
|
||||
this.handleSpeechTranscriptionError(event);
|
||||
break;
|
||||
case 'pong':
|
||||
break;
|
||||
case 'error':
|
||||
@@ -1515,28 +1503,6 @@ export class ChatSessionService {
|
||||
this.error.set(event.message);
|
||||
}
|
||||
|
||||
private handleSpeechTranscribed(event: Extract<ServerEvent, { type: 'speech-transcribed' }>): void {
|
||||
const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
|
||||
|
||||
if (!pendingRequest) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.pendingSpeechTranscriptionRequests.delete(event.requestId);
|
||||
pendingRequest.resolve(event.text);
|
||||
}
|
||||
|
||||
private handleSpeechTranscriptionError(event: Extract<ServerEvent, { type: 'speech-transcription-error' }>): void {
|
||||
const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
|
||||
|
||||
if (pendingRequest) {
|
||||
this.pendingSpeechTranscriptionRequests.delete(event.requestId);
|
||||
pendingRequest.reject(new Error(event.message));
|
||||
}
|
||||
|
||||
this.error.set(event.message);
|
||||
}
|
||||
|
||||
private async restoreSession(): Promise<void> {
|
||||
const token = this.token();
|
||||
|
||||
@@ -2530,18 +2496,6 @@ export class ChatSessionService {
|
||||
}
|
||||
}
|
||||
|
||||
private rejectPendingSpeechTranscriptions(message: string): void {
|
||||
if (this.pendingSpeechTranscriptionRequests.size === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const { reject } of this.pendingSpeechTranscriptionRequests.values()) {
|
||||
reject(new Error(message));
|
||||
}
|
||||
|
||||
this.pendingSpeechTranscriptionRequests.clear();
|
||||
}
|
||||
|
||||
private clearLocalAuth(statusMessage: string): void {
|
||||
this.clearWebSocketReconnect();
|
||||
this.signalingRecoveryPromise = null;
|
||||
@@ -2555,7 +2509,6 @@ export class ChatSessionService {
|
||||
this.releasePreloadedRingtone();
|
||||
this.pendingImageGenerationRequests.clear();
|
||||
this.pendingOutgoingFlushes.clear();
|
||||
this.rejectPendingSpeechTranscriptions('Session ended during dictation.');
|
||||
this.incomingCallModes.set([]);
|
||||
this.outgoingCallModes.set([]);
|
||||
this.activeCallModes.set([]);
|
||||
@@ -3865,4 +3818,25 @@ export class ChatSessionService {
|
||||
|
||||
return responseMessage ?? thrownMessage ?? fallback;
|
||||
}
|
||||
|
||||
private readStoredDictationLanguage(currentUserId: string): DictationLanguage {
|
||||
const storedValue = this.readStorage(this.dictationLanguageStorageKey(currentUserId));
|
||||
|
||||
return this.normalizeDictationLanguage(storedValue);
|
||||
}
|
||||
|
||||
private dictationLanguageStorageKey(currentUserId: string): string {
|
||||
return `${ChatSessionService.dictationLanguageStoragePrefix}.${currentUserId}`;
|
||||
}
|
||||
|
||||
private normalizeDictationLanguage(value: string | null | undefined): DictationLanguage {
|
||||
switch (value) {
|
||||
case 'fr':
|
||||
case 'es':
|
||||
case 'en':
|
||||
return value;
|
||||
default:
|
||||
return 'en';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -188,6 +188,31 @@
|
||||
<div class="alert alert-success mb-4">{{ session.notice() }}</div>
|
||||
}
|
||||
|
||||
<section class="access-key-panel mb-4">
|
||||
<div class="dictation-language-panel">
|
||||
<div>
|
||||
<h3 class="h5 mb-1">Dictation language</h3>
|
||||
<p class="small text-secondary mb-0">
|
||||
Speech input and text output use the same selected language.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="dictation-language-select-shell mt-3">
|
||||
<label class="form-label small mb-2" for="dictationLanguage">Language</label>
|
||||
<select
|
||||
id="dictationLanguage"
|
||||
class="form-select"
|
||||
[ngModel]="session.dictationLanguage()"
|
||||
(ngModelChange)="setDictationLanguage($event)"
|
||||
>
|
||||
@for (option of dictationLanguageOptions; track option.value) {
|
||||
<option [ngValue]="option.value">{{ option.label }}</option>
|
||||
}
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="access-key-panel mb-4">
|
||||
<div class="d-flex justify-content-between align-items-start gap-3">
|
||||
<div>
|
||||
|
||||
@@ -114,6 +114,12 @@
|
||||
background: var(--panel-soft-background);
|
||||
}
|
||||
|
||||
.dictation-language-panel,
|
||||
.dictation-language-select-shell {
|
||||
display: grid;
|
||||
gap: 0.75rem;
|
||||
}
|
||||
|
||||
.user-search-panel {
|
||||
display: grid;
|
||||
gap: 0.75rem;
|
||||
@@ -183,7 +189,9 @@
|
||||
}
|
||||
|
||||
.form-control,
|
||||
.form-control:focus {
|
||||
.form-control:focus,
|
||||
.form-select,
|
||||
.form-select:focus {
|
||||
color: var(--page-text);
|
||||
background-color: var(--input-background);
|
||||
border-color: var(--input-border);
|
||||
|
||||
@@ -4,7 +4,7 @@ import { FormsModule } from '@angular/forms';
|
||||
import { Router, RouterLink } from '@angular/router';
|
||||
|
||||
import { ChatSessionService } from './chat-session.service';
|
||||
import type { AdminUserSummary, UserProfile } from './models';
|
||||
import type { AdminUserSummary, DictationLanguage, UserProfile } from './models';
|
||||
import { ThemeService } from './theme.service';
|
||||
|
||||
@Component({
|
||||
@@ -32,6 +32,11 @@ export class HomePageComponent {
|
||||
readonly loadingAdminUsers = signal(false);
|
||||
readonly deletingUserId = signal<string | null>(null);
|
||||
readonly adminUsersError = signal<string | null>(null);
|
||||
readonly dictationLanguageOptions: Array<{ value: DictationLanguage; label: string }> = [
|
||||
{ value: 'en', label: 'English' },
|
||||
{ value: 'fr', label: 'French' },
|
||||
{ value: 'es', label: 'Spanish' },
|
||||
];
|
||||
readonly filteredKnownUsers = computed(() => {
|
||||
const query = this.userSearch.trim().toLowerCase();
|
||||
const users = this.knownUsers();
|
||||
@@ -202,4 +207,8 @@ export class HomePageComponent {
|
||||
setIncomingMessageSound(enabled: boolean): void {
|
||||
this.session.setIncomingMessageSoundEnabled(enabled);
|
||||
}
|
||||
|
||||
setDictationLanguage(language: string): void {
|
||||
this.session.setDictationLanguage(language as DictationLanguage);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -113,6 +113,7 @@ export interface ChatEntry {
|
||||
}
|
||||
|
||||
export type CallMode = 'audio' | 'video';
|
||||
export type DictationLanguage = 'en' | 'fr' | 'es';
|
||||
|
||||
export type SignalPayload =
|
||||
| { type: 'sdp'; description: RTCSessionDescriptionInit }
|
||||
@@ -138,16 +139,6 @@ export type ServerEvent =
|
||||
peerId: string;
|
||||
message: string;
|
||||
}
|
||||
| {
|
||||
type: 'speech-transcribed';
|
||||
requestId: string;
|
||||
text: string;
|
||||
}
|
||||
| {
|
||||
type: 'speech-transcription-error';
|
||||
requestId: string;
|
||||
message: string;
|
||||
}
|
||||
| { type: 'pong' }
|
||||
| { type: 'error'; message: string };
|
||||
|
||||
|
||||
45
server/dist/index.js
vendored
45
server/dist/index.js
vendored
@@ -16,7 +16,6 @@ import { generateAuthenticationOptions, generateRegistrationOptions, verifyAuthe
|
||||
import Fastify from 'fastify';
|
||||
import { Redis } from 'ioredis';
|
||||
import { z } from 'zod';
|
||||
import { SpeechTranscriber } from './speech-transcriber.js';
|
||||
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
|
||||
const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
|
||||
const registerSchema = z.object({
|
||||
@@ -90,12 +89,6 @@ const signalMessageSchema = z.discriminatedUnion('type', [
|
||||
z.object({
|
||||
type: z.literal('ping'),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal('speech-transcription'),
|
||||
requestId: z.string().uuid(),
|
||||
mimeType: z.string().trim().min(1).max(128),
|
||||
audioBase64: z.string().min(1).max(32_000_000),
|
||||
}),
|
||||
]);
|
||||
const app = Fastify({ logger: true, trustProxy: true });
|
||||
const approvalAdminUsername = 'ladparis';
|
||||
@@ -106,9 +99,6 @@ const frontendDistPath = resolveProjectPath(process.env.PRIVATECHAT_WEB_DIST_DIR
|
||||
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
|
||||
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
|
||||
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
|
||||
const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'wss://whisper.dubertrand.fr';
|
||||
const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto';
|
||||
const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000);
|
||||
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
|
||||
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
|
||||
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
|
||||
@@ -121,11 +111,6 @@ const frontendIndexPath = path.join(frontendDistPath, 'index.html');
|
||||
const hasFrontendBuild = fs.existsSync(frontendIndexPath);
|
||||
const convertOfficeDocument = promisify(libreOffice.convertWithOptions);
|
||||
const execFileAsync = promisify(execFile);
|
||||
const speechTranscriber = new SpeechTranscriber({
|
||||
serviceUrl: speechTranscriptionServiceUrl,
|
||||
language: speechTranscriptionLanguage,
|
||||
requestTimeoutMs: speechTranscriptionTimeoutMs,
|
||||
}, app.log);
|
||||
fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
|
||||
fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
|
||||
const encryptionKey = deriveEncryptionKey(loadOrCreateMasterKey(masterKeyPath));
|
||||
@@ -782,25 +767,6 @@ async function handleSocketMessage(userId, sessionId, socket, rawMessage) {
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (parsed.type === 'speech-transcription') {
|
||||
try {
|
||||
const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType);
|
||||
send(socket, {
|
||||
type: 'speech-transcribed',
|
||||
requestId: parsed.requestId,
|
||||
text,
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
app.log.warn({ err: error, userId }, 'Speech transcription failed');
|
||||
send(socket, {
|
||||
type: 'speech-transcription-error',
|
||||
requestId: parsed.requestId,
|
||||
message: error instanceof Error ? error.message : 'Speech transcription failed.',
|
||||
});
|
||||
}
|
||||
return;
|
||||
}
|
||||
let delivered = 0;
|
||||
const recipientSockets = socketsByUserId.get(parsed.to);
|
||||
if (recipientSockets) {
|
||||
@@ -1257,23 +1223,12 @@ function parseClientMessage(rawMessage) {
|
||||
prompt: parsed.data.prompt,
|
||||
};
|
||||
}
|
||||
if (parsed.data.type === 'speech-transcription') {
|
||||
return {
|
||||
type: 'speech-transcription',
|
||||
requestId: parsed.data.requestId,
|
||||
mimeType: parsed.data.mimeType,
|
||||
audioBase64: parsed.data.audioBase64,
|
||||
};
|
||||
}
|
||||
return {
|
||||
type: 'signal',
|
||||
to: parsed.data.to,
|
||||
signal: normalizeSignal(parsed.data.signal),
|
||||
};
|
||||
}
|
||||
async function transcribeAudioPayload(requestId, audioBase64, mimeType) {
|
||||
return await speechTranscriber.transcribe(requestId, audioBase64, mimeType);
|
||||
}
|
||||
async function generateImageFromPrompt(prompt) {
|
||||
const abortController = new AbortController();
|
||||
const timeoutId = setTimeout(() => abortController.abort(), 120_000);
|
||||
|
||||
124
server/dist/speech-transcriber.js
vendored
124
server/dist/speech-transcriber.js
vendored
@@ -1,124 +0,0 @@
|
||||
import WebSocket from 'ws';
|
||||
export class SpeechTranscriber {
|
||||
options;
|
||||
logger;
|
||||
constructor(options, logger) {
|
||||
this.options = options;
|
||||
this.logger = logger;
|
||||
}
|
||||
async transcribe(requestId, audioBase64, mimeType) {
|
||||
const audio = this.normalizeAudioPayload(audioBase64, mimeType);
|
||||
return await new Promise((resolve, reject) => {
|
||||
let settled = false;
|
||||
const socket = new WebSocket(this.options.serviceUrl);
|
||||
const finish = (handler) => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
settled = true;
|
||||
clearTimeout(timeout);
|
||||
socket.removeAllListeners();
|
||||
if (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN) {
|
||||
socket.close();
|
||||
}
|
||||
handler();
|
||||
};
|
||||
const timeout = setTimeout(() => {
|
||||
finish(() => {
|
||||
reject(new Error(`The transcription service timed out after ${this.options.requestTimeoutMs}ms.`));
|
||||
});
|
||||
}, this.options.requestTimeoutMs);
|
||||
socket.on('open', () => {
|
||||
try {
|
||||
socket.send(JSON.stringify({
|
||||
type: 'transcribe',
|
||||
id: requestId,
|
||||
language: this.options.language,
|
||||
audio,
|
||||
}));
|
||||
}
|
||||
catch (error) {
|
||||
finish(() => {
|
||||
reject(error instanceof Error ? error : new Error('Could not send transcription request.'));
|
||||
});
|
||||
}
|
||||
});
|
||||
socket.on('message', (payload) => {
|
||||
const event = this.parseEvent(payload);
|
||||
if (!event) {
|
||||
return;
|
||||
}
|
||||
if (event.id && event.id !== requestId) {
|
||||
this.logger.warn({ requestId, event }, 'Ignored transcription event for another request');
|
||||
return;
|
||||
}
|
||||
if (event.type === 'start') {
|
||||
this.logger.info({ requestId, model: event.model, language: event.language }, 'Speech transcription started');
|
||||
return;
|
||||
}
|
||||
if (event.type === 'delta') {
|
||||
return;
|
||||
}
|
||||
if (event.type === 'done') {
|
||||
finish(() => {
|
||||
resolve(event.text.trim());
|
||||
});
|
||||
return;
|
||||
}
|
||||
finish(() => {
|
||||
reject(new Error(event.message));
|
||||
});
|
||||
});
|
||||
socket.on('error', (error) => {
|
||||
finish(() => {
|
||||
reject(error instanceof Error ? error : new Error('The transcription service connection failed.'));
|
||||
});
|
||||
});
|
||||
socket.on('close', (code, reasonBuffer) => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
const reason = reasonBuffer.toString().trim();
|
||||
const detail = reason
|
||||
? `The transcription service closed the connection unexpectedly (code=${code}, reason=${reason}).`
|
||||
: `The transcription service closed the connection unexpectedly (code=${code}).`;
|
||||
finish(() => {
|
||||
reject(new Error(detail));
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
normalizeAudioPayload(audioBase64, mimeType) {
|
||||
const trimmedAudio = audioBase64.trim();
|
||||
if (trimmedAudio.startsWith('data:')) {
|
||||
return trimmedAudio;
|
||||
}
|
||||
const normalizedMimeType = mimeType.trim() || 'audio/webm';
|
||||
return `data:${normalizedMimeType};base64,${trimmedAudio}`;
|
||||
}
|
||||
parseEvent(payload) {
|
||||
const message = this.rawDataToString(payload).trim();
|
||||
if (!message) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return JSON.parse(message);
|
||||
}
|
||||
catch {
|
||||
this.logger.warn({ transcriptionPayload: message }, 'Ignored non-JSON transcription event');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
rawDataToString(payload) {
|
||||
if (typeof payload === 'string') {
|
||||
return payload;
|
||||
}
|
||||
if (payload instanceof ArrayBuffer) {
|
||||
return Buffer.from(payload).toString('utf8');
|
||||
}
|
||||
if (Array.isArray(payload)) {
|
||||
return Buffer.concat(payload).toString('utf8');
|
||||
}
|
||||
return payload.toString('utf8');
|
||||
}
|
||||
}
|
||||
@@ -26,8 +26,6 @@ import { Redis } from 'ioredis';
|
||||
import type WebSocket from 'ws';
|
||||
import { z } from 'zod';
|
||||
|
||||
import { SpeechTranscriber } from './speech-transcriber.js';
|
||||
|
||||
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
|
||||
|
||||
const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
|
||||
@@ -125,12 +123,6 @@ type ClientMessage =
|
||||
}
|
||||
| {
|
||||
type: 'ping';
|
||||
}
|
||||
| {
|
||||
type: 'speech-transcription';
|
||||
requestId: string;
|
||||
mimeType: string;
|
||||
audioBase64: string;
|
||||
};
|
||||
|
||||
type ServerMessage =
|
||||
@@ -153,16 +145,6 @@ type ServerMessage =
|
||||
peerId: string;
|
||||
message: string;
|
||||
}
|
||||
| {
|
||||
type: 'speech-transcribed';
|
||||
requestId: string;
|
||||
text: string;
|
||||
}
|
||||
| {
|
||||
type: 'speech-transcription-error';
|
||||
requestId: string;
|
||||
message: string;
|
||||
}
|
||||
| { type: 'pong' }
|
||||
| { type: 'error'; message: string };
|
||||
|
||||
@@ -316,12 +298,6 @@ const signalMessageSchema = z.discriminatedUnion('type', [
|
||||
z.object({
|
||||
type: z.literal('ping'),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal('speech-transcription'),
|
||||
requestId: z.string().uuid(),
|
||||
mimeType: z.string().trim().min(1).max(128),
|
||||
audioBase64: z.string().min(1).max(32_000_000),
|
||||
}),
|
||||
]);
|
||||
|
||||
const app = Fastify({ logger: true, trustProxy: true });
|
||||
@@ -340,9 +316,6 @@ const frontendDistPath = resolveProjectPath(
|
||||
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
|
||||
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
|
||||
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
|
||||
const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'wss://whisper.dubertrand.fr';
|
||||
const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto';
|
||||
const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000);
|
||||
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
|
||||
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
|
||||
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
|
||||
@@ -358,15 +331,6 @@ const hasFrontendBuild = fs.existsSync(frontendIndexPath);
|
||||
const convertOfficeDocument = promisify(libreOffice.convertWithOptions);
|
||||
const execFileAsync = promisify(execFile);
|
||||
|
||||
const speechTranscriber = new SpeechTranscriber(
|
||||
{
|
||||
serviceUrl: speechTranscriptionServiceUrl,
|
||||
language: speechTranscriptionLanguage,
|
||||
requestTimeoutMs: speechTranscriptionTimeoutMs,
|
||||
},
|
||||
app.log,
|
||||
);
|
||||
|
||||
fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
|
||||
fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
|
||||
|
||||
@@ -1225,27 +1189,6 @@ async function handleSocketMessage(
|
||||
return;
|
||||
}
|
||||
|
||||
if (parsed.type === 'speech-transcription') {
|
||||
try {
|
||||
const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType);
|
||||
|
||||
send(socket, {
|
||||
type: 'speech-transcribed',
|
||||
requestId: parsed.requestId,
|
||||
text,
|
||||
});
|
||||
} catch (error) {
|
||||
app.log.warn({ err: error, userId }, 'Speech transcription failed');
|
||||
send(socket, {
|
||||
type: 'speech-transcription-error',
|
||||
requestId: parsed.requestId,
|
||||
message: error instanceof Error ? error.message : 'Speech transcription failed.',
|
||||
});
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
let delivered = 0;
|
||||
const recipientSockets = socketsByUserId.get(parsed.to);
|
||||
|
||||
@@ -1897,15 +1840,6 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
|
||||
};
|
||||
}
|
||||
|
||||
if (parsed.data.type === 'speech-transcription') {
|
||||
return {
|
||||
type: 'speech-transcription',
|
||||
requestId: parsed.data.requestId,
|
||||
mimeType: parsed.data.mimeType,
|
||||
audioBase64: parsed.data.audioBase64,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
type: 'signal',
|
||||
to: parsed.data.to,
|
||||
@@ -1913,10 +1847,6 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
|
||||
};
|
||||
}
|
||||
|
||||
async function transcribeAudioPayload(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
|
||||
return await speechTranscriber.transcribe(requestId, audioBase64, mimeType);
|
||||
}
|
||||
|
||||
async function generateImageFromPrompt(prompt: string): Promise<{ imageBase64: string; mimeType: string }> {
|
||||
const abortController = new AbortController();
|
||||
const timeoutId = setTimeout(() => abortController.abort(), 120_000);
|
||||
|
||||
@@ -1,173 +0,0 @@
|
||||
import WebSocket, { type RawData } from 'ws';
|
||||
|
||||
type LoggerLike = {
|
||||
info: (payload: unknown, message?: string) => void;
|
||||
warn: (payload: unknown, message?: string) => void;
|
||||
error: (payload: unknown, message?: string) => void;
|
||||
};
|
||||
|
||||
type SpeechTranscriberOptions = {
|
||||
serviceUrl: string;
|
||||
language: string;
|
||||
requestTimeoutMs: number;
|
||||
};
|
||||
|
||||
type ServiceEvent =
|
||||
| { type: 'start'; id: string; model: string; language: string }
|
||||
| { type: 'delta'; id: string; text: string; fullText: string }
|
||||
| { type: 'done'; id: string; text: string }
|
||||
| { type: 'error'; id?: string; message: string };
|
||||
|
||||
export class SpeechTranscriber {
|
||||
constructor(
|
||||
private readonly options: SpeechTranscriberOptions,
|
||||
private readonly logger: LoggerLike,
|
||||
) {}
|
||||
|
||||
async transcribe(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
|
||||
const audio = this.normalizeAudioPayload(audioBase64, mimeType);
|
||||
|
||||
return await new Promise<string>((resolve, reject) => {
|
||||
let settled = false;
|
||||
const socket = new WebSocket(this.options.serviceUrl);
|
||||
|
||||
const finish = (handler: () => void): void => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
|
||||
settled = true;
|
||||
clearTimeout(timeout);
|
||||
socket.removeAllListeners();
|
||||
|
||||
if (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN) {
|
||||
socket.close();
|
||||
}
|
||||
|
||||
handler();
|
||||
};
|
||||
|
||||
const timeout = setTimeout(() => {
|
||||
finish(() => {
|
||||
reject(new Error(`The transcription service timed out after ${this.options.requestTimeoutMs}ms.`));
|
||||
});
|
||||
}, this.options.requestTimeoutMs);
|
||||
|
||||
socket.on('open', () => {
|
||||
try {
|
||||
socket.send(
|
||||
JSON.stringify({
|
||||
type: 'transcribe',
|
||||
id: requestId,
|
||||
language: this.options.language,
|
||||
audio,
|
||||
}),
|
||||
);
|
||||
} catch (error) {
|
||||
finish(() => {
|
||||
reject(error instanceof Error ? error : new Error('Could not send transcription request.'));
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
socket.on('message', (payload) => {
|
||||
const event = this.parseEvent(payload);
|
||||
|
||||
if (!event) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (event.id && event.id !== requestId) {
|
||||
this.logger.warn({ requestId, event }, 'Ignored transcription event for another request');
|
||||
return;
|
||||
}
|
||||
|
||||
if (event.type === 'start') {
|
||||
this.logger.info(
|
||||
{ requestId, model: event.model, language: event.language },
|
||||
'Speech transcription started',
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
if (event.type === 'delta') {
|
||||
return;
|
||||
}
|
||||
|
||||
if (event.type === 'done') {
|
||||
finish(() => {
|
||||
resolve(event.text.trim());
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
finish(() => {
|
||||
reject(new Error(event.message));
|
||||
});
|
||||
});
|
||||
|
||||
socket.on('error', (error) => {
|
||||
finish(() => {
|
||||
reject(error instanceof Error ? error : new Error('The transcription service connection failed.'));
|
||||
});
|
||||
});
|
||||
|
||||
socket.on('close', (code, reasonBuffer) => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
|
||||
const reason = reasonBuffer.toString().trim();
|
||||
const detail = reason
|
||||
? `The transcription service closed the connection unexpectedly (code=${code}, reason=${reason}).`
|
||||
: `The transcription service closed the connection unexpectedly (code=${code}).`;
|
||||
|
||||
finish(() => {
|
||||
reject(new Error(detail));
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
private normalizeAudioPayload(audioBase64: string, mimeType: string): string {
|
||||
const trimmedAudio = audioBase64.trim();
|
||||
|
||||
if (trimmedAudio.startsWith('data:')) {
|
||||
return trimmedAudio;
|
||||
}
|
||||
|
||||
const normalizedMimeType = mimeType.trim() || 'audio/webm';
|
||||
return `data:${normalizedMimeType};base64,${trimmedAudio}`;
|
||||
}
|
||||
|
||||
private parseEvent(payload: RawData): ServiceEvent | null {
|
||||
const message = this.rawDataToString(payload).trim();
|
||||
|
||||
if (!message) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
return JSON.parse(message) as ServiceEvent;
|
||||
} catch {
|
||||
this.logger.warn({ transcriptionPayload: message }, 'Ignored non-JSON transcription event');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private rawDataToString(payload: RawData): string {
|
||||
if (typeof payload === 'string') {
|
||||
return payload;
|
||||
}
|
||||
|
||||
if (payload instanceof ArrayBuffer) {
|
||||
return Buffer.from(payload).toString('utf8');
|
||||
}
|
||||
|
||||
if (Array.isArray(payload)) {
|
||||
return Buffer.concat(payload).toString('utf8');
|
||||
}
|
||||
|
||||
return payload.toString('utf8');
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user