local multi language STT
This commit is contained in:
45
server/dist/index.js
vendored
45
server/dist/index.js
vendored
@@ -16,7 +16,6 @@ import { generateAuthenticationOptions, generateRegistrationOptions, verifyAuthe
|
||||
import Fastify from 'fastify';
|
||||
import { Redis } from 'ioredis';
|
||||
import { z } from 'zod';
|
||||
import { SpeechTranscriber } from './speech-transcriber.js';
|
||||
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
|
||||
const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
|
||||
const registerSchema = z.object({
|
||||
@@ -90,12 +89,6 @@ const signalMessageSchema = z.discriminatedUnion('type', [
|
||||
z.object({
|
||||
type: z.literal('ping'),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal('speech-transcription'),
|
||||
requestId: z.string().uuid(),
|
||||
mimeType: z.string().trim().min(1).max(128),
|
||||
audioBase64: z.string().min(1).max(32_000_000),
|
||||
}),
|
||||
]);
|
||||
const app = Fastify({ logger: true, trustProxy: true });
|
||||
const approvalAdminUsername = 'ladparis';
|
||||
@@ -106,9 +99,6 @@ const frontendDistPath = resolveProjectPath(process.env.PRIVATECHAT_WEB_DIST_DIR
|
||||
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
|
||||
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
|
||||
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
|
||||
const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'wss://whisper.dubertrand.fr';
|
||||
const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto';
|
||||
const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000);
|
||||
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
|
||||
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
|
||||
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
|
||||
@@ -121,11 +111,6 @@ const frontendIndexPath = path.join(frontendDistPath, 'index.html');
|
||||
const hasFrontendBuild = fs.existsSync(frontendIndexPath);
|
||||
const convertOfficeDocument = promisify(libreOffice.convertWithOptions);
|
||||
const execFileAsync = promisify(execFile);
|
||||
const speechTranscriber = new SpeechTranscriber({
|
||||
serviceUrl: speechTranscriptionServiceUrl,
|
||||
language: speechTranscriptionLanguage,
|
||||
requestTimeoutMs: speechTranscriptionTimeoutMs,
|
||||
}, app.log);
|
||||
fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
|
||||
fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
|
||||
const encryptionKey = deriveEncryptionKey(loadOrCreateMasterKey(masterKeyPath));
|
||||
@@ -782,25 +767,6 @@ async function handleSocketMessage(userId, sessionId, socket, rawMessage) {
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (parsed.type === 'speech-transcription') {
|
||||
try {
|
||||
const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType);
|
||||
send(socket, {
|
||||
type: 'speech-transcribed',
|
||||
requestId: parsed.requestId,
|
||||
text,
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
app.log.warn({ err: error, userId }, 'Speech transcription failed');
|
||||
send(socket, {
|
||||
type: 'speech-transcription-error',
|
||||
requestId: parsed.requestId,
|
||||
message: error instanceof Error ? error.message : 'Speech transcription failed.',
|
||||
});
|
||||
}
|
||||
return;
|
||||
}
|
||||
let delivered = 0;
|
||||
const recipientSockets = socketsByUserId.get(parsed.to);
|
||||
if (recipientSockets) {
|
||||
@@ -1257,23 +1223,12 @@ function parseClientMessage(rawMessage) {
|
||||
prompt: parsed.data.prompt,
|
||||
};
|
||||
}
|
||||
if (parsed.data.type === 'speech-transcription') {
|
||||
return {
|
||||
type: 'speech-transcription',
|
||||
requestId: parsed.data.requestId,
|
||||
mimeType: parsed.data.mimeType,
|
||||
audioBase64: parsed.data.audioBase64,
|
||||
};
|
||||
}
|
||||
return {
|
||||
type: 'signal',
|
||||
to: parsed.data.to,
|
||||
signal: normalizeSignal(parsed.data.signal),
|
||||
};
|
||||
}
|
||||
async function transcribeAudioPayload(requestId, audioBase64, mimeType) {
|
||||
return await speechTranscriber.transcribe(requestId, audioBase64, mimeType);
|
||||
}
|
||||
async function generateImageFromPrompt(prompt) {
|
||||
const abortController = new AbortController();
|
||||
const timeoutId = setTimeout(() => abortController.abort(), 120_000);
|
||||
|
||||
124
server/dist/speech-transcriber.js
vendored
124
server/dist/speech-transcriber.js
vendored
@@ -1,124 +0,0 @@
|
||||
import WebSocket from 'ws';
|
||||
export class SpeechTranscriber {
|
||||
options;
|
||||
logger;
|
||||
constructor(options, logger) {
|
||||
this.options = options;
|
||||
this.logger = logger;
|
||||
}
|
||||
async transcribe(requestId, audioBase64, mimeType) {
|
||||
const audio = this.normalizeAudioPayload(audioBase64, mimeType);
|
||||
return await new Promise((resolve, reject) => {
|
||||
let settled = false;
|
||||
const socket = new WebSocket(this.options.serviceUrl);
|
||||
const finish = (handler) => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
settled = true;
|
||||
clearTimeout(timeout);
|
||||
socket.removeAllListeners();
|
||||
if (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN) {
|
||||
socket.close();
|
||||
}
|
||||
handler();
|
||||
};
|
||||
const timeout = setTimeout(() => {
|
||||
finish(() => {
|
||||
reject(new Error(`The transcription service timed out after ${this.options.requestTimeoutMs}ms.`));
|
||||
});
|
||||
}, this.options.requestTimeoutMs);
|
||||
socket.on('open', () => {
|
||||
try {
|
||||
socket.send(JSON.stringify({
|
||||
type: 'transcribe',
|
||||
id: requestId,
|
||||
language: this.options.language,
|
||||
audio,
|
||||
}));
|
||||
}
|
||||
catch (error) {
|
||||
finish(() => {
|
||||
reject(error instanceof Error ? error : new Error('Could not send transcription request.'));
|
||||
});
|
||||
}
|
||||
});
|
||||
socket.on('message', (payload) => {
|
||||
const event = this.parseEvent(payload);
|
||||
if (!event) {
|
||||
return;
|
||||
}
|
||||
if (event.id && event.id !== requestId) {
|
||||
this.logger.warn({ requestId, event }, 'Ignored transcription event for another request');
|
||||
return;
|
||||
}
|
||||
if (event.type === 'start') {
|
||||
this.logger.info({ requestId, model: event.model, language: event.language }, 'Speech transcription started');
|
||||
return;
|
||||
}
|
||||
if (event.type === 'delta') {
|
||||
return;
|
||||
}
|
||||
if (event.type === 'done') {
|
||||
finish(() => {
|
||||
resolve(event.text.trim());
|
||||
});
|
||||
return;
|
||||
}
|
||||
finish(() => {
|
||||
reject(new Error(event.message));
|
||||
});
|
||||
});
|
||||
socket.on('error', (error) => {
|
||||
finish(() => {
|
||||
reject(error instanceof Error ? error : new Error('The transcription service connection failed.'));
|
||||
});
|
||||
});
|
||||
socket.on('close', (code, reasonBuffer) => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
const reason = reasonBuffer.toString().trim();
|
||||
const detail = reason
|
||||
? `The transcription service closed the connection unexpectedly (code=${code}, reason=${reason}).`
|
||||
: `The transcription service closed the connection unexpectedly (code=${code}).`;
|
||||
finish(() => {
|
||||
reject(new Error(detail));
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
normalizeAudioPayload(audioBase64, mimeType) {
|
||||
const trimmedAudio = audioBase64.trim();
|
||||
if (trimmedAudio.startsWith('data:')) {
|
||||
return trimmedAudio;
|
||||
}
|
||||
const normalizedMimeType = mimeType.trim() || 'audio/webm';
|
||||
return `data:${normalizedMimeType};base64,${trimmedAudio}`;
|
||||
}
|
||||
parseEvent(payload) {
|
||||
const message = this.rawDataToString(payload).trim();
|
||||
if (!message) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return JSON.parse(message);
|
||||
}
|
||||
catch {
|
||||
this.logger.warn({ transcriptionPayload: message }, 'Ignored non-JSON transcription event');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
rawDataToString(payload) {
|
||||
if (typeof payload === 'string') {
|
||||
return payload;
|
||||
}
|
||||
if (payload instanceof ArrayBuffer) {
|
||||
return Buffer.from(payload).toString('utf8');
|
||||
}
|
||||
if (Array.isArray(payload)) {
|
||||
return Buffer.concat(payload).toString('utf8');
|
||||
}
|
||||
return payload.toString('utf8');
|
||||
}
|
||||
}
|
||||
@@ -26,8 +26,6 @@ import { Redis } from 'ioredis';
|
||||
import type WebSocket from 'ws';
|
||||
import { z } from 'zod';
|
||||
|
||||
import { SpeechTranscriber } from './speech-transcriber.js';
|
||||
|
||||
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
|
||||
|
||||
const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
|
||||
@@ -125,12 +123,6 @@ type ClientMessage =
|
||||
}
|
||||
| {
|
||||
type: 'ping';
|
||||
}
|
||||
| {
|
||||
type: 'speech-transcription';
|
||||
requestId: string;
|
||||
mimeType: string;
|
||||
audioBase64: string;
|
||||
};
|
||||
|
||||
type ServerMessage =
|
||||
@@ -153,16 +145,6 @@ type ServerMessage =
|
||||
peerId: string;
|
||||
message: string;
|
||||
}
|
||||
| {
|
||||
type: 'speech-transcribed';
|
||||
requestId: string;
|
||||
text: string;
|
||||
}
|
||||
| {
|
||||
type: 'speech-transcription-error';
|
||||
requestId: string;
|
||||
message: string;
|
||||
}
|
||||
| { type: 'pong' }
|
||||
| { type: 'error'; message: string };
|
||||
|
||||
@@ -316,12 +298,6 @@ const signalMessageSchema = z.discriminatedUnion('type', [
|
||||
z.object({
|
||||
type: z.literal('ping'),
|
||||
}),
|
||||
z.object({
|
||||
type: z.literal('speech-transcription'),
|
||||
requestId: z.string().uuid(),
|
||||
mimeType: z.string().trim().min(1).max(128),
|
||||
audioBase64: z.string().min(1).max(32_000_000),
|
||||
}),
|
||||
]);
|
||||
|
||||
const app = Fastify({ logger: true, trustProxy: true });
|
||||
@@ -340,9 +316,6 @@ const frontendDistPath = resolveProjectPath(
|
||||
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
|
||||
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
|
||||
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
|
||||
const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'wss://whisper.dubertrand.fr';
|
||||
const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto';
|
||||
const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000);
|
||||
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
|
||||
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
|
||||
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
|
||||
@@ -358,15 +331,6 @@ const hasFrontendBuild = fs.existsSync(frontendIndexPath);
|
||||
const convertOfficeDocument = promisify(libreOffice.convertWithOptions);
|
||||
const execFileAsync = promisify(execFile);
|
||||
|
||||
const speechTranscriber = new SpeechTranscriber(
|
||||
{
|
||||
serviceUrl: speechTranscriptionServiceUrl,
|
||||
language: speechTranscriptionLanguage,
|
||||
requestTimeoutMs: speechTranscriptionTimeoutMs,
|
||||
},
|
||||
app.log,
|
||||
);
|
||||
|
||||
fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
|
||||
fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
|
||||
|
||||
@@ -1225,27 +1189,6 @@ async function handleSocketMessage(
|
||||
return;
|
||||
}
|
||||
|
||||
if (parsed.type === 'speech-transcription') {
|
||||
try {
|
||||
const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType);
|
||||
|
||||
send(socket, {
|
||||
type: 'speech-transcribed',
|
||||
requestId: parsed.requestId,
|
||||
text,
|
||||
});
|
||||
} catch (error) {
|
||||
app.log.warn({ err: error, userId }, 'Speech transcription failed');
|
||||
send(socket, {
|
||||
type: 'speech-transcription-error',
|
||||
requestId: parsed.requestId,
|
||||
message: error instanceof Error ? error.message : 'Speech transcription failed.',
|
||||
});
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
let delivered = 0;
|
||||
const recipientSockets = socketsByUserId.get(parsed.to);
|
||||
|
||||
@@ -1897,15 +1840,6 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
|
||||
};
|
||||
}
|
||||
|
||||
if (parsed.data.type === 'speech-transcription') {
|
||||
return {
|
||||
type: 'speech-transcription',
|
||||
requestId: parsed.data.requestId,
|
||||
mimeType: parsed.data.mimeType,
|
||||
audioBase64: parsed.data.audioBase64,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
type: 'signal',
|
||||
to: parsed.data.to,
|
||||
@@ -1913,10 +1847,6 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
|
||||
};
|
||||
}
|
||||
|
||||
async function transcribeAudioPayload(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
|
||||
return await speechTranscriber.transcribe(requestId, audioBase64, mimeType);
|
||||
}
|
||||
|
||||
async function generateImageFromPrompt(prompt: string): Promise<{ imageBase64: string; mimeType: string }> {
|
||||
const abortController = new AbortController();
|
||||
const timeoutId = setTimeout(() => abortController.abort(), 120_000);
|
||||
|
||||
@@ -1,173 +0,0 @@
|
||||
import WebSocket, { type RawData } from 'ws';
|
||||
|
||||
type LoggerLike = {
|
||||
info: (payload: unknown, message?: string) => void;
|
||||
warn: (payload: unknown, message?: string) => void;
|
||||
error: (payload: unknown, message?: string) => void;
|
||||
};
|
||||
|
||||
type SpeechTranscriberOptions = {
|
||||
serviceUrl: string;
|
||||
language: string;
|
||||
requestTimeoutMs: number;
|
||||
};
|
||||
|
||||
type ServiceEvent =
|
||||
| { type: 'start'; id: string; model: string; language: string }
|
||||
| { type: 'delta'; id: string; text: string; fullText: string }
|
||||
| { type: 'done'; id: string; text: string }
|
||||
| { type: 'error'; id?: string; message: string };
|
||||
|
||||
export class SpeechTranscriber {
|
||||
constructor(
|
||||
private readonly options: SpeechTranscriberOptions,
|
||||
private readonly logger: LoggerLike,
|
||||
) {}
|
||||
|
||||
async transcribe(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
|
||||
const audio = this.normalizeAudioPayload(audioBase64, mimeType);
|
||||
|
||||
return await new Promise<string>((resolve, reject) => {
|
||||
let settled = false;
|
||||
const socket = new WebSocket(this.options.serviceUrl);
|
||||
|
||||
const finish = (handler: () => void): void => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
|
||||
settled = true;
|
||||
clearTimeout(timeout);
|
||||
socket.removeAllListeners();
|
||||
|
||||
if (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN) {
|
||||
socket.close();
|
||||
}
|
||||
|
||||
handler();
|
||||
};
|
||||
|
||||
const timeout = setTimeout(() => {
|
||||
finish(() => {
|
||||
reject(new Error(`The transcription service timed out after ${this.options.requestTimeoutMs}ms.`));
|
||||
});
|
||||
}, this.options.requestTimeoutMs);
|
||||
|
||||
socket.on('open', () => {
|
||||
try {
|
||||
socket.send(
|
||||
JSON.stringify({
|
||||
type: 'transcribe',
|
||||
id: requestId,
|
||||
language: this.options.language,
|
||||
audio,
|
||||
}),
|
||||
);
|
||||
} catch (error) {
|
||||
finish(() => {
|
||||
reject(error instanceof Error ? error : new Error('Could not send transcription request.'));
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
socket.on('message', (payload) => {
|
||||
const event = this.parseEvent(payload);
|
||||
|
||||
if (!event) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (event.id && event.id !== requestId) {
|
||||
this.logger.warn({ requestId, event }, 'Ignored transcription event for another request');
|
||||
return;
|
||||
}
|
||||
|
||||
if (event.type === 'start') {
|
||||
this.logger.info(
|
||||
{ requestId, model: event.model, language: event.language },
|
||||
'Speech transcription started',
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
if (event.type === 'delta') {
|
||||
return;
|
||||
}
|
||||
|
||||
if (event.type === 'done') {
|
||||
finish(() => {
|
||||
resolve(event.text.trim());
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
finish(() => {
|
||||
reject(new Error(event.message));
|
||||
});
|
||||
});
|
||||
|
||||
socket.on('error', (error) => {
|
||||
finish(() => {
|
||||
reject(error instanceof Error ? error : new Error('The transcription service connection failed.'));
|
||||
});
|
||||
});
|
||||
|
||||
socket.on('close', (code, reasonBuffer) => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
|
||||
const reason = reasonBuffer.toString().trim();
|
||||
const detail = reason
|
||||
? `The transcription service closed the connection unexpectedly (code=${code}, reason=${reason}).`
|
||||
: `The transcription service closed the connection unexpectedly (code=${code}).`;
|
||||
|
||||
finish(() => {
|
||||
reject(new Error(detail));
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
private normalizeAudioPayload(audioBase64: string, mimeType: string): string {
|
||||
const trimmedAudio = audioBase64.trim();
|
||||
|
||||
if (trimmedAudio.startsWith('data:')) {
|
||||
return trimmedAudio;
|
||||
}
|
||||
|
||||
const normalizedMimeType = mimeType.trim() || 'audio/webm';
|
||||
return `data:${normalizedMimeType};base64,${trimmedAudio}`;
|
||||
}
|
||||
|
||||
private parseEvent(payload: RawData): ServiceEvent | null {
|
||||
const message = this.rawDataToString(payload).trim();
|
||||
|
||||
if (!message) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
return JSON.parse(message) as ServiceEvent;
|
||||
} catch {
|
||||
this.logger.warn({ transcriptionPayload: message }, 'Ignored non-JSON transcription event');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private rawDataToString(payload: RawData): string {
|
||||
if (typeof payload === 'string') {
|
||||
return payload;
|
||||
}
|
||||
|
||||
if (payload instanceof ArrayBuffer) {
|
||||
return Buffer.from(payload).toString('utf8');
|
||||
}
|
||||
|
||||
if (Array.isArray(payload)) {
|
||||
return Buffer.concat(payload).toString('utf8');
|
||||
}
|
||||
|
||||
return payload.toString('utf8');
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user