Transcription on mac

2026-03-11 03:08:27 +01:00
parent 0da98bfd96
commit f0e2b60f43
10 changed files with 320 additions and 487 deletions
--- a/server/dist/index.js
+++ b/server/dist/index.js
@@ -1,7 +1,5 @@
 import crypto from 'node:crypto';
 import fs from 'node:fs';
-import fsPromises from 'node:fs/promises';
-import os from 'node:os';
 import path from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { TextEncoder } from 'node:util';
@@ -15,7 +13,7 @@ import { generateAuthenticationOptions, generateRegistrationOptions, verifyAuthe
 import Fastify from 'fastify';
 import { Redis } from 'ioredis';
 import { z } from 'zod';
-import { WhisperTranscriber } from './whisper-transcriber.js';
+import { SpeechTranscriber } from './speech-transcriber.js';
 dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
 const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
 const registerSchema = z.object({
@@ -100,11 +98,9 @@ const frontendDistPath = resolveProjectPath(process.env.PRIVATECHAT_WEB_DIST_DIR
 const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
 const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
 const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
-const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
-const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
-const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
-const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
-const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
+const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'ws://192.168.1.19:8080';
+const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto';
+const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000);
 const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
 const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
 const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
@@ -115,12 +111,10 @@ const webAuthnRpName = process.env.WEBAUTHN_RP_NAME ?? 'PrivateChat';
 const webAuthnUserVerification = resolveWebAuthnUserVerification(process.env.WEBAUTHN_USER_VERIFICATION);
 const frontendIndexPath = path.join(frontendDistPath, 'index.html');
 const hasFrontendBuild = fs.existsSync(frontendIndexPath);
-const whisperTranscriber = new WhisperTranscriber({
-    pythonExecutable: whisperPythonExecutable,
-    scriptPath: whisperScriptPath,
-    model: whisperModel,
-    device: whisperDevice,
-    computeType: whisperComputeType,
+const speechTranscriber = new SpeechTranscriber({
+    serviceUrl: speechTranscriptionServiceUrl,
+    language: speechTranscriptionLanguage,
+    requestTimeoutMs: speechTranscriptionTimeoutMs,
 }, app.log);
 fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
 fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
@@ -750,7 +744,7 @@ async function handleSocketMessage(userId, sessionId, socket, rawMessage) {
            });
        }
        catch (error) {
-            app.log.warn({ err: error, userId }, 'Whisper transcription failed');
+            app.log.warn({ err: error, userId }, 'Speech transcription failed');
            send(socket, {
                type: 'speech-transcription-error',
                requestId: parsed.requestId,
@@ -1150,38 +1144,7 @@ function parseClientMessage(rawMessage) {
    };
 }
 async function transcribeAudioPayload(requestId, audioBase64, mimeType) {
-    const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
-    const extension = audioExtensionForMimeType(mimeType);
-    const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
-    try {
-        await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
-        return await whisperTranscriber.transcribe(requestId, audioPath);
-    }
-    finally {
-        await fsPromises.rm(tempDirectory, { recursive: true, force: true });
-    }
-}
-function audioExtensionForMimeType(mimeType) {
-    switch (mimeType.toLowerCase()) {
-        case 'audio/webm':
-        case 'audio/webm;codecs=opus':
-            return 'webm';
-        case 'audio/ogg':
-        case 'audio/ogg;codecs=opus':
-            return 'ogg';
-        case 'audio/mp4':
-        case 'audio/m4a':
-            return 'm4a';
-        case 'audio/mpeg':
-        case 'audio/mp3':
-            return 'mp3';
-        case 'audio/wav':
-        case 'audio/wave':
-        case 'audio/x-wav':
-            return 'wav';
-        default:
-            return 'webm';
-    }
+    return await speechTranscriber.transcribe(requestId, audioBase64, mimeType);
 }
 async function generateImageFromPrompt(prompt) {
    const abortController = new AbortController();