Transcription on mac

This commit is contained in:
2026-03-11 03:08:27 +01:00
parent 0da98bfd96
commit f0e2b60f43
10 changed files with 320 additions and 487 deletions

View File

@@ -1,7 +1,5 @@
import crypto from 'node:crypto';
import fs from 'node:fs';
import fsPromises from 'node:fs/promises';
import os from 'node:os';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { TextEncoder } from 'node:util';
@@ -25,7 +23,7 @@ import { Redis } from 'ioredis';
import type WebSocket from 'ws';
import { z } from 'zod';
import { WhisperTranscriber } from './whisper-transcriber.js';
import { SpeechTranscriber } from './speech-transcriber.js';
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
@@ -333,11 +331,9 @@ const frontendDistPath = resolveProjectPath(
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'ws://192.168.1.19:8080';
const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto';
const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000);
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
@@ -351,13 +347,11 @@ const webAuthnUserVerification = resolveWebAuthnUserVerification(
const frontendIndexPath = path.join(frontendDistPath, 'index.html');
const hasFrontendBuild = fs.existsSync(frontendIndexPath);
const whisperTranscriber = new WhisperTranscriber(
const speechTranscriber = new SpeechTranscriber(
{
pythonExecutable: whisperPythonExecutable,
scriptPath: whisperScriptPath,
model: whisperModel,
device: whisperDevice,
computeType: whisperComputeType,
serviceUrl: speechTranscriptionServiceUrl,
language: speechTranscriptionLanguage,
requestTimeoutMs: speechTranscriptionTimeoutMs,
},
app.log,
);
@@ -1179,7 +1173,7 @@ async function handleSocketMessage(
text,
});
} catch (error) {
app.log.warn({ err: error, userId }, 'Whisper transcription failed');
app.log.warn({ err: error, userId }, 'Speech transcription failed');
send(socket, {
type: 'speech-transcription-error',
requestId: parsed.requestId,
@@ -1748,39 +1742,7 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
}
async function transcribeAudioPayload(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
const extension = audioExtensionForMimeType(mimeType);
const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
try {
await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
return await whisperTranscriber.transcribe(requestId, audioPath);
} finally {
await fsPromises.rm(tempDirectory, { recursive: true, force: true });
}
}
function audioExtensionForMimeType(mimeType: string): string {
switch (mimeType.toLowerCase()) {
case 'audio/webm':
case 'audio/webm;codecs=opus':
return 'webm';
case 'audio/ogg':
case 'audio/ogg;codecs=opus':
return 'ogg';
case 'audio/mp4':
case 'audio/m4a':
return 'm4a';
case 'audio/mpeg':
case 'audio/mp3':
return 'mp3';
case 'audio/wav':
case 'audio/wave':
case 'audio/x-wav':
return 'wav';
default:
return 'webm';
}
return await speechTranscriber.transcribe(requestId, audioBase64, mimeType);
}
async function generateImageFromPrompt(prompt: string): Promise<{ imageBase64: string; mimeType: string }> {