Transcription on mac
This commit is contained in:
@@ -1,7 +1,5 @@
|
||||
import crypto from 'node:crypto';
|
||||
import fs from 'node:fs';
|
||||
import fsPromises from 'node:fs/promises';
|
||||
import os from 'node:os';
|
||||
import path from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { TextEncoder } from 'node:util';
|
||||
@@ -25,7 +23,7 @@ import { Redis } from 'ioredis';
|
||||
import type WebSocket from 'ws';
|
||||
import { z } from 'zod';
|
||||
|
||||
import { WhisperTranscriber } from './whisper-transcriber.js';
|
||||
import { SpeechTranscriber } from './speech-transcriber.js';
|
||||
|
||||
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
|
||||
|
||||
@@ -333,11 +331,9 @@ const frontendDistPath = resolveProjectPath(
|
||||
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
|
||||
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
|
||||
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
|
||||
const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
|
||||
const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
|
||||
const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
|
||||
const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
|
||||
const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
|
||||
const speechTranscriptionServiceUrl = process.env.PRIVATECHAT_TRANSCRIPTION_WS_URL ?? 'ws://192.168.1.19:8080';
|
||||
const speechTranscriptionLanguage = process.env.PRIVATECHAT_TRANSCRIPTION_LANGUAGE ?? 'auto';
|
||||
const speechTranscriptionTimeoutMs = Number(process.env.PRIVATECHAT_TRANSCRIPTION_TIMEOUT_MS ?? 120_000);
|
||||
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
|
||||
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
|
||||
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
|
||||
@@ -351,13 +347,11 @@ const webAuthnUserVerification = resolveWebAuthnUserVerification(
|
||||
const frontendIndexPath = path.join(frontendDistPath, 'index.html');
|
||||
const hasFrontendBuild = fs.existsSync(frontendIndexPath);
|
||||
|
||||
const whisperTranscriber = new WhisperTranscriber(
|
||||
const speechTranscriber = new SpeechTranscriber(
|
||||
{
|
||||
pythonExecutable: whisperPythonExecutable,
|
||||
scriptPath: whisperScriptPath,
|
||||
model: whisperModel,
|
||||
device: whisperDevice,
|
||||
computeType: whisperComputeType,
|
||||
serviceUrl: speechTranscriptionServiceUrl,
|
||||
language: speechTranscriptionLanguage,
|
||||
requestTimeoutMs: speechTranscriptionTimeoutMs,
|
||||
},
|
||||
app.log,
|
||||
);
|
||||
@@ -1179,7 +1173,7 @@ async function handleSocketMessage(
|
||||
text,
|
||||
});
|
||||
} catch (error) {
|
||||
app.log.warn({ err: error, userId }, 'Whisper transcription failed');
|
||||
app.log.warn({ err: error, userId }, 'Speech transcription failed');
|
||||
send(socket, {
|
||||
type: 'speech-transcription-error',
|
||||
requestId: parsed.requestId,
|
||||
@@ -1748,39 +1742,7 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
|
||||
}
|
||||
|
||||
async function transcribeAudioPayload(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
|
||||
const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
|
||||
const extension = audioExtensionForMimeType(mimeType);
|
||||
const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
|
||||
|
||||
try {
|
||||
await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
|
||||
return await whisperTranscriber.transcribe(requestId, audioPath);
|
||||
} finally {
|
||||
await fsPromises.rm(tempDirectory, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
function audioExtensionForMimeType(mimeType: string): string {
|
||||
switch (mimeType.toLowerCase()) {
|
||||
case 'audio/webm':
|
||||
case 'audio/webm;codecs=opus':
|
||||
return 'webm';
|
||||
case 'audio/ogg':
|
||||
case 'audio/ogg;codecs=opus':
|
||||
return 'ogg';
|
||||
case 'audio/mp4':
|
||||
case 'audio/m4a':
|
||||
return 'm4a';
|
||||
case 'audio/mpeg':
|
||||
case 'audio/mp3':
|
||||
return 'mp3';
|
||||
case 'audio/wav':
|
||||
case 'audio/wave':
|
||||
case 'audio/x-wav':
|
||||
return 'wav';
|
||||
default:
|
||||
return 'webm';
|
||||
}
|
||||
return await speechTranscriber.transcribe(requestId, audioBase64, mimeType);
|
||||
}
|
||||
|
||||
async function generateImageFromPrompt(prompt: string): Promise<{ imageBase64: string; mimeType: string }> {
|
||||
|
||||
173
server/src/speech-transcriber.ts
Normal file
173
server/src/speech-transcriber.ts
Normal file
@@ -0,0 +1,173 @@
|
||||
import WebSocket, { type RawData } from 'ws';
|
||||
|
||||
type LoggerLike = {
|
||||
info: (payload: unknown, message?: string) => void;
|
||||
warn: (payload: unknown, message?: string) => void;
|
||||
error: (payload: unknown, message?: string) => void;
|
||||
};
|
||||
|
||||
type SpeechTranscriberOptions = {
|
||||
serviceUrl: string;
|
||||
language: string;
|
||||
requestTimeoutMs: number;
|
||||
};
|
||||
|
||||
type ServiceEvent =
|
||||
| { type: 'start'; id: string; model: string; language: string }
|
||||
| { type: 'delta'; id: string; text: string; fullText: string }
|
||||
| { type: 'done'; id: string; text: string }
|
||||
| { type: 'error'; id?: string; message: string };
|
||||
|
||||
export class SpeechTranscriber {
|
||||
constructor(
|
||||
private readonly options: SpeechTranscriberOptions,
|
||||
private readonly logger: LoggerLike,
|
||||
) {}
|
||||
|
||||
async transcribe(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
|
||||
const audio = this.normalizeAudioPayload(audioBase64, mimeType);
|
||||
|
||||
return await new Promise<string>((resolve, reject) => {
|
||||
let settled = false;
|
||||
const socket = new WebSocket(this.options.serviceUrl);
|
||||
|
||||
const finish = (handler: () => void): void => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
|
||||
settled = true;
|
||||
clearTimeout(timeout);
|
||||
socket.removeAllListeners();
|
||||
|
||||
if (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN) {
|
||||
socket.close();
|
||||
}
|
||||
|
||||
handler();
|
||||
};
|
||||
|
||||
const timeout = setTimeout(() => {
|
||||
finish(() => {
|
||||
reject(new Error(`The transcription service timed out after ${this.options.requestTimeoutMs}ms.`));
|
||||
});
|
||||
}, this.options.requestTimeoutMs);
|
||||
|
||||
socket.on('open', () => {
|
||||
try {
|
||||
socket.send(
|
||||
JSON.stringify({
|
||||
type: 'transcribe',
|
||||
id: requestId,
|
||||
language: this.options.language,
|
||||
audio,
|
||||
}),
|
||||
);
|
||||
} catch (error) {
|
||||
finish(() => {
|
||||
reject(error instanceof Error ? error : new Error('Could not send transcription request.'));
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
socket.on('message', (payload) => {
|
||||
const event = this.parseEvent(payload);
|
||||
|
||||
if (!event) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (event.id && event.id !== requestId) {
|
||||
this.logger.warn({ requestId, event }, 'Ignored transcription event for another request');
|
||||
return;
|
||||
}
|
||||
|
||||
if (event.type === 'start') {
|
||||
this.logger.info(
|
||||
{ requestId, model: event.model, language: event.language },
|
||||
'Speech transcription started',
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
if (event.type === 'delta') {
|
||||
return;
|
||||
}
|
||||
|
||||
if (event.type === 'done') {
|
||||
finish(() => {
|
||||
resolve(event.text.trim());
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
finish(() => {
|
||||
reject(new Error(event.message));
|
||||
});
|
||||
});
|
||||
|
||||
socket.on('error', (error) => {
|
||||
finish(() => {
|
||||
reject(error instanceof Error ? error : new Error('The transcription service connection failed.'));
|
||||
});
|
||||
});
|
||||
|
||||
socket.on('close', (code, reasonBuffer) => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
|
||||
const reason = reasonBuffer.toString().trim();
|
||||
const detail = reason
|
||||
? `The transcription service closed the connection unexpectedly (code=${code}, reason=${reason}).`
|
||||
: `The transcription service closed the connection unexpectedly (code=${code}).`;
|
||||
|
||||
finish(() => {
|
||||
reject(new Error(detail));
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
private normalizeAudioPayload(audioBase64: string, mimeType: string): string {
|
||||
const trimmedAudio = audioBase64.trim();
|
||||
|
||||
if (trimmedAudio.startsWith('data:')) {
|
||||
return trimmedAudio;
|
||||
}
|
||||
|
||||
const normalizedMimeType = mimeType.trim() || 'audio/webm';
|
||||
return `data:${normalizedMimeType};base64,${trimmedAudio}`;
|
||||
}
|
||||
|
||||
private parseEvent(payload: RawData): ServiceEvent | null {
|
||||
const message = this.rawDataToString(payload).trim();
|
||||
|
||||
if (!message) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
return JSON.parse(message) as ServiceEvent;
|
||||
} catch {
|
||||
this.logger.warn({ transcriptionPayload: message }, 'Ignored non-JSON transcription event');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private rawDataToString(payload: RawData): string {
|
||||
if (typeof payload === 'string') {
|
||||
return payload;
|
||||
}
|
||||
|
||||
if (payload instanceof ArrayBuffer) {
|
||||
return Buffer.from(payload).toString('utf8');
|
||||
}
|
||||
|
||||
if (Array.isArray(payload)) {
|
||||
return Buffer.concat(payload).toString('utf8');
|
||||
}
|
||||
|
||||
return payload.toString('utf8');
|
||||
}
|
||||
}
|
||||
@@ -1,176 +0,0 @@
|
||||
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
|
||||
import { createInterface } from 'node:readline';
|
||||
|
||||
type LoggerLike = {
|
||||
info: (payload: unknown, message?: string) => void;
|
||||
warn: (payload: unknown, message?: string) => void;
|
||||
error: (payload: unknown, message?: string) => void;
|
||||
};
|
||||
|
||||
type WhisperTranscriberOptions = {
|
||||
pythonExecutable: string;
|
||||
scriptPath: string;
|
||||
model: string;
|
||||
device: string;
|
||||
computeType: string;
|
||||
};
|
||||
|
||||
type WorkerEvent =
|
||||
| { type: 'ready'; model: string }
|
||||
| { type: 'result'; requestId: string; text: string }
|
||||
| { type: 'error'; requestId?: string; message: string }
|
||||
| { type: 'fatal'; message: string };
|
||||
|
||||
export class WhisperTranscriber {
|
||||
private worker: ChildProcessWithoutNullStreams | null = null;
|
||||
private readyPromise: Promise<void> | null = null;
|
||||
private resolveReady: (() => void) | null = null;
|
||||
private rejectReady: ((reason?: unknown) => void) | null = null;
|
||||
private readonly pendingRequests = new Map<
|
||||
string,
|
||||
{ resolve: (text: string) => void; reject: (reason?: unknown) => void }
|
||||
>();
|
||||
|
||||
constructor(
|
||||
private readonly options: WhisperTranscriberOptions,
|
||||
private readonly logger: LoggerLike,
|
||||
) {}
|
||||
|
||||
async transcribe(requestId: string, audioPath: string): Promise<string> {
|
||||
await this.ensureWorker();
|
||||
|
||||
if (!this.worker || this.worker.stdin.destroyed) {
|
||||
throw new Error('The Whisper worker is not available.');
|
||||
}
|
||||
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
this.pendingRequests.set(requestId, { resolve, reject });
|
||||
|
||||
try {
|
||||
this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`);
|
||||
} catch (error) {
|
||||
this.pendingRequests.delete(requestId);
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private async ensureWorker(): Promise<void> {
|
||||
if (this.readyPromise) {
|
||||
return this.readyPromise;
|
||||
}
|
||||
|
||||
this.worker = spawn(
|
||||
this.options.pythonExecutable,
|
||||
[
|
||||
this.options.scriptPath,
|
||||
'--model',
|
||||
this.options.model,
|
||||
'--device',
|
||||
this.options.device,
|
||||
'--compute-type',
|
||||
this.options.computeType,
|
||||
],
|
||||
{ stdio: ['pipe', 'pipe', 'pipe'] },
|
||||
);
|
||||
|
||||
this.readyPromise = new Promise<void>((resolve, reject) => {
|
||||
this.resolveReady = resolve;
|
||||
this.rejectReady = reject;
|
||||
});
|
||||
|
||||
const stdout = createInterface({ input: this.worker.stdout });
|
||||
stdout.on('line', (line) => {
|
||||
this.handleWorkerLine(line);
|
||||
});
|
||||
|
||||
this.worker.stderr.on('data', (chunk) => {
|
||||
const message = chunk.toString().trim();
|
||||
|
||||
if (message) {
|
||||
this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr');
|
||||
}
|
||||
});
|
||||
|
||||
this.worker.on('error', (error) => {
|
||||
this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.'));
|
||||
});
|
||||
|
||||
this.worker.on('exit', (code, signal) => {
|
||||
this.failWorker(
|
||||
new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`),
|
||||
);
|
||||
});
|
||||
|
||||
return this.readyPromise;
|
||||
}
|
||||
|
||||
private handleWorkerLine(line: string): void {
|
||||
let payload: WorkerEvent;
|
||||
|
||||
try {
|
||||
payload = JSON.parse(line) as WorkerEvent;
|
||||
} catch {
|
||||
this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output');
|
||||
return;
|
||||
}
|
||||
|
||||
if (payload.type === 'ready') {
|
||||
this.logger.info({ model: payload.model }, 'Whisper worker ready');
|
||||
this.resolveReady?.();
|
||||
this.resolveReady = null;
|
||||
this.rejectReady = null;
|
||||
return;
|
||||
}
|
||||
|
||||
if (payload.type === 'fatal') {
|
||||
this.failWorker(new Error(payload.message));
|
||||
return;
|
||||
}
|
||||
|
||||
if (payload.type === 'error') {
|
||||
if (!payload.requestId) {
|
||||
this.failWorker(new Error(payload.message));
|
||||
return;
|
||||
}
|
||||
|
||||
const pendingRequest = this.pendingRequests.get(payload.requestId);
|
||||
|
||||
if (!pendingRequest) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.pendingRequests.delete(payload.requestId);
|
||||
pendingRequest.reject(new Error(payload.message));
|
||||
return;
|
||||
}
|
||||
|
||||
const pendingRequest = this.pendingRequests.get(payload.requestId);
|
||||
|
||||
if (!pendingRequest) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.pendingRequests.delete(payload.requestId);
|
||||
pendingRequest.resolve(payload.text.trim());
|
||||
}
|
||||
|
||||
private failWorker(error: Error): void {
|
||||
if (this.worker) {
|
||||
this.worker.removeAllListeners();
|
||||
this.worker = null;
|
||||
}
|
||||
|
||||
this.rejectReady?.(error);
|
||||
this.resolveReady = null;
|
||||
this.rejectReady = null;
|
||||
this.readyPromise = null;
|
||||
|
||||
for (const { reject } of this.pendingRequests.values()) {
|
||||
reject(error);
|
||||
}
|
||||
|
||||
this.pendingRequests.clear();
|
||||
this.logger.error({ err: error }, 'Whisper worker failed');
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user