Dictation through AI

This commit is contained in:
2026-03-11 00:26:49 +01:00
parent d2c4152ea7
commit 189f989c0d
12 changed files with 911 additions and 2 deletions

82
server/dist/index.js vendored
View File

@@ -1,5 +1,7 @@
import crypto from 'node:crypto';
import fs from 'node:fs';
import fsPromises from 'node:fs/promises';
import os from 'node:os';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { TextEncoder } from 'node:util';
@@ -13,6 +15,7 @@ import { generateAuthenticationOptions, generateRegistrationOptions, verifyAuthe
import Fastify from 'fastify';
import { Redis } from 'ioredis';
import { z } from 'zod';
import { WhisperTranscriber } from './whisper-transcriber.js';
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
const registerSchema = z.object({
@@ -81,6 +84,12 @@ const signalMessageSchema = z.discriminatedUnion('type', [
z.object({
type: z.literal('ping'),
}),
z.object({
type: z.literal('speech-transcription'),
requestId: z.string().uuid(),
mimeType: z.string().trim().min(1).max(128),
audioBase64: z.string().min(1).max(32_000_000),
}),
]);
const app = Fastify({ logger: true, trustProxy: true });
const approvalAdminUsername = 'ladparis';
@@ -91,6 +100,11 @@ const frontendDistPath = resolveProjectPath(process.env.PRIVATECHAT_WEB_DIST_DIR
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
@@ -101,6 +115,13 @@ const webAuthnRpName = process.env.WEBAUTHN_RP_NAME ?? 'PrivateChat';
const webAuthnUserVerification = resolveWebAuthnUserVerification(process.env.WEBAUTHN_USER_VERIFICATION);
const frontendIndexPath = path.join(frontendDistPath, 'index.html');
const hasFrontendBuild = fs.existsSync(frontendIndexPath);
const whisperTranscriber = new WhisperTranscriber({
pythonExecutable: whisperPythonExecutable,
scriptPath: whisperScriptPath,
model: whisperModel,
device: whisperDevice,
computeType: whisperComputeType,
}, app.log);
fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
const encryptionKey = deriveEncryptionKey(loadOrCreateMasterKey(masterKeyPath));
@@ -719,6 +740,25 @@ async function handleSocketMessage(userId, sessionId, socket, rawMessage) {
}
return;
}
if (parsed.type === 'speech-transcription') {
try {
const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType);
send(socket, {
type: 'speech-transcribed',
requestId: parsed.requestId,
text,
});
}
catch (error) {
app.log.warn({ err: error, userId }, 'Whisper transcription failed');
send(socket, {
type: 'speech-transcription-error',
requestId: parsed.requestId,
message: error instanceof Error ? error.message : 'Speech transcription failed.',
});
}
return;
}
let delivered = 0;
const recipientSockets = socketsByUserId.get(parsed.to);
if (recipientSockets) {
@@ -1095,12 +1135,54 @@ function parseClientMessage(rawMessage) {
prompt: parsed.data.prompt,
};
}
if (parsed.data.type === 'speech-transcription') {
return {
type: 'speech-transcription',
requestId: parsed.data.requestId,
mimeType: parsed.data.mimeType,
audioBase64: parsed.data.audioBase64,
};
}
return {
type: 'signal',
to: parsed.data.to,
signal: normalizeSignal(parsed.data.signal),
};
}
async function transcribeAudioPayload(requestId, audioBase64, mimeType) {
const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
const extension = audioExtensionForMimeType(mimeType);
const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
try {
await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
return await whisperTranscriber.transcribe(requestId, audioPath);
}
finally {
await fsPromises.rm(tempDirectory, { recursive: true, force: true });
}
}
function audioExtensionForMimeType(mimeType) {
switch (mimeType.toLowerCase()) {
case 'audio/webm':
case 'audio/webm;codecs=opus':
return 'webm';
case 'audio/ogg':
case 'audio/ogg;codecs=opus':
return 'ogg';
case 'audio/mp4':
case 'audio/m4a':
return 'm4a';
case 'audio/mpeg':
case 'audio/mp3':
return 'mp3';
case 'audio/wav':
case 'audio/wave':
case 'audio/x-wav':
return 'wav';
default:
return 'webm';
}
}
async function generateImageFromPrompt(prompt) {
const abortController = new AbortController();
const timeoutId = setTimeout(() => abortController.abort(), 120_000);

121
server/dist/whisper-transcriber.js vendored Normal file
View File

@@ -0,0 +1,121 @@
import { spawn } from 'node:child_process';
import { createInterface } from 'node:readline';
export class WhisperTranscriber {
options;
logger;
worker = null;
readyPromise = null;
resolveReady = null;
rejectReady = null;
pendingRequests = new Map();
constructor(options, logger) {
this.options = options;
this.logger = logger;
}
async transcribe(requestId, audioPath) {
await this.ensureWorker();
if (!this.worker || this.worker.stdin.destroyed) {
throw new Error('The Whisper worker is not available.');
}
return new Promise((resolve, reject) => {
this.pendingRequests.set(requestId, { resolve, reject });
try {
this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`);
}
catch (error) {
this.pendingRequests.delete(requestId);
reject(error);
}
});
}
async ensureWorker() {
if (this.readyPromise) {
return this.readyPromise;
}
this.worker = spawn(this.options.pythonExecutable, [
this.options.scriptPath,
'--model',
this.options.model,
'--device',
this.options.device,
'--compute-type',
this.options.computeType,
], { stdio: ['pipe', 'pipe', 'pipe'] });
this.readyPromise = new Promise((resolve, reject) => {
this.resolveReady = resolve;
this.rejectReady = reject;
});
const stdout = createInterface({ input: this.worker.stdout });
stdout.on('line', (line) => {
this.handleWorkerLine(line);
});
this.worker.stderr.on('data', (chunk) => {
const message = chunk.toString().trim();
if (message) {
this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr');
}
});
this.worker.on('error', (error) => {
this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.'));
});
this.worker.on('exit', (code, signal) => {
this.failWorker(new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`));
});
return this.readyPromise;
}
handleWorkerLine(line) {
let payload;
try {
payload = JSON.parse(line);
}
catch {
this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output');
return;
}
if (payload.type === 'ready') {
this.logger.info({ model: payload.model }, 'Whisper worker ready');
this.resolveReady?.();
this.resolveReady = null;
this.rejectReady = null;
return;
}
if (payload.type === 'fatal') {
this.failWorker(new Error(payload.message));
return;
}
if (payload.type === 'error') {
if (!payload.requestId) {
this.failWorker(new Error(payload.message));
return;
}
const pendingRequest = this.pendingRequests.get(payload.requestId);
if (!pendingRequest) {
return;
}
this.pendingRequests.delete(payload.requestId);
pendingRequest.reject(new Error(payload.message));
return;
}
const pendingRequest = this.pendingRequests.get(payload.requestId);
if (!pendingRequest) {
return;
}
this.pendingRequests.delete(payload.requestId);
pendingRequest.resolve(payload.text.trim());
}
failWorker(error) {
if (this.worker) {
this.worker.removeAllListeners();
this.worker = null;
}
this.rejectReady?.(error);
this.resolveReady = null;
this.rejectReady = null;
this.readyPromise = null;
for (const { reject } of this.pendingRequests.values()) {
reject(error);
}
this.pendingRequests.clear();
this.logger.error({ err: error }, 'Whisper worker failed');
}
}