Dictation through AI

2026-03-11 00:26:49 +01:00
parent d2c4152ea7
commit 189f989c0d
12 changed files with 911 additions and 2 deletions
--- a/server/src/index.ts
+++ b/server/src/index.ts
@@ -1,5 +1,7 @@
 import crypto from 'node:crypto';
 import fs from 'node:fs';
+import fsPromises from 'node:fs/promises';
+import os from 'node:os';
 import path from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { TextEncoder } from 'node:util';
@@ -23,6 +25,8 @@ import { Redis } from 'ioredis';
 import type WebSocket from 'ws';
 import { z } from 'zod';

+import { WhisperTranscriber } from './whisper-transcriber.js';
+
 dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });

 const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
@@ -120,6 +124,12 @@ type ClientMessage =
    }
  | {
      type: 'ping';
+    }
+  | {
+      type: 'speech-transcription';
+      requestId: string;
+      mimeType: string;
+      audioBase64: string;
    };

 type ServerMessage =
@@ -142,6 +152,16 @@ type ServerMessage =
      peerId: string;
      message: string;
    }
+  | {
+      type: 'speech-transcribed';
+      requestId: string;
+      text: string;
+    }
+  | {
+      type: 'speech-transcription-error';
+      requestId: string;
+      message: string;
+    }
  | { type: 'pong' }
  | { type: 'error'; message: string };

@@ -289,6 +309,12 @@ const signalMessageSchema = z.discriminatedUnion('type', [
  z.object({
    type: z.literal('ping'),
  }),
+  z.object({
+    type: z.literal('speech-transcription'),
+    requestId: z.string().uuid(),
+    mimeType: z.string().trim().min(1).max(128),
+    audioBase64: z.string().min(1).max(32_000_000),
+  }),
 ]);

 const app = Fastify({ logger: true, trustProxy: true });
@@ -307,6 +333,11 @@ const frontendDistPath = resolveProjectPath(
 const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
 const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
 const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
+const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
+const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
+const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
+const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
+const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
 const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
 const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
 const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
@@ -320,6 +351,17 @@ const webAuthnUserVerification = resolveWebAuthnUserVerification(
 const frontendIndexPath = path.join(frontendDistPath, 'index.html');
 const hasFrontendBuild = fs.existsSync(frontendIndexPath);

+const whisperTranscriber = new WhisperTranscriber(
+  {
+    pythonExecutable: whisperPythonExecutable,
+    scriptPath: whisperScriptPath,
+    model: whisperModel,
+    device: whisperDevice,
+    computeType: whisperComputeType,
+  },
+  app.log,
+);
+
 fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
 fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });

@@ -1127,6 +1169,27 @@ async function handleSocketMessage(
    return;
  }

+  if (parsed.type === 'speech-transcription') {
+    try {
+      const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType);
+
+      send(socket, {
+        type: 'speech-transcribed',
+        requestId: parsed.requestId,
+        text,
+      });
+    } catch (error) {
+      app.log.warn({ err: error, userId }, 'Whisper transcription failed');
+      send(socket, {
+        type: 'speech-transcription-error',
+        requestId: parsed.requestId,
+        message: error instanceof Error ? error.message : 'Speech transcription failed.',
+      });
+    }
+
+    return;
+  }
+
  let delivered = 0;
  const recipientSockets = socketsByUserId.get(parsed.to);

@@ -1668,6 +1731,15 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
    };
  }

+  if (parsed.data.type === 'speech-transcription') {
+    return {
+      type: 'speech-transcription',
+      requestId: parsed.data.requestId,
+      mimeType: parsed.data.mimeType,
+      audioBase64: parsed.data.audioBase64,
+    };
+  }
+
  return {
    type: 'signal',
    to: parsed.data.to,
@@ -1675,6 +1747,42 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
  };
 }

+async function transcribeAudioPayload(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
+  const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
+  const extension = audioExtensionForMimeType(mimeType);
+  const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
+
+  try {
+    await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
+    return await whisperTranscriber.transcribe(requestId, audioPath);
+  } finally {
+    await fsPromises.rm(tempDirectory, { recursive: true, force: true });
+  }
+}
+
+function audioExtensionForMimeType(mimeType: string): string {
+  switch (mimeType.toLowerCase()) {
+    case 'audio/webm':
+    case 'audio/webm;codecs=opus':
+      return 'webm';
+    case 'audio/ogg':
+    case 'audio/ogg;codecs=opus':
+      return 'ogg';
+    case 'audio/mp4':
+    case 'audio/m4a':
+      return 'm4a';
+    case 'audio/mpeg':
+    case 'audio/mp3':
+      return 'mp3';
+    case 'audio/wav':
+    case 'audio/wave':
+    case 'audio/x-wav':
+      return 'wav';
+    default:
+      return 'webm';
+  }
+}
+
 async function generateImageFromPrompt(prompt: string): Promise<{ imageBase64: string; mimeType: string }> {
  const abortController = new AbortController();
  const timeoutId = setTimeout(() => abortController.abort(), 120_000);