Dictation through AI
This commit is contained in:
@@ -314,6 +314,30 @@
|
|||||||
{{ isRecordingVoice() ? '⏹️' : '🎙️' }}
|
{{ isRecordingVoice() ? '⏹️' : '🎙️' }}
|
||||||
</button>
|
</button>
|
||||||
|
|
||||||
|
<button
|
||||||
|
class="composer-dictation"
|
||||||
|
type="button"
|
||||||
|
[disabled]="!session.isSelectedPeerReady() || session.signalingState() !== 'connected' || isTranscribingDictation()"
|
||||||
|
(click)="toggleDictation(composerTextarea)"
|
||||||
|
[title]="
|
||||||
|
isDictating()
|
||||||
|
? 'Stop dictation and transcribe'
|
||||||
|
: isTranscribingDictation()
|
||||||
|
? 'Transcribing dictated audio'
|
||||||
|
: 'Start dictation'
|
||||||
|
"
|
||||||
|
[attr.aria-label]="
|
||||||
|
isDictating()
|
||||||
|
? 'Stop dictation and transcribe'
|
||||||
|
: isTranscribingDictation()
|
||||||
|
? 'Transcribing dictated audio'
|
||||||
|
: 'Start dictation'
|
||||||
|
"
|
||||||
|
[class.composer-dictation-active]="isDictating() || isTranscribingDictation()"
|
||||||
|
>
|
||||||
|
{{ isDictating() ? '🛑' : isTranscribingDictation() ? '⏳' : '🗣️' }}
|
||||||
|
</button>
|
||||||
|
|
||||||
<input
|
<input
|
||||||
#fileInput
|
#fileInput
|
||||||
class="composer-file-input"
|
class="composer-file-input"
|
||||||
|
|||||||
@@ -357,6 +357,7 @@
|
|||||||
|
|
||||||
.composer-camera,
|
.composer-camera,
|
||||||
.composer-call,
|
.composer-call,
|
||||||
|
.composer-dictation,
|
||||||
.composer-hangup,
|
.composer-hangup,
|
||||||
.composer-voice,
|
.composer-voice,
|
||||||
.composer-image-generate,
|
.composer-image-generate,
|
||||||
@@ -398,6 +399,12 @@
|
|||||||
background: var(--badge-background);
|
background: var(--badge-background);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.composer-dictation {
|
||||||
|
color: var(--page-text);
|
||||||
|
background: linear-gradient(135deg, #f6d8ff, #ffcadb);
|
||||||
|
}
|
||||||
|
|
||||||
|
.composer-dictation-active,
|
||||||
.composer-hangup,
|
.composer-hangup,
|
||||||
.composer-voice-recording {
|
.composer-voice-recording {
|
||||||
color: #fff;
|
color: #fff;
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import { CommonModule } from '@angular/common';
|
import { CommonModule } from '@angular/common';
|
||||||
import { Component, computed, effect, ElementRef, inject, OnDestroy, signal, ViewChild } from '@angular/core';
|
import { Component, computed, effect, ElementRef, inject, NgZone, OnDestroy, signal, ViewChild } from '@angular/core';
|
||||||
import { toSignal } from '@angular/core/rxjs-interop';
|
import { toSignal } from '@angular/core/rxjs-interop';
|
||||||
import { FormsModule } from '@angular/forms';
|
import { FormsModule } from '@angular/forms';
|
||||||
import { ActivatedRoute, Router, RouterLink } from '@angular/router';
|
import { ActivatedRoute, Router, RouterLink } from '@angular/router';
|
||||||
@@ -18,6 +18,7 @@ import type { ChatEntry, ConnectionState, PeerSummary } from './models';
|
|||||||
export class ChatPageComponent implements OnDestroy {
|
export class ChatPageComponent implements OnDestroy {
|
||||||
private readonly route = inject(ActivatedRoute);
|
private readonly route = inject(ActivatedRoute);
|
||||||
private readonly router = inject(Router);
|
private readonly router = inject(Router);
|
||||||
|
private readonly ngZone = inject(NgZone);
|
||||||
private readonly routeParamMap = toSignal(this.route.paramMap, {
|
private readonly routeParamMap = toSignal(this.route.paramMap, {
|
||||||
initialValue: this.route.snapshot.paramMap,
|
initialValue: this.route.snapshot.paramMap,
|
||||||
});
|
});
|
||||||
@@ -28,6 +29,14 @@ export class ChatPageComponent implements OnDestroy {
|
|||||||
private voiceChunks: Blob[] = [];
|
private voiceChunks: Blob[] = [];
|
||||||
private discardRecordedVoice = false;
|
private discardRecordedVoice = false;
|
||||||
private recordingPeerId: string | null = null;
|
private recordingPeerId: string | null = null;
|
||||||
|
private dictationRecorder: MediaRecorder | null = null;
|
||||||
|
private dictationStream: MediaStream | null = null;
|
||||||
|
private dictationChunks: Blob[] = [];
|
||||||
|
private dictationBaseText = '';
|
||||||
|
private discardRecordedDictation = false;
|
||||||
|
private dictationCompletionPromise: Promise<void> | null = null;
|
||||||
|
private resolveDictationCompletion: (() => void) | null = null;
|
||||||
|
private dictationApplyToken = 0;
|
||||||
@ViewChild('callAudioElement')
|
@ViewChild('callAudioElement')
|
||||||
set callAudioElementRef(value: ElementRef<HTMLAudioElement> | undefined) {
|
set callAudioElementRef(value: ElementRef<HTMLAudioElement> | undefined) {
|
||||||
this.callAudioElement = value;
|
this.callAudioElement = value;
|
||||||
@@ -39,6 +48,8 @@ export class ChatPageComponent implements OnDestroy {
|
|||||||
readonly forwardingEntryId = signal<string | null>(null);
|
readonly forwardingEntryId = signal<string | null>(null);
|
||||||
readonly emojiPickerOpen = signal(false);
|
readonly emojiPickerOpen = signal(false);
|
||||||
readonly isRecordingVoice = signal(false);
|
readonly isRecordingVoice = signal(false);
|
||||||
|
readonly isDictating = signal(false);
|
||||||
|
readonly isTranscribingDictation = signal(false);
|
||||||
readonly emojiOptions = [
|
readonly emojiOptions = [
|
||||||
'😀', '😁', '😂', '🤣', '😊',
|
'😀', '😁', '😂', '🤣', '😊',
|
||||||
'😉', '😍', '😘', '😎', '🤔',
|
'😉', '😍', '😘', '😎', '🤔',
|
||||||
@@ -152,6 +163,7 @@ export class ChatPageComponent implements OnDestroy {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ngOnDestroy(): void {
|
ngOnDestroy(): void {
|
||||||
|
void this.stopDictation(true);
|
||||||
this.stopVoiceRecording(true);
|
this.stopVoiceRecording(true);
|
||||||
this.detachCallAudioSource();
|
this.detachCallAudioSource();
|
||||||
}
|
}
|
||||||
@@ -174,6 +186,7 @@ export class ChatPageComponent implements OnDestroy {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
await this.stopDictation(false);
|
||||||
await this.session.sendText(peerId, this.messageText);
|
await this.session.sendText(peerId, this.messageText);
|
||||||
this.messageText = '';
|
this.messageText = '';
|
||||||
this.emojiPickerOpen.set(false);
|
this.emojiPickerOpen.set(false);
|
||||||
@@ -188,6 +201,7 @@ export class ChatPageComponent implements OnDestroy {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
await this.stopDictation(false);
|
||||||
const requested = await this.session.requestGeneratedImage(peerId, this.messageText);
|
const requested = await this.session.requestGeneratedImage(peerId, this.messageText);
|
||||||
|
|
||||||
if (!requested) {
|
if (!requested) {
|
||||||
@@ -262,6 +276,92 @@ export class ChatPageComponent implements OnDestroy {
|
|||||||
input.value = '';
|
input.value = '';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async toggleDictation(textarea: HTMLTextAreaElement): Promise<void> {
|
||||||
|
if (this.isDictating()) {
|
||||||
|
await this.stopDictation(false);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.isTranscribingDictation()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const peerId = this.peerId();
|
||||||
|
|
||||||
|
if (!peerId) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof MediaRecorder === 'undefined' || typeof navigator === 'undefined') {
|
||||||
|
this.session.error.set('This browser does not support dictation recording.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof navigator.mediaDevices?.getUserMedia !== 'function') {
|
||||||
|
this.session.error.set('This browser cannot access the microphone for dictation.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.dictationBaseText = this.messageText;
|
||||||
|
this.discardRecordedDictation = false;
|
||||||
|
this.dictationApplyToken += 1;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||||
|
const preferredMimeType = this.preferredVoiceMimeType();
|
||||||
|
const recorder = preferredMimeType
|
||||||
|
? new MediaRecorder(stream, { mimeType: preferredMimeType })
|
||||||
|
: new MediaRecorder(stream);
|
||||||
|
const applyToken = this.dictationApplyToken;
|
||||||
|
|
||||||
|
this.dictationChunks = [];
|
||||||
|
this.dictationStream = stream;
|
||||||
|
this.dictationRecorder = recorder;
|
||||||
|
this.dictationCompletionPromise = new Promise<void>((resolve) => {
|
||||||
|
this.resolveDictationCompletion = resolve;
|
||||||
|
});
|
||||||
|
|
||||||
|
recorder.ondataavailable = (event) => {
|
||||||
|
if (event.data.size > 0) {
|
||||||
|
this.dictationChunks.push(event.data);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
recorder.onerror = () => {
|
||||||
|
this.ngZone.run(() => {
|
||||||
|
this.session.error.set('Could not record dictation audio.');
|
||||||
|
this.cleanupDictationRecorder();
|
||||||
|
this.finishDictationCompletion();
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
recorder.onstop = () => {
|
||||||
|
const shouldDiscard = this.discardRecordedDictation;
|
||||||
|
const mimeType = recorder.mimeType || preferredMimeType || 'audio/webm';
|
||||||
|
const blob = new Blob(this.dictationChunks, { type: mimeType });
|
||||||
|
|
||||||
|
this.ngZone.run(() => {
|
||||||
|
this.cleanupDictationRecorder();
|
||||||
|
|
||||||
|
if (shouldDiscard || blob.size === 0) {
|
||||||
|
this.finishDictationCompletion();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.isTranscribingDictation.set(true);
|
||||||
|
void this.transcribeDictation(blob, textarea, applyToken);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
recorder.start();
|
||||||
|
this.isDictating.set(true);
|
||||||
|
this.session.error.set(null);
|
||||||
|
} catch {
|
||||||
|
this.session.error.set('Could not start dictation recording.');
|
||||||
|
this.cleanupDictationRecorder();
|
||||||
|
this.finishDictationCompletion();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async toggleVoiceRecording(): Promise<void> {
|
async toggleVoiceRecording(): Promise<void> {
|
||||||
if (this.isRecordingVoice()) {
|
if (this.isRecordingVoice()) {
|
||||||
this.stopVoiceRecording(false);
|
this.stopVoiceRecording(false);
|
||||||
@@ -482,6 +582,7 @@ export class ChatPageComponent implements OnDestroy {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
await this.stopDictation(true);
|
||||||
this.stopVoiceRecording(true);
|
this.stopVoiceRecording(true);
|
||||||
this.forwardingEntryId.set(null);
|
this.forwardingEntryId.set(null);
|
||||||
this.emojiPickerOpen.set(false);
|
this.emojiPickerOpen.set(false);
|
||||||
@@ -532,6 +633,106 @@ export class ChatPageComponent implements OnDestroy {
|
|||||||
return candidates.find((candidate) => MediaRecorder.isTypeSupported(candidate)) ?? '';
|
return candidates.find((candidate) => MediaRecorder.isTypeSupported(candidate)) ?? '';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private async stopDictation(discard: boolean): Promise<void> {
|
||||||
|
const completion = this.dictationCompletionPromise;
|
||||||
|
|
||||||
|
if (discard) {
|
||||||
|
this.dictationApplyToken += 1;
|
||||||
|
this.messageText = this.dictationBaseText || this.messageText;
|
||||||
|
this.handleMessageTextChange(this.messageText);
|
||||||
|
this.isTranscribingDictation.set(false);
|
||||||
|
} else {
|
||||||
|
this.dictationBaseText = this.messageText;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.dictationRecorder) {
|
||||||
|
this.discardRecordedDictation = discard;
|
||||||
|
|
||||||
|
if (this.dictationRecorder.state !== 'inactive') {
|
||||||
|
this.dictationRecorder.stop();
|
||||||
|
} else {
|
||||||
|
this.cleanupDictationRecorder();
|
||||||
|
this.finishDictationCompletion();
|
||||||
|
}
|
||||||
|
} else if (!completion) {
|
||||||
|
this.dictationBaseText = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (completion) {
|
||||||
|
await completion;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private cleanupDictationRecorder(): void {
|
||||||
|
if (this.dictationStream) {
|
||||||
|
for (const track of this.dictationStream.getTracks()) {
|
||||||
|
track.stop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.dictationRecorder = null;
|
||||||
|
this.dictationStream = null;
|
||||||
|
this.dictationChunks = [];
|
||||||
|
this.discardRecordedDictation = false;
|
||||||
|
this.isDictating.set(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
private finishDictationCompletion(): void {
|
||||||
|
this.resolveDictationCompletion?.();
|
||||||
|
this.resolveDictationCompletion = null;
|
||||||
|
this.dictationCompletionPromise = null;
|
||||||
|
this.dictationBaseText = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
private async transcribeDictation(blob: Blob, textarea: HTMLTextAreaElement, applyToken: number): Promise<void> {
|
||||||
|
try {
|
||||||
|
const transcript = await this.session.requestSpeechTranscription(blob);
|
||||||
|
|
||||||
|
if (applyToken !== this.dictationApplyToken) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.applyDictatedText(this.mergeDictatedText(this.dictationBaseText, transcript), textarea);
|
||||||
|
} catch {
|
||||||
|
if (applyToken === this.dictationApplyToken) {
|
||||||
|
this.session.error.set('Dictation transcription failed.');
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
if (applyToken === this.dictationApplyToken) {
|
||||||
|
this.isTranscribingDictation.set(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.finishDictationCompletion();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private mergeDictatedText(baseText: string, transcript: string): string {
|
||||||
|
const trimmedTranscript = transcript.trim();
|
||||||
|
|
||||||
|
if (!trimmedTranscript) {
|
||||||
|
return baseText;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!baseText.trim()) {
|
||||||
|
return trimmedTranscript;
|
||||||
|
}
|
||||||
|
|
||||||
|
return `${baseText.trimEnd()} ${trimmedTranscript}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
private applyDictatedText(text: string, textarea: HTMLTextAreaElement): void {
|
||||||
|
this.messageText = text;
|
||||||
|
textarea.value = text;
|
||||||
|
this.composerSelectionStart = text.length;
|
||||||
|
this.composerSelectionEnd = text.length;
|
||||||
|
this.handleMessageTextChange(text);
|
||||||
|
|
||||||
|
queueMicrotask(() => {
|
||||||
|
textarea.focus();
|
||||||
|
textarea.setSelectionRange(text.length, text.length);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
private syncCallAudioSource(): void {
|
private syncCallAudioSource(): void {
|
||||||
const audio = this.callAudioElement?.nativeElement;
|
const audio = this.callAudioElement?.nativeElement;
|
||||||
|
|
||||||
|
|||||||
@@ -170,6 +170,10 @@ export class ChatSessionService {
|
|||||||
string,
|
string,
|
||||||
{ peerId: string; prompt: string; waitMessageId: string }
|
{ peerId: string; prompt: string; waitMessageId: string }
|
||||||
>();
|
>();
|
||||||
|
private readonly pendingSpeechTranscriptionRequests = new Map<
|
||||||
|
string,
|
||||||
|
{ resolve: (text: string) => void; reject: (reason?: unknown) => void }
|
||||||
|
>();
|
||||||
private readonly remoteVideoStreams = signal<Array<{ peerId: string; stream: MediaStream }>>([]);
|
private readonly remoteVideoStreams = signal<Array<{ peerId: string; stream: MediaStream }>>([]);
|
||||||
private readonly remoteAudioStreams = signal<Array<{ peerId: string; stream: MediaStream }>>([]);
|
private readonly remoteAudioStreams = signal<Array<{ peerId: string; stream: MediaStream }>>([]);
|
||||||
private readonly activeCameraPeerId = signal<string | null>(null);
|
private readonly activeCameraPeerId = signal<string | null>(null);
|
||||||
@@ -916,6 +920,32 @@ export class ChatSessionService {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async requestSpeechTranscription(audioBlob: Blob): Promise<string> {
|
||||||
|
if (!this.websocket || this.websocket.readyState !== WebSocket.OPEN) {
|
||||||
|
throw new Error('You must be connected to signaling before using dictation.');
|
||||||
|
}
|
||||||
|
|
||||||
|
const requestId = crypto.randomUUID();
|
||||||
|
const audioBase64 = await this.blobToBase64(audioBlob);
|
||||||
|
|
||||||
|
return new Promise<string>((resolve, reject) => {
|
||||||
|
this.pendingSpeechTranscriptionRequests.set(requestId, { resolve, reject });
|
||||||
|
|
||||||
|
try {
|
||||||
|
this.error.set(null);
|
||||||
|
this.websocket?.send(JSON.stringify({
|
||||||
|
type: 'speech-transcription',
|
||||||
|
requestId,
|
||||||
|
mimeType: audioBlob.type || 'audio/webm',
|
||||||
|
audioBase64,
|
||||||
|
}));
|
||||||
|
} catch (error) {
|
||||||
|
this.pendingSpeechTranscriptionRequests.delete(requestId);
|
||||||
|
reject(error);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
private async loadAccessKeys(): Promise<void> {
|
private async loadAccessKeys(): Promise<void> {
|
||||||
const token = this.token();
|
const token = this.token();
|
||||||
|
|
||||||
@@ -990,6 +1020,7 @@ export class ChatSessionService {
|
|||||||
const shouldReconnect = this.websocket === websocket && !this.suppressSocketReconnect;
|
const shouldReconnect = this.websocket === websocket && !this.suppressSocketReconnect;
|
||||||
|
|
||||||
this.stopWebSocketHeartbeat();
|
this.stopWebSocketHeartbeat();
|
||||||
|
this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
|
||||||
this.signalingState.set('disconnected');
|
this.signalingState.set('disconnected');
|
||||||
this.status.set('Signaling connection closed.');
|
this.status.set('Signaling connection closed.');
|
||||||
|
|
||||||
@@ -1014,6 +1045,7 @@ export class ChatSessionService {
|
|||||||
|
|
||||||
private disconnectWebSocket(): void {
|
private disconnectWebSocket(): void {
|
||||||
this.stopWebSocketHeartbeat();
|
this.stopWebSocketHeartbeat();
|
||||||
|
this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
|
||||||
|
|
||||||
if (this.websocket) {
|
if (this.websocket) {
|
||||||
this.suppressSocketReconnect = true;
|
this.suppressSocketReconnect = true;
|
||||||
@@ -1055,6 +1087,12 @@ export class ChatSessionService {
|
|||||||
case 'image-generation-error':
|
case 'image-generation-error':
|
||||||
this.handleGeneratedImageError(event);
|
this.handleGeneratedImageError(event);
|
||||||
break;
|
break;
|
||||||
|
case 'speech-transcribed':
|
||||||
|
this.handleSpeechTranscribed(event);
|
||||||
|
break;
|
||||||
|
case 'speech-transcription-error':
|
||||||
|
this.handleSpeechTranscriptionError(event);
|
||||||
|
break;
|
||||||
case 'pong':
|
case 'pong':
|
||||||
break;
|
break;
|
||||||
case 'error':
|
case 'error':
|
||||||
@@ -1109,6 +1147,28 @@ export class ChatSessionService {
|
|||||||
this.error.set(event.message);
|
this.error.set(event.message);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private handleSpeechTranscribed(event: Extract<ServerEvent, { type: 'speech-transcribed' }>): void {
|
||||||
|
const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
|
||||||
|
|
||||||
|
if (!pendingRequest) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.pendingSpeechTranscriptionRequests.delete(event.requestId);
|
||||||
|
pendingRequest.resolve(event.text);
|
||||||
|
}
|
||||||
|
|
||||||
|
private handleSpeechTranscriptionError(event: Extract<ServerEvent, { type: 'speech-transcription-error' }>): void {
|
||||||
|
const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
|
||||||
|
|
||||||
|
if (pendingRequest) {
|
||||||
|
this.pendingSpeechTranscriptionRequests.delete(event.requestId);
|
||||||
|
pendingRequest.reject(new Error(event.message));
|
||||||
|
}
|
||||||
|
|
||||||
|
this.error.set(event.message);
|
||||||
|
}
|
||||||
|
|
||||||
private async restoreSession(): Promise<void> {
|
private async restoreSession(): Promise<void> {
|
||||||
const token = this.token();
|
const token = this.token();
|
||||||
|
|
||||||
@@ -2024,6 +2084,18 @@ export class ChatSessionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private rejectPendingSpeechTranscriptions(message: string): void {
|
||||||
|
if (this.pendingSpeechTranscriptionRequests.size === 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const { reject } of this.pendingSpeechTranscriptionRequests.values()) {
|
||||||
|
reject(new Error(message));
|
||||||
|
}
|
||||||
|
|
||||||
|
this.pendingSpeechTranscriptionRequests.clear();
|
||||||
|
}
|
||||||
|
|
||||||
private clearLocalAuth(statusMessage: string): void {
|
private clearLocalAuth(statusMessage: string): void {
|
||||||
this.clearWebSocketReconnect();
|
this.clearWebSocketReconnect();
|
||||||
this.disconnectWebSocket();
|
this.disconnectWebSocket();
|
||||||
@@ -2034,6 +2106,7 @@ export class ChatSessionService {
|
|||||||
this.stopRingtone();
|
this.stopRingtone();
|
||||||
this.releasePreloadedRingtone();
|
this.releasePreloadedRingtone();
|
||||||
this.pendingImageGenerationRequests.clear();
|
this.pendingImageGenerationRequests.clear();
|
||||||
|
this.rejectPendingSpeechTranscriptions('Session ended during dictation.');
|
||||||
this.remoteVideoStreams.set([]);
|
this.remoteVideoStreams.set([]);
|
||||||
this.remoteAudioStreams.set([]);
|
this.remoteAudioStreams.set([]);
|
||||||
this.remoteVideoModalPeerId.set(null);
|
this.remoteVideoModalPeerId.set(null);
|
||||||
@@ -2060,6 +2133,19 @@ export class ChatSessionService {
|
|||||||
this.removeStorage('privatechat.user');
|
this.removeStorage('privatechat.user');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private async blobToBase64(blob: Blob): Promise<string> {
|
||||||
|
const buffer = await blob.arrayBuffer();
|
||||||
|
let binary = '';
|
||||||
|
const bytes = new Uint8Array(buffer);
|
||||||
|
const chunkSize = 0x8000;
|
||||||
|
|
||||||
|
for (let index = 0; index < bytes.length; index += chunkSize) {
|
||||||
|
binary += String.fromCharCode(...bytes.subarray(index, index + chunkSize));
|
||||||
|
}
|
||||||
|
|
||||||
|
return btoa(binary);
|
||||||
|
}
|
||||||
|
|
||||||
private async loadPersistedMessages(userId: string): Promise<void> {
|
private async loadPersistedMessages(userId: string): Promise<void> {
|
||||||
const messageEncryptionKey = this.messageEncryptionKey;
|
const messageEncryptionKey = this.messageEncryptionKey;
|
||||||
|
|
||||||
|
|||||||
@@ -130,6 +130,16 @@ export type ServerEvent =
|
|||||||
peerId: string;
|
peerId: string;
|
||||||
message: string;
|
message: string;
|
||||||
}
|
}
|
||||||
|
| {
|
||||||
|
type: 'speech-transcribed';
|
||||||
|
requestId: string;
|
||||||
|
text: string;
|
||||||
|
}
|
||||||
|
| {
|
||||||
|
type: 'speech-transcription-error';
|
||||||
|
requestId: string;
|
||||||
|
message: string;
|
||||||
|
}
|
||||||
| { type: 'pong' }
|
| { type: 'pong' }
|
||||||
| { type: 'error'; message: string };
|
| { type: 'error'; message: string };
|
||||||
|
|
||||||
|
|||||||
82
server/dist/index.js
vendored
82
server/dist/index.js
vendored
@@ -1,5 +1,7 @@
|
|||||||
import crypto from 'node:crypto';
|
import crypto from 'node:crypto';
|
||||||
import fs from 'node:fs';
|
import fs from 'node:fs';
|
||||||
|
import fsPromises from 'node:fs/promises';
|
||||||
|
import os from 'node:os';
|
||||||
import path from 'node:path';
|
import path from 'node:path';
|
||||||
import { fileURLToPath } from 'node:url';
|
import { fileURLToPath } from 'node:url';
|
||||||
import { TextEncoder } from 'node:util';
|
import { TextEncoder } from 'node:util';
|
||||||
@@ -13,6 +15,7 @@ import { generateAuthenticationOptions, generateRegistrationOptions, verifyAuthe
|
|||||||
import Fastify from 'fastify';
|
import Fastify from 'fastify';
|
||||||
import { Redis } from 'ioredis';
|
import { Redis } from 'ioredis';
|
||||||
import { z } from 'zod';
|
import { z } from 'zod';
|
||||||
|
import { WhisperTranscriber } from './whisper-transcriber.js';
|
||||||
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
|
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
|
||||||
const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
|
const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
|
||||||
const registerSchema = z.object({
|
const registerSchema = z.object({
|
||||||
@@ -81,6 +84,12 @@ const signalMessageSchema = z.discriminatedUnion('type', [
|
|||||||
z.object({
|
z.object({
|
||||||
type: z.literal('ping'),
|
type: z.literal('ping'),
|
||||||
}),
|
}),
|
||||||
|
z.object({
|
||||||
|
type: z.literal('speech-transcription'),
|
||||||
|
requestId: z.string().uuid(),
|
||||||
|
mimeType: z.string().trim().min(1).max(128),
|
||||||
|
audioBase64: z.string().min(1).max(32_000_000),
|
||||||
|
}),
|
||||||
]);
|
]);
|
||||||
const app = Fastify({ logger: true, trustProxy: true });
|
const app = Fastify({ logger: true, trustProxy: true });
|
||||||
const approvalAdminUsername = 'ladparis';
|
const approvalAdminUsername = 'ladparis';
|
||||||
@@ -91,6 +100,11 @@ const frontendDistPath = resolveProjectPath(process.env.PRIVATECHAT_WEB_DIST_DIR
|
|||||||
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
|
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
|
||||||
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
|
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
|
||||||
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
|
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
|
||||||
|
const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
|
||||||
|
const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
|
||||||
|
const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
|
||||||
|
const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
|
||||||
|
const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
|
||||||
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
|
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
|
||||||
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
|
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
|
||||||
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
|
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
|
||||||
@@ -101,6 +115,13 @@ const webAuthnRpName = process.env.WEBAUTHN_RP_NAME ?? 'PrivateChat';
|
|||||||
const webAuthnUserVerification = resolveWebAuthnUserVerification(process.env.WEBAUTHN_USER_VERIFICATION);
|
const webAuthnUserVerification = resolveWebAuthnUserVerification(process.env.WEBAUTHN_USER_VERIFICATION);
|
||||||
const frontendIndexPath = path.join(frontendDistPath, 'index.html');
|
const frontendIndexPath = path.join(frontendDistPath, 'index.html');
|
||||||
const hasFrontendBuild = fs.existsSync(frontendIndexPath);
|
const hasFrontendBuild = fs.existsSync(frontendIndexPath);
|
||||||
|
const whisperTranscriber = new WhisperTranscriber({
|
||||||
|
pythonExecutable: whisperPythonExecutable,
|
||||||
|
scriptPath: whisperScriptPath,
|
||||||
|
model: whisperModel,
|
||||||
|
device: whisperDevice,
|
||||||
|
computeType: whisperComputeType,
|
||||||
|
}, app.log);
|
||||||
fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
|
fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
|
||||||
fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
|
fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
|
||||||
const encryptionKey = deriveEncryptionKey(loadOrCreateMasterKey(masterKeyPath));
|
const encryptionKey = deriveEncryptionKey(loadOrCreateMasterKey(masterKeyPath));
|
||||||
@@ -719,6 +740,25 @@ async function handleSocketMessage(userId, sessionId, socket, rawMessage) {
|
|||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (parsed.type === 'speech-transcription') {
|
||||||
|
try {
|
||||||
|
const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType);
|
||||||
|
send(socket, {
|
||||||
|
type: 'speech-transcribed',
|
||||||
|
requestId: parsed.requestId,
|
||||||
|
text,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
catch (error) {
|
||||||
|
app.log.warn({ err: error, userId }, 'Whisper transcription failed');
|
||||||
|
send(socket, {
|
||||||
|
type: 'speech-transcription-error',
|
||||||
|
requestId: parsed.requestId,
|
||||||
|
message: error instanceof Error ? error.message : 'Speech transcription failed.',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
let delivered = 0;
|
let delivered = 0;
|
||||||
const recipientSockets = socketsByUserId.get(parsed.to);
|
const recipientSockets = socketsByUserId.get(parsed.to);
|
||||||
if (recipientSockets) {
|
if (recipientSockets) {
|
||||||
@@ -1095,12 +1135,54 @@ function parseClientMessage(rawMessage) {
|
|||||||
prompt: parsed.data.prompt,
|
prompt: parsed.data.prompt,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
if (parsed.data.type === 'speech-transcription') {
|
||||||
|
return {
|
||||||
|
type: 'speech-transcription',
|
||||||
|
requestId: parsed.data.requestId,
|
||||||
|
mimeType: parsed.data.mimeType,
|
||||||
|
audioBase64: parsed.data.audioBase64,
|
||||||
|
};
|
||||||
|
}
|
||||||
return {
|
return {
|
||||||
type: 'signal',
|
type: 'signal',
|
||||||
to: parsed.data.to,
|
to: parsed.data.to,
|
||||||
signal: normalizeSignal(parsed.data.signal),
|
signal: normalizeSignal(parsed.data.signal),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
async function transcribeAudioPayload(requestId, audioBase64, mimeType) {
|
||||||
|
const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
|
||||||
|
const extension = audioExtensionForMimeType(mimeType);
|
||||||
|
const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
|
||||||
|
try {
|
||||||
|
await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
|
||||||
|
return await whisperTranscriber.transcribe(requestId, audioPath);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
await fsPromises.rm(tempDirectory, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function audioExtensionForMimeType(mimeType) {
|
||||||
|
switch (mimeType.toLowerCase()) {
|
||||||
|
case 'audio/webm':
|
||||||
|
case 'audio/webm;codecs=opus':
|
||||||
|
return 'webm';
|
||||||
|
case 'audio/ogg':
|
||||||
|
case 'audio/ogg;codecs=opus':
|
||||||
|
return 'ogg';
|
||||||
|
case 'audio/mp4':
|
||||||
|
case 'audio/m4a':
|
||||||
|
return 'm4a';
|
||||||
|
case 'audio/mpeg':
|
||||||
|
case 'audio/mp3':
|
||||||
|
return 'mp3';
|
||||||
|
case 'audio/wav':
|
||||||
|
case 'audio/wave':
|
||||||
|
case 'audio/x-wav':
|
||||||
|
return 'wav';
|
||||||
|
default:
|
||||||
|
return 'webm';
|
||||||
|
}
|
||||||
|
}
|
||||||
async function generateImageFromPrompt(prompt) {
|
async function generateImageFromPrompt(prompt) {
|
||||||
const abortController = new AbortController();
|
const abortController = new AbortController();
|
||||||
const timeoutId = setTimeout(() => abortController.abort(), 120_000);
|
const timeoutId = setTimeout(() => abortController.abort(), 120_000);
|
||||||
|
|||||||
121
server/dist/whisper-transcriber.js
vendored
Normal file
121
server/dist/whisper-transcriber.js
vendored
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
import { spawn } from 'node:child_process';
|
||||||
|
import { createInterface } from 'node:readline';
|
||||||
|
export class WhisperTranscriber {
|
||||||
|
options;
|
||||||
|
logger;
|
||||||
|
worker = null;
|
||||||
|
readyPromise = null;
|
||||||
|
resolveReady = null;
|
||||||
|
rejectReady = null;
|
||||||
|
pendingRequests = new Map();
|
||||||
|
constructor(options, logger) {
|
||||||
|
this.options = options;
|
||||||
|
this.logger = logger;
|
||||||
|
}
|
||||||
|
async transcribe(requestId, audioPath) {
|
||||||
|
await this.ensureWorker();
|
||||||
|
if (!this.worker || this.worker.stdin.destroyed) {
|
||||||
|
throw new Error('The Whisper worker is not available.');
|
||||||
|
}
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
this.pendingRequests.set(requestId, { resolve, reject });
|
||||||
|
try {
|
||||||
|
this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`);
|
||||||
|
}
|
||||||
|
catch (error) {
|
||||||
|
this.pendingRequests.delete(requestId);
|
||||||
|
reject(error);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
async ensureWorker() {
|
||||||
|
if (this.readyPromise) {
|
||||||
|
return this.readyPromise;
|
||||||
|
}
|
||||||
|
this.worker = spawn(this.options.pythonExecutable, [
|
||||||
|
this.options.scriptPath,
|
||||||
|
'--model',
|
||||||
|
this.options.model,
|
||||||
|
'--device',
|
||||||
|
this.options.device,
|
||||||
|
'--compute-type',
|
||||||
|
this.options.computeType,
|
||||||
|
], { stdio: ['pipe', 'pipe', 'pipe'] });
|
||||||
|
this.readyPromise = new Promise((resolve, reject) => {
|
||||||
|
this.resolveReady = resolve;
|
||||||
|
this.rejectReady = reject;
|
||||||
|
});
|
||||||
|
const stdout = createInterface({ input: this.worker.stdout });
|
||||||
|
stdout.on('line', (line) => {
|
||||||
|
this.handleWorkerLine(line);
|
||||||
|
});
|
||||||
|
this.worker.stderr.on('data', (chunk) => {
|
||||||
|
const message = chunk.toString().trim();
|
||||||
|
if (message) {
|
||||||
|
this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
this.worker.on('error', (error) => {
|
||||||
|
this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.'));
|
||||||
|
});
|
||||||
|
this.worker.on('exit', (code, signal) => {
|
||||||
|
this.failWorker(new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`));
|
||||||
|
});
|
||||||
|
return this.readyPromise;
|
||||||
|
}
|
||||||
|
handleWorkerLine(line) {
|
||||||
|
let payload;
|
||||||
|
try {
|
||||||
|
payload = JSON.parse(line);
|
||||||
|
}
|
||||||
|
catch {
|
||||||
|
this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (payload.type === 'ready') {
|
||||||
|
this.logger.info({ model: payload.model }, 'Whisper worker ready');
|
||||||
|
this.resolveReady?.();
|
||||||
|
this.resolveReady = null;
|
||||||
|
this.rejectReady = null;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (payload.type === 'fatal') {
|
||||||
|
this.failWorker(new Error(payload.message));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (payload.type === 'error') {
|
||||||
|
if (!payload.requestId) {
|
||||||
|
this.failWorker(new Error(payload.message));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const pendingRequest = this.pendingRequests.get(payload.requestId);
|
||||||
|
if (!pendingRequest) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.pendingRequests.delete(payload.requestId);
|
||||||
|
pendingRequest.reject(new Error(payload.message));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const pendingRequest = this.pendingRequests.get(payload.requestId);
|
||||||
|
if (!pendingRequest) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.pendingRequests.delete(payload.requestId);
|
||||||
|
pendingRequest.resolve(payload.text.trim());
|
||||||
|
}
|
||||||
|
failWorker(error) {
|
||||||
|
if (this.worker) {
|
||||||
|
this.worker.removeAllListeners();
|
||||||
|
this.worker = null;
|
||||||
|
}
|
||||||
|
this.rejectReady?.(error);
|
||||||
|
this.resolveReady = null;
|
||||||
|
this.rejectReady = null;
|
||||||
|
this.readyPromise = null;
|
||||||
|
for (const { reject } of this.pendingRequests.values()) {
|
||||||
|
reject(error);
|
||||||
|
}
|
||||||
|
this.pendingRequests.clear();
|
||||||
|
this.logger.error({ err: error }, 'Whisper worker failed');
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -6,7 +6,8 @@
|
|||||||
"scripts": {
|
"scripts": {
|
||||||
"dev": "node node_modules/tsx/dist/cli.mjs watch src/index.ts",
|
"dev": "node node_modules/tsx/dist/cli.mjs watch src/index.ts",
|
||||||
"build": "node node_modules/typescript/bin/tsc -p tsconfig.json",
|
"build": "node node_modules/typescript/bin/tsc -p tsconfig.json",
|
||||||
"start": "node dist/index.js"
|
"start": "node dist/index.js",
|
||||||
|
"setup-whisper": "python3 -m pip install -r requirements-whisper.txt"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@fastify/cors": "^11.2.0",
|
"@fastify/cors": "^11.2.0",
|
||||||
|
|||||||
1
server/requirements-whisper.txt
Normal file
1
server/requirements-whisper.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
faster-whisper>=1.0.0
|
||||||
92
server/scripts/transcribe_whisper.py
Normal file
92
server/scripts/transcribe_whisper.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def emit(payload):
|
||||||
|
print(json.dumps(payload), flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def load_model(model_name, device, compute_type):
|
||||||
|
try:
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
except Exception as exc:
|
||||||
|
emit(
|
||||||
|
{
|
||||||
|
"type": "fatal",
|
||||||
|
"message": "faster-whisper is not installed. Run `python3 -m pip install -r server/requirements-whisper.txt`.",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
raise SystemExit(1) from exc
|
||||||
|
|
||||||
|
try:
|
||||||
|
return WhisperModel(model_name, device=device, compute_type=compute_type)
|
||||||
|
except Exception as exc:
|
||||||
|
emit(
|
||||||
|
{
|
||||||
|
"type": "fatal",
|
||||||
|
"message": f"Could not load the faster-whisper model '{model_name}': {exc}",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
raise SystemExit(1) from exc
|
||||||
|
|
||||||
|
|
||||||
|
def transcribe(model, request_id, audio_path):
|
||||||
|
try:
|
||||||
|
segments, _ = model.transcribe(audio_path, vad_filter=True, beam_size=5)
|
||||||
|
text = "".join(segment.text for segment in segments).strip()
|
||||||
|
emit({"type": "result", "requestId": request_id, "text": text})
|
||||||
|
except Exception as exc:
|
||||||
|
emit(
|
||||||
|
{
|
||||||
|
"type": "error",
|
||||||
|
"requestId": request_id,
|
||||||
|
"message": f"Whisper transcription failed: {exc}",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Persistent faster-whisper transcription worker")
|
||||||
|
parser.add_argument("--model", default="small")
|
||||||
|
parser.add_argument("--device", default="cpu")
|
||||||
|
parser.add_argument("--compute-type", default="int8")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
model = load_model(args.model, args.device, args.compute_type)
|
||||||
|
emit({"type": "ready", "model": args.model})
|
||||||
|
|
||||||
|
for raw_line in sys.stdin:
|
||||||
|
line = raw_line.strip()
|
||||||
|
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
payload = json.loads(line)
|
||||||
|
except Exception as exc:
|
||||||
|
emit({"type": "error", "message": f"Invalid request JSON: {exc}"})
|
||||||
|
continue
|
||||||
|
|
||||||
|
request_id = payload.get("requestId")
|
||||||
|
audio_path = payload.get("audioPath")
|
||||||
|
|
||||||
|
if not request_id or not audio_path:
|
||||||
|
emit(
|
||||||
|
{
|
||||||
|
"type": "error",
|
||||||
|
"requestId": request_id,
|
||||||
|
"message": "Missing requestId or audioPath.",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
transcribe(model, request_id, audio_path)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
@@ -1,5 +1,7 @@
|
|||||||
import crypto from 'node:crypto';
|
import crypto from 'node:crypto';
|
||||||
import fs from 'node:fs';
|
import fs from 'node:fs';
|
||||||
|
import fsPromises from 'node:fs/promises';
|
||||||
|
import os from 'node:os';
|
||||||
import path from 'node:path';
|
import path from 'node:path';
|
||||||
import { fileURLToPath } from 'node:url';
|
import { fileURLToPath } from 'node:url';
|
||||||
import { TextEncoder } from 'node:util';
|
import { TextEncoder } from 'node:util';
|
||||||
@@ -23,6 +25,8 @@ import { Redis } from 'ioredis';
|
|||||||
import type WebSocket from 'ws';
|
import type WebSocket from 'ws';
|
||||||
import { z } from 'zod';
|
import { z } from 'zod';
|
||||||
|
|
||||||
|
import { WhisperTranscriber } from './whisper-transcriber.js';
|
||||||
|
|
||||||
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
|
dotenv.config({ path: fileURLToPath(new URL('../../.env', import.meta.url)) });
|
||||||
|
|
||||||
const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
|
const projectRootPath = fileURLToPath(new URL('../../', import.meta.url));
|
||||||
@@ -120,6 +124,12 @@ type ClientMessage =
|
|||||||
}
|
}
|
||||||
| {
|
| {
|
||||||
type: 'ping';
|
type: 'ping';
|
||||||
|
}
|
||||||
|
| {
|
||||||
|
type: 'speech-transcription';
|
||||||
|
requestId: string;
|
||||||
|
mimeType: string;
|
||||||
|
audioBase64: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
type ServerMessage =
|
type ServerMessage =
|
||||||
@@ -142,6 +152,16 @@ type ServerMessage =
|
|||||||
peerId: string;
|
peerId: string;
|
||||||
message: string;
|
message: string;
|
||||||
}
|
}
|
||||||
|
| {
|
||||||
|
type: 'speech-transcribed';
|
||||||
|
requestId: string;
|
||||||
|
text: string;
|
||||||
|
}
|
||||||
|
| {
|
||||||
|
type: 'speech-transcription-error';
|
||||||
|
requestId: string;
|
||||||
|
message: string;
|
||||||
|
}
|
||||||
| { type: 'pong' }
|
| { type: 'pong' }
|
||||||
| { type: 'error'; message: string };
|
| { type: 'error'; message: string };
|
||||||
|
|
||||||
@@ -289,6 +309,12 @@ const signalMessageSchema = z.discriminatedUnion('type', [
|
|||||||
z.object({
|
z.object({
|
||||||
type: z.literal('ping'),
|
type: z.literal('ping'),
|
||||||
}),
|
}),
|
||||||
|
z.object({
|
||||||
|
type: z.literal('speech-transcription'),
|
||||||
|
requestId: z.string().uuid(),
|
||||||
|
mimeType: z.string().trim().min(1).max(128),
|
||||||
|
audioBase64: z.string().min(1).max(32_000_000),
|
||||||
|
}),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const app = Fastify({ logger: true, trustProxy: true });
|
const app = Fastify({ logger: true, trustProxy: true });
|
||||||
@@ -307,6 +333,11 @@ const frontendDistPath = resolveProjectPath(
|
|||||||
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
|
const ollamaServerUrl = (process.env.PRIVATECHAT_OLLAMA_URL ?? 'http://192.168.1.19:11434').replace(/\/+$/, '');
|
||||||
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
|
const ollamaImageModel = process.env.PRIVATECHAT_OLLAMA_IMAGE_MODEL ?? 'x/z-image-turbo:latest';
|
||||||
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
|
const ollamaImageSize = process.env.PRIVATECHAT_OLLAMA_IMAGE_SIZE ?? '1024x1024';
|
||||||
|
const whisperPythonExecutable = process.env.PRIVATECHAT_WHISPER_PYTHON ?? 'python3';
|
||||||
|
const whisperModel = process.env.PRIVATECHAT_WHISPER_MODEL ?? 'small';
|
||||||
|
const whisperDevice = process.env.PRIVATECHAT_WHISPER_DEVICE ?? 'cpu';
|
||||||
|
const whisperComputeType = process.env.PRIVATECHAT_WHISPER_COMPUTE_TYPE ?? 'int8';
|
||||||
|
const whisperScriptPath = resolveProjectPath('server/scripts/transcribe_whisper.py');
|
||||||
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
|
const sessionTtlSeconds = Number(process.env.SESSION_TTL_SECONDS ?? 60 * 60 * 12);
|
||||||
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
|
const webAuthnChallengeTtlSeconds = Number(process.env.WEBAUTHN_CHALLENGE_TTL_SECONDS ?? 5 * 60);
|
||||||
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
|
const allowedCorsOrigins = parseAllowedOrigins(process.env.CORS_ORIGIN);
|
||||||
@@ -320,6 +351,17 @@ const webAuthnUserVerification = resolveWebAuthnUserVerification(
|
|||||||
const frontendIndexPath = path.join(frontendDistPath, 'index.html');
|
const frontendIndexPath = path.join(frontendDistPath, 'index.html');
|
||||||
const hasFrontendBuild = fs.existsSync(frontendIndexPath);
|
const hasFrontendBuild = fs.existsSync(frontendIndexPath);
|
||||||
|
|
||||||
|
const whisperTranscriber = new WhisperTranscriber(
|
||||||
|
{
|
||||||
|
pythonExecutable: whisperPythonExecutable,
|
||||||
|
scriptPath: whisperScriptPath,
|
||||||
|
model: whisperModel,
|
||||||
|
device: whisperDevice,
|
||||||
|
computeType: whisperComputeType,
|
||||||
|
},
|
||||||
|
app.log,
|
||||||
|
);
|
||||||
|
|
||||||
fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
|
fs.mkdirSync(path.dirname(sqlitePath), { recursive: true });
|
||||||
fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
|
fs.mkdirSync(path.dirname(masterKeyPath), { recursive: true });
|
||||||
|
|
||||||
@@ -1127,6 +1169,27 @@ async function handleSocketMessage(
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (parsed.type === 'speech-transcription') {
|
||||||
|
try {
|
||||||
|
const text = await transcribeAudioPayload(parsed.requestId, parsed.audioBase64, parsed.mimeType);
|
||||||
|
|
||||||
|
send(socket, {
|
||||||
|
type: 'speech-transcribed',
|
||||||
|
requestId: parsed.requestId,
|
||||||
|
text,
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
app.log.warn({ err: error, userId }, 'Whisper transcription failed');
|
||||||
|
send(socket, {
|
||||||
|
type: 'speech-transcription-error',
|
||||||
|
requestId: parsed.requestId,
|
||||||
|
message: error instanceof Error ? error.message : 'Speech transcription failed.',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
let delivered = 0;
|
let delivered = 0;
|
||||||
const recipientSockets = socketsByUserId.get(parsed.to);
|
const recipientSockets = socketsByUserId.get(parsed.to);
|
||||||
|
|
||||||
@@ -1668,6 +1731,15 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (parsed.data.type === 'speech-transcription') {
|
||||||
|
return {
|
||||||
|
type: 'speech-transcription',
|
||||||
|
requestId: parsed.data.requestId,
|
||||||
|
mimeType: parsed.data.mimeType,
|
||||||
|
audioBase64: parsed.data.audioBase64,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
type: 'signal',
|
type: 'signal',
|
||||||
to: parsed.data.to,
|
to: parsed.data.to,
|
||||||
@@ -1675,6 +1747,42 @@ function parseClientMessage(rawMessage: string): ClientMessage | null {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function transcribeAudioPayload(requestId: string, audioBase64: string, mimeType: string): Promise<string> {
|
||||||
|
const tempDirectory = await fsPromises.mkdtemp(path.join(os.tmpdir(), 'privatechat-whisper-'));
|
||||||
|
const extension = audioExtensionForMimeType(mimeType);
|
||||||
|
const audioPath = path.join(tempDirectory, `dictation-${requestId}.${extension}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await fsPromises.writeFile(audioPath, Buffer.from(audioBase64, 'base64'));
|
||||||
|
return await whisperTranscriber.transcribe(requestId, audioPath);
|
||||||
|
} finally {
|
||||||
|
await fsPromises.rm(tempDirectory, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function audioExtensionForMimeType(mimeType: string): string {
|
||||||
|
switch (mimeType.toLowerCase()) {
|
||||||
|
case 'audio/webm':
|
||||||
|
case 'audio/webm;codecs=opus':
|
||||||
|
return 'webm';
|
||||||
|
case 'audio/ogg':
|
||||||
|
case 'audio/ogg;codecs=opus':
|
||||||
|
return 'ogg';
|
||||||
|
case 'audio/mp4':
|
||||||
|
case 'audio/m4a':
|
||||||
|
return 'm4a';
|
||||||
|
case 'audio/mpeg':
|
||||||
|
case 'audio/mp3':
|
||||||
|
return 'mp3';
|
||||||
|
case 'audio/wav':
|
||||||
|
case 'audio/wave':
|
||||||
|
case 'audio/x-wav':
|
||||||
|
return 'wav';
|
||||||
|
default:
|
||||||
|
return 'webm';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function generateImageFromPrompt(prompt: string): Promise<{ imageBase64: string; mimeType: string }> {
|
async function generateImageFromPrompt(prompt: string): Promise<{ imageBase64: string; mimeType: string }> {
|
||||||
const abortController = new AbortController();
|
const abortController = new AbortController();
|
||||||
const timeoutId = setTimeout(() => abortController.abort(), 120_000);
|
const timeoutId = setTimeout(() => abortController.abort(), 120_000);
|
||||||
|
|||||||
176
server/src/whisper-transcriber.ts
Normal file
176
server/src/whisper-transcriber.ts
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process';
|
||||||
|
import { createInterface } from 'node:readline';
|
||||||
|
|
||||||
|
type LoggerLike = {
|
||||||
|
info: (payload: unknown, message?: string) => void;
|
||||||
|
warn: (payload: unknown, message?: string) => void;
|
||||||
|
error: (payload: unknown, message?: string) => void;
|
||||||
|
};
|
||||||
|
|
||||||
|
type WhisperTranscriberOptions = {
|
||||||
|
pythonExecutable: string;
|
||||||
|
scriptPath: string;
|
||||||
|
model: string;
|
||||||
|
device: string;
|
||||||
|
computeType: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
type WorkerEvent =
|
||||||
|
| { type: 'ready'; model: string }
|
||||||
|
| { type: 'result'; requestId: string; text: string }
|
||||||
|
| { type: 'error'; requestId?: string; message: string }
|
||||||
|
| { type: 'fatal'; message: string };
|
||||||
|
|
||||||
|
export class WhisperTranscriber {
|
||||||
|
private worker: ChildProcessWithoutNullStreams | null = null;
|
||||||
|
private readyPromise: Promise<void> | null = null;
|
||||||
|
private resolveReady: (() => void) | null = null;
|
||||||
|
private rejectReady: ((reason?: unknown) => void) | null = null;
|
||||||
|
private readonly pendingRequests = new Map<
|
||||||
|
string,
|
||||||
|
{ resolve: (text: string) => void; reject: (reason?: unknown) => void }
|
||||||
|
>();
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
private readonly options: WhisperTranscriberOptions,
|
||||||
|
private readonly logger: LoggerLike,
|
||||||
|
) {}
|
||||||
|
|
||||||
|
async transcribe(requestId: string, audioPath: string): Promise<string> {
|
||||||
|
await this.ensureWorker();
|
||||||
|
|
||||||
|
if (!this.worker || this.worker.stdin.destroyed) {
|
||||||
|
throw new Error('The Whisper worker is not available.');
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Promise<string>((resolve, reject) => {
|
||||||
|
this.pendingRequests.set(requestId, { resolve, reject });
|
||||||
|
|
||||||
|
try {
|
||||||
|
this.worker?.stdin.write(`${JSON.stringify({ type: 'transcribe', requestId, audioPath })}\n`);
|
||||||
|
} catch (error) {
|
||||||
|
this.pendingRequests.delete(requestId);
|
||||||
|
reject(error);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private async ensureWorker(): Promise<void> {
|
||||||
|
if (this.readyPromise) {
|
||||||
|
return this.readyPromise;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.worker = spawn(
|
||||||
|
this.options.pythonExecutable,
|
||||||
|
[
|
||||||
|
this.options.scriptPath,
|
||||||
|
'--model',
|
||||||
|
this.options.model,
|
||||||
|
'--device',
|
||||||
|
this.options.device,
|
||||||
|
'--compute-type',
|
||||||
|
this.options.computeType,
|
||||||
|
],
|
||||||
|
{ stdio: ['pipe', 'pipe', 'pipe'] },
|
||||||
|
);
|
||||||
|
|
||||||
|
this.readyPromise = new Promise<void>((resolve, reject) => {
|
||||||
|
this.resolveReady = resolve;
|
||||||
|
this.rejectReady = reject;
|
||||||
|
});
|
||||||
|
|
||||||
|
const stdout = createInterface({ input: this.worker.stdout });
|
||||||
|
stdout.on('line', (line) => {
|
||||||
|
this.handleWorkerLine(line);
|
||||||
|
});
|
||||||
|
|
||||||
|
this.worker.stderr.on('data', (chunk) => {
|
||||||
|
const message = chunk.toString().trim();
|
||||||
|
|
||||||
|
if (message) {
|
||||||
|
this.logger.warn({ whisperStderr: message }, 'Whisper worker stderr');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
this.worker.on('error', (error) => {
|
||||||
|
this.failWorker(error instanceof Error ? error : new Error('The Whisper worker could not start.'));
|
||||||
|
});
|
||||||
|
|
||||||
|
this.worker.on('exit', (code, signal) => {
|
||||||
|
this.failWorker(
|
||||||
|
new Error(`The Whisper worker exited unexpectedly (code=${code ?? 'null'}, signal=${signal ?? 'null'}).`),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
return this.readyPromise;
|
||||||
|
}
|
||||||
|
|
||||||
|
private handleWorkerLine(line: string): void {
|
||||||
|
let payload: WorkerEvent;
|
||||||
|
|
||||||
|
try {
|
||||||
|
payload = JSON.parse(line) as WorkerEvent;
|
||||||
|
} catch {
|
||||||
|
this.logger.warn({ whisperStdout: line }, 'Ignored non-JSON Whisper worker output');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (payload.type === 'ready') {
|
||||||
|
this.logger.info({ model: payload.model }, 'Whisper worker ready');
|
||||||
|
this.resolveReady?.();
|
||||||
|
this.resolveReady = null;
|
||||||
|
this.rejectReady = null;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (payload.type === 'fatal') {
|
||||||
|
this.failWorker(new Error(payload.message));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (payload.type === 'error') {
|
||||||
|
if (!payload.requestId) {
|
||||||
|
this.failWorker(new Error(payload.message));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pendingRequest = this.pendingRequests.get(payload.requestId);
|
||||||
|
|
||||||
|
if (!pendingRequest) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.pendingRequests.delete(payload.requestId);
|
||||||
|
pendingRequest.reject(new Error(payload.message));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pendingRequest = this.pendingRequests.get(payload.requestId);
|
||||||
|
|
||||||
|
if (!pendingRequest) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.pendingRequests.delete(payload.requestId);
|
||||||
|
pendingRequest.resolve(payload.text.trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
private failWorker(error: Error): void {
|
||||||
|
if (this.worker) {
|
||||||
|
this.worker.removeAllListeners();
|
||||||
|
this.worker = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.rejectReady?.(error);
|
||||||
|
this.resolveReady = null;
|
||||||
|
this.rejectReady = null;
|
||||||
|
this.readyPromise = null;
|
||||||
|
|
||||||
|
for (const { reject } of this.pendingRequests.values()) {
|
||||||
|
reject(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.pendingRequests.clear();
|
||||||
|
this.logger.error({ err: error }, 'Whisper worker failed');
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user