Dictation through AI

This commit is contained in:
2026-03-11 00:26:49 +01:00
parent d2c4152ea7
commit 189f989c0d
12 changed files with 911 additions and 2 deletions

View File

@@ -314,6 +314,30 @@
{{ isRecordingVoice() ? '⏹️' : '🎙️' }}
</button>
<button
class="composer-dictation"
type="button"
[disabled]="!session.isSelectedPeerReady() || session.signalingState() !== 'connected' || isTranscribingDictation()"
(click)="toggleDictation(composerTextarea)"
[title]="
isDictating()
? 'Stop dictation and transcribe'
: isTranscribingDictation()
? 'Transcribing dictated audio'
: 'Start dictation'
"
[attr.aria-label]="
isDictating()
? 'Stop dictation and transcribe'
: isTranscribingDictation()
? 'Transcribing dictated audio'
: 'Start dictation'
"
[class.composer-dictation-active]="isDictating() || isTranscribingDictation()"
>
{{ isDictating() ? '🛑' : isTranscribingDictation() ? '⏳' : '🗣️' }}
</button>
<input
#fileInput
class="composer-file-input"

View File

@@ -357,6 +357,7 @@
.composer-camera,
.composer-call,
.composer-dictation,
.composer-hangup,
.composer-voice,
.composer-image-generate,
@@ -398,6 +399,12 @@
background: var(--badge-background);
}
.composer-dictation {
color: var(--page-text);
background: linear-gradient(135deg, #f6d8ff, #ffcadb);
}
.composer-dictation-active,
.composer-hangup,
.composer-voice-recording {
color: #fff;

View File

@@ -1,5 +1,5 @@
import { CommonModule } from '@angular/common';
import { Component, computed, effect, ElementRef, inject, OnDestroy, signal, ViewChild } from '@angular/core';
import { Component, computed, effect, ElementRef, inject, NgZone, OnDestroy, signal, ViewChild } from '@angular/core';
import { toSignal } from '@angular/core/rxjs-interop';
import { FormsModule } from '@angular/forms';
import { ActivatedRoute, Router, RouterLink } from '@angular/router';
@@ -18,6 +18,7 @@ import type { ChatEntry, ConnectionState, PeerSummary } from './models';
export class ChatPageComponent implements OnDestroy {
private readonly route = inject(ActivatedRoute);
private readonly router = inject(Router);
private readonly ngZone = inject(NgZone);
private readonly routeParamMap = toSignal(this.route.paramMap, {
initialValue: this.route.snapshot.paramMap,
});
@@ -28,6 +29,14 @@ export class ChatPageComponent implements OnDestroy {
private voiceChunks: Blob[] = [];
private discardRecordedVoice = false;
private recordingPeerId: string | null = null;
private dictationRecorder: MediaRecorder | null = null;
private dictationStream: MediaStream | null = null;
private dictationChunks: Blob[] = [];
private dictationBaseText = '';
private discardRecordedDictation = false;
private dictationCompletionPromise: Promise<void> | null = null;
private resolveDictationCompletion: (() => void) | null = null;
private dictationApplyToken = 0;
@ViewChild('callAudioElement')
set callAudioElementRef(value: ElementRef<HTMLAudioElement> | undefined) {
this.callAudioElement = value;
@@ -39,6 +48,8 @@ export class ChatPageComponent implements OnDestroy {
readonly forwardingEntryId = signal<string | null>(null);
readonly emojiPickerOpen = signal(false);
readonly isRecordingVoice = signal(false);
readonly isDictating = signal(false);
readonly isTranscribingDictation = signal(false);
readonly emojiOptions = [
'😀', '😁', '😂', '🤣', '😊',
'😉', '😍', '😘', '😎', '🤔',
@@ -152,6 +163,7 @@ export class ChatPageComponent implements OnDestroy {
}
ngOnDestroy(): void {
void this.stopDictation(true);
this.stopVoiceRecording(true);
this.detachCallAudioSource();
}
@@ -174,6 +186,7 @@ export class ChatPageComponent implements OnDestroy {
return;
}
await this.stopDictation(false);
await this.session.sendText(peerId, this.messageText);
this.messageText = '';
this.emojiPickerOpen.set(false);
@@ -188,6 +201,7 @@ export class ChatPageComponent implements OnDestroy {
return;
}
await this.stopDictation(false);
const requested = await this.session.requestGeneratedImage(peerId, this.messageText);
if (!requested) {
@@ -262,6 +276,92 @@ export class ChatPageComponent implements OnDestroy {
input.value = '';
}
async toggleDictation(textarea: HTMLTextAreaElement): Promise<void> {
if (this.isDictating()) {
await this.stopDictation(false);
return;
}
if (this.isTranscribingDictation()) {
return;
}
const peerId = this.peerId();
if (!peerId) {
return;
}
if (typeof MediaRecorder === 'undefined' || typeof navigator === 'undefined') {
this.session.error.set('This browser does not support dictation recording.');
return;
}
if (typeof navigator.mediaDevices?.getUserMedia !== 'function') {
this.session.error.set('This browser cannot access the microphone for dictation.');
return;
}
this.dictationBaseText = this.messageText;
this.discardRecordedDictation = false;
this.dictationApplyToken += 1;
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const preferredMimeType = this.preferredVoiceMimeType();
const recorder = preferredMimeType
? new MediaRecorder(stream, { mimeType: preferredMimeType })
: new MediaRecorder(stream);
const applyToken = this.dictationApplyToken;
this.dictationChunks = [];
this.dictationStream = stream;
this.dictationRecorder = recorder;
this.dictationCompletionPromise = new Promise<void>((resolve) => {
this.resolveDictationCompletion = resolve;
});
recorder.ondataavailable = (event) => {
if (event.data.size > 0) {
this.dictationChunks.push(event.data);
}
};
recorder.onerror = () => {
this.ngZone.run(() => {
this.session.error.set('Could not record dictation audio.');
this.cleanupDictationRecorder();
this.finishDictationCompletion();
});
};
recorder.onstop = () => {
const shouldDiscard = this.discardRecordedDictation;
const mimeType = recorder.mimeType || preferredMimeType || 'audio/webm';
const blob = new Blob(this.dictationChunks, { type: mimeType });
this.ngZone.run(() => {
this.cleanupDictationRecorder();
if (shouldDiscard || blob.size === 0) {
this.finishDictationCompletion();
return;
}
this.isTranscribingDictation.set(true);
void this.transcribeDictation(blob, textarea, applyToken);
});
};
recorder.start();
this.isDictating.set(true);
this.session.error.set(null);
} catch {
this.session.error.set('Could not start dictation recording.');
this.cleanupDictationRecorder();
this.finishDictationCompletion();
}
}
async toggleVoiceRecording(): Promise<void> {
if (this.isRecordingVoice()) {
this.stopVoiceRecording(false);
@@ -482,6 +582,7 @@ export class ChatPageComponent implements OnDestroy {
return;
}
await this.stopDictation(true);
this.stopVoiceRecording(true);
this.forwardingEntryId.set(null);
this.emojiPickerOpen.set(false);
@@ -532,6 +633,106 @@ export class ChatPageComponent implements OnDestroy {
return candidates.find((candidate) => MediaRecorder.isTypeSupported(candidate)) ?? '';
}
private async stopDictation(discard: boolean): Promise<void> {
const completion = this.dictationCompletionPromise;
if (discard) {
this.dictationApplyToken += 1;
this.messageText = this.dictationBaseText || this.messageText;
this.handleMessageTextChange(this.messageText);
this.isTranscribingDictation.set(false);
} else {
this.dictationBaseText = this.messageText;
}
if (this.dictationRecorder) {
this.discardRecordedDictation = discard;
if (this.dictationRecorder.state !== 'inactive') {
this.dictationRecorder.stop();
} else {
this.cleanupDictationRecorder();
this.finishDictationCompletion();
}
} else if (!completion) {
this.dictationBaseText = '';
}
if (completion) {
await completion;
}
}
private cleanupDictationRecorder(): void {
if (this.dictationStream) {
for (const track of this.dictationStream.getTracks()) {
track.stop();
}
}
this.dictationRecorder = null;
this.dictationStream = null;
this.dictationChunks = [];
this.discardRecordedDictation = false;
this.isDictating.set(false);
}
private finishDictationCompletion(): void {
this.resolveDictationCompletion?.();
this.resolveDictationCompletion = null;
this.dictationCompletionPromise = null;
this.dictationBaseText = '';
}
private async transcribeDictation(blob: Blob, textarea: HTMLTextAreaElement, applyToken: number): Promise<void> {
try {
const transcript = await this.session.requestSpeechTranscription(blob);
if (applyToken !== this.dictationApplyToken) {
return;
}
this.applyDictatedText(this.mergeDictatedText(this.dictationBaseText, transcript), textarea);
} catch {
if (applyToken === this.dictationApplyToken) {
this.session.error.set('Dictation transcription failed.');
}
} finally {
if (applyToken === this.dictationApplyToken) {
this.isTranscribingDictation.set(false);
}
this.finishDictationCompletion();
}
}
private mergeDictatedText(baseText: string, transcript: string): string {
const trimmedTranscript = transcript.trim();
if (!trimmedTranscript) {
return baseText;
}
if (!baseText.trim()) {
return trimmedTranscript;
}
return `${baseText.trimEnd()} ${trimmedTranscript}`;
}
private applyDictatedText(text: string, textarea: HTMLTextAreaElement): void {
this.messageText = text;
textarea.value = text;
this.composerSelectionStart = text.length;
this.composerSelectionEnd = text.length;
this.handleMessageTextChange(text);
queueMicrotask(() => {
textarea.focus();
textarea.setSelectionRange(text.length, text.length);
});
}
private syncCallAudioSource(): void {
const audio = this.callAudioElement?.nativeElement;

View File

@@ -170,6 +170,10 @@ export class ChatSessionService {
string,
{ peerId: string; prompt: string; waitMessageId: string }
>();
private readonly pendingSpeechTranscriptionRequests = new Map<
string,
{ resolve: (text: string) => void; reject: (reason?: unknown) => void }
>();
private readonly remoteVideoStreams = signal<Array<{ peerId: string; stream: MediaStream }>>([]);
private readonly remoteAudioStreams = signal<Array<{ peerId: string; stream: MediaStream }>>([]);
private readonly activeCameraPeerId = signal<string | null>(null);
@@ -916,6 +920,32 @@ export class ChatSessionService {
return true;
}
async requestSpeechTranscription(audioBlob: Blob): Promise<string> {
if (!this.websocket || this.websocket.readyState !== WebSocket.OPEN) {
throw new Error('You must be connected to signaling before using dictation.');
}
const requestId = crypto.randomUUID();
const audioBase64 = await this.blobToBase64(audioBlob);
return new Promise<string>((resolve, reject) => {
this.pendingSpeechTranscriptionRequests.set(requestId, { resolve, reject });
try {
this.error.set(null);
this.websocket?.send(JSON.stringify({
type: 'speech-transcription',
requestId,
mimeType: audioBlob.type || 'audio/webm',
audioBase64,
}));
} catch (error) {
this.pendingSpeechTranscriptionRequests.delete(requestId);
reject(error);
}
});
}
private async loadAccessKeys(): Promise<void> {
const token = this.token();
@@ -990,6 +1020,7 @@ export class ChatSessionService {
const shouldReconnect = this.websocket === websocket && !this.suppressSocketReconnect;
this.stopWebSocketHeartbeat();
this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
this.signalingState.set('disconnected');
this.status.set('Signaling connection closed.');
@@ -1014,6 +1045,7 @@ export class ChatSessionService {
private disconnectWebSocket(): void {
this.stopWebSocketHeartbeat();
this.rejectPendingSpeechTranscriptions('Signaling connection closed during dictation.');
if (this.websocket) {
this.suppressSocketReconnect = true;
@@ -1055,6 +1087,12 @@ export class ChatSessionService {
case 'image-generation-error':
this.handleGeneratedImageError(event);
break;
case 'speech-transcribed':
this.handleSpeechTranscribed(event);
break;
case 'speech-transcription-error':
this.handleSpeechTranscriptionError(event);
break;
case 'pong':
break;
case 'error':
@@ -1109,6 +1147,28 @@ export class ChatSessionService {
this.error.set(event.message);
}
private handleSpeechTranscribed(event: Extract<ServerEvent, { type: 'speech-transcribed' }>): void {
const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
if (!pendingRequest) {
return;
}
this.pendingSpeechTranscriptionRequests.delete(event.requestId);
pendingRequest.resolve(event.text);
}
private handleSpeechTranscriptionError(event: Extract<ServerEvent, { type: 'speech-transcription-error' }>): void {
const pendingRequest = this.pendingSpeechTranscriptionRequests.get(event.requestId);
if (pendingRequest) {
this.pendingSpeechTranscriptionRequests.delete(event.requestId);
pendingRequest.reject(new Error(event.message));
}
this.error.set(event.message);
}
private async restoreSession(): Promise<void> {
const token = this.token();
@@ -2024,6 +2084,18 @@ export class ChatSessionService {
}
}
private rejectPendingSpeechTranscriptions(message: string): void {
if (this.pendingSpeechTranscriptionRequests.size === 0) {
return;
}
for (const { reject } of this.pendingSpeechTranscriptionRequests.values()) {
reject(new Error(message));
}
this.pendingSpeechTranscriptionRequests.clear();
}
private clearLocalAuth(statusMessage: string): void {
this.clearWebSocketReconnect();
this.disconnectWebSocket();
@@ -2034,6 +2106,7 @@ export class ChatSessionService {
this.stopRingtone();
this.releasePreloadedRingtone();
this.pendingImageGenerationRequests.clear();
this.rejectPendingSpeechTranscriptions('Session ended during dictation.');
this.remoteVideoStreams.set([]);
this.remoteAudioStreams.set([]);
this.remoteVideoModalPeerId.set(null);
@@ -2060,6 +2133,19 @@ export class ChatSessionService {
this.removeStorage('privatechat.user');
}
private async blobToBase64(blob: Blob): Promise<string> {
const buffer = await blob.arrayBuffer();
let binary = '';
const bytes = new Uint8Array(buffer);
const chunkSize = 0x8000;
for (let index = 0; index < bytes.length; index += chunkSize) {
binary += String.fromCharCode(...bytes.subarray(index, index + chunkSize));
}
return btoa(binary);
}
private async loadPersistedMessages(userId: string): Promise<void> {
const messageEncryptionKey = this.messageEncryptionKey;

View File

@@ -130,6 +130,16 @@ export type ServerEvent =
peerId: string;
message: string;
}
| {
type: 'speech-transcribed';
requestId: string;
text: string;
}
| {
type: 'speech-transcription-error';
requestId: string;
message: string;
}
| { type: 'pong' }
| { type: 'error'; message: string };