-
Notifications
You must be signed in to change notification settings - Fork 94
/
Copy pathmain.ts
436 lines (357 loc) · 13 KB
/
main.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
import {
Hume,
HumeClient,
convertBlobToBase64,
convertBase64ToBlob,
ensureSingleValidAudioTrack,
getAudioStream,
getBrowserSupportedMimeType,
MimeType,
} from 'hume';
import { handleToolCallMessage } from './handleToolCall';
import './styles.css';
(async () => {
const startBtn =
document.querySelector<HTMLButtonElement>("button#start-btn");
const stopBtn = document.querySelector<HTMLButtonElement>("button#stop-btn");
const chat = document.querySelector<HTMLDivElement>("div#chat");
startBtn?.addEventListener("click", connect);
stopBtn?.addEventListener("click", disconnect);
/**
* the Hume Client, includes methods for connecting to EVI and managing the Web Socket connection
*/
let client: HumeClient | null = null;
/**
* the WebSocket instance
*/
let socket: Hume.empathicVoice.chat.ChatSocket | null = null;
/**
* flag which denotes the intended state of the WebSocket
*/
let connected = false;
/**
* the recorder responsible for recording the audio stream to be prepared as the audio input
*/
let recorder: MediaRecorder | null = null;
/**
* the stream of audio captured from the user's microphone
*/
let audioStream: MediaStream | null = null;
/**
* the current audio element to be played
*/
let currentAudio: HTMLAudioElement | null = null;
/**
* flag which denotes whether audio is currently playing or not
*/
let isPlaying = false;
/**
* flag which denotes whether to utilize chat resumability (preserve context from one chat to the next)
*/
let resumeChats = true;
/**
* The ChatGroup ID used to resume the chat if disconnected unexpectedly
*/
let chatGroupId: string | undefined;
/**
* audio playback queue
*/
const audioQueue: Blob[] = [];
/**
* mime type supported by the browser the application is running in
*/
const mimeType: MimeType = (() => {
const result = getBrowserSupportedMimeType();
return result.success ? result.mimeType : MimeType.WEBM;
})();
/**
* instantiates interface config and client, sets up Web Socket handlers, and establishes secure Web Socket connection
*/
async function connect(): Promise<void> {
// instantiate the HumeClient with credentials to make authenticated requests
if (!client) {
client = new HumeClient({
apiKey: import.meta.env.VITE_HUME_API_KEY || "",
secretKey: import.meta.env.VITE_HUME_SECRET_KEY || "",
});
}
// instantiates WebSocket and establishes an authenticated connection
socket = await client.empathicVoice.chat.connect({
// configuration that includes the get_current_weather tool
configId: import.meta.env.VITE_HUME_WEATHER_ASSISTANT_CONFIG_ID || null,
resumedChatGroupId: chatGroupId,
});
socket.on("open", handleWebSocketOpenEvent);
socket.on("message", handleWebSocketMessageEvent);
socket.on("error", handleWebSocketErrorEvent);
socket.on("close", handleWebSocketCloseEvent);
// update ui state
toggleBtnStates();
}
/**
* stops audio capture and playback, and closes the Web Socket connection
*/
function disconnect(): void {
// update ui state
toggleBtnStates();
// stop audio playback
stopAudio();
// stop audio capture
recorder?.stop();
recorder = null;
audioStream = null;
// set connected state to false to prevent automatic reconnect
connected = false;
// IF resumeChats flag is false, reset chatGroupId so a new conversation is started when reconnecting
if (!resumeChats) {
chatGroupId = undefined;
}
// closed the Web Socket connection
socket?.close();
}
/**
* captures and records audio stream, and sends audio stream through the socket
*
* API Reference:
* - `audio_input`: https://dev.hume.ai/reference/empathic-voice-interface-evi/chat/chat#send.Audio%20Input.type
*/
async function captureAudio(): Promise<void> {
audioStream = await getAudioStream();
// ensure there is only one audio track in the stream
ensureSingleValidAudioTrack(audioStream);
// instantiate the media recorder
recorder = new MediaRecorder(audioStream, { mimeType });
// callback for when recorded chunk is available to be processed
recorder.ondataavailable = async ({ data }) => {
// IF size of data is smaller than 1 byte then do nothing
if (data.size < 1) return;
// base64 encode audio data
const encodedAudioData = await convertBlobToBase64(data);
// define the audio_input message JSON
const audioInput: Omit<Hume.empathicVoice.AudioInput, "type"> = {
data: encodedAudioData,
};
// send audio_input message
socket?.sendAudioInput(audioInput);
};
// capture audio input at a rate of 100ms (recommended)
const timeSlice = 100;
recorder.start(timeSlice);
}
/**
* play the audio within the playback queue, converting each Blob into playable HTMLAudioElements
*/
function playAudio(): void {
// IF there is nothing in the audioQueue OR audio is currently playing then do nothing
if (!audioQueue.length || isPlaying) return;
// update isPlaying state
isPlaying = true;
// pull next audio output from the queue
const audioBlob = audioQueue.shift();
// IF audioBlob is unexpectedly undefined then do nothing
if (!audioBlob) return;
// converts Blob to AudioElement for playback
const audioUrl = URL.createObjectURL(audioBlob);
currentAudio = new Audio(audioUrl);
// play audio
currentAudio.play();
// callback for when audio finishes playing
currentAudio.onended = () => {
// update isPlaying state
isPlaying = false;
// attempt to pull next audio output from queue
if (audioQueue.length) playAudio();
};
}
/**
* stops audio playback, clears audio playback queue, and updates audio playback state
*/
function stopAudio(): void {
// stop the audio playback
currentAudio?.pause();
currentAudio = null;
// update audio playback state
isPlaying = false;
// clear the audioQueue
audioQueue.length = 0;
}
/**
* callback function to handle a WebSocket opened event
*/
async function handleWebSocketOpenEvent(): Promise<void> {
/* place logic here which you would like invoked when the socket opens */
console.log("Web socket connection opened");
// ensures socket will reconnect if disconnected unintentionally
connected = true;
await captureAudio();
}
/**
* callback function to handle a WebSocket message event
*
* API Reference:
* - `chat_metadata`: https://dev.hume.ai/reference/empathic-voice-interface-evi/chat/chat#receive.Chat%20Metadata.type
* - `user_message`: https://dev.hume.ai/reference/empathic-voice-interface-evi/chat/chat#receive.User%20Message.type
* - `assistant_message`: https://dev.hume.ai/reference/empathic-voice-interface-evi/chat/chat#receive.Assistant%20Message.type
* - `audio_output`: https://dev.hume.ai/reference/empathic-voice-interface-evi/chat/chat#receive.Audio%20Output.type
* - `user_interruption`: https://dev.hume.ai/reference/empathic-voice-interface-evi/chat/chat#receive.User%20Interruption.type
* - `tool_call`: https://dev.hume.ai/reference/empathic-voice-interface-evi/chat/chat#receive.Tool%20Call%20Message.type
*/
async function handleWebSocketMessageEvent(
message: Hume.empathicVoice.SubscribeEvent
): Promise<void> {
/* place logic here which you would like to invoke when receiving a message through the socket */
console.log(message);
// handle messages received through the WebSocket (messages are distinguished by their "type" field.)
switch (message.type) {
// save chat_group_id to resume chat if disconnected
case "chat_metadata":
chatGroupId = message.chatGroupId;
break;
// append user and assistant messages to UI for chat visibility
case "user_message":
case "assistant_message":
const { role, content } = message.message;
const topThreeEmotions = extractTopThreeEmotions(message);
appendMessage(role, content ?? "", topThreeEmotions);
break;
// add received audio to the playback queue, and play next audio output
case "audio_output":
// convert base64 encoded audio to a Blob
const audioOutput = message.data;
const blob = convertBase64ToBlob(audioOutput);
// add audio Blob to audioQueue
audioQueue.push(blob);
// play the next audio output
if (audioQueue.length >= 1) playAudio();
break;
// stop audio playback, clear audio playback queue, and update audio playback state on interrupt
case "user_interruption":
stopAudio();
break;
// invoke tool upon receiving a tool_call message
case "tool_call":
handleToolCallMessage(message, socket);
break;
}
}
/**
* callback function to handle a WebSocket error event
*/
function handleWebSocketErrorEvent(error: Error): void {
/* place logic here which you would like invoked when receiving an error through the socket */
console.error(error);
}
/**
* callback function to handle a WebSocket closed event
*/
async function handleWebSocketCloseEvent(): Promise<void> {
/* place logic here which you would like invoked when the socket closes */
// reconnect to the socket if disconnect was unintentional
if (connected) await connect();
console.log("Web socket connection closed");
}
/**
* adds message to Chat in the webpage's UI
*
* @param role the speaker associated with the audio transcription
* @param content transcript of the audio
* @param topThreeEmotions the top three emotion prediction scores for the message
*/
function appendMessage(
role: Hume.empathicVoice.Role,
content: string,
topThreeEmotions: { emotion: string; score: any }[]
): void {
// generate chat card component with message content and emotion scores
const chatCard = new ChatCard({
role,
timestamp: new Date().toLocaleTimeString(),
content,
scores: topThreeEmotions,
});
// append chat card to the UI
chat?.appendChild(chatCard.render());
// scroll to the bottom to view most recently added message
if (chat) chat.scrollTop = chat.scrollHeight;
}
/**
* toggles `start` and `stop` buttons' disabled states
*/
function toggleBtnStates(): void {
if (startBtn) startBtn.disabled = !startBtn.disabled;
if (stopBtn) stopBtn.disabled = !stopBtn.disabled;
}
/**
* takes a received `user_message` or `assistant_message` and extracts the top 3 emotions from the
* predicted expression measurement scores.
*/
function extractTopThreeEmotions(
message:
| Hume.empathicVoice.UserMessage
| Hume.empathicVoice.AssistantMessage
): { emotion: string; score: string }[] {
// extract emotion scores from the message
const scores = message.models.prosody?.scores;
// convert the emotions object into an array of key-value pairs
const scoresArray = Object.entries(scores || {});
// sort the array by the values in descending order
scoresArray.sort((a, b) => b[1] - a[1]);
// extract the top three emotions and convert them back to an object
const topThreeEmotions = scoresArray
.slice(0, 3)
.map(([emotion, score]) => ({
emotion,
score: (Math.round(Number(score) * 100) / 100).toFixed(2),
}));
return topThreeEmotions;
}
})();
/**
* The code below does not pertain to the EVI implementation, and only serves to style the UI.
*/
interface Score {
emotion: string;
score: string;
}
interface ChatMessage {
role: Hume.empathicVoice.Role;
timestamp: string;
content: string;
scores: Score[];
}
class ChatCard {
private message: ChatMessage;
constructor(message: ChatMessage) {
this.message = message;
}
private createScoreItem(score: Score): HTMLElement {
const scoreItem = document.createElement('div');
scoreItem.className = 'score-item';
scoreItem.innerHTML = `${score.emotion}: <strong>${score.score}</strong>`;
return scoreItem;
}
public render(): HTMLElement {
const card = document.createElement('div');
card.className = `chat-card ${this.message.role}`;
const role = document.createElement('div');
role.className = 'role';
role.textContent =
this.message.role.charAt(0).toUpperCase() + this.message.role.slice(1);
const timestamp = document.createElement('div');
timestamp.className = 'timestamp';
timestamp.innerHTML = `<strong>${this.message.timestamp}</strong>`;
const content = document.createElement('div');
content.className = 'content';
content.textContent = this.message.content;
const scores = document.createElement('div');
scores.className = 'scores';
this.message.scores.forEach((score) => {
scores.appendChild(this.createScoreItem(score));
});
card.appendChild(role);
card.appendChild(timestamp);
card.appendChild(content);
card.appendChild(scores);
return card;
}
}