ElevenLabs: support Streaming (output) endpoint & extract Voices Drop…

…down
enricoros · Aug 20, 2023 · 4e3b170 · 4e3b170 · vercel · Aug 20, 2023
1 parent 9495a50
commit 4e3b170
Show file tree

Hide file tree

Showing 5 changed files with 325 additions and 41 deletions.
diff --git a/src/common/util/audioUtils.ts b/src/common/util/audioUtils.ts
@@ -1,3 +1,10 @@
+import * as React from 'react';
+
+export function playSoundUrl(url: string) {
+  const audio = new Audio(url);
+  audio.play().catch(error => console.error('Error playing audio:', url, error));
+}
+
 export async function playSoundBuffer(audioBuffer: ArrayBuffer) {
   const audioContext = new AudioContext();
   const bufferSource = audioContext.createBufferSource();
@@ -6,7 +13,177 @@ export async function playSoundBuffer(audioBuffer: ArrayBuffer) {
   bufferSource.start();
 }
 
-export async function playSoundUrl(url: string) {
-  const audio = new Audio(url);
-  await audio.play();
-}
+
+/**
+ * Plays a sound from a URL, and optionally repeats it after a delay.
+ * @param url The URL of the sound to play.
+ * @param firstDelay The delay before the first play, in milliseconds.
+ * @param repeatMs The delay between each repeat, in milliseconds. If 0, the sound will only play once.
+ */
+export function usePlaySoundUrl(url: string | null, firstDelay: number = 0, repeatMs: number = 0) {
+  React.useEffect(() => {
+    if (!url) return;
+
+    let timer2: any = null;
+
+    const playFirstTime = () => {
+      const playAudio = () => playSoundUrl(url);
+      playAudio();
+      timer2 = repeatMs > 0 ? setInterval(playAudio, repeatMs) : null;
+    };
+
+    const timer1 = setTimeout(playFirstTime, firstDelay);
+
+    return () => {
+      clearTimeout(timer1);
+      if (timer2)
+        clearInterval(timer2);
+    };
+  }, [firstDelay, repeatMs, url]);
+}
+
+
+export class LiveAudioPlayer {
+  private readonly audioContext: AudioContext;
+  private readonly audioElement: HTMLAudioElement;
+  private readonly mediaSource: MediaSource;
+  private reader: ReadableStreamDefaultReader<Uint8Array> | null;
+  private bufferSizeLimit: number;
+  private onStart: (() => void) | null;
+  private onStop: (() => void) | null;
+
+  constructor() {
+    this.audioContext = new AudioContext();
+    this.audioElement = new Audio();
+    this.mediaSource = new MediaSource();
+    this.reader = null;
+    this.bufferSizeLimit = 5; // in seconds
+    this.onStart = null;
+    this.onStop = null;
+  }
+
+  async EXPERIMENTAL_playStream(edgeResponse: Response) {
+    if (this.reader) {
+      await this.stop();
+    }
+
+    if (!edgeResponse.body) {
+      return;
+    }
+    const esgeReadableStream = edgeResponse.body;
+
+    const sourceNode = this.audioContext.createMediaElementSource(this.audioElement);
+    sourceNode.connect(this.audioContext.destination);
+
+    const mimeType = 'audio/mpeg';
+    this.mediaSource.addEventListener('sourceopen', async () => {
+      const sourceBuffer: SourceBuffer = this.mediaSource.addSourceBuffer(mimeType);
+      this.reader = esgeReadableStream.getReader();
+
+      if (this.onStart) {
+        this.onStart();
+      }
+
+      while (true) {
+        const { done, value } = await this.reader.read();
+        if (done) {
+          sourceBuffer.onupdateend = () => this.mediaSource.endOfStream();
+          break;
+        }
+
+        await new Promise((resolve) => {
+          if (!sourceBuffer.updating) {
+            resolve(null);
+          } else {
+            sourceBuffer.addEventListener('updateend', () => resolve(null), { once: true });
+          }
+        });
+
+        if (this.audioElement.buffered.length > 0) {
+          const currentTime = this.audioElement.currentTime;
+          const bufferedEnd = this.audioElement.buffered.end(this.audioElement.buffered.length - 1);
+          const remainingBuffer = bufferedEnd - currentTime;
+
+          if (remainingBuffer > this.bufferSizeLimit) {
+            // E: just made this a bit more resilient, but not much
+            try {
+              // Remove old data from the buffer
+              sourceBuffer.remove(0, currentTime - 1);
+              await new Promise((resolve) => {
+                sourceBuffer.addEventListener('updateend', () => resolve(null), { once: true });
+              });
+            } catch (e) {
+              console.warn('Error removing old data from the buffer:', e);
+            }
+          }
+        }
+
+        // Wait for the sourceBuffer to finish updating before appending new data
+        await new Promise((resolve) => {
+          if (!sourceBuffer.updating) {
+            resolve(null);
+          } else {
+            sourceBuffer.addEventListener('updateend', () => resolve(null), { once: true });
+          }
+        });
+
+        // Append new data to the buffer
+        sourceBuffer.appendBuffer(value);
+      }
+
+      if (this.onStop) {
+        this.onStop();
+      }
+    });
+
+    this.audioElement.src = URL.createObjectURL(this.mediaSource);
+    this.audioElement.autoplay = true;
+  }
+
+  async stop() {
+    if (this.reader) {
+      await this.reader.cancel();
+      this.reader = null;
+      this.mediaSource.endOfStream();
+      this.audioElement.pause();
+    }
+  }
+
+  // setOnStart(callback) {
+  //   this.onStart = callback;
+  // }
+  //
+  // setOnStop(callback) {
+  //   this.onStop = callback;
+  // }
+}
+
+
+/*export async function playLiveAudioStream(stream: ReadableStream<Uint8Array>, mimeType: string = 'audio/mpeg') {
+  const mediaSource = new MediaSource();
+  const audio = new Audio(URL.createObjectURL(mediaSource));
+  audio.autoplay = true;
+
+  mediaSource.addEventListener('sourceopen', async () => {
+    const sourceBuffer = mediaSource.addSourceBuffer(mimeType);
+    const reader = stream.getReader();
+
+    const processStream = async () => {
+      const { done, value } = await reader.read();
+
+      if (done) {
+        mediaSource.endOfStream();
+        return;
+      }
+
+      if (sourceBuffer.updating) {
+        await new Promise(resolve => sourceBuffer.addEventListener('updateend', resolve, { once: true }));
+      }
+
+      sourceBuffer.appendBuffer(value);
+      processStream();
+    };
+
+    processStream();
+  });
+}*/
diff --git a/src/modules/elevenlabs/ElevenlabsSettings.tsx b/src/modules/elevenlabs/ElevenlabsSettings.tsx
@@ -1,37 +1,29 @@
 import * as React from 'react';
 import { shallow } from 'zustand/shallow';
 
-import { Box, CircularProgress, FormControl, FormHelperText, FormLabel, Option, Radio, RadioGroup, Select, Stack, Tooltip } from '@mui/joy';
-import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown';
-import RecordVoiceOverIcon from '@mui/icons-material/RecordVoiceOver';
-
-import { apiQuery } from '~/modules/trpc/trpc.client';
+import { Box, FormControl, FormHelperText, FormLabel, Radio, RadioGroup, Stack, Tooltip } from '@mui/joy';
 
 import { FormInputKey } from '~/common/components/FormInputKey';
 import { LanguageSelect } from '~/common/components/LanguageSelect';
 import { settingsCol1Width, settingsGap } from '~/common/theme';
 
 import { isElevenLabsEnabled, requireUserKeyElevenLabs } from './elevenlabs.client';
 import { useElevenlabsStore } from './store-elevenlabs';
+import { useVoiceDropdown } from './useVoiceDropdown';
 
 
 export function ElevenlabsSettings() {
   // external state
-  const { apiKey, setApiKey, voiceId, setVoiceId, autoSpeak, setAutoSpeak } = useElevenlabsStore(state => ({
+  const { apiKey, setApiKey, autoSpeak, setAutoSpeak } = useElevenlabsStore(state => ({
     apiKey: state.elevenLabsApiKey, setApiKey: state.setElevenLabsApiKey,
-    voiceId: state.elevenLabsVoiceId, setVoiceId: state.setElevenLabsVoiceId,
     autoSpeak: state.elevenLabsAutoSpeak, setAutoSpeak: state.setElevenLabsAutoSpeak,
   }), shallow);
 
   const requiresKey = requireUserKeyElevenLabs;
   const isValidKey = isElevenLabsEnabled(apiKey);
 
-  const { data: voicesData, isLoading: loadingVoices } = apiQuery.elevenlabs.listVoices.useQuery({ elevenKey: apiKey }, {
-    enabled: isValidKey,
-    staleTime: 1000 * 60 * 5, // 5 minutes
-  });
+  const { hasVoices, voicesDropdown } = useVoiceDropdown(true);
 
-  const handleVoiceChange = (_event: any, value: string | null) => setVoiceId(value || '');
 
   const handleAutoSpeakChange = (e: React.ChangeEvent<HTMLInputElement>) => setAutoSpeak((e.target.value || 'off') as 'off' | 'firstLine');
 
@@ -68,23 +60,7 @@ export function ElevenlabsSettings() {
         <FormLabel sx={{ minWidth: settingsCol1Width }}>
           Assistant Voice
         </FormLabel>
-        <Select
-          variant='outlined' placeholder={isValidKey ? 'Select a voice' : 'Enter valid API Key'}
-          value={voiceId} onChange={handleVoiceChange}
-          startDecorator={<RecordVoiceOverIcon />}
-          endDecorator={isValidKey && loadingVoices && <CircularProgress size='sm' />}
-          indicator={<KeyboardArrowDownIcon />}
-          slotProps={{
-            root: { sx: { width: '100%' } },
-            indicator: { sx: { opacity: 0.5 } },
-          }}
-        >
-          {voicesData && voicesData.voices?.map(voice => (
-            <Option key={voice.id} value={voice.id}>
-              {voice.name}
-            </Option>
-          ))}
-        </Select>
+        {voicesDropdown}
       </FormControl>
 
       <FormControl orientation='horizontal' sx={{ alignItems: 'center', justifyContent: 'space-between' }}>
@@ -93,8 +69,8 @@ export function ElevenlabsSettings() {
           <FormHelperText>{autoSpeak === 'off' ? 'Off' : 'First paragraph'}</FormHelperText>
         </Box>
         <RadioGroup orientation='horizontal' value={autoSpeak} onChange={handleAutoSpeakChange}>
-          <Radio disabled={!voicesData?.voices} value='off' label='Off' />
-          <Radio disabled={!voicesData?.voices} value='firstLine' label='Start' />
+          <Radio disabled={!hasVoices} value='off' label='Off' />
+          <Radio disabled={!hasVoices} value='firstLine' label='Start' />
           <Radio disabled={true} value='all' label='Full' />
         </RadioGroup>
       </FormControl>

diff --git a/src/modules/elevenlabs/elevenlabs.client.ts b/src/modules/elevenlabs/elevenlabs.client.ts
@@ -1,4 +1,4 @@
-import { playSoundBuffer } from '~/common/util/audioUtils';
+import { LiveAudioPlayer, playSoundBuffer } from '~/common/util/audioUtils';
 import { useUIPreferencesStore } from '~/common/state/store-ui';
 
 import type { SpeechInputSchema } from './elevenlabs.router';
@@ -24,23 +24,44 @@ export async function speakText(text: string, voiceId?: string) {
   const nonEnglish = !(preferredLanguage?.toLowerCase()?.startsWith('en'));
 
   try {
-    const audioBuffer = await callElevenlabsSpeech(text, elevenLabsApiKey, voiceId || elevenLabsVoiceId, nonEnglish);
+    const edgeResponse = await fetchApiElevenlabsSpeech(text, elevenLabsApiKey, voiceId || elevenLabsVoiceId, nonEnglish, false);
+    const audioBuffer = await edgeResponse.arrayBuffer();
     await playSoundBuffer(audioBuffer);
   } catch (error) {
     console.error('Error playing first text:', error);
   }
 }
 
+// let liveAudioPlayer: LiveAudioPlayer | undefined = undefined;
+
+export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) {
+  if (!(text?.trim())) return;
+
+  const { elevenLabsApiKey, elevenLabsVoiceId } = useElevenlabsStore.getState();
+  if (!isElevenLabsEnabled(elevenLabsApiKey)) return;
+
+  const { preferredLanguage } = useUIPreferencesStore.getState();
+  const nonEnglish = !(preferredLanguage?.toLowerCase()?.startsWith('en'));
+
+  const edgeResponse = await fetchApiElevenlabsSpeech(text, elevenLabsApiKey, voiceId || elevenLabsVoiceId, nonEnglish, true);
+
+  // if (!liveAudioPlayer)
+  const liveAudioPlayer = new LiveAudioPlayer();
+  liveAudioPlayer.EXPERIMENTAL_playStream(edgeResponse).then();
+}
+
+
 /**
  * Note: we have to use this client-side API instead of TRPC because of ArrayBuffers..
  */
-async function callElevenlabsSpeech(text: string, elevenLabsApiKey: string, elevenLabsVoiceId: string, nonEnglish: boolean): Promise<ArrayBuffer> {
+async function fetchApiElevenlabsSpeech(text: string, elevenLabsApiKey: string, elevenLabsVoiceId: string, nonEnglish: boolean, streaming: boolean): Promise<Response> {
   // NOTE: hardcoded 1000 as a failsafe, since the API will take very long and consume lots of credits for longer texts
   const speechInput: SpeechInputSchema = {
     elevenKey: elevenLabsApiKey,
     text: text.slice(0, 1000),
     voiceId: elevenLabsVoiceId,
     nonEnglish,
+    ...(streaming && { streaming: true, streamOptimization: 4 }),
   };
 
   const response = await fetch('/api/elevenlabs/speech', {
@@ -54,5 +75,5 @@ async function callElevenlabsSpeech(text: string, elevenLabsApiKey: string, elev
     throw new Error(errorData.error || errorData.message || 'Unknown error');
   }
 
-  return await response.arrayBuffer();
+  return response;
 }
diff --git a/src/modules/elevenlabs/elevenlabs.router.ts b/src/modules/elevenlabs/elevenlabs.router.ts
@@ -15,18 +15,34 @@ export const speechInputSchema = z.object({
 
 export type SpeechInputSchema = z.infer<typeof speechInputSchema>;
 
-const voicesInputSchema = z.object({
+const listVoicesInputSchema = z.object({
   elevenKey: z.string().optional(),
 });
 
+const voiceSchema = z.object({
+  id: z.string(),
+  name: z.string(),
+  description: z.string().nullable(),
+  previewUrl: z.string().nullable(),
+  category: z.string(),
+  default: z.boolean(),
+});
+
+export type VoiceSchema = z.infer<typeof voiceSchema>;
+
+const listVoicesOutputSchema = z.object({
+  voices: z.array(voiceSchema),
+});
+
 
 export const elevenlabsRouter = createTRPCRouter({
 
   /**
    * List Voices available to this api key
    */
   listVoices: publicProcedure
-    .input(voicesInputSchema)
+    .input(listVoicesInputSchema)
+    .output(listVoicesOutputSchema)
     .query(async ({ input }) => {
 
       const { elevenKey } = input;