Skip to content

Commit

Permalink
ElevenLabs: support Streaming (output) endpoint & extract Voices Drop…
Browse files Browse the repository at this point in the history
…down
  • Loading branch information
enricoros committed Aug 20, 2023
1 parent 9495a50 commit 4e3b170
Show file tree
Hide file tree
Showing 5 changed files with 325 additions and 41 deletions.
185 changes: 181 additions & 4 deletions src/common/util/audioUtils.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
import * as React from 'react';

export function playSoundUrl(url: string) {
const audio = new Audio(url);
audio.play().catch(error => console.error('Error playing audio:', url, error));
}

export async function playSoundBuffer(audioBuffer: ArrayBuffer) {
const audioContext = new AudioContext();
const bufferSource = audioContext.createBufferSource();
Expand All @@ -6,7 +13,177 @@ export async function playSoundBuffer(audioBuffer: ArrayBuffer) {
bufferSource.start();
}

export async function playSoundUrl(url: string) {
const audio = new Audio(url);
await audio.play();
}

/**
* Plays a sound from a URL, and optionally repeats it after a delay.
* @param url The URL of the sound to play.
* @param firstDelay The delay before the first play, in milliseconds.
* @param repeatMs The delay between each repeat, in milliseconds. If 0, the sound will only play once.
*/
export function usePlaySoundUrl(url: string | null, firstDelay: number = 0, repeatMs: number = 0) {
React.useEffect(() => {
if (!url) return;

let timer2: any = null;

const playFirstTime = () => {
const playAudio = () => playSoundUrl(url);
playAudio();
timer2 = repeatMs > 0 ? setInterval(playAudio, repeatMs) : null;
};

const timer1 = setTimeout(playFirstTime, firstDelay);

return () => {
clearTimeout(timer1);
if (timer2)
clearInterval(timer2);
};
}, [firstDelay, repeatMs, url]);
}


export class LiveAudioPlayer {
private readonly audioContext: AudioContext;
private readonly audioElement: HTMLAudioElement;
private readonly mediaSource: MediaSource;
private reader: ReadableStreamDefaultReader<Uint8Array> | null;
private bufferSizeLimit: number;
private onStart: (() => void) | null;
private onStop: (() => void) | null;

constructor() {
this.audioContext = new AudioContext();
this.audioElement = new Audio();
this.mediaSource = new MediaSource();
this.reader = null;
this.bufferSizeLimit = 5; // in seconds
this.onStart = null;
this.onStop = null;
}

async EXPERIMENTAL_playStream(edgeResponse: Response) {
if (this.reader) {
await this.stop();
}

if (!edgeResponse.body) {
return;
}
const esgeReadableStream = edgeResponse.body;

const sourceNode = this.audioContext.createMediaElementSource(this.audioElement);
sourceNode.connect(this.audioContext.destination);

const mimeType = 'audio/mpeg';
this.mediaSource.addEventListener('sourceopen', async () => {
const sourceBuffer: SourceBuffer = this.mediaSource.addSourceBuffer(mimeType);
this.reader = esgeReadableStream.getReader();

if (this.onStart) {
this.onStart();
}

while (true) {
const { done, value } = await this.reader.read();
if (done) {
sourceBuffer.onupdateend = () => this.mediaSource.endOfStream();
break;
}

await new Promise((resolve) => {
if (!sourceBuffer.updating) {
resolve(null);
} else {
sourceBuffer.addEventListener('updateend', () => resolve(null), { once: true });
}
});

if (this.audioElement.buffered.length > 0) {
const currentTime = this.audioElement.currentTime;
const bufferedEnd = this.audioElement.buffered.end(this.audioElement.buffered.length - 1);
const remainingBuffer = bufferedEnd - currentTime;

if (remainingBuffer > this.bufferSizeLimit) {
// E: just made this a bit more resilient, but not much
try {
// Remove old data from the buffer
sourceBuffer.remove(0, currentTime - 1);
await new Promise((resolve) => {
sourceBuffer.addEventListener('updateend', () => resolve(null), { once: true });
});
} catch (e) {
console.warn('Error removing old data from the buffer:', e);
}
}
}

// Wait for the sourceBuffer to finish updating before appending new data
await new Promise((resolve) => {
if (!sourceBuffer.updating) {
resolve(null);
} else {
sourceBuffer.addEventListener('updateend', () => resolve(null), { once: true });
}
});

// Append new data to the buffer
sourceBuffer.appendBuffer(value);
}

if (this.onStop) {
this.onStop();
}
});

this.audioElement.src = URL.createObjectURL(this.mediaSource);
this.audioElement.autoplay = true;
}

async stop() {
if (this.reader) {
await this.reader.cancel();
this.reader = null;
this.mediaSource.endOfStream();
this.audioElement.pause();
}
}

// setOnStart(callback) {
// this.onStart = callback;
// }
//
// setOnStop(callback) {
// this.onStop = callback;
// }
}


/*export async function playLiveAudioStream(stream: ReadableStream<Uint8Array>, mimeType: string = 'audio/mpeg') {
const mediaSource = new MediaSource();
const audio = new Audio(URL.createObjectURL(mediaSource));
audio.autoplay = true;
mediaSource.addEventListener('sourceopen', async () => {
const sourceBuffer = mediaSource.addSourceBuffer(mimeType);
const reader = stream.getReader();
const processStream = async () => {
const { done, value } = await reader.read();
if (done) {
mediaSource.endOfStream();
return;
}
if (sourceBuffer.updating) {
await new Promise(resolve => sourceBuffer.addEventListener('updateend', resolve, { once: true }));
}
sourceBuffer.appendBuffer(value);
processStream();
};
processStream();
});
}*/
38 changes: 7 additions & 31 deletions src/modules/elevenlabs/ElevenlabsSettings.tsx
Original file line number Diff line number Diff line change
@@ -1,37 +1,29 @@
import * as React from 'react';
import { shallow } from 'zustand/shallow';

import { Box, CircularProgress, FormControl, FormHelperText, FormLabel, Option, Radio, RadioGroup, Select, Stack, Tooltip } from '@mui/joy';
import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown';
import RecordVoiceOverIcon from '@mui/icons-material/RecordVoiceOver';

import { apiQuery } from '~/modules/trpc/trpc.client';
import { Box, FormControl, FormHelperText, FormLabel, Radio, RadioGroup, Stack, Tooltip } from '@mui/joy';

import { FormInputKey } from '~/common/components/FormInputKey';
import { LanguageSelect } from '~/common/components/LanguageSelect';
import { settingsCol1Width, settingsGap } from '~/common/theme';

import { isElevenLabsEnabled, requireUserKeyElevenLabs } from './elevenlabs.client';
import { useElevenlabsStore } from './store-elevenlabs';
import { useVoiceDropdown } from './useVoiceDropdown';


export function ElevenlabsSettings() {
// external state
const { apiKey, setApiKey, voiceId, setVoiceId, autoSpeak, setAutoSpeak } = useElevenlabsStore(state => ({
const { apiKey, setApiKey, autoSpeak, setAutoSpeak } = useElevenlabsStore(state => ({
apiKey: state.elevenLabsApiKey, setApiKey: state.setElevenLabsApiKey,
voiceId: state.elevenLabsVoiceId, setVoiceId: state.setElevenLabsVoiceId,
autoSpeak: state.elevenLabsAutoSpeak, setAutoSpeak: state.setElevenLabsAutoSpeak,
}), shallow);

const requiresKey = requireUserKeyElevenLabs;
const isValidKey = isElevenLabsEnabled(apiKey);

const { data: voicesData, isLoading: loadingVoices } = apiQuery.elevenlabs.listVoices.useQuery({ elevenKey: apiKey }, {
enabled: isValidKey,
staleTime: 1000 * 60 * 5, // 5 minutes
});
const { hasVoices, voicesDropdown } = useVoiceDropdown(true);

const handleVoiceChange = (_event: any, value: string | null) => setVoiceId(value || '');

const handleAutoSpeakChange = (e: React.ChangeEvent<HTMLInputElement>) => setAutoSpeak((e.target.value || 'off') as 'off' | 'firstLine');

Expand Down Expand Up @@ -68,23 +60,7 @@ export function ElevenlabsSettings() {
<FormLabel sx={{ minWidth: settingsCol1Width }}>
Assistant Voice
</FormLabel>
<Select
variant='outlined' placeholder={isValidKey ? 'Select a voice' : 'Enter valid API Key'}
value={voiceId} onChange={handleVoiceChange}
startDecorator={<RecordVoiceOverIcon />}
endDecorator={isValidKey && loadingVoices && <CircularProgress size='sm' />}
indicator={<KeyboardArrowDownIcon />}
slotProps={{
root: { sx: { width: '100%' } },
indicator: { sx: { opacity: 0.5 } },
}}
>
{voicesData && voicesData.voices?.map(voice => (
<Option key={voice.id} value={voice.id}>
{voice.name}
</Option>
))}
</Select>
{voicesDropdown}
</FormControl>

<FormControl orientation='horizontal' sx={{ alignItems: 'center', justifyContent: 'space-between' }}>
Expand All @@ -93,8 +69,8 @@ export function ElevenlabsSettings() {
<FormHelperText>{autoSpeak === 'off' ? 'Off' : 'First paragraph'}</FormHelperText>
</Box>
<RadioGroup orientation='horizontal' value={autoSpeak} onChange={handleAutoSpeakChange}>
<Radio disabled={!voicesData?.voices} value='off' label='Off' />
<Radio disabled={!voicesData?.voices} value='firstLine' label='Start' />
<Radio disabled={!hasVoices} value='off' label='Off' />
<Radio disabled={!hasVoices} value='firstLine' label='Start' />
<Radio disabled={true} value='all' label='Full' />
</RadioGroup>
</FormControl>
Expand Down
29 changes: 25 additions & 4 deletions src/modules/elevenlabs/elevenlabs.client.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { playSoundBuffer } from '~/common/util/audioUtils';
import { LiveAudioPlayer, playSoundBuffer } from '~/common/util/audioUtils';
import { useUIPreferencesStore } from '~/common/state/store-ui';

import type { SpeechInputSchema } from './elevenlabs.router';
Expand All @@ -24,23 +24,44 @@ export async function speakText(text: string, voiceId?: string) {
const nonEnglish = !(preferredLanguage?.toLowerCase()?.startsWith('en'));

try {
const audioBuffer = await callElevenlabsSpeech(text, elevenLabsApiKey, voiceId || elevenLabsVoiceId, nonEnglish);
const edgeResponse = await fetchApiElevenlabsSpeech(text, elevenLabsApiKey, voiceId || elevenLabsVoiceId, nonEnglish, false);
const audioBuffer = await edgeResponse.arrayBuffer();
await playSoundBuffer(audioBuffer);
} catch (error) {
console.error('Error playing first text:', error);
}
}

// let liveAudioPlayer: LiveAudioPlayer | undefined = undefined;

export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) {
if (!(text?.trim())) return;

const { elevenLabsApiKey, elevenLabsVoiceId } = useElevenlabsStore.getState();
if (!isElevenLabsEnabled(elevenLabsApiKey)) return;

const { preferredLanguage } = useUIPreferencesStore.getState();
const nonEnglish = !(preferredLanguage?.toLowerCase()?.startsWith('en'));

const edgeResponse = await fetchApiElevenlabsSpeech(text, elevenLabsApiKey, voiceId || elevenLabsVoiceId, nonEnglish, true);

// if (!liveAudioPlayer)
const liveAudioPlayer = new LiveAudioPlayer();
liveAudioPlayer.EXPERIMENTAL_playStream(edgeResponse).then();
}


/**
* Note: we have to use this client-side API instead of TRPC because of ArrayBuffers..
*/
async function callElevenlabsSpeech(text: string, elevenLabsApiKey: string, elevenLabsVoiceId: string, nonEnglish: boolean): Promise<ArrayBuffer> {
async function fetchApiElevenlabsSpeech(text: string, elevenLabsApiKey: string, elevenLabsVoiceId: string, nonEnglish: boolean, streaming: boolean): Promise<Response> {
// NOTE: hardcoded 1000 as a failsafe, since the API will take very long and consume lots of credits for longer texts
const speechInput: SpeechInputSchema = {
elevenKey: elevenLabsApiKey,
text: text.slice(0, 1000),
voiceId: elevenLabsVoiceId,
nonEnglish,
...(streaming && { streaming: true, streamOptimization: 4 }),
};

const response = await fetch('/api/elevenlabs/speech', {
Expand All @@ -54,5 +75,5 @@ async function callElevenlabsSpeech(text: string, elevenLabsApiKey: string, elev
throw new Error(errorData.error || errorData.message || 'Unknown error');
}

return await response.arrayBuffer();
return response;
}
20 changes: 18 additions & 2 deletions src/modules/elevenlabs/elevenlabs.router.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,34 @@ export const speechInputSchema = z.object({

export type SpeechInputSchema = z.infer<typeof speechInputSchema>;

const voicesInputSchema = z.object({
const listVoicesInputSchema = z.object({
elevenKey: z.string().optional(),
});

const voiceSchema = z.object({
id: z.string(),
name: z.string(),
description: z.string().nullable(),
previewUrl: z.string().nullable(),
category: z.string(),
default: z.boolean(),
});

export type VoiceSchema = z.infer<typeof voiceSchema>;

const listVoicesOutputSchema = z.object({
voices: z.array(voiceSchema),
});


export const elevenlabsRouter = createTRPCRouter({

/**
* List Voices available to this api key
*/
listVoices: publicProcedure
.input(voicesInputSchema)
.input(listVoicesInputSchema)
.output(listVoicesOutputSchema)
.query(async ({ input }) => {

const { elevenKey } = input;
Expand Down
Loading

1 comment on commit 4e3b170

@vercel
Copy link

@vercel vercel bot commented on 4e3b170 Aug 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

big-agi – ./

big-agi-git-main-enricoros.vercel.app
big-agi-enricoros.vercel.app
get.big-agi.com

Please sign in to comment.