End-to-end sample code for streaming text-to-speech #48

iwasrobbed · 2024-06-09T14:30:45Z

Issue

Understanding how to stream and play the generated audio isn't very straightforward for devs who are new to streaming and playing audio.

Expected

Clear sample code showing an end-to-end example or having convenience methods within this JS lib that helps people stream and play audio both for server- and client-side needs.

Example code

For now, here is some sample code others can work off of (and hopefully improve upon)

Text-to-Speech (server-side)

import { ElevenLabsClient } from 'elevenlabs'
import { OptimizeStreamingLatency } from 'elevenlabs/api'
import { Readable } from 'stream'

const elevenlabs = new ElevenLabsClient({
  apiKey: process.env.ELEVENLABS_API_KEY
})

type TextToSpeechProps = {
  text: string
}

enum ElevenLabsVoice {
  MyVoice = 'abcd1234'
}

enum ElevenLabsModel {
  MultilingualV2 = 'eleven_multilingual_v2',
  TurboV2 = 'eleven_turbo_v2'
}

export async function textToSpeech({
  text
}: TextToSpeechProps): Promise<Readable> {
  const voiceId = ElevenLabsVoice.MyVoice
  const modelId = ElevenLabsModel.TurboV2

  const audioStream = await elevenlabs.generate({
    stream: true,
    voice: voiceId,
    text: text,
    model_id: modelId,
    optimize_streaming_latency: OptimizeStreamingLatency.Three,
    voice_settings: {
      stability: 0.7,
      similarity_boost: 1.0,
      style: 0.5,
      use_speaker_boost: true
    }
  })
  return audioStream
}

API Route to wrap it (since this ElevenLabs lib works only in Node envs for now)

// /api/tts/route.ts
import { textToSpeech } from '@/lib/voice/elevenlabs/textToSpeech'
import { NextResponse } from 'next/server'

export async function POST(req: Request) {
  const { message } = await req.json()
  try {
    console.time('textToSpeech latency')
    const audioStream = await textToSpeech({ text: message })
    console.timeEnd('textToSpeech latency')

    return new Response(audioStream as unknown as BodyInit, {
      headers: { 'Content-Type': 'audio/mpeg' }
    })
  } catch (error: any) {
    console.error(error)
    return NextResponse.json(
      { error: error.message },
      { status: error.statusCode || 500 }
    )
  }
}

Hook to call into API route

import { useCallback, useRef, useState } from 'react'
import { playAudioFromResponse } from '@/lib/audioPlayer/audioPlayer'

export const useTextToSpeech = () => {
  const audioRef = useRef<HTMLAudioElement | null>(null)
  const [audioLoaded, setAudioLoaded] = useState(false)
  const [isPlaying, setIsPlaying] = useState(false)
  const [isLoading, setIsLoading] = useState(false)

  const stop = useCallback(() => {
    if (audioRef.current) {
      audioRef.current.pause()
      audioRef.current.currentTime = 0
      setIsPlaying(false)
    }
  }, [])

  const speak = useCallback(async (text: string) => {
    setIsLoading(true)

    const response = await fetch('/api/tts', {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json'
      },
      body: JSON.stringify({ message: text })
    })

    if (!response.ok) {
      throw new Error('Network response was not ok')
    }

    playAudioFromResponse(
      response,
      audioRef,
      () => setAudioLoaded(true),
      isPlaying => setIsPlaying(isPlaying),
      isLoading => setIsLoading(isLoading)
    )
  }, [])

  return { speak, stop, audioLoaded, isPlaying, isLoading }
}

Audio player to play streamed readable

export function playAudioFromResponse(
  response: Response,
  audioRef: React.MutableRefObject<HTMLAudioElement | null>,
  onAudioLoaded: () => void,
  onIsPlayingChange: (isPlaying: boolean) => void,
  onIsLoadingChange: (isLoading: boolean) => void
) {
  if (!MediaSource.isTypeSupported('audio/mpeg')) {
    throw new Error('Unsupported MIME type or codec: audio/mpeg')
  }

  const mediaSource = new MediaSource()
  const audio = new Audio()
  audio.src = URL.createObjectURL(mediaSource)
  audioRef.current = audio

  mediaSource.addEventListener('sourceopen', () => {
    const sourceBuffer = mediaSource.addSourceBuffer('audio/mpeg')
    onAudioLoaded()
    readAudioChunks(response.body!.getReader(), sourceBuffer, mediaSource)
    onIsLoadingChange(false)
    onIsPlayingChange(true)
    audio.play()
  })

  audio.onended = () => {
    onIsPlayingChange(false)
    onIsLoadingChange(false)
  }

  audio.addEventListener('error', e => {
    console.error('Error playing audio', e)
  })
}

function readAudioChunks(
  reader: ReadableStreamDefaultReader<Uint8Array>,
  sourceBuffer: SourceBuffer,
  mediaSource: MediaSource
) {
  let queue: Uint8Array[] = []
  let isAppendingInProgress = false

  function processQueue() {
    if (queue.length > 0 && !sourceBuffer.updating) {
      sourceBuffer.appendBuffer(queue.shift()!)
    }
  }

  function push() {
    reader.read().then(({ done, value }) => {
      if (done) {
        mediaSource.endOfStream()
        return
      }
      queue.push(value!)
      if (!isAppendingInProgress) {
        isAppendingInProgress = true
        processQueue()
      }
      push()
    })
  }

  sourceBuffer.addEventListener('updateend', () => {
    isAppendingInProgress = false
    processQueue()
  })

  push()
}

React code to bring it all together

import Image from 'next/image'
import { MdPlayCircle, MdStopCircle } from 'react-icons/md'
import { ImSpinner8 } from 'react-icons/im'
import { useTextToSpeech } from '@/lib/hooks/use-text-to-speech'
import { cn } from '@/lib/utils'

export function EmptyScreen() {
  const { speak, stop, isLoading, isPlaying } = useTextToSpeech()

  const handlePlayAudio = () => {
    isPlaying ? stop() : speak("Hello, I'm a virtual assistant! Welcome to our AI chatbot. Here, you can chat with me and get assistance with various tasks. What can I help you with?")
  }

  return (
    <div className="mx-auto max-w-2xl px-4">
      <div className="flex flex-col gap-2 rounded-lg border bg-background p-8">
        <Image
          src="/images/assistant.svg"
          alt="Virtual Assistant"
          width={700}
          height={392}
          className="mx-auto"
          priority
        />
        <h1 className="text-lg font-semibold">Hello, I&apos;m your Virtual Assistant!</h1>
        <p className="leading-normal text-muted-foreground">
          Welcome to our AI chatbot. Here, you can chat with me and get assistance with various tasks.
        </p>
        <button
          onClick={handlePlayAudio}
          className={cn(
            'flex gap-2 items-center justify-center mt-4 px-4 py-2 bg-blue-500 text-white rounded',
            isLoading && 'opacity-50 cursor-not-allowed'
          )}
        >
          {isLoading ? (
            <ImSpinner8 className="size-4 shrink-0 animate-spin" />
          ) : isPlaying ? (
            <MdStopCircle />
          ) : (
            <MdPlayCircle />
          )}
          {isLoading ? 'Loading...' : isPlaying ? 'Stop' : 'Play'}
        </button>
      </div>
    </div>
  )
}

emorling · 2024-07-04T20:36:21Z

Thanks for sharing, this should be part of original documentation.

Bilal-io · 2024-07-19T17:08:40Z

Has anyone tested this on iPhone?
I had no luck with the MediaSource API on iPhone.

eerikson · 2024-08-05T01:44:34Z

Thanks so much for sharing your solution and flagging the current docs as lacking. Unfortunately, I'm not able to successfully get your solution working, due to this error:

InvalidStateError: Failed to execute 'endOfStream' on 'MediaSource': The 'updating' attribute is true on one or more of this MediaSource's SourceBuffers.

In any case, this is a brick wall for many developers looking to incorporate the streaming API from Eleven Labs. I hope more attention is given to this part of the documentation.

jtmuller5 · 2024-11-14T16:00:23Z

I created a simple example of streaming generated text from server to client in a NextJS app. You can find it here:

https://github.com/jtmuller5/elevenlabs-nextjs-stream-example

Important parts:

Client page is located at app/(features)/elevenlabs/page.tsx
Server route is located at app/api/stream/tts
In the browser, navigate to /elevenlabs to test it out

braco mentioned this issue Jul 28, 2024

elevenlabs ai-ng/swift#8

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

End-to-end sample code for streaming text-to-speech #48

End-to-end sample code for streaming text-to-speech #48

iwasrobbed commented Jun 9, 2024

emorling commented Jul 4, 2024

Bilal-io commented Jul 19, 2024

eerikson commented Aug 5, 2024 •

edited

Loading

jtmuller5 commented Nov 14, 2024

End-to-end sample code for streaming text-to-speech #48

End-to-end sample code for streaming text-to-speech #48

Comments

iwasrobbed commented Jun 9, 2024

Issue

Expected

Example code

Text-to-Speech (server-side)

API Route to wrap it (since this ElevenLabs lib works only in Node envs for now)

Hook to call into API route

Audio player to play streamed readable

React code to bring it all together

emorling commented Jul 4, 2024

Bilal-io commented Jul 19, 2024

eerikson commented Aug 5, 2024 • edited Loading

jtmuller5 commented Nov 14, 2024

eerikson commented Aug 5, 2024 •

edited

Loading