Skip to content

Commit

Permalink
Start/stop recoding from the backend. Add guide on conversational cha…
Browse files Browse the repository at this point in the history
…tbots (#9419)

* Add code

* stop displatch

* first draft

* edit

* add changeset

* lint

* Docstring

* Make recording

* fix video

* fix guide link

* redirect

* add changeset

---------

Co-authored-by: gradio-pr-bot <[email protected]>
  • Loading branch information
freddyaboulton and gradio-pr-bot authored Sep 27, 2024
1 parent 4d75f02 commit 018c140
Show file tree
Hide file tree
Showing 12 changed files with 228 additions and 5 deletions.
7 changes: 7 additions & 0 deletions .changeset/bright-apes-fly.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
"@gradio/audio": minor
"gradio": minor
"website": minor
---

feat:Start/stop recoding from the backend. Add guide on conversational chatbots
6 changes: 5 additions & 1 deletion gradio/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1846,7 +1846,11 @@ async def handle_streaming_outputs(

for i, block in enumerate(block_fn.outputs):
output_id = block._id
if isinstance(block, components.StreamingOutput) and block.streaming:
if (
isinstance(block, components.StreamingOutput)
and block.streaming
and not utils.is_prop_update(data[i])
):
if final:
stream_run[output_id].end_stream()
first_chunk = output_id not in stream_run
Expand Down
3 changes: 3 additions & 0 deletions gradio/components/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def __init__(
max_length: int | None = None,
waveform_options: WaveformOptions | dict | None = None,
loop: bool = False,
recording: bool = False,
):
"""
Parameters:
Expand Down Expand Up @@ -139,6 +140,7 @@ def __init__(
max_length: The maximum length of audio (in seconds) that the user can pass into the prediction function. If None, there is no maximum length.
waveform_options: A dictionary of options for the waveform display. Options include: waveform_color (str), waveform_progress_color (str), show_controls (bool), skip_length (int), trim_region_color (str). Default is None, which uses the default values for these options. [See `gr.WaveformOptions` docs](#waveform-options).
loop: If True, the audio will loop when it reaches the end and continue playing from the beginning.
recording: If True, the audio component will be set to record audio from the microphone if the source is set to "microphone". Defaults to False.
"""
valid_sources: list[Literal["upload", "microphone"]] = ["upload", "microphone"]
if sources is None:
Expand Down Expand Up @@ -190,6 +192,7 @@ def __init__(
self.waveform_options = waveform_options
self.min_length = min_length
self.max_length = max_length
self.recording = recording
super().__init__(
label=label,
every=every,
Expand Down
2 changes: 1 addition & 1 deletion guides/04_additional-features/03_streaming-inputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,4 @@ demo.launch()

## End-to-End Examples

For an end-to-end example of streaming from the webcam, see the object detection from webcam [guide](/main/guides/object-detection-from-webcam).
For an end-to-end example of streaming from the webcam, see the object detection from webcam [guide](/main/guides/object-detection-from-webcam-with-webrtc).
189 changes: 189 additions & 0 deletions guides/07_streaming/04_conversational-chatbot.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
# Building Conversational Chatbots with Gradio

Tags: AUDIO, STREAMING, CHATBOTS

## Introduction

The next generation of AI user interfaces is moving towards audio-native experiences. Users will be able to speak to chatbots and receive spoken responses in return. Several models have been built under this paradigm, including GPT-4o and [mini omni](https://github.com/gpt-omni/mini-omni).

In this guide, we'll walk you through building your own conversational chat application using mini omni as an example. You can see a demo of the finished app below:

<video src="https://github.com/user-attachments/assets/db36f4db-7535-49f1-a2dd-bd36c487ebdf" controls
height="600" width="600" style="display: block; margin: auto;" autoplay="true" loop="true">
</video>

## Application Overview

Our application will enable the following user experience:

1. Users click a button to start recording their message
2. The app detects when the user has finished speaking and stops recording
3. The user's audio is passed to the omni model, which streams back a response
4. After omni mini finishes speaking, the user's microphone is reactivated
5. All previous spoken audio, from both the user and omni, is displayed in a chatbot component

Let's dive into the implementation details.

## Processing User Audio

We'll stream the user's audio from their microphone to the server and determine if the user has stopped speaking on each new chunk of audio.

Here's our `process_audio` function:

```python
import numpy as np
from utils import determine_pause

def process_audio(audio: tuple, state: AppState):
if state.stream is None:
state.stream = audio[1]
state.sampling_rate = audio[0]
else:
state.stream = np.concatenate((state.stream, audio[1]))

pause_detected = determine_pause(state.stream, state.sampling_rate, state)
state.pause_detected = pause_detected

if state.pause_detected and state.started_talking:
return gr.Audio(recording=False), state
return None, state
```

This function takes two inputs:
1. The current audio chunk (a tuple of `(sampling_rate, numpy array of audio)`)
2. The current application state

We'll use the following `AppState` dataclass to manage our application state:

```python
from dataclasses import dataclass

@dataclass
class AppState:
stream: np.ndarray | None = None
sampling_rate: int = 0
pause_detected: bool = False
stopped: bool = False
conversation: list = []
```

The function concatenates new audio chunks to the existing stream and checks if the user has stopped speaking. If a pause is detected, it returns an update to stop recording. Otherwise, it returns `None` to indicate no changes.

The implementation of the `determine_pause` function is specific to the omni-mini project and can be found [here](https://huggingface.co/spaces/gradio/omni-mini/blob/eb027808c7bfe5179b46d9352e3fa1813a45f7c3/app.py#L98).

## Generating the Response

After processing the user's audio, we need to generate and stream the chatbot's response. Here's our `response` function:

```python
import io
import tempfile
from pydub import AudioSegment

def response(state: AppState):
if not state.pause_detected and not state.started_talking:
return None, AppState()

audio_buffer = io.BytesIO()

segment = AudioSegment(
state.stream.tobytes(),
frame_rate=state.sampling_rate,
sample_width=state.stream.dtype.itemsize,
channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
)
segment.export(audio_buffer, format="wav")

with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_buffer.getvalue())

state.conversation.append({"role": "user",
"content": {"path": f.name,
"mime_type": "audio/wav"}})

output_buffer = b""

for mp3_bytes in speaking(audio_buffer.getvalue()):
output_buffer += mp3_bytes
yield mp3_bytes, state

with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
f.write(output_buffer)

state.conversation.append({"role": "assistant",
"content": {"path": f.name,
"mime_type": "audio/mp3"}})
yield None, AppState(conversation=state.conversation)
```

This function:
1. Converts the user's audio to a WAV file
2. Adds the user's message to the conversation history
3. Generates and streams the chatbot's response using the `speaking` function
4. Saves the chatbot's response as an MP3 file
5. Adds the chatbot's response to the conversation history

Note: The implementation of the `speaking` function is specific to the omni-mini project and can be found [here](https://huggingface.co/spaces/gradio/omni-mini/blob/main/app.py#L116).

## Building the Gradio App

Now let's put it all together using Gradio's Blocks API:

```python
import gradio as gr

def start_recording_user(state: AppState):
if not state.stopped:
return gr.Audio(recording=True)

with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
input_audio = gr.Audio(
label="Input Audio", sources="microphone", type="numpy"
)
with gr.Column():
chatbot = gr.Chatbot(label="Conversation", type="messages")
output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
state = gr.State(value=AppState())

stream = input_audio.stream(
process_audio,
[input_audio, state],
[input_audio, state],
stream_every=0.5,
time_limit=30,
)
respond = input_audio.stop_recording(
response,
[state],
[output_audio, state]
)
respond.then(lambda s: s.conversation, [state], [chatbot])

restart = output_audio.stop(
start_recording_user,
[state],
[input_audio]
)
cancel = gr.Button("Stop Conversation", variant="stop")
cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
[state, input_audio], cancels=[respond, restart])

if __name__ == "__main__":
demo.launch()
```

This setup creates a user interface with:
- An input audio component for recording user messages
- A chatbot component to display the conversation history
- An output audio component for the chatbot's responses
- A button to stop and reset the conversation

The app streams user audio in 0.5-second chunks, processes it, generates responses, and updates the conversation history accordingly.

## Conclusion

This guide demonstrates how to build a conversational chatbot application using Gradio and the mini omni model. You can adapt this framework to create various audio-based chatbot demos. To see the full application in action, visit the Hugging Face Spaces demo: https://huggingface.co/spaces/gradio/omni-mini

Feel free to experiment with different models, audio processing techniques, or user interface designs to create your own unique conversational AI experiences!
3 changes: 2 additions & 1 deletion js/_website/src/routes/redirects.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@ export const redirects = {
"/docs/python-client/python-client": "/docs/python-client/introduction",
"/guides/sharing-your-app#security-and-file-access": "/guides/file-access",
"/docs/gradio/interface#interface-queue": "/docs/gradio/interface",
"/using_hugging_face_integrations": "/guides/using-hugging-face-integrations"
"/using_hugging_face_integrations": "/guides/using-hugging-face-integrations",
"/main/guides/object-detection-from-webcam": "/main/guides/object-detection-from-webcam-with-webrtc"
};
2 changes: 2 additions & 0 deletions js/audio/Index.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
export let streaming: boolean;
export let stream_every: number;
export let input_ready: boolean;
export let recording = false;
let uploading = false;
$: input_ready = !uploading;
Expand Down Expand Up @@ -240,6 +241,7 @@
{active_source}
{pending}
{streaming}
bind:recording
{loop}
max_file_size={gradio.max_file_size}
{handle_reset_value}
Expand Down
7 changes: 5 additions & 2 deletions js/audio/interactive/InteractiveAudio.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
export let stream_handler: Client["stream"];
export let stream_every: number;
export let uploading = false;
export let recording = false;
let time_limit: number | null = null;
let stream_state: "open" | "waiting" | "closed" = "closed";
Expand All @@ -65,14 +66,12 @@
// TODO: make use of this
// export let type: "normal" | "numpy" = "normal";
let recording = false;
let recorder: IMediaRecorder;
let mode = "";
let header: Uint8Array | undefined = undefined;
let pending_stream: Uint8Array[] = [];
let submit_pending_stream_on_pending_end = false;
let inited = false;
let stream_open = false;
const NUM_HEADER_BYTES = 44;
let audio_chunks: Blob[] = [];
Expand Down Expand Up @@ -236,6 +235,9 @@
mode = "";
}
}
$: if (!recording && recorder) stop();
$: if (recording && recorder) record();
</script>

<BlockLabel
Expand Down Expand Up @@ -264,6 +266,7 @@
bind:mode
{i18n}
{editable}
{recording}
{dispatch_blob}
{waveform_settings}
{waveform_options}
Expand Down
2 changes: 2 additions & 0 deletions js/audio/player/AudioPlayer.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,8 @@
autoplay={waveform_settings.autoplay}
on:load
bind:this={audio_player}
on:ended={() => dispatch("stop")}
on:play={() => dispatch("play")}
/>
{#if value === null}
<Empty size="small">
Expand Down
2 changes: 2 additions & 0 deletions js/audio/recorder/AudioRecorder.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
};
export let handle_reset_value: () => void;
export let editable = true;
export let recording = false;
let micWaveform: WaveSurfer;
let recordingWaveform: WaveSurfer;
Expand Down Expand Up @@ -226,6 +227,7 @@
bind:record
{i18n}
{timing}
{recording}
show_recording_waveform={waveform_options.show_recording_waveform}
record_time={format_time(seconds)}
/>
Expand Down
10 changes: 10 additions & 0 deletions js/audio/shared/WaveformRecordControls.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
export let record: RecordPlugin;
export let i18n: I18nFormatter;
export let recording = false;
let micDevices: MediaDeviceInfo[] = [];
let recordButton: HTMLButtonElement;
let pauseButton: HTMLButtonElement;
let resumeButton: HTMLButtonElement;
let stopButton: HTMLButtonElement;
let stopButtonPaused: HTMLButtonElement;
let recording_ongoing = false;
export let record_time: string;
export let show_recording_waveform: boolean | undefined;
Expand Down Expand Up @@ -53,6 +55,14 @@
stopButton.style.display = "flex";
stopButtonPaused.style.display = "none";
});
$: if (recording && !recording_ongoing) {
record.startRecording();
recording_ongoing = true;
} else {
record.stopRecording();
recording_ongoing = false;
}
</script>

<div class="controls">
Expand Down

0 comments on commit 018c140

Please sign in to comment.