Start/stop recoding from the backend. Add guide on conversational cha…

…tbots (#9419) * Add code * stop displatch * first draft * edit * add changeset * lint * Docstring * Make recording * fix video * fix guide link * redirect * add changeset --------- Co-authored-by: gradio-pr-bot <[email protected]>
gradio-app · Sep 27, 2024 · 018c140 · 018c140
1 parent 4d75f02
commit 018c140
Show file tree

Hide file tree

Showing 12 changed files with 228 additions and 5 deletions.
diff --git a/.changeset/bright-apes-fly.md b/.changeset/bright-apes-fly.md
@@ -0,0 +1,7 @@
+---
+"@gradio/audio": minor
+"gradio": minor
+"website": minor
+---
+
+feat:Start/stop recoding from the backend. Add guide on conversational chatbots
diff --git a/gradio/blocks.py b/gradio/blocks.py
@@ -1846,7 +1846,11 @@ async def handle_streaming_outputs(
 
         for i, block in enumerate(block_fn.outputs):
             output_id = block._id
-            if isinstance(block, components.StreamingOutput) and block.streaming:
+            if (
+                isinstance(block, components.StreamingOutput)
+                and block.streaming
+                and not utils.is_prop_update(data[i])
+            ):
                 if final:
                     stream_run[output_id].end_stream()
                 first_chunk = output_id not in stream_run

diff --git a/gradio/components/audio.py b/gradio/components/audio.py
@@ -110,6 +110,7 @@ def __init__(
         max_length: int | None = None,
         waveform_options: WaveformOptions | dict | None = None,
         loop: bool = False,
+        recording: bool = False,
     ):
         """
         Parameters:
@@ -139,6 +140,7 @@ def __init__(
             max_length: The maximum length of audio (in seconds) that the user can pass into the prediction function. If None, there is no maximum length.
             waveform_options: A dictionary of options for the waveform display. Options include: waveform_color (str), waveform_progress_color (str), show_controls (bool), skip_length (int), trim_region_color (str). Default is None, which uses the default values for these options. [See `gr.WaveformOptions` docs](#waveform-options).
             loop: If True, the audio will loop when it reaches the end and continue playing from the beginning.
+            recording: If True, the audio component will be set to record audio from the microphone if the source is set to "microphone". Defaults to False.
         """
         valid_sources: list[Literal["upload", "microphone"]] = ["upload", "microphone"]
         if sources is None:
@@ -190,6 +192,7 @@ def __init__(
             self.waveform_options = waveform_options
         self.min_length = min_length
         self.max_length = max_length
+        self.recording = recording
         super().__init__(
             label=label,
             every=every,

diff --git a/guides/04_additional-features/03_streaming-inputs.md b/guides/04_additional-features/03_streaming-inputs.md
@@ -63,4 +63,4 @@ demo.launch()
 
 ## End-to-End Examples
 
-For an end-to-end example of streaming from the webcam, see the object detection from webcam [guide](/main/guides/object-detection-from-webcam).
+For an end-to-end example of streaming from the webcam, see the object detection from webcam [guide](/main/guides/object-detection-from-webcam-with-webrtc).
diff --git a/guides/07_streaming/04_conversational-chatbot.md b/guides/07_streaming/04_conversational-chatbot.md
@@ -0,0 +1,189 @@
+# Building Conversational Chatbots with Gradio
+
+Tags: AUDIO, STREAMING, CHATBOTS
+
+## Introduction
+
+The next generation of AI user interfaces is moving towards audio-native experiences. Users will be able to speak to chatbots and receive spoken responses in return. Several models have been built under this paradigm, including GPT-4o and [mini omni](https://github.com/gpt-omni/mini-omni).
+
+In this guide, we'll walk you through building your own conversational chat application using mini omni as an example. You can see a demo of the finished app below:
+
+<video src="https://github.com/user-attachments/assets/db36f4db-7535-49f1-a2dd-bd36c487ebdf" controls
+height="600" width="600" style="display: block; margin: auto;" autoplay="true" loop="true">
+</video>
+
+## Application Overview
+
+Our application will enable the following user experience:
+
+1. Users click a button to start recording their message
+2. The app detects when the user has finished speaking and stops recording
+3. The user's audio is passed to the omni model, which streams back a response
+4. After omni mini finishes speaking, the user's microphone is reactivated
+5. All previous spoken audio, from both the user and omni, is displayed in a chatbot component
+
+Let's dive into the implementation details.
+
+## Processing User Audio
+
+We'll stream the user's audio from their microphone to the server and determine if the user has stopped speaking on each new chunk of audio.
+
+Here's our `process_audio` function:
+
+```python
+import numpy as np
+from utils import determine_pause
+
+def process_audio(audio: tuple, state: AppState):
+    if state.stream is None:
+        state.stream = audio[1]
+        state.sampling_rate = audio[0]
+    else:
+        state.stream = np.concatenate((state.stream, audio[1]))
+
+    pause_detected = determine_pause(state.stream, state.sampling_rate, state)
+    state.pause_detected = pause_detected
+
+    if state.pause_detected and state.started_talking:
+        return gr.Audio(recording=False), state
+    return None, state
+```
+
+This function takes two inputs:
+1. The current audio chunk (a tuple of `(sampling_rate, numpy array of audio)`)
+2. The current application state
+
+We'll use the following `AppState` dataclass to manage our application state:
+
+```python
+from dataclasses import dataclass
+
+@dataclass
+class AppState:
+    stream: np.ndarray | None = None
+    sampling_rate: int = 0
+    pause_detected: bool = False
+    stopped: bool = False
+    conversation: list = []
+```
+
+The function concatenates new audio chunks to the existing stream and checks if the user has stopped speaking. If a pause is detected, it returns an update to stop recording. Otherwise, it returns `None` to indicate no changes.
+
+The implementation of the `determine_pause` function is specific to the omni-mini project and can be found [here](https://huggingface.co/spaces/gradio/omni-mini/blob/eb027808c7bfe5179b46d9352e3fa1813a45f7c3/app.py#L98).
+
+## Generating the Response
+
+After processing the user's audio, we need to generate and stream the chatbot's response. Here's our `response` function:
+
+```python
+import io
+import tempfile
+from pydub import AudioSegment
+
+def response(state: AppState):
+    if not state.pause_detected and not state.started_talking:
+        return None, AppState()
+
+    audio_buffer = io.BytesIO()
+
+    segment = AudioSegment(
+        state.stream.tobytes(),
+        frame_rate=state.sampling_rate,
+        sample_width=state.stream.dtype.itemsize,
+        channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
+    )
+    segment.export(audio_buffer, format="wav")
+
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+        f.write(audio_buffer.getvalue())
+
+    state.conversation.append({"role": "user",
+                                "content": {"path": f.name,
+                                "mime_type": "audio/wav"}})
+
+    output_buffer = b""
+
+    for mp3_bytes in speaking(audio_buffer.getvalue()):
+        output_buffer += mp3_bytes
+        yield mp3_bytes, state
+
+    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
+        f.write(output_buffer)
+
+    state.conversation.append({"role": "assistant",
+                    "content": {"path": f.name,
+                                "mime_type": "audio/mp3"}})
+    yield None, AppState(conversation=state.conversation)
+```
+
+This function:
+1. Converts the user's audio to a WAV file
+2. Adds the user's message to the conversation history
+3. Generates and streams the chatbot's response using the `speaking` function
+4. Saves the chatbot's response as an MP3 file
+5. Adds the chatbot's response to the conversation history
+
+Note: The implementation of the `speaking` function is specific to the omni-mini project and can be found [here](https://huggingface.co/spaces/gradio/omni-mini/blob/main/app.py#L116).
+
+## Building the Gradio App
+
+Now let's put it all together using Gradio's Blocks API:
+
+```python
+import gradio as gr
+
+def start_recording_user(state: AppState):
+    if not state.stopped:
+        return gr.Audio(recording=True)
+
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            input_audio = gr.Audio(
+                label="Input Audio", sources="microphone", type="numpy"
+            )
+        with gr.Column():
+            chatbot = gr.Chatbot(label="Conversation", type="messages")
+            output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
+    state = gr.State(value=AppState())
+
+    stream = input_audio.stream(
+        process_audio,
+        [input_audio, state],
+        [input_audio, state],
+        stream_every=0.5,
+        time_limit=30,
+    )
+    respond = input_audio.stop_recording(
+        response,
+        [state],
+        [output_audio, state]
+    )
+    respond.then(lambda s: s.conversation, [state], [chatbot])
+
+    restart = output_audio.stop(
+        start_recording_user,
+        [state],
+        [input_audio]
+    )
+    cancel = gr.Button("Stop Conversation", variant="stop")
+    cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
+                [state, input_audio], cancels=[respond, restart])
+
+if __name__ == "__main__":
+    demo.launch()
+```
+
+This setup creates a user interface with:
+- An input audio component for recording user messages
+- A chatbot component to display the conversation history
+- An output audio component for the chatbot's responses
+- A button to stop and reset the conversation
+
+The app streams user audio in 0.5-second chunks, processes it, generates responses, and updates the conversation history accordingly.
+
+## Conclusion
+
+This guide demonstrates how to build a conversational chatbot application using Gradio and the mini omni model. You can adapt this framework to create various audio-based chatbot demos. To see the full application in action, visit the Hugging Face Spaces demo: https://huggingface.co/spaces/gradio/omni-mini
+
+Feel free to experiment with different models, audio processing techniques, or user interface designs to create your own unique conversational AI experiences!
diff --git a/...eaming/04_real-time-speech-recognition.md → ...eaming/05_real-time-speech-recognition.md b/...eaming/04_real-time-speech-recognition.md → ...eaming/05_real-time-speech-recognition.md
diff --git a/js/_website/src/routes/redirects.js b/js/_website/src/routes/redirects.js
@@ -18,5 +18,6 @@ export const redirects = {
 	"/docs/python-client/python-client": "/docs/python-client/introduction",
 	"/guides/sharing-your-app#security-and-file-access": "/guides/file-access",
 	"/docs/gradio/interface#interface-queue": "/docs/gradio/interface",
-	"/using_hugging_face_integrations": "/guides/using-hugging-face-integrations"
+	"/using_hugging_face_integrations": "/guides/using-hugging-face-integrations",
+	"/main/guides/object-detection-from-webcam": "/main/guides/object-detection-from-webcam-with-webrtc"
 };
diff --git a/js/audio/Index.svelte b/js/audio/Index.svelte
@@ -41,6 +41,7 @@
 	export let streaming: boolean;
 	export let stream_every: number;
 	export let input_ready: boolean;
+	export let recording = false;
 	let uploading = false;
 	$: input_ready = !uploading;
 
@@ -240,6 +241,7 @@
 			{active_source}
 			{pending}
 			{streaming}
+			bind:recording
 			{loop}
 			max_file_size={gradio.max_file_size}
 			{handle_reset_value}

diff --git a/js/audio/interactive/InteractiveAudio.svelte b/js/audio/interactive/InteractiveAudio.svelte
@@ -40,6 +40,7 @@
 	export let stream_handler: Client["stream"];
 	export let stream_every: number;
 	export let uploading = false;
+	export let recording = false;
 
 	let time_limit: number | null = null;
 	let stream_state: "open" | "waiting" | "closed" = "closed";
@@ -65,14 +66,12 @@
 
 	// TODO: make use of this
 	// export let type: "normal" | "numpy" = "normal";
-	let recording = false;
 	let recorder: IMediaRecorder;
 	let mode = "";
 	let header: Uint8Array | undefined = undefined;
 	let pending_stream: Uint8Array[] = [];
 	let submit_pending_stream_on_pending_end = false;
 	let inited = false;
-	let stream_open = false;
 
 	const NUM_HEADER_BYTES = 44;
 	let audio_chunks: Blob[] = [];
@@ -236,6 +235,9 @@
 			mode = "";
 		}
 	}
+
+	$: if (!recording && recorder) stop();
+	$: if (recording && recorder) record();
 </script>
 
 <BlockLabel
@@ -264,6 +266,7 @@
 					bind:mode
 					{i18n}
 					{editable}
+					{recording}
 					{dispatch_blob}
 					{waveform_settings}
 					{waveform_options}

diff --git a/js/audio/player/AudioPlayer.svelte b/js/audio/player/AudioPlayer.svelte
@@ -208,6 +208,8 @@
 	autoplay={waveform_settings.autoplay}
 	on:load
 	bind:this={audio_player}
+	on:ended={() => dispatch("stop")}
+	on:play={() => dispatch("play")}
 />
 {#if value === null}
 	<Empty size="small">

diff --git a/js/audio/recorder/AudioRecorder.svelte b/js/audio/recorder/AudioRecorder.svelte
@@ -23,6 +23,7 @@
 	};
 	export let handle_reset_value: () => void;
 	export let editable = true;
+	export let recording = false;
 
 	let micWaveform: WaveSurfer;
 	let recordingWaveform: WaveSurfer;
@@ -226,6 +227,7 @@
 			bind:record
 			{i18n}
 			{timing}
+			{recording}
 			show_recording_waveform={waveform_options.show_recording_waveform}
 			record_time={format_time(seconds)}
 		/>

diff --git a/js/audio/shared/WaveformRecordControls.svelte b/js/audio/shared/WaveformRecordControls.svelte
@@ -6,13 +6,15 @@
 
 	export let record: RecordPlugin;
 	export let i18n: I18nFormatter;
+	export let recording = false;
 
 	let micDevices: MediaDeviceInfo[] = [];
 	let recordButton: HTMLButtonElement;
 	let pauseButton: HTMLButtonElement;
 	let resumeButton: HTMLButtonElement;
 	let stopButton: HTMLButtonElement;
 	let stopButtonPaused: HTMLButtonElement;
+	let recording_ongoing = false;
 
 	export let record_time: string;
 	export let show_recording_waveform: boolean | undefined;
@@ -53,6 +55,14 @@
 		stopButton.style.display = "flex";
 		stopButtonPaused.style.display = "none";
 	});
+
+	$: if (recording && !recording_ongoing) {
+		record.startRecording();
+		recording_ongoing = true;
+	} else {
+		record.stopRecording();
+		recording_ongoing = false;
+	}
 </script>
 
 <div class="controls">