gradio-app · abidlabs · Sep 19, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024
diff --git a/.changeset/floppy-nails-grab.md b/.changeset/floppy-nails-grab.md
@@ -0,0 +1,5 @@
+---
+"gradio": minor
+---
+
+feat:Object Detection From Webcam Stream Guide
diff --git a/demo/rt-detr-object-detection/draw_boxes.py b/demo/rt-detr-object-detection/draw_boxes.py
@@ -0,0 +1,45 @@
+from PIL import ImageDraw, ImageFont  # type: ignore
+import colorsys
+
+
+def get_color(label):
+    # Simple hash function to generate consistent colors for each label
+    hash_value = hash(label)
+    hue = (hash_value % 100) / 100.0
+    saturation = 0.7
+    value = 0.9
+    rgb = colorsys.hsv_to_rgb(hue, saturation, value)
+    return tuple(int(x * 255) for x in rgb)
+
+
+def draw_bounding_boxes(image, results: dict, model, threshold=0.3):
+    draw = ImageDraw.Draw(image)
+    font = ImageFont.load_default()
+
+    for score, label_id, box in zip(
+        results["scores"], results["labels"], results["boxes"]
+    ):
+        if score > threshold:
+            label = model.config.id2label[label_id.item()]
+            box = [round(i, 2) for i in box.tolist()]
+            color = get_color(label)
+
+            # Draw bounding box
+            draw.rectangle(box, outline=color, width=3) # type: ignore
+
+            # Prepare text
+            text = f"{label}: {score:.2f}"
+            text_bbox = draw.textbbox((0, 0), text, font=font)
+            text_width = text_bbox[2] - text_bbox[0]
+            text_height = text_bbox[3] - text_bbox[1]
+
+            # Draw text background
+            draw.rectangle(
+                [box[0], box[1] - text_height - 4, box[0] + text_width, box[1]], # type: ignore
+                fill=color, # type: ignore
+            )
+
+            # Draw text
+            draw.text((box[0], box[1] - text_height - 4), text, fill="white", font=font)
+
+    return image
diff --git a/demo/rt-detr-object-detection/requirements.txt b/demo/rt-detr-object-detection/requirements.txt
@@ -0,0 +1,4 @@
+safetensors==0.4.3
+opencv-python
+torch
+transformers>=4.43.0
diff --git a/demo/rt-detr-object-detection/run.ipynb b/demo/rt-detr-object-detection/run.ipynb
@@ -0,0 +1 @@
+{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: rt-detr-object-detection"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio safetensors==0.4.3 opencv-python torch transformers>=4.43.0"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/rt-detr-object-detection/draw_boxes.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import spaces\n", "import gradio as gr\n", "import cv2\n", "from PIL import Image\n", "import torch\n", "import time\n", "import numpy as np\n", "import uuid\n", "\n", "from transformers import RTDetrForObjectDetection, RTDetrImageProcessor  # type: ignore\n", "\n", "from draw_boxes import draw_bounding_boxes\n", "\n", "image_processor = RTDetrImageProcessor.from_pretrained(\"PekingU/rtdetr_r50vd\")\n", "model = RTDetrForObjectDetection.from_pretrained(\"PekingU/rtdetr_r50vd\").to(\"cuda\")\n", "\n", "\n", "SUBSAMPLE = 2\n", "\n", "\n", "@spaces.GPU\n", "def stream_object_detection(video, conf_threshold):\n", "    cap = cv2.VideoCapture(video)\n", "\n", "    video_codec = cv2.VideoWriter_fourcc(*\"mp4v\")  # type: ignore\n", "    fps = int(cap.get(cv2.CAP_PROP_FPS))\n", "\n", "    desired_fps = fps // SUBSAMPLE\n", "    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) // 2\n", "    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) // 2\n", "\n", "    iterating, frame = cap.read()\n", "\n", "    n_frames = 0\n", "\n", "    name = f\"output_{uuid.uuid4()}.mp4\"\n", "    segment_file = cv2.VideoWriter(name, video_codec, desired_fps, (width, height))  # type: ignore\n", "    batch = []\n", "\n", "    while iterating:\n", "        frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5)\n", "        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n", "        if n_frames % SUBSAMPLE == 0:\n", "            batch.append(frame)\n", "        if len(batch) == 2 * desired_fps:\n", "            inputs = image_processor(images=batch, return_tensors=\"pt\").to(\"cuda\")\n", "\n", "            print(f\"starting batch of size {len(batch)}\")\n", "            start = time.time()\n", "            with torch.no_grad():\n", "                outputs = model(**inputs)\n", "            end = time.time()\n", "            print(\"time taken for inference\", end - start)\n", "\n", "            start = time.time()\n", "            boxes = image_processor.post_process_object_detection(\n", "                outputs,\n", "                target_sizes=torch.tensor([(height, width)] * len(batch)),\n", "                threshold=conf_threshold,\n", "            )\n", "\n", "            for _, (array, box) in enumerate(zip(batch, boxes)):\n", "                pil_image = draw_bounding_boxes(\n", "                    Image.fromarray(array), box, model, conf_threshold\n", "                )\n", "                frame = np.array(pil_image)\n", "                # Convert RGB to BGR\n", "                frame = frame[:, :, ::-1].copy()\n", "                segment_file.write(frame)\n", "\n", "            batch = []\n", "            segment_file.release()\n", "            yield name\n", "            end = time.time()\n", "            print(\"time taken for processing boxes\", end - start)\n", "            name = f\"output_{uuid.uuid4()}.mp4\"\n", "            segment_file = cv2.VideoWriter(\n", "                name, video_codec, desired_fps, (width, height)\n", "            )  # type: ignore\n", "\n", "        iterating, frame = cap.read()\n", "        n_frames += 1\n", "\n", "\n", "with gr.Blocks() as demo:\n", "    gr.HTML(\n", "        \"\"\"\n", "    <h1 style='text-align: center'>\n", "    Video Object Detection with <a href='https://huggingface.co/PekingU/rtdetr_r101vd_coco_o365' target='_blank'>RT-DETR</a>\n", "    </h1>\n", "    \"\"\"\n", "    )\n", "    with gr.Row():\n", "        with gr.Column():\n", "            video = gr.Video(label=\"Video Source\")\n", "            conf_threshold = gr.Slider(\n", "                label=\"Confidence Threshold\",\n", "                minimum=0.0,\n", "                maximum=1.0,\n", "                step=0.05,\n", "                value=0.30,\n", "            )\n", "        with gr.Column():\n", "            output_video = gr.Video(\n", "                label=\"Processed Video\", streaming=True, autoplay=True\n", "            )\n", "\n", "    video.upload(\n", "        fn=stream_object_detection,\n", "        inputs=[video, conf_threshold],\n", "        outputs=[output_video],\n", "    )\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
diff --git a/demo/rt-detr-object-detection/run.py b/demo/rt-detr-object-detection/run.py
@@ -0,0 +1,115 @@
+import spaces
+import gradio as gr
+import cv2
+from PIL import Image
+import torch
+import time
+import numpy as np
+import uuid
+
+from transformers import RTDetrForObjectDetection, RTDetrImageProcessor  # type: ignore
+
+from draw_boxes import draw_bounding_boxes
+
+image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
+model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd").to("cuda")
+
+
+SUBSAMPLE = 2
+
+
+@spaces.GPU
+def stream_object_detection(video, conf_threshold):
+    cap = cv2.VideoCapture(video)
+
+    video_codec = cv2.VideoWriter_fourcc(*"mp4v")  # type: ignore
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+
+    desired_fps = fps // SUBSAMPLE
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) // 2
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) // 2
+
+    iterating, frame = cap.read()
+
+    n_frames = 0
+
+    name = f"output_{uuid.uuid4()}.mp4"
+    segment_file = cv2.VideoWriter(name, video_codec, desired_fps, (width, height))  # type: ignore
+    batch = []
+
+    while iterating:
+        frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5)
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        if n_frames % SUBSAMPLE == 0:
+            batch.append(frame)
+        if len(batch) == 2 * desired_fps:
+            inputs = image_processor(images=batch, return_tensors="pt").to("cuda")
+
+            print(f"starting batch of size {len(batch)}")
+            start = time.time()
+            with torch.no_grad():
+                outputs = model(**inputs)
+            end = time.time()
+            print("time taken for inference", end - start)
+
+            start = time.time()
+            boxes = image_processor.post_process_object_detection(
+                outputs,
+                target_sizes=torch.tensor([(height, width)] * len(batch)),
+                threshold=conf_threshold,
+            )
+
+            for _, (array, box) in enumerate(zip(batch, boxes)):
+                pil_image = draw_bounding_boxes(
+                    Image.fromarray(array), box, model, conf_threshold
+                )
+                frame = np.array(pil_image)
+                # Convert RGB to BGR
+                frame = frame[:, :, ::-1].copy()
+                segment_file.write(frame)
+
+            batch = []
+            segment_file.release()
+            yield name
+            end = time.time()
+            print("time taken for processing boxes", end - start)
+            name = f"output_{uuid.uuid4()}.mp4"
+            segment_file = cv2.VideoWriter(
+                name, video_codec, desired_fps, (width, height)
+            )  # type: ignore
+
+        iterating, frame = cap.read()
+        n_frames += 1
+
+
+with gr.Blocks() as demo:
+    gr.HTML(
+        """
+    <h1 style='text-align: center'>
+    Video Object Detection with <a href='https://huggingface.co/PekingU/rtdetr_r101vd_coco_o365' target='_blank'>RT-DETR</a>
+    </h1>
+    """
+    )
+    with gr.Row():
+        with gr.Column():
+            video = gr.Video(label="Video Source")
+            conf_threshold = gr.Slider(
+                label="Confidence Threshold",
+                minimum=0.0,
+                maximum=1.0,
+                step=0.05,
+                value=0.30,
+            )
+        with gr.Column():
+            output_video = gr.Video(
+                label="Processed Video", streaming=True, autoplay=True
+            )
+
+    video.upload(
+        fn=stream_object_detection,
+        inputs=[video, conf_threshold],
+        outputs=[output_video],
+    )
+
+if __name__ == "__main__":
+    demo.launch()
diff --git a/demo/yolov10_webcam_stream/requirements.txt b/demo/yolov10_webcam_stream/requirements.txt
@@ -0,0 +1,2 @@
+safetensors==0.4.3
+git+https://github.com/THU-MIG/yolov10.git
diff --git a/demo/yolov10_webcam_stream/run.ipynb b/demo/yolov10_webcam_stream/run.ipynb
@@ -0,0 +1 @@
+{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: yolov10_webcam_stream"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio safetensors==0.4.3 git+https://github.com/THU-MIG/yolov10.git"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "\n", "from ultralytics import YOLOv10\n", "\n", "model = YOLOv10.from_pretrained(\"jameslahm/yolov10n\")\n", "\n", "\n", "def yolov10_inference(image, conf_threshold):\n", "    width, _ = image.size\n", "    import time\n", "\n", "    start = time.time()\n", "    results = model.predict(source=image, imgsz=width, conf=conf_threshold)\n", "    end = time.time()\n", "    annotated_image = results[0].plot()\n", "    print(\"time\", end - start)\n", "    return annotated_image[:, :, ::-1]\n", "\n", "\n", "css = \"\"\".my-group {max-width: 600px !important; max-height: 600 !important;}\n", "                      .my-column {display: flex !important; justify-content: center !important; align-items: center !important};\"\"\"\n", "\n", "\n", "with gr.Blocks(css=css) as app:\n", "    gr.HTML(\n", "        \"\"\"\n", "    <h1 style='text-align: center'>\n", "    <a href='https://github.com/THU-MIG/yolov10' target='_blank'>YOLO V10</a> Webcam Stream Object Detection\n", "    </h1>\n", "    \"\"\"\n", "    )\n", "    with gr.Column(elem_classes=[\"my-column\"]):\n", "        with gr.Group(elem_classes=[\"my-group\"]):\n", "            image = gr.Image(type=\"pil\", label=\"Image\", sources=\"webcam\")\n", "            conf_threshold = gr.Slider(\n", "                label=\"Confidence Threshold\",\n", "                minimum=0.0,\n", "                maximum=1.0,\n", "                step=0.05,\n", "                value=0.30,\n", "            )\n", "        image.stream(\n", "            fn=yolov10_inference,\n", "            inputs=[image, conf_threshold],\n", "            outputs=[image],\n", "            stream_every=0.1,\n", "            time_limit=30,\n", "        )\n", "\n", "if __name__ == \"__main__\":\n", "    app.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
diff --git a/demo/yolov10_webcam_stream/run.py b/demo/yolov10_webcam_stream/run.py
@@ -0,0 +1,51 @@
+import gradio as gr
+
+from ultralytics import YOLOv10
+
+model = YOLOv10.from_pretrained("jameslahm/yolov10n")
+
+
+def yolov10_inference(image, conf_threshold):
+    width, _ = image.size
+    import time
+
+    start = time.time()
+    results = model.predict(source=image, imgsz=width, conf=conf_threshold)
+    end = time.time()
+    annotated_image = results[0].plot()
+    print("time", end - start)
+    return annotated_image[:, :, ::-1]
+
+
+css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
+                      .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
+
+
+with gr.Blocks(css=css) as app:
+    gr.HTML(
+        """
+    <h1 style='text-align: center'>
+    <a href='https://github.com/THU-MIG/yolov10' target='_blank'>YOLO V10</a> Webcam Stream Object Detection
+    </h1>
+    """
+    )
+    with gr.Column(elem_classes=["my-column"]):
+        with gr.Group(elem_classes=["my-group"]):
+            image = gr.Image(type="pil", label="Image", sources="webcam")
+            conf_threshold = gr.Slider(
+                label="Confidence Threshold",
+                minimum=0.0,
+                maximum=1.0,
+                step=0.05,
+                value=0.30,
+            )
+        image.stream(
+            fn=yolov10_inference,
+            inputs=[image, conf_threshold],
+            outputs=[image],
+            stream_every=0.1,
+            time_limit=30,
+        )
+
+if __name__ == "__main__":
+    app.launch()
diff --git a/gradio/route_utils.py b/gradio/route_utils.py
@@ -897,15 +897,15 @@ def __init__(self):
         self.ended = False
         self.segment_index = 0
         self.playlist = "#EXTM3U\n#EXT-X-PLAYLIST-TYPE:EVENT\n#EXT-X-TARGETDURATION:10\n#EXT-X-VERSION:4\n#EXT-X-MEDIA-SEQUENCE:0\n"
-        self.max_length = 5
+        self.max_duration = 5
 
     async def add_segment(self, data: MediaStreamChunk | None):
         if not data:
             return
 
         segment_id = str(uuid.uuid4())
         self.segments.append({"id": segment_id, **data})
-        self.max_duration = max(self.max_length, data["duration"]) + 1
+        self.max_duration = max(self.max_duration, data["duration"]) + 1
 
     def end_stream(self):
         self.ended = True
diff --git a/guides/04_additional-features/02_streaming-outputs.md b/guides/04_additional-features/02_streaming-outputs.md
@@ -33,7 +33,7 @@ For audio, the next "chunk" can be either an `.mp3` or `.wav` file or a `bytes`
 For video, the next "chunk" has to be either `.mp4` file or a file with `h.264` codec with a `.ts` extension.
 For smooth playback, make sure chunks are consistent lengths and larger than 1 second.
 
-We'll finish with some simple examples illustrating these points. For a deeper tutorial on streaming audio with [transformers](https://huggingface.co/docs/transformers/index), please see [this guide](/main/guides/streaming-ai-generated-audio).
+We'll finish with some simple examples illustrating these points.
 
 ### Streaming Audio
 
@@ -67,4 +67,8 @@ gr.Interface(keep_repeating,
              gr.Video(sources=["webcam"], format="mp4"),
              gr.Video(streaming=True, autoplay=True)
 ).launch()
-```
+```
+
+## End-to-End Examples
+
+For an end-to-end example of streaming media, see the object detection from video [guide](/main/guides/object-detection-from-video) or the streaming AI-generated audio with [transformers](https://huggingface.co/docs/transformers/index) [guide](/main/guides/streaming-ai-generated-audio).
diff --git a/guides/04_additional-features/03_streaming-inputs.md b/guides/04_additional-features/03_streaming-inputs.md
@@ -38,7 +38,7 @@ $demo_streaming_filter_unified
 
 Your streaming function should be stateless. It should take the current input and return its corresponding output. However, there are cases where you may want to keep track of past inputs or outputs. For example, you may want to keep a buffer of the previous `k` inputs to improve the accuracy of your transcription demo. You can do this with Gradio's `gr.State()` component.
 
-Let's showcase this with a sample demo
+Let's showcase this with a sample demo:
 
 ```python
 def transcribe_handler(current_audio, state, transcript):
@@ -60,3 +60,7 @@ with gr.Blocks() as demo:
 
 demo.launch()
 ```
+
+## End-to-End Examples
+
+For an end-to-end example of streaming from the webcam, see the object detection from webcam [guide](/main/guides/object-detection-from-webcam).
diff --git a/...tutorials/streaming-ai-generated-audio.md → ...eaming/01_streaming-ai-generated-audio.md b/...tutorials/streaming-ai-generated-audio.md → ...eaming/01_streaming-ai-generated-audio.md