Added video support (working)

valentinfrlch · Jul 27, 2024 · b73507f · b73507f
1 parent 8796342
commit b73507f
Show file tree

Hide file tree

Showing 5 changed files with 55 additions and 38 deletions.
diff --git a/custom_components/llmvision/__init__.py b/custom_components/llmvision/__init__.py
@@ -17,6 +17,8 @@
     MESSAGE,
     IMAGE_FILE,
     IMAGE_ENTITY,
+    VIDEO_FILE,
+    INTERVAL,
     TEMPERATURE,
     DETAIL,
     INCLUDE_FILENAME
@@ -76,6 +78,9 @@ def __init__(self, data_call):
         self.image_paths = data_call.data.get(IMAGE_FILE, "").split(
             "\n") if data_call.data.get(IMAGE_FILE) else None
         self.image_entities = data_call.data.get(IMAGE_ENTITY)
+        self.video_paths = data_call.data.get(VIDEO_FILE, "").split(
+            "\n") if data_call.data.get(VIDEO_FILE) else None
+        self.interval = int(data_call.data.get(INTERVAL, 3))
         self.target_width = data_call.data.get(TARGET_WIDTH, 1280)
         self.temperature = float(data_call.data.get(TEMPERATURE, 0.5))
         self.max_tokens = int(data_call.data.get(MAXTOKENS, 100))
@@ -114,7 +119,7 @@ async def image_analyzer(data_call):
         # Fetch and preprocess images
         processor = MediaProcessor(hass, client)
         # Send images to RequestHandler client
-        client = await processor.add_image(call.image_entities, call.image_paths, call.target_width, call.include_filename)
+        client = await processor.add_images(call.image_entities, call.image_paths, call.target_width, call.include_filename)
 
         # Validate configuration, input data and make the call
         try:
@@ -127,13 +132,14 @@ async def image_analyzer(data_call):
     async def video_analyzer(data_call):
         """Handle the service call to analyze a video (future implementation)"""
         call = ServiceCallData(data_call).get_service_call_data()
+        call.message = "The attached images are frames from a video." + call.message
         client = RequestHandler(hass,
                                 message=call.message,
                                 max_tokens=call.max_tokens,
                                 temperature=call.temperature,
                                 detail=call.detail)
         processor = MediaProcessor(hass, client)
-        client = await processor.add_video(call.video_paths, call.interval, call.target_width, call.include_filename)
+        client = await processor.add_videos(call.video_paths, call.interval, call.target_width, call.include_filename)
         try:
             response = await client.make_request(call)
         except ServiceValidationError as e:

diff --git a/custom_components/llmvision/const.py b/custom_components/llmvision/const.py
@@ -22,6 +22,8 @@
 MESSAGE = 'message'
 IMAGE_FILE = 'image_file'
 IMAGE_ENTITY = 'image_entity'
+VIDEO_FILE = 'video_file'
+INTERVAL = 'interval'
 DETAIL = 'detail'
 TEMPERATURE = 'temperature'
 INCLUDE_FILENAME = 'include_filename'

diff --git a/custom_components/llmvision/media_handlers.py b/custom_components/llmvision/media_handlers.py
@@ -1,11 +1,14 @@
 import base64
 import io
 import os
+import shutil
+import logging
 from homeassistant.helpers.network import get_url
-# TODO: Use ffmpeg instead of moviepy
 from PIL import Image
 from homeassistant.exceptions import ServiceValidationError
 
+_LOGGER = logging.getLogger(__name__)
+
 
 class MediaProcessor:
     def __init__(self, hass, client):
@@ -15,14 +18,7 @@ def __init__(self, hass, client):
         self.filenames = []
 
     async def resize_image(self, target_width, image_path=None, image_data=None, img=None):
-        """Encode image as base64
-
-        Args:
-            image_path (string): path where image is stored e.g.: "/config/www/tmp/image.jpg"
-
-        Returns:
-            string: image encoded as base64
-        """
+        """Resize image to target_width"""
         loop = self.hass.loop
         if image_path:
             # Open the image file
@@ -70,13 +66,15 @@ async def resize_image(self, target_width, image_path=None, image_data=None, img
         return base64_image
 
     async def _encode_image(self, img):
+        """Encode image as base64"""
         img_byte_arr = io.BytesIO()
         img.save(img_byte_arr, format='PNG')
         base64_image = base64.b64encode(
             img_byte_arr.getvalue()).decode('utf-8')
         return base64_image
 
-    async def add_image(self, image_entities, image_paths, target_width, include_filename):
+    async def add_images(self, image_entities, image_paths, target_width, include_filename):
+        """Wrapper for client.add_image"""
         if image_entities:
             for image_entity in image_entities:
                 try:
@@ -124,26 +122,47 @@ async def add_image(self, image_entities, image_paths, target_width, include_fil
                     raise ServiceValidationError(f"Error: {e}")
         return self.client
 
-    async def add_video(self, video_paths, interval, target_width, include_filename):
+    async def add_videos(self, video_paths, interval, target_width, include_filename):
+        """Wrapper for client.add_image for videos"""
         if video_paths:
+            _LOGGER.debug(f"Processing videos: {video_paths}")
             for video_path in video_paths:
                 try:
                     video_path = video_path.strip()
                     if os.path.exists(video_path):
-                        # extract frames from video every interval seconds
-                        clip = VideoFileClip(video_path)
-                        duration = clip.duration
-                        for t in range(0, int(duration), interval):
-                            frame = clip.get_frame(t)
-                            # Convert frame (numpy array) to image and encode it
-                            img = Image.fromarray(frame)
+                        # extract frames from video every 'interval' seconds using ffmpeg
+                        tmp_dir = "tmp_frames"
+                        os.makedirs(tmp_dir, exist_ok=True)
+                        _LOGGER.debug(
+                            f"Created {tmp_dir} {os.path.exists(tmp_dir)}")
+                        ffmpeg_cmd = [
+                            "ffmpeg",
+                            "-i", video_path,
+                            "-vf", f"fps=1/{interval},select='eq(n\,0)+not(mod(n\,{interval}))'",
+                            os.path.join(tmp_dir, "frame%04d.png")
+                        ]
+                        # Run ffmpeg command
+                        loop = self.hass.loop
+                        await loop.run_in_executor(None, os.system, " ".join(ffmpeg_cmd))
+
+                        frame_counter = 0
+                        for frame_file in await loop.run_in_executor(None, os.listdir, tmp_dir):
+                            _LOGGER.debug(f"Adding frame {frame_file}")
+                            frame_counter = 0
+                            frame_path = os.path.join(tmp_dir, frame_file)
                             self.client.add_image(
-                                base64_image=await self.resize_image(img=img, target_width=target_width),
+                                base64_image=await self.resize_image(image_path=frame_path, target_width=target_width),
                                 filename=video_path.split(
-                                    '/')[-1].split('.')[-2] if include_filename else ""
+                                    '/')[-1].split('.')[-2] + " (frame " + str(frame_counter) + ")" if include_filename else "Video frame " + str(frame_counter)
                             )
+                            frame_counter += 1
+
                     if not os.path.exists(video_path):
                         raise ServiceValidationError(
                             f"File {video_path} does not exist")
                 except Exception as e:
                     raise ServiceValidationError(f"Error: {e}")
+
+                # Clean up tmp dir
+                await loop.run_in_executor(None, shutil.rmtree, tmp_dir)
+            return self.client
diff --git a/custom_components/llmvision/request_handlers.py b/custom_components/llmvision/request_handlers.py
@@ -55,7 +55,7 @@ def __init__(self, hass, message, max_tokens, temperature, detail):
         self.filenames = []
 
     async def make_request(self, call):
-        _LOGGER.debug(f"Base64 Images: {self.base64_images}")
+        _LOGGER.debug(f"Base64 Images: {sanitize_data(self.base64_images)}")
         if call.provider == 'OpenAI':
             api_key = self.hass.data.get(DOMAIN).get(CONF_OPENAI_API_KEY)
             model = call.model
@@ -338,17 +338,7 @@ async def _fetch(self, url):
         return data
 
     def _validate_call(self, provider, api_key, base64_images, ip_address=None, port=None):
-        """Validate the configuration for the component
-
-        Args:
-            mode (string): "OpenAI" or "LocalAI"
-            api_key (string): OpenAI API key
-            ip_address (string): LocalAI server IP address
-            port (string): LocalAI server port
-
-        Raises:
-            ServiceValidationError: if configuration is invalid
-        """
+        """Validate the service call data"""
         # Checks for OpenAI
         if provider == 'OpenAI':
             if not api_key:
@@ -368,7 +358,7 @@ def _validate_call(self, provider, api_key, base64_images, ip_address=None, port
         elif provider == 'Ollama':
             if not ip_address or not port:
                 raise ServiceValidationError(ERROR_OLLAMA_NOT_CONFIGURED)
-        # File path validation
+        # Check media input
         if base64_images == []:
             raise ServiceValidationError(ERROR_NO_IMAGE_INPUT)
 

diff --git a/custom_components/llmvision/services.yaml b/custom_components/llmvision/services.yaml
@@ -132,7 +132,7 @@ video_analyzer:
         text:
           multiline: true
     video_file:
-      name: Image File
+      name: Video File
       required: true
       description: 'Local path to video'
       example: "/config/www/recordings/front_door.mp4"
@@ -147,8 +147,8 @@ video_analyzer:
       default: 3
       selector:
         number:
-          min: 0.1
-          max: 100.0
+          min: 1
+          max: 60
     include_filename:
       name: Include Filename
       required: false