video ingestion with intervals #1

ttrigui · Jun 10, 2024 · c3f2b0b · c3f2b0b
1 parent eb4f924
commit c3f2b0b
Show file tree

Hide file tree

Showing 4 changed files with 326 additions and 15 deletions.
diff --git a/VideoRAGQnA/docs/config.yaml b/VideoRAGQnA/docs/config.yaml
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Path to all videos
-videos: video_ingest/videos/
+videos: /home/intel-admin/plischwe
 # Path to video description generated by open-source vision models (ex. video-llama, video-llava, etc.)
 description: video_ingest/scene_description/
 # Do you want to extract frames of videos (True if not done already, else False)
@@ -15,12 +15,21 @@ image_output_dir: video_ingest/frames/
 meta_output_dir: video_ingest/frame_metadata/
 # Number of frames to extract per second,
 # if 24 fps, and this value is 2, then it will extract 12th and 24th frame
-number_of_frames_per_second: 2
+number_of_frames_per_second: 24
+# Chunk duration defines the interval of time that each embedding will occur
+chunk_duration: 30
+# Clip duration defines the length of the interval in which the embeding will occur
+clip_duration: 10
+# e.g. For every <chunk_duration>, you embed the first <clip_duration>'s frames of that interval
+
+
+
+
 
 vector_db:
-  choice_of_db: 'vdms' #'chroma' # #Supported databases [vdms, chroma]
+  choice_of_db: 'chroma' #'chroma' # #Supported databases [vdms, chroma]
   host: 0.0.0.0
-  port: 55555 #8000 #
+  port: 8000 #55555
 
 # LLM path
 model_path: meta-llama/Llama-2-7b-chat-hf
diff --git a/VideoRAGQnA/embedding/vector_stores/db.py b/VideoRAGQnA/embedding/vector_stores/db.py
@@ -197,7 +197,6 @@ def MultiModalRetrieval(
 
         self.update_db(query, n_images)
         image_results = self.update_image_retriever.invoke(query)
-
         for r in image_results:
             print("images:", r.metadata["video"], "\t", r.metadata["date"], "\t", r.metadata["time"], "\n")
 

diff --git a/VideoRAGQnA/video-rag-ui.py b/VideoRAGQnA/video-rag-ui.py
@@ -15,9 +15,6 @@
 from utils import config_reader as reader
 from utils import prompt_handler as ph
 
-# from vector_stores import db
-HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
-
 set_seed(22)
 
 if "config" not in st.session_state.keys():
@@ -27,7 +24,8 @@
 
 model_path = config["model_path"]
 video_dir = config["videos"]
-print(video_dir)
+HUGGINGFACEHUB_API_TOKEN = os.getenv("ENTER HF TOKEN HERE", "")
+#print(video_dir)
 video_dir = video_dir.replace("../", "")
 print(video_dir)
 st.set_page_config(initial_sidebar_state="collapsed", layout="wide")
@@ -55,7 +53,6 @@ def load_models():
     model = AutoModelForCausalLM.from_pretrained(
         model_path, torch_dtype=torch.float32, device_map="auto", trust_remote_code=True, token=HUGGINGFACEHUB_API_TOKEN
     )
-
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, token=HUGGINGFACEHUB_API_TOKEN)
     tokenizer.padding_size = "right"
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
@@ -117,6 +114,7 @@ def get_top_doc(results, qcnt):
     for r in results:
         try:
             video_name = r.metadata["video"]
+            # timestamp = r.metadata["start of interval in sec"]
             if video_name not in hit_score.keys():
                 hit_score[video_name] = 0
             hit_score[video_name] += 1
@@ -131,15 +129,15 @@ def get_top_doc(results, qcnt):
     return {"video": list(x)[qcnt]}
 
 
-def play_video(x):
+def play_video(x, offset):
     if x is not None:
         video_file = x.replace(".pt", "")
         path = video_dir + video_file
 
         video_file = open(path, "rb")
         video_bytes = video_file.read()
 
-        st.video(video_bytes, start_time=0)
+        st.video(video_bytes, start_time=offset)
 
 
 if "llm" not in st.session_state.keys():
@@ -182,8 +180,11 @@ def RAG(prompt):
     if top_doc == None:
         return None, None
     video_name = top_doc["video"]
+    timestamp = top_doc["start of interval in sec"]
+    print('Video from top doc: ', video_name)
+    print('Timestamp for playback: ', timestamp)
 
-    return video_name, top_doc
+    return video_name, timestamp, top_doc
 
 
 def get_description(vn):
@@ -225,14 +226,16 @@ def handle_message():
             else:
                 st.session_state["qcnt"] = 0
                 st.session_state["prevprompt"] = prompt
-            video_name, top_doc = RAG(prompt)
+            video_name, start_time, top_doc = RAG(prompt)
             if video_name == None:
                 full_response = "No more relevant videos found. Select a different query. \n\n"
                 placeholder.markdown(full_response)
                 end = time.time()
             else:
                 with col2:
-                    play_video(video_name)
+                    #get metadata of video (what in metadat contains global timestamp of the 10 sec embedding start time), and use 
+                    play_video_from_timestamp(video_name, start_time)
+                    # play_video(video_name)
 
                 scene_des = get_description(video_name)
                 formatted_prompt = ph.get_formatted_prompt(scene=scene_des, prompt=prompt, history=get_history())