From a24575102dc9026c995076d4af023c40dbb6793a Mon Sep 17 00:00:00 2001
From: Anand Bodas <anand.v.bodas@intel.com>
Date: Wed, 29 May 2024 21:02:50 +0530
Subject: [PATCH] Cleaned up Readme, requirements and VectorDB; Added env
 variables for HF Token and VectorDB service

Signed-off-by: Anand Bodas <anand.v.bodas@intel.com>
---
 VideoRAGQnA/README.md                         | 13 +++----
 VideoRAGQnA/docs/config.yaml                  | 14 ++++----
 VideoRAGQnA/docs/requirements.txt             |  4 ++-
 .../embedding/generate_store_embeddings.py    |  6 ++--
 VideoRAGQnA/utils/prompt_handler.py           |  4 +--
 VideoRAGQnA/video-rag-ui.py                   | 34 +++++++------------
 6 files changed, 35 insertions(+), 40 deletions(-)

diff --git a/VideoRAGQnA/README.md b/VideoRAGQnA/README.md
index 0915abc0c..33f99a75b 100644
--- a/VideoRAGQnA/README.md
+++ b/VideoRAGQnA/README.md
@@ -15,13 +15,13 @@ Visual RAG is a framework that retrives video based on provided user prompt. It
 
 ## Prerequisites
 
-There are 10 example videos present in ```files/videos``` along with their description generated by open-source vision model.
+There are 10 example videos present in ```video_ingest/videos``` along with their description generated by open-source vision model.
 If you want these visual RAG to work on your own videos, make sure it matches below format.
 
 ## File Structure
 
 ```bash
-files/ 
+video_ingest/
 .
 ├── scene_description
 │   ├── op_10_0320241830.mp4.txt
@@ -52,7 +52,8 @@ files/
 Install pip requirements
 
 ```bash
-pip3 install -r VideoRAGQnA/requirements.txt
+cd VideoRAGQnA
+pip3 install -r docs/requirements.txt
 ```
 
 The current framework supports both Chroma DB and Intel's VDMS, use either of them,
@@ -72,12 +73,12 @@ docker run -d -p 55555:55555 intellabs/vdms:latest
 
 Update your choice of db and port in ```config.yaml```.
 
-Generating Image embeddigns and store them into selected db, specify config file location and video input location
+Generating Image embeddings and store them into selected db, specify config file location and video input location
 ```bash
-python3 VideoRAGQnA/embedding/generate_store_embeddings.py VideoRAGQnA/docs/config.yaml VideoRAGQnA/video_ingest/videos/
+python3 embedding/generate_store_embeddings.py docs/config.yaml video_ingest/videos/
 ```
 
 **Web UI Video RAG**
 ```bash
 streamlit run video-rag-ui.py --server.address 0.0.0.0 --server.port 50055
-```
\ No newline at end of file
+```
diff --git a/VideoRAGQnA/docs/config.yaml b/VideoRAGQnA/docs/config.yaml
index 1f86c967b..391b2f79b 100755
--- a/VideoRAGQnA/docs/config.yaml
+++ b/VideoRAGQnA/docs/config.yaml
@@ -1,23 +1,23 @@
 # Path to all videos
-videos: VideoRAGQnA/video_ingest/videos/
+videos: video_ingest/videos/
 # Path to video description generated by open-source vision models (ex. video-llama, video-llava, etc.)
-description: VideoRAGQnA/video_ingest/scene_description/
+description: video_ingest/scene_description/
 # Do you want to extract frames of videos (True if not done already, else False)
 generate_frames: True
 # Do you wnat to generate image embeddings?
 embed_frames: True
 # Path to store extracted frames
-image_output_dir: VideoRAGQnA/video_ingest/frames/
+image_output_dir: video_ingest/frames/
 # Path to store metadata files
-meta_output_dir: VideoRAGQnA/video_ingest/frame_metadata/
+meta_output_dir: video_ingest/frame_metadata/
 # Number of frames to extract per second, 
 # if 24 fps, and this value is 2, then it will extract 12th and 24th frame
 number_of_frames_per_second: 2
 
 vector_db:
   choice_of_db: 'vdms' #'chroma' # #Supported databases [vdms, chroma]
-  host: 10.190.167.193
-  port: 55556 #8000 #
+  host: 0.0.0.0
+  port: 55555 #8000 #
 
 # LLM path
-model_path: VideoRAGQnA/ckpt/llama-2-7b-chat-hf
\ No newline at end of file
+model_path: meta-llama/Llama-2-7b-chat-hf
diff --git a/VideoRAGQnA/docs/requirements.txt b/VideoRAGQnA/docs/requirements.txt
index 84a5eb474..d228806b5 100644
--- a/VideoRAGQnA/docs/requirements.txt
+++ b/VideoRAGQnA/docs/requirements.txt
@@ -6,4 +6,6 @@ streamlit
 metafunctions
 sentence-transformers
 accelerate
-vdms
\ No newline at end of file
+vdms
+tzlocal
+dateparser
diff --git a/VideoRAGQnA/embedding/generate_store_embeddings.py b/VideoRAGQnA/embedding/generate_store_embeddings.py
index d8547c7b2..8848fec8a 100644
--- a/VideoRAGQnA/embedding/generate_store_embeddings.py
+++ b/VideoRAGQnA/embedding/generate_store_embeddings.py
@@ -9,6 +9,7 @@
 
 # Add the parent directory of the current script to the Python path
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+VECTORDB_SERVICE_HOST_IP = os.getenv("VECTORDB_SERVICE_HOST_IP", "0.0.0.0")
 
 
 # sys.path.append(os.path.abspath('../utils'))
@@ -150,16 +151,17 @@ def retrieval_testing():
     meta_output_dir = config['meta_output_dir']
     N = config['number_of_frames_per_second']
     
-    host = config['vector_db']['host']
+    host = VECTORDB_SERVICE_HOST_IP
     port = int(config['vector_db']['port'])
     selected_db = config['vector_db']['choice_of_db']
     
     # Creating DB
     print ('Creating DB with text and image embedding support, \nIt may take few minutes to download and load all required models if you are running for first time.')
+    print('Connect to {} at {}:{}'.format(selected_db, host, port))
     
     vs = db.VS(host, port, selected_db)
     
     generate_image_embeddings(selected_db)
     
     retrieval_testing()
-    
\ No newline at end of file
+
diff --git a/VideoRAGQnA/utils/prompt_handler.py b/VideoRAGQnA/utils/prompt_handler.py
index 500f738b0..35b368fd3 100644
--- a/VideoRAGQnA/utils/prompt_handler.py
+++ b/VideoRAGQnA/utils/prompt_handler.py
@@ -1,8 +1,8 @@
 from jinja2 import Environment, BaseLoader
 
-PROMPT = open("VideoRAGQnA/utils/prompt_template.jinja2").read().strip()
+PROMPT = open("utils/prompt_template.jinja2").read().strip()
 
 def get_formatted_prompt(scene, prompt):
     env = Environment(loader=BaseLoader())
     template = env.from_string(PROMPT)
-    return template.render(scene=scene, prompt=prompt)
\ No newline at end of file
+    return template.render(scene=scene, prompt=prompt)
diff --git a/VideoRAGQnA/video-rag-ui.py b/VideoRAGQnA/video-rag-ui.py
index d9ea6e78d..7c4cc2a55 100644
--- a/VideoRAGQnA/video-rag-ui.py
+++ b/VideoRAGQnA/video-rag-ui.py
@@ -3,26 +3,25 @@
 from embedding.vector_stores import db
 import time
 import torch
-import streamlit as st
 
 import torch
 import streamlit as st
 from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+from transformers import set_seed
 
-from transformers import TextIteratorStreamer
 from typing import Any, List, Mapping, Optional
 from langchain_core.callbacks.manager import CallbackManagerForLLMRun
 from langchain.llms.base import LLM
 import threading
-from transformers import set_seed
 from utils import config_reader as reader
 from utils import prompt_handler as ph
 # from vector_stores import db
+HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
 
 set_seed(22)
 
 if 'config' not in st.session_state.keys():
-    st.session_state.config = reader.read_config('VideoRAGQnA/docs/config.yaml')
+    st.session_state.config = reader.read_config('docs/config.yaml')
 
 config = st.session_state.config
 
@@ -51,11 +50,12 @@
 
 @st.cache_resource       
 def load_models():
+    #print("HF Token: ", HUGGINGFACEHUB_API_TOKEN)
     model = AutoModelForCausalLM.from_pretrained(
-        model_path, torch_dtype=torch.float32, device_map='auto', trust_remote_code=True,
+        model_path, torch_dtype=torch.float32, device_map='auto', trust_remote_code=True, token=HUGGINGFACEHUB_API_TOKEN
     )
     
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, token=HUGGINGFACEHUB_API_TOKEN)
     tokenizer.padding_size = 'right'
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
     
@@ -248,22 +248,12 @@ def display_messages():
             'Find similar videos', 
             'Man wearing glasses', 
             'People reading item description',
-            'Man wearing khaki pants',
-            'Man laughing',
-            'Black tshirt guy holding red basket',
             'Man holding red shopping basket',
-            'Man wearing blue shirt',
-            'Man putting object into his pocket',
-            'Was there any shoplifting reported?',
-            'Was there any shoplifting reported today?',
-            'Was there any shoplifting reported in the last 6 hours?',
-            'Was there any shoplifting reported last Sunday?',
-            'Was there any shoplifting reported last Monday?',
-            'Have there been instances of shoplifting?',
-            'Have there been instances of shoplifting last Friday?',
-            'Have there been any instances of theft or shoplifting in the last 30 minutes?',
-            'Have there been any instances of theft or shoplifting in the last 48 hours?',
-            'Have there been any instances of theft or shoplifting in the last 72 hours?',
+            'Was there any person wearing a blue shirt seen today?',
+            'Was there any person wearing a blue shirt seen in the last 6 hours?',
+            'Was there any person wearing a blue shirt seen last Sunday?',
+            'Was a person wearing glasses seen in the last 30 minutes?',
+            'Was a person wearing glasses seen in the last 72 hours?',
         ),
         key='example_video'
     )
@@ -290,4 +280,4 @@ def display_messages():
 
 with col1:
     display_messages()
-    handle_message()
\ No newline at end of file
+    handle_message()