Skip to content

Commit

Permalink
video ingestion with intervals #1
Browse files Browse the repository at this point in the history
  • Loading branch information
root committed Jun 10, 2024
1 parent eb4f924 commit c3f2b0b
Show file tree
Hide file tree
Showing 4 changed files with 326 additions and 15 deletions.
17 changes: 13 additions & 4 deletions VideoRAGQnA/docs/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

# Path to all videos
videos: video_ingest/videos/
videos: /home/intel-admin/plischwe
# Path to video description generated by open-source vision models (ex. video-llama, video-llava, etc.)
description: video_ingest/scene_description/
# Do you want to extract frames of videos (True if not done already, else False)
Expand All @@ -15,12 +15,21 @@ image_output_dir: video_ingest/frames/
meta_output_dir: video_ingest/frame_metadata/
# Number of frames to extract per second,
# if 24 fps, and this value is 2, then it will extract 12th and 24th frame
number_of_frames_per_second: 2
number_of_frames_per_second: 24
# Chunk duration defines the interval of time that each embedding will occur
chunk_duration: 30
# Clip duration defines the length of the interval in which the embeding will occur
clip_duration: 10
# e.g. For every <chunk_duration>, you embed the first <clip_duration>'s frames of that interval





vector_db:
choice_of_db: 'vdms' #'chroma' # #Supported databases [vdms, chroma]
choice_of_db: 'chroma' #'chroma' # #Supported databases [vdms, chroma]
host: 0.0.0.0
port: 55555 #8000 #
port: 8000 #55555

# LLM path
model_path: meta-llama/Llama-2-7b-chat-hf
1 change: 0 additions & 1 deletion VideoRAGQnA/embedding/vector_stores/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,6 @@ def MultiModalRetrieval(

self.update_db(query, n_images)
image_results = self.update_image_retriever.invoke(query)

for r in image_results:
print("images:", r.metadata["video"], "\t", r.metadata["date"], "\t", r.metadata["time"], "\n")

Expand Down
23 changes: 13 additions & 10 deletions VideoRAGQnA/video-rag-ui.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@
from utils import config_reader as reader
from utils import prompt_handler as ph

# from vector_stores import db
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")

set_seed(22)

if "config" not in st.session_state.keys():
Expand All @@ -27,7 +24,8 @@

model_path = config["model_path"]
video_dir = config["videos"]
print(video_dir)
HUGGINGFACEHUB_API_TOKEN = os.getenv("ENTER HF TOKEN HERE", "")
#print(video_dir)
video_dir = video_dir.replace("../", "")
print(video_dir)
st.set_page_config(initial_sidebar_state="collapsed", layout="wide")
Expand Down Expand Up @@ -55,7 +53,6 @@ def load_models():
model = AutoModelForCausalLM.from_pretrained(
model_path, torch_dtype=torch.float32, device_map="auto", trust_remote_code=True, token=HUGGINGFACEHUB_API_TOKEN
)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, token=HUGGINGFACEHUB_API_TOKEN)
tokenizer.padding_size = "right"
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
Expand Down Expand Up @@ -117,6 +114,7 @@ def get_top_doc(results, qcnt):
for r in results:
try:
video_name = r.metadata["video"]
# timestamp = r.metadata["start of interval in sec"]
if video_name not in hit_score.keys():
hit_score[video_name] = 0
hit_score[video_name] += 1
Expand All @@ -131,15 +129,15 @@ def get_top_doc(results, qcnt):
return {"video": list(x)[qcnt]}


def play_video(x):
def play_video(x, offset):
if x is not None:
video_file = x.replace(".pt", "")
path = video_dir + video_file

video_file = open(path, "rb")
video_bytes = video_file.read()

st.video(video_bytes, start_time=0)
st.video(video_bytes, start_time=offset)


if "llm" not in st.session_state.keys():
Expand Down Expand Up @@ -182,8 +180,11 @@ def RAG(prompt):
if top_doc == None:
return None, None
video_name = top_doc["video"]
timestamp = top_doc["start of interval in sec"]
print('Video from top doc: ', video_name)
print('Timestamp for playback: ', timestamp)

return video_name, top_doc
return video_name, timestamp, top_doc


def get_description(vn):
Expand Down Expand Up @@ -225,14 +226,16 @@ def handle_message():
else:
st.session_state["qcnt"] = 0
st.session_state["prevprompt"] = prompt
video_name, top_doc = RAG(prompt)
video_name, start_time, top_doc = RAG(prompt)
if video_name == None:
full_response = "No more relevant videos found. Select a different query. \n\n"
placeholder.markdown(full_response)
end = time.time()
else:
with col2:
play_video(video_name)
#get metadata of video (what in metadat contains global timestamp of the 10 sec embedding start time), and use
play_video_from_timestamp(video_name, start_time)
# play_video(video_name)

scene_des = get_description(video_name)
formatted_prompt = ph.get_formatted_prompt(scene=scene_des, prompt=prompt, history=get_history())
Expand Down
Loading

0 comments on commit c3f2b0b

Please sign in to comment.