diff --git a/VideoRAGQnA/docs/config.yaml b/VideoRAGQnA/docs/config.yaml index 2d501cdc2..bf004bd10 100755 --- a/VideoRAGQnA/docs/config.yaml +++ b/VideoRAGQnA/docs/config.yaml @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # Path to all videos -videos: video_ingest/videos/ +videos: /home/intel-admin/plischwe # Path to video description generated by open-source vision models (ex. video-llama, video-llava, etc.) description: video_ingest/scene_description/ # Do you want to extract frames of videos (True if not done already, else False) @@ -15,12 +15,21 @@ image_output_dir: video_ingest/frames/ meta_output_dir: video_ingest/frame_metadata/ # Number of frames to extract per second, # if 24 fps, and this value is 2, then it will extract 12th and 24th frame -number_of_frames_per_second: 2 +number_of_frames_per_second: 24 +# Chunk duration defines the interval of time that each embedding will occur +chunk_duration: 30 +# Clip duration defines the length of the interval in which the embeding will occur +clip_duration: 10 +# e.g. For every , you embed the first 's frames of that interval + + + + vector_db: - choice_of_db: 'vdms' #'chroma' # #Supported databases [vdms, chroma] + choice_of_db: 'chroma' #'chroma' # #Supported databases [vdms, chroma] host: 0.0.0.0 - port: 55555 #8000 # + port: 8000 #55555 # LLM path model_path: meta-llama/Llama-2-7b-chat-hf diff --git a/VideoRAGQnA/embedding/vector_stores/db.py b/VideoRAGQnA/embedding/vector_stores/db.py index 19137145c..19fdf4dc3 100644 --- a/VideoRAGQnA/embedding/vector_stores/db.py +++ b/VideoRAGQnA/embedding/vector_stores/db.py @@ -197,7 +197,6 @@ def MultiModalRetrieval( self.update_db(query, n_images) image_results = self.update_image_retriever.invoke(query) - for r in image_results: print("images:", r.metadata["video"], "\t", r.metadata["date"], "\t", r.metadata["time"], "\n") diff --git a/VideoRAGQnA/video-rag-ui.py b/VideoRAGQnA/video-rag-ui.py old mode 100644 new mode 100755 index 139a8e6ea..1a2ed6228 --- a/VideoRAGQnA/video-rag-ui.py +++ b/VideoRAGQnA/video-rag-ui.py @@ -15,9 +15,6 @@ from utils import config_reader as reader from utils import prompt_handler as ph -# from vector_stores import db -HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") - set_seed(22) if "config" not in st.session_state.keys(): @@ -27,7 +24,8 @@ model_path = config["model_path"] video_dir = config["videos"] -print(video_dir) +HUGGINGFACEHUB_API_TOKEN = os.getenv("ENTER HF TOKEN HERE", "") +#print(video_dir) video_dir = video_dir.replace("../", "") print(video_dir) st.set_page_config(initial_sidebar_state="collapsed", layout="wide") @@ -55,7 +53,6 @@ def load_models(): model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=torch.float32, device_map="auto", trust_remote_code=True, token=HUGGINGFACEHUB_API_TOKEN ) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, token=HUGGINGFACEHUB_API_TOKEN) tokenizer.padding_size = "right" streamer = TextIteratorStreamer(tokenizer, skip_prompt=True) @@ -117,6 +114,7 @@ def get_top_doc(results, qcnt): for r in results: try: video_name = r.metadata["video"] + # timestamp = r.metadata["start of interval in sec"] if video_name not in hit_score.keys(): hit_score[video_name] = 0 hit_score[video_name] += 1 @@ -131,7 +129,7 @@ def get_top_doc(results, qcnt): return {"video": list(x)[qcnt]} -def play_video(x): +def play_video(x, offset): if x is not None: video_file = x.replace(".pt", "") path = video_dir + video_file @@ -139,7 +137,7 @@ def play_video(x): video_file = open(path, "rb") video_bytes = video_file.read() - st.video(video_bytes, start_time=0) + st.video(video_bytes, start_time=offset) if "llm" not in st.session_state.keys(): @@ -182,8 +180,11 @@ def RAG(prompt): if top_doc == None: return None, None video_name = top_doc["video"] + timestamp = top_doc["start of interval in sec"] + print('Video from top doc: ', video_name) + print('Timestamp for playback: ', timestamp) - return video_name, top_doc + return video_name, timestamp, top_doc def get_description(vn): @@ -225,14 +226,16 @@ def handle_message(): else: st.session_state["qcnt"] = 0 st.session_state["prevprompt"] = prompt - video_name, top_doc = RAG(prompt) + video_name, start_time, top_doc = RAG(prompt) if video_name == None: full_response = "No more relevant videos found. Select a different query. \n\n" placeholder.markdown(full_response) end = time.time() else: with col2: - play_video(video_name) + #get metadata of video (what in metadat contains global timestamp of the 10 sec embedding start time), and use + play_video_from_timestamp(video_name, start_time) + # play_video(video_name) scene_des = get_description(video_name) formatted_prompt = ph.get_formatted_prompt(scene=scene_des, prompt=prompt, history=get_history()) diff --git a/VideoRAGQnA/video_ingest/ingest.py b/VideoRAGQnA/video_ingest/ingest.py new file mode 100755 index 000000000..4184555d9 --- /dev/null +++ b/VideoRAGQnA/video_ingest/ingest.py @@ -0,0 +1,300 @@ + +#This file needs to read the long retail video, and for every thirty seconds, choose a 10 second interval to create embeddings. Store embeddings with metadata. + +# from VideoRAGQnA.utils import config_reader as reader +import sys +import os + +# Add the parent directory of the current script to the Python path +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +VECTORDB_SERVICE_HOST_IP = os.getenv("VECTORDB_SERVICE_HOST_IP", "0.0.0.0") + +import yaml +import chromadb +import json +import os +import argparse +from langchain_experimental.open_clip import OpenCLIPEmbeddings +from utils import config_reader as reader +from embedding.vector_stores import db +import cv2 +import random +import datetime +from tzlocal import get_localzone + + +def read_json(path): + with open(path) as f: + x = json.load(f) + return x + +# EMBEDDING MODEL +clip_embd = OpenCLIPEmbeddings(model_name="ViT-g-14", checkpoint="laion2b_s34b_b88k") + +# Have to change function to take in start time/frame and end time/frame +def extract_frames(video_path, start_time, end_time, interval_count, date_time, local_timezone, meta_output_dir, image_output_dir, N=100, selected_db='chroma'): + # video = video_path.split('/')[-1] + + video = os.path.basename(video_path) + video, _ = os.path.splitext(video) + # Create a directory to store frames and metadata + image_output_dir = os.path.join(image_output_dir, f'{video}', 'interval_' + f'{interval_count}') + print(image_output_dir) + os.makedirs(image_output_dir, exist_ok=True) + os.makedirs(meta_output_dir, exist_ok=True) + + # Open the video file + cap = cv2.VideoCapture(video_path) + + if int(cv2.__version__.split('.')[0]) < 3: + fps = cap.get(cv2.cv.CV_CAP_PROP_FPS) + else: + fps = cap.get(cv2.CAP_PROP_FPS) + + start_frame = int(start_time * fps) + end_frame = int(end_time * fps) + interval_frames = end_frame - start_frame + + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + + #print (f'fps {fps}') + #print (f'total frames {total_frames}') + + mod = int(fps // N) + if mod == 0: mod = 1 + + print (f'total frames in interval {interval_frames}, N {N}, mod {mod}') + + # Metadata dictionary to store timestamp and image paths + metadata = {} + + # Move video to start time + cap.set(cv2.CAP_PROP_POS_MSEC, start_time * 1000) + + # Variables to track frame count and desired frames + frame_count = 0 + + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + + pos_msec = cap.get(cv2.CAP_PROP_POS_MSEC) + + if pos_msec >= end_time * 1000: + break + + frame_count += 1 + + # image_output_dir = image_output_dir + f'{video} + f{interval_counter}' + # print('frame_path: ', os.path.join(image_output_dir, f"{video}_{frame_count}.jpg")) + + if frame_count % mod == 0: + timestamp = start_time + frame_path = os.path.join(image_output_dir, f"{video}_{frame_count}.jpg") + time = date_time.strftime("%H:%M:%S") + date = date_time.strftime("%Y-%m-%d") + hours, minutes, seconds = map(float, time.split(":")) + year, month, day = map(int, date.split("-")) + + cv2.imwrite(frame_path, frame) # Save the frame as an image + + + metadata[frame_count] = {"timestamp": timestamp, "frame_path": frame_path,"date": date, "year": year, "month": month, "day": day, + "time": time, "hours": hours, "minutes": minutes, "seconds": seconds, "video": video_path} + if selected_db == 'vdms': + # Localize the current time to the local timezone of the machine + #Tahani might not need this + current_time_local = date_time.replace(tzinfo=datetime.timezone.utc).astimezone(local_timezone) + + # Convert the localized time to ISO 8601 format with timezone offset + iso_date_time = current_time_local.isoformat() + metadata[frame_count]['date_time'] = {"_date": str(iso_date_time)} + + # Save metadata to a JSON file + # metadata_file = os.path.join(meta_output_dir, f"{video}_metadata.json") + # with open(metadata_file, "w") as f: + # json.dump(metadata, f, indent=4) + + # Release the video capture and close all windows + cap.release() + print(f"{frame_count/mod} Frames extracted and metadata saved successfully.") + return fps, interval_frames, metadata + +# This function needs to change arguemtns so it doesn't take videos, but frames or intervals to generate embeddings of intervals. Change functionality of this function to not need to save videos before generating embeddings, as well as do the embedding one at a time. +def store_into_vectordb(metadata_dict, selected_db): + global_frame_counter = 0 + + image_name_list = [] + embedding_list = [] + metadata_list = [] + ids = [] + + # process frames + for frame_id, frame_details in metadata_dict.items(): + global_frame_counter += 1 + if selected_db == 'vdms': + meta_data = { + 'start of interval in sec': frame_details['timestamp'], + 'frame_path': frame_details['frame_path'], + 'video': frame_details['video'], + # 'embedding_path': curr_data['embedding_path'], + 'date_time': frame_details['date_time'], #{"_date":frame_details['date_time']}, + 'date': frame_details['date'], + 'year': frame_details['year'], + 'month': frame_details['month'], + 'day': frame_details['day'], + 'time': frame_details['time'], + 'hours': frame_details['hours'], + 'minutes': frame_details['minutes'], + 'seconds': frame_details['seconds'], + } + if selected_db == 'chroma': + meta_data = { + 'start of interval in sec': frame_details['timestamp'], + 'frame_path': frame_details['frame_path'], + 'video': frame_details['video'], + # 'embedding_path': curr_data['embedding_path'], + 'date': frame_details['date'], + 'year': frame_details['year'], + 'month': frame_details['month'], + 'day': frame_details['day'], + 'time': frame_details['time'], + 'hours': frame_details['hours'], + 'minutes': frame_details['minutes'], + 'seconds': frame_details['seconds'], + } + image_path = frame_details['frame_path'] + image_name_list.append(image_path) + + metadata_list.append(meta_data) + ids.append(str(global_frame_counter)) + # print('datetime',meta_data['date_time']) + # generate clip embeddings + # embedding_list.extend(clip_embd.embed_image(image_name_list)) + + vs.add_images( + uris=image_name_list, + metadatas=metadata_list + ) + + print("✅ Finished creating embeddings for interval") + + # print (f'✅ {_+1}/{total_videos} video {video}, len {len(image_name_list)}, {len(metadata_list)}, {len(embedding_list)}') + +def calculate_intervals(video_path, chunk_duration, clip_duration): + cap = cv2.VideoCapture(video_path) + + if not cap.isOpened(): + print("Error: Could not open video.") + return + + fps = cap.get(cv2.CAP_PROP_FPS) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + total_seconds = total_frames / fps + + intervals = [] + + chunk_frames = int(chunk_duration * fps) + clip_frames = int(clip_duration * fps) + + for start_frame in range(0, total_frames, chunk_frames): + end_frame = min(start_frame + clip_frames, total_frames) + start_time = start_frame / fps + end_time = end_frame / fps + intervals.append((start_frame, end_frame, start_time, end_time)) + + cap.release() + return intervals + +global interval_counter +interval_counter = 0 +def process_video(video_path, selected_db, chunk_duration, clip_duration): + interval_count = 0 + intervals = calculate_intervals(video_path, chunk_duration, clip_duration) + + for interval in intervals: + start_frame, end_frame, start_time, end_time = interval + + date_time = datetime.datetime.now() + + local_timezone = get_localzone() + # With this interval, extract frames to create metadata + fps, interval_frames, metadata_dict = extract_frames(video_path, start_time, end_time, interval_count, date_time, local_timezone, meta_output_dir, image_output_dir, N=100, selected_db='chroma') + + video = os.path.basename(video_path) + video, _ = os.path.splitext(video) + + first_frame_id, first_frame_details = list(metadata_dict.items())[0] + frames_path = os.path.dirname(first_frame_details['frame_path']) + + metadata = {} + global_metadata_file = meta_output_dir + 'metadata.json' + + metadata[video + '_interval_' + f"{interval_count}"] = { + "start datetime": + { + 'start of interval in sec': first_frame_details['timestamp'], + 'date': first_frame_details['date'], + 'year': first_frame_details['year'], + 'month': first_frame_details['month'], + 'day': first_frame_details['day'], + 'time': first_frame_details['time'], + 'hours': first_frame_details['hours'], + 'minutes': first_frame_details['minutes'], + 'seconds': first_frame_details['seconds'], + }, + "fps": fps, + "total_frames": interval_frames, + "embedding_path": f"embeddings/{video}.pt", + "video_path": f"{video_path}", + "frames_path": frames_path + } + + with open(global_metadata_file, "a") as f: + json.dump(metadata, f, indent=4) + print("DICTIONARY USED FOR EMBEDDING: ", metadata_dict) + store_into_vectordb(metadata_dict, selected_db='chroma') + interval_count += 1 + +if __name__ == "__main__": + print("Reading config file") + + # Create argument parser + parser = argparse.ArgumentParser(description="Process configuration file for generating and storing embeddings.") + + # Add argument for configuration file + parser.add_argument("config_file", type=str, help="Path to configuration file (e.g., config.yaml)") + + # Parse command-line arguments + args = parser.parse_args() + + # Read configuration file + config = reader.read_config(args.config_file) + + print("Config file data \n", yaml.dump(config, default_flow_style=False, sort_keys=False)) + + generate_frames = config["generate_frames"] + embed_frames = config["embed_frames"] + path = config["videos"] # args.videos_folder # + image_output_dir = config["image_output_dir"] + meta_output_dir = config["meta_output_dir"] + N = config["number_of_frames_per_second"] + chunk_duration = config["chunk_duration"] + clip_duration = config["clip_duration"] + + host = VECTORDB_SERVICE_HOST_IP + port = int(config["vector_db"]["port"]) + selected_db = config["vector_db"]["choice_of_db"] + + # Creating DB + print( + "Creating DB with text and image embedding support, \nIt may take few minutes to download and load all required models if you are running for first time." + ) + print("Connect to {} at {}:{}".format(selected_db, host, port)) + + vs = db.VS(host, port, selected_db) + + videos = [os.path.join(path, file) for file in os.listdir(path) if file.endswith(".mp4")] + for video in videos: + process_video(video, selected_db, chunk_duration, clip_duration)