diff --git a/poetry.lock b/poetry.lock index 8466ddbf0..dfdafd2fd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.0 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "accelerate" @@ -1273,13 +1273,13 @@ grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] [[package]] name = "gradio" -version = "4.10.0" +version = "4.19.0" description = "Python library for easily interacting with trained machine learning models" optional = false python-versions = ">=3.8" files = [ - {file = "gradio-4.10.0-py3-none-any.whl", hash = "sha256:7595185716aff430381d010087d6ebc4eadef06fefc3dc1cfa76edcdd2c109db"}, - {file = "gradio-4.10.0.tar.gz", hash = "sha256:d4ca039aa7f5c2783b2bbf7b465153c80bb4257edcca4d8b9c59ce6f61a75b97"}, + {file = "gradio-4.19.0-py3-none-any.whl", hash = "sha256:d09732190acc0f33b5e7ea3235d267472bf74beeea62dabb7a82f93193155e09"}, + {file = "gradio-4.19.0.tar.gz", hash = "sha256:e77e3ce8a4113865abd1dcf92cc9426d9da4896e0a6fd2824a0c90ec751dd442"}, ] [package.dependencies] @@ -1287,7 +1287,7 @@ aiofiles = ">=22.0,<24.0" altair = ">=4.2.0,<6.0" fastapi = "*" ffmpy = "*" -gradio-client = "0.7.3" +gradio-client = "0.10.0" httpx = "*" huggingface-hub = ">=0.19.3" importlib-resources = ">=1.3,<7.0" @@ -1303,6 +1303,7 @@ pydantic = ">=2.0" pydub = "*" python-multipart = "*" pyyaml = ">=5.0,<7.0" +ruff = ">=0.1.7" semantic-version = ">=2.0,<3.0" tomlkit = "0.12.0" typer = {version = ">=0.9,<1.0", extras = ["all"]} @@ -1314,13 +1315,13 @@ oauth = ["authlib", "itsdangerous"] [[package]] name = "gradio-client" -version = "0.7.3" +version = "0.10.0" description = "Python library for easily interacting with trained machine learning models" optional = false python-versions = ">=3.8" files = [ - {file = "gradio_client-0.7.3-py3-none-any.whl", hash = "sha256:b91073770470ceb9f284977064c35bc0cffaf868eb887bf352db77aa01fe342a"}, - {file = "gradio_client-0.7.3.tar.gz", hash = "sha256:8146a1d19a125b38088dd201ddacd0008ea47ef9b0504d1c5b87ca09a43f4dcd"}, + {file = "gradio_client-0.10.0-py3-none-any.whl", hash = "sha256:2bcfe61710f9f1c8f336fa9ff0f5c5f0ea52079233196cd753ad30cccdfd585c"}, + {file = "gradio_client-0.10.0.tar.gz", hash = "sha256:feaee70f18363d76f81a7d25fc3456f40ed5f92417e642c8f1bf86dc65e3a981"}, ] [package.dependencies] @@ -6111,4 +6112,4 @@ chroma = ["chromadb"] [metadata] lock-version = "2.0" python-versions = ">=3.11,<3.12" -content-hash = "c2bcf29b5c894a0fae9682145cd001dfb57bb4919c9097b5e27323ddee58fc8c" +content-hash = "121bf7797b74c02efaf11712e178c9c01880b79701eeff6485ede9ca8b25d307" diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py index ed65c203a..8e0229498 100644 --- a/private_gpt/settings/settings.py +++ b/private_gpt/settings/settings.py @@ -189,6 +189,12 @@ class UISettings(BaseModel): default_query_system_prompt: str = Field( None, description="The default system prompt to use for the query mode." ) + delete_file_button_enabled: bool = Field( + True, description="If the button to delete a file is enabled or not." + ) + delete_all_files_button_enabled: bool = Field( + False, description="If the button to delete all files is enabled or not." + ) class QdrantSettings(BaseModel): diff --git a/private_gpt/ui/ui.py b/private_gpt/ui/ui.py index c7b538a3e..a4b131fe8 100644 --- a/private_gpt/ui/ui.py +++ b/private_gpt/ui/ui.py @@ -15,6 +15,7 @@ from private_gpt.constants import PROJECT_ROOT_PATH from private_gpt.di import global_injector +from private_gpt.open_ai.extensions.context_filter import ContextFilter from private_gpt.server.chat.chat_service import ChatService, CompletionGen from private_gpt.server.chunks.chunks_service import Chunk, ChunksService from private_gpt.server.ingest.ingest_service import IngestService @@ -31,7 +32,7 @@ SOURCES_SEPARATOR = "\n\n Sources: \n" -MODES = ["Query Docs", "Search in Docs", "LLM Chat"] +MODES = ["Query Files", "Search Files", "LLM Chat (no context from files)"] class Source(BaseModel): @@ -74,6 +75,8 @@ def __init__( # Cache the UI blocks self._ui_block = None + self._selected_filename = None + # Initialize system prompt based on default mode self.mode = MODES[0] self._system_prompt = self._get_default_system_prompt(self.mode) @@ -132,20 +135,34 @@ def build_history() -> list[ChatMessage]: ), ) match mode: - case "Query Docs": + case "Query Files": + + # Use only the selected file for the query + context_filter = None + if self._selected_filename is not None: + docs_ids = [] + for ingested_document in self._ingest_service.list_ingested(): + if ( + ingested_document.doc_metadata["file_name"] + == self._selected_filename + ): + docs_ids.append(ingested_document.doc_id) + context_filter = ContextFilter(docs_ids=docs_ids) + query_stream = self._chat_service.stream_chat( messages=all_messages, use_context=True, + context_filter=context_filter, ) yield from yield_deltas(query_stream) - case "LLM Chat": + case "LLM Chat (no context from files)": llm_stream = self._chat_service.stream_chat( messages=all_messages, use_context=False, ) yield from yield_deltas(llm_stream) - case "Search in Docs": + case "Search Files": response = self._chunks_service.retrieve_relevant( text=message, limit=4, prev_next_chunks=0 ) @@ -166,10 +183,10 @@ def _get_default_system_prompt(mode: str) -> str: p = "" match mode: # For query chat mode, obtain default system prompt from settings - case "Query Docs": + case "Query Files": p = settings().ui.default_query_system_prompt # For chat mode, obtain default system prompt from settings - case "LLM Chat": + case "LLM Chat (no context from files)": p = settings().ui.default_chat_system_prompt # For any other mode, clear the system prompt case _: @@ -205,8 +222,71 @@ def _list_ingested_files(self) -> list[list[str]]: def _upload_file(self, files: list[str]) -> None: logger.debug("Loading count=%s files", len(files)) paths = [Path(file) for file in files] + + # remove all existing Documents with name identical to a new file upload: + file_names = [path.name for path in paths] + doc_ids_to_delete = [] + for ingested_document in self._ingest_service.list_ingested(): + if ( + ingested_document.doc_metadata + and ingested_document.doc_metadata["file_name"] in file_names + ): + doc_ids_to_delete.append(ingested_document.doc_id) + if len(doc_ids_to_delete) > 0: + logger.info( + "Uploading file(s) which were already ingested: %s document(s) will be replaced.", + len(doc_ids_to_delete), + ) + for doc_id in doc_ids_to_delete: + self._ingest_service.delete(doc_id) + self._ingest_service.bulk_ingest([(str(path.name), path) for path in paths]) + def _delete_all_files(self) -> Any: + ingested_files = self._ingest_service.list_ingested() + logger.debug("Deleting count=%s files", len(ingested_files)) + for ingested_document in ingested_files: + self._ingest_service.delete(ingested_document.doc_id) + return [ + gr.List(self._list_ingested_files()), + gr.components.Button(interactive=False), + gr.components.Button(interactive=False), + gr.components.Textbox("All files"), + ] + + def _delete_selected_file(self) -> Any: + logger.debug("Deleting selected %s", self._selected_filename) + # Note: keep looping for pdf's (each page became a Document) + for ingested_document in self._ingest_service.list_ingested(): + if ( + ingested_document.doc_metadata + and ingested_document.doc_metadata["file_name"] + == self._selected_filename + ): + self._ingest_service.delete(ingested_document.doc_id) + return [ + gr.List(self._list_ingested_files()), + gr.components.Button(interactive=False), + gr.components.Button(interactive=False), + gr.components.Textbox("All files"), + ] + + def _deselect_selected_file(self) -> Any: + self._selected_filename = None + return [ + gr.components.Button(interactive=False), + gr.components.Button(interactive=False), + gr.components.Textbox("All files"), + ] + + def _selected_a_file(self, select_data: gr.SelectData) -> Any: + self._selected_filename = select_data.value + return [ + gr.components.Button(interactive=True), + gr.components.Button(interactive=True), + gr.components.Textbox(self._selected_filename), + ] + def _build_ui_blocks(self) -> gr.Blocks: logger.debug("Creating the UI blocks") with gr.Blocks( @@ -235,7 +315,7 @@ def _build_ui_blocks(self) -> gr.Blocks: mode = gr.Radio( MODES, label="Mode", - value="Query Docs", + value="Query Files", ) upload_button = gr.components.UploadButton( "Upload File(s)", @@ -247,6 +327,7 @@ def _build_ui_blocks(self) -> gr.Blocks: self._list_ingested_files, headers=["File name"], label="Ingested Files", + height=235, interactive=False, render=False, # Rendered under the button ) @@ -260,6 +341,57 @@ def _build_ui_blocks(self) -> gr.Blocks: outputs=ingested_dataset, ) ingested_dataset.render() + deselect_file_button = gr.components.Button( + "De-select selected file", size="sm", interactive=False + ) + selected_text = gr.components.Textbox( + "All files", label="Selected for Query or Deletion", max_lines=1 + ) + delete_file_button = gr.components.Button( + "🗑️ Delete selected file", + size="sm", + visible=settings().ui.delete_file_button_enabled, + interactive=False, + ) + delete_files_button = gr.components.Button( + "⚠️ Delete ALL files", + size="sm", + visible=settings().ui.delete_all_files_button_enabled, + ) + deselect_file_button.click( + self._deselect_selected_file, + outputs=[ + delete_file_button, + deselect_file_button, + selected_text, + ], + ) + ingested_dataset.select( + fn=self._selected_a_file, + outputs=[ + delete_file_button, + deselect_file_button, + selected_text, + ], + ) + delete_file_button.click( + self._delete_selected_file, + outputs=[ + ingested_dataset, + delete_file_button, + deselect_file_button, + selected_text, + ], + ) + delete_files_button.click( + self._delete_all_files, + outputs=[ + ingested_dataset, + delete_file_button, + deselect_file_button, + selected_text, + ], + ) system_prompt_input = gr.Textbox( placeholder=self._system_prompt, label="System Prompt", diff --git a/pyproject.toml b/pyproject.toml index e75a7cb9a..97db9986a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ types-pyyaml = "^6.0.12.12" [tool.poetry.group.ui] optional = true [tool.poetry.group.ui.dependencies] -gradio = "^4.4.1" +gradio = "^4.19.0" [tool.poetry.group.local] optional = true diff --git a/scripts/ingest_folder.py b/scripts/ingest_folder.py index 8c6acad1c..ccda87cc5 100755 --- a/scripts/ingest_folder.py +++ b/scripts/ingest_folder.py @@ -18,10 +18,11 @@ def __init__(self, ingest_service: IngestService) -> None: self.total_documents = 0 self.current_document_count = 0 - self._files_under_root_folder: list[Path] = list() + self._files_under_root_folder: list[Path] = [] def _find_all_files_in_folder(self, root_path: Path, ignored: list[str]) -> None: """Search all files under the root folder recursively. + Count them at the same time """ for file_path in root_path.iterdir(): diff --git a/settings.yaml b/settings.yaml index 0ffbfcaef..632c12ce2 100644 --- a/settings.yaml +++ b/settings.yaml @@ -31,6 +31,9 @@ ui: You can only answer questions about the provided context. If you know the answer but it is not based in the provided context, don't provide the answer, just state the answer is not in the context provided. + delete_file_button_enabled: true + delete_all_files_button_enabled: true + llm: mode: local