From 04731935cfb808deb5fc0385bab8c466b37f1307 Mon Sep 17 00:00:00 2001 From: XuhuiRen Date: Thu, 12 Sep 2024 21:07:40 +0800 Subject: [PATCH 01/31] add graphrag for neo4j Signed-off-by: XuhuiRen --- comps/dataprep/neo4j/langchain/Dockerfile | 38 ++++ comps/dataprep/neo4j/langchain/README.md | 115 +++++++++++ comps/dataprep/neo4j/langchain/__init__.py | 2 + comps/dataprep/neo4j/langchain/config.py | 15 ++ .../docker-compose-dataprep-neo4j.yaml | 48 +++++ .../neo4j/langchain/prepare_doc_neo4j.py | 195 ++++++++++++++++++ .../dataprep/neo4j/langchain/requirements.txt | 31 +++ comps/retrievers/neo4j/langchain/Dockerfile | 31 +++ comps/retrievers/neo4j/langchain/README.md | 111 ++++++++++ comps/retrievers/neo4j/langchain/__init__.py | 2 + comps/retrievers/neo4j/langchain/config.py | 16 ++ .../neo4j/langchain/requirements.txt | 22 ++ .../neo4j/langchain/retriever_neo4j.py | 94 +++++++++ 13 files changed, 720 insertions(+) create mode 100644 comps/dataprep/neo4j/langchain/Dockerfile create mode 100644 comps/dataprep/neo4j/langchain/README.md create mode 100644 comps/dataprep/neo4j/langchain/__init__.py create mode 100644 comps/dataprep/neo4j/langchain/config.py create mode 100644 comps/dataprep/neo4j/langchain/docker-compose-dataprep-neo4j.yaml create mode 100644 comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py create mode 100644 comps/dataprep/neo4j/langchain/requirements.txt create mode 100644 comps/retrievers/neo4j/langchain/Dockerfile create mode 100644 comps/retrievers/neo4j/langchain/README.md create mode 100644 comps/retrievers/neo4j/langchain/__init__.py create mode 100644 comps/retrievers/neo4j/langchain/config.py create mode 100644 comps/retrievers/neo4j/langchain/requirements.txt create mode 100644 comps/retrievers/neo4j/langchain/retriever_neo4j.py diff --git a/comps/dataprep/neo4j/langchain/Dockerfile b/comps/dataprep/neo4j/langchain/Dockerfile new file mode 100644 index 000000000..5c1884359 --- /dev/null +++ b/comps/dataprep/neo4j/langchain/Dockerfile @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + default-jre \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/neo4j/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +USER root + +RUN mkdir -p /home/user/comps/dataprep/qdrant/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/neo4j/langchain/uploaded_files + +USER user + +WORKDIR /home/user/comps/dataprep/neo4j/langchain + +ENTRYPOINT ["python", "prepare_doc_neo4j.py"] diff --git a/comps/dataprep/neo4j/langchain/README.md b/comps/dataprep/neo4j/langchain/README.md new file mode 100644 index 000000000..df6c0547f --- /dev/null +++ b/comps/dataprep/neo4j/langchain/README.md @@ -0,0 +1,115 @@ +# Dataprep Microservice with Neo4J + +## 🚀Start Microservice with Python + +### Install Requirements + +```bash +pip install -r requirements.txt +apt-get install libtesseract-dev -y +apt-get install poppler-utils -y +``` + +### Start Neo4J Server + +To launch Neo4j locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. +```bash +docker run \ + -p 7474:7474 -p 7687:7687 \ + -v $PWD/data:/data -v $PWD/plugins:/plugins \ + --name neo4j-apoc \ + -d \ + -e NEO4J_AUTH=neo4j/password \ + -e NEO4J_PLUGINS=\[\"apoc\"\] \ + neo4j:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export NEO4J_URI=${your_neo4j_url} +export NEO4J_USERNAME=${your_neo4j_username} +export NEO4J_PASSWORD=${your_neo4j_password} +export PYTHONPATH=${path_to_comps} +``` + +### Start Document Preparation Microservice for Neo4J with Python Script + +Start document preparation microservice for Neo4J with below command. + +```bash +python prepare_doc_neo4j.py +``` + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/dataprep-neo4j:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/neo4j/langchain/Dockerfile . +``` + +### Run Docker with CLI + +```bash +docker run -d --name="dataprep-neo4j-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-neo4j:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export NEO4J_URI=${your_neo4j_url} +export NEO4J_USERNAME=${your_neo4j_username} +export NEO4J_PASSWORD=${your_neo4j_password} +``` + +### Run Docker with Docker Compose + +```bash +cd comps/dataprep/neo4j/langchain +docker compose -f docker-compose-dataprep-neo4j.yaml up -d +``` + +## Invoke Microservice + +Once document preparation microservice for Neo4J is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + http://localhost:6007/v1/dataprep +``` + +You can specify chunk_size and chunk_size by the following commands. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "chunk_size=1500" \ + -F "chunk_overlap=100" \ + http://localhost:6007/v1/dataprep +``` + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +For ensure the quality and comprehensiveness of the extracted entities, we recommend to use `gpt-4o` as the default model for parsing the document. To enable the openai service, please `export OPENAI_KEY=xxxx` before using this services. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./your_file.pdf" \ + -F "process_table=true" \ + -F "table_strategy=hq" \ + http://localhost:6007/v1/dataprep +``` diff --git a/comps/dataprep/neo4j/langchain/__init__.py b/comps/dataprep/neo4j/langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/dataprep/neo4j/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/neo4j/langchain/config.py b/comps/dataprep/neo4j/langchain/config.py new file mode 100644 index 000000000..07bb9b27b --- /dev/null +++ b/comps/dataprep/neo4j/langchain/config.py @@ -0,0 +1,15 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# Neo4J configuration +NEO4J_URL = os.getenv("NEO4J_URI", "bolt://localhost:7687") +NEO4J_USERNAME = os.getenv("NEO4J_USERNAME", "neo4j") +NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "test") + +# LLM/Embedding endpoints +TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") +TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081") +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT") +OPENAI_KEY = os.getenv('OPENAI_API_KEY') \ No newline at end of file diff --git a/comps/dataprep/neo4j/langchain/docker-compose-dataprep-neo4j.yaml b/comps/dataprep/neo4j/langchain/docker-compose-dataprep-neo4j.yaml new file mode 100644 index 000000000..d7d210adf --- /dev/null +++ b/comps/dataprep/neo4j/langchain/docker-compose-dataprep-neo4j.yaml @@ -0,0 +1,48 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + neo4j-vector-db: + image: neo4j/neo4j + container_name: neo4j-graph-db + ports: + - "6337:6337" + - "6338:6338" + tgi_gaudi_service: + image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + container_name: tgi-service + ports: + - "8088:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HF_TOKEN} + command: --model-id ${LLM_MODEL_ID} --auto-truncate --max-input-tokens 1024 --max-total-tokens 2048 + dataprep-neo4j: + image: opea/gen-ai-comps:dataprep-neo4j-xeon-server + container_name: dataprep-neo4j-server + depends_on: + - neo4j-vector-db + - tgi_gaudi_service + ports: + - "6007:6007" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + NEO4J_URL: ${NEO4J_URL} + NEO4J_USERNAME: ${NEO4J_USERNAME} + NEO4J_PASSWORD: ${NEO4J_PASSWORD} + TGI_LLM_ENDPOINT: ${TEI_ENDPOINT} + OPENAI_KEY: ${OPENAI_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py b/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py new file mode 100644 index 000000000..1e7c15315 --- /dev/null +++ b/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py @@ -0,0 +1,195 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +from typing import List, Optional, Union +import openai +from config import TGI_LLM_ENDPOINT, OPENAI_KEY, Neo4J_URL, NEO4J_USERNAME, NEO4J_PASSWORD +from fastapi import File, Form, HTTPException, UploadFile +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_experimental.graph_transformers import LLMGraphTransformer +from langchain_community.llms import HuggingFaceEndpoint +from langchain_community.graphs.graph_document import GraphDocument +from langchain_core.documents import Document +from langchain_text_splitters import HTMLHeaderTextSplitter + +from comps import CustomLogger, DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import ( + document_loader, + encode_filename, + get_separators, + get_tables_result, + parse_html, + save_content_to_local_disk, +) +from langchain_community.graphs import Neo4jGraph + +logger = CustomLogger("prepare_doc_neo4j") +logflag = os.getenv("LOGFLAG", False) + +upload_folder = "./uploaded_files/" + + +def ingest_data_to_neo4j(doc_path: DocPath): + """Ingest document to Neo4J.""" + path = doc_path.path + if logflag: + logger.info(f"Parsing document {path}.") + + if path.endswith(".html"): + headers_to_split_on = [ + ("h1", "Header 1"), + ("h2", "Header 2"), + ("h3", "Header 3"), + ] + text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + else: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), + ) + + content = document_loader(path) + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + chunks = chunks + table_chunks + if logflag: + logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") + + if OPENAI_KEY: + logger.info("OpenAI API Key is set. Verifying its validity...") + openai.api_key = OPENAI_KEY + + try: + response = openai.Engine.list() + logger.info("OpenAI API Key is valid.") + llm = ChatOpenAI(temperature=0, model_name="gpt-4o") + except openai.error.AuthenticationError: + logger.info("OpenAI API Key is invalid.") + except Exception as e: + logger.info(f"An error occurred while verifying the API Key: {e}") + else: + llm = HuggingFaceEndpoint( + endpoint_url=TGI_LLM_ENDPOINT, + max_new_tokens=512, + top_k=40, + top_p=0.9, + temperature=0.8, + timeout=600, + ) + + llm_transformer = LLMGraphTransformer( + llm=llm, + node_properties=["description"], + relationship_properties=["description"] + ) + + doc_list = [Document(page_content=text) for text in chunks] + graph_doc = llm_transformer.convert_to_graph_documents(doc_list) + + graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD) + + graph.add_graph_documents( + graph_documents, + baseEntityLabel=True, + include_source=True + ) + + if logflag: + logger.info("The graph is built.") + + return True + + +@register_microservice( + name="opea_service@prepare_doc_neo4j", + endpoint="/v1/dataprep", + host="0.0.0.0", + port=6007, + input_datatype=DocPath, + output_datatype=None, +) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), +): + if logflag: + logger.info(f"files:{files}") + logger.info(f"link_list:{link_list}") + + if files: + if not isinstance(files, list): + files = [files] + uploaded_files = [] + for file in files: + encode_file = encode_filename(file.filename) + save_path = upload_folder + encode_file + await save_content_to_local_disk(save_path, file) + ingest_data_to_neo4j( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + uploaded_files.append(save_path) + if logflag: + logger.info(f"Successfully saved file {save_path}") + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + if link_list: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail="link_list should be a list.") + for link in link_list: + encoded_link = encode_filename(link) + save_path = upload_folder + encoded_link + ".txt" + content = parse_html([link])[0][0] + try: + await save_content_to_local_disk(save_path, content) + ingest_data_to_neo4j( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + except json.JSONDecodeError: + raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") + + if logflag: + logger.info(f"Successfully saved link {link}") + + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + +if __name__ == "__main__": + opea_microservices["opea_service@prepare_doc_neo4j"].start() diff --git a/comps/dataprep/neo4j/langchain/requirements.txt b/comps/dataprep/neo4j/langchain/requirements.txt new file mode 100644 index 000000000..3fe0ad588 --- /dev/null +++ b/comps/dataprep/neo4j/langchain/requirements.txt @@ -0,0 +1,31 @@ +beautifulsoup4 +cairosvg +docarray[full] +docx2txt +easyocr +fastapi +huggingface_hub +langchain +langchain-community +langchain-text-splitters +langchain_huggingface +langchain-experimental +langchain-openai +openai +neo4j +markdown +numpy +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pymupdf +python-docx +python-pptx +sentence_transformers +shortuuid +unstructured[all-docs]==0.15.7 +uvicorn + diff --git a/comps/retrievers/neo4j/langchain/Dockerfile b/comps/retrievers/neo4j/langchain/Dockerfile new file mode 100644 index 000000000..00ce186cb --- /dev/null +++ b/comps/retrievers/neo4j/langchain/Dockerfile @@ -0,0 +1,31 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/retrievers/neo4j/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/retrievers/neo4j/langchain + +ENTRYPOINT ["python", "retriever_neo4j.py"] diff --git a/comps/retrievers/neo4j/langchain/README.md b/comps/retrievers/neo4j/langchain/README.md new file mode 100644 index 000000000..64661bec8 --- /dev/null +++ b/comps/retrievers/neo4j/langchain/README.md @@ -0,0 +1,111 @@ +# Retriever Microservice with Neo4J + +## 🚀Start Microservice with Python + +### Install Requirements + +```bash +pip install -r requirements.txt +``` + +### Start Neo4J Server + +To launch Neo4j locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. +```bash +docker run \ + -p 7474:7474 -p 7687:7687 \ + -v $PWD/data:/data -v $PWD/plugins:/plugins \ + --name neo4j-apoc \ + -d \ + -e NEO4J_AUTH=neo4j/password \ + -e NEO4J_PLUGINS=\[\"apoc\"\] \ + neo4j:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export NEO4J_URI=${your_neo4j_url} +export NEO4J_USERNAME=${your_neo4j_username} +export NEO4J_PASSWORD=${your_neo4j_password} +``` + +### Start Retriever Service + +```bash +python retriever_neo4j.py +``` + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../ +docker build -t opea/retriever-neo4j:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/neo4j/langchain/Dockerfile . +``` + +### Run Docker with CLI + +```bash +docker run -d --name="retriever-neo4j-server" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e NEO4J_URI=${your_neo4j_host_ip} opea/retriever-neo4j:latest +``` + +## 🚀3. Consume Retriever Service + +### 3.1 Check Service Status + +```bash +curl http://${your_ip}:7000/v1/health_check \ + -X GET \ + -H 'Content-Type: application/json' +``` + +### 3.2 Consume Embedding Service + +To consume the Retriever Microservice, you can generate a mock embedding vector of length 768 with Python. + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://${your_ip}:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \ + -H 'Content-Type: application/json' +``` + +You can set the parameters for the retriever. + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity\", \"k\":4}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_distance_threshold\", \"k\":4, \"distance_threshold\":1.0}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_score_threshold\", \"k\":4, \"score_threshold\":0.2}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"mmr\", \"k\":4, \"fetch_k\":20, \"lambda_mult\":0.5}" \ + -H 'Content-Type: application/json' +``` diff --git a/comps/retrievers/neo4j/langchain/__init__.py b/comps/retrievers/neo4j/langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/retrievers/neo4j/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/retrievers/neo4j/langchain/config.py b/comps/retrievers/neo4j/langchain/config.py new file mode 100644 index 000000000..0bad9de83 --- /dev/null +++ b/comps/retrievers/neo4j/langchain/config.py @@ -0,0 +1,16 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# Neo4J configuration +NEO4J_URL = os.getenv("NEO4J_URI", "bolt://localhost:7687") +NEO4J_USERNAME = os.getenv("NEO4J_USERNAME", "neo4j") +NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "test") + +# Embedding model +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +# Embedding endpoints +EMBED_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "") + diff --git a/comps/retrievers/neo4j/langchain/requirements.txt b/comps/retrievers/neo4j/langchain/requirements.txt new file mode 100644 index 000000000..789642a04 --- /dev/null +++ b/comps/retrievers/neo4j/langchain/requirements.txt @@ -0,0 +1,22 @@ +docarray[full] +fastapi +frontend==0.0.3 +huggingface_hub +langchain +langchain-community +numpy +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pydantic==2.7.3 +pymupdf==1.24.5 +python-docx==0.8.11 +sentence_transformers +shortuuid +tiktoken +uvicorn +neo4j +tiktoken \ No newline at end of file diff --git a/comps/retrievers/neo4j/langchain/retriever_neo4j.py b/comps/retrievers/neo4j/langchain/retriever_neo4j.py new file mode 100644 index 000000000..e9d9aae26 --- /dev/null +++ b/comps/retrievers/neo4j/langchain/retriever_neo4j.py @@ -0,0 +1,94 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import os +import time +from typing import List, Optional + +from config import ( + EMBED_ENDPOINT, + EMBED_MODEL, + NEO4J_URL, + NEO4J_USERNAME, + NEO4J_PASSWORD, +) +from langchain_community.embeddings import HuggingFaceHubEmbeddings, HuggingFaceBgeEmbeddings +from langchain_community.vectorstores import Neo4jVector + +from comps import ( + CustomLogger, + EmbedDoc, + SearchedDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) + +logger = CustomLogger("retriever_neo4j") +logflag = os.getenv("LOGFLAG", False) + + +@register_microservice( + name="opea_service@retriever_neo4j", + service_type=ServiceType.RETRIEVER, + endpoint="/v1/retrieval", + host="0.0.0.0", + port=7000, +) +@register_statistics(names=["opea_service@retriever_neo4j"]) +def retrieve(input: EmbedDoc) -> SearchedDoc: + if logflag: + logger.info(input) + + start = time.time() + if input.search_type == "similarity": + search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, k=input.k) + elif input.search_type == "similarity_distance_threshold": + if input.distance_threshold is None: + raise ValueError("distance_threshold must be provided for " + "similarity_distance_threshold retriever") + search_res = vector_db.similarity_search_by_vector( + embedding=input.embedding, k=input.k, distance_threshold=input.distance_threshold + ) + elif input.search_type == "similarity_score_threshold": + docs_and_similarities = vector_db.similarity_search_with_relevance_scores( + query=input.text, k=input.k, score_threshold=input.score_threshold + ) + search_res = [doc for doc, _ in docs_and_similarities] + elif input.search_type == "mmr": + search_res = vector_db.max_marginal_relevance_search( + query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult + ) + searched_docs = [] + for r in search_res: + searched_docs.append(TextDoc(text=r.page_content)) + result = SearchedDoc(retrieved_docs=searched_docs, initial_query=input.text) + + statistics_dict["opea_service@retriever_neo4j"].append_latency(time.time() - start, None) + if logflag: + logger.info(result) + return result + + +if __name__ == "__main__": + + if EMBED_ENDPOINT: + # create embeddings using TEI endpoint service + embeddings = HuggingFaceHubEmbeddings(model=EMBED_ENDPOINT) + else: + # create embeddings using local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + vector_db = Neo4jVector.from_existing_graph( + embedding=embeddings, + url=NEO4J_URL, + username=NEO4J_USERNAME, + password=NEO4J_PASSWORD, + node_label='__Entity__', + text_node_properties=['id', 'description'], + embedding_node_property='embedding' + ) + opea_microservices["opea_service@retriever_neo4j"].start() From 4ce444e8e89a5115ac26cfac45b6360afd6dcf72 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Sep 2024 13:14:12 +0000 Subject: [PATCH 02/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/dataprep/neo4j/langchain/README.md | 3 ++- comps/dataprep/neo4j/langchain/config.py | 2 +- .../neo4j/langchain/prepare_doc_neo4j.py | 27 ++++++++----------- .../dataprep/neo4j/langchain/requirements.txt | 8 +++--- comps/retrievers/neo4j/langchain/README.md | 3 ++- comps/retrievers/neo4j/langchain/config.py | 1 - .../neo4j/langchain/requirements.txt | 3 +-- .../neo4j/langchain/retriever_neo4j.py | 24 +++++++---------- 8 files changed, 30 insertions(+), 41 deletions(-) diff --git a/comps/dataprep/neo4j/langchain/README.md b/comps/dataprep/neo4j/langchain/README.md index df6c0547f..31f92548b 100644 --- a/comps/dataprep/neo4j/langchain/README.md +++ b/comps/dataprep/neo4j/langchain/README.md @@ -12,7 +12,8 @@ apt-get install poppler-utils -y ### Start Neo4J Server -To launch Neo4j locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. +To launch Neo4j locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. + ```bash docker run \ -p 7474:7474 -p 7687:7687 \ diff --git a/comps/dataprep/neo4j/langchain/config.py b/comps/dataprep/neo4j/langchain/config.py index 07bb9b27b..bb21d57e3 100644 --- a/comps/dataprep/neo4j/langchain/config.py +++ b/comps/dataprep/neo4j/langchain/config.py @@ -12,4 +12,4 @@ TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081") TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT") -OPENAI_KEY = os.getenv('OPENAI_API_KEY') \ No newline at end of file +OPENAI_KEY = os.getenv("OPENAI_API_KEY") diff --git a/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py b/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py index 1e7c15315..915c02de6 100644 --- a/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py +++ b/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py @@ -4,14 +4,16 @@ import json import os from typing import List, Optional, Union + import openai -from config import TGI_LLM_ENDPOINT, OPENAI_KEY, Neo4J_URL, NEO4J_USERNAME, NEO4J_PASSWORD +from config import NEO4J_PASSWORD, NEO4J_USERNAME, OPENAI_KEY, TGI_LLM_ENDPOINT, Neo4J_URL from fastapi import File, Form, HTTPException, UploadFile from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_experimental.graph_transformers import LLMGraphTransformer -from langchain_community.llms import HuggingFaceEndpoint +from langchain_community.graphs import Neo4jGraph from langchain_community.graphs.graph_document import GraphDocument +from langchain_community.llms import HuggingFaceEndpoint from langchain_core.documents import Document +from langchain_experimental.graph_transformers import LLMGraphTransformer from langchain_text_splitters import HTMLHeaderTextSplitter from comps import CustomLogger, DocPath, opea_microservices, register_microservice @@ -23,7 +25,6 @@ parse_html, save_content_to_local_disk, ) -from langchain_community.graphs import Neo4jGraph logger = CustomLogger("prepare_doc_neo4j") logflag = os.getenv("LOGFLAG", False) @@ -67,11 +68,11 @@ def ingest_data_to_neo4j(doc_path: DocPath): chunks = chunks + table_chunks if logflag: logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") - + if OPENAI_KEY: logger.info("OpenAI API Key is set. Verifying its validity...") openai.api_key = OPENAI_KEY - + try: response = openai.Engine.list() logger.info("OpenAI API Key is valid.") @@ -91,22 +92,16 @@ def ingest_data_to_neo4j(doc_path: DocPath): ) llm_transformer = LLMGraphTransformer( - llm=llm, - node_properties=["description"], - relationship_properties=["description"] + llm=llm, node_properties=["description"], relationship_properties=["description"] ) - + doc_list = [Document(page_content=text) for text in chunks] graph_doc = llm_transformer.convert_to_graph_documents(doc_list) graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD) - graph.add_graph_documents( - graph_documents, - baseEntityLabel=True, - include_source=True - ) - + graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True) + if logflag: logger.info("The graph is built.") diff --git a/comps/dataprep/neo4j/langchain/requirements.txt b/comps/dataprep/neo4j/langchain/requirements.txt index 3fe0ad588..b8326a623 100644 --- a/comps/dataprep/neo4j/langchain/requirements.txt +++ b/comps/dataprep/neo4j/langchain/requirements.txt @@ -7,14 +7,14 @@ fastapi huggingface_hub langchain langchain-community -langchain-text-splitters -langchain_huggingface langchain-experimental langchain-openai -openai -neo4j +langchain-text-splitters +langchain_huggingface markdown +neo4j numpy +openai opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk diff --git a/comps/retrievers/neo4j/langchain/README.md b/comps/retrievers/neo4j/langchain/README.md index 64661bec8..731abc20f 100644 --- a/comps/retrievers/neo4j/langchain/README.md +++ b/comps/retrievers/neo4j/langchain/README.md @@ -10,7 +10,8 @@ pip install -r requirements.txt ### Start Neo4J Server -To launch Neo4j locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. +To launch Neo4j locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. + ```bash docker run \ -p 7474:7474 -p 7687:7687 \ diff --git a/comps/retrievers/neo4j/langchain/config.py b/comps/retrievers/neo4j/langchain/config.py index 0bad9de83..39adf6d89 100644 --- a/comps/retrievers/neo4j/langchain/config.py +++ b/comps/retrievers/neo4j/langchain/config.py @@ -13,4 +13,3 @@ # Embedding endpoints EMBED_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "") - diff --git a/comps/retrievers/neo4j/langchain/requirements.txt b/comps/retrievers/neo4j/langchain/requirements.txt index 789642a04..55e858b50 100644 --- a/comps/retrievers/neo4j/langchain/requirements.txt +++ b/comps/retrievers/neo4j/langchain/requirements.txt @@ -4,6 +4,7 @@ frontend==0.0.3 huggingface_hub langchain langchain-community +neo4j numpy opentelemetry-api opentelemetry-exporter-otlp @@ -18,5 +19,3 @@ sentence_transformers shortuuid tiktoken uvicorn -neo4j -tiktoken \ No newline at end of file diff --git a/comps/retrievers/neo4j/langchain/retriever_neo4j.py b/comps/retrievers/neo4j/langchain/retriever_neo4j.py index e9d9aae26..a00834100 100644 --- a/comps/retrievers/neo4j/langchain/retriever_neo4j.py +++ b/comps/retrievers/neo4j/langchain/retriever_neo4j.py @@ -6,14 +6,8 @@ import time from typing import List, Optional -from config import ( - EMBED_ENDPOINT, - EMBED_MODEL, - NEO4J_URL, - NEO4J_USERNAME, - NEO4J_PASSWORD, -) -from langchain_community.embeddings import HuggingFaceHubEmbeddings, HuggingFaceBgeEmbeddings +from config import EMBED_ENDPOINT, EMBED_MODEL, NEO4J_PASSWORD, NEO4J_URL, NEO4J_USERNAME +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings from langchain_community.vectorstores import Neo4jVector from comps import ( @@ -43,7 +37,7 @@ def retrieve(input: EmbedDoc) -> SearchedDoc: if logflag: logger.info(input) - + start = time.time() if input.search_type == "similarity": search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, k=input.k) @@ -66,7 +60,7 @@ def retrieve(input: EmbedDoc) -> SearchedDoc: for r in search_res: searched_docs.append(TextDoc(text=r.page_content)) result = SearchedDoc(retrieved_docs=searched_docs, initial_query=input.text) - + statistics_dict["opea_service@retriever_neo4j"].append_latency(time.time() - start, None) if logflag: logger.info(result) @@ -74,21 +68,21 @@ def retrieve(input: EmbedDoc) -> SearchedDoc: if __name__ == "__main__": - + if EMBED_ENDPOINT: # create embeddings using TEI endpoint service embeddings = HuggingFaceHubEmbeddings(model=EMBED_ENDPOINT) else: # create embeddings using local embedding model embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) - + vector_db = Neo4jVector.from_existing_graph( embedding=embeddings, url=NEO4J_URL, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, - node_label='__Entity__', - text_node_properties=['id', 'description'], - embedding_node_property='embedding' + node_label="__Entity__", + text_node_properties=["id", "description"], + embedding_node_property="embedding", ) opea_microservices["opea_service@retriever_neo4j"].start() From abee3d26f142ff04565a2f75ea927679ec3cc306 Mon Sep 17 00:00:00 2001 From: XuhuiRen Date: Thu, 12 Sep 2024 21:19:33 +0800 Subject: [PATCH 03/31] add Signed-off-by: XuhuiRen --- comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py b/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py index 915c02de6..9cb7d4f56 100644 --- a/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py +++ b/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py @@ -6,7 +6,7 @@ from typing import List, Optional, Union import openai -from config import NEO4J_PASSWORD, NEO4J_USERNAME, OPENAI_KEY, TGI_LLM_ENDPOINT, Neo4J_URL +from config import NEO4J_PASSWORD, NEO4J_USERNAME, OPENAI_KEY, TGI_LLM_ENDPOINT, NEO4J_URL from fastapi import File, Form, HTTPException, UploadFile from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.graphs import Neo4jGraph @@ -15,6 +15,7 @@ from langchain_core.documents import Document from langchain_experimental.graph_transformers import LLMGraphTransformer from langchain_text_splitters import HTMLHeaderTextSplitter +from langchain_openai import ChatOpenAI from comps import CustomLogger, DocPath, opea_microservices, register_microservice from comps.dataprep.utils import ( @@ -98,9 +99,9 @@ def ingest_data_to_neo4j(doc_path: DocPath): doc_list = [Document(page_content=text) for text in chunks] graph_doc = llm_transformer.convert_to_graph_documents(doc_list) - graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD) + graph = Neo4jGraph(url=NEO4J_URL, username=NEO4J_USERNAME, password=NEO4J_PASSWORD) - graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True) + graph.add_graph_documents(graph_doc, baseEntityLabel=True, include_source=True) if logflag: logger.info("The graph is built.") From 333b344d34e86442379892cad3ae04322a182989 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Sep 2024 13:19:57 +0000 Subject: [PATCH 04/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py b/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py index 9cb7d4f56..39d88d055 100644 --- a/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py +++ b/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py @@ -6,7 +6,7 @@ from typing import List, Optional, Union import openai -from config import NEO4J_PASSWORD, NEO4J_USERNAME, OPENAI_KEY, TGI_LLM_ENDPOINT, NEO4J_URL +from config import NEO4J_PASSWORD, NEO4J_URL, NEO4J_USERNAME, OPENAI_KEY, TGI_LLM_ENDPOINT from fastapi import File, Form, HTTPException, UploadFile from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.graphs import Neo4jGraph @@ -14,8 +14,8 @@ from langchain_community.llms import HuggingFaceEndpoint from langchain_core.documents import Document from langchain_experimental.graph_transformers import LLMGraphTransformer -from langchain_text_splitters import HTMLHeaderTextSplitter from langchain_openai import ChatOpenAI +from langchain_text_splitters import HTMLHeaderTextSplitter from comps import CustomLogger, DocPath, opea_microservices, register_microservice from comps.dataprep.utils import ( From 9c7931628859faceadb0dd8e79fcc5e07e1a6606 Mon Sep 17 00:00:00 2001 From: XuhuiRen Date: Thu, 12 Sep 2024 21:46:10 +0800 Subject: [PATCH 05/31] add Signed-off-by: XuhuiRen --- .../test_retrievers_neo4j_langchain.sh | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 tests/retrievers/test_retrievers_neo4j_langchain.sh diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh new file mode 100644 index 000000000..723abadba --- /dev/null +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + cd $WORKPATH + docker build --no-cache -t opea/retriever-neo4j:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/neo4j/langchain/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/retriever-neo4j built fail" + exit 1 + else + echo "opea/retriever-neo4j built successful" + fi +} + +function start_service() { + # neo4j + docker run -d -p 7474:7474 -p 7687:7687 -v $PWD/data:/data -v $PWD/plugins:/plugins --name neo4j-apoc -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest + sleep 60s + + # tei endpoint + tei_endpoint=5434 + model="BAAI/bge-base-en-v1.5" + docker run -d --name="test-comps-retriever-neo4j-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model + sleep 30s + export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" + + # Neo4J retriever + export NEO4J_URI="bolt://${ip_address}:7687" + export NEO4J_USERNAME="neo4j" + export NEO4J_PASSWORD="password" + # unset http_proxy + docker run -d --name="test-comps-retriever-neo4j-server" -p ${retriever_port}:7000 --ipc=host -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e NEO4J_URI=$NEO4J_URI -e NEO4J_USERNAME=$NEO4J_USERNAME NEO4J_PASSWORD=$NEO4J_PASSWORD opea/retriever-neo4j:comps + + sleep 3m +} + +function validate_microservice() { + retriever_port=5435 + export PATH="${HOME}/miniforge3/bin:$PATH" + source activate + URL="http://${ip_address}:$retriever_port/v1/retrieval" + + test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ retriever ] HTTP status is 200. Checking content..." + local CONTENT=$(curl -s -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/retriever.log) + + if echo "$CONTENT" | grep -q "retrieved_docs"; then + echo "[ retriever ] Content is as expected." + else + echo "[ retriever ] Content does not match the expected result: $CONTENT" + docker logs test-comps-retriever-redis-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-redis-tei-endpoint >> ${LOG_PATH}/tei.log + exit 1 + fi + else + echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-retriever-redis-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-redis-tei-endpoint >> ${LOG_PATH}/tei.log + exit 1 + fi +} + +function stop_docker() { + cid_retrievers=$(docker ps -aq --filter "name=test-comps-retriever-neo4j*") + if [[ ! -z "$cid_retrievers" ]]; then + docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s + fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main From 97d97f3316b526f49b632bfa1e79d4f514afdf6e Mon Sep 17 00:00:00 2001 From: XuhuiRen Date: Thu, 12 Sep 2024 22:05:08 +0800 Subject: [PATCH 06/31] add Signed-off-by: XuhuiRen --- tests/retrievers/test_retrievers_neo4j_langchain.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh index 723abadba..b7f9c88ae 100644 --- a/tests/retrievers/test_retrievers_neo4j_langchain.sh +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -58,14 +58,14 @@ function validate_microservice() { echo "[ retriever ] Content is as expected." else echo "[ retriever ] Content does not match the expected result: $CONTENT" - docker logs test-comps-retriever-redis-server >> ${LOG_PATH}/retriever.log - docker logs test-comps-retriever-redis-tei-endpoint >> ${LOG_PATH}/tei.log + docker logs test-comps-retriever-neo4j-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-neo4j-tei-endpoint >> ${LOG_PATH}/tei.log exit 1 fi else echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs test-comps-retriever-redis-server >> ${LOG_PATH}/retriever.log - docker logs test-comps-retriever-redis-tei-endpoint >> ${LOG_PATH}/tei.log + docker logs test-comps-retriever-neo4j-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-neo4j-tei-endpoint >> ${LOG_PATH}/tei.log exit 1 fi } From 18711777af89181b2df6e4d7e9f17aae803f1f12 Mon Sep 17 00:00:00 2001 From: XuhuiRen Date: Fri, 13 Sep 2024 11:06:37 +0800 Subject: [PATCH 07/31] fix ut Signed-off-by: XuhuiRen --- comps/retrievers/neo4j/langchain/retriever_neo4j.py | 4 ++-- tests/retrievers/test_retrievers_neo4j_langchain.sh | 13 ++++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/comps/retrievers/neo4j/langchain/retriever_neo4j.py b/comps/retrievers/neo4j/langchain/retriever_neo4j.py index a00834100..08850a1a3 100644 --- a/comps/retrievers/neo4j/langchain/retriever_neo4j.py +++ b/comps/retrievers/neo4j/langchain/retriever_neo4j.py @@ -40,12 +40,12 @@ def retrieve(input: EmbedDoc) -> SearchedDoc: start = time.time() if input.search_type == "similarity": - search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, k=input.k) + search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, query=input.text, k=input.k) elif input.search_type == "similarity_distance_threshold": if input.distance_threshold is None: raise ValueError("distance_threshold must be provided for " + "similarity_distance_threshold retriever") search_res = vector_db.similarity_search_by_vector( - embedding=input.embedding, k=input.k, distance_threshold=input.distance_threshold + embedding=input.embedding, query=input.text, k=input.k, distance_threshold=input.distance_threshold ) elif input.search_type == "similarity_score_threshold": docs_and_similarities = vector_db.similarity_search_with_relevance_scores( diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh index b7f9c88ae..56d8a3a7e 100644 --- a/tests/retrievers/test_retrievers_neo4j_langchain.sh +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -10,6 +10,9 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH + docker run -d -p 7474:7474 -p 7687:7687 -v $PWD/data:/data -v $PWD/plugins:/plugins --name neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest + sleep 60s + docker build --no-cache -t opea/retriever-neo4j:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/neo4j/langchain/Dockerfile . if [ $? -ne 0 ]; then echo "opea/retriever-neo4j built fail" @@ -20,10 +23,6 @@ function build_docker_images() { } function start_service() { - # neo4j - docker run -d -p 7474:7474 -p 7687:7687 -v $PWD/data:/data -v $PWD/plugins:/plugins --name neo4j-apoc -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest - sleep 60s - # tei endpoint tei_endpoint=5434 model="BAAI/bge-base-en-v1.5" @@ -36,7 +35,7 @@ function start_service() { export NEO4J_USERNAME="neo4j" export NEO4J_PASSWORD="password" # unset http_proxy - docker run -d --name="test-comps-retriever-neo4j-server" -p ${retriever_port}:7000 --ipc=host -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e NEO4J_URI=$NEO4J_URI -e NEO4J_USERNAME=$NEO4J_USERNAME NEO4J_PASSWORD=$NEO4J_PASSWORD opea/retriever-neo4j:comps + docker run -d --name="test-comps-retriever-neo4j-server" -p ${retriever_port}:7000 --ipc=host -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/retriever-neo4j:comps sleep 3m } @@ -75,6 +74,10 @@ function stop_docker() { if [[ ! -z "$cid_retrievers" ]]; then docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s fi + cid_db=$(docker ps -aq --filter "name=neo4j-apoc1") + if [[ ! -z "$cid_retrievers" ]]; then + docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s + fi } function main() { From 72f94f39e6feee2f9bb4a675b0ed8e8620dffe58 Mon Sep 17 00:00:00 2001 From: XuhuiRen Date: Fri, 13 Sep 2024 11:28:53 +0800 Subject: [PATCH 08/31] fix Signed-off-by: XuhuiRen --- comps/retrievers/neo4j/langchain/Dockerfile | 4 ++++ comps/retrievers/neo4j/langchain/retriever_neo4j.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/comps/retrievers/neo4j/langchain/Dockerfile b/comps/retrievers/neo4j/langchain/Dockerfile index 00ce186cb..5d8e8d254 100644 --- a/comps/retrievers/neo4j/langchain/Dockerfile +++ b/comps/retrievers/neo4j/langchain/Dockerfile @@ -5,6 +5,10 @@ FROM python:3.11-slim ENV LANG=C.UTF-8 +ENV no_proxy=localhost,127.0.0.1 + +ENV HUGGINGFACEHUB_API_TOKEN=dummy + ARG ARCH="cpu" RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ diff --git a/comps/retrievers/neo4j/langchain/retriever_neo4j.py b/comps/retrievers/neo4j/langchain/retriever_neo4j.py index 08850a1a3..25ade8625 100644 --- a/comps/retrievers/neo4j/langchain/retriever_neo4j.py +++ b/comps/retrievers/neo4j/langchain/retriever_neo4j.py @@ -71,7 +71,8 @@ def retrieve(input: EmbedDoc) -> SearchedDoc: if EMBED_ENDPOINT: # create embeddings using TEI endpoint service - embeddings = HuggingFaceHubEmbeddings(model=EMBED_ENDPOINT) + hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN") + embeddings = HuggingFaceHubEmbeddings(model=EMBED_ENDPOINT, huggingfacehub_api_token=hf_token) else: # create embeddings using local embedding model embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) From e546eea54f2938e39ba592f2fbeed95cffaf0fa4 Mon Sep 17 00:00:00 2001 From: XuhuiRen Date: Fri, 13 Sep 2024 19:10:16 +0800 Subject: [PATCH 09/31] add Signed-off-by: XuhuiRen --- .../neo4j/langchain/retriever_neo4j.py | 34 +++++++++++++++---- .../test_retrievers_neo4j_langchain.sh | 5 +-- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/comps/retrievers/neo4j/langchain/retriever_neo4j.py b/comps/retrievers/neo4j/langchain/retriever_neo4j.py index 25ade8625..51332b0d3 100644 --- a/comps/retrievers/neo4j/langchain/retriever_neo4j.py +++ b/comps/retrievers/neo4j/langchain/retriever_neo4j.py @@ -34,11 +34,19 @@ port=7000, ) @register_statistics(names=["opea_service@retriever_neo4j"]) -def retrieve(input: EmbedDoc) -> SearchedDoc: +def retrieve( + input: Union[EmbedDoc, RetrievalRequest, ChatCompletionRequest] +) -> Union[SearchedDoc, RetrievalResponse, ChatCompletionRequest]: if logflag: logger.info(input) - start = time.time() + + if isinstance(input, EmbedDoc): + query = input.text + else: + # for RetrievalRequest, ChatCompletionRequest + query = input.input + if input.search_type == "similarity": search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, query=input.text, k=input.k) elif input.search_type == "similarity_distance_threshold": @@ -56,10 +64,24 @@ def retrieve(input: EmbedDoc) -> SearchedDoc: search_res = vector_db.max_marginal_relevance_search( query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult ) - searched_docs = [] - for r in search_res: - searched_docs.append(TextDoc(text=r.page_content)) - result = SearchedDoc(retrieved_docs=searched_docs, initial_query=input.text) + else: + raise ValueError(f"{input.search_type} not valid") + + # return different response format + retrieved_docs = [] + if isinstance(input, EmbedDoc): + for r in search_res: + retrieved_docs.append(TextDoc(text=r.page_content)) + result = SearchedDoc(retrieved_docs=retrieved_docs, initial_query=input.text) + else: + for r in search_res: + retrieved_docs.append(RetrievalResponseData(text=r.page_content, metadata=r.metadata)) + if isinstance(input, RetrievalRequest): + result = RetrievalResponse(retrieved_docs=retrieved_docs) + elif isinstance(input, ChatCompletionRequest): + input.retrieved_docs = retrieved_docs + input.documents = [doc.text for doc in retrieved_docs] + result = input statistics_dict["opea_service@retriever_neo4j"].append_latency(time.time() - start, None) if logflag: diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh index 56d8a3a7e..cd3e44ab5 100644 --- a/tests/retrievers/test_retrievers_neo4j_langchain.sh +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -11,7 +11,7 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH docker run -d -p 7474:7474 -p 7687:7687 -v $PWD/data:/data -v $PWD/plugins:/plugins --name neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest - sleep 60s + sleep 30s docker build --no-cache -t opea/retriever-neo4j:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/neo4j/langchain/Dockerfile . if [ $? -ne 0 ]; then @@ -34,8 +34,9 @@ function start_service() { export NEO4J_URI="bolt://${ip_address}:7687" export NEO4J_USERNAME="neo4j" export NEO4J_PASSWORD="password" + retriever_port=5435 # unset http_proxy - docker run -d --name="test-comps-retriever-neo4j-server" -p ${retriever_port}:7000 --ipc=host -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/retriever-neo4j:comps + docker run -d --name="test-comps-retriever-neo4j-server" -p ${retriever_port}:7000 --ipc=host -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e NEO4J_URI="bolt://${ip_address}:7687" -e NEO4J_USERNAME="neo4j" -e NEO4J_PASSWORD="password" opea/retriever-neo4j:comps sleep 3m } From c6fab2966d3b9c05feb18940006ac1d2fee2caf9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 13 Sep 2024 11:10:37 +0000 Subject: [PATCH 10/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/retrievers/neo4j/langchain/retriever_neo4j.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/retrievers/neo4j/langchain/retriever_neo4j.py b/comps/retrievers/neo4j/langchain/retriever_neo4j.py index 51332b0d3..ae8c9068f 100644 --- a/comps/retrievers/neo4j/langchain/retriever_neo4j.py +++ b/comps/retrievers/neo4j/langchain/retriever_neo4j.py @@ -46,7 +46,7 @@ def retrieve( else: # for RetrievalRequest, ChatCompletionRequest query = input.input - + if input.search_type == "similarity": search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, query=input.text, k=input.k) elif input.search_type == "similarity_distance_threshold": From 94897fd1214a2176cf883eb4940bc4440c813a24 Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Fri, 13 Sep 2024 19:14:53 +0800 Subject: [PATCH 11/31] Update retriever_neo4j.py --- comps/retrievers/neo4j/langchain/retriever_neo4j.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/comps/retrievers/neo4j/langchain/retriever_neo4j.py b/comps/retrievers/neo4j/langchain/retriever_neo4j.py index ae8c9068f..d109fa22c 100644 --- a/comps/retrievers/neo4j/langchain/retriever_neo4j.py +++ b/comps/retrievers/neo4j/langchain/retriever_neo4j.py @@ -4,7 +4,7 @@ import argparse import os import time -from typing import List, Optional +from typing import Union from config import EMBED_ENDPOINT, EMBED_MODEL, NEO4J_PASSWORD, NEO4J_URL, NEO4J_USERNAME from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings @@ -22,10 +22,16 @@ statistics_dict, ) +from comps.cores.proto.api_protocol import ( + ChatCompletionRequest, + RetrievalRequest, + RetrievalResponse, + RetrievalResponseData, +) + logger = CustomLogger("retriever_neo4j") logflag = os.getenv("LOGFLAG", False) - @register_microservice( name="opea_service@retriever_neo4j", service_type=ServiceType.RETRIEVER, From 7e81a707946e31bef17c841ece9cc54be9d44ed3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 13 Sep 2024 11:15:10 +0000 Subject: [PATCH 12/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/retrievers/neo4j/langchain/retriever_neo4j.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/retrievers/neo4j/langchain/retriever_neo4j.py b/comps/retrievers/neo4j/langchain/retriever_neo4j.py index d109fa22c..47ce4a544 100644 --- a/comps/retrievers/neo4j/langchain/retriever_neo4j.py +++ b/comps/retrievers/neo4j/langchain/retriever_neo4j.py @@ -21,7 +21,6 @@ register_statistics, statistics_dict, ) - from comps.cores.proto.api_protocol import ( ChatCompletionRequest, RetrievalRequest, @@ -32,6 +31,7 @@ logger = CustomLogger("retriever_neo4j") logflag = os.getenv("LOGFLAG", False) + @register_microservice( name="opea_service@retriever_neo4j", service_type=ServiceType.RETRIEVER, From e26e98ab8f65c03600b74583d1ab42f6bce1cd7a Mon Sep 17 00:00:00 2001 From: XuhuiRen Date: Fri, 13 Sep 2024 20:28:39 +0800 Subject: [PATCH 13/31] add Signed-off-by: XuhuiRen --- tests/retrievers/test_retrievers_neo4j_langchain.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh index cd3e44ab5..27e6d51ea 100644 --- a/tests/retrievers/test_retrievers_neo4j_langchain.sh +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -29,14 +29,14 @@ function start_service() { docker run -d --name="test-comps-retriever-neo4j-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model sleep 30s export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" - + export no_proxy="localhost,127.0.0.1,"${ip_address} # Neo4J retriever export NEO4J_URI="bolt://${ip_address}:7687" export NEO4J_USERNAME="neo4j" export NEO4J_PASSWORD="password" retriever_port=5435 # unset http_proxy - docker run -d --name="test-comps-retriever-neo4j-server" -p ${retriever_port}:7000 --ipc=host -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e NEO4J_URI="bolt://${ip_address}:7687" -e NEO4J_USERNAME="neo4j" -e NEO4J_PASSWORD="password" opea/retriever-neo4j:comps + docker run -d --name="test-comps-retriever-neo4j-server" -p ${retriever_port}:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e NEO4J_URI="bolt://${ip_address}:7687" -e NEO4J_USERNAME="neo4j" -e NEO4J_PASSWORD="password" opea/retriever-neo4j:comps sleep 3m } From 446696c3e570ca001b1a23e96cdad1951c1f77d3 Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Fri, 13 Sep 2024 20:47:15 +0800 Subject: [PATCH 14/31] Update test_retrievers_neo4j_langchain.sh --- tests/retrievers/test_retrievers_neo4j_langchain.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh index 27e6d51ea..2251c17e9 100644 --- a/tests/retrievers/test_retrievers_neo4j_langchain.sh +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -10,7 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH - docker run -d -p 7474:7474 -p 7687:7687 -v $PWD/data:/data -v $PWD/plugins:/plugins --name neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest + docker run --name neo4j-apoc1 -p 7474:7474 -p 7687:7687 -d -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest sleep 30s docker build --no-cache -t opea/retriever-neo4j:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/neo4j/langchain/Dockerfile . From f31e648b3ac4cc724c6bdfeb0ccd91e0b8d9a5ab Mon Sep 17 00:00:00 2001 From: XuhuiRen Date: Fri, 13 Sep 2024 20:56:27 +0800 Subject: [PATCH 15/31] add Signed-off-by: XuhuiRen --- tests/retrievers/test_retrievers_neo4j_langchain.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh index 2251c17e9..3e4f3948f 100644 --- a/tests/retrievers/test_retrievers_neo4j_langchain.sh +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -10,7 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH - docker run --name neo4j-apoc1 -p 7474:7474 -p 7687:7687 -d -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest + docker run -d -p 7474:7474 -p 7687:7687 -v $PWD/data:/data -v $PWD/plugins:/plugins --name neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest sleep 30s docker build --no-cache -t opea/retriever-neo4j:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/neo4j/langchain/Dockerfile . @@ -29,16 +29,17 @@ function start_service() { docker run -d --name="test-comps-retriever-neo4j-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model sleep 30s export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" - export no_proxy="localhost,127.0.0.1,"${ip_address} + # Neo4J retriever export NEO4J_URI="bolt://${ip_address}:7687" export NEO4J_USERNAME="neo4j" export NEO4J_PASSWORD="password" retriever_port=5435 # unset http_proxy + export no_proxy="localhost,127.0.0.1,"${ip_address} docker run -d --name="test-comps-retriever-neo4j-server" -p ${retriever_port}:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e NEO4J_URI="bolt://${ip_address}:7687" -e NEO4J_USERNAME="neo4j" -e NEO4J_PASSWORD="password" opea/retriever-neo4j:comps - sleep 3m + sleep 1m } function validate_microservice() { From 193a9243e90291c229958cd3201e8a6e093e2f09 Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Fri, 13 Sep 2024 21:04:00 +0800 Subject: [PATCH 16/31] Update test_retrievers_neo4j_langchain.sh --- tests/retrievers/test_retrievers_neo4j_langchain.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh index 3e4f3948f..9c1a6ac62 100644 --- a/tests/retrievers/test_retrievers_neo4j_langchain.sh +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -10,7 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH - docker run -d -p 7474:7474 -p 7687:7687 -v $PWD/data:/data -v $PWD/plugins:/plugins --name neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest + docker run -d -p 7474:7474 -p 7687:7687 -v ./data:/data -v ./plugins:/plugins --name neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest sleep 30s docker build --no-cache -t opea/retriever-neo4j:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/neo4j/langchain/Dockerfile . From 3da1ef4707eae03946130f3ff4675bd3fb4d2511 Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Fri, 13 Sep 2024 21:44:26 +0800 Subject: [PATCH 17/31] Update test_retrievers_neo4j_langchain.sh --- tests/retrievers/test_retrievers_neo4j_langchain.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh index 9c1a6ac62..35911e27b 100644 --- a/tests/retrievers/test_retrievers_neo4j_langchain.sh +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -10,7 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH - docker run -d -p 7474:7474 -p 7687:7687 -v ./data:/data -v ./plugins:/plugins --name neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest + docker run -d -p 7474:7474 -p 7687:7687 -v ./data:/data -v ./plugins:/plugins --name test-neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest sleep 30s docker build --no-cache -t opea/retriever-neo4j:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/neo4j/langchain/Dockerfile . @@ -76,7 +76,7 @@ function stop_docker() { if [[ ! -z "$cid_retrievers" ]]; then docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s fi - cid_db=$(docker ps -aq --filter "name=neo4j-apoc1") + cid_db=$(docker ps -aq --filter "name=test-neo4j-apoc1") if [[ ! -z "$cid_retrievers" ]]; then docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s fi From df56f2c53701965e408251c413102a3d0af6c9c2 Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Fri, 13 Sep 2024 21:53:28 +0800 Subject: [PATCH 18/31] Update test_retrievers_neo4j_langchain.sh --- tests/retrievers/test_retrievers_neo4j_langchain.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh index 35911e27b..9855fe75f 100644 --- a/tests/retrievers/test_retrievers_neo4j_langchain.sh +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -10,7 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH - docker run -d -p 7474:7474 -p 7687:7687 -v ./data:/data -v ./plugins:/plugins --name test-neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest + docker run -d -p 7474:7474 -p 7687:7687 -v ./data:/data -v ./plugins:/plugins --name test-comps-neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest sleep 30s docker build --no-cache -t opea/retriever-neo4j:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/neo4j/langchain/Dockerfile . @@ -76,7 +76,7 @@ function stop_docker() { if [[ ! -z "$cid_retrievers" ]]; then docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s fi - cid_db=$(docker ps -aq --filter "name=test-neo4j-apoc1") + cid_db=$(docker ps -aq --filter "name=test-comps-neo4j-apoc1") if [[ ! -z "$cid_retrievers" ]]; then docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s fi From 71577d0eb1490cb9666b32feac3d3a56bccba9cb Mon Sep 17 00:00:00 2001 From: XuhuiRen Date: Sat, 14 Sep 2024 14:17:32 +0800 Subject: [PATCH 19/31] add docker Signed-off-by: XuhuiRen --- .github/workflows/docker/compose/dataprep-compose-cd.yaml | 4 ++++ .github/workflows/docker/compose/retrievers-compose-cd.yaml | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/.github/workflows/docker/compose/dataprep-compose-cd.yaml b/.github/workflows/docker/compose/dataprep-compose-cd.yaml index fb08b51fa..6622a2921 100644 --- a/.github/workflows/docker/compose/dataprep-compose-cd.yaml +++ b/.github/workflows/docker/compose/dataprep-compose-cd.yaml @@ -27,3 +27,7 @@ services: build: dockerfile: comps/dataprep/vdms/langchain/Dockerfile image: ${REGISTRY:-opea}/dataprep-vdms:${TAG:-latest} + dataprep-neo4j: + build: + dockerfile: comps/dataprep/neo4j/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-neo4j:${TAG:-latest} diff --git a/.github/workflows/docker/compose/retrievers-compose-cd.yaml b/.github/workflows/docker/compose/retrievers-compose-cd.yaml index f9230412d..c4e26ffab 100644 --- a/.github/workflows/docker/compose/retrievers-compose-cd.yaml +++ b/.github/workflows/docker/compose/retrievers-compose-cd.yaml @@ -27,3 +27,7 @@ services: build: dockerfile: comps/retrievers/multimodal/redis/langchain/Dockerfile image: ${REGISTRY:-opea}/multimodal-retriever-redis:${TAG:-latest} + retriever-neo4j: + build: + dockerfile: comps/retrievers/neo4j/langchain/Dockerfile + image: ${REGISTRY:-opea}/retriever-neo4j:${TAG:-latest} \ No newline at end of file From f7f2fce44aa9142cec22b2b79f56c72941397841 Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Sat, 14 Sep 2024 15:58:35 +0800 Subject: [PATCH 20/31] Update retrievers-compose-cd.yaml --- .github/workflows/docker/compose/retrievers-compose-cd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker/compose/retrievers-compose-cd.yaml b/.github/workflows/docker/compose/retrievers-compose-cd.yaml index c4e26ffab..67b44fd0f 100644 --- a/.github/workflows/docker/compose/retrievers-compose-cd.yaml +++ b/.github/workflows/docker/compose/retrievers-compose-cd.yaml @@ -30,4 +30,4 @@ services: retriever-neo4j: build: dockerfile: comps/retrievers/neo4j/langchain/Dockerfile - image: ${REGISTRY:-opea}/retriever-neo4j:${TAG:-latest} \ No newline at end of file + image: ${REGISTRY:-opea}/retriever-neo4j:${TAG:-latest} From 0da11d94916983360c4e8cfe92ee19d84694e724 Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Sat, 14 Sep 2024 19:53:54 +0800 Subject: [PATCH 21/31] Update test_retrievers_neo4j_langchain.sh --- tests/retrievers/test_retrievers_neo4j_langchain.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh index 9855fe75f..e8a179fbd 100644 --- a/tests/retrievers/test_retrievers_neo4j_langchain.sh +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -10,7 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH - docker run -d -p 7474:7474 -p 7687:7687 -v ./data:/data -v ./plugins:/plugins --name test-comps-neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest + docker run -d -p 7475:7475 -p 7688:7688 -v ./data:/data -v ./plugins:/plugins --name test-comps-neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest sleep 30s docker build --no-cache -t opea/retriever-neo4j:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/neo4j/langchain/Dockerfile . From a5309cc756bc66c25eac1a57224aa1ddc4b772ae Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Sat, 14 Sep 2024 20:23:04 +0800 Subject: [PATCH 22/31] Update config.py --- comps/retrievers/neo4j/langchain/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/retrievers/neo4j/langchain/config.py b/comps/retrievers/neo4j/langchain/config.py index 39adf6d89..bfd0eade4 100644 --- a/comps/retrievers/neo4j/langchain/config.py +++ b/comps/retrievers/neo4j/langchain/config.py @@ -4,7 +4,7 @@ import os # Neo4J configuration -NEO4J_URL = os.getenv("NEO4J_URI", "bolt://localhost:7687") +NEO4J_URL = os.getenv("NEO4J_URI", "bolt://localhost:7688") NEO4J_USERNAME = os.getenv("NEO4J_USERNAME", "neo4j") NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "test") From 96c8fe3d25fdcc35858bcbabd3af3d3eeb0f1ed6 Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Sat, 14 Sep 2024 20:36:25 +0800 Subject: [PATCH 23/31] Update test_retrievers_neo4j_langchain.sh --- tests/retrievers/test_retrievers_neo4j_langchain.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh index e8a179fbd..91afe100c 100644 --- a/tests/retrievers/test_retrievers_neo4j_langchain.sh +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -31,7 +31,7 @@ function start_service() { export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" # Neo4J retriever - export NEO4J_URI="bolt://${ip_address}:7687" + export NEO4J_URI="bolt://${ip_address}:7688" export NEO4J_USERNAME="neo4j" export NEO4J_PASSWORD="password" retriever_port=5435 From 32240b3188247f7b86603600aecca06c69e3fd3a Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Sat, 14 Sep 2024 20:48:14 +0800 Subject: [PATCH 24/31] Update test_retrievers_neo4j_langchain.sh --- tests/retrievers/test_retrievers_neo4j_langchain.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh index 91afe100c..570241a88 100644 --- a/tests/retrievers/test_retrievers_neo4j_langchain.sh +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -37,7 +37,7 @@ function start_service() { retriever_port=5435 # unset http_proxy export no_proxy="localhost,127.0.0.1,"${ip_address} - docker run -d --name="test-comps-retriever-neo4j-server" -p ${retriever_port}:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e NEO4J_URI="bolt://${ip_address}:7687" -e NEO4J_USERNAME="neo4j" -e NEO4J_PASSWORD="password" opea/retriever-neo4j:comps + docker run -d --name="test-comps-retriever-neo4j-server" -p ${retriever_port}:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e NEO4J_URI="bolt://${ip_address}:7688" -e NEO4J_USERNAME="neo4j" -e NEO4J_PASSWORD="password" opea/retriever-neo4j:comps sleep 1m } From 355739074a801c124c234c702cea761f03e0ae3f Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Sat, 14 Sep 2024 21:04:13 +0800 Subject: [PATCH 25/31] Update config.py --- comps/retrievers/neo4j/langchain/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/retrievers/neo4j/langchain/config.py b/comps/retrievers/neo4j/langchain/config.py index bfd0eade4..39adf6d89 100644 --- a/comps/retrievers/neo4j/langchain/config.py +++ b/comps/retrievers/neo4j/langchain/config.py @@ -4,7 +4,7 @@ import os # Neo4J configuration -NEO4J_URL = os.getenv("NEO4J_URI", "bolt://localhost:7688") +NEO4J_URL = os.getenv("NEO4J_URI", "bolt://localhost:7687") NEO4J_USERNAME = os.getenv("NEO4J_USERNAME", "neo4j") NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "test") From 6e3e1d0c25b3956bc70063eb4def9477a37c68bd Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Sat, 14 Sep 2024 21:05:17 +0800 Subject: [PATCH 26/31] Update test_retrievers_neo4j_langchain.sh --- tests/retrievers/test_retrievers_neo4j_langchain.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh index 570241a88..9855fe75f 100644 --- a/tests/retrievers/test_retrievers_neo4j_langchain.sh +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -10,7 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH - docker run -d -p 7475:7475 -p 7688:7688 -v ./data:/data -v ./plugins:/plugins --name test-comps-neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest + docker run -d -p 7474:7474 -p 7687:7687 -v ./data:/data -v ./plugins:/plugins --name test-comps-neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest sleep 30s docker build --no-cache -t opea/retriever-neo4j:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/neo4j/langchain/Dockerfile . @@ -31,13 +31,13 @@ function start_service() { export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" # Neo4J retriever - export NEO4J_URI="bolt://${ip_address}:7688" + export NEO4J_URI="bolt://${ip_address}:7687" export NEO4J_USERNAME="neo4j" export NEO4J_PASSWORD="password" retriever_port=5435 # unset http_proxy export no_proxy="localhost,127.0.0.1,"${ip_address} - docker run -d --name="test-comps-retriever-neo4j-server" -p ${retriever_port}:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e NEO4J_URI="bolt://${ip_address}:7688" -e NEO4J_USERNAME="neo4j" -e NEO4J_PASSWORD="password" opea/retriever-neo4j:comps + docker run -d --name="test-comps-retriever-neo4j-server" -p ${retriever_port}:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e NEO4J_URI="bolt://${ip_address}:7687" -e NEO4J_USERNAME="neo4j" -e NEO4J_PASSWORD="password" opea/retriever-neo4j:comps sleep 1m } From 0373c57047ca6e1f0aefd52abf629636e62be942 Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Sat, 14 Sep 2024 21:08:56 +0800 Subject: [PATCH 27/31] Update requirements.txt --- comps/retrievers/neo4j/langchain/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/retrievers/neo4j/langchain/requirements.txt b/comps/retrievers/neo4j/langchain/requirements.txt index 55e858b50..659afeaed 100644 --- a/comps/retrievers/neo4j/langchain/requirements.txt +++ b/comps/retrievers/neo4j/langchain/requirements.txt @@ -2,7 +2,6 @@ docarray[full] fastapi frontend==0.0.3 huggingface_hub -langchain langchain-community neo4j numpy @@ -19,3 +18,4 @@ sentence_transformers shortuuid tiktoken uvicorn +langchain==0.2.39 From 5d9a870040a63baa1bd844423c26eab528ed4511 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 14 Sep 2024 13:09:13 +0000 Subject: [PATCH 28/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/retrievers/neo4j/langchain/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/retrievers/neo4j/langchain/requirements.txt b/comps/retrievers/neo4j/langchain/requirements.txt index 659afeaed..de50d4002 100644 --- a/comps/retrievers/neo4j/langchain/requirements.txt +++ b/comps/retrievers/neo4j/langchain/requirements.txt @@ -2,6 +2,7 @@ docarray[full] fastapi frontend==0.0.3 huggingface_hub +langchain==0.2.39 langchain-community neo4j numpy @@ -18,4 +19,3 @@ sentence_transformers shortuuid tiktoken uvicorn -langchain==0.2.39 From b533fcf42e90a33a9a595684ca3bbe509017130e Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Sat, 14 Sep 2024 21:13:46 +0800 Subject: [PATCH 29/31] Update requirements.txt From 33cdd89d5e7e9afb63d6cbda2bbcd142a3415181 Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Sat, 14 Sep 2024 21:17:55 +0800 Subject: [PATCH 30/31] Update requirements.txt --- comps/retrievers/neo4j/langchain/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/retrievers/neo4j/langchain/requirements.txt b/comps/retrievers/neo4j/langchain/requirements.txt index de50d4002..55e858b50 100644 --- a/comps/retrievers/neo4j/langchain/requirements.txt +++ b/comps/retrievers/neo4j/langchain/requirements.txt @@ -2,7 +2,7 @@ docarray[full] fastapi frontend==0.0.3 huggingface_hub -langchain==0.2.39 +langchain langchain-community neo4j numpy From 5e887913b4d4df3ca998d9b55f5ae0e3ae863143 Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Sat, 14 Sep 2024 21:23:01 +0800 Subject: [PATCH 31/31] Update requirements.txt --- comps/retrievers/neo4j/langchain/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/retrievers/neo4j/langchain/requirements.txt b/comps/retrievers/neo4j/langchain/requirements.txt index 55e858b50..24f579c6a 100644 --- a/comps/retrievers/neo4j/langchain/requirements.txt +++ b/comps/retrievers/neo4j/langchain/requirements.txt @@ -2,7 +2,7 @@ docarray[full] fastapi frontend==0.0.3 huggingface_hub -langchain +langchain==0.2 langchain-community neo4j numpy