opea-project · lvliang-intel · Aug 29, 2024 · Jul 23, 2024 · Jul 23, 2024 · Jul 23, 2024
diff --git a/comps/retrievers/langchain/pathway/Dockerfile b/comps/retrievers/langchain/pathway/Dockerfile
@@ -0,0 +1,25 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM pathwaycom/pathway:0.13.2-slim
+
+ENV DOCKER_BUILDKIT=1
+ENV PYTHONUNBUFFERED=1
+
+RUN apt-get update && apt-get install -y \
+    poppler-utils \
+    libreoffice \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+RUN pip install -U pathway[xpack-llm]
+RUN pip install "unstructured[all-docs] >= 0.10.28, < 0.15"
+RUN pip install langchain_openai
+RUN pip install sentence_transformers
+
+COPY pathway_vs.py /app/
+
+
+CMD ["python", "pathway_vs.py"]
+
@@ -0,0 +1,124 @@
+# Retriever Microservice with Pathway
+
+# 🚀Start Microservice with Python
+
+## Install Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+### With the Docker compose
+
+(Optionally) Start the embedder service separately.
+
+> Note that Docker compose will start this service as well, this step is thus optional.
+
+```bash
+export LANGCHAIN_TRACING_V2=true
+export LANGCHAIN_API_KEY=${your_langchain_api_key}
+export LANGCHAIN_PROJECT="opea/retriever"
+model=BAAI/bge-base-en-v1.5
+revision=refs/pr/4
+# TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060"  # if you want to use the hosted embedding service, example: "http://127.0.0.1:6060"
+
+# then run:
+docker run -p 6060:80 -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision
+```
+
+Health check the embedding service with:
+
+```bash
+curl 127.0.0.1:6060/rerank     -X POST     -d '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'     -H 'Content-Type: application/json'
+```
+
+## Start the Pathway Vector DB Server
+
+Make sure to set the Pathway environment variables.
+
+> Note: If you are using `TEI_EMBEDDING_ENDPOINT`, make sure embedding service is already running.
+
+```bash
+export PATHWAY_HOST=0.0.0.0
+export PATHWAY_PORT=8666
+model=BAAI/bge-base-en-v1.5
+revision=refs/pr/4
+# TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060"  # uncomment if you want to use the hosted embedding service, example: "http://127.0.0.1:6060"
+```
+
+## Setting up the Pathway data sources
+
+Pathway can listen to many sources simultaneously, such as local files, S3 folders, cloud storage, and any data stream. Whenever a new file is added or an existing file is modified, Pathway parses, chunks and indexes the documents in real-time.
+
+See [pathway-io](https://pathway.com/developers/api-docs/pathway-io) for more information.
+
+You can easily connect to the data inside the folder with the Pathway file system connector. The data will automatically be updated by Pathway whenever the content of the folder changes. In this example, we create a single data source that reads the files under the `./data` folder.
+
+You can manage your data sources by configuring the `data_sources` in `pathway_vs.py`.
+
+```python
+import pathway as pw
+
+data = pw.io.fs.read(
+    "./data",
+    format="binary",
+    mode="streaming",
+    with_metadata=True,
+)  # This creates a Pathway connector that tracks
+# all the files in the ./data directory
+
+data_sources = [data]
+```
+
+Build the Docker and run the Pathway Vector Store:
+
+```bash
+cd comps/retrievers/langchain/pathway
+
+docker build -t vectorstore-pathway .
+
+# with locally loaded model, you may add `EMBED_MODEL` env variable to configure the model.
+docker run -e PATHWAY_HOST=${PATHWAY_HOST} -e PATHWAY_PORT=${PATHWAY_PORT} -v ./data:/app/data -p ${PATHWAY_PORT}:${PATHWAY_PORT} vectorstore-pathway
+
+# with the hosted embedder (network argument is needed for the vector server to reach to the embedding service)
+docker run -e PATHWAY_HOST=${PATHWAY_HOST} -e PATHWAY_PORT=${PATHWAY_PORT} -e TEI_EMBEDDING_ENDPOINT=${TEI_EMBEDDING_ENDPOINT} -v ./data:/app/data -p ${PATHWAY_PORT}:${PATHWAY_PORT} --network="host" vectorstore-pathway
+```
+
+## Start Retriever Service
+
+Note that the retriever service also expects the Pathway host and port variables to connect to the vector DB.
+
+### With the Docker CLI
+
+```bash
+# make sure you are in the root folder of the repo
+docker build -t opea/retriever-pathway:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/langchain/pathway/docker/Dockerfile .
+
+docker run -p 7000:7000 -e PATHWAY_HOST=${PATHWAY_HOST} -e PATHWAY_PORT=${PATHWAY_PORT} --network="host" opea/retriever-pathway:latest
+```
+
+### With the Docker compose
+
+```bash
+cd /comps/retrievers/langchain/pathway/docker
+
+docker compose -f docker_compose_retriever.yaml build
+docker compose -f docker_compose_retriever.yaml up
+
+# shut down the containers
+docker compose -f docker_compose_retriever.yaml down
+```
+
+Make sure the retriever service is working as expected:
+
+```bash
+curl http://0.0.0.0:7000/v1/health_check   -X GET   -H 'Content-Type: application/json'
+```
+
+send an example query:
+
+```bash
+exm_embeddings=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
+
+curl http://0.0.0.0:7000/v1/retrieval   -X POST   -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${exm_embeddings}}"   -H 'Content-Type: application/json'
+```
diff --git a/comps/retrievers/langchain/pathway/data/nke-10k-2023.pdf b/comps/retrievers/langchain/pathway/data/nke-10k-2023.pdf
@@ -0,0 +1,30 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM langchain/langchain:latest
+
+ARG ARCH="cpu"
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    vim
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+COPY comps /home/user/comps
+
+USER user
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    if [ ${ARCH} = "cpu" ]; then pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
+    pip install --no-cache-dir -r /home/user/comps/retrievers/langchain/pathway/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user
+
+WORKDIR /home/user/comps/retrievers/langchain/pathway
+
+ENTRYPOINT ["python", "retriever_pathway.py"]
@@ -0,0 +1,34 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+  tei_xeon_service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2
+    container_name: tei-xeon-server
+    ports:
+      - "6060:80"
+    volumes:
+      - "./data:/data"
+    shm_size: 1g
+    command: --model-id ${RETRIEVE_MODEL_ID}
+  retriever:
+    image: opea/retriever-pathway:latest
+    container_name: retriever-pathway-server
+    ports:
+      - "7000:7000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      PATHWAY_HOST: ${PATHWAY_HOST}
+      PATHWAY_PORT: ${PATHWAY_PORT}
+      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
+      LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
diff --git a/comps/retrievers/langchain/pathway/pathway_vs.py b/comps/retrievers/langchain/pathway/pathway_vs.py
@@ -0,0 +1,59 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import os
+
+import pathway as pw
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings
+from pathway.xpacks.llm.parsers import ParseUnstructured
+from pathway.xpacks.llm.vector_store import VectorStoreServer
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(name)s %(levelname)s %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+
+data = pw.io.fs.read(
+    "./data",
+    format="binary",
+    mode="streaming",
+    with_metadata=True,
+)
+
+data_sources = [data]
+
+splitter = CharacterTextSplitter()
+
+host = os.getenv("PATHWAY_HOST", "127.0.0.1")
+port = int(os.getenv("PATHWAY_PORT", 8666))
+
+EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
+
+tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT")
+
+if __name__ == "__main__":
+    # Create vectorstore
+    if tei_embedding_endpoint:
+        # create embeddings using TEI endpoint service
+        logging.info(f"Initializing the embedder from tei_embedding_endpoint: {tei_embedding_endpoint}")
+        embeddings = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint)
+    else:
+        # create embeddings using local embedding model
+        embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)
+
+    server = VectorStoreServer.from_langchain_components(
+        *data_sources,
+        embedder=embeddings,
+        parser=ParseUnstructured(),
+        splitter=splitter,
+    )
+
+    server.run_server(
+        host,
+        port=port,
+        with_cache=True,
+        cache_backend=pw.persistence.Backend.filesystem("./Cache"),
+    )
@@ -0,0 +1,13 @@
+docarray[full]
+fastapi
+frontend==0.0.3
+huggingface_hub
+langchain_community == 0.2.0
+langsmith
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+pathway
+prometheus-fastapi-instrumentator
+sentence_transformers
+shortuuid
@@ -0,0 +1,52 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import time
+
+from langchain_community.vectorstores import PathwayVectorClient
+from langsmith import traceable
+
+from comps import (
+    EmbedDoc768,
+    SearchedDoc,
+    ServiceType,
+    TextDoc,
+    opea_microservices,
+    register_microservice,
+    register_statistics,
+    statistics_dict,
+)
+
+host = os.getenv("PATHWAY_HOST", "127.0.0.1")
+port = int(os.getenv("PATHWAY_PORT", 8666))
+
+EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
+
+tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT")
+
+
+@register_microservice(
+    name="opea_service@retriever_pathway",
+    service_type=ServiceType.RETRIEVER,
+    endpoint="/v1/retrieval",
+    host="0.0.0.0",
+    port=7000,
+)
+@traceable(run_type="retriever")
+@register_statistics(names=["opea_service@retriever_pathway"])
+def retrieve(input: EmbedDoc768) -> SearchedDoc:
+    start = time.time()
+    documents = pw_client.similarity_search(input.text, input.fetch_k)
+
+    docs = [TextDoc(text=r.page_content) for r in documents]
+
+    time_spent = time.time() - start
+    statistics_dict["opea_service@retriever_pathway"].append_latency(time_spent, None)  # noqa: E501
+    return SearchedDoc(retrieved_docs=docs, initial_query=input.text)
+
+
+if __name__ == "__main__":
+    # Create the vectorstore client
+    pw_client = PathwayVectorClient(host=host, port=port)
+    opea_microservices["opea_service@retriever_pathway"].start()