diff --git a/.github/workflows/docker/compose/reranks-compose-cd.yaml b/.github/workflows/docker/compose/reranks-compose-cd.yaml index d805b570e..0cebf4ca3 100644 --- a/.github/workflows/docker/compose/reranks-compose-cd.yaml +++ b/.github/workflows/docker/compose/reranks-compose-cd.yaml @@ -14,3 +14,11 @@ services: build: dockerfile: comps/reranks/langchain-mosec/docker/Dockerfile image: ${REGISTRY:-opea}/reranking-langchain-mosec:${TAG:-latest} + reranking-mosec-neural-speed: + build: + dockerfile: comps/reranks/neural-speed/docker/Dockerfile + image: ${REGISTRY:-opea}/reranking-mosec-neural-speed:${TAG:-latest} + reranking-mosec-neural-speed-endpoint: + build: + dockerfile: comps/reranks/neural-speed/neuralspeed-docker/Dockerfile + image: ${REGISTRY:-opea}/reranking-mosec-neural-speed-endpoint:${TAG:-latest} diff --git a/comps/reranks/neural-speed/README.md b/comps/reranks/neural-speed/README.md new file mode 100644 index 000000000..c1841e16a --- /dev/null +++ b/comps/reranks/neural-speed/README.md @@ -0,0 +1,32 @@ +# build Mosec endpoint docker image + +``` +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t langchain-mosec:neuralspeed-reranks -f comps/reranks/neural-speed/neuralspeed-docker/Dockerfile . +``` + +# build Reranking microservice docker image + +``` +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/reranking-langchain-mosec:neuralspeed -f comps/reranks/neural-speed/docker/Dockerfile . +``` + +Note: Please contact us to request model files before building images. + +# launch Mosec endpoint docker container + +``` +docker run -d --name="reranking-langchain-mosec-endpoint" -p 6001:8000 langchain-mosec:neuralspeed-reranks +``` + +# launch Reranking microservice docker container + +``` +export MOSEC_RERANKING_ENDPOINT=http://127.0.0.1:6001 +docker run -d --name="reranking-langchain-mosec-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 6000:8000 --ipc=host -e MOSEC_RERANKING_ENDPOINT=$MOSEC_RERANKING_ENDPOINT opea/reranking-langchain-mosec:neuralspeed +``` + +# run client test + +``` +curl http://localhost:6000/v1/reranking -X POST -d '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}' -H 'Content-Type: application/json' +``` diff --git a/comps/reranks/neural-speed/__init__.py b/comps/reranks/neural-speed/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/reranks/neural-speed/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/reranks/neural-speed/docker/Dockerfile b/comps/reranks/neural-speed/docker/Dockerfile new file mode 100644 index 000000000..8ffed65ec --- /dev/null +++ b/comps/reranks/neural-speed/docker/Dockerfile @@ -0,0 +1,31 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM langchain/langchain:latest + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/reranks/neural-speed/requirements.txt + +RUN pip3 install llmspec mosec msgspec httpx requests +RUN pip3 install torch==2.2.2 --trusted-host download.pytorch.org --index-url https://download.pytorch.org/whl/cpu + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/reranks/neural-speed + +ENTRYPOINT ["python", "reranking_neuralspeed_svc.py"] + diff --git a/comps/reranks/neural-speed/docker/docker_compose_embedding.yaml b/comps/reranks/neural-speed/docker/docker_compose_embedding.yaml new file mode 100644 index 000000000..d5f59b4a0 --- /dev/null +++ b/comps/reranks/neural-speed/docker/docker_compose_embedding.yaml @@ -0,0 +1,22 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + reranking: + image: opea/reranking-langchain-mosec:neuralspeed + container_name: reranking-langchain-mosec-server + ports: + - "6000:8000" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + MOSEC_RERANKING_ENDPOINT: ${MOSEC_RERANKING_ENDPOINT} + LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/reranks/neural-speed/neuralspeed-docker/Dockerfile b/comps/reranks/neural-speed/neuralspeed-docker/Dockerfile new file mode 100644 index 000000000..42dcbad8c --- /dev/null +++ b/comps/reranks/neural-speed/neuralspeed-docker/Dockerfile @@ -0,0 +1,27 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +From ubuntu:22.04 +ARG DEBIAN_FRONTEND=noninteractive + +ENV GLIBC_TUNABLES glibc.cpu.x86_shstk=permissive + +COPY comps /root/comps +COPY neural_speed-0.1.dev45+g41ea0aa-cp310-cp310-linux_x86_64.whl /root/ +COPY bge-large-r-q8.bin /root/ +COPY libstdc++.so.6 /root/ + +RUN apt update && apt install -y python3 python3-pip +RUN pip3 install -r /root/comps/reranks/neural-speed/neuralspeed-docker/requirements.txt +RUN pip3 install llmspec mosec msgspec httpx requests +RUN pip3 install /root/neural_speed-0.1.dev45+g41ea0aa-cp310-cp310-linux_x86_64.whl + +RUN cd /root/ && export HF_ENDPOINT=https://hf-mirror.com && huggingface-cli download --resume-download BAAI/bge-reranker-large --local-dir /root/bge-reranker-large + + +ENV LD_PRELOAD=/root/libstdc++.so.6 + + +WORKDIR /root/comps/reranks/neural-speed/neuralspeed-docker + +CMD ["python3", "server.py"] diff --git a/comps/reranks/neural-speed/neuralspeed-docker/client.py b/comps/reranks/neural-speed/neuralspeed-docker/client.py new file mode 100644 index 000000000..02017faaf --- /dev/null +++ b/comps/reranks/neural-speed/neuralspeed-docker/client.py @@ -0,0 +1,35 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from http import HTTPStatus + +import httpx +import msgspec +import requests + +req = { + "query": "talk is cheap, show me the code", + "docs": [ + "what a nice day", + "life is short, use python", + "early bird catches the worm", + ], +} + +httpx_response = httpx.post("http://127.0.0.1:8080/inference", content=msgspec.msgpack.encode(req)) + +requests_response = requests.post("http://127.0.0.1:8080/inference", data=msgspec.msgpack.encode(req)) + +MOSEC_RERANKING_ENDPOINT = os.environ.get("MOSEC_RERANKING_ENDPOINT", "http://127.0.0.1:8080") + +request_url = MOSEC_RERANKING_ENDPOINT + "/inference" +print(f"request_url = {request_url}") +resp_3 = requests.post(request_url, data=msgspec.msgpack.encode(req)) + +if httpx_response.status_code == HTTPStatus.OK and requests_response.status_code == HTTPStatus.OK: + print(f"OK: \n {msgspec.msgpack.decode(httpx_response.content)}") + print(f"OK: \n {msgspec.msgpack.decode(requests_response.content)}") + print(f"OK: \n {msgspec.msgpack.decode(resp_3.content)}") +else: + print(f"err[{httpx_response.status_code}] {httpx_response.text}") diff --git a/comps/reranks/neural-speed/neuralspeed-docker/client_multibatch.py b/comps/reranks/neural-speed/neuralspeed-docker/client_multibatch.py new file mode 100644 index 000000000..09eee1dfb --- /dev/null +++ b/comps/reranks/neural-speed/neuralspeed-docker/client_multibatch.py @@ -0,0 +1,45 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from http import HTTPStatus +from threading import Thread + +import httpx +import msgspec + +req = { + "query": "talk is cheap, show me the code", + "docs": [ + "what a nice day", + "life is short, use python", + "early bird catches the worm", + ], +} +reqs = [] +BATCH = 32 +for i in range(BATCH): + reqs.append(msgspec.msgpack.encode(req)) + + +def post_func(threadIdx): + resp = httpx.post("http://127.0.0.1:8080/inference", content=reqs[threadIdx]) + ret = f"thread {threadIdx} \n" + if resp.status_code == HTTPStatus.OK: + ret += f"OK: {msgspec.msgpack.decode(resp.content)['scores']}" + else: + ret += f"err[{resp.status_code}] {resp.text}" + print(ret) + + +threads = [] +for i in range(BATCH): + t = Thread( + target=post_func, + args=[ + i, + ], + ) + threads.append(t) + +for i in range(BATCH): + threads[i].start() diff --git a/comps/reranks/neural-speed/neuralspeed-docker/requirements.txt b/comps/reranks/neural-speed/neuralspeed-docker/requirements.txt new file mode 100644 index 000000000..50dc540fc --- /dev/null +++ b/comps/reranks/neural-speed/neuralspeed-docker/requirements.txt @@ -0,0 +1,16 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +accelerate +cmake +datasets +huggingface_hub +matplotlib +numpy +peft +protobuf<3.20 +py-cpuinfo +sentencepiece +tiktoken +torch +transformers +transformers_stream_generator +zipfile38 diff --git a/comps/reranks/neural-speed/neuralspeed-docker/server.py b/comps/reranks/neural-speed/neuralspeed-docker/server.py new file mode 100644 index 000000000..0176abcfb --- /dev/null +++ b/comps/reranks/neural-speed/neuralspeed-docker/server.py @@ -0,0 +1,91 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import time +from typing import Any, List + +import numpy +from mosec import Server, Worker, get_logger +from mosec.mixin import TypedMsgPackMixin +from msgspec import Struct +from neural_speed import Model +from transformers import AutoModelForSequenceClassification, AutoTokenizer + +logger = get_logger() + +INFERENCE_BATCH_SIZE = 128 +INFERENCE_MAX_WAIT_TIME = 10 +INFERENCE_WORKER_NUM = 1 +INFERENCE_CONTEXT = 512 + +TorchModel = "/root/bge-reranker-large" +NS_Bin = "/root/bge-large-r-q8.bin" + +NS_Model = "bert" + + +class Request(Struct, kw_only=True): + query: str + docs: List[str] + + +class Response(Struct, kw_only=True): + scores: List[float] + + +class Inference(TypedMsgPackMixin, Worker): + + def __init__(self): + super().__init__() + self.tokenizer = AutoTokenizer.from_pretrained(TorchModel) + self.model = Model() + self.model.init_from_bin( + NS_Model, + NS_Bin, + batch_size=INFERENCE_BATCH_SIZE, + n_ctx=INFERENCE_CONTEXT + 2, + ) + + def forward(self, data: List[Request]) -> List[Response]: + batch = len(data) + ndoc = [] + inps = [] + for data in data: + inp = [[data.query, doc] for doc in data.docs] + inps.extend(inp) + ndoc.append(len(data.docs)) + outs = [] + for i in range(0, len(inps), INFERENCE_BATCH_SIZE): + inp_bs = inps[i : i + INFERENCE_BATCH_SIZE] + inputs = self.tokenizer( + inp_bs, padding=True, truncation=True, max_length=INFERENCE_CONTEXT, return_tensors="pt" + ) + st = time.time() + output = self.model( + **inputs, + reinit=True, + logits_all=True, + continuous_batching=False, + ignore_padding=True, + ) + logger.info(f"Toal batch {batch} input shape {inputs.input_ids.shape} time {time.time()-st}") + outs.append(output) + ns_outputs = numpy.concatenate(outs, axis=0) + resps = [] + pos = 0 + for i in range(batch): + resp = Response(scores=ns_outputs[pos : pos + ndoc[i]].tolist()) + pos += ndoc[i] + resps.append(resp) + return resps + + +if __name__ == "__main__": + INFERENCE_BATCH_SIZE = int(os.environ.get("MAX_BATCH_SIZE", 128)) + INFERENCE_MAX_WAIT_TIME = int(os.environ.get("MAX_WAIT_TIME", 1)) + server = Server() + server.append_worker( + Inference, max_batch_size=INFERENCE_BATCH_SIZE, max_wait_time=INFERENCE_MAX_WAIT_TIME, num=INFERENCE_WORKER_NUM + ) + server.run() diff --git a/comps/reranks/neural-speed/requirements.txt b/comps/reranks/neural-speed/requirements.txt new file mode 100644 index 000000000..9fa1a059c --- /dev/null +++ b/comps/reranks/neural-speed/requirements.txt @@ -0,0 +1,11 @@ +docarray[full] +fastapi +langchain +langchain_community +openai +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +prometheus-fastapi-instrumentator +shortuuid +uvicorn diff --git a/comps/reranks/neural-speed/reranking_neuralspeed_svc.py b/comps/reranks/neural-speed/reranking_neuralspeed_svc.py new file mode 100644 index 000000000..098378a52 --- /dev/null +++ b/comps/reranks/neural-speed/reranking_neuralspeed_svc.py @@ -0,0 +1,93 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import heapq +import json +import os +import re +import time +from typing import List, Optional, Union + +import httpx +import msgspec +import requests +import torch +from langchain_core.prompts import ChatPromptTemplate +from langsmith import traceable + +from comps import ( + CustomLogger, + LLMParamsDoc, + SearchedDoc, + ServiceType, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) +from comps.cores.proto.api_protocol import ( + ChatCompletionRequest, + RerankingRequest, + RerankingResponse, + RerankingResponseData, +) + + +@register_microservice( + name="opea_service@reranking_mosec", + service_type=ServiceType.RERANK, + endpoint="/v1/reranking", + host="0.0.0.0", + port=8000, + input_datatype=SearchedDoc, + output_datatype=LLMParamsDoc, +) +@traceable(run_type="reranking") +@register_statistics(names=["opea_service@reranking_mosec"]) +def reranking( + input: Union[SearchedDoc, RerankingRequest, ChatCompletionRequest] +) -> Union[LLMParamsDoc, RerankingResponse, ChatCompletionRequest]: + start = time.time() + reranking_results = [] + if input.retrieved_docs: + docs = [doc.text for doc in input.retrieved_docs] + url = mosec_reranking_endpoint + "/inference" + if isinstance(input, SearchedDoc): + query = input.initial_query + else: + # for RerankingRequest, ChatCompletionRequest + query = input.input + data = {"query": query, "docs": docs} + resp = requests.post(url, data=msgspec.msgpack.encode(data)) + response_list = msgspec.msgpack.decode(resp.content)["scores"] + response = torch.nn.functional.sigmoid(torch.tensor(response_list)) + length = len(response) + resp_list = response.tolist() + sorted_score = heapq.nlargest(length, resp_list) + sorted_score_index = map(resp_list.index, sorted_score) + + for i in range(input.top_n): + reranking_results.append( + {"text": input.retrieved_docs[list(sorted_score_index)[i]].text, "score": sorted_score[i]} + ) + + statistics_dict["opea_service@reranking_mosec"].append_latency(time.time() - start, None) + if isinstance(input, SearchedDoc): + return LLMParamsDoc(query=input.initial_query, documents=[doc["text"] for doc in reranking_results]) + else: + reranking_docs = [] + for doc in reranking_results: + reranking_docs.append(RerankingResponseData(text=doc["text"], score=doc["score"])) + if isinstance(input, RerankingRequest): + return RerankingResponse(reranked_docs=reranking_docs) + + if isinstance(input, ChatCompletionRequest): + input.reranked_docs = reranking_docs + input.documents = [doc["text"] for doc in reranking_results] + return input + + +if __name__ == "__main__": + mosec_reranking_endpoint = os.getenv("MOSEC_RERANKING_ENDPOINT", "http://localhost:8080") + print("NeuralSpeed Reranking Microservice Initialized.") + opea_microservices["opea_service@reranking_mosec"].start() diff --git a/tests/test_reranks_mosec-neuralspeed.sh b/tests/test_reranks_mosec-neuralspeed.sh new file mode 100644 index 000000000..4512dc794 --- /dev/null +++ b/tests/test_reranks_mosec-neuralspeed.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +function build_mosec_docker_images() { + cd $WORKPATH + echo $(pwd) + cp /data2/nswhl/* ./ + docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t langchain-mosec:neuralspeed-reranks -f comps/reranks/neural-speed/neuralspeed-docker/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/reranking-langchain-mosec-endpoint built fail" + exit 1 + else + echo "opea/reranking-langchain-mosec-endpoint built successful" + fi +} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/reranking-langchain-mosec:neuralspeed -f comps/reranks/neural-speed/docker/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/reranking-langchain-mosec built fail" + exit 1 + else + echo "opea/reranking-langchain-mosec built successful" + fi +} + +function start_service() { + mosec_endpoint=5006 + model="BAAI/bge-reranker-large" + unset http_proxy + docker run -d --name="test-comps-reranking-langchain-mosec-endpoint" -p $mosec_endpoint:8000 langchain-mosec:neuralspeed-reranks + export MOSEC_RERANKING_ENDPOINT="http://${ip_address}:${mosec_endpoint}" + mosec_service_port=5007 + docker run -d --name="test-comps-reranking-langchain-mosec-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p ${mosec_service_port}:8000 --ipc=host -e MOSEC_RERANKING_ENDPOINT=$MOSEC_RERANKING_ENDPOINT opea/reranking-langchain-mosec:neuralspeed + sleep 3m +} + +function validate_microservice() { + mosec_service_port=5007 + result=$(http_proxy="" curl http://${ip_address}:${mosec_service_port}/v1/reranking\ + -X POST \ + -d '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}' \ + -H 'Content-Type: application/json') + if [[ $result == *"Deep"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-reranking-langchain-mosec-endpoint + docker logs test-comps-reranking-langchain-mosec-server + exit 1 + fi +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-reranking-langchain-mosec-*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_mosec_docker_images + + build_docker_images + + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main