add dynamic batching embedding/reranking (#774)

* draft static batching embedding/reranking on single gaudi card * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * resolve segfault, deadlock and other issues * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * narrow down default timeout * add doockerfile * fix hpu local microservice start * openai format * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * configurable timeout * lower timeout * fix * lower default timeout * bf16 * log, pad max_len * autocast, 128 * fix acc issue * perf fallback with no acc drop * revert no-padding ones * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix hpu graph wrapper * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add padding batch * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * habana 1.18 * static -> dynamic * add UT, add param in_single_process * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add docker file * fix case doc empty, and pass model id from env * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * CI --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: chen, suyue <[email protected]> Co-authored-by: ZePan110 <[email protected]>
opea-project · Nov 6, 2024 · 518cdfb · 518cdfb
1 parent a8e5adc
commit 518cdfb
Show file tree

Hide file tree

Showing 5 changed files with 439 additions and 3 deletions.
diff --git a/.github/workflows/docker/compose/embeddings-compose-cd.yaml b/.github/workflows/docker/compose/embeddings-compose-cd.yaml
@@ -22,3 +22,7 @@ services:
     build:
       dockerfile: comps/embeddings/predictionguard/Dockerfile
     image: ${REGISTRY:-opea}/embedding-predictionguard:${TAG:-latest}
+  embedding-reranking-local:
+    build:
+      dockerfile: comps/embeddings/tei/langchain/Dockerfile.dynamic_batching
+    image: ${REGISTRY:-opea}/embedding-reranking-local:${TAG:-latest}
diff --git a/comps/cores/mega/micro_service.py b/comps/cores/mega/micro_service.py
@@ -3,14 +3,21 @@
 
 import asyncio
 import multiprocessing
+import os
+from collections import defaultdict, deque
+from enum import Enum
 from typing import Any, List, Optional, Type
 
 from ..proto.docarray import TextDoc
 from .constants import ServiceRoleType, ServiceType
+from .logger import CustomLogger
 from .utils import check_ports_availability
 
 opea_microservices = {}
 
+logger = CustomLogger("micro_service")
+logflag = os.getenv("LOGFLAG", False)
+
 
 class MicroService:
     """MicroService class to create a microservice."""
@@ -31,6 +38,9 @@ def __init__(
         provider: Optional[str] = None,
         provider_endpoint: Optional[str] = None,
         use_remote_service: Optional[bool] = False,
+        dynamic_batching: bool = False,
+        dynamic_batching_timeout: int = 1,
+        dynamic_batching_max_batch_size: int = 32,
     ):
         """Init the microservice."""
         self.name = f"{name}/{self.__class__.__name__}" if name else self.__class__.__name__
@@ -43,6 +53,9 @@ def __init__(
         self.input_datatype = input_datatype
         self.output_datatype = output_datatype
         self.use_remote_service = use_remote_service
+        self.dynamic_batching = dynamic_batching
+        self.dynamic_batching_timeout = dynamic_batching_timeout
+        self.dynamic_batching_max_batch_size = dynamic_batching_max_batch_size
         self.uvicorn_kwargs = {}
 
         if ssl_keyfile:
@@ -58,10 +71,50 @@ def __init__(
 
             self.server = self._get_server()
             self.app = self.server.app
+            # create a batch request processor loop if using dynamic batching
+            if self.dynamic_batching:
+                self.buffer_lock = asyncio.Lock()
+                self.request_buffer = defaultdict(deque)
+
+                @self.app.on_event("startup")
+                async def startup_event():
+                    asyncio.create_task(self._dynamic_batch_processor())
+
             self.event_loop = asyncio.new_event_loop()
             asyncio.set_event_loop(self.event_loop)
             self.event_loop.run_until_complete(self._async_setup())
 
+    async def _dynamic_batch_processor(self):
+        if logflag:
+            logger.info("dynamic batch processor looping...")
+        while True:
+            await asyncio.sleep(self.dynamic_batching_timeout)
+            runtime_batch: dict[Enum, list[dict]] = {}  # {ServiceType.Embedding: [{"request": xx, "response": yy}, {}]}
+
+            async with self.buffer_lock:
+                # prepare the runtime batch, access to buffer is locked
+                if self.request_buffer:
+                    for service_type, request_lst in self.request_buffer.items():
+                        batch = []
+                        # grab min(MAX_BATCH_SIZE, REQUEST_SIZE) requests from buffer
+                        for _ in range(min(self.dynamic_batching_max_batch_size, len(request_lst))):
+                            batch.append(request_lst.popleft())
+
+                        runtime_batch[service_type] = batch
+
+            # Run batched inference on the batch and set results
+            for service_type, batch in runtime_batch.items():
+                if not batch:
+                    continue
+                results = await self.dynamic_batching_infer(service_type, batch)
+
+                for req, result in zip(batch, results):
+                    req["response"].set_result(result)
+
+    async def dynamic_batching_infer(self, service_type: Enum, batch: list[dict]):
+        """Need to implement."""
+        raise NotImplementedError("Unimplemented dynamic batching inference!")
+
     def _validate_env(self):
         """Check whether to use the microservice locally."""
         if self.use_remote_service:
@@ -116,10 +169,14 @@ def run(self):
         self._validate_env()
         self.event_loop.run_until_complete(self._async_run_forever())
 
-    def start(self):
+    def start(self, in_single_process=False):
         self._validate_env()
-        self.process = multiprocessing.Process(target=self.run, daemon=False, name=self.name)
-        self.process.start()
+        if in_single_process:
+            # Resolve HPU segmentation fault and potential tokenizer issues by limiting to same process
+            self.run()
+        else:
+            self.process = multiprocessing.Process(target=self.run, daemon=False, name=self.name)
+            self.process.start()
 
     async def _async_teardown(self):
         """Shutdown the server."""
@@ -155,6 +212,9 @@ def register_microservice(
     provider: Optional[str] = None,
     provider_endpoint: Optional[str] = None,
     methods: List[str] = ["POST"],
+    dynamic_batching: bool = False,
+    dynamic_batching_timeout: int = 1,
+    dynamic_batching_max_batch_size: int = 32,
 ):
     def decorator(func):
         if name not in opea_microservices:
@@ -172,6 +232,9 @@ def decorator(func):
                 output_datatype=output_datatype,
                 provider=provider,
                 provider_endpoint=provider_endpoint,
+                dynamic_batching=dynamic_batching,
+                dynamic_batching_timeout=dynamic_batching_timeout,
+                dynamic_batching_max_batch_size=dynamic_batching_max_batch_size,
             )
             opea_microservices[name] = micro_service
         opea_microservices[name].app.router.add_api_route(endpoint, func, methods=methods)

diff --git a/comps/embeddings/tei/langchain/Dockerfile.dynamic_batching b/comps/embeddings/tei/langchain/Dockerfile.dynamic_batching
@@ -0,0 +1,28 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# FROM opea/habanalabs:1.16.1-pytorch-installer-2.2.2 as hpu
+FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest as hpu
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    libgl1-mesa-glx \
+    libjemalloc-dev
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+# Disable user for now
+# USER user
+
+COPY comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /home/user/comps/embeddings/tei/langchain/requirements.txt && \
+    pip install git+https://github.com/huggingface/optimum-habana.git
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user
+
+WORKDIR /home/user/comps/embeddings/tei/langchain
+
+ENTRYPOINT ["python", "local_embedding_reranking.py"]