epam · adubovik · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024
@@ -58,7 +58,8 @@ async def dial_chat_completion(deployment_id: str, messages: list) -> str:
     payload = {
         "model": deployment_id,
         "messages": messages,
-        "stream": False,
+        "temperature": 0.0,
+        "stream": False
     }
     headers = {"api-key": DIAL_API_KEY}
     params = {"api-version": DIAL_API_VERSION}

@@ -17,6 +17,7 @@ services:
       DIAL_API_HOST: "http://core:8080"
       DIAL_API_KEY: "dial_api_key"
       ENABLED_FEATURES: "conversations-section,prompts-section,top-settings,top-clear-conversation,top-chat-info,top-chat-model-settings,empty-chat-settings,header,footer,request-api-key,report-an-issue,likes,input-files,attachments-manager"
+      KEEP_ALIVE_TIMEOUT: ${CHAT_KEEP_ALIVE_TIMEOUT}
 
   redis:
     image: redis:7.2.4-alpine3.19

@@ -1,4 +1,6 @@
 DIAL_DIR="./ollama"
+CHAT_KEEP_ALIVE_TIMEOUT=600000
+
 OLLAMA_CHAT_MODEL=
 OLLAMA_VISION_MODEL=
 OLLAMA_EMBEDDING_MODEL=
@@ -20,11 +20,6 @@ services:
       - OLLAMA_CHAT_MODEL=${OLLAMA_CHAT_MODEL}
       - OLLAMA_VISION_MODEL=${OLLAMA_VISION_MODEL}
       - OLLAMA_EMBEDDING_MODEL=${OLLAMA_EMBEDDING_MODEL}
-    healthcheck:
-      test: ["CMD", "test", "-f", "/healthy"]
-      interval: 10s
-      start_period: 10s
-      retries: 10
 
   adapter-openai:
     image: epam/ai-dial-adapter-openai:0.14.0

@@ -0,0 +1,2 @@
+.dockerignore
+.venv
@@ -1,7 +1,14 @@
 FROM python:3.11-alpine
 
+RUN apk --no-cache add curl
+
 WORKDIR /app
 COPY * /app
 RUN pip install -r requirements.txt
 
-CMD ["sh", "-c", "python setup.py && tail -f /dev/null"]
+EXPOSE 5000
+
+HEALTHCHECK --interval=10s --timeout=1s --start-period=10s --retries=10 \
+  CMD curl --fail http://localhost:5000/health || exit 1
+
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "5000"]
@@ -1,12 +1,13 @@
 import asyncio
 from contextlib import asynccontextmanager
 import os
-import sys
-import time
 import asyncio
+from fastapi import FastAPI
 from ollama import AsyncClient
 from tqdm import tqdm
 
+from utils import Writer, print_info, timer
+
 OLLAMA_URL = os.getenv("OLLAMA_URL")
 if OLLAMA_URL is None:
     raise RuntimeError("OLLAMA_URL env var isn't set")
@@ -15,46 +16,15 @@
 OLLAMA_VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL")
 OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL")
 
-HEALTH_FILE = "/healthy"
-
-
-class Writer:
-    @classmethod
-    def write(cls, s: str):
-        # NOTE: every tqdm progress bar update is deliberately ended with "\n",
-        # otherwise one wouldn't see the bar running in console upon running `docker compose up`.
-        print(s, file=sys.stdout, flush=True, end="\n")
-
-    @classmethod
-    def flush(cls):
-        sys.stdout.flush()
-
-
-print_info = Writer.write
-
-print_info(f"OLLAMA_URL = {OLLAMA_URL}")
-print_info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}")
-print_info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}")
-print_info(f"OLLAMA_EMBEDDING_MODEL = {OLLAMA_EMBEDDING_MODEL}")
-
-
-@asynccontextmanager
-async def timer(name: str):
-    print_info(f"[{name}] Starting...")
-    start = time.perf_counter()
-    yield
-    elapsed = time.perf_counter() - start
-    print_info(f"[{name}] Finished in {elapsed:.2f} seconds")
-
 
 async def wait_for_startup():
-    attempt = 0
+    attempts = 0
     while True:
-        attempt += 1
+        attempts += 1
         try:
             await AsyncClient(host=OLLAMA_URL, timeout=5).ps()
         except Exception:
-            print_info(f"[{attempt:>3}] Waiting for Ollama to start...")
+            print_info(f"[{attempts:>3}] Waiting for Ollama to start...")
             await asyncio.sleep(5)
         else:
             break
@@ -73,30 +43,34 @@ async def pull_model(client: AsyncClient, model: str):
 
         if status != prev_status and total:
             prev_status = status
-            if progress_bar:
-                progress_bar.close()
             progress_bar = tqdm(
-                total=total, unit="B", unit_scale=True, desc=f"[{status}]", file=Writer
+                total=total,
+                unit="B",
+                unit_scale=True,
+                desc=f"[{status}]",
+                mininterval=1,
+                file=Writer,
             )
 
-        if completed and progress_bar and total:
+        if completed and total and progress_bar:
             progress_bar.n = completed
-            progress_bar.set_description(f"[{status}]")
-            progress_bar.refresh()
+            progress_bar.update(n=0)
 
         if total and total == completed and progress_bar:
             progress_bar.close()
+            progress_bar = None
 
         if not completed and not total:
             print_info(f"[{status}]")
 
 
-async def create_health_mark():
-    open(HEALTH_FILE, "w").close()
-
+async def startup():
+    print_info(f"OLLAMA_URL = {OLLAMA_URL}")
+    print_info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}")
+    print_info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}")
+    print_info(f"OLLAMA_EMBEDDING_MODEL = {OLLAMA_EMBEDDING_MODEL}")
 
-async def main():
-    client = AsyncClient(host=OLLAMA_URL, timeout=300000)
+    client = AsyncClient(host=OLLAMA_URL, timeout=300)
 
     async with timer("Waiting for Ollama to start"):
         await wait_for_startup()
@@ -117,11 +91,18 @@ async def main():
         async with timer(f"Loading model {model_to_load} into memory"):
             await client.generate(model_to_load)
 
-    await create_health_mark()
-
     print_info("The Ollama server is up and running.")
 
 
-if __name__ == "__main__":
-    loop = asyncio.get_event_loop()
-    loop.run_until_complete(main())
+@asynccontextmanager
+async def lifespan(app):
+    await startup()
+    yield
+
+
+app = FastAPI(lifespan=lifespan)
+
+
+@app.get("/health")
+def health_check():
+    return {"status": "ok"}
@@ -1,3 +1,5 @@
 httpx==0.27.2
 tqdm==4.66.5
-ollama==0.3.3
+ollama==0.3.3
+fastapi==0.115.0
+uvicorn==0.30.6
@@ -0,0 +1,38 @@
+import logging
+import sys
+import time
+from contextlib import asynccontextmanager
+
+
+class Writer:
+    @classmethod
+    def write(cls, s: str):
+        # NOTE: every tqdm progress bar update is deliberately ended with "\n",
+        # otherwise one wouldn't see the bar running in console upon running `docker compose up`.
+        if s in ["\n", ""]:
+            return
+        print(s.strip(), file=sys.stderr, flush=True, end="\n")
+
+    @classmethod
+    def flush(cls):
+        sys.stderr.flush()
+
+
+print_info = Writer.write
+
+
+@asynccontextmanager
+async def timer(name: str):
+    print_info(f"[{name}] Starting...")
+    start = time.perf_counter()
+    yield
+    elapsed = time.perf_counter() - start
+    print_info(f"[{name}] Finished in {elapsed:.2f} seconds")
+
+
+class HealthFilter(logging.Filter):
+    def filter(self, record: logging.LogRecord) -> bool:
+        return record.getMessage().find("/health") == -1
+
+
+logging.getLogger("uvicorn.access").addFilter(HealthFilter())
@@ -65,15 +65,15 @@ All the models support streaming.
     docker compose up --abort-on-container-exit
     ```
 
-    > Keep in mind that a typical size of a lightweight Ollama model is around a few gigabytes. So it may take a few minutes _(or dozens of minutes)_ to download them on the first run depending on your Internet bandwidth.
+    > Keep in mind that a typical size of a lightweight Ollama model is around a few gigabytes. So it may take a few minutes _(or more)_ to download it on the first run, depending on your internet bandwidth and the size of the model you choose.
     >
-    > The model is fully loaded once `ollama-setup` service prints `The Ollama server is up and running.`
+    > The models are fully loaded once `ollama-setup` service prints `The Ollama server is up and running.`
 
 3. Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select an appropriate AI DIAL deployments to converse with:
 
     * `Self-hosted chat model` deployment for the `OLLAMA_CHAT_MODEL`
     * `Self-hosted vision model` deployment for the `OLLAMA_VISION_MODEL`
 
-> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond.
+    > Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond.
 
-The embedding model will become available in AI DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`.
+    The embedding model will become available in AI DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`.