Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: increased chat timeout to allow working with vision models; descreased freq of the progress bar updates #197

Merged
merged 8 commits into from
Sep 20, 2024
3 changes: 2 additions & 1 deletion dial-docker-compose/ci/ollama/test/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ async def dial_chat_completion(deployment_id: str, messages: list) -> str:
payload = {
"model": deployment_id,
"messages": messages,
"stream": False,
"temperature": 0.0,
"stream": False
}
headers = {"api-key": DIAL_API_KEY}
params = {"api-version": DIAL_API_VERSION}
Expand Down
1 change: 1 addition & 0 deletions dial-docker-compose/common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ services:
DIAL_API_HOST: "http://core:8080"
DIAL_API_KEY: "dial_api_key"
ENABLED_FEATURES: "conversations-section,prompts-section,top-settings,top-clear-conversation,top-chat-info,top-chat-model-settings,empty-chat-settings,header,footer,request-api-key,report-an-issue,likes,input-files,attachments-manager"
KEEP_ALIVE_TIMEOUT: ${CHAT_KEEP_ALIVE_TIMEOUT}

redis:
image: redis:7.2.4-alpine3.19
Expand Down
2 changes: 2 additions & 0 deletions dial-docker-compose/ollama/.env
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
DIAL_DIR="./ollama"
CHAT_KEEP_ALIVE_TIMEOUT=600000

OLLAMA_CHAT_MODEL=
OLLAMA_VISION_MODEL=
OLLAMA_EMBEDDING_MODEL=
5 changes: 0 additions & 5 deletions dial-docker-compose/ollama/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,6 @@ services:
- OLLAMA_CHAT_MODEL=${OLLAMA_CHAT_MODEL}
- OLLAMA_VISION_MODEL=${OLLAMA_VISION_MODEL}
- OLLAMA_EMBEDDING_MODEL=${OLLAMA_EMBEDDING_MODEL}
healthcheck:
test: ["CMD", "test", "-f", "/healthy"]
interval: 10s
start_period: 10s
retries: 10

adapter-openai:
image: epam/ai-dial-adapter-openai:0.14.0
Expand Down
2 changes: 2 additions & 0 deletions dial-docker-compose/ollama/ollama_setup/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.dockerignore
.venv
9 changes: 8 additions & 1 deletion dial-docker-compose/ollama/ollama_setup/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
FROM python:3.11-alpine

RUN apk --no-cache add curl

WORKDIR /app
COPY * /app
RUN pip install -r requirements.txt

CMD ["sh", "-c", "python setup.py && tail -f /dev/null"]
EXPOSE 5000

HEALTHCHECK --interval=10s --timeout=1s --start-period=10s --retries=10 \
CMD curl --fail http://localhost:5000/health || exit 1

CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "5000"]
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import asyncio
from contextlib import asynccontextmanager
import os
import sys
import time
import asyncio
from fastapi import FastAPI
from ollama import AsyncClient
from tqdm import tqdm

from utils import Writer, print_info, timer

OLLAMA_URL = os.getenv("OLLAMA_URL")
if OLLAMA_URL is None:
raise RuntimeError("OLLAMA_URL env var isn't set")
Expand All @@ -15,46 +16,15 @@
OLLAMA_VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL")
OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL")

HEALTH_FILE = "/healthy"


class Writer:
@classmethod
def write(cls, s: str):
# NOTE: every tqdm progress bar update is deliberately ended with "\n",
# otherwise one wouldn't see the bar running in console upon running `docker compose up`.
print(s, file=sys.stdout, flush=True, end="\n")

@classmethod
def flush(cls):
sys.stdout.flush()


print_info = Writer.write

print_info(f"OLLAMA_URL = {OLLAMA_URL}")
print_info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}")
print_info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}")
print_info(f"OLLAMA_EMBEDDING_MODEL = {OLLAMA_EMBEDDING_MODEL}")


@asynccontextmanager
async def timer(name: str):
print_info(f"[{name}] Starting...")
start = time.perf_counter()
yield
elapsed = time.perf_counter() - start
print_info(f"[{name}] Finished in {elapsed:.2f} seconds")


async def wait_for_startup():
attempt = 0
attempts = 0
while True:
attempt += 1
attempts += 1
try:
await AsyncClient(host=OLLAMA_URL, timeout=5).ps()
except Exception:
print_info(f"[{attempt:>3}] Waiting for Ollama to start...")
print_info(f"[{attempts:>3}] Waiting for Ollama to start...")
await asyncio.sleep(5)
else:
break
Expand All @@ -73,30 +43,34 @@ async def pull_model(client: AsyncClient, model: str):

if status != prev_status and total:
prev_status = status
if progress_bar:
progress_bar.close()
progress_bar = tqdm(
total=total, unit="B", unit_scale=True, desc=f"[{status}]", file=Writer
total=total,
unit="B",
unit_scale=True,
desc=f"[{status}]",
mininterval=1,
file=Writer,
)

if completed and progress_bar and total:
if completed and total and progress_bar:
progress_bar.n = completed
progress_bar.set_description(f"[{status}]")
progress_bar.refresh()
progress_bar.update(n=0)

if total and total == completed and progress_bar:
progress_bar.close()
progress_bar = None

if not completed and not total:
print_info(f"[{status}]")


async def create_health_mark():
open(HEALTH_FILE, "w").close()

async def startup():
print_info(f"OLLAMA_URL = {OLLAMA_URL}")
print_info(f"OLLAMA_CHAT_MODEL = {OLLAMA_CHAT_MODEL}")
print_info(f"OLLAMA_VISION_MODEL = {OLLAMA_VISION_MODEL}")
print_info(f"OLLAMA_EMBEDDING_MODEL = {OLLAMA_EMBEDDING_MODEL}")

async def main():
client = AsyncClient(host=OLLAMA_URL, timeout=300000)
client = AsyncClient(host=OLLAMA_URL, timeout=300)

async with timer("Waiting for Ollama to start"):
await wait_for_startup()
Expand All @@ -117,11 +91,18 @@ async def main():
async with timer(f"Loading model {model_to_load} into memory"):
await client.generate(model_to_load)

await create_health_mark()

print_info("The Ollama server is up and running.")


if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
@asynccontextmanager
async def lifespan(app):
await startup()
yield


app = FastAPI(lifespan=lifespan)


@app.get("/health")
def health_check():
return {"status": "ok"}
4 changes: 3 additions & 1 deletion dial-docker-compose/ollama/ollama_setup/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
httpx==0.27.2
tqdm==4.66.5
ollama==0.3.3
ollama==0.3.3
fastapi==0.115.0
uvicorn==0.30.6
38 changes: 38 additions & 0 deletions dial-docker-compose/ollama/ollama_setup/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import logging
import sys
import time
from contextlib import asynccontextmanager


class Writer:
@classmethod
def write(cls, s: str):
# NOTE: every tqdm progress bar update is deliberately ended with "\n",
# otherwise one wouldn't see the bar running in console upon running `docker compose up`.
if s in ["\n", ""]:
return
print(s.strip(), file=sys.stderr, flush=True, end="\n")

@classmethod
def flush(cls):
sys.stderr.flush()


print_info = Writer.write


@asynccontextmanager
async def timer(name: str):
print_info(f"[{name}] Starting...")
start = time.perf_counter()
yield
elapsed = time.perf_counter() - start
print_info(f"[{name}] Finished in {elapsed:.2f} seconds")


class HealthFilter(logging.Filter):
def filter(self, record: logging.LogRecord) -> bool:
return record.getMessage().find("/health") == -1


logging.getLogger("uvicorn.access").addFilter(HealthFilter())
8 changes: 4 additions & 4 deletions docs/tutorials/quick-start-with-self-hosted-model.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,15 @@ All the models support streaming.
docker compose up --abort-on-container-exit
```

> Keep in mind that a typical size of a lightweight Ollama model is around a few gigabytes. So it may take a few minutes _(or dozens of minutes)_ to download them on the first run depending on your Internet bandwidth.
> Keep in mind that a typical size of a lightweight Ollama model is around a few gigabytes. So it may take a few minutes _(or more)_ to download it on the first run, depending on your internet bandwidth and the size of the model you choose.
>
> The model is fully loaded once `ollama-setup` service prints `The Ollama server is up and running.`
> The models are fully loaded once `ollama-setup` service prints `The Ollama server is up and running.`

3. Finally, open http://localhost:3000/ in your browser to launch the AI DIAL Chat application and select an appropriate AI DIAL deployments to converse with:

* `Self-hosted chat model` deployment for the `OLLAMA_CHAT_MODEL`
* `Self-hosted vision model` deployment for the `OLLAMA_VISION_MODEL`

> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond.
> Note, that the vision models we tested, do not support streaming of response. Moreover, they are typically more computationally expensive than the chat models. So it may take minutes for a vision model to respond.

The embedding model will become available in AI DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`.
The embedding model will become available in AI DIAL under the deployment name `embedding-model` and could be called via the endpoint: `localhost:8080/openai/deployments/embedding-model/embeddings`.