From c5647da04a1ae22a634c093b743b9ad7d723f4cc Mon Sep 17 00:00:00 2001
From: michaelfeil <me@michaelfeil.eu>
Date: Mon, 14 Oct 2024 23:41:42 -0700
Subject: [PATCH 1/3] add jinja docker templates

---
 .github/workflows/release.yaml                |  29 ++++-
 .../workflows/release_docker_container.yaml   |  13 +-
 infra/sap/sap-core-ai                         |   2 +-
 libs/infinity_emb/Dockerfile.amd_auto         | 115 ++++++++++++++++++
 libs/infinity_emb/Dockerfile.cpu_auto         | 115 ++++++++++++++++++
 libs/infinity_emb/Dockerfile.jinja2           | 115 ++++++++++++++++++
 .../{Dockerfile => Dockerfile.nvidia_auto}    |  16 +--
 libs/infinity_emb/Makefile                    |   7 +-
 libs/infinity_emb/docker.template.yaml        |  19 +++
 .../infinity_emb/transformer/audio/torch.py   |   2 +-
 libs/infinity_emb/poetry.lock                 |  67 ++++++----
 libs/infinity_emb/pyproject.toml              |  23 +++-
 12 files changed, 485 insertions(+), 38 deletions(-)
 create mode 100644 libs/infinity_emb/Dockerfile.amd_auto
 create mode 100644 libs/infinity_emb/Dockerfile.cpu_auto
 create mode 100644 libs/infinity_emb/Dockerfile.jinja2
 rename libs/infinity_emb/{Dockerfile => Dockerfile.nvidia_auto} (88%)
 create mode 100644 libs/infinity_emb/docker.template.yaml

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index dbdaddb6..1b84ffff 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -10,17 +10,40 @@ env:
   WORKDIR: "libs/infinity_emb"
 
 jobs:
-  docker-container-push:
+  docker-container-push-nvidia:
     uses:
       ./.github/workflows/release_docker_container.yaml
     with:
       # working-directory: libs/infinity_emb
-      dockerfile: libs/infinity_emb/Dockerfile
+      dockerfile: libs/infinity_emb/Dockerfile.nvidia_auto
       image: michaelf34/infinity
+      appendix_tag: ""
+      platforms: "linux/amd64,linux/arm64"
     secrets: inherit
+  
+  docker-container-push-cpu:
+    uses:
+      ./.github/workflows/release_docker_container.yaml
+    with:
+      # working-directory: libs/infinity_emb
+      dockerfile: libs/infinity_emb/Dockerfile.cpu_auto
+      image: michaelf34/infinity
+      appendix_tag: "-cpu"
+      platforms: "linux/amd64,linux/arm64"
+    secrets: inherit
+  
+  # docker-container-push-amd:
+  #   uses:
+  #     ./.github/workflows/release_docker_container.yaml
+  #   with:
+  #     # working-directory: libs/infinity_emb
+  #     dockerfile: libs/infinity_emb/Dockerfile.amd_auto
+  #     image: michaelf34/infinity
+  #     appendix_tag: "-amd"
+  #     platforms: "linux/amd64"
+  #   secrets: inherit
 
   modal-deploy:
-    needs: docker-container-push
     uses:
       ./.github/workflows/release_modal_com.yaml
     secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/release_docker_container.yaml b/.github/workflows/release_docker_container.yaml
index c7933557..d71e8039 100644
--- a/.github/workflows/release_docker_container.yaml
+++ b/.github/workflows/release_docker_container.yaml
@@ -11,11 +11,20 @@ on:
         required: true
         type: string
         description: "Name of the image to build"
+      appendix_tag:
+        required: false
+        type: string
+        description: "Appendix tag to add to the image, e.g. `-cpu`"
       context:
         required: false
         type: string
         description: "Path to the build context"
         default: "./libs/infinity_emb"
+      platforms:
+        required: false
+        type: string
+        description: "Platforms to build for"
+        default: "linux/amd64,linux/arm64"
 
 env:
   TEST_TAG: ${{ inputs.image }}:test
@@ -78,6 +87,6 @@ jobs:
           # QEMU and base python image, for now build only for
           # linux/amd64 and linux/arm64
           # cache-from: type=registry,ref=${{ env.LATEST_TAG }}
-          platforms: linux/amd64,linux/arm64
-          tags: ${{ env.LATEST_TAG }},${{ env.VERSION_TAG }}
+          platforms: ${{ inputs.platforms }}
+          tags: ${{ env.LATEST_TAG }}{{ inputs.appendix_tag }},${{ env.VERSION_TAG }}{{ inputs.appendix_tag }}
           push: true
diff --git a/infra/sap/sap-core-ai b/infra/sap/sap-core-ai
index 8eb2ce3e..881d03d5 160000
--- a/infra/sap/sap-core-ai
+++ b/infra/sap/sap-core-ai
@@ -1 +1 @@
-Subproject commit 8eb2ce3e7994b4419ca98d8fc2c19690c2808eb0
+Subproject commit 881d03d5288efe293da2e31d8650b86bc6a5fa48
diff --git a/libs/infinity_emb/Dockerfile.amd_auto b/libs/infinity_emb/Dockerfile.amd_auto
new file mode 100644
index 00000000..e648bf8a
--- /dev/null
+++ b/libs/infinity_emb/Dockerfile.amd_auto
@@ -0,0 +1,115 @@
+# Autogenerated warning:
+# This file is generated from Dockerfile.jinja2. Do not edit the Dockerfile.cuda|cpu|amd file directly.
+# Only contribute to the Dockerfile.jinja2 and dockerfile_template.yaml and regenerate the Dockerfile.cuda|cpu|amd
+
+FROM rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0 AS base
+
+ENV PYTHONUNBUFFERED=1 \
+    \
+    # pip
+    PIP_NO_CACHE_DIR=off \
+    PIP_DISABLE_PIP_VERSION_CHECK=on \
+    PIP_DEFAULT_TIMEOUT=100 \
+    \
+    # make poetry create the virtual environment in the project's root
+    # it gets named `.venv`
+    POETRY_VIRTUALENVS_IN_PROJECT=true \
+    # do not ask any interactive question
+    POETRY_NO_INTERACTION=1 \
+    EXTRAS="all" \
+    PYTHON="python3.11"
+RUN apt-get update && apt-get install build-essential python3-dev libsndfile1 $PYTHON-venv $PYTHON curl -y 
+WORKDIR /app
+
+FROM base as builder
+# Set the working directory for the app
+# Define the version of Poetry to install (default is 1.7.1)
+# Define the directory to install Poetry to (default is /opt/poetry)
+ARG POETRY_VERSION=1.7.1
+ARG POETRY_HOME=/opt/poetry
+# Create a Python virtual environment for Poetry and install it
+RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=$POETRY_HOME POETRY_VERSION=$POETRY_VERSION $PYTHON -
+ENV PATH=$POETRY_HOME/bin:$PATH
+# Test if Poetry is installed in the expected path
+RUN echo "Poetry version:" && poetry --version
+# Copy the rest of the app source code (this layer will be invalidated and rebuilt whenever the source code changes)
+COPY poetry.lock poetry.toml pyproject.toml README.md /app/
+# Install dependencies only
+RUN sed -i 's|"pypi"|"pytorch_rocm"|' pyproject.toml && sed -i 's|torch = "2.4.1"|#|' pyproject.toml  && rm poetry.lock
+RUN poetry install --no-interaction --no-ansi --no-root --extras "${EXTRAS}" --without lint,test && poetry cache clear pypi --all
+COPY infinity_emb infinity_emb
+# Install dependency with infinity_emb package
+RUN poetry install --no-interaction --no-ansi  --extras "${EXTRAS}" --without lint,test && poetry cache clear pypi --all
+
+FROM builder as testing
+# install lint and test dependencies
+RUN poetry install --no-interaction --no-ansi  --extras "${EXTRAS}" --with lint,test && poetry cache clear pypi --all
+# lint 
+RUN poetry run ruff .
+RUN poetry run black --check .
+RUN poetry run mypy .
+# pytest
+COPY tests tests
+# run end to end tests because of duration of build in github ci.
+# Run tests/end_to_end on TARGETPLATFORM x86_64 otherwise run tests/end_to_end_gpu
+# poetry run python -m pytest tests/end_to_end -x # TODO: does not work.
+RUN if [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
+poetry run python -m pytest tests/end_to_end -x ; \
+else \
+poetry run python -m pytest tests/end_to_end/test_api_with_dummymodel.py -x ; \
+fi
+RUN echo "all tests passed" > "test_results.txt"
+
+
+# Use a multi-stage build -> production version, with download
+FROM base AS tested-builder
+COPY --from=builder /app /app
+# force testing stage to run
+COPY --from=testing /app/test_results.txt /app/test_results.txt
+ENV HF_HOME=/app/.cache/huggingface
+ENV PATH=/app/.venv/bin:$PATH
+# do nothing
+RUN echo "copied all files"
+
+
+# Export with tensorrt, not recommended.
+# docker buildx build --target=production-tensorrt -f Dockerfile .
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=off \
+    PYTHON="python3.11"
+RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y 
+COPY --from=builder /app /app
+# force testing stage to run
+COPY --from=testing /app/test_results.txt /app/test_results.txt
+ENV HF_HOME=/app/.cache/torch
+ENV PATH=/app/.venv/bin:$PATH
+RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
+ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
+ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
+ENTRYPOINT ["infinity_emb"]
+
+
+# Use a multi-stage build -> production version, with download
+# docker buildx build --target=production-with-download \
+# --build-arg MODEL_NAME=BAAI/bge-small-en-v1.5 --build-arg ENGINE=torch -f Dockerfile -t infinity-BAAI-small .
+FROM tested-builder AS production-with-download
+# collect model name and engine from build args
+ARG MODEL_NAME
+RUN if [ -z "${MODEL_NAME}" ]; then echo "Error: Build argument MODEL_NAME not set." && exit 1; fi
+ARG ENGINE
+RUN if [ -z "${ENGINE}" ]; then echo "Error: Build argument ENGINE not set." && exit 1; fi
+ARG EXTRA_PACKAGES
+RUN if [ -n "${EXTRA_PACKAGES}" ]; then python -m pip install --no-cache-dir ${EXTRA_PACKAGES} ; fi
+# will exit with 3 if model is downloaded # TODO: better exit code
+RUN infinity_emb v2 --model-id $MODEL_NAME --engine $ENGINE --preload-only || [ $? -eq 3 ]
+ENTRYPOINT ["infinity_emb"]
+
+# flash attention fa2
+FROM tested-builder AS production-with-fa2
+RUN python -m pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.3cxx11abiFalse-cp310-cp310-linux_x86_64.whl
+ENTRYPOINT ["infinity_emb"]
+
+# Use a multi-stage build -> production version
+FROM tested-builder AS production
+ENTRYPOINT ["infinity_emb"]
diff --git a/libs/infinity_emb/Dockerfile.cpu_auto b/libs/infinity_emb/Dockerfile.cpu_auto
new file mode 100644
index 00000000..8db05cc3
--- /dev/null
+++ b/libs/infinity_emb/Dockerfile.cpu_auto
@@ -0,0 +1,115 @@
+# Autogenerated warning:
+# This file is generated from Dockerfile.jinja2. Do not edit the Dockerfile.cuda|cpu|amd file directly.
+# Only contribute to the Dockerfile.jinja2 and dockerfile_template.yaml and regenerate the Dockerfile.cuda|cpu|amd
+
+FROM ubuntu:22.04 AS base
+
+ENV PYTHONUNBUFFERED=1 \
+    \
+    # pip
+    PIP_NO_CACHE_DIR=off \
+    PIP_DISABLE_PIP_VERSION_CHECK=on \
+    PIP_DEFAULT_TIMEOUT=100 \
+    \
+    # make poetry create the virtual environment in the project's root
+    # it gets named `.venv`
+    POETRY_VIRTUALENVS_IN_PROJECT=true \
+    # do not ask any interactive question
+    POETRY_NO_INTERACTION=1 \
+    EXTRAS="all" \
+    PYTHON="python3.11"
+RUN apt-get update && apt-get install build-essential python3-dev libsndfile1 $PYTHON-venv $PYTHON curl -y 
+WORKDIR /app
+
+FROM base as builder
+# Set the working directory for the app
+# Define the version of Poetry to install (default is 1.7.1)
+# Define the directory to install Poetry to (default is /opt/poetry)
+ARG POETRY_VERSION=1.7.1
+ARG POETRY_HOME=/opt/poetry
+# Create a Python virtual environment for Poetry and install it
+RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=$POETRY_HOME POETRY_VERSION=$POETRY_VERSION $PYTHON -
+ENV PATH=$POETRY_HOME/bin:$PATH
+# Test if Poetry is installed in the expected path
+RUN echo "Poetry version:" && poetry --version
+# Copy the rest of the app source code (this layer will be invalidated and rebuilt whenever the source code changes)
+COPY poetry.lock poetry.toml pyproject.toml README.md /app/
+# Install dependencies only
+RUN sed -i 's|"pypi"|"pytorch_cpu"|' pyproject.toml && rm poetry.lock
+RUN poetry install --no-interaction --no-ansi --no-root --extras "${EXTRAS}" --without lint,test && poetry cache clear pypi --all
+COPY infinity_emb infinity_emb
+# Install dependency with infinity_emb package
+RUN poetry install --no-interaction --no-ansi  --extras "${EXTRAS}" --without lint,test && poetry cache clear pypi --all
+
+FROM builder as testing
+# install lint and test dependencies
+RUN poetry install --no-interaction --no-ansi  --extras "${EXTRAS}" --with lint,test && poetry cache clear pypi --all
+# lint 
+RUN poetry run ruff .
+RUN poetry run black --check .
+RUN poetry run mypy .
+# pytest
+COPY tests tests
+# run end to end tests because of duration of build in github ci.
+# Run tests/end_to_end on TARGETPLATFORM x86_64 otherwise run tests/end_to_end_gpu
+# poetry run python -m pytest tests/end_to_end -x # TODO: does not work.
+RUN if [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
+poetry run python -m pytest tests/end_to_end -x ; \
+else \
+poetry run python -m pytest tests/end_to_end/test_api_with_dummymodel.py -x ; \
+fi
+RUN echo "all tests passed" > "test_results.txt"
+
+
+# Use a multi-stage build -> production version, with download
+FROM base AS tested-builder
+COPY --from=builder /app /app
+# force testing stage to run
+COPY --from=testing /app/test_results.txt /app/test_results.txt
+ENV HF_HOME=/app/.cache/huggingface
+ENV PATH=/app/.venv/bin:$PATH
+# do nothing
+RUN echo "copied all files"
+
+
+# Export with tensorrt, not recommended.
+# docker buildx build --target=production-tensorrt -f Dockerfile .
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=off \
+    PYTHON="python3.11"
+RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y 
+COPY --from=builder /app /app
+# force testing stage to run
+COPY --from=testing /app/test_results.txt /app/test_results.txt
+ENV HF_HOME=/app/.cache/torch
+ENV PATH=/app/.venv/bin:$PATH
+RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
+ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
+ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
+ENTRYPOINT ["infinity_emb"]
+
+
+# Use a multi-stage build -> production version, with download
+# docker buildx build --target=production-with-download \
+# --build-arg MODEL_NAME=BAAI/bge-small-en-v1.5 --build-arg ENGINE=torch -f Dockerfile -t infinity-BAAI-small .
+FROM tested-builder AS production-with-download
+# collect model name and engine from build args
+ARG MODEL_NAME
+RUN if [ -z "${MODEL_NAME}" ]; then echo "Error: Build argument MODEL_NAME not set." && exit 1; fi
+ARG ENGINE
+RUN if [ -z "${ENGINE}" ]; then echo "Error: Build argument ENGINE not set." && exit 1; fi
+ARG EXTRA_PACKAGES
+RUN if [ -n "${EXTRA_PACKAGES}" ]; then python -m pip install --no-cache-dir ${EXTRA_PACKAGES} ; fi
+# will exit with 3 if model is downloaded # TODO: better exit code
+RUN infinity_emb v2 --model-id $MODEL_NAME --engine $ENGINE --preload-only || [ $? -eq 3 ]
+ENTRYPOINT ["infinity_emb"]
+
+# flash attention fa2
+FROM tested-builder AS production-with-fa2
+RUN python -m pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.3cxx11abiFalse-cp310-cp310-linux_x86_64.whl
+ENTRYPOINT ["infinity_emb"]
+
+# Use a multi-stage build -> production version
+FROM tested-builder AS production
+ENTRYPOINT ["infinity_emb"]
diff --git a/libs/infinity_emb/Dockerfile.jinja2 b/libs/infinity_emb/Dockerfile.jinja2
new file mode 100644
index 00000000..9c2079e6
--- /dev/null
+++ b/libs/infinity_emb/Dockerfile.jinja2
@@ -0,0 +1,115 @@
+# Autogenerated warning:
+# This file is generated from Dockerfile.jinja2. Do not edit the Dockerfile.cuda|cpu|amd file directly.
+# Only contribute to the Dockerfile.jinja2 and dockerfile_template.yaml and regenerate the Dockerfile.cuda|cpu|amd
+
+FROM {{ base_image }} AS base
+
+ENV PYTHONUNBUFFERED=1 \
+    \
+    # pip
+    PIP_NO_CACHE_DIR=off \
+    PIP_DISABLE_PIP_VERSION_CHECK=on \
+    PIP_DEFAULT_TIMEOUT=100 \
+    \
+    # make poetry create the virtual environment in the project's root
+    # it gets named `.venv`
+    POETRY_VIRTUALENVS_IN_PROJECT=true \
+    # do not ask any interactive question
+    POETRY_NO_INTERACTION=1 \
+    EXTRAS="all" \
+    PYTHON="python3.11"
+RUN apt-get update && apt-get install build-essential python3-dev libsndfile1 $PYTHON-venv $PYTHON curl -y 
+WORKDIR /app
+
+FROM base as builder
+# Set the working directory for the app
+# Define the version of Poetry to install (default is 1.7.1)
+# Define the directory to install Poetry to (default is /opt/poetry)
+ARG POETRY_VERSION=1.7.1
+ARG POETRY_HOME=/opt/poetry
+# Create a Python virtual environment for Poetry and install it
+RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=$POETRY_HOME POETRY_VERSION=$POETRY_VERSION $PYTHON -
+ENV PATH=$POETRY_HOME/bin:$PATH
+# Test if Poetry is installed in the expected path
+RUN echo "Poetry version:" && poetry --version
+# Copy the rest of the app source code (this layer will be invalidated and rebuilt whenever the source code changes)
+COPY poetry.lock poetry.toml pyproject.toml README.md /app/
+# Install dependencies only
+RUN {{pyproject_sed}}
+RUN {{main_install}} && poetry cache clear pypi --all
+COPY infinity_emb infinity_emb
+# Install dependency with infinity_emb package
+RUN {{main_install|replace("--no-root","")}} && poetry cache clear pypi --all
+
+FROM builder as testing
+# install lint and test dependencies
+RUN {{main_install|replace("--without", "--with")|replace("--no-root","")}} && poetry cache clear pypi --all
+# lint 
+RUN poetry run ruff .
+RUN poetry run black --check .
+RUN poetry run mypy .
+# pytest
+COPY tests tests
+# run end to end tests because of duration of build in github ci.
+# Run tests/end_to_end on TARGETPLATFORM x86_64 otherwise run tests/end_to_end_gpu
+# poetry run python -m pytest tests/end_to_end -x # TODO: does not work.
+RUN if [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
+poetry run python -m pytest tests/end_to_end -x ; \
+else \
+poetry run python -m pytest tests/end_to_end/test_api_with_dummymodel.py -x ; \
+fi
+RUN echo "all tests passed" > "test_results.txt"
+
+
+# Use a multi-stage build -> production version, with download
+FROM base AS tested-builder
+COPY --from=builder /app /app
+# force testing stage to run
+COPY --from=testing /app/test_results.txt /app/test_results.txt
+ENV HF_HOME=/app/.cache/huggingface
+ENV PATH=/app/.venv/bin:$PATH
+# do nothing
+RUN echo "copied all files"
+
+
+# Export with tensorrt, not recommended.
+# docker buildx build --target=production-tensorrt -f Dockerfile .
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=off \
+    PYTHON="python3.11"
+RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y 
+COPY --from=builder /app /app
+# force testing stage to run
+COPY --from=testing /app/test_results.txt /app/test_results.txt
+ENV HF_HOME=/app/.cache/torch
+ENV PATH=/app/.venv/bin:$PATH
+RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
+ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
+ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
+ENTRYPOINT ["infinity_emb"]
+
+
+# Use a multi-stage build -> production version, with download
+# docker buildx build --target=production-with-download \
+# --build-arg MODEL_NAME=BAAI/bge-small-en-v1.5 --build-arg ENGINE=torch -f Dockerfile -t infinity-BAAI-small .
+FROM tested-builder AS production-with-download
+# collect model name and engine from build args
+ARG MODEL_NAME
+RUN if [ -z "${MODEL_NAME}" ]; then echo "Error: Build argument MODEL_NAME not set." && exit 1; fi
+ARG ENGINE
+RUN if [ -z "${ENGINE}" ]; then echo "Error: Build argument ENGINE not set." && exit 1; fi
+ARG EXTRA_PACKAGES
+RUN if [ -n "${EXTRA_PACKAGES}" ]; then python -m pip install --no-cache-dir ${EXTRA_PACKAGES} ; fi
+# will exit with 3 if model is downloaded # TODO: better exit code
+RUN infinity_emb v2 --model-id $MODEL_NAME --engine $ENGINE --preload-only || [ $? -eq 3 ]
+ENTRYPOINT ["infinity_emb"]
+
+# flash attention fa2
+FROM tested-builder AS production-with-fa2
+RUN python -m pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.3cxx11abiFalse-cp310-cp310-linux_x86_64.whl
+ENTRYPOINT ["infinity_emb"]
+
+# Use a multi-stage build -> production version
+FROM tested-builder AS production
+ENTRYPOINT ["infinity_emb"]
diff --git a/libs/infinity_emb/Dockerfile b/libs/infinity_emb/Dockerfile.nvidia_auto
similarity index 88%
rename from libs/infinity_emb/Dockerfile
rename to libs/infinity_emb/Dockerfile.nvidia_auto
index 80193b83..6ec70999 100644
--- a/libs/infinity_emb/Dockerfile
+++ b/libs/infinity_emb/Dockerfile.nvidia_auto
@@ -1,4 +1,7 @@
-# Use the Python base image
+# Autogenerated warning:
+# This file is generated from Dockerfile.jinja2. Do not edit the Dockerfile.cuda|cpu|amd file directly.
+# Only contribute to the Dockerfile.jinja2 and dockerfile_template.yaml and regenerate the Dockerfile.cuda|cpu|amd
+
 FROM nvidia/cuda:12.1.1-base-ubuntu22.04 AS base
 
 ENV PYTHONUNBUFFERED=1 \
@@ -32,16 +35,15 @@ RUN echo "Poetry version:" && poetry --version
 # Copy the rest of the app source code (this layer will be invalidated and rebuilt whenever the source code changes)
 COPY poetry.lock poetry.toml pyproject.toml README.md /app/
 # Install dependencies only
-RUN poetry install --no-interaction --no-ansi --no-root --extras "${EXTRAS}" --without lint,test
+RUN true
+RUN poetry install --no-interaction --no-ansi --extras "${EXTRAS}" --no-root --without lint,test && poetry cache clear pypi --all
 COPY infinity_emb infinity_emb
 # Install dependency with infinity_emb package
-RUN poetry install --no-interaction --no-ansi --extras "${EXTRAS}"  --without lint,test
-# remove cache
-RUN poetry cache clear pypi --all
+RUN poetry install --no-interaction --no-ansi --extras "${EXTRAS}"  --without lint,test && poetry cache clear pypi --all
 
 FROM builder as testing
 # install lint and test dependencies
-RUN poetry install --no-interaction --no-ansi --extras "${EXTRAS}"
+RUN poetry install --no-interaction --no-ansi --extras "${EXTRAS}"  --with lint,test && poetry cache clear pypi --all
 # lint 
 RUN poetry run ruff .
 RUN poetry run black --check .
@@ -50,7 +52,7 @@ RUN poetry run mypy .
 COPY tests tests
 # run end to end tests because of duration of build in github ci.
 # Run tests/end_to_end on TARGETPLATFORM x86_64 otherwise run tests/end_to_end_gpu
-# poetry run python -m pytest tests/end_to_end -x
+# poetry run python -m pytest tests/end_to_end -x # TODO: does not work.
 RUN if [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
 poetry run python -m pytest tests/end_to_end -x ; \
 else \
diff --git a/libs/infinity_emb/Makefile b/libs/infinity_emb/Makefile
index b9e6faa4..4c98942e 100644
--- a/libs/infinity_emb/Makefile
+++ b/libs/infinity_emb/Makefile
@@ -1,4 +1,4 @@
-.PHONY: all clean docs_build docs_clean docs_linkcheck api_docs_build api_docs_clean api_docs_linkcheck format lint test tests test_watch integration_tests docker_tests help extended_tests
+.PHONY: all clean docs_build docs_clean docs_linkcheck api_docs_build api_docs_clean api_docs_linkcheck format lint test tests test_watch template_docker integration_tests docker_tests help extended_tests
 
 # Default target executed when no arguments are given to make.
 all: help
@@ -42,6 +42,11 @@ format format_diff:
 	[ "$(PYTHON_FILES)" = "" ] || poetry run black $(PYTHON_FILES)
 	[ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I --fix $(PYTHON_FILES)
 
+template_docker:
+	jinja2 Dockerfile.jinja2 docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto
+	jinja2 Dockerfile.jinja2 docker.template.yaml --format=yaml -s cpu > Dockerfile.cpu_auto
+	jinja2 Dockerfile.jinja2 docker.template.yaml --format=yaml -s nvidia > Dockerfile.nvidia_auto
+
 poetry_check:
 	poetry check
 
diff --git a/libs/infinity_emb/docker.template.yaml b/libs/infinity_emb/docker.template.yaml
new file mode 100644
index 00000000..acfb2131
--- /dev/null
+++ b/libs/infinity_emb/docker.template.yaml
@@ -0,0 +1,19 @@
+# 1. Guide: pip install jinja2 jinja2-cli
+nvidia:
+  # 2 .command: jinja2 Dockerfile.jinja2 docker.template.yaml --format=yaml -s nvidia > Dockerfile.nvidia_auto
+  base_image: 'nvidia/cuda:12.1.1-base-ubuntu22.04'
+  main_install: poetry install --no-interaction --no-ansi --no-root --extras "${EXTRAS}" --without lint,test 
+  pyproject_sed: "true"
+
+cpu:
+  # 2. command: jinja2 Dockerfile.jinja2 docker.template.yaml --format=yaml -s cpu > Dockerfile.cpu_auto
+  base_image: 'ubuntu:22.04' 
+  main_install: poetry install --no-interaction --no-ansi --no-root --extras "${EXTRAS}" --without lint,test 
+  pyproject_sed: sed -i 's|"pypi"|"pytorch_cpu"|' pyproject.toml && rm poetry.lock 
+
+amd:
+  # 2 . command: jinja2 Dockerfile.jinja2 docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto
+  base_image: 'rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0'
+  main_install: poetry install --no-interaction --no-ansi --no-root --extras "${EXTRAS}" --without lint,test 
+  pyproject_sed: sed -i 's|"pypi"|"pytorch_rocm"|' pyproject.toml && sed -i 's|torch = "2.4.1"|#|' pyproject.toml  && rm poetry.lock 
+
diff --git a/libs/infinity_emb/infinity_emb/transformer/audio/torch.py b/libs/infinity_emb/infinity_emb/transformer/audio/torch.py
index 41ce443d..311cc8fd 100644
--- a/libs/infinity_emb/infinity_emb/transformer/audio/torch.py
+++ b/libs/infinity_emb/infinity_emb/transformer/audio/torch.py
@@ -16,7 +16,7 @@
 
 if CHECK_TORCH.is_available:
     import torch
-if CHECK_TRANSFORMERS.is_available:
+if CHECK_TORCH.is_available and CHECK_TRANSFORMERS.is_available:
     from transformers import AutoModel, AutoProcessor  # type: ignore
 
 
diff --git a/libs/infinity_emb/poetry.lock b/libs/infinity_emb/poetry.lock
index d5956cee..35b56e19 100644
--- a/libs/infinity_emb/poetry.lock
+++ b/libs/infinity_emb/poetry.lock
@@ -1339,6 +1339,26 @@ MarkupSafe = ">=2.0"
 [package.extras]
 i18n = ["Babel (>=2.7)"]
 
+[[package]]
+name = "jinja2-cli"
+version = "0.8.2"
+description = "A CLI interface to Jinja2"
+optional = false
+python-versions = "*"
+files = [
+    {file = "jinja2-cli-0.8.2.tar.gz", hash = "sha256:a16bb1454111128e206f568c95938cdef5b5a139929378f72bb8cf6179e18e50"},
+    {file = "jinja2_cli-0.8.2-py2.py3-none-any.whl", hash = "sha256:b91715c79496beaddad790171e7258a87db21c1a0b6d2b15bca3ba44b74aac5d"},
+]
+
+[package.dependencies]
+jinja2 = "*"
+
+[package.extras]
+tests = ["flake8", "jinja2", "pytest"]
+toml = ["jinja2", "toml"]
+xml = ["jinja2", "xmltodict"]
+yaml = ["jinja2", "pyyaml"]
+
 [[package]]
 name = "jiter"
 version = "0.6.1"
@@ -2270,13 +2290,13 @@ datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
 
 [[package]]
 name = "optimum"
-version = "1.17.1"
+version = "1.23.1"
 description = "Optimum Library is an extension of the Hugging Face Transformers library, providing a framework to integrate third-party libraries from Hardware Partners and interface with their specific functionality."
 optional = true
 python-versions = ">=3.7.0"
 files = [
-    {file = "optimum-1.17.1-py3-none-any.whl", hash = "sha256:508bc55db3c9434f4e8d5a30c39a46ac63c4cdb45bcc5a641b6c1c77cae88d23"},
-    {file = "optimum-1.17.1.tar.gz", hash = "sha256:e59af717e8691b11903fe2cfb8c6efd6f6798b0417f3e70d231e578a02448ceb"},
+    {file = "optimum-1.23.1-py3-none-any.whl", hash = "sha256:9a910601b665ac617ef14df99a44fe06e51040bcf945093f7b111d0e692fa5ac"},
+    {file = "optimum-1.23.1.tar.gz", hash = "sha256:bdef34c20d702a0856b0f35720287f561e55854e0fc4655512a99365ac480dde"},
 ]
 
 [package.dependencies]
@@ -2294,30 +2314,35 @@ packaging = "*"
 protobuf = {version = ">=3.20.1", optional = true, markers = "extra == \"onnxruntime\""}
 sympy = "*"
 torch = ">=1.11"
-transformers = {version = ">=4.26.0", extras = ["sentencepiece"]}
+transformers = [
+    {version = ">=4.29", extras = ["sentencepiece"]},
+    {version = "<4.46.0", optional = true, markers = "extra == \"onnxruntime\""},
+]
 
 [package.extras]
 amd = ["optimum-amd"]
 benchmark = ["evaluate (>=0.2.0)", "optuna", "scikit-learn", "seqeval", "torchvision", "tqdm"]
-dev = ["Pillow", "accelerate", "black (>=23.1,<24.0)", "diffusers (>=0.17.0)", "einops", "invisible-watermark", "parameterized", "pytest", "pytest-xdist", "requests", "rjieba", "ruff (==0.1.5)", "sacremoses", "scikit-learn", "timm", "torchaudio", "torchvision"]
+dev = ["Pillow", "accelerate", "black (>=23.1,<24.0)", "diffusers (>=0.17.0)", "einops", "invisible-watermark", "parameterized", "pytest (<=8.0.0)", "pytest-xdist", "requests", "rjieba", "ruff (==0.1.5)", "sacremoses", "scikit-learn", "timm", "torchaudio", "torchvision"]
 diffusers = ["diffusers"]
 doc-build = ["accelerate"]
-exporters = ["onnx", "onnxruntime", "timm"]
-exporters-gpu = ["onnx", "onnxruntime-gpu", "timm"]
-exporters-tf = ["h5py", "numpy (<1.24.0)", "onnx", "onnxruntime", "tensorflow (>=2.4,<=2.12.1)", "tf2onnx", "timm"]
+exporters = ["onnx", "onnxruntime", "timm", "transformers (<4.46.0)"]
+exporters-gpu = ["onnx", "onnxruntime-gpu", "timm", "transformers (<4.46.0)"]
+exporters-tf = ["datasets (<=2.16)", "h5py", "numpy (<1.24.0)", "onnx", "onnxruntime", "tensorflow (>=2.4,<=2.12.1)", "tf2onnx", "timm", "transformers[sentencepiece] (>=4.26,<4.38)"]
 furiosa = ["optimum-furiosa"]
 graphcore = ["optimum-graphcore"]
-habana = ["optimum-habana", "transformers (>=4.37.0,<4.38.0)"]
-intel = ["optimum-intel (>=1.15.0)"]
-neural-compressor = ["optimum-intel[neural-compressor] (>=1.15.0)"]
-neuron = ["optimum-neuron[neuron]"]
-neuronx = ["optimum-neuron[neuronx]"]
-nncf = ["optimum-intel[nncf] (>=1.15.0)"]
-onnxruntime = ["datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime (>=1.11.0)", "protobuf (>=3.20.1)"]
-onnxruntime-gpu = ["accelerate", "datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime-gpu (>=1.11.0)", "protobuf (>=3.20.1)"]
-openvino = ["optimum-intel[openvino] (>=1.15.0)"]
+habana = ["optimum-habana", "transformers (>=4.43.0,<4.44.0)"]
+intel = ["optimum-intel (>=1.18.0)"]
+ipex = ["optimum-intel[ipex] (>=1.18.0)"]
+neural-compressor = ["optimum-intel[neural-compressor] (>=1.18.0)"]
+neuron = ["optimum-neuron[neuron] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"]
+neuronx = ["optimum-neuron[neuronx] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"]
+nncf = ["optimum-intel[nncf] (>=1.18.0)"]
+onnxruntime = ["datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime (>=1.11.0)", "protobuf (>=3.20.1)", "transformers (<4.46.0)"]
+onnxruntime-gpu = ["accelerate", "datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime-gpu (>=1.11.0)", "protobuf (>=3.20.1)", "transformers (<4.46.0)"]
+openvino = ["optimum-intel[openvino] (>=1.18.0)"]
 quality = ["black (>=23.1,<24.0)", "ruff (==0.1.5)"]
-tests = ["Pillow", "accelerate", "diffusers (>=0.17.0)", "einops", "invisible-watermark", "parameterized", "pytest", "pytest-xdist", "requests", "rjieba", "sacremoses", "scikit-learn", "timm", "torchaudio", "torchvision"]
+quanto = ["optimum-quanto (>=0.2.4)"]
+tests = ["Pillow", "accelerate", "diffusers (>=0.17.0)", "einops", "invisible-watermark", "parameterized", "pytest (<=8.0.0)", "pytest-xdist", "requests", "rjieba", "sacremoses", "scikit-learn", "timm", "torchaudio", "torchvision"]
 
 [[package]]
 name = "orjson"
@@ -5054,7 +5079,7 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
 type = ["pytest-mypy"]
 
 [extras]
-all = ["colpali-engine", "ctranslate2", "diskcache", "einops", "fastapi", "optimum", "orjson", "pillow", "posthog", "prometheus-fastapi-instrumentator", "pydantic", "rich", "sentence-transformers", "soundfile", "timm", "torch", "typer", "uvicorn"]
+all = ["colpali-engine", "ctranslate2", "diskcache", "einops", "fastapi", "optimum", "orjson", "pillow", "posthog", "prometheus-fastapi-instrumentator", "pydantic", "rich", "sentence-transformers", "soundfile", "timm", "torch", "torchvision", "typer", "uvicorn"]
 audio = ["soundfile"]
 cache = ["diskcache"]
 ct2 = ["ctranslate2", "sentence-transformers", "torch", "transformers"]
@@ -5065,9 +5090,9 @@ optimum = ["optimum"]
 server = ["fastapi", "orjson", "posthog", "prometheus-fastapi-instrumentator", "pydantic", "rich", "typer", "uvicorn"]
 tensorrt = ["tensorrt"]
 torch = ["sentence-transformers", "torch"]
-vision = ["colpali-engine", "pillow", "timm"]
+vision = ["colpali-engine", "pillow", "timm", "torchvision"]
 
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<4"
-content-hash = "ddb2234dae1ebb22503ff767029fcaa8a7f15b394ace3f15864d4547fd866966"
+content-hash = "88abd4ba04370e75955d60208cbaa746d0de66247d123cfba0d8a5a4e79371dd"
diff --git a/libs/infinity_emb/pyproject.toml b/libs/infinity_emb/pyproject.toml
index acab8757..4180267e 100644
--- a/libs/infinity_emb/pyproject.toml
+++ b/libs/infinity_emb/pyproject.toml
@@ -1,3 +1,4 @@
+
 [tool.poetry]
 name = "infinity_emb"
 version = "0.0.63"
@@ -26,17 +27,20 @@ typer = {version = "^0.9.0", optional=true, extras = ["all"]}
 pydantic = {version = ">=2.4.0,<3", optional=true} 
 posthog = {version = "*", optional=true}
 # backend
+# pin torch to a specific source, but default to pypi. use sed to overwrite.
 torch = {version = ">=2.2.1", source = "pypi", optional=true}
 sentence-transformers = {version = "^3.0.1", optional=true} 
 transformers = {version = ">4.34.0,<=5.0", optional=true} 
 ctranslate2 = {version = "^4.0.0", optional=true}
-optimum = {version = ">=1.16.2", optional=true, extras=["onnxruntime"]}
+optimum = {version = ">=1.23.1", optional=true, extras=["onnxruntime"]}
 hf_transfer = {version=">=0.1.5"}
 einops = {version = "*", optional=true}
 # vision
 pillow = {version = "*", optional=true}
 timm = {version = "*", optional=true}
 colpali-engine = {version="^0.3.1", optional=true}
+# pin torchvision to a specific source, but default to pypi. use sed to overwrite.
+torchvision = {version = "*", source = "pypi", optional=true}
 # cache
 diskcache = {version = "*", optional=true}
 # gpu
@@ -60,6 +64,8 @@ mypy = "^1.5.1"
 requests = "2.28.1"
 types-requests = "2.28.1"
 openai = "*" # 1.51.0 works
+jinja2 = "*" 
+jinja2-cli = "*"
 
 # preferred dev dependencies
 torch = "2.4.1"
@@ -90,7 +96,7 @@ torch=["sentence-transformers","torch"]
 einops=["einops"]
 logging=["rich"]
 cache=["diskcache"]
-vision=["colpali-engine","pillow","timm"]
+vision=["colpali-engine","pillow","timm","torchvision"]
 audio=["soundfile"]
 server=[
     "fastapi",
@@ -118,6 +124,7 @@ all=[
     "sentence-transformers",
     "timm",
     "torch", 
+    "torchvision",
     "typer", 
     "uvicorn",
     "soundfile"
@@ -126,6 +133,18 @@ all=[
 tensorrt=["tensorrt"]
 onnxruntime-gpu=["onnxruntime-gpu"]
 
+[[tool.poetry.source]]
+# used for monkey-patching cpu onlu
+name = "pytorch_cpu"
+url = "https://download.pytorch.org/whl/cpu"
+priority = "explicit"
+
+[[tool.poetry.source]]
+# used for monkey-patching rocm only
+name = "pytorch_rocm"
+url = "https://download.pytorch.org/whl/rocm6.1"
+priority = "explicit"
+
 [tool.pytest.ini_options]
 markers = [
     "performance: tests that measure performance (deselect with '-m \"not performance\"')",

From cfa4f02a2cb4f99098668a3e52f6a572f8350c09 Mon Sep 17 00:00:00 2001
From: michaelfeil <me@michaelfeil.eu>
Date: Mon, 14 Oct 2024 23:44:45 -0700
Subject: [PATCH 2/3] add jinja docker templates 2

---
 libs/infinity_emb/{Dockerfile.nvidia_auto => Dockerfile} | 6 +++---
 libs/infinity_emb/Makefile                               | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename libs/infinity_emb/{Dockerfile.nvidia_auto => Dockerfile} (92%)

diff --git a/libs/infinity_emb/Dockerfile.nvidia_auto b/libs/infinity_emb/Dockerfile
similarity index 92%
rename from libs/infinity_emb/Dockerfile.nvidia_auto
rename to libs/infinity_emb/Dockerfile
index 6ec70999..4168262a 100644
--- a/libs/infinity_emb/Dockerfile.nvidia_auto
+++ b/libs/infinity_emb/Dockerfile
@@ -36,14 +36,14 @@ RUN echo "Poetry version:" && poetry --version
 COPY poetry.lock poetry.toml pyproject.toml README.md /app/
 # Install dependencies only
 RUN true
-RUN poetry install --no-interaction --no-ansi --extras "${EXTRAS}" --no-root --without lint,test && poetry cache clear pypi --all
+RUN poetry install --no-interaction --no-ansi --no-root --extras "${EXTRAS}" --without lint,test && poetry cache clear pypi --all
 COPY infinity_emb infinity_emb
 # Install dependency with infinity_emb package
-RUN poetry install --no-interaction --no-ansi --extras "${EXTRAS}"  --without lint,test && poetry cache clear pypi --all
+RUN poetry install --no-interaction --no-ansi  --extras "${EXTRAS}" --without lint,test && poetry cache clear pypi --all
 
 FROM builder as testing
 # install lint and test dependencies
-RUN poetry install --no-interaction --no-ansi --extras "${EXTRAS}"  --with lint,test && poetry cache clear pypi --all
+RUN poetry install --no-interaction --no-ansi  --extras "${EXTRAS}" --with lint,test && poetry cache clear pypi --all
 # lint 
 RUN poetry run ruff .
 RUN poetry run black --check .
diff --git a/libs/infinity_emb/Makefile b/libs/infinity_emb/Makefile
index 4c98942e..5392d0aa 100644
--- a/libs/infinity_emb/Makefile
+++ b/libs/infinity_emb/Makefile
@@ -3,7 +3,7 @@
 # Default target executed when no arguments are given to make.
 all: help
 
-precommit : | format spell_fix spell_check lint poetry_check cli_v2_docs openapi test 
+precommit : | format spell_fix spell_check lint poetry_check cli_v2_docs template_docker openapi test 
 
 ######################
 # TESTING AND COVERAGE

From e300d76fbee8bd263d7aaa22c8c4063ff61b379c Mon Sep 17 00:00:00 2001
From: michaelfeil <me@michaelfeil.eu>
Date: Tue, 15 Oct 2024 00:09:25 -0700
Subject: [PATCH 3/3] fix docker

---
 libs/infinity_emb/{Dockerfile => Dockerfile.nvidia_auto} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename libs/infinity_emb/{Dockerfile => Dockerfile.nvidia_auto} (100%)

diff --git a/libs/infinity_emb/Dockerfile b/libs/infinity_emb/Dockerfile.nvidia_auto
similarity index 100%
rename from libs/infinity_emb/Dockerfile
rename to libs/infinity_emb/Dockerfile.nvidia_auto