michaelfeil · michaelfeil · Nov 4, 2023 · Nov 4, 2023 · Nov 4, 2023 · Nov 4, 2023
diff --git a/README.md b/README.md
@@ -14,10 +14,10 @@
 [![MIT License][license-shield]][license-url]
 [![LinkedIn][linkedin-shield]][linkedin-url]
 
-
 # Infinity ♾️
-![codecov](https://codecov.io/gh/michaelfeil/infinity/branch/main/graph/badge.svg?token=NMVQY5QOFQ)
-![CI](https://github.com/michaelfeil/infinity/actions/workflows/ci.yaml/badge.svg)
+[![codecov][codecov-shield]][codecov-url]
+[![ci][ci-shield]][ci-url]
+[![Downloads][pepa-shield]][pepa-url]
 
 Infinity is a high-throughput, low-latency REST API for serving vector embeddings, supporting a wide range of sentence-transformer models and frameworks. Infinity is developed under MIT Licence: https://github.com/michaelfeil/infinity
 
@@ -106,11 +106,11 @@ The download path at runtime, can be controlled via the environment variable `SE
 
   Multiple models on one GPU is in experimental mode. You can use the following temporary solution:
   ```Dockerfile
-  # Dockerfile for multiple models via multiple ports
   FROM michaelf34/infinity:latest
+  # Dockerfile-ENTRYPOINT for multiple models via multiple ports
   ENTRYPOINT ["/bin/sh", "-c", \
-   "(/opt/poetry/bin/poetry run infinity_emb --port 8080 --model-name-or-path BAAI/bge-small-en-v1.5 &);\
-   (/opt/poetry/bin/poetry run infinity_emb --port 8081 --model-name-or-path intfloat/e5-large-v2 )"]
+   "(. /app/.venv/bin/activate && infinity_emb --port 8080 --model-name-or-path sentence-transformers/all-MiniLM-L6-v2 &);\
+   (. /app/.venv/bin/activate && infinity_emb --port 8081 --model-name-or-path intfloat/e5-large-v2 )"]
   ```
 
   You can build and run it via:  
@@ -157,3 +157,9 @@ poetry run pytest ./tests
 [license-url]: https://github.com/michaelfeil/infinity/blob/master/LICENSE.txt
 [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=for-the-badge&logo=linkedin&colorB=555
 [linkedin-url]: https://linkedin.com/in/michael-feil
+[pepa-shield]: https://static.pepy.tech/badge/infinity-emb
+[pepa-url]: https://www.pepy.tech/projects/infinity-emb
+[codecov-shield]: https://codecov.io/gh/michaelfeil/infinity/branch/main/graph/badge.svg?token=NMVQY5QOFQ
+[codecov-url]: https://codecov.io/gh/michaelfeil/infinity/branch/main
+[ci-shield]: https://github.com/michaelfeil/infinity/actions/workflows/ci.yaml/badge.svg
+[ci-url]: https://github.com/michaelfeil/infinity/actions
diff --git a/libs/infinity_emb/Dockerfile b/libs/infinity_emb/Dockerfile
@@ -1,5 +1,5 @@
 # Use the Python base image
-FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 AS builder
+FROM nvidia/cuda:12.2.0-base-ubuntu22.04 AS base
 
 ENV PYTHONUNBUFFERED=1 \
     # prevents python creating .pyc files
@@ -16,47 +16,43 @@ ENV PYTHONUNBUFFERED=1 \
     # do not ask any interactive question
     POETRY_NO_INTERACTION=1 
 
-RUN apt-get update && apt-get install python3.10 python3.10-venv -y 
+RUN apt-get update && apt-get install python3.10 curl -y 
+# python3.10-venv 
 
-# Define the version of Poetry to install (default is 1.4.2)
-ARG POETRY_VERSION=1.6.1
+FROM base as builder
+
+# Set the working directory for the app
+WORKDIR /app
 
+# Define the version of Poetry to install (default is 1.6.1)
 # Define the directory to install Poetry to (default is /opt/poetry)
+ARG POETRY_VERSION=1.6.1
 ARG POETRY_HOME=/opt/poetry
 
 # Create a Python virtual environment for Poetry and install it
-RUN python3.10 -m venv ${POETRY_HOME} && \
-    $POETRY_HOME/bin/pip install --upgrade pip && \
-    $POETRY_HOME/bin/pip install poetry==${POETRY_VERSION}
+RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=$POETRY_HOME POETRY_VERSION=$POETRY_VERSION python3.10 -
 
-ENV PATH="${PATH}:${POETRY_VENV}/bin"
+ENV PATH=$POETRY_HOME/bin:$PATH
 
 # Test if Poetry is installed in the expected path
-RUN echo "Poetry version:" && $POETRY_HOME/bin/poetry --version
-
-# Set the working directory for the app
-WORKDIR /app
-
-# Use a multi-stage build to install dependencies
-FROM builder AS dependencies
-
-ARG POETRY_HOME
-
-# Copy only the dependency files for installation
-COPY pyproject.toml poetry.lock poetry.toml ./
+RUN echo "Poetry version:" && poetry --version
 
-RUN $POETRY_HOME/bin/poetry config installer.max-workers 10
+# Copy the rest of the app source code (this layer will be invalidated and rebuilt whenever the source code changes)
+COPY poetry.lock poetry.toml pyproject.toml README.md .
+COPY infinity_emb infinity_emb
 
-# Install the Poetry dependencies (this layer will be cached as long as the dependencies don't change)
-RUN $POETRY_HOME/bin/poetry install --no-root --no-interaction --no-ansi --extras all
+# Install dependencies and project
+RUN poetry config virtualenvs.create false
+RUN poetry install --no-interaction --no-ansi --extras all
+# remove cache
+RUN poetry cache clear pypi --all
 
-# Use a multi-stage build to run tests
-FROM dependencies AS finisher
+# Use a multi-stage build -> production version
+from base AS production
 
-# Copy the rest of the app source code (this layer will be invalidated and rebuilt whenever the source code changes)
-COPY . .
+COPY --from=builder /app /app
+WORKDIR /app
 
-RUN $POETRY_HOME/bin/poetry install --no-interaction --no-ansi --extras all
 ENV SENTENCE_TRANSFORMERS_HOME=/app/.cache/torch
 
-ENTRYPOINT ["/opt/poetry/bin/poetry","run","infinity_emb"]
+CMD . /app/.venv/bin/activate && infinity_emb
diff --git a/libs/infinity_emb/README.md b/libs/infinity_emb/README.md
@@ -14,10 +14,10 @@
 [![MIT License][license-shield]][license-url]
 [![LinkedIn][linkedin-shield]][linkedin-url]
 
-
 # Infinity ♾️
-![codecov](https://codecov.io/gh/michaelfeil/infinity/branch/main/graph/badge.svg?token=NMVQY5QOFQ)
-![CI](https://github.com/michaelfeil/infinity/actions/workflows/ci.yaml/badge.svg)
+[![codecov][codecov-shield]][codecov-url]
+[![ci][ci-shield]][ci-url]
+[![Downloads][pepa-shield]][pepa-url]
 
 Infinity is a high-throughput, low-latency REST API for serving vector embeddings, supporting a wide range of sentence-transformer models and frameworks. Infinity is developed under MIT Licence: https://github.com/michaelfeil/infinity
 
@@ -106,11 +106,11 @@ The download path at runtime, can be controlled via the environment variable `SE
 
   Multiple models on one GPU is in experimental mode. You can use the following temporary solution:
   ```Dockerfile
-  # Dockerfile for multiple models via multiple ports
   FROM michaelf34/infinity:latest
+  # Dockerfile-ENTRYPOINT for multiple models via multiple ports
   ENTRYPOINT ["/bin/sh", "-c", \
-   "(/opt/poetry/bin/poetry run infinity_emb --port 8080 --model-name-or-path BAAI/bge-small-en-v1.5 &);\
-   (/opt/poetry/bin/poetry run infinity_emb --port 8081 --model-name-or-path intfloat/e5-large-v2 )"]
+   "(. /app/.venv/bin/activate && infinity_emb --port 8080 --model-name-or-path sentence-transformers/all-MiniLM-L6-v2 &);\
+   (. /app/.venv/bin/activate && infinity_emb --port 8081 --model-name-or-path intfloat/e5-large-v2 )"]
   ```
 
   You can build and run it via:  
@@ -157,3 +157,9 @@ poetry run pytest ./tests
 [license-url]: https://github.com/michaelfeil/infinity/blob/master/LICENSE.txt
 [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=for-the-badge&logo=linkedin&colorB=555
 [linkedin-url]: https://linkedin.com/in/michael-feil
+[pepa-shield]: https://static.pepy.tech/badge/infinity-emb
+[pepa-url]: https://www.pepy.tech/projects/infinity-emb
+[codecov-shield]: https://codecov.io/gh/michaelfeil/infinity/branch/main/graph/badge.svg?token=NMVQY5QOFQ
+[codecov-url]: https://codecov.io/gh/michaelfeil/infinity/branch/main
+[ci-shield]: https://github.com/michaelfeil/infinity/actions/workflows/ci.yaml/badge.svg
+[ci-url]: https://github.com/michaelfeil/infinity/actions
diff --git a/libs/infinity_emb/infinity_emb/transformer/abstract.py b/libs/infinity_emb/infinity_emb/transformer/abstract.py
@@ -10,16 +10,16 @@
 class BaseTransformer(ABC):  # Inherit from ABC(Abstract base class)
     @abstractmethod  # Decorator to define an abstract method
     def encode_pre(self, sentences: List[str]) -> INPUT_FEATURE:
-        pass
+        """takes care of the tokenization and feature preparation"""
 
     @abstractmethod
     def encode_core(self, features: INPUT_FEATURE) -> OUT_FEATURES:
-        pass
+        """runs plain inference, on cpu/gpu"""
 
     @abstractmethod
     def encode_post(self, embedding: OUT_FEATURES) -> NpEmbeddingType:
-        pass
+        """runs post encoding such as normlization"""
 
     @abstractmethod
     def tokenize_lengths(self, sentences: List[str]) -> List[int]:
-        pass
+        """gets the lengths of each sentences according to tokenize/len etc."""
diff --git a/libs/infinity_emb/infinity_emb/transformer/sentence_transformer.py b/libs/infinity_emb/infinity_emb/transformer/sentence_transformer.py
@@ -11,12 +11,12 @@
 try:
     import torch
     from sentence_transformers import SentenceTransformer, util  # type: ignore
-    from torch import Tensor, device, dtype
+    from torch import Tensor
     from torch.nn import Module
 
     TORCH_AVAILABLE = True
 except ImportError:
-    torch, Tensor, device, dtype = None, None, None, None
+    torch, Tensor = None, None
 
     class SentenceTransformer:
         pass
@@ -233,56 +233,6 @@ def children(self):
         # child module so that it will stay on the CPU.
         return []
 
-    def half(self):
-        self.to(dtype="float16")
-        return self
-
-    def to(
-        self,
-        device: int | device | None = None,
-        dtype: dtype | str | None = None,
-        non_blocking: bool = False,
-    ) -> "CT2Transformer":
-        if not isinstance(device, int):
-            raise ValueError("param `dtype` needs to be of type int")
-        if not isinstance(dtype, str) or dtype is not None:
-            raise ValueError("param `dtype` needs to be of type str")
-
-        if dtype and not ("float" in dtype or "int" in dtype):
-            raise ValueError(
-                "dtype should be one of `int8`, `float16`, `int8_float16`, `float32`"
-            )
-        elif dtype:
-            new_dtype = True
-            self.compute_type = new_dtype
-        else:
-            new_dtype = False
-
-        if device and (device.startswith("cuda") or device.startswith("cpu")):
-            raise ValueError(
-                "for param `device`, f'cuda:{index}' or f'cpu:{index}' are supported"
-            )
-        elif device:
-            if ":" in device:
-                new_device = device.split(":")[0]
-                new_index = device.split(":")[1]
-            else:
-                new_device = device
-                new_index = "0"
-        else:
-            new_device = ""
-            new_index = ""
-
-        if new_device or new_dtype or new_index:
-            self.encoder = self._ctranslate2_encoder_cls(
-                self.ct2_model_dir,
-                device=new_device,
-                device_index=new_index,
-                intra_threads=torch.get_num_threads(),
-                compute_type=self.compute_type,
-            )
-        return self
-
     def forward(self, features):
         """overwrites torch forward method with CTranslate model"""
         device = features["input_ids"].device

diff --git a/libs/infinity_emb/poetry.lock b/libs/infinity_emb/poetry.lock
diff --git a/libs/infinity_emb/poetry.toml b/libs/infinity_emb/poetry.toml
@@ -1,5 +1,7 @@
 [virtualenvs]
+create = true
 in-project = true
 
 [installer]
 modern-installation = false
+max-workers = 10