From 0a2ecf72ab5f2d4a1a2f22c5cc0f5197c6edb39e Mon Sep 17 00:00:00 2001 From: michaelfeil Date: Sat, 8 Jun 2024 16:40:20 -0700 Subject: [PATCH 1/6] add first version of clip Co-authored-by: lckr <15931380+lckr@users.noreply.github.com> --- .../infinity_emb/_optional_imports.py | 2 +- libs/infinity_emb/infinity_emb/engine.py | 23 ++++ .../infinity_emb/inference/batch_handler.py | 31 +++++ .../infinity_emb/inference/select_model.py | 3 + libs/infinity_emb/infinity_emb/primitives.py | 53 +++++++- libs/infinity_emb/infinity_emb/py.typed | 0 .../infinity_emb/transformer/abstract.py | 44 ++++++- .../infinity_emb/transformer/utils.py | 12 ++ .../transformer/vision/__init__.py | 0 .../transformer/vision/torch_vision.py | 122 ++++++++++++++++++ .../infinity_emb/transformer/vision/utils.py | 12 ++ libs/infinity_emb/pyproject.toml | 3 +- .../tests/unit_test/test_engine.py | 42 ++++++ .../transformer/vision/test_torch_vision.py | 36 ++++++ 14 files changed, 378 insertions(+), 5 deletions(-) create mode 100644 libs/infinity_emb/infinity_emb/py.typed create mode 100644 libs/infinity_emb/infinity_emb/transformer/vision/__init__.py create mode 100644 libs/infinity_emb/infinity_emb/transformer/vision/torch_vision.py create mode 100644 libs/infinity_emb/infinity_emb/transformer/vision/utils.py create mode 100644 libs/infinity_emb/tests/unit_test/transformer/vision/test_torch_vision.py diff --git a/libs/infinity_emb/infinity_emb/_optional_imports.py b/libs/infinity_emb/infinity_emb/_optional_imports.py index 7f3651ac..716baaa5 100644 --- a/libs/infinity_emb/infinity_emb/_optional_imports.py +++ b/libs/infinity_emb/infinity_emb/_optional_imports.py @@ -60,7 +60,7 @@ def _raise_error(self) -> None: CHECK_OPTIMUM_NEURON = OptionalImports("optimum.neuron", "neuronx") CHECK_SENTENCE_TRANSFORMERS = OptionalImports("sentence_transformers", "torch") CHECK_TRANSFORMERS = OptionalImports("transformers", "torch") -CHECK_TORCH = OptionalImports("torch.nn", "torch") +CHECK_TORCH = OptionalImports("torch", "torch") CHECK_PYDANTIC = OptionalImports("pydantic", "server") CHECK_TYPER = OptionalImports("typer", "server") CHECK_UVICORN = OptionalImports("uvicorn", "server") diff --git a/libs/infinity_emb/infinity_emb/engine.py b/libs/infinity_emb/infinity_emb/engine.py index 86437434..de889f4d 100644 --- a/libs/infinity_emb/infinity_emb/engine.py +++ b/libs/infinity_emb/infinity_emb/engine.py @@ -194,6 +194,29 @@ async def classify( return scores, usage + async def image_embed( + self, *, images: list[str] + ) -> tuple[list[EmbeddingReturnType], int]: + """embed multiple images + + Args: + images (list[str]): list of image urls, to be embedded + + Raises: + ValueError: raised if engine is not started yet + ModelNotDeployedError: If loaded model does not expose `image_embed` + capabilities + + Returns: + list[EmbeddingReturnType]: embeddings + 2D list-array of shape( len(sentences),embed_dim ) + int: token usage + """ + + self._assert_running() + embeddings, usage = await self._batch_handler.image_embed(images=images) + return embeddings, usage + def _assert_running(self): if not self.running: raise ValueError( diff --git a/libs/infinity_emb/infinity_emb/inference/batch_handler.py b/libs/infinity_emb/infinity_emb/inference/batch_handler.py index dac2ddd1..56f7047c 100644 --- a/libs/infinity_emb/infinity_emb/inference/batch_handler.py +++ b/libs/infinity_emb/infinity_emb/inference/batch_handler.py @@ -18,6 +18,7 @@ ClassifyReturnType, EmbeddingReturnType, EmbeddingSingle, + ImageSingle, ModelCapabilites, ModelNotDeployedError, OverloadStatus, @@ -28,6 +29,7 @@ ) from infinity_emb.transformer.abstract import BaseTransformer from infinity_emb.transformer.utils import get_lengths_with_tokenize +from infinity_emb.transformer.vision.utils import resolve_images class ShutdownReadOnly: @@ -200,6 +202,35 @@ async def classify( return classifications, usage + async def image_embed( + self, + *, + images: list[str], + ) -> tuple[list[EmbeddingReturnType], int]: + """Schedule a images and sentences to be embedded. Awaits until embedded. + + Args: + images (list[str]): list of pre-signed urls + + Raises: + ModelNotDeployedError: If loaded model does not expose `embed` + capabilities + + Returns: + list[EmbeddingReturnType]: list of embedding as 1darray + int: token usage + """ + + if "image_embed" not in self.model_worker.capabilities: + raise ModelNotDeployedError( + "the loaded moded cannot fullyfill `image_embed`." + f"options are {self.model_worker.capabilities}." + ) + + items = await asyncio.to_thread(resolve_images, images) + embeddings, usage = await self._schedule(items) + return embeddings, usage + async def _schedule( self, list_queueitem: Sequence[AbstractSingle] ) -> tuple[list[Any], int]: diff --git a/libs/infinity_emb/infinity_emb/inference/select_model.py b/libs/infinity_emb/infinity_emb/inference/select_model.py index d52f667f..ed4bb6d6 100644 --- a/libs/infinity_emb/infinity_emb/inference/select_model.py +++ b/libs/infinity_emb/infinity_emb/inference/select_model.py @@ -8,6 +8,7 @@ from infinity_emb.log_handler import logger from infinity_emb.transformer.abstract import BaseCrossEncoder, BaseEmbedder from infinity_emb.transformer.utils import ( + ClipLikeEngine, EmbedderEngine, InferenceEngine, PredictEngine, @@ -44,6 +45,8 @@ def get_engine_type_from_config( return RerankEngine.from_inference_engine(engine_args.engine) else: return PredictEngine.from_inference_engine(engine_args.engine) + if config.get("vision_config") and "clip" in config.get("model_type", "").lower(): + return ClipLikeEngine.from_inference_engine(engine_args.engine) else: return EmbedderEngine.from_inference_engine(engine_args.engine) diff --git a/libs/infinity_emb/infinity_emb/primitives.py b/libs/infinity_emb/infinity_emb/primitives.py index 3af74e7e..87fb152d 100644 --- a/libs/infinity_emb/infinity_emb/primitives.py +++ b/libs/infinity_emb/infinity_emb/primitives.py @@ -13,11 +13,23 @@ # cached_porperty from functools import lru_cache -from typing import Generic, Literal, Optional, Type, TypedDict, TypeVar, Union +from typing import ( + TYPE_CHECKING, + Generic, + Literal, + Optional, + Type, + TypedDict, + TypeVar, + Union, +) import numpy as np import numpy.typing as npt +if TYPE_CHECKING: + from PIL.Image import Image as ImageClass + # if python>=3.10 use kw_only dataclass_args = {"kw_only": True} if sys.version_info >= (3, 10) else {} @@ -153,6 +165,18 @@ class PredictSingle(EmbeddingSingle): pass +@dataclass(**dataclass_args) +class ImageSingle(AbstractSingle): + image: "ImageClass" + + def str_repr(self) -> str: + """creates a dummy representation of the image to count tokens relative to shape""" + return f"an image is worth a repeated {'token' * self.image.height}" + + def to_input(self) -> "ImageClass": + return self.image + + AbstractInnerType = TypeVar("AbstractInnerType") @@ -242,12 +266,37 @@ async def get_result(self) -> ClassifyReturnType: return self.class_encoding -QueueItemInner = Union[EmbeddingInner, ReRankInner, PredictInner] +@dataclass(order=True, **dataclass_args) +class ImageInner(AbstractInner): + content: ImageSingle + embedding: Optional[EmbeddingReturnType] = None + + async def complete(self, result: EmbeddingReturnType) -> None: + """marks the future for completion. + only call from the same thread as created future.""" + self.embedding = result + + if self.embedding is None: + raise ValueError("embedding is None") + try: + self.future.set_result(self.embedding) + except asyncio.exceptions.InvalidStateError: + pass + + async def get_result(self) -> EmbeddingReturnType: + """waits for future to complete and returns result""" + await self.future + assert self.embedding is not None + return self.embedding + + +QueueItemInner = Union[EmbeddingInner, ReRankInner, PredictInner, ImageInner] _type_to_inner_item_map = { EmbeddingSingle: EmbeddingInner, ReRankSingle: ReRankInner, PredictSingle: PredictInner, + ImageSingle: ImageInner, } diff --git a/libs/infinity_emb/infinity_emb/py.typed b/libs/infinity_emb/infinity_emb/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/libs/infinity_emb/infinity_emb/transformer/abstract.py b/libs/infinity_emb/infinity_emb/transformer/abstract.py index 6af6ad79..428dc823 100644 --- a/libs/infinity_emb/infinity_emb/transformer/abstract.py +++ b/libs/infinity_emb/infinity_emb/transformer/abstract.py @@ -1,12 +1,15 @@ +import random from abc import ABC, abstractmethod from time import perf_counter -from typing import Any, Set +from typing import Any, Set, Union from infinity_emb.primitives import ( EmbeddingDtype, EmbeddingInner, EmbeddingReturnType, EmbeddingSingle, + ImageInner, + ImageSingle, ModelCapabilites, PredictInner, PredictSingle, @@ -72,6 +75,45 @@ def warmup(self, *, batch_size: int = 64, n_tokens=1) -> tuple[float, float, str return run_warmup(self, inp) +class BaseClipVisionModel(BaseTransformer): # Inherit from ABC(Abstract base class) + capabilities = {"embed", "image_embed"} + + @property + def embedding_dtype(self) -> EmbeddingDtype: + """returns the dtype of the embeddings""" + return self.engine_args.embedding_dtype # type: ignore + + @abstractmethod # Decorator to define an abstract method + def encode_pre(self, sentences_or_images: Union[list[str], Any]) -> INPUT_FEATURE: + """ + takes a list of sentences, or a list of images. + Images could be url or numpy arrays/pil + """ + + @abstractmethod + def encode_post( + self, embedding: OUT_FEATURES, skip_quanitzation=True + ) -> EmbeddingReturnType: + """runs post encoding such as normalization""" + + def warmup(self, *, batch_size: int = 64, n_tokens=1) -> tuple[float, float, str]: + sample_text = ["warm " * n_tokens] * max(1, batch_size // 2) + sample_image = [] * max(1, batch_size // 2) + inp = [ + # TODO: warmup for images + ImageInner(content=ImageSingle(image=img), future=None) # type: ignore + for img in sample_image + ] + [ + EmbeddingInner( + content=EmbeddingSingle(sentence=s), future=None # type: ignore + ) + for s in sample_text + ] + random.shuffle(inp) + + return run_warmup(self, inp) + + class BaseClassifer(BaseTransformer): # Inherit from ABC(Abstract base class) capabilities = {"classify"} diff --git a/libs/infinity_emb/infinity_emb/transformer/utils.py b/libs/infinity_emb/infinity_emb/transformer/utils.py index ecbde3fb..4ba30fbb 100644 --- a/libs/infinity_emb/infinity_emb/transformer/utils.py +++ b/libs/infinity_emb/infinity_emb/transformer/utils.py @@ -14,6 +14,7 @@ from infinity_emb.transformer.embedder.sentence_transformer import ( SentenceTransformerPatched, ) +from infinity_emb.transformer.vision.torch_vision import ClipLikeModel __all__ = [ "length_tokenizer", @@ -58,6 +59,17 @@ def from_inference_engine(engine: InferenceEngine): raise NotImplementedError(f"RerankEngine for {engine} not implemented") +class ClipLikeEngine(Enum): + torch = ClipLikeModel + + @staticmethod + def from_inference_engine(engine: InferenceEngine): + if engine == InferenceEngine.torch: + return ClipLikeEngine.torch + else: + raise NotImplementedError(f"ClipLikeEngine for {engine} not implemented") + + class PredictEngine(Enum): torch = SentenceClassifier diff --git a/libs/infinity_emb/infinity_emb/transformer/vision/__init__.py b/libs/infinity_emb/infinity_emb/transformer/vision/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/libs/infinity_emb/infinity_emb/transformer/vision/torch_vision.py b/libs/infinity_emb/infinity_emb/transformer/vision/torch_vision.py new file mode 100644 index 00000000..631e4111 --- /dev/null +++ b/libs/infinity_emb/infinity_emb/transformer/vision/torch_vision.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Union, Optional + +from infinity_emb._optional_imports import CHECK_TORCH, CHECK_TRANSFORMERS +from infinity_emb.args import EngineArgs +from infinity_emb.primitives import Dtype +from infinity_emb.transformer.abstract import BaseClipVisionModel + +from infinity_emb.transformer.quantization.interface import quant_embedding_decorator + +if TYPE_CHECKING: + from PIL.Image import Image as ImageClass + from torch import Tensor + +if CHECK_TORCH.is_available: + import torch + + +class ClipLikeModel(BaseClipVisionModel): + """CrossEncoder with .encode_core() and no microbatching""" + + def __init__(self, *, engine_args: EngineArgs): + CHECK_TORCH.mark_required() + CHECK_TRANSFORMERS.mark_required() + from transformers import AutoModel, AutoProcessor + + self.model = AutoModel.from_pretrained( + engine_args.model_name_or_path, + revision=engine_args.revision, + ) + if torch.cuda.is_available(): + self.model = self.model.cuda() + if engine_args.dtype in (Dtype.float16, Dtype.auto): + self.model = self.model.half() + self.processor = AutoProcessor.from_pretrained( + engine_args.model_name_or_path, + revision=engine_args.revision, + ) + self.engine_args = engine_args + + if engine_args.compile: + self.model.vision_model = torch.compile(self.model.vision_model) + self.model.text_model = torch.compile(self.model.text_model) + + assert hasattr( + self.model, "get_text_features" + ), f"AutoModel of {engine_args.model_name_or_path} does not have get_text_features method" + assert hasattr( + self.model, "get_image_features" + ), f"AutoModel of {engine_args.model_name_or_path} does not have get_image_features method" + + def encode_pre(self, input_tuples: list[Union[str, ImageClass]]): + # return input_tuples + text_list: list[str] = [] + image_list = [] + type_is_img: list[bool] = [] + + for im_or_text in input_tuples: + if isinstance(im_or_text, str): + text_list.append(im_or_text) + type_is_img.append(False) + else: + image_list.append(im_or_text) + type_is_img.append(True) + if not image_list: + image_list = None + if not text_list: + text_list = None + + preprocessed = self.processor( + images=image_list, + text=text_list, + return_tensors="pt", + padding=True, + truncation=True, + ) + preprocessed = {k: v.to(self.model.device) for k, v in preprocessed.items()} + + return (preprocessed, type_is_img) + + def _normalize_cpu(self, tensor: Optional["Tensor"]) -> iter["Tensor"]: + if tensor is None: + return iter([]) + return iter((tensor / tensor.norm(p=2, dim=-1, keepdim=True)).cpu()) + + def encode_core(self, features_and_types: tuple[dict[str, "Tensor"], list[bool]]) -> list["Tensor"]: + """ + Computes sentence embeddings + """ + features, type_is_img = features_and_types + with torch.no_grad(): + # TODO: torch.cuda.stream() + if "input_ids" in features: + text_embeds = self.model.get_text_features( + input_ids=features.get("input_ids"), + attention_mask=features.get("attention_mask"), + ) + else: + text_embeds = None # type: ignore + if "pixel_values" in features: + image_embeds = self.model.get_image_features( + pixel_values=features.get("pixel_values"), + ) + else: + image_embeds = None + + return text_embeds, image_embeds, type_is_img + + @quant_embedding_decorator() + def encode_post(self, out_features) -> list[float]: + text_embeds, image_embeds, type_is_img = out_features + text_embeds = self._normalize_cpu(text_embeds) + image_embeds = self._normalize_cpu(image_embeds) + embeddings = list( + next(image_embeds if is_img else text_embeds) for is_img in type_is_img + ) + return embeddings + + def tokenize_lengths(self, text_list: list[str]) -> list[int]: + preprocessed = self.processor(text=text_list, truncation=True) + return [len(t) for t in preprocessed["input_ids"]] diff --git a/libs/infinity_emb/infinity_emb/transformer/vision/utils.py b/libs/infinity_emb/infinity_emb/transformer/vision/utils.py new file mode 100644 index 00000000..eadd042d --- /dev/null +++ b/libs/infinity_emb/infinity_emb/transformer/vision/utils.py @@ -0,0 +1,12 @@ +import requests +from PIL import Image + +from infinity_emb.primitives import ImageSingle + + +def resolve_images(image_urls) -> list[ImageSingle]: + # TODO: improve parallel requests, safety, error handling + return [ + ImageSingle(image=Image.open(requests.get(url, stream=True).raw)) + for url in image_urls + ] diff --git a/libs/infinity_emb/pyproject.toml b/libs/infinity_emb/pyproject.toml index a8101c4a..1a9fc643 100644 --- a/libs/infinity_emb/pyproject.toml +++ b/libs/infinity_emb/pyproject.toml @@ -73,8 +73,9 @@ torch=["sentence-transformers","torch"] einops=["einops"] logging=["rich"] cache=["diskcache"] +vision=["pillow"] server=["fastapi", "pydantic", "orjson", "prometheus-fastapi-instrumentator", "uvicorn", "typer","rich"] -all=["ctranslate2", "fastapi", "optimum", "orjson", "prometheus-fastapi-instrumentator", "pydantic", "rich", "sentence-transformers", "torch", "typer", "uvicorn","diskcache", "einops"] +all=["ctranslate2", "fastapi", "optimum", "orjson", "prometheus-fastapi-instrumentator", "pydantic", "rich", "sentence-transformers", "torch", "typer", "uvicorn","diskcache", "einops", "vision"] # non-default gpu tensorrt=["tensorrt"] onnxruntime-gpu=["onnxruntime-gpu"] diff --git a/libs/infinity_emb/tests/unit_test/test_engine.py b/libs/infinity_emb/tests/unit_test/test_engine.py index af6851bf..2ed4c9a5 100644 --- a/libs/infinity_emb/tests/unit_test/test_engine.py +++ b/libs/infinity_emb/tests/unit_test/test_engine.py @@ -1,3 +1,4 @@ +import asyncio import sys import numpy as np @@ -141,6 +142,47 @@ async def test_async_api_torch_lengths_via_tokenize_usage(): assert embeddings.shape[1] >= 10 +@pytest.mark.anyio +async def test_torch_clip_embed(): + image_urls = [ + "http://images.cocodataset.org/val2017/000000039769.jpg" + ] # a photo of two cats + sentences = [ + "a photo of two cats", + "a photo of a cat", + "a photo of a dog", + "a photo of a car", + ] + engine = AsyncEmbeddingEngine.from_args( + EngineArgs( + model_name_or_path="wkcn/TinyCLIP-ViT-8M-16-Text-3M-YFCC15M", + engine=InferenceEngine.torch, + model_warmup=False, + ) + ) + async with engine: + t1, t2 = asyncio.create_task(engine.embed(sentences)), asyncio.create_task( + engine.image_embed(images=image_urls) + ) + emb_text, usage_text = await t1 + emb_image, usage_image = await t2 + emb_text_np = np.array(emb_text) # type: ignore + emb_image_np = np.array(emb_image) # type: ignore + + assert emb_text_np.shape[0] == len(sentences) + assert emb_image_np.shape[0] == len(image_urls) + assert emb_text_np.shape[1] >= 10 + assert emb_image_np.shape == emb_image_np[: len(image_urls)].shape + + assert usage_text == sum([len(s) for s in sentences]) + + # check if cat image and two cats are most similar + for i in range(1, len(sentences)): + assert np.dot(emb_text_np[0], emb_image_np[0]) > np.dot( + emb_text_np[i], emb_image_np[0] + ) + + @pytest.mark.anyio @pytest.mark.skipif(sys.platform != "linux", reason="only run these on Linux") @pytest.mark.parametrize( diff --git a/libs/infinity_emb/tests/unit_test/transformer/vision/test_torch_vision.py b/libs/infinity_emb/tests/unit_test/transformer/vision/test_torch_vision.py new file mode 100644 index 00000000..bc2b9edd --- /dev/null +++ b/libs/infinity_emb/tests/unit_test/transformer/vision/test_torch_vision.py @@ -0,0 +1,36 @@ +import requests +import torch +from PIL import Image +from transformers import CLIPModel, CLIPProcessor + +from infinity_emb.args import EngineArgs +from infinity_emb.transformer.vision.torch_vision import ClipLikeModel + + +def test_clip_like_model(): + model_name = "openai/clip-vit-base-patch32" + model = ClipLikeModel(engine_args=EngineArgs(model_name_or_path=model_name, dtype="float32")) + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + + inputs = [ + "a photo of a cat", + image, + "a photo of a dog", + image, + ] + embeddings = model.encode_post(model.encode_core(model.encode_pre(inputs))) + assert len(embeddings) == len(inputs) + + model = CLIPModel.from_pretrained(model_name) + processor = CLIPProcessor.from_pretrained(model_name) + + inputs_clip = processor( + text=["a photo of a cat"], images=[image], return_tensors="pt", padding=True + ) + + outputs = model(**inputs_clip) + + torch.testing.assert_close(outputs.text_embeds[0], embeddings[0], check_dtype=False) + torch.testing.assert_close(outputs.image_embeds[0], embeddings[3], check_dtype=False) + torch.testing.assert_close(embeddings[1], embeddings[3]) From 43447b16a28517e18506d5be45c909d630a2a046 Mon Sep 17 00:00:00 2001 From: michaelfeil Date: Sat, 8 Jun 2024 16:52:05 -0700 Subject: [PATCH 2/6] add typing and additional unit test Co-authored-by: lckr <15931380+lckr@users.noreply.github.com> --- .../infinity_emb/_optional_imports.py | 2 ++ .../infinity_emb/inference/batch_handler.py | 1 - libs/infinity_emb/infinity_emb/primitives.py | 4 +-- .../infinity_emb/transformer/abstract.py | 15 ++++++---- .../transformer/vision/torch_vision.py | 28 +++++++++---------- .../infinity_emb/transformer/vision/utils.py | 14 +++++++--- .../transformer/vision/test_torch_vision.py | 16 +++++++---- 7 files changed, 47 insertions(+), 33 deletions(-) diff --git a/libs/infinity_emb/infinity_emb/_optional_imports.py b/libs/infinity_emb/infinity_emb/_optional_imports.py index 716baaa5..86a916ab 100644 --- a/libs/infinity_emb/infinity_emb/_optional_imports.py +++ b/libs/infinity_emb/infinity_emb/_optional_imports.py @@ -61,6 +61,8 @@ def _raise_error(self) -> None: CHECK_SENTENCE_TRANSFORMERS = OptionalImports("sentence_transformers", "torch") CHECK_TRANSFORMERS = OptionalImports("transformers", "torch") CHECK_TORCH = OptionalImports("torch", "torch") +CHECK_REQUESTS = OptionalImports("requests", "server") +CHECK_PIL = OptionalImports("PIL", "vision") CHECK_PYDANTIC = OptionalImports("pydantic", "server") CHECK_TYPER = OptionalImports("typer", "server") CHECK_UVICORN = OptionalImports("uvicorn", "server") diff --git a/libs/infinity_emb/infinity_emb/inference/batch_handler.py b/libs/infinity_emb/infinity_emb/inference/batch_handler.py index 56f7047c..90111328 100644 --- a/libs/infinity_emb/infinity_emb/inference/batch_handler.py +++ b/libs/infinity_emb/infinity_emb/inference/batch_handler.py @@ -18,7 +18,6 @@ ClassifyReturnType, EmbeddingReturnType, EmbeddingSingle, - ImageSingle, ModelCapabilites, ModelNotDeployedError, OverloadStatus, diff --git a/libs/infinity_emb/infinity_emb/primitives.py b/libs/infinity_emb/infinity_emb/primitives.py index 87fb152d..211bd922 100644 --- a/libs/infinity_emb/infinity_emb/primitives.py +++ b/libs/infinity_emb/infinity_emb/primitives.py @@ -133,7 +133,7 @@ def str_repr(self) -> str: pass @abstractmethod - def to_input(self) -> Union[str, tuple[str, str]]: + def to_input(self) -> Union[str, tuple[str, str], "ImageClass"]: pass @@ -324,4 +324,4 @@ class ModelNotDeployedError(Exception): pass -ModelCapabilites = Literal["embed", "rerank", "classify"] +ModelCapabilites = Literal["embed", "rerank", "classify", "image_embed"] diff --git a/libs/infinity_emb/infinity_emb/transformer/abstract.py b/libs/infinity_emb/infinity_emb/transformer/abstract.py index 428dc823..4340cdcd 100644 --- a/libs/infinity_emb/infinity_emb/transformer/abstract.py +++ b/libs/infinity_emb/infinity_emb/transformer/abstract.py @@ -1,7 +1,7 @@ import random from abc import ABC, abstractmethod from time import perf_counter -from typing import Any, Set, Union +from typing import TYPE_CHECKING, Any, Set, Union from infinity_emb.primitives import ( EmbeddingDtype, @@ -21,6 +21,9 @@ INPUT_FEATURE = Any OUT_FEATURES = Any +if TYPE_CHECKING: + from PIL.Image import Image as ImageClass + class BaseTransformer(ABC): # Inherit from ABC(Abstract base class) capabilities: Set[ModelCapabilites] = set() @@ -57,7 +60,7 @@ def embedding_dtype(self) -> EmbeddingDtype: return self.engine_args.embedding_dtype # type: ignore @abstractmethod # Decorator to define an abstract method - def encode_pre(self, sentences: list[str]) -> INPUT_FEATURE: + def encode_pre(self, sentences: list[Union[str, Any]]) -> INPUT_FEATURE: """takes care of the tokenization and feature preparation""" @abstractmethod @@ -75,7 +78,7 @@ def warmup(self, *, batch_size: int = 64, n_tokens=1) -> tuple[float, float, str return run_warmup(self, inp) -class BaseClipVisionModel(BaseTransformer): # Inherit from ABC(Abstract base class) +class BaseClipVisionModel(BaseEmbedder): # Inherit from ABC(Abstract base class) capabilities = {"embed", "image_embed"} @property @@ -84,7 +87,9 @@ def embedding_dtype(self) -> EmbeddingDtype: return self.engine_args.embedding_dtype # type: ignore @abstractmethod # Decorator to define an abstract method - def encode_pre(self, sentences_or_images: Union[list[str], Any]) -> INPUT_FEATURE: + def encode_pre( + self, sentences_or_images: list[Union[str, "ImageClass"]] + ) -> INPUT_FEATURE: """ takes a list of sentences, or a list of images. Images could be url or numpy arrays/pil @@ -98,7 +103,7 @@ def encode_post( def warmup(self, *, batch_size: int = 64, n_tokens=1) -> tuple[float, float, str]: sample_text = ["warm " * n_tokens] * max(1, batch_size // 2) - sample_image = [] * max(1, batch_size // 2) + sample_image = [] * max(1, batch_size // 2) # type: ignore inp = [ # TODO: warmup for images ImageInner(content=ImageSingle(image=img), future=None) # type: ignore diff --git a/libs/infinity_emb/infinity_emb/transformer/vision/torch_vision.py b/libs/infinity_emb/infinity_emb/transformer/vision/torch_vision.py index 631e4111..7748aba2 100644 --- a/libs/infinity_emb/infinity_emb/transformer/vision/torch_vision.py +++ b/libs/infinity_emb/infinity_emb/transformer/vision/torch_vision.py @@ -1,12 +1,11 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Union, Optional +from typing import TYPE_CHECKING, Any, Iterable, Optional, Union from infinity_emb._optional_imports import CHECK_TORCH, CHECK_TRANSFORMERS from infinity_emb.args import EngineArgs from infinity_emb.primitives import Dtype from infinity_emb.transformer.abstract import BaseClipVisionModel - from infinity_emb.transformer.quantization.interface import quant_embedding_decorator if TYPE_CHECKING: @@ -15,6 +14,8 @@ if CHECK_TORCH.is_available: import torch +if CHECK_TRANSFORMERS.is_available: + from transformers import AutoModel, AutoProcessor # type: ignore class ClipLikeModel(BaseClipVisionModel): @@ -23,7 +24,6 @@ class ClipLikeModel(BaseClipVisionModel): def __init__(self, *, engine_args: EngineArgs): CHECK_TORCH.mark_required() CHECK_TRANSFORMERS.mark_required() - from transformers import AutoModel, AutoProcessor self.model = AutoModel.from_pretrained( engine_args.model_name_or_path, @@ -50,27 +50,23 @@ def __init__(self, *, engine_args: EngineArgs): self.model, "get_image_features" ), f"AutoModel of {engine_args.model_name_or_path} does not have get_image_features method" - def encode_pre(self, input_tuples: list[Union[str, ImageClass]]): + def encode_pre(self, sentences_or_images: list[Union[str, "ImageClass"]]): # return input_tuples text_list: list[str] = [] - image_list = [] + image_list: list[Any] = [] type_is_img: list[bool] = [] - for im_or_text in input_tuples: + for im_or_text in sentences_or_images: if isinstance(im_or_text, str): text_list.append(im_or_text) type_is_img.append(False) else: image_list.append(im_or_text) type_is_img.append(True) - if not image_list: - image_list = None - if not text_list: - text_list = None preprocessed = self.processor( - images=image_list, - text=text_list, + images=image_list if image_list else None, + text=text_list if text_list else None, return_tensors="pt", padding=True, truncation=True, @@ -79,12 +75,14 @@ def encode_pre(self, input_tuples: list[Union[str, ImageClass]]): return (preprocessed, type_is_img) - def _normalize_cpu(self, tensor: Optional["Tensor"]) -> iter["Tensor"]: + def _normalize_cpu(self, tensor: Optional["Tensor"]) -> Iterable["Tensor"]: if tensor is None: return iter([]) return iter((tensor / tensor.norm(p=2, dim=-1, keepdim=True)).cpu()) - def encode_core(self, features_and_types: tuple[dict[str, "Tensor"], list[bool]]) -> list["Tensor"]: + def encode_core( + self, features_and_types: tuple[dict[str, "Tensor"], list[bool]] + ) -> tuple["Tensor", "Tensor", list[bool]]: """ Computes sentence embeddings """ @@ -114,7 +112,7 @@ def encode_post(self, out_features) -> list[float]: image_embeds = self._normalize_cpu(image_embeds) embeddings = list( next(image_embeds if is_img else text_embeds) for is_img in type_is_img - ) + ) return embeddings def tokenize_lengths(self, text_list: list[str]) -> list[int]: diff --git a/libs/infinity_emb/infinity_emb/transformer/vision/utils.py b/libs/infinity_emb/infinity_emb/transformer/vision/utils.py index eadd042d..774b1632 100644 --- a/libs/infinity_emb/infinity_emb/transformer/vision/utils.py +++ b/libs/infinity_emb/infinity_emb/transformer/vision/utils.py @@ -1,11 +1,17 @@ -import requests -from PIL import Image - +from infinity_emb._optional_imports import CHECK_PIL, CHECK_REQUESTS from infinity_emb.primitives import ImageSingle +if CHECK_PIL.is_available: + from PIL import Image # type: ignore +if CHECK_REQUESTS.is_available: + import requests # type: ignore + -def resolve_images(image_urls) -> list[ImageSingle]: +def resolve_images(image_urls: list[str]) -> list[ImageSingle]: + """Resolve images from URLs.""" # TODO: improve parallel requests, safety, error handling + CHECK_REQUESTS.mark_required() + CHECK_PIL.mark_required() return [ ImageSingle(image=Image.open(requests.get(url, stream=True).raw)) for url in image_urls diff --git a/libs/infinity_emb/tests/unit_test/transformer/vision/test_torch_vision.py b/libs/infinity_emb/tests/unit_test/transformer/vision/test_torch_vision.py index bc2b9edd..c65e4013 100644 --- a/libs/infinity_emb/tests/unit_test/transformer/vision/test_torch_vision.py +++ b/libs/infinity_emb/tests/unit_test/transformer/vision/test_torch_vision.py @@ -1,7 +1,7 @@ -import requests +import requests # type: ignore import torch -from PIL import Image -from transformers import CLIPModel, CLIPProcessor +from PIL import Image # type: ignore +from transformers import CLIPModel, CLIPProcessor # type: ignore from infinity_emb.args import EngineArgs from infinity_emb.transformer.vision.torch_vision import ClipLikeModel @@ -9,7 +9,9 @@ def test_clip_like_model(): model_name = "openai/clip-vit-base-patch32" - model = ClipLikeModel(engine_args=EngineArgs(model_name_or_path=model_name, dtype="float32")) + model = ClipLikeModel( + engine_args=EngineArgs(model_name_or_path=model_name, dtype="float32") + ) url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) @@ -19,7 +21,7 @@ def test_clip_like_model(): "a photo of a dog", image, ] - embeddings = model.encode_post(model.encode_core(model.encode_pre(inputs))) + embeddings = model.encode_post(model.encode_core(model.encode_pre(inputs))) assert len(embeddings) == len(inputs) model = CLIPModel.from_pretrained(model_name) @@ -32,5 +34,7 @@ def test_clip_like_model(): outputs = model(**inputs_clip) torch.testing.assert_close(outputs.text_embeds[0], embeddings[0], check_dtype=False) - torch.testing.assert_close(outputs.image_embeds[0], embeddings[3], check_dtype=False) + torch.testing.assert_close( + outputs.image_embeds[0], embeddings[3], check_dtype=False + ) torch.testing.assert_close(embeddings[1], embeddings[3]) From 2412177e5e85c495c972e1e76f7d913ca3bfa4f6 Mon Sep 17 00:00:00 2001 From: michaelfeil Date: Sat, 8 Jun 2024 17:01:43 -0700 Subject: [PATCH 3/6] update pre-commit --- .../infinity_emb/_optional_imports.py | 2 +- libs/infinity_emb/poetry.lock | 9 +++--- libs/infinity_emb/pyproject.toml | 28 +++++++++++++++++-- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/libs/infinity_emb/infinity_emb/_optional_imports.py b/libs/infinity_emb/infinity_emb/_optional_imports.py index 86a916ab..f99d738d 100644 --- a/libs/infinity_emb/infinity_emb/_optional_imports.py +++ b/libs/infinity_emb/infinity_emb/_optional_imports.py @@ -60,7 +60,7 @@ def _raise_error(self) -> None: CHECK_OPTIMUM_NEURON = OptionalImports("optimum.neuron", "neuronx") CHECK_SENTENCE_TRANSFORMERS = OptionalImports("sentence_transformers", "torch") CHECK_TRANSFORMERS = OptionalImports("transformers", "torch") -CHECK_TORCH = OptionalImports("torch", "torch") +CHECK_TORCH = OptionalImports("torch.nn", "torch") CHECK_REQUESTS = OptionalImports("requests", "server") CHECK_PIL = OptionalImports("PIL", "vision") CHECK_PYDANTIC = OptionalImports("pydantic", "server") diff --git a/libs/infinity_emb/poetry.lock b/libs/infinity_emb/poetry.lock index 8dd91ee6..81f00fa5 100644 --- a/libs/infinity_emb/poetry.lock +++ b/libs/infinity_emb/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -2303,7 +2303,6 @@ optional = true python-versions = ">=3.9" files = [ {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"}, - {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"}, {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"}, {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"}, {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"}, @@ -2324,7 +2323,6 @@ files = [ {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"}, {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"}, {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"}, - {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"}, {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"}, {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"}, {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"}, @@ -4544,7 +4542,7 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.link testing = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [extras] -all = ["ctranslate2", "diskcache", "einops", "fastapi", "optimum", "orjson", "prometheus-fastapi-instrumentator", "pydantic", "rich", "sentence-transformers", "torch", "typer", "uvicorn"] +all = ["ctranslate2", "diskcache", "einops", "fastapi", "optimum", "orjson", "pillow", "prometheus-fastapi-instrumentator", "pydantic", "rich", "sentence-transformers", "torch", "typer", "uvicorn"] cache = ["diskcache"] ct2 = ["ctranslate2", "sentence-transformers", "torch", "transformers"] einops = ["einops"] @@ -4554,8 +4552,9 @@ optimum = ["optimum"] server = ["fastapi", "orjson", "prometheus-fastapi-instrumentator", "pydantic", "rich", "typer", "uvicorn"] tensorrt = ["tensorrt"] torch = ["sentence-transformers", "torch"] +vision = ["pillow"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "c6bcf42a770abac42e9bd11a55eef7cb65d0620cd66a930267eaf10360e464f4" +content-hash = "3256c9ea0b03caaa75894e463f07d47a7dac6158718d3854b2e56f247b6e9736" diff --git a/libs/infinity_emb/pyproject.toml b/libs/infinity_emb/pyproject.toml index 1a9fc643..5b908024 100644 --- a/libs/infinity_emb/pyproject.toml +++ b/libs/infinity_emb/pyproject.toml @@ -29,6 +29,7 @@ ctranslate2 = {version = "^4.0.0", optional=true} optimum = {version = ">=1.16.2", optional=true, extras=["onnxruntime"]} hf_transfer = {version=">=0.1.5"} einops = {version = "*", optional=true} +pillow = {version = "*", optional=true} # cache diskcache = {version = "*", optional=true} # gpu @@ -74,8 +75,31 @@ einops=["einops"] logging=["rich"] cache=["diskcache"] vision=["pillow"] -server=["fastapi", "pydantic", "orjson", "prometheus-fastapi-instrumentator", "uvicorn", "typer","rich"] -all=["ctranslate2", "fastapi", "optimum", "orjson", "prometheus-fastapi-instrumentator", "pydantic", "rich", "sentence-transformers", "torch", "typer", "uvicorn","diskcache", "einops", "vision"] +server=[ + "fastapi", + "orjson", + "prometheus-fastapi-instrumentator", + "pydantic", + "rich" + "typer", + "uvicorn", +] +all=[ + "ctranslate2", + "diskcache", + "einops", + "fastapi", + "optimum", + "orjson", + "pillow", + "prometheus-fastapi-instrumentator", + "pydantic", + "rich", + "sentence-transformers", + "torch", + "typer", + "uvicorn", +] # non-default gpu tensorrt=["tensorrt"] onnxruntime-gpu=["onnxruntime-gpu"] From 40738043de5f90a7907ef4a7c4255adf962abee0 Mon Sep 17 00:00:00 2001 From: michaelfeil Date: Sat, 8 Jun 2024 17:02:41 -0700 Subject: [PATCH 4/6] update pyproject --- libs/infinity_emb/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/infinity_emb/pyproject.toml b/libs/infinity_emb/pyproject.toml index 5b908024..b43e7197 100644 --- a/libs/infinity_emb/pyproject.toml +++ b/libs/infinity_emb/pyproject.toml @@ -80,7 +80,7 @@ server=[ "orjson", "prometheus-fastapi-instrumentator", "pydantic", - "rich" + "rich", "typer", "uvicorn", ] From d111e6ff2956fe955ae361bbde4ea9014c68f9b6 Mon Sep 17 00:00:00 2001 From: michaelfeil Date: Sat, 8 Jun 2024 18:48:21 -0700 Subject: [PATCH 5/6] update typing --- .../infinity_emb/transformer/crossencoder/torch.py | 2 +- .../transformer/embedder/sentence_transformer.py | 2 +- libs/infinity_emb/tests/unit_test/inference/test_models.py | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/libs/infinity_emb/infinity_emb/transformer/crossencoder/torch.py b/libs/infinity_emb/infinity_emb/transformer/crossencoder/torch.py index dfbd8d2c..638b5c99 100644 --- a/libs/infinity_emb/infinity_emb/transformer/crossencoder/torch.py +++ b/libs/infinity_emb/infinity_emb/transformer/crossencoder/torch.py @@ -38,7 +38,7 @@ def __init__(self, *, engine_args: EngineArgs): super().__init__( engine_args.model_name_or_path, revision=engine_args.revision, - device=engine_args.device.resolve(), + device=engine_args.device.resolve(), # type: ignore trust_remote_code=engine_args.trust_remote_code, ) self.model.to(self._target_device) # type: ignore diff --git a/libs/infinity_emb/infinity_emb/transformer/embedder/sentence_transformer.py b/libs/infinity_emb/infinity_emb/transformer/embedder/sentence_transformer.py index 41b3876b..245b333b 100644 --- a/libs/infinity_emb/infinity_emb/transformer/embedder/sentence_transformer.py +++ b/libs/infinity_emb/infinity_emb/transformer/embedder/sentence_transformer.py @@ -95,7 +95,7 @@ def encode_core(self, features: Mapping[str, "Tensor"]) -> "Tensor": """ with torch.no_grad(): - features = util.batch_to_device(features, self.device) + features = util.batch_to_device(features, self.device) # type: ignore out_features: "Tensor" = self.forward(features)["sentence_embedding"] return out_features.detach().cpu() diff --git a/libs/infinity_emb/tests/unit_test/inference/test_models.py b/libs/infinity_emb/tests/unit_test/inference/test_models.py index e3f649f1..13717c18 100644 --- a/libs/infinity_emb/tests/unit_test/inference/test_models.py +++ b/libs/infinity_emb/tests/unit_test/inference/test_models.py @@ -4,6 +4,7 @@ import copy import sys +from typing import Union import pytest import torch @@ -26,7 +27,7 @@ def _pretrained_model_score( ct2_compute_type: str = "", ): test_samples = dataset[::3] - + model: Union[SentenceTransformerPatched, CT2SentenceTransformer] if ct2_compute_type: model = CT2SentenceTransformer( engine_args=EngineArgs(model_name_or_path=model_name), @@ -44,7 +45,7 @@ def _pretrained_model_score( test_samples, name="sts-test" ) - score = model.evaluate(evaluator)["sts-test_spearman_cosine"] * 100 + score = model.evaluate(evaluator)["sts-test_spearman_cosine"] * 100 # type: ignore print(model_name, "{:.2f} vs. exp: {:.2f}".format(score, expected_score)) assert score > expected_score or abs(score - expected_score) < 0.01 From 49663fa03508235aeeaa5726c2f8d7a86b63b452 Mon Sep 17 00:00:00 2001 From: michaelfeil Date: Sat, 8 Jun 2024 18:51:34 -0700 Subject: [PATCH 6/6] update poetry lock --- libs/infinity_emb/poetry.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/infinity_emb/poetry.lock b/libs/infinity_emb/poetry.lock index 81f00fa5..67927f4f 100644 --- a/libs/infinity_emb/poetry.lock +++ b/libs/infinity_emb/poetry.lock @@ -4557,4 +4557,4 @@ vision = ["pillow"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "3256c9ea0b03caaa75894e463f07d47a7dac6158718d3854b2e56f247b6e9736" +content-hash = "8bc6998513bef7dec5ebfe2ea59d37246690d83d53c8dcc8d662d04fab28a607"