openvinotoolkit · sungchul2 · Jul 12, 2023 · Jul 7, 2023 · Jul 7, 2023 · Jul 7, 2023
@@ -12,7 +12,9 @@ All notable changes to this project will be documented in this file.
 - Add per-class XAI saliency maps for Mask R-CNN model (https://github.com/openvinotoolkit/training_extensions/pull/2227)
 - Add new object detector Deformable DETR (<https://github.com/openvinotoolkit/training_extensions/pull/2249>)
 - Add new object detector DINO(<https://github.com/openvinotoolkit/training_extensions/pull/2266>)
-- Add new visual prompting task (https://github.com/openvinotoolkit/training_extensions/pull/2203), (https://github.com/openvinotoolkit/training_extensions/pull/2274)
+- Add new visual prompting task: train/eval (https://github.com/openvinotoolkit/training_extensions/pull/2203)
+- Add new visual prompting task: export (https://github.com/openvinotoolkit/training_extensions/pull/2274)
+- Add new visual prompting task: deploy (https://github.com/openvinotoolkit/training_extensions/pull/2311)
 - Add new object detector ResNeXt101-ATSS (<https://github.com/openvinotoolkit/training_extensions/pull/2309>)
 
 ### Enhancements

@@ -13,3 +13,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions
 # and limitations under the License.
+
+from .model_wrappers import *  # noqa: F403
@@ -0,0 +1,18 @@
+"""Wrapper Initialization of OTX Visual Prompting."""
+
+# Copyright (C) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+
+from .openvino_adapters import VisualPromptingOpenvinoAdapter  # noqa: F401
+from .openvino_models import Decoder, ImageEncoder  # noqa: F401
@@ -0,0 +1,164 @@
+"""Openvino Adapter Wrappers of OTX Visual Prompting.
+
+There is a bug on fit_to_window resize module in model API.
+VisualPromptingOpenvinoAdapter is temporarily implemented to use updated `fit_to_window` resize function.
+When model API version in otx is upgraded, it can be removed.
+
+Issue: https://github.com/openvinotoolkit/model_api/issues/99
+Updated PR: https://github.com/openvinotoolkit/model_api/pull/100
+"""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+from functools import partial
+from typing import Tuple
+
+import numpy as np
+import openvino.runtime as ov
+from openvino.model_api.adapters import OpenvinoAdapter
+from openvino.preprocess import ColorFormat, PrePostProcessor
+from openvino.runtime import Output, Type
+from openvino.runtime import opset10 as opset
+from openvino.runtime.utils.decorators import custom_preprocess_function
+
+
+def resize_image_with_aspect_pad(input: Output, size, keep_aspect_ratio, interpolation, pad_value):
+    """https://github.com/openvinotoolkit/model_api/blob/0.1.3/model_api/python/openvino/model_api/adapters/utils.py#L273-L341."""
+    h_axis = 1
+    w_axis = 2
+    w, h = size
+
+    target_size = list(size)
+    target_size.reverse()
+
+    image_shape = opset.shape_of(input, name="shape")
+    iw = opset.convert(
+        opset.gather(image_shape, opset.constant(w_axis), axis=0),
+        destination_type="f32",
+    )
+    ih = opset.convert(
+        opset.gather(image_shape, opset.constant(h_axis), axis=0),
+        destination_type="f32",
+    )
+    w_ratio = opset.divide(np.float32(w), iw)
+    h_ratio = opset.divide(np.float32(h), ih)
+    scale = opset.minimum(w_ratio, h_ratio)
+    nw = opset.convert(opset.round(opset.multiply(iw, scale), "half_to_even"), destination_type="i32")
+    nh = opset.convert(opset.round(opset.multiply(ih, scale), "half_to_even"), destination_type="i32")
+    new_size = opset.concat([opset.unsqueeze(nh, 0), opset.unsqueeze(nw, 0)], axis=0)
+    image = opset.interpolate(
+        input,
+        new_size,
+        scales=np.array([0.0, 0.0], dtype=np.float32),
+        axes=[h_axis, w_axis],
+        mode=interpolation,
+        shape_calculation_mode="sizes",
+    )
+
+    dx_border = opset.subtract(opset.constant(w, dtype=np.int32), nw)
+    dy_border = opset.subtract(opset.constant(h, dtype=np.int32), nh)
+    pads_begin = np.array([0, 0, 0, 0], np.int32)
+    pads_end = opset.concat(
+        [
+            opset.constant([0], dtype=np.int32),
+            opset.unsqueeze(dy_border, 0),
+            opset.unsqueeze(dx_border, 0),
+            opset.constant([0], dtype=np.int32),
+        ],
+        axis=0,
+    )
+    return opset.pad(
+        image,
+        pads_begin,
+        pads_end,
+        "constant",
+        opset.constant(pad_value, dtype=np.uint8),
+    )
+
+
+def resize_image_with_aspect(size, interpolation, pad_value):
+    """https://github.com/openvinotoolkit/model_api/blob/0.1.3/model_api/python/openvino/model_api/adapters/utils.py#L356-L365."""
+    return custom_preprocess_function(
+        partial(
+            resize_image_with_aspect_pad,
+            size=size,
+            keep_aspect_ratio=True,
+            interpolation=interpolation,
+            pad_value=pad_value,
+        )
+    )
+
+
+class VisualPromptingOpenvinoAdapter(OpenvinoAdapter):
+    """Openvino Adapter Wrappers of OTX Visual Prompting.
+
+    This class is to use fixed `fit_to_window` resize module.
+    When model API version in otx is upgraded, it can be removed.
+    """
+
+    def embed_preprocessing(
+        self,
+        layout,
+        resize_mode: str,
+        interpolation_mode,
+        target_shape: Tuple[int],
+        pad_value,
+        dtype=type(int),
+        brg2rgb=False,
+        mean=None,
+        scale=None,
+        input_idx=0,
+    ):
+        """https://github.com/openvinotoolkit/model_api/blob/0.1.3/model_api/python/openvino/model_api/adapters/openvino_adapter.py#L340-L411."""
+        ppp = PrePostProcessor(self.model)  # type: ignore[has-type]
+
+        # Change the input type to the 8-bit image
+        if dtype == type(int):
+            ppp.input(input_idx).tensor().set_element_type(Type.u8)
+
+        ppp.input(input_idx).tensor().set_layout(ov.Layout("NHWC")).set_color_format(ColorFormat.BGR)
+
+        INTERPOLATION_MODE_MAP = {
+            "LINEAR": "linear",
+            "CUBIC": "cubic",
+            "NEAREST": "nearest",
+        }
+
+        RESIZE_MODE_MAP = {"fit_to_window": resize_image_with_aspect}
+
+        # Handle resize
+        # Change to dynamic shape to handle various image size
+        # TODO: check the number of input channels and rank of input shape
+        if resize_mode and target_shape:
+            if resize_mode in RESIZE_MODE_MAP:
+                input_shape = [1, -1, -1, 3]
+                ppp.input(input_idx).tensor().set_shape(input_shape)
+                ppp.input(input_idx).preprocess().custom(
+                    RESIZE_MODE_MAP[resize_mode](
+                        target_shape,
+                        INTERPOLATION_MODE_MAP[interpolation_mode],
+                        pad_value,
+                    )
+                )
+
+            else:
+                raise ValueError(f"Upsupported resize type in model preprocessing: {resize_mode}")
+
+        # Handle layout
+        ppp.input(input_idx).model().set_layout(ov.Layout(layout))
+
+        # Handle color format
+        if brg2rgb:
+            ppp.input(input_idx).preprocess().convert_color(ColorFormat.RGB)
+
+        ppp.input(input_idx).preprocess().convert_element_type(Type.f32)
+
+        if mean:
+            ppp.input(input_idx).preprocess().mean(mean)
+        if scale:
+            ppp.input(input_idx).preprocess().scale(scale)
+
+        self.model = ppp.build()
+        self.load_model()
@@ -1,4 +1,4 @@
-"""Model Wrapper of OTX Visual Prompting."""
+"""Openvino Model Wrappers of OTX Visual Prompting."""
 
 # Copyright (C) 2023 Intel Corporation
 #
@@ -14,16 +14,15 @@
 # See the License for the specific language governing permissions
 # and limitations under the License.
 
-from typing import Any, Dict, Tuple
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import cv2
 import numpy as np
-from openvino.model_api.models import ImageModel
-from openvino.model_api.models.types import NumericalValue
+from openvino.model_api.adapters.inference_adapter import InferenceAdapter
+from openvino.model_api.models import ImageModel, SegmentationModel
+from openvino.model_api.models.types import NumericalValue, StringValue
 
-from otx.algorithms.segmentation.adapters.openvino.model_wrappers.blur import (
-    BlurSegmentation,
-)
 from otx.api.utils.segmentation_utils import create_hard_prediction_from_soft_prediction
 
 
@@ -32,63 +31,93 @@ class ImageEncoder(ImageModel):
 
     __model__ = "image_encoder"
 
+    def __init__(self, inference_adapter, configuration=None, preload=False):
+        super().__init__(inference_adapter, configuration, preload)
+
     @classmethod
     def parameters(cls) -> Dict[str, Any]:  # noqa: D102
         parameters = super().parameters()
-        parameters["resize_type"].default_value = "fit_to_window"
-        parameters["mean_values"].default_value = [123.675, 116.28, 103.53]
-        parameters["scale_values"].default_value = [58.395, 57.12, 57.375]
+        parameters.update(
+            {
+                "resize_type": StringValue(default_value="fit_to_window"),
+            }
+        )
         return parameters
 
+    def preprocess(self, inputs: np.ndarray) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]:
+        """Update meta for image encoder."""
+        dict_inputs, meta = super().preprocess(inputs)
+        meta["resize_type"] = self.resize_type
+        return dict_inputs, meta
 
-class Decoder(BlurSegmentation):
-    """Decoder class for visual prompting of openvino model wrapper.
 
-    TODO (sungchul): change parent class
-    """
+class Decoder(SegmentationModel):
+    """Decoder class for visual prompting of openvino model wrapper."""
 
     __model__ = "decoder"
 
-    def preprocess(self, bbox: np.ndarray, original_size: Tuple[int]) -> Dict[str, Any]:
-        """Ready decoder inputs."""
-        point_coords = bbox.reshape((-1, 2, 2))
-        point_labels = np.array([2, 3], dtype=np.float32).reshape((-1, 2))
-        inputs_decoder = {
-            "point_coords": point_coords,
-            "point_labels": point_labels,
-            # TODO (sungchul): how to generate mask_input and has_mask_input
-            "mask_input": np.zeros((1, 1, 256, 256), dtype=np.float32),
-            "has_mask_input": np.zeros((1, 1), dtype=np.float32),
-            "orig_size": np.array(original_size, dtype=np.float32).reshape((-1, 2)),
-        }
-        return inputs_decoder
+    def __init__(
+        self,
+        model_adapter: InferenceAdapter,
+        configuration: Optional[dict] = None,
+        preload: bool = False,
+    ):
+        super().__init__(model_adapter, configuration, preload)
+        self.output_blob_name = "low_res_masks"
 
     @classmethod
     def parameters(cls):  # noqa: D102
         parameters = super().parameters()
         parameters.update({"image_size": NumericalValue(value_type=int, default_value=1024, min=0, max=2048)})
         return parameters
 
+    def preprocess(self, inputs: Dict[str, Any], meta: Dict[str, Any]):
+        """Preprocess prompts."""
+        processed_prompts = []
+        # TODO (sungchul): process points
+        for bbox, label in zip(inputs["bboxes"], inputs["labels"]):
+            # TODO (sungchul): add condition to check whether using bbox or point
+            point_coords = self._apply_coords(bbox.reshape(-1, 2, 2), inputs["original_size"])
+            point_labels = np.array([2, 3], dtype=np.float32).reshape((-1, 2))
+            processed_prompts.append(
+                {
+                    "point_coords": point_coords,
+                    "point_labels": point_labels,
+                    # TODO (sungchul): how to generate mask_input and has_mask_input
+                    "mask_input": np.zeros((1, 1, 256, 256), dtype=np.float32),
+                    "has_mask_input": np.zeros((1, 1), dtype=np.float32),
+                    "orig_size": np.array(inputs["original_size"], dtype=np.float32).reshape((-1, 2)),
+                    "label": label,
+                }
+            )
+        return processed_prompts
+
+    def _apply_coords(self, coords: np.ndarray, original_size: Union[List[int], Tuple[int, int]]) -> np.ndarray:
+        """Process coords according to preprocessed image size using image meta."""
+        old_h, old_w = original_size
+        new_h, new_w = self._get_preprocess_shape(original_size[0], original_size[1], self.image_size)
+        coords = deepcopy(coords).astype(np.float32)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+
+    def _get_preprocess_shape(self, old_h: int, old_w: int, image_size: int) -> Tuple[int, int]:
+        """Compute the output size given input size and target image size."""
+        scale = image_size / max(old_h, old_w)
+        new_h, new_w = old_h * scale, old_w * scale
+        new_w = int(new_w + 0.5)
+        new_h = int(new_h + 0.5)
+        return (new_h, new_w)
+
+    def _check_io_number(self, number_of_inputs, number_of_outputs):
+        pass
+
     def _get_inputs(self):
         """Get input layer name and shape."""
         image_blob_names = [name for name in self.inputs.keys()]
         image_info_blob_names = []
         return image_blob_names, image_info_blob_names
 
-    def _get_outputs(self):
-        """Get output layer name and shape."""
-        layer_name = "low_res_masks"
-        layer_shape = self.outputs[layer_name].shape
-
-        if len(layer_shape) == 3:
-            self.out_channels = 0
-        elif len(layer_shape) == 4:
-            self.out_channels = layer_shape[1]
-        else:
-            raise Exception(f"Unexpected output layer shape {layer_shape}. Only 4D and 3D output layers are supported")
-
-        return layer_name
-
     def postprocess(self, outputs: Dict[str, np.ndarray], meta: Dict[str, Any]) -> Tuple[np.ndarray, np.ndarray]:
         """Postprocess to convert soft prediction to hard prediction.
 
@@ -102,10 +131,10 @@ def postprocess(self, outputs: Dict[str, np.ndarray], meta: Dict[str, Any]) -> T
         """
 
         def sigmoid(x):
-            return 1 / (1 + np.exp(-x))
+            return np.tanh(x * 0.5) * 0.5 + 0.5  # to avoid overflow
 
         soft_prediction = outputs[self.output_blob_name].squeeze()
-        soft_prediction = self.resize_and_crop(soft_prediction, meta["original_size"])
+        soft_prediction = self.resize_and_crop(soft_prediction, meta["original_size"][0])
         soft_prediction = sigmoid(soft_prediction)
         meta["soft_prediction"] = soft_prediction
 
@@ -134,18 +163,18 @@ def resize_and_crop(self, soft_prediction: np.ndarray, original_size: np.ndarray
             soft_prediction, (self.image_size, self.image_size), 0, 0, interpolation=cv2.INTER_LINEAR
         )
 
-        prepadded_size = self.resize_longest_image_size(original_size, self.image_size).astype(np.int64)
+        prepadded_size = self.get_padded_size(original_size, self.image_size).astype(np.int64)
         resized_cropped_soft_prediction = resized_soft_prediction[..., : prepadded_size[0], : prepadded_size[1]]
 
         original_size = original_size.astype(np.int64)
-        h, w = original_size[0], original_size[1]
+        h, w = original_size
         final_soft_prediction = cv2.resize(
             resized_cropped_soft_prediction, (w, h), 0, 0, interpolation=cv2.INTER_LINEAR
         )
         return final_soft_prediction
 
-    def resize_longest_image_size(self, original_size: np.ndarray, longest_side: int) -> np.ndarray:
-        """Resizes the longest side of the image to the given size.
+    def get_padded_size(self, original_size: np.ndarray, longest_side: int) -> np.ndarray:
+        """Get padded size from original size and longest side of the image.
 
         Args:
             original_size (np.ndarray): The original image size with shape Bx2.