From d1bd1d5d04d62cdcf57ff270ba2a3a33f52a333f Mon Sep 17 00:00:00 2001
From: Yunchu Lee <yunchu.lee@intel.com>
Date: Tue, 20 Aug 2024 09:16:45 +0900
Subject: [PATCH 01/53] update for releases 2.2.0rc0

---
 CHANGELOG.md                              |  2 +-
 README.md                                 | 89 ++++++-----------------
 docs/source/guide/release_notes/index.rst | 35 +++++++++
 3 files changed, 57 insertions(+), 69 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5917a13519e..460454bad2a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,7 @@
 
 All notable changes to this project will be documented in this file.
 
-## \[unreleased\]
+## \[2.2.0\]
 
 ### New features
 
diff --git a/README.md b/README.md
index 4f81c83d28b..f678fd9ecd5 100644
--- a/README.md
+++ b/README.md
@@ -166,83 +166,36 @@ In addition to the examples above, please refer to the documentation for tutoria
 
 ---
 
-## Updates
-
-### v2.1.0 (3Q24)
-
-> _**NOTES**_
->
-> OpenVINO™ Training Extensions, version 2.1.0 does not include the latest functional and security updates. OpenVINO™ Training Extensions, version 2.2.0 is targeted to be released in September 2024 and will include additional functional and security updates. Customers should update to the latest version as it becomes available.
+## Updates - v2.2.0 (3Q24)
 
 ### New features
 
-- Add a flag to enable OV inference on dGPU
-- Add early stopping with warmup. Remove mandatory background label in semantic segmentation task
-- RTMDet-tiny enablement for detection task
-- Add data_format validation and update in OTXDataModule
-- Add torchvision.MaskRCNN
-- Add Semi-SL for Multi-class Classification (EfficientNet-B0)
-- Decoupling mmaction for action classification (MoviNet, X3D)
-- Add Semi-SL Algorithms for mv3-large, effnet-v2, deit-tiny, dino-v2
-- RTMDet-tiny enablement for detection task (export/optimize)
-- Enable ruff & ruff-format into otx/algo/classification/backbones
-- Add TV MaskRCNN Tile Recipe
-- Add rotated det OV recipe
+- Add RT-DETR model for Object Detection
+- Add Multi-Label & H-label Classification with torchvision models
+- Add Hugging-Face Model Wrapper for Classification
+- Add LoRA finetuning capability for ViT Architectures
+- Add Hugging-Face Model Wrapper for Object Detection
+- Add Hugging-Face Model Wrapper for Semantic Segmentation
+- Enable torch.compile to work with classification
+- Add `otx benchmark` subcommand
+- Add RTMPose for Keypoint Detection Task
+- Add Semi-SL MeanTeacher algorithm for Semantic Segmentation
+- Update head and h-label format for hierarchical label classification
+- Support configurable input size
 
 ### Enhancements
 
-- Change load_stat_dict to on_load_checkpoint
-- Add try - except to keep running the remaining tests
-- Update instance_segmentation.py to resolve conflict with 2.0.0
-- Update XPU install
-- Sync rgb order between torch and ov inference of action classification task
-- Make Perf test available to load pervious Perf test to skip training stage
-- Reenable e2e classification XAI tests
-- Remove action detection task support
-- Increase readability of pickling error log during HPO & fix minor bug
-- Update RTMDet checkpoint url
-- Refactor Torchvision Model for Classification Semi-SL
-- Add coverage omit mm-related code
-- Add docs semi-sl part
-- Refactor docs design & Add contents
-- Add execution example of auto batch size in docs
-- Add Semi-SL for cls Benchmark Test
-- Move value to device before logging for metric
-- Add .codecov.yaml
-- Update benchmark tool for otx2.1
-- Collect pretrained weight binary files in one place
-- Minimize compiled dependency files
-- Update README & CODEOWNERS
-- Update Engine's docstring & CLI --help outputs
-- Align integration test to exportable code interface update for release branch
-- Refactor exporter for anomaly task and fix a bug with exportable code
-- Update pandas version constraint
-- Include more models to export test into test_otx_e2e
-- Move assigning tasks to Models from Engine to Anomaly Model Classes
-- Refactoring detection modules
+- Reimplement of ViT Architecture following TIMM
+- Enable to override data configurations
+- Enable to use input_size at transforms in recipe
+- Enable to use polygon and bitmap mask as prompt inputs for zero-shot learning
+- Refactoring `ConvModule` by removing `conv_cfg`, `norm_cfg`, and `act_cfg`
 
 ### Bug fixes
 
-- Fix conflicts between develop and 2.0.0
-- Fix polygon mask
-- Fix vpm intg test error
-- Fix anomaly
-- Bug fix in Semantic Segmentation + enable DINOV2 export in ONNX
-- Fix some export issues. Remove EXPORTABLE_CODE as export parameter.
-- Fix `load_from_checkpoint` to apply original model's hparams
-- Fix `load_from_checkpoint` args to apply original model's hparams
-- Fix zero-shot `learn` for ov model
-- Various fixes for XAI in 2.1
-- Fix tests to work in a mm-free environment
-- Fix a bug in benchmark code
-- Update exportable code dependency & fix a bug
-- Fix getting wrong shape during resizing
-- Fix detection prediction outputs
-- Fix RTMDet PTQ performance
-- Fix segmentation fault on VPM PTQ
-- Fix NNCF MaskRCNN-Eff accuracy drop
-- Fix optimize with Semi-SL data pipeline
-- Fix MaskRCNN SwinT NNCF Accuracy Drop
+- Fix Combined Dataloader & unlabeled warmup loss in Semi-SL
+- Revert #3579 to fix issues with replacing coco_instance with a different format in some dataset
+- Add num_devices in Engine for multi-gpu training
 
 ### Known issues
 
diff --git a/docs/source/guide/release_notes/index.rst b/docs/source/guide/release_notes/index.rst
index c074a39a4f2..1871c1b9438 100644
--- a/docs/source/guide/release_notes/index.rst
+++ b/docs/source/guide/release_notes/index.rst
@@ -4,6 +4,41 @@ Releases
 .. toctree::
   :maxdepth: 1
 
+v2.2.0 (2024.09)
+----------------
+
+New features
+^^^^^^^^^^^^
+
+- Add RT-DETR model for Object Detection
+- Add Multi-Label & H-label Classification with torchvision models
+- Add Hugging-Face Model Wrapper for Classification
+- Add LoRA finetuning capability for ViT Architectures
+- Add Hugging-Face Model Wrapper for Object Detection
+- Add Hugging-Face Model Wrapper for Semantic Segmentation
+- Enable torch.compile to work with classification
+- Add `otx benchmark` subcommand
+- Add RTMPose for Keypoint Detection Task
+- Add Semi-SL MeanTeacher algorithm for Semantic Segmentation
+- Update head and h-label format for hierarchical label classification
+- Support configurable input size
+
+Enhancements
+^^^^^^^^^^^^
+
+- Reimplement of ViT Architecture following TIMM
+- Enable to override data configurations
+- Enable to use input_size at transforms in recipe
+- Enable to use polygon and bitmap mask as prompt inputs for zero-shot learning
+- Refactoring `ConvModule` by removing `conv_cfg`, `norm_cfg`, and `act_cfg`
+
+Bug fixes
+^^^^^^^^^
+
+- Fix Combined Dataloader & unlabeled warmup loss in Semi-SL
+- Revert #3579 to fix issues with replacing coco_instance with a different format in some dataset
+- Add num_devices in Engine for multi-gpu training
+
 
 v2.1.0 (2024.07)
 ----------------

From c16f9854fda8c212415730f513c18bb2234e0f1e Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Wed, 21 Aug 2024 11:37:05 +0900
Subject: [PATCH 02/53] Fix Classification explain forward issue (#3867)

Fix bug
---
 src/otx/algo/classification/classifier/base_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/otx/algo/classification/classifier/base_classifier.py b/src/otx/algo/classification/classifier/base_classifier.py
index b1c8daaf037..bc8a43195af 100644
--- a/src/otx/algo/classification/classifier/base_classifier.py
+++ b/src/otx/algo/classification/classifier/base_classifier.py
@@ -190,7 +190,7 @@ def _forward_explain(self, images: torch.Tensor) -> dict[str, torch.Tensor | lis
         logits = self.head(x)
         pred_results = self.head._get_predictions(logits)  # noqa: SLF001
         scores = pred_results.unbind(0)
-        preds = logits.argmax(-1, keepdim=True).unbind(0)
+        preds = pred_results.argmax(-1, keepdim=True).unbind(0)
 
         outputs = {
             "logits": logits,

From cba512088e15e69f37f078eb23d99d77490cdc8d Mon Sep 17 00:00:00 2001
From: Emily Chun <emily.chun@intel.com>
Date: Wed, 21 Aug 2024 19:29:40 +0900
Subject: [PATCH 03/53] Fix e2e code error (#3871)

* Update test_cli.py

* Update tests/e2e/cli/test_cli.py

Co-authored-by: Eunwoo Shin <eunwoo.shin@intel.com>

* Update test_cli.py

* Update test_cli.py

---------

Co-authored-by: Eunwoo Shin <eunwoo.shin@intel.com>
---
 tests/e2e/cli/test_cli.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tests/e2e/cli/test_cli.py b/tests/e2e/cli/test_cli.py
index bc9f8459074..07ac0daf103 100644
--- a/tests/e2e/cli/test_cli.py
+++ b/tests/e2e/cli/test_cli.py
@@ -133,13 +133,16 @@ def test_otx_e2e_cli(
     assert (latest_dir / "csv").exists()
 
     # 3) otx export
-    fxt_export_list = []
-    if task in ("visual_prompting", "zero_shot_visual_prompting"):
-        fxt_export_list.append(ExportCase2Test("ONNX", False, "exported_model_decoder.onnx"))
-        fxt_export_list.append(ExportCase2Test("OPENVINO", False, "exported_model_decoder.xml"))
-    elif "anomaly" in task or "keypoint_detection" in task:
-        fxt_export_list.append(ExportCase2Test("ONNX", False, "exported_model.onnx"))
-        fxt_export_list.append(ExportCase2Test("OPENVINO", False, "exported_model.xml"))
+    if task in (OTXTaskType.VISUAL_PROMPTING, OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING):
+        fxt_export_list = [
+            ExportCase2Test("ONNX", False, "exported_model_decoder.onnx"),
+            ExportCase2Test("OPENVINO", False, "exported_model_decoder.xml"),
+        ]
+    elif "ANOMALY" in task or OTXTaskType.KEYPOINT_DETECTION in task:
+        fxt_export_list = [
+            ExportCase2Test("ONNX", False, "exported_model.onnx"),
+            ExportCase2Test("OPENVINO", False, "exported_model.xml"),
+        ]
 
     overrides = fxt_cli_override_command_per_task[task]
     if "anomaly" in task:
@@ -182,7 +185,7 @@ def test_otx_e2e_cli(
         msg = "There is no OV IR."
         raise RuntimeError(msg)
     exported_model_path = str(ov_files[0])
-    if task in ("visual_prompting", "zero_shot_visual_prompting"):
+    if task in (OTXTaskType.VISUAL_PROMPTING, OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING):
         recipe = str(Path(recipe).parents[0] / "openvino_model.yaml")
 
     overrides = fxt_cli_override_command_per_task[task]

From b807c9d13f08e271f8b8511400b51ed1b3887dab Mon Sep 17 00:00:00 2001
From: Eunwoo Shin <eunwoo.shin@intel.com>
Date: Thu, 22 Aug 2024 01:31:38 +0900
Subject: [PATCH 04/53] Add documentation about configurable input size (#3870)

* add docs about configurable input size

* update api usecase and fix bug
---
 .../configurable_input_size.rst               | 116 ++++++++++++++++++
 .../explanation/additional_features/hpo.rst   |   8 +-
 .../explanation/additional_features/index.rst |   1 +
 src/otx/engine/engine.py                      |   3 +-
 tests/unit/engine/test_engine.py              |   8 ++
 5 files changed, 134 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/guide/explanation/additional_features/configurable_input_size.rst

diff --git a/docs/source/guide/explanation/additional_features/configurable_input_size.rst b/docs/source/guide/explanation/additional_features/configurable_input_size.rst
new file mode 100644
index 00000000000..eb36879d0e4
--- /dev/null
+++ b/docs/source/guide/explanation/additional_features/configurable_input_size.rst
@@ -0,0 +1,116 @@
+Configurable Input Size
+=======================
+
+The Configurable Input Size feature allows users to adjust the input resolution of their deep learning models
+to balance between training and inference speed and model performance.
+This flexibility enables users to tailor the input size to their specific needs without manually altering
+the data pipeline configurations.
+
+To utilize this feature, simply specify the desired input size as an argument during the train command.
+Additionally, OTX ensures compatibility with model trained on non-default input sizes by automatically adjusting
+the data pipeline to match the input size during other engine entry points.
+
+Usage example:
+
+.. code-block::
+
+    $ otx train \
+        --config ... \
+
+.. tab-set::
+
+    .. tab-item:: API 1
+
+        .. code-block:: python
+
+            from otx.algo.detection.yolox import YOLOXS
+            from otx.core.data.module import OTXDataModule
+            from otx.engine import Engine
+
+            input_size = (512, 512)
+            model = YOLOXS(label_info=5, input_size=input_size)  # should be tuple[int, int]
+            datamodule = OTXDataModule(..., input_size=input_size)
+            engine = Engine(model=model, datamodule=datamodule)
+            engine.train()
+
+    .. tab-item:: API 2
+
+        .. code-block:: python
+
+            from otx.core.data.module import OTXDataModule
+            from otx.engine import Engine
+
+            datamodule = OTXDataModule(..., input_size=(512, 512))
+            engine = Engine(model="yolox_s", datamodule=datamodule)  # model input size will be aligned with the datamodule input size
+            engine.train()
+
+    .. tab-item:: CLI
+
+        .. code-block:: bash
+
+            (otx) ...$ otx train ... --data.input_size 512
+
+.. _adaptive-input-size:
+
+Adaptive Input Size
+-------------------
+
+The Adaptive Input Size feature intelligently determines an optimal input size for the model
+by analyzing the dataset's statistics.
+It operates in two distinct modes: "auto" and "downscale".
+In "auto" mode, the input size may increase or decrease based on the dataset's characteristics.
+In "downscale" mode, the input size will either decrease or remain unchanged, ensuring that the model training or inference speed deosn't drop.
+
+
+To activate this feature, use the following command with the desired mode:
+
+.. tab-set::
+
+    .. tab-item:: API
+
+        .. code-block:: python
+
+            from otx.algo.detection.yolox import YOLOXS
+            from otx.core.data.module import OTXDataModule
+            from otx.engine import Engine
+
+            datamodule = OTXDataModule(
+                ...
+                adaptive_input_size="auto",  # auto or downscale
+                input_size_multiplier=YOLOXS.input_size_multiplier, # should set the input_size_multiplier of the model
+            )
+            model = YOLOXS(label_info=5, input_size=datamodule.input_size)
+            engine = Engine(model=model, datamodule=datamodule)
+            engine.train()
+
+    .. tab-item:: CLI
+
+        .. code-block:: bash
+
+            (otx) ...$ otx train ... --data.adaptive_input_size "auto | downscale"
+
+The adaptive process includes the following steps:
+
+1. OTX computes robust statistics from the input dataset.
+
+2. The initial input size is set based on the typical large image size within the dataset.
+
+3. (Optional) The input size may be further refined to account for the sizes of objects present in the dataset.
+   The model's minimum recognizable object size, typically ranging from 16x16 to 32x32 pixels, serves as a reference to
+   proportionally adjust the input size relative to the average small object size observed in the dataset.
+   For instance, if objects are generally 64x64 pixels in a 512x512 image, the input size would be adjusted
+   to 256x256 to maintain detectability.
+
+   Adjustments are subject to the following constraints:
+
+   * If the recalculated input size exceeds the maximum image size determined in the previous step, it will be capped at that maximum size.
+   * If the recalculated input size falls below the minimum threshold defined by MIN_DETECTION_INPUT_SIZE, the input size will be scaled up. This is done by increasing the smaller dimension (width or height) to MIN_DETECTION_INPUT_SIZE while maintaining the aspect ratio, ensuring that the model's minimum criteria for object detection are met.
+
+4. (downscale only) Any scale-up beyond the default model input size is restricted.
+
+
+.. Note::
+    Opting for a smaller input size can be advantageous for datasets with lower-resolution images or larger objects,
+    as it may improve speed with minimal impact on model accuracy. However, it is important to consider that selecting
+    a smaller input size could affect model performance depending on the task, model architecture, and dataset
+    properties.
diff --git a/docs/source/guide/explanation/additional_features/hpo.rst b/docs/source/guide/explanation/additional_features/hpo.rst
index 0304ef8c02d..5a40051197c 100644
--- a/docs/source/guide/explanation/additional_features/hpo.rst
+++ b/docs/source/guide/explanation/additional_features/hpo.rst
@@ -143,10 +143,16 @@ Here is explanation of all HPO configuration.
 
 - **mode** (*str*, *default='max'*) - Optimization mode for the metric. It determines whether the metric should be maximized or minimized. The possible values are 'max' and 'min', respectively.
 
-- **num_workers** (*int*, *default=1*) How many trials will be executed in parallel.
+- **num_trials** (*int*, *default=None*) The number of training trials to perform during HPO. If not provided, the number of trials will be determined based on the expected time ratio. Defaults to None.
+
+- **num_workers** (*int*, *default=None*) The number of trials that will be run concurrently.
 
 - **expected_time_ratio** (*int*, *default=4*) How many times to use for HPO compared to training time.
 
+- **metric_name** (*str*, *default=None*) The name of the performance metric to be optimized during HPO. If not specified, the metric will be selected based on the configured callbacks. Defaults to None.
+
+- **adapt_bs_search_space_max_val** (*Literal["None", "Safe", "Full"]*, *default="None"*) Whether to execute `Auto-adapt batch size` prior to HPO. This step finds the maximum batch size value, which then serves as the upper limit for the batch size search space during HPO. For further information on `Auto-adapt batch size`, please refer to the `Auto-configuration` documentation. Defaults to "None".
+
 - **maximum_resource** (*int*, *default=None*) - Maximum number of training epochs for each trial. When the training epochs reaches this value, the trial stop to train.
 
 - **minimum_resource** (*int*, *default=None*) - Minimum number of training epochs for each trial. Each trial will run at least this epochs, even if the performance of the model is not improving.
diff --git a/docs/source/guide/explanation/additional_features/index.rst b/docs/source/guide/explanation/additional_features/index.rst
index f0e7f1f370d..8ec0b87deef 100644
--- a/docs/source/guide/explanation/additional_features/index.rst
+++ b/docs/source/guide/explanation/additional_features/index.rst
@@ -14,3 +14,4 @@ Additional Features
    fast_data_loading
    tiling
    class_incremental_sampler
+   configurable_input_size
diff --git a/src/otx/engine/engine.py b/src/otx/engine/engine.py
index 4954105002c..47647caf7d6 100644
--- a/src/otx/engine/engine.py
+++ b/src/otx/engine/engine.py
@@ -143,7 +143,8 @@ def __init__(
         get_model_args: dict[str, Any] = {}
         if self._datamodule is not None:
             get_model_args["label_info"] = self._datamodule.label_info
-            get_model_args["input_size"] = self._datamodule.input_size
+            if (input_size := self._datamodule.input_size) is not None:
+                get_model_args["input_size"] = (input_size, input_size) if isinstance(input_size, int) else input_size
         self._model: OTXModel = (
             model if isinstance(model, OTXModel) else self._auto_configurator.get_model(**get_model_args)
         )
diff --git a/tests/unit/engine/test_engine.py b/tests/unit/engine/test_engine.py
index db52a3f871e..879987f19cc 100644
--- a/tests/unit/engine/test_engine.py
+++ b/tests/unit/engine/test_engine.py
@@ -72,6 +72,14 @@ def test_model_init(self, tmp_path, mock_datamodule):
         assert engine._model.input_size == (1234, 1234)
         assert engine._model.label_info.num_classes == 4321
 
+    def test_model_init_datamodule_ipt_size_int(self, tmp_path, mock_datamodule):
+        mock_datamodule.input_size = 1234
+        data_root = "tests/assets/classification_dataset"
+        engine = Engine(work_dir=tmp_path, data_root=data_root)
+
+        assert engine._model.input_size == (1234, 1234)
+        assert engine._model.label_info.num_classes == 4321
+
     def test_model_setter(self, fxt_engine, mocker) -> None:
         assert isinstance(fxt_engine.model, TVModelForMulticlassCls)
         fxt_engine.model = "efficientnet_b0"

From 2835aba7d7066e0c8bf1958442a52f4c05280aab Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Fri, 23 Aug 2024 12:56:45 +0900
Subject: [PATCH 05/53] Fix zero-shot e2e (#3876)

Fix
---
 src/otx/algo/visual_prompting/sam.py         | 82 ++++++++++----------
 src/otx/core/exporter/visual_prompting.py    | 37 +++++----
 tests/unit/algo/visual_prompting/test_sam.py | 58 +++++++-------
 3 files changed, 95 insertions(+), 82 deletions(-)

diff --git a/src/otx/algo/visual_prompting/sam.py b/src/otx/algo/visual_prompting/sam.py
index 0aab3a3d857..e691acf60e2 100644
--- a/src/otx/algo/visual_prompting/sam.py
+++ b/src/otx/algo/visual_prompting/sam.py
@@ -109,6 +109,47 @@ def freeze_networks(
         for param in self.model.mask_decoder.parameters():
             param.requires_grad = not freeze_mask_decoder
 
+    @torch.no_grad()
+    def forward_for_tracing(
+        self,
+        image_embeddings: Tensor,
+        point_coords: Tensor,
+        point_labels: Tensor,
+        mask_input: Tensor,
+        has_mask_input: Tensor,
+        ori_shape: Tensor,
+    ) -> tuple[Tensor, ...]:
+        """Forward method for SAM inference (export/deploy).
+
+        Args:
+            image_embeddings (Tensor): The image embedding with a batch index of length 1.
+                If it is a zero tensor, the image embedding will be computed from the image.
+            point_coords (Tensor): Coordinates of sparse input prompts,
+                corresponding to both point inputs and box inputs.
+                Boxes are encoded using two points, one for the top-left corner and one for the bottom-right corner.
+                Coordinates must already be transformed to long-side 1024. Has a batch index of length 1.
+            point_labels (Tensor): Labels for the sparse input prompts.
+                0 is a negative input point, 1 is a positive input point,
+                2 is a top-left box corner, 3 is a bottom-right box corner, and -1 is a padding point.
+                If there is no box input, a single padding point with label -1 and
+                coordinates (0.0, 0.0) should be concatenated.
+            mask_input (Tensor): A mask input to the model with shape 1x1x256x256.
+                This must be supplied even if there is no mask input. In this case, it can just be zeros.
+            has_mask_input (Tensor): An indicator for the mask input.
+                1 indicates a mask input, 0 indicates no mask input.
+                This input has 1x1 shape due to supporting openvino input layout.
+            ori_shape (Tensor): The size of the input image in (H,W) format, before any transformation.
+                This input has 1x2 shape due to supporting openvino input layout.
+        """
+        return self.model.forward_for_tracing(
+            image_embeddings=image_embeddings,
+            point_coords=point_coords,
+            point_labels=point_labels,
+            mask_input=mask_input,
+            has_mask_input=has_mask_input,
+            ori_shape=ori_shape,
+        )
+
 
 class SAM(OTXVisualPromptingModel, CommonSettingMixin):
     """OTX visual prompting model class for Segment Anything Model (SAM)."""
@@ -177,47 +218,6 @@ def _build_model(self) -> nn.Module:
             stability_score_offset=self.stability_score_offset,
         )
 
-    @torch.no_grad()
-    def forward_for_tracing(
-        self,
-        image_embeddings: Tensor,
-        point_coords: Tensor,
-        point_labels: Tensor,
-        mask_input: Tensor,
-        has_mask_input: Tensor,
-        ori_shape: Tensor,
-    ) -> tuple[Tensor, ...]:
-        """Forward method for SAM inference (export/deploy).
-
-        Args:
-            image_embeddings (Tensor): The image embedding with a batch index of length 1.
-                If it is a zero tensor, the image embedding will be computed from the image.
-            point_coords (Tensor): Coordinates of sparse input prompts,
-                corresponding to both point inputs and box inputs.
-                Boxes are encoded using two points, one for the top-left corner and one for the bottom-right corner.
-                Coordinates must already be transformed to long-side 1024. Has a batch index of length 1.
-            point_labels (Tensor): Labels for the sparse input prompts.
-                0 is a negative input point, 1 is a positive input point,
-                2 is a top-left box corner, 3 is a bottom-right box corner, and -1 is a padding point.
-                If there is no box input, a single padding point with label -1 and
-                coordinates (0.0, 0.0) should be concatenated.
-            mask_input (Tensor): A mask input to the model with shape 1x1x256x256.
-                This must be supplied even if there is no mask input. In this case, it can just be zeros.
-            has_mask_input (Tensor): An indicator for the mask input.
-                1 indicates a mask input, 0 indicates no mask input.
-                This input has 1x1 shape due to supporting openvino input layout.
-            ori_shape (Tensor): The size of the input image in (H,W) format, before any transformation.
-                This input has 1x2 shape due to supporting openvino input layout.
-        """
-        return self.model.forward_for_tracing(
-            image_embeddings=image_embeddings,
-            point_coords=point_coords,
-            point_labels=point_labels,
-            mask_input=mask_input,
-            has_mask_input=has_mask_input,
-            ori_shape=ori_shape,
-        )
-
 
 class ZeroShotSAM(OTXZeroShotVisualPromptingModel, CommonSettingMixin):
     """Zero-Shot Visual Prompting model."""
diff --git a/src/otx/core/exporter/visual_prompting.py b/src/otx/core/exporter/visual_prompting.py
index beba5899654..12f2820c887 100644
--- a/src/otx/core/exporter/visual_prompting.py
+++ b/src/otx/core/exporter/visual_prompting.py
@@ -49,13 +49,6 @@ def export(  # type: ignore[override]
         Returns:
             dict[str, Path]: paths to the exported models
         """
-        # NOTE: Rather than using OTXModel.forward_for_tracing()
-        # Use the nested `image_encoder` and `decoder` models' forward functions directly
-        models: dict[str, torch.nn.Module] = {
-            "image_encoder": model.model.image_encoder,
-            "decoder": model,
-        }
-
         if export_format == OTXExportFormatType.OPENVINO:
             if to_exportable_code:
                 msg = "Exportable code option is not supported and will be ignored."
@@ -67,11 +60,29 @@ def export(  # type: ignore[override]
             msg = f"Unsupported export format: {export_format}"
             raise ValueError(msg)
 
-        return {  # type: ignore[return-value]
-            module: fn(models[module], output_dir, f"{base_model_name}_{module}", precision, model_type=f"sam_{module}")
-            for module in ["image_encoder", "decoder"]
+        models: dict[str, torch.nn.Module] = {
+            "image_encoder": model.model.image_encoder,
+            "decoder": model.model,
         }
 
+        orig_decoder_forward = models["decoder"].forward
+        try:
+            models["decoder"].forward = models["decoder"].forward_for_tracing
+
+            return {  # type: ignore[return-value]
+                module: fn(
+                    models[module],
+                    output_dir,
+                    f"{base_model_name}_{module}",
+                    precision,
+                    model_type=f"sam_{module}",
+                )
+                for module in ["image_encoder", "decoder"]
+            }
+
+        finally:
+            models["decoder"].forward = orig_decoder_forward
+
     def to_openvino(
         self,
         model: OTXModel | torch.nn.Module,
@@ -170,8 +181,8 @@ def get_onnx_dummy_inputs(
             dummy_inputs = {
                 "image_embeddings": torch.zeros(
                     1,
-                    model.model.prompt_encoder.embed_dim,
-                    *model.model.prompt_encoder.image_embedding_size,
+                    model.prompt_encoder.embed_dim,
+                    *model.prompt_encoder.image_embedding_size,
                     dtype=torch.float32,
                 ),
                 "point_coords": torch.randint(low=0, high=self.input_size[0], size=(1, 2, 2), dtype=torch.float32),
@@ -179,7 +190,7 @@ def get_onnx_dummy_inputs(
                 "mask_input": torch.randn(
                     1,
                     1,
-                    *(4 * size for size in model.model.prompt_encoder.image_embedding_size),
+                    *(4 * size for size in model.prompt_encoder.image_embedding_size),
                     dtype=torch.float32,
                 ),
                 "has_mask_input": torch.tensor([[1]], dtype=torch.float32),
diff --git a/tests/unit/algo/visual_prompting/test_sam.py b/tests/unit/algo/visual_prompting/test_sam.py
index c88aa279d5d..94f4f1fc3d9 100644
--- a/tests/unit/algo/visual_prompting/test_sam.py
+++ b/tests/unit/algo/visual_prompting/test_sam.py
@@ -94,6 +94,36 @@ def __init__(self):
         for param in mock_model.mask_decoder.parameters():
             assert param.requires_grad != freeze_mask_decoder
 
+    def test_forward_for_tracing(self, mocker) -> None:
+        mixin = CommonSettingMixin()
+        mixin.model = mock.Mock()
+        mock_forward_for_tracing = mocker.patch.object(mixin.model, "forward_for_tracing")
+
+        image_embeddings = torch.zeros((1, 256, 64, 64))
+        point_coords = torch.zeros((1, 10, 2))
+        point_labels = torch.zeros((1, 10))
+        mask_input = torch.zeros((1, 1, 256, 256))
+        has_mask_input = torch.zeros((1, 1))
+        ori_shape = torch.zeros((1, 2))
+
+        mixin.forward_for_tracing(
+            image_embeddings=image_embeddings,
+            point_coords=point_coords,
+            point_labels=point_labels,
+            mask_input=mask_input,
+            has_mask_input=has_mask_input,
+            ori_shape=ori_shape,
+        )
+
+        mock_forward_for_tracing.assert_called_once_with(
+            image_embeddings=image_embeddings,
+            point_coords=point_coords,
+            point_labels=point_labels,
+            mask_input=mask_input,
+            has_mask_input=has_mask_input,
+            ori_shape=ori_shape,
+        )
+
 
 class TestSAM:
     @pytest.fixture()
@@ -128,34 +158,6 @@ def test_build_model(self, sam: SAM) -> None:
         assert isinstance(segment_anything.mask_decoder, SAMMaskDecoder)
         assert isinstance(segment_anything.criterion, SAMCriterion)
 
-    def test_forward_for_tracing(self, mocker, sam) -> None:
-        mock_forward_for_tracing = mocker.patch.object(sam.model, "forward_for_tracing")
-
-        image_embeddings = torch.zeros((1, 256, 64, 64))
-        point_coords = torch.zeros((1, 10, 2))
-        point_labels = torch.zeros((1, 10))
-        mask_input = torch.zeros((1, 1, 256, 256))
-        has_mask_input = torch.zeros((1, 1))
-        ori_shape = torch.zeros((1, 2))
-
-        sam.forward_for_tracing(
-            image_embeddings=image_embeddings,
-            point_coords=point_coords,
-            point_labels=point_labels,
-            mask_input=mask_input,
-            has_mask_input=has_mask_input,
-            ori_shape=ori_shape,
-        )
-
-        mock_forward_for_tracing.assert_called_once_with(
-            image_embeddings=image_embeddings,
-            point_coords=point_coords,
-            point_labels=point_labels,
-            mask_input=mask_input,
-            has_mask_input=has_mask_input,
-            ori_shape=ori_shape,
-        )
-
 
 class TestZeroShotSAM:
     @pytest.fixture()

From ccf2d508738e24d3c309a9ce40a792fad6228d99 Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Fri, 23 Aug 2024 14:24:09 +0900
Subject: [PATCH 06/53] Fix DeiT for multi-label classification (#3881)

Remove init_args
---
 src/otx/algo/classification/vit.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/otx/algo/classification/vit.py b/src/otx/algo/classification/vit.py
index 6fcc77b6cbe..46c4e030d8d 100644
--- a/src/otx/algo/classification/vit.py
+++ b/src/otx/algo/classification/vit.py
@@ -373,10 +373,6 @@ def _create_model(self) -> nn.Module:
         return model
 
     def _build_model(self, num_classes: int) -> nn.Module:
-        init_cfg = [
-            {"std": 0.2, "layer": "Linear", "type": "TruncNormal"},
-            {"bias": 0.0, "val": 1.0, "layer": "LayerNorm", "type": "Constant"},
-        ]
         vit_backbone = VisionTransformer(arch=self.arch, img_size=self.input_size, lora=self.lora)
         return ImageClassifier(
             backbone=vit_backbone,
@@ -386,7 +382,6 @@ def _build_model(self, num_classes: int) -> nn.Module:
                 in_channels=vit_backbone.embed_dim,
             ),
             loss=AsymmetricAngularLossWithIgnore(gamma_pos=0.0, gamma_neg=1.0, reduction="sum"),
-            init_cfg=init_cfg,
         )
 
 

From e577b6a2b62cb97f85020007c74a3e7fc3f883ba Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Fri, 23 Aug 2024 18:51:01 +0900
Subject: [PATCH 07/53] Fix Semi-SL for ViT accuracy drop (#3883)

Remove init_args
---
 src/otx/algo/classification/vit.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/otx/algo/classification/vit.py b/src/otx/algo/classification/vit.py
index 46c4e030d8d..55beb76deff 100644
--- a/src/otx/algo/classification/vit.py
+++ b/src/otx/algo/classification/vit.py
@@ -290,7 +290,6 @@ def _build_model(self, num_classes: int) -> nn.Module:
                     in_channels=vit_backbone.embed_dim,
                 ),
                 loss=nn.CrossEntropyLoss(reduction="none"),
-                init_cfg=init_cfg,
             )
 
         return ImageClassifier(

From d1dd2b02c17a6ff12972f1d6334fab4cb245657a Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Fri, 23 Aug 2024 19:44:22 +0900
Subject: [PATCH 08/53] Update docs for 2.2 (#3884)

Update docs
---
 .../algorithms/action/action_detection.rst    |  47 ---
 .../explanation/algorithms/action/index.rst   |   1 -
 .../source/guide/get_started/cli_commands.rst |   4 +-
 .../source/guide/get_started/installation.rst |   2 +-
 .../source/guide/tutorials/advanced/index.rst |   2 +
 .../advanced/low_rank_adaptation.rst          |  39 +++
 .../tutorials/advanced/torch_compile.rst      |  41 +++
 .../base/how_to_train/action_detection.rst    | 275 ------------------
 .../tutorials/base/how_to_train/index.rst     |   8 -
 9 files changed, 85 insertions(+), 334 deletions(-)
 delete mode 100644 docs/source/guide/explanation/algorithms/action/action_detection.rst
 create mode 100644 docs/source/guide/tutorials/advanced/low_rank_adaptation.rst
 create mode 100644 docs/source/guide/tutorials/advanced/torch_compile.rst
 delete mode 100644 docs/source/guide/tutorials/base/how_to_train/action_detection.rst

diff --git a/docs/source/guide/explanation/algorithms/action/action_detection.rst b/docs/source/guide/explanation/algorithms/action/action_detection.rst
deleted file mode 100644
index 1eae4fae0d3..00000000000
--- a/docs/source/guide/explanation/algorithms/action/action_detection.rst
+++ /dev/null
@@ -1,47 +0,0 @@
-Action Detection
-================
-
-Sptio-Temporal action detection is the problem of localizing the actor(spatial detection) and action(temporal detection). We solve this problem by combining 3D action classification backbone and 2D object detection model. We can combine these two models in several ways. Currently, we support the simplest way. The other ways will be supported in near future.
-
-X3D + Fast-RCNN architecture comes from `X3D paper <https://arxiv.org/abs/2004.04730>`_. This model requires pre-computed actor proposals. Actor pre-proposals can be obtained from `COCO <https://cocodataset.org/#home>`_ pre-trained 2D object detector (e.g. `Faster-RCNN <https://arxiv.org/abs/1506.01497>`_, `ATSS <https://arxiv.org/abs/1912.02424>`_). If the custom dataset requires finetuning of 2d object detector, please refer :doc:`otx.algorithms.detection <../object_detection/object_detection>`. Region-of-interest (RoI) features are extracted at the last feature map of X3D by extending a 2D proposal at a keyframe into a 3D RoI by replicating it along the temporal axis. The RoI features fed into the roi head of Fast-RCNN.
-
-For better transfer learning we use the following algorithm components:
-
-- ``Augmentations``: We use only random crop and random flip for the training pipeline
-
-- ``Optimizer``: We use `SGD <https://en.wikipedia.org/wiki/Stochastic_gradient_descent>`_ optimizer with the weight decay set to **1e-4** and momentum set to **0.9**.
-
-- ``Loss functions``: For the multi-label case binary cross entropy loss is used. In the other case, `Cross Entropy Loss <https://en.wikipedia.org/wiki/Cross_entropy>`_ is used for the categories classification.
-
-**************
-Dataset Format
-**************
-
-We support the popular action classification formats, `AVA dataset <http://research.google.com/ava/>`_ format.
-
-
-******
-Models
-******
-
-We support the following ready-to-use model recipes for transfer learning:
-
-+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------------+-------------------------+
-| Recipe ID                                                                                                                                                                               | Name          | Complexity (GFLOPs) | Model size (MB)         |
-+=========================================================================================================================================================================================+===============+=====================+=========================+
-| `Custom_Action_Detection_X3D_FAST_RCNN <https://github.com/openvinotoolkit/training_extensions/blob/develop/src/otx/recipe/action/action_detection/x3d_fast_rcnn.yaml>`_                | x3d_fast_rcnn | 13.04               | 8.32                    |
-+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------------+-------------------------+
-
-To see which models are available for the task, the following command can be executed:
-
-.. code-block:: shell
-
-        (otx) ...$ otx find --task ACTION_DETECTION
-
-In the table below the **mAP** on some academic datasets are presented. Each model is trained using `Kinetics-400 <https://www.deepmind.com/open-source/kinetics>`_ pre-trained weight with single Nvidia GeForce RTX3090.
-
-+----------------+-------+-----------+
-| Model name     | JHMDB | UCF101-24 |
-+================+=======+===========+
-| x3d_fast_rcnn  | 92.14 |   80.7    |
-+----------------+-------+-----------+
diff --git a/docs/source/guide/explanation/algorithms/action/index.rst b/docs/source/guide/explanation/algorithms/action/index.rst
index c2965183809..1d5de506b5a 100644
--- a/docs/source/guide/explanation/algorithms/action/index.rst
+++ b/docs/source/guide/explanation/algorithms/action/index.rst
@@ -6,4 +6,3 @@ Action Recognition
 
 
    action_classification
-   action_detection
diff --git a/docs/source/guide/get_started/cli_commands.rst b/docs/source/guide/get_started/cli_commands.rst
index a7b03590e60..7231c6d0dd7 100644
--- a/docs/source/guide/get_started/cli_commands.rst
+++ b/docs/source/guide/get_started/cli_commands.rst
@@ -339,11 +339,11 @@ The results will be saved in ``./otx-workspace/`` folder by default. The output
 
             (otx) ...$ otx train --model <model-class-path-or-name> --task <task-type> --data_root <dataset-root>
 
-        For example, if you want to use the ``otx.algo.detection.atss.ATSS`` model class, you can train it as shown below.
+        For example, if you want to use the ``otx.algo.classification.torchvision_model.TVModelForMulticlassCls`` model class, you can train it as shown below.
 
         .. code-block:: shell
 
-            (otx) ...$ otx train --model otx.algo.detection.atss.ATSS --model.variant mobilenetv2 --task DETECTION ...
+            (otx) ...$ otx train --model otx.algo.classification.torchvision_model.TVModelForMulticlassCls --model.backbone mobilenet_v3_small ...
 
 .. note::
     You also can visualize the training using ``Tensorboard`` as these logs are located in ``<work_dir>/tensorboard``.
diff --git a/docs/source/guide/get_started/installation.rst b/docs/source/guide/get_started/installation.rst
index 3ab889bbb57..94de82e29a3 100644
--- a/docs/source/guide/get_started/installation.rst
+++ b/docs/source/guide/get_started/installation.rst
@@ -68,7 +68,7 @@ according to your system environment.
 
 .. note::
 
-    Currently, only torch==2.1.1 was fully validated. (older versions are not supported due to security issues).
+    Currently, only torch==2.2 was fully validated. (older versions are not supported due to security issues).
 
 
 3. Once the package is installed in the virtual environment, you can use full
diff --git a/docs/source/guide/tutorials/advanced/index.rst b/docs/source/guide/tutorials/advanced/index.rst
index 8524b3a8200..a4b69cdacc8 100644
--- a/docs/source/guide/tutorials/advanced/index.rst
+++ b/docs/source/guide/tutorials/advanced/index.rst
@@ -8,5 +8,7 @@ Advanced Tutorials
    semi_supervised_learning
    huggingface_model
    multi_gpu
+   low_rank_adaptation
+   torch_compile
 
 .. Once we have enough material, we might need to categorize these into `data`, `model learning` sections.
\ No newline at end of file
diff --git a/docs/source/guide/tutorials/advanced/low_rank_adaptation.rst b/docs/source/guide/tutorials/advanced/low_rank_adaptation.rst
new file mode 100644
index 00000000000..06b31d2b7b4
--- /dev/null
+++ b/docs/source/guide/tutorials/advanced/low_rank_adaptation.rst
@@ -0,0 +1,39 @@
+LoRA: Low Rank Adaptation for Classification Tasks
+===================================================
+
+.. note::
+
+    LoRA is only supported for VisionTransformer models.
+    See the model in otx.algo.classification.vit.
+
+Overview
+--------
+
+OpenVINO™ Training Extensions now supports Low Rank Adaptation (LoRA) for classification tasks using Transformer models. 
+LoRA is a parameter-efficient approach to adapt pre-trained models by introducing low-rank matrices that capture important adaptations without the need to retrain the entire model.
+
+Benefits of LoRA
+----------------
+
+- **Efficiency**: LoRA allows for efficient adaptation of large pre-trained models with minimal additional parameters.
+- **Performance**: By focusing on key parameters, LoRA can achieve competitive performance with less computational overhead.
+- **Flexibility**: LoRA can be applied to various parts of the transformer model, providing flexibility in model tuning.
+
+How to Use LoRA in OpenVINO™ Training Extensions
+------------------------------------------------
+
+.. tab-set::
+
+    .. tab-item:: API
+
+        .. code-block:: python
+
+            from otx.algo.classification.vit import VisionTransformerForMulticlassCls
+
+            model = VisionTransformerForMulticlassCls(..., lora=True)
+
+    .. tab-item:: CLI
+
+        .. code-block:: bash
+
+            (otx) ...$ otx train ... --model.lora True
diff --git a/docs/source/guide/tutorials/advanced/torch_compile.rst b/docs/source/guide/tutorials/advanced/torch_compile.rst
new file mode 100644
index 00000000000..97a4f325923
--- /dev/null
+++ b/docs/source/guide/tutorials/advanced/torch_compile.rst
@@ -0,0 +1,41 @@
+[BETA] Enable torch.compile
+============================
+
+.. warning::
+    Not currently supported on all models.
+    As far as we check, it is available for Classification Task models and some segmentation models.
+    We will continue to optimize this and do not guarantee performance for now.
+
+Overview
+--------
+
+OpenVINO™ Training Extensions now integrates the `torch.compile` feature from PyTorch, allowing users to optimize their models for better performance.
+This feature compiles the model's operations into optimized lower-level code, which can significantly improve execution speed and reduce memory usage.
+
+Benefits of torch.compile
+-------------------------
+
+- **Performance Optimization**: Compiled models run faster by executing optimized low-level operations.
+- **Reduced Memory Footprint**: Optimized models can use less memory, which is beneficial for deploying models on resource-constrained devices.
+For more information on the benefits of `torch.compile`, refer to the official `PyTorch documentation <https://pytorch.org/docs/stable/generated/torch.compile.html>`_.
+
+How to Use torch.compile in OpenVINO™ Training Extensions
+----------------------------------------------------------
+
+**Prepare OTXModel**: Ensure that model is compatible with `torch.compile`. When building the model, give the `torch_compile` option `True`.
+
+.. tab-set::
+
+    .. tab-item:: API
+
+        .. code-block:: python
+
+            from otx.algo.classification.vit import VisionTransformerForMulticlassCls
+
+            model = VisionTransformerForMulticlassCls(..., torch_compile=True)
+
+    .. tab-item:: CLI
+
+        .. code-block:: bash
+
+            (otx) ...$ otx train ... --model.torch_compile True
diff --git a/docs/source/guide/tutorials/base/how_to_train/action_detection.rst b/docs/source/guide/tutorials/base/how_to_train/action_detection.rst
deleted file mode 100644
index 7707e803e62..00000000000
--- a/docs/source/guide/tutorials/base/how_to_train/action_detection.rst
+++ /dev/null
@@ -1,275 +0,0 @@
-Action Detection model
-================================
-
-This live example shows how to easily train and validate for spatio-temporal action detection model on the subset of `JHMDB <http://jhmdb.is.tue.mpg.de/>`_.
-To learn more about Action Detection task, refer to :doc:`../../../explanation/algorithms/action/action_detection`.
-
-.. note::
-
-  To learn deeper how to manage training process of the model including additional parameters and its modification, refer to :doc:`./detection`.
-
-The process has been tested on the following configuration.
-
-- Ubuntu 20.04
-- NVIDIA GeForce RTX 3090
-- Intel(R) Core(TM) i9-10980XE
-- CUDA Toolkit 11.1
-
-*************************
-Setup virtual environment
-*************************
-
-1. You can follow the installation process from a :doc:`quick start guide <../../../get_started/installation>`
-to create a universal virtual environment for OpenVINO™ Training Extensions.
-
-2. Activate your virtual
-environment:
-
-.. code-block::
-
-  .otx/bin/activate
-  # or by this line, if you created an environment, using tox
-  . venv/otx/bin/activate
-
-
-***************************
-Dataset preparation
-***************************
-
-For action detection task, you need to prepare dataset whose format is `AVA <https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ava/README.md>`_ dataset. 
-For easy beginning, we provide `sample dataset <https://drive.google.com/file/d/1758dyPeFv4wS0gqL42sSXZSHWysL0Xr8/view?usp=drive_link>`_
-
-If you download data from link and extract to ``training_extensions/data`` folder(you should make data folder at first), you can see the structure below:
-
-.. code-block::
-
-    training_extensions
-    └── data
-        └── JHMDB_10%
-            ├── annotations
-            │    └── ava_action_list_v2.2.pbtxt
-            │    └── ava_test.csv
-            │    └── ava_train.csv
-            │    └── ava_val.csv
-            │    └── test.pkl
-            │    └── train.pkl
-            │    └── val.pkl
-            │
-            └── frames
-                │── train_video001
-                │   └── train_video001_0001.jpg
-                └── test_video001
-                    └── test_video001_0001.jpg
-
-
-
-*********
-Training
-*********
-
-1. First of all, you need to choose which action detection model you want to train.
-The list of supported recipes for action detection is available with the command line below:
-
-.. note::
-
-  The characteristics and detailed comparison of the models could be found in :doc:`Explanation section <../../../explanation/algorithms/action/action_detection>`.
-
-.. code-block::
-
-  (otx) ...$ otx find --task ACTION_DETECTION
-
-  +-----------------------+--------------------------------------+---------------------------------------------------------------------------------+
-  |          TASK         |                  Model Name          |                                         Recipe PATH                             |
-  +-----------------------+--------------------------------------+---------------------------------------------------------------------------------+
-  | ACTION_DETECTION      | x3d_fast_rcnn                        | ../otx/recipe/action/action_detection/x3d_fast_rcnn.yaml                        |
-  +-----------------------+--------------------------------------+---------------------------------------------------------------------------------+
-
-To have a specific example in this tutorial, all commands will be run on the X3D_FAST_RCNN  model. It's a light model, that achieves competitive accuracy while keeping the inference fast.
-
-2. ``otx train`` trains a model (a particular model template)
-on a dataset and results:
-
-Here are the main outputs can expect with CLI:
-- ``{work_dir}/{timestamp}/checkpoints/epoch_*.ckpt`` - a model checkpoint file.
-- ``{work_dir}/{timestamp}/configs.yaml`` - The configuration file used in the training can be reused to reproduce the training.
-- ``{work_dir}/.latest`` - The results of each of the most recently executed subcommands are soft-linked. This allows you to skip checkpoints and config file entry as a workspace.
-
-.. tab-set::
-
-    .. tab-item:: CLI (auto-config)
-
-        .. code-block:: shell
-
-            (otx) ...$ otx train --data_root data/JHMDB_10%
-
-    .. tab-item:: CLI (with config)
-
-        .. code-block:: shell
-
-            (otx) ...$ otx train --config src/otx/recipe/action/action_detection/x3d_fast_rcnn.yaml --data_root data/JHMDB_10%
-
-    .. tab-item:: API (from_config)
-
-        .. code-block:: python
-
-            from otx.engine import Engine
-
-            data_root = "data/JHMDB_10%"
-            recipe = "src/otx/recipe/action/action_detection/x3d_fast_rcnn.yaml"
-
-            engine = Engine.from_config(
-                      config_path=recipe,
-                      data_root=data_root,
-                      work_dir="otx-workspace",
-                    )
-
-            engine.train(...)
-
-    .. tab-item:: API
-
-        .. code-block:: python
-
-            from otx.engine import Engine
-
-            data_root = "data/JHMDB_10%"
-
-            engine = Engine(
-                      model="x3d",
-                      data_root=data_root,
-                      work_dir="otx-workspace",
-                    )
-
-            engine.train(...)
-
-
-3. ``(Optional)`` Additionally, we can tune training parameters such as batch size, learning rate, patience epochs or warm-up iterations.
-Learn more about specific parameters using ``otx train --help -v`` or ``otx train --help -vv``.
-
-For example, to decrease the batch size to 4, fix the number of epochs to 100, extend the command line above with the following line.
-
-.. tab-set::
-
-    .. tab-item:: CLI
-
-        .. code-block:: shell
-
-            (otx) ...$ otx train ... --data.train_subset.batch_size 4 \
-                                     --max_epochs 100
-
-    .. tab-item:: API
-
-        .. code-block:: python
-
-            from otx.core.config.data import SubsetConfig
-            from otx.core.data.module import OTXDataModule
-            from otx.engine import Engine
-
-            datamodule = OTXDataModule(..., train_subset=SubsetConfig(..., batch_size=4))
-
-            engine = Engine(..., datamodule=datamodule)
-
-            engine.train(max_epochs=100)
-
-
-4. The training result ``checkpoints/*.ckpt`` file is located in ``{work_dir}`` folder,
-while training logs can be found in the ``{work_dir}/{timestamp}`` dir.
-
-.. note::
-    We also can visualize the training using ``Tensorboard`` as these logs are located in ``{work_dir}/{timestamp}/tensorboard``.
-
-.. code-block::
-
-    otx-workspace
-    ├── 20240403_134256/
-        ├── csv/
-        ├── checkpoints/
-        |   └── epoch_*.pth
-        ├── tensorboard/
-        └── configs.yaml
-    └── .latest
-        └── train/
-    ...
-
-The training time highly relies on the hardware characteristics, for example on 1 NVIDIA GeForce RTX 3090 the training took about 3 minutes.
-
-After that, we have the PyTorch object detection model trained with OpenVINO™ Training Extensions, which we can use for evaluation, export, optimization and deployment.
-
-***********
-Evaluation
-***********
-
-1. ``otx test`` runs evaluation of a
-trained model on a particular dataset.
-
-Test function receives test annotation information and model snapshot, trained in previous step.
-
-The default metric is mAP_50 measure.
-
-2. That's how we can evaluate the snapshot in ``otx-workspace``
-folder on JHMDB_10% dataset and save results to ``otx-workspace``:
-
-.. tab-set::
-
-    .. tab-item:: CLI (with work_dir)
-
-        .. code-block:: shell
-
-            (otx) ...$ otx test --work_dir otx-workspace
-              ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
-              ┃        Test metric        ┃       DataLoader 0        ┃
-              ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
-              │      test/data_time       │   0.006367621477693319    │
-              │      test/iter_time       │    0.02698644995689392    │
-              │         test/map          │    0.10247182101011276    │
-              │        test/map_50        │    0.3779516816139221     │
-              │        test/map_75        │    0.03639398142695427    │
-              │      test/map_large       │    0.11831618845462799    │
-              │      test/map_medium      │    0.02958027645945549    │
-              │    test/map_per_class     │           -1.0            │
-              │      test/map_small       │            0.0            │
-              │        test/mar_1         │    0.12753313779830933    │
-              │        test/mar_10        │    0.1305265873670578     │
-              │       test/mar_100        │    0.1305265873670578     │
-              │  test/mar_100_per_class   │           -1.0            │
-              │      test/mar_large       │    0.14978596568107605    │
-              │      test/mar_medium      │    0.06217033043503761    │
-              │      test/mar_small       │            0.0            │
-              └───────────────────────────┴───────────────────────────┘
-
-    .. tab-item:: CLI (with config)
-
-        .. code-block:: shell
-
-            (otx) ...$ otx test --config  src/otx/recipe/action/action_detection/x3d_fast_rcnn.yaml \
-                                --data_root data/JHMDB_10% \
-                                --checkpoint otx-workspace/20240312_051135/checkpoints/epoch_033.ckpt
-              ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
-              ┃        Test metric        ┃       DataLoader 0        ┃
-              ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
-              │      test/data_time       │   0.006367621477693319    │
-              │      test/iter_time       │    0.02698644995689392    │
-              │         test/map          │    0.10247182101011276    │
-              │        test/map_50        │    0.3779516816139221     │
-              │        test/map_75        │    0.03639398142695427    │
-              │      test/map_large       │    0.11831618845462799    │
-              │      test/map_medium      │    0.02958027645945549    │
-              │    test/map_per_class     │           -1.0            │
-              │      test/map_small       │            0.0            │
-              │        test/mar_1         │    0.12753313779830933    │
-              │        test/mar_10        │    0.1305265873670578     │
-              │       test/mar_100        │    0.1305265873670578     │
-              │  test/mar_100_per_class   │           -1.0            │
-              │      test/mar_large       │    0.14978596568107605    │
-              │      test/mar_medium      │    0.06217033043503761    │
-              │      test/mar_small       │            0.0            │
-              └───────────────────────────┴───────────────────────────┘
-
-    .. tab-item:: API
-
-        .. code-block:: python
-
-            engine.test()
-
-
-3. The output of ``{work_dir}/{timestamp}/csv/version_0/metrics.csv`` consists of
-a dict with target metric name and its value.
\ No newline at end of file
diff --git a/docs/source/guide/tutorials/base/how_to_train/index.rst b/docs/source/guide/tutorials/base/how_to_train/index.rst
index 7d224cb46cf..87c673cfd3f 100644
--- a/docs/source/guide/tutorials/base/how_to_train/index.rst
+++ b/docs/source/guide/tutorials/base/how_to_train/index.rst
@@ -47,13 +47,6 @@ Training to deployment tutorials
 
       Learn how to train an action classification model
 
-   .. grid-item-card:: Action Detection
-      :link: action_detection
-      :link-type: doc
-      :text-align: center
-
-      Learn how to train an action detection model
-
    .. grid-item-card:: Visual Prompting
       :link: visual_prompting
       :link-type: doc
@@ -71,5 +64,4 @@ Training to deployment tutorials
    semantic_segmentation
    anomaly_detection
    action_classification
-   action_detection
    visual_prompting

From c17a9239390c29d3e7ab27b6a19631983a66b8ce Mon Sep 17 00:00:00 2001
From: Prokofiev Kirill <kirill.prokofiev@intel.com>
Date: Fri, 23 Aug 2024 14:39:26 +0200
Subject: [PATCH 09/53] Fix mean and scale for segmentation task (#3885)

fix mean and scale
---
 src/otx/core/model/segmentation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py
index 330deb89bec..a2972d7eb86 100644
--- a/src/otx/core/model/segmentation.py
+++ b/src/otx/core/model/segmentation.py
@@ -41,8 +41,8 @@
 class OTXSegmentationModel(OTXModel[SegBatchDataEntity, SegBatchPredEntity]):
     """Base class for the semantic segmentation models used in OTX."""
 
-    mean: ClassVar[tuple[float, float, float]] = (0.485, 0.456, 0.406)
-    scale: ClassVar[tuple[float, float, float]] = (0.229, 0.224, 0.225)
+    mean: ClassVar[tuple[float, float, float]] = (123.675, 116.28, 103.53)
+    scale: ClassVar[tuple[float, float, float]] = (58.395, 57.12, 57.375)
 
     def __init__(
         self,

From d72feeb8fa102f2416431c2e8bc9d08a10683148 Mon Sep 17 00:00:00 2001
From: Vladislav Sovrasov <sovrasov.vlad@gmail.com>
Date: Mon, 26 Aug 2024 02:46:37 +0200
Subject: [PATCH 10/53] Update MAPI in 2.2 (#3889)

* Bump MAPI

* Update exportable code requirements
---
 pyproject.toml                                              | 4 ++--
 src/otx/core/exporter/exportable_code/demo/requirements.txt | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index eb28b058258..f555f283653 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,7 +83,7 @@ xpu = [
     "timm==1.0.3",
     "openvino==2024.3",
     "openvino-dev==2024.3",
-    "openvino-model-api==0.2.2.1",
+    "openvino-model-api==0.2.2.2",
     "onnx==1.16.2",
     "onnxconverter-common==1.14.0",
     "nncf==2.12.0",
@@ -97,7 +97,7 @@ base = [
     "timm==1.0.3",
     "openvino==2024.3",
     "openvino-dev==2024.3",
-    "openvino-model-api==0.2.2.1",
+    "openvino-model-api==0.2.2.2",
     "onnx==1.16.2",
     "onnxconverter-common==1.14.0",
     "nncf==2.12.0",
diff --git a/src/otx/core/exporter/exportable_code/demo/requirements.txt b/src/otx/core/exporter/exportable_code/demo/requirements.txt
index 6f798a02346..0b693bfebd9 100644
--- a/src/otx/core/exporter/exportable_code/demo/requirements.txt
+++ b/src/otx/core/exporter/exportable_code/demo/requirements.txt
@@ -1,3 +1,3 @@
 openvino==2024.3.0
-openvino-model-api==0.2.2.1
+openvino-model-api==0.2.2.2
 numpy==1.26.4

From 00ed3a01eb3603196fa8101757f3f20b23fb52c0 Mon Sep 17 00:00:00 2001
From: Prokofiev Kirill <kirill.prokofiev@intel.com>
Date: Mon, 26 Aug 2024 04:47:43 +0200
Subject: [PATCH 11/53] Improve Semi-SL for LiteHRNet (small-medium case)
 (#3891)

* change drop pixels value

* go safe, change only tested models

* minor
---
 .../recipe/semantic_segmentation/semisl/litehrnet_18_semisl.yaml | 1 +
 .../recipe/semantic_segmentation/semisl/litehrnet_s_semisl.yaml  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/otx/recipe/semantic_segmentation/semisl/litehrnet_18_semisl.yaml b/src/otx/recipe/semantic_segmentation/semisl/litehrnet_18_semisl.yaml
index a98f1ab47a2..c06944dae29 100644
--- a/src/otx/recipe/semantic_segmentation/semisl/litehrnet_18_semisl.yaml
+++ b/src/otx/recipe/semantic_segmentation/semisl/litehrnet_18_semisl.yaml
@@ -4,6 +4,7 @@ model:
     label_info: 2
     model_version: lite_hrnet_18
     train_type: SEMI_SUPERVISED
+    drop_unreliable_pixels_percent: 80
 
     optimizer:
       class_path: torch.optim.Adam
diff --git a/src/otx/recipe/semantic_segmentation/semisl/litehrnet_s_semisl.yaml b/src/otx/recipe/semantic_segmentation/semisl/litehrnet_s_semisl.yaml
index c0cd0de594f..8722e611494 100644
--- a/src/otx/recipe/semantic_segmentation/semisl/litehrnet_s_semisl.yaml
+++ b/src/otx/recipe/semantic_segmentation/semisl/litehrnet_s_semisl.yaml
@@ -4,6 +4,7 @@ model:
     label_info: 2
     model_version: lite_hrnet_s
     train_type: SEMI_SUPERVISED
+    drop_unreliable_pixels_percent: 80
 
     optimizer:
       class_path: torch.optim.Adam

From 2c6b4debd4406bde95b3ce69182eff387456242b Mon Sep 17 00:00:00 2001
From: Sooah Lee <sooah.lee@intel.com>
Date: Mon, 26 Aug 2024 17:10:01 +0900
Subject: [PATCH 12/53] Improve h-cls for eff models (#3893)

* Update step size for eff v2

* Update effb0 recipe
---
 src/otx/algo/classification/timm_model.py                  | 7 +++++--
 .../recipe/classification/h_label_cls/efficientnet_b0.yaml | 2 ++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/otx/algo/classification/timm_model.py b/src/otx/algo/classification/timm_model.py
index 4bc3c694c2b..d7e171565a7 100644
--- a/src/otx/algo/classification/timm_model.py
+++ b/src/otx/algo/classification/timm_model.py
@@ -5,7 +5,8 @@
 
 from __future__ import annotations
 
-from copy import deepcopy
+from copy import copy, deepcopy
+from math import ceil
 from typing import TYPE_CHECKING, Literal
 
 import torch
@@ -267,12 +268,14 @@ def _create_model(self) -> nn.Module:
 
     def _build_model(self, head_config: dict) -> nn.Module:
         backbone = TimmBackbone(backbone=self.backbone, pretrained=self.pretrained)
+        copied_head_config = copy(head_config)
+        copied_head_config["step_size"] = (ceil(self.input_size[0] / 32), ceil(self.input_size[1] / 32))
         return HLabelClassifier(
             backbone=backbone,
             neck=nn.Identity(),
             head=HierarchicalCBAMClsHead(
                 in_channels=backbone.num_features,
-                **head_config,
+                **copied_head_config,
             ),
             multiclass_loss=nn.CrossEntropyLoss(),
             multilabel_loss=AsymmetricAngularLossWithIgnore(gamma_pos=0.0, gamma_neg=1.0, reduction="sum"),
diff --git a/src/otx/recipe/classification/h_label_cls/efficientnet_b0.yaml b/src/otx/recipe/classification/h_label_cls/efficientnet_b0.yaml
index a734131ad05..d0ea7daec7b 100644
--- a/src/otx/recipe/classification/h_label_cls/efficientnet_b0.yaml
+++ b/src/otx/recipe/classification/h_label_cls/efficientnet_b0.yaml
@@ -7,6 +7,8 @@ model:
       class_path: torch.optim.SGD
       init_args:
         lr: 0.0049
+        momentum: 0.9
+        weight_decay: 0.0001
 
     scheduler:
       class_path: lightning.pytorch.cli.ReduceLROnPlateau

From 0dc7a29dac18737749e85dfd395b6d212a16e7f5 Mon Sep 17 00:00:00 2001
From: Eugene Liu <eugene.liu@intel.com>
Date: Tue, 27 Aug 2024 12:53:51 +0100
Subject: [PATCH 13/53] Fix maskrcnn swin nncf acc drop (#3900)

update maskrcnn swimt model type to transformer
---
 src/otx/algo/instance_segmentation/maskrcnn.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/otx/algo/instance_segmentation/maskrcnn.py b/src/otx/algo/instance_segmentation/maskrcnn.py
index 5e9610b794d..100b4b6f7b2 100644
--- a/src/otx/algo/instance_segmentation/maskrcnn.py
+++ b/src/otx/algo/instance_segmentation/maskrcnn.py
@@ -675,15 +675,4 @@ def _build_model(self, num_classes: int) -> TwoStageDetector:
     @property
     def _optimization_config(self) -> dict[str, Any]:
         """PTQ config for MaskRCNN-SwinT."""
-        return {
-            "ignored_scope": {
-                "types": [
-                    "Add",
-                    "MVN",
-                    "Divide",
-                    "Multiply",
-                ],
-                "validate": False,
-            },
-            "preset": "mixed",
-        }
+        return {"model_type": "transformer"}

From 0d6799ca4c76b53c6c6426bc21f27a1aff130c4a Mon Sep 17 00:00:00 2001
From: Wonju Lee <wonju.lee@intel.com>
Date: Wed, 28 Aug 2024 16:23:02 +0900
Subject: [PATCH 14/53] Add keypoint detection recipe for single object cases
 (#3903)

* add rtmpose_tiny for single obj

* add rtmpose_tiny for single obj

* modify test subset name

* fix unit test

* update recipe with reset
---
 .../keypoint_detection/heads/rtmcc_head.py    |  2 +-
 .../core/data/dataset/keypoint_detection.py   |  4 +-
 .../core/data/transform_libs/torchvision.py   | 62 +++++++++-----
 .../_base_/data/keypoint_detection.yaml       |  3 +
 .../rtmpose_tiny_single_obj.yaml              | 81 +++++++++++++++++++
 .../data/transform_libs/test_torchvision.py   | 14 +++-
 6 files changed, 140 insertions(+), 26 deletions(-)
 create mode 100644 src/otx/recipe/keypoint_detection/rtmpose_tiny_single_obj.yaml

diff --git a/src/otx/algo/keypoint_detection/heads/rtmcc_head.py b/src/otx/algo/keypoint_detection/heads/rtmcc_head.py
index 6fcb5c50e78..63f12d2edca 100644
--- a/src/otx/algo/keypoint_detection/heads/rtmcc_head.py
+++ b/src/otx/algo/keypoint_detection/heads/rtmcc_head.py
@@ -192,7 +192,7 @@ def loss(self, x: tuple[Tensor], entity: KeypointDetBatchDataEntity) -> dict:
             mask=mask,
         )
 
-        loss_pose = torch.tensor(avg_acc, device=device)
+        loss_pose = -1 * torch.tensor(avg_acc, device=device)
         losses.update(loss_pose=loss_pose)
 
         return losses
diff --git a/src/otx/core/data/dataset/keypoint_detection.py b/src/otx/core/data/dataset/keypoint_detection.py
index 2b8edbe9d46..f0e0d30c372 100644
--- a/src/otx/core/data/dataset/keypoint_detection.py
+++ b/src/otx/core/data/dataset/keypoint_detection.py
@@ -104,8 +104,8 @@ def _get_item_impl(self, index: int) -> KeypointDetDataEntity | None:
         ).reshape(-1, 2)
         keypoints_visible = np.minimum(1, keypoints)[..., 0]
 
-        bbox_center = (bboxes[0, 2:] + bboxes[0, :2]) * 0.5
-        bbox_scale = (bboxes[0, 2:] - bboxes[0, :2]) * 1.25
+        bbox_center = np.array(img_shape) / 2.0
+        bbox_scale = np.array(img_shape)
         bbox_rotation = 0.0
 
         entity = KeypointDetDataEntity(
diff --git a/src/otx/core/data/transform_libs/torchvision.py b/src/otx/core/data/transform_libs/torchvision.py
index 03f37827174..8cfe3ea1636 100644
--- a/src/otx/core/data/transform_libs/torchvision.py
+++ b/src/otx/core/data/transform_libs/torchvision.py
@@ -3109,6 +3109,46 @@ def __call__(self, *_inputs: T_OTXDataEntity) -> T_OTXDataEntity | None:
         return inputs
 
 
+class GetBBoxCenterScale(tvt_v2.Transform):
+    """Convert bboxes from [x, y, w, h] to center and scale.
+
+    The center is the coordinates of the bbox center, and the scale is the
+    bbox width and height normalized by a scale factor.
+    Required Keys:
+        - bbox
+    Modified Keys:
+        - bbox_center
+        - bbox_scale
+    Args:
+        padding (float): The bbox padding scale that will be multilied to
+            `bbox_scale`. Defaults to 1.25
+    """
+
+    def __init__(self, padding: float = 1.25) -> None:
+        super().__init__()
+
+        self.padding = padding
+
+    def __call__(self, *_inputs: T_OTXDataEntity) -> T_OTXDataEntity | None:
+        """Transform function to add bbox_infos from bboxes for keypoint detection task."""
+        assert len(_inputs) == 1, "[tmp] Multiple entity is not supported yet."  # noqa: S101
+        inputs = _inputs[0]
+
+        bbox = inputs.bboxes[0].numpy()
+        inputs.bbox_info.center = (bbox[2:] + bbox[:2]) * 0.5
+        inputs.bbox_info.scale = (bbox[2:] - bbox[:2]) * self.padding
+
+        return inputs
+
+    def __repr__(self) -> str:
+        """Print the basic information of the transform.
+
+        Returns:
+            str: Formatted string.
+        """
+        return self.__class__.__name__ + f"(padding={self.padding})"
+
+
 class RandomBBoxTransform(tvt_v2.Transform):
     r"""Rnadomly shift, resize and rotate the bounding boxes.
 
@@ -3202,16 +3242,7 @@ def _get_transform_params(self) -> tuple:
         return offset, scale, rotate
 
     def __call__(self, *_inputs: T_OTXDataEntity) -> T_OTXDataEntity | None:
-        """The transform function of :class:`RandomBboxTransform`.
-
-        See ``transform()`` method of :class:`BaseTransform` for details.
-
-        Args:
-            results (dict): The result dict
-
-        Returns:
-            dict: The result dict.
-        """
+        """Transform function to adjust bbox_infos randomly."""
         assert len(_inputs) == 1, "[tmp] Multiple entity is not supported yet."  # noqa: S101
         inputs = _inputs[0]
 
@@ -3378,16 +3409,7 @@ def _get_warp_image(
         return torch.from_numpy(warped_image).permute(2, 0, 1)
 
     def __call__(self, *_inputs: T_OTXDataEntity) -> T_OTXDataEntity | None:
-        """The transform function of :class:`TopdownAffine`.
-
-        See ``transform()`` method of :class:`BaseTransform` for details.
-
-        Args:
-            results (dict): The result dict
-
-        Returns:
-            dict: The result dict.
-        """
+        """Transform function to affine image through warp matrix."""
         assert len(_inputs) == 1, "[tmp] Multiple entity is not supported yet."  # noqa: S101
         inputs = _inputs[0]
 
diff --git a/src/otx/recipe/_base_/data/keypoint_detection.yaml b/src/otx/recipe/_base_/data/keypoint_detection.yaml
index bc6bf54540e..756bddba960 100644
--- a/src/otx/recipe/_base_/data/keypoint_detection.yaml
+++ b/src/otx/recipe/_base_/data/keypoint_detection.yaml
@@ -12,6 +12,7 @@ train_subset:
   subset_name: train
   batch_size: 32
   transforms:
+    - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.RandomBBoxTransform
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
       init_args:
@@ -30,6 +31,7 @@ val_subset:
   subset_name: val
   batch_size: 32
   transforms:
+    - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
       init_args:
         input_size: $(input_size)
@@ -45,6 +47,7 @@ test_subset:
   subset_name: test
   batch_size: 32
   transforms:
+    - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
     - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
       init_args:
         input_size: $(input_size)
diff --git a/src/otx/recipe/keypoint_detection/rtmpose_tiny_single_obj.yaml b/src/otx/recipe/keypoint_detection/rtmpose_tiny_single_obj.yaml
new file mode 100644
index 00000000000..8b22c757330
--- /dev/null
+++ b/src/otx/recipe/keypoint_detection/rtmpose_tiny_single_obj.yaml
@@ -0,0 +1,81 @@
+model:
+  class_path: otx.algo.keypoint_detection.rtmpose.RTMPoseTiny
+  init_args:
+    label_info: 17
+
+    optimizer:
+      class_path: torch.optim.AdamW
+      init_args:
+        lr: 0.001
+        weight_decay: 0.0001
+
+    scheduler:
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
+      init_args:
+        num_warmup_steps: 3
+        main_scheduler_callable:
+          class_path: lightning.pytorch.cli.ReduceLROnPlateau
+          init_args:
+            mode: max
+            factor: 0.1
+            patience: 9
+            monitor: val/accuracy
+
+engine:
+  task: KEYPOINT_DETECTION
+  device: auto
+
+callback_monitor: val/accuracy
+
+data: ../_base_/data/keypoint_detection.yaml
+
+overrides:
+  gradient_clip_val: 35.0
+  reset:
+    - data.train_subset.transforms
+    - data.val_subset.transforms
+    - data.test_subset.transforms
+  input_size:
+    - 512
+    - 512
+  train_subset:
+    transforms:
+      - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
+        init_args:
+          input_size: $(input_size)
+      - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
+        init_args:
+          is_numpy_to_tvtensor: true
+      - class_path: torchvision.transforms.v2.ToDtype
+        init_args:
+          dtype: ${as_torch_dtype:torch.float32}
+      - class_path: torchvision.transforms.v2.Normalize
+        init_args:
+          mean: [123.675, 116.28, 103.53]
+          std: [58.395, 57.12, 57.375]
+  val_subset:
+    transforms:
+      - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
+        init_args:
+          input_size: $(input_size)
+          is_numpy_to_tvtensor: true
+      - class_path: torchvision.transforms.v2.ToDtype
+        init_args:
+          dtype: ${as_torch_dtype:torch.float32}
+      - class_path: torchvision.transforms.v2.Normalize
+        init_args:
+          mean: [123.675, 116.28, 103.53]
+          std: [58.395, 57.12, 57.375]
+  test_subset:
+    transforms:
+      - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
+        init_args:
+          input_size: $(input_size)
+          is_numpy_to_tvtensor: true
+      - class_path: torchvision.transforms.v2.ToDtype
+        init_args:
+          dtype: ${as_torch_dtype:torch.float32}
+      - class_path: torchvision.transforms.v2.Normalize
+        init_args:
+          mean: [123.675, 116.28, 103.53]
+          std: [58.395, 57.12, 57.375]
diff --git a/tests/unit/core/data/transform_libs/test_torchvision.py b/tests/unit/core/data/transform_libs/test_torchvision.py
index 19de5aa8f36..1a1363d6821 100644
--- a/tests/unit/core/data/transform_libs/test_torchvision.py
+++ b/tests/unit/core/data/transform_libs/test_torchvision.py
@@ -20,8 +20,10 @@
 from otx.core.data.transform_libs.torchvision import (
     CachedMixUp,
     CachedMosaic,
+    Compose,
     DecodeVideo,
     FilterAnnotations,
+    GetBBoxCenterScale,
     MinIoURandomCrop,
     PackVideo,
     Pad,
@@ -901,12 +903,18 @@ def keypoint_det_entity(self) -> KeypointDetDataEntity:
             labels=torch.LongTensor([0]),
             keypoints=tv_tensors.TVTensor(np.array([[0, 4], [4, 2], [2, 6], [6, 0]])),
             keypoints_visible=tv_tensors.TVTensor(np.array([1, 1, 1, 0])),
-            bbox_info=BboxInfo(center=np.array([3.5, 3.5]), scale=np.array([8.75, 8.75]), rotation=0),
+            bbox_info=BboxInfo(center=np.array([5, 5]), scale=np.array([10, 10]), rotation=0),
         )
 
     def test_forward(self, keypoint_det_entity) -> None:
-        transform = TopdownAffine(input_size=(5, 5))
+        transform = Compose(
+            [
+                GetBBoxCenterScale(),
+                TopdownAffine(input_size=(5, 5)),
+            ],
+        )
         results = transform(deepcopy(keypoint_det_entity))
 
-        assert hasattr(results, "keypoints")
+        assert np.array_equal(results.bbox_info.center, np.array([3.5, 3.5]))
+        assert np.array_equal(results.bbox_info.scale, np.array([8.75, 8.75]))
         assert results.keypoints.shape == (4, 2)

From 8115b529e5ecac52362f63dd507bd92a79f716df Mon Sep 17 00:00:00 2001
From: Sooah Lee <sooah.lee@intel.com>
Date: Thu, 29 Aug 2024 18:43:40 +0900
Subject: [PATCH 15/53] Improve acc drop of efficientnetv2 for h-label cls
 (#3907)

* Add warmup_iters for effv2

* Update max_epochs
---
 .../classification/h_label_cls/efficientnet_v2.yaml    | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/otx/recipe/classification/h_label_cls/efficientnet_v2.yaml b/src/otx/recipe/classification/h_label_cls/efficientnet_v2.yaml
index 848a985d433..9c06011a1c9 100644
--- a/src/otx/recipe/classification/h_label_cls/efficientnet_v2.yaml
+++ b/src/otx/recipe/classification/h_label_cls/efficientnet_v2.yaml
@@ -10,14 +10,6 @@ model:
         momentum: 0.9
         weight_decay: 0.0001
 
-    scheduler:
-      class_path: lightning.pytorch.cli.ReduceLROnPlateau
-      init_args:
-        mode: max
-        factor: 0.5
-        patience: 1
-        monitor: val/accuracy
-
 engine:
   task: H_LABEL_CLS
   device: auto
@@ -34,7 +26,7 @@ overrides:
     - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
       init_args:
         patience: 3
-
+        warmup_iters: 750
   data:
     task: H_LABEL_CLS
     data_format: datumaro

From 4c8555e10d7aebdf23cc8f6ac90c5049edb34f1c Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Thu, 29 Aug 2024 23:16:24 +0900
Subject: [PATCH 16/53] Fix pretrained weight cached dir for timm (#3909)

* Fix pretrained_weight for timm

* Fix unit-test
---
 src/otx/algo/classification/backbones/timm.py | 32 ++++++++-----------
 .../classification/backbones/test_timm.py     |  1 -
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/src/otx/algo/classification/backbones/timm.py b/src/otx/algo/classification/backbones/timm.py
index e2f17044680..ec3e6fed06b 100644
--- a/src/otx/algo/classification/backbones/timm.py
+++ b/src/otx/algo/classification/backbones/timm.py
@@ -9,14 +9,13 @@
 """
 from __future__ import annotations
 
-from pathlib import Path
 from typing import Literal
 
 import timm
 import torch
 from torch import nn
 
-from otx.algo.utils.mmengine_utils import load_checkpoint_to_model, load_from_http
+from otx.algo.utils.mmengine_utils import load_from_http
 
 PRETRAINED_ROOT = "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/"
 pretrained_urls = {
@@ -28,7 +27,7 @@
     "mobilenetv3_large_21k": "mobilenetv3_large_100_miil_in21k",
     "mobilenetv3_large_1k": "mobilenetv3_large_100_miil",
     "tresnet": "tresnet_m",
-    "efficientnetv2_s_21k": "tf_efficientnetv2_s_in21k",
+    "efficientnetv2_s_21k": "tf_efficientnetv2_s.in21k",
     "efficientnetv2_s_1k": "tf_efficientnetv2_s_in21ft1k",
     "efficientnetv2_m_21k": "tf_efficientnetv2_m_in21k",
     "efficientnetv2_m_1k": "tf_efficientnetv2_m_in21ft1k",
@@ -59,12 +58,19 @@ def __init__(
     ):
         super().__init__(**kwargs)
         self.backbone = backbone
-        self.pretrained = pretrained
+        self.pretrained: bool | dict = pretrained
         self.is_mobilenet = backbone.startswith("mobilenet")
+        if pretrained and self.backbone in pretrained_urls:
+            # This pretrained weight is saved into ~/.cache/torch/hub/checkpoints
+            # Otherwise, it is stored in ~/.cache/huggingface/hub. (timm defaults)
+            self.pretrained = load_from_http(filename=pretrained_urls[self.backbone])
+
+        self.model = timm.create_model(
+            TIMM_MODEL_NAME_DICT[self.backbone],
+            pretrained=self.pretrained,
+            num_classes=1000,
+        )
 
-        self.model = timm.create_model(TIMM_MODEL_NAME_DICT[self.backbone], pretrained=pretrained, num_classes=1000)
-        if self.pretrained:
-            print(f"init weight - {pretrained_urls[self.backbone]}")
         self.model.classifier = None  # Detach classifier. Only use 'backbone' part in otx.
         self.num_head_features = self.model.num_features
         self.num_features = self.model.conv_head.in_channels if self.is_mobilenet else self.model.num_features
@@ -97,15 +103,3 @@ def get_config_optim(self, lrs: list[float] | float) -> list[dict[str, float]]:
                 param_dict["lr"] = lrs
 
         return parameters
-
-    def init_weights(self, pretrained: str | bool | None = None) -> None:
-        """Initialize weights."""
-        checkpoint = None
-        if isinstance(pretrained, str) and Path(pretrained).exists():
-            checkpoint = torch.load(pretrained, None)
-            print(f"init weight - {pretrained}")
-        elif pretrained is not None:
-            checkpoint = load_from_http(pretrained_urls[self.key])
-            print(f"init weight - {pretrained_urls[self.key]}")
-        if checkpoint is not None:
-            load_checkpoint_to_model(self, checkpoint)
diff --git a/tests/unit/algo/classification/backbones/test_timm.py b/tests/unit/algo/classification/backbones/test_timm.py
index 0716e2c08cb..7a1bb6a657b 100644
--- a/tests/unit/algo/classification/backbones/test_timm.py
+++ b/tests/unit/algo/classification/backbones/test_timm.py
@@ -8,7 +8,6 @@
 class TestOTXEfficientNetV2:
     def test_forward(self):
         model = TimmBackbone(backbone="efficientnetv2_s_21k")
-        model.init_weights()
         assert model(torch.randn(1, 3, 244, 244))[0].shape == torch.Size([1, 1280, 8, 8])
 
     def test_get_config_optim(self):

From 52221e3eb53e3cf04f726d833b162ec3acf17d78 Mon Sep 17 00:00:00 2001
From: Wonju Lee <wonju.lee@intel.com>
Date: Fri, 30 Aug 2024 15:08:09 +0900
Subject: [PATCH 17/53] Fix keypoint detection single obj recipe (#3915)

* add rtmpose_tiny for single obj

* modify test subset name

* fix unit test

* property for pck
---
 src/otx/core/metrics/pck.py                   | 18 +++-
 src/otx/core/model/keypoint_detection.py      |  5 +
 .../keypoint_detection/rtmpose_tiny.yaml      |  2 +-
 .../rtmpose_tiny_single_obj.yaml              | 93 ++++++++++---------
 tests/perf/test_keypoint_detection.py         | 90 ++++++++++++++++--
 5 files changed, 153 insertions(+), 55 deletions(-)

diff --git a/src/otx/core/metrics/pck.py b/src/otx/core/metrics/pck.py
index 61ab3909865..941ed679775 100644
--- a/src/otx/core/metrics/pck.py
+++ b/src/otx/core/metrics/pck.py
@@ -147,6 +147,22 @@ def __init__(
         self.label_info: LabelInfo = label_info
         self.reset()
 
+    @property
+    def input_size(self) -> tuple[int, int]:
+        """Getter for input_size."""
+        return self._input_size
+
+    @input_size.setter
+    def input_size(self, size: tuple[int, int]) -> None:
+        """Setter for input_size."""
+        if not isinstance(size, tuple) or len(size) != 2:
+            msg = "input_size must be a tuple of two integers."
+            raise ValueError(msg)
+        if not all(isinstance(dim, int) for dim in size):
+            msg = "input_size dimensions must be integers."
+            raise ValueError(msg)
+        self._input_size = size
+
     def reset(self) -> None:
         """Reset for every validation and test epoch.
 
@@ -177,7 +193,7 @@ def compute(self) -> dict:
         gt_kpts = np.stack([p[0] for p in self.targets])
         kpts_visible = np.stack([p[1] for p in self.targets])
 
-        normalize = np.tile(np.array([[256, 192]]), (pred_kpts.shape[0], 1))
+        normalize = np.tile(np.array([self.input_size]), (pred_kpts.shape[0], 1))
         _, avg_acc, _ = keypoint_pck_accuracy(
             pred_kpts,
             gt_kpts,
diff --git a/src/otx/core/model/keypoint_detection.py b/src/otx/core/model/keypoint_detection.py
index 406c6c8031e..ea91d00cd67 100644
--- a/src/otx/core/model/keypoint_detection.py
+++ b/src/otx/core/model/keypoint_detection.py
@@ -104,6 +104,11 @@ def _customize_outputs(
             bbox_info=[],
         )
 
+    def configure_metric(self) -> None:
+        """Configure the metric."""
+        super().configure_metric()
+        self._metric.input_size = self.input_size
+
     def _convert_pred_entity_to_compute_metric(
         self,
         preds: KeypointDetBatchPredEntity,
diff --git a/src/otx/recipe/keypoint_detection/rtmpose_tiny.yaml b/src/otx/recipe/keypoint_detection/rtmpose_tiny.yaml
index 1a25a2d39d4..447d4fd5218 100644
--- a/src/otx/recipe/keypoint_detection/rtmpose_tiny.yaml
+++ b/src/otx/recipe/keypoint_detection/rtmpose_tiny.yaml
@@ -6,7 +6,7 @@ model:
     optimizer:
       class_path: torch.optim.AdamW
       init_args:
-        lr: 0.004
+        lr: 0.001
         weight_decay: 0.0001
 
     scheduler:
diff --git a/src/otx/recipe/keypoint_detection/rtmpose_tiny_single_obj.yaml b/src/otx/recipe/keypoint_detection/rtmpose_tiny_single_obj.yaml
index 8b22c757330..8045bb5e85c 100644
--- a/src/otx/recipe/keypoint_detection/rtmpose_tiny_single_obj.yaml
+++ b/src/otx/recipe/keypoint_detection/rtmpose_tiny_single_obj.yaml
@@ -2,6 +2,9 @@ model:
   class_path: otx.algo.keypoint_detection.rtmpose.RTMPoseTiny
   init_args:
     label_info: 17
+    input_size:
+      - 512
+      - 512
 
     optimizer:
       class_path: torch.optim.AdamW
@@ -35,47 +38,49 @@ overrides:
     - data.train_subset.transforms
     - data.val_subset.transforms
     - data.test_subset.transforms
-  input_size:
-    - 512
-    - 512
-  train_subset:
-    transforms:
-      - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
-        init_args:
-          input_size: $(input_size)
-      - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
-        init_args:
-          is_numpy_to_tvtensor: true
-      - class_path: torchvision.transforms.v2.ToDtype
-        init_args:
-          dtype: ${as_torch_dtype:torch.float32}
-      - class_path: torchvision.transforms.v2.Normalize
-        init_args:
-          mean: [123.675, 116.28, 103.53]
-          std: [58.395, 57.12, 57.375]
-  val_subset:
-    transforms:
-      - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
-        init_args:
-          input_size: $(input_size)
-          is_numpy_to_tvtensor: true
-      - class_path: torchvision.transforms.v2.ToDtype
-        init_args:
-          dtype: ${as_torch_dtype:torch.float32}
-      - class_path: torchvision.transforms.v2.Normalize
-        init_args:
-          mean: [123.675, 116.28, 103.53]
-          std: [58.395, 57.12, 57.375]
-  test_subset:
-    transforms:
-      - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
-        init_args:
-          input_size: $(input_size)
-          is_numpy_to_tvtensor: true
-      - class_path: torchvision.transforms.v2.ToDtype
-        init_args:
-          dtype: ${as_torch_dtype:torch.float32}
-      - class_path: torchvision.transforms.v2.Normalize
-        init_args:
-          mean: [123.675, 116.28, 103.53]
-          std: [58.395, 57.12, 57.375]
+  data:
+    input_size:
+      - 512
+      - 512
+    train_subset:
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.RandomBBoxTransform
+        - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
+          init_args:
+            input_size: $(input_size)
+        - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
+          init_args:
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+        - class_path: torchvision.transforms.v2.Normalize
+          init_args:
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+    val_subset:
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
+          init_args:
+            input_size: $(input_size)
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+        - class_path: torchvision.transforms.v2.Normalize
+          init_args:
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+    test_subset:
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
+          init_args:
+            input_size: $(input_size)
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+        - class_path: torchvision.transforms.v2.Normalize
+          init_args:
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
diff --git a/tests/perf/test_keypoint_detection.py b/tests/perf/test_keypoint_detection.py
index 3a2f2a299c9..1ff150a03d6 100644
--- a/tests/perf/test_keypoint_detection.py
+++ b/tests/perf/test_keypoint_detection.py
@@ -5,6 +5,7 @@
 from __future__ import annotations
 
 from pathlib import Path
+from typing import ClassVar
 
 import pytest
 
@@ -19,26 +20,97 @@ class TestPerfKeypointDetection(PerfTestBase):
         Benchmark.Model(task="keypoint_detection", name="rtmpose_tiny", category="speed"),
     ]
 
-    DATASET_TEST_CASES = [
+    DATASET_TEST_CASES: ClassVar = [
         Benchmark.Dataset(
-            name=f"coco_person_keypoint_small_{idx}",
-            path=Path("keypoint_detection/coco_keypoint_small") / f"{idx}",
+            name="coco_person_keypoint_small",
+            path=Path("keypoint_detection/coco_keypoint/small"),
             group="small",
             num_repeat=5,
             extra_overrides={},
-        )
-        for idx in (1, 2, 3)
-    ] + [
+        ),
         Benchmark.Dataset(
             name="coco_person_keypoint_medium",
-            path=Path("keypoint_detection/coco_keypoint_medium"),
+            path=Path("keypoint_detection/coco_keypoint/medium"),
+            group="medium",
+            num_repeat=5,
+            extra_overrides={},
+        ),
+        Benchmark.Dataset(
+            name="coco_person_keypoint_large",
+            path=Path("keypoint_detection/coco_keypoint/large"),
+            group="large",
+            num_repeat=5,
+            extra_overrides={},
+        ),
+    ]
+
+    BENCHMARK_CRITERIA = [  # noqa: RUF012
+        Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1),
+        Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1),
+        Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
+    ]
+
+    @pytest.mark.parametrize(
+        "fxt_model",
+        MODEL_TEST_CASES,
+        ids=lambda model: model.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_dataset",
+        DATASET_TEST_CASES,
+        ids=lambda dataset: dataset.name,
+        indirect=True,
+    )
+    def test_perf(
+        self,
+        fxt_model: Benchmark.Model,
+        fxt_dataset: Benchmark.Dataset,
+        fxt_benchmark: Benchmark,
+    ):
+        self._test_perf(
+            model=fxt_model,
+            dataset=fxt_dataset,
+            benchmark=fxt_benchmark,
+            criteria=self.BENCHMARK_CRITERIA,
+        )
+
+
+class TestPerfKeypointDetectionSingleObj(PerfTestBase):
+    """Benchmark visual prompting."""
+
+    MODEL_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Model(task="keypoint_detection", name="rtmpose_tiny_single_obj", category="speed"),
+    ]
+
+    DATASET_TEST_CASES: ClassVar = [
+        Benchmark.Dataset(
+            name="coco_person_keypoint_single_obj_small",
+            path=Path("keypoint_detection/coco_keypoint_single_obj/small"),
+            group="small",
+            num_repeat=5,
+            extra_overrides={},
+        ),
+        Benchmark.Dataset(
+            name="coco_person_keypoint_single_obj_medium",
+            path=Path("keypoint_detection/coco_keypoint_single_obj/medium"),
             group="medium",
             num_repeat=5,
             extra_overrides={},
         ),
         Benchmark.Dataset(
-            name="mpii_large",
-            path=Path("keypoint_detection/mpii_large"),
+            name="coco_person_keypoint_single_obj_large",
+            path=Path("keypoint_detection/coco_keypoint_single_obj/large"),
             group="large",
             num_repeat=5,
             extra_overrides={},

From 9265c596b03754c91d4247f5d2fb55a145161e1e Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Fri, 30 Aug 2024 19:44:36 +0900
Subject: [PATCH 18/53] Fix cached dir for timm & hugging-face (#3914)

* Fix cached dir

* Pretrained weight download unit-test

* Fix pre-commit
---
 src/otx/__init__.py                           | 11 +++++
 src/otx/algo/classification/backbones/timm.py | 43 +++++--------------
 .../h_label_cls/efficientnet_v2.yaml          |  2 +-
 .../multi_class_cls/efficientnet_v2.yaml      |  2 +-
 .../semisl/efficientnet_v2_semisl.yaml        |  2 +-
 .../multi_label_cls/efficientnet_v2.yaml      |  2 +-
 .../classification/backbones/test_timm.py     | 16 ++++++-
 .../algo/classification/test_timm_model.py    |  6 +--
 8 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/src/otx/__init__.py b/src/otx/__init__.py
index e10623ab5c6..58cf9a5f332 100644
--- a/src/otx/__init__.py
+++ b/src/otx/__init__.py
@@ -5,8 +5,19 @@
 
 __version__ = "2.2.0rc0"
 
+import os
+from pathlib import Path
+
 from otx.core.types import *  # noqa: F403
 
+# Set the value of HF_HUB_CACHE to set the cache folder that stores the pretrained weights for timm and huggingface.
+# Refer: huggingface_hub/constants.py::HF_HUB_CACHE
+# Default, Pretrained weight is saved into ~/.cache/torch/hub/checkpoints
+os.environ["HF_HUB_CACHE"] = os.getenv(
+    "HF_HUB_CACHE",
+    str(Path.home() / ".cache" / "torch" / "hub" / "checkpoints"),
+)
+
 OTX_LOGO: str = """
 
  ██████╗  ████████╗ ██╗  ██╗
diff --git a/src/otx/algo/classification/backbones/timm.py b/src/otx/algo/classification/backbones/timm.py
index ec3e6fed06b..7bafa0b1dbb 100644
--- a/src/otx/algo/classification/backbones/timm.py
+++ b/src/otx/algo/classification/backbones/timm.py
@@ -15,34 +15,15 @@
 import torch
 from torch import nn
 
-from otx.algo.utils.mmengine_utils import load_from_http
-
-PRETRAINED_ROOT = "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/"
-pretrained_urls = {
-    "efficientnetv2_s_21k": PRETRAINED_ROOT + "tf_efficientnetv2_s_21k-6337ad01.pth",
-    "efficientnetv2_s_1k": PRETRAINED_ROOT + "tf_efficientnetv2_s_21ft1k-d7dafa41.pth",
-}
-
-TIMM_MODEL_NAME_DICT = {
-    "mobilenetv3_large_21k": "mobilenetv3_large_100_miil_in21k",
-    "mobilenetv3_large_1k": "mobilenetv3_large_100_miil",
-    "tresnet": "tresnet_m",
-    "efficientnetv2_s_21k": "tf_efficientnetv2_s.in21k",
-    "efficientnetv2_s_1k": "tf_efficientnetv2_s_in21ft1k",
-    "efficientnetv2_m_21k": "tf_efficientnetv2_m_in21k",
-    "efficientnetv2_m_1k": "tf_efficientnetv2_m_in21ft1k",
-    "efficientnetv2_b0": "tf_efficientnetv2_b0",
-}
-
 TimmModelType = Literal[
-    "mobilenetv3_large_21k",
-    "mobilenetv3_large_1k",
-    "tresnet",
-    "efficientnetv2_s_21k",
-    "efficientnetv2_s_1k",
-    "efficientnetv2_m_21k",
-    "efficientnetv2_m_1k",
-    "efficientnetv2_b0",
+    "mobilenetv3_large_100_miil_in21k",
+    "mobilenetv3_large_100_miil",
+    "tresnet_m",
+    "tf_efficientnetv2_s.in21k",
+    "tf_efficientnetv2_s.in21ft1k",
+    "tf_efficientnetv2_m.in21k",
+    "tf_efficientnetv2_m.in21ft1k",
+    "tf_efficientnetv2_b0",
 ]
 
 
@@ -60,14 +41,10 @@ def __init__(
         self.backbone = backbone
         self.pretrained: bool | dict = pretrained
         self.is_mobilenet = backbone.startswith("mobilenet")
-        if pretrained and self.backbone in pretrained_urls:
-            # This pretrained weight is saved into ~/.cache/torch/hub/checkpoints
-            # Otherwise, it is stored in ~/.cache/huggingface/hub. (timm defaults)
-            self.pretrained = load_from_http(filename=pretrained_urls[self.backbone])
 
         self.model = timm.create_model(
-            TIMM_MODEL_NAME_DICT[self.backbone],
-            pretrained=self.pretrained,
+            self.backbone,
+            pretrained=pretrained,
             num_classes=1000,
         )
 
diff --git a/src/otx/recipe/classification/h_label_cls/efficientnet_v2.yaml b/src/otx/recipe/classification/h_label_cls/efficientnet_v2.yaml
index 9c06011a1c9..fc3f6abeab8 100644
--- a/src/otx/recipe/classification/h_label_cls/efficientnet_v2.yaml
+++ b/src/otx/recipe/classification/h_label_cls/efficientnet_v2.yaml
@@ -1,7 +1,7 @@
 model:
   class_path: otx.algo.classification.timm_model.TimmModelForHLabelCls
   init_args:
-    backbone: efficientnetv2_s_21k
+    backbone: tf_efficientnetv2_s.in21k
 
     optimizer:
       class_path: torch.optim.SGD
diff --git a/src/otx/recipe/classification/multi_class_cls/efficientnet_v2.yaml b/src/otx/recipe/classification/multi_class_cls/efficientnet_v2.yaml
index 2ca3c354f73..0cb77ef8852 100644
--- a/src/otx/recipe/classification/multi_class_cls/efficientnet_v2.yaml
+++ b/src/otx/recipe/classification/multi_class_cls/efficientnet_v2.yaml
@@ -2,7 +2,7 @@ model:
   class_path: otx.algo.classification.timm_model.TimmModelForMulticlassCls
   init_args:
     label_info: 1000
-    backbone: efficientnetv2_s_21k
+    backbone: tf_efficientnetv2_s.in21k
 
     optimizer:
       class_path: torch.optim.SGD
diff --git a/src/otx/recipe/classification/multi_class_cls/semisl/efficientnet_v2_semisl.yaml b/src/otx/recipe/classification/multi_class_cls/semisl/efficientnet_v2_semisl.yaml
index b1f87665bde..0bf81b8e05d 100644
--- a/src/otx/recipe/classification/multi_class_cls/semisl/efficientnet_v2_semisl.yaml
+++ b/src/otx/recipe/classification/multi_class_cls/semisl/efficientnet_v2_semisl.yaml
@@ -2,7 +2,7 @@ model:
   class_path: otx.algo.classification.timm_model.TimmModelForMulticlassCls
   init_args:
     label_info: 1000
-    backbone: efficientnetv2_s_21k
+    backbone: tf_efficientnetv2_s.in21k
     train_type: SEMI_SUPERVISED
 
     optimizer:
diff --git a/src/otx/recipe/classification/multi_label_cls/efficientnet_v2.yaml b/src/otx/recipe/classification/multi_label_cls/efficientnet_v2.yaml
index cc6ec415ec2..87177eb1e17 100644
--- a/src/otx/recipe/classification/multi_label_cls/efficientnet_v2.yaml
+++ b/src/otx/recipe/classification/multi_label_cls/efficientnet_v2.yaml
@@ -2,7 +2,7 @@ model:
   class_path: otx.algo.classification.timm_model.TimmModelForMultilabelCls
   init_args:
     label_info: 1000
-    backbone: efficientnetv2_s_21k
+    backbone: tf_efficientnetv2_s.in21k
 
     optimizer:
       class_path: torch.optim.SGD
diff --git a/tests/unit/algo/classification/backbones/test_timm.py b/tests/unit/algo/classification/backbones/test_timm.py
index 7a1bb6a657b..800f45520f3 100644
--- a/tests/unit/algo/classification/backbones/test_timm.py
+++ b/tests/unit/algo/classification/backbones/test_timm.py
@@ -1,16 +1,28 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import os
+import shutil
+from pathlib import Path
+
 import torch
 from otx.algo.classification.backbones.timm import TimmBackbone
 
 
 class TestOTXEfficientNetV2:
     def test_forward(self):
-        model = TimmBackbone(backbone="efficientnetv2_s_21k")
+        model = TimmBackbone(backbone="tf_efficientnetv2_s.in21k")
         assert model(torch.randn(1, 3, 244, 244))[0].shape == torch.Size([1, 1280, 8, 8])
 
     def test_get_config_optim(self):
-        model = TimmBackbone(backbone="efficientnetv2_s_21k")
+        model = TimmBackbone(backbone="tf_efficientnetv2_s.in21k")
         assert model.get_config_optim([0.01])[0]["lr"] == 0.01
         assert model.get_config_optim(0.01)[0]["lr"] == 0.01
+
+    def test_check_pretrained_weight_download(self):
+        target = Path(os.environ.get("HF_HUB_CACHE")) / "models--timm--tf_efficientnetv2_s.in21k"
+        if target.exists():
+            shutil.rmtree(target)
+        assert not target.exists()
+        TimmBackbone(backbone="tf_efficientnetv2_s.in21k", pretrained=True)
+        assert target.exists()
diff --git a/tests/unit/algo/classification/test_timm_model.py b/tests/unit/algo/classification/test_timm_model.py
index b20bcf7eba9..1cacecf2b5b 100644
--- a/tests/unit/algo/classification/test_timm_model.py
+++ b/tests/unit/algo/classification/test_timm_model.py
@@ -21,7 +21,7 @@
 def fxt_multi_class_cls_model():
     return TimmModelForMulticlassCls(
         label_info=10,
-        backbone="efficientnetv2_s_21k",
+        backbone="tf_efficientnetv2_s.in21k",
     )
 
 
@@ -59,7 +59,7 @@ def test_predict_step(self, fxt_multi_class_cls_model, fxt_multiclass_cls_batch_
 def fxt_multi_label_cls_model():
     return TimmModelForMultilabelCls(
         label_info=10,
-        backbone="efficientnetv2_s_21k",
+        backbone="tf_efficientnetv2_s.in21k",
     )
 
 
@@ -97,7 +97,7 @@ def test_predict_step(self, fxt_multi_label_cls_model, fxt_multilabel_cls_batch_
 def fxt_h_label_cls_model(fxt_hlabel_cifar):
     return TimmModelForHLabelCls(
         label_info=fxt_hlabel_cifar,
-        backbone="efficientnetv2_s_21k",
+        backbone="tf_efficientnetv2_s.in21k",
     )
 
 

From 517073674948547c5ac8080be94fa240d2355984 Mon Sep 17 00:00:00 2001
From: Yunchu Lee <yunchu.lee@intel.com>
Date: Fri, 30 Aug 2024 19:54:21 +0900
Subject: [PATCH 19/53] Fix wrong template id mapping for anomaly (#3916)

---
 src/otx/tools/converter.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/otx/tools/converter.py b/src/otx/tools/converter.py
index 5b19febd73d..e7f8e980855 100644
--- a/src/otx/tools/converter.py
+++ b/src/otx/tools/converter.py
@@ -122,29 +122,29 @@
     },
     # ANOMALY_CLASSIFICATION
     "ote_anomaly_classification_padim": {
-        "task": OTXTaskType.SEMANTIC_SEGMENTATION,
+        "task": OTXTaskType.ANOMALY_CLASSIFICATION,
         "model_name": "padim",
     },
     "ote_anomaly_classification_stfpm": {
-        "task": OTXTaskType.SEMANTIC_SEGMENTATION,
+        "task": OTXTaskType.ANOMALY_CLASSIFICATION,
         "model_name": "stfpm",
     },
     # ANOMALY_DETECTION
     "ote_anomaly_detection_padim": {
-        "task": OTXTaskType.SEMANTIC_SEGMENTATION,
+        "task": OTXTaskType.ANOMALY_DETECTION,
         "model_name": "padim",
     },
     "ote_anomaly_detection_stfpm": {
-        "task": OTXTaskType.SEMANTIC_SEGMENTATION,
+        "task": OTXTaskType.ANOMALY_DETECTION,
         "model_name": "stfpm",
     },
     # ANOMALY_SEGMENTATION
     "ote_anomaly_segmentation_padim": {
-        "task": OTXTaskType.SEMANTIC_SEGMENTATION,
+        "task": OTXTaskType.ANOMALY_SEGMENTATION,
         "model_name": "padim",
     },
     "ote_anomaly_segmentation_stfpm": {
-        "task": OTXTaskType.SEMANTIC_SEGMENTATION,
+        "task": OTXTaskType.ANOMALY_SEGMENTATION,
         "model_name": "stfpm",
     },
 }

From f611cc1ebd6d677bb75add55d9ded8f79062b734 Mon Sep 17 00:00:00 2001
From: Yunchu Lee <yunchu.lee@intel.com>
Date: Fri, 30 Aug 2024 19:54:59 +0900
Subject: [PATCH 20/53] Update script to allow setting otx version using env.
 variable (#3913)

---
 docker/build.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docker/build.sh b/docker/build.sh
index 5888516d481..8d4085dd5c5 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
-# shellcheck disable=SC2154
+# shellcheck disable=SC2154,SC2035,SC2046
 
-OTX_VERSION=$(python -c 'import otx; print(otx.__version__)')
+if [ "$OTX_VERSION" == "" ]; then
+    OTX_VERSION=$(python -c 'import otx; print(otx.__version__)')
+fi
 THIS_DIR=$(dirname "$0")
 
 echo "Build OTX ${OTX_VERSION} CUDA Docker image..."

From 425a479a8af1e721e25db0617a8cb3682272124f Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Mon, 2 Sep 2024 11:48:12 +0900
Subject: [PATCH 21/53] Fix Datamodule creation for OV in AutoConfigurator
 (#3920)

Fix datamodule for ov
---
 src/otx/engine/utils/auto_configurator.py         | 1 +
 tests/unit/engine/utils/test_auto_configurator.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py
index 5b2b50f33bd..cdafe49b248 100644
--- a/src/otx/engine/utils/auto_configurator.py
+++ b/src/otx/engine/utils/auto_configurator.py
@@ -410,6 +410,7 @@ def update_ov_subset_pipeline(self, datamodule: OTXDataModule, subset: str = "te
         subset_config.to_tv_image = ov_config[f"{subset}_subset"]["to_tv_image"]
         datamodule.image_color_channel = ov_config["image_color_channel"]
         datamodule.tile_config.enable_tiler = False
+        datamodule.unlabeled_subset.data_root = None
         msg = (
             f"For OpenVINO IR models, Update the following {subset} \n"
             f"\t transforms: {subset_config.transforms} \n"
diff --git a/tests/unit/engine/utils/test_auto_configurator.py b/tests/unit/engine/utils/test_auto_configurator.py
index 0b28d834168..9e1c273ea15 100644
--- a/tests/unit/engine/utils/test_auto_configurator.py
+++ b/tests/unit/engine/utils/test_auto_configurator.py
@@ -212,3 +212,5 @@ def test_update_ov_subset_pipeline(self) -> None:
         assert updated_datamodule.test_subset.transforms == [{"class_path": "torchvision.transforms.v2.ToImage"}]
 
         assert updated_datamodule.test_subset.transform_lib_type == TransformLibType.TORCHVISION
+        assert not updated_datamodule.tile_config.enable_tiler
+        assert updated_datamodule.unlabeled_subset.data_root is None

From 7f1c7da57365ae362b81542616a8d0ed6517a8f9 Mon Sep 17 00:00:00 2001
From: Yunchu Lee <yunchu.lee@intel.com>
Date: Mon, 2 Sep 2024 16:06:09 +0900
Subject: [PATCH 22/53] Update tpp file for 2.2.0 (#3921)

---
 third-party-programs.txt | 232 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)

diff --git a/third-party-programs.txt b/third-party-programs.txt
index 3b40683feeb..3297eebc18f 100644
--- a/third-party-programs.txt
+++ b/third-party-programs.txt
@@ -2473,3 +2473,235 @@ Apache-2.0
    See the License for the specific language governing permissions and
    limitations under the License.
 -------------------------------------------------------------
+typeguard
+
+MIT
+
+This is the MIT license: http://www.opensource.org/licenses/mit-license.php
+
+Copyright (c) Alex GrÃ¶nholm
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this
+software and associated documentation files (the "Software"), to deal in the Software
+without restriction, including without limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
+to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or
+substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
+FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+-------------------------------------------------------------
+transformers
+
+Apache-2.0
+
+Copyright 2018- The Hugging Face team. All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-------------------------------------------------------------

From 51d1adfb7445429dadff9a7959f0a172e44060b5 Mon Sep 17 00:00:00 2001
From: Prokofiev Kirill <kirill.prokofiev@intel.com>
Date: Tue, 3 Sep 2024 02:30:57 +0200
Subject: [PATCH 23/53] Fix names for ignored scope [HOT-FIX, 2.2.0] (#3924)

fix names for ignored scope
---
 src/otx/algo/segmentation/litehrnet.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/otx/algo/segmentation/litehrnet.py b/src/otx/algo/segmentation/litehrnet.py
index 11b20d421f7..73ade597fd8 100644
--- a/src/otx/algo/segmentation/litehrnet.py
+++ b/src/otx/algo/segmentation/litehrnet.py
@@ -87,7 +87,7 @@ def _exporter(self) -> OTXModelExporter:
     @property
     def ignore_scope(self) -> dict[str, Any]:
         """Get the ignored scope for LiteHRNet."""
-        if self.model_version == "large":
+        if self.model_version == "lite_hrnet_x":
             return {
                 "ignored_scope": {
                     "patterns": ["__module.model.decode_head.aggregator/*"],
@@ -175,7 +175,7 @@ def ignore_scope(self) -> dict[str, Any]:
                 "preset": "performance",
             }
 
-        if self.model_version == "medium":
+        if self.model_version == "lite_hrnet_18":
             return {
                 "ignored_scope": {
                     "patterns": ["__module.model.backbone/*"],
@@ -263,7 +263,7 @@ def ignore_scope(self) -> dict[str, Any]:
                 "preset": "mixed",
             }
 
-        if self.model_version == "small":
+        if self.model_version == "lite_hrnet_s":
             return {
                 "ignored_scope": {
                     "names": [

From 2bcf1b2812cf962a4eb3bbb7dc6d945045f5e52b Mon Sep 17 00:00:00 2001
From: Vladislav Sovrasov <sovrasov.vlad@gmail.com>
Date: Tue, 3 Sep 2024 04:40:24 +0200
Subject: [PATCH 24/53] Fix classification rt_info (#3922)

* Restore output_raw_scores for classificaiton

* Add uts

* Fix linter
---
 src/otx/core/model/classification.py         | 3 +++
 src/otx/core/types/export.py                 | 6 ++++++
 tests/unit/core/model/test_classification.py | 1 +
 tests/unit/core/types/test_export.py         | 3 +++
 4 files changed, 13 insertions(+)

diff --git a/src/otx/core/model/classification.py b/src/otx/core/model/classification.py
index 7e042f19ed2..d5615014959 100644
--- a/src/otx/core/model/classification.py
+++ b/src/otx/core/model/classification.py
@@ -154,6 +154,7 @@ def _export_parameters(self) -> TaskLevelExportParameters:
             task_type="classification",
             multilabel=False,
             hierarchical=False,
+            output_raw_scores=True,
         )
 
     @property
@@ -279,6 +280,7 @@ def _export_parameters(self) -> TaskLevelExportParameters:
             multilabel=True,
             hierarchical=False,
             confidence_threshold=0.5,
+            output_raw_scores=True,
         )
 
     @property
@@ -401,6 +403,7 @@ def _export_parameters(self) -> TaskLevelExportParameters:
             multilabel=False,
             hierarchical=True,
             confidence_threshold=0.5,
+            output_raw_scores=True,
         )
 
     @property
diff --git a/src/otx/core/types/export.py b/src/otx/core/types/export.py
index 932990a02ce..7f64febe607 100644
--- a/src/otx/core/types/export.py
+++ b/src/otx/core/types/export.py
@@ -34,6 +34,8 @@ class TaskLevelExportParameters:
             Only specified for the classification task.
         hierarchical (bool | None): Whether it is hierarchical or not.
             Only specified for the classification task.
+        output_raw_scores (bool | None): Whether to output raw scores.
+            Only specified for the classification task.
         confidence_threshold (float | None): Confidence threshold for model prediction probability.
             It is used only for classification tasks, detection and instance segmentation tasks.
         iou_threshold (float | None): The Intersection over Union (IoU) threshold
@@ -60,6 +62,7 @@ class TaskLevelExportParameters:
     # (Optional) Classification tasks
     multilabel: bool | None = None
     hierarchical: bool | None = None
+    output_raw_scores: bool | None = None
 
     # (Optional) Classification tasks, detection and instance segmentation task
     confidence_threshold: float | None = None
@@ -133,6 +136,9 @@ def to_metadata(self) -> dict[tuple[str, str], str]:
         if self.hierarchical is not None:
             metadata[("model_info", "hierarchical")] = str(self.hierarchical)
 
+        if self.output_raw_scores is not None:
+            metadata[("model_info", "output_raw_scores")] = str(self.output_raw_scores)
+
         if self.confidence_threshold is not None:
             metadata[("model_info", "confidence_threshold")] = str(self.confidence_threshold)
 
diff --git a/tests/unit/core/model/test_classification.py b/tests/unit/core/model/test_classification.py
index 352bf9a331d..28430f309a4 100644
--- a/tests/unit/core/model/test_classification.py
+++ b/tests/unit/core/model/test_classification.py
@@ -48,6 +48,7 @@ def test_export_parameters(
         assert model._export_parameters.task_type.lower() == "classification"
         assert not model._export_parameters.multilabel
         assert not model._export_parameters.hierarchical
+        assert model._export_parameters.output_raw_scores
 
         model = OTXMultilabelClsModel(
             label_info=1,
diff --git a/tests/unit/core/types/test_export.py b/tests/unit/core/types/test_export.py
index 5a85f3b6fc8..72add6f3c31 100644
--- a/tests/unit/core/types/test_export.py
+++ b/tests/unit/core/types/test_export.py
@@ -17,6 +17,7 @@ def test_wrap(fxt_label_info, task_type):
 
     multilabel = False
     hierarchical = False
+    output_raw_scores = True
     confidence_threshold = 0.0
     iou_threshold = 0.0
     return_soft_prediction = False
@@ -27,6 +28,7 @@ def test_wrap(fxt_label_info, task_type):
     params = params.wrap(
         multilabel=multilabel,
         hierarchical=hierarchical,
+        output_raw_scores=output_raw_scores,
         confidence_threshold=confidence_threshold,
         iou_threshold=iou_threshold,
         return_soft_prediction=return_soft_prediction,
@@ -44,6 +46,7 @@ def test_wrap(fxt_label_info, task_type):
     assert metadata[("model_info", "return_soft_prediction")] == str(return_soft_prediction)
     assert metadata[("model_info", "soft_threshold")] == str(soft_threshold)
     assert metadata[("model_info", "blur_strength")] == str(blur_strength)
+    assert metadata[("model_info", "output_raw_scores")] == str(output_raw_scores)
 
     # Tile config
     assert ("model_info", "tile_size") in metadata

From 112b2b2cb72636f5328cf31cf3d6f2e83a1e25fe Mon Sep 17 00:00:00 2001
From: Ashwin Vaidya <ashwin.vaidya@intel.com>
Date: Wed, 4 Sep 2024 06:56:01 +0200
Subject: [PATCH 25/53] Update label info (#3925)

add label info to init

Signed-off-by: Ashwin Vaidya <ashwinnitinvaidya@gmail.com>
---
 src/otx/algo/anomaly/padim.py | 5 ++++-
 src/otx/algo/anomaly/stfpm.py | 5 ++++-
 src/otx/core/model/anomaly.py | 7 ++++---
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/otx/algo/anomaly/padim.py b/src/otx/algo/anomaly/padim.py
index f667efa897d..091d652e9e3 100644
--- a/src/otx/algo/anomaly/padim.py
+++ b/src/otx/algo/anomaly/padim.py
@@ -14,6 +14,7 @@
 from anomalib.models.image import Padim as AnomalibPadim
 
 from otx.core.model.anomaly import OTXAnomaly
+from otx.core.types.label import AnomalyLabelInfo
 from otx.core.types.task import OTXTaskType
 
 if TYPE_CHECKING:
@@ -21,6 +22,7 @@
     from torch.optim.optimizer import Optimizer
 
     from otx.core.model.anomaly import AnomalyModelInputs, AnomalyModelOutputs
+    from otx.core.types.label import LabelInfoTypes
 
 
 class Padim(OTXAnomaly, AnomalibPadim):
@@ -40,6 +42,7 @@ class Padim(OTXAnomaly, AnomalibPadim):
 
     def __init__(
         self,
+        label_info: LabelInfoTypes = AnomalyLabelInfo(),
         backbone: str = "resnet18",
         layers: list[str] = ["layer1", "layer2", "layer3"],  # noqa: B006
         pre_trained: bool = True,
@@ -51,7 +54,7 @@ def __init__(
         ] = OTXTaskType.ANOMALY_CLASSIFICATION,
         input_size: tuple[int, int] = (256, 256),
     ) -> None:
-        OTXAnomaly.__init__(self, input_size)
+        OTXAnomaly.__init__(self, label_info=label_info, input_size=input_size)
         AnomalibPadim.__init__(
             self,
             backbone=backbone,
diff --git a/src/otx/algo/anomaly/stfpm.py b/src/otx/algo/anomaly/stfpm.py
index 614d3ad52f9..7e0696ad08e 100644
--- a/src/otx/algo/anomaly/stfpm.py
+++ b/src/otx/algo/anomaly/stfpm.py
@@ -14,6 +14,7 @@
 from anomalib.models.image.stfpm import Stfpm as AnomalibStfpm
 
 from otx.core.model.anomaly import OTXAnomaly
+from otx.core.types.label import AnomalyLabelInfo
 from otx.core.types.task import OTXTaskType
 
 if TYPE_CHECKING:
@@ -21,6 +22,7 @@
     from torch.optim.optimizer import Optimizer
 
     from otx.core.model.anomaly import AnomalyModelInputs, AnomalyModelOutputs
+    from otx.core.types.label import LabelInfoTypes
 
 
 class Stfpm(OTXAnomaly, AnomalibStfpm):
@@ -38,6 +40,7 @@ class Stfpm(OTXAnomaly, AnomalibStfpm):
 
     def __init__(
         self,
+        label_info: LabelInfoTypes = AnomalyLabelInfo(),
         layers: Sequence[str] = ["layer1", "layer2", "layer3"],
         backbone: str = "resnet18",
         task: Literal[
@@ -48,7 +51,7 @@ def __init__(
         input_size: tuple[int, int] = (256, 256),
         **kwargs,
     ) -> None:
-        OTXAnomaly.__init__(self, input_size=input_size)
+        OTXAnomaly.__init__(self, label_info=label_info, input_size=input_size)
         AnomalibStfpm.__init__(
             self,
             backbone=backbone,
diff --git a/src/otx/core/model/anomaly.py b/src/otx/core/model/anomaly.py
index 68abff41a59..cec2359a70f 100644
--- a/src/otx/core/model/anomaly.py
+++ b/src/otx/core/model/anomaly.py
@@ -39,7 +39,8 @@
     from lightning.pytorch.callbacks.callback import Callback
     from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable
     from torchmetrics import Metric
-from otx.core.types.label import AnomalyLabelInfo
+
+    from otx.core.types.label import LabelInfoTypes
 
 AnomalyModelInputs: TypeAlias = (
     AnomalyClassificationDataBatch | AnomalySegmentationDataBatch | AnomalyDetectionDataBatch
@@ -57,8 +58,8 @@ class OTXAnomaly(OTXModel):
             Model input size in the order of height and width. Defaults to None.
     """
 
-    def __init__(self, input_size: tuple[int, int]) -> None:
-        super().__init__(label_info=AnomalyLabelInfo(), input_size=input_size)
+    def __init__(self, label_info: LabelInfoTypes, input_size: tuple[int, int]) -> None:
+        super().__init__(label_info=label_info, input_size=input_size)
         self.optimizer: list[OptimizerCallable] | OptimizerCallable = None
         self.scheduler: list[LRSchedulerCallable] | LRSchedulerCallable = None
         self.trainer: Trainer

From 929132d16ffa0c68fb4c46f8cb0b377c11b63b4a Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Thu, 5 Sep 2024 09:06:16 +0900
Subject: [PATCH 26/53] Fix binary classification metric task (#3928)

* Fix binary classification

* Add unit-tests
---
 src/otx/core/metrics/accuracy.py         |  4 +++-
 tests/unit/core/metrics/test_accuracy.py | 12 ++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/otx/core/metrics/accuracy.py b/src/otx/core/metrics/accuracy.py
index 1ddd0ce4b99..2f6139ba364 100644
--- a/src/otx/core/metrics/accuracy.py
+++ b/src/otx/core/metrics/accuracy.py
@@ -346,8 +346,10 @@ def compute(self) -> torch.Tensor:
 
 
 def _multi_class_cls_metric_callable(label_info: LabelInfo) -> MetricCollection:
+    num_classes = label_info.num_classes
+    task = "binary" if num_classes == 1 else "multiclass"
     return MetricCollection(
-        {"accuracy": TorchmetricAcc(task="multiclass", num_classes=label_info.num_classes)},
+        {"accuracy": TorchmetricAcc(task=task, num_classes=num_classes)},
     )
 
 
diff --git a/tests/unit/core/metrics/test_accuracy.py b/tests/unit/core/metrics/test_accuracy.py
index d04d253575f..8370fee09f6 100644
--- a/tests/unit/core/metrics/test_accuracy.py
+++ b/tests/unit/core/metrics/test_accuracy.py
@@ -9,9 +9,11 @@
     HlabelAccuracy,
     MixedHLabelAccuracy,
     MulticlassAccuracywithLabelGroup,
+    MultiClassClsMetricCallable,
     MultilabelAccuracywithLabelGroup,
 )
 from otx.core.types.label import HLabelInfo, LabelInfo
+from torchmetrics.classification.accuracy import BinaryAccuracy, MulticlassAccuracy
 
 
 class TestAccuracy:
@@ -45,6 +47,16 @@ def test_multiclass_accuracy(self, fxt_multiclass_labelinfo: LabelInfo) -> None:
         acc = result["accuracy"]
         assert round(acc.item(), 3) == 0.792
 
+    def test_default_multi_class_cls_metric_callable(self, fxt_multiclass_labelinfo: LabelInfo) -> None:
+        assert fxt_multiclass_labelinfo.num_classes > 1
+        metric = MultiClassClsMetricCallable(fxt_multiclass_labelinfo)
+        assert isinstance(metric.accuracy, MulticlassAccuracy)
+
+        one_class_label_info = LabelInfo(label_names=["class1"], label_groups=[["class1"]])
+        assert one_class_label_info.num_classes == 1
+        binary_metric = MultiClassClsMetricCallable(one_class_label_info)
+        assert isinstance(binary_metric.accuracy, BinaryAccuracy)
+
     def test_multilabel_accuracy(self, fxt_multilabel_labelinfo: LabelInfo) -> None:
         """Check whether accuracy is same with OTX1.x version."""
         preds = [

From 706f99b02662e32418ebe23b8086ad7aba11098d Mon Sep 17 00:00:00 2001
From: Eugene Liu <eugene.liu@intel.com>
Date: Thu, 5 Sep 2024 02:36:13 +0100
Subject: [PATCH 27/53] Improve MaskRCNN SwinT NNCF (#3929)

* ignore heads and disable smooth quant

* add activations_range_estimator_params

* update changelog
---
 CHANGELOG.md                                  |  2 ++
 .../algo/instance_segmentation/maskrcnn.py    | 23 ++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 460454bad2a..37184eb4fb5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -189,6 +189,8 @@ All notable changes to this project will be documented in this file.
   (<https://github.com/openvinotoolkit/training_extensions/pull/3684>)
 - Fix MaskRCNN SwinT NNCF Accuracy Drop
   (<https://github.com/openvinotoolkit/training_extensions/pull/3685>)
+- Fix MaskRCNN SwinT NNCF Accuracy Drop By Adding More PTQ Configs
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3929>)
 
 ### Known issues
 
diff --git a/src/otx/algo/instance_segmentation/maskrcnn.py b/src/otx/algo/instance_segmentation/maskrcnn.py
index 100b4b6f7b2..ede10ace96a 100644
--- a/src/otx/algo/instance_segmentation/maskrcnn.py
+++ b/src/otx/algo/instance_segmentation/maskrcnn.py
@@ -675,4 +675,25 @@ def _build_model(self, num_classes: int) -> TwoStageDetector:
     @property
     def _optimization_config(self) -> dict[str, Any]:
         """PTQ config for MaskRCNN-SwinT."""
-        return {"model_type": "transformer"}
+        return {
+            "model_type": "transformer",
+            "ignored_scope": {
+                "patterns": [".*head.*"],
+                "validate": False,
+            },
+            "advanced_parameters": {
+                "smooth_quant_alpha": -1,
+                "activations_range_estimator_params": {
+                    "min": {
+                        "statistics_type": "QUANTILE",
+                        "aggregator_type": "MIN",
+                        "quantile_outlier_prob": "1e-4",
+                    },
+                    "max": {
+                        "statistics_type": "QUANTILE",
+                        "aggregator_type": "MAX",
+                        "quantile_outlier_prob": "1e-4",
+                    },
+                },
+            },
+        }

From 53a7d9ab60a87d2cf43960cd1c678fdd80565e90 Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Thu, 5 Sep 2024 13:31:26 +0900
Subject: [PATCH 28/53] Fix get_item for Chained Tasks in Classification
 (#3931)

* Fix Task Chain

* Add multi-label case as well

* Add multi-label case as well2

* Add H-label case
---
 src/otx/core/data/dataset/classification.py   | 33 ++++++-
 tests/unit/core/data/conftest.py              | 40 +++++++++
 .../core/data/dataset/test_classification.py  | 90 ++++++++++++++++++-
 3 files changed, 159 insertions(+), 4 deletions(-)

diff --git a/src/otx/core/data/dataset/classification.py b/src/otx/core/data/dataset/classification.py
index 57170da967b..c5048dd7987 100644
--- a/src/otx/core/data/dataset/classification.py
+++ b/src/otx/core/data/dataset/classification.py
@@ -34,7 +34,16 @@ def _get_item_impl(self, index: int) -> MulticlassClsDataEntity | None:
         img = item.media_as(Image)
         img_data, img_shape = self._get_img_data_and_shape(img)
 
-        label_anns = [ann for ann in item.annotations if isinstance(ann, Label)]
+        label_anns = []
+        for ann in item.annotations:
+            if isinstance(ann, Label):
+                label_anns.append(ann)
+            else:
+                # If the annotation is not Label, it should be converted to Label.
+                # For Chained Task: Detection (Bbox) -> Classification (Label)
+                label = Label(label=ann.label)
+                if label not in label_anns:
+                    label_anns.append(label)
         if len(label_anns) > 1:
             msg = f"Multi-class Classification can't use the multi-label, currently len(labels) = {len(label_anns)}"
             raise ValueError(msg)
@@ -71,7 +80,16 @@ def _get_item_impl(self, index: int) -> MultilabelClsDataEntity | None:
         ignored_labels: list[int] = []  # This should be assigned form item
         img_data, img_shape = self._get_img_data_and_shape(img)
 
-        label_anns = [ann for ann in item.annotations if isinstance(ann, Label)]
+        label_anns = []
+        for ann in item.annotations:
+            if isinstance(ann, Label):
+                label_anns.append(ann)
+            else:
+                # If the annotation is not Label, it should be converted to Label.
+                # For Chained Task: Detection (Bbox) -> Classification (Label)
+                label = Label(label=ann.label)
+                if label not in label_anns:
+                    label_anns.append(label)
         labels = torch.as_tensor([ann.label for ann in label_anns])
 
         entity = MultilabelClsDataEntity(
@@ -179,7 +197,16 @@ def _get_item_impl(self, index: int) -> HlabelClsDataEntity | None:
         ignored_labels: list[int] = []  # This should be assigned form item
         img_data, img_shape = self._get_img_data_and_shape(img)
 
-        label_anns = [ann for ann in item.annotations if isinstance(ann, Label)]
+        label_anns = []
+        for ann in item.annotations:
+            if isinstance(ann, Label):
+                label_anns.append(ann)
+            else:
+                # If the annotation is not Label, it should be converted to Label.
+                # For Chained Task: Detection (Bbox) -> Classification (Label)
+                label = Label(label=ann.label)
+                if label not in label_anns:
+                    label_anns.append(label)
         hlabel_labels = self._convert_label_to_hlabel_format(label_anns, ignored_labels)
 
         entity = HlabelClsDataEntity(
diff --git a/tests/unit/core/data/conftest.py b/tests/unit/core/data/conftest.py
index 78ee2b1fb8c..6c3ca23dde6 100644
--- a/tests/unit/core/data/conftest.py
+++ b/tests/unit/core/data/conftest.py
@@ -96,6 +96,37 @@ def fxt_dm_item(request, tmpdir) -> DatasetItem:
     )
 
 
+@pytest.fixture(params=["bytes", "file"])
+def fxt_dm_item_bbox_only(request, tmpdir) -> DatasetItem:
+    np_img = np.zeros(shape=(10, 10, 3), dtype=np.uint8)
+    np_img[:, :, 0] = 0  # Set 0 for B channel
+    np_img[:, :, 1] = 1  # Set 1 for G channel
+    np_img[:, :, 2] = 2  # Set 2 for R channel
+
+    if request.param == "bytes":
+        _, np_bytes = cv2.imencode(".png", np_img)
+        media = Image.from_bytes(np_bytes.tobytes())
+        media.path = ""
+    elif request.param == "file":
+        fname = str(uuid.uuid4())
+        fpath = str(Path(tmpdir) / f"{fname}.png")
+        cv2.imwrite(fpath, np_img)
+        media = Image.from_file(fpath)
+    else:
+        raise ValueError(request.param)
+
+    return DatasetItem(
+        id="item",
+        subset="train",
+        media=media,
+        annotations=[
+            Bbox(x=0, y=0, w=1, h=1, label=0),
+            Bbox(x=1, y=0, w=1, h=1, label=0),
+            Bbox(x=1, y=1, w=1, h=1, label=0),
+        ],
+    )
+
+
 @pytest.fixture()
 def fxt_mock_dm_subset(mocker: MockerFixture, fxt_dm_item: DatasetItem) -> MagicMock:
     mock_dm_subset = mocker.MagicMock(spec=DmDataset)
@@ -105,6 +136,15 @@ def fxt_mock_dm_subset(mocker: MockerFixture, fxt_dm_item: DatasetItem) -> Magic
     return mock_dm_subset
 
 
+@pytest.fixture()
+def fxt_mock_det_dm_subset(mocker: MockerFixture, fxt_dm_item_bbox_only: DatasetItem) -> MagicMock:
+    mock_dm_subset = mocker.MagicMock(spec=DmDataset)
+    mock_dm_subset.__getitem__.return_value = fxt_dm_item_bbox_only
+    mock_dm_subset.__len__.return_value = 1
+    mock_dm_subset.categories().__getitem__.return_value = LabelCategories.from_iterable(_LABEL_NAMES)
+    return mock_dm_subset
+
+
 @pytest.fixture(
     params=[
         (OTXHlabelClsDataset, HlabelClsDataEntity, {}),
diff --git a/tests/unit/core/data/dataset/test_classification.py b/tests/unit/core/data/dataset/test_classification.py
index 8bef7ffa4e2..bf2da750d9a 100644
--- a/tests/unit/core/data/dataset/test_classification.py
+++ b/tests/unit/core/data/dataset/test_classification.py
@@ -5,7 +5,65 @@
 
 from unittest.mock import MagicMock
 
-from otx.core.data.dataset.classification import OTXHlabelClsDataset
+from otx.core.data.dataset.classification import (
+    HLabelInfo,
+    OTXHlabelClsDataset,
+    OTXMulticlassClsDataset,
+    OTXMultilabelClsDataset,
+)
+from otx.core.data.entity.classification import HlabelClsDataEntity, MulticlassClsDataEntity, MultilabelClsDataEntity
+
+
+class TestOTXMulticlassClsDataset:
+    def test_get_item(
+        self,
+        fxt_mock_dm_subset,
+    ) -> None:
+        dataset = OTXMulticlassClsDataset(
+            dm_subset=fxt_mock_dm_subset,
+            transforms=[lambda x: x],
+            mem_cache_img_max_size=None,
+            max_refetch=3,
+        )
+        assert isinstance(dataset[0], MulticlassClsDataEntity)
+
+    def test_get_item_from_bbox_dataset(
+        self,
+        fxt_mock_det_dm_subset,
+    ) -> None:
+        dataset = OTXMulticlassClsDataset(
+            dm_subset=fxt_mock_det_dm_subset,
+            transforms=[lambda x: x],
+            mem_cache_img_max_size=None,
+            max_refetch=3,
+        )
+        assert isinstance(dataset[0], MulticlassClsDataEntity)
+
+
+class TestOTXMultilabelClsDataset:
+    def test_get_item(
+        self,
+        fxt_mock_dm_subset,
+    ) -> None:
+        dataset = OTXMultilabelClsDataset(
+            dm_subset=fxt_mock_dm_subset,
+            transforms=[lambda x: x],
+            mem_cache_img_max_size=None,
+            max_refetch=3,
+        )
+        assert isinstance(dataset[0], MultilabelClsDataEntity)
+
+    def test_get_item_from_bbox_dataset(
+        self,
+        fxt_mock_det_dm_subset,
+    ) -> None:
+        dataset = OTXMultilabelClsDataset(
+            dm_subset=fxt_mock_det_dm_subset,
+            transforms=[lambda x: x],
+            mem_cache_img_max_size=None,
+            max_refetch=3,
+        )
+        assert isinstance(dataset[0], MultilabelClsDataEntity)
 
 
 class TestOTXHlabelClsDataset:
@@ -20,3 +78,33 @@ def test_add_ancestors(self, fxt_hlabel_dataset_subset):
         # Added the ancestor
         adjusted_anns = hlabel_dataset.dm_subset.get(id=0, subset="train").annotations
         assert len(adjusted_anns) == 2
+
+    def test_get_item(
+        self,
+        mocker,
+        fxt_mock_dm_subset,
+        fxt_mock_hlabelinfo,
+    ) -> None:
+        mocker.patch.object(HLabelInfo, "from_dm_label_groups", return_value=fxt_mock_hlabelinfo)
+        dataset = OTXHlabelClsDataset(
+            dm_subset=fxt_mock_dm_subset,
+            transforms=[lambda x: x],
+            mem_cache_img_max_size=None,
+            max_refetch=3,
+        )
+        assert isinstance(dataset[0], HlabelClsDataEntity)
+
+    def test_get_item_from_bbox_dataset(
+        self,
+        mocker,
+        fxt_mock_det_dm_subset,
+        fxt_mock_hlabelinfo,
+    ) -> None:
+        mocker.patch.object(HLabelInfo, "from_dm_label_groups", return_value=fxt_mock_hlabelinfo)
+        dataset = OTXHlabelClsDataset(
+            dm_subset=fxt_mock_det_dm_subset,
+            transforms=[lambda x: x],
+            mem_cache_img_max_size=None,
+            max_refetch=3,
+        )
+        assert isinstance(dataset[0], HlabelClsDataEntity)

From c3749e30b5bfbfd257050e2005e456eac076ad7d Mon Sep 17 00:00:00 2001
From: Sooah Lee <sooah.lee@intel.com>
Date: Thu, 5 Sep 2024 16:41:51 +0900
Subject: [PATCH 29/53] Correct Keyerror for h-label cls in label_groups for
 dm_label_categories using label's id/key (#3932)

Modify label_groups for dm_label_categories with id/key of label
---
 src/otx/core/types/label.py | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/otx/core/types/label.py b/src/otx/core/types/label.py
index cd472965336..7f00aa0b496 100644
--- a/src/otx/core/types/label.py
+++ b/src/otx/core/types/label.py
@@ -229,7 +229,32 @@ def get_label_tree_edges(dm_label_items: list[LabelCategories]) -> list[list[str
             """Get label tree edges information. Each edges represent [child, parent]."""
             return [[item.name, item.parent] for item in dm_label_items if item.parent != ""]
 
-        all_groups = [label_group.labels for label_group in dm_label_categories.label_groups]
+        def convert_labels_if_needed(
+            dm_label_categories: LabelCategories,
+            label_names: list[str],
+        ) -> list[list[str]]:
+            # Check if the labels need conversion and create name to ID mapping if required
+            name_to_id_mapping = None
+            for label_group in dm_label_categories.label_groups:
+                if label_group.labels and label_group.labels[0] not in label_names:
+                    name_to_id_mapping = {
+                        attr[len("__name__") :]: category.name
+                        for category in dm_label_categories.items
+                        for attr in category.attributes
+                        if attr.startswith("__name__")
+                    }
+                    break
+
+            # If mapping exists, update the labels
+            if name_to_id_mapping:
+                for label_group in dm_label_categories.label_groups:
+                    label_group.labels = [name_to_id_mapping.get(label, label) for label in label_group.labels]
+
+            # Retrieve all label groups after conversion
+            return [group.labels for group in dm_label_categories.label_groups]
+
+        label_names = [item.name for item in dm_label_categories.items]
+        all_groups = convert_labels_if_needed(dm_label_categories, label_names)
 
         exclusive_group_info = get_exclusive_group_info(all_groups)
         single_label_group_info = get_single_label_group_info(all_groups, exclusive_group_info["num_multiclass_heads"])
@@ -240,7 +265,7 @@ def get_label_tree_edges(dm_label_items: list[LabelCategories]) -> list[list[str
         )
 
         return HLabelInfo(
-            label_names=[item.name for item in dm_label_categories.items],
+            label_names=label_names,
             label_groups=all_groups,
             num_multiclass_heads=exclusive_group_info["num_multiclass_heads"],
             num_multilabel_classes=single_label_group_info["num_multilabel_classes"],

From 98a9cacade7ebacc35b3f47d848b54064c5c64be Mon Sep 17 00:00:00 2001
From: Eugene Liu <eugene.liu@intel.com>
Date: Fri, 6 Sep 2024 01:04:08 +0100
Subject: [PATCH 30/53] Remove datumaro attribute id from tiling, add subset
 names (#3933)

* remove datumaro attribute id from tiling

* add subset names
---
 src/otx/core/data/dataset/tile.py | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/otx/core/data/dataset/tile.py b/src/otx/core/data/dataset/tile.py
index a1fcee48621..73ab24fb4ea 100644
--- a/src/otx/core/data/dataset/tile.py
+++ b/src/otx/core/data/dataset/tile.py
@@ -53,6 +53,10 @@
 # NOTE: Disable private-member-access (SLF001).
 # This is a workaround so we could apply the same transforms to tiles as the original dataset.
 
+# NOTE: Datumaro subset name should be standardized.
+TRAIN_SUBSET_NAMES = ("train", "TRAINING")
+VAL_SUBSET_NAMES = ("val", "VALIDATION")
+
 
 class OTXTileTransform(Tile):
     """OTX tile transform.
@@ -188,7 +192,7 @@ def create(
         Returns:
             OTXTileDataset: Tile dataset.
         """
-        if dataset.dm_subset[0].subset == "train":
+        if dataset.dm_subset[0].subset in TRAIN_SUBSET_NAMES:
             return OTXTileTrainDataset(dataset, tile_config)
 
         if task == OTXTaskType.DETECTION:
@@ -230,12 +234,17 @@ def _get_item_impl(self, index: int) -> OTXDataEntity | None:
         """Get item implementation from the original dataset."""
         return self._dataset._get_item_impl(index)
 
-    def _convert_entity(self, image: np.ndarray, dataset_item: DatasetItem) -> OTXDataEntity:
+    def _convert_entity(self, image: np.ndarray, dataset_item: DatasetItem, parent_idx: int) -> OTXDataEntity:
         """Convert a tile dataset item to OTXDataEntity."""
         msg = "Method _convert_entity is not implemented."
         raise NotImplementedError(msg)
 
-    def get_tiles(self, image: np.ndarray, item: DatasetItem) -> tuple[list[OTXDataEntity], list[dict]]:
+    def get_tiles(
+        self,
+        image: np.ndarray,
+        item: DatasetItem,
+        parent_idx: int,
+    ) -> tuple[list[OTXDataEntity], list[dict]]:
         """Retrieves tiles from the given image and dataset item.
 
         Args:
@@ -256,14 +265,14 @@ def get_tiles(self, image: np.ndarray, item: DatasetItem) -> tuple[list[OTXDataE
             with_full_img=self.tile_config.with_full_img,
         )
 
-        if item.subset == "val":
+        if item.subset in VAL_SUBSET_NAMES:
             # NOTE: filter validation tiles with annotations only to avoid evaluation on empty tiles.
             tile_ds = tile_ds.filter("/item/annotation", filter_annotations=True, remove_empty=True)
 
         tile_entities: list[OTXDataEntity] = []
         tile_attrs: list[dict] = []
         for tile in tile_ds:
-            tile_entity = self._convert_entity(image, tile)
+            tile_entity = self._convert_entity(image, tile, parent_idx)
             # apply the same transforms as the original dataset
             transformed_tile = self._apply_transforms(tile_entity)
             if transformed_tile is None:
@@ -346,7 +355,7 @@ def _get_item_impl(self, index: int) -> TileDetDataEntity:  # type: ignore[overr
         )
         labels = torch.as_tensor([ann.label for ann in bbox_anns])
 
-        tile_entities, tile_attrs = self.get_tiles(img_data, item)
+        tile_entities, tile_attrs = self.get_tiles(img_data, item, index)
 
         return TileDetDataEntity(
             num_tiles=len(tile_entities),
@@ -365,13 +374,13 @@ def _get_item_impl(self, index: int) -> TileDetDataEntity:  # type: ignore[overr
             ori_labels=labels,
         )
 
-    def _convert_entity(self, image: np.ndarray, dataset_item: DatasetItem) -> DetDataEntity:
+    def _convert_entity(self, image: np.ndarray, dataset_item: DatasetItem, parent_idx: int) -> DetDataEntity:
         """Convert a tile datumaro dataset item to DetDataEntity."""
         x1, y1, w, h = dataset_item.attributes["roi"]
         tile_img = image[y1 : y1 + h, x1 : x1 + w]
         tile_shape = tile_img.shape[:2]
         img_info = ImageInfo(
-            img_idx=dataset_item.attributes["id"],
+            img_idx=parent_idx,
             img_shape=tile_shape,
             ori_shape=tile_shape,
         )
@@ -448,7 +457,7 @@ def _get_item_impl(self, index: int) -> TileInstSegDataEntity:  # type: ignore[o
         masks = np.stack(gt_masks, axis=0) if gt_masks else np.zeros((0, *img_shape), dtype=bool)
         labels = np.array(gt_labels, dtype=np.int64)
 
-        tile_entities, tile_attrs = self.get_tiles(img_data, item)
+        tile_entities, tile_attrs = self.get_tiles(img_data, item, index)
 
         return TileInstSegDataEntity(
             num_tiles=len(tile_entities),
@@ -469,13 +478,13 @@ def _get_item_impl(self, index: int) -> TileInstSegDataEntity:  # type: ignore[o
             ori_polygons=gt_polygons,
         )
 
-    def _convert_entity(self, image: np.ndarray, dataset_item: DatasetItem) -> InstanceSegDataEntity:
+    def _convert_entity(self, image: np.ndarray, dataset_item: DatasetItem, parent_idx: int) -> InstanceSegDataEntity:
         """Convert a tile dataset item to InstanceSegDataEntity."""
         x1, y1, w, h = dataset_item.attributes["roi"]
         tile_img = image[y1 : y1 + h, x1 : x1 + w]
         tile_shape = tile_img.shape[:2]
         img_info = ImageInfo(
-            img_idx=dataset_item.attributes["id"],
+            img_idx=parent_idx,
             img_shape=tile_shape,
             ori_shape=tile_shape,
         )

From d8e6454cf8dce1e69ab6ae83c17f6b7b37c1c4a5 Mon Sep 17 00:00:00 2001
From: Prokofiev Kirill <kirill.prokofiev@intel.com>
Date: Fri, 6 Sep 2024 02:05:08 +0200
Subject: [PATCH 31/53] Fix soft predictions for Semantic Segmentation (#3934)

fix soft preds
---
 src/otx/core/model/segmentation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py
index a2972d7eb86..85182944474 100644
--- a/src/otx/core/model/segmentation.py
+++ b/src/otx/core/model/segmentation.py
@@ -219,7 +219,8 @@ def _dispatch_label_info(label_info: LabelInfoTypes) -> LabelInfo:
 
     def forward_for_tracing(self, image: Tensor) -> Tensor | dict[str, Tensor]:
         """Model forward function used for the model tracing during model exportation."""
-        return self.model(inputs=image, mode="tensor")
+        raw_outputs = self.model(inputs=image, mode="tensor")
+        return torch.softmax(raw_outputs, dim=1)
 
     def get_dummy_input(self, batch_size: int = 1) -> SegBatchDataEntity:
         """Returns a dummy input for semantic segmentation model."""

From c2705df953dfee87434bb25a6821ff2721e6fe21 Mon Sep 17 00:00:00 2001
From: Ashwin Vaidya <ashwin.vaidya@intel.com>
Date: Fri, 6 Sep 2024 09:28:30 +0200
Subject: [PATCH 32/53] Update STFPM config (#3935)

---
 src/otx/recipe/anomaly_classification/stfpm.yaml | 4 ++++
 src/otx/recipe/anomaly_detection/stfpm.yaml      | 4 ++++
 src/otx/recipe/anomaly_segmentation/stfpm.yaml   | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/src/otx/recipe/anomaly_classification/stfpm.yaml b/src/otx/recipe/anomaly_classification/stfpm.yaml
index d4716047769..ec1c6af8ddc 100644
--- a/src/otx/recipe/anomaly_classification/stfpm.yaml
+++ b/src/otx/recipe/anomaly_classification/stfpm.yaml
@@ -19,6 +19,10 @@ overrides:
     - class_path: lightning.pytorch.callbacks.EarlyStopping
       init_args:
         patience: 5
+        mode: max
+        monitor: pixel_F1Score
     - class_path: otx.algo.callbacks.adaptive_train_scheduling.AdaptiveTrainScheduling
       init_args:
         max_interval: 1
+  gradient_clip_val: 0
+  num_sanity_val_steps: 0
diff --git a/src/otx/recipe/anomaly_detection/stfpm.yaml b/src/otx/recipe/anomaly_detection/stfpm.yaml
index 35e0bdf006b..b13534505a4 100644
--- a/src/otx/recipe/anomaly_detection/stfpm.yaml
+++ b/src/otx/recipe/anomaly_detection/stfpm.yaml
@@ -24,9 +24,13 @@ overrides:
     - class_path: lightning.pytorch.callbacks.EarlyStopping
       init_args:
         patience: 5
+        mode: max
+        monitor: pixel_F1Score
     - class_path: otx.algo.callbacks.adaptive_train_scheduling.AdaptiveTrainScheduling
       init_args:
         max_interval: 1
+  gradient_clip_val: 0
+  num_sanity_val_steps: 0
 
   data:
     task: ANOMALY_DETECTION
diff --git a/src/otx/recipe/anomaly_segmentation/stfpm.yaml b/src/otx/recipe/anomaly_segmentation/stfpm.yaml
index 17967143424..9a3d9c85d6e 100644
--- a/src/otx/recipe/anomaly_segmentation/stfpm.yaml
+++ b/src/otx/recipe/anomaly_segmentation/stfpm.yaml
@@ -19,9 +19,13 @@ overrides:
     - class_path: lightning.pytorch.callbacks.EarlyStopping
       init_args:
         patience: 5
+        mode: max
+        monitor: pixel_F1Score
     - class_path: otx.algo.callbacks.adaptive_train_scheduling.AdaptiveTrainScheduling
       init_args:
         max_interval: 1
+  gradient_clip_val: 0
+  num_sanity_val_steps: 0
 
   data:
     task: ANOMALY_SEGMENTATION

From c2ccfc9d0c5772dfa4d5b86bae4bc9395a8070c0 Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Fri, 6 Sep 2024 16:29:25 +0900
Subject: [PATCH 33/53] Add missing pretrained weights when creating a docker
 image (#3938)

* Fix pre-trained weight downloader

* Remove if condition for pretrained wiehgt download
---
 docker/download_pretrained_weights.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docker/download_pretrained_weights.py b/docker/download_pretrained_weights.py
index 7ff01433799..0d145fe9c73 100644
--- a/docker/download_pretrained_weights.py
+++ b/docker/download_pretrained_weights.py
@@ -32,10 +32,6 @@ def download_all() -> None:
             msg = f"Skip {config_path} since it is not a PyTorch config."
             logger.warning(msg)
             continue
-        if "anomaly_" in str(config_path) or "dino_v2" in str(config_path) or "h_label_cls" in str(config_path):
-            msg = f"Skip {config_path} since those models show errors on instantiation."
-            logger.warning(msg)
-            continue
 
         config = OmegaConf.load(config_path)
         init_model = next(iter(partial_instantiate_class(config.model)))

From 8b747f9ddc2cd5f0ca44a65487725c3debca1512 Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Mon, 9 Sep 2024 15:56:53 +0900
Subject: [PATCH 34/53] Change default option 'full' to 'base' in otx install
 (#3937)

* Change option full to base for otx install

* Fix wrong code

* Fix issue

* Fix docs
---
 docs/source/guide/get_started/installation.rst | 2 +-
 src/otx/cli/install.py                         | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/source/guide/get_started/installation.rst b/docs/source/guide/get_started/installation.rst
index 94de82e29a3..ed30443fefd 100644
--- a/docs/source/guide/get_started/installation.rst
+++ b/docs/source/guide/get_started/installation.rst
@@ -62,7 +62,7 @@ according to your system environment.
 
         .. code-block:: shell
 
-            otx install -v
+            otx install -v --option full
 
 [Optional] Refer to the `torch official installation guide <https://pytorch.org/get-started/previous-versions/>`_
 
diff --git a/src/otx/cli/install.py b/src/otx/cli/install.py
index b02a86d0a39..d1168d23ddc 100644
--- a/src/otx/cli/install.py
+++ b/src/otx/cli/install.py
@@ -49,8 +49,8 @@ def add_install_parser(subcommands_action: _ActionSubCommands) -> None:
     parser = ArgumentParser()
     parser.add_argument(
         "--option",
-        help="Install the mmlab library or optional-dependencies.",
-        default="full",
+        help="Install optional-dependencies. The 'full' option will install all dependencies.",
+        default="base",
         type=str,
     )
     parser.add_argument(
@@ -123,7 +123,7 @@ def otx_install(
     )
 
     # Parse mmX requirements if the task requires mmX packages.
-    mmcv_install_args = ["--user"] if user else []
+    mmcv_install_args = []
     if mmcv_requirements:
         mmcv_install_args = get_mmcv_install_args(torch_requirement, mmcv_requirements)
         install_args += ["openmim"]
@@ -146,6 +146,8 @@ def otx_install(
 
         # Install mmX requirements if the task requires mmX packages using mim.
         if mmcv_install_args and status_code == 0:
+            if user:
+                mmcv_install_args.append("--user")
             console.log(f"Installation list: [yellow]{mmcv_install_args}[/yellow]")
             status_code = mim_installation(mmcv_install_args)
             if status_code == 0:

From d43226eaa33a8bd4b4b444f81a704253b66c3be2 Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Mon, 9 Sep 2024 17:45:19 +0900
Subject: [PATCH 35/53] Fix auto adapt batch size in Converter (#3939)

* Enable auto adapt batch size into converter

* Fix wrong
---
 src/otx/tools/converter.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/otx/tools/converter.py b/src/otx/tools/converter.py
index e7f8e980855..c83dc7692f5 100644
--- a/src/otx/tools/converter.py
+++ b/src/otx/tools/converter.py
@@ -276,6 +276,9 @@ def update_use_adaptive_interval(param_value: bool) -> None:
         def update_auto_num_workers(param_value: bool) -> None:
             config["data"]["auto_num_workers"] = param_value
 
+        def update_auto_adapt_batch_size(param_value: str) -> None:
+            config["adaptive_bs"] = param_value
+
         def update_enable_tiling(param_value: bool) -> None:
             config["data"]["tile_config"]["enable_tiler"] = param_value
             if param_value:
@@ -312,11 +315,12 @@ def update_enable_tiling(param_value: bool) -> None:
             "use_adaptive_interval": update_use_adaptive_interval,
             "auto_num_workers": update_auto_num_workers,
             "enable_tiling": update_enable_tiling,
+            "auto_adapt_batch_size": update_auto_adapt_batch_size,
         }
         for param_name, param_value in param_dict.items():
             update_func = param_update_funcs.get(param_name)
             if update_func:
-                update_func(param_value)
+                update_func(param_value)  # type: ignore[operator]
                 unused_params.pop(param_name)
 
         warn("Warning: These parameters are not updated", stacklevel=1)

From 1d319cd0479cee860b287a9099590244c841b182 Mon Sep 17 00:00:00 2001
From: Eunwoo Shin <eunwoo.shin@intel.com>
Date: Mon, 9 Sep 2024 18:08:10 +0900
Subject: [PATCH 36/53] Fix hpo converter (#3940)

* save best hp after hpo

* add test
---
 src/otx/engine/hpo/hpo_api.py         | 5 +++++
 src/otx/tools/converter.py            | 2 ++
 tests/unit/engine/hpo/test_hpo_api.py | 1 +
 3 files changed, 8 insertions(+)

diff --git a/src/otx/engine/hpo/hpo_api.py b/src/otx/engine/hpo/hpo_api.py
index 3fe01e1d786..7fd5983618e 100644
--- a/src/otx/engine/hpo/hpo_api.py
+++ b/src/otx/engine/hpo/hpo_api.py
@@ -6,6 +6,7 @@
 from __future__ import annotations
 
 import dataclasses
+import json
 import logging
 import time
 from functools import partial
@@ -128,6 +129,10 @@ def execute_hpo(
     hpo_algo.print_result()
     _remove_unused_model_weights(hpo_workdir, best_hpo_weight)
 
+    if best_config is not None:
+        with (hpo_workdir / "best_hp.json").open("w") as f:
+            json.dump(best_config, f)
+
     return best_config, best_hpo_weight
 
 
diff --git a/src/otx/tools/converter.py b/src/otx/tools/converter.py
index c83dc7692f5..b6f7400f133 100644
--- a/src/otx/tools/converter.py
+++ b/src/otx/tools/converter.py
@@ -202,6 +202,8 @@ def convert(config_path: str, task: OTXTaskType | None = None) -> dict:
             task_info["task"] = task
         default_config = ConfigConverter._get_default_config(task_info)
         ConfigConverter._update_params(default_config, param_dict)
+        if (hpo_time_ratio := template_config.get("hpo_parameters", {}).get("hpo_time_ratio")) is not None:
+            default_config["hpo_config.expected_time_ratio"] = hpo_time_ratio
         ConfigConverter._remove_unused_key(default_config)
         return default_config
 
diff --git a/tests/unit/engine/hpo/test_hpo_api.py b/tests/unit/engine/hpo/test_hpo_api.py
index f2cef3fab31..bcc71d8bc9a 100644
--- a/tests/unit/engine/hpo/test_hpo_api.py
+++ b/tests/unit/engine/hpo/test_hpo_api.py
@@ -148,6 +148,7 @@ def test_execute_hpo(
 
     # check hpo workdir exists
     assert (engine_work_dir / "hpo").exists()
+    assert (engine_work_dir / "hpo" / "best_hp.json").exists()
     # check a case where progress_update_callback exists
     mock_thread.assert_called_once()
     assert mock_thread.call_args.kwargs["target"] == _update_hpo_progress

From aaa27656ced11001c599bf4856902dba550dbbb1 Mon Sep 17 00:00:00 2001
From: Eugene Liu <eugene.liu@intel.com>
Date: Mon, 9 Sep 2024 13:40:25 +0100
Subject: [PATCH 37/53] Fix tiling XAI out of range (#3943)

- Fix tile merge XAI out of range
---
 src/otx/core/model/detection.py             |  1 +
 src/otx/core/model/instance_segmentation.py |  1 +
 src/otx/core/utils/tile_merge.py            | 12 +++++++-----
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py
index 76b7c2d1538..064811189ce 100644
--- a/src/otx/core/model/detection.py
+++ b/src/otx/core/model/detection.py
@@ -237,6 +237,7 @@ def forward_tiles(self, inputs: OTXTileBatchDataEntity[DetBatchDataEntity]) -> D
             inputs.imgs_info,
             self.num_classes,
             self.tile_config,
+            self.explain_mode,
         )
         for batch_tile_attrs, batch_tile_input in inputs.unbind():
             output = self.forward_explain(batch_tile_input) if self.explain_mode else self.forward(batch_tile_input)
diff --git a/src/otx/core/model/instance_segmentation.py b/src/otx/core/model/instance_segmentation.py
index 2a26b688920..3a58ba00715 100644
--- a/src/otx/core/model/instance_segmentation.py
+++ b/src/otx/core/model/instance_segmentation.py
@@ -215,6 +215,7 @@ def forward_tiles(self, inputs: OTXTileBatchDataEntity[InstanceSegBatchDataEntit
             inputs.imgs_info,
             self.num_classes,
             self.tile_config,
+            self.explain_mode,
         )
         for batch_tile_attrs, batch_tile_input in inputs.unbind():
             output = self.forward_explain(batch_tile_input) if self.explain_mode else self.forward(batch_tile_input)
diff --git a/src/otx/core/utils/tile_merge.py b/src/otx/core/utils/tile_merge.py
index 99149e4e1f8..02457522055 100644
--- a/src/otx/core/utils/tile_merge.py
+++ b/src/otx/core/utils/tile_merge.py
@@ -27,9 +27,9 @@ class TileMerge(Generic[T_OTXDataEntity, T_OTXBatchPredEntity]):
 
     Args:
         img_infos (list[ImageInfo]): Original image information before tiling.
-        iou_threshold (float, optional): IoU threshold for non-maximum suppression. Defaults to 0.45.
-        max_num_instances (int, optional): Maximum number of instances to keep. Defaults to 500.
-
+        num_classes (int): Number of classes.
+        tile_config (TileConfig): Tile configuration.
+        explain_mode (bool): Whether or not tiles have explain features. Default: False.
     """
 
     def __init__(
@@ -37,6 +37,7 @@ def __init__(
         img_infos: list[ImageInfo],
         num_classes: int,
         tile_config: TileConfig,
+        explain_mode: bool = False,
     ) -> None:
         self.img_infos = img_infos
         self.num_classes = num_classes
@@ -44,6 +45,7 @@ def __init__(
         self.iou_threshold = tile_config.iou_threshold
         self.max_num_instances = tile_config.max_num_instances
         self.with_full_img = tile_config.with_full_img
+        self.explain_mode = explain_mode
 
     @abstractmethod
     def _merge_entities(
@@ -115,7 +117,7 @@ def merge(
         """
         entities_to_merge = defaultdict(list)
         img_ids = []
-        explain_mode = len(batch_tile_preds[0].feature_vector) > 0
+        explain_mode = self.explain_mode
 
         for tile_preds, tile_attrs in zip(batch_tile_preds, batch_tile_attrs):
             batch_size = tile_preds.batch_size
@@ -315,7 +317,7 @@ def merge(
         """
         entities_to_merge = defaultdict(list)
         img_ids = []
-        explain_mode = len(batch_tile_preds[0].feature_vector) > 0
+        explain_mode = self.explain_mode
 
         for tile_preds, tile_attrs in zip(batch_tile_preds, batch_tile_attrs):
             feature_vectors = tile_preds.feature_vector if explain_mode else [[] for _ in range(tile_preds.batch_size)]

From ac87b49e8e79503bd27be27e21dbbb7e580670b8 Mon Sep 17 00:00:00 2001
From: Ashwin Vaidya <ashwin.vaidya@intel.com>
Date: Thu, 12 Sep 2024 02:08:20 +0200
Subject: [PATCH 38/53] enable model export (#3952)

Signed-off-by: Ashwin Vaidya <ashwinnitinvaidya@gmail.com>
---
 src/otx/core/model/anomaly.py     | 7 -------
 tests/integration/cli/test_cli.py | 7 -------
 2 files changed, 14 deletions(-)

diff --git a/src/otx/core/model/anomaly.py b/src/otx/core/model/anomaly.py
index cec2359a70f..f41823cdcc1 100644
--- a/src/otx/core/model/anomaly.py
+++ b/src/otx/core/model/anomaly.py
@@ -4,7 +4,6 @@
 
 from __future__ import annotations
 
-import logging as log
 from typing import TYPE_CHECKING, Any, TypeAlias
 
 import torch
@@ -285,12 +284,6 @@ def export(
         Returns:
             Path: path to the exported model.
         """
-        if export_format == OTXExportFormatType.OPENVINO:
-            if to_exportable_code:
-                msg = "Exportable code option is not supported yet for anomaly tasks and will be ignored."
-                log.warning(msg)
-            to_exportable_code = False
-
         return self._exporter.export(
             model=self.model,
             output_dir=output_dir,
diff --git a/tests/integration/cli/test_cli.py b/tests/integration/cli/test_cli.py
index a827de85fae..c754173f39d 100644
--- a/tests/integration/cli/test_cli.py
+++ b/tests/integration/cli/test_cli.py
@@ -158,15 +158,8 @@ def test_otx_e2e(
             ExportCase2Test("ONNX", False, "exported_model_decoder.onnx"),
             ExportCase2Test("OPENVINO", False, "exported_model_decoder.xml"),
         ]  # TODO (sungchul): EXPORTABLE_CODE will be supported
-    elif "anomaly" in task:
-        fxt_export_list = [
-            ExportCase2Test("ONNX", False, "exported_model.onnx"),
-            ExportCase2Test("OPENVINO", False, "exported_model.xml"),
-        ]  # anomaly doesn't support exportable code
 
     overrides = fxt_cli_override_command_per_task[task]
-    if "anomaly" in task:
-        overrides = {}  # Overrides are not needed in export
 
     tmp_path_test = tmp_path / f"otx_test_{model_name}"
     for export_case in fxt_export_list:

From 8f96f27920af87cb2c270ee9893cc988b3d85d1d Mon Sep 17 00:00:00 2001
From: Prokofiev Kirill <kirill.prokofiev@intel.com>
Date: Thu, 12 Sep 2024 14:55:25 +0200
Subject: [PATCH 39/53] Move templates from OTX1.X to OTX2.X (#3951)

* add otx1.6 templates

* added new models

* delete entrypoints and nncf cfg

* updated some hyperparams

* fix for rtmdet_tiny

* updated converter

* Update classification templates

* Update det, r-det, vpm

* Update template.yaml

* changed warmaup value in train.yaml

---------

Co-authored-by: Kang, Harim <harim.kang@intel.com>
Co-authored-by: Kim, Sungchul <sungchul.kim@intel.com>
---
 src/otx/recipe/detection/rtdetr_101.yaml      |   2 +-
 src/otx/recipe/detection/rtdetr_18.yaml       |   2 +-
 src/otx/recipe/detection/rtdetr_50.yaml       |   2 +-
 .../recipe/semantic_segmentation/dino_v2.yaml |   2 +-
 .../semantic_segmentation/segnext_b.yaml      |   2 +-
 .../semantic_segmentation/segnext_s.yaml      |   2 +-
 .../semantic_segmentation/segnext_t.yaml      |   2 +-
 src/otx/tools/converter.py                    |  36 +
 src/otx/tools/templates/__init__.py           |   4 +
 .../action/classification/configuration.yaml  | 453 +++++++++++
 .../classification/movinet/template.yaml      |  42 +
 .../action/classification/x3d/template.yaml   |  42 +
 .../action/detection/configuration.yaml       | 453 +++++++++++
 .../detection/x3d_fast_rcnn/template.yaml     |  42 +
 .../classification/padim/configuration.yaml   | 183 +++++
 .../classification/padim/template.yaml        |  29 +
 .../classification/stfpm/configuration.yaml   | 312 ++++++++
 .../classification/stfpm/template.yaml        |  34 +
 .../detection/padim/configuration.yaml        | 183 +++++
 .../anomaly/detection/padim/template.yaml     |  29 +
 .../detection/stfpm/configuration.yaml        | 312 ++++++++
 .../anomaly/detection/stfpm/template.yaml     |  34 +
 .../segmentation/padim/configuration.yaml     | 183 +++++
 .../anomaly/segmentation/padim/template.yaml  |  29 +
 .../segmentation/stfpm/configuration.yaml     | 312 ++++++++
 .../anomaly/segmentation/stfpm/template.yaml  |  34 +
 .../classification/configuration.yaml         | 496 ++++++++++++
 .../classification/deit_tiny/template.yaml    |  44 ++
 .../efficientnet_b0_cls_incr/template.yaml    |  45 ++
 .../efficientnet_b3/template.yaml             |  41 +
 .../efficientnet_v2_l/template.yaml           |  41 +
 .../efficientnet_v2_s_cls_incr/template.yaml  |  44 ++
 .../template.yaml                             |  44 ++
 .../mobilenet_v3_small/template.yaml          |  41 +
 .../detection/detection/configuration.yaml    | 700 +++++++++++++++++
 .../cspdarknet_yolox_l/template.yaml          |  48 ++
 .../cspdarknet_yolox_s/template.yaml          |  48 ++
 .../cspdarknet_yolox_tiny/template.yaml       |  46 ++
 .../cspdarknet_yolox_x/template.yaml          |  48 ++
 .../detection/mobilenetv2_atss/template.yaml  |  47 ++
 .../detection/mobilenetv2_ssd/template.yaml   |  46 ++
 .../detection/resnext101_atss/template.yaml   |  48 ++
 .../detection/rtdetr_101/template.yaml        |  48 ++
 .../detection/rtdetr_18/template.yaml         |  48 ++
 .../detection/rtdetr_50/template.yaml         |  48 ++
 .../detection/rtmdet_tiny/template.yaml       |  48 ++
 .../instance_segmentation/configuration.yaml  | 720 ++++++++++++++++++
 .../efficientnetb2b_maskrcnn/template.yaml    |  47 ++
 .../maskrcnn_swin_t/template.yaml             |  43 ++
 .../resnet50_maskrcnn/template.yaml           |  46 ++
 .../rtmdet_tiny/template.yaml                 |  43 ++
 .../rotated_detection/configuration.yaml      | 705 +++++++++++++++++
 .../efficientnetb2b_maskrcnn/template.yaml    |  47 ++
 .../resnet50_maskrcnn/template.yaml           |  46 ++
 .../templates/segmentation/configuration.yaml | 473 ++++++++++++
 .../segmentation/dinov2_small/template.yaml   |  41 +
 .../segmentation/ham_segnext_b/template.yaml  |  45 ++
 .../segmentation/ham_segnext_s/template.yaml  |  45 ++
 .../segmentation/ham_segnext_t/template.yaml  |  45 ++
 .../ocr_lite_hrnet_18_mod2/template.yaml      |  44 ++
 .../ocr_lite_hrnet_s_mod2/template.yaml       |  44 ++
 .../ocr_lite_hrnet_x_mod3/template.yaml       |  44 ++
 .../visual_prompting/configuration.yaml       | 235 ++++++
 .../sam_tiny_vit/template.yaml                |  25 +
 .../visual_prompting/sam_vit_b/template.yaml  |  25 +
 .../zero_shot_sam_tiny_vit/configuration.yaml | 210 +++++
 .../zero_shot_sam_tiny_vit/template.yaml      |  33 +
 .../zero_shot_sam_vit_b/configuration.yaml    | 210 +++++
 .../zero_shot_sam_vit_b/template.yaml         |  33 +
 69 files changed, 8021 insertions(+), 7 deletions(-)
 create mode 100644 src/otx/tools/templates/__init__.py
 create mode 100644 src/otx/tools/templates/action/classification/configuration.yaml
 create mode 100644 src/otx/tools/templates/action/classification/movinet/template.yaml
 create mode 100644 src/otx/tools/templates/action/classification/x3d/template.yaml
 create mode 100644 src/otx/tools/templates/action/detection/configuration.yaml
 create mode 100644 src/otx/tools/templates/action/detection/x3d_fast_rcnn/template.yaml
 create mode 100644 src/otx/tools/templates/anomaly/classification/padim/configuration.yaml
 create mode 100644 src/otx/tools/templates/anomaly/classification/padim/template.yaml
 create mode 100644 src/otx/tools/templates/anomaly/classification/stfpm/configuration.yaml
 create mode 100644 src/otx/tools/templates/anomaly/classification/stfpm/template.yaml
 create mode 100644 src/otx/tools/templates/anomaly/detection/padim/configuration.yaml
 create mode 100644 src/otx/tools/templates/anomaly/detection/padim/template.yaml
 create mode 100644 src/otx/tools/templates/anomaly/detection/stfpm/configuration.yaml
 create mode 100644 src/otx/tools/templates/anomaly/detection/stfpm/template.yaml
 create mode 100644 src/otx/tools/templates/anomaly/segmentation/padim/configuration.yaml
 create mode 100644 src/otx/tools/templates/anomaly/segmentation/padim/template.yaml
 create mode 100644 src/otx/tools/templates/anomaly/segmentation/stfpm/configuration.yaml
 create mode 100644 src/otx/tools/templates/anomaly/segmentation/stfpm/template.yaml
 create mode 100644 src/otx/tools/templates/classification/configuration.yaml
 create mode 100644 src/otx/tools/templates/classification/deit_tiny/template.yaml
 create mode 100644 src/otx/tools/templates/classification/efficientnet_b0_cls_incr/template.yaml
 create mode 100644 src/otx/tools/templates/classification/efficientnet_b3/template.yaml
 create mode 100644 src/otx/tools/templates/classification/efficientnet_v2_l/template.yaml
 create mode 100644 src/otx/tools/templates/classification/efficientnet_v2_s_cls_incr/template.yaml
 create mode 100644 src/otx/tools/templates/classification/mobilenet_v3_large_1_cls_incr/template.yaml
 create mode 100644 src/otx/tools/templates/classification/mobilenet_v3_small/template.yaml
 create mode 100644 src/otx/tools/templates/detection/detection/configuration.yaml
 create mode 100644 src/otx/tools/templates/detection/detection/cspdarknet_yolox_l/template.yaml
 create mode 100644 src/otx/tools/templates/detection/detection/cspdarknet_yolox_s/template.yaml
 create mode 100644 src/otx/tools/templates/detection/detection/cspdarknet_yolox_tiny/template.yaml
 create mode 100644 src/otx/tools/templates/detection/detection/cspdarknet_yolox_x/template.yaml
 create mode 100644 src/otx/tools/templates/detection/detection/mobilenetv2_atss/template.yaml
 create mode 100644 src/otx/tools/templates/detection/detection/mobilenetv2_ssd/template.yaml
 create mode 100644 src/otx/tools/templates/detection/detection/resnext101_atss/template.yaml
 create mode 100644 src/otx/tools/templates/detection/detection/rtdetr_101/template.yaml
 create mode 100644 src/otx/tools/templates/detection/detection/rtdetr_18/template.yaml
 create mode 100644 src/otx/tools/templates/detection/detection/rtdetr_50/template.yaml
 create mode 100644 src/otx/tools/templates/detection/detection/rtmdet_tiny/template.yaml
 create mode 100644 src/otx/tools/templates/detection/instance_segmentation/configuration.yaml
 create mode 100644 src/otx/tools/templates/detection/instance_segmentation/efficientnetb2b_maskrcnn/template.yaml
 create mode 100644 src/otx/tools/templates/detection/instance_segmentation/maskrcnn_swin_t/template.yaml
 create mode 100644 src/otx/tools/templates/detection/instance_segmentation/resnet50_maskrcnn/template.yaml
 create mode 100644 src/otx/tools/templates/detection/instance_segmentation/rtmdet_tiny/template.yaml
 create mode 100644 src/otx/tools/templates/detection/rotated_detection/configuration.yaml
 create mode 100644 src/otx/tools/templates/detection/rotated_detection/efficientnetb2b_maskrcnn/template.yaml
 create mode 100644 src/otx/tools/templates/detection/rotated_detection/resnet50_maskrcnn/template.yaml
 create mode 100644 src/otx/tools/templates/segmentation/configuration.yaml
 create mode 100644 src/otx/tools/templates/segmentation/dinov2_small/template.yaml
 create mode 100644 src/otx/tools/templates/segmentation/ham_segnext_b/template.yaml
 create mode 100644 src/otx/tools/templates/segmentation/ham_segnext_s/template.yaml
 create mode 100644 src/otx/tools/templates/segmentation/ham_segnext_t/template.yaml
 create mode 100644 src/otx/tools/templates/segmentation/ocr_lite_hrnet_18_mod2/template.yaml
 create mode 100644 src/otx/tools/templates/segmentation/ocr_lite_hrnet_s_mod2/template.yaml
 create mode 100644 src/otx/tools/templates/segmentation/ocr_lite_hrnet_x_mod3/template.yaml
 create mode 100644 src/otx/tools/templates/visual_prompting/configuration.yaml
 create mode 100644 src/otx/tools/templates/visual_prompting/sam_tiny_vit/template.yaml
 create mode 100644 src/otx/tools/templates/visual_prompting/sam_vit_b/template.yaml
 create mode 100644 src/otx/tools/templates/visual_prompting/zero_shot_sam_tiny_vit/configuration.yaml
 create mode 100644 src/otx/tools/templates/visual_prompting/zero_shot_sam_tiny_vit/template.yaml
 create mode 100644 src/otx/tools/templates/visual_prompting/zero_shot_sam_vit_b/configuration.yaml
 create mode 100644 src/otx/tools/templates/visual_prompting/zero_shot_sam_vit_b/template.yaml

diff --git a/src/otx/recipe/detection/rtdetr_101.yaml b/src/otx/recipe/detection/rtdetr_101.yaml
index 1ae36dbc26b..b67c2d34b4d 100644
--- a/src/otx/recipe/detection/rtdetr_101.yaml
+++ b/src/otx/recipe/detection/rtdetr_101.yaml
@@ -13,7 +13,7 @@ model:
     scheduler:
       class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
       init_args:
-        num_warmup_steps: 5
+        num_warmup_steps: 100
         main_scheduler_callable:
           class_path: lightning.pytorch.cli.ReduceLROnPlateau
           init_args:
diff --git a/src/otx/recipe/detection/rtdetr_18.yaml b/src/otx/recipe/detection/rtdetr_18.yaml
index 4e11fa20499..ba1575e0d03 100644
--- a/src/otx/recipe/detection/rtdetr_18.yaml
+++ b/src/otx/recipe/detection/rtdetr_18.yaml
@@ -13,7 +13,7 @@ model:
     scheduler:
       class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
       init_args:
-        num_warmup_steps: 5
+        num_warmup_steps: 100
         main_scheduler_callable:
           class_path: lightning.pytorch.cli.ReduceLROnPlateau
           init_args:
diff --git a/src/otx/recipe/detection/rtdetr_50.yaml b/src/otx/recipe/detection/rtdetr_50.yaml
index 9adb14819a7..298b30737d7 100644
--- a/src/otx/recipe/detection/rtdetr_50.yaml
+++ b/src/otx/recipe/detection/rtdetr_50.yaml
@@ -13,7 +13,7 @@ model:
     scheduler:
       class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
       init_args:
-        num_warmup_steps: 5
+        num_warmup_steps: 100
         main_scheduler_callable:
           class_path: lightning.pytorch.cli.ReduceLROnPlateau
           init_args:
diff --git a/src/otx/recipe/semantic_segmentation/dino_v2.yaml b/src/otx/recipe/semantic_segmentation/dino_v2.yaml
index 713b8e92624..33c4e98d578 100644
--- a/src/otx/recipe/semantic_segmentation/dino_v2.yaml
+++ b/src/otx/recipe/semantic_segmentation/dino_v2.yaml
@@ -19,7 +19,7 @@ model:
     scheduler:
       class_path: torch.optim.lr_scheduler.PolynomialLR
       init_args:
-        total_iters: 100
+        total_iters: 150
         power: 0.9
         last_epoch: -1
 
diff --git a/src/otx/recipe/semantic_segmentation/segnext_b.yaml b/src/otx/recipe/semantic_segmentation/segnext_b.yaml
index 49626e58d6c..9d14a3af30b 100644
--- a/src/otx/recipe/semantic_segmentation/segnext_b.yaml
+++ b/src/otx/recipe/semantic_segmentation/segnext_b.yaml
@@ -20,7 +20,7 @@ model:
         main_scheduler_callable:
           class_path: torch.optim.lr_scheduler.PolynomialLR
           init_args:
-            total_iters: 100
+            total_iters: 150
             power: 0.9
             last_epoch: -1
 
diff --git a/src/otx/recipe/semantic_segmentation/segnext_s.yaml b/src/otx/recipe/semantic_segmentation/segnext_s.yaml
index e8eab1d22e7..3fccf0dd151 100644
--- a/src/otx/recipe/semantic_segmentation/segnext_s.yaml
+++ b/src/otx/recipe/semantic_segmentation/segnext_s.yaml
@@ -20,7 +20,7 @@ model:
         main_scheduler_callable:
           class_path: torch.optim.lr_scheduler.PolynomialLR
           init_args:
-            total_iters: 100
+            total_iters: 150
             power: 0.9
             last_epoch: -1
 
diff --git a/src/otx/recipe/semantic_segmentation/segnext_t.yaml b/src/otx/recipe/semantic_segmentation/segnext_t.yaml
index 755c26ee49c..a670ba3248f 100644
--- a/src/otx/recipe/semantic_segmentation/segnext_t.yaml
+++ b/src/otx/recipe/semantic_segmentation/segnext_t.yaml
@@ -20,7 +20,7 @@ model:
         main_scheduler_callable:
           class_path: torch.optim.lr_scheduler.PolynomialLR
           init_args:
-            total_iters: 100
+            total_iters: 150
             power: 0.9
             last_epoch: -1
 
diff --git a/src/otx/tools/converter.py b/src/otx/tools/converter.py
index b6f7400f133..98c9d4aee86 100644
--- a/src/otx/tools/converter.py
+++ b/src/otx/tools/converter.py
@@ -40,6 +40,18 @@
         "task": OTXTaskType.MULTI_CLASS_CLS,
         "model_name": "mobilenet_v3_large",
     },
+    "Custom_Image_Classification_EfficinetNet-B3": {
+        "task": OTXTaskType.MULTI_CLASS_CLS,
+        "model_name": "tv_efficientnet_b3",
+    },
+    "Custom_Image_Classification_EfficinetNet-V2-L": {
+        "task": OTXTaskType.MULTI_CLASS_CLS,
+        "model_name": "tv_efficientnet_v2_l",
+    },
+    "Custom_Image_Classification_MobileNet-V3-small": {
+        "task": OTXTaskType.MULTI_CLASS_CLS,
+        "model_name": "tv_mobilenet_v3_small",
+    },
     # DETECTION
     "Custom_Object_Detection_Gen3_ATSS": {
         "task": OTXTaskType.DETECTION,
@@ -69,6 +81,22 @@
         "task": OTXTaskType.DETECTION,
         "model_name": "yolox_tiny",
     },
+    "Object_Detection_RTDetr_18": {
+        "task": OTXTaskType.DETECTION,
+        "model_name": "rtdetr_18",
+    },
+    "Object_Detection_RTDetr_50": {
+        "task": OTXTaskType.DETECTION,
+        "model_name": "rtdetr_50",
+    },
+    "Object_Detection_RTDetr_101": {
+        "task": OTXTaskType.DETECTION,
+        "model_name": "rtdetr_101",
+    },
+    "Object_Detection_RTMDet_tiny": {
+        "task": OTXTaskType.DETECTION,
+        "model_name": "rtmdet_tiny",
+    },
     # INSTANCE_SEGMENTATION
     "Custom_Counting_Instance_Segmentation_MaskRCNN_ResNet50": {
         "task": OTXTaskType.INSTANCE_SEGMENTATION,
@@ -82,6 +110,10 @@
         "task": OTXTaskType.INSTANCE_SEGMENTATION,
         "model_name": "maskrcnn_efficientnetb2b",
     },
+    "Custom_Instance_Segmentation_RTMDet_tiny": {
+        "task": OTXTaskType.INSTANCE_SEGMENTATION,
+        "model_name": "rtmdet_inst_tiny",
+    },
     # ROTATED_DETECTION
     "Custom_Rotated_Detection_via_Instance_Segmentation_MaskRCNN_ResNet50": {
         "task": OTXTaskType.ROTATED_DETECTION,
@@ -120,6 +152,10 @@
         "task": OTXTaskType.SEMANTIC_SEGMENTATION,
         "model_name": "segnext_b",
     },
+    "Custom_Semantic_Segmentation_DINOV2_S": {
+        "task": OTXTaskType.SEMANTIC_SEGMENTATION,
+        "model_name": "dino_v2",
+    },
     # ANOMALY_CLASSIFICATION
     "ote_anomaly_classification_padim": {
         "task": OTXTaskType.ANOMALY_CLASSIFICATION,
diff --git a/src/otx/tools/templates/__init__.py b/src/otx/tools/templates/__init__.py
new file mode 100644
index 00000000000..9e41fd6f483
--- /dev/null
+++ b/src/otx/tools/templates/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+"""YAML file templates for the models OTX provides."""
diff --git a/src/otx/tools/templates/action/classification/configuration.yaml b/src/otx/tools/templates/action/classification/configuration.yaml
new file mode 100644
index 00000000000..78f3178f2ae
--- /dev/null
+++ b/src/otx/tools/templates/action/classification/configuration.yaml
@@ -0,0 +1,453 @@
+description: Configuration for an action classification task
+header: Configuration for an action classification task
+learning_parameters:
+  batch_size:
+    affects_outcome_of: TRAINING
+    default_value: 5
+    description:
+      The number of training samples seen in each iteration of training.
+      Increasing this value improves training time and may make the training more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 5
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+    auto_hpo_state: NOT_POSSIBLE
+  description: Learning Parameters
+  header: Learning Parameters
+  learning_rate:
+    affects_outcome_of: TRAINING
+    default_value: 0.01
+    description:
+      Increasing this value will speed up training convergence but might
+      make it unstable.
+    editable: true
+    header: Learning rate
+    max_value: 0.1
+    min_value: 1.0e-07
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.01
+    visible_in_ui: true
+    warning: null
+    auto_hpo_state: NOT_POSSIBLE
+  learning_rate_warmup_iters:
+    affects_outcome_of: TRAINING
+    default_value: 100
+    description:
+      In this periods of initial training iterations, the model will be trained in low learning rate,
+      which will be increased incrementally up to the expected learning rate setting.
+      This warm-up phase is known to be helpful to stabilize training, thus result in better performance.
+    editable: true
+    header: Number of iterations for learning rate warmup
+    max_value: 10000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 100
+    visible_in_ui: true
+    warning: null
+  num_iters:
+    affects_outcome_of: TRAINING
+    default_value: 200
+    description:
+      Increasing this value causes the results to be more robust but training
+      time will be longer.
+    editable: true
+    header: Number of training iterations
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1
+    visible_in_ui: true
+    warning: null
+  num_workers:
+    affects_outcome_of: NONE
+    default_value: 0
+    description:
+      Increasing this value might improve training speed however it might
+      cause out of memory errors. If the number of workers is set to zero, data loading
+      will happen in the main training thread.
+    editable: true
+    header: Number of cpu threads to use during batch generation
+    max_value: 8
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: null
+  enable_early_stopping:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Early exit from training when validation accuracy isn't changed or decreased for several epochs.
+    editable: true
+    header: Enable early stopping of the training
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  early_stop_start:
+    affects_outcome_of: TRAINING
+    default_value: 3
+    editable: true
+    header: Start epoch for early stopping
+    max_value: 1000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 3
+    visible_in_ui: false
+  early_stop_patience:
+    affects_outcome_of: TRAINING
+    default_value: 10
+    description: Training will stop if the model does not improve within the number of epochs of patience.
+    editable: true
+    header: Patience for early stopping
+    max_value: 50
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 10
+    visible_in_ui: true
+    warning: This is applied exclusively when early stopping is enabled.
+  early_stop_iteration_patience:
+    affects_outcome_of: TRAINING
+    default_value: 0
+    description:
+      Training will stop if the model does not improve within the number of iterations of patience.
+      This ensures the model is trained enough with the number of iterations of patience before early stopping.
+    editable: true
+    header: Iteration patience for early stopping
+    max_value: 1000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: This is applied exclusively when early stopping is enabled.
+  use_adaptive_interval:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Depending on the size of iteration per epoch, adaptively update the validation interval and related values.
+    editable: true
+    header: Use adaptive validation interval
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: This will automatically control the patience and interval when early stopping is enabled.
+  auto_adapt_batch_size:
+    affects_outcome_of: TRAINING
+    default_value: None
+    description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
+    editable: true
+    enum_name: BatchSizeAdaptType
+    header: Decrease batch size if current batch size isn't fit to CUDA memory.
+    options:
+      NONE: "None"
+      SAFE: "Safe"
+      FULL: "Full"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: None
+    visible_in_ui: true
+    warning:
+      Enabling this could change the actual batch size depending on the current GPU status.
+      The learning rate also could be adjusted according to the adapted batch size. This process might change
+      a model performance and take some extra computation time to try a few batch size candidates.
+  auto_num_workers:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Adapt num_workers according to current hardware status automatically.
+    editable: true
+    header: Enable auto adaptive num_workers
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+postprocessing:
+  confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: 0.35
+    description:
+      This threshold only takes effect if the threshold is not set based
+      on the result.
+    editable: true
+    header: Confidence threshold
+    max_value: 1
+    min_value: 0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    # value: 0.35
+    value: 0.01
+    visible_in_ui: true
+    warning: null
+  description: Postprocessing
+  header: Postprocessing
+  result_based_confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: true
+    description: Confidence threshold is derived from the results
+    editable: true
+    header: Result based confidence threshold
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+algo_backend:
+  description: parameters for algo backend
+  header: Algo backend parameters
+  train_type:
+    affects_outcome_of: NONE
+    default_value: Incremental
+    description: Quantization preset that defines quantization scheme
+    editable: false
+    enum_name: TrainType
+    header: Train type
+    options:
+      Incremental: "Incremental"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Incremental
+    visible_in_ui: false
+    warning: null
+  mem_cache_size:
+    affects_outcome_of: TRAINING
+    default_value: 0
+    description: Size of memory pool for caching decoded data to load data faster (bytes).
+    editable: true
+    header: Size of memory pool
+    max_value: 10000000000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme for storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    default_value: Performance
+    description: Quantization preset that defines quantization scheme
+    editable: True
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Performance
+    visible_in_ui: True
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: True
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: True
+    warning: null
+  stat_requests_number:
+    affects_outcome_of: NONE
+    default_value: 0
+    description: Number of requests during statistics collection
+    editable: true
+    header: Number of requests
+    max_value: 200
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+nncf_optimization:
+  description: Optimization by NNCF
+  header: Optimization by NNCF
+  enable_quantization:
+    affects_outcome_of: INFERENCE
+    default_value: True
+    description: Enable quantization algorithm
+    editable: false
+    header: Enable quantization algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: false
+    warning: null
+  enable_pruning:
+    affects_outcome_of: INFERENCE
+    default_value: false
+    description: Enable filter pruning algorithm
+    editable: true
+    header: Enable filter pruning algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  pruning_supported:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Whether filter pruning is supported
+    editable: false
+    header: Whether filter pruning is supported
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: null
+  maximal_accuracy_degradation:
+    affects_outcome_of: NONE
+    default_value: 1.0
+    description: The maximal allowed accuracy metric drop in absolute values
+    editable: True
+    header: Maximum accuracy degradation
+    max_value: 100.0
+    min_value: 0.0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1.0
+    visible_in_ui: True
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: True
diff --git a/src/otx/tools/templates/action/classification/movinet/template.yaml b/src/otx/tools/templates/action/classification/movinet/template.yaml
new file mode 100644
index 00000000000..e28eead71dc
--- /dev/null
+++ b/src/otx/tools/templates/action/classification/movinet/template.yaml
@@ -0,0 +1,42 @@
+# Description.
+model_template_id: Custom_Action_Classification_MoViNet
+name: MoViNet
+task_type: ACTION_CLASSIFICATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Basic transfer learning template for MoViNet
+application: ~
+
+# Algo backend.
+framework: OTXAction v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.0003
+        auto_hpo_state: POSSIBLE
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 2.71
+size: 3.1
+# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform.
+# inference_targets:
+#   - CPU
+#   - GPU
+#   - VPU
diff --git a/src/otx/tools/templates/action/classification/x3d/template.yaml b/src/otx/tools/templates/action/classification/x3d/template.yaml
new file mode 100644
index 00000000000..db27fc584dc
--- /dev/null
+++ b/src/otx/tools/templates/action/classification/x3d/template.yaml
@@ -0,0 +1,42 @@
+# Description.
+model_template_id: Custom_Action_Classification_X3D
+name: X3D
+task_type: ACTION_CLASSIFICATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Basic transfer learning template for X3D
+application: ~
+
+# Algo backend.
+framework: OTXAction v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.0001
+        auto_hpo_state: POSSIBLE
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 20.6
+size: 9.1
+# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform.
+# inference_targets:
+#   - CPU
+#   - GPU
+#   - VPU
diff --git a/src/otx/tools/templates/action/detection/configuration.yaml b/src/otx/tools/templates/action/detection/configuration.yaml
new file mode 100644
index 00000000000..78f3178f2ae
--- /dev/null
+++ b/src/otx/tools/templates/action/detection/configuration.yaml
@@ -0,0 +1,453 @@
+description: Configuration for an action classification task
+header: Configuration for an action classification task
+learning_parameters:
+  batch_size:
+    affects_outcome_of: TRAINING
+    default_value: 5
+    description:
+      The number of training samples seen in each iteration of training.
+      Increasing this value improves training time and may make the training more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 5
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+    auto_hpo_state: NOT_POSSIBLE
+  description: Learning Parameters
+  header: Learning Parameters
+  learning_rate:
+    affects_outcome_of: TRAINING
+    default_value: 0.01
+    description:
+      Increasing this value will speed up training convergence but might
+      make it unstable.
+    editable: true
+    header: Learning rate
+    max_value: 0.1
+    min_value: 1.0e-07
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.01
+    visible_in_ui: true
+    warning: null
+    auto_hpo_state: NOT_POSSIBLE
+  learning_rate_warmup_iters:
+    affects_outcome_of: TRAINING
+    default_value: 100
+    description:
+      In this periods of initial training iterations, the model will be trained in low learning rate,
+      which will be increased incrementally up to the expected learning rate setting.
+      This warm-up phase is known to be helpful to stabilize training, thus result in better performance.
+    editable: true
+    header: Number of iterations for learning rate warmup
+    max_value: 10000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 100
+    visible_in_ui: true
+    warning: null
+  num_iters:
+    affects_outcome_of: TRAINING
+    default_value: 200
+    description:
+      Increasing this value causes the results to be more robust but training
+      time will be longer.
+    editable: true
+    header: Number of training iterations
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1
+    visible_in_ui: true
+    warning: null
+  num_workers:
+    affects_outcome_of: NONE
+    default_value: 0
+    description:
+      Increasing this value might improve training speed however it might
+      cause out of memory errors. If the number of workers is set to zero, data loading
+      will happen in the main training thread.
+    editable: true
+    header: Number of cpu threads to use during batch generation
+    max_value: 8
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: null
+  enable_early_stopping:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Early exit from training when validation accuracy isn't changed or decreased for several epochs.
+    editable: true
+    header: Enable early stopping of the training
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  early_stop_start:
+    affects_outcome_of: TRAINING
+    default_value: 3
+    editable: true
+    header: Start epoch for early stopping
+    max_value: 1000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 3
+    visible_in_ui: false
+  early_stop_patience:
+    affects_outcome_of: TRAINING
+    default_value: 10
+    description: Training will stop if the model does not improve within the number of epochs of patience.
+    editable: true
+    header: Patience for early stopping
+    max_value: 50
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 10
+    visible_in_ui: true
+    warning: This is applied exclusively when early stopping is enabled.
+  early_stop_iteration_patience:
+    affects_outcome_of: TRAINING
+    default_value: 0
+    description:
+      Training will stop if the model does not improve within the number of iterations of patience.
+      This ensures the model is trained enough with the number of iterations of patience before early stopping.
+    editable: true
+    header: Iteration patience for early stopping
+    max_value: 1000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: This is applied exclusively when early stopping is enabled.
+  use_adaptive_interval:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Depending on the size of iteration per epoch, adaptively update the validation interval and related values.
+    editable: true
+    header: Use adaptive validation interval
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: This will automatically control the patience and interval when early stopping is enabled.
+  auto_adapt_batch_size:
+    affects_outcome_of: TRAINING
+    default_value: None
+    description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
+    editable: true
+    enum_name: BatchSizeAdaptType
+    header: Decrease batch size if current batch size isn't fit to CUDA memory.
+    options:
+      NONE: "None"
+      SAFE: "Safe"
+      FULL: "Full"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: None
+    visible_in_ui: true
+    warning:
+      Enabling this could change the actual batch size depending on the current GPU status.
+      The learning rate also could be adjusted according to the adapted batch size. This process might change
+      a model performance and take some extra computation time to try a few batch size candidates.
+  auto_num_workers:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Adapt num_workers according to current hardware status automatically.
+    editable: true
+    header: Enable auto adaptive num_workers
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+postprocessing:
+  confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: 0.35
+    description:
+      This threshold only takes effect if the threshold is not set based
+      on the result.
+    editable: true
+    header: Confidence threshold
+    max_value: 1
+    min_value: 0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    # value: 0.35
+    value: 0.01
+    visible_in_ui: true
+    warning: null
+  description: Postprocessing
+  header: Postprocessing
+  result_based_confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: true
+    description: Confidence threshold is derived from the results
+    editable: true
+    header: Result based confidence threshold
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+algo_backend:
+  description: parameters for algo backend
+  header: Algo backend parameters
+  train_type:
+    affects_outcome_of: NONE
+    default_value: Incremental
+    description: Quantization preset that defines quantization scheme
+    editable: false
+    enum_name: TrainType
+    header: Train type
+    options:
+      Incremental: "Incremental"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Incremental
+    visible_in_ui: false
+    warning: null
+  mem_cache_size:
+    affects_outcome_of: TRAINING
+    default_value: 0
+    description: Size of memory pool for caching decoded data to load data faster (bytes).
+    editable: true
+    header: Size of memory pool
+    max_value: 10000000000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme for storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    default_value: Performance
+    description: Quantization preset that defines quantization scheme
+    editable: True
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Performance
+    visible_in_ui: True
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: True
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: True
+    warning: null
+  stat_requests_number:
+    affects_outcome_of: NONE
+    default_value: 0
+    description: Number of requests during statistics collection
+    editable: true
+    header: Number of requests
+    max_value: 200
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+nncf_optimization:
+  description: Optimization by NNCF
+  header: Optimization by NNCF
+  enable_quantization:
+    affects_outcome_of: INFERENCE
+    default_value: True
+    description: Enable quantization algorithm
+    editable: false
+    header: Enable quantization algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: false
+    warning: null
+  enable_pruning:
+    affects_outcome_of: INFERENCE
+    default_value: false
+    description: Enable filter pruning algorithm
+    editable: true
+    header: Enable filter pruning algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  pruning_supported:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Whether filter pruning is supported
+    editable: false
+    header: Whether filter pruning is supported
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: null
+  maximal_accuracy_degradation:
+    affects_outcome_of: NONE
+    default_value: 1.0
+    description: The maximal allowed accuracy metric drop in absolute values
+    editable: True
+    header: Maximum accuracy degradation
+    max_value: 100.0
+    min_value: 0.0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1.0
+    visible_in_ui: True
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: True
diff --git a/src/otx/tools/templates/action/detection/x3d_fast_rcnn/template.yaml b/src/otx/tools/templates/action/detection/x3d_fast_rcnn/template.yaml
new file mode 100644
index 00000000000..bac2499645f
--- /dev/null
+++ b/src/otx/tools/templates/action/detection/x3d_fast_rcnn/template.yaml
@@ -0,0 +1,42 @@
+# Description.
+model_template_id: Custom_Action_Detection_X3D_FAST_RCNN
+name: X3D_FAST_RCNN
+task_type: ACTION_DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Basic transfer learning template for Fast RCNN with 3D action backbone
+application: ~
+
+# Algo backend.
+framework: OTXAction v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.005
+        auto_hpo_state: POSSIBLE
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 20.6
+size: 9.1
+# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform.
+# inference_targets:
+#   - CPU
+#   - GPU
+#   - VPU
diff --git a/src/otx/tools/templates/anomaly/classification/padim/configuration.yaml b/src/otx/tools/templates/anomaly/classification/padim/configuration.yaml
new file mode 100644
index 00000000000..eac893d019c
--- /dev/null
+++ b/src/otx/tools/templates/anomaly/classification/padim/configuration.yaml
@@ -0,0 +1,183 @@
+dataset:
+  description: Dataset Parameters
+  header: Dataset Parameters
+  num_workers:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 8
+    description:
+      Increasing this value might improve training speed however it might
+      cause out of memory errors. If the number of workers is set to zero, data loading
+      will happen in the main training thread.
+    editable: true
+    header: Number of workers
+    max_value: 36
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 8
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+description: Configuration for Padim
+header: Configuration for Padim
+id: ""
+learning_parameters:
+  backbone:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: resnet18
+    description: Pre-trained backbone used for feature extraction
+    editable: false
+    enum_name: ModelBackbone
+    header: Model Backbone
+    options:
+      RESNET18: resnet18
+      WIDE_RESNET_50: wide_resnet50_2
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: resnet18
+    visible_in_ui: false
+    warning: null
+  description: Learning Parameters
+  header: Learning Parameters
+  train_batch_size:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 32
+    description:
+      The number of training samples seen in each iteration of training.
+      Increasing this value improves training time and may make the training more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 32
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+nncf_optimization:
+  description: Optimization by NNCF
+  enable_pruning:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: false
+    description: Enable filter pruning algorithm
+    editable: true
+    header: Enable filter pruning algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  enable_quantization:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: true
+    description: Enable quantization algorithm
+    editable: true
+    header: Enable quantization algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+  header: Optimization by NNCF
+  pruning_supported:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: false
+    description: Whether filter pruning is supported
+    editable: false
+    header: Whether filter pruning is supported
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: Performance
+    description: Quantization preset that defines quantization scheme
+    editable: true
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Performance
+    visible_in_ui: true
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: true
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
diff --git a/src/otx/tools/templates/anomaly/classification/padim/template.yaml b/src/otx/tools/templates/anomaly/classification/padim/template.yaml
new file mode 100644
index 00000000000..4f425ba2cb0
--- /dev/null
+++ b/src/otx/tools/templates/anomaly/classification/padim/template.yaml
@@ -0,0 +1,29 @@
+# Description.
+model_template_id: ote_anomaly_classification_padim
+name: PADIM
+task_type: ANOMALY_CLASSIFICATION
+task_family: VISION
+instantiation: "CLASS"
+summary: This model is faster and in many cases more accurate, but it requires a fixed position of the objects within the image.
+application: ~
+
+# Algo backend.
+framework: OTXAnomalyClassification v0.1.0
+
+# Hyper Parameters
+hyper_parameters:
+  base_path: ./configuration.yaml
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Computational Complexity
+gigaflops: 3.9
+size: 168.4
+
+# Model spec
+model_category: SPEED
+is_default_for_task: true
diff --git a/src/otx/tools/templates/anomaly/classification/stfpm/configuration.yaml b/src/otx/tools/templates/anomaly/classification/stfpm/configuration.yaml
new file mode 100644
index 00000000000..ff3e8ca1517
--- /dev/null
+++ b/src/otx/tools/templates/anomaly/classification/stfpm/configuration.yaml
@@ -0,0 +1,312 @@
+dataset:
+  description: Dataset Parameters
+  header: Dataset Parameters
+  num_workers:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 8
+    description:
+      Increasing this value might improve training speed however it might
+      cause out of memory errors. If the number of workers is set to zero, data loading
+      will happen in the main training thread.
+    editable: true
+    header: Number of workers
+    max_value: 36
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 8
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+description: Configuration for STFPM
+header: Configuration for STFPM
+id: ""
+learning_parameters:
+  backbone:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: resnet18
+    description: Pre-trained backbone used for feature extraction
+    editable: true
+    enum_name: ModelBackbone
+    header: Model Backbone
+    options:
+      RESNET18: resnet18
+      WIDE_RESNET_50: wide_resnet50_2
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: resnet18
+    visible_in_ui: true
+    warning: null
+  description: Learning Parameters
+  early_stopping:
+    description: Early Stopping Parameters
+    header: Early Stopping Parameters
+    metric:
+      affects_outcome_of: NONE
+      auto_hpo_state: not_possible
+      auto_hpo_value: null
+      default_value: image_F1Score
+      description: The metric used to determine if the model should stop training
+      editable: true
+      enum_name: EarlyStoppingMetrics
+      header: Early Stopping Metric
+      options:
+        IMAGE_F1: image_F1Score
+        IMAGE_ROC_AUC: image_AUROC
+      type: SELECTABLE
+      ui_rules:
+        action: DISABLE_EDITING
+        operator: AND
+        rules: []
+        type: UI_RULES
+      value: image_F1Score
+      visible_in_ui: true
+      warning: null
+    patience:
+      affects_outcome_of: TRAINING
+      auto_hpo_state: not_possible
+      auto_hpo_value: null
+      default_value: 10
+      description:
+        Number of epochs to wait for an improvement in the monitored metric.
+        If the metric has not improved for this many epochs, the training will stop
+        and the best model will be returned.
+      editable: true
+      header: Early Stopping Patience
+      max_value: 100
+      min_value: 1
+      type: INTEGER
+      ui_rules:
+        action: DISABLE_EDITING
+        operator: AND
+        rules: []
+        type: UI_RULES
+      value: 10
+      visible_in_ui: true
+      warning:
+        Setting this value too low might lead to underfitting. Setting the
+        value too high will increase the training time and might lead to overfitting.
+    type: PARAMETER_GROUP
+    visible_in_ui: true
+  header: Learning Parameters
+  lr:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 0.4
+    description: Learning rate used for optimizing the Student network.
+    editable: true
+    header: Learning Rate
+    max_value: 1
+    min_value: 0.001
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.4
+    visible_in_ui: true
+    warning: null
+  max_epochs:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 100
+    description: Maximum number of epochs to train the model for.
+    editable: true
+    header: Max Epochs
+    max_value: 500
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 100
+    visible_in_ui: true
+    warning:
+      Training for very few epochs might lead to poor performance. If Early
+      Stopping is enabled then increasing the value of max epochs might not lead to
+      desired result.
+  momentum:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 0.9
+    description: Momentum used for SGD optimizer
+    editable: true
+    header: Momentum
+    max_value: 1.0
+    min_value: 0.1
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.9
+    visible_in_ui: true
+    warning: null
+  train_batch_size:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 32
+    description:
+      The number of training samples seen in each iteration of training.
+      Increasing this value improves training time and may make the training more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 32
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+  weight_decay:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 0.0001
+    description: Decay for SGD optimizer
+    editable: true
+    header: Weight Decay
+    max_value: 1
+    min_value: 1.0e-05
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.0001
+    visible_in_ui: true
+    warning: null
+nncf_optimization:
+  description: Optimization by NNCF
+  enable_pruning:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: false
+    description: Enable filter pruning algorithm
+    editable: true
+    header: Enable filter pruning algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  enable_quantization:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: true
+    description: Enable quantization algorithm
+    editable: true
+    header: Enable quantization algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+  header: Optimization by NNCF
+  pruning_supported:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: false
+    description: Whether filter pruning is supported
+    editable: false
+    header: Whether filter pruning is supported
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: Performance
+    description: Quantization preset that defines quantization scheme
+    editable: true
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Performance
+    visible_in_ui: true
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: true
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
diff --git a/src/otx/tools/templates/anomaly/classification/stfpm/template.yaml b/src/otx/tools/templates/anomaly/classification/stfpm/template.yaml
new file mode 100644
index 00000000000..9b40b90ac48
--- /dev/null
+++ b/src/otx/tools/templates/anomaly/classification/stfpm/template.yaml
@@ -0,0 +1,34 @@
+# Description.
+model_template_id: ote_anomaly_classification_stfpm
+name: STFPM
+task_type: ANOMALY_CLASSIFICATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Use this model when the position of the objects in the image frame might differ between images.
+application: ~
+
+# Algo backend.
+framework: OTXAnomalyClassification v0.1.0
+
+# Hyper Parameters
+hyper_parameters:
+  base_path: ./configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      train_batch_size:
+        auto_hpo_state: POSSIBLE
+      lr:
+        auto_hpo_state: POSSIBLE
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Computational Complexity
+gigaflops: 5.6
+size: 21.1
+
+# Model spec
+model_category: ACCURACY
diff --git a/src/otx/tools/templates/anomaly/detection/padim/configuration.yaml b/src/otx/tools/templates/anomaly/detection/padim/configuration.yaml
new file mode 100644
index 00000000000..eac893d019c
--- /dev/null
+++ b/src/otx/tools/templates/anomaly/detection/padim/configuration.yaml
@@ -0,0 +1,183 @@
+dataset:
+  description: Dataset Parameters
+  header: Dataset Parameters
+  num_workers:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 8
+    description:
+      Increasing this value might improve training speed however it might
+      cause out of memory errors. If the number of workers is set to zero, data loading
+      will happen in the main training thread.
+    editable: true
+    header: Number of workers
+    max_value: 36
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 8
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+description: Configuration for Padim
+header: Configuration for Padim
+id: ""
+learning_parameters:
+  backbone:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: resnet18
+    description: Pre-trained backbone used for feature extraction
+    editable: false
+    enum_name: ModelBackbone
+    header: Model Backbone
+    options:
+      RESNET18: resnet18
+      WIDE_RESNET_50: wide_resnet50_2
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: resnet18
+    visible_in_ui: false
+    warning: null
+  description: Learning Parameters
+  header: Learning Parameters
+  train_batch_size:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 32
+    description:
+      The number of training samples seen in each iteration of training.
+      Increasing this value improves training time and may make the training more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 32
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+nncf_optimization:
+  description: Optimization by NNCF
+  enable_pruning:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: false
+    description: Enable filter pruning algorithm
+    editable: true
+    header: Enable filter pruning algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  enable_quantization:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: true
+    description: Enable quantization algorithm
+    editable: true
+    header: Enable quantization algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+  header: Optimization by NNCF
+  pruning_supported:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: false
+    description: Whether filter pruning is supported
+    editable: false
+    header: Whether filter pruning is supported
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: Performance
+    description: Quantization preset that defines quantization scheme
+    editable: true
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Performance
+    visible_in_ui: true
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: true
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
diff --git a/src/otx/tools/templates/anomaly/detection/padim/template.yaml b/src/otx/tools/templates/anomaly/detection/padim/template.yaml
new file mode 100644
index 00000000000..68d39898793
--- /dev/null
+++ b/src/otx/tools/templates/anomaly/detection/padim/template.yaml
@@ -0,0 +1,29 @@
+# Description.
+model_template_id: ote_anomaly_detection_padim
+name: PADIM
+task_type: ANOMALY_DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: This model is faster and in many cases more accurate, but it requires a fixed position of the objects within the image.
+application: ~
+
+# Algo backend.
+framework: OTXAnomalyClassification v0.1.0 # TODO: update after the name has been changed on the platform side
+
+# Hyper Parameters
+hyper_parameters:
+  base_path: ./configuration.yaml
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Computational Complexity
+gigaflops: 3.9
+size: 168.4
+
+# Model spec
+model_category: SPEED
+is_default_for_task: true
diff --git a/src/otx/tools/templates/anomaly/detection/stfpm/configuration.yaml b/src/otx/tools/templates/anomaly/detection/stfpm/configuration.yaml
new file mode 100644
index 00000000000..ff3e8ca1517
--- /dev/null
+++ b/src/otx/tools/templates/anomaly/detection/stfpm/configuration.yaml
@@ -0,0 +1,312 @@
+dataset:
+  description: Dataset Parameters
+  header: Dataset Parameters
+  num_workers:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 8
+    description:
+      Increasing this value might improve training speed however it might
+      cause out of memory errors. If the number of workers is set to zero, data loading
+      will happen in the main training thread.
+    editable: true
+    header: Number of workers
+    max_value: 36
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 8
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+description: Configuration for STFPM
+header: Configuration for STFPM
+id: ""
+learning_parameters:
+  backbone:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: resnet18
+    description: Pre-trained backbone used for feature extraction
+    editable: true
+    enum_name: ModelBackbone
+    header: Model Backbone
+    options:
+      RESNET18: resnet18
+      WIDE_RESNET_50: wide_resnet50_2
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: resnet18
+    visible_in_ui: true
+    warning: null
+  description: Learning Parameters
+  early_stopping:
+    description: Early Stopping Parameters
+    header: Early Stopping Parameters
+    metric:
+      affects_outcome_of: NONE
+      auto_hpo_state: not_possible
+      auto_hpo_value: null
+      default_value: image_F1Score
+      description: The metric used to determine if the model should stop training
+      editable: true
+      enum_name: EarlyStoppingMetrics
+      header: Early Stopping Metric
+      options:
+        IMAGE_F1: image_F1Score
+        IMAGE_ROC_AUC: image_AUROC
+      type: SELECTABLE
+      ui_rules:
+        action: DISABLE_EDITING
+        operator: AND
+        rules: []
+        type: UI_RULES
+      value: image_F1Score
+      visible_in_ui: true
+      warning: null
+    patience:
+      affects_outcome_of: TRAINING
+      auto_hpo_state: not_possible
+      auto_hpo_value: null
+      default_value: 10
+      description:
+        Number of epochs to wait for an improvement in the monitored metric.
+        If the metric has not improved for this many epochs, the training will stop
+        and the best model will be returned.
+      editable: true
+      header: Early Stopping Patience
+      max_value: 100
+      min_value: 1
+      type: INTEGER
+      ui_rules:
+        action: DISABLE_EDITING
+        operator: AND
+        rules: []
+        type: UI_RULES
+      value: 10
+      visible_in_ui: true
+      warning:
+        Setting this value too low might lead to underfitting. Setting the
+        value too high will increase the training time and might lead to overfitting.
+    type: PARAMETER_GROUP
+    visible_in_ui: true
+  header: Learning Parameters
+  lr:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 0.4
+    description: Learning rate used for optimizing the Student network.
+    editable: true
+    header: Learning Rate
+    max_value: 1
+    min_value: 0.001
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.4
+    visible_in_ui: true
+    warning: null
+  max_epochs:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 100
+    description: Maximum number of epochs to train the model for.
+    editable: true
+    header: Max Epochs
+    max_value: 500
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 100
+    visible_in_ui: true
+    warning:
+      Training for very few epochs might lead to poor performance. If Early
+      Stopping is enabled then increasing the value of max epochs might not lead to
+      desired result.
+  momentum:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 0.9
+    description: Momentum used for SGD optimizer
+    editable: true
+    header: Momentum
+    max_value: 1.0
+    min_value: 0.1
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.9
+    visible_in_ui: true
+    warning: null
+  train_batch_size:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 32
+    description:
+      The number of training samples seen in each iteration of training.
+      Increasing this value improves training time and may make the training more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 32
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+  weight_decay:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 0.0001
+    description: Decay for SGD optimizer
+    editable: true
+    header: Weight Decay
+    max_value: 1
+    min_value: 1.0e-05
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.0001
+    visible_in_ui: true
+    warning: null
+nncf_optimization:
+  description: Optimization by NNCF
+  enable_pruning:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: false
+    description: Enable filter pruning algorithm
+    editable: true
+    header: Enable filter pruning algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  enable_quantization:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: true
+    description: Enable quantization algorithm
+    editable: true
+    header: Enable quantization algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+  header: Optimization by NNCF
+  pruning_supported:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: false
+    description: Whether filter pruning is supported
+    editable: false
+    header: Whether filter pruning is supported
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: Performance
+    description: Quantization preset that defines quantization scheme
+    editable: true
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Performance
+    visible_in_ui: true
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: true
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
diff --git a/src/otx/tools/templates/anomaly/detection/stfpm/template.yaml b/src/otx/tools/templates/anomaly/detection/stfpm/template.yaml
new file mode 100644
index 00000000000..b32df5c55d9
--- /dev/null
+++ b/src/otx/tools/templates/anomaly/detection/stfpm/template.yaml
@@ -0,0 +1,34 @@
+# Description.
+model_template_id: ote_anomaly_detection_stfpm
+name: STFPM
+task_type: ANOMALY_DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Use this model when the position of the objects in the image frame might differ between images.
+application: ~
+
+# Algo backend.
+framework: OTXAnomalyClassification v0.1.0 # TODO: update after the name has been changed on the platform side
+
+# Hyper Parameters
+hyper_parameters:
+  base_path: ./configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      train_batch_size:
+        auto_hpo_state: POSSIBLE
+      lr:
+        auto_hpo_state: POSSIBLE
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Computational Complexity
+gigaflops: 5.6
+size: 21.1
+
+# Model spec
+model_category: ACCURACY
diff --git a/src/otx/tools/templates/anomaly/segmentation/padim/configuration.yaml b/src/otx/tools/templates/anomaly/segmentation/padim/configuration.yaml
new file mode 100644
index 00000000000..eac893d019c
--- /dev/null
+++ b/src/otx/tools/templates/anomaly/segmentation/padim/configuration.yaml
@@ -0,0 +1,183 @@
+dataset:
+  description: Dataset Parameters
+  header: Dataset Parameters
+  num_workers:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 8
+    description:
+      Increasing this value might improve training speed however it might
+      cause out of memory errors. If the number of workers is set to zero, data loading
+      will happen in the main training thread.
+    editable: true
+    header: Number of workers
+    max_value: 36
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 8
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+description: Configuration for Padim
+header: Configuration for Padim
+id: ""
+learning_parameters:
+  backbone:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: resnet18
+    description: Pre-trained backbone used for feature extraction
+    editable: false
+    enum_name: ModelBackbone
+    header: Model Backbone
+    options:
+      RESNET18: resnet18
+      WIDE_RESNET_50: wide_resnet50_2
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: resnet18
+    visible_in_ui: false
+    warning: null
+  description: Learning Parameters
+  header: Learning Parameters
+  train_batch_size:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 32
+    description:
+      The number of training samples seen in each iteration of training.
+      Increasing this value improves training time and may make the training more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 32
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+nncf_optimization:
+  description: Optimization by NNCF
+  enable_pruning:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: false
+    description: Enable filter pruning algorithm
+    editable: true
+    header: Enable filter pruning algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  enable_quantization:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: true
+    description: Enable quantization algorithm
+    editable: true
+    header: Enable quantization algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+  header: Optimization by NNCF
+  pruning_supported:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: false
+    description: Whether filter pruning is supported
+    editable: false
+    header: Whether filter pruning is supported
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: Performance
+    description: Quantization preset that defines quantization scheme
+    editable: true
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Performance
+    visible_in_ui: true
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: true
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
diff --git a/src/otx/tools/templates/anomaly/segmentation/padim/template.yaml b/src/otx/tools/templates/anomaly/segmentation/padim/template.yaml
new file mode 100644
index 00000000000..c6702175a3a
--- /dev/null
+++ b/src/otx/tools/templates/anomaly/segmentation/padim/template.yaml
@@ -0,0 +1,29 @@
+# Description.
+model_template_id: ote_anomaly_segmentation_padim
+name: PADIM
+task_type: ANOMALY_SEGMENTATION
+task_family: VISION
+instantiation: "CLASS"
+summary: This model is faster and in many cases more accurate, but it requires a fixed position of the objects within the image.
+application: ~
+
+# Algo backend.
+framework: OTXAnomalyClassification v0.1.0 # TODO: update after the name has been changed on the platform side
+
+# Hyper Parameters
+hyper_parameters:
+  base_path: ./configuration.yaml
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Computational Complexity
+gigaflops: 3.9
+size: 168.4
+
+# Model spec
+model_category: SPEED
+is_default_for_task: true
diff --git a/src/otx/tools/templates/anomaly/segmentation/stfpm/configuration.yaml b/src/otx/tools/templates/anomaly/segmentation/stfpm/configuration.yaml
new file mode 100644
index 00000000000..ff3e8ca1517
--- /dev/null
+++ b/src/otx/tools/templates/anomaly/segmentation/stfpm/configuration.yaml
@@ -0,0 +1,312 @@
+dataset:
+  description: Dataset Parameters
+  header: Dataset Parameters
+  num_workers:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 8
+    description:
+      Increasing this value might improve training speed however it might
+      cause out of memory errors. If the number of workers is set to zero, data loading
+      will happen in the main training thread.
+    editable: true
+    header: Number of workers
+    max_value: 36
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 8
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+description: Configuration for STFPM
+header: Configuration for STFPM
+id: ""
+learning_parameters:
+  backbone:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: resnet18
+    description: Pre-trained backbone used for feature extraction
+    editable: true
+    enum_name: ModelBackbone
+    header: Model Backbone
+    options:
+      RESNET18: resnet18
+      WIDE_RESNET_50: wide_resnet50_2
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: resnet18
+    visible_in_ui: true
+    warning: null
+  description: Learning Parameters
+  early_stopping:
+    description: Early Stopping Parameters
+    header: Early Stopping Parameters
+    metric:
+      affects_outcome_of: NONE
+      auto_hpo_state: not_possible
+      auto_hpo_value: null
+      default_value: image_F1Score
+      description: The metric used to determine if the model should stop training
+      editable: true
+      enum_name: EarlyStoppingMetrics
+      header: Early Stopping Metric
+      options:
+        IMAGE_F1: image_F1Score
+        IMAGE_ROC_AUC: image_AUROC
+      type: SELECTABLE
+      ui_rules:
+        action: DISABLE_EDITING
+        operator: AND
+        rules: []
+        type: UI_RULES
+      value: image_F1Score
+      visible_in_ui: true
+      warning: null
+    patience:
+      affects_outcome_of: TRAINING
+      auto_hpo_state: not_possible
+      auto_hpo_value: null
+      default_value: 10
+      description:
+        Number of epochs to wait for an improvement in the monitored metric.
+        If the metric has not improved for this many epochs, the training will stop
+        and the best model will be returned.
+      editable: true
+      header: Early Stopping Patience
+      max_value: 100
+      min_value: 1
+      type: INTEGER
+      ui_rules:
+        action: DISABLE_EDITING
+        operator: AND
+        rules: []
+        type: UI_RULES
+      value: 10
+      visible_in_ui: true
+      warning:
+        Setting this value too low might lead to underfitting. Setting the
+        value too high will increase the training time and might lead to overfitting.
+    type: PARAMETER_GROUP
+    visible_in_ui: true
+  header: Learning Parameters
+  lr:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 0.4
+    description: Learning rate used for optimizing the Student network.
+    editable: true
+    header: Learning Rate
+    max_value: 1
+    min_value: 0.001
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.4
+    visible_in_ui: true
+    warning: null
+  max_epochs:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 100
+    description: Maximum number of epochs to train the model for.
+    editable: true
+    header: Max Epochs
+    max_value: 500
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 100
+    visible_in_ui: true
+    warning:
+      Training for very few epochs might lead to poor performance. If Early
+      Stopping is enabled then increasing the value of max epochs might not lead to
+      desired result.
+  momentum:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 0.9
+    description: Momentum used for SGD optimizer
+    editable: true
+    header: Momentum
+    max_value: 1.0
+    min_value: 0.1
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.9
+    visible_in_ui: true
+    warning: null
+  train_batch_size:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 32
+    description:
+      The number of training samples seen in each iteration of training.
+      Increasing this value improves training time and may make the training more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 32
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+  weight_decay:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 0.0001
+    description: Decay for SGD optimizer
+    editable: true
+    header: Weight Decay
+    max_value: 1
+    min_value: 1.0e-05
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.0001
+    visible_in_ui: true
+    warning: null
+nncf_optimization:
+  description: Optimization by NNCF
+  enable_pruning:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: false
+    description: Enable filter pruning algorithm
+    editable: true
+    header: Enable filter pruning algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  enable_quantization:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: true
+    description: Enable quantization algorithm
+    editable: true
+    header: Enable quantization algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+  header: Optimization by NNCF
+  pruning_supported:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: false
+    description: Whether filter pruning is supported
+    editable: false
+    header: Whether filter pruning is supported
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: Performance
+    description: Quantization preset that defines quantization scheme
+    editable: true
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Performance
+    visible_in_ui: true
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: true
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
diff --git a/src/otx/tools/templates/anomaly/segmentation/stfpm/template.yaml b/src/otx/tools/templates/anomaly/segmentation/stfpm/template.yaml
new file mode 100644
index 00000000000..bfb772c182b
--- /dev/null
+++ b/src/otx/tools/templates/anomaly/segmentation/stfpm/template.yaml
@@ -0,0 +1,34 @@
+# Description.
+model_template_id: ote_anomaly_segmentation_stfpm
+name: STFPM
+task_type: ANOMALY_SEGMENTATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Use this model when the position of the objects in the image frame might differ between images.
+application: ~
+
+# Algo backend.
+framework: OTXAnomalyClassification v0.1.0 # TODO: update after the name has been changed on the platform side
+
+# Hyper Parameters
+hyper_parameters:
+  base_path: ./configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      train_batch_size:
+        auto_hpo_state: POSSIBLE
+      lr:
+        auto_hpo_state: POSSIBLE
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Computational Complexity
+gigaflops: 5.6
+size: 21.1
+
+# Model spec
+model_category: ACCURACY
diff --git a/src/otx/tools/templates/classification/configuration.yaml b/src/otx/tools/templates/classification/configuration.yaml
new file mode 100644
index 00000000000..ed91ea1cfa3
--- /dev/null
+++ b/src/otx/tools/templates/classification/configuration.yaml
@@ -0,0 +1,496 @@
+description: Configuration for an image classification task
+header: Configuration for an image classification task
+learning_parameters:
+  batch_size:
+    affects_outcome_of: TRAINING
+    default_value: 32
+    description:
+      The number of training samples seen in each iteration of training.
+      Increasing this value improves training time and may make the training more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Batch size
+    max_value: 2048
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+    auto_hpo_state: NOT_POSSIBLE
+  unlabeled_batch_size:
+    affects_outcome_of: TRAINING
+    default_value: 32
+    description:
+      The number of unlabeled training samples seen in each iteration of semi-supervised learning.
+      Increasing this value improves training time and may make the training more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Unlabeled batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+    auto_hpo_state: NOT_POSSIBLE
+  description: Learning Parameters
+  header: Learning Parameters
+  learning_rate:
+    affects_outcome_of: TRAINING
+    default_value: 0.01
+    description:
+      Increasing this value will speed up training convergence but might
+      make it unstable.
+    editable: true
+    header: Learning rate
+    max_value: 1.0
+    min_value: 1.0e-07
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+    auto_hpo_state: NOT_POSSIBLE
+  max_num_epochs:
+    affects_outcome_of: TRAINING
+    default_value: 200
+    description:
+      Increasing this value causes the results to be more robust but training
+      time will be longer.
+    editable: true
+    header: Maximum number of training epochs
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  num_iters:
+    affects_outcome_of: TRAINING
+    default_value: 200
+    description:
+      Increasing this value causes the results to be more robust but training
+      time will be longer.
+    editable: true
+    header: Number of training iterations
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 200
+    visible_in_ui: true
+    warning: null
+  num_workers:
+    affects_outcome_of: NONE
+    default_value: 2
+    description:
+      Increasing this value might improve training speed however it might
+      cause out of memory errors. If the number of workers is set to zero, data loading
+      will happen in the main training thread.
+    editable: true
+    header: Number of cpu threads to use during batch generation
+    max_value: 8
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: null
+  learning_rate_warmup_iters:
+    affects_outcome_of: TRAINING
+    default_value: 100
+    description:
+      In this periods of initial training iterations, the model will be trained in low learning rate,
+      which will be increased incrementally up to the expected learning rate setting.
+      This warm-up phase is known to be helpful to stabilize training, thus result in better performance.
+    editable: true
+    header: Number of iterations for learning rate warmup
+    max_value: 10000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 100
+    visible_in_ui: true
+    warning: null
+  enable_early_stopping:
+    affects_outcome_of: TRAINING
+    default_value: true
+    description: Early exit from training when validation accuracy is not changed or decreased for several epochs.
+    editable: true
+    header: Enable early stopping of the training
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  early_stop_start:
+    affects_outcome_of: TRAINING
+    default_value: 3
+    editable: true
+    header: Start epoch for early stopping
+    max_value: 1000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 3
+    visible_in_ui: false
+  early_stop_patience:
+    affects_outcome_of: TRAINING
+    default_value: 3
+    description: Training will stop if the model does not improve within the number of epochs of patience.
+    editable: true
+    header: Patience for early stopping
+    max_value: 50
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 8
+    visible_in_ui: true
+    warning: This is applied exclusively when early stopping is enabled.
+  early_stop_iteration_patience:
+    affects_outcome_of: TRAINING
+    default_value: 0
+    description:
+      Training will stop if the model does not improve within the number of iterations of patience.
+      This ensures the model is trained enough with the number of iterations of patience before early stopping.
+    editable: true
+    header: Iteration patience for early stopping
+    max_value: 1000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: This is applied exclusively when early stopping is enabled.
+  use_adaptive_interval:
+    affects_outcome_of: TRAINING
+    default_value: true
+    description: Depending on the size of iteration per epoch, adaptively update the validation interval and related values.
+    editable: true
+    header: Use adaptive validation interval
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: This will automatically control the patience and interval when early stopping is enabled.
+  enable_supcon:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description:
+      Enable an auxiliar supervised contrastive loss, which might increase robustness
+      and accuracy for small datasets.
+    editable: true
+    header: Enable Supervised Contrastive helper loss
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  auto_adapt_batch_size:
+    affects_outcome_of: TRAINING
+    default_value: None
+    description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
+    editable: true
+    enum_name: BatchSizeAdaptType
+    header: Decrease batch size if current batch size isn't fit to CUDA memory.
+    options:
+      NONE: "None"
+      SAFE: "Safe"
+      FULL: "Full"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: None
+    visible_in_ui: true
+    warning:
+      Enabling this could change the actual batch size depending on the current GPU status.
+      The learning rate also could be adjusted according to the adapted batch size. This process might change
+      a model performance and take some extra computation time to try a few batch size candidates.
+  auto_num_workers:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Adapt num_workers according to current hardware status automatically.
+    editable: true
+    header: Enable auto adaptive num_workers
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  input_size:
+    affects_outcome_of: INFERENCE
+    default_value: Default
+    description:
+      The input size of the given model could be configured to one of the predefined resolutions.
+      Reduced training and inference time could be expected by using smaller input size.
+      In Auto mode, the input size is automatically determined based on dataset statistics.
+      Defaults to per-model default resolution.
+    editable: true
+    enum_name: InputSizePreset
+    header: Configure model input size.
+    options:
+      DEFAULT: "Default"
+      AUTO: "Auto"
+      _64x64: "64x64"
+      _128x128: "128x128"
+      _224x224: "224x224"
+      _256x256: "256x256"
+      _384x384: "384x384"
+      _512x512: "512x512"
+      _768x768: "768x768"
+      _1024x1024: "1024x1024"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Default
+    visible_in_ui: false
+    warning: Modifying input size may decrease model performance.
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    default_value: Performance
+    description: Quantization preset that defines quantization scheme
+    editable: false
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: true
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
+nncf_optimization:
+  description: Optimization by NNCF
+  header: Optimization by NNCF
+  enable_quantization:
+    affects_outcome_of: TRAINING
+    default_value: true
+    description: Enable quantization algorithm
+    editable: true
+    header: Enable quantization algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+  enable_pruning:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Enable filter pruning algorithm
+    editable: true
+    header: Enable filter pruning algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  pruning_supported:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Whether filter pruning is supported
+    editable: false
+    header: Whether filter pruning is supported
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: null
+  maximal_accuracy_degradation:
+    affects_outcome_of: TRAINING
+    default_value: 1.0
+    description: The maximal allowed accuracy metric drop
+    editable: true
+    header: Maximum accuracy degradation
+    max_value: 100.0
+    min_value: 0.0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1.0
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+algo_backend:
+  description: parameters for algo backend
+  header: Algo backend parameters
+  train_type:
+    affects_outcome_of: TRAINING
+    default_value: Incremental
+    description: Training scheme option that determines how to train the model
+    editable: True
+    enum_name: TrainType
+    header: Train type
+    options:
+      Incremental: "Incremental"
+      Semisupervised: "Semisupervised"
+      Selfsupervised: "Selfsupervised"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Incremental
+    visible_in_ui: false
+    warning: null
+  mem_cache_size:
+    affects_outcome_of: TRAINING
+    default_value: 100000000
+    description: Size of memory pool for caching decoded data to load data faster (bytes).
+    editable: true
+    header: Size of memory pool
+    max_value: 10000000000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme for storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  enable_noisy_label_detection:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Set to True to enable loss dynamics tracking for each sample to detect noisy labeled samples.
+    editable: true
+    header: Enable loss dynamics tracking for noisy label detection
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
diff --git a/src/otx/tools/templates/classification/deit_tiny/template.yaml b/src/otx/tools/templates/classification/deit_tiny/template.yaml
new file mode 100644
index 00000000000..9e3842532e4
--- /dev/null
+++ b/src/otx/tools/templates/classification/deit_tiny/template.yaml
@@ -0,0 +1,44 @@
+# Description.
+model_template_id: Custom_Image_Classification_DeiT-Tiny
+name: DeiT-Tiny
+task_type: CLASSIFICATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Custom Image Classification for DeiT-Tiny
+application: ~
+
+# Algo backend.
+framework: OTXClassification v1.2.3
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 64
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.0001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 0
+      num_iters:
+        default_value: 90
+    algo_backend:
+      train_type:
+        default_value: Incremental
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 1.26
+size: 5.72
diff --git a/src/otx/tools/templates/classification/efficientnet_b0_cls_incr/template.yaml b/src/otx/tools/templates/classification/efficientnet_b0_cls_incr/template.yaml
new file mode 100644
index 00000000000..b90904d13da
--- /dev/null
+++ b/src/otx/tools/templates/classification/efficientnet_b0_cls_incr/template.yaml
@@ -0,0 +1,45 @@
+# Description.
+model_template_id: Custom_Image_Classification_EfficinetNet-B0
+name: EfficientNet-B0
+task_type: CLASSIFICATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Image Classification for EfficientNet-B0
+application: ~
+
+# Algo backend.
+framework: OTXClassification v1.2.3
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 64
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.0049
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 0
+      num_iters:
+        default_value: 90
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 0.81
+size: 4.09
+
+# Model spec
+model_category: BALANCE
+is_default_for_task: true
diff --git a/src/otx/tools/templates/classification/efficientnet_b3/template.yaml b/src/otx/tools/templates/classification/efficientnet_b3/template.yaml
new file mode 100644
index 00000000000..72b0dfb3532
--- /dev/null
+++ b/src/otx/tools/templates/classification/efficientnet_b3/template.yaml
@@ -0,0 +1,41 @@
+# Description.
+model_template_id: Custom_Image_Classification_EfficinetNet-B3
+name: EfficientNet-B3
+task_type: CLASSIFICATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Image Classification for EfficientNet-B3
+application: ~
+
+# Algo backend.
+framework: OTXClassification v1.2.3
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 64
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.01
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 0
+      num_iters:
+        default_value: 90
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 1.92
+size: 10.3
diff --git a/src/otx/tools/templates/classification/efficientnet_v2_l/template.yaml b/src/otx/tools/templates/classification/efficientnet_v2_l/template.yaml
new file mode 100644
index 00000000000..377368e3184
--- /dev/null
+++ b/src/otx/tools/templates/classification/efficientnet_v2_l/template.yaml
@@ -0,0 +1,41 @@
+# Description.
+model_template_id: Custom_Image_Classification_EfficientNet-V2-L
+name: EfficientNet-V2-L
+task_type: CLASSIFICATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Image Classification for EfficientNet-V2-L
+application: ~
+
+# Algo backend.
+framework: OTXClassification v1.2.3
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 64
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.01
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 0
+      num_iters:
+        default_value: 90
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 24.46
+size: 117
diff --git a/src/otx/tools/templates/classification/efficientnet_v2_s_cls_incr/template.yaml b/src/otx/tools/templates/classification/efficientnet_v2_s_cls_incr/template.yaml
new file mode 100644
index 00000000000..208369f2a64
--- /dev/null
+++ b/src/otx/tools/templates/classification/efficientnet_v2_s_cls_incr/template.yaml
@@ -0,0 +1,44 @@
+# Description.
+model_template_id: Custom_Image_Classification_EfficientNet-V2-S
+name: EfficientNet-V2-S
+task_type: CLASSIFICATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Image Classification for EfficientNet-V2-S
+application: ~
+
+# Algo backend.
+framework: OTXClassification v1.2.3
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 64
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.0071
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 0
+      num_iters:
+        default_value: 90
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 5.76
+size: 20.23
+
+# Model spec
+model_category: ACCURACY
diff --git a/src/otx/tools/templates/classification/mobilenet_v3_large_1_cls_incr/template.yaml b/src/otx/tools/templates/classification/mobilenet_v3_large_1_cls_incr/template.yaml
new file mode 100644
index 00000000000..388c5275013
--- /dev/null
+++ b/src/otx/tools/templates/classification/mobilenet_v3_large_1_cls_incr/template.yaml
@@ -0,0 +1,44 @@
+# Description.
+model_template_id: Custom_Image_Classification_MobileNet-V3-large-1x
+name: MobileNet-V3-large-1x
+task_type: CLASSIFICATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Image Classification for MobileNet-V3-large-1x
+application: ~
+
+# Algo backend.
+framework: OTXClassification v1.2.3
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 64
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.0058
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 0
+      num_iters:
+        default_value: 90
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 0.44
+size: 4.29
+
+# Model spec
+model_category: SPEED
diff --git a/src/otx/tools/templates/classification/mobilenet_v3_small/template.yaml b/src/otx/tools/templates/classification/mobilenet_v3_small/template.yaml
new file mode 100644
index 00000000000..efd02ec7350
--- /dev/null
+++ b/src/otx/tools/templates/classification/mobilenet_v3_small/template.yaml
@@ -0,0 +1,41 @@
+# Description.
+model_template_id: Custom_Image_Classification_MobileNet-V3-small
+name: MobileNet-V3-small
+task_type: CLASSIFICATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Image Classification for MobileNet-V3-small
+application: ~
+
+# Algo backend.
+framework: OTXClassification v1.2.3
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 64
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.01
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 0
+      num_iters:
+        default_value: 90
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 0.11
+size: 1.6
diff --git a/src/otx/tools/templates/detection/detection/configuration.yaml b/src/otx/tools/templates/detection/detection/configuration.yaml
new file mode 100644
index 00000000000..5cb11d83c9f
--- /dev/null
+++ b/src/otx/tools/templates/detection/detection/configuration.yaml
@@ -0,0 +1,700 @@
+description: Configuration for an object detection task
+header: Configuration for an object detection task
+learning_parameters:
+  batch_size:
+    affects_outcome_of: TRAINING
+    default_value: 5
+    description:
+      The number of training samples seen in each iteration of training.
+      Increasing this value improves training time and may make the training more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 5
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+    auto_hpo_state: NOT_POSSIBLE
+  inference_batch_size:
+    affects_outcome_of: TRAINING
+    default_value: 1
+    description: The number of samples seen in each iteration of inference.
+      Increasing this value improves inference time and may make the inference more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Inference batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+    auto_hpo_state: NOT_POSSIBLE
+  description: Learning Parameters
+  header: Learning Parameters
+  learning_rate:
+    affects_outcome_of: TRAINING
+    default_value: 0.01
+    description:
+      Increasing this value will speed up training convergence but might
+      make it unstable.
+    editable: true
+    header: Learning rate
+    max_value: 0.1
+    min_value: 1.0e-08
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.01
+    visible_in_ui: true
+    warning: null
+    auto_hpo_state: NOT_POSSIBLE
+  learning_rate_warmup_iters:
+    affects_outcome_of: TRAINING
+    default_value: 100
+    description:
+      In this periods of initial training iterations, the model will be trained in low learning rate,
+      which will be increased incrementally up to the expected learning rate setting.
+      This warm-up phase is known to be helpful to stabilize training, thus result in better performance.
+    editable: true
+    header: Number of iterations for learning rate warmup
+    max_value: 10000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 100
+    visible_in_ui: true
+    warning: null
+  num_iters:
+    affects_outcome_of: TRAINING
+    default_value: 200
+    description:
+      Increasing this value causes the results to be more robust but training
+      time will be longer.
+    editable: true
+    header: Number of training iterations
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 200
+    visible_in_ui: true
+    warning: null
+  num_workers:
+    affects_outcome_of: NONE
+    default_value: 2
+    description:
+      Increasing this value might improve training speed however it might
+      cause out of memory errors. If the number of workers is set to zero, data loading
+      will happen in the main training thread.
+    editable: true
+    header: Number of cpu threads to use during batch generation
+    max_value: 8
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: null
+  enable_early_stopping:
+    affects_outcome_of: TRAINING
+    default_value: true
+    description: Early exit from training when validation accuracy isn't changed or decreased for several epochs.
+    editable: true
+    header: Enable early stopping of the training
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  early_stop_start:
+    affects_outcome_of: TRAINING
+    default_value: 3
+    editable: true
+    header: Start epoch for early stopping
+    max_value: 1000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 3
+    visible_in_ui: false
+  early_stop_patience:
+    affects_outcome_of: TRAINING
+    default_value: 10
+    description: Training will stop if the model does not improve within the number of epochs of patience.
+    editable: true
+    header: Patience for early stopping
+    max_value: 50
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 10
+    visible_in_ui: true
+    warning: This is applied exclusively when early stopping is enabled.
+  early_stop_iteration_patience:
+    affects_outcome_of: TRAINING
+    default_value: 0
+    description:
+      Training will stop if the model does not improve within the number of iterations of patience.
+      This ensures the model is trained enough with the number of iterations of patience before early stopping.
+    editable: true
+    header: Iteration patience for early stopping
+    max_value: 1000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: This is applied exclusively when early stopping is enabled.
+  use_adaptive_interval:
+    affects_outcome_of: TRAINING
+    default_value: true
+    description: Depending on the size of iteration per epoch, adaptively update the validation interval and related values.
+    editable: true
+    header: Use adaptive validation interval
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: This will automatically control the patience and interval when early stopping is enabled.
+  auto_adapt_batch_size:
+    affects_outcome_of: TRAINING
+    default_value: None
+    description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
+    editable: true
+    enum_name: BatchSizeAdaptType
+    header: Decrease batch size if current batch size isn't fit to CUDA memory.
+    options:
+      NONE: "None"
+      SAFE: "Safe"
+      FULL: "Full"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: None
+    visible_in_ui: true
+    warning:
+      Enabling this could change the actual batch size depending on the current GPU status.
+      The learning rate also could be adjusted according to the adapted batch size. This process might change
+      a model performance and take some extra computation time to try a few batch size candidates.
+  auto_num_workers:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Adapt num_workers according to current hardware status automatically.
+    editable: true
+    header: Enable auto adaptive num_workers
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  input_size:
+    affects_outcome_of: INFERENCE
+    default_value: Default
+    description:
+      The input size of the given model could be configured to one of the predefined resolutions.
+      Reduced training and inference time could be expected by using smaller input size.
+      In Auto mode, the input size is automatically determined based on dataset statistics.
+      Defaults to per-model default resolution.
+    editable: true
+    enum_name: InputSizePreset
+    header: Configure model input size.
+    options:
+      DEFAULT: "Default"
+      AUTO: "Auto"
+      _256x256: "256x256"
+      _384x384: "384x384"
+      _512x512: "512x512"
+      _768x768: "768x768"
+      _1024x1024: "1024x1024"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Default
+    visible_in_ui: false
+    warning: Modifying input size may decrease model performance.
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+postprocessing:
+  confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: 0.35
+    description:
+      This threshold only takes effect if the threshold is not set based
+      on the result.
+    editable: true
+    header: Confidence threshold
+    max_value: 1
+    min_value: 0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    # value: 0.35
+    value: 0.01
+    visible_in_ui: true
+    warning: null
+  nms_iou_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: 0.5
+    description:
+      IoU Threshold for NMS Postprocessing. Intersection over Union (IoU) threshold is set to remove overlapping predictions.
+      If the IoU between two predictions is greater than or equal to the IoU threshold, they are considered overlapping and will be discarded.
+    editable: true
+    header: NMS IoU Threshold
+    max_value: 1
+    min_value: 0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.01
+    visible_in_ui: true
+    warning: If you want to chage the value of IoU Threshold of model, then you need to re-train model with new IoU threshold.
+  max_num_detections:
+    affects_outcome_of: INFERENCE
+    default_value: 0
+    description:
+      Extra detection outputs will be discared in non-maximum suppression process.
+      Defaults to 0, which means per-model default values.
+    editable: true
+    header: Maximum number of detections per image
+    max_value: 10000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: null
+  use_ellipse_shapes:
+    affects_outcome_of: INFERENCE
+    default_value: false
+    description: Use direct ellipse shape in inference instead of polygon from mask
+    editable: true
+    header: Use ellipse shapes
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: null
+  description: Postprocessing
+  header: Postprocessing
+  result_based_confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: true
+    description: Confidence threshold is derived from the results
+    editable: true
+    header: Result based confidence threshold
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+algo_backend:
+  description: parameters for algo backend
+  header: Algo backend parameters
+  train_type:
+    affects_outcome_of: TRAINING
+    default_value: Incremental
+    description: Training scheme option that determines how to train the model
+    editable: True
+    enum_name: TrainType
+    header: Train type
+    options:
+      Incremental: "Incremental"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Incremental
+    visible_in_ui: false
+    warning: null
+  mem_cache_size:
+    affects_outcome_of: TRAINING
+    default_value: 100000000
+    description: Size of memory pool for caching decoded data to load data faster (bytes).
+    editable: true
+    header: Size of memory pool
+    max_value: 10000000000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme for storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  enable_noisy_label_detection:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Set to True to enable loss dynamics tracking for each sample to detect noisy labeled samples.
+    editable: true
+    header: Enable loss dynamics tracking for noisy label detection
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    default_value: Performance
+    description: Quantization preset that defines quantization scheme
+    editable: True
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Performance
+    visible_in_ui: True
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: True
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: True
+    warning: null
+  stat_requests_number:
+    affects_outcome_of: NONE
+    default_value: 0
+    description: Number of requests during statistics collection
+    editable: true
+    header: Number of requests
+    max_value: 100000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+nncf_optimization:
+  description: Optimization by NNCF
+  header: Optimization by NNCF
+  enable_quantization:
+    affects_outcome_of: INFERENCE
+    default_value: True
+    description: Enable quantization algorithm
+    editable: false
+    header: Enable quantization algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: false
+    warning: null
+  enable_pruning:
+    affects_outcome_of: INFERENCE
+    default_value: false
+    description: Enable filter pruning algorithm
+    editable: true
+    header: Enable filter pruning algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  pruning_supported:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Whether filter pruning is supported
+    editable: false
+    header: Whether filter pruning is supported
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: null
+  maximal_accuracy_degradation:
+    affects_outcome_of: NONE
+    default_value: 1.0
+    description: The maximal allowed accuracy metric drop in absolute values
+    editable: True
+    header: Maximum accuracy degradation
+    max_value: 100.0
+    min_value: 0.0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1.0
+    visible_in_ui: True
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: True
+
+tiling_parameters:
+  header: Tiling
+  description: Crop dataset to tiles
+
+  enable_tiling:
+    header: Enable tiling
+    description: Set to True to allow tiny objects to be better detected.
+    default_value: false
+    editable: true
+    affects_outcome_of: TRAINING
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: "Tiling trades off speed for accuracy as it increases the number of images to be processed. In turn, it's memory efficient as smaller resolution patches are handled at onces so that the possibility of OOM issues could be reduced. Important: In the current version, depending on the dataset size and the available hardware resources, a model may not train successfully when tiling is enabled."
+
+  enable_adaptive_params:
+    header: Enable adaptive tiling parameters
+    description: Config tile size and tile overlap adaptively based on annotated dataset statistic. Manual settings well be ignored if it's turned on. Please turn off this option in order to tune tiling parameters manually.
+    default_value: true
+    editable: true
+    affects_outcome_of: TRAINING
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+
+  tile_size:
+    header: Tile Image Size
+    description: Tile image size. (tile_size x tile_size) sub images will be the unit of computation.
+    affects_outcome_of: TRAINING
+    default_value: 400
+    min_value: 100
+    max_value: 4096
+    type: INTEGER
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 400
+    visible_in_ui: true
+    warning: null
+
+  tile_overlap:
+    header: Tile Overlap
+    description: Overlap ratio between each two neighboring tiles. Recommend to set as large_object_size / tile_size.
+    affects_outcome_of: TRAINING
+    default_value: 0.2
+    min_value: 0.0
+    max_value: 0.9
+    type: FLOAT
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.2
+    visible_in_ui: true
+    warning: null
+
+  tile_max_number:
+    header: Max object per tile
+    description: Maximum number of objects per tile. If set to 1500, the tile adaptor will automatically determine the value. Otherwise, the manually set value will be used.
+    affects_outcome_of: TRAINING
+    default_value: 1500
+    min_value: 1
+    max_value: 5000
+    type: INTEGER
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1500
+    visible_in_ui: true
+    warning: null
+
+  tile_sampling_ratio:
+    header: Sampling Ratio for entire tiling
+    description: Since tiling train and validation to all tile from large image, usually it takes lots of time than normal training. The tile_sampling_ratio is ratio for sampling entire tile dataset. Sampling tile dataset would save lots of time for training and validation time. Note that sampling will be applied to training and validation dataset, not test dataset.
+    affects_outcome_of: TRAINING
+    default_value: 1.0
+    min_value: 0.000001
+    max_value: 1.0
+    type: FLOAT
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1.0
+    visible_in_ui: true
+    warning: null
+
+  object_tile_ratio:
+    header: Object tile ratio
+    description: The desired ratio of min object size and tile size.
+    affects_outcome_of: TRAINING
+    default_value: 0.03
+    min_value: 0.00
+    max_value: 1.00
+    type: FLOAT
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.03
+    visible_in_ui: false
+    warning: null
+
+  type: PARAMETER_GROUP
+  visible_in_ui: true
diff --git a/src/otx/tools/templates/detection/detection/cspdarknet_yolox_l/template.yaml b/src/otx/tools/templates/detection/detection/cspdarknet_yolox_l/template.yaml
new file mode 100644
index 00000000000..c161471d452
--- /dev/null
+++ b/src/otx/tools/templates/detection/detection/cspdarknet_yolox_l/template.yaml
@@ -0,0 +1,48 @@
+# Description.
+model_template_id: Object_Detection_YOLOX_L
+name: YOLOX-L
+task_type: DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Object Detection for YOLOX_L
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 8
+      learning_rate:
+        default_value: 0.001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 3
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 194.57
+size: 207
+# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform.
+# inference_targets:
+#   - CPU
+#   - GPU
+#   - VPU
diff --git a/src/otx/tools/templates/detection/detection/cspdarknet_yolox_s/template.yaml b/src/otx/tools/templates/detection/detection/cspdarknet_yolox_s/template.yaml
new file mode 100644
index 00000000000..f8ef1d4acd3
--- /dev/null
+++ b/src/otx/tools/templates/detection/detection/cspdarknet_yolox_s/template.yaml
@@ -0,0 +1,48 @@
+# Description.
+model_template_id: Object_Detection_YOLOX_S
+name: YOLOX-S
+task_type: DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Object Detection for YOLOX_S
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 8
+      learning_rate:
+        default_value: 0.001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 3
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 33.51
+size: 46
+# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform.
+# inference_targets:
+#   - CPU
+#   - GPU
+#   - VPU
diff --git a/src/otx/tools/templates/detection/detection/cspdarknet_yolox_tiny/template.yaml b/src/otx/tools/templates/detection/detection/cspdarknet_yolox_tiny/template.yaml
new file mode 100644
index 00000000000..e2e426840ed
--- /dev/null
+++ b/src/otx/tools/templates/detection/detection/cspdarknet_yolox_tiny/template.yaml
@@ -0,0 +1,46 @@
+# Description.
+model_template_id: Custom_Object_Detection_YOLOX
+name: YOLOX-TINY
+task_type: DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Object Detection for YOLOX-TINY
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 8
+      learning_rate:
+        default_value: 0.0002
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 3
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 6.5
+size: 20.4
+
+# Model spec
+model_category: SPEED
diff --git a/src/otx/tools/templates/detection/detection/cspdarknet_yolox_x/template.yaml b/src/otx/tools/templates/detection/detection/cspdarknet_yolox_x/template.yaml
new file mode 100644
index 00000000000..97f85fed008
--- /dev/null
+++ b/src/otx/tools/templates/detection/detection/cspdarknet_yolox_x/template.yaml
@@ -0,0 +1,48 @@
+# Description.
+model_template_id: Object_Detection_YOLOX_X
+name: YOLOX-X
+task_type: DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Object Detection for YOLOX_X
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 4
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 4
+      learning_rate:
+        default_value: 0.001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 3
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 352.42
+size: 378
+# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform.
+# inference_targets:
+#   - CPU
+#   - GPU
+#   - VPU
diff --git a/src/otx/tools/templates/detection/detection/mobilenetv2_atss/template.yaml b/src/otx/tools/templates/detection/detection/mobilenetv2_atss/template.yaml
new file mode 100644
index 00000000000..94dd429e1f1
--- /dev/null
+++ b/src/otx/tools/templates/detection/detection/mobilenetv2_atss/template.yaml
@@ -0,0 +1,47 @@
+# Description.
+model_template_id: Custom_Object_Detection_Gen3_ATSS
+name: MobileNetV2-ATSS
+task_type: DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Object Detection for MobileNetV2-ATSS
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 8
+      learning_rate:
+        default_value: 0.004
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 3
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 20.6
+size: 9.1
+
+# Model spec
+model_category: ACCURACY
+is_default_for_task: true
diff --git a/src/otx/tools/templates/detection/detection/mobilenetv2_ssd/template.yaml b/src/otx/tools/templates/detection/detection/mobilenetv2_ssd/template.yaml
new file mode 100644
index 00000000000..3cdde945a08
--- /dev/null
+++ b/src/otx/tools/templates/detection/detection/mobilenetv2_ssd/template.yaml
@@ -0,0 +1,46 @@
+# Description.
+model_template_id: Custom_Object_Detection_Gen3_SSD
+name: SSD
+task_type: DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Object Detection for SSD
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 8
+      learning_rate:
+        default_value: 0.01
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 3
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 9.4
+size: 7.6
+
+# Model spec
+model_category: BALANCE
diff --git a/src/otx/tools/templates/detection/detection/resnext101_atss/template.yaml b/src/otx/tools/templates/detection/detection/resnext101_atss/template.yaml
new file mode 100644
index 00000000000..cf12454e78d
--- /dev/null
+++ b/src/otx/tools/templates/detection/detection/resnext101_atss/template.yaml
@@ -0,0 +1,48 @@
+# Description.
+model_template_id: Object_Detection_ResNeXt101_ATSS
+name: ResNeXt101-ATSS
+task_type: DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Object Detection for ResNeXt101-ATSS
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 4
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 4
+      learning_rate:
+        default_value: 0.004
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 3
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 434.75
+size: 344
+# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform.
+# inference_targets:
+#   - CPU
+#   - GPU
+#   - VPU
diff --git a/src/otx/tools/templates/detection/detection/rtdetr_101/template.yaml b/src/otx/tools/templates/detection/detection/rtdetr_101/template.yaml
new file mode 100644
index 00000000000..1394cf44159
--- /dev/null
+++ b/src/otx/tools/templates/detection/detection/rtdetr_101/template.yaml
@@ -0,0 +1,48 @@
+# Description.
+model_template_id: Object_Detection_RTDetr_101
+name: RTDetr_101
+task_type: DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Object Detection for RTDetr_101
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 4
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 8
+      learning_rate:
+        default_value: 0.0001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 259
+size: 76
+# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform.
+# inference_targets:
+#   - CPU
+#   - GPU
+#   - VPU
diff --git a/src/otx/tools/templates/detection/detection/rtdetr_18/template.yaml b/src/otx/tools/templates/detection/detection/rtdetr_18/template.yaml
new file mode 100644
index 00000000000..7738c65f1b7
--- /dev/null
+++ b/src/otx/tools/templates/detection/detection/rtdetr_18/template.yaml
@@ -0,0 +1,48 @@
+# Description.
+model_template_id: Object_Detection_RTDetr_18
+name: RTDetr_18
+task_type: DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Object Detection for RTDetr_18
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 4
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 8
+      learning_rate:
+        default_value: 0.0001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 60
+size: 20
+# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform.
+# inference_targets:
+#   - CPU
+#   - GPU
+#   - VPU
diff --git a/src/otx/tools/templates/detection/detection/rtdetr_50/template.yaml b/src/otx/tools/templates/detection/detection/rtdetr_50/template.yaml
new file mode 100644
index 00000000000..408e48cd8fb
--- /dev/null
+++ b/src/otx/tools/templates/detection/detection/rtdetr_50/template.yaml
@@ -0,0 +1,48 @@
+# Description.
+model_template_id: Object_Detection_RTDetr_50
+name: RTDetr_50
+task_type: DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Object Detection for RTDetr_50
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 4
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 8
+      learning_rate:
+        default_value: 0.0001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 136
+size: 42
+# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform.
+# inference_targets:
+#   - CPU
+#   - GPU
+#   - VPU
diff --git a/src/otx/tools/templates/detection/detection/rtmdet_tiny/template.yaml b/src/otx/tools/templates/detection/detection/rtmdet_tiny/template.yaml
new file mode 100644
index 00000000000..8b110503b62
--- /dev/null
+++ b/src/otx/tools/templates/detection/detection/rtmdet_tiny/template.yaml
@@ -0,0 +1,48 @@
+# Description.
+model_template_id: Object_Detection_RTMDet_tiny
+name: RTMDet_tiny
+task_type: DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Object Detection for RTMDet-tiny
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 8
+      learning_rate:
+        default_value: 0.0007
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 3
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 8.1
+size: 4.8
+# # Inference options. Defined by OpenVINO capabilities, not Algo Backend or Platform.
+# inference_targets:
+#   - CPU
+#   - GPU
+#   - VPU
diff --git a/src/otx/tools/templates/detection/instance_segmentation/configuration.yaml b/src/otx/tools/templates/detection/instance_segmentation/configuration.yaml
new file mode 100644
index 00000000000..12277f05ddd
--- /dev/null
+++ b/src/otx/tools/templates/detection/instance_segmentation/configuration.yaml
@@ -0,0 +1,720 @@
+description: Configuration for an instance segmentation task
+header: Configuration for an instance segmentation task
+learning_parameters:
+  batch_size:
+    affects_outcome_of: TRAINING
+    default_value: 5
+    description:
+      The number of training samples seen in each iteration of training.
+      Increasing this value improves training time and may make the training more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 5
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+    auto_hpo_state: NOT_POSSIBLE
+  inference_batch_size:
+    affects_outcome_of: TRAINING
+    default_value: 1
+    description: The number of samples seen in each iteration of inference.
+      Increasing this value improves inference time and may make the inference more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Inference batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+    auto_hpo_state: NOT_POSSIBLE
+  description: Learning Parameters
+  header: Learning Parameters
+  learning_rate:
+    affects_outcome_of: TRAINING
+    default_value: 0.01
+    description:
+      Increasing this value will speed up training convergence but might
+      make it unstable.
+    editable: true
+    header: Learning rate
+    max_value: 0.1
+    min_value: 1.0e-08
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.01
+    visible_in_ui: true
+    warning: null
+    auto_hpo_state: NOT_POSSIBLE
+  learning_rate_warmup_iters:
+    affects_outcome_of: TRAINING
+    default_value: 100
+    description:
+      In this periods of initial training iterations, the model will be trained in low learning rate,
+      which will be increased incrementally up to the expected learning rate setting.
+      This warm-up phase is known to be helpful to stabilize training, thus result in better performance.
+    editable: true
+    header: Number of iterations for learning rate warmup
+    max_value: 10000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 100
+    visible_in_ui: true
+    warning: null
+  num_iters:
+    affects_outcome_of: TRAINING
+    default_value: 200
+    description:
+      Increasing this value causes the results to be more robust but training
+      time will be longer.
+    editable: true
+    header: Number of training iterations
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 200
+    visible_in_ui: true
+    warning: null
+  num_workers:
+    affects_outcome_of: NONE
+    default_value: 2
+    description:
+      Increasing this value might improve training speed however it might
+      cause out of memory errors. If the number of workers is set to zero, data loading
+      will happen in the main training thread.
+    editable: true
+    header: Number of cpu threads to use during batch generation
+    max_value: 8
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 2
+    visible_in_ui: true
+    warning: null
+  enable_early_stopping:
+    affects_outcome_of: TRAINING
+    default_value: true
+    description: Early exit from training when validation accuracy isn't changed or decreased for several epochs.
+    editable: true
+    header: Enable early stopping of the training
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  early_stop_start:
+    affects_outcome_of: TRAINING
+    default_value: 3
+    editable: true
+    header: Start epoch for early stopping
+    max_value: 1000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 3
+    visible_in_ui: false
+  early_stop_patience:
+    affects_outcome_of: TRAINING
+    default_value: 10
+    description: Training will stop if the model does not improve within the number of epochs of patience.
+    editable: true
+    header: Patience for early stopping
+    max_value: 50
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 10
+    visible_in_ui: true
+    warning: This is applied exclusively when early stopping is enabled.
+  early_stop_iteration_patience:
+    affects_outcome_of: TRAINING
+    default_value: 0
+    description:
+      Training will stop if the model does not improve within the number of iterations of patience.
+      This ensures the model is trained enough with the number of iterations of patience before early stopping.
+    editable: true
+    header: Iteration patience for early stopping
+    max_value: 1000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: This is applied exclusively when early stopping is enabled.
+  use_adaptive_interval:
+    affects_outcome_of: TRAINING
+    default_value: true
+    description: Depending on the size of iteration per epoch, adaptively update the validation interval and related values.
+    editable: true
+    header: Use adaptive validation interval
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: This will automatically control the patience and interval when early stopping is enabled.
+  auto_adapt_batch_size:
+    affects_outcome_of: TRAINING
+    default_value: Safe
+    description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
+    editable: true
+    enum_name: BatchSizeAdaptType
+    header: Decrease batch size if current batch size isn't fit to CUDA memory.
+    options:
+      NONE: "None"
+      SAFE: "Safe"
+      FULL: "Full"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Safe
+    visible_in_ui: true
+    warning:
+      Enabling this could change the actual batch size depending on the current GPU status.
+      The learning rate also could be adjusted according to the adapted batch size. This process might change
+      a model performance and take some extra computation time to try a few batch size candidates.
+  auto_num_workers:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Adapt num_workers according to current hardware status automatically.
+    editable: true
+    header: Enable auto adaptive num_workers
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  input_size:
+    affects_outcome_of: INFERENCE
+    default_value: Default
+    description:
+      The input size of the given model could be configured to one of the predefined resolutions.
+      Reduced training and inference time could be expected by using smaller input size.
+      In Auto mode, the input size is automatically determined based on dataset statistics.
+      Defaults to per-model default resolution.
+    editable: true
+    enum_name: InputSizePreset
+    header: Configure model input size.
+    options:
+      DEFAULT: "Default"
+      AUTO: "Auto"
+      _256x256: "256x256"
+      _384x384: "384x384"
+      _512x512: "512x512"
+      _768x768: "768x768"
+      _1024x1024: "1024x1024"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Default
+    visible_in_ui: false
+    warning: Modifying input size may decrease model performance.
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+postprocessing:
+  confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: 0.35
+    description:
+      This threshold only takes effect if the threshold is not set based
+      on the result.
+    editable: true
+    header: Confidence threshold
+    max_value: 1
+    min_value: 0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    # value: 0.35
+    value: 0.01
+    visible_in_ui: true
+    warning: null
+  nms_iou_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: 0.5
+    description:
+      IoU Threshold for NMS Postprocessing. Intersection over Union (IoU) threshold is set to remove overlapping predictions.
+      If the IoU between two predictions is greater than or equal to the IoU threshold, they are considered overlapping and will be discarded.
+    editable: true
+    header: NMS IoU Threshold
+    max_value: 1
+    min_value: 0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.01
+    visible_in_ui: true
+    warning: If you want to chage the value of IoU Threshold of model, then you need to re-train model with new IoU threshold.
+  max_num_detections:
+    affects_outcome_of: INFERENCE
+    default_value: 0
+    description:
+      Extra detection outputs will be discared in non-maximum suppression process.
+      Defaults to 0, which means per-model default values.
+    editable: true
+    header: Maximum number of detections per image
+    max_value: 10000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: null
+  use_ellipse_shapes:
+    affects_outcome_of: INFERENCE
+    default_value: false
+    description: Use direct ellipse shape in inference instead of polygon from mask
+    editable: true
+    header: Use ellipse shapes
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  description: Postprocessing
+  header: Postprocessing
+  result_based_confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: true
+    description: Confidence threshold is derived from the results
+    editable: true
+    header: Result based confidence threshold
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+algo_backend:
+  description: parameters for algo backend
+  header: Algo backend parameters
+  train_type:
+    affects_outcome_of: TRAINING
+    default_value: Incremental
+    description: Training scheme option that determines how to train the model
+    editable: True
+    enum_name: TrainType
+    header: Train type
+    options:
+      Incremental: "Incremental"
+      Semisupervised: "Semisupervised"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Incremental
+    visible_in_ui: false
+    warning: null
+  mem_cache_size:
+    affects_outcome_of: TRAINING
+    default_value: 100000000
+    description: Size of memory pool for caching decoded data to load data faster (bytes).
+    editable: true
+    header: Size of memory pool
+    max_value: 10000000000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme for storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    default_value: Performance
+    description: Quantization preset that defines quantization scheme
+    editable: True
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Performance
+    visible_in_ui: True
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: True
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: True
+    warning: null
+  stat_requests_number:
+    affects_outcome_of: NONE
+    default_value: 0
+    description: Number of requests during statistics collection
+    editable: true
+    header: Number of requests
+    max_value: 200
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+nncf_optimization:
+  description: Optimization by NNCF
+  header: Optimization by NNCF
+  enable_quantization:
+    affects_outcome_of: INFERENCE
+    default_value: True
+    description: Enable quantization algorithm
+    editable: false
+    header: Enable quantization algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: false
+    warning: null
+  enable_pruning:
+    affects_outcome_of: INFERENCE
+    default_value: false
+    description: Enable filter pruning algorithm
+    editable: true
+    header: Enable filter pruning algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  pruning_supported:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Whether filter pruning is supported
+    editable: false
+    header: Whether filter pruning is supported
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: null
+  maximal_accuracy_degradation:
+    affects_outcome_of: NONE
+    default_value: 1.0
+    description: The maximal allowed accuracy metric drop in absolute values
+    editable: True
+    header: Maximum accuracy degradation
+    max_value: 100.0
+    min_value: 0.0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1.0
+    visible_in_ui: True
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: True
+
+tiling_parameters:
+  header: Tiling
+  description: Crop dataset to tiles
+
+  enable_tiling:
+    header: Enable tiling
+    description: Set to True to allow tiny objects to be better detected.
+    default_value: false
+    editable: true
+    affects_outcome_of: TRAINING
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: "Tiling trades off speed for accuracy as it increases the number of images to be processed. In turn, it's memory efficient as smaller resolution patches are handled at onces so that the possibility of OOM issues could be reduced. Important: In the current version, depending on the dataset size and the available hardware resources, a model may not train successfully when tiling is enabled."
+
+  enable_tile_classifier:
+    header: Enable tile classifier
+    description: Enabling tile classifier enhances the speed of tiling inference by incorporating a tile classifier into the instance segmentation model. This feature prevents the detector from making predictions on tiles that do not contain any objects, thus optimizing its speed performance.
+    default_value: false
+    editable: true
+    affects_outcome_of: TRAINING
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: The tile classifier prioritizes inference speed over training speed, it requires more training in order to achieve its optimized performance.
+
+  enable_adaptive_params:
+    header: Enable adaptive tiling parameters
+    description: Config tile size and tile overlap adaptively based on annotated dataset statistic. Manual settings well be ignored if it's turned on. Please turn off this option in order to tune tiling parameters manually.
+    default_value: true
+    editable: true
+    affects_outcome_of: TRAINING
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+
+  tile_size:
+    header: Tile Image Size
+    description: Tile image size. (tile_size x tile_size) sub images will be the unit of computation.
+    affects_outcome_of: TRAINING
+    default_value: 400
+    min_value: 100
+    max_value: 4096
+    type: INTEGER
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 400
+    visible_in_ui: true
+    warning: null
+
+  tile_overlap:
+    header: Tile Overlap
+    description: Overlap ratio between each two neighboring tiles. Recommend to set as large_object_size / tile_size.
+    affects_outcome_of: TRAINING
+    default_value: 0.2
+    min_value: 0.0
+    max_value: 0.9
+    type: FLOAT
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.2
+    visible_in_ui: true
+    warning: null
+
+  tile_max_number:
+    header: Max object per tile
+    description: Maximum number of objects per tile. If set to 1500, the tile adaptor will automatically determine the value. Otherwise, the manually set value will be used.
+    affects_outcome_of: TRAINING
+    default_value: 1500
+    min_value: 1
+    max_value: 5000
+    type: INTEGER
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1500
+    visible_in_ui: true
+    warning: null
+
+  tile_ir_scale_factor:
+    header: OpenVINO IR Scale Factor
+    description: The purpose of the scale parameter is to optimize the performance and efficiency of tiling in OpenVINO IR during inference. By controlling the increase in tile size and input size, the scale parameter allows for more efficient parallelization of the workload and improve the overall performance and efficiency of the inference process on OpenVINO.
+    affects_outcome_of: TRAINING
+    default_value: 1.0
+    min_value: 1.0
+    max_value: 4.0
+    type: FLOAT
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1.0
+    visible_in_ui: true
+    warning: null
+
+  tile_sampling_ratio:
+    header: Sampling Ratio for entire tiling
+    description: Since tiling train and validation to all tile from large image, usually it takes lots of time than normal training. The tile_sampling_ratio is ratio for sampling entire tile dataset. Sampling tile dataset would save lots of time for training and validation time. Note that sampling will be applied to training and validation dataset, not test dataset.
+    affects_outcome_of: TRAINING
+    default_value: 1.0
+    min_value: 0.000001
+    max_value: 1.0
+    type: FLOAT
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1.0
+    visible_in_ui: true
+    warning: null
+
+  object_tile_ratio:
+    header: Object tile ratio
+    description: The desired ratio of min object size and tile size.
+    affects_outcome_of: TRAINING
+    default_value: 0.03
+    min_value: 0.00
+    max_value: 1.00
+    type: FLOAT
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.03
+    visible_in_ui: false
+    warning: null
+
+  type: PARAMETER_GROUP
+  visible_in_ui: true
diff --git a/src/otx/tools/templates/detection/instance_segmentation/efficientnetb2b_maskrcnn/template.yaml b/src/otx/tools/templates/detection/instance_segmentation/efficientnetb2b_maskrcnn/template.yaml
new file mode 100644
index 00000000000..4b5e21a4f83
--- /dev/null
+++ b/src/otx/tools/templates/detection/instance_segmentation/efficientnetb2b_maskrcnn/template.yaml
@@ -0,0 +1,47 @@
+# Description.
+model_template_id: Custom_Counting_Instance_Segmentation_MaskRCNN_EfficientNetB2B
+name: MaskRCNN-EfficientNetB2B
+task_type: INSTANCE_SEGMENTATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Instance Segmentation for MaskRCNN-EfficientNetB2B
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 4
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 1
+      learning_rate:
+        default_value: 0.015
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 100
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 68.48
+size: 13.27
+
+# Model spec
+model_category: SPEED
+is_default_for_task: true
diff --git a/src/otx/tools/templates/detection/instance_segmentation/maskrcnn_swin_t/template.yaml b/src/otx/tools/templates/detection/instance_segmentation/maskrcnn_swin_t/template.yaml
new file mode 100644
index 00000000000..e9e289c6bf6
--- /dev/null
+++ b/src/otx/tools/templates/detection/instance_segmentation/maskrcnn_swin_t/template.yaml
@@ -0,0 +1,43 @@
+# Description.
+model_template_id: Custom_Counting_Instance_Segmentation_MaskRCNN_SwinT_FP16
+name: MaskRCNN-SwinT-FP16
+task_type: INSTANCE_SEGMENTATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Instance Segmentation for MaskRCNN-SwinT-FP16
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 4
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 1
+      learning_rate:
+        default_value: 0.0001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 100
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 407.32
+size: 191.46
diff --git a/src/otx/tools/templates/detection/instance_segmentation/resnet50_maskrcnn/template.yaml b/src/otx/tools/templates/detection/instance_segmentation/resnet50_maskrcnn/template.yaml
new file mode 100644
index 00000000000..bd2248adbcd
--- /dev/null
+++ b/src/otx/tools/templates/detection/instance_segmentation/resnet50_maskrcnn/template.yaml
@@ -0,0 +1,46 @@
+# Description.
+model_template_id: Custom_Counting_Instance_Segmentation_MaskRCNN_ResNet50
+name: MaskRCNN-ResNet50
+task_type: INSTANCE_SEGMENTATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Instance Segmentation for MaskRCNN-ResNet50
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 4
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 1
+      learning_rate:
+        default_value: 0.007
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 100
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 533.8
+size: 177.9
+
+# Model spec
+model_category: ACCURACY
diff --git a/src/otx/tools/templates/detection/instance_segmentation/rtmdet_tiny/template.yaml b/src/otx/tools/templates/detection/instance_segmentation/rtmdet_tiny/template.yaml
new file mode 100644
index 00000000000..e904fc55ca8
--- /dev/null
+++ b/src/otx/tools/templates/detection/instance_segmentation/rtmdet_tiny/template.yaml
@@ -0,0 +1,43 @@
+# Description.
+model_template_id: Custom_Instance_Segmentation_RTMDet_tiny
+name: RTMDet_tiny
+task_type: INSTANCE_SEGMENTATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Instance Segmentation for RTMDet_tiny
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 4
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 1
+      learning_rate:
+        default_value: 0.001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 20
+      num_iters:
+        default_value: 100
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 11.8
+size: 5.6
diff --git a/src/otx/tools/templates/detection/rotated_detection/configuration.yaml b/src/otx/tools/templates/detection/rotated_detection/configuration.yaml
new file mode 100644
index 00000000000..b41ea7dda25
--- /dev/null
+++ b/src/otx/tools/templates/detection/rotated_detection/configuration.yaml
@@ -0,0 +1,705 @@
+description: Configuration for an rotated detection task
+header: Configuration for an rotated detection task
+learning_parameters:
+  batch_size:
+    affects_outcome_of: TRAINING
+    default_value: 5
+    description:
+      The number of training samples seen in each iteration of training.
+      Increasing this value improves training time and may make the training more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 5
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+    auto_hpo_state: NOT_POSSIBLE
+  inference_batch_size:
+    affects_outcome_of: TRAINING
+    default_value: 1
+    description: The number of samples seen in each iteration of inference.
+      Increasing this value improves inference time and may make the inference more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Inference batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+    auto_hpo_state: NOT_POSSIBLE
+  description: Learning Parameters
+  header: Learning Parameters
+  learning_rate:
+    affects_outcome_of: TRAINING
+    default_value: 0.01
+    description:
+      Increasing this value will speed up training convergence but might
+      make it unstable.
+    editable: true
+    header: Learning rate
+    max_value: 0.1
+    min_value: 1.0e-07
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.01
+    visible_in_ui: true
+    warning: null
+    auto_hpo_state: NOT_POSSIBLE
+  learning_rate_warmup_iters:
+    affects_outcome_of: TRAINING
+    default_value: 100
+    description:
+      In this periods of initial training iterations, the model will be trained in low learning rate,
+      which will be increased incrementally up to the expected learning rate setting.
+      This warm-up phase is known to be helpful to stabilize training, thus result in better performance.
+    editable: true
+    header: Number of iterations for learning rate warmup
+    max_value: 10000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 100
+    visible_in_ui: true
+    warning: null
+  num_iters:
+    affects_outcome_of: TRAINING
+    default_value: 200
+    description:
+      Increasing this value causes the results to be more robust but training
+      time will be longer.
+    editable: true
+    header: Number of training iterations
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 200
+    visible_in_ui: true
+    warning: null
+  num_workers:
+    affects_outcome_of: NONE
+    default_value: 2
+    description:
+      Increasing this value might improve training speed however it might
+      cause out of memory errors. If the number of workers is set to zero, data loading
+      will happen in the main training thread.
+    editable: true
+    header: Number of cpu threads to use during batch generation
+    max_value: 8
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 2
+    visible_in_ui: true
+    warning: null
+  enable_early_stopping:
+    affects_outcome_of: TRAINING
+    default_value: true
+    description: Early exit from training when validation accuracy isn't changed or decreased for several epochs.
+    editable: true
+    header: Enable early stopping of the training
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  early_stop_start:
+    affects_outcome_of: TRAINING
+    default_value: 3
+    editable: true
+    header: Start epoch for early stopping
+    max_value: 1000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 3
+    visible_in_ui: false
+  early_stop_patience:
+    affects_outcome_of: TRAINING
+    default_value: 10
+    description: Training will stop if the model does not improve within the number of epochs of patience.
+    editable: true
+    header: Patience for early stopping
+    max_value: 50
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 10
+    visible_in_ui: true
+    warning: This is applied exclusively when early stopping is enabled.
+  early_stop_iteration_patience:
+    affects_outcome_of: TRAINING
+    default_value: 0
+    description:
+      Training will stop if the model does not improve within the number of iterations of patience.
+      This ensures the model is trained enough with the number of iterations of patience before early stopping.
+    editable: true
+    header: Iteration patience for early stopping
+    max_value: 1000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: This is applied exclusively when early stopping is enabled.
+  use_adaptive_interval:
+    affects_outcome_of: TRAINING
+    default_value: true
+    description: Depending on the size of iteration per epoch, adaptively update the validation interval and related values.
+    editable: true
+    header: Use adaptive validation interval
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: This will automatically control the patience and interval when early stopping is enabled.
+  auto_adapt_batch_size:
+    affects_outcome_of: TRAINING
+    default_value: Safe
+    description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
+    editable: true
+    enum_name: BatchSizeAdaptType
+    header: Decrease batch size if current batch size isn't fit to CUDA memory.
+    options:
+      NONE: "None"
+      SAFE: "Safe"
+      FULL: "Full"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Safe
+    visible_in_ui: true
+    warning:
+      Enabling this could change the actual batch size depending on the current GPU status.
+      The learning rate also could be adjusted according to the adapted batch size. This process might change
+      a model performance and take some extra computation time to try a few batch size candidates.
+  auto_num_workers:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Adapt num_workers according to current hardware status automatically.
+    editable: true
+    header: Enable auto adaptive num_workers
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  input_size:
+    affects_outcome_of: INFERENCE
+    default_value: Default
+    description:
+      The input size of the given model could be configured to one of the predefined resolutions.
+      Reduced training and inference time could be expected by using smaller input size.
+      In Auto mode, the input size is automatically determined based on dataset statistics.
+      Defaults to per-model default resolution.
+    editable: true
+    enum_name: InputSizePreset
+    header: Configure model input size.
+    options:
+      DEFAULT: "Default"
+      AUTO: "Auto"
+      _256x256: "256x256"
+      _384x384: "384x384"
+      _512x512: "512x512"
+      _768x768: "768x768"
+      _1024x1024: "1024x1024"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Default
+    visible_in_ui: false
+    warning: Modifying input size may decrease model performance.
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+postprocessing:
+  confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: 0.35
+    description:
+      This threshold only takes effect if the threshold is not set based
+      on the result.
+    editable: true
+    header: Confidence threshold
+    max_value: 1
+    min_value: 0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    # value: 0.35
+    value: 0.01
+    visible_in_ui: true
+    warning: null
+  nms_iou_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: 0.5
+    description:
+      IoU Threshold for NMS Postprocessing. Intersection over Union (IoU) threshold is set to remove overlapping predictions.
+      If the IoU between two predictions is greater than or equal to the IoU threshold, they are considered overlapping and will be discarded.
+    editable: true
+    header: NMS IoU Threshold
+    max_value: 1
+    min_value: 0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.01
+    visible_in_ui: true
+    warning: If you want to chage the value of IoU Threshold of model, then you need to re-train model with new IoU threshold.
+  description: Postprocessing
+  header: Postprocessing
+  result_based_confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: true
+    description: Confidence threshold is derived from the results
+    editable: true
+    header: Result based confidence threshold
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+  max_num_detections:
+    affects_outcome_of: INFERENCE
+    default_value: 0
+    description:
+      Extra detection outputs will be discared in non-maximum suppression process.
+      Defaults to 0, which means per-model default values.
+    editable: true
+    header: Maximum number of detections per image
+    max_value: 10000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: null
+algo_backend:
+  description: parameters for algo backend
+  header: Algo backend parameters
+  train_type:
+    affects_outcome_of: TRAINING
+    default_value: Incremental
+    description: Training scheme option that determines how to train the model
+    editable: True
+    enum_name: TrainType
+    header: Train type
+    options:
+      Incremental: "Incremental"
+      Semisupervised: "Semisupervised"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Incremental
+    visible_in_ui: false
+    warning: null
+  mem_cache_size:
+    affects_outcome_of: TRAINING
+    default_value: 100000000
+    description: Size of memory pool for caching decoded data to load data faster (bytes).
+    editable: true
+    header: Size of memory pool
+    max_value: 10000000000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme for storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    default_value: Performance
+    description: Quantization preset that defines quantization scheme
+    editable: True
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Performance
+    visible_in_ui: True
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: True
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: True
+    warning: null
+  stat_requests_number:
+    affects_outcome_of: NONE
+    default_value: 0
+    description: Number of requests during statistics collection
+    editable: true
+    header: Number of requests
+    max_value: 200
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+nncf_optimization:
+  description: Optimization by NNCF
+  header: Optimization by NNCF
+  enable_quantization:
+    affects_outcome_of: INFERENCE
+    default_value: True
+    description: Enable quantization algorithm
+    editable: false
+    header: Enable quantization algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: false
+    warning: null
+  enable_pruning:
+    affects_outcome_of: INFERENCE
+    default_value: false
+    description: Enable filter pruning algorithm
+    editable: true
+    header: Enable filter pruning algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  pruning_supported:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Whether filter pruning is supported
+    editable: false
+    header: Whether filter pruning is supported
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: null
+  maximal_accuracy_degradation:
+    affects_outcome_of: NONE
+    default_value: 1.0
+    description: The maximal allowed accuracy metric drop in absolute values
+    editable: True
+    header: Maximum accuracy degradation
+    max_value: 100.0
+    min_value: 0.0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1.0
+    visible_in_ui: True
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: True
+
+tiling_parameters:
+  header: Tiling
+  description: Crop dataset to tiles
+
+  enable_tiling:
+    header: Enable tiling
+    description: Set to True to allow tiny objects to be better detected.
+    default_value: false
+    editable: true
+    affects_outcome_of: TRAINING
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: "Tiling trades off speed for accuracy as it increases the number of images to be processed. In turn, it's memory efficient as smaller resolution patches are handled at onces so that the possibility of OOM issues could be reduced. Important: In the current version, depending on the dataset size and the available hardware resources, a model may not train successfully when tiling is enabled."
+
+  enable_tile_classifier:
+    header: Enable tile classifier
+    description: Enabling tile classifier enhances the speed of tiling inference by incorporating a tile classifier into the instance segmentation model. This feature prevents the detector from making predictions on tiles that do not contain any objects, thus optimizing its speed performance.
+    default_value: false
+    editable: false
+    affects_outcome_of: TRAINING
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: The tile classifier prioritizes inference speed over training speed, it requires more training in order to achieve its optimized performance.
+
+  enable_adaptive_params:
+    header: Enable adaptive tiling parameters
+    description: Config tile size and tile overlap adaptively based on annotated dataset statistic. Manual settings well be ignored if it's turned on. Please turn off this option in order to tune tiling parameters manually.
+    default_value: true
+    editable: true
+    affects_outcome_of: TRAINING
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+
+  tile_size:
+    header: Tile Image Size
+    description: Tile image size. (tile_size x tile_size) sub images will be the unit of computation.
+    affects_outcome_of: TRAINING
+    default_value: 400
+    min_value: 100
+    max_value: 4096
+    type: INTEGER
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 400
+    visible_in_ui: true
+    warning: null
+
+  tile_overlap:
+    header: Tile Overlap
+    description: Overlap ratio between each two neighboring tiles. Recommend to set as large_object_size / tile_size.
+    affects_outcome_of: TRAINING
+    default_value: 0.2
+    min_value: 0.0
+    max_value: 0.9
+    type: FLOAT
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.2
+    visible_in_ui: true
+    warning: null
+
+  tile_max_number:
+    header: Max object per tile
+    description: Maximum number of objects per tile. If set to 1500, the tile adaptor will automatically determine the value. Otherwise, the manually set value will be used.
+    affects_outcome_of: TRAINING
+    default_value: 1500
+    min_value: 1
+    max_value: 5000
+    type: INTEGER
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1500
+    visible_in_ui: true
+    warning: null
+
+  tile_ir_scale_factor:
+    header: OpenVINO IR Scale Factor
+    description: The purpose of the scale parameter is to optimize the performance and efficiency of tiling in OpenVINO IR during inference. By controlling the increase in tile size and input size, the scale parameter allows for more efficient parallelization of the workload and improve the overall performance and efficiency of the inference process on OpenVINO.
+    affects_outcome_of: TRAINING
+    default_value: 1.0
+    min_value: 1.0
+    max_value: 4.0
+    type: FLOAT
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1.0
+    visible_in_ui: true
+    warning: null
+
+  tile_sampling_ratio:
+    header: Sampling Ratio for entire tiling
+    description: Since tiling train and validation to all tile from large image, usually it takes lots of time than normal training. The tile_sampling_ratio is ratio for sampling entire tile dataset. Sampling tile dataset would save lots of time for training and validation time. Note that sampling will be applied to training and validation dataset, not test dataset.
+    affects_outcome_of: TRAINING
+    default_value: 1.0
+    min_value: 0.000001
+    max_value: 1.0
+    type: FLOAT
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1.0
+    visible_in_ui: true
+    warning: null
+
+  object_tile_ratio:
+    header: Object tile ratio
+    description: The desired ratio of min object size and tile size.
+    affects_outcome_of: TRAINING
+    default_value: 0.03
+    min_value: 0.00
+    max_value: 1.00
+    type: FLOAT
+    editable: true
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.03
+    visible_in_ui: false
+    warning: null
+
+  type: PARAMETER_GROUP
+  visible_in_ui: true
diff --git a/src/otx/tools/templates/detection/rotated_detection/efficientnetb2b_maskrcnn/template.yaml b/src/otx/tools/templates/detection/rotated_detection/efficientnetb2b_maskrcnn/template.yaml
new file mode 100644
index 00000000000..21e079c489a
--- /dev/null
+++ b/src/otx/tools/templates/detection/rotated_detection/efficientnetb2b_maskrcnn/template.yaml
@@ -0,0 +1,47 @@
+# Description.
+model_template_id: Custom_Rotated_Detection_via_Instance_Segmentation_MaskRCNN_EfficientNetB2B
+name: MaskRCNN-EfficientNetB2B
+task_type: ROTATED_DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Rotated object detection for MaskRCNN-EfficientNetB2B
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 4
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 4
+      learning_rate:
+        default_value: 0.007
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 100
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 68.48
+size: 13.27
+
+# Model spec
+model_category: SPEED
+is_default_for_task: true
diff --git a/src/otx/tools/templates/detection/rotated_detection/resnet50_maskrcnn/template.yaml b/src/otx/tools/templates/detection/rotated_detection/resnet50_maskrcnn/template.yaml
new file mode 100644
index 00000000000..4cb51f466eb
--- /dev/null
+++ b/src/otx/tools/templates/detection/rotated_detection/resnet50_maskrcnn/template.yaml
@@ -0,0 +1,46 @@
+# Description.
+model_template_id: Custom_Rotated_Detection_via_Instance_Segmentation_MaskRCNN_ResNet50
+name: MaskRCNN-ResNet50
+task_type: ROTATED_DETECTION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Rotated object detection for MaskRCNN-ResNet50
+application: ~
+
+# Algo backend.
+framework: OTXDetection v2.9.1
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 4
+        auto_hpo_state: POSSIBLE
+      inference_batch_size:
+        default_value: 4
+      learning_rate:
+        default_value: 0.007
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 100
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 533.8
+size: 177.9
+
+# Model spec
+model_category: ACCURACY
diff --git a/src/otx/tools/templates/segmentation/configuration.yaml b/src/otx/tools/templates/segmentation/configuration.yaml
new file mode 100644
index 00000000000..87a07515e02
--- /dev/null
+++ b/src/otx/tools/templates/segmentation/configuration.yaml
@@ -0,0 +1,473 @@
+description: Configuration for an semantic segmentation task
+header: Configuration for an semantic segmentation task
+id: ""
+learning_parameters:
+  batch_size:
+    affects_outcome_of: TRAINING
+    default_value: 5
+    description:
+      The number of training samples seen in each iteration of training.
+      Increasing this value improves training time and may make the training more
+      stable. A larger batch size has higher memory requirements.
+    editable: true
+    header: Batch size
+    max_value: 512
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 5
+    visible_in_ui: true
+    warning:
+      Increasing this value may cause the system to use more memory than available,
+      potentially causing out of memory errors, please update with caution.
+    auto_hpo_state: NOT_POSSIBLE
+  description: Learning Parameters
+  header: Learning Parameters
+  learning_rate:
+    affects_outcome_of: TRAINING
+    default_value: 0.01
+    description:
+      Increasing this value will speed up training convergence but might
+      make it unstable.
+    editable: true
+    header: Learning rate
+    max_value: 0.1
+    min_value: 1.0e-08
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.001
+    visible_in_ui: true
+    warning: null
+    auto_hpo_state: NOT_POSSIBLE
+  learning_rate_warmup_iters:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 100
+    description:
+      In this periods of initial training iterations, the model will be trained in low learning rate,
+      which will be increased incrementally up to the expected learning rate setting.
+      This warm-up phase is known to be helpful to stabilize training, thus result in better performance.
+    editable: true
+    header: Number of iterations for learning rate warmup
+    max_value: 10000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 100
+    visible_in_ui: true
+    warning: null
+  num_iters:
+    affects_outcome_of: TRAINING
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 200
+    description:
+      Increasing this value causes the results to be more robust but training
+      time will be longer.
+    editable: true
+    header: Number of training iterations
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 200
+    visible_in_ui: true
+    warning: null
+  num_workers:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 2
+    description:
+      Increasing this value might improve training speed however it might
+      cause out of memory errors. If the number of workers is set to zero, data loading
+      will happen in the main training thread.
+    editable: true
+    header: Number of cpu threads to use during batch generation
+    max_value: 8
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: null
+  enable_early_stopping:
+    affects_outcome_of: TRAINING
+    default_value: true
+    description: Early exit from training when validation accuracy isn't changed or decreased for several epochs.
+    editable: true
+    header: Enable early stopping of the training
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  early_stop_start:
+    affects_outcome_of: TRAINING
+    default_value: 70
+    editable: true
+    header: Start epoch for early stopping
+    max_value: 1000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 3
+    visible_in_ui: false
+  early_stop_patience:
+    affects_outcome_of: TRAINING
+    default_value: 7
+    description: Training will stop if the model does not improve within the number of epochs of patience.
+    editable: true
+    header: Patience for early stopping
+    max_value: 50
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 5
+    visible_in_ui: true
+    warning: This is applied exclusively when early stopping is enabled.
+  early_stop_iteration_patience:
+    affects_outcome_of: TRAINING
+    default_value: 0
+    description:
+      Training will stop if the model does not improve within the number of iterations of patience.
+      This ensures the model is trained enough with the number of iterations of patience before early stopping.
+    editable: true
+    header: Iteration patience for early stopping
+    max_value: 1000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0
+    visible_in_ui: true
+    warning: This is applied exclusively when early stopping is enabled.
+  enable_supcon:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description:
+      Enable an auxiliar supervised contrastive loss, which might increase robustness
+      and accuracy for small datasets.
+    editable: true
+    header: Enable Supervised Contrastive helper loss
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  auto_adapt_batch_size:
+    affects_outcome_of: TRAINING
+    default_value: Safe
+    description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory.
+    editable: true
+    enum_name: BatchSizeAdaptType
+    header: Decrease batch size if current batch size isn't fit to CUDA memory.
+    options:
+      NONE: "None"
+      SAFE: "Safe"
+      FULL: "Full"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Safe
+    visible_in_ui: true
+    warning:
+      Enabling this could change the actual batch size depending on the current GPU status.
+      The learning rate also could be adjusted according to the adapted batch size. This process might change
+      a model performance and take some extra computation time to try a few batch size candidates.
+  auto_num_workers:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Adapt num_workers according to current hardware status automatically.
+    editable: true
+    header: Enable auto adaptive num_workers
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: true
+    warning: null
+  input_size:
+    affects_outcome_of: INFERENCE
+    default_value: Auto
+    description:
+      The input size of the given model could be configured to one of the predefined resolutions.
+      Reduced training and inference time could be expected by using smaller input size.
+      Defaults to Auto, in which input size is automatically determined based on dataset statistics.
+    editable: true
+    enum_name: InputSizePreset
+    header: Configure model input size.
+    options:
+      DEFAULT: "Default"
+      AUTO: "Auto"
+      _256x256: "256x256"
+      _384x384: "384x384"
+      _512x512: "512x512"
+      _768x768: "768x768"
+      _1024x1024: "1024x1024"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Default
+    visible_in_ui: false
+    warning: Modifying input size may decrease model performance.
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+postprocessing:
+  confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: 0.35
+    description:
+      This threshold only takes effect if the threshold is not set based
+      on the result.
+    editable: true
+    header: Confidence threshold
+    max_value: 1
+    min_value: 0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.35
+    visible_in_ui: true
+    warning: null
+  description: Postprocessing
+  header: Postprocessing
+  result_based_confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: true
+    description: Confidence threshold is derived from the results
+    editable: true
+    header: Result based confidence threshold
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+algo_backend:
+  description: parameters for algo backend
+  header: Algo backend parameters
+  train_type:
+    affects_outcome_of: TRAINING
+    default_value: Incremental
+    description: Training scheme option that determines how to train the model
+    editable: True
+    enum_name: TrainType
+    header: Train type
+    options:
+      Incremental: "Incremental"
+      Semisupervised: "Semisupervised"
+      Selfsupervised: "Selfsupervised"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Incremental
+    visible_in_ui: false
+    warning: null
+  mem_cache_size:
+    affects_outcome_of: TRAINING
+    default_value: 100000000
+    description: Size of memory pool for caching decoded data to load data faster (bytes).
+    editable: true
+    header: Size of memory pool
+    max_value: 10000000000
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme for storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    default_value: Performance
+    description: Quantization preset that defines quantization scheme
+    editable: True
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Performance
+    visible_in_ui: True
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: True
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: True
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+nncf_optimization:
+  description: Optimization by NNCF
+  header: Optimization by NNCF
+  enable_quantization:
+    affects_outcome_of: INFERENCE
+    default_value: True
+    description: Enable quantization algorithm
+    editable: false
+    header: Enable quantization algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: true
+    visible_in_ui: false
+    warning: null
+  enable_pruning:
+    affects_outcome_of: INFERENCE
+    default_value: false
+    description: Enable filter pruning algorithm
+    editable: true
+    header: Enable filter pruning algorithm
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  pruning_supported:
+    affects_outcome_of: TRAINING
+    default_value: false
+    description: Whether filter pruning is supported
+    editable: false
+    header: Whether filter pruning is supported
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: false
+    warning: null
+  maximal_accuracy_degradation:
+    affects_outcome_of: NONE
+    default_value: 1.0
+    description: The maximal allowed accuracy metric drop in absolute values
+    editable: True
+    header: Maximum accuracy degradation
+    max_value: 100.0
+    min_value: 0.0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 1.0
+    visible_in_ui: True
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: True
diff --git a/src/otx/tools/templates/segmentation/dinov2_small/template.yaml b/src/otx/tools/templates/segmentation/dinov2_small/template.yaml
new file mode 100644
index 00000000000..ac837fa5007
--- /dev/null
+++ b/src/otx/tools/templates/segmentation/dinov2_small/template.yaml
@@ -0,0 +1,41 @@
+# Description.
+model_template_id: Custom_Semantic_Segmentation_DINOV2_S
+name: DINOV2_S
+task_type: SEGMENTATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Semantic Segmentation with larger architecture which based on the foundational DINO model for the better accuracy, especcially on small datasets.
+application: ~
+
+# Algo backend.
+framework: OTXSegmentation v0.14.0
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 62
+size: 2.37
diff --git a/src/otx/tools/templates/segmentation/ham_segnext_b/template.yaml b/src/otx/tools/templates/segmentation/ham_segnext_b/template.yaml
new file mode 100644
index 00000000000..ef390639238
--- /dev/null
+++ b/src/otx/tools/templates/segmentation/ham_segnext_b/template.yaml
@@ -0,0 +1,45 @@
+# Description.
+model_template_id: Custom_Semantic_Segmentation_SegNext_B
+name: SegNext-B
+task_type: SEGMENTATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Semantic Segmentation with larger architecture which based on the MSCAN backbone for the better accuracy.
+application: ~
+
+# Algo backend.
+framework: OTXSegmentation v0.14.0
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.00006
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 200
+      early_stop_start:
+        default_value: 100
+      early_stop_patience:
+        default_value: 10
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 32.08
+size: 27.56
diff --git a/src/otx/tools/templates/segmentation/ham_segnext_s/template.yaml b/src/otx/tools/templates/segmentation/ham_segnext_s/template.yaml
new file mode 100644
index 00000000000..9afd2660cf2
--- /dev/null
+++ b/src/otx/tools/templates/segmentation/ham_segnext_s/template.yaml
@@ -0,0 +1,45 @@
+# Description.
+model_template_id: Custom_Semantic_Segmentation_SegNext_s
+name: SegNext-s
+task_type: SEGMENTATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Semantic Segmentation with medium-sized architecture which based on the MSCAN backbone for the balance between accuracy and fast inference.
+application: ~
+
+# Algo backend.
+framework: OTXSegmentation v0.14.0
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.00006
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 200
+      early_stop_start:
+        default_value: 100
+      early_stop_patience:
+        default_value: 10
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 15.35
+size: 13.9
diff --git a/src/otx/tools/templates/segmentation/ham_segnext_t/template.yaml b/src/otx/tools/templates/segmentation/ham_segnext_t/template.yaml
new file mode 100644
index 00000000000..c5879535caa
--- /dev/null
+++ b/src/otx/tools/templates/segmentation/ham_segnext_t/template.yaml
@@ -0,0 +1,45 @@
+# Description.
+model_template_id: Custom_Semantic_Segmentation_SegNext_t
+name: SegNext-t
+task_type: SEGMENTATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Semantic Segmentation with small-sized architecture which based on the MSCAN backbone for faster inference while preserving competetive accuracy.
+application: ~
+
+# Algo backend.
+framework: OTXSegmentation v0.14.0
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.00006
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 200
+      early_stop_start:
+        default_value: 100
+      early_stop_patience:
+        default_value: 10
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 6.07
+size: 4.23
diff --git a/src/otx/tools/templates/segmentation/ocr_lite_hrnet_18_mod2/template.yaml b/src/otx/tools/templates/segmentation/ocr_lite_hrnet_18_mod2/template.yaml
new file mode 100644
index 00000000000..56af9f4b2b2
--- /dev/null
+++ b/src/otx/tools/templates/segmentation/ocr_lite_hrnet_18_mod2/template.yaml
@@ -0,0 +1,44 @@
+# Description.
+model_template_id: Custom_Semantic_Segmentation_Lite-HRNet-18-mod2_OCR
+name: Lite-HRNet-18-mod2
+task_type: SEGMENTATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Semantic Segmentation with middle-sized architecture which based on the Lite-HRNet backbone for the balance between the fast inference and long training.
+application: ~
+
+# Algo backend.
+framework: OTXSegmentation v0.14.0
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+      learning_rate:
+        default_value: 0.001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 3.63
+size: 4.8
+
+# Model spec
+model_category: BALANCE
+is_default_for_task: true
diff --git a/src/otx/tools/templates/segmentation/ocr_lite_hrnet_s_mod2/template.yaml b/src/otx/tools/templates/segmentation/ocr_lite_hrnet_s_mod2/template.yaml
new file mode 100644
index 00000000000..6637673c64e
--- /dev/null
+++ b/src/otx/tools/templates/segmentation/ocr_lite_hrnet_s_mod2/template.yaml
@@ -0,0 +1,44 @@
+# Description.
+model_template_id: Custom_Semantic_Segmentation_Lite-HRNet-s-mod2_OCR
+name: Lite-HRNet-s-mod2
+task_type: SEGMENTATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Semantic Segmentation with lightweight architecture which based on the Lite-HRNet backbone for the fast inference and training on the limited amount of data.
+application: ~
+
+# Algo backend.
+framework: OTXSegmentation v0.14.0
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 1.82
+size: 3.5
+
+# Model spec
+model_category: SPEED
diff --git a/src/otx/tools/templates/segmentation/ocr_lite_hrnet_x_mod3/template.yaml b/src/otx/tools/templates/segmentation/ocr_lite_hrnet_x_mod3/template.yaml
new file mode 100644
index 00000000000..f4bc011554f
--- /dev/null
+++ b/src/otx/tools/templates/segmentation/ocr_lite_hrnet_x_mod3/template.yaml
@@ -0,0 +1,44 @@
+# Description.
+model_template_id: Custom_Semantic_Segmentation_Lite-HRNet-x-mod3_OCR
+name: Lite-HRNet-x-mod3
+task_type: SEGMENTATION
+task_family: VISION
+instantiation: "CLASS"
+summary: Class-Incremental Semantic Segmentation with heavy-size architecture which based on the Lite-HRNet backbone for the accurate predictions but long training.
+application: ~
+
+# Algo backend.
+framework: OTXSegmentation v0.14.0
+
+# Capabilities.
+capabilities:
+  - compute_representations
+
+# Hyperparameters.
+hyper_parameters:
+  base_path: ../configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      batch_size:
+        default_value: 8
+        auto_hpo_state: POSSIBLE
+      learning_rate:
+        default_value: 0.001
+        auto_hpo_state: POSSIBLE
+      learning_rate_warmup_iters:
+        default_value: 100
+      num_iters:
+        default_value: 200
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Stats.
+gigaflops: 13.97
+size: 6.4
+
+# Model spec
+model_category: ACCURACY
diff --git a/src/otx/tools/templates/visual_prompting/configuration.yaml b/src/otx/tools/templates/visual_prompting/configuration.yaml
new file mode 100644
index 00000000000..a9c04e23137
--- /dev/null
+++ b/src/otx/tools/templates/visual_prompting/configuration.yaml
@@ -0,0 +1,235 @@
+description: Configuration for SAM
+header: Configuration for SAM
+id: ""
+learning_parameters:
+  description: Learning Parameters
+  header: Learning Parameters
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+  trainer:
+    description: Trainer Parameters
+    header: Trainer Parameters
+    type: PARAMETER_GROUP
+    visible_in_ui: true
+    max_epochs:
+      affects_outcome_of: TRAINING
+      default_value: 100
+      description:
+        Maximum number of epochs to train for. If not specified, the training will
+        run until the early stopping criteria is met.
+      editable: true
+      header: Maximum number of epochs
+      max_value: 1000
+      min_value: 1
+      type: INTEGER
+      value: 100
+  dataset:
+    description: Dataset Parameters
+    header: Dataset Parameters
+    type: PARAMETER_GROUP
+    visible_in_ui: true
+    use_mask:
+      header: Flag about using mask as label
+      affects_outcome_of: TRAINING
+      default_value: false
+      description: If using mask as-is (true) or converting it to polygon (false)
+      editable: true
+      value: false
+      type: BOOLEAN
+    train_batch_size:
+      affects_outcome_of: TRAINING
+      auto_hpo_state: not_possible
+      auto_hpo_value: null
+      default_value: 2
+      description:
+        The number of training samples seen in each iteration of training.
+        Increasing this value improves training time and may make the training more
+        stable. A larger batch size has higher memory requirements.
+      editable: true
+      header: Batch size
+      max_value: 512
+      min_value: 1
+      type: INTEGER
+      ui_rules:
+        action: DISABLE_EDITING
+        operator: AND
+        rules: []
+        type: UI_RULES
+      value: 32
+      visible_in_ui: true
+      warning:
+        Increasing this value may cause the system to use more memory than available,
+        potentially causing out of memory errors, please update with caution.
+  optimizer:
+    description: Optimizer Parameters
+    header: Optimizer Parameters
+    type: PARAMETER_GROUP
+    visible_in_ui: true
+    lr:
+      affects_outcome_of: TRAINING
+      default_value: 0.00001
+      description:
+        Increasing this value will speed up training convergence but might
+        make it unstable.
+      editable: true
+      header: Learning rate
+      max_value: 10
+      min_value: 1.0e-07
+      type: FLOAT
+      ui_rules:
+        action: DISABLE_EDITING
+        operator: AND
+        rules: []
+        type: UI_RULES
+      value: 0.0001
+      visible_in_ui: true
+      warning: null
+      auto_hpo_state: NOT_POSSIBLE
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: Mixed
+    description: Quantization preset that defines quantization scheme
+    editable: true
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Mixed
+    visible_in_ui: true
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: true
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+postprocessing:
+  confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: 0.5
+    description:
+      This threshold only takes effect if the threshold is not set based
+      on the result.
+    editable: true
+    header: Confidence threshold
+    max_value: 1
+    min_value: 0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.5
+    visible_in_ui: true
+    warning: null
+  description: Postprocessing
+  header: Postprocessing
+  result_based_confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: false
+    description: Confidence threshold is derived from the results
+    editable: true
+    header: Result based confidence threshold
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+algo_backend:
+  description: parameters for algo backend
+  header: Algo backend parameters
+  train_type:
+    affects_outcome_of: TRAINING
+    default_value: Incremental
+    description: Training scheme option that determines how to train the model
+    editable: True
+    enum_name: TrainType
+    header: Train type
+    options:
+      Incremental: "Incremental"
+      Zeroshot: "Zeroshot"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Incremental
+    visible_in_ui: false
+    warning: null
+  mem_cache_size:
+    affects_outcome_of: TRAINING
+    default_value: 1000000000
+    description: Size of memory pool for caching decoded data to load data faster (bytes).
+    editable: true
+    header: Size of memory pool
+    max_value: 9223372036854775807
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme for storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
diff --git a/src/otx/tools/templates/visual_prompting/sam_tiny_vit/template.yaml b/src/otx/tools/templates/visual_prompting/sam_tiny_vit/template.yaml
new file mode 100644
index 00000000000..9b84e3a9e15
--- /dev/null
+++ b/src/otx/tools/templates/visual_prompting/sam_tiny_vit/template.yaml
@@ -0,0 +1,25 @@
+# Description.
+model_template_id: Visual_Prompting_SAM_Tiny_ViT
+name: SAM_Tiny_ViT
+task_type: VISUAL_PROMPTING
+task_family: VISION
+instantiation: "CLASS"
+summary: Visual Prompting with TinyViT for the accurate predictions
+application: ~
+
+# Algo backend.
+framework: OTXVisualPrompting v0.1.0
+
+# Hyper Parameters
+hyper_parameters:
+  base_path: ../configuration.yaml
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Computational Complexity
+gigaflops: 38.95
+size: 47
diff --git a/src/otx/tools/templates/visual_prompting/sam_vit_b/template.yaml b/src/otx/tools/templates/visual_prompting/sam_vit_b/template.yaml
new file mode 100644
index 00000000000..3bd17deaf3f
--- /dev/null
+++ b/src/otx/tools/templates/visual_prompting/sam_vit_b/template.yaml
@@ -0,0 +1,25 @@
+# Description.
+model_template_id: Visual_Prompting_SAM_ViT_B
+name: SAM_ViT_B
+task_type: VISUAL_PROMPTING
+task_family: VISION
+instantiation: "CLASS"
+summary: Visual Prompting with ViT-B for the accurate predictions
+application: ~
+
+# Algo backend.
+framework: OTXVisualPrompting v0.1.0
+
+# Hyper Parameters
+hyper_parameters:
+  base_path: ../configuration.yaml
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Computational Complexity
+gigaflops: 483.71
+size: 362
diff --git a/src/otx/tools/templates/visual_prompting/zero_shot_sam_tiny_vit/configuration.yaml b/src/otx/tools/templates/visual_prompting/zero_shot_sam_tiny_vit/configuration.yaml
new file mode 100644
index 00000000000..1740ecd8324
--- /dev/null
+++ b/src/otx/tools/templates/visual_prompting/zero_shot_sam_tiny_vit/configuration.yaml
@@ -0,0 +1,210 @@
+description: Configuration for SAM
+header: Configuration for SAM
+id: ""
+learning_parameters:
+  description: Learning Parameters
+  header: Learning Parameters
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+  trainer:
+    description: Trainer Parameters
+    header: Trainer Parameters
+    type: PARAMETER_GROUP
+    visible_in_ui: true
+    max_epochs:
+      affects_outcome_of: TRAINING
+      default_value: 1
+      description:
+        Maximum number of epochs to train for. If not specified, the training will
+        run until the early stopping criteria is met.
+      editable: true
+      header: Maximum number of epochs
+      max_value: 1
+      min_value: 1
+      type: INTEGER
+      value: 1
+  dataset:
+    description: Dataset Parameters
+    header: Dataset Parameters
+    type: PARAMETER_GROUP
+    visible_in_ui: true
+    use_mask:
+      header: Flag about using mask as label
+      affects_outcome_of: TRAINING
+      default_value: false
+      description: If using mask as-is (true) or converting it to polygon (false)
+      editable: true
+      value: false
+      type: BOOLEAN
+    train_batch_size:
+      affects_outcome_of: TRAINING
+      auto_hpo_state: not_possible
+      auto_hpo_value: null
+      default_value: 1
+      description:
+        The number of training samples seen in each iteration of training.
+        Increasing this value improves training time and may make the training more
+        stable. A larger batch size has higher memory requirements.
+      editable: true
+      header: Batch size
+      max_value: 512
+      min_value: 1
+      type: INTEGER
+      ui_rules:
+        action: DISABLE_EDITING
+        operator: AND
+        rules: []
+        type: UI_RULES
+      value: 32
+      visible_in_ui: true
+      warning:
+        Increasing this value may cause the system to use more memory than available,
+        potentially causing out of memory errors, please update with caution.
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: Mixed
+    description: Quantization preset that defines quantization scheme
+    editable: true
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Mixed
+    visible_in_ui: true
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: true
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+postprocessing:
+  confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: 0.5
+    description:
+      This threshold only takes effect if the threshold is not set based
+      on the result.
+    editable: true
+    header: Confidence threshold
+    max_value: 1
+    min_value: 0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.5
+    visible_in_ui: true
+    warning: null
+  description: Postprocessing
+  header: Postprocessing
+  result_based_confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: false
+    description: Confidence threshold is derived from the results
+    editable: true
+    header: Result based confidence threshold
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+algo_backend:
+  description: parameters for algo backend
+  header: Algo backend parameters
+  train_type:
+    affects_outcome_of: TRAINING
+    default_value: Incremental
+    description: Training scheme option that determines how to train the model
+    editable: True
+    enum_name: TrainType
+    header: Train type
+    options:
+      Incremental: "Incremental"
+      Zeroshot: "Zeroshot"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Incremental
+    visible_in_ui: false
+    warning: null
+  mem_cache_size:
+    affects_outcome_of: TRAINING
+    default_value: 1000000000
+    description: Size of memory pool for caching decoded data to load data faster (bytes).
+    editable: true
+    header: Size of memory pool
+    max_value: 9223372036854775807
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme for storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
diff --git a/src/otx/tools/templates/visual_prompting/zero_shot_sam_tiny_vit/template.yaml b/src/otx/tools/templates/visual_prompting/zero_shot_sam_tiny_vit/template.yaml
new file mode 100644
index 00000000000..c3f7a3c1d3f
--- /dev/null
+++ b/src/otx/tools/templates/visual_prompting/zero_shot_sam_tiny_vit/template.yaml
@@ -0,0 +1,33 @@
+# Description.
+model_template_id: Zero_Shot_SAM_Tiny_ViT
+name: Zero_Shot_SAM_Tiny_ViT
+task_type: VISUAL_PROMPTING
+task_family: VISION
+instantiation: "CLASS"
+summary: Zero SHot Visual Prompting with TinyViT for the accurate predictions
+application: ~
+
+# Algo backend.
+framework: OTXVisualPrompting v0.1.0
+
+# Hyper Parameters
+hyper_parameters:
+  base_path: ./configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      dataset:
+        train_batch_size:
+          default_value: 1
+    algo_backend:
+      train_type:
+        default_value: Zeroshot
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Computational Complexity
+gigaflops: 38.18
+size: 25
diff --git a/src/otx/tools/templates/visual_prompting/zero_shot_sam_vit_b/configuration.yaml b/src/otx/tools/templates/visual_prompting/zero_shot_sam_vit_b/configuration.yaml
new file mode 100644
index 00000000000..1740ecd8324
--- /dev/null
+++ b/src/otx/tools/templates/visual_prompting/zero_shot_sam_vit_b/configuration.yaml
@@ -0,0 +1,210 @@
+description: Configuration for SAM
+header: Configuration for SAM
+id: ""
+learning_parameters:
+  description: Learning Parameters
+  header: Learning Parameters
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+  trainer:
+    description: Trainer Parameters
+    header: Trainer Parameters
+    type: PARAMETER_GROUP
+    visible_in_ui: true
+    max_epochs:
+      affects_outcome_of: TRAINING
+      default_value: 1
+      description:
+        Maximum number of epochs to train for. If not specified, the training will
+        run until the early stopping criteria is met.
+      editable: true
+      header: Maximum number of epochs
+      max_value: 1
+      min_value: 1
+      type: INTEGER
+      value: 1
+  dataset:
+    description: Dataset Parameters
+    header: Dataset Parameters
+    type: PARAMETER_GROUP
+    visible_in_ui: true
+    use_mask:
+      header: Flag about using mask as label
+      affects_outcome_of: TRAINING
+      default_value: false
+      description: If using mask as-is (true) or converting it to polygon (false)
+      editable: true
+      value: false
+      type: BOOLEAN
+    train_batch_size:
+      affects_outcome_of: TRAINING
+      auto_hpo_state: not_possible
+      auto_hpo_value: null
+      default_value: 1
+      description:
+        The number of training samples seen in each iteration of training.
+        Increasing this value improves training time and may make the training more
+        stable. A larger batch size has higher memory requirements.
+      editable: true
+      header: Batch size
+      max_value: 512
+      min_value: 1
+      type: INTEGER
+      ui_rules:
+        action: DISABLE_EDITING
+        operator: AND
+        rules: []
+        type: UI_RULES
+      value: 32
+      visible_in_ui: true
+      warning:
+        Increasing this value may cause the system to use more memory than available,
+        potentially causing out of memory errors, please update with caution.
+pot_parameters:
+  description: POT Parameters
+  header: POT Parameters
+  preset:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: Mixed
+    description: Quantization preset that defines quantization scheme
+    editable: true
+    enum_name: POTQuantizationPreset
+    header: Preset
+    options:
+      MIXED: Mixed
+      PERFORMANCE: Performance
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Mixed
+    visible_in_ui: true
+    warning: null
+  stat_subset_size:
+    affects_outcome_of: NONE
+    auto_hpo_state: not_possible
+    auto_hpo_value: null
+    default_value: 300
+    description: Number of data samples used for post-training optimization
+    editable: true
+    header: Number of data samples
+    max_value: 1000
+    min_value: 1
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 300
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+postprocessing:
+  confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: 0.5
+    description:
+      This threshold only takes effect if the threshold is not set based
+      on the result.
+    editable: true
+    header: Confidence threshold
+    max_value: 1
+    min_value: 0
+    type: FLOAT
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: 0.5
+    visible_in_ui: true
+    warning: null
+  description: Postprocessing
+  header: Postprocessing
+  result_based_confidence_threshold:
+    affects_outcome_of: INFERENCE
+    default_value: false
+    description: Confidence threshold is derived from the results
+    editable: true
+    header: Result based confidence threshold
+    type: BOOLEAN
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: false
+    visible_in_ui: true
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: true
+algo_backend:
+  description: parameters for algo backend
+  header: Algo backend parameters
+  train_type:
+    affects_outcome_of: TRAINING
+    default_value: Incremental
+    description: Training scheme option that determines how to train the model
+    editable: True
+    enum_name: TrainType
+    header: Train type
+    options:
+      Incremental: "Incremental"
+      Zeroshot: "Zeroshot"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    value: Incremental
+    visible_in_ui: false
+    warning: null
+  mem_cache_size:
+    affects_outcome_of: TRAINING
+    default_value: 1000000000
+    description: Size of memory pool for caching decoded data to load data faster (bytes).
+    editable: true
+    header: Size of memory pool
+    max_value: 9223372036854775807
+    min_value: 0
+    type: INTEGER
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme for storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
+  type: PARAMETER_GROUP
+  visible_in_ui: false
+type: CONFIGURABLE_PARAMETERS
+visible_in_ui: true
diff --git a/src/otx/tools/templates/visual_prompting/zero_shot_sam_vit_b/template.yaml b/src/otx/tools/templates/visual_prompting/zero_shot_sam_vit_b/template.yaml
new file mode 100644
index 00000000000..ebf495e6ac8
--- /dev/null
+++ b/src/otx/tools/templates/visual_prompting/zero_shot_sam_vit_b/template.yaml
@@ -0,0 +1,33 @@
+# Description.
+model_template_id: Zero_Shot_SAM_ViT_B
+name: Zero_Shot_SAM_ViT_B
+task_type: VISUAL_PROMPTING
+task_family: VISION
+instantiation: "CLASS"
+summary: Zero SHot Visual Prompting with ViT-B for the accurate predictions
+application: ~
+
+# Algo backend.
+framework: OTXVisualPrompting v0.1.0
+
+# Hyper Parameters
+hyper_parameters:
+  base_path: ./configuration.yaml
+  parameter_overrides:
+    learning_parameters:
+      dataset:
+        train_batch_size:
+          default_value: 1
+    algo_backend:
+      train_type:
+        default_value: Zeroshot
+
+# Training resources.
+max_nodes: 1
+training_targets:
+  - GPU
+  - CPU
+
+# Computational Complexity
+gigaflops: 483.71
+size: 362

From 0f87c868f3d32da4ee9c45d506b79ae2a33c227f Mon Sep 17 00:00:00 2001
From: Eugene Liu <eugene.liu@intel.com>
Date: Thu, 12 Sep 2024 16:08:10 +0100
Subject: [PATCH 40/53] Add missing tile recipes and various tile recipe
 changes  (#3942)

* add missing tile recipes

* Fix tiling XAI out of range (#3943)

- Fix tile merge XAI out of range

* update xai tile merge

* update rtdetr

* update tile recipes

* update rtdetr tile postprocess

* update rtdetr recipes and tile recipes

* update tile recipes

* fix rtdetr unittest

* update recipes

* refactor tile unit test

* address pr reviews

* remove unnecessary files

* update color channel

* fix image channel passing

* include tiling in cli integration test

* remove transform_bbox

---------

Co-authored-by: Vladislav Sovrasov <sovrasov.vlad@gmail.com>
---
 .../base_models/detection_transformer.py      | 19 ++--
 src/otx/algo/detection/rtdetr.py              | 18 ++--
 src/otx/algo/detection/yolox.py               |  7 +-
 src/otx/core/data/dataset/tile.py             |  3 +
 src/otx/core/data/entity/tile.py              |  4 +-
 src/otx/core/utils/tile_merge.py              | 16 ++--
 .../recipe/_base_/data/detection_tile.yaml    | 82 +++++++++++++++++
 .../detection/atss_mobilenetv2_tile.yaml      |  6 +-
 .../detection/atss_resnext101_tile.yaml       | 51 +++++++++++
 src/otx/recipe/detection/rtdetr_101.yaml      |  2 -
 src/otx/recipe/detection/rtdetr_101_tile.yaml | 89 +++++++++++++++++++
 src/otx/recipe/detection/rtdetr_18.yaml       |  2 -
 src/otx/recipe/detection/rtdetr_18_tile.yaml  | 88 ++++++++++++++++++
 src/otx/recipe/detection/rtdetr_50.yaml       |  2 -
 src/otx/recipe/detection/rtdetr_50_tile.yaml  | 89 +++++++++++++++++++
 .../recipe/detection/rtmdet_tiny_tile.yaml    | 69 ++++++++++++++
 .../detection/ssd_mobilenetv2_tile.yaml       | 22 +----
 src/otx/recipe/detection/yolox_l_tile.yaml    | 30 +------
 src/otx/recipe/detection/yolox_s_tile.yaml    | 29 +-----
 src/otx/recipe/detection/yolox_tiny_tile.yaml | 31 +------
 src/otx/recipe/detection/yolox_x_tile.yaml    | 30 +------
 .../integration/cli/test_export_inference.py  |  2 -
 .../algo/detection/base_models/test_detr.py   |  8 +-
 tests/unit/core/data/test_tiling.py           | 87 ++++--------------
 24 files changed, 542 insertions(+), 244 deletions(-)
 create mode 100644 src/otx/recipe/_base_/data/detection_tile.yaml
 create mode 100644 src/otx/recipe/detection/atss_resnext101_tile.yaml
 create mode 100644 src/otx/recipe/detection/rtdetr_101_tile.yaml
 create mode 100644 src/otx/recipe/detection/rtdetr_18_tile.yaml
 create mode 100644 src/otx/recipe/detection/rtdetr_50_tile.yaml
 create mode 100644 src/otx/recipe/detection/rtmdet_tiny_tile.yaml

diff --git a/src/otx/algo/detection/base_models/detection_transformer.py b/src/otx/algo/detection/base_models/detection_transformer.py
index ed19a38dbd0..479cb7e9ce5 100644
--- a/src/otx/algo/detection/base_models/detection_transformer.py
+++ b/src/otx/algo/detection/base_models/detection_transformer.py
@@ -98,19 +98,24 @@ def export(
         if explain_mode:
             msg = "Explain mode is not supported for DETR models yet."
             raise NotImplementedError(msg)
-        return self.postprocess(self._forward_features(batch_inputs), deploy_mode=True)
+
+        return self.postprocess(
+            self._forward_features(batch_inputs),
+            [meta["img_shape"] for meta in batch_img_metas],
+            deploy_mode=True,
+        )
 
     def postprocess(
         self,
         outputs: dict[str, Tensor],
-        original_size: tuple[int, int] | None = None,
+        original_sizes: list[tuple[int, int]],
         deploy_mode: bool = False,
     ) -> dict[str, Tensor] | tuple[list[Tensor], list[Tensor], list[Tensor]]:
         """Post-processes the model outputs.
 
         Args:
             outputs (dict[str, Tensor]): The model outputs.
-            original_size (tuple[int, int], optional): The original size of the input images. Defaults to None.
+            original_sizes (list[tuple[int, int]]): The original image sizes.
             deploy_mode (bool, optional): Whether to run in deploy mode. Defaults to False.
 
         Returns:
@@ -120,9 +125,9 @@ def postprocess(
 
         # convert bbox to xyxy and rescale back to original size (resize in OTX)
         bbox_pred = box_convert(boxes, in_fmt="cxcywh", out_fmt="xyxy")
-        if not deploy_mode and original_size is not None:
-            original_size_tensor = torch.tensor(original_size).to(bbox_pred.device)
-            bbox_pred *= original_size_tensor.repeat(1, 2).unsqueeze(1)
+        if not deploy_mode:
+            original_size_tensor = torch.tensor(original_sizes).to(bbox_pred.device)
+            bbox_pred *= original_size_tensor.flip(1).repeat(1, 2).unsqueeze(1)
 
         # perform scores computation and gather topk results
         scores = nn.functional.sigmoid(logits)
@@ -136,7 +141,7 @@ def postprocess(
 
         scores_list, boxes_list, labels_list = [], [], []
 
-        for sc, bb, ll in zip(scores, boxes, labels):
+        for sc, bb, ll, original_size in zip(scores, boxes, labels, original_sizes):
             scores_list.append(sc)
             boxes_list.append(
                 BoundingBoxes(bb, format="xyxy", canvas_size=original_size),
diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py
index 9f487c04be3..8350dbc5fe0 100644
--- a/src/otx/algo/detection/rtdetr.py
+++ b/src/otx/algo/detection/rtdetr.py
@@ -75,13 +75,14 @@ def _customize_inputs(
         # prepare bboxes for the model
         for bb, ll in zip(entity.bboxes, entity.labels):
             # convert to cxcywh if needed
-            converted_bboxes = (
-                box_convert(bb, in_fmt="xyxy", out_fmt="cxcywh") if bb.format == BoundingBoxFormat.XYXY else bb
-            )
-            # normalize the bboxes
-            scaled_bboxes = converted_bboxes / torch.tensor(bb.canvas_size[::-1]).tile(2)[None].to(
-                converted_bboxes.device,
-            )
+            if len(scaled_bboxes := bb):
+                converted_bboxes = (
+                    box_convert(bb, in_fmt="xyxy", out_fmt="cxcywh") if bb.format == BoundingBoxFormat.XYXY else bb
+                )
+                # normalize the bboxes
+                scaled_bboxes = converted_bboxes / torch.tensor(bb.canvas_size[::-1]).tile(2)[None].to(
+                    converted_bboxes.device,
+                )
             targets.append({"boxes": scaled_bboxes, "labels": ll})
 
         return {
@@ -109,7 +110,8 @@ def _customize_outputs(
                     raise TypeError(msg)
             return losses
 
-        scores, bboxes, labels = self.model.postprocess(outputs, [img_info.img_shape for img_info in inputs.imgs_info])
+        original_sizes = [img_info.ori_shape for img_info in inputs.imgs_info]
+        scores, bboxes, labels = self.model.postprocess(outputs, original_sizes)
 
         return DetBatchPredEntity(
             batch_size=len(outputs),
diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py
index fd1a8765cad..dd7b61b921c 100644
--- a/src/otx/algo/detection/yolox.py
+++ b/src/otx/algo/detection/yolox.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Literal
 
 from otx.algo.common.losses import CrossEntropyLoss, L1Loss
 from otx.algo.detection.backbones import CSPDarknet
@@ -76,13 +76,16 @@ def _exporter(self) -> OTXModelExporter:
             raise ValueError(msg)
 
         swap_rgb = not isinstance(self, YOLOXTINY)  # only YOLOX-TINY uses RGB
+        resize_mode: Literal["standard", "fit_to_window_letterbox"] = "fit_to_window_letterbox"
+        if self.tile_config.enable_tiler:
+            resize_mode = "standard"
 
         return OTXNativeModelExporter(
             task_level_export_parameters=self._export_parameters,
             input_size=(1, 3, *self.input_size),
             mean=self.mean,
             std=self.std,
-            resize_mode="fit_to_window_letterbox",
+            resize_mode=resize_mode,
             pad_value=114,
             swap_rgb=swap_rgb,
             via_onnx=True,
diff --git a/src/otx/core/data/dataset/tile.py b/src/otx/core/data/dataset/tile.py
index 73ab24fb4ea..6c132b25be1 100644
--- a/src/otx/core/data/dataset/tile.py
+++ b/src/otx/core/data/dataset/tile.py
@@ -218,6 +218,9 @@ def __init__(self, dataset: OTXDataset, tile_config: TileConfig) -> None:
             dataset.mem_cache_handler,
             dataset.mem_cache_img_max_size,
             dataset.max_refetch,
+            dataset.image_color_channel,
+            dataset.stack_images,
+            dataset.to_tv_image,
         )
         self.tile_config = tile_config
         self._dataset = dataset
diff --git a/src/otx/core/data/entity/tile.py b/src/otx/core/data/entity/tile.py
index 8b9ce454e5b..a7c7b4a2d78 100644
--- a/src/otx/core/data/entity/tile.py
+++ b/src/otx/core/data/entity/tile.py
@@ -125,7 +125,7 @@ def unbind(self) -> list[tuple[TileAttrDictList, DetBatchDataEntity]]:
                     labels=[[] for _ in range(self.batch_size)],
                 ),
             )
-        return list(zip(batch_tile_attr_list, batch_data_entities))
+        return list(zip(batch_tile_attr_list, batch_data_entities, strict=True))
 
     @classmethod
     def collate_fn(cls, batch_entities: list[TileDetDataEntity]) -> TileBatchDetDataEntity:
@@ -218,7 +218,7 @@ def unbind(self) -> list[tuple[TileAttrDictList, InstanceSegBatchDataEntity]]:
             )
             for i in range(0, len(tiles), self.batch_size)
         ]
-        return list(zip(batch_tile_attr_list, batch_data_entities))
+        return list(zip(batch_tile_attr_list, batch_data_entities, strict=True))
 
     @classmethod
     def collate_fn(cls, batch_entities: list[TileInstSegDataEntity]) -> TileBatchInstSegDataEntity:
diff --git a/src/otx/core/utils/tile_merge.py b/src/otx/core/utils/tile_merge.py
index 02457522055..df6a272b2ab 100644
--- a/src/otx/core/utils/tile_merge.py
+++ b/src/otx/core/utils/tile_merge.py
@@ -29,7 +29,7 @@ class TileMerge(Generic[T_OTXDataEntity, T_OTXBatchPredEntity]):
         img_infos (list[ImageInfo]): Original image information before tiling.
         num_classes (int): Number of classes.
         tile_config (TileConfig): Tile configuration.
-        explain_mode (bool): Whether or not tiles have explain features. Default: False.
+        explain_mode (bool, optional): Whether or not tiles have explain features. Default: False.
     """
 
     def __init__(
@@ -119,8 +119,8 @@ def merge(
         img_ids = []
         explain_mode = self.explain_mode
 
-        for tile_preds, tile_attrs in zip(batch_tile_preds, batch_tile_attrs):
-            batch_size = tile_preds.batch_size
+        for tile_preds, tile_attrs in zip(batch_tile_preds, batch_tile_attrs, strict=True):
+            batch_size = len(tile_attrs)
             saliency_maps = tile_preds.saliency_map if explain_mode else [[] for _ in range(batch_size)]
             feature_vectors = tile_preds.feature_vector if explain_mode else [[] for _ in range(batch_size)]
             for tile_attr, tile_img_info, tile_bboxes, tile_labels, tile_scores, tile_s_map, tile_f_vect in zip(
@@ -131,6 +131,7 @@ def merge(
                 tile_preds.scores,
                 saliency_maps,
                 feature_vectors,
+                strict=True,
             ):
                 offset_x, offset_y, _, _ = tile_attr["roi"]
                 tile_bboxes[:, 0::2] += offset_x
@@ -156,7 +157,7 @@ def merge(
 
         return [
             self._merge_entities(image_info, entities_to_merge[img_id], explain_mode)
-            for img_id, image_info in zip(img_ids, self.img_infos)
+            for img_id, image_info in zip(img_ids, self.img_infos, strict=True)
         ]
 
     def _merge_entities(
@@ -319,8 +320,8 @@ def merge(
         img_ids = []
         explain_mode = self.explain_mode
 
-        for tile_preds, tile_attrs in zip(batch_tile_preds, batch_tile_attrs):
-            feature_vectors = tile_preds.feature_vector if explain_mode else [[] for _ in range(tile_preds.batch_size)]
+        for tile_preds, tile_attrs in zip(batch_tile_preds, batch_tile_attrs, strict=True):
+            feature_vectors = tile_preds.feature_vector if explain_mode else [[] for _ in range(len(tile_attrs))]
             for tile_attr, tile_img_info, tile_bboxes, tile_labels, tile_scores, tile_masks, tile_f_vect in zip(
                 tile_attrs,
                 tile_preds.imgs_info,
@@ -329,6 +330,7 @@ def merge(
                 tile_preds.scores,
                 tile_preds.masks,
                 feature_vectors,
+                strict=True,
             ):
                 keep_indices = tile_masks.to_sparse().sum((1, 2)).to_dense() > 0
                 keep_indices = keep_indices.nonzero(as_tuple=True)[0]
@@ -363,7 +365,7 @@ def merge(
 
         return [
             self._merge_entities(image_info, entities_to_merge[img_id], explain_mode)
-            for img_id, image_info in zip(img_ids, self.img_infos)
+            for img_id, image_info in zip(img_ids, self.img_infos, strict=True)
         ]
 
     def _merge_entities(
diff --git a/src/otx/recipe/_base_/data/detection_tile.yaml b/src/otx/recipe/_base_/data/detection_tile.yaml
new file mode 100644
index 00000000000..ebfddab9e43
--- /dev/null
+++ b/src/otx/recipe/_base_/data/detection_tile.yaml
@@ -0,0 +1,82 @@
+task: DETECTION
+input_size:
+  - 800
+  - 800
+mem_cache_size: 1GB
+mem_cache_img_max_size: null
+image_color_channel: RGB
+stack_images: true
+data_format: coco_instances
+unannotated_items_ratio: 0.0
+tile_config:
+  enable_tiler: true
+  enable_adaptive_tiling: true
+train_subset:
+  subset_name: train
+  transform_lib_type: TORCHVISION
+  batch_size: 1
+  num_workers: 2
+  to_tv_image: false
+  transforms:
+    - class_path: otx.core.data.transform_libs.torchvision.Resize
+      init_args:
+        scale: $(input_size)
+        keep_ratio: false
+        transform_bbox: true
+    - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
+      init_args:
+        prob: 0.5
+        is_numpy_to_tvtensor: true
+    - class_path: torchvision.transforms.v2.ToDtype
+      init_args:
+        dtype: ${as_torch_dtype:torch.float32}
+    - class_path: torchvision.transforms.v2.Normalize
+      init_args:
+        mean: [0.0, 0.0, 0.0]
+        std: [255.0, 255.0, 255.0]
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+
+val_subset:
+  subset_name: val
+  transform_lib_type: TORCHVISION
+  batch_size: 1
+  num_workers: 2
+  to_tv_image: false
+  transforms:
+    - class_path: otx.core.data.transform_libs.torchvision.Resize
+      init_args:
+        scale: $(input_size)
+        keep_ratio: false
+        is_numpy_to_tvtensor: true
+    - class_path: torchvision.transforms.v2.ToDtype
+      init_args:
+        dtype: ${as_torch_dtype:torch.float32}
+    - class_path: torchvision.transforms.v2.Normalize
+      init_args:
+        mean: [0.0, 0.0, 0.0]
+        std: [255.0, 255.0, 255.0]
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+
+test_subset:
+  subset_name: test
+  transform_lib_type: TORCHVISION
+  batch_size: 1
+  num_workers: 2
+  to_tv_image: false
+  transforms:
+    - class_path: otx.core.data.transform_libs.torchvision.Resize
+      init_args:
+        scale: $(input_size)
+        keep_ratio: false
+        is_numpy_to_tvtensor: true
+    - class_path: torchvision.transforms.v2.ToDtype
+      init_args:
+        dtype: ${as_torch_dtype:torch.float32}
+    - class_path: torchvision.transforms.v2.Normalize
+      init_args:
+        mean: [0.0, 0.0, 0.0]
+        std: [255.0, 255.0, 255.0]
+  sampler:
+    class_path: torch.utils.data.RandomSampler
diff --git a/src/otx/recipe/detection/atss_mobilenetv2_tile.yaml b/src/otx/recipe/detection/atss_mobilenetv2_tile.yaml
index e1f6e9725f7..0d4dfd53727 100644
--- a/src/otx/recipe/detection/atss_mobilenetv2_tile.yaml
+++ b/src/otx/recipe/detection/atss_mobilenetv2_tile.yaml
@@ -28,14 +28,10 @@ engine:
 
 callback_monitor: val/map_50
 
-data: ../_base_/data/detection.yaml
+data: ../_base_/data/detection_tile.yaml
 overrides:
   gradient_clip_val: 35.0
   data:
-    tile_config:
-      enable_tiler: true
-      enable_adaptive_tiling: true
-
     train_subset:
       batch_size: 8
       sampler:
diff --git a/src/otx/recipe/detection/atss_resnext101_tile.yaml b/src/otx/recipe/detection/atss_resnext101_tile.yaml
new file mode 100644
index 00000000000..831d694caad
--- /dev/null
+++ b/src/otx/recipe/detection/atss_resnext101_tile.yaml
@@ -0,0 +1,51 @@
+model:
+  class_path: otx.algo.detection.atss.ResNeXt101ATSS
+  init_args:
+    label_info: 80
+
+    optimizer:
+      class_path: torch.optim.SGD
+      init_args:
+        lr: 0.004
+        momentum: 0.9
+        weight_decay: 0.0001
+
+    scheduler:
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
+      init_args:
+        num_warmup_steps: 3
+        main_scheduler_callable:
+          class_path: lightning.pytorch.cli.ReduceLROnPlateau
+          init_args:
+            mode: max
+            factor: 0.1
+            patience: 4
+            monitor: val/map_50
+
+engine:
+  task: DETECTION
+  device: auto
+
+callback_monitor: val/map_50
+
+data: ../_base_/data/detection_tile.yaml
+overrides:
+  gradient_clip_val: 35.0
+  callbacks:
+    - class_path: otx.algo.callbacks.adaptive_train_scheduling.AdaptiveTrainScheduling
+      init_args:
+        max_interval: 5
+        decay: -0.025
+        min_lrschedule_patience: 3
+
+  data:
+    train_subset:
+      batch_size: 4
+      sampler:
+        class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
+
+    val_subset:
+      batch_size: 4
+
+    test_subset:
+      batch_size: 4
diff --git a/src/otx/recipe/detection/rtdetr_101.yaml b/src/otx/recipe/detection/rtdetr_101.yaml
index b67c2d34b4d..8f2b13f3d0a 100644
--- a/src/otx/recipe/detection/rtdetr_101.yaml
+++ b/src/otx/recipe/detection/rtdetr_101.yaml
@@ -88,7 +88,6 @@ overrides:
           init_args:
             scale: $(input_size)
             keep_ratio: false
-            transform_bbox: true
             is_numpy_to_tvtensor: true
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
@@ -103,7 +102,6 @@ overrides:
           init_args:
             scale: $(input_size)
             keep_ratio: false
-            transform_bbox: true
             is_numpy_to_tvtensor: true
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
diff --git a/src/otx/recipe/detection/rtdetr_101_tile.yaml b/src/otx/recipe/detection/rtdetr_101_tile.yaml
new file mode 100644
index 00000000000..23e86944a22
--- /dev/null
+++ b/src/otx/recipe/detection/rtdetr_101_tile.yaml
@@ -0,0 +1,89 @@
+model:
+  class_path: otx.algo.detection.rtdetr.RTDETR101
+  init_args:
+    label_info: 80
+
+    optimizer:
+      class_path: torch.optim.AdamW
+      init_args:
+        lr: 0.0001
+        betas: [0.9, 0.999]
+        weight_decay: 0.0001
+
+    scheduler:
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
+      init_args:
+        num_warmup_steps: 5
+        main_scheduler_callable:
+          class_path: lightning.pytorch.cli.ReduceLROnPlateau
+          init_args:
+            mode: max
+            factor: 0.1
+            patience: 6
+            monitor: val/map_50
+
+engine:
+  task: DETECTION
+  device: auto
+
+callback_monitor: val/map_50
+
+data: ../_base_/data/detection_tile.yaml
+overrides:
+  callbacks:
+    - class_path: otx.algo.callbacks.adaptive_train_scheduling.AdaptiveTrainScheduling
+      init_args:
+        max_interval: 1
+        decay: -0.025
+        min_lrschedule_patience: 3
+    - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
+      init_args:
+        monitor: null
+        mode: max
+        patience: 10
+        check_on_train_epoch_end: false
+        min_delta: 0.001
+        warmup_iters: 100
+        warmup_epochs: 7
+
+  data:
+    input_size:
+      - 640
+      - 640
+    task: DETECTION
+    stack_images: true
+    data_format: coco_instances
+    train_subset:
+      batch_size: 4
+      to_tv_image: true
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            keep_ratio: false
+            transform_bbox: true
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+      sampler:
+        class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
+
+    val_subset:
+      batch_size: 8
+      to_tv_image: true
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            keep_ratio: false
+            is_numpy_to_tvtensor: true
+
+    test_subset:
+      batch_size: 8
+      to_tv_image: true
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            keep_ratio: false
+            is_numpy_to_tvtensor: true
diff --git a/src/otx/recipe/detection/rtdetr_18.yaml b/src/otx/recipe/detection/rtdetr_18.yaml
index ba1575e0d03..2038d6c2d76 100644
--- a/src/otx/recipe/detection/rtdetr_18.yaml
+++ b/src/otx/recipe/detection/rtdetr_18.yaml
@@ -87,7 +87,6 @@ overrides:
           init_args:
             scale: $(input_size)
             keep_ratio: false
-            transform_bbox: true
             is_numpy_to_tvtensor: true
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
@@ -102,7 +101,6 @@ overrides:
           init_args:
             scale: $(input_size)
             keep_ratio: false
-            transform_bbox: true
             is_numpy_to_tvtensor: true
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
diff --git a/src/otx/recipe/detection/rtdetr_18_tile.yaml b/src/otx/recipe/detection/rtdetr_18_tile.yaml
new file mode 100644
index 00000000000..e932c387ce0
--- /dev/null
+++ b/src/otx/recipe/detection/rtdetr_18_tile.yaml
@@ -0,0 +1,88 @@
+model:
+  class_path: otx.algo.detection.rtdetr.RTDETR18
+  init_args:
+    label_info: 80
+
+    optimizer:
+      class_path: torch.optim.AdamW
+      init_args:
+        lr: 0.0001
+        betas: [0.9, 0.999]
+        weight_decay: 0.0001
+
+    scheduler:
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
+      init_args:
+        num_warmup_steps: 5
+        main_scheduler_callable:
+          class_path: lightning.pytorch.cli.ReduceLROnPlateau
+          init_args:
+            mode: max
+            factor: 0.1
+            patience: 6
+            monitor: val/map_50
+engine:
+  task: DETECTION
+  device: auto
+
+callback_monitor: val/map_50
+
+data: ../_base_/data/detection_tile.yaml
+overrides:
+  callbacks:
+    - class_path: otx.algo.callbacks.adaptive_train_scheduling.AdaptiveTrainScheduling
+      init_args:
+        max_interval: 1
+        decay: -0.025
+        min_lrschedule_patience: 3
+    - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
+      init_args:
+        monitor: null
+        mode: max
+        patience: 10
+        check_on_train_epoch_end: false
+        min_delta: 0.001
+        warmup_iters: 100
+        warmup_epochs: 7
+
+  data:
+    input_size:
+      - 640
+      - 640
+    task: DETECTION
+    stack_images: true
+    data_format: coco_instances
+    train_subset:
+      batch_size: 4
+      to_tv_image: true
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            keep_ratio: false
+            transform_bbox: true
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+      sampler:
+        class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
+
+    val_subset:
+      batch_size: 8
+      to_tv_image: true
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            keep_ratio: false
+            is_numpy_to_tvtensor: true
+
+    test_subset:
+      batch_size: 8
+      to_tv_image: true
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            keep_ratio: false
+            is_numpy_to_tvtensor: true
diff --git a/src/otx/recipe/detection/rtdetr_50.yaml b/src/otx/recipe/detection/rtdetr_50.yaml
index 298b30737d7..9b10a0b24d1 100644
--- a/src/otx/recipe/detection/rtdetr_50.yaml
+++ b/src/otx/recipe/detection/rtdetr_50.yaml
@@ -88,7 +88,6 @@ overrides:
           init_args:
             scale: $(input_size)
             keep_ratio: false
-            transform_bbox: true
             is_numpy_to_tvtensor: true
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
@@ -103,7 +102,6 @@ overrides:
           init_args:
             scale: $(input_size)
             keep_ratio: false
-            transform_bbox: true
             is_numpy_to_tvtensor: true
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
diff --git a/src/otx/recipe/detection/rtdetr_50_tile.yaml b/src/otx/recipe/detection/rtdetr_50_tile.yaml
new file mode 100644
index 00000000000..95a1384df45
--- /dev/null
+++ b/src/otx/recipe/detection/rtdetr_50_tile.yaml
@@ -0,0 +1,89 @@
+model:
+  class_path: otx.algo.detection.rtdetr.RTDETR50
+  init_args:
+    label_info: 80
+
+    optimizer:
+      class_path: torch.optim.AdamW
+      init_args:
+        lr: 0.0001
+        betas: [0.9, 0.999]
+        weight_decay: 0.0001
+
+    scheduler:
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
+      init_args:
+        num_warmup_steps: 5
+        main_scheduler_callable:
+          class_path: lightning.pytorch.cli.ReduceLROnPlateau
+          init_args:
+            mode: max
+            factor: 0.1
+            patience: 6
+            monitor: val/map_50
+
+engine:
+  task: DETECTION
+  device: auto
+
+callback_monitor: val/map_50
+
+data: ../_base_/data/detection_tile.yaml
+overrides:
+  callbacks:
+    - class_path: otx.algo.callbacks.adaptive_train_scheduling.AdaptiveTrainScheduling
+      init_args:
+        max_interval: 1
+        decay: -0.025
+        min_lrschedule_patience: 3
+    - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup
+      init_args:
+        monitor: null
+        mode: max
+        patience: 10
+        check_on_train_epoch_end: false
+        min_delta: 0.001
+        warmup_iters: 100
+        warmup_epochs: 7
+
+  data:
+    input_size:
+      - 640
+      - 640
+    task: DETECTION
+    stack_images: true
+    data_format: coco_instances
+    train_subset:
+      batch_size: 4
+      to_tv_image: true
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            keep_ratio: false
+            transform_bbox: true
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+      sampler:
+        class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
+
+    val_subset:
+      batch_size: 8
+      to_tv_image: true
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            keep_ratio: false
+            is_numpy_to_tvtensor: true
+
+    test_subset:
+      batch_size: 8
+      to_tv_image: true
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+            keep_ratio: false
+            is_numpy_to_tvtensor: true
diff --git a/src/otx/recipe/detection/rtmdet_tiny_tile.yaml b/src/otx/recipe/detection/rtmdet_tiny_tile.yaml
new file mode 100644
index 00000000000..982d7b775d3
--- /dev/null
+++ b/src/otx/recipe/detection/rtmdet_tiny_tile.yaml
@@ -0,0 +1,69 @@
+model:
+  class_path: otx.algo.detection.rtmdet.RTMDetTiny
+  init_args:
+    label_info: 80
+
+    optimizer:
+      class_path: torch.optim.AdamW
+      init_args:
+        lr: 0.0007
+        weight_decay: 0.05
+
+    scheduler:
+      class_path: otx.core.schedulers.LinearWarmupSchedulerCallable
+      init_args:
+        num_warmup_steps: 3
+        main_scheduler_callable:
+          class_path: lightning.pytorch.cli.ReduceLROnPlateau
+          init_args:
+            mode: max
+            factor: 0.1
+            patience: 4
+            monitor: val/map_50
+
+engine:
+  task: DETECTION
+  device: auto
+
+callback_monitor: val/map_50
+
+data: ../_base_/data/detection_tile.yaml
+overrides:
+  gradient_clip_val: 35.0
+  data:
+    input_size:
+      - 640
+      - 640
+    image_color_channel: BGR
+    train_subset:
+      batch_size: 8
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+        - class_path: torchvision.transforms.v2.Normalize
+          init_args:
+            mean: [103.53, 116.28, 123.675]
+            std: [57.375, 57.12, 58.395]
+
+    val_subset:
+      batch_size: 8
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+        - class_path: torchvision.transforms.v2.Normalize
+          init_args:
+            mean: [103.53, 116.28, 123.675]
+            std: [57.375, 57.12, 58.395]
+
+    test_subset:
+      batch_size: 8
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.Resize
+          init_args:
+            scale: $(input_size)
+        - class_path: torchvision.transforms.v2.Normalize
+          init_args:
+            mean: [103.53, 116.28, 123.675]
+            std: [57.375, 57.12, 58.395]
diff --git a/src/otx/recipe/detection/ssd_mobilenetv2_tile.yaml b/src/otx/recipe/detection/ssd_mobilenetv2_tile.yaml
index 33d6bf4c261..f806cee40ea 100644
--- a/src/otx/recipe/detection/ssd_mobilenetv2_tile.yaml
+++ b/src/otx/recipe/detection/ssd_mobilenetv2_tile.yaml
@@ -28,40 +28,20 @@ engine:
 
 callback_monitor: val/map_50
 
-data: ../_base_/data/detection.yaml
+data: ../_base_/data/detection_tile.yaml
 overrides:
-  reset:
-    - data.train_subset.transforms
-
   gradient_clip_val: 35.0
   data:
     input_size:
       - 864
       - 864
-    tile_config:
-      enable_tiler: true
-      enable_adaptive_tiling: true
 
     train_subset:
       batch_size: 8
       transforms:
-        - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
-        - class_path: otx.core.data.transform_libs.torchvision.MinIoURandomCrop
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
-            transform_bbox: true
-        - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
-          init_args:
-            prob: 0.5
-            is_numpy_to_tvtensor: true
-        - class_path: torchvision.transforms.v2.ToDtype
-          init_args:
-            dtype: ${as_torch_dtype:torch.float32}
-        - class_path: torchvision.transforms.v2.Normalize
-          init_args:
-            mean: [0.0, 0.0, 0.0]
-            std: [255.0, 255.0, 255.0]
       sampler:
         class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
 
diff --git a/src/otx/recipe/detection/yolox_l_tile.yaml b/src/otx/recipe/detection/yolox_l_tile.yaml
index f69cd804357..1e07232ff03 100644
--- a/src/otx/recipe/detection/yolox_l_tile.yaml
+++ b/src/otx/recipe/detection/yolox_l_tile.yaml
@@ -28,23 +28,16 @@ engine:
 
 callback_monitor: val/map_50
 
-data: ../_base_/data/detection.yaml
+data: ../_base_/data/detection_tile.yaml
 overrides:
   reset:
     - data.train_subset.transforms
-    - data.val_subset.transforms
-    - data.test_subset.transforms
-
   gradient_clip_val: 35.0
   data:
     input_size:
       - 640
       - 640
     image_color_channel: BGR
-    tile_config:
-      enable_tiler: true
-      enable_adaptive_tiling: true
-
     train_subset:
       num_workers: 4
       batch_size: 8
@@ -53,14 +46,11 @@ overrides:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
+            keep_ratio: false
             transform_bbox: true
         - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
           init_args:
             prob: 0.5
-        - class_path: otx.core.data.transform_libs.torchvision.Pad
-          init_args:
-            pad_to_square: true
-            pad_val: 114
             is_numpy_to_tvtensor: true
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
@@ -77,14 +67,6 @@ overrides:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
-        - class_path: otx.core.data.transform_libs.torchvision.Pad
-          init_args:
-            pad_to_square: true
-            pad_val: 114
-            is_numpy_to_tvtensor: true
-        - class_path: torchvision.transforms.v2.ToDtype
-          init_args:
-            dtype: ${as_torch_dtype:torch.float32}
         - class_path: torchvision.transforms.v2.Normalize
           init_args:
             mean: [0.0, 0.0, 0.0]
@@ -97,14 +79,6 @@ overrides:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
-        - class_path: otx.core.data.transform_libs.torchvision.Pad
-          init_args:
-            pad_to_square: true
-            pad_val: 114
-            is_numpy_to_tvtensor: true
-        - class_path: torchvision.transforms.v2.ToDtype
-          init_args:
-            dtype: ${as_torch_dtype:torch.float32}
         - class_path: torchvision.transforms.v2.Normalize
           init_args:
             mean: [0.0, 0.0, 0.0]
diff --git a/src/otx/recipe/detection/yolox_s_tile.yaml b/src/otx/recipe/detection/yolox_s_tile.yaml
index a5758eca47c..5d224c19f16 100644
--- a/src/otx/recipe/detection/yolox_s_tile.yaml
+++ b/src/otx/recipe/detection/yolox_s_tile.yaml
@@ -28,12 +28,10 @@ engine:
 
 callback_monitor: val/map_50
 
-data: ../_base_/data/detection.yaml
+data: ../_base_/data/detection_tile.yaml
 overrides:
   reset:
     - data.train_subset.transforms
-    - data.val_subset.transforms
-    - data.test_subset.transforms
 
   gradient_clip_val: 35.0
   data:
@@ -41,10 +39,6 @@ overrides:
       - 640
       - 640
     image_color_channel: BGR
-    tile_config:
-      enable_tiler: true
-      enable_adaptive_tiling: true
-
     train_subset:
       num_workers: 4
       batch_size: 8
@@ -53,14 +47,11 @@ overrides:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
+            keep_ratio: false
             transform_bbox: true
         - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
           init_args:
             prob: 0.5
-        - class_path: otx.core.data.transform_libs.torchvision.Pad
-          init_args:
-            pad_to_square: true
-            pad_val: 114
             is_numpy_to_tvtensor: true
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
@@ -77,14 +68,6 @@ overrides:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
-        - class_path: otx.core.data.transform_libs.torchvision.Pad
-          init_args:
-            pad_to_square: true
-            pad_val: 114
-            is_numpy_to_tvtensor: true
-        - class_path: torchvision.transforms.v2.ToDtype
-          init_args:
-            dtype: ${as_torch_dtype:torch.float32}
         - class_path: torchvision.transforms.v2.Normalize
           init_args:
             mean: [0.0, 0.0, 0.0]
@@ -97,14 +80,6 @@ overrides:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
-        - class_path: otx.core.data.transform_libs.torchvision.Pad
-          init_args:
-            pad_to_square: true
-            pad_val: 114
-            is_numpy_to_tvtensor: true
-        - class_path: torchvision.transforms.v2.ToDtype
-          init_args:
-            dtype: ${as_torch_dtype:torch.float32}
         - class_path: torchvision.transforms.v2.Normalize
           init_args:
             mean: [0.0, 0.0, 0.0]
diff --git a/src/otx/recipe/detection/yolox_tiny_tile.yaml b/src/otx/recipe/detection/yolox_tiny_tile.yaml
index 61d9d59f765..c7843ac50f5 100644
--- a/src/otx/recipe/detection/yolox_tiny_tile.yaml
+++ b/src/otx/recipe/detection/yolox_tiny_tile.yaml
@@ -28,38 +28,29 @@ engine:
 
 callback_monitor: val/map_50
 
-data: ../_base_/data/detection.yaml
+data: ../_base_/data/detection_tile.yaml
 overrides:
   reset:
     - data.train_subset.transforms
-    - data.val_subset.transforms
-    - data.test_subset.transforms
-
   gradient_clip_val: 35.0
   data:
     input_size:
       - 640
       - 640
-    tile_config:
-      enable_tiler: true
-      enable_adaptive_tiling: true
 
     train_subset:
       num_workers: 4
       batch_size: 8
       transforms:
-        - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
+        - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
+            keep_ratio: false
             transform_bbox: true
         - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
           init_args:
             prob: 0.5
-        - class_path: otx.core.data.transform_libs.torchvision.Pad
-          init_args:
-            pad_to_square: true
-            pad_val: 114
             is_numpy_to_tvtensor: true
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
@@ -76,14 +67,6 @@ overrides:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
-        - class_path: otx.core.data.transform_libs.torchvision.Pad
-          init_args:
-            pad_to_square: true
-            pad_val: 114
-            is_numpy_to_tvtensor: true
-        - class_path: torchvision.transforms.v2.ToDtype
-          init_args:
-            dtype: ${as_torch_dtype:torch.float32}
         - class_path: torchvision.transforms.v2.Normalize
           init_args:
             mean: [123.675, 116.28, 103.53]
@@ -96,14 +79,6 @@ overrides:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
-        - class_path: otx.core.data.transform_libs.torchvision.Pad
-          init_args:
-            pad_to_square: true
-            pad_val: 114
-            is_numpy_to_tvtensor: true
-        - class_path: torchvision.transforms.v2.ToDtype
-          init_args:
-            dtype: ${as_torch_dtype:torch.float32}
         - class_path: torchvision.transforms.v2.Normalize
           init_args:
             mean: [123.675, 116.28, 103.53]
diff --git a/src/otx/recipe/detection/yolox_x_tile.yaml b/src/otx/recipe/detection/yolox_x_tile.yaml
index 0431814cb6e..1739ffe3f6a 100644
--- a/src/otx/recipe/detection/yolox_x_tile.yaml
+++ b/src/otx/recipe/detection/yolox_x_tile.yaml
@@ -28,23 +28,16 @@ engine:
 
 callback_monitor: val/map_50
 
-data: ../_base_/data/detection.yaml
+data: ../_base_/data/detection_tile.yaml
 overrides:
   reset:
     - data.train_subset.transforms
-    - data.val_subset.transforms
-    - data.test_subset.transforms
-
   gradient_clip_val: 35.0
   data:
     input_size:
       - 640
       - 640
     image_color_channel: BGR
-    tile_config:
-      enable_tiler: true
-      enable_adaptive_tiling: true
-
     train_subset:
       num_workers: 4
       batch_size: 4
@@ -53,14 +46,11 @@ overrides:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
+            keep_ratio: false
             transform_bbox: true
         - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
           init_args:
             prob: 0.5
-        - class_path: otx.core.data.transform_libs.torchvision.Pad
-          init_args:
-            pad_to_square: true
-            pad_val: 114
             is_numpy_to_tvtensor: true
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
@@ -77,14 +67,6 @@ overrides:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
-        - class_path: otx.core.data.transform_libs.torchvision.Pad
-          init_args:
-            pad_to_square: true
-            pad_val: 114
-            is_numpy_to_tvtensor: true
-        - class_path: torchvision.transforms.v2.ToDtype
-          init_args:
-            dtype: ${as_torch_dtype:torch.float32}
         - class_path: torchvision.transforms.v2.Normalize
           init_args:
             mean: [0.0, 0.0, 0.0]
@@ -97,14 +79,6 @@ overrides:
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
-        - class_path: otx.core.data.transform_libs.torchvision.Pad
-          init_args:
-            pad_to_square: true
-            pad_val: 114
-            is_numpy_to_tvtensor: true
-        - class_path: torchvision.transforms.v2.ToDtype
-          init_args:
-            dtype: ${as_torch_dtype:torch.float32}
         - class_path: torchvision.transforms.v2.Normalize
           init_args:
             mean: [0.0, 0.0, 0.0]
diff --git a/tests/integration/cli/test_export_inference.py b/tests/integration/cli/test_export_inference.py
index fdb46e90ea8..8071925aadb 100644
--- a/tests/integration/cli/test_export_inference.py
+++ b/tests/integration/cli/test_export_inference.py
@@ -89,8 +89,6 @@ def test_otx_export_infer(
         task == "instance_segmentation" and "maskrcnn_efficientnetb2b" not in recipe
     ):
         pytest.skip("To prevent memory bug from aborting integration test, test single model per task.")
-    elif "tile" in recipe:
-        pytest.skip("Exporting models with tiling isn't supported yet.")
 
     model_name = recipe.split("/")[-1].split(".")[0]
 
diff --git a/tests/unit/algo/detection/base_models/test_detr.py b/tests/unit/algo/detection/base_models/test_detr.py
index 71ce30cc1fb..6b6cc99c17c 100644
--- a/tests/unit/algo/detection/base_models/test_detr.py
+++ b/tests/unit/algo/detection/base_models/test_detr.py
@@ -79,15 +79,15 @@ def test_rt_detr_postprocess(self, rt_detr_model):
             "pred_logits": torch.randn(2, 100, 10),
             "pred_boxes": torch.randn(2, 100, 4),
         }
-        original_size = [640, 640]
-        result = rt_detr_model.postprocess(outputs, original_size)
+        original_sizes = [[640, 640], [640, 640]]
+        result = rt_detr_model.postprocess(outputs, original_sizes)
         assert isinstance(result, tuple)
         assert len(result) == 3
         scores, boxes, labels = result
         assert isinstance(scores, list)
         assert isinstance(boxes, list)
         assert isinstance(boxes[0], torchvision.tv_tensors.BoundingBoxes)
-        assert boxes[0].canvas_size == original_size
+        assert boxes[0].canvas_size == original_sizes[0]
         assert isinstance(labels, list)
         assert len(scores) == 2
         assert len(boxes) == 2
@@ -96,7 +96,7 @@ def test_rt_detr_postprocess(self, rt_detr_model):
     def test_rt_detr_export(self, rt_detr_model, images):
         rt_detr_model.eval()
         rt_detr_model.num_top_queries = 10
-        batch_img_metas = {"img_shape": (740, 740), "scale_factor": 1.0}
+        batch_img_metas = [{"img_shape": (740, 740), "scale_factor": 1.0}]
         result = rt_detr_model.export(images, batch_img_metas)
         assert isinstance(result, dict)
         assert "bboxes" in result
diff --git a/tests/unit/core/data/test_tiling.py b/tests/unit/core/data/test_tiling.py
index 0eff56af3a0..baff06fb67b 100644
--- a/tests/unit/core/data/test_tiling.py
+++ b/tests/unit/core/data/test_tiling.py
@@ -17,7 +17,7 @@
 from otx.algo.detection.atss import MobileNetV2ATSS
 from otx.algo.instance_segmentation.maskrcnn import MaskRCNNEfficientNet
 from otx.core.config.data import (
-    SubsetConfig,
+    SamplerConfig,
     TileConfig,
     VisualPromptingConfig,
 )
@@ -28,7 +28,6 @@
 from otx.core.data.module import OTXDataModule
 from otx.core.model.detection import OTXDetectionModel
 from otx.core.types.task import OTXTaskType
-from otx.core.types.transformer_libs import TransformLibType
 from torchvision import tv_tensors
 
 from tests.test_helpers import generate_random_bboxes
@@ -40,74 +39,24 @@ def mock_otx_det_model(self) -> OTXDetectionModel:
         return create_autospec(OTXDetectionModel)
 
     @pytest.fixture()
-    def fxt_tv_det_transform_config(self) -> list[DictConfig]:
-        mmdet_base = OmegaConf.load("src/otx/recipe/_base_/data/torchvision_base.yaml")
-        return mmdet_base.train_subset.transforms
+    def fxt_det_transform_config(self) -> DictConfig:
+        config = OmegaConf.load("src/otx/recipe/_base_/data/detection_tile.yaml")
+        config.train_subset.input_size = config.input_size
+        config.val_subset.input_size = config.input_size
+        config.test_subset.input_size = config.input_size
+        config.train_subset.sampler = SamplerConfig(**config.train_subset.sampler)
+        return config
 
     @pytest.fixture()
-    def fxt_det_data_config(self, fxt_tv_det_transform_config) -> dict:
+    def fxt_det_data_config(self, fxt_det_transform_config) -> dict:
         data_root = Path(__file__).parent.parent.parent.parent / "assets" / "car_tree_bug"
 
-        batch_size = 8
-        num_workers = 0
         return {
             "data_format": "coco_instances",
             "data_root": data_root,
-            "train_subset": SubsetConfig(
-                subset_name="train",
-                batch_size=batch_size,
-                num_workers=num_workers,
-                transform_lib_type=TransformLibType.TORCHVISION,
-                transforms=fxt_tv_det_transform_config,
-            ),
-            "val_subset": SubsetConfig(
-                subset_name="val",
-                batch_size=batch_size,
-                num_workers=num_workers,
-                transform_lib_type=TransformLibType.TORCHVISION,
-                transforms=fxt_tv_det_transform_config,
-            ),
-            "test_subset": SubsetConfig(
-                subset_name="test",
-                batch_size=batch_size,
-                num_workers=num_workers,
-                transform_lib_type=TransformLibType.TORCHVISION,
-                transforms=fxt_tv_det_transform_config,
-            ),
-            "tile_config": TileConfig(),
-            "vpm_config": VisualPromptingConfig(),
-        }
-
-    @pytest.fixture()
-    def fxt_instseg_data_config(self, fxt_tv_det_transform_config) -> dict:
-        data_root = Path(__file__).parent.parent.parent.parent / "assets" / "car_tree_bug"
-
-        batch_size = 8
-        num_workers = 0
-        return {
-            "data_format": "coco_instances",
-            "data_root": data_root,
-            "train_subset": SubsetConfig(
-                subset_name="train",
-                batch_size=batch_size,
-                num_workers=num_workers,
-                transform_lib_type=TransformLibType.TORCHVISION,
-                transforms=fxt_tv_det_transform_config,
-            ),
-            "val_subset": SubsetConfig(
-                subset_name="val",
-                batch_size=batch_size,
-                num_workers=num_workers,
-                transform_lib_type=TransformLibType.TORCHVISION,
-                transforms=fxt_tv_det_transform_config,
-            ),
-            "test_subset": SubsetConfig(
-                subset_name="test",
-                batch_size=batch_size,
-                num_workers=num_workers,
-                transform_lib_type=TransformLibType.TORCHVISION,
-                transforms=fxt_tv_det_transform_config,
-            ),
+            "train_subset": fxt_det_transform_config.train_subset,
+            "val_subset": fxt_det_transform_config.val_subset,
+            "test_subset": fxt_det_transform_config.test_subset,
             "tile_config": TileConfig(),
             "vpm_config": VisualPromptingConfig(),
         }
@@ -379,13 +328,13 @@ def test_explain_det_tile_merge(self, fxt_det_data_config):
             assert prediction.saliency_map[0].ndim == 3
         self.explain_mode = False
 
-    def test_instseg_tile_merge(self, fxt_instseg_data_config):
+    def test_instseg_tile_merge(self, fxt_det_data_config):
         model = MaskRCNNEfficientNet(label_info=3)
         # Enable tile adapter
-        fxt_instseg_data_config["tile_config"] = TileConfig(enable_tiler=True)
+        fxt_det_data_config["tile_config"] = TileConfig(enable_tiler=True)
         tile_datamodule = OTXDataModule(
             task=OTXTaskType.INSTANCE_SEGMENTATION,
-            **fxt_instseg_data_config,
+            **fxt_det_data_config,
         )
 
         self.explain_mode = False
@@ -395,13 +344,13 @@ def test_instseg_tile_merge(self, fxt_instseg_data_config):
         for batch in tile_datamodule.val_dataloader():
             model.forward_tiles(batch)
 
-    def test_explain_instseg_tile_merge(self, fxt_instseg_data_config):
+    def test_explain_instseg_tile_merge(self, fxt_det_data_config):
         model = MaskRCNNEfficientNet(label_info=3)
         # Enable tile adapter
-        fxt_instseg_data_config["tile_config"] = TileConfig(enable_tiler=True, enable_adaptive_tiling=False)
+        fxt_det_data_config["tile_config"] = TileConfig(enable_tiler=True, enable_adaptive_tiling=False)
         tile_datamodule = OTXDataModule(
             task=OTXTaskType.INSTANCE_SEGMENTATION,
-            **fxt_instseg_data_config,
+            **fxt_det_data_config,
         )
 
         self.explain_mode = model.explain_mode = True

From c7efcbcb7bd8916837422bc3bda3f593d82f4e52 Mon Sep 17 00:00:00 2001
From: Ashwin Vaidya <ashwin.vaidya@intel.com>
Date: Thu, 12 Sep 2024 21:02:29 +0200
Subject: [PATCH 41/53] Support ImageFromBytes (#3948)

* add image_from_bytes

Signed-off-by: Ashwin Vaidya <ashwinnitinvaidya@gmail.com>

* refactor code

Signed-off-by: Ashwin Vaidya <ashwinnitinvaidya@gmail.com>

* allow empty anomalous masks

Signed-off-by: Ashwin Vaidya <ashwinnitinvaidya@gmail.com>

---------

Signed-off-by: Ashwin Vaidya <ashwinnitinvaidya@gmail.com>
---
 src/otx/core/data/dataset/anomaly.py | 152 +++++++++++++++++++++------
 src/otx/core/model/anomaly.py        |  18 ++--
 2 files changed, 128 insertions(+), 42 deletions(-)

diff --git a/src/otx/core/data/dataset/anomaly.py b/src/otx/core/data/dataset/anomaly.py
index b776ccda911..ec9b59ce499 100644
--- a/src/otx/core/data/dataset/anomaly.py
+++ b/src/otx/core/data/dataset/anomaly.py
@@ -5,13 +5,18 @@
 
 from __future__ import annotations
 
+from enum import Enum
 from pathlib import Path
 from typing import Callable
 
+import cv2
+import numpy as np
 import torch
 from anomalib.data.utils import masks_to_boxes
 from datumaro import Dataset as DmDataset
-from datumaro import Image
+from datumaro import DatasetItem, Image
+from datumaro.components.annotation import AnnotationType, Bbox, Ellipse, Polygon
+from datumaro.components.media import ImageFromBytes, ImageFromFile
 from torchvision import io
 from torchvision.tv_tensors import BoundingBoxes, BoundingBoxFormat, Mask
 
@@ -31,6 +36,13 @@
 from otx.core.types.task import OTXTaskType
 
 
+class AnomalyLabel(Enum):
+    """Anomaly label to tensor mapping."""
+
+    NORMAL = torch.tensor(0.0)
+    ANOMALOUS = torch.tensor(1.0)
+
+
 class AnomalyDataset(OTXDataset):
     """OTXDataset class for anomaly classification task."""
 
@@ -58,6 +70,7 @@ def __init__(
             to_tv_image,
         )
         self.label_info = AnomalyLabelInfo()
+        self._label_mapping = self._map_id_to_label()
 
     def _get_item_impl(
         self,
@@ -67,12 +80,9 @@ def _get_item_impl(
         img = datumaro_item.media_as(Image)
         # returns image in RGB format if self.image_color_channel is RGB
         img_data, img_shape = self._get_img_data_and_shape(img)
-        # Note: This assumes that the dataset is in MVTec format.
-        # We can't use datumaro label id as it returns some number like 3 for good from which it is hard to infer
-        # whether the image is Anomalous or Normal. Because it leads to other questions like what do numbers 0,1,2 mean?
-        label: torch.LongTensor = (
-            torch.tensor(0.0, dtype=torch.long) if "good" in datumaro_item.id else torch.tensor(1.0, dtype=torch.long)
-        )
+
+        label = self._get_label(datumaro_item)
+
         item: AnomalyClassificationDataItem | AnomalySegmentationDataItem | AnomalyDetectionDataItem
         if self.task_type == OTXTaskType.ANOMALY_CLASSIFICATION:
             item = AnomalyClassificationDataItem(
@@ -88,15 +98,6 @@ def _get_item_impl(
         elif self.task_type == OTXTaskType.ANOMALY_SEGMENTATION:
             # Note: this part of code is brittle. Ideally Datumaro should return masks
             # Another major problem with this is that it assumes that the dataset passed is in MVTec format
-            mask_file_path = (
-                Path("/".join(datumaro_item.media.path.split("/")[:-3]))
-                / "ground_truth"
-                / f"{('/'.join(datumaro_item.media.path.split('/')[-2:])).replace('.png','_mask.png')}"
-            )
-            mask = torch.zeros(1, img_shape[0], img_shape[1], dtype=torch.uint8)
-            if mask_file_path.exists():
-                # read and convert to binary mask
-                mask = (io.read_image(str(mask_file_path), mode=io.ImageReadMode.GRAY) / 255).to(torch.uint8)
             item = AnomalySegmentationDataItem(
                 image=img_data,
                 img_info=ImageInfo(
@@ -106,20 +107,9 @@ def _get_item_impl(
                     image_color_channel=self.image_color_channel,
                 ),
                 label=label,
-                mask=Mask(mask),
+                mask=Mask(self._get_mask(datumaro_item, label, img_shape)),
             )
         elif self.task_type == OTXTaskType.ANOMALY_DETECTION:
-            # Note: this part of code is brittle. Ideally Datumaro should return masks
-            mask_file_path = (
-                Path("/".join(datumaro_item.media.path.split("/")[:-3]))
-                / "ground_truth"
-                / f"{('/'.join(datumaro_item.media.path.split('/')[-2:])).replace('.png','_mask.png')}"
-            )
-            mask = torch.zeros(1, img_shape[0], img_shape[1], dtype=torch.uint8)
-            if mask_file_path.exists():
-                # read and convert to binary mask
-                mask = (io.read_image(str(mask_file_path), mode=io.ImageReadMode.GRAY) / 255).to(torch.uint8)
-            boxes, _ = masks_to_boxes(mask)
             item = AnomalyDetectionDataItem(
                 image=img_data,
                 img_info=ImageInfo(
@@ -129,9 +119,9 @@ def _get_item_impl(
                     image_color_channel=self.image_color_channel,
                 ),
                 label=label,
-                boxes=BoundingBoxes(boxes[0], format=BoundingBoxFormat.XYXY, canvas_size=img_shape),
+                boxes=self._get_boxes(datumaro_item, label, img_shape),
                 # mask is used for pixel-level metric computation. We can't assume that this will always be available
-                mask=Mask(mask),
+                mask=Mask(self._get_mask(datumaro_item, label, img_shape)),
             )
         else:
             msg = f"Task {self.task_type} is not supported yet."
@@ -142,6 +132,108 @@ def _get_item_impl(
         # "AnomalyClassificationDataItem | AnomalySegmentationDataBatch | AnomalyDetectionDataBatch")
         return self._apply_transforms(item)  # type: ignore[return-value]
 
+    def _get_mask(self, datumaro_item: DatasetItem, label: torch.Tensor, img_shape: tuple[int, int]) -> torch.Tensor:
+        """Get mask from datumaro_item.
+
+        Converts bounding boxes to mask if mask is not available.
+        """
+        if isinstance(datumaro_item.media, ImageFromFile):
+            if label == AnomalyLabel.ANOMALOUS.value:
+                mask = self._mask_image_from_file(datumaro_item, img_shape)
+            else:
+                mask = torch.zeros(1, *img_shape).to(torch.uint8)
+        elif isinstance(datumaro_item.media, ImageFromBytes):
+            mask = torch.zeros(1, *img_shape).to(torch.uint8)
+            if label == AnomalyLabel.ANOMALOUS.value:
+                for annotation in datumaro_item.annotations:
+                    # There is only one mask
+                    if isinstance(annotation, (Ellipse, Polygon)):
+                        polygons = np.asarray(annotation.as_polygon(), dtype=np.int32).reshape((-1, 1, 2))
+                        mask = np.zeros(img_shape, dtype=np.uint8)
+                        mask = cv2.drawContours(
+                            mask,
+                            [polygons],
+                            0,
+                            (1, 1, 1),
+                            thickness=cv2.FILLED,
+                        )
+                        mask = torch.from_numpy(mask).to(torch.uint8).unsqueeze(0)
+                        break
+                    # If there is no mask, create a mask from bbox
+                    if isinstance(annotation, Bbox):
+                        bbox = annotation
+                        mask = self._bbox_to_mask(bbox, img_shape)
+                        break
+        return mask
+
+    def _get_boxes(self, datumaro_item: DatasetItem, label: torch.Tensor, img_shape: tuple[int, int]) -> BoundingBoxes:
+        """Get bounding boxes from datumaro item.
+
+        Uses masks if available to get bounding boxes.
+        """
+        boxes = BoundingBoxes(torch.empty(0, 4), format=BoundingBoxFormat.XYXY, canvas_size=img_shape)
+        if isinstance(datumaro_item.media, ImageFromFile):
+            if label == AnomalyLabel.ANOMALOUS.value:
+                mask = self._mask_image_from_file(datumaro_item, img_shape)
+                boxes, _ = masks_to_boxes(mask)
+                # Assumes only one bounding box is present
+                boxes = BoundingBoxes(boxes[0], format=BoundingBoxFormat.XYXY, canvas_size=img_shape)
+        elif isinstance(datumaro_item.media, ImageFromBytes) and label == AnomalyLabel.ANOMALOUS.value:
+            for annotation in datumaro_item.annotations:
+                if isinstance(annotation, Bbox):
+                    bbox = annotation
+                    boxes = BoundingBoxes(bbox.get_bbox(), format=BoundingBoxFormat.XYXY, canvas_size=img_shape)
+                    break
+        return boxes
+
+    def _bbox_to_mask(self, bbox: Bbox, img_shape: tuple[int, int]) -> torch.Tensor:
+        mask = torch.zeros(1, *img_shape).to(torch.uint8)
+        x1, y1, x2, y2 = bbox.get_bbox()
+        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+        mask[:, y1:y2, x1:x2] = 1
+        return mask
+
+    def _get_label(self, datumaro_item: DatasetItem) -> torch.LongTensor:
+        """Get label from datumaro item."""
+        if isinstance(datumaro_item.media, ImageFromFile):
+            # Note: This assumes that the dataset is in MVTec format.
+            # We can't use datumaro label id as it returns some number like 3 for good from which it is hard to infer
+            # whether the image is Anomalous or Normal. Because it leads to other questions like what do numbers 0,1,2
+            # mean?
+            label: torch.LongTensor = AnomalyLabel.NORMAL if "good" in datumaro_item.id else AnomalyLabel.ANOMALOUS
+        elif isinstance(datumaro_item.media, ImageFromBytes):
+            label = self._label_mapping[datumaro_item.annotations[0].label]
+        else:
+            msg = f"Media type {type(datumaro_item.media)} is not supported."
+            raise NotImplementedError(msg)
+        return label.value
+
+    def _map_id_to_label(self) -> dict[int, torch.Tensor]:
+        """Map label id to label tensor."""
+        id_label_mapping = {}
+        categories = self.dm_subset.categories()[AnnotationType.label]
+        for label_item in categories.items:
+            if any("normal" in attribute.lower() for attribute in label_item.attributes):
+                label = AnomalyLabel.NORMAL
+            else:
+                label = AnomalyLabel.ANOMALOUS
+            id_label_mapping[categories.find(label_item.name)[0]] = label
+        return id_label_mapping
+
+    def _mask_image_from_file(self, datumaro_item: DatasetItem, img_shape: tuple[int, int]) -> torch.Tensor:
+        """Assumes MVTec format and returns mask from disk."""
+        mask_file_path = (
+            Path("/".join(datumaro_item.media.path.split("/")[:-3]))
+            / "ground_truth"
+            / f"{('/'.join(datumaro_item.media.path.split('/')[-2:])).replace('.png','_mask.png')}"
+        )
+        if mask_file_path.exists():
+            return (io.read_image(str(mask_file_path), mode=io.ImageReadMode.GRAY) / 255).to(torch.uint8)
+
+        # Note: This is a workaround to handle the case where mask is not available otherwise the tests fail.
+        # This is problematic because it assigns empty masks to an Anomalous image.
+        return torch.zeros(1, *img_shape).to(torch.uint8)
+
     @property
     def collate_fn(self) -> Callable:
         """Collection function to collect SegDataEntity into SegBatchDataEntity in data loader."""
diff --git a/src/otx/core/model/anomaly.py b/src/otx/core/model/anomaly.py
index f41823cdcc1..8f78769fdb0 100644
--- a/src/otx/core/model/anomaly.py
+++ b/src/otx/core/model/anomaly.py
@@ -180,18 +180,12 @@ def _customize_inputs(
         inputs: AnomalyModelInputs,
     ) -> dict[str, Any]:
         """Customize inputs for the model."""
-        return_dict = {}
-        if isinstance(inputs, AnomalyClassificationDataBatch):
-            return_dict = {"image": inputs.images, "label": torch.vstack(inputs.labels).squeeze()}
-        if isinstance(inputs, AnomalySegmentationDataBatch):
-            return_dict = {"image": inputs.images, "label": torch.vstack(inputs.labels).squeeze(), "mask": inputs.masks}
-        if isinstance(inputs, AnomalyDetectionDataBatch):
-            return_dict = {
-                "image": inputs.images,
-                "label": torch.vstack(inputs.labels).squeeze(),
-                "mask": inputs.masks,
-                "boxes": inputs.boxes,
-            }
+        return_dict = {"image": inputs.images, "label": torch.vstack(inputs.labels).squeeze()}
+        if isinstance(inputs, AnomalySegmentationDataBatch) and inputs.masks is not None:
+            return_dict["mask"] = inputs.masks
+        if isinstance(inputs, AnomalyDetectionDataBatch) and inputs.masks is not None and inputs.boxes is not None:
+            return_dict["mask"] = inputs.masks
+            return_dict["boxes"] = inputs.boxes
 
         if return_dict["label"].size() == torch.Size([]):  # when last batch size is 1
             return_dict["label"] = return_dict["label"].unsqueeze(0)

From ecef545af5336c506329424d29267e74189297de Mon Sep 17 00:00:00 2001
From: Prokofiev Kirill <kirill.prokofiev@intel.com>
Date: Fri, 13 Sep 2024 02:34:08 +0200
Subject: [PATCH 42/53] Change categories mapping logic (#3946)

* change pre-filtering logic

* Update src/otx/core/data/pre_filtering.py

Co-authored-by: Eunwoo Shin <eunwoo.shin@intel.com>

---------

Co-authored-by: Eunwoo Shin <eunwoo.shin@intel.com>
---
 src/otx/core/data/pre_filtering.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/otx/core/data/pre_filtering.py b/src/otx/core/data/pre_filtering.py
index 459ef7be6f5..f78d8fe1db2 100644
--- a/src/otx/core/data/pre_filtering.py
+++ b/src/otx/core/data/pre_filtering.py
@@ -88,7 +88,15 @@ def remove_unused_labels(dataset: DmDataset, data_format: str, ignore_index: int
         raise ValueError(msg)
     if len(used_labels) == len(original_categories):
         return dataset
+    if data_format == "arrow" and max(used_labels) != len(original_categories) - 1:
+        # we assume that empty label is always the last one. If it is not explicitly added to the dataset,
+        # (not in the used labels) it will be filtered out.
+        mapping = {cat: cat for cat in original_categories[:-1]}
+    elif data_format == "arrow":
+        # this mean that some other class wasn't annotated, we don't need to filter the object classes
+        return dataset
+    else:
+        mapping = {original_categories[idx]: original_categories[idx] for idx in used_labels}
     msg = "There are unused labels in dataset, they will be filtered out before training."
     warnings.warn(msg, stacklevel=2)
-    mapping = {original_categories[idx]: original_categories[idx] for idx in used_labels}
     return dataset.transform("remap_labels", mapping=mapping, default="delete")

From b1ec8e709abf7e362d76ab52b881b69cd2b97570 Mon Sep 17 00:00:00 2001
From: Yunchu Lee <yunchu.lee@intel.com>
Date: Fri, 13 Sep 2024 11:29:23 +0900
Subject: [PATCH 43/53] Update for 2.2.0rc1 (#3956)

---
 CHANGELOG.md                              | 16 +++++++++++++---
 README.md                                 |  5 +++++
 docs/source/guide/release_notes/index.rst |  6 +++++-
 src/otx/__init__.py                       |  2 +-
 4 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 37184eb4fb5..2110dc39feb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -43,15 +43,25 @@ All notable changes to this project will be documented in this file.
   (<https://github.com/openvinotoolkit/training_extensions/pull/3769>)
 - Refactoring `ConvModule` by removing `conv_cfg`, `norm_cfg`, and `act_cfg`
   (<https://github.com/openvinotoolkit/training_extensions/pull/3783>, <https://github.com/openvinotoolkit/training_extensions/pull/3816>, <https://github.com/openvinotoolkit/training_extensions/pull/3809>)
+- Support ImageFromBytes
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3948>)
+- enable model export
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3952>)
+- Move templates from OTX1.X to OTX2.X
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3951>)
 
 ### Bug fixes
 
 - Fix Combined Dataloader & unlabeled warmup loss in Semi-SL
-  (https://github.com/openvinotoolkit/training_extensions/pull/3723)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3723>)
 - Revert #3579 to fix issues with replacing coco_instance with a different format in some dataset
-  (https://github.com/openvinotoolkit/training_extensions/pull/3753)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3753>)
 - Add num_devices in Engine for multi-gpu training
-  (https://github.com/openvinotoolkit/training_extensions/pull/3778)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3778>)
+- Add missing tile recipes and various tile recipe changes
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3942>)
+- Change categories mapping logic
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3946>)
 
 ## \[v2.1.0\]
 
diff --git a/README.md b/README.md
index f678fd9ecd5..c3938a2bc90 100644
--- a/README.md
+++ b/README.md
@@ -190,12 +190,17 @@ In addition to the examples above, please refer to the documentation for tutoria
 - Enable to use input_size at transforms in recipe
 - Enable to use polygon and bitmap mask as prompt inputs for zero-shot learning
 - Refactoring `ConvModule` by removing `conv_cfg`, `norm_cfg`, and `act_cfg`
+- Support ImageFromBytes
+- enable model export
+- Move templates from OTX1.X to OTX2.X
 
 ### Bug fixes
 
 - Fix Combined Dataloader & unlabeled warmup loss in Semi-SL
 - Revert #3579 to fix issues with replacing coco_instance with a different format in some dataset
 - Add num_devices in Engine for multi-gpu training
+- Add missing tile recipes and various tile recipe changes
+- Change categories mapping logic
 
 ### Known issues
 
diff --git a/docs/source/guide/release_notes/index.rst b/docs/source/guide/release_notes/index.rst
index 1871c1b9438..cf950aae2f6 100644
--- a/docs/source/guide/release_notes/index.rst
+++ b/docs/source/guide/release_notes/index.rst
@@ -31,6 +31,9 @@ Enhancements
 - Enable to use input_size at transforms in recipe
 - Enable to use polygon and bitmap mask as prompt inputs for zero-shot learning
 - Refactoring `ConvModule` by removing `conv_cfg`, `norm_cfg`, and `act_cfg`
+- Support ImageFromBytes
+- enable model export
+- Move templates from OTX1.X to OTX2.X
 
 Bug fixes
 ^^^^^^^^^
@@ -38,7 +41,8 @@ Bug fixes
 - Fix Combined Dataloader & unlabeled warmup loss in Semi-SL
 - Revert #3579 to fix issues with replacing coco_instance with a different format in some dataset
 - Add num_devices in Engine for multi-gpu training
-
+- Add missing tile recipes and various tile recipe changes
+- Change categories mapping logic
 
 v2.1.0 (2024.07)
 ----------------
diff --git a/src/otx/__init__.py b/src/otx/__init__.py
index 58cf9a5f332..cd62efaf949 100644
--- a/src/otx/__init__.py
+++ b/src/otx/__init__.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-__version__ = "2.2.0rc0"
+__version__ = "2.2.0rc1"
 
 import os
 from pathlib import Path

From aa31dca4bd407a35b844dd037c3846b51040d180 Mon Sep 17 00:00:00 2001
From: Eugene Liu <eugene.liu@intel.com>
Date: Fri, 20 Sep 2024 01:54:47 +0100
Subject: [PATCH 44/53] Include Geti arrow dataset subset names (#3962)

* restrited number of output masks by tiling

* add geti subset name

* update num of max pred
---
 src/otx/core/data/utils/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/otx/core/data/utils/utils.py b/src/otx/core/data/utils/utils.py
index fc0c84dbf58..7bb96fd3588 100644
--- a/src/otx/core/data/utils/utils.py
+++ b/src/otx/core/data/utils/utils.py
@@ -250,7 +250,7 @@ def adapt_tile_config(tile_config: TileConfig, dataset: Dataset) -> None:
         tile_config (TileConfig): tiling parameters of the model
         dataset (Dataset): Datumaro dataset including all subsets
     """
-    if (train_dataset := dataset.subsets().get("train")) is not None:
+    if (train_dataset := dataset.subsets().get("train") or dataset.subsets().get("TRAINING")) is not None:
         stat = compute_robust_dataset_statistics(train_dataset)
         max_num_objects = round(stat["annotation"]["num_per_image"]["max"])
         avg_size = stat["annotation"]["size_of_shape"]["avg"]

From 93f1a55aab464a6633107fb950873a0aeca58c27 Mon Sep 17 00:00:00 2001
From: Eugene Liu <eugene.liu@intel.com>
Date: Fri, 20 Sep 2024 02:32:03 +0100
Subject: [PATCH 45/53] Include full image with anno in case there's no tile in
 tile dataset (#3964)

* include full image with anno incase there's no tile in dataset

* update test
---
 src/otx/core/data/dataset/tile.py  | 39 ++++++++++++++++++++++++------
 tests/unit/core/utils/test_tile.py |  9 ++++---
 2 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/src/otx/core/data/dataset/tile.py b/src/otx/core/data/dataset/tile.py
index 6c132b25be1..a39ea5aa90d 100644
--- a/src/otx/core/data/dataset/tile.py
+++ b/src/otx/core/data/dataset/tile.py
@@ -148,12 +148,12 @@ def _extract_rois(self, image: Image) -> list[BboxIntCoords]:
         tile_h, tile_w = self._tile_size
         h_ovl, w_ovl = self._overlap
 
-        rois: list[BboxIntCoords] = []
+        rois: set[BboxIntCoords] = set()
         cols = range(0, img_w, int(tile_w * (1 - w_ovl)))
         rows = range(0, img_h, int(tile_h * (1 - h_ovl)))
 
         if self.with_full_img:
-            rois += [x1y1x2y2_to_xywh(0, 0, img_w, img_h)]
+            rois.add(x1y1x2y2_to_xywh(0, 0, img_w, img_h))
         for offset_x, offset_y in product(cols, rows):
             x2 = min(offset_x + tile_w, img_w)
             y2 = min(offset_y + tile_h, img_h)
@@ -161,11 +161,11 @@ def _extract_rois(self, image: Image) -> list[BboxIntCoords]:
             x1, y1, x2, y2 = cxcywh_to_x1y1x2y2(c_x, c_y, w, h)
             x1, y1, x2, y2 = clip_x1y1x2y2(x1, y1, x2, y2, img_w, img_h)
             x1, y1, x2, y2 = (int(v) for v in [x1, y1, x2, y2])
-            rois += [x1y1x2y2_to_xywh(x1, y1, x2, y2)]
+            rois.add(x1y1x2y2_to_xywh(x1, y1, x2, y2))
 
         log.info(f"image: {img_h}x{img_w} ~ tile_size: {self._tile_size}")
         log.info(f"{len(rows)}x{len(cols)} tiles -> {len(rois)} tiles")
-        return rois
+        return list(rois)
 
 
 class OTXTileDatasetFactory:
@@ -242,6 +242,23 @@ def _convert_entity(self, image: np.ndarray, dataset_item: DatasetItem, parent_i
         msg = "Method _convert_entity is not implemented."
         raise NotImplementedError(msg)
 
+    def transform_item(
+        self,
+        item: DatasetItem,
+        tile_size: tuple[int, int],
+        overlap: tuple[float, float],
+        with_full_img: bool,
+    ) -> DmDataset:
+        """Transform a dataset item to tile dataset which contains multiple tiles."""
+        tile_ds = DmDataset.from_iterable([item])
+        return tile_ds.transform(
+            OTXTileTransform,
+            tile_size=tile_size,
+            overlap=overlap,
+            threshold_drop_ann=0.5,
+            with_full_img=with_full_img,
+        )
+
     def get_tiles(
         self,
         image: np.ndarray,
@@ -259,18 +276,24 @@ def get_tiles(
             - tile_entities (list[OTXDataEntity]): List of tile entities.
             - tile_attrs (list[dict]): List of tile attributes.
         """
-        tile_ds = DmDataset.from_iterable([item])
-        tile_ds = tile_ds.transform(
-            OTXTileTransform,
+        tile_ds = self.transform_item(
+            item,
             tile_size=self.tile_config.tile_size,
             overlap=(self.tile_config.overlap, self.tile_config.overlap),
-            threshold_drop_ann=0.5,
             with_full_img=self.tile_config.with_full_img,
         )
 
         if item.subset in VAL_SUBSET_NAMES:
             # NOTE: filter validation tiles with annotations only to avoid evaluation on empty tiles.
             tile_ds = tile_ds.filter("/item/annotation", filter_annotations=True, remove_empty=True)
+            # if tile dataset is empty it means objects are too big to fit in any tile, in this case include full image
+            if len(tile_ds) == 0:
+                tile_ds = self.transform_item(
+                    item,
+                    tile_size=self.tile_config.tile_size,
+                    overlap=(self.tile_config.overlap, self.tile_config.overlap),
+                    with_full_img=True,
+                )
 
         tile_entities: list[OTXDataEntity] = []
         tile_attrs: list[dict] = []
diff --git a/tests/unit/core/utils/test_tile.py b/tests/unit/core/utils/test_tile.py
index 6806fa801aa..b2c29e9ed8a 100644
--- a/tests/unit/core/utils/test_tile.py
+++ b/tests/unit/core/utils/test_tile.py
@@ -15,7 +15,7 @@
 
 
 def test_tile_transform_consistency(mocker):
-    # Test that the tiler and tile transform are consistent
+    # Test that OV tiler and PyTorch tile transform are consistent
     rng = np.random.default_rng()
     rnd_tile_size = rng.integers(low=100, high=500)
     rnd_tile_overlap = rng.random()
@@ -39,5 +39,8 @@ def test_tile_transform_consistency(mocker):
     tile_transform.with_full_img = True
 
     dm_rois = [xywh_to_x1y1x2y2(*roi) for roi in tile_transform._extract_rois(dm_image)]
-    # 0 index in tiler is the full image so we skip it
-    assert np.allclose(dm_rois, tiler._tile(np_image))
+    ov_tiler_rois = tiler._tile(np_image)
+
+    assert len(dm_rois) == len(ov_tiler_rois)
+    for dm_roi in dm_rois:
+        assert list(dm_roi) in ov_tiler_rois

From 45f9a2461526379372b50488d56221ec2487f415 Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Fri, 20 Sep 2024 12:14:49 +0900
Subject: [PATCH 46/53] Add type checker in converter for callable functions
 (optimizer, scheduler) (#3968)

Fix converter callable functions (optimizer, scheduler)
---
 src/otx/tools/converter.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/otx/tools/converter.py b/src/otx/tools/converter.py
index 98c9d4aee86..23818c8ce0a 100644
--- a/src/otx/tools/converter.py
+++ b/src/otx/tools/converter.py
@@ -277,12 +277,22 @@ def update_inference_batch_size(param_value: int) -> None:
             config["data"]["test_subset"]["batch_size"] = param_value
 
         def update_learning_rate(param_value: float) -> None:
-            config["model"]["init_args"]["optimizer"]["init_args"]["lr"] = param_value
+            optimizer = config["model"]["init_args"]["optimizer"]
+            if isinstance(optimizer, dict) and "init_args" in optimizer:
+                optimizer["init_args"]["lr"] = param_value
+            else:
+                warn("Warning: learning_rate is not updated", stacklevel=1)
 
         def update_learning_rate_warmup_iters(param_value: int) -> None:
             scheduler = config["model"]["init_args"]["scheduler"]
-            if scheduler["class_path"] == "otx.core.schedulers.LinearWarmupSchedulerCallable":
+            if (
+                isinstance(scheduler, dict)
+                and "class_path" in scheduler
+                and scheduler["class_path"] == "otx.core.schedulers.LinearWarmupSchedulerCallable"
+            ):
                 scheduler["init_args"]["num_warmup_steps"] = param_value
+            else:
+                warn("Warning: learning_rate_warmup_iters is not updated", stacklevel=1)
 
         def update_num_iters(param_value: int) -> None:
             config["max_epochs"] = param_value

From 51fcb73604290af3ff99621c53ea25d26a47a7a7 Mon Sep 17 00:00:00 2001
From: Yunchu Lee <yunchu.lee@intel.com>
Date: Fri, 20 Sep 2024 14:45:32 +0900
Subject: [PATCH 47/53] Update for 2.2.0rc2 (#3969)

update for 2.2.0rc2
---
 CHANGELOG.md                              | 6 ++++++
 README.md                                 | 3 +++
 docs/source/guide/release_notes/index.rst | 3 +++
 src/otx/__init__.py                       | 2 +-
 4 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2110dc39feb..47efe68bac9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -49,6 +49,12 @@ All notable changes to this project will be documented in this file.
   (<https://github.com/openvinotoolkit/training_extensions/pull/3952>)
 - Move templates from OTX1.X to OTX2.X
   (<https://github.com/openvinotoolkit/training_extensions/pull/3951>)
+- Include Geti arrow dataset subset names
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3962>)
+- Include full image with anno in case there's no tile in tile dataset
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3964>)
+- Add type checker in converter for callable functions (optimizer, scheduler)
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3968>)
 
 ### Bug fixes
 
diff --git a/README.md b/README.md
index c3938a2bc90..f1c3043a6e1 100644
--- a/README.md
+++ b/README.md
@@ -193,6 +193,9 @@ In addition to the examples above, please refer to the documentation for tutoria
 - Support ImageFromBytes
 - enable model export
 - Move templates from OTX1.X to OTX2.X
+- Include Geti arrow dataset subset names
+- Include full image with anno in case there's no tile in tile dataset
+- Add type checker in converter for callable functions (optimizer, scheduler)
 
 ### Bug fixes
 
diff --git a/docs/source/guide/release_notes/index.rst b/docs/source/guide/release_notes/index.rst
index cf950aae2f6..75070a72926 100644
--- a/docs/source/guide/release_notes/index.rst
+++ b/docs/source/guide/release_notes/index.rst
@@ -34,6 +34,9 @@ Enhancements
 - Support ImageFromBytes
 - enable model export
 - Move templates from OTX1.X to OTX2.X
+- Include Geti arrow dataset subset names
+- Include full image with anno in case there's no tile in tile dataset
+- Add type checker in converter for callable functions (optimizer, scheduler)
 
 Bug fixes
 ^^^^^^^^^
diff --git a/src/otx/__init__.py b/src/otx/__init__.py
index cd62efaf949..225fbef0d04 100644
--- a/src/otx/__init__.py
+++ b/src/otx/__init__.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-__version__ = "2.2.0rc1"
+__version__ = "2.2.0rc2"
 
 import os
 from pathlib import Path

From 7cb8b0dfab299785d714b6172afa267f860b9d70 Mon Sep 17 00:00:00 2001
From: Prokofiev Kirill <kirill.prokofiev@intel.com>
Date: Fri, 20 Sep 2024 10:11:41 +0200
Subject: [PATCH 48/53] Update CHANGELOG.md

Co-authored-by: Kim, Sungchul <sungchul.kim@intel.com>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 709b8fa509a..0c84f871fdd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -47,7 +47,7 @@ All notable changes to this project will be documented in this file.
   (<https://github.com/openvinotoolkit/training_extensions/pull/3783>, <https://github.com/openvinotoolkit/training_extensions/pull/3816>, <https://github.com/openvinotoolkit/training_extensions/pull/3809>)
 - Support ImageFromBytes
   (<https://github.com/openvinotoolkit/training_extensions/pull/3948>)
-- enable model export
+- Enable model export
   (<https://github.com/openvinotoolkit/training_extensions/pull/3952>)
 - Move templates from OTX1.X to OTX2.X
   (<https://github.com/openvinotoolkit/training_extensions/pull/3951>)

From 901fd1646000002f02a4aa6b32eb6d36027f45f6 Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Sat, 21 Sep 2024 00:39:24 +0900
Subject: [PATCH 49/53] fix semantic seg tests

---
 src/otx/algo/segmentation/litehrnet.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/otx/algo/segmentation/litehrnet.py b/src/otx/algo/segmentation/litehrnet.py
index 75076fc9ef8..33269e04532 100644
--- a/src/otx/algo/segmentation/litehrnet.py
+++ b/src/otx/algo/segmentation/litehrnet.py
@@ -87,7 +87,7 @@ def _exporter(self) -> OTXModelExporter:
     @property
     def ignore_scope(self) -> dict[str, Any]:
         """Get the ignored scope for LiteHRNet."""
-        if self.model_version == "lite_hrnet_x":
+        if self.model_name == "lite_hrnet_x":
             return {
                 "ignored_scope": {
                     "patterns": ["__module.model.decode_head.aggregator/*"],
@@ -175,7 +175,7 @@ def ignore_scope(self) -> dict[str, Any]:
                 "preset": "performance",
             }
 
-        if self.model_version == "lite_hrnet_18":
+        if self.model_name == "lite_hrnet_18":
             return {
                 "ignored_scope": {
                     "patterns": ["__module.model.backbone/*"],
@@ -263,7 +263,7 @@ def ignore_scope(self) -> dict[str, Any]:
                 "preset": "mixed",
             }
 
-        if self.model_version == "lite_hrnet_s":
+        if self.model_name == "lite_hrnet_s":
             return {
                 "ignored_scope": {
                     "names": [

From a30ef81b80e419310ad2c4ab70e22c32c480c6ca Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Sat, 21 Sep 2024 00:48:01 +0900
Subject: [PATCH 50/53] fix detection tiling

---
 src/otx/recipe/detection/atss_resnext101_tile.yaml | 3 ++-
 src/otx/recipe/detection/rtdetr_101_tile.yaml      | 3 ++-
 src/otx/recipe/detection/rtdetr_18_tile.yaml       | 3 ++-
 src/otx/recipe/detection/rtdetr_50_tile.yaml       | 3 ++-
 src/otx/recipe/detection/rtmdet_tiny_tile.yaml     | 3 ++-
 5 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/otx/recipe/detection/atss_resnext101_tile.yaml b/src/otx/recipe/detection/atss_resnext101_tile.yaml
index 831d694caad..8f78195f637 100644
--- a/src/otx/recipe/detection/atss_resnext101_tile.yaml
+++ b/src/otx/recipe/detection/atss_resnext101_tile.yaml
@@ -1,6 +1,7 @@
 model:
-  class_path: otx.algo.detection.atss.ResNeXt101ATSS
+  class_path: otx.algo.detection.atss.ATSS
   init_args:
+    model_name: atss_resnext101
     label_info: 80
 
     optimizer:
diff --git a/src/otx/recipe/detection/rtdetr_101_tile.yaml b/src/otx/recipe/detection/rtdetr_101_tile.yaml
index 23e86944a22..918a173dedf 100644
--- a/src/otx/recipe/detection/rtdetr_101_tile.yaml
+++ b/src/otx/recipe/detection/rtdetr_101_tile.yaml
@@ -1,6 +1,7 @@
 model:
-  class_path: otx.algo.detection.rtdetr.RTDETR101
+  class_path: otx.algo.detection.rtdetr.RTDETR
   init_args:
+    model_name: rtdetr_101
     label_info: 80
 
     optimizer:
diff --git a/src/otx/recipe/detection/rtdetr_18_tile.yaml b/src/otx/recipe/detection/rtdetr_18_tile.yaml
index e932c387ce0..d79091eb56c 100644
--- a/src/otx/recipe/detection/rtdetr_18_tile.yaml
+++ b/src/otx/recipe/detection/rtdetr_18_tile.yaml
@@ -1,6 +1,7 @@
 model:
-  class_path: otx.algo.detection.rtdetr.RTDETR18
+  class_path: otx.algo.detection.rtdetr.RTDETR
   init_args:
+    model_name: rtdetr_18
     label_info: 80
 
     optimizer:
diff --git a/src/otx/recipe/detection/rtdetr_50_tile.yaml b/src/otx/recipe/detection/rtdetr_50_tile.yaml
index 95a1384df45..4c0bfdb1e64 100644
--- a/src/otx/recipe/detection/rtdetr_50_tile.yaml
+++ b/src/otx/recipe/detection/rtdetr_50_tile.yaml
@@ -1,6 +1,7 @@
 model:
-  class_path: otx.algo.detection.rtdetr.RTDETR50
+  class_path: otx.algo.detection.rtdetr.RTDETR
   init_args:
+    model_name: rtdetr_50
     label_info: 80
 
     optimizer:
diff --git a/src/otx/recipe/detection/rtmdet_tiny_tile.yaml b/src/otx/recipe/detection/rtmdet_tiny_tile.yaml
index 982d7b775d3..716f5151e8f 100644
--- a/src/otx/recipe/detection/rtmdet_tiny_tile.yaml
+++ b/src/otx/recipe/detection/rtmdet_tiny_tile.yaml
@@ -1,6 +1,7 @@
 model:
-  class_path: otx.algo.detection.rtmdet.RTMDetTiny
+  class_path: otx.algo.detection.rtmdet.RTMDet
   init_args:
+    model_name: rtmdet_tiny
     label_info: 80
 
     optimizer:

From b45fe11cd439d4675aba29958799921496f4e334 Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Mon, 23 Sep 2024 09:30:00 +0900
Subject: [PATCH 51/53] Update test_tiling.py

---
 tests/unit/core/data/test_tiling.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/unit/core/data/test_tiling.py b/tests/unit/core/data/test_tiling.py
index 61277ea6393..d6f4e892057 100644
--- a/tests/unit/core/data/test_tiling.py
+++ b/tests/unit/core/data/test_tiling.py
@@ -20,7 +20,6 @@
 from otx.algo.instance_segmentation.maskrcnn import MaskRCNN
 from otx.algo.segmentation.litehrnet import LiteHRNet
 from otx.core.config.data import (
-    SamplerConfig,
     TileConfig,
     VisualPromptingConfig,
 )

From 4c8ecd153cb2cc1b2de16342125dcd104821eb75 Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Mon, 23 Sep 2024 09:42:31 +0900
Subject: [PATCH 52/53] Update test_tiling.py

---
 tests/unit/core/data/test_tiling.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/unit/core/data/test_tiling.py b/tests/unit/core/data/test_tiling.py
index d6f4e892057..252de7d0a8f 100644
--- a/tests/unit/core/data/test_tiling.py
+++ b/tests/unit/core/data/test_tiling.py
@@ -20,6 +20,7 @@
 from otx.algo.instance_segmentation.maskrcnn import MaskRCNN
 from otx.algo.segmentation.litehrnet import LiteHRNet
 from otx.core.config.data import (
+    SubsetConfig,
     TileConfig,
     VisualPromptingConfig,
 )
@@ -32,6 +33,7 @@
 from otx.core.model.detection import OTXDetectionModel
 from otx.core.model.seg_tiler import SegTiler
 from otx.core.types.task import OTXTaskType
+from otx.core.types.transformer_libs import TransformLibType
 from torchvision import tv_tensors
 
 from tests.test_helpers import generate_random_bboxes

From 31b250411356570509ccd4ec4adbffd83df6dddd Mon Sep 17 00:00:00 2001
From: kprokofi <kirill.prokofiev@intel.com>
Date: Tue, 24 Sep 2024 06:02:37 +0900
Subject: [PATCH 53/53] fix unit test

---
 src/otx/tools/converter.py          | 3 +++
 tests/unit/core/data/test_tiling.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/otx/tools/converter.py b/src/otx/tools/converter.py
index 119ae3e530b..cb74298e910 100644
--- a/src/otx/tools/converter.py
+++ b/src/otx/tools/converter.py
@@ -448,6 +448,9 @@ def instantiate(
         model_parser.add_subclass_arguments(OTXModel, "model", required=False, fail_untyped=False, skip={"label_info"})
         model = model_parser.instantiate_classes(Namespace(model=model_config)).get("model")
 
+        if hasattr(model, "tile_config"):
+            model.tile_config = datamodule.tile_config
+
         # Instantiate Engine
         config_work_dir = config.pop("work_dir", config["engine"].pop("work_dir", None))
         config["engine"]["work_dir"] = work_dir if work_dir is not None else config_work_dir
diff --git a/tests/unit/core/data/test_tiling.py b/tests/unit/core/data/test_tiling.py
index 252de7d0a8f..b595b6932fb 100644
--- a/tests/unit/core/data/test_tiling.py
+++ b/tests/unit/core/data/test_tiling.py
@@ -256,7 +256,7 @@ def test_tile_transform(self, task, fxt_data_roots):
         dataset = DmDataset.import_from(data_root, format=dataset_format)
 
         rng = np.random.default_rng()
-        tile_size = rng.integers(low=100, high=500, size=(2,))
+        tile_size = rng.integers(low=50, high=128, size=(2,))
         overlap = rng.random(2)
         overlap = overlap.clip(0, 0.9)
         threshold_drop_ann = rng.random()