Mergeback 1.5.0rc1 (#2562)

* Delete mem cache handler after training is done (#2535) * Fix bug that auto batch size doesn't consider distributed training (#2533) * consider distributed training while searching batch size * update unit test * revert gpu memory upper bound * change allocated to reserved * add unit test for distributed training * Apply fix progress hook to release 1.5.0 (#2539) * Fix hook's ordering issue. AdaptiveRepeatHook changes the runner.max_iters before the ProgressHook * Fix multi-label, h-label issue Co-authored-by: Eunwoo Shin <[email protected]> * Revert adaptive hook sampler init * Refactor the function name: get_data_cfg -> get_subset_data_cfg * Remove adding AdaptiveRepeatDataHook for autobs * Fix detection and segmentation case in Geti scenario --------- Co-authored-by: Eunwoo Shin <[email protected]> * Re-introduce adaptive training (#2543) * Fix auto input size mismatch in eval & export (#2530) * Re-enable E2E tests for Issue#2518 * Add input size check in export testing * Format float numbers in log * Fix NNCF export shape mismatch * Fix saliency map issue * Disable auto input size if tiling enabled --------- Signed-off-by: Songki Choi <[email protected]> * Update ref. fq number for anomaly e2e2 (#2547) * Skip e2e det tests by issue2548 (#2550) * Add skip to chained TC for issue #2548 (#2552) --------- Signed-off-by: Songki Choi <[email protected]> Co-authored-by: Eunwoo Shin <[email protected]> Co-authored-by: Sungman Cho <[email protected]> Co-authored-by: Jaeguk Hyun <[email protected]> Co-authored-by: Songki Choi <[email protected]>
openvinotoolkit · Oct 18, 2023 · 23296d5 · 23296d5
1 parent b3f5fc4
commit 23296d5
Show file tree

Hide file tree

Showing 35 changed files with 388 additions and 289 deletions.
diff --git a/src/otx/algorithms/classification/adapters/mmcls/configurer.py b/src/otx/algorithms/classification/adapters/mmcls/configurer.py
@@ -1,9 +1,9 @@
 """Base configurer for mmdet config."""
+
 # Copyright (C) 2023 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-#
 
-from typing import Optional
+from typing import Optional, Tuple
 
 import torch
 from mmcv import build_from_cfg
@@ -22,7 +22,6 @@
     recursively_update_cfg,
     update_or_add_custom_hook,
 )
-from otx.algorithms.common.configs.configuration_enums import InputSizePreset
 from otx.algorithms.common.utils.logger import get_logger
 
 logger = get_logger()
@@ -162,16 +161,19 @@ def configure_topk(cfg):
 
     @staticmethod
     def configure_input_size(
-        cfg, input_size_config: InputSizePreset = InputSizePreset.DEFAULT, model_ckpt_path: Optional[str] = None
+        cfg, input_size=Optional[Tuple[int, int]], model_ckpt_path: Optional[str] = None, training=True
     ):
         """Change input size if necessary."""
-        manager = InputSizeManager(cfg)
-        input_size = manager.get_configured_input_size(input_size_config, model_ckpt_path)
         if input_size is None:  # InputSizePreset.DEFAULT
             return
 
+        manager = InputSizeManager(cfg)
+
         if input_size == (0, 0):  # InputSizePreset.AUTO
-            input_size = BaseConfigurer.adapt_input_size_to_dataset(cfg, manager)
+            if training:
+                input_size = BaseConfigurer.adapt_input_size_to_dataset(cfg, manager, use_annotations=False)
+            else:
+                input_size = manager.get_trained_input_size(model_ckpt_path)
             if input_size is None:
                 return
 

diff --git a/src/otx/algorithms/classification/adapters/mmcls/nncf/task.py b/src/otx/algorithms/classification/adapters/mmcls/nncf/task.py
@@ -1,18 +1,7 @@
 """NNCF Task for OTX Classification."""
 
-# Copyright (C) 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
+# Copyright (C) 2022-2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
 
 from functools import partial
 from typing import List, Optional
@@ -121,3 +110,6 @@ def _generate_training_metrics_group(self, learning_curves):
             output.append(LineMetricsGroup(metrics=[metric_curve], visualization_info=visualization_info))
 
         return output, best_acc
+
+    def _save_model_post_hook(self, modelinfo):
+        modelinfo["input_size"] = self._input_size
diff --git a/src/otx/algorithms/classification/adapters/mmcls/task.py b/src/otx/algorithms/classification/adapters/mmcls/task.py
@@ -1,18 +1,7 @@
 """Task of OTX Classification using mmclassification training backend."""
 
 # Copyright (C) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
 
 import glob
 import os
@@ -194,11 +183,12 @@ def configure(
             ir_options,
             data_classes,
             model_classes,
-            self._hyperparams.learning_parameters.input_size,
+            self._input_size,
             options_for_patch_datasets=options_for_patch_datasets,
             options_for_patch_evaluation=options_for_patch_evaluation,
         )
         self._config = cfg
+        self._input_size = cfg.model.pop("input_size", None)
         return cfg
 
     def build_model(

diff --git a/src/otx/algorithms/classification/task.py b/src/otx/algorithms/classification/task.py
@@ -1,18 +1,7 @@
 """Task of OTX Classification."""
 
 # Copyright (C) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
 
 import io
 import json
@@ -34,6 +23,7 @@
     get_multihead_class_info as get_hierarchical_info,
 )
 from otx.algorithms.common.configs import TrainType
+from otx.algorithms.common.configs.configuration_enums import InputSizePreset
 from otx.algorithms.common.tasks.base_task import TRAIN_TYPE_DIR_PATH, OTXTask
 from otx.algorithms.common.utils import embed_ir_model_data
 from otx.algorithms.common.utils.callback import TrainingProgressCallback
@@ -80,6 +70,7 @@
 from otx.api.utils.dataset_utils import add_saliency_maps_to_dataset_item
 from otx.api.utils.labels_utils import get_empty_label
 from otx.cli.utils.multi_gpu import is_multigpu_child_process
+from otx.core.data.caching.mem_cache_handler import MemCacheHandlerSingleton
 
 logger = get_logger()
 RECIPE_TRAIN_TYPE = {
@@ -129,6 +120,12 @@ def __init__(self, task_environment: TaskEnvironment, output_path: Optional[str]
         if self._task_environment.model is not None:
             self._load_model()
 
+        if hasattr(self._hyperparams.learning_parameters, "input_size"):
+            input_size_cfg = InputSizePreset(self._hyperparams.learning_parameters.input_size.value)
+        else:
+            input_size_cfg = InputSizePreset.DEFAULT
+        self._input_size = input_size_cfg.tuple
+
     def _is_multi_label(self, label_groups: List[LabelGroup], all_labels: List[LabelEntity]):
         """Check whether the current training mode is multi-label or not."""
         # NOTE: In the current Geti, multi-label should have `___` symbol for all group names.
@@ -215,6 +212,8 @@ def train(
 
         results = self._train_model(dataset)
 
+        MemCacheHandlerSingleton.delete()
+
         # Check for stop signal when training has stopped. If should_stop is true, training was cancelled and no new
         if self._should_stop:
             logger.info("Training cancelled.")
@@ -476,6 +475,7 @@ def save_model(self, output_model: ModelEntity):
             "model": model_ckpt,
             "config": hyperparams_str,
             "labels": labels,
+            "input_size": self._input_size,
             "VERSION": 1,
         }
 

diff --git a/src/otx/algorithms/common/adapters/mmcv/configurer.py b/src/otx/algorithms/common/adapters/mmcv/configurer.py
@@ -26,7 +26,6 @@
     recursively_update_cfg,
     update_or_add_custom_hook,
 )
-from otx.algorithms.common.configs.configuration_enums import InputSizePreset
 from otx.algorithms.common.tasks.base_task import OnHookInitialized
 from otx.algorithms.common.utils import UncopiableDefaultDict, append_dist_rank_suffix
 from otx.algorithms.common.utils.data import compute_robust_dataset_statistics
@@ -74,7 +73,7 @@ def configure(
         ir_options: Optional[Config] = None,
         data_classes: Optional[List[str]] = None,
         model_classes: Optional[List[str]] = None,
-        input_size: InputSizePreset = InputSizePreset.DEFAULT,
+        input_size: Optional[Tuple[int, int]] = None,
         **kwargs: Dict[Any, Any],
     ) -> Config:
         """Create MMCV-consumable config from given inputs."""
@@ -228,7 +227,7 @@ def configure_data_pipeline(self, cfg, input_size, model_ckpt_path, **kwargs):
         """Configuration data pipeline settings."""
 
         patch_color_conversion(cfg)
-        self.configure_input_size(cfg, input_size, model_ckpt_path)
+        self.configure_input_size(cfg, input_size, model_ckpt_path, self.training)
 
     def configure_recipe(self, cfg, **kwargs):
         """Configuration training recipe settings."""
@@ -533,7 +532,15 @@ def adapt_input_size_to_dataset(
         stat = compute_robust_dataset_statistics(dataset, use_annotations)
         if not stat:
             return None
-        logger.info(f"Dataset stat: {json.dumps(stat, indent=4)}")
+
+        def format_float(obj):
+            if isinstance(obj, float):
+                return f"{obj:.2f}"
+            if isinstance(obj, dict):
+                return {k: format_float(v) for k, v in obj.items()}
+            return obj
+
+        logger.info(f"Dataset stat: {json.dumps(format_float(stat), indent=4)}")
 
         # Fit to typical large image size (conservative)
         # -> "avg" size might be preferrable for efficiency

diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/adaptive_training_hook.py b/src/otx/algorithms/common/adapters/mmcv/hooks/adaptive_training_hook.py
@@ -23,9 +23,15 @@
 class AdaptiveTrainSchedulingHook(Hook):
     """Adaptive Training Scheduling Hook.
 
-    Depending on the size of iteration per epoch, adaptively update the validation interval.
+    Depending on the size of iteration per epoch, adaptively update the validation interval and related values.
 
     Args:
+        base_lr_patience (int): The value of LR drop patience are expected in total epoch.
+            Patience used when interval is 1, Defaults to 5.
+        min_lr_patience (int): Minumum value of LR drop patience.
+            Defaults to 2.
+        base_es_patience (int): The value of Early-Stopping patience are expected in total epoch.
+            Patience used when interval is 1, Defaults to 10.
         max_interval (int): Maximum value of validation interval.
             Defaults to 5.
         decay (float): Parameter to control the interval. This value is set by manual manner.
@@ -39,6 +45,10 @@ class AdaptiveTrainSchedulingHook(Hook):
     def __init__(
         self,
         max_interval=5,
+        base_lr_patience=5,
+        min_lr_patience=2,
+        base_es_patience=10,
+        min_es_patience=3,
         decay=-0.025,
         enable_adaptive_interval_hook=False,
         enable_eval_before_run=False,
@@ -47,6 +57,10 @@ def __init__(
         super().__init__(**kwargs)
 
         self.max_interval = max_interval
+        self.base_lr_patience = base_lr_patience
+        self.min_lr_patience = min_lr_patience
+        self.base_es_patience = base_es_patience
+        self.min_es_patience = min_es_patience
         self.decay = decay
         self.enable_adaptive_interval_hook = enable_adaptive_interval_hook
         self.enable_eval_before_run = enable_eval_before_run
@@ -84,13 +98,23 @@ def before_train_iter(self, runner):
                     logger.info(f"Update EvalHook interval: {hook.interval} -> {adaptive_interval}")
                     hook.interval = adaptive_interval
                 elif isinstance(hook, LrUpdaterHook):
+                    patience = max(
+                        math.ceil((self.base_lr_patience / adaptive_interval)),
+                        self.min_lr_patience,
+                    )
                     if hasattr(hook, "interval") and hasattr(hook, "patience"):
                         hook.interval = adaptive_interval
-                        logger.info(f"Update LrUpdaterHook interval: {hook.interval} -> {adaptive_interval}")
+                        hook.patience = patience
+                        logger.info(f"Update LrUpdaterHook patience: {hook.patience} -> {patience}")
                 elif isinstance(hook, EarlyStoppingHook):
-                    logger.info(f"Update EarlyStoppingHook interval: {hook.interval} -> {adaptive_interval}")
+                    patience = max(
+                        math.ceil((self.base_es_patience / adaptive_interval)),
+                        self.min_es_patience,
+                    )
+                    logger.info(f"Update EarlyStoppingHook patience: {hook.patience} -> {patience}")
                     hook.start = adaptive_interval
                     hook.interval = adaptive_interval
+                    hook.patience = patience
                 elif isinstance(hook, CheckpointHook):
                     # make sure checkpoint is saved at last
                     limit = runner.max_epochs if hook.by_epoch else runner.max_iters

diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/task_adapt_hook.py b/src/otx/algorithms/common/adapters/mmcv/hooks/task_adapt_hook.py
@@ -9,6 +9,7 @@
 from otx.algorithms.common.adapters.torch.dataloaders.samplers import (
     BalancedSampler,
     ClsIncrSampler,
+    OTXSampler,
 )
 from otx.algorithms.common.utils.logger import get_logger
 
@@ -58,13 +59,19 @@ def before_epoch(self, runner):
             collate_fn = runner.data_loader.collate_fn
             worker_init_fn = runner.data_loader.worker_init_fn
             rank, world_size = get_dist_info()
+
+            if isinstance(runner.data_loader.sampler, OTXSampler):
+                repeat = runner.data_loader.sampler.repeat
+            else:
+                repeat = 1
             if self.sampler_type == "balanced":
                 sampler = BalancedSampler(
                     dataset,
                     batch_size,
                     efficient_mode=self.efficient_mode,
                     num_replicas=world_size,
                     rank=rank,
+                    n_repeats=repeat,
                 )
             else:
                 sampler = ClsIncrSampler(
@@ -73,6 +80,7 @@ def before_epoch(self, runner):
                     efficient_mode=self.efficient_mode,
                     num_replicas=world_size,
                     rank=rank,
+                    n_repeats=repeat,
                 )
             runner.data_loader = DataLoader(
                 dataset,

diff --git a/src/otx/algorithms/common/adapters/mmcv/utils/config_utils.py b/src/otx/algorithms/common/adapters/mmcv/utils/config_utils.py
@@ -682,12 +682,12 @@ def set_input_size(self, input_size: Union[int, List[int], Tuple[int, int]]):
                     self._set_pipeline_size_value(pipelines, resize_ratio)
 
         # Set model size
-        # - needed only for YOLOX
         model_cfg = self._config.get("model", {})
+        model_cfg["input_size"] = input_size
         if model_cfg.get("type", "") == "CustomYOLOX":
+            # - needed only for YOLOX
             if input_size[0] % 32 != 0 or input_size[1] % 32 != 0:
                 raise ValueError("YOLOX should have input size being multiple of 32.")
-            model_cfg["input_size"] = input_size
 
     @property
     def base_input_size(self) -> Union[Tuple[int, int], Dict[str, Tuple[int, int]]]:
@@ -862,38 +862,28 @@ def _set_size_value(pipeline: Dict, attr: str, scale: Tuple[Union[int, float], U
             pipeline[attr] = (round(pipeline[attr][0] * scale[0]), round(pipeline[attr][1] * scale[1]))
 
     @staticmethod
-    def get_configured_input_size(
-        input_size_config: InputSizePreset = InputSizePreset.DEFAULT, model_ckpt: Optional[str] = None
-    ) -> Optional[Tuple[int, int]]:
-        """Get configurable input size configuration. If it doesn't exist, return None.
+    def get_trained_input_size(model_ckpt: Optional[str] = None) -> Optional[Tuple[int, int]]:
+        """Get trained input size from checkpoint. If it doesn't exist, return None.
 
         Args:
-            input_size_config (InputSizePreset, optional): Input size setting. Defaults to InputSizePreset.DEFAULT.
             model_ckpt (Optional[str], optional): Model weight to load. Defaults to None.
 
         Returns:
             Optional[Tuple[int, int]]: Pair of width and height. If there is no input size configuration, return None.
         """
-        input_size = None
-        if input_size_config == InputSizePreset.DEFAULT:
-            if model_ckpt is None:
-                return None
-
-            model_info = torch.load(model_ckpt, map_location="cpu")
-            for key in ["config", "learning_parameters", "input_size", "value"]:
-                if key not in model_info:
-                    return None
-                model_info = model_info[key]
-            input_size = model_info
-
-            if input_size == InputSizePreset.DEFAULT.value:
-                return None
-            logger.info("Given model weight was trained with {} input size.".format(input_size))
+        if model_ckpt is None:
+            return None
 
-        else:
-            input_size = input_size_config.value
+        model_info = torch.load(model_ckpt, map_location="cpu")
+        if model_info is None:
+            return None
 
-        return InputSizePreset.parse(input_size)
+        input_size = model_info.get("input_size", None)
+        if not input_size:
+            return None
+
+        logger.info("Given model weight was trained with {} input size.".format(input_size))
+        return input_size
 
     @staticmethod
     def select_closest_size(input_size: Tuple[int, int], preset_sizes: List[Tuple[int, int]]):