Skip to content

Commit

Permalink
Mergeback 1.5.0rc1 (#2562)
Browse files Browse the repository at this point in the history
* Delete mem cache handler after training is done (#2535)

* Fix bug that auto batch size doesn't consider distributed training (#2533)

* consider distributed training while searching batch size

* update unit test

* revert gpu memory upper bound

* change allocated to reserved

* add unit test for distributed training

* Apply fix progress hook to release 1.5.0 (#2539)

* Fix hook's ordering issue. AdaptiveRepeatHook changes the runner.max_iters before the ProgressHook

* Fix multi-label, h-label issue

Co-authored-by: Eunwoo Shin <[email protected]>

* Revert adaptive hook sampler init

* Refactor the function name: get_data_cfg -> get_subset_data_cfg

* Remove adding AdaptiveRepeatDataHook for autobs

* Fix detection and segmentation case in Geti scenario

---------

Co-authored-by: Eunwoo Shin <[email protected]>

* Re-introduce adaptive training (#2543)

* Fix auto input size mismatch in eval & export (#2530)

* Re-enable E2E tests for Issue#2518

* Add input size check in export testing

* Format float numbers in log

* Fix NNCF export shape mismatch

* Fix saliency map issue

* Disable auto input size if tiling enabled

---------

Signed-off-by: Songki Choi <[email protected]>

* Update ref. fq number for anomaly e2e2 (#2547)

* Skip e2e det tests by issue2548 (#2550)

* Add skip to chained TC for issue #2548 (#2552)

---------

Signed-off-by: Songki Choi <[email protected]>
Co-authored-by: Eunwoo Shin <[email protected]>
Co-authored-by: Sungman Cho <[email protected]>
Co-authored-by: Jaeguk Hyun <[email protected]>
Co-authored-by: Songki Choi <[email protected]>
  • Loading branch information
5 people authored Oct 18, 2023
1 parent b3f5fc4 commit 23296d5
Show file tree
Hide file tree
Showing 35 changed files with 388 additions and 289 deletions.
16 changes: 9 additions & 7 deletions src/otx/algorithms/classification/adapters/mmcls/configurer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""Base configurer for mmdet config."""

# Copyright (C) 2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#

from typing import Optional
from typing import Optional, Tuple

import torch
from mmcv import build_from_cfg
Expand All @@ -22,7 +22,6 @@
recursively_update_cfg,
update_or_add_custom_hook,
)
from otx.algorithms.common.configs.configuration_enums import InputSizePreset
from otx.algorithms.common.utils.logger import get_logger

logger = get_logger()
Expand Down Expand Up @@ -162,16 +161,19 @@ def configure_topk(cfg):

@staticmethod
def configure_input_size(
cfg, input_size_config: InputSizePreset = InputSizePreset.DEFAULT, model_ckpt_path: Optional[str] = None
cfg, input_size=Optional[Tuple[int, int]], model_ckpt_path: Optional[str] = None, training=True
):
"""Change input size if necessary."""
manager = InputSizeManager(cfg)
input_size = manager.get_configured_input_size(input_size_config, model_ckpt_path)
if input_size is None: # InputSizePreset.DEFAULT
return

manager = InputSizeManager(cfg)

if input_size == (0, 0): # InputSizePreset.AUTO
input_size = BaseConfigurer.adapt_input_size_to_dataset(cfg, manager)
if training:
input_size = BaseConfigurer.adapt_input_size_to_dataset(cfg, manager, use_annotations=False)
else:
input_size = manager.get_trained_input_size(model_ckpt_path)
if input_size is None:
return

Expand Down
18 changes: 5 additions & 13 deletions src/otx/algorithms/classification/adapters/mmcls/nncf/task.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,7 @@
"""NNCF Task for OTX Classification."""

# Copyright (C) 2022 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions
# and limitations under the License.
# Copyright (C) 2022-2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

from functools import partial
from typing import List, Optional
Expand Down Expand Up @@ -121,3 +110,6 @@ def _generate_training_metrics_group(self, learning_curves):
output.append(LineMetricsGroup(metrics=[metric_curve], visualization_info=visualization_info))

return output, best_acc

def _save_model_post_hook(self, modelinfo):
modelinfo["input_size"] = self._input_size
16 changes: 3 additions & 13 deletions src/otx/algorithms/classification/adapters/mmcls/task.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,7 @@
"""Task of OTX Classification using mmclassification training backend."""

# Copyright (C) 2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions
# and limitations under the License.
# SPDX-License-Identifier: Apache-2.0

import glob
import os
Expand Down Expand Up @@ -194,11 +183,12 @@ def configure(
ir_options,
data_classes,
model_classes,
self._hyperparams.learning_parameters.input_size,
self._input_size,
options_for_patch_datasets=options_for_patch_datasets,
options_for_patch_evaluation=options_for_patch_evaluation,
)
self._config = cfg
self._input_size = cfg.model.pop("input_size", None)
return cfg

def build_model(
Expand Down
24 changes: 12 additions & 12 deletions src/otx/algorithms/classification/task.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,7 @@
"""Task of OTX Classification."""

# Copyright (C) 2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions
# and limitations under the License.
# SPDX-License-Identifier: Apache-2.0

import io
import json
Expand All @@ -34,6 +23,7 @@
get_multihead_class_info as get_hierarchical_info,
)
from otx.algorithms.common.configs import TrainType
from otx.algorithms.common.configs.configuration_enums import InputSizePreset
from otx.algorithms.common.tasks.base_task import TRAIN_TYPE_DIR_PATH, OTXTask
from otx.algorithms.common.utils import embed_ir_model_data
from otx.algorithms.common.utils.callback import TrainingProgressCallback
Expand Down Expand Up @@ -80,6 +70,7 @@
from otx.api.utils.dataset_utils import add_saliency_maps_to_dataset_item
from otx.api.utils.labels_utils import get_empty_label
from otx.cli.utils.multi_gpu import is_multigpu_child_process
from otx.core.data.caching.mem_cache_handler import MemCacheHandlerSingleton

logger = get_logger()
RECIPE_TRAIN_TYPE = {
Expand Down Expand Up @@ -129,6 +120,12 @@ def __init__(self, task_environment: TaskEnvironment, output_path: Optional[str]
if self._task_environment.model is not None:
self._load_model()

if hasattr(self._hyperparams.learning_parameters, "input_size"):
input_size_cfg = InputSizePreset(self._hyperparams.learning_parameters.input_size.value)
else:
input_size_cfg = InputSizePreset.DEFAULT
self._input_size = input_size_cfg.tuple

def _is_multi_label(self, label_groups: List[LabelGroup], all_labels: List[LabelEntity]):
"""Check whether the current training mode is multi-label or not."""
# NOTE: In the current Geti, multi-label should have `___` symbol for all group names.
Expand Down Expand Up @@ -215,6 +212,8 @@ def train(

results = self._train_model(dataset)

MemCacheHandlerSingleton.delete()

# Check for stop signal when training has stopped. If should_stop is true, training was cancelled and no new
if self._should_stop:
logger.info("Training cancelled.")
Expand Down Expand Up @@ -476,6 +475,7 @@ def save_model(self, output_model: ModelEntity):
"model": model_ckpt,
"config": hyperparams_str,
"labels": labels,
"input_size": self._input_size,
"VERSION": 1,
}

Expand Down
15 changes: 11 additions & 4 deletions src/otx/algorithms/common/adapters/mmcv/configurer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
recursively_update_cfg,
update_or_add_custom_hook,
)
from otx.algorithms.common.configs.configuration_enums import InputSizePreset
from otx.algorithms.common.tasks.base_task import OnHookInitialized
from otx.algorithms.common.utils import UncopiableDefaultDict, append_dist_rank_suffix
from otx.algorithms.common.utils.data import compute_robust_dataset_statistics
Expand Down Expand Up @@ -74,7 +73,7 @@ def configure(
ir_options: Optional[Config] = None,
data_classes: Optional[List[str]] = None,
model_classes: Optional[List[str]] = None,
input_size: InputSizePreset = InputSizePreset.DEFAULT,
input_size: Optional[Tuple[int, int]] = None,
**kwargs: Dict[Any, Any],
) -> Config:
"""Create MMCV-consumable config from given inputs."""
Expand Down Expand Up @@ -228,7 +227,7 @@ def configure_data_pipeline(self, cfg, input_size, model_ckpt_path, **kwargs):
"""Configuration data pipeline settings."""

patch_color_conversion(cfg)
self.configure_input_size(cfg, input_size, model_ckpt_path)
self.configure_input_size(cfg, input_size, model_ckpt_path, self.training)

def configure_recipe(self, cfg, **kwargs):
"""Configuration training recipe settings."""
Expand Down Expand Up @@ -533,7 +532,15 @@ def adapt_input_size_to_dataset(
stat = compute_robust_dataset_statistics(dataset, use_annotations)
if not stat:
return None
logger.info(f"Dataset stat: {json.dumps(stat, indent=4)}")

def format_float(obj):
if isinstance(obj, float):
return f"{obj:.2f}"
if isinstance(obj, dict):
return {k: format_float(v) for k, v in obj.items()}
return obj

logger.info(f"Dataset stat: {json.dumps(format_float(stat), indent=4)}")

# Fit to typical large image size (conservative)
# -> "avg" size might be preferrable for efficiency
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,15 @@
class AdaptiveTrainSchedulingHook(Hook):
"""Adaptive Training Scheduling Hook.
Depending on the size of iteration per epoch, adaptively update the validation interval.
Depending on the size of iteration per epoch, adaptively update the validation interval and related values.
Args:
base_lr_patience (int): The value of LR drop patience are expected in total epoch.
Patience used when interval is 1, Defaults to 5.
min_lr_patience (int): Minumum value of LR drop patience.
Defaults to 2.
base_es_patience (int): The value of Early-Stopping patience are expected in total epoch.
Patience used when interval is 1, Defaults to 10.
max_interval (int): Maximum value of validation interval.
Defaults to 5.
decay (float): Parameter to control the interval. This value is set by manual manner.
Expand All @@ -39,6 +45,10 @@ class AdaptiveTrainSchedulingHook(Hook):
def __init__(
self,
max_interval=5,
base_lr_patience=5,
min_lr_patience=2,
base_es_patience=10,
min_es_patience=3,
decay=-0.025,
enable_adaptive_interval_hook=False,
enable_eval_before_run=False,
Expand All @@ -47,6 +57,10 @@ def __init__(
super().__init__(**kwargs)

self.max_interval = max_interval
self.base_lr_patience = base_lr_patience
self.min_lr_patience = min_lr_patience
self.base_es_patience = base_es_patience
self.min_es_patience = min_es_patience
self.decay = decay
self.enable_adaptive_interval_hook = enable_adaptive_interval_hook
self.enable_eval_before_run = enable_eval_before_run
Expand Down Expand Up @@ -84,13 +98,23 @@ def before_train_iter(self, runner):
logger.info(f"Update EvalHook interval: {hook.interval} -> {adaptive_interval}")
hook.interval = adaptive_interval
elif isinstance(hook, LrUpdaterHook):
patience = max(
math.ceil((self.base_lr_patience / adaptive_interval)),
self.min_lr_patience,
)
if hasattr(hook, "interval") and hasattr(hook, "patience"):
hook.interval = adaptive_interval
logger.info(f"Update LrUpdaterHook interval: {hook.interval} -> {adaptive_interval}")
hook.patience = patience
logger.info(f"Update LrUpdaterHook patience: {hook.patience} -> {patience}")
elif isinstance(hook, EarlyStoppingHook):
logger.info(f"Update EarlyStoppingHook interval: {hook.interval} -> {adaptive_interval}")
patience = max(
math.ceil((self.base_es_patience / adaptive_interval)),
self.min_es_patience,
)
logger.info(f"Update EarlyStoppingHook patience: {hook.patience} -> {patience}")
hook.start = adaptive_interval
hook.interval = adaptive_interval
hook.patience = patience
elif isinstance(hook, CheckpointHook):
# make sure checkpoint is saved at last
limit = runner.max_epochs if hook.by_epoch else runner.max_iters
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from otx.algorithms.common.adapters.torch.dataloaders.samplers import (
BalancedSampler,
ClsIncrSampler,
OTXSampler,
)
from otx.algorithms.common.utils.logger import get_logger

Expand Down Expand Up @@ -58,13 +59,19 @@ def before_epoch(self, runner):
collate_fn = runner.data_loader.collate_fn
worker_init_fn = runner.data_loader.worker_init_fn
rank, world_size = get_dist_info()

if isinstance(runner.data_loader.sampler, OTXSampler):
repeat = runner.data_loader.sampler.repeat
else:
repeat = 1
if self.sampler_type == "balanced":
sampler = BalancedSampler(
dataset,
batch_size,
efficient_mode=self.efficient_mode,
num_replicas=world_size,
rank=rank,
n_repeats=repeat,
)
else:
sampler = ClsIncrSampler(
Expand All @@ -73,6 +80,7 @@ def before_epoch(self, runner):
efficient_mode=self.efficient_mode,
num_replicas=world_size,
rank=rank,
n_repeats=repeat,
)
runner.data_loader = DataLoader(
dataset,
Expand Down
40 changes: 15 additions & 25 deletions src/otx/algorithms/common/adapters/mmcv/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,12 +682,12 @@ def set_input_size(self, input_size: Union[int, List[int], Tuple[int, int]]):
self._set_pipeline_size_value(pipelines, resize_ratio)

# Set model size
# - needed only for YOLOX
model_cfg = self._config.get("model", {})
model_cfg["input_size"] = input_size
if model_cfg.get("type", "") == "CustomYOLOX":
# - needed only for YOLOX
if input_size[0] % 32 != 0 or input_size[1] % 32 != 0:
raise ValueError("YOLOX should have input size being multiple of 32.")
model_cfg["input_size"] = input_size

@property
def base_input_size(self) -> Union[Tuple[int, int], Dict[str, Tuple[int, int]]]:
Expand Down Expand Up @@ -862,38 +862,28 @@ def _set_size_value(pipeline: Dict, attr: str, scale: Tuple[Union[int, float], U
pipeline[attr] = (round(pipeline[attr][0] * scale[0]), round(pipeline[attr][1] * scale[1]))

@staticmethod
def get_configured_input_size(
input_size_config: InputSizePreset = InputSizePreset.DEFAULT, model_ckpt: Optional[str] = None
) -> Optional[Tuple[int, int]]:
"""Get configurable input size configuration. If it doesn't exist, return None.
def get_trained_input_size(model_ckpt: Optional[str] = None) -> Optional[Tuple[int, int]]:
"""Get trained input size from checkpoint. If it doesn't exist, return None.
Args:
input_size_config (InputSizePreset, optional): Input size setting. Defaults to InputSizePreset.DEFAULT.
model_ckpt (Optional[str], optional): Model weight to load. Defaults to None.
Returns:
Optional[Tuple[int, int]]: Pair of width and height. If there is no input size configuration, return None.
"""
input_size = None
if input_size_config == InputSizePreset.DEFAULT:
if model_ckpt is None:
return None

model_info = torch.load(model_ckpt, map_location="cpu")
for key in ["config", "learning_parameters", "input_size", "value"]:
if key not in model_info:
return None
model_info = model_info[key]
input_size = model_info

if input_size == InputSizePreset.DEFAULT.value:
return None
logger.info("Given model weight was trained with {} input size.".format(input_size))
if model_ckpt is None:
return None

else:
input_size = input_size_config.value
model_info = torch.load(model_ckpt, map_location="cpu")
if model_info is None:
return None

return InputSizePreset.parse(input_size)
input_size = model_info.get("input_size", None)
if not input_size:
return None

logger.info("Given model weight was trained with {} input size.".format(input_size))
return input_size

@staticmethod
def select_closest_size(input_size: Tuple[int, int], preset_sizes: List[Tuple[int, int]]):
Expand Down
Loading

0 comments on commit 23296d5

Please sign in to comment.