Turn on/off augmentations for detection and instance segmentation (#4066

) * Add det and inst_segm recepies for confiruble augmentation * Add changelog * Fixes from comments * Minor
openvinotoolkit · Oct 31, 2024 · f280604 · f280604
1 parent c29632d
commit f280604
Show file tree

Hide file tree

Showing 23 changed files with 250 additions and 31 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@ All notable changes to this project will be documented in this file.
 
 - Turn on/off classification augmentations
   (<https://github.com/openvinotoolkit/training_extensions/pull/4039>)
+- Turn on/off detection and instance segmentation augmentations
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4066>)
 
 ### Enhancements
 

diff --git a/src/otx/core/data/transform_libs/torchvision.py b/src/otx/core/data/transform_libs/torchvision.py
@@ -2050,8 +2050,7 @@ def forward(self, *_inputs: T_OTXDataEntity) -> T_OTXDataEntity | None:
         img_hsv[..., 0] = (img_hsv[..., 0] + hsv_gains[0]) % 180
         img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_gains[1], 0, 255)
         img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_gains[2], 0, 255)
-        cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img)
-
+        img = cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR)
         inputs.image = img
         return self.convert(inputs)
 

diff --git a/src/otx/recipe/_base_/data/detection.yaml b/src/otx/recipe/_base_/data/detection.yaml
@@ -20,17 +20,29 @@ train_subset:
       init_args:
         scale: $(input_size)
         transform_bbox: true
+    - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
+      enable: false
+    - class_path: otx.core.data.transform_libs.torchvision.RandomAffine
+      enable: false
     - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
       init_args:
         prob: 0.5
         is_numpy_to_tvtensor: true
+    - class_path: torchvision.transforms.v2.RandomVerticalFlip
+      enable: false
+    - class_path: torchvision.transforms.v2.GaussianBlur
+      enable: false
+      init_args:
+        kernel_size: 5
     - class_path: torchvision.transforms.v2.ToDtype
       init_args:
         dtype: ${as_torch_dtype:torch.float32}
     - class_path: torchvision.transforms.v2.Normalize
       init_args:
         mean: [0.0, 0.0, 0.0]
         std: [255.0, 255.0, 255.0]
+    - class_path: torchvision.transforms.v2.GaussianNoise
+      enable: false
   sampler:
     class_path: torch.utils.data.RandomSampler
 

diff --git a/src/otx/recipe/_base_/data/detection_tile.yaml b/src/otx/recipe/_base_/data/detection_tile.yaml
@@ -23,17 +23,29 @@ train_subset:
         scale: $(input_size)
         keep_ratio: false
         transform_bbox: true
+    - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
+      enable: false
+    - class_path: otx.core.data.transform_libs.torchvision.RandomAffine
+      enable: false
     - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
       init_args:
         prob: 0.5
         is_numpy_to_tvtensor: true
+    - class_path: torchvision.transforms.v2.RandomVerticalFlip
+      enable: false
+    - class_path: torchvision.transforms.v2.GaussianBlur
+      enable: false
+      init_args:
+        kernel_size: 5
     - class_path: torchvision.transforms.v2.ToDtype
       init_args:
         dtype: ${as_torch_dtype:torch.float32}
     - class_path: torchvision.transforms.v2.Normalize
       init_args:
         mean: [0.0, 0.0, 0.0]
         std: [255.0, 255.0, 255.0]
+    - class_path: torchvision.transforms.v2.GaussianNoise
+      enable: false
   sampler:
     class_path: torch.utils.data.RandomSampler
 

diff --git a/src/otx/recipe/_base_/data/instance_segmentation.yaml b/src/otx/recipe/_base_/data/instance_segmentation.yaml
@@ -22,6 +22,10 @@ train_subset:
         transform_bbox: true
         transform_mask: true
         scale: $(input_size)
+    - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
+      enable: false
+    - class_path: otx.core.data.transform_libs.torchvision.RandomAffine
+      enable: false
     - class_path: otx.core.data.transform_libs.torchvision.Pad
       init_args:
         pad_to_square: true
@@ -30,13 +34,21 @@ train_subset:
       init_args:
         prob: 0.5
         is_numpy_to_tvtensor: true
+    - class_path: torchvision.transforms.v2.RandomVerticalFlip
+      enable: false
+    - class_path: torchvision.transforms.v2.GaussianBlur
+      enable: false
+      init_args:
+        kernel_size: 5
     - class_path: torchvision.transforms.v2.ToDtype
       init_args:
         dtype: ${as_torch_dtype:torch.float32}
     - class_path: torchvision.transforms.v2.Normalize
       init_args:
         mean: [123.675, 116.28, 103.53]
         std: [58.395, 57.12, 57.375]
+    - class_path: torchvision.transforms.v2.GaussianNoise
+      enable: false
   sampler:
     class_path: torch.utils.data.RandomSampler
 

diff --git a/src/otx/recipe/detection/rtdetr_101.yaml b/src/otx/recipe/detection/rtdetr_101.yaml
@@ -58,26 +58,36 @@ overrides:
       batch_size: 4
       to_tv_image: true
       transforms:
-        - class_path: torchvision.transforms.v2.RandomPhotometricDistort
-          init_args:
-            p: 0.5
         - class_path: torchvision.transforms.v2.RandomZoomOut
           init_args:
             fill: 0
-        - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
+        - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
           init_args:
-            prob: 0.5
+            hue_delta: 13
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
             keep_ratio: false
             transform_bbox: true
+        - class_path: otx.core.data.transform_libs.torchvision.RandomAffine
+          enable: false
+        - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
+          init_args:
+            prob: 0.5
             is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.RandomVerticalFlip
+          enable: false
+        - class_path: torchvision.transforms.v2.GaussianBlur
+          enable: false
+          init_args:
+            kernel_size: 5
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
             dtype: ${as_torch_dtype:torch.float32}
             scale: true
         - class_path: torchvision.transforms.v2.SanitizeBoundingBoxes
+        - class_path: torchvision.transforms.v2.GaussianNoise
+          enable: false
       sampler:
         class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
 

diff --git a/src/otx/recipe/detection/rtdetr_18.yaml b/src/otx/recipe/detection/rtdetr_18.yaml
@@ -57,26 +57,36 @@ overrides:
       batch_size: 4
       to_tv_image: true
       transforms:
-        - class_path: torchvision.transforms.v2.RandomPhotometricDistort
-          init_args:
-            p: 0.5
         - class_path: torchvision.transforms.v2.RandomZoomOut
           init_args:
             fill: 0
-        - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
+        - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
           init_args:
-            prob: 0.5
+            hue_delta: 13
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
             keep_ratio: false
             transform_bbox: true
+        - class_path: otx.core.data.transform_libs.torchvision.RandomAffine
+          enable: false
+        - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
+          init_args:
+            prob: 0.5
             is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.RandomVerticalFlip
+          enable: false
+        - class_path: torchvision.transforms.v2.GaussianBlur
+          enable: false
+          init_args:
+            kernel_size: 5
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
             dtype: ${as_torch_dtype:torch.float32}
             scale: true
         - class_path: torchvision.transforms.v2.SanitizeBoundingBoxes
+        - class_path: torchvision.transforms.v2.GaussianNoise
+          enable: false
       sampler:
         class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
 

diff --git a/src/otx/recipe/detection/rtdetr_50.yaml b/src/otx/recipe/detection/rtdetr_50.yaml
@@ -58,26 +58,36 @@ overrides:
       batch_size: 4
       to_tv_image: true
       transforms:
-        - class_path: torchvision.transforms.v2.RandomPhotometricDistort
-          init_args:
-            p: 0.5
         - class_path: torchvision.transforms.v2.RandomZoomOut
           init_args:
             fill: 0
-        - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
+        - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
           init_args:
-            prob: 0.5
+            hue_delta: 13
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
             keep_ratio: false
             transform_bbox: true
+        - class_path: otx.core.data.transform_libs.torchvision.RandomAffine
+          enable: false
+        - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
+          init_args:
+            prob: 0.5
             is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.RandomVerticalFlip
+          enable: false
+        - class_path: torchvision.transforms.v2.GaussianBlur
+          enable: false
+          init_args:
+            kernel_size: 5
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
             dtype: ${as_torch_dtype:torch.float32}
             scale: true
         - class_path: torchvision.transforms.v2.SanitizeBoundingBoxes
+        - class_path: torchvision.transforms.v2.GaussianNoise
+          enable: false
       sampler:
         class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
 

diff --git a/src/otx/recipe/detection/rtmdet_tiny.yaml b/src/otx/recipe/detection/rtmdet_tiny.yaml
@@ -60,10 +60,21 @@ overrides:
         - class_path: otx.core.data.transform_libs.torchvision.RandomCrop
           init_args:
             crop_size: $(input_size)
+        - class_path: otx.core.data.transform_libs.torchvision.RandomAffine
+          enable: false
+        - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
+          enable: false
         - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
         - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
           init_args:
             prob: 0.5
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.RandomVerticalFlip
+          enable: false
+        - class_path: torchvision.transforms.v2.GaussianBlur
+          enable: false
+          init_args:
+            kernel_size: 5
         - class_path: otx.core.data.transform_libs.torchvision.Pad
           init_args:
             size: $(input_size)
@@ -85,6 +96,8 @@ overrides:
           init_args:
             mean: [103.53, 116.28, 123.675]
             std: [57.375, 57.12, 58.395]
+        - class_path: torchvision.transforms.v2.GaussianNoise
+          enable: false
 
     val_subset:
       batch_size: 8

diff --git a/src/otx/recipe/detection/ssd_mobilenetv2.yaml b/src/otx/recipe/detection/ssd_mobilenetv2.yaml
@@ -48,17 +48,27 @@ overrides:
           init_args:
             scale: $(input_size)
             transform_bbox: true
+        - class_path: otx.core.data.transform_libs.torchvision.RandomAffine
+          enable: false
         - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
           init_args:
             prob: 0.5
             is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.RandomVerticalFlip
+          enable: false
+        - class_path: torchvision.transforms.v2.GaussianBlur
+          enable: false
+          init_args:
+            kernel_size: 5
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
             dtype: ${as_torch_dtype:torch.float32}
         - class_path: torchvision.transforms.v2.Normalize
           init_args:
             mean: [0.0, 0.0, 0.0]
             std: [255.0, 255.0, 255.0]
+        - class_path: torchvision.transforms.v2.GaussianNoise
+          enable: false
       sampler:
         class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
 

diff --git a/src/otx/recipe/detection/yolov9_c.yaml b/src/otx/recipe/detection/yolov9_c.yaml
@@ -65,15 +65,24 @@ overrides:
             prob: 0.5
             random_pop: false
             max_cached_images: 10
-        - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
             keep_ratio: true
             transform_bbox: true
+        - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
+          enable: false
+        - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
         - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
           init_args:
             prob: 0.5
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.RandomVerticalFlip
+          enable: false
+        - class_path: torchvision.transforms.v2.GaussianBlur
+          enable: false
+          init_args:
+            kernel_size: 5
         - class_path: otx.core.data.transform_libs.torchvision.Pad
           init_args:
             pad_to_square: true
@@ -86,6 +95,8 @@ overrides:
           init_args:
             mean: [0.0, 0.0, 0.0]
             std: [255.0, 255.0, 255.0]
+        - class_path: torchvision.transforms.v2.GaussianNoise
+          enable: false
       sampler:
         class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
 

diff --git a/src/otx/recipe/detection/yolov9_m.yaml b/src/otx/recipe/detection/yolov9_m.yaml
@@ -65,15 +65,24 @@ overrides:
             prob: 0.5
             random_pop: false
             max_cached_images: 10
-        - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
         - class_path: otx.core.data.transform_libs.torchvision.Resize
           init_args:
             scale: $(input_size)
             keep_ratio: true
             transform_bbox: true
+        - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
+          enable: false
+        - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
         - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
           init_args:
             prob: 0.5
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.RandomVerticalFlip
+          enable: false
+        - class_path: torchvision.transforms.v2.GaussianBlur
+          enable: false
+          init_args:
+            kernel_size: 5
         - class_path: otx.core.data.transform_libs.torchvision.Pad
           init_args:
             pad_to_square: true
@@ -86,6 +95,8 @@ overrides:
           init_args:
             mean: [0.0, 0.0, 0.0]
             std: [255.0, 255.0, 255.0]
+        - class_path: torchvision.transforms.v2.GaussianNoise
+          enable: false
       sampler:
         class_path: otx.algo.samplers.balanced_sampler.BalancedSampler
 

diff --git a/src/otx/recipe/detection/yolov9_s.yaml b/src/otx/recipe/detection/yolov9_s.yaml
@@ -58,6 +58,8 @@ overrides:
             scale: $(input_size)
             keep_ratio: true
             transform_bbox: true
+        - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
+          enable: false
         - class_path: otx.core.data.transform_libs.torchvision.RandomFlip
           init_args:
             prob: 0.5
@@ -66,13 +68,21 @@ overrides:
             pad_to_square: true
             pad_val: 114
             is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.RandomVerticalFlip
+          enable: false
+        - class_path: torchvision.transforms.v2.GaussianBlur
+          enable: false
+          init_args:
+            kernel_size: 5
         - class_path: torchvision.transforms.v2.ToDtype
           init_args:
             dtype: ${as_torch_dtype:torch.float32}
         - class_path: torchvision.transforms.v2.Normalize
           init_args:
             mean: [0.0, 0.0, 0.0]
             std: [255.0, 255.0, 255.0]
+        - class_path: torchvision.transforms.v2.GaussianNoise
+          enable: false
       sampler:
         class_path: otx.algo.samplers.balanced_sampler.BalancedSampler