openvinotoolkit · sooahleex · May 7, 2024 · Apr 17, 2024 · Apr 23, 2024 · Apr 23, 2024
@@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/1459>)
 - Add DOTA data format for oriented object detection task
   (<https://github.com/openvinotoolkit/datumaro/pull/1475>)
+- Add AstypeAnnotations Transform
+  (<https://github.com/openvinotoolkit/datumaro/pull/1484>)
 
 ### Enhancements
 - Fix ambiguous COCO format detector

@@ -101,6 +101,7 @@ Basic dataset item manipulations:
 - [`remove_images`](#remove_images) - Removes specific images
 - [`remove_annotations`](#remove_annotations) - Removes annotations
 - [`remove_attributes`](#remove_attributes) - Removes attributes
+- [`astype_annotations`](#astype_annotations) - Convert annotation type
 
 Subset manipulations:
 - [`random_split`](#random_split) - Splits dataset into subsets
@@ -498,6 +499,31 @@ Examples:
     --id '2010_001705:train' --attr 'occluded'
   ```
 
+#### `astype_annotations`
+
+Enables the conversion of annotation types for the categories and individual items within a dataset.
+This transform only supports tabular datasets. If you want to change annotation types in datasets of other types, please use a different transform.
+
+Based on default setting it transforms the annotation types, changing them to 'Label' if they are categorical,
+and to 'Caption' if they are of type string, float, or integer.
+If you specifically set mapping, change annotation types based on the mapping.
+
+Usage:
+```console
+astype_annotations [-h] [--mapping MAPPING]
+```
+
+Optional arguments:
+- `-h`, `--help` (flag) - Show this help message and exit
+- `--mapping` (str) - Annotations type in the form of: '<src>:<dst>' (repeatable)
+
+Examples:
+- Convert type of `title` and `rating` annotation
+  ```console
+  datum transform -t astype_annotations -- \
+    --mapping 'title:text,rating:int'
+  ```
+
 #### `random_split`
 
 Joins all subsets into one and splits the result into few parts.

@@ -1234,7 +1234,7 @@ def columns(self) -> List[str]:
     def dtype(self, column: str) -> Optional[Type[TableDtype]]:
         """Returns native python type for a given column"""
         numpy_type = self.data.dtypes[column]
-        if numpy_type == object and self.data[column].nunique() / self.shape[0] < 0.1:  # TODO
+        if self.data[column].nunique() / self.shape[0] < 0.1:  # TODO
             # Convert to CategoricalDtype for efficient storage and categorical analysis
             return pd.api.types.CategoricalDtype()
         if numpy_type == object:

@@ -1949,6 +1949,11 @@
     "plugin_name": "sort",
     "plugin_type": "Transform"
   },
+  {
+    "import_path": "datumaro.plugins.transforms.AstypeAnnotations",
+    "plugin_name": "astype_annotations",
+    "plugin_type": "Transform"
+  },
   {
     "import_path": "datumaro.plugins.validators.ClassificationValidator",
     "plugin_name": "classification",

@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2021 Intel Corporation
+# Copyright (C) 2020-2024 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -18,6 +18,7 @@
 import cv2
 import numpy as np
 import pycocotools.mask as mask_utils
+from pandas.api.types import CategoricalDtype
 
 import datumaro.util.mask_tools as mask_tools
 from datumaro.components.annotation import (
@@ -37,8 +38,8 @@
 )
 from datumaro.components.cli_plugin import CliPlugin
 from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME, DatasetInfo, DatasetItem, IDataset
-from datumaro.components.errors import DatumaroError
-from datumaro.components.media import Image
+from datumaro.components.errors import DatumaroError, MediaTypeError
+from datumaro.components.media import Image, TableRow
 from datumaro.components.transformer import ItemTransform, Transform
 from datumaro.util import NOTSET, filter_dict, parse_json_file, parse_str_enum_value, take_by
 from datumaro.util.annotation_util import find_group_leader, find_instances
@@ -1447,3 +1448,97 @@
                         )
                     updated_anns.append(new_ann)
                 yield item.wrap(annotations=updated_anns)
+
+
+class AstypeAnnotations(ItemTransform):
+    """
+    Enables the conversion of annotation types for the categories and individual items within a dataset.|n
+    |n
+    Based on a specified mapping, it transforms the annotation types,|m
+    changing them to 'Label' if they are categorical, and to 'Caption' if they are of type string, float, or integer.|n
+    |n
+    Examples:|n
+        - Convert type of `title` annotation|n
+
+        .. code-block::
+
+        |s|s%(prog)s --mapping 'title:Caption'
+    """
+
+    @staticmethod
+    def _split_arg(s):
+        columns = s.split(",")
+        results = []
+        for column in columns:
+            parts = column.split(":")
+            if len(parts) != 2:
+                raise argparse.ArgumentTypeError()
+            results.append((parts[0], parts[1]))
+        return results
+
+    @classmethod
+    def build_cmdline_parser(cls, **kwargs):
+        parser = super().build_cmdline_parser(**kwargs)
+        parser.add_argument(
+            "--mapping",
+            action="append",
+            type=cls._split_arg,
+            dest="mapping",
+            help="Annotations type in the form of: '<src>:<dst>' (repeatable)",
+        )
+        return parser
+
+    def __init__(
+        self,
+        extractor: IDataset,
+        mapping: Optional[Union[Dict[str, str], List[Tuple[str, str]]]] = None,
+    ):
+        super().__init__(extractor)
+
+        if extractor.media_type() and not issubclass(extractor.media_type(), TableRow):
+            raise MediaTypeError(
+                "Media type is not table. This transform only support tabular media"
+            )
+
+        # Turn off for default setting
+        assert mapping is None or isinstance(mapping, (dict, list)), "Mapping must be dict, or list"
+        if isinstance(mapping, list):
+            mapping = dict(mapping)
+
+        self._categories = {}
+
+        src_categories = self._extractor.categories()
+        src_tabular_cat = src_categories.get(AnnotationType.tabular)
+        self._tabular_cat_types = {}
+
+        # Make LabelCategories
+        self._id_mapping = {}
+        dst_label_cat = LabelCategories()
+
+        if src_tabular_cat is None:
+            return
+
+        for src_cat in src_tabular_cat:
+            if src_cat.dtype == CategoricalDtype():
+                dst_parent = src_cat.name
+                dst_labels = sorted(src_cat.labels)
+                for dst_label in dst_labels:
+                    dst_index = dst_label_cat.add(dst_label, parent=dst_parent, attributes={})
+                    self._id_mapping[dst_label] = dst_index
+                dst_label_cat.add_label_group(src_cat.name, src_cat.labels, group_type=0)
+            self._tabular_cat_types[src_cat.name] = src_cat.dtype
+        self._categories[AnnotationType.label] = dst_label_cat
+
+    def categories(self):
+        return self._categories
+
+    def transform_item(self, item: DatasetItem):
+        annotations = []
+        for name, value in item.annotations[0].values.items():
+            dtype = self._tabular_cat_types.get(name, None)
+            if dtype == CategoricalDtype():
+                annotations.append(Label(label=self._id_mapping[value]))
+            else:
+                annotations.append(Caption(value))
+
+        return self.wrap_item(item, annotations=annotations)
@@ -5,6 +5,7 @@
 from unittest import TestCase
 
 import pytest
+from pandas.api.types import CategoricalDtype
 
 from datumaro.components.annotation import AnnotationType, TabularCategories
 from datumaro.components.dataset import Dataset
@@ -57,7 +58,10 @@ def test_can_import_tabular_file(self, fxt_electricity) -> None:
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     def test_can_import_tabular_folder(self, fxt_buddy) -> None:
         dataset: Type[Dataset] = fxt_buddy
-        expected_categories_keys = [("breed_category", float), ("pet_category", int)]
+        expected_categories_keys = [
+            ("breed_category", CategoricalDtype()),
+            ("pet_category", CategoricalDtype()),
+        ]
 
         assert [
             (cat.name, cat.dtype) for cat in dataset.categories()[AnnotationType.tabular].items
@@ -110,18 +114,18 @@ def test_can_export_tabular(self, fxt: str, target, request) -> None:
             (
                 {"input": "length(m)", "output": "breed_category"},
                 ["length(m)", "breed_category"],
-                [("breed_category", float)],
+                [("breed_category", CategoricalDtype())],
             ),
             (
                 {"input": "length", "output": "breed_category"},
                 ["breed_category"],
-                [("breed_category", float)],
+                [("breed_category", CategoricalDtype())],
             ),
             ({"input": "length(m)", "output": "breed"}, ["length(m)"], []),
             (
                 {"input": ["length(m)", "height(cm)"], "output": "breed_category"},
                 ["length(m)", "height(cm)", "breed_category"],
-                [("breed_category", float)],
+                [("breed_category", CategoricalDtype())],
             ),
         ],
     )