Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AstypeAnnotations Transform #1484

Merged
merged 17 commits into from
May 7, 2024
Merged
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/1459>)
- Add DOTA data format for oriented object detection task
(<https://github.com/openvinotoolkit/datumaro/pull/1475>)
- Add AstypeAnnotations Transform
(<https://github.com/openvinotoolkit/datumaro/pull/1484>)

### Enhancements
- Fix ambiguous COCO format detector
Expand Down
26 changes: 26 additions & 0 deletions docs/source/docs/command-reference/context_free/transform.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ Basic dataset item manipulations:
- [`remove_images`](#remove_images) - Removes specific images
- [`remove_annotations`](#remove_annotations) - Removes annotations
- [`remove_attributes`](#remove_attributes) - Removes attributes
- [`astype_annotations`](#astype_annotations) - Convert annotation type

Subset manipulations:
- [`random_split`](#random_split) - Splits dataset into subsets
Expand Down Expand Up @@ -498,6 +499,31 @@ Examples:
--id '2010_001705:train' --attr 'occluded'
```

#### `astype_annotations`

Enables the conversion of annotation types for the categories and individual items within a dataset.
This transform only supports tabular datasets. If you want to change annotation types in datasets of other types, please use a different transform.

Based on default setting it transforms the annotation types, changing them to 'Label' if they are categorical,
and to 'Caption' if they are of type string, float, or integer.
If you specifically set mapping, change annotation types based on the mapping.

Usage:
```console
astype_annotations [-h] [--mapping MAPPING]
```

Optional arguments:
- `-h`, `--help` (flag) - Show this help message and exit
- `--mapping` (str) - Annotations type in the form of: '<src>:<dst>' (repeatable)

Examples:
- Convert type of `title` and `rating` annotation
```console
datum transform -t astype_annotations -- \
--mapping 'title:text,rating:int'
wonjuleee marked this conversation as resolved.
Show resolved Hide resolved
```

#### `random_split`

Joins all subsets into one and splits the result into few parts.
Expand Down
2 changes: 1 addition & 1 deletion src/datumaro/components/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,7 +1234,7 @@ def columns(self) -> List[str]:
def dtype(self, column: str) -> Optional[Type[TableDtype]]:
"""Returns native python type for a given column"""
numpy_type = self.data.dtypes[column]
if numpy_type == object and self.data[column].nunique() / self.shape[0] < 0.1: # TODO
if self.data[column].nunique() / self.shape[0] < 0.1: # TODO
# Convert to CategoricalDtype for efficient storage and categorical analysis
return pd.api.types.CategoricalDtype()
if numpy_type == object:
Expand Down
5 changes: 5 additions & 0 deletions src/datumaro/plugins/specs.json
Original file line number Diff line number Diff line change
Expand Up @@ -1949,6 +1949,11 @@
"plugin_name": "sort",
"plugin_type": "Transform"
},
{
"import_path": "datumaro.plugins.transforms.AstypeAnnotations",
"plugin_name": "astype_annotations",
"plugin_type": "Transform"
},
{
"import_path": "datumaro.plugins.validators.ClassificationValidator",
"plugin_name": "classification",
Expand Down
101 changes: 98 additions & 3 deletions src/datumaro/plugins/transforms.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2020-2021 Intel Corporation
# Copyright (C) 2020-2024 Intel Corporation
#
# SPDX-License-Identifier: MIT

Expand All @@ -18,6 +18,7 @@
import cv2
import numpy as np
import pycocotools.mask as mask_utils
from pandas.api.types import CategoricalDtype

import datumaro.util.mask_tools as mask_tools
from datumaro.components.annotation import (
Expand All @@ -37,8 +38,8 @@
)
from datumaro.components.cli_plugin import CliPlugin
from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME, DatasetInfo, DatasetItem, IDataset
from datumaro.components.errors import DatumaroError
from datumaro.components.media import Image
from datumaro.components.errors import DatumaroError, MediaTypeError
from datumaro.components.media import Image, TableRow
from datumaro.components.transformer import ItemTransform, Transform
from datumaro.util import NOTSET, filter_dict, parse_json_file, parse_str_enum_value, take_by
from datumaro.util.annotation_util import find_group_leader, find_instances
Expand Down Expand Up @@ -1447,3 +1448,97 @@
)
updated_anns.append(new_ann)
yield item.wrap(annotations=updated_anns)


class AstypeAnnotations(ItemTransform):
"""
Enables the conversion of annotation types for the categories and individual items within a dataset.|n
|n
Based on a specified mapping, it transforms the annotation types,|m
changing them to 'Label' if they are categorical, and to 'Caption' if they are of type string, float, or integer.|n
|n
Examples:|n
- Convert type of `title` annotation|n

.. code-block::

|s|s%(prog)s --mapping 'title:Caption'
"""

@staticmethod
def _split_arg(s):
columns = s.split(",")
results = []
for column in columns:
parts = column.split(":")
if len(parts) != 2:
raise argparse.ArgumentTypeError()
results.append((parts[0], parts[1]))
return results

@classmethod
def build_cmdline_parser(cls, **kwargs):
parser = super().build_cmdline_parser(**kwargs)
parser.add_argument(

Check warning on line 1482 in src/datumaro/plugins/transforms.py

View check run for this annotation

Codecov / codecov/patch

src/datumaro/plugins/transforms.py#L1481-L1482

Added lines #L1481 - L1482 were not covered by tests
"--mapping",
action="append",
type=cls._split_arg,
dest="mapping",
help="Annotations type in the form of: '<src>:<dst>' (repeatable)",
)
return parser

Check warning on line 1489 in src/datumaro/plugins/transforms.py

View check run for this annotation

Codecov / codecov/patch

src/datumaro/plugins/transforms.py#L1489

Added line #L1489 was not covered by tests

def __init__(
self,
extractor: IDataset,
mapping: Optional[Union[Dict[str, str], List[Tuple[str, str]]]] = None,
):
super().__init__(extractor)

if extractor.media_type() and not issubclass(extractor.media_type(), TableRow):
raise MediaTypeError(
"Media type is not table. This transform only support tabular media"
)

# Turn off for default setting
assert mapping is None or isinstance(mapping, (dict, list)), "Mapping must be dict, or list"
if isinstance(mapping, list):
mapping = dict(mapping)

Check warning on line 1506 in src/datumaro/plugins/transforms.py

View check run for this annotation

Codecov / codecov/patch

src/datumaro/plugins/transforms.py#L1506

Added line #L1506 was not covered by tests

self._categories = {}

src_categories = self._extractor.categories()
src_tabular_cat = src_categories.get(AnnotationType.tabular)
wonjuleee marked this conversation as resolved.
Show resolved Hide resolved
self._tabular_cat_types = {}

# Make LabelCategories
self._id_mapping = {}
dst_label_cat = LabelCategories()

if src_tabular_cat is None:
return

for src_cat in src_tabular_cat:
if src_cat.dtype == CategoricalDtype():
dst_parent = src_cat.name
dst_labels = sorted(src_cat.labels)
for dst_label in dst_labels:
dst_index = dst_label_cat.add(dst_label, parent=dst_parent, attributes={})
self._id_mapping[dst_label] = dst_index
dst_label_cat.add_label_group(src_cat.name, src_cat.labels, group_type=0)
self._tabular_cat_types[src_cat.name] = src_cat.dtype
self._categories[AnnotationType.label] = dst_label_cat

def categories(self):
return self._categories

def transform_item(self, item: DatasetItem):
annotations = []
for name, value in item.annotations[0].values.items():
dtype = self._tabular_cat_types.get(name, None)
if dtype == CategoricalDtype():
annotations.append(Label(label=self._id_mapping[value]))
else:
annotations.append(Caption(value))

return self.wrap_item(item, annotations=annotations)
12 changes: 8 additions & 4 deletions tests/unit/test_tabular_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from unittest import TestCase

import pytest
from pandas.api.types import CategoricalDtype

from datumaro.components.annotation import AnnotationType, TabularCategories
from datumaro.components.dataset import Dataset
Expand Down Expand Up @@ -57,7 +58,10 @@ def test_can_import_tabular_file(self, fxt_electricity) -> None:
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_can_import_tabular_folder(self, fxt_buddy) -> None:
dataset: Type[Dataset] = fxt_buddy
expected_categories_keys = [("breed_category", float), ("pet_category", int)]
expected_categories_keys = [
("breed_category", CategoricalDtype()),
("pet_category", CategoricalDtype()),
]

assert [
(cat.name, cat.dtype) for cat in dataset.categories()[AnnotationType.tabular].items
Expand Down Expand Up @@ -110,18 +114,18 @@ def test_can_export_tabular(self, fxt: str, target, request) -> None:
(
{"input": "length(m)", "output": "breed_category"},
["length(m)", "breed_category"],
[("breed_category", float)],
[("breed_category", CategoricalDtype())],
),
(
{"input": "length", "output": "breed_category"},
["breed_category"],
[("breed_category", float)],
[("breed_category", CategoricalDtype())],
),
({"input": "length(m)", "output": "breed"}, ["length(m)"], []),
(
{"input": ["length(m)", "height(cm)"], "output": "breed_category"},
["length(m)", "height(cm)", "breed_category"],
[("breed_category", float)],
[("breed_category", CategoricalDtype())],
),
],
)
Expand Down
Loading
Loading