Skip to content

Commit

Permalink
Enhance Datumaro data format detect() to be memory-bounded and perfor…
Browse files Browse the repository at this point in the history
…mant (#1229)

### Summary

- Ticket no. 127136

### How to test
Refer to #1224 for
details on how we obtained the following results.

1. Performance

- Before

```console
Duration for detecting Datumaro data format: 25784.5ms, format=datumaro
```
- After

```console
Duration for detecting Datumaro data format: 5966.8ms, format=datumaro
```

2. Memory usage
- Before

![before](https://github.com/openvinotoolkit/datumaro/assets/26541465/9f6432f7-108d-4d9f-a535-f954bfd55f02)
- After

![after](https://github.com/openvinotoolkit/datumaro/assets/26541465/8ff7a1a4-6106-46cc-9f16-74a4979b8a3b)

### Checklist
<!-- Put an 'x' in all the boxes that apply -->
- [ ] I have added unit tests to cover my changes.​
- [ ] I have added integration tests to cover my changes.​
- [x] I have added the description of my changes into
[CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​
- [ ] I have updated the
[documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs)
accordingly

### License

- [x] I submit _my code changes_ under the same [MIT
License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE)
that covers the project.
  Feel free to contact the maintainers if that's a concern.
- [x] I have updated the license header for each file (see an example
below).

```python
# Copyright (C) 2023 Intel Corporation
#
# SPDX-License-Identifier: MIT
```

---------

Signed-off-by: Kim, Vinnam <[email protected]>
  • Loading branch information
vinnamkim authored Dec 18, 2023
1 parent 4c96422 commit 3a5f138
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 24 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/1194>)
- Enhance visualizer to toggle plot title visibility
(<https://github.com/openvinotoolkit/datumaro/pull/1228>)
- Enhance Datumaro data format detect() to be memory-bounded and performant
(<https://github.com/openvinotoolkit/datumaro/pull/1229>)

### Bug fixes
- Fix wrong example of Datumaro dataset creation in document
Expand Down
12 changes: 7 additions & 5 deletions src/datumaro/plugins/data_formats/ade20k2020.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from datumaro.components.format_detection import FormatDetectionContext
from datumaro.components.importer import ImportContext, Importer
from datumaro.components.media import Image
from datumaro.rust_api import JsonSectionPageMapper
from datumaro.util import parse_json
from datumaro.util.image import IMAGE_EXTENSIONS, find_images, lazy_image, load_image
from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
Expand Down Expand Up @@ -223,11 +224,12 @@ def detect(cls, context: FormatDetectionContext) -> None:
with context.probe_text_file(
annot_path,
'must be a JSON object with an "annotation" key',
) as f:
contents = parse_json(f.read())
if not isinstance(contents, dict):
raise Exception
if "annotation" not in contents:
):
fpath = osp.join(context.root_path, annot_path)
page_mapper = JsonSectionPageMapper(fpath)
sections = page_mapper.sections()

if "annotation" not in sections.keys():
raise Exception

@classmethod
Expand Down
13 changes: 11 additions & 2 deletions src/datumaro/plugins/data_formats/kinetics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from datumaro.components.importer import ImportContext, Importer
from datumaro.components.media import Video
from datumaro.plugins.data_formats.video import VIDEO_EXTENSIONS
from datumaro.rust_api import JsonSectionPageMapper
from datumaro.util import parse_json, parse_json_file
from datumaro.util.os_util import find_files

Expand Down Expand Up @@ -143,10 +144,18 @@ def detect(cls, context: FormatDetectionContext) -> None:
ann_file,
"JSON file must contain an youtube 'url' key",
) as f:
contents = parse_json(f.read())
fpath = osp.join(context.root_path, ann_file)
page_mapper = JsonSectionPageMapper(fpath)
sections = page_mapper.sections()

page_map = next(iter(sections.values()))
offset, size = page_map["offset"], page_map["size"]

f.seek(offset, 0)
contents = parse_json(f.read(size))
if not isinstance(contents, dict):
raise Exception
if "youtube" not in next(iter(contents.values())).get("url", ""):
if "youtube" not in contents.get("url", ""):
raise Exception

with context.alternative():
Expand Down
49 changes: 32 additions & 17 deletions src/datumaro/plugins/data_formats/segment_anything/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
from datumaro.components.importer import Importer
from datumaro.errors import DatasetImportError
from datumaro.rust_api import JsonSectionPageMapper
from datumaro.util import parse_json


class SegmentAnythingImporter(Importer):
_N_JSON_TO_TEST = 10
_MAX_ANNOTATION_SECTION_BYTES = 100 * 1024 * 1024 # 100 MiB

@classmethod
def detect(
Expand All @@ -26,24 +28,37 @@ def detect(
with context.probe_text_file(
file, "Annotation format is not Segmentat-Anything format", is_binary_file=True
) as f:
anno = parse_json(f.read())
if (
set(anno.keys()) != {"annotations", "image"}
or (
set(anno["image"].keys())
!= {
"image_id",
"width",
"height",
"file_name",
}
)
or (
anno["annotations"]
and not {"id", "segmentation", "bbox"}.issubset(set(anno["annotations"][0]))
)
):
fpath = os.path.join(context.root_path, file)
page_mapper = JsonSectionPageMapper(fpath)
sections = page_mapper.sections()

if set(sections.keys()) != {"annotations", "image"}:
raise DatasetImportError

offset, size = sections["image"]["offset"], sections["image"]["size"]
f.seek(offset, 0)
img_contents = parse_json(f.read(size))

if set(img_contents.keys()) != {
"image_id",
"width",
"height",
"file_name",
}:
raise DatasetImportError

offset, size = sections["annotations"]["offset"], sections["annotations"]["size"]

if size > cls._MAX_ANNOTATION_SECTION_BYTES:
msg = f"Annotation section is too huge. It exceeded {cls._MAX_ANNOTATION_SECTION_BYTES} bytes."
raise DatasetImportError(msg)

f.seek(offset, 0)
ann_contents = parse_json(f.read(size))

if not {"id", "segmentation", "bbox"}.issubset(set(ann_contents[0])):
raise DatasetImportError

if ctr > cls._N_JSON_TO_TEST:
break

Expand Down

0 comments on commit 3a5f138

Please sign in to comment.