Enhance Datumaro data format detect() to be memory-bounded and perfor…

…mant (#1229) ### Summary - Ticket no. 127136 ### How to test Refer to #1224 for details on how we obtained the following results. 1. Performance - Before ```console Duration for detecting Datumaro data format: 25784.5ms, format=datumaro ``` - After ```console Duration for detecting Datumaro data format: 5966.8ms, format=datumaro ``` 2. Memory usage - Before ![before](https://github.com/openvinotoolkit/datumaro/assets/26541465/9f6432f7-108d-4d9f-a535-f954bfd55f02) - After ![after](https://github.com/openvinotoolkit/datumaro/assets/26541465/8ff7a1a4-6106-46cc-9f16-74a4979b8a3b) ### Checklist  - [ ] I have added unit tests to cover my changes. - [ ] I have added integration tests to cover my changes. - [x] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md). - [ ] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [x] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [x] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2023 Intel Corporation # # SPDX-License-Identifier: MIT ``` --------- Signed-off-by: Kim, Vinnam <[email protected]>
openvinotoolkit · Dec 18, 2023 · 3a5f138 · 3a5f138
1 parent 4c96422
commit 3a5f138
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 24 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/1194>)
 - Enhance visualizer to toggle plot title visibility
   (<https://github.com/openvinotoolkit/datumaro/pull/1228>)
+- Enhance Datumaro data format detect() to be memory-bounded and performant
+  (<https://github.com/openvinotoolkit/datumaro/pull/1229>)
 
 ### Bug fixes
 - Fix wrong example of Datumaro dataset creation in document

diff --git a/src/datumaro/plugins/data_formats/ade20k2020.py b/src/datumaro/plugins/data_formats/ade20k2020.py
@@ -23,6 +23,7 @@
 from datumaro.components.format_detection import FormatDetectionContext
 from datumaro.components.importer import ImportContext, Importer
 from datumaro.components.media import Image
+from datumaro.rust_api import JsonSectionPageMapper
 from datumaro.util import parse_json
 from datumaro.util.image import IMAGE_EXTENSIONS, find_images, lazy_image, load_image
 from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
@@ -223,11 +224,12 @@ def detect(cls, context: FormatDetectionContext) -> None:
         with context.probe_text_file(
             annot_path,
             'must be a JSON object with an "annotation" key',
-        ) as f:
-            contents = parse_json(f.read())
-            if not isinstance(contents, dict):
-                raise Exception
-            if "annotation" not in contents:
+        ):
+            fpath = osp.join(context.root_path, annot_path)
+            page_mapper = JsonSectionPageMapper(fpath)
+            sections = page_mapper.sections()
+
+            if "annotation" not in sections.keys():
                 raise Exception
 
     @classmethod

diff --git a/src/datumaro/plugins/data_formats/kinetics.py b/src/datumaro/plugins/data_formats/kinetics.py
@@ -14,6 +14,7 @@
 from datumaro.components.importer import ImportContext, Importer
 from datumaro.components.media import Video
 from datumaro.plugins.data_formats.video import VIDEO_EXTENSIONS
+from datumaro.rust_api import JsonSectionPageMapper
 from datumaro.util import parse_json, parse_json_file
 from datumaro.util.os_util import find_files
 
@@ -143,10 +144,18 @@ def detect(cls, context: FormatDetectionContext) -> None:
                     ann_file,
                     "JSON file must contain an youtube 'url' key",
                 ) as f:
-                    contents = parse_json(f.read())
+                    fpath = osp.join(context.root_path, ann_file)
+                    page_mapper = JsonSectionPageMapper(fpath)
+                    sections = page_mapper.sections()
+
+                    page_map = next(iter(sections.values()))
+                    offset, size = page_map["offset"], page_map["size"]
+
+                    f.seek(offset, 0)
+                    contents = parse_json(f.read(size))
                     if not isinstance(contents, dict):
                         raise Exception
-                    if "youtube" not in next(iter(contents.values())).get("url", ""):
+                    if "youtube" not in contents.get("url", ""):
                         raise Exception
 
             with context.alternative():

diff --git a/src/datumaro/plugins/data_formats/segment_anything/importer.py b/src/datumaro/plugins/data_formats/segment_anything/importer.py
@@ -8,11 +8,13 @@
 from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
 from datumaro.components.importer import Importer
 from datumaro.errors import DatasetImportError
+from datumaro.rust_api import JsonSectionPageMapper
 from datumaro.util import parse_json
 
 
 class SegmentAnythingImporter(Importer):
     _N_JSON_TO_TEST = 10
+    _MAX_ANNOTATION_SECTION_BYTES = 100 * 1024 * 1024  # 100 MiB
 
     @classmethod
     def detect(
@@ -26,24 +28,37 @@ def detect(
             with context.probe_text_file(
                 file, "Annotation format is not Segmentat-Anything format", is_binary_file=True
             ) as f:
-                anno = parse_json(f.read())
-                if (
-                    set(anno.keys()) != {"annotations", "image"}
-                    or (
-                        set(anno["image"].keys())
-                        != {
-                            "image_id",
-                            "width",
-                            "height",
-                            "file_name",
-                        }
-                    )
-                    or (
-                        anno["annotations"]
-                        and not {"id", "segmentation", "bbox"}.issubset(set(anno["annotations"][0]))
-                    )
-                ):
+                fpath = os.path.join(context.root_path, file)
+                page_mapper = JsonSectionPageMapper(fpath)
+                sections = page_mapper.sections()
+
+                if set(sections.keys()) != {"annotations", "image"}:
+                    raise DatasetImportError
+
+                offset, size = sections["image"]["offset"], sections["image"]["size"]
+                f.seek(offset, 0)
+                img_contents = parse_json(f.read(size))
+
+                if set(img_contents.keys()) != {
+                    "image_id",
+                    "width",
+                    "height",
+                    "file_name",
+                }:
+                    raise DatasetImportError
+
+                offset, size = sections["annotations"]["offset"], sections["annotations"]["size"]
+
+                if size > cls._MAX_ANNOTATION_SECTION_BYTES:
+                    msg = f"Annotation section is too huge. It exceeded {cls._MAX_ANNOTATION_SECTION_BYTES} bytes."
+                    raise DatasetImportError(msg)
+
+                f.seek(offset, 0)
+                ann_contents = parse_json(f.read(size))
+
+                if not {"id", "segmentation", "bbox"}.issubset(set(ann_contents[0])):
                     raise DatasetImportError
+
             if ctr > cls._N_JSON_TO_TEST:
                 break