Updating vision reader to also produce class probs and labels (#293)

* updating vision reader to also produce class probs and labels * removing duplicate code * removing unnecessary import * oops, fix
allenai · Jul 30, 2021 · 4eb7c27 · 4eb7c27
1 parent 7b7b9c1
commit 4eb7c27
Show file tree

Hide file tree

Showing 8 changed files with 124 additions and 48 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Fixed tests for Spacy versions greater than 3.1
 
+### Changed
+
+- Updated `VisionReader` to yield all of `RegionDetectorOutput`'s keys in processing.
+
 ## [v2.6.0](https://github.com/allenai/allennlp-models/releases/tag/v2.6.0) - 2021-07-19
 
 ### Added

diff --git a/allennlp_models/vision/dataset_readers/flickr30k.py b/allennlp_models/vision/dataset_readers/flickr30k.py
@@ -199,7 +199,9 @@ def _read(self, file_path: str):
                 full_file_path = os.path.join(self.data_dir, filename)
                 caption_dicts.append(get_caption_data(full_file_path))
 
-        processed_images: Iterable[Optional[Tuple[Tensor, Tensor]]]
+        processed_images: Iterable[
+            Optional[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]]
+        ]
         filenames = [f"{caption_dict['image_id']}.jpg" for caption_dict in caption_dicts]
         try:
             processed_images = self._process_image_paths(
@@ -221,7 +223,7 @@ def _read(self, file_path: str):
         averaged_features_list = []
         coordinates_list = []
         masks_list = []
-        for features, coords in processed_images:
+        for features, coords, _, _ in processed_images:
             features_list.append(TensorField(features))
             averaged_features_list.append(torch.mean(features, dim=0))
             coordinates_list.append(TensorField(coords))

diff --git a/allennlp_models/vision/dataset_readers/gqa.py b/allennlp_models/vision/dataset_readers/gqa.py
@@ -133,7 +133,9 @@ def _read(self, split_or_filename: str):
                 )
             )
 
-            processed_images: Iterable[Optional[Tuple[Tensor, Tensor]]]
+            processed_images: Iterable[
+                Optional[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]]
+            ]
             if self.produce_featurized_images:
                 # It would be much easier to just process one image at a time, but it's faster to process
                 # them in batches. So this code gathers up instances until it has enough to fill up a batch
@@ -170,7 +172,7 @@ def _read(self, split_or_filename: str):
     def text_to_instance(
         self,  # type: ignore
         question: str,
-        image: Optional[Union[str, Tuple[Tensor, Tensor]]],
+        image: Optional[Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]]],
         answer: Optional[Dict[str, str]] = None,
         *,
         use_cache: bool = True,
@@ -195,9 +197,11 @@ def text_to_instance(
 
         if image is not None:
             if isinstance(image, str):
-                features, coords = next(self._process_image_paths([image], use_cache=use_cache))
+                features, coords, _, _ = next(
+                    self._process_image_paths([image], use_cache=use_cache)
+                )
             else:
-                features, coords = image
+                features, coords, _, _ = image
             fields["box_features"] = ArrayField(features)
             fields["box_coordinates"] = ArrayField(coords)
             fields["box_mask"] = ArrayField(

diff --git a/allennlp_models/vision/dataset_readers/nlvr2.py b/allennlp_models/vision/dataset_readers/nlvr2.py
@@ -125,8 +125,12 @@ def _read(self, split_or_filename: str):
             blobs.append(json_blob)
 
         blob_dicts = list(self.shard_iterable(blobs))
-        processed_images1: Iterable[Optional[Tuple[Tensor, Tensor]]]
-        processed_images2: Iterable[Optional[Tuple[Tensor, Tensor]]]
+        processed_images1: Iterable[
+            Optional[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]]
+        ]
+        processed_images2: Iterable[
+            Optional[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]]
+        ]
         if self.produce_featurized_images:
             # It would be much easier to just process one image at a time, but it's faster to process
             # them in batches. So this code gathers up instances until it has enough to fill up a batch
@@ -169,11 +173,15 @@ def _read(self, split_or_filename: str):
                 yield instance
         logger.info(f"Successfully yielded {attempted_instances} instances")
 
-    def extract_image_features(self, image: Union[str, Tuple[Tensor, Tensor]], use_cache: bool):
+    def extract_image_features(
+        self,
+        image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]],
+        use_cache: bool,
+    ):
         if isinstance(image, str):
-            features, coords = next(self._process_image_paths([image], use_cache=use_cache))
+            features, coords, _, _ = next(self._process_image_paths([image], use_cache=use_cache))
         else:
-            features, coords = image
+            features, coords, _, _ = image
 
         return (
             ArrayField(features),
@@ -190,8 +198,8 @@ def text_to_instance(
         self,  # type: ignore
         identifier: Optional[str],
         hypothesis: str,
-        image1: Union[str, Tuple[Tensor, Tensor]],
-        image2: Union[str, Tuple[Tensor, Tensor]],
+        image1: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]],
+        image2: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]],
         label: bool,
         use_cache: bool = True,
     ) -> Instance:

diff --git a/allennlp_models/vision/dataset_readers/vgqa.py b/allennlp_models/vision/dataset_readers/vgqa.py
@@ -139,7 +139,9 @@ def _read(self, file_path: str):
         questions = questions[question_slice]
 
         question_dicts = list(self.shard_iterable(questions))
-        processed_images: Iterable[Optional[Tuple[Tensor, Tensor]]]
+        processed_images: Iterable[
+            Optional[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]]
+        ]
         if self.produce_featurized_images:
             # It would be much easier to just process one image at a time, but it's faster to process
             # them in batches. So this code gathers up instances until it has enough to fill up a batch
@@ -196,7 +198,7 @@ def text_to_instance(
         qa_id: int,
         question: str,
         answer: Optional[str],
-        image: Union[str, Tuple[Tensor, Tensor]],
+        image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]],
         use_cache: bool = True,
         keep_impossible_questions: bool = True,
     ) -> Optional[Instance]:
@@ -207,9 +209,9 @@ def text_to_instance(
         }
 
         if isinstance(image, str):
-            features, coords = next(self._process_image_paths([image], use_cache=use_cache))
+            features, coords, _, _ = next(self._process_image_paths([image], use_cache=use_cache))
         else:
-            features, coords = image
+            features, coords, _, _ = image
 
         fields["box_features"] = ArrayField(features)
         fields["box_coordinates"] = ArrayField(coords)

diff --git a/allennlp_models/vision/dataset_readers/vision_reader.py b/allennlp_models/vision/dataset_readers/vision_reader.py
@@ -122,6 +122,9 @@ def __init__(
         # feature cache
         self.feature_cache_dir = feature_cache_dir
         self.coordinates_cache_dir = feature_cache_dir
+        self.class_probs_cache_dir = feature_cache_dir
+        self.class_labels_cache_dir = feature_cache_dir
+
         if feature_cache_dir:
             self.write_to_cache = write_to_cache
         else:
@@ -130,6 +133,8 @@ def __init__(
             self.write_to_cache = True
         self._feature_cache_instance: Optional[MutableMapping[str, Tensor]] = None
         self._coordinates_cache_instance: Optional[MutableMapping[str, Tensor]] = None
+        self._class_probs_cache_instance: Optional[MutableMapping[str, Tensor]] = None
+        self._class_labels_cache_instance: Optional[MutableMapping[str, Tensor]] = None
 
         # image processors
         self.image_loader = None
@@ -206,37 +211,52 @@ def region_detector(self) -> Optional[RegionDetector]:
             self._region_detector.eval()  # type: ignore[attr-defined]
         return self._region_detector  # type: ignore[return-value]
 
+    def _create_cache(
+        self,
+        cache_name: str,
+        cache_dir: Optional[Union[str, PathLike]] = None,
+    ) -> MutableMapping[str, Tensor]:
+        if cache_dir is None:
+            return {}
+        os.makedirs(cache_dir, exist_ok=True)
+        return TensorCache(
+            os.path.join(cache_dir, cache_name),
+            read_only=not self.write_to_cache,
+        )
+
     @property
     def _feature_cache(self) -> MutableMapping[str, Tensor]:
         if self._feature_cache_instance is None:
-            if self.feature_cache_dir is None:
-                self._feature_cache_instance = {}
-            else:
-                os.makedirs(self.feature_cache_dir, exist_ok=True)
-                self._feature_cache_instance = TensorCache(
-                    os.path.join(self.feature_cache_dir, "features"),
-                    read_only=not self.write_to_cache,
-                )
-
+            self._feature_cache_instance = self._create_cache("features", self.feature_cache_dir)
         return self._feature_cache_instance
 
     @property
     def _coordinates_cache(self) -> MutableMapping[str, Tensor]:
         if self._coordinates_cache_instance is None:
-            if self.coordinates_cache_dir is None:
-                self._coordinates_cache_instance = {}
-            else:
-                os.makedirs(self.feature_cache_dir, exist_ok=True)  # type: ignore
-                self._coordinates_cache_instance = TensorCache(
-                    os.path.join(self.feature_cache_dir, "coordinates"),  # type: ignore
-                    read_only=not self.write_to_cache,
-                )
-
+            self._coordinates_cache_instance = self._create_cache(
+                "coordinates", self.coordinates_cache_dir
+            )
         return self._coordinates_cache_instance
 
+    @property
+    def _class_probs_cache(self) -> MutableMapping[str, Tensor]:
+        if self._class_probs_cache_instance is None:
+            self._class_probs_cache_instance = self._create_cache(
+                "class_probs", self.class_probs_cache_dir
+            )
+        return self._class_probs_cache_instance
+
+    @property
+    def _class_labels_cache(self) -> MutableMapping[str, Tensor]:
+        if self._class_labels_cache_instance is None:
+            self._class_labels_cache_instance = self._create_cache(
+                "class_labels", self.class_labels_cache_dir
+            )
+        return self._class_labels_cache_instance
+
     def _process_image_paths(
         self, image_paths: Iterable[str], *, use_cache: bool = True
-    ) -> Iterator[Tuple[Tensor, Tensor]]:
+    ) -> Iterator[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]]:
         """
         Processes the given image paths and returns featurized images.
 
@@ -258,7 +278,7 @@ def _process_image_paths(
             "an image featurizer, and a region detector."
         )
 
-        batch: List[Union[str, Tuple[Tensor, Tensor]]] = []
+        batch: List[Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]]] = []
         unprocessed_paths: Set[str] = set()
 
         def yield_batch():
@@ -272,16 +292,44 @@ def yield_batch():
                 detector_results = self.region_detector(images, sizes, featurized_images)
                 features = detector_results.features
                 coordinates = detector_results.boxes
+                class_probs = detector_results.class_probs
+                class_labels = detector_results.class_labels
 
             # store the processed results in memory, so we can complete the batch
-            paths_to_tensors = {path: (features[i], coordinates[i]) for i, path in enumerate(paths)}
+            paths_to_tensors = {}
+            for i, path in enumerate(paths):
+                if class_probs:
+                    class_probs_tensor = class_probs[i]
+                else:
+                    class_probs_tensor = None
+
+                if class_labels:
+                    class_labels_tensor = class_labels[i]
+                else:
+                    class_labels_tensor = None
+
+                paths_to_tensors[path] = (
+                    features[i],
+                    coordinates[i],
+                    class_probs_tensor,
+                    class_labels_tensor,
+                )
 
             # store the processed results in the cache
             if use_cache and self.write_to_cache:
-                for path, (features, coordinates) in paths_to_tensors.items():
+                for path, (
+                    features,
+                    coordinates,
+                    class_probs,
+                    class_labels,
+                ) in paths_to_tensors.items():
                     basename = os.path.basename(path)
                     self._feature_cache[basename] = features
                     self._coordinates_cache[basename] = coordinates
+                    if class_probs is not None:
+                        self._class_probs_cache[basename] = class_probs
+                    if class_labels is not None:
+                        self._class_labels_cache[basename] = class_labels
 
             # yield the batch
             for b in batch:
@@ -296,10 +344,12 @@ def yield_batch():
                 if use_cache:
                     features: Tensor = self._feature_cache[basename]
                     coordinates: Tensor = self._coordinates_cache[basename]
+                    class_probs: Optional[Tensor] = self._class_probs_cache.get(basename)
+                    class_labels: Optional[Tensor] = self._class_labels_cache.get(basename)
                     if len(batch) <= 0:
-                        yield features, coordinates
+                        yield features, coordinates, class_probs, class_labels
                     else:
-                        batch.append((features, coordinates))
+                        batch.append((features, coordinates, class_probs, class_labels))
                 else:
                     # If we're not using the cache, we pretend we had a cache miss here.
                     raise KeyError

diff --git a/allennlp_models/vision/dataset_readers/visual_entailment.py b/allennlp_models/vision/dataset_readers/visual_entailment.py
@@ -76,7 +76,7 @@ def _read(self, file_path: str):
     @overrides
     def text_to_instance(
         self,  # type: ignore
-        image: Union[str, Tuple[Tensor, Tensor]],
+        image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]],
         hypothesis: str,
         label: Optional[str] = None,
         *,
@@ -90,9 +90,11 @@ def text_to_instance(
 
         if image is not None:
             if isinstance(image, str):
-                features, coords = next(self._process_image_paths([image], use_cache=use_cache))
+                features, coords, _, _ = next(
+                    self._process_image_paths([image], use_cache=use_cache)
+                )
             else:
-                features, coords = image
+                features, coords, _, _ = image
 
             fields["box_features"] = ArrayField(features)
             fields["box_coordinates"] = ArrayField(coords)

diff --git a/allennlp_models/vision/dataset_readers/vqav2.py b/allennlp_models/vision/dataset_readers/vqav2.py
@@ -231,7 +231,9 @@ class Split(NamedTuple):
         questions = questions[question_slice]
 
         question_dicts = list(self.shard_iterable(questions))
-        processed_images: Iterable[Optional[Tuple[Tensor, Tensor]]]
+        processed_images: Iterable[
+            Optional[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]]
+        ]
         if self.produce_featurized_images:
             # It would be much easier to just process one image at a time, but it's faster to process
             # them in batches. So this code gathers up instances until it has enough to fill up a batch
@@ -279,7 +281,7 @@ class Split(NamedTuple):
     def text_to_instance(
         self,  # type: ignore
         question: str,
-        image: Union[str, Tuple[Tensor, Tensor]],
+        image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]],
         answer_counts: Optional[MutableMapping[str, int]] = None,
         *,
         use_cache: bool = True,
@@ -293,9 +295,11 @@ def text_to_instance(
 
         if image is not None:
             if isinstance(image, str):
-                features, coords = next(self._process_image_paths([image], use_cache=use_cache))
+                features, coords, _, _ = next(
+                    self._process_image_paths([image], use_cache=use_cache)
+                )
             else:
-                features, coords = image
+                features, coords, _, _ = image
 
             fields["box_features"] = ArrayField(features)
             fields["box_coordinates"] = ArrayField(coords)