From 4eb7c27d7fad00ac886ffdefc6c152909fa28f23 Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Thu, 29 Jul 2021 18:05:28 -0700 Subject: [PATCH] Updating vision reader to also produce class probs and labels (#293) * updating vision reader to also produce class probs and labels * removing duplicate code * removing unnecessary import * oops, fix --- CHANGELOG.md | 4 + .../vision/dataset_readers/flickr30k.py | 6 +- allennlp_models/vision/dataset_readers/gqa.py | 12 ++- .../vision/dataset_readers/nlvr2.py | 22 +++-- .../vision/dataset_readers/vgqa.py | 10 +- .../vision/dataset_readers/vision_reader.py | 98 ++++++++++++++----- .../dataset_readers/visual_entailment.py | 8 +- .../vision/dataset_readers/vqav2.py | 12 ++- 8 files changed, 124 insertions(+), 48 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 96b2d65b6..a8e4612b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed tests for Spacy versions greater than 3.1 +### Changed + +- Updated `VisionReader` to yield all of `RegionDetectorOutput`'s keys in processing. + ## [v2.6.0](https://github.com/allenai/allennlp-models/releases/tag/v2.6.0) - 2021-07-19 ### Added diff --git a/allennlp_models/vision/dataset_readers/flickr30k.py b/allennlp_models/vision/dataset_readers/flickr30k.py index 1b4771259..d1086f77d 100644 --- a/allennlp_models/vision/dataset_readers/flickr30k.py +++ b/allennlp_models/vision/dataset_readers/flickr30k.py @@ -199,7 +199,9 @@ def _read(self, file_path: str): full_file_path = os.path.join(self.data_dir, filename) caption_dicts.append(get_caption_data(full_file_path)) - processed_images: Iterable[Optional[Tuple[Tensor, Tensor]]] + processed_images: Iterable[ + Optional[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]] + ] filenames = [f"{caption_dict['image_id']}.jpg" for caption_dict in caption_dicts] try: processed_images = self._process_image_paths( @@ -221,7 +223,7 @@ def _read(self, file_path: str): averaged_features_list = [] coordinates_list = [] masks_list = [] - for features, coords in processed_images: + for features, coords, _, _ in processed_images: features_list.append(TensorField(features)) averaged_features_list.append(torch.mean(features, dim=0)) coordinates_list.append(TensorField(coords)) diff --git a/allennlp_models/vision/dataset_readers/gqa.py b/allennlp_models/vision/dataset_readers/gqa.py index 565bf142a..802d13a55 100644 --- a/allennlp_models/vision/dataset_readers/gqa.py +++ b/allennlp_models/vision/dataset_readers/gqa.py @@ -133,7 +133,9 @@ def _read(self, split_or_filename: str): ) ) - processed_images: Iterable[Optional[Tuple[Tensor, Tensor]]] + processed_images: Iterable[ + Optional[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]] + ] if self.produce_featurized_images: # It would be much easier to just process one image at a time, but it's faster to process # them in batches. So this code gathers up instances until it has enough to fill up a batch @@ -170,7 +172,7 @@ def _read(self, split_or_filename: str): def text_to_instance( self, # type: ignore question: str, - image: Optional[Union[str, Tuple[Tensor, Tensor]]], + image: Optional[Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]]], answer: Optional[Dict[str, str]] = None, *, use_cache: bool = True, @@ -195,9 +197,11 @@ def text_to_instance( if image is not None: if isinstance(image, str): - features, coords = next(self._process_image_paths([image], use_cache=use_cache)) + features, coords, _, _ = next( + self._process_image_paths([image], use_cache=use_cache) + ) else: - features, coords = image + features, coords, _, _ = image fields["box_features"] = ArrayField(features) fields["box_coordinates"] = ArrayField(coords) fields["box_mask"] = ArrayField( diff --git a/allennlp_models/vision/dataset_readers/nlvr2.py b/allennlp_models/vision/dataset_readers/nlvr2.py index 659142fc4..ed19974c6 100644 --- a/allennlp_models/vision/dataset_readers/nlvr2.py +++ b/allennlp_models/vision/dataset_readers/nlvr2.py @@ -125,8 +125,12 @@ def _read(self, split_or_filename: str): blobs.append(json_blob) blob_dicts = list(self.shard_iterable(blobs)) - processed_images1: Iterable[Optional[Tuple[Tensor, Tensor]]] - processed_images2: Iterable[Optional[Tuple[Tensor, Tensor]]] + processed_images1: Iterable[ + Optional[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]] + ] + processed_images2: Iterable[ + Optional[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]] + ] if self.produce_featurized_images: # It would be much easier to just process one image at a time, but it's faster to process # them in batches. So this code gathers up instances until it has enough to fill up a batch @@ -169,11 +173,15 @@ def _read(self, split_or_filename: str): yield instance logger.info(f"Successfully yielded {attempted_instances} instances") - def extract_image_features(self, image: Union[str, Tuple[Tensor, Tensor]], use_cache: bool): + def extract_image_features( + self, + image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]], + use_cache: bool, + ): if isinstance(image, str): - features, coords = next(self._process_image_paths([image], use_cache=use_cache)) + features, coords, _, _ = next(self._process_image_paths([image], use_cache=use_cache)) else: - features, coords = image + features, coords, _, _ = image return ( ArrayField(features), @@ -190,8 +198,8 @@ def text_to_instance( self, # type: ignore identifier: Optional[str], hypothesis: str, - image1: Union[str, Tuple[Tensor, Tensor]], - image2: Union[str, Tuple[Tensor, Tensor]], + image1: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]], + image2: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]], label: bool, use_cache: bool = True, ) -> Instance: diff --git a/allennlp_models/vision/dataset_readers/vgqa.py b/allennlp_models/vision/dataset_readers/vgqa.py index 8f2df21a7..f3bb3199c 100644 --- a/allennlp_models/vision/dataset_readers/vgqa.py +++ b/allennlp_models/vision/dataset_readers/vgqa.py @@ -139,7 +139,9 @@ def _read(self, file_path: str): questions = questions[question_slice] question_dicts = list(self.shard_iterable(questions)) - processed_images: Iterable[Optional[Tuple[Tensor, Tensor]]] + processed_images: Iterable[ + Optional[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]] + ] if self.produce_featurized_images: # It would be much easier to just process one image at a time, but it's faster to process # them in batches. So this code gathers up instances until it has enough to fill up a batch @@ -196,7 +198,7 @@ def text_to_instance( qa_id: int, question: str, answer: Optional[str], - image: Union[str, Tuple[Tensor, Tensor]], + image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]], use_cache: bool = True, keep_impossible_questions: bool = True, ) -> Optional[Instance]: @@ -207,9 +209,9 @@ def text_to_instance( } if isinstance(image, str): - features, coords = next(self._process_image_paths([image], use_cache=use_cache)) + features, coords, _, _ = next(self._process_image_paths([image], use_cache=use_cache)) else: - features, coords = image + features, coords, _, _ = image fields["box_features"] = ArrayField(features) fields["box_coordinates"] = ArrayField(coords) diff --git a/allennlp_models/vision/dataset_readers/vision_reader.py b/allennlp_models/vision/dataset_readers/vision_reader.py index 705e8aaec..621f62c81 100644 --- a/allennlp_models/vision/dataset_readers/vision_reader.py +++ b/allennlp_models/vision/dataset_readers/vision_reader.py @@ -122,6 +122,9 @@ def __init__( # feature cache self.feature_cache_dir = feature_cache_dir self.coordinates_cache_dir = feature_cache_dir + self.class_probs_cache_dir = feature_cache_dir + self.class_labels_cache_dir = feature_cache_dir + if feature_cache_dir: self.write_to_cache = write_to_cache else: @@ -130,6 +133,8 @@ def __init__( self.write_to_cache = True self._feature_cache_instance: Optional[MutableMapping[str, Tensor]] = None self._coordinates_cache_instance: Optional[MutableMapping[str, Tensor]] = None + self._class_probs_cache_instance: Optional[MutableMapping[str, Tensor]] = None + self._class_labels_cache_instance: Optional[MutableMapping[str, Tensor]] = None # image processors self.image_loader = None @@ -206,37 +211,52 @@ def region_detector(self) -> Optional[RegionDetector]: self._region_detector.eval() # type: ignore[attr-defined] return self._region_detector # type: ignore[return-value] + def _create_cache( + self, + cache_name: str, + cache_dir: Optional[Union[str, PathLike]] = None, + ) -> MutableMapping[str, Tensor]: + if cache_dir is None: + return {} + os.makedirs(cache_dir, exist_ok=True) + return TensorCache( + os.path.join(cache_dir, cache_name), + read_only=not self.write_to_cache, + ) + @property def _feature_cache(self) -> MutableMapping[str, Tensor]: if self._feature_cache_instance is None: - if self.feature_cache_dir is None: - self._feature_cache_instance = {} - else: - os.makedirs(self.feature_cache_dir, exist_ok=True) - self._feature_cache_instance = TensorCache( - os.path.join(self.feature_cache_dir, "features"), - read_only=not self.write_to_cache, - ) - + self._feature_cache_instance = self._create_cache("features", self.feature_cache_dir) return self._feature_cache_instance @property def _coordinates_cache(self) -> MutableMapping[str, Tensor]: if self._coordinates_cache_instance is None: - if self.coordinates_cache_dir is None: - self._coordinates_cache_instance = {} - else: - os.makedirs(self.feature_cache_dir, exist_ok=True) # type: ignore - self._coordinates_cache_instance = TensorCache( - os.path.join(self.feature_cache_dir, "coordinates"), # type: ignore - read_only=not self.write_to_cache, - ) - + self._coordinates_cache_instance = self._create_cache( + "coordinates", self.coordinates_cache_dir + ) return self._coordinates_cache_instance + @property + def _class_probs_cache(self) -> MutableMapping[str, Tensor]: + if self._class_probs_cache_instance is None: + self._class_probs_cache_instance = self._create_cache( + "class_probs", self.class_probs_cache_dir + ) + return self._class_probs_cache_instance + + @property + def _class_labels_cache(self) -> MutableMapping[str, Tensor]: + if self._class_labels_cache_instance is None: + self._class_labels_cache_instance = self._create_cache( + "class_labels", self.class_labels_cache_dir + ) + return self._class_labels_cache_instance + def _process_image_paths( self, image_paths: Iterable[str], *, use_cache: bool = True - ) -> Iterator[Tuple[Tensor, Tensor]]: + ) -> Iterator[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]]: """ Processes the given image paths and returns featurized images. @@ -258,7 +278,7 @@ def _process_image_paths( "an image featurizer, and a region detector." ) - batch: List[Union[str, Tuple[Tensor, Tensor]]] = [] + batch: List[Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]]] = [] unprocessed_paths: Set[str] = set() def yield_batch(): @@ -272,16 +292,44 @@ def yield_batch(): detector_results = self.region_detector(images, sizes, featurized_images) features = detector_results.features coordinates = detector_results.boxes + class_probs = detector_results.class_probs + class_labels = detector_results.class_labels # store the processed results in memory, so we can complete the batch - paths_to_tensors = {path: (features[i], coordinates[i]) for i, path in enumerate(paths)} + paths_to_tensors = {} + for i, path in enumerate(paths): + if class_probs: + class_probs_tensor = class_probs[i] + else: + class_probs_tensor = None + + if class_labels: + class_labels_tensor = class_labels[i] + else: + class_labels_tensor = None + + paths_to_tensors[path] = ( + features[i], + coordinates[i], + class_probs_tensor, + class_labels_tensor, + ) # store the processed results in the cache if use_cache and self.write_to_cache: - for path, (features, coordinates) in paths_to_tensors.items(): + for path, ( + features, + coordinates, + class_probs, + class_labels, + ) in paths_to_tensors.items(): basename = os.path.basename(path) self._feature_cache[basename] = features self._coordinates_cache[basename] = coordinates + if class_probs is not None: + self._class_probs_cache[basename] = class_probs + if class_labels is not None: + self._class_labels_cache[basename] = class_labels # yield the batch for b in batch: @@ -296,10 +344,12 @@ def yield_batch(): if use_cache: features: Tensor = self._feature_cache[basename] coordinates: Tensor = self._coordinates_cache[basename] + class_probs: Optional[Tensor] = self._class_probs_cache.get(basename) + class_labels: Optional[Tensor] = self._class_labels_cache.get(basename) if len(batch) <= 0: - yield features, coordinates + yield features, coordinates, class_probs, class_labels else: - batch.append((features, coordinates)) + batch.append((features, coordinates, class_probs, class_labels)) else: # If we're not using the cache, we pretend we had a cache miss here. raise KeyError diff --git a/allennlp_models/vision/dataset_readers/visual_entailment.py b/allennlp_models/vision/dataset_readers/visual_entailment.py index 9a7c2ca36..f03eeaf2b 100644 --- a/allennlp_models/vision/dataset_readers/visual_entailment.py +++ b/allennlp_models/vision/dataset_readers/visual_entailment.py @@ -76,7 +76,7 @@ def _read(self, file_path: str): @overrides def text_to_instance( self, # type: ignore - image: Union[str, Tuple[Tensor, Tensor]], + image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]], hypothesis: str, label: Optional[str] = None, *, @@ -90,9 +90,11 @@ def text_to_instance( if image is not None: if isinstance(image, str): - features, coords = next(self._process_image_paths([image], use_cache=use_cache)) + features, coords, _, _ = next( + self._process_image_paths([image], use_cache=use_cache) + ) else: - features, coords = image + features, coords, _, _ = image fields["box_features"] = ArrayField(features) fields["box_coordinates"] = ArrayField(coords) diff --git a/allennlp_models/vision/dataset_readers/vqav2.py b/allennlp_models/vision/dataset_readers/vqav2.py index be1567151..91c6a7325 100644 --- a/allennlp_models/vision/dataset_readers/vqav2.py +++ b/allennlp_models/vision/dataset_readers/vqav2.py @@ -231,7 +231,9 @@ class Split(NamedTuple): questions = questions[question_slice] question_dicts = list(self.shard_iterable(questions)) - processed_images: Iterable[Optional[Tuple[Tensor, Tensor]]] + processed_images: Iterable[ + Optional[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]] + ] if self.produce_featurized_images: # It would be much easier to just process one image at a time, but it's faster to process # them in batches. So this code gathers up instances until it has enough to fill up a batch @@ -279,7 +281,7 @@ class Split(NamedTuple): def text_to_instance( self, # type: ignore question: str, - image: Union[str, Tuple[Tensor, Tensor]], + image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]], answer_counts: Optional[MutableMapping[str, int]] = None, *, use_cache: bool = True, @@ -293,9 +295,11 @@ def text_to_instance( if image is not None: if isinstance(image, str): - features, coords = next(self._process_image_paths([image], use_cache=use_cache)) + features, coords, _, _ = next( + self._process_image_paths([image], use_cache=use_cache) + ) else: - features, coords = image + features, coords, _, _ = image fields["box_features"] = ArrayField(features) fields["box_coordinates"] = ArrayField(coords)