Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Nlvr2 (#265)
Browse files Browse the repository at this point in the history
* Adding dataset reader + models and training configs for NLVR2

Co-authored-by: Pete <[email protected]>
Co-authored-by: epwalsh <[email protected]>
  • Loading branch information
3 people committed Jun 4, 2021
1 parent e395e63 commit 664f38f
Show file tree
Hide file tree
Showing 42 changed files with 1,083 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

### Added

- Added support for NLVR2 visual entailment, including a data loader, two models, and training configs.


## [v2.5.0](https://github.com/allenai/allennlp-models/releases/tag/v2.5.0) - 2021-06-03

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ Here is a list of pre-trained models currently available.
- [`mc-roberta-commonsenseqa`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/mc-roberta-commonsenseqa.json) - RoBERTa-based multiple choice model for CommonSenseQA.
- [`mc-roberta-piqa`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/mc-roberta-piqa.json) - RoBERTa-based multiple choice model for PIQA.
- [`mc-roberta-swag`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/mc-roberta-swag.json) - RoBERTa-based multiple choice model for SWAG.
- [`nlvr2-vilbert`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/nlvr2-vilbert-head.json) - ViLBERT-based model for Visual Entailment.
- [`nlvr2-vilbert`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/nlvr2-vilbert.json) - ViLBERT-based model for Visual Entailment.
- [`pair-classification-binary-gender-bias-mitigated-roberta-snli`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/pair-classification-binary-gender-bias-mitigated-roberta-snli.json) - RoBERTa finetuned on SNLI with binary gender bias mitigation.
- [`pair-classification-decomposable-attention-elmo`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/pair-classification-decomposable-attention-elmo.json) - The decomposable attention model (Parikh et al, 2017) combined with ELMo embeddings trained on SNLI.
- [`pair-classification-esim`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/pair-classification-esim.json) - Enhanced LSTM trained on SNLI.
Expand Down
66 changes: 66 additions & 0 deletions allennlp_models/modelcards/nlvr2-vilbert-head.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
{
"id": "nlvr2-vilbert",
"registered_model_name": "nlvr2",
"registered_predictor_name": null,
"display_name": "Visual Entailment - NLVR2",
"task_id": "nlvr2",
"model_details": {
"description": "This model uses a VilBERT-based backbone with an NLVR2-specific model head. The image features are obtained using the ResNet backbone and Faster RCNN (region detection).",
"short_description": "ViLBERT-based model for Visual Entailment.",
"developed_by": "Lu et al",
"contributed_by": "Jacob Morrison",
"date": "2021-05-27",
"version": "2",
"model_type": "ViLBERT based on BERT large",
"paper": {
"citation": "\n@inproceedings{Lu2019ViLBERTPT,\ntitle={ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks},\nauthor={Jiasen Lu and Dhruv Batra and D. Parikh and Stefan Lee},\nbooktitle={NeurIPS},\nyear={2019}",
"title": "ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks",
"url": "https://api.semanticscholar.org/CorpusID:199453025"
},
"license": null,
"contact": "[email protected]"
},
"intended_use": {
"primary_uses": "This model is developed for the AllenNLP demo.",
"primary_users": null,
"out_of_scope_use_cases": null
},
"factors": {
"relevant_factors": null,
"evaluation_factors": null
},
"metrics": {
"model_performance_measures": "Accuracy and F1-score",
"decision_thresholds": null,
"variation_approaches": null
},
"evaluation_data": {
"dataset": {
"name": "Natural Language for Visual Reasoning For Real dev set",
"url": "https://github.com/lil-lab/nlvr/tree/master/nlvr2",
"notes": "Evaluation requires a large amount of images to be accessible locally, so we cannot provide a command you can easily copy and paste."
},
"motivation": null,
"preprocessing": null
},
"training_data": {
"dataset": {
"name": "Natural Language for Visual Reasoning For Real train set",
"url": "https://github.com/lil-lab/nlvr/tree/master/nlvr2"
},
"motivation": null,
"preprocessing": null
},
"quantitative_analyses": {
"unitary_results": "On the validation set:\nF1: 33.7%\nAccuracy: 50.8%.\nThese scores do not match the performance in the 12-in-1 paper because this was trained as a standalone task, not as part of a multitask setup. Please contact us if you want to match those scores!",
"intersectional_results": null
},
"model_ethical_considerations": {
"ethical_considerations": null
},
"model_usage": {
"archive_file": "vilbert-nlvr2-head-2021.06.01.tar.gz",
"training_config": "vilbert_nlvr2_pretrained.jsonnet",
"install_instructions": "pip install allennlp>=2.5.1 allennlp-models>=2.5.1"
}
}
66 changes: 66 additions & 0 deletions allennlp_models/modelcards/nlvr2-vilbert.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
{
"id": "nlvr2-vilbert",
"registered_model_name": "nlvr2",
"registered_predictor_name": null,
"display_name": "Visual Entailment - NLVR2",
"task_id": "nlvr2",
"model_details": {
"description": "This model is based on the ViLBERT multitask architecture. The image features are obtained using the ResNet backbone and Faster RCNN (region detection).",
"short_description": "ViLBERT-based model for Visual Entailment.",
"developed_by": "Lu et al",
"contributed_by": "Jacob Morrison",
"date": "2021-05-27",
"version": "2",
"model_type": "ViLBERT based on BERT large",
"paper": {
"citation": "\n@inproceedings{Lu2019ViLBERTPT,\ntitle={ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks},\nauthor={Jiasen Lu and Dhruv Batra and D. Parikh and Stefan Lee},\nbooktitle={NeurIPS},\nyear={2019}",
"title": "ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks",
"url": "https://api.semanticscholar.org/CorpusID:199453025"
},
"license": null,
"contact": "[email protected]"
},
"intended_use": {
"primary_uses": "This model is developed for the AllenNLP demo.",
"primary_users": null,
"out_of_scope_use_cases": null
},
"factors": {
"relevant_factors": null,
"evaluation_factors": null
},
"metrics": {
"model_performance_measures": "Accuracy and F1-score",
"decision_thresholds": null,
"variation_approaches": null
},
"evaluation_data": {
"dataset": {
"name": "Natural Language for Visual Reasoning For Real dev set",
"url": "https://github.com/lil-lab/nlvr/tree/master/nlvr2",
"notes": "Evaluation requires a large amount of images to be accessible locally, so we cannot provide a command you can easily copy and paste."
},
"motivation": null,
"preprocessing": null
},
"training_data": {
"dataset": {
"name": "Natural Language for Visual Reasoning For Real train set",
"url": "https://github.com/lil-lab/nlvr/tree/master/nlvr2"
},
"motivation": null,
"preprocessing": null
},
"quantitative_analyses": {
"unitary_results": "On the validation set:\nF1: 33.7%\nAccuracy: 50.8%.\nThese scores do not match the performance in the 12-in-1 paper because this was trained as a standalone task, not as part of a multitask setup. Please contact us if you want to match those scores!",
"intersectional_results": null
},
"model_ethical_considerations": {
"ethical_considerations": null
},
"model_usage": {
"archive_file": "vilbert-nlvr2-2021.06.01.tar.gz",
"training_config": "vilbert_nlvr2_pretrained.jsonnet",
"install_instructions": "pip install allennlp>=2.5.1 allennlp-models>=2.5.1"
}
}
1 change: 1 addition & 0 deletions allennlp_models/vision/dataset_readers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from allennlp_models.vision.dataset_readers.vision_reader import VisionReader
from allennlp_models.vision.dataset_readers.gqa import GQAReader
from allennlp_models.vision.dataset_readers.nlvr2 import Nlvr2Reader
from allennlp_models.vision.dataset_readers.vgqa import VGQAReader
from allennlp_models.vision.dataset_readers.vqav2 import VQAv2Reader
from allennlp_models.vision.dataset_readers.visual_entailment import VisualEntailmentReader
220 changes: 220 additions & 0 deletions allennlp_models/vision/dataset_readers/nlvr2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
import logging
from os import PathLike
from typing import Any, Dict, Iterable, Tuple, Union, Optional

from overrides import overrides
import torch
from torch import Tensor

from allennlp.common.file_utils import cached_path, json_lines_from_file
from allennlp.common.lazy import Lazy
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import ArrayField, LabelField, ListField, MetadataField, TextField
from allennlp.data.image_loader import ImageLoader
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer
from allennlp.data.tokenizers import Tokenizer
from allennlp.modules.vision.grid_embedder import GridEmbedder
from allennlp.modules.vision.region_detector import RegionDetector

from allennlp_models.vision.dataset_readers.vision_reader import VisionReader

logger = logging.getLogger(__name__)


@DatasetReader.register("nlvr2")
class Nlvr2Reader(VisionReader):
"""
Reads the NLVR2 dataset from [http://lil.nlp.cornell.edu/nlvr/](http://lil.nlp.cornell.edu/nlvr/).
In this task, the model is presented with two images and a hypothesis referring to those images.
The task for the model is to identify whether the hypothesis is true or false.
Accordingly, the instances produced by this reader contain two images, featurized into the
fields "box_features" and "box_coordinates". In addition to that, it produces a `TextField`
called "hypothesis", and a `MetadataField` called "identifier". The latter contains the question
id from the question set.
Parameters
----------
image_dir: `str`
Path to directory containing `png` image files.
image_loader: `ImageLoader`
An image loader to read the images with
image_featurizer: `GridEmbedder`
The backbone image processor (like a ResNet), whose output will be passed to the region
detector for finding object boxes in the image.
region_detector: `RegionDetector`
For pulling out regions of the image (both coordinates and features) that will be used by
downstream models.
feature_cache_dir: `str`, optional
If given, the reader will attempt to use the featurized image cache in this directory.
Caching the featurized images can result in big performance improvements, so it is
recommended to set this.
tokenizer: `Tokenizer`, optional, defaults to `PretrainedTransformerTokenizer("bert-base-uncased")`
token_indexers: `Dict[str, TokenIndexer]`, optional,
defaults to`{"tokens": PretrainedTransformerIndexer("bert-base-uncased")}`
cuda_device: `int`, optional
Set this to run image featurization on the given GPU. By default, image featurization runs on CPU.
max_instances: `int`, optional
If set, the reader only returns the first `max_instances` instances, and then stops.
This is useful for testing.
image_processing_batch_size: `int`
The number of images to process at one time while featurizing. Default is 8.
"""

def __init__(
self,
image_dir: Optional[Union[str, PathLike]] = None,
*,
image_loader: Optional[ImageLoader] = None,
image_featurizer: Optional[Lazy[GridEmbedder]] = None,
region_detector: Optional[Lazy[RegionDetector]] = None,
feature_cache_dir: Optional[Union[str, PathLike]] = None,
tokenizer: Optional[Tokenizer] = None,
token_indexers: Optional[Dict[str, TokenIndexer]] = None,
cuda_device: Optional[Union[int, torch.device]] = None,
max_instances: Optional[int] = None,
image_processing_batch_size: int = 8,
write_to_cache: bool = True,
) -> None:
run_featurization = image_loader and image_featurizer and region_detector
if image_dir is None and run_featurization:
raise ValueError(
"Because of the size of the image datasets, we don't download them automatically. "
"Please go to https://github.com/lil-lab/nlvr/tree/master/nlvr2, download the datasets you need, "
"and set the image_dir parameter to point to your download location. This dataset "
"reader does not care about the exact directory structure. It finds the images "
"wherever they are."
)

super().__init__(
image_dir,
image_loader=image_loader,
image_featurizer=image_featurizer,
region_detector=region_detector,
feature_cache_dir=feature_cache_dir,
tokenizer=tokenizer,
token_indexers=token_indexers,
cuda_device=cuda_device,
max_instances=max_instances,
image_processing_batch_size=image_processing_batch_size,
write_to_cache=write_to_cache,
)

github_url = "https://raw.githubusercontent.com/lil-lab/nlvr/"
nlvr_commit = "68a11a766624a5b665ec7594982b8ecbedc728c7"
data_dir = f"{github_url}{nlvr_commit}/nlvr2/data"
self.splits = {
"dev": f"{data_dir}/dev.json",
"test": f"{data_dir}/test1.json",
"train": f"{data_dir}/train.json",
"balanced_dev": f"{data_dir}/balanced/balanced_dev.json",
"balanced_test": f"{data_dir}/balanced/balanced_test1.json",
"unbalanced_dev": f"{data_dir}/balanced/unbalanced_dev.json",
"unbalanced_test": f"{data_dir}/balanced/unbalanced_test1.json",
}

@overrides
def _read(self, split_or_filename: str):
filename = self.splits.get(split_or_filename, split_or_filename)

json_file_path = cached_path(filename)

blobs = []
json_blob: Dict[str, Any]
for json_blob in json_lines_from_file(json_file_path):
blobs.append(json_blob)

blob_dicts = list(self.shard_iterable(blobs))
processed_images1: Iterable[Optional[Tuple[Tensor, Tensor]]]
processed_images2: Iterable[Optional[Tuple[Tensor, Tensor]]]
if self.produce_featurized_images:
# It would be much easier to just process one image at a time, but it's faster to process
# them in batches. So this code gathers up instances until it has enough to fill up a batch
# that needs processing, and then processes them all.

try:
image_paths1 = []
image_paths2 = []
for blob in blob_dicts:
identifier = blob["identifier"]
image_name_base = identifier[: identifier.rindex("-")]
image_paths1.append(self.images[f"{image_name_base}-img0.png"])
image_paths2.append(self.images[f"{image_name_base}-img1.png"])
except KeyError as e:
missing_id = e.args[0]
raise KeyError(
missing_id,
f"We could not find an image with the id {missing_id}. "
"Because of the size of the image datasets, we don't download them automatically. "
"Please go to https://github.com/lil-lab/nlvr/tree/master/nlvr2, download the "
"datasets you need, and set the image_dir parameter to point to your download "
"location. This dataset reader does not care about the exact directory "
"structure. It finds the images wherever they are.",
)

processed_images1 = self._process_image_paths(image_paths1)
processed_images2 = self._process_image_paths(image_paths2)
else:
processed_images1 = [None for _ in range(len(blob_dicts))]
processed_images2 = [None for _ in range(len(blob_dicts))]

attempted_instances = 0
for json_blob, image1, image2 in zip(blob_dicts, processed_images1, processed_images2):
identifier = json_blob["identifier"]
hypothesis = json_blob["sentence"]
label = json_blob["label"] == "True"
instance = self.text_to_instance(identifier, hypothesis, image1, image2, label)
if instance is not None:
attempted_instances += 1
yield instance
logger.info(f"Successfully yielded {attempted_instances} instances")

def extract_image_features(self, image: Union[str, Tuple[Tensor, Tensor]], use_cache: bool):
if isinstance(image, str):
features, coords = next(self._process_image_paths([image], use_cache=use_cache))
else:
features, coords = image

return (
ArrayField(features),
ArrayField(coords),
ArrayField(
features.new_ones((features.shape[0],), dtype=torch.bool),
padding_value=False,
dtype=torch.bool,
),
)

@overrides
def text_to_instance(
self, # type: ignore
identifier: Optional[str],
hypothesis: str,
image1: Union[str, Tuple[Tensor, Tensor]],
image2: Union[str, Tuple[Tensor, Tensor]],
label: bool,
use_cache: bool = True,
) -> Instance:
hypothesis_field = TextField(self._tokenizer.tokenize(hypothesis), None)
box_features1, box_coordinates1, box_mask1 = self.extract_image_features(image1, use_cache)
box_features2, box_coordinates2, box_mask2 = self.extract_image_features(image2, use_cache)

fields = {
"hypothesis": ListField([hypothesis_field, hypothesis_field]),
"box_features": ListField([box_features1, box_features2]),
"box_coordinates": ListField([box_coordinates1, box_coordinates2]),
"box_mask": ListField([box_mask1, box_mask2]),
}

if identifier is not None:
fields["identifier"] = MetadataField(identifier)

if label is not None:
fields["label"] = LabelField(int(label), skip_indexing=True)

return Instance(fields)

@overrides
def apply_token_indexers(self, instance: Instance) -> None:
instance["hypothesis"][0].token_indexers = self._token_indexers # type: ignore
instance["hypothesis"][1].token_indexers = self._token_indexers # type: ignore
1 change: 1 addition & 0 deletions allennlp_models/vision/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from allennlp_models.vision.models.nlvr2 import Nlvr2Model
from allennlp_models.vision.models.vision_text_model import VisionTextModel
from allennlp_models.vision.models.visual_entailment import VisualEntailmentModel
from allennlp_models.vision.models.vilbert_vqa import VqaVilbert
Expand Down
1 change: 1 addition & 0 deletions allennlp_models/vision/models/heads/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from allennlp_models.vision.models.heads.nlvr2_head import Nlvr2Head
from allennlp_models.vision.models.heads.vqa_head import VqaHead
from allennlp_models.vision.models.heads.visual_entailment_head import VisualEntailmentHead
Loading

0 comments on commit 664f38f

Please sign in to comment.