Vgqa dataset reader (#260)

* Adding a reader and training config for the VGQA dataset Co-authored-by: Pete <[email protected]>
allenai · May 11, 2021 · 45068bb · 45068bb
1 parent 77315fc
commit 45068bb
Show file tree

Hide file tree

Showing 16 changed files with 782 additions and 206 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added a parameter called `source_prefix` to `CNNDailyMailDatasetReader`. This is useful with T5, for example, by setting `source_prefix` to "summarization: ".
 - Tests for `VqaMeasure`.
 - Distributed tests for `ConllCorefScores` and `SrlEvalScorer` metrics.
+- Added dataset reader for visual genome QA.
 
 ### Fixed
 

diff --git a/README.md b/README.md
@@ -175,6 +175,7 @@ Here is a list of pre-trained models currently available.
 - [`tagging-fine-grained-crf-tagger`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/tagging-fine-grained-crf-tagger.json) - This model identifies a broad range of 16 semantic types in the input text. It is a reimplementation of Lample (2016) and uses a biLSTM with a CRF layer, character embeddings and ELMo embeddings.
 - [`tagging-fine-grained-transformer-crf-tagger`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/tagging-fine-grained-transformer-crf-tagger.json) - Fine-grained NER model
 - [`ve-vilbert`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/ve-vilbert.json) - ViLBERT-based model for Visual Entailment.
+- [`vgqa-vilbert`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/vgqa-vilbert.json) - ViLBERT (short for Vision-and-Language BERT), is a model for learning task-agnostic joint representations of image content and natural language.
 - [`vqa-vilbert`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/vqa-vilbert.json) - ViLBERT (short for Vision-and-Language BERT), is a model for learning task-agnostic joint representations of image content and natural language.
 
 <!-- End automatically generated section -->

diff --git a/allennlp_models/modelcards/vgqa-vilbert.json b/allennlp_models/modelcards/vgqa-vilbert.json
@@ -0,0 +1,72 @@
+{
+    "id": "vgqa-vilbert",
+    "registered_model_name": "vgqa_vilbert",
+    "registered_predictor_name": "vilbert_vgqa",
+    "display_name": "ViLBERT - Visual Genome Question Answering",
+    "task_id": "vgqa",
+    "model_details": {
+        "description": "ViLBERT (short for Vision-and-Language BERT), is a model for learning task-agnostic joint representations of image content and natural language.",
+        "short_description": "ViLBERT (short for Vision-and-Language BERT), is a model for learning task-agnostic joint representations of image content and natural language.",
+        "developed_by": "Lu et al",
+        "contributed_by": "Jacob Morrison",
+        "date": "2021-05-07",
+        "version": "2",
+        "model_type": "ViLBERT based on BERT large",
+        "paper": {
+            "citation": "\n@inproceedings{Lu2019ViLBERTPT,\ntitle={ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks},\nauthor={Jiasen Lu and Dhruv Batra and D. Parikh and Stefan Lee},\nbooktitle={NeurIPS},\nyear={2019}\n}",
+            "title": "ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks",
+            "url": "https://api.semanticscholar.org/CorpusID:199453025"
+        },
+        "license": null,
+        "contact": "[email protected]"
+    },
+    "intended_use": {
+        "primary_uses": "This model is developed for the AllenNLP demo.",
+        "primary_users": null,
+        "out_of_scope_use_cases": null
+    },
+    "factors": {
+        "relevant_factors": null,
+        "evaluation_factors": null
+    },
+    "metrics": {
+        "model_performance_measures": "F1-metric and VQA score",
+        "decision_thresholds": null,
+        "variation_approaches": null
+    },
+    "evaluation_data": {
+        "dataset": {
+            "name": "VGQA dataset",
+            "url": "https://visualgenome.org/",
+            "notes": "Evaluation requires a large amount of images to be accessible locally, so we cannot provide a command you can easily copy and paste. The first time you run it, you will get an error message that tells you how to get the rest of the data.",
+            "processed_url": "https://visualgenome.org/static/data/dataset/question_answers.json.zip!question_answers.json[:5000]"
+        },
+        "motivation": null,
+        "preprocessing": null
+    },
+    "training_data": {
+        "dataset": {
+            "name": "VGQA dataset",
+            "url": "https://visualgenome.org/",
+            "notes": "Training requires a large amount of images to be accessible locally, so we cannot provide a command you can easily copy and paste. The first time you run it, you will get an error message that tells you how to get the rest of the data.",
+            "processed_url": "https://visualgenome.org/static/data/dataset/question_answers.json.zip!question_answers.json[5000:]"
+        },
+        "motivation": null,
+        "preprocessing": null
+    },
+    "quantitative_analyses": {
+        "unitary_results": "On the validation set:\nF1: 29.6%\nVQA: 26.5%.\nThese scores do not match the performance in the VilBERT paper. Please contact us if you want to match those scores!",
+        "intersectional_results": null
+    },
+    "model_ethical_considerations": {
+        "ethical_considerations": null
+    },
+    "model_caveats_and_recommendations": {
+        "caveats_and_recommendations": null
+    },
+    "model_usage": {
+        "archive_file": "vilbert-vgqa-pretrained.2021-05-10.tar.gz",
+        "training_config": "vision/vilbert_vgqa_pretrained.jsonnet",
+        "install_instructions": "pip install allennlp==2.5.0 allennlp-models==2.5.0"
+    }
+}
diff --git a/allennlp_models/vision/dataset_readers/__init__.py b/allennlp_models/vision/dataset_readers/__init__.py
@@ -1,4 +1,5 @@
 from allennlp_models.vision.dataset_readers.vision_reader import VisionReader
 from allennlp_models.vision.dataset_readers.gqa import GQAReader
+from allennlp_models.vision.dataset_readers.vgqa import VGQAReader
 from allennlp_models.vision.dataset_readers.vqav2 import VQAv2Reader
 from allennlp_models.vision.dataset_readers.visual_entailment import VisualEntailmentReader
diff --git a/allennlp_models/vision/dataset_readers/utils.py b/allennlp_models/vision/dataset_readers/utils.py
@@ -0,0 +1,217 @@
+"""
+Utilities for vision dataset readers.
+"""
+
+import logging
+from functools import lru_cache
+from typing import Tuple
+import re
+
+logger = logging.getLogger(__name__)
+
+contractions = {
+    "aint": "ain't",
+    "arent": "aren't",
+    "cant": "can't",
+    "couldve": "could've",
+    "couldnt": "couldn't",
+    "couldn'tve": "couldn't've",
+    "couldnt've": "couldn't've",
+    "didnt": "didn't",
+    "doesnt": "doesn't",
+    "dont": "don't",
+    "hadnt": "hadn't",
+    "hadnt've": "hadn't've",
+    "hadn'tve": "hadn't've",
+    "hasnt": "hasn't",
+    "havent": "haven't",
+    "hed": "he'd",
+    "hed've": "he'd've",
+    "he'dve": "he'd've",
+    "hes": "he's",
+    "howd": "how'd",
+    "howll": "how'll",
+    "hows": "how's",
+    "Id've": "I'd've",
+    "I'dve": "I'd've",
+    "Im": "I'm",
+    "Ive": "I've",
+    "isnt": "isn't",
+    "itd": "it'd",
+    "itd've": "it'd've",
+    "it'dve": "it'd've",
+    "itll": "it'll",
+    "let's": "let's",
+    "maam": "ma'am",
+    "mightnt": "mightn't",
+    "mightnt've": "mightn't've",
+    "mightn'tve": "mightn't've",
+    "mightve": "might've",
+    "mustnt": "mustn't",
+    "mustve": "must've",
+    "neednt": "needn't",
+    "notve": "not've",
+    "oclock": "o'clock",
+    "oughtnt": "oughtn't",
+    "ow's'at": "'ow's'at",
+    "'ows'at": "'ow's'at",
+    "'ow'sat": "'ow's'at",
+    "shant": "shan't",
+    "shed've": "she'd've",
+    "she'dve": "she'd've",
+    "she's": "she's",
+    "shouldve": "should've",
+    "shouldnt": "shouldn't",
+    "shouldnt've": "shouldn't've",
+    "shouldn'tve": "shouldn't've",
+    "somebody'd": "somebodyd",
+    "somebodyd've": "somebody'd've",
+    "somebody'dve": "somebody'd've",
+    "somebodyll": "somebody'll",
+    "somebodys": "somebody's",
+    "someoned": "someone'd",
+    "someoned've": "someone'd've",
+    "someone'dve": "someone'd've",
+    "someonell": "someone'll",
+    "someones": "someone's",
+    "somethingd": "something'd",
+    "somethingd've": "something'd've",
+    "something'dve": "something'd've",
+    "somethingll": "something'll",
+    "thats": "that's",
+    "thered": "there'd",
+    "thered've": "there'd've",
+    "there'dve": "there'd've",
+    "therere": "there're",
+    "theres": "there's",
+    "theyd": "they'd",
+    "theyd've": "they'd've",
+    "they'dve": "they'd've",
+    "theyll": "they'll",
+    "theyre": "they're",
+    "theyve": "they've",
+    "twas": "'twas",
+    "wasnt": "wasn't",
+    "wed've": "we'd've",
+    "we'dve": "we'd've",
+    "weve": "we've",
+    "werent": "weren't",
+    "whatll": "what'll",
+    "whatre": "what're",
+    "whats": "what's",
+    "whatve": "what've",
+    "whens": "when's",
+    "whered": "where'd",
+    "wheres": "where's",
+    "whereve": "where've",
+    "whod": "who'd",
+    "whod've": "who'd've",
+    "who'dve": "who'd've",
+    "wholl": "who'll",
+    "whos": "who's",
+    "whove": "who've",
+    "whyll": "why'll",
+    "whyre": "why're",
+    "whys": "why's",
+    "wont": "won't",
+    "wouldve": "would've",
+    "wouldnt": "wouldn't",
+    "wouldnt've": "wouldn't've",
+    "wouldn'tve": "wouldn't've",
+    "yall": "y'all",
+    "yall'll": "y'all'll",
+    "y'allll": "y'all'll",
+    "yall'd've": "y'all'd've",
+    "y'alld've": "y'all'd've",
+    "y'all'dve": "y'all'd've",
+    "youd": "you'd",
+    "youd've": "you'd've",
+    "you'dve": "you'd've",
+    "youll": "you'll",
+    "youre": "you're",
+    "youve": "you've",
+}
+manual_map = {
+    "none": "0",
+    "zero": "0",
+    "one": "1",
+    "two": "2",
+    "three": "3",
+    "four": "4",
+    "five": "5",
+    "six": "6",
+    "seven": "7",
+    "eight": "8",
+    "nine": "9",
+    "ten": "10",
+}
+articles = ["a", "an", "the"]
+period_strip = re.compile(r"(?!<=\d)(\.)(?!\d)")
+comma_strip = re.compile(r"(\d)(\,)(\d)")
+punct = [
+    ";",
+    r"/",
+    "[",
+    "]",
+    '"',
+    "{",
+    "}",
+    "(",
+    ")",
+    "=",
+    "+",
+    "\\",
+    "_",
+    "-",
+    ">",
+    "<",
+    "@",
+    "`",
+    ",",
+    "?",
+    "!",
+]
+
+
+def process_punctuation(inText: str) -> str:
+    outText = inText
+    for p in punct:
+        if (p + " " in inText or " " + p in inText) or (comma_strip.search(inText) is not None):
+            outText = outText.replace(p, "")
+        else:
+            outText = outText.replace(p, " ")
+    outText = period_strip.sub("", outText, re.UNICODE)
+    return outText
+
+
+def process_digit_article(input: str) -> str:
+    output = []
+    for word in input.lower().split():
+        word = manual_map.get(word, word)
+        if word not in articles:
+            output.append(word)
+        else:
+            pass
+    for index, word in enumerate(output):
+        if word in contractions:
+            output[index] = contractions[word]
+    return " ".join(output)
+
+
+@lru_cache(maxsize=None)
+def preprocess_answer(answer: str) -> str:
+    answer = process_digit_article(process_punctuation(answer))
+    answer = answer.replace(",", "")
+    return answer
+
+
+def get_data_slice(file_path: str) -> Tuple[slice, str]:
+    slice_match = re.match(r"(.*)\[([0123456789:]*)]", file_path)
+    if slice_match is None:
+        question_slice = slice(None, None, None)
+        return question_slice, file_path
+    else:
+        split_name = slice_match[1]
+        slice_args = [int(a) if len(a) > 0 else None for a in slice_match[2].split(":")]
+        question_slice = slice(*slice_args)
+        return question_slice, split_name