Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Vgqa dataset reader (#260)
Browse files Browse the repository at this point in the history
* Adding a reader and training config for the VGQA dataset

Co-authored-by: Pete <[email protected]>
  • Loading branch information
jacob-morrison and epwalsh authored May 11, 2021
1 parent 77315fc commit 45068bb
Show file tree
Hide file tree
Showing 16 changed files with 782 additions and 206 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added a parameter called `source_prefix` to `CNNDailyMailDatasetReader`. This is useful with T5, for example, by setting `source_prefix` to "summarization: ".
- Tests for `VqaMeasure`.
- Distributed tests for `ConllCorefScores` and `SrlEvalScorer` metrics.
- Added dataset reader for visual genome QA.

### Fixed

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ Here is a list of pre-trained models currently available.
- [`tagging-fine-grained-crf-tagger`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/tagging-fine-grained-crf-tagger.json) - This model identifies a broad range of 16 semantic types in the input text. It is a reimplementation of Lample (2016) and uses a biLSTM with a CRF layer, character embeddings and ELMo embeddings.
- [`tagging-fine-grained-transformer-crf-tagger`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/tagging-fine-grained-transformer-crf-tagger.json) - Fine-grained NER model
- [`ve-vilbert`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/ve-vilbert.json) - ViLBERT-based model for Visual Entailment.
- [`vgqa-vilbert`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/vgqa-vilbert.json) - ViLBERT (short for Vision-and-Language BERT), is a model for learning task-agnostic joint representations of image content and natural language.
- [`vqa-vilbert`](https://github.com/allenai/allennlp-models/tree/main/allennlp_models/modelcards/vqa-vilbert.json) - ViLBERT (short for Vision-and-Language BERT), is a model for learning task-agnostic joint representations of image content and natural language.

<!-- End automatically generated section -->
Expand Down
72 changes: 72 additions & 0 deletions allennlp_models/modelcards/vgqa-vilbert.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
{
"id": "vgqa-vilbert",
"registered_model_name": "vgqa_vilbert",
"registered_predictor_name": "vilbert_vgqa",
"display_name": "ViLBERT - Visual Genome Question Answering",
"task_id": "vgqa",
"model_details": {
"description": "ViLBERT (short for Vision-and-Language BERT), is a model for learning task-agnostic joint representations of image content and natural language.",
"short_description": "ViLBERT (short for Vision-and-Language BERT), is a model for learning task-agnostic joint representations of image content and natural language.",
"developed_by": "Lu et al",
"contributed_by": "Jacob Morrison",
"date": "2021-05-07",
"version": "2",
"model_type": "ViLBERT based on BERT large",
"paper": {
"citation": "\n@inproceedings{Lu2019ViLBERTPT,\ntitle={ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks},\nauthor={Jiasen Lu and Dhruv Batra and D. Parikh and Stefan Lee},\nbooktitle={NeurIPS},\nyear={2019}\n}",
"title": "ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks",
"url": "https://api.semanticscholar.org/CorpusID:199453025"
},
"license": null,
"contact": "[email protected]"
},
"intended_use": {
"primary_uses": "This model is developed for the AllenNLP demo.",
"primary_users": null,
"out_of_scope_use_cases": null
},
"factors": {
"relevant_factors": null,
"evaluation_factors": null
},
"metrics": {
"model_performance_measures": "F1-metric and VQA score",
"decision_thresholds": null,
"variation_approaches": null
},
"evaluation_data": {
"dataset": {
"name": "VGQA dataset",
"url": "https://visualgenome.org/",
"notes": "Evaluation requires a large amount of images to be accessible locally, so we cannot provide a command you can easily copy and paste. The first time you run it, you will get an error message that tells you how to get the rest of the data.",
"processed_url": "https://visualgenome.org/static/data/dataset/question_answers.json.zip!question_answers.json[:5000]"
},
"motivation": null,
"preprocessing": null
},
"training_data": {
"dataset": {
"name": "VGQA dataset",
"url": "https://visualgenome.org/",
"notes": "Training requires a large amount of images to be accessible locally, so we cannot provide a command you can easily copy and paste. The first time you run it, you will get an error message that tells you how to get the rest of the data.",
"processed_url": "https://visualgenome.org/static/data/dataset/question_answers.json.zip!question_answers.json[5000:]"
},
"motivation": null,
"preprocessing": null
},
"quantitative_analyses": {
"unitary_results": "On the validation set:\nF1: 29.6%\nVQA: 26.5%.\nThese scores do not match the performance in the VilBERT paper. Please contact us if you want to match those scores!",
"intersectional_results": null
},
"model_ethical_considerations": {
"ethical_considerations": null
},
"model_caveats_and_recommendations": {
"caveats_and_recommendations": null
},
"model_usage": {
"archive_file": "vilbert-vgqa-pretrained.2021-05-10.tar.gz",
"training_config": "vision/vilbert_vgqa_pretrained.jsonnet",
"install_instructions": "pip install allennlp==2.5.0 allennlp-models==2.5.0"
}
}
1 change: 1 addition & 0 deletions allennlp_models/vision/dataset_readers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from allennlp_models.vision.dataset_readers.vision_reader import VisionReader
from allennlp_models.vision.dataset_readers.gqa import GQAReader
from allennlp_models.vision.dataset_readers.vgqa import VGQAReader
from allennlp_models.vision.dataset_readers.vqav2 import VQAv2Reader
from allennlp_models.vision.dataset_readers.visual_entailment import VisualEntailmentReader
217 changes: 217 additions & 0 deletions allennlp_models/vision/dataset_readers/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
"""
Utilities for vision dataset readers.
"""

import logging
from functools import lru_cache
from typing import Tuple
import re

logger = logging.getLogger(__name__)

contractions = {
"aint": "ain't",
"arent": "aren't",
"cant": "can't",
"couldve": "could've",
"couldnt": "couldn't",
"couldn'tve": "couldn't've",
"couldnt've": "couldn't've",
"didnt": "didn't",
"doesnt": "doesn't",
"dont": "don't",
"hadnt": "hadn't",
"hadnt've": "hadn't've",
"hadn'tve": "hadn't've",
"hasnt": "hasn't",
"havent": "haven't",
"hed": "he'd",
"hed've": "he'd've",
"he'dve": "he'd've",
"hes": "he's",
"howd": "how'd",
"howll": "how'll",
"hows": "how's",
"Id've": "I'd've",
"I'dve": "I'd've",
"Im": "I'm",
"Ive": "I've",
"isnt": "isn't",
"itd": "it'd",
"itd've": "it'd've",
"it'dve": "it'd've",
"itll": "it'll",
"let's": "let's",
"maam": "ma'am",
"mightnt": "mightn't",
"mightnt've": "mightn't've",
"mightn'tve": "mightn't've",
"mightve": "might've",
"mustnt": "mustn't",
"mustve": "must've",
"neednt": "needn't",
"notve": "not've",
"oclock": "o'clock",
"oughtnt": "oughtn't",
"ow's'at": "'ow's'at",
"'ows'at": "'ow's'at",
"'ow'sat": "'ow's'at",
"shant": "shan't",
"shed've": "she'd've",
"she'dve": "she'd've",
"she's": "she's",
"shouldve": "should've",
"shouldnt": "shouldn't",
"shouldnt've": "shouldn't've",
"shouldn'tve": "shouldn't've",
"somebody'd": "somebodyd",
"somebodyd've": "somebody'd've",
"somebody'dve": "somebody'd've",
"somebodyll": "somebody'll",
"somebodys": "somebody's",
"someoned": "someone'd",
"someoned've": "someone'd've",
"someone'dve": "someone'd've",
"someonell": "someone'll",
"someones": "someone's",
"somethingd": "something'd",
"somethingd've": "something'd've",
"something'dve": "something'd've",
"somethingll": "something'll",
"thats": "that's",
"thered": "there'd",
"thered've": "there'd've",
"there'dve": "there'd've",
"therere": "there're",
"theres": "there's",
"theyd": "they'd",
"theyd've": "they'd've",
"they'dve": "they'd've",
"theyll": "they'll",
"theyre": "they're",
"theyve": "they've",
"twas": "'twas",
"wasnt": "wasn't",
"wed've": "we'd've",
"we'dve": "we'd've",
"weve": "we've",
"werent": "weren't",
"whatll": "what'll",
"whatre": "what're",
"whats": "what's",
"whatve": "what've",
"whens": "when's",
"whered": "where'd",
"wheres": "where's",
"whereve": "where've",
"whod": "who'd",
"whod've": "who'd've",
"who'dve": "who'd've",
"wholl": "who'll",
"whos": "who's",
"whove": "who've",
"whyll": "why'll",
"whyre": "why're",
"whys": "why's",
"wont": "won't",
"wouldve": "would've",
"wouldnt": "wouldn't",
"wouldnt've": "wouldn't've",
"wouldn'tve": "wouldn't've",
"yall": "y'all",
"yall'll": "y'all'll",
"y'allll": "y'all'll",
"yall'd've": "y'all'd've",
"y'alld've": "y'all'd've",
"y'all'dve": "y'all'd've",
"youd": "you'd",
"youd've": "you'd've",
"you'dve": "you'd've",
"youll": "you'll",
"youre": "you're",
"youve": "you've",
}
manual_map = {
"none": "0",
"zero": "0",
"one": "1",
"two": "2",
"three": "3",
"four": "4",
"five": "5",
"six": "6",
"seven": "7",
"eight": "8",
"nine": "9",
"ten": "10",
}
articles = ["a", "an", "the"]
period_strip = re.compile(r"(?!<=\d)(\.)(?!\d)")
comma_strip = re.compile(r"(\d)(\,)(\d)")
punct = [
";",
r"/",
"[",
"]",
'"',
"{",
"}",
"(",
")",
"=",
"+",
"\\",
"_",
"-",
">",
"<",
"@",
"`",
",",
"?",
"!",
]


def process_punctuation(inText: str) -> str:
outText = inText
for p in punct:
if (p + " " in inText or " " + p in inText) or (comma_strip.search(inText) is not None):
outText = outText.replace(p, "")
else:
outText = outText.replace(p, " ")
outText = period_strip.sub("", outText, re.UNICODE)
return outText


def process_digit_article(input: str) -> str:
output = []
for word in input.lower().split():
word = manual_map.get(word, word)
if word not in articles:
output.append(word)
else:
pass
for index, word in enumerate(output):
if word in contractions:
output[index] = contractions[word]
return " ".join(output)


@lru_cache(maxsize=None)
def preprocess_answer(answer: str) -> str:
answer = process_digit_article(process_punctuation(answer))
answer = answer.replace(",", "")
return answer


def get_data_slice(file_path: str) -> Tuple[slice, str]:
slice_match = re.match(r"(.*)\[([0123456789:]*)]", file_path)
if slice_match is None:
question_slice = slice(None, None, None)
return question_slice, file_path
else:
split_name = slice_match[1]
slice_args = [int(a) if len(a) > 0 else None for a in slice_match[2].split(":")]
question_slice = slice(*slice_args)
return question_slice, split_name
Loading

0 comments on commit 45068bb

Please sign in to comment.