This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 176
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Adding a reader and training config for the VGQA dataset Co-authored-by: Pete <[email protected]>
- Loading branch information
1 parent
77315fc
commit 45068bb
Showing
16 changed files
with
782 additions
and
206 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
{ | ||
"id": "vgqa-vilbert", | ||
"registered_model_name": "vgqa_vilbert", | ||
"registered_predictor_name": "vilbert_vgqa", | ||
"display_name": "ViLBERT - Visual Genome Question Answering", | ||
"task_id": "vgqa", | ||
"model_details": { | ||
"description": "ViLBERT (short for Vision-and-Language BERT), is a model for learning task-agnostic joint representations of image content and natural language.", | ||
"short_description": "ViLBERT (short for Vision-and-Language BERT), is a model for learning task-agnostic joint representations of image content and natural language.", | ||
"developed_by": "Lu et al", | ||
"contributed_by": "Jacob Morrison", | ||
"date": "2021-05-07", | ||
"version": "2", | ||
"model_type": "ViLBERT based on BERT large", | ||
"paper": { | ||
"citation": "\n@inproceedings{Lu2019ViLBERTPT,\ntitle={ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks},\nauthor={Jiasen Lu and Dhruv Batra and D. Parikh and Stefan Lee},\nbooktitle={NeurIPS},\nyear={2019}\n}", | ||
"title": "ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks", | ||
"url": "https://api.semanticscholar.org/CorpusID:199453025" | ||
}, | ||
"license": null, | ||
"contact": "[email protected]" | ||
}, | ||
"intended_use": { | ||
"primary_uses": "This model is developed for the AllenNLP demo.", | ||
"primary_users": null, | ||
"out_of_scope_use_cases": null | ||
}, | ||
"factors": { | ||
"relevant_factors": null, | ||
"evaluation_factors": null | ||
}, | ||
"metrics": { | ||
"model_performance_measures": "F1-metric and VQA score", | ||
"decision_thresholds": null, | ||
"variation_approaches": null | ||
}, | ||
"evaluation_data": { | ||
"dataset": { | ||
"name": "VGQA dataset", | ||
"url": "https://visualgenome.org/", | ||
"notes": "Evaluation requires a large amount of images to be accessible locally, so we cannot provide a command you can easily copy and paste. The first time you run it, you will get an error message that tells you how to get the rest of the data.", | ||
"processed_url": "https://visualgenome.org/static/data/dataset/question_answers.json.zip!question_answers.json[:5000]" | ||
}, | ||
"motivation": null, | ||
"preprocessing": null | ||
}, | ||
"training_data": { | ||
"dataset": { | ||
"name": "VGQA dataset", | ||
"url": "https://visualgenome.org/", | ||
"notes": "Training requires a large amount of images to be accessible locally, so we cannot provide a command you can easily copy and paste. The first time you run it, you will get an error message that tells you how to get the rest of the data.", | ||
"processed_url": "https://visualgenome.org/static/data/dataset/question_answers.json.zip!question_answers.json[5000:]" | ||
}, | ||
"motivation": null, | ||
"preprocessing": null | ||
}, | ||
"quantitative_analyses": { | ||
"unitary_results": "On the validation set:\nF1: 29.6%\nVQA: 26.5%.\nThese scores do not match the performance in the VilBERT paper. Please contact us if you want to match those scores!", | ||
"intersectional_results": null | ||
}, | ||
"model_ethical_considerations": { | ||
"ethical_considerations": null | ||
}, | ||
"model_caveats_and_recommendations": { | ||
"caveats_and_recommendations": null | ||
}, | ||
"model_usage": { | ||
"archive_file": "vilbert-vgqa-pretrained.2021-05-10.tar.gz", | ||
"training_config": "vision/vilbert_vgqa_pretrained.jsonnet", | ||
"install_instructions": "pip install allennlp==2.5.0 allennlp-models==2.5.0" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
from allennlp_models.vision.dataset_readers.vision_reader import VisionReader | ||
from allennlp_models.vision.dataset_readers.gqa import GQAReader | ||
from allennlp_models.vision.dataset_readers.vgqa import VGQAReader | ||
from allennlp_models.vision.dataset_readers.vqav2 import VQAv2Reader | ||
from allennlp_models.vision.dataset_readers.visual_entailment import VisualEntailmentReader |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,217 @@ | ||
""" | ||
Utilities for vision dataset readers. | ||
""" | ||
|
||
import logging | ||
from functools import lru_cache | ||
from typing import Tuple | ||
import re | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
contractions = { | ||
"aint": "ain't", | ||
"arent": "aren't", | ||
"cant": "can't", | ||
"couldve": "could've", | ||
"couldnt": "couldn't", | ||
"couldn'tve": "couldn't've", | ||
"couldnt've": "couldn't've", | ||
"didnt": "didn't", | ||
"doesnt": "doesn't", | ||
"dont": "don't", | ||
"hadnt": "hadn't", | ||
"hadnt've": "hadn't've", | ||
"hadn'tve": "hadn't've", | ||
"hasnt": "hasn't", | ||
"havent": "haven't", | ||
"hed": "he'd", | ||
"hed've": "he'd've", | ||
"he'dve": "he'd've", | ||
"hes": "he's", | ||
"howd": "how'd", | ||
"howll": "how'll", | ||
"hows": "how's", | ||
"Id've": "I'd've", | ||
"I'dve": "I'd've", | ||
"Im": "I'm", | ||
"Ive": "I've", | ||
"isnt": "isn't", | ||
"itd": "it'd", | ||
"itd've": "it'd've", | ||
"it'dve": "it'd've", | ||
"itll": "it'll", | ||
"let's": "let's", | ||
"maam": "ma'am", | ||
"mightnt": "mightn't", | ||
"mightnt've": "mightn't've", | ||
"mightn'tve": "mightn't've", | ||
"mightve": "might've", | ||
"mustnt": "mustn't", | ||
"mustve": "must've", | ||
"neednt": "needn't", | ||
"notve": "not've", | ||
"oclock": "o'clock", | ||
"oughtnt": "oughtn't", | ||
"ow's'at": "'ow's'at", | ||
"'ows'at": "'ow's'at", | ||
"'ow'sat": "'ow's'at", | ||
"shant": "shan't", | ||
"shed've": "she'd've", | ||
"she'dve": "she'd've", | ||
"she's": "she's", | ||
"shouldve": "should've", | ||
"shouldnt": "shouldn't", | ||
"shouldnt've": "shouldn't've", | ||
"shouldn'tve": "shouldn't've", | ||
"somebody'd": "somebodyd", | ||
"somebodyd've": "somebody'd've", | ||
"somebody'dve": "somebody'd've", | ||
"somebodyll": "somebody'll", | ||
"somebodys": "somebody's", | ||
"someoned": "someone'd", | ||
"someoned've": "someone'd've", | ||
"someone'dve": "someone'd've", | ||
"someonell": "someone'll", | ||
"someones": "someone's", | ||
"somethingd": "something'd", | ||
"somethingd've": "something'd've", | ||
"something'dve": "something'd've", | ||
"somethingll": "something'll", | ||
"thats": "that's", | ||
"thered": "there'd", | ||
"thered've": "there'd've", | ||
"there'dve": "there'd've", | ||
"therere": "there're", | ||
"theres": "there's", | ||
"theyd": "they'd", | ||
"theyd've": "they'd've", | ||
"they'dve": "they'd've", | ||
"theyll": "they'll", | ||
"theyre": "they're", | ||
"theyve": "they've", | ||
"twas": "'twas", | ||
"wasnt": "wasn't", | ||
"wed've": "we'd've", | ||
"we'dve": "we'd've", | ||
"weve": "we've", | ||
"werent": "weren't", | ||
"whatll": "what'll", | ||
"whatre": "what're", | ||
"whats": "what's", | ||
"whatve": "what've", | ||
"whens": "when's", | ||
"whered": "where'd", | ||
"wheres": "where's", | ||
"whereve": "where've", | ||
"whod": "who'd", | ||
"whod've": "who'd've", | ||
"who'dve": "who'd've", | ||
"wholl": "who'll", | ||
"whos": "who's", | ||
"whove": "who've", | ||
"whyll": "why'll", | ||
"whyre": "why're", | ||
"whys": "why's", | ||
"wont": "won't", | ||
"wouldve": "would've", | ||
"wouldnt": "wouldn't", | ||
"wouldnt've": "wouldn't've", | ||
"wouldn'tve": "wouldn't've", | ||
"yall": "y'all", | ||
"yall'll": "y'all'll", | ||
"y'allll": "y'all'll", | ||
"yall'd've": "y'all'd've", | ||
"y'alld've": "y'all'd've", | ||
"y'all'dve": "y'all'd've", | ||
"youd": "you'd", | ||
"youd've": "you'd've", | ||
"you'dve": "you'd've", | ||
"youll": "you'll", | ||
"youre": "you're", | ||
"youve": "you've", | ||
} | ||
manual_map = { | ||
"none": "0", | ||
"zero": "0", | ||
"one": "1", | ||
"two": "2", | ||
"three": "3", | ||
"four": "4", | ||
"five": "5", | ||
"six": "6", | ||
"seven": "7", | ||
"eight": "8", | ||
"nine": "9", | ||
"ten": "10", | ||
} | ||
articles = ["a", "an", "the"] | ||
period_strip = re.compile(r"(?!<=\d)(\.)(?!\d)") | ||
comma_strip = re.compile(r"(\d)(\,)(\d)") | ||
punct = [ | ||
";", | ||
r"/", | ||
"[", | ||
"]", | ||
'"', | ||
"{", | ||
"}", | ||
"(", | ||
")", | ||
"=", | ||
"+", | ||
"\\", | ||
"_", | ||
"-", | ||
">", | ||
"<", | ||
"@", | ||
"`", | ||
",", | ||
"?", | ||
"!", | ||
] | ||
|
||
|
||
def process_punctuation(inText: str) -> str: | ||
outText = inText | ||
for p in punct: | ||
if (p + " " in inText or " " + p in inText) or (comma_strip.search(inText) is not None): | ||
outText = outText.replace(p, "") | ||
else: | ||
outText = outText.replace(p, " ") | ||
outText = period_strip.sub("", outText, re.UNICODE) | ||
return outText | ||
|
||
|
||
def process_digit_article(input: str) -> str: | ||
output = [] | ||
for word in input.lower().split(): | ||
word = manual_map.get(word, word) | ||
if word not in articles: | ||
output.append(word) | ||
else: | ||
pass | ||
for index, word in enumerate(output): | ||
if word in contractions: | ||
output[index] = contractions[word] | ||
return " ".join(output) | ||
|
||
|
||
@lru_cache(maxsize=None) | ||
def preprocess_answer(answer: str) -> str: | ||
answer = process_digit_article(process_punctuation(answer)) | ||
answer = answer.replace(",", "") | ||
return answer | ||
|
||
|
||
def get_data_slice(file_path: str) -> Tuple[slice, str]: | ||
slice_match = re.match(r"(.*)\[([0123456789:]*)]", file_path) | ||
if slice_match is None: | ||
question_slice = slice(None, None, None) | ||
return question_slice, file_path | ||
else: | ||
split_name = slice_match[1] | ||
slice_args = [int(a) if len(a) > 0 else None for a in slice_match[2].split(":")] | ||
question_slice = slice(*slice_args) | ||
return question_slice, split_name |
Oops, something went wrong.