Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Improve transformer_squad reader to avoid duplicate tokenization of context in training #263

Merged
merged 2 commits into from
May 17, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 26 additions & 13 deletions allennlp_models/rc/dataset_readers/transformer_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def _read(self, file_path: str):
for article in dataset:
for paragraph_json in article["paragraphs"]:
context = paragraph_json["context"]
cached_tokenized_context = self._tokenize_context(context)
for question_answer in self.shard_iterable(paragraph_json["qas"]):
answers = [answer_json["text"] for answer_json in question_answer["answers"]]

Expand All @@ -163,6 +164,7 @@ def _read(self, file_path: str):
first_answer_offset=first_answer_offset,
always_add_answer_span=True,
is_training=True,
cached_tokenized_context=cached_tokenized_context,
)
instances_yielded = 0
for instance in instances:
Expand All @@ -179,19 +181,7 @@ def _read(self, file_path: str):
100 * questions_with_more_than_one_instance / yielded_question_count,
)

def make_instances(
self,
qid: str,
question: str,
answers: List[str],
context: str,
first_answer_offset: Optional[int],
always_add_answer_span: bool = False,
is_training: bool = False,
) -> Iterable[Instance]:
"""
Create training instances from a SQuAD example.
"""
def _tokenize_context(self, context: str) -> List[Token]:
# tokenize context by spaces first, and then with the wordpiece tokenizer
# For RoBERTa, this produces a bug where every token is marked as beginning-of-sentence. To fix it, we
# detect whether a space comes before a word, and if so, add "a " in front of the word.
Expand Down Expand Up @@ -220,6 +210,29 @@ def tokenize_slice(start: int, end: int) -> Iterable[Token]:
if wordpiece.idx is not None:
wordpiece.idx += token_start
tokenized_context.append(wordpiece)
return tokenized_context

def make_instances(
self,
qid: str,
question: str,
answers: List[str],
context: str,
first_answer_offset: Optional[int],
always_add_answer_span: bool = False,
is_training: bool = False,
cached_tokenized_context: Optional[List[Token]] = None,
) -> Iterable[Instance]:
"""
Create training instances from a SQuAD example.
"""
if cached_tokenized_context is not None:
# In training, we will use the same context in multiple instances, so we use
# cached_tokenized_context to avoid duplicate tokenization
tokenized_context = cached_tokenized_context
else:
# In prediction, no cached_tokenized_context is provided, so we tokenize context here
tokenized_context = self._tokenize_context(context)

if first_answer_offset is None:
(token_answer_span_start, token_answer_span_end) = (-1, -1)
Expand Down