Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Avoid duplicate tokenization of context in training (#263)
Browse files Browse the repository at this point in the history
Co-authored-by: magialiao <[email protected]>
Co-authored-by: Pete <[email protected]>
  • Loading branch information
3 people authored May 17, 2021
1 parent dc633f1 commit dea182c
Showing 1 changed file with 26 additions and 13 deletions.
39 changes: 26 additions & 13 deletions allennlp_models/rc/dataset_readers/transformer_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def _read(self, file_path: str):
for article in dataset:
for paragraph_json in article["paragraphs"]:
context = paragraph_json["context"]
cached_tokenized_context = self._tokenize_context(context)
for question_answer in self.shard_iterable(paragraph_json["qas"]):
answers = [answer_json["text"] for answer_json in question_answer["answers"]]

Expand All @@ -163,6 +164,7 @@ def _read(self, file_path: str):
first_answer_offset=first_answer_offset,
always_add_answer_span=True,
is_training=True,
cached_tokenized_context=cached_tokenized_context,
)
instances_yielded = 0
for instance in instances:
Expand All @@ -179,19 +181,7 @@ def _read(self, file_path: str):
100 * questions_with_more_than_one_instance / yielded_question_count,
)

def make_instances(
self,
qid: str,
question: str,
answers: List[str],
context: str,
first_answer_offset: Optional[int],
always_add_answer_span: bool = False,
is_training: bool = False,
) -> Iterable[Instance]:
"""
Create training instances from a SQuAD example.
"""
def _tokenize_context(self, context: str) -> List[Token]:
# tokenize context by spaces first, and then with the wordpiece tokenizer
# For RoBERTa, this produces a bug where every token is marked as beginning-of-sentence. To fix it, we
# detect whether a space comes before a word, and if so, add "a " in front of the word.
Expand Down Expand Up @@ -220,6 +210,29 @@ def tokenize_slice(start: int, end: int) -> Iterable[Token]:
if wordpiece.idx is not None:
wordpiece.idx += token_start
tokenized_context.append(wordpiece)
return tokenized_context

def make_instances(
self,
qid: str,
question: str,
answers: List[str],
context: str,
first_answer_offset: Optional[int],
always_add_answer_span: bool = False,
is_training: bool = False,
cached_tokenized_context: Optional[List[Token]] = None,
) -> Iterable[Instance]:
"""
Create training instances from a SQuAD example.
"""
if cached_tokenized_context is not None:
# In training, we will use the same context in multiple instances, so we use
# cached_tokenized_context to avoid duplicate tokenization
tokenized_context = cached_tokenized_context
else:
# In prediction, no cached_tokenized_context is provided, so we tokenize context here
tokenized_context = self._tokenize_context(context)

if first_answer_offset is None:
(token_answer_span_start, token_answer_span_end) = (-1, -1)
Expand Down

0 comments on commit dea182c

Please sign in to comment.