From b58e58ced54ed8396babc284fd5310dd174e93a8 Mon Sep 17 00:00:00 2001 From: magialiao Date: Sat, 15 May 2021 14:09:37 +0800 Subject: [PATCH] Avoid duplicate tokenization of context in training --- .../rc/dataset_readers/transformer_squad.py | 39 ++++++++++++------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/allennlp_models/rc/dataset_readers/transformer_squad.py b/allennlp_models/rc/dataset_readers/transformer_squad.py index 00fc528ce..e98acf2d6 100644 --- a/allennlp_models/rc/dataset_readers/transformer_squad.py +++ b/allennlp_models/rc/dataset_readers/transformer_squad.py @@ -146,6 +146,7 @@ def _read(self, file_path: str): for article in dataset: for paragraph_json in article["paragraphs"]: context = paragraph_json["context"] + cached_tokenized_context = self._tokenize_context(context) for question_answer in self.shard_iterable(paragraph_json["qas"]): answers = [answer_json["text"] for answer_json in question_answer["answers"]] @@ -163,6 +164,7 @@ def _read(self, file_path: str): first_answer_offset=first_answer_offset, always_add_answer_span=True, is_training=True, + cached_tokenized_context=cached_tokenized_context, ) instances_yielded = 0 for instance in instances: @@ -179,19 +181,7 @@ def _read(self, file_path: str): 100 * questions_with_more_than_one_instance / yielded_question_count, ) - def make_instances( - self, - qid: str, - question: str, - answers: List[str], - context: str, - first_answer_offset: Optional[int], - always_add_answer_span: bool = False, - is_training: bool = False, - ) -> Iterable[Instance]: - """ - Create training instances from a SQuAD example. - """ + def _tokenize_context(self, context: str) -> List[Token]: # tokenize context by spaces first, and then with the wordpiece tokenizer # For RoBERTa, this produces a bug where every token is marked as beginning-of-sentence. To fix it, we # detect whether a space comes before a word, and if so, add "a " in front of the word. @@ -220,6 +210,29 @@ def tokenize_slice(start: int, end: int) -> Iterable[Token]: if wordpiece.idx is not None: wordpiece.idx += token_start tokenized_context.append(wordpiece) + return tokenized_context + + def make_instances( + self, + qid: str, + question: str, + answers: List[str], + context: str, + first_answer_offset: Optional[int], + always_add_answer_span: bool = False, + is_training: bool = False, + cached_tokenized_context: Optional[List[Token]] = None, + ) -> Iterable[Instance]: + """ + Create training instances from a SQuAD example. + """ + if cached_tokenized_context is not None: + # In training, we will use the same context in multiple instances, so we use + # cached_tokenized_context to avoid duplicate tokenization + tokenized_context = cached_tokenized_context + else: + # In prediction, no cached_tokenized_context is provided, so we tokenize context here + tokenized_context = self._tokenize_context(context) if first_answer_offset is None: (token_answer_span_start, token_answer_span_end) = (-1, -1)