From 33746cec6b355d87c66de7236814c8c489b34c48 Mon Sep 17 00:00:00 2001 From: Maksym Del Date: Sat, 4 Apr 2020 18:18:30 +0300 Subject: [PATCH 1/3] Fix transformers vocab not being saved to files --- .../pretrained_transformer_indexer.py | 36 +++++-------------- allennlp/data/vocabulary.py | 28 ++++++++++++++- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/allennlp/data/token_indexers/pretrained_transformer_indexer.py b/allennlp/data/token_indexers/pretrained_transformer_indexer.py index 5ca2ffe4ad7..47a818e1597 100644 --- a/allennlp/data/token_indexers/pretrained_transformer_indexer.py +++ b/allennlp/data/token_indexers/pretrained_transformer_indexer.py @@ -28,7 +28,7 @@ class PretrainedTransformerIndexer(TokenIndexer): model_name : `str` The name of the `transformers` model to use. - namespace : `str`, optional (default=`tags`) + namespace : `str`, optional (default=`from_transformeres`) We will add the tokens in the pytorch_transformer vocabulary to this vocabulary namespace. We use a somewhat confusing default value of `tags` so that we do not add padding or UNK tokens to this namespace, which would break on loading because we wouldn't find our default @@ -41,7 +41,11 @@ class PretrainedTransformerIndexer(TokenIndexer): """ def __init__( - self, model_name: str, namespace: str = "tags", max_length: int = None, **kwargs + self, + model_name: str, + namespace: str = "from_transformeres", + max_length: int = None, + **kwargs, ) -> None: super().__init__(**kwargs) self._namespace = namespace @@ -65,35 +69,13 @@ def __init__( def _add_encoding_to_vocabulary_if_needed(self, vocab: Vocabulary) -> None: """ Copies tokens from ```transformers``` model to the specified namespace. - Transformers vocab is taken from the / keys of the tokenizer object. """ if self._added_to_vocabulary: return - # Find the key under which the vocab is hiding. - for vocab_field_name in ["vocab", "encoder", "sp_model"]: - if hasattr(self._tokenizer, vocab_field_name): - break - else: - vocab_field_name = None - - if vocab_field_name is not None: - pretrained_vocab = getattr(self._tokenizer, vocab_field_name) - if vocab_field_name == "sp_model": - for idx in range(len(pretrained_vocab)): - word = pretrained_vocab.id_to_piece(idx) - vocab._token_to_index[self._namespace][word] = idx - vocab._index_to_token[self._namespace][idx] = word - else: - for word, idx in pretrained_vocab.items(): - vocab._token_to_index[self._namespace][word] = idx - vocab._index_to_token[self._namespace][idx] = word - else: - logger.warning( - """Wasn't able to fetch vocabulary from pretrained transformers lib. - Neither nor are the valid fields for vocab. - Your tokens will still be correctly indexed, but vocabulary file will not be saved.""" - ) + vocab.extend_from_precomputed_encoding( + self._tokenizer.get_vocab(), self._namespace, resave_to_files=True + ) self._added_to_vocabulary = True diff --git a/allennlp/data/vocabulary.py b/allennlp/data/vocabulary.py index 470e700f404..327fa58a996 100644 --- a/allennlp/data/vocabulary.py +++ b/allennlp/data/vocabulary.py @@ -21,7 +21,7 @@ logger = logging.getLogger(__name__) -DEFAULT_NON_PADDED_NAMESPACES = ("*tags", "*labels") +DEFAULT_NON_PADDED_NAMESPACES = ("*tags", "*labels", "*from_transformers") DEFAULT_PADDING_TOKEN = "@@PADDING@@" DEFAULT_OOV_TOKEN = "@@UNKNOWN@@" NAMESPACE_PADDING_FILE = "non_padded_namespaces.txt" @@ -223,6 +223,7 @@ def __init__( self._padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN self._oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN + self._last_save_directory = None self._non_padded_namespaces = set(non_padded_namespaces) self._token_to_index = _TokenToIndexDefaultDict( @@ -603,6 +604,8 @@ def save_to_files(self, directory: str) -> None: for i in range(start_index, num_tokens): print(mapping[i].replace("\n", "@@NEWLINE@@"), file=token_file) + self._last_save_directory = directory + def is_padded(self, namespace: str) -> bool: """ Returns whether or not there are padding and OOV tokens added to the given namespace. @@ -627,6 +630,29 @@ def add_token_to_namespace(self, token: str, namespace: str = "tokens") -> int: else: return self._token_to_index[namespace][token] + def extend_from_precomputed_encoding( + self, + precomputed_encoding: Dict[str, int], + namespace: str = "from_transformers", + resave_to_files: bool = False, + ) -> None: + """ + Populates given namespace with precomputed encoding, for example from pretrained transformers. + We also optionally resave vocabulary to files in this method since we understand it can be used + after our initial vocab construction and saving procedure. + """ + for word, idx in precomputed_encoding.items(): + self._token_to_index[namespace][word] = idx + self._index_to_token[namespace][idx] = word + + if resave_to_files: + if self._last_save_directory is not None: + self.save_to_files(self._last_save_directory) + else: + logging.warning( + "vocabulary folder on disk is missing, newly populated namespace will not be saved to files" + ) + def add_tokens_to_namespace(self, tokens: List[str], namespace: str = "tokens") -> List[int]: """ Adds `tokens` to the index, if they are not already present. Either way, we return the From d45fad695267a583565d7c607d5754fe908dc839 Mon Sep 17 00:00:00 2001 From: Maksym Del Date: Sat, 4 Apr 2020 18:33:35 +0300 Subject: [PATCH 2/3] Fix typo --- .../data/token_indexers/pretrained_transformer_indexer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/allennlp/data/token_indexers/pretrained_transformer_indexer.py b/allennlp/data/token_indexers/pretrained_transformer_indexer.py index 47a818e1597..42016d913b5 100644 --- a/allennlp/data/token_indexers/pretrained_transformer_indexer.py +++ b/allennlp/data/token_indexers/pretrained_transformer_indexer.py @@ -28,7 +28,7 @@ class PretrainedTransformerIndexer(TokenIndexer): model_name : `str` The name of the `transformers` model to use. - namespace : `str`, optional (default=`from_transformeres`) + namespace : `str`, optional (default=`from_transformers`) We will add the tokens in the pytorch_transformer vocabulary to this vocabulary namespace. We use a somewhat confusing default value of `tags` so that we do not add padding or UNK tokens to this namespace, which would break on loading because we wouldn't find our default @@ -43,7 +43,7 @@ class PretrainedTransformerIndexer(TokenIndexer): def __init__( self, model_name: str, - namespace: str = "from_transformeres", + namespace: str = "from_transformers", max_length: int = None, **kwargs, ) -> None: From 21ecc86a7795ce2200eb37b3535442c58f46dfc5 Mon Sep 17 00:00:00 2001 From: Maksym Del Date: Sat, 4 Apr 2020 20:26:34 +0300 Subject: [PATCH 3/3] Fix naming; merge --- .../pretrained_transformer_indexer.py | 2 +- allennlp/data/vocabulary.py | 48 +++++++++---------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/allennlp/data/token_indexers/pretrained_transformer_indexer.py b/allennlp/data/token_indexers/pretrained_transformer_indexer.py index 42016d913b5..2eacb832ce1 100644 --- a/allennlp/data/token_indexers/pretrained_transformer_indexer.py +++ b/allennlp/data/token_indexers/pretrained_transformer_indexer.py @@ -73,7 +73,7 @@ def _add_encoding_to_vocabulary_if_needed(self, vocab: Vocabulary) -> None: if self._added_to_vocabulary: return - vocab.extend_from_precomputed_encoding( + vocab.extend_from_dictionary( self._tokenizer.get_vocab(), self._namespace, resave_to_files=True ) diff --git a/allennlp/data/vocabulary.py b/allennlp/data/vocabulary.py index 327fa58a996..525ed730baf 100644 --- a/allennlp/data/vocabulary.py +++ b/allennlp/data/vocabulary.py @@ -223,7 +223,7 @@ def __init__( self._padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN self._oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN - self._last_save_directory = None + self._last_save_directory: Optional[str] = None self._non_padded_namespaces = set(non_padded_namespaces) self._token_to_index = _TokenToIndexDefaultDict( @@ -456,6 +456,29 @@ def extend_from_vocab(self, vocab: "Vocabulary") -> None: for token in vocab.get_token_to_index_vocabulary(namespace): self.add_token_to_namespace(token, namespace) + def extend_from_dictionary( + self, + encoding_dictionary: Dict[str, int], + namespace: str = "from_transformers", + resave_to_files: bool = False, + ) -> None: + """ + Populates given namespace with precomputed encoding, for example from pretrained transformers. + We also optionally resave vocabulary to files in this method since we understand it can be used + after our initial vocab construction and saving procedure. + """ + for word, idx in encoding_dictionary.items(): + self._token_to_index[namespace][word] = idx + self._index_to_token[namespace][idx] = word + + if resave_to_files: + if self._last_save_directory is not None: + self.save_to_files(self._last_save_directory) + else: + logging.warning( + "vocabulary folder on disk is missing, newly populated namespace will not be saved to files" + ) + def _extend( self, counter: Dict[str, Dict[str, int]] = None, @@ -630,29 +653,6 @@ def add_token_to_namespace(self, token: str, namespace: str = "tokens") -> int: else: return self._token_to_index[namespace][token] - def extend_from_precomputed_encoding( - self, - precomputed_encoding: Dict[str, int], - namespace: str = "from_transformers", - resave_to_files: bool = False, - ) -> None: - """ - Populates given namespace with precomputed encoding, for example from pretrained transformers. - We also optionally resave vocabulary to files in this method since we understand it can be used - after our initial vocab construction and saving procedure. - """ - for word, idx in precomputed_encoding.items(): - self._token_to_index[namespace][word] = idx - self._index_to_token[namespace][idx] = word - - if resave_to_files: - if self._last_save_directory is not None: - self.save_to_files(self._last_save_directory) - else: - logging.warning( - "vocabulary folder on disk is missing, newly populated namespace will not be saved to files" - ) - def add_tokens_to_namespace(self, tokens: List[str], namespace: str = "tokens") -> List[int]: """ Adds `tokens` to the index, if they are not already present. Either way, we return the