allenai · MaksymDel · Apr 4, 2020 · Apr 4, 2020 · Apr 4, 2020 · matt-gardner
diff --git a/allennlp/data/token_indexers/pretrained_transformer_indexer.py b/allennlp/data/token_indexers/pretrained_transformer_indexer.py
@@ -28,7 +28,7 @@ class PretrainedTransformerIndexer(TokenIndexer):
 
     model_name : `str`
         The name of the `transformers` model to use.
-    namespace : `str`, optional (default=`tags`)
+    namespace : `str`, optional (default=`from_transformers`)
         We will add the tokens in the pytorch_transformer vocabulary to this vocabulary namespace.
         We use a somewhat confusing default value of `tags` so that we do not add padding or UNK
-        We use a somewhat confusing default value of `tags` so that we do not add padding or UNK
+        We use `from_transformers` as the default namespace, which gets special treatment in our
+        `Vocabulary` class to not get padding or UNK tokens.
-        We use a somewhat confusing default value of `tags` so that we do not add padding or UNK
+        We use `from_transformers` as the default namespace, which gets special treatment in our
+        `Vocabulary` class to not get padding or UNK tokens.
         tokens to this namespace, which would break on loading because we wouldn't find our default
@@ -41,7 +41,11 @@ class PretrainedTransformerIndexer(TokenIndexer):
     """
 
     def __init__(
-        self, model_name: str, namespace: str = "tags", max_length: int = None, **kwargs
+        self,
+        model_name: str,
+        namespace: str = "from_transformers",
+        max_length: int = None,
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         self._namespace = namespace
@@ -65,35 +69,13 @@ def __init__(
     def _add_encoding_to_vocabulary_if_needed(self, vocab: Vocabulary) -> None:
         """
         Copies tokens from ```transformers``` model to the specified namespace.
-        Transformers vocab is taken from the <vocab>/<encoder> keys of the tokenizer object.
         """
         if self._added_to_vocabulary:
             return
 
-        # Find the key under which the vocab is hiding.
-        for vocab_field_name in ["vocab", "encoder", "sp_model"]:
-            if hasattr(self._tokenizer, vocab_field_name):
-                break
-        else:
-            vocab_field_name = None
-
-        if vocab_field_name is not None:
-            pretrained_vocab = getattr(self._tokenizer, vocab_field_name)
-            if vocab_field_name == "sp_model":
-                for idx in range(len(pretrained_vocab)):
-                    word = pretrained_vocab.id_to_piece(idx)
-                    vocab._token_to_index[self._namespace][word] = idx
-                    vocab._index_to_token[self._namespace][idx] = word
-            else:
-                for word, idx in pretrained_vocab.items():
-                    vocab._token_to_index[self._namespace][word] = idx
-                    vocab._index_to_token[self._namespace][idx] = word
-        else:
-            logger.warning(
-                """Wasn't able to fetch vocabulary from pretrained transformers lib.
-                Neither <vocab> nor <encoder> are the valid fields for vocab.
-                Your tokens will still be correctly indexed, but vocabulary file will not be saved."""
-            )
+        vocab.extend_from_precomputed_encoding(
+            self._tokenizer.get_vocab(), self._namespace, resave_to_files=True
+        )
 
         self._added_to_vocabulary = True
 

diff --git a/allennlp/data/vocabulary.py b/allennlp/data/vocabulary.py
@@ -21,7 +21,7 @@
 
 logger = logging.getLogger(__name__)
 
-DEFAULT_NON_PADDED_NAMESPACES = ("*tags", "*labels")
+DEFAULT_NON_PADDED_NAMESPACES = ("*tags", "*labels", "*from_transformers")
 DEFAULT_PADDING_TOKEN = "@@PADDING@@"
 DEFAULT_OOV_TOKEN = "@@UNKNOWN@@"
 NAMESPACE_PADDING_FILE = "non_padded_namespaces.txt"
@@ -223,6 +223,7 @@ def __init__(
         self._padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN
         self._oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN
 
+        self._last_save_directory = None
         self._non_padded_namespaces = set(non_padded_namespaces)
 
         self._token_to_index = _TokenToIndexDefaultDict(
@@ -603,6 +604,8 @@ def save_to_files(self, directory: str) -> None:
                 for i in range(start_index, num_tokens):
                     print(mapping[i].replace("\n", "@@NEWLINE@@"), file=token_file)
 
+        self._last_save_directory = directory
+
     def is_padded(self, namespace: str) -> bool:
         """
         Returns whether or not there are padding and OOV tokens added to the given namespace.
@@ -627,6 +630,29 @@ def add_token_to_namespace(self, token: str, namespace: str = "tokens") -> int:
         else:
             return self._token_to_index[namespace][token]
 
+    def extend_from_precomputed_encoding(
+        self,
+        precomputed_encoding: Dict[str, int],
+        namespace: str = "from_transformers",
+        resave_to_files: bool = False,
+    ) -> None:
+        """
+        Populates given namespace with precomputed encoding, for example from pretrained transformers.
+        We also optionally resave vocabulary to files in this method since we understand it can be used
+        after our initial vocab construction and saving procedure.
+        """
+        for word, idx in precomputed_encoding.items():
+            self._token_to_index[namespace][word] = idx
+            self._index_to_token[namespace][idx] = word
+
+        if resave_to_files:
+            if self._last_save_directory is not None:
+                self.save_to_files(self._last_save_directory)
+            else:
+                logging.warning(
+                    "vocabulary folder on disk is missing, newly populated namespace will not be saved to files"
+                )
+
     def add_tokens_to_namespace(self, tokens: List[str], namespace: str = "tokens") -> List[int]:
         """
         Adds `tokens` to the index, if they are not already present.  Either way, we return the