Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Fix transformers vocab not being saved to files #4023

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 9 additions & 27 deletions allennlp/data/token_indexers/pretrained_transformer_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class PretrainedTransformerIndexer(TokenIndexer):

model_name : `str`
The name of the `transformers` model to use.
namespace : `str`, optional (default=`tags`)
namespace : `str`, optional (default=`from_transformers`)
We will add the tokens in the pytorch_transformer vocabulary to this vocabulary namespace.
We use a somewhat confusing default value of `tags` so that we do not add padding or UNK
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
We use a somewhat confusing default value of `tags` so that we do not add padding or UNK
We use `from_transformers` as the default namespace, which gets special treatment in our
`Vocabulary` class to not get padding or UNK tokens.

Actually, how are UNK and padding handled here? Is it possible to set them using the transformer? We probably do want UNK handling and padding tokens, we just need them set in the right way, both here and during loading. This might need some additional changes to how we save and load vocabularies.

And, while we're at it, we probably want the padding and unk tokens to be namespace-dependent.

tokens to this namespace, which would break on loading because we wouldn't find our default
Expand All @@ -41,7 +41,11 @@ class PretrainedTransformerIndexer(TokenIndexer):
"""

def __init__(
self, model_name: str, namespace: str = "tags", max_length: int = None, **kwargs
self,
model_name: str,
namespace: str = "from_transformers",
max_length: int = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
self._namespace = namespace
Expand All @@ -65,35 +69,13 @@ def __init__(
def _add_encoding_to_vocabulary_if_needed(self, vocab: Vocabulary) -> None:
"""
Copies tokens from ```transformers``` model to the specified namespace.
Transformers vocab is taken from the <vocab>/<encoder> keys of the tokenizer object.
"""
if self._added_to_vocabulary:
return

# Find the key under which the vocab is hiding.
for vocab_field_name in ["vocab", "encoder", "sp_model"]:
if hasattr(self._tokenizer, vocab_field_name):
break
else:
vocab_field_name = None

if vocab_field_name is not None:
pretrained_vocab = getattr(self._tokenizer, vocab_field_name)
if vocab_field_name == "sp_model":
for idx in range(len(pretrained_vocab)):
word = pretrained_vocab.id_to_piece(idx)
vocab._token_to_index[self._namespace][word] = idx
vocab._index_to_token[self._namespace][idx] = word
else:
for word, idx in pretrained_vocab.items():
vocab._token_to_index[self._namespace][word] = idx
vocab._index_to_token[self._namespace][idx] = word
else:
logger.warning(
"""Wasn't able to fetch vocabulary from pretrained transformers lib.
Neither <vocab> nor <encoder> are the valid fields for vocab.
Your tokens will still be correctly indexed, but vocabulary file will not be saved."""
)
vocab.extend_from_dictionary(
self._tokenizer.get_vocab(), self._namespace, resave_to_files=True
)

self._added_to_vocabulary = True

Expand Down
28 changes: 27 additions & 1 deletion allennlp/data/vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

logger = logging.getLogger(__name__)

DEFAULT_NON_PADDED_NAMESPACES = ("*tags", "*labels")
DEFAULT_NON_PADDED_NAMESPACES = ("*tags", "*labels", "*from_transformers")
DEFAULT_PADDING_TOKEN = "@@PADDING@@"
DEFAULT_OOV_TOKEN = "@@UNKNOWN@@"
NAMESPACE_PADDING_FILE = "non_padded_namespaces.txt"
Expand Down Expand Up @@ -223,6 +223,7 @@ def __init__(
self._padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN
self._oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN

self._last_save_directory: Optional[str] = None
self._non_padded_namespaces = set(non_padded_namespaces)

self._token_to_index = _TokenToIndexDefaultDict(
Expand Down Expand Up @@ -455,6 +456,29 @@ def extend_from_vocab(self, vocab: "Vocabulary") -> None:
for token in vocab.get_token_to_index_vocabulary(namespace):
self.add_token_to_namespace(token, namespace)

def extend_from_dictionary(
self,
encoding_dictionary: Dict[str, int],
namespace: str = "from_transformers",
resave_to_files: bool = False,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is getting to be a bigger change, but it's probably necessary to really fix this the right way. But I think this function should take arguments for setting padding and oov tokens, and those should set them in a namespace-specific way. And the padding and oov tokens then would need to be serialized with the vocabulary and read from the serialized files. Then we wouldn't need the from_transformers namespace added to the non-padded namespaces.

I would wait to do any work on this, though, until @dirkgr has had a chance to glance at it and see if he agrees. I think this is the right approach, but I could be missing something.

) -> None:
"""
Populates given namespace with precomputed encoding, for example from pretrained transformers.
We also optionally resave vocabulary to files in this method since we understand it can be used
after our initial vocab construction and saving procedure.
"""
for word, idx in encoding_dictionary.items():
self._token_to_index[namespace][word] = idx
self._index_to_token[namespace][idx] = word

if resave_to_files:
if self._last_save_directory is not None:
self.save_to_files(self._last_save_directory)
else:
logging.warning(
"vocabulary folder on disk is missing, newly populated namespace will not be saved to files"
)

def _extend(
self,
counter: Dict[str, Dict[str, int]] = None,
Expand Down Expand Up @@ -603,6 +627,8 @@ def save_to_files(self, directory: str) -> None:
for i in range(start_index, num_tokens):
print(mapping[i].replace("\n", "@@NEWLINE@@"), file=token_file)

self._last_save_directory = directory

def is_padded(self, namespace: str) -> bool:
"""
Returns whether or not there are padding and OOV tokens added to the given namespace.
Expand Down