-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Fix transformers vocab not being saved to files #4023
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,7 +21,7 @@ | |
|
||
logger = logging.getLogger(__name__) | ||
|
||
DEFAULT_NON_PADDED_NAMESPACES = ("*tags", "*labels") | ||
DEFAULT_NON_PADDED_NAMESPACES = ("*tags", "*labels", "*from_transformers") | ||
DEFAULT_PADDING_TOKEN = "@@PADDING@@" | ||
DEFAULT_OOV_TOKEN = "@@UNKNOWN@@" | ||
NAMESPACE_PADDING_FILE = "non_padded_namespaces.txt" | ||
|
@@ -223,6 +223,7 @@ def __init__( | |
self._padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN | ||
self._oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN | ||
|
||
self._last_save_directory: Optional[str] = None | ||
self._non_padded_namespaces = set(non_padded_namespaces) | ||
|
||
self._token_to_index = _TokenToIndexDefaultDict( | ||
|
@@ -455,6 +456,29 @@ def extend_from_vocab(self, vocab: "Vocabulary") -> None: | |
for token in vocab.get_token_to_index_vocabulary(namespace): | ||
self.add_token_to_namespace(token, namespace) | ||
|
||
def extend_from_dictionary( | ||
self, | ||
encoding_dictionary: Dict[str, int], | ||
namespace: str = "from_transformers", | ||
resave_to_files: bool = False, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is getting to be a bigger change, but it's probably necessary to really fix this the right way. But I think this function should take arguments for setting padding and oov tokens, and those should set them in a namespace-specific way. And the padding and oov tokens then would need to be serialized with the vocabulary and read from the serialized files. Then we wouldn't need the I would wait to do any work on this, though, until @dirkgr has had a chance to glance at it and see if he agrees. I think this is the right approach, but I could be missing something. |
||
) -> None: | ||
""" | ||
Populates given namespace with precomputed encoding, for example from pretrained transformers. | ||
We also optionally resave vocabulary to files in this method since we understand it can be used | ||
after our initial vocab construction and saving procedure. | ||
""" | ||
for word, idx in encoding_dictionary.items(): | ||
self._token_to_index[namespace][word] = idx | ||
self._index_to_token[namespace][idx] = word | ||
|
||
if resave_to_files: | ||
if self._last_save_directory is not None: | ||
self.save_to_files(self._last_save_directory) | ||
else: | ||
logging.warning( | ||
"vocabulary folder on disk is missing, newly populated namespace will not be saved to files" | ||
) | ||
|
||
def _extend( | ||
self, | ||
counter: Dict[str, Dict[str, int]] = None, | ||
|
@@ -603,6 +627,8 @@ def save_to_files(self, directory: str) -> None: | |
for i in range(start_index, num_tokens): | ||
print(mapping[i].replace("\n", "@@NEWLINE@@"), file=token_file) | ||
|
||
self._last_save_directory = directory | ||
|
||
def is_padded(self, namespace: str) -> bool: | ||
""" | ||
Returns whether or not there are padding and OOV tokens added to the given namespace. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, how are
UNK
and padding handled here? Is it possible to set them using the transformer? We probably do want UNK handling and padding tokens, we just need them set in the right way, both here and during loading. This might need some additional changes to how we save and load vocabularies.And, while we're at it, we probably want the padding and unk tokens to be namespace-dependent.