Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add automatic conversion of static heads when loaded via XModelWithHeads #181

Merged
merged 6 commits into from
Jun 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,5 @@ multi_line_output = 3
use_parentheses = True

[flake8]
ignore = E203, E501, E741, W503, W605
ignore = E203, E501, E731, E741, W503, W605
max-line-length = 119
227 changes: 227 additions & 0 deletions src/transformers/adapters/head_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
import logging
import re


logger = logging.getLogger(__name__)


STATIC_TO_FLEX_HEAD_MAP = {
# BERT
"BertForSequenceClassification": {
"config": {
"head_type": "classification",
"layers": 1,
"activation_function": None,
"use_pooler": True,
},
"layers": ["classifier"],
},
"BertForMultipleChoice": {
"config": {
"head_type": "multiple_choice",
"layers": 1,
"activation_function": None,
"use_pooler": True,
},
"layers": ["classifier"],
},
"BertForTokenClassification": {
"config": {
"head_type": "tagging",
"layers": 1,
"activation_function": None,
},
"layers": ["classifier"],
},
"BertForQuestionAnswering": {
"config": {
"head_type": "question_answering",
"layers": 1,
"activation_function": None,
},
"layers": ["qa_outputs"],
},
# RoBERTa
"RobertaForSequenceClassification": {
"config": {
"head_type": "classification",
"layers": 2,
"activation_function": "tanh",
"use_pooler": False,
},
"layers": ["classifier.dense", "classifier.out_proj"],
},
"RobertaForMultipleChoice": {
"config": {
"head_type": "multiple_choice",
"layers": 1,
"activation_function": None,
"use_pooler": True,
},
"layers": ["classifier"],
},
"RobertaForTokenClassification": {
"config": {
"head_type": "tagging",
"layers": 1,
"activation_function": None,
},
"layers": ["classifier"],
},
"RobertaForQuestionAnswering": {
"config": {
"head_type": "question_answering",
"layers": 1,
"activation_function": None,
},
"layers": ["qa_outputs"],
},
# XLM-RoBERTa
"XLMRobertaForSequenceClassification": {
"config": {
"head_type": "classification",
"layers": 2,
"activation_function": "tanh",
"use_pooler": False,
},
"layers": ["classifier.dense", "classifier.out_proj"],
},
"XLMRobertaForMultipleChoice": {
"config": {
"head_type": "multiple_choice",
"layers": 1,
"activation_function": None,
"use_pooler": True,
},
"layers": ["classifier"],
},
"XLMRobertaForTokenClassification": {
"config": {
"head_type": "tagging",
"layers": 1,
"activation_function": None,
},
"layers": ["classifier"],
},
"XLMRobertaForQuestionAnswering": {
"config": {
"head_type": "question_answering",
"layers": 1,
"activation_function": None,
},
"layers": ["qa_outputs"],
},
# BART
"BartForSequenceClassification": {
"config": {
"head_type": "classification",
"layers": 2,
"activation_function": "tanh",
},
"layers": ["classification_head.dense", "classification_head.out_proj"],
},
"BartForQuestionAnswering": {
"config": {
"head_type": "question_answering",
"layers": 1,
"activation_function": None,
},
"layers": ["qa_outputs"],
},
# MBART
"MBartForSequenceClassification": {
"config": {
"head_type": "classification",
"layers": 2,
"activation_function": "tanh",
},
"layers": ["classification_head.dense", "classification_head.out_proj"],
},
"MBartForQuestionAnswering": {
"config": {
"head_type": "question_answering",
"layers": 1,
"activation_function": None,
},
"layers": ["qa_outputs"],
},
# DistilBERT
"DistilBertForSequenceClassification": {
"config": {
"head_type": "classification",
"layers": 2,
"activation_function": "relu",
},
"layers": ["pre_classifier", "classifier"],
},
"DistilBertForMultipleChoice": {
"config": {
"head_type": "multiple_choice",
"layers": 2,
"activation_function": "relu",
},
"layers": ["pre_classifier", "classifier"],
},
"DistilBertForTokenClassification": {
"config": {
"head_type": "tagging",
"layers": 1,
"activation_function": None,
},
"layers": ["classifier"],
},
"DistilBertForQuestionAnswering": {
"config": {
"head_type": "question_answering",
"layers": 1,
"activation_function": None,
},
"layers": ["qa_outputs"],
},
# GPT-2
"GPT2ForSequenceClassification": {
"config": {
"head_type": "classification",
"layers": 1,
"activation_function": None,
"bias": False,
},
"layers": ["score"],
},
}


def _regex_list_rename_func(k, rename_list):
for o, n in rename_list:
match = re.match(o, k)
if match:
return n.format(match.group(1))
return k


def get_head_config_and_rename_list(model_class_name, head_name, label2id, num_labels=None):
if label2id is None:
logger.warning(
"No valid map of labels in label2id. Falling back to default (num_labels=2). This may cause errors during loading!"
)
label2id = {"LABEL_" + str(i): i for i in range(2)}
# num_labels is optional (e.g. for regression, when no map given)
num_labels = num_labels or len(label2id)
data = STATIC_TO_FLEX_HEAD_MAP[model_class_name]
# config
config = data["config"]
if config["head_type"] == "multiple_choice":
config["num_choices"] = num_labels
else:
config["num_labels"] = num_labels
config["label2id"] = label2id
# rename
rename_list = []
i = 0
for name in data["layers"]:
escaped_name = re.escape(name)
rename_list.append((rf"{escaped_name}\.(\S+)", f"heads.{head_name}.{i+1}.{{0}}"))
i += 3 if config["activation_function"] else 2 # there's always a dropout layer in between
rename_func = lambda k, rename_list=rename_list: _regex_list_rename_func(k, rename_list)

return config, rename_func
52 changes: 38 additions & 14 deletions src/transformers/adapters/heads.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,20 @@ def __init__(self, name):
def build(self, model):
model_config = model.config
pred_head = []
bias = self.config.get("bias", True)
for l in range(self.config["layers"]):
pred_head.append(nn.Dropout(model_config.hidden_dropout_prob))
if l < self.config["layers"] - 1:
pred_head.append(nn.Linear(model_config.hidden_size, model_config.hidden_size))
pred_head.append(nn.Linear(model_config.hidden_size, model_config.hidden_size, bias=bias))
if self.config["activation_function"]:
pred_head.append(Activation_Function_Class(self.config["activation_function"]))
else:
if "num_labels" in self.config:
pred_head.append(nn.Linear(model_config.hidden_size, self.config["num_labels"]))
pred_head.append(nn.Linear(model_config.hidden_size, self.config["num_labels"], bias=bias))
elif "num_choices" in self.config: # used for multiple_choice head
pred_head.append(nn.Linear(model_config.hidden_size, 1))
pred_head.append(nn.Linear(model_config.hidden_size, 1, bias=bias))
else:
pred_head.append(nn.Linear(model_config.hidden_size, model_config.hidden_size))
pred_head.append(nn.Linear(model_config.hidden_size, model_config.hidden_size, bias=bias))
if self.config["activation_function"]:
pred_head.append(Activation_Function_Class(self.config["activation_function"]))
for i, module in enumerate(pred_head):
Expand All @@ -64,19 +65,27 @@ def __init__(
layers=2,
activation_function="tanh",
id2label=None,
use_pooler=False,
bias=True,
):
super().__init__(head_name)
self.config = {
"head_type": "classification",
"num_labels": num_labels,
"layers": layers,
"activation_function": activation_function,
"label2id": {label: id_ for id_, label in id2label.items()} if id2label else None,
"label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
"use_pooler": use_pooler,
"bias": bias,
}
self.build(model)

def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs):
cls_output = cls_output if cls_output is not None else outputs[0][:, 0]
if cls_output is None:
if self.config["use_pooler"]:
cls_output = kwargs.pop("pooled_output")
else:
cls_output = outputs[0][:, 0]
logits = super().forward(cls_output)
loss = None
labels = kwargs.pop("labels", None)
Expand Down Expand Up @@ -125,19 +134,27 @@ def __init__(
layers=2,
activation_function="tanh",
id2label=None,
use_pooler=False,
bias=True,
):
super().__init__(head_name)
self.config = {
"head_type": "multilabel_classification",
"num_labels": num_labels,
"layers": layers,
"activation_function": activation_function,
"label2id": {label: id_ for id_, label in id2label.items()} if id2label else None,
"label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
"use_pooler": use_pooler,
"bias": bias,
}
self.build(model)

def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs):
cls_output = cls_output if cls_output is not None else outputs[0][:, 0]
if cls_output is None:
if self.config["use_pooler"]:
cls_output = kwargs.pop("pooled_output")
else:
cls_output = outputs[0][:, 0]
logits = super().forward(cls_output)
loss = None
labels = kwargs.pop("labels", None)
Expand Down Expand Up @@ -183,19 +200,25 @@ def __init__(
layers=2,
activation_function="tanh",
id2label=None,
use_pooler=False,
):
super().__init__(head_name)
self.config = {
"head_type": "multiple_choice",
"num_choices": num_choices,
"layers": layers,
"activation_function": activation_function,
"label2id": {label: id_ for id_, label in id2label.items()} if id2label else None,
"label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
"use_pooler": use_pooler,
}
self.build(model)

def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=None, **kwargs):
cls_output = cls_output if cls_output is not None else outputs[0][:, 0]
if cls_output is None:
if self.config["use_pooler"]:
cls_output = kwargs.pop("pooled_output")
else:
cls_output = outputs[0][:, 0]
logits = super().forward(cls_output)
logits = logits.view(-1, self.config["num_choices"])
loss = None
Expand Down Expand Up @@ -234,7 +257,7 @@ def __init__(
"num_labels": num_labels,
"layers": layers,
"activation_function": activation_function,
"label2id": {label: id_ for id_, label in id2label.items()} if id2label else None,
"label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
}
self.build(model)

Expand Down Expand Up @@ -286,7 +309,7 @@ def __init__(
"num_labels": num_labels,
"layers": layers,
"activation_function": activation_function,
"label2id": {label: id_ for id_, label in id2label.items()} if id2label else None,
"label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
}
self.build(model)

Expand Down Expand Up @@ -356,6 +379,7 @@ class ModelWithFlexibleHeadsAdaptersMixin(ModelWithHeadsAdaptersMixin):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._convert_to_flex_head = True
if not hasattr(self.config, "custom_heads"):
self.config.custom_heads = {}
self._active_heads = []
Expand All @@ -373,9 +397,9 @@ def add_prediction_head_from_config(self, head_name, config, overwrite_ok=False)
head_type = config.pop("head_type")
# handle cases when id2label, label2id or both are available
id2label = config.pop("id2label", None)
if not id2label:
if id2label is None:
label2id = config.pop("label2id", None)
if label2id:
if label2id is not None:
id2label = {id_: label for label, id_ in label2id.items()}
else:
# don't pass label2id to head_class
Expand Down
Loading