adapter-hub · calpt · Jun 11, 2021 · May 29, 2021 · May 29, 2021 · Jun 5, 2021
diff --git a/setup.cfg b/setup.cfg
@@ -47,5 +47,5 @@ multi_line_output = 3
 use_parentheses = True
 
 [flake8]
-ignore = E203, E501, E741, W503, W605
+ignore = E203, E501, E731, E741, W503, W605
 max-line-length = 119
diff --git a/src/transformers/adapters/head_utils.py b/src/transformers/adapters/head_utils.py
@@ -0,0 +1,227 @@
+import logging
+import re
+
+
+logger = logging.getLogger(__name__)
+
+
+STATIC_TO_FLEX_HEAD_MAP = {
+    # BERT
+    "BertForSequenceClassification": {
+        "config": {
+            "head_type": "classification",
+            "layers": 1,
+            "activation_function": None,
+            "use_pooler": True,
+        },
+        "layers": ["classifier"],
+    },
+    "BertForMultipleChoice": {
+        "config": {
+            "head_type": "multiple_choice",
+            "layers": 1,
+            "activation_function": None,
+            "use_pooler": True,
+        },
+        "layers": ["classifier"],
+    },
+    "BertForTokenClassification": {
+        "config": {
+            "head_type": "tagging",
+            "layers": 1,
+            "activation_function": None,
+        },
+        "layers": ["classifier"],
+    },
+    "BertForQuestionAnswering": {
+        "config": {
+            "head_type": "question_answering",
+            "layers": 1,
+            "activation_function": None,
+        },
+        "layers": ["qa_outputs"],
+    },
+    # RoBERTa
+    "RobertaForSequenceClassification": {
+        "config": {
+            "head_type": "classification",
+            "layers": 2,
+            "activation_function": "tanh",
+            "use_pooler": False,
+        },
+        "layers": ["classifier.dense", "classifier.out_proj"],
+    },
+    "RobertaForMultipleChoice": {
+        "config": {
+            "head_type": "multiple_choice",
+            "layers": 1,
+            "activation_function": None,
+            "use_pooler": True,
+        },
+        "layers": ["classifier"],
+    },
+    "RobertaForTokenClassification": {
+        "config": {
+            "head_type": "tagging",
+            "layers": 1,
+            "activation_function": None,
+        },
+        "layers": ["classifier"],
+    },
+    "RobertaForQuestionAnswering": {
+        "config": {
+            "head_type": "question_answering",
+            "layers": 1,
+            "activation_function": None,
+        },
+        "layers": ["qa_outputs"],
+    },
+    # XLM-RoBERTa
+    "XLMRobertaForSequenceClassification": {
+        "config": {
+            "head_type": "classification",
+            "layers": 2,
+            "activation_function": "tanh",
+            "use_pooler": False,
+        },
+        "layers": ["classifier.dense", "classifier.out_proj"],
+    },
+    "XLMRobertaForMultipleChoice": {
+        "config": {
+            "head_type": "multiple_choice",
+            "layers": 1,
+            "activation_function": None,
+            "use_pooler": True,
+        },
+        "layers": ["classifier"],
+    },
+    "XLMRobertaForTokenClassification": {
+        "config": {
+            "head_type": "tagging",
+            "layers": 1,
+            "activation_function": None,
+        },
+        "layers": ["classifier"],
+    },
+    "XLMRobertaForQuestionAnswering": {
+        "config": {
+            "head_type": "question_answering",
+            "layers": 1,
+            "activation_function": None,
+        },
+        "layers": ["qa_outputs"],
+    },
+    # BART
+    "BartForSequenceClassification": {
+        "config": {
+            "head_type": "classification",
+            "layers": 2,
+            "activation_function": "tanh",
+        },
+        "layers": ["classification_head.dense", "classification_head.out_proj"],
+    },
+    "BartForQuestionAnswering": {
+        "config": {
+            "head_type": "question_answering",
+            "layers": 1,
+            "activation_function": None,
+        },
+        "layers": ["qa_outputs"],
+    },
+    # MBART
+    "MBartForSequenceClassification": {
+        "config": {
+            "head_type": "classification",
+            "layers": 2,
+            "activation_function": "tanh",
+        },
+        "layers": ["classification_head.dense", "classification_head.out_proj"],
+    },
+    "MBartForQuestionAnswering": {
+        "config": {
+            "head_type": "question_answering",
+            "layers": 1,
+            "activation_function": None,
+        },
+        "layers": ["qa_outputs"],
+    },
+    # DistilBERT
+    "DistilBertForSequenceClassification": {
+        "config": {
+            "head_type": "classification",
+            "layers": 2,
+            "activation_function": "relu",
+        },
+        "layers": ["pre_classifier", "classifier"],
+    },
+    "DistilBertForMultipleChoice": {
+        "config": {
+            "head_type": "multiple_choice",
+            "layers": 2,
+            "activation_function": "relu",
+        },
+        "layers": ["pre_classifier", "classifier"],
+    },
+    "DistilBertForTokenClassification": {
+        "config": {
+            "head_type": "tagging",
+            "layers": 1,
+            "activation_function": None,
+        },
+        "layers": ["classifier"],
+    },
+    "DistilBertForQuestionAnswering": {
+        "config": {
+            "head_type": "question_answering",
+            "layers": 1,
+            "activation_function": None,
+        },
+        "layers": ["qa_outputs"],
+    },
+    # GPT-2
+    "GPT2ForSequenceClassification": {
+        "config": {
+            "head_type": "classification",
+            "layers": 1,
+            "activation_function": None,
+            "bias": False,
+        },
+        "layers": ["score"],
+    },
+}
+
+
+def _regex_list_rename_func(k, rename_list):
+    for o, n in rename_list:
+        match = re.match(o, k)
+        if match:
+            return n.format(match.group(1))
+    return k
+
+
+def get_head_config_and_rename_list(model_class_name, head_name, label2id, num_labels=None):
+    if label2id is None:
+        logger.warning(
+            "No valid map of labels in label2id. Falling back to default (num_labels=2). This may cause errors during loading!"
+        )
+        label2id = {"LABEL_" + str(i): i for i in range(2)}
+    # num_labels is optional (e.g. for regression, when no map given)
+    num_labels = num_labels or len(label2id)
+    data = STATIC_TO_FLEX_HEAD_MAP[model_class_name]
+    # config
+    config = data["config"]
+    if config["head_type"] == "multiple_choice":
+        config["num_choices"] = num_labels
+    else:
+        config["num_labels"] = num_labels
+    config["label2id"] = label2id
+    # rename
+    rename_list = []
+    i = 0
+    for name in data["layers"]:
+        escaped_name = re.escape(name)
+        rename_list.append((rf"{escaped_name}\.(\S+)", f"heads.{head_name}.{i+1}.{{0}}"))
+        i += 3 if config["activation_function"] else 2  # there's always a dropout layer in between
+    rename_func = lambda k, rename_list=rename_list: _regex_list_rename_func(k, rename_list)
+
+    return config, rename_func
diff --git a/src/transformers/adapters/heads.py b/src/transformers/adapters/heads.py
@@ -33,19 +33,20 @@ def __init__(self, name):
     def build(self, model):
         model_config = model.config
         pred_head = []
+        bias = self.config.get("bias", True)
         for l in range(self.config["layers"]):
             pred_head.append(nn.Dropout(model_config.hidden_dropout_prob))
             if l < self.config["layers"] - 1:
-                pred_head.append(nn.Linear(model_config.hidden_size, model_config.hidden_size))
+                pred_head.append(nn.Linear(model_config.hidden_size, model_config.hidden_size, bias=bias))
                 if self.config["activation_function"]:
                     pred_head.append(Activation_Function_Class(self.config["activation_function"]))
             else:
                 if "num_labels" in self.config:
-                    pred_head.append(nn.Linear(model_config.hidden_size, self.config["num_labels"]))
+                    pred_head.append(nn.Linear(model_config.hidden_size, self.config["num_labels"], bias=bias))
                 elif "num_choices" in self.config:  # used for multiple_choice head
-                    pred_head.append(nn.Linear(model_config.hidden_size, 1))
+                    pred_head.append(nn.Linear(model_config.hidden_size, 1, bias=bias))
                 else:
-                    pred_head.append(nn.Linear(model_config.hidden_size, model_config.hidden_size))
+                    pred_head.append(nn.Linear(model_config.hidden_size, model_config.hidden_size, bias=bias))
                     if self.config["activation_function"]:
                         pred_head.append(Activation_Function_Class(self.config["activation_function"]))
         for i, module in enumerate(pred_head):
@@ -64,19 +65,27 @@ def __init__(
         layers=2,
         activation_function="tanh",
         id2label=None,
+        use_pooler=False,
+        bias=True,
     ):
         super().__init__(head_name)
         self.config = {
             "head_type": "classification",
             "num_labels": num_labels,
             "layers": layers,
             "activation_function": activation_function,
-            "label2id": {label: id_ for id_, label in id2label.items()} if id2label else None,
+            "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
+            "use_pooler": use_pooler,
+            "bias": bias,
         }
         self.build(model)
 
     def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs):
-        cls_output = cls_output if cls_output is not None else outputs[0][:, 0]
+        if cls_output is None:
+            if self.config["use_pooler"]:
+                cls_output = kwargs.pop("pooled_output")
+            else:
+                cls_output = outputs[0][:, 0]
         logits = super().forward(cls_output)
         loss = None
         labels = kwargs.pop("labels", None)
@@ -125,19 +134,27 @@ def __init__(
         layers=2,
         activation_function="tanh",
         id2label=None,
+        use_pooler=False,
+        bias=True,
     ):
         super().__init__(head_name)
         self.config = {
             "head_type": "multilabel_classification",
             "num_labels": num_labels,
             "layers": layers,
             "activation_function": activation_function,
-            "label2id": {label: id_ for id_, label in id2label.items()} if id2label else None,
+            "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
+            "use_pooler": use_pooler,
+            "bias": bias,
         }
         self.build(model)
 
     def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=False, **kwargs):
-        cls_output = cls_output if cls_output is not None else outputs[0][:, 0]
+        if cls_output is None:
+            if self.config["use_pooler"]:
+                cls_output = kwargs.pop("pooled_output")
+            else:
+                cls_output = outputs[0][:, 0]
         logits = super().forward(cls_output)
         loss = None
         labels = kwargs.pop("labels", None)
@@ -183,19 +200,25 @@ def __init__(
         layers=2,
         activation_function="tanh",
         id2label=None,
+        use_pooler=False,
     ):
         super().__init__(head_name)
         self.config = {
             "head_type": "multiple_choice",
             "num_choices": num_choices,
             "layers": layers,
             "activation_function": activation_function,
-            "label2id": {label: id_ for id_, label in id2label.items()} if id2label else None,
+            "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
+            "use_pooler": use_pooler,
         }
         self.build(model)
 
     def forward(self, outputs, cls_output=None, attention_mask=None, return_dict=None, **kwargs):
-        cls_output = cls_output if cls_output is not None else outputs[0][:, 0]
+        if cls_output is None:
+            if self.config["use_pooler"]:
+                cls_output = kwargs.pop("pooled_output")
+            else:
+                cls_output = outputs[0][:, 0]
         logits = super().forward(cls_output)
         logits = logits.view(-1, self.config["num_choices"])
         loss = None
@@ -234,7 +257,7 @@ def __init__(
             "num_labels": num_labels,
             "layers": layers,
             "activation_function": activation_function,
-            "label2id": {label: id_ for id_, label in id2label.items()} if id2label else None,
+            "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
         }
         self.build(model)
 
@@ -286,7 +309,7 @@ def __init__(
             "num_labels": num_labels,
             "layers": layers,
             "activation_function": activation_function,
-            "label2id": {label: id_ for id_, label in id2label.items()} if id2label else None,
+            "label2id": {label: id_ for id_, label in id2label.items()} if id2label is not None else None,
         }
         self.build(model)
 
@@ -356,6 +379,7 @@ class ModelWithFlexibleHeadsAdaptersMixin(ModelWithHeadsAdaptersMixin):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self._convert_to_flex_head = True
         if not hasattr(self.config, "custom_heads"):
             self.config.custom_heads = {}
         self._active_heads = []
@@ -373,9 +397,9 @@ def add_prediction_head_from_config(self, head_name, config, overwrite_ok=False)
         head_type = config.pop("head_type")
         # handle cases when id2label, label2id or both are available
         id2label = config.pop("id2label", None)
-        if not id2label:
+        if id2label is None:
             label2id = config.pop("label2id", None)
-            if label2id:
+            if label2id is not None:
                 id2label = {id_: label for label, id_ in label2id.items()}
         else:
             # don't pass label2id to head_class