Merge pull request #1 from adapter-hub/master

merge latest changes of master branch
adapter-hub · Oct 2, 2022 · 54f3a00 · 54f3a00
2 parents e36b429 + ca32936
commit 54f3a00
Show file tree

Hide file tree

Showing 59 changed files with 632 additions and 75 deletions.
diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ Thus, most files in this repository are direct copies from the HuggingFace Trans
 
 ## Installation
 
-`adapter-transformers` currently supports **Python 3.6+** and **PyTorch 1.3.1+**.
+`adapter-transformers` currently supports **Python 3.7+** and **PyTorch 1.3.1+**.
 After [installing PyTorch](https://pytorch.org/get-started/locally/), you can install `adapter-transformers` from PyPI ...
 
 ```
@@ -74,10 +74,11 @@ Currently, adapter-transformers integrates all architectures and methods listed
 | AdapterDrop | [Rücklé et al. (2021)](https://arxiv.org/pdf/2010.11918.pdf) | [Notebook](https://colab.research.google.com/github/Adapter-Hub/adapter-transformers/blob/master/notebooks/05_Adapter_Drop_Training.ipynb) |
 | MAD-X 2.0,<br> Embedding training | [Pfeiffer et al. (2021)](https://arxiv.org/pdf/2012.15562.pdf) | [Docs: Embeddings](https://docs.adapterhub.ml/embeddings.html), [Notebook](https://colab.research.google.com/github/Adapter-Hub/adapter-transformers/blob/master/notebooks/08_NER_Wikiann.ipynb) |
 | Prefix Tuning | [Li and Liang (2021)](https://arxiv.org/pdf/2101.00190.pdf) | [Docs](https://docs.adapterhub.ml/overview.html#prefix-tuning) |
-| Parallel adapters,<br> Mix-and-Match adapters | [He et al. (2021)](https://arxiv.org/pdf/2110.04366.pdf) | [Docs](https://docs.adapterhub.ml/overview.html#combinations-mix-and-match-adapters) |
+| Parallel adapters,<br> Mix-and-Match adapters | [He et al. (2021)](https://arxiv.org/pdf/2110.04366.pdf) | [Docs](https://docs.adapterhub.ml/overview.html#mix-and-match-adapters) |
 | Compacter | [Mahabadi et al. (2021)](https://arxiv.org/pdf/2106.04647.pdf) | [Docs](https://docs.adapterhub.ml/overview.html#compacter) |
 | LoRA | [Hu et al. (2021)](https://arxiv.org/pdf/2106.09685.pdf) | [Docs](https://docs.adapterhub.ml/overview.html#lora) |
-| (IA)^3 | [Liu et al. (2022)](https://arxiv.org/pdf/2205.05638.pdf) | [Docs](https://docs.adapterhub.ml/overview.html#ia3) |
+| (IA)^3 | [Liu et al. (2022)](https://arxiv.org/pdf/2205.05638.pdf) | [Docs](https://docs.adapterhub.ml/overview.html#ia-3) |
+| UniPELT | [Mao et al. (2022)](https://arxiv.org/pdf/2110.07577.pdf) | [Docs](https://docs.adapterhub.ml/overview.html#unipelt) |
 
 ## Supported Models
 

diff --git a/adapter_docs/adapter_composition.md b/adapter_docs/adapter_composition.md
@@ -109,6 +109,32 @@ To learn how training an _AdapterFusion_ layer works, check out [this Colab note
 In v1.x of `adapter-transformers`, fusing adapters was done using a nested list of adapter names, i.e. the example from above would be defined as `[["d", "e", "f"]]`.
 For backwards compatibility, you can still do this, although it is recommended to use the new syntax.
 
+#### Retrieving AdapterFusion attentions
+
+Finally, it is possible to retrieve the attention scores computed by each fusion layer in a forward pass of the model.
+These scores can be used for analyzing the fused adapter blocks and can serve as the basis for visualizations similar to those in the AdapterFusion paper.
+You can collect the fusion attention scores by passing `output_adapter_fusion_attentions=True` to the model forward call.
+The scores for each layer will then be saved in the `adapter_fusion_attentions` attribute of the output:
+
+```python
+outputs = model(**inputs, output_adapter_fusion_attentions=True)
+attention_scores = outputs.adapter_fusion_attentions
+```
+Note that this parameter is only available to base model classes and [AdapterModel classes](prediction_heads.md#adaptermodel-classes).
+In the example, `attention_scores` holds a dictionary of the following form:
+```
+{
+    '<fusion_name>': {
+        <layer_id>: {
+            '<module_location>': np.array([...]),
+            ...
+        },
+        ...
+    },
+    ...
+}
+```
+
 ## `Split`
 
 ```{eval-rst}

diff --git a/adapter_docs/classes/adapter_config.rst b/adapter_docs/classes/adapter_config.rst
@@ -65,6 +65,9 @@ Combined configurations
 .. autoclass:: transformers.MAMConfig
     :members:
 
+.. autoclass:: transformers.UniPELTConfig
+    :members:
+
 Adapter Fusion
 ~~~~~~~~~~~~~~~
 

diff --git a/adapter_docs/conf.py b/adapter_docs/conf.py
@@ -26,7 +26,7 @@
 docs_versions = [
     "adapters1.1.1",
     "adapters2.3.0",
-    "adapters3.0.1",
+    "adapters3.1.0",
 ]
 
 

diff --git a/adapter_docs/img/compacter.png b/adapter_docs/img/compacter.png
diff --git a/adapter_docs/img/ia3.png b/adapter_docs/img/ia3.png
diff --git a/adapter_docs/img/lora.png b/adapter_docs/img/lora.png
diff --git a/adapter_docs/img/prefix.png b/adapter_docs/img/prefix.png
diff --git a/adapter_docs/img/unipelt.png b/adapter_docs/img/unipelt.png
diff --git a/adapter_docs/installation.md b/adapter_docs/installation.md
@@ -1,7 +1,7 @@
 # Installation
 
 Our *adapter-transformers* package is a drop-in replacement for Huggingface's *transformers* library.
-It currently supports Python 3.6+ and PyTorch 1.3.1+. You will have to [install PyTorch](https://pytorch.org/get-started/locally/) first. 
+It currently supports Python 3.7+ and PyTorch 1.3.1+. You will have to [install PyTorch](https://pytorch.org/get-started/locally/) first. 
 
 ```{eval-rst}
 .. important::

diff --git a/adapter_docs/overview.md b/adapter_docs/overview.md
@@ -130,6 +130,15 @@ _Papers:_
 
 _Configuration class_: [`PrefixTuningConfig`](transformers.PrefixTuningConfig)
 
+```{eval-rst}
+.. figure:: img/prefix.png
+    :height: 300
+    :align: center
+    :alt: Illustration of Prefix Tuning.
+
+    Illustration of the Prefix Tuning method within one Transformer layer. Trained components are colored in shades of magenta.
+```
+
 Prefix Tuning ([Li and Liang, 2021](https://aclanthology.org/2021.acl-long.353.pdf)) introduces new parameters in the multi-head attention blocks in each Transformer layer.
 More, specifically, it prepends trainable prefix vectors $P^K$ and $P^V$ to the keys and values of the attention head input, each of a configurable prefix length $l$ (`prefix_length` attribute):
 
@@ -162,6 +171,15 @@ _Papers:_
 
 _Configuration class_: [`CompacterConfig`](transformers.CompacterConfig), [`CompacterPlusPlusConfig`](transformers.CompacterPlusPlusConfig)
 
+```{eval-rst}
+.. figure:: img/compacter.png
+    :height: 300
+    :align: center
+    :alt: Illustration of Compacter.
+
+    Illustration of the Compacter method within one Transformer layer. Trained components are colored in shades of magenta.
+```
+
 The Compacter architecture proposed by [Mahabadi et al., 2021](https://arxiv.org/pdf/2106.04647.pdf)
 is similar to the bottleneck adapter architecture. It only exchanges the linear down- and 
 up-projection with a PHM layer. Unlike the linear layer, the PHM layer constructs its weight matrix from two smaller matrices, which reduces the number of parameters.
@@ -187,6 +205,15 @@ _Papers:_
 
 _Configuration class_: [`LoRAConfig`](transformers.LoRAConfig)
 
+```{eval-rst}
+.. figure:: img/lora.png
+    :height: 300
+    :align: center
+    :alt: Illustration of LoRA.
+
+    Illustration of the LoRA method within one Transformer layer. Trained components are colored in shades of magenta.
+```
+
 Low-Rank Adaptation (LoRA) is an efficient fine-tuning technique proposed by [Hu et al. (2021)](https://arxiv.org/pdf/2106.09685.pdf).
 LoRA injects trainable low-rank decomposition matrices into the layers of a pre-trained model.
 For any model layer expressed as a matrix multiplication of the form $h = W_0 x$, it therefore performs a reparameterization, such that:
@@ -229,6 +256,15 @@ _Papers:_
 
 _Configuration class_: [`IA3Config`](transformers.IA3Config)
 
+```{eval-rst}
+.. figure:: img/ia3.png
+    :height: 300
+    :align: center
+    :alt: Illustration of (IA)^3.
+
+    Illustration of the (IA)^3 method within one Transformer layer. Trained components are colored in shades of magenta.
+```
+
 _Infused Adapter by Inhibiting and Amplifying Inner Activations ((IA)^3)_ is an efficient fine-tuning method proposed within the _T-Few_ fine-tuning approach by [Liu et al. (2022)](https://arxiv.org/pdf/2205.05638.pdf).
 (IA)^3 introduces trainable vectors $l_W$ into different components of a Transformer model which perform element-wise rescaling of inner model activations.
 For any model layer expressed as a matrix multiplication of the form $h = W x$, it therefore performs an element-wise multiplication with $l_W$, such that:
@@ -271,7 +307,7 @@ model.reset_adapter("ia3_adapter")
 _Papers:_
 - [Few-Shot Parameter-Efficient Fine-Tuning is Better and Cheaper than In-Context Learning](https://arxiv.org/pdf/2205.05638.pdf) (Liu et al., 2022)
 
-## Combinations - Mix-and-Match Adapters
+## Method Combinations
 
 _Configuration class_: [`ConfigUnion`](transformers.ConfigUnion)
 
@@ -290,6 +326,10 @@ config = ConfigUnion(
 model.add_adapter("union_adapter", config=config)
 ```
 
+### Mix-and-Match Adapters
+
+_Configuration class_: [`MAMConfig`](transformers.MAMConfig)
+
 [He et al. (2021)](https://arxiv.org/pdf/2110.04366.pdf) study various variants and combinations of efficient fine-tuning methods.
 Among others, they propose _Mix-and-Match Adapters_ as a combination of Prefix Tuning and parallel bottleneck adapters.
 This configuration is supported by adapter-transformers out-of-the-box:
@@ -315,3 +355,79 @@ model.add_adapter("mam_adapter", config=config)
 
 _Papers:_
 - [Towards a Unified View of Parameter-Efficient Transfer Learning](https://arxiv.org/pdf/2110.04366.pdf) (He et al., 2021)
+
+### UniPELT
+
+_Configuration class_: [`UniPELTConfig`](transformers.UniPELTConfig)
+
+```{eval-rst}
+.. figure:: img/unipelt.png
+    :height: 300
+    :align: center
+    :alt: Illustration of UniPELT.
+
+    Illustration of the UniPELT method within one Transformer layer. Trained components are colored in shades of magenta.
+```
+
+An approach similar to the work of [He et al. (2021)](https://arxiv.org/pdf/2110.04366.pdf) is taken by [Mao et al. (2022)](https://arxiv.org/pdf/2110.07577.pdf) in their _UniPELT_ framework.
+They, too, combine multiple efficient fine-tuning methods, namely LoRA, Prefix Tuning and bottleneck adapters, in a single unified setup.
+_UniPELT_ additionally introduces a gating mechanism that controls the activation of the different submodules.
+
+Concretely, for each adapted module $m$, UniPELT adds a trainable gating value $\mathcal{G}_m \in (0, 1)$ that is computed via a feed-forward network ($W_{\mathcal{G}_m}$) and sigmoid activation ($\sigma$) from the Transformer layer input states ($x$):
+
+$$\mathcal{G}_m \leftarrow \sigma(W_{\mathcal{G}_m} \cdot x)$$
+
+These gating values are then used to scale the output activations of the injected adapter modules, e.g. for a LoRA layer:
+
+$$
+h \leftarrow W_0 x + \mathcal{G}_{LoRA} B A x
+$$
+
+In the configuration classes of `adapter-transformers`, these gating mechanisms can be activated via `use_gating=True`.
+The full UniPELT setup can be instantiated using `UniPELTConfig`[^unipelt]:
+
+[^unipelt]: Note that the implementation of UniPELT in `adapter-transformers` follows the implementation in the original code, which is slighlty different from the description in the paper. See [here](https://github.com/morningmoni/UniPELT/issues/1) for more.
+
+```python
+from transformers.adapters import UniPELTConfig
+
+config = UniPELTConfig()
+model.add_adapter("unipelt", config=config)
+```
+
+which is identical to the following `ConfigUnion`:
+
+```python
+from transformers.adapters import ConfigUnion, LoRAConfig, PrefixTuningConfig, PfeifferConfig
+
+config = ConfigUnion(
+    LoRAConfig(r=8, use_gating=True),
+    PrefixTuningConfig(prefix_length=10, use_gating=True),
+    PfeifferConfig(reduction_factor=16, use_gating=True),
+)
+model.add_adapter("unipelt", config=config)
+```
+
+Finally, as the gating values for each adapter module might provide interesting insights for analysis, `adapter-transformers` comes with an integrated mechanism of returning all gating values computed during a model forward pass via the `output_adapter_gating_scores` parameter:
+
+```python
+outputs = model(**inputs, output_adapter_gating_scores=True)
+gating_scores = outputs.adapter_gating_scores
+```
+Note that this parameter is only available to base model classes and [AdapterModel classes](prediction_heads.md#adaptermodel-classes).
+In the example, `gating_scores` holds a dictionary of the following form:
+```
+{
+    '<adapter_name>': {
+        <layer_id>: {
+            '<module_location>': np.array([...]),
+            ...
+        },
+        ...
+    },
+    ...
+}
+```
+
+_Papers:_
+- [UNIPELT: A Unified Framework for Parameter-Efficient Language Model Tuning](https://arxiv.org/pdf/2110.07577.pdf) (Mao et al., 2022)
diff --git a/setup.py b/setup.py
@@ -116,7 +116,7 @@
     "fugashi>=1.0",
     "GitPython<3.1.19",
     "hf-doc-builder>=0.3.0",
-    "huggingface-hub>=0.1.0,<0.8.0",
+    "huggingface-hub>=0.1.0,<1.0",
     "importlib_metadata",
     "ipadic>=1.0.0,<2.0",
     "isort>=5.5.4",
@@ -417,7 +417,7 @@ def run(self):
 
 setup(
     name="adapter-transformers",
-    version="3.1.0a1",
+    version="3.1.0",
     author="Jonas Pfeiffer, Andreas Rücklé, Clifton Poth, Hannah Sterz, based on work by the HuggingFace team and community",
     author_email="[email protected]",
     description="A friendly fork of HuggingFace's Transformers, adding Adapters to PyTorch language models",

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -22,7 +22,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
 
-__version__ = "4.21.2"
+__version__ = "4.21.3"
 
 from typing import TYPE_CHECKING
 
@@ -2063,6 +2063,7 @@
         "StaticAdapterFusionConfig",
         "T5AdapterModel",
         "T5ModelWithHeads",
+        "UniPELTConfig",
         "ViTAdapterModel",
         "XLMRobertaAdapterModel",
         "XLMRobertaModelWithHeads",
@@ -4602,6 +4603,7 @@
             StaticAdapterFusionConfig,
             T5AdapterModel,
             T5ModelWithHeads,
+            UniPELTConfig,
             ViTAdapterModel,
             XLMRobertaAdapterModel,
             XLMRobertaModelWithHeads,

diff --git a/src/transformers/adapters/__init__.py b/src/transformers/adapters/__init__.py
@@ -16,7 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "3.1.0a1"
+__version__ = "3.1.0"
 
 from typing import TYPE_CHECKING
 
@@ -57,6 +57,7 @@
         "PfeifferInvConfig",
         "PrefixTuningConfig",
         "StaticAdapterFusionConfig",
+        "UniPELTConfig",
     ],
     "context": [
         "AdapterSetup",
@@ -176,6 +177,7 @@
         PfeifferInvConfig,
         PrefixTuningConfig,
         StaticAdapterFusionConfig,
+        UniPELTConfig,
     )
     from .context import AdapterSetup, ForwardContext
     from .heads import (