From 970c5d1b9ae827ac9bb20e435028fded216a753f Mon Sep 17 00:00:00 2001
From: hSterz <hsterz16@gmail.com>
Date: Thu, 16 Mar 2023 09:16:21 +0100
Subject: [PATCH] Squashed commit with changes since version 3.2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix resume_from_checkpoint (#514)

add initialization of variable so invalid checkpoints throw a understandable error

Fix LoRA & (IA)³ implementation for Bart & MBart (#518)

Fixes a critical issue in the LoRA & (IA)³ implementation of Bart & MBart, where LoRA & (IA)³ weights were not added to the intermediate and output linear layers of the model's decoder blocks.

I.e., adapter configs having intermediate_lora=True or output_lora=True are added incorrectly to (M)Bart models. For LoRA, this does not affect the default config, for (IA)³ it does (intermediate_lora=True).

To ensure correct addition of weights in the future, get_adapter() tests are updated to count the number of modules added per adapter.

Fix python3.7 Compatibility (#510)

Compatibility with python3.8+ and pytorch1.12.1+

Restore compatibility in GPT-2 LoRALinear bias init (#525)

Fix compacter init weights (#516)

Update doc chapter "Getting Started" (#527)

Update version to 3.2.1

Fix Notebook01 Dataset column_rename (#543)

Update doc chapter "Adapter Methods" (#535)

Do not stale issues labeled as bugs (#550)
---
 .github/workflows/stale.yml                   |   4 +-
 .github/workflows/tests_torch.yml             |  18 +-
 README.md                                     |   2 +-
 adapter_docs/conf.py                          |   2 +-
 adapter_docs/installation.md                  |   6 +-
 adapter_docs/method_combinations.md           |  12 +-
 adapter_docs/methods.md                       |  50 +--
 adapter_docs/overview.md                      |  36 +-
 adapter_docs/quickstart.md                    |  60 ++--
 adapter_docs/training.md                      |  38 +-
 notebooks/01_Adapter_Training.ipynb           |   4 +-
 notebooks/03_Adapter_Fusion.ipynb             |   4 +-
 notebooks/04_Cross_Lingual_Transfer.ipynb     |   2 +-
 notebooks/05_Adapter_Drop_Training.ipynb      |   4 +-
 setup.py                                      |  11 +-
 src/transformers/adapters/__init__.py         |   2 +-
 src/transformers/adapters/lora.py             |  34 +-
 src/transformers/adapters/model_mixin.py      |  26 +-
 src/transformers/adapters/modeling.py         | 178 ++++++----
 src/transformers/adapters/trainer.py          |   1 +
 src/transformers/adapters/utils.py            |   3 +-
 .../adapters/wrappers/configuration.py        |   1 +
 src/transformers/dependency_versions_table.py |   4 +-
 src/transformers/models/bart/modeling_bart.py |   4 +-
 src/transformers/models/gpt2/modeling_gpt2.py |   9 +-
 .../models/mbart/modeling_mbart.py            |   4 +-
 tests_adapters/methods/base.py                |  14 +-
 tests_adapters/methods/test_adapter_common.py |  13 +-
 tests_adapters/methods/test_compacter.py      |   3 +-
 tests_adapters/methods/test_config_union.py   |  38 ++
 tests_adapters/methods/test_ia3.py            |   3 +-
 tests_adapters/methods/test_lora.py           |   3 +-
 tests_adapters/methods/test_prefix_tuning.py  |   9 +-
 tests_adapters/methods/test_unipelt.py        |   5 +-
 tests_adapters/test_adapter_heads.py          |  13 +-
 tests_adapters/test_adapter_trainer.py        | 336 ++++++++++--------
 tests_adapters/test_bart.py                   |   2 +
 tests_adapters/test_bert.py                   |   2 +
 tests_adapters/test_deberta.py                |   3 +-
 tests_adapters/test_debertaV2.py              |   2 +
 tests_adapters/test_distilbert.py             |   2 +
 tests_adapters/test_gpt2.py                   |   2 +
 tests_adapters/test_gptj.py                   |   2 +
 tests_adapters/test_roberta.py                |   2 +
 44 files changed, 590 insertions(+), 383 deletions(-)
 create mode 100644 tests_adapters/methods/test_config_union.py

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index b8cd4c362..7a7e84a5c 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -12,10 +12,10 @@ jobs:
       issues: write
     
     steps:
-      - uses: actions/stale@v6
+      - uses: actions/stale@v8
         with:
           repo-token: "${{ secrets.BOT_TOKEN }}"
-          exempt-issue-labels: 'do-not-stale,enhancement'
+          exempt-issue-labels: 'do-not-stale,enhancement,bug'
           stale-issue-message: 'This issue has been automatically marked as stale because it has been without activity for 90 days. This issue will be closed in 14 days unless you comment or remove the stale label.'
           close-issue-message: 'This issue was closed because it was stale for 14 days without any activity.'
           days-before-issue-stale: 90
diff --git a/.github/workflows/tests_torch.yml b/.github/workflows/tests_torch.yml
index a13cf0fab..5bdc1e7a1 100644
--- a/.github/workflows/tests_torch.yml
+++ b/.github/workflows/tests_torch.yml
@@ -25,11 +25,11 @@ jobs:
   check_code_quality:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
         with:
           python-version: 3.8
-      - uses: actions/cache@v2
+      - uses: actions/cache@v3
         with:
           path: ~/.cache/pip
           key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
@@ -45,11 +45,11 @@ jobs:
     timeout-minutes: 60
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
         with:
           python-version: 3.8
-      - uses: actions/cache@v2
+      - uses: actions/cache@v3
         with:
           path: ~/.cache/pip
           key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
@@ -67,11 +67,11 @@ jobs:
     timeout-minutes: 60
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
         with:
           python-version: 3.8
-      - uses: actions/cache@v2
+      - uses: actions/cache@v3
         with:
           path: ~/.cache/pip
           key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
diff --git a/README.md b/README.md
index 92b8d1622..1e5f1a8ad 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ Thus, most files in this repository are direct copies from the HuggingFace Trans
 
 ## Installation
 
-`adapter-transformers` currently supports **Python 3.7+** and **PyTorch 1.3.1+**.
+`adapter-transformers` currently supports **Python 3.8+** and **PyTorch 1.12.1+**.
 After [installing PyTorch](https://pytorch.org/get-started/locally/), you can install `adapter-transformers` from PyPI ...
 
 ```
diff --git a/adapter_docs/conf.py b/adapter_docs/conf.py
index c132ff41a..ba355cbc3 100644
--- a/adapter_docs/conf.py
+++ b/adapter_docs/conf.py
@@ -26,7 +26,7 @@
 docs_versions = [
     "adapters1.1.1",
     "adapters2.3.0",
-    "adapters3.2.0",
+    "adapters3.2.1",
 ]
 
 
diff --git a/adapter_docs/installation.md b/adapter_docs/installation.md
index ac6c72dfa..da6359e85 100644
--- a/adapter_docs/installation.md
+++ b/adapter_docs/installation.md
@@ -1,13 +1,13 @@
 # Installation
 
 Our *adapter-transformers* package is a drop-in replacement for Huggingface's *transformers* library.
-It currently supports Python 3.7+ and PyTorch 1.3.1+. You will have to [install PyTorch](https://pytorch.org/get-started/locally/) first. 
+It currently supports Python 3.8+ and PyTorch 1.12.1+. You will have to [install PyTorch](https://pytorch.org/get-started/locally/) first. 
 
 ```{eval-rst}
 .. important::
     ``adapter-transformers`` is a direct fork of ``transformers``.
-    This means our package includes all the awesome features of HuggingFace's original package plus the adapter implementation.
-    As both packages share the same namespace, they ideally should not installed in the same environment.
+    This means our package includes all the awesome features of HuggingFace's original package, plus the adapter implementation.
+    As both packages share the same namespace, they ideally should not be installed in the same environment.
 ```
 
 ## Using pip
diff --git a/adapter_docs/method_combinations.md b/adapter_docs/method_combinations.md
index 33bf83edc..80ffe6f77 100644
--- a/adapter_docs/method_combinations.md
+++ b/adapter_docs/method_combinations.md
@@ -2,8 +2,8 @@
 
 _Configuration class_: [`ConfigUnion`](transformers.ConfigUnion)
 
-While different efficient fine-tuning methods and configurations have often been proposed as standalone, it might be beneficial to combine them for joint training.
-To make this process easier, adapter-transformers provides the possibility to group multiple configuration instances together using the `ConfigUnion` class.
+While different efficient fine-tuning methods and configurations have often been proposed as standalone, combining them for joint training might be beneficial. 
+To make this process easier, `adapter-transformers` provides the possibility to group multiple configuration instances using the [`ConfigUnion`](transformers.ConfigUnion) class.
 
 For example, this could be used to define different reduction factors for the adapter modules placed after the multi-head attention and the feed-forward blocks:
 
@@ -22,8 +22,8 @@ model.add_adapter("union_adapter", config=config)
 _Configuration class_: [`MAMConfig`](transformers.MAMConfig)
 
 [He et al. (2021)](https://arxiv.org/pdf/2110.04366.pdf) study various variants and combinations of efficient fine-tuning methods.
-Among others, they propose _Mix-and-Match Adapters_ as a combination of Prefix Tuning and parallel bottleneck adapters.
-This configuration is supported by adapter-transformers out-of-the-box:
+They propose _Mix-and-Match Adapters_ as a combination of Prefix Tuning and parallel bottleneck adapters.
+This configuration is supported by `adapter-transformers` out-of-the-box:
 
 ```python
 from transformers.adapters import MAMConfig
@@ -68,7 +68,7 @@ Concretely, for each adapted module $m$, UniPELT adds a trainable gating value $
 
 $$\mathcal{G}_m \leftarrow \sigma(W_{\mathcal{G}_m} \cdot x)$$
 
-These gating values are then used to scale the output activations of the injected adapter modules, e.g. for a LoRA layer:
+These gating values are then used to scale the output activations of the injected adapter modules, e.g., for a LoRA layer:
 
 $$
 h \leftarrow W_0 x + \mathcal{G}_{LoRA} B A x
@@ -77,7 +77,7 @@ $$
 In the configuration classes of `adapter-transformers`, these gating mechanisms can be activated via `use_gating=True`.
 The full UniPELT setup can be instantiated using `UniPELTConfig`[^unipelt]:
 
-[^unipelt]: Note that the implementation of UniPELT in `adapter-transformers` follows the implementation in the original code, which is slighlty different from the description in the paper. See [here](https://github.com/morningmoni/UniPELT/issues/1) for more.
+[^unipelt]: Note that the implementation of UniPELT in `adapter-transformers` follows the implementation in the original code, which is slightly different from the description in the paper. See [here](https://github.com/morningmoni/UniPELT/issues/1) for more.
 
 ```python
 from transformers.adapters import UniPELTConfig
diff --git a/adapter_docs/methods.md b/adapter_docs/methods.md
index 666e49886..b4ec41106 100644
--- a/adapter_docs/methods.md
+++ b/adapter_docs/methods.md
@@ -1,7 +1,7 @@
 # Adapter Methods
 
 On this page, we present all adapter methods currently integrated into the `adapter-transformers` library.
-A tabulary overview of adapter methods is provided [here](overview.html#table-of-adapter-methods)
+A tabular overview of adapter methods is provided [here](overview.html#table-of-adapter-methods). 
 Additionally, options to combine multiple adapter methods in a single setup are presented [on the next page](method_combinations.md).
 
 ## Bottleneck Adapters
@@ -15,7 +15,7 @@ $$
 h \leftarrow W_{up} \cdot f(W_{down} \cdot h) + r
 $$
 
-Depending on the concrete adapter configuration, these layers can be introduced at different locations within a Transformer block. Further, residual connections, layer norms, activation functions and bottleneck sizes etc. can be configured.
+Depending on the concrete adapter configuration, these layers can be introduced at different locations within a Transformer block. Further, residual connections, layer norms, activation functions and bottleneck sizes ,etc., can be configured.
 
 The most important configuration hyperparameter to be highlighted here is the bottleneck dimension $d_{bottleneck}$.
 In adapter-transformers, this bottleneck dimension is specified indirectly via the `reduction_factor` attribute of a configuration.
@@ -25,7 +25,7 @@ $$
 \text{reduction_factor} = \frac{d_{hidden}}{d_{bottleneck}}
 $$
 
-A visualization of further configuration options related to the adapter structure is given in the figure below. For more details, refer to the documentation of [`AdapterConfig`](transformers.AdapterConfig).
+A visualization of further configuration options related to the adapter structure is given in the figure below. For more details, we refer to the documentation of [`AdapterConfig`](transformers.AdapterConfig).
 
 
 ```{eval-rst}
@@ -37,11 +37,11 @@ A visualization of further configuration options related to the adapter structur
     Visualization of possible adapter configurations with corresponding dictionary keys.
 ```
 
-adapter-transformers comes with pre-defined configurations for some bottleneck adapter architectures proposed in literature:
+`adapter-transformers` comes with pre-defined configurations for some bottleneck adapter architectures proposed in literature:
 
-- [`HoulsbyConfig`](transformers.HoulsbyConfig) as proposed by [Houlsby et al. (2019)](https://arxiv.org/pdf/1902.00751.pdf) places adapter layers after both the multi-head attention and feed-forward block in each Transformer layer.
-- [`PfeifferConfig`](transformers.PfeifferConfig) as proposed by [Pfeiffer et al. (2020)](https://arxiv.org/pdf/2005.00052.pdf) places an adapter layer only after the feed-forward block in each Transformer layer.
-- [`ParallelConfig`](transformers.ParallelConfig) as proposed by [He et al. (2021)](https://arxiv.org/pdf/2110.04366.pdf) places adapter layers in parallel to the original Transformer layers.
+- [`HoulsbyConfig`](transformers.HoulsbyConfig), as proposed by [Houlsby et al. (2019)](https://arxiv.org/pdf/1902.00751.pdf), places adapter layers after both the multi-head attention and feed-forward block in each Transformer layer.
+- [`PfeifferConfig`](transformers.PfeifferConfig), as proposed by [Pfeiffer et al. (2020)](https://arxiv.org/pdf/2005.00052.pdf), places an adapter layer only after the feed-forward block in each Transformer layer.
+- [`ParallelConfig`](transformers.ParallelConfig), as proposed by [He et al. (2021)](https://arxiv.org/pdf/2110.04366.pdf), places adapter layers in parallel to the original Transformer layers.
 
 _Example_:
 ```python
@@ -68,7 +68,7 @@ To perform zero-shot cross-lingual transfer, one language adapter can simply be
 
 In terms of architecture, language adapters are largely similar to regular bottleneck adapters, except for an additional _invertible adapter_ layer after the LM embedding layer.
 Embedding outputs are passed through this invertible adapter in the forward direction before entering the first Transformer layer and in the inverse direction after leaving the last Transformer layer.
-Invertible adapter architectures are further detailed in [Pfeiffer et al. (2020)](https://arxiv.org/pdf/2005.00052.pdf) and can be configured via the `inv_adapter` attribute of the `AdapterConfig` class.
+Invertible adapter architectures are further detailed in [Pfeiffer et al. (2020)](https://arxiv.org/pdf/2005.00052.pdf) and can be configured via the `inv_adapter` attribute of the [`AdapterConfig`](transformers.AdapterConfig) class.
 
 _Example_:
 ```python
@@ -101,13 +101,13 @@ _Configuration class_: [`PrefixTuningConfig`](transformers.PrefixTuningConfig)
 ```
 
 Prefix Tuning ([Li and Liang, 2021](https://aclanthology.org/2021.acl-long.353.pdf)) introduces new parameters in the multi-head attention blocks in each Transformer layer.
-More, specifically, it prepends trainable prefix vectors $P^K$ and $P^V$ to the keys and values of the attention head input, each of a configurable prefix length $l$ (`prefix_length` attribute):
+More specifically, it prepends trainable prefix vectors $P^K$ and $P^V$ to the keys and values of the attention head input, each of a configurable prefix length $l$ (`prefix_length` attribute):
 
 $$
 head_i = \text{Attention}(Q W_i^Q, [P_i^K, K W_i^K], [P_i^V, V W_i^V])
 $$
 
-Following the original authors, the prefix vectors in $P^K$ and $P^V$ are note optimized directly, but reparameterized via a bottleneck MLP.
+Following the original authors, the prefix vectors in $P^K$ and $P^V$ are not optimized directly but reparameterized via a bottleneck MLP.
 This behavior is controlled via the `flat` attribute of the configuration.
 Using `PrefixTuningConfig(flat=True)` will create prefix tuning vectors that are optimized without reparameterization.
 
@@ -119,7 +119,7 @@ config = PrefixTuningConfig(flat=False, prefix_length=30)
 model.add_adapter("prefix_tuning", config=config)
 ```
 
-As reparameterization using the bottleneck MLP is not necessary for performing inference on an already trained Prefix Tuning module, adapter-transformers includes a function to "eject" a reparameterized Prefix Tuning into a flat one:
+As reparameterization using the bottleneck MLP is not necessary for performing inference on an already trained Prefix Tuning module, `adapter-transformers` includes a function to "eject" a reparameterized Prefix Tuning into a flat one:
 ```python
 model.eject_prefix_tuning("prefix_tuning")
 ```
@@ -150,9 +150,9 @@ for a PHM layer by specifying `use_phm=True` in the config.
 The PHM layer has the following additional properties: `phm_dim`, `shared_phm_rule`, `factorized_phm_rule`, `learn_phm`, 
 `factorized_phm_W`, `shared_W_phm`, `phm_c_init`, `phm_init_range`, `hypercomplex_nonlinearity`
 
-For more information check out the [`AdapterConfig`](transformers.AdapterConfig) class.
+For more information, check out the [`AdapterConfig`](transformers.AdapterConfig) class.
 
-To add a Compacter to your model you can use the predefined configs:
+To add a Compacter to your model, you can use the predefined configs:
 ```python
 from transformers.adapters import CompacterConfig
 
@@ -177,7 +177,7 @@ _Configuration class_: [`LoRAConfig`](transformers.LoRAConfig)
 
 Low-Rank Adaptation (LoRA) is an efficient fine-tuning technique proposed by [Hu et al. (2021)](https://arxiv.org/pdf/2106.09685.pdf).
 LoRA injects trainable low-rank decomposition matrices into the layers of a pre-trained model.
-For any model layer expressed as a matrix multiplication of the form $h = W_0 x$, it therefore performs a reparameterization, such that:
+For any model layer expressed as a matrix multiplication of the form $h = W_0 x$, it performs a reparameterization such that:
 
 $$
 h = W_0 x + \frac{\alpha}{r} B A x
@@ -185,7 +185,7 @@ $$
 
 Here, $A \in \mathbb{R}^{r\times k}$ and $B \in \mathbb{R}^{d\times r}$ are the decomposition matrices and $r$, the low-dimensional rank of the decomposition, is the most important hyperparameter.
 
-While, in principle, this reparameterization can be applied to any weights matrix in a model, the original paper only adapts the attention weights of the Transformer self-attention sub-layer with LoRA.
+While, in principle, this reparameterization can be applied to any weight matrix in a model, the original paper only adapts the attention weights of the Transformer self-attention sub-layer with LoRA.
 `adapter-transformers` additionally allows injecting LoRA into the dense feed-forward layers in the intermediate and output components of a Transformer block.
 You can configure the locations where LoRA weights should be injected using the attributes in the [`LoRAConfig`](transformers.LoRAConfig) class.
 
@@ -200,14 +200,14 @@ model.add_adapter("lora_adapter", config=config)
 In the design of LoRA, Hu et al. (2021) also pay special attention to keeping the inference latency overhead compared to full fine-tuning at a minimum.
 To accomplish this, the LoRA reparameterization can be merged with the original pre-trained weights of a model for inference.
 Thus, the adapted weights are directly used in every forward pass without passing activations through an additional module.
-In `adapter-transformers`, this can be realized using the built-in `merge_adapter()` method:
+In `adapter-transformers`, this can be realized using the built-in [`merge_adapter()`](transformers.ModelAdaptersMixin.merge_adapter) method:
 ```python
 model.merge_adapter("lora_adapter")
 ```
 
 To continue training on this LoRA adapter or to deactivate it entirely, the merged weights first have to be reset again:
 ```python
-model.reset_adapter("lora_adapter")
+model.reset_adapter()
 ```
 
 _Papers:_
@@ -227,7 +227,7 @@ _Configuration class_: [`IA3Config`](transformers.IA3Config)
 ```
 
 _Infused Adapter by Inhibiting and Amplifying Inner Activations ((IA)^3)_ is an efficient fine-tuning method proposed within the _T-Few_ fine-tuning approach by [Liu et al. (2022)](https://arxiv.org/pdf/2205.05638.pdf).
-(IA)^3 introduces trainable vectors $l_W$ into different components of a Transformer model which perform element-wise rescaling of inner model activations.
+(IA)^3 introduces trainable vectors $l_W$ into different components of a Transformer model, which perform element-wise rescaling of inner model activations.
 For any model layer expressed as a matrix multiplication of the form $h = W x$, it therefore performs an element-wise multiplication with $l_W$, such that:
 
 $$
@@ -244,16 +244,16 @@ config = IA3Config()
 model.add_adapter("ia3_adapter", config=config)
 ```
 
-The implementation of (IA)^3, as well as the `IA3Config` class, are derived from the implementation of [LoRA](#lora), with a few main modifications.
-First, (IA)^3 uses multiplicative composition of weights instead of additive composition as in LoRA.
+The implementation of (IA)^3, as well as the [`IA3Config`](transformers.IA3Config) class, are derived from the implementation of [LoRA](#lora), with a few main modifications.
+First, (IA)^3 uses multiplicative composition of weights instead of additive composition, as in LoRA.
 Second, the added weights are not further decomposed into low-rank matrices.
-Both of these modifications are controlled via the `composition_mode` configuration attribute by setting `composition_mode="scale"`.
+These modifications are controlled via the `composition_mode` configuration attribute by setting `composition_mode="scale"`.
 Additionally, as the added weights are already of rank 1, `r=1` is set.
 
-Beyond that, both methods share the same configuration attributes that allow you to specify in which Transformer components rescaling vectors will be injected.
-Following the original implementation, `IA3Config` adds rescaling vectors to the self-attention weights (`selfattn_lora=True`) and the final feed-forward layer (`output_lora=True`).
+Beyond that, both methods share the same configuration attributes that allow you to specify which Transformer components rescaling vectors will be injected.
+Following the original implementation, [`IA3Config`](transformers.IA3Config) adds rescaling vectors to the self-attention weights (`selfattn_lora=True`) and the final feed-forward layer (`output_lora=True`).
 Further, you can modify which matrices of the attention mechanism to rescale by leveraging the `attn_matrices` attribute.
-By default, (IA)^3 injects weights into the key ('k') and value ('v') matrices, but not in the query ('q') matrix.
+By default, (IA)^3 injects weights into the key ('k') and value ('v') matrices but not in the query ('q') matrix.
 
 Finally, similar to LoRA, (IA)^3 also allows merging the injected parameters with the original weight matrices of the Transformer model.
 E.g.:
@@ -262,7 +262,7 @@ E.g.:
 model.merge_adapter("ia3_adapter")
 
 # Reset merged weights
-model.reset_adapter("ia3_adapter")
+model.reset_adapter()
 ```
 
 _Papers:_
diff --git a/adapter_docs/overview.md b/adapter_docs/overview.md
index 3cd35da6e..4c1e9c87d 100644
--- a/adapter_docs/overview.md
+++ b/adapter_docs/overview.md
@@ -1,17 +1,17 @@
 # Overview and Configuration
 
 Large pre-trained Transformer-based language models (LMs) have become the foundation of NLP in recent years.
-While the most prevalent method of using these LMs for transfer learning involves costly *full fine-tuning* of all model parameters, a series of *efficient* and *lightweight* alternatives have been established in recent time.
-Instead of updating all parameters of the pre-trained LM towards a downstream target task, these methods commonly introduce a small amount of new parameters and only update these while keeping the pre-trained model weights fixed.
+While the most prevalent method of using these LMs for transfer learning involves costly *full fine-tuning* of all model parameters, a series of *efficient* and *lightweight* alternatives have recently been established.
+Instead of updating all parameters of the pre-trained LM towards a downstream target task, these methods commonly introduce a small number of new parameters and only update these while keeping the pre-trained model weights fixed.
 
 ```{admonition} Why use Efficient Fine-Tuning?
-Efficient fine-tuning methods offer multiple benefits over full fine-tuning of LMs:
+Efficient fine-tuning methods offer multiple benefits over the full fine-tuning of LMs:
 
-- They are **parameter-efficient**, i.e. they only update a very small subset (often under 1%) of a model's parameters.
-- They often are **modular**, i.e. the updated parameters can be extracted and shared independently of the base model parameters.
-- They are easy to share and easy to deploy due to their **small file sizes**, e.g. having only ~3MB per task instead of ~440MB for sharing a full model.
-- They **speed up training**, i.e. efficient fine-tuning often needs less time for training compared fully fine-tuning LMs.
-- They are **composable**, e.g. multiple adapters trained on different tasks can be stacked, fused or mixed to leverage their combined knowledge.
+- They are **parameter-efficient**, i.e., they only update a tiny subset (often under 1%) of a model's parameters.
+- They often are **modular**, i.e., the updated parameters can be extracted and shared independently of the base model parameters.
+- They are easy to share and deploy due to their **small file sizes**, e.g., having only ~3MB per task instead of ~440MB for sharing a full model.
+- They **speed up training**, i.e., efficient fine-tuning often requires less training time than fully fine-tuning LMs.
+- They are **composable**, e.g., multiple adapters trained on different tasks can be stacked, fused, or mixed to leverage their combined knowledge.
 - They often provide **on-par performance** with full fine-tuning.
 ```
 
@@ -30,12 +30,12 @@ While these adapters have laid the foundation of the adapter-transformers librar
 .. important::
     In literature, different terms are used to refer to efficient fine-tuning methods.
     The term "adapter" is usually only applied to bottleneck adapter modules.
-    However, most efficient fine-tuning methods follow the same general idea of inserting a small set of new parameters and by this "adapting" the pre-trained LM to a new task.
+    However, most efficient fine-tuning methods follow the same general idea of inserting a small set of new parameters and, by this, "adapting" the pre-trained LM to a new task.
     In adapter-transformers, the term "adapter" thus may refer to any efficient fine-tuning method if not specified otherwise.
 ```
 
 In the remaining sections, we will present how adapter methods can be configured in `adapter-transformers`.
-The next two pages will then present the methodological details of all currently supported adapter methods.
+The following two pages will offer the methodological details of all currently supported adapter methods.
 
 ## Table of Adapter Methods
 
@@ -48,14 +48,14 @@ Identifiers and configuration classes are explained in more detail in the [next
 | `houlsby` | `HoulsbyConfig()` | [Bottleneck Adapters](methods.html#bottleneck-adapters) |
 | `parallel` | `ParallelConfig()` | [Bottleneck Adapters](methods.html#bottleneck-adapters) |
 | `scaled_parallel` | `ParallelConfig(scaling="learned")` | [Bottleneck Adapters](methods.html#bottleneck-adapters) |
-| `pfeiffer+inv` | `PfeifferInvConfig()` | [Invertible Adapters](methods.html#language-adapters---invertible-adapters) |
-| `houlsby+inv` | `HoulsbyInvConfig()` | [Invertible Adapters](methods.html#language-adapters---invertible-adapters) |
+| `pfeiffer+inv` | `PfeifferInvConfig()` | [Invertible Adapters](methods.html#language-adapters-invertible-adapters) |
+| `houlsby+inv` | `HoulsbyInvConfig()` | [Invertible Adapters](methods.html#language-adapters-invertible-adapters) |
 | `compacter` | `CompacterConfig()` | [Compacter](methods.html#compacter) |
 | `compacter++` | `CompacterPlusPlusConfig()` | [Compacter](methods.html#compacter) |
 | `prefix_tuning` | `PrefixTuningConfig()` | [Prefix Tuning](methods.html#prefix-tuning) |
 | `prefix_tuning_flat` | `PrefixTuningConfig(flat=True)` | [Prefix Tuning](methods.html#prefix-tuning) |
 | `lora` | `LoRAConfig()` | [LoRA](methods.html#lora) |
-| `ia3` | `IA3Config()` | [IA³](methods.html#ia3) |
+| `ia3` | `IA3Config()` | [IA³](methods.html#ia-3) |
 | `mam` | `MAMConfig()` | [Mix-and-Match Adapters](method_combinations.html#mix-and-match-adapters) |
 | `unipelt` | `UniPELTConfig()` | [UniPELT](method_combinations.html#unipelt) |
 
@@ -83,11 +83,15 @@ Here, `<identifier>` refers to one of the identifiers listed in [the table above
 In square brackets after the identifier, you can set specific configuration attributes from the respective configuration class, e.g. `parallel[reduction_factor=2]`.
 If all attributes remain at their default values, this can be omitted.
 
-Finally, it is also possible to specify a [method combination](method_combinations.md) as a configuration string by joining multiple configuration strings with `|`.
-E.g., `prefix_tuning[bottleneck_size=800]|parallel` is identical to the following configuration class instance:
+Finally, it is also possible to specify a [method combination](method_combinations.md) as a configuration string by joining multiple configuration strings with `|`, e.g.:
+```python
+config = "prefix_tuning[bottleneck_size=800]|parallel"
+```
+
+is identical to the following `ConfigUnion`:
 
 ```python
-ConfigUnion(
+config = ConfigUnion(
     PrefixTuningConfig(bottleneck_size=800),
     ParallelConfig(),
 )
diff --git a/adapter_docs/quickstart.md b/adapter_docs/quickstart.md
index fb4010dd8..17f7f0b5c 100644
--- a/adapter_docs/quickstart.md
+++ b/adapter_docs/quickstart.md
@@ -10,7 +10,7 @@ storing (`save_adapter()`) and deletion (`delete_adapter()`) are added to the mo
 .. note::
     This document focuses on the adapter-related functionalities added by *adapter-transformers*.
     For a more general overview of the *transformers* library, visit
-    `the 'Usage' section in Huggingface's documentation <https://huggingface.co/transformers/usage.html>`_.
+    `the 'Usage' section in HuggingFace's documentation <https://huggingface.co/transformers/usage.html>`_.
 ```
 
 ## Quick Tour: Using a pre-trained adapter for inference
@@ -24,41 +24,43 @@ We use BERT in this example, so we first load a pre-trained `BertTokenizer` to e
 `bert-base-uncased` checkpoint from HuggingFace's Model Hub using the [`BertAdapterModel`](transformers.adapters.BertAdapterModel) class:
 
 ```python
+import os
+
 import torch
 from transformers import BertTokenizer
-from transformers.adapters import BertAdapterModel
+from transformers.adapters import BertAdapterModel, AutoAdapterModel
 
-# Load pre-trained BERT tokenizer from Huggingface.
+# Load pre-trained BERT tokenizer from HuggingFace
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 
-# An input sentence.
+# An input sentence
 sentence = "It's also, clearly, great fun."
 
-# Tokenize the input sentence and create a PyTorch input tensor.
+# Tokenize the input sentence and create a PyTorch input tensor
 input_data = tokenizer(sentence, return_tensors="pt")
 
-# Load pre-trained BERT model from HuggingFace Hub.
-# The `BertAdapterModel` class is specifically designed for working with adapters.
-# It can be used with different prediction heads.
+# Load pre-trained BERT model from HuggingFace Hub
+# The `BertAdapterModel` class is specifically designed for working with adapters
+# It can be used with different prediction heads
 model = BertAdapterModel.from_pretrained('bert-base-uncased')
 ```
 
 Having loaded the model, we now add a pre-trained task adapter that is useful to our task from AdapterHub.
-As we're doing sentiment classification, we use [an adapter trained on the SST-2 dataset](https://adapterhub.ml/adapters/ukp/bert-base-uncased_sentiment_sst-2_pfeiffer/) in this case.
+In this case, for sentiment classification, we thus use [an adapter trained on the SST-2 dataset](https://adapterhub.ml/adapters/ukp/bert-base-uncased_sentiment_sst-2_pfeiffer/).
 The task prediction head loaded together with the adapter gives us a class label for our sentence:
 
 ```python
-# load pre-trained task adapter from Adapter Hub
-# this method call will also load a pre-trained classification head for the adapter task
-adapter_name = model.load_adapter('sst-2@ukp', config='pfeiffer')
+# Load pre-trained task adapter from Adapter Hub
+# This method call will also load a pre-trained classification head for the adapter task
+adapter_name = model.load_adapter("sentiment/sst-2@ukp", config='pfeiffer')
 
-# activate the adapter we just loaded, so that it is used in every forward pass
+# Activate the adapter we just loaded, so that it is used in every forward pass
 model.set_active_adapters(adapter_name)
 
-# predict output tensor
+# Predict output tensor
 outputs = model(**input_data)
 
-# retrieve the predicted class label
+# Retrieve the predicted class label
 predicted = torch.argmax(outputs[0]).item()
 assert predicted == 1
 ```
@@ -66,25 +68,29 @@ assert predicted == 1
 To save our pre-trained model and adapters, we can easily store and reload them as follows:
 
 ```python
-# save model
-model.save_pretrained('./path/to/model/directory/')
-# save adapter
-model.save_adapter('./path/to/adapter/directory/', 'sst-2')
-
-# load model
-model = AutoAdapterModel.from_pretrained('./path/to/model/directory/')
-model.load_adapter('./path/to/adapter/directory/')
+# For the sake of this demonstration an example path for loading and storing is given below
+example_path = os.path.join(os.getcwd(), "adapter-quickstart")
+
+# Save model
+model.save_pretrained(example_path)
+# Save adapter
+model.save_adapter(example_path, adapter_name)
+
+# Load model, similar to HuggingFace's AutoModel class, 
+# you can also use AutoAdapterModel instead of BertAdapterModel
+model = AutoAdapterModel.from_pretrained(example_path)
+model.load_adapter(example_path)
 ```
 
 Similar to how the weights of the full model are saved, the `save_adapter()` will create a file for saving the adapter weights and a file for saving the adapter configuration in the specified directory.
 
-Finally, if we have finished working with adapters, we can restore the base Transformer in its original form by deactivating and deleting the adapter:
+Finally, if we have finished working with adapters, we can restore the base Transformer to its original form by deactivating and deleting the adapter:
 
 ```python
-# deactivate all adapters
+# Deactivate all adapters
 model.set_active_adapters(None)
-# delete the added adapter
-model.delete_adapter('sst-2')
+# Delete the added adapter
+model.delete_adapter(adapter_name)
 ```
 
 ## Quick Tour: Adapter training
diff --git a/adapter_docs/training.md b/adapter_docs/training.md
index fe75e7697..ce4c9aaa7 100644
--- a/adapter_docs/training.md
+++ b/adapter_docs/training.md
@@ -1,7 +1,7 @@
 # Adapter Training
 
 This section describes some examples of training adapter methods for different scenarios. We focus on integrating adapter methods into existing training scripts for Transformer models.
-All presented scripts are only slightly modified from the original [examples from HuggingFace Transformers](https://huggingface.co/transformers/examples.html).
+All presented scripts are only slightly modified from the original [examples from HuggingFace Transformers](https://github.com/huggingface/transformers/tree/main/examples/pytorch#examples).
 To run the scripts, make sure you have the latest version of the repository and have installed some additional requirements:
 
 ```
@@ -13,14 +13,14 @@ pip install -r ./examples/pytorch/<your_examples_folder>/requirements.txt
 
 ## Train a Task Adapter
 
-Training a task adapter module on a dataset only requires minor modifications from training the full model.
+Training a task adapter module on a dataset only requires minor modifications compared to training the entire model.
 Suppose we have an existing script for training a Transformer model.
 In the following, we will use HuggingFace's [run_glue.py](https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/pytorch/text-classification/run_glue.py) example script for training on the GLUE benchmark.
 We go through all required changes step by step:
 
 ### Step A - Parse `AdapterArguments`
 
-The [`AdapterArguments`](transformers.adapters.training.AdapterArguments) class integrated into adapter-transformers provides a set of command-line options useful for training adapters.
+The [`AdapterArguments`](transformers.adapters.training.AdapterArguments) class integrated into `adapter-transformers` provides a set of command-line options useful for training adapters.
 These include options such as `--train_adapter` for activating adapter training and `--load_adapter` for loading adapters from checkpoints.
 Thus, the first step of integrating adapters is to add these arguments to the line where `HfArgumentParser` is instantiated:
 
@@ -43,17 +43,17 @@ model = AutoAdapterModel.from_pretrained(
 model.add_classification_head(data_args.task_name, num_labels=num_labels)
 ```
 
-Note that this change is entirely optional and training will also work with the original model class.
-Learn more about the benefits of AdapterModel classes [here](prediction_heads.md)
+Note that this change is optional and training will also work with the original model class.
+Learn more about the benefits of AdapterModel classes [here](prediction_heads.md).
 
 ### Step C - Setup adapter methods
 
 ```{eval-rst}
 .. tip::
-    In the following, we show how to setup adapters manually. In most cases, you can use the built-in ``setup_adapter_training()`` method to perform this job automatically. Just add a statement similar to this anywhere between model instantiation and training start in your script: ``setup_adapter_training(model, adapter_args, task_name)``
+    In the following, we show how to set up adapters manually. In most cases, you can use the built-in ``setup_adapter_training()`` method to perform this job automatically. Just add a statement similar to this anywhere between model instantiation and training start in your script: ``setup_adapter_training(model, adapter_args, task_name)``
 ```
 
-Compared to fine-tuning the full model, there is only this one significant adaptation we have to make: adding an adapter setup and activating it.
+Compared to fine-tuning the entire model, we have to make only one significant adaptation: adding an adapter setup and activating it.
 
 ```python
 # task adapter - only add if not existing
@@ -69,14 +69,14 @@ model.train_adapter(task_name)
 ```{eval-rst}
 .. important::
     The most crucial step when training an adapter module is to freeze all weights in the model except for those of the
-    adapter. In the previous snippet, this is achieved by calling the ``train_adapter()`` method which disables training
+    adapter. In the previous snippet, this is achieved by calling the ``train_adapter()`` method, which disables training
     of all weights outside the task adapter. In case you want to unfreeze all model weights later on, you can use
     ``freeze_model(False)``.
 ```
 
 Besides this, we only have to make sure that the task adapter and prediction head are activated so that they are used in every forward pass. To specify the adapter modules to use, we can use the `model.set_active_adapters()` 
 method and pass the adapter setup. If you only use a single adapter, you can simply pass the name of the adapter. For more information
-on complex setups checkout the [Composition Blocks](https://docs.adapterhub.ml/adapter_composition.html).
+on complex setups, checkout the [Composition Blocks](https://docs.adapterhub.ml/adapter_composition.html).
 
 ```python
 model.set_active_adapters(task_name)
@@ -84,18 +84,18 @@ model.set_active_adapters(task_name)
 
 ### Step D - Switch to `AdapterTrainer` class
 
-Finally, we switch the `Trainer` class built into Transformers for adapter-transformers' [`AdapterTrainer`](transformers.adapters.AdapterTrainer) class that is optimized for training adapter methods.
+Finally, we exchange the `Trainer` class built into Transformers for adapter-transformers' [`AdapterTrainer`](transformers.adapters.AdapterTrainer) class that is optimized for training adapter methods.
 See [below for more information](#adaptertrainer).
 
 Technically, this change is not required as no changes to the training loop are required for training adapters.
-However, `AdapterTrainer` e.g. provides better support for checkpointing and reloading adapter weights.
+However, `AdapterTrainer` e.g., provides better support for checkpointing and reloading adapter weights.
 
 ### Step E - Start training
 
 The rest of the training procedure does not require any further changes in code.
 
 You can find the full version of the modified training script for GLUE at [run_glue.py](https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/pytorch/text-classification/run_glue.py) in the `examples` folder of our repository.
-We also adapted [various other example scripts](https://github.com/Adapter-Hub/adapter-transformers/tree/master/examples/pytorch) (e.g. `run_glue.py`, `run_multiple_choice.py`, `run_squad.py`, ...) to support adapter training.
+We also adapted [various other example scripts](https://github.com/Adapter-Hub/adapter-transformers/tree/master/examples/pytorch) (e.g., `run_glue.py`, `run_multiple_choice.py`, `run_squad.py`, ...) to support adapter training.
 
 To start adapter training on a GLUE task, you can run something similar to:
 
@@ -117,16 +117,16 @@ python run_glue.py \
   --adapter_config pfeiffer
 ```
 
-The important flag here is `--train_adapter` which switches from fine-tuning the full model to training an adapter module for the given GLUE task.
+The important flag here is `--train_adapter`, which switches from fine-tuning the entire model to training an adapter module for the given GLUE task.
 
 ```{eval-rst}
 .. tip::
-    Adapter weights are usually initialized randomly. That is why we require a higher learning rate. We have found that a default adapter learning rate of ``1e-4`` works well for most settings.
+    Adapter weights are usually initialized randomly, which is why we require a higher learning rate. We have found that a default adapter learning rate of ``1e-4`` works well for most settings.
 ```
 
 ```{eval-rst}
 .. tip::
-    Depending on your data set size you might also need to train longer than usual. To avoid overfitting you can evaluating the adapters after each epoch on the development set and only save the best model.
+    Depending on your data set size, you might also need to train longer than usual. To avoid overfitting, you can evaluate the adapters after each epoch on the development set and only save the best model.
 ```
 
 ## Train a Language Adapter
@@ -160,12 +160,12 @@ You can adapt this script to train AdapterFusion with different pre-trained adap
 
 ```{eval-rst}
 .. important::
-    AdapterFusion on a target task is trained in a second training stage, after independently training adapters on individual tasks.
+    AdapterFusion on a target task is trained in a second training stage after independently training adapters on individual tasks.
     When setting up a fusion architecture on your model, make sure to load the pre-trained adapter modules to be fused using ``model.load_adapter()`` before adding a fusion layer.
     For more on AdapterFusion, also refer to `Pfeiffer et al., 2020 <https://arxiv.org/pdf/2005.00247>`_.
 ```
 
-To start fusion training on SST-2 as target task, you can run something like the following:
+To start fusion training on SST-2 as the target task, you can run something like the following:
 
 ```
 export GLUE_DIR=/path/to/glue
@@ -188,9 +188,9 @@ python run_fusion_glue.py \
 
 ## AdapterTrainer
 
-Similar to the `Trainer` class provided by HuggingFace, adapter-transformers provides an `AdapterTrainer` class. This class is only
+Similar to the `Trainer` class provided by HuggingFace, `adapter-transformers` provides an `AdapterTrainer` class. This class is only
 intended for training adapters. The `Trainer` class should still be used to fully fine-tune models. To train adapters with the `AdapterTrainer`
-class, simply initialize it the same way you would initialize the `Trainer` class e.g.: 
+class, simply initialize it the same way you would initialize the `Trainer` class, e.g.: 
 
 ```python
 model.add_adapter(task_name)
diff --git a/notebooks/01_Adapter_Training.ipynb b/notebooks/01_Adapter_Training.ipynb
index 68063753a..23a93b8a7 100644
--- a/notebooks/01_Adapter_Training.ipynb
+++ b/notebooks/01_Adapter_Training.ipynb
@@ -433,7 +433,7 @@
         "# Encode the input data\n",
         "dataset = dataset.map(encode_batch, batched=True)\n",
         "# The transformers model expects the target class column to be named \"labels\"\n",
-        "dataset.rename_column_(\"label\", \"labels\")\n",
+        "dataset = dataset.rename_column(\"label\", \"labels\")\n",
         "# Transform to pytorch tensors and only output the required columns\n",
         "dataset.set_format(type=\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])"
       ]
@@ -4209,4 +4209,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/notebooks/03_Adapter_Fusion.ipynb b/notebooks/03_Adapter_Fusion.ipynb
index 1a047f838..533bd501b 100644
--- a/notebooks/03_Adapter_Fusion.ipynb
+++ b/notebooks/03_Adapter_Fusion.ipynb
@@ -266,7 +266,7 @@
     "# Encode the input data\n",
     "dataset = dataset.map(encode_batch, batched=True)\n",
     "# The transformers model expects the target class column to be named \"labels\"\n",
-    "dataset.rename_column_(\"label\", \"labels\")\n",
+    "dataset = dataset.rename_column(\"label\", \"labels\")\n",
     "# Transform to pytorch tensors and only output the required columns\n",
     "dataset.set_format(type=\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])"
    ]
@@ -728,4 +728,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
\ No newline at end of file
+}
diff --git a/notebooks/04_Cross_Lingual_Transfer.ipynb b/notebooks/04_Cross_Lingual_Transfer.ipynb
index f2c6170d9..dfdbb70eb 100644
--- a/notebooks/04_Cross_Lingual_Transfer.ipynb
+++ b/notebooks/04_Cross_Lingual_Transfer.ipynb
@@ -356,7 +356,7 @@
     "  # Encode the input data\n",
     "  dataset = dataset.map(encode_batch, batched=True)\n",
     "  # The transformers model expects the target class column to be named \"labels\"\n",
-    "  dataset.rename_column_(\"label\", \"labels\")\n",
+    "  dataset = dataset.rename_column(\"label\", \"labels\")\n",
     "  # Transform to pytorch tensors and only output the required columns\n",
     "  dataset.set_format(columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n",
     "  return dataset\n",
diff --git a/notebooks/05_Adapter_Drop_Training.ipynb b/notebooks/05_Adapter_Drop_Training.ipynb
index cc3ac657e..288372aa2 100644
--- a/notebooks/05_Adapter_Drop_Training.ipynb
+++ b/notebooks/05_Adapter_Drop_Training.ipynb
@@ -169,7 +169,7 @@
     "# Encode the input data\n",
     "dataset = dataset.map(encode_batch, batched=True)\n",
     "# The transformers model expects the target class column to be named \"labels\"\n",
-    "dataset.rename_column_(\"label\", \"labels\")\n",
+    "dataset = dataset.rename_column(\"label\", \"labels\")\n",
     "# Transform to pytorch tensors and only output the required columns\n",
     "dataset.set_format(type=\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])"
    ]
@@ -475,4 +475,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
\ No newline at end of file
+}
diff --git a/setup.py b/setup.py
index 214ec4c0d..7a6c0381e 100644
--- a/setup.py
+++ b/setup.py
@@ -146,7 +146,7 @@
     "pytest-subtests",
     "pytest-timeout",
     "pytest-xdist",
-    "python>=3.7.0",
+    "python>=3.8.0",
     "ray[tune]",
     "myst-parser",
     "regex!=2019.12.17",
@@ -176,7 +176,7 @@
     "timeout-decorator",
     "timm",
     "tokenizers>=0.11.1,!=0.11.3,<0.14",
-    "torch>=1.7,!=1.12.0",
+    "torch>=1.12.1",
     "torchaudio",
     "pyctcdecode>=0.4.0",
     "tqdm>=4.27",
@@ -253,6 +253,7 @@ def run(self):
         with open(target, "w", encoding="utf-8", newline="\n") as f:
             f.write("\n".join(content))
 
+
 extras = {}
 
 extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "rhoknp")
@@ -431,7 +432,7 @@ def run(self):
 
 setup(
     name="adapter-transformers",
-    version="3.2.0",
+    version="3.2.1",
     author="Jonas Pfeiffer, Andreas Rücklé, Clifton Poth, Hannah Sterz, Leon Engländer, based on work by the HuggingFace team and community",
     author_email="pfeiffer@ukp.tu-darmstadt.de",
     description="A friendly fork of HuggingFace's Transformers, adding Adapters to PyTorch language models",
@@ -447,7 +448,7 @@ def run(self):
     zip_safe=False,
     extras_require=extras,
     entry_points={"console_scripts": ["transformers-cli=transformers.commands.transformers_cli:main"]},
-    python_requires=">=3.7.0",
+    python_requires=">=3.8.0",
     install_requires=install_requires,
     classifiers=[
         "Development Status :: 5 - Production/Stable",
@@ -457,9 +458,9 @@ def run(self):
         "License :: OSI Approved :: Apache Software License",
         "Operating System :: OS Independent",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
     cmdclass={"deps_table_update": DepsTableUpdateCommand},
diff --git a/src/transformers/adapters/__init__.py b/src/transformers/adapters/__init__.py
index a07b8e846..fea1f6396 100644
--- a/src/transformers/adapters/__init__.py
+++ b/src/transformers/adapters/__init__.py
@@ -16,7 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "3.2.0"
+__version__ = "3.2.1"
 
 from typing import TYPE_CHECKING
 
diff --git a/src/transformers/adapters/lora.py b/src/transformers/adapters/lora.py
index 657cde095..5333ee4ef 100644
--- a/src/transformers/adapters/lora.py
+++ b/src/transformers/adapters/lora.py
@@ -154,7 +154,16 @@ def get_adapter(self, adapter_name: str) -> nn.Module:
 
 
 class Linear(LoRALayer, nn.Linear):
-    # LoRA implemented in a dense layer
+    """
+    LoRA implementation for Linear layer.
+
+    Args:
+        fan_in_fan_out (bool, optional):
+            Set this to True if the layer to replace stores weight like (fan_in, fan_out). Defaults to False.
+        no_init_bias (bool, optional): Use this to add a bias that is not initialized by PyTorch. Defaults to False.
+
+    """
+
     def __init__(
         self,
         in_features: int,
@@ -162,15 +171,20 @@ def __init__(
         location_key: str,
         config: PretrainedConfig,
         attn_key: str = None,
-        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        fan_in_fan_out: bool = False,
+        no_init_bias: bool = False,
         **kwargs
     ):
+        if no_init_bias and "bias" not in kwargs:
+            kwargs["bias"] = False
         LoRALayer.__init__(self, location_key, config, in_features, out_features, **kwargs)
 
         self.attn_key = attn_key
         self.fan_in_fan_out = fan_in_fan_out
         if fan_in_fan_out:
             self.weight.data = torch.t(self.weight.data)
+        if no_init_bias:
+            self.bias = nn.Parameter(torch.empty(out_features))
 
     def _check_lora_location(self, config: LoRAConfig):
         return self.attn_key is None or self.attn_key in config.attn_matrices
@@ -252,7 +266,16 @@ def T(w):
 
 
 class MergedLinear(LoRALayer, nn.Linear):
-    # LoRA implemented in a dense layer
+    """
+    LoRA implementation for merged attention layer layer.
+
+    Args:
+        fan_in_fan_out (bool, optional):
+            Set this to True if the layer to replace stores weight like (fan_in, fan_out). Defaults to False.
+        no_init_bias (bool, optional): Use this to add a bias that is not initialized by PyTorch. Defaults to False.
+
+    """
+
     def __init__(
         self,
         in_features: int,
@@ -260,13 +283,18 @@ def __init__(
         location_key: str,
         config: PretrainedConfig,
         fan_in_fan_out: bool = False,
+        no_init_bias: bool = False,
         **kwargs
     ):
+        if no_init_bias and "bias" not in kwargs:
+            kwargs["bias"] = False
         LoRALayer.__init__(self, location_key, config, in_features, out_features, **kwargs)
 
         self.fan_in_fan_out = fan_in_fan_out
         if fan_in_fan_out:
             self.weight.data = self.weight.data.T
+        if no_init_bias:
+            self.bias = nn.Parameter(torch.empty(out_features))
 
     def get_n_heads(self, lora: Union[LoRA, LoRAConfig]):
         return len(set(lora.attn_matrices))
diff --git a/src/transformers/adapters/model_mixin.py b/src/transformers/adapters/model_mixin.py
index 54bae8d65..4a1df88b4 100644
--- a/src/transformers/adapters/model_mixin.py
+++ b/src/transformers/adapters/model_mixin.py
@@ -22,10 +22,10 @@
 from .layer import AdapterLayer, AdapterLayerBase
 from .loading import AdapterFusionLoader, AdapterLoader, PredictionHeadLoader, WeightsLoader
 from .lora import LoRALayer
-from .modeling import Adapter, GLOWCouplingBlock, NICECouplingBlock
+from .modeling import Adapter, GLOWCouplingBlock, NICECouplingBlock, init_shared_parameters
 from .prefix_tuning import PrefixTuningPool, PrefixTuningShim
 from .utils import EMBEDDING_FILE, TOKENIZER_PATH, inherit_doc
-from .wrappers.configuration import wrap_config
+from .wrappers.configuration import SUBMODEL_NAMES, wrap_config
 
 
 logger = logging.getLogger(__name__)
@@ -495,12 +495,30 @@ def _add_adapter_weights(self, adapter_name: str):
         """Helper method that performs the actual parameter additions when adding a new adapter."""
         self.apply_to_adapter_layers(lambda i, layer: layer.add_adapter(adapter_name, i))
         # PHM Layer
-        if self.config.adapters.match(adapter_name, AdapterConfig, location_key="phm_layer"):
+        adapter_config = self.config.adapters.match(adapter_name, AdapterConfig, location_key="phm_layer")
+        if adapter_config:
             adapter_module = list(self.get_adapter(adapter_name)[0].values())[0]
             # if multiple adapters with same location key exist they are returned as a modulelist
             if isinstance(adapter_module, nn.ModuleList):
                 adapter_module = adapter_module[0]
-            self.base_model.shared_parameters[adapter_name] = adapter_module.adapter_down[0].init_shared_parameters()
+            if adapter_config["shared_phm_rule"] or adapter_config["shared_W_phm"]:
+                if self.config.model_type in SUBMODEL_NAMES:
+                    hidden_sizes = [
+                        getattr(self.config, key).hidden_size for key in SUBMODEL_NAMES[self.config.model_type]
+                    ]
+                    if all(hidden_sizes[0] == h for h in hidden_sizes):
+                        self.base_model.shared_parameters[adapter_name] = init_shared_parameters(
+                            adapter_config, hidden_sizes[0], self.device
+                        )
+                    else:
+                        raise ValueError(
+                            "The model has different hidden sizes {}. Sharing comapcter weights is only possible if"
+                            " the hidden_sizes match.".format(hidden_sizes)
+                        )
+                else:
+                    self.base_model.shared_parameters[adapter_name] = init_shared_parameters(
+                        adapter_config, self.config.hidden_size, self.device
+                    )
         # Prefix Tuning
         for module in self.modules():
             if isinstance(module, PrefixTuningPool):
diff --git a/src/transformers/adapters/modeling.py b/src/transformers/adapters/modeling.py
index 2d6ac3b38..20538bc90 100644
--- a/src/transformers/adapters/modeling.py
+++ b/src/transformers/adapters/modeling.py
@@ -568,6 +568,7 @@ def __init__(
         assert (
             out_features % config["phm_dim"] == 0
         ), f"Argument `out_features`={out_features} is not divisble be `phm_dim`{config['phm_dim']}"
+        self.config = config
         self.name = adapter_name
         self.in_features = in_features
         self.out_features = out_features
@@ -616,42 +617,18 @@ def __init__(
             self.register_parameter("b", None)
         self.reset_parameters()
 
-    def init_W(self, W_left=None, W_right=None, W=None):
+    def _init_W(self, W_left=None, W_right=None, W=None):
         if self.factorized_phm_W:
             W_left = W_left if W_left is not None else self.W_left
             W_right = W_right if W_right is not None else self.W_right
+            return init_W(self.config, W_left, W_right, W)
         else:
             W = W if W is not None else self.W
-        if self.w_init == "glorot-normal":
-            if self.factorized_phm_W:
-                for i in range(self.phm_dim):
-                    W_left.data[i] = nn.init.xavier_normal_(W_left.data[i])
-                    W_right.data[i] = nn.init.xavier_normal_(W_right.data[i])
-            else:
-                for i in range(self.phm_dim):
-                    W.data[i] = nn.init.xavier_normal_(W.data[i])
-        elif self.w_init == "glorot-uniform":
-            if self.factorized_phm_W:
-                for i in range(self.phm_dim):
-                    W_left.data[i] = nn.init.xavier_uniform_(W_left.data[i])
-                    W_right.data[i] = nn.init.xavier_uniform_(W_right.data[i])
-            else:
-                for i in range(self.phm_dim):
-                    W.data[i] = nn.init.xavier_uniform_(W.data[i])
-        elif self.w_init == "normal":
-            if self.factorized_phm_W:
-                for i in range(self.phm_dim):
-                    W_left.data[i].normal_(mean=0, std=self.phm_init_range)
-                    W_right.data[i].normal_(mean=0, std=self.phm_init_range)
-            else:
-                for i in range(self.phm_dim):
-                    W.data[i].normal_(mean=0, std=self.phm_init_range)
-        else:
-            raise ValueError
+            return init_W(self.config, W_left, W_right, W)
 
     def reset_parameters(self):
         if not self.shared_W_phm:
-            self.init_W()
+            self._init_W()
 
         if self.bias_flag:
             self.b.data = torch.zeros_like(self.b.data)
@@ -723,54 +700,101 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             y += self.b
         return y
 
-    def init_shared_parameters(self):
-        parameters = nn.ParameterDict()
-        if self.shared_W_phm:
-            if self.factorized_phm_W:
-                W_down_left = torch.Tensor(size=(self.phm_dim, self._in_feats_per_axis, self.phm_rank))
-                W_down_right = torch.Tensor(size=(self.phm_dim, self.phm_rank, self._out_feats_per_axis))
-                W_up_left = torch.Tensor(size=(self.phm_dim, self._out_feats_per_axis, self.phm_rank))
-                W_up_right = torch.Tensor(size=(self.phm_dim, self.phm_rank, self._in_feats_per_axis))
-                self.init_W(W_left=W_down_left, W_right=W_down_right)
-                self.init_W(W_left=W_up_left, W_right=W_up_right)
-                parameters["W_down_left"] = nn.Parameter(W_down_left, requires_grad=True)
-                parameters["W_down_right"] = nn.Parameter(W_down_right, requires_grad=True)
-                parameters["W_up_left"] = nn.Parameter(W_up_left, requires_grad=True)
-                parameters["W_up_right"] = nn.Parameter(W_up_right, requires_grad=True)
+
+def init_shared_parameters(config, in_features, device):
+    """
+    Create and initialize the parameters shared by all compacter modules
+    """
+    parameters = nn.ParameterDict()
+    if config["shared_W_phm"]:
+        if config["factorized_phm_W"]:
+            out_features = in_features // config["reduction_factor"]
+            _in_feats_per_axis = in_features // config["phm_dim"]
+            _out_feats_per_axis = out_features // config["phm_dim"]
+            W_down_left = torch.Tensor(size=(config["phm_dim"], _in_feats_per_axis, config["phm_rank"]))
+            W_down_right = torch.Tensor(size=(config["phm_dim"], config["phm_rank"], _out_feats_per_axis))
+            W_up_left = torch.Tensor(size=(config["phm_dim"], _out_feats_per_axis, config["phm_rank"]))
+            W_up_right = torch.Tensor(size=(config["phm_dim"], config["phm_rank"], _in_feats_per_axis))
+            init_W(config, W_left=W_down_left, W_right=W_down_right)
+            init_W(config, W_left=W_up_left, W_right=W_up_right)
+            parameters["W_down_left"] = nn.Parameter(W_down_left, requires_grad=True)
+            parameters["W_down_right"] = nn.Parameter(W_down_right, requires_grad=True)
+            parameters["W_up_left"] = nn.Parameter(W_up_left, requires_grad=True)
+            parameters["W_up_right"] = nn.Parameter(W_up_right, requires_grad=True)
+        else:
+            W_down = torch.Tensor(size=(config["phm_dim"], _in_feats_per_axis, _out_feats_per_axis))
+            W_up = torch.Tensor(size=(config["phm_dim"], _out_feats_per_axis, _in_feats_per_axis))
+            init_W(config, W=W_down)
+            init_W(config, W=W_up)
+            parameters["W_down"] = nn.Parameter(W_down, requires_grad=True)
+            parameters["W_up"] = nn.Parameter(W_up, requires_grad=True)
+    if config["shared_phm_rule"]:
+        if config["factorized_phm_rule"]:
+            phm_rule_left = nn.Parameter(
+                torch.FloatTensor(config["phm_dim"], config["phm_dim"], 1).to(device),
+                requires_grad=config["learn_phm"],
+            )
+            phm_rule_right = nn.Parameter(
+                torch.FloatTensor(config["phm_dim"], 1, config["phm_dim"]).to(device),
+                requires_grad=config["learn_phm"],
+            )
+            if config["phm_c_init"] == "normal":
+                phm_rule_left.data.normal_(mean=0, std=config["phm_init_range"])
+                phm_rule_right.data.normal_(mean=0, std=config["phm_init_range"])
+            elif config["phm_c_init"] == "uniform":
+                phm_rule_left.data.uniform_(-1, 1)
+                phm_rule_right.data.uniform_(-1, 1)
             else:
-                W_down = torch.Tensor(size=(self.phm_dim, self._in_feats_per_axis, self._out_feats_per_axis))
-                W_up = torch.Tensor(size=(self.phm_dim, self._out_feats_per_axis, self._in_feats_per_axis))
-                self.init_W(W=W_down)
-                self.init_W(W=W_up)
-                parameters["W_down"] = nn.Parameter(W_down, requires_grad=True)
-                parameters["W_up"] = nn.Parameter(W_up, requires_grad=True)
-        if self.shared_phm_rule:
-            if self.factorized_phm_rule:
-                phm_rule_left = nn.Parameter(
-                    torch.FloatTensor(self.phm_dim, self.phm_dim, 1).to(self.device), requires_grad=self.learn_phm
-                )
-                phm_rule_right = nn.Parameter(
-                    torch.FloatTensor(self.phm_dim, 1, self.phm_dim).to(self.device), requires_grad=self.learn_phm
-                )
-                if self.c_init == "normal":
-                    phm_rule_left.data.normal_(mean=0, std=self.phm_init_range)
-                    phm_rule_right.data.normal_(mean=0, std=self.phm_init_range)
-                elif self.c_init == "uniform":
-                    phm_rule_left.data.uniform_(-1, 1)
-                    phm_rule_right.data.uniform_(-1, 1)
-                else:
-                    raise NotImplementedError
-                parameters["phm_rule_left"] = phm_rule_left
-                parameters["phm_rule_right"] = phm_rule_right
+                raise NotImplementedError
+            parameters["phm_rule_left"] = phm_rule_left
+            parameters["phm_rule_right"] = phm_rule_right
+        else:
+            phm_rule = nn.Parameter(
+                torch.FloatTensor(config["phm_dim"], config["phm_dim"], config["phm_dim"]),
+                requires_grad=config["learn_phm"],
+            )
+            if config["phm_c_init"] == "normal":
+                phm_rule.data.normal_(mean=0, std=config["phm_init_range"])
+            elif config["phm_c_init"] == "uniform":
+                phm_rule.data.uniform_(-1, 1)
             else:
-                phm_rule = nn.Parameter(
-                    torch.FloatTensor(self.phm_dim, self.phm_dim, self.phm_dim), requires_grad=self.learn_phm
-                )
-                if self.c_init == "normal":
-                    phm_rule.data.normal_(mean=0, std=self.phm_init_range)
-                elif self.c_init == "uniform":
-                    phm_rule.data.uniform_(-1, 1)
-                else:
-                    raise NotImplementedError
-                parameters["phm_rule"] = phm_rule
-        return parameters
+                raise NotImplementedError
+            parameters["phm_rule"] = phm_rule
+    return parameters
+
+
+def init_W(config, W_left=None, W_right=None, W=None):
+    """
+    Initialize the weights for the compacter module or the shared parameters
+    """
+    if config["factorized_phm_W"]:
+        W_left = W_left
+        W_right = W_right
+    else:
+        W = W
+    if config["hypercomplex_nonlinearity"]:
+        if config["factorized_phm_W"]:
+            for i in range(config["phm_dim"]):
+                W_left.data[i] = nn.init.xavier_normal_(W_left.data[i])
+                W_right.data[i] = nn.init.xavier_normal_(W_right.data[i])
+        else:
+            for i in range(config["phm_dim"]):
+                W.data[i] = nn.init.xavier_normal_(W.data[i])
+    elif config["hypercomplex_nonlinearity"] == "glorot-uniform":
+        if config["factorized_phm_W"]:
+            for i in range(config["phm_dim"]):
+                W_left.data[i] = nn.init.xavier_uniform_(W_left.data[i])
+                W_right.data[i] = nn.init.xavier_uniform_(W_right.data[i])
+        else:
+            for i in range(config["phm_dim"]):
+                W.data[i] = nn.init.xavier_uniform_(W.data[i])
+    elif config["hypercomplex_nonlinearity"] == "normal":
+        if config["factorized_phm_W"]:
+            for i in range(config["phm_dim"]):
+                W_left.data[i].normal_(mean=0, std=config["phm_init_range"])
+                W_right.data[i].normal_(mean=0, std=config["phm_init_range"])
+        else:
+            for i in range(config["phm_dim"]):
+                W.data[i].normal_(mean=0, std=config["phm_init_range"])
+    else:
+        raise ValueError
diff --git a/src/transformers/adapters/trainer.py b/src/transformers/adapters/trainer.py
index 5746294d8..77cf4bc44 100644
--- a/src/transformers/adapters/trainer.py
+++ b/src/transformers/adapters/trainer.py
@@ -182,6 +182,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint):
             # will be resumed in deepspeed_init
             pass
         else:
+            adapter_loaded = False
             if os.path.isdir(resume_from_checkpoint):
                 adapter_loaded = self._load_adapters(resume_from_checkpoint)
                 self._load_adapter_fusions(resume_from_checkpoint)
diff --git a/src/transformers/adapters/utils.py b/src/transformers/adapters/utils.py
index a11d52845..096bc496f 100644
--- a/src/transformers/adapters/utils.py
+++ b/src/transformers/adapters/utils.py
@@ -408,7 +408,8 @@ def parse_adapter_config_string(config_string: str) -> List[Tuple[str, dict]]:
         if not match or not match.group("name"):
             raise ValueError(f"Invalid adapter config string format: '{config_string_chunk}'.")
         name = match.group("name")
-        if kvs := match.group("kvs"):
+        if match.group("kvs"):
+            kvs = match.group("kvs")
             # Replace "=" with ":" in key-value pairs for valid Python dict
             kvs = re.sub(r"(\w+)=", r"'\1':", kvs)
         else:
diff --git a/src/transformers/adapters/wrappers/configuration.py b/src/transformers/adapters/wrappers/configuration.py
index 56da1694b..3506d93f7 100644
--- a/src/transformers/adapters/wrappers/configuration.py
+++ b/src/transformers/adapters/wrappers/configuration.py
@@ -56,6 +56,7 @@
     "vit": {},
     "xlm_roberta": {},
 }
+SUBMODEL_NAMES = {"clip": ["vision_config", "text_config"], "encoder-decoder": ["encoder", "decoder"]}
 
 
 def wrap_config(config: PretrainedConfig) -> PretrainedConfig:
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 10fce437a..04740672a 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -52,7 +52,7 @@
     "pytest-subtests": "pytest-subtests",
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",
-    "python": "python>=3.7.0",
+    "python": "python>=3.8.0",
     "ray[tune]": "ray[tune]",
     "myst-parser": "myst-parser",
     "regex": "regex!=2019.12.17",
@@ -82,7 +82,7 @@
     "timeout-decorator": "timeout-decorator",
     "timm": "timm",
     "tokenizers": "tokenizers>=0.11.1,!=0.11.3,<0.14",
-    "torch": "torch>=1.7,!=1.12.0",
+    "torch": "torch>=1.12.1",
     "torchaudio": "torchaudio",
     "pyctcdecode": "pyctcdecode>=0.4.0",
     "tqdm": "tqdm>=4.27",
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 77380b0d8..de3a05e7a 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -415,8 +415,8 @@ def __init__(self, config: BartConfig):
             location_key="cross",
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
-        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.fc1 = LoRALinear(self.embed_dim, config.encoder_ffn_dim, "intermediate", config)
+        self.fc2 = LoRALinear(config.encoder_ffn_dim, self.embed_dim, "output", config)
         self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
         self._init_adapter_modules()
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 9a268dfbd..155e96ad4 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -170,6 +170,7 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
                 "selfattn",
                 config,
                 fan_in_fan_out=True,
+                no_init_bias=True,
             )
         self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
 
@@ -366,8 +367,12 @@ def __init__(self, intermediate_size, config):
         super().__init__()
         embed_dim = config.hidden_size
         # Order of dimension inputs to LORALinear reversed compared to Conv1D
-        self.c_fc = LoRALinear(embed_dim, intermediate_size, "intermediate", config, fan_in_fan_out=True)
-        self.c_proj = LoRALinear(intermediate_size, embed_dim, "output", config, fan_in_fan_out=True)
+        self.c_fc = LoRALinear(
+            embed_dim, intermediate_size, "intermediate", config, fan_in_fan_out=True, no_init_bias=True
+        )
+        self.c_proj = LoRALinear(
+            intermediate_size, embed_dim, "output", config, fan_in_fan_out=True, no_init_bias=True
+        )
         self.act = ACT2FN[config.activation_function]
         self.dropout = nn.Dropout(config.resid_pdrop)
 
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index d7b980fe7..3f78c5b95 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -412,8 +412,8 @@ def __init__(self, config: MBartConfig):
             location_key="cross",
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
-        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.fc1 = LoRALinear(self.embed_dim, config.encoder_ffn_dim, "intermediate", config)
+        self.fc2 = LoRALinear(config.encoder_ffn_dim, self.embed_dim, "output", config)
         self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
         self._init_adapter_modules()
diff --git a/tests_adapters/methods/base.py b/tests_adapters/methods/base.py
index 742e3ec08..1539b5196 100644
--- a/tests_adapters/methods/base.py
+++ b/tests_adapters/methods/base.py
@@ -77,27 +77,23 @@ def run_delete_test(self, model, adapter_config, filter_keys):
             has_weights = True
         self.assertFalse(has_weights)
 
-    def run_get_test(self, model, adapter_config):
+    def run_get_test(self, model, adapter_config, num_expected_modules):
         model.eval()
 
         model.add_adapter("first", config=adapter_config)
-        model.add_adapter("second", config=adapter_config)
         model.set_active_adapters(["first"])
-        model.to(torch_device)
 
         # adapter is correctly added to config
         name = "first"
         self.assert_adapter_available(model, name)
 
-        first_adapter = model.get_adapter("first")
-        second_adapter = model.get_adapter("second")
+        adapter = model.get_adapter("first")
 
-        self.assertNotEqual(len(first_adapter), 0)
-        self.assertEqual(len(first_adapter), len(second_adapter))
-        self.assertNotEqual(first_adapter, second_adapter)
+        self.assertNotEqual(len(adapter), 0)
+        num_found_modules = sum([len(layer_modules) for layer_modules in adapter.values()])
+        self.assertEqual(num_expected_modules, num_found_modules)
 
         model.delete_adapter("first")
-        model.delete_adapter("second")
 
     def run_forward_test(self, model, adapter_config):
         model.eval()
diff --git a/tests_adapters/methods/test_adapter_common.py b/tests_adapters/methods/test_adapter_common.py
index 133625b20..4ffdf057b 100644
--- a/tests_adapters/methods/test_adapter_common.py
+++ b/tests_adapters/methods/test_adapter_common.py
@@ -85,10 +85,17 @@ def test_add_adapter_with_invertible(self):
     def test_get_adapter(self):
         model = self.get_model()
         model.eval()
-
-        for adapter_config, _ in self.adapter_configs_to_test:
+        n_layers = len(list(model.iter_layers()))
+        if model.config.is_encoder_decoder:
+            n_prefix_layers = 3
+        elif model.config.is_composition:
+            n_prefix_layers = 2
+        else:
+            n_prefix_layers = 1
+
+        for adapter_config, n_expected in [(HoulsbyConfig(), n_layers * 2), (MAMConfig(), n_layers + n_prefix_layers)]:
             with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__):
-                self.run_get_test(model, adapter_config)
+                self.run_get_test(model, adapter_config, n_expected)
 
     def test_add_adapter_multiple_reduction_factors(self):
         model = self.get_model()
diff --git a/tests_adapters/methods/test_compacter.py b/tests_adapters/methods/test_compacter.py
index a07662941..4a7758c05 100644
--- a/tests_adapters/methods/test_compacter.py
+++ b/tests_adapters/methods/test_compacter.py
@@ -16,7 +16,8 @@ def test_delete_compacter(self):
 
     def test_get_compacter(self):
         model = self.get_model()
-        self.run_get_test(model, CompacterPlusPlusConfig(phm_dim=2, reduction_factor=8))
+        n_layers = len(list(model.iter_layers()))
+        self.run_get_test(model, CompacterPlusPlusConfig(phm_dim=2, reduction_factor=8), n_layers + 1)
 
     def test_forward_compacter(self):
         model = self.get_model()
diff --git a/tests_adapters/methods/test_config_union.py b/tests_adapters/methods/test_config_union.py
new file mode 100644
index 000000000..e655c118d
--- /dev/null
+++ b/tests_adapters/methods/test_config_union.py
@@ -0,0 +1,38 @@
+from tests_adapters.methods.base import AdapterMethodBaseTestMixin
+from transformers.adapters.configuration import CompacterConfig, ConfigUnion, ParallelConfig, PrefixTuningConfig, LoRAConfig, PfeifferConfig
+from transformers.testing_utils import require_torch
+
+
+@require_torch
+class ConfigUnionAdapterTest(AdapterMethodBaseTestMixin):
+    adapter_configs_to_test = [
+        (ConfigUnion(
+            PrefixTuningConfig(),
+            ParallelConfig(),
+        ), ["adapters.{name}.",  "prefix_tunings.{name}."]),
+        (ConfigUnion(
+            CompacterConfig(),
+            LoRAConfig(),
+        ), ["adapters.{name}.", "loras.{name}."]),
+        (ConfigUnion(
+            PfeifferConfig(),
+            LoRAConfig(),
+        ), ["adapters.{name}.", "loras.{name}."]),
+
+    ]
+
+    def test_add_union_adapter(self):
+        model = self.get_model()
+        model.eval()
+
+        for adapter_config, filter_keys in self.adapter_configs_to_test:
+            with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__):
+                self.run_add_test(model, adapter_config, filter_keys)
+
+    def test_union_adapter_forward(self):
+        model = self.get_model()
+        model.eval()
+
+        for adapter_config, _ in self.adapter_configs_to_test:
+            with self.subTest(model_class=model.__class__.__name__, config=adapter_config.__class__.__name__):
+                self.run_forward_test(model, adapter_config)
\ No newline at end of file
diff --git a/tests_adapters/methods/test_ia3.py b/tests_adapters/methods/test_ia3.py
index b43adf48a..cda33b98b 100644
--- a/tests_adapters/methods/test_ia3.py
+++ b/tests_adapters/methods/test_ia3.py
@@ -19,7 +19,8 @@ def test_delete_ia3(self):
 
     def test_get_ia3(self):
         model = self.get_model()
-        self.run_get_test(model, IA3Config())
+        n_layers = len(list(model.iter_layers()))
+        self.run_get_test(model, IA3Config(intermediate_lora=True, output_lora=True), n_layers * 3)
 
     def test_forward_ia3(self):
         model = self.get_model()
diff --git a/tests_adapters/methods/test_lora.py b/tests_adapters/methods/test_lora.py
index 8d3557952..b0dd97870 100644
--- a/tests_adapters/methods/test_lora.py
+++ b/tests_adapters/methods/test_lora.py
@@ -19,7 +19,8 @@ def test_delete_lora(self):
 
     def test_get_lora(self):
         model = self.get_model()
-        self.run_get_test(model, LoRAConfig())
+        n_layers = len(list(model.iter_layers()))
+        self.run_get_test(model, LoRAConfig(intermediate_lora=True, output_lora=True), n_layers * 3)
 
     def test_forward_lora(self):
         model = self.get_model()
diff --git a/tests_adapters/methods/test_prefix_tuning.py b/tests_adapters/methods/test_prefix_tuning.py
index eefd68a3f..89f5933f0 100644
--- a/tests_adapters/methods/test_prefix_tuning.py
+++ b/tests_adapters/methods/test_prefix_tuning.py
@@ -18,7 +18,14 @@ def test_delete_prefix_tuning(self):
 
     def test_get_prefix_tuning(self):
         model = self.get_model()
-        self.run_get_test(model, PrefixTuningConfig(flat=True))
+        if model.config.is_encoder_decoder:
+            n_prefix_layers = 3
+        elif model.config.is_composition:
+            n_prefix_layers = 2
+        else:
+            n_prefix_layers = 1
+
+        self.run_get_test(model, PrefixTuningConfig(flat=True), n_prefix_layers)
 
     def test_forward_prefix_tuning(self):
         model = self.get_model()
diff --git a/tests_adapters/methods/test_unipelt.py b/tests_adapters/methods/test_unipelt.py
index 507aa7dd1..f6f453cea 100644
--- a/tests_adapters/methods/test_unipelt.py
+++ b/tests_adapters/methods/test_unipelt.py
@@ -18,7 +18,10 @@ def test_delete_unipelt(self):
 
     def test_get_unipelt(self):
         model = self.get_model()
-        self.run_get_test(model, UniPELTConfig())
+        n_layers = len(list(model.iter_layers()))
+        # In UniPELT, prefix tuning has gates in every layer
+        n_prefix_layers = 1.5 * n_layers if model.config.is_encoder_decoder else n_layers
+        self.run_get_test(model, UniPELTConfig(), n_layers * 2 + n_prefix_layers)
 
     def test_forward_unipelt(self):
         model = self.get_model()
diff --git a/tests_adapters/test_adapter_heads.py b/tests_adapters/test_adapter_heads.py
index a0139213f..d679d76fe 100644
--- a/tests_adapters/test_adapter_heads.py
+++ b/tests_adapters/test_adapter_heads.py
@@ -122,8 +122,6 @@ def test_causal_lm_head(self):
         model1.add_causal_lm_head("dummy")
 
         label_dict = {}
-        # Use a different length for the seq2seq output
-        seq_output_length = self.seq_length + 30
         label_dict["labels"] = torch.zeros((self.batch_size, self.seq_length), dtype=torch.long, device=torch_device)
 
         self.run_prediction_head_test(
@@ -137,8 +135,10 @@ def test_causal_lm_head(self):
         # Finally, also check if generation works properly
         input_ids = self.get_input_samples((1, self.seq_length), config=model1.config)["input_ids"]
         input_ids = input_ids.to(torch_device)
+        # Use a different length for the seq2seq output
+        seq_output_length = self.seq_length + 30
         generated = model1.generate(input_ids, max_length=seq_output_length)
-        self.assertEqual(generated.shape[0], 1) 
+        self.assertEqual(generated.shape[0], 1)
         self.assertLessEqual(generated.shape[1], seq_output_length)
 
     def test_seq2seq_lm_head(self):
@@ -149,8 +149,6 @@ def test_seq2seq_lm_head(self):
         model1.add_seq2seq_lm_head("dummy")
 
         label_dict = {}
-        # Use a different length for the seq2seq output
-        seq_output_length = self.seq_length + 30
         label_dict["labels"] = torch.zeros((self.batch_size, self.seq_length), dtype=torch.long, device=torch_device)
 
         # prepare decoder_input_ids similar to how DataCollatorForSeq2Seq does it
@@ -169,8 +167,11 @@ def test_seq2seq_lm_head(self):
         # Finally, also check if generation works properly
         input_ids = self.get_input_samples((1, self.seq_length), config=model1.config)["input_ids"]
         input_ids = input_ids.to(torch_device)
+        # Use a different length for the seq2seq output
+        seq_output_length = self.seq_length + 30
         generated = model1.generate(input_ids, max_length=seq_output_length)
-        self.assertEqual(generated.shape, (1, seq_output_length))
+        self.assertEqual(generated.shape[0], 1)
+        self.assertLessEqual(generated.shape[1], seq_output_length)
 
     def test_masked_lm_head(self):
         if not hasattr(ADAPTER_MODEL_MAPPING[self.config_class], "add_masked_lm_head"):
diff --git a/tests_adapters/test_adapter_trainer.py b/tests_adapters/test_adapter_trainer.py
index eb8ffd4a3..f1d4bb55f 100644
--- a/tests_adapters/test_adapter_trainer.py
+++ b/tests_adapters/test_adapter_trainer.py
@@ -36,48 +36,92 @@ def test_resume_training(self):
             task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True
         )
         train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train")
+        with TemporaryDirectory() as tmpdirname:
+            model = AutoModelForSequenceClassification.from_config(self.get_model_config())
+            model.add_adapter("adapter")
+            model.add_adapter("additional_adapter")
+            model.set_active_adapters("adapter")
+            model.train_adapter("adapter")
 
-        model = AutoModelForSequenceClassification.from_config(self.get_model_config())
-        model.add_adapter("adapter")
-        model.add_adapter("additional_adapter")
-        model.set_active_adapters("adapter")
-        model.train_adapter("adapter")
+            training_args = TrainingArguments(
+                output_dir=tmpdirname,
+                do_train=True,
+                learning_rate=0.1,
+                logging_steps=1,
+                max_steps=1,
+                save_steps=1,
+                remove_unused_columns=False,
+            )
+            trainer = AdapterTrainer(
+                model=model,
+                args=training_args,
+                train_dataset=train_dataset,
+            )
 
-        training_args = TrainingArguments(
-            output_dir="./output",
-            do_train=True,
-            learning_rate=0.1,
-            logging_steps=1,
-            max_steps=1,
-            save_steps=1,
-            remove_unused_columns=False,
-        )
-        trainer = AdapterTrainer(
-            model=model,
-            args=training_args,
-            train_dataset=train_dataset,
-        )
+            trainer.train()
+            # create second model that should resume the training of the first
+            model_resume = AutoModelForSequenceClassification.from_config(self.get_model_config())
+            model_resume.add_adapter("adapter")
+            model_resume.add_adapter("additional_adapter")
+            model_resume.set_active_adapters("adapter")
+            model_resume.train_adapter("adapter")
+            trainer_resume = AdapterTrainer(
+                model=model_resume,
+                args=TrainingArguments(do_train=True, max_steps=1, output_dir=tmpdirname),
+                train_dataset=train_dataset,
+            )
+            trainer_resume.train(resume_from_checkpoint=True)
+
+            self.assertEqual(model.config.adapters.adapters, model_resume.config.adapters.adapters)
+
+            for ((k1, v1), (k2, v2)) in zip(trainer.model.state_dict().items(), trainer_resume.model.state_dict().items()):
+                self.assertEqual(k1, k2)
+                if "adapter" in k1:
+                    self.assertTrue(torch.equal(v1, v2), k1)
+
+    def test_resume_training_invalid_checkpoint(self):
 
-        trainer.train()
-        # create second model that should resume the training of the first
-        model_resume = AutoModelForSequenceClassification.from_config(self.get_model_config())
-        model_resume.add_adapter("adapter")
-        model_resume.add_adapter("additional_adapter")
-        model_resume.set_active_adapters("adapter")
-        model_resume.train_adapter("adapter")
-        trainer_resume = AdapterTrainer(
-            model=model_resume,
-            args=TrainingArguments(do_train=True, max_steps=1, output_dir="./output"),
-            train_dataset=train_dataset,
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        data_args = GlueDataTrainingArguments(
+            task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True
         )
-        trainer_resume.train(resume_from_checkpoint=True)
+        train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train")
+        with TemporaryDirectory() as tmpdirname:
+            model = AutoModelForSequenceClassification.from_config(self.get_model_config())
+            model.add_adapter("adapter")
+            model.add_adapter("additional_adapter")
+            model.set_active_adapters("adapter")
+            model.train_adapter("adapter")
 
-        self.assertEqual(model.config.adapters.adapters, model_resume.config.adapters.adapters)
+            training_args = TrainingArguments(
+                output_dir=tmpdirname,
+                do_train=True,
+                learning_rate=0.1,
+                logging_steps=1,
+                max_steps=1,
+                save_steps=1,
+                remove_unused_columns=False,
+            )
+            trainer = AdapterTrainer(
+                model=model,
+                args=training_args,
+                train_dataset=train_dataset,
+            )
 
-        for ((k1, v1), (k2, v2)) in zip(trainer.model.state_dict().items(), trainer_resume.model.state_dict().items()):
-            self.assertEqual(k1, k2)
-            if "adapter" in k1:
-                self.assertTrue(torch.equal(v1, v2), k1)
+            trainer.train()
+            # create second model that should resume the training of the first
+            model_resume = AutoModelForSequenceClassification.from_config(self.get_model_config())
+            model_resume.add_adapter("adapter")
+            model_resume.add_adapter("additional_adapter")
+            model_resume.set_active_adapters("adapter")
+            model_resume.train_adapter("adapter")
+            trainer_resume = AdapterTrainer(
+                model=model_resume,
+                args=TrainingArguments(do_train=True, max_steps=1, output_dir=tmpdirname),
+                train_dataset=train_dataset,
+            )
+            with self.assertRaises(Exception):
+                trainer_resume.train(resume_from_checkpoint=tmpdirname+"_invalid")
 
     def test_resume_training_with_fusion(self):
         tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
@@ -85,51 +129,51 @@ def test_resume_training_with_fusion(self):
             task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True
         )
         train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train")
+        with TemporaryDirectory() as tmpdirname:
+            model = AutoModelForSequenceClassification.from_config(self.get_model_config())
+            model.add_adapter("adapter")
+            model.add_adapter("additional_adapter")
+            model.add_adapter_fusion(Fuse("adapter", "additional_adapter"))
+            model.set_active_adapters(Fuse("adapter", "additional_adapter"))
+            model.train_fusion(Fuse("adapter", "additional_adapter"))
 
-        model = AutoModelForSequenceClassification.from_config(self.get_model_config())
-        model.add_adapter("adapter")
-        model.add_adapter("additional_adapter")
-        model.add_adapter_fusion(Fuse("adapter", "additional_adapter"))
-        model.set_active_adapters(Fuse("adapter", "additional_adapter"))
-        model.train_fusion(Fuse("adapter", "additional_adapter"))
-
-        training_args = TrainingArguments(
-            output_dir="./output",
-            do_train=True,
-            learning_rate=0.1,
-            logging_steps=1,
-            max_steps=1,
-            save_steps=1,
-            remove_unused_columns=False,
-        )
-        trainer = AdapterTrainer(
-            model=model,
-            args=training_args,
-            train_dataset=train_dataset,
-        )
+            training_args = TrainingArguments(
+                output_dir=tmpdirname,
+                do_train=True,
+                learning_rate=0.1,
+                logging_steps=1,
+                max_steps=1,
+                save_steps=1,
+                remove_unused_columns=False,
+            )
+            trainer = AdapterTrainer(
+                model=model,
+                args=training_args,
+                train_dataset=train_dataset,
+            )
 
-        trainer.train()
-        model_resume = AutoModelForSequenceClassification.from_config(self.get_model_config())
-        model_resume.add_adapter("adapter")
-        model_resume.add_adapter("additional_adapter")
-        model_resume.add_adapter_fusion(Fuse("adapter", "additional_adapter"))
-        model_resume.set_active_adapters(Fuse("adapter", "additional_adapter"))
-        model_resume.train_fusion(Fuse("adapter", "additional_adapter"))
-        trainer_resume = AdapterTrainer(
-            model=model_resume,
-            args=TrainingArguments(do_train=True, max_steps=1, output_dir="./output"),
-            train_dataset=train_dataset,
-        )
-        trainer_resume.train(resume_from_checkpoint=True)
+            trainer.train()
+            model_resume = AutoModelForSequenceClassification.from_config(self.get_model_config())
+            model_resume.add_adapter("adapter")
+            model_resume.add_adapter("additional_adapter")
+            model_resume.add_adapter_fusion(Fuse("adapter", "additional_adapter"))
+            model_resume.set_active_adapters(Fuse("adapter", "additional_adapter"))
+            model_resume.train_fusion(Fuse("adapter", "additional_adapter"))
+            trainer_resume = AdapterTrainer(
+                model=model_resume,
+                args=TrainingArguments(do_train=True, max_steps=1, output_dir=tmpdirname),
+                train_dataset=train_dataset,
+            )
+            trainer_resume.train(resume_from_checkpoint=True)
 
-        self.assertEqual(model.config.adapters.adapters, model_resume.config.adapters.adapters)
+            self.assertEqual(model.config.adapters.adapters, model_resume.config.adapters.adapters)
 
-        for ((k1, v1), (k2, v2)) in zip(
-            trainer.model.to("cpu").state_dict().items(), trainer_resume.model.to("cpu").state_dict().items()
-        ):
-            self.assertEqual(k1, k2)
-            if "adapter" in k1:
-                self.assertTrue(torch.equal(v1, v2), k1)
+            for ((k1, v1), (k2, v2)) in zip(
+                trainer.model.to("cpu").state_dict().items(), trainer_resume.model.to("cpu").state_dict().items()
+            ):
+                self.assertEqual(k1, k2)
+                if "adapter" in k1:
+                    self.assertTrue(torch.equal(v1, v2), k1)
 
     def test_auto_set_save_adapters(self):
         model = BertForSequenceClassification(
@@ -144,15 +188,16 @@ def test_auto_set_save_adapters(self):
         model.add_adapter("adapter2")
         model.add_adapter_fusion(Fuse("adapter1", "adapter2"))
         model.train_adapter_fusion(Fuse("adapter1", "adapter2"))
-
-        training_args = TrainingArguments(
-            output_dir="./output",
-        )
-        trainer = AdapterTrainer(
-            model=model,
-            args=training_args,
-        )
-        self.assertTrue(trainer.train_adapter_fusion)
+        
+        with TemporaryDirectory() as tmpdirname:
+            training_args = TrainingArguments(
+                output_dir=tmpdirname,
+            )
+            trainer = AdapterTrainer(
+                model=model,
+                args=training_args,
+            )
+            self.assertTrue(trainer.train_adapter_fusion)
 
     @slow
     def test_training_load_best_model_at_end_full_model(self):
@@ -167,27 +212,28 @@ def test_training_load_best_model_at_end_full_model(self):
         model.add_adapter("adapter")
         model.train_adapter("adapter")
 
-        training_args = TrainingArguments(
-            output_dir="./output",
-            do_train=True,
-            learning_rate=0.001,
-            max_steps=1,
-            save_steps=1,
-            remove_unused_columns=False,
-            load_best_model_at_end=True,
-            evaluation_strategy="epoch",
-            save_strategy="epoch",
-            num_train_epochs=2,
-        )
-        trainer = Trainer(
-            model=model,
-            args=training_args,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-        )
+        with TemporaryDirectory() as tmpdirname:
+            training_args = TrainingArguments(
+                output_dir=tmpdirname,
+                do_train=True,
+                learning_rate=0.001,
+                max_steps=1,
+                save_steps=1,
+                remove_unused_columns=False,
+                load_best_model_at_end=True,
+                evaluation_strategy="epoch",
+                save_strategy="epoch",
+                num_train_epochs=2,
+            )
+            trainer = Trainer(
+                model=model,
+                args=training_args,
+                train_dataset=train_dataset,
+                eval_dataset=eval_dataset,
+            )
 
-        trainer.train()
-        self.assertIsNotNone(trainer.model.active_adapters)
+            trainer.train()
+            self.assertIsNotNone(trainer.model.active_adapters)
 
     def test_training_load_best_model_at_end_adapter(self):
         tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
@@ -201,25 +247,26 @@ def test_training_load_best_model_at_end_adapter(self):
         model.add_adapter("adapter")
         model.train_adapter("adapter")
 
-        training_args = TrainingArguments(
-            output_dir="./output",
-            do_train=True,
-            learning_rate=0.001,
-            max_steps=1,
-            save_steps=1,
-            remove_unused_columns=False,
-            load_best_model_at_end=True,
-            evaluation_strategy="epoch",
-            save_strategy="epoch",
-            num_train_epochs=2,
-        )
-        trainer = AdapterTrainer(
-            model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
-        )
-        with self.assertLogs(logger) as cm:
-            trainer.train()
-            self.assertTrue(any("Loading best adapter(s) from" in line for line in cm.output))
-        self.assertEqual(Stack("adapter"), trainer.model.active_adapters)
+        with TemporaryDirectory() as tmpdirname:
+            training_args = TrainingArguments(
+                output_dir=tmpdirname,
+                do_train=True,
+                learning_rate=0.001,
+                max_steps=1,
+                save_steps=1,
+                remove_unused_columns=False,
+                load_best_model_at_end=True,
+                evaluation_strategy="epoch",
+                save_strategy="epoch",
+                num_train_epochs=2,
+            )
+            trainer = AdapterTrainer(
+                model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
+            )
+            with self.assertLogs(logger) as cm:
+                trainer.train()
+                self.assertTrue(any("Loading best adapter(s) from" in line for line in cm.output))
+            self.assertEqual(Stack("adapter"), trainer.model.active_adapters)
 
     def test_training_load_best_model_at_end_fusion(self):
         tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
@@ -235,25 +282,26 @@ def test_training_load_best_model_at_end_fusion(self):
         model.add_adapter_fusion(Fuse("fuse_adapter_1", "fuse_adapter_2"))
         model.train_adapter_fusion(Fuse("fuse_adapter_1", "fuse_adapter_2"))
 
-        training_args = TrainingArguments(
-            output_dir="./output",
-            do_train=True,
-            learning_rate=0.001,
-            max_steps=1,
-            save_steps=1,
-            remove_unused_columns=False,
-            load_best_model_at_end=True,
-            evaluation_strategy="epoch",
-            save_strategy="epoch",
-            num_train_epochs=2,
-        )
-        trainer = AdapterTrainer(
-            model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
-        )
-        with self.assertLogs(logger) as cm:
-            trainer.train()
-            self.assertTrue(any("Loading best adapter fusion(s) from" in line for line in cm.output))
-        self.assertEqual(Fuse("fuse_adapter_1", "fuse_adapter_2"), trainer.model.active_adapters)
+        with TemporaryDirectory() as tmpdirname:
+            training_args = TrainingArguments(
+                output_dir=tmpdirname,
+                do_train=True,
+                learning_rate=0.001,
+                max_steps=1,
+                save_steps=1,
+                remove_unused_columns=False,
+                load_best_model_at_end=True,
+                evaluation_strategy="epoch",
+                save_strategy="epoch",
+                num_train_epochs=2,
+            )
+            trainer = AdapterTrainer(
+                model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
+            )
+            with self.assertLogs(logger) as cm:
+                trainer.train()
+                self.assertTrue(any("Loading best adapter fusion(s) from" in line for line in cm.output))
+            self.assertEqual(Fuse("fuse_adapter_1", "fuse_adapter_2"), trainer.model.active_adapters)
 
     def test_reloading_prediction_head(self):
         tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
diff --git a/tests_adapters/test_bart.py b/tests_adapters/test_bart.py
index 09a1a4381..ca970207e 100644
--- a/tests_adapters/test_bart.py
+++ b/tests_adapters/test_bart.py
@@ -1,4 +1,5 @@
 import unittest
+from tests_adapters.methods.test_config_union import ConfigUnionAdapterTest
 
 from transformers import BartConfig
 from transformers.testing_utils import require_torch
@@ -49,6 +50,7 @@ class BartAdapterTest(
     PredictionHeadModelTestMixin,
     ParallelAdapterInferenceTestMixin,
     ParallelTrainingMixin,
+    ConfigUnionAdapterTest,
     BartAdapterTestBase,
     unittest.TestCase,
 ):
diff --git a/tests_adapters/test_bert.py b/tests_adapters/test_bert.py
index 1daca164d..f6c9da697 100644
--- a/tests_adapters/test_bert.py
+++ b/tests_adapters/test_bert.py
@@ -1,4 +1,5 @@
 import unittest
+from tests_adapters.methods.test_config_union import ConfigUnionAdapterTest
 
 from transformers import BertConfig
 from transformers.testing_utils import require_torch
@@ -46,6 +47,7 @@ class BertAdapterTest(
     PredictionHeadModelTestMixin,
     ParallelAdapterInferenceTestMixin,
     ParallelTrainingMixin,
+    ConfigUnionAdapterTest,
     BertAdapterTestBase,
     unittest.TestCase,
 ):
diff --git a/tests_adapters/test_deberta.py b/tests_adapters/test_deberta.py
index 99963e3e8..d5f1546fb 100644
--- a/tests_adapters/test_deberta.py
+++ b/tests_adapters/test_deberta.py
@@ -1,4 +1,5 @@
 import unittest
+from tests_adapters.methods.test_config_union import ConfigUnionAdapterTest
 
 from transformers import DebertaConfig
 from transformers.testing_utils import require_torch
@@ -50,7 +51,7 @@ class DebertaAdapterTest(
     UniPELTTestMixin,
     EmbeddingTestMixin,
     ParallelTrainingMixin,
-
+    ConfigUnionAdapterTest, 
     DebertaAdapterTestBase,
     unittest.TestCase,
 ):
diff --git a/tests_adapters/test_debertaV2.py b/tests_adapters/test_debertaV2.py
index f0d3909e0..ed436f445 100644
--- a/tests_adapters/test_debertaV2.py
+++ b/tests_adapters/test_debertaV2.py
@@ -1,4 +1,5 @@
 import unittest
+from tests_adapters.methods.test_config_union import ConfigUnionAdapterTest
 
 from transformers import DebertaV2Config
 from transformers.testing_utils import require_torch
@@ -49,6 +50,7 @@ class DebertaV2AdapterTest(
     UniPELTTestMixin,
     EmbeddingTestMixin,
     ParallelTrainingMixin,
+    ConfigUnionAdapterTest,
     DebertaV2AdapterTestBase,
     unittest.TestCase,
 ):
diff --git a/tests_adapters/test_distilbert.py b/tests_adapters/test_distilbert.py
index 74008a54a..81daa08fc 100644
--- a/tests_adapters/test_distilbert.py
+++ b/tests_adapters/test_distilbert.py
@@ -1,4 +1,5 @@
 import unittest
+from tests_adapters.methods.test_config_union import ConfigUnionAdapterTest
 
 from transformers import DistilBertConfig
 from transformers.testing_utils import require_torch
@@ -46,6 +47,7 @@ class DistilBertAdapterTest(
     PredictionHeadModelTestMixin,
     ParallelAdapterInferenceTestMixin,
     ParallelTrainingMixin,
+    ConfigUnionAdapterTest,
     DistilBertAdapterTestBase,
     unittest.TestCase,
 ):
diff --git a/tests_adapters/test_gpt2.py b/tests_adapters/test_gpt2.py
index 87b3c5d95..5605c5895 100644
--- a/tests_adapters/test_gpt2.py
+++ b/tests_adapters/test_gpt2.py
@@ -1,4 +1,5 @@
 import unittest
+from tests_adapters.methods.test_config_union import ConfigUnionAdapterTest
 
 from transformers import GPT2Config
 from transformers.testing_utils import require_torch
@@ -47,6 +48,7 @@ class GPT2AdapterTest(
     PredictionHeadModelTestMixin,
     ParallelAdapterInferenceTestMixin,
     ParallelTrainingMixin,
+    ConfigUnionAdapterTest, 
     GPT2AdapterTestBase,
     unittest.TestCase,
 ):
diff --git a/tests_adapters/test_gptj.py b/tests_adapters/test_gptj.py
index c6607a49d..4f294d705 100644
--- a/tests_adapters/test_gptj.py
+++ b/tests_adapters/test_gptj.py
@@ -1,4 +1,5 @@
 import unittest
+from tests_adapters.methods.test_config_union import ConfigUnionAdapterTest
 
 from transformers import GPTJConfig
 from transformers.testing_utils import require_torch
@@ -48,6 +49,7 @@ class GPTJAdapterTest(
     PredictionHeadModelTestMixin,
     ParallelAdapterInferenceTestMixin,
     ParallelTrainingMixin,
+    ConfigUnionAdapterTest,
     GPTJAdapterTestBase,
     unittest.TestCase,
 ):
diff --git a/tests_adapters/test_roberta.py b/tests_adapters/test_roberta.py
index 69c9b35eb..8d8cd23fc 100644
--- a/tests_adapters/test_roberta.py
+++ b/tests_adapters/test_roberta.py
@@ -1,4 +1,5 @@
 import unittest
+from tests_adapters.methods.test_config_union import ConfigUnionAdapterTest
 
 from transformers import RobertaConfig
 from transformers.testing_utils import require_torch
@@ -44,6 +45,7 @@ class RobertaAdapterTest(
     CompabilityTestMixin,
     PredictionHeadModelTestMixin,
     ParallelAdapterInferenceTestMixin,
+    ConfigUnionAdapterTest,
     RobertaAdapterTestBase,
     unittest.TestCase,
 ):