diff --git a/src/transformers/adapters/mixins/t5.py b/src/transformers/adapters/mixins/t5.py index 4be1aefe7f..49117d827e 100644 --- a/src/transformers/adapters/mixins/t5.py +++ b/src/transformers/adapters/mixins/t5.py @@ -25,13 +25,13 @@ class T5ModelAdaptersMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, Mode """Adds adapters to the T5Model class.""" def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]: + global_i = 0 if hasattr(self, "encoder"): + global_i = len(self.encoder.block) for i, layer in enumerate(self.encoder.block): yield i, layer - for i, layer in enumerate(self.decoder.block, start=len(self.encoder.block)): - yield i, layer - else: - for i, layer in enumerate(self.decoder.block): + if hasattr(self, "decoder"): + for i, layer in enumerate(self.decoder.block, start=global_i): yield i, layer def _init_adapter_modules(self): diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index 6eeab16b93..4703920155 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -1831,6 +1831,8 @@ def __init__(self, config: T5Config): self.model_parallel = False self.device_map = None + self._init_adapter_modules() + @add_start_docstrings(PARALLELIZE_DOCSTRING) def parallelize(self, device_map=None): self.device_map = ( @@ -1870,6 +1872,7 @@ class PreTrainedModel @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC) + @ForwardContext.wrap def forward( self, input_ids: Optional[torch.LongTensor] = None,