huggingface · gante · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
@@ -373,15 +373,30 @@ A [`Constraint`] can be used to force the generation to include specific tokens
 
 [[autodoc]] Cache
     - update
+    - update_cross_attention
+    - has_cached_cross_attentions
+    - get_seq_length
+    - get_max_length
+    - get_usable_length
 
 [[autodoc]] DynamicCache
     - update
     - get_seq_length
+    - get_max_length
+    - get_usable_length
     - reorder_cache
     - to_legacy_cache
     - from_legacy_cache
 
+[[autodoc]] DynamicCacheWithCrossAttention
+    - update_cross_attention
+    - has_cached_cross_attentions
+    - to_legacy_cache
+    - from_legacy_cache
+
 [[autodoc]] SinkCache
     - update
     - get_seq_length
+    - get_max_length
+    - get_usable_length
     - reorder_cache
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -1308,7 +1308,7 @@
     _import_structure["activations"] = []
     _import_structure["benchmark.benchmark"] = ["PyTorchBenchmark"]
     _import_structure["benchmark.benchmark_args"] = ["PyTorchBenchmarkArguments"]
-    _import_structure["cache_utils"] = ["Cache", "DynamicCache", "SinkCache"]
+    _import_structure["cache_utils"] = ["Cache", "DynamicCache", "DynamicCacheWithCrossAttention", "SinkCache"]
     _import_structure["data.datasets"] = [
         "GlueDataset",
         "GlueDataTrainingArguments",
@@ -5966,7 +5966,7 @@
         # Benchmarks
         from .benchmark.benchmark import PyTorchBenchmark
         from .benchmark.benchmark_args import PyTorchBenchmarkArguments
-        from .cache_utils import Cache, DynamicCache, SinkCache
+        from .cache_utils import Cache, DynamicCache, DynamicCacheWithCrossAttention, SinkCache
         from .data.datasets import (
             GlueDataset,
             GlueDataTrainingArguments,

diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -34,6 +34,25 @@ def update(
         """
         raise NotImplementedError("Make sure to implement `update` in a subclass.")
 
+    def update_cross_attention(self, key_states: torch.Tensor, value_states: torch.Tensor, layer_idx: int):
+        """
+        Updates the cross attention cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+        """
+        raise NotImplementedError(
+            "Make sure to implement `update_cross_attention` in a subclass to use in encoder-decoder models."
+        )
+
+    def has_cached_cross_attentions(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns whether it has cached cross attentions. A layer index can be optionally passed."""
+        return False
+
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
         raise NotImplementedError("Make sure to implement `get_seq_length` in a subclass.")
@@ -56,7 +75,7 @@ def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -
 
 class DynamicCache(Cache):
     """
-    A cache that grows dynamically as more tokens are generated. This is the default for generative models.
+    A [~Cache] that grows dynamically as more tokens are generated. This is the default for generative models.
 
     It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
     `[batch_size, num_heads, seq_len, head_dim]`.
@@ -136,27 +155,29 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         return self.key_cache[layer_idx].shape[-2]
 
     def get_max_length(self) -> Optional[int]:
-        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
+        """Returns the maximum sequence length of the cached states. [~DynamicCache] does not have a maximum length."""
         return None
 
     def reorder_cache(self, beam_idx: torch.LongTensor):
-        """Reorders the cache for beam search, given the selected beam indices."""
+        """Reorders the self attention cache for beam search, given the selected beam indices."""
         for layer_idx in range(len(self.key_cache)):
             device = self.key_cache[layer_idx].device
             self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
             device = self.value_cache[layer_idx].device
             self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
 
-    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
-        """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format."""
+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+        """Converts the [~DynamicCache] instance into the its equivalent in the legacy cache format."""
         legacy_cache = ()
         for layer_idx in range(len(self)):
-            legacy_cache += ((self.key_cache[layer_idx], self.value_cache[layer_idx]),)
+            legacy_cache += (self[layer_idx],)
         return legacy_cache
 
     @classmethod
-    def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
-        """Converts a cache in the legacy cache format into an equivalent `DynamicCache`."""
+    def from_legacy_cache(
+        cls, past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor]]] = None
+    ) -> "DynamicCache":
+        """Converts a cache in the legacy cache format into an equivalent [~DynamicCache]."""
         cache = cls()
         if past_key_values is not None:
             for layer_idx in range(len(past_key_values)):
@@ -165,11 +186,97 @@ def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTens
         return cache
 
 
+class DynamicCacheWithCrossAttention(DynamicCache):
+    """
+    Expands [~DynamicCache] with cross attention. This is the default for encoder-decoder generative models.
+    It stores the cross-attention Key and Value states as a list of tensors, one for each layer. The expected shape
+    for each tensor is `[batch_size, num_heads, encoder_sequence_length, embed_size_per_head]`. Please refer to
+    [~DynamicCache] for documentation and functions related to self-attention.
+    """
+
+    def __init__(self) -> None:
+        self.cross_attention_key_cache: List[torch.Tensor] = []
+        self.cross_attention_value_cache: List[torch.Tensor] = []
+        super().__init__()
+
+    def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
+        """
+        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
+        sequence length.
+        """
+        if layer_idx < len(self):
+            return (
+                self.key_cache[layer_idx],
+                self.value_cache[layer_idx],
+                self.cross_attention_key_cache[layer_idx],
+                self.cross_attention_value_cache[layer_idx],
+            )
+        else:
+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
+
+    def __iter__(self):
+        """
+        Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
+        keys and values
+        """
+        for layer_idx in range(len(self)):
+            yield (
+                self.key_cache[layer_idx],
+                self.value_cache[layer_idx],
+                self.cross_attention_key_cache[layer_idx],
+                self.cross_attention_value_cache[layer_idx],
+            )
+
+    def update_cross_attention(self, key_states: torch.Tensor, value_states: torch.Tensor, layer_idx: int):
+        """
+        Updates the cross attention cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+        """
+        # Update the cache
+        if len(self.cross_attention_key_cache) <= layer_idx:
+            self.cross_attention_key_cache.append(key_states)
+            self.cross_attention_value_cache.append(value_states)
+        else:
+            raise ValueError(
+                "Attempted to update the cross attention cache for a layer that already has a cached cross attention."
+            )
+
+    def has_cached_cross_attentions(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns whether it has cached cross attentions. A layer index can be optionally passed."""
+        return len(self.cross_attention_key_cache) > layer_idx
+
+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]]:
+        """Converts the [~DynamicCacheWithCrossAttention] instance into the its equivalent in the legacy cache format."""
+        legacy_cache = ()
+        for layer_idx in range(len(self)):
+            legacy_cache += (self[layer_idx],)
+        return legacy_cache
+
+    @classmethod
+    def from_legacy_cache(
+        cls, past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]]] = None
+    ) -> "DynamicCacheWithCrossAttention":
+        """Converts a cache in the legacy cache format into an equivalent [~DynamicCacheWithCrossAttention]."""
+        cache = cls()
+        if past_key_values is not None:
+            for layer_idx in range(len(past_key_values)):
+                key_states, value_states, cross_attn_key_states, cross_attn_value_states = past_key_values[layer_idx]
+                cache.update(key_states, value_states, layer_idx)
+                cache.update_cross_attention(cross_attn_key_states, cross_attn_value_states, layer_idx)
+        return cache
+
+
 class SinkCache(Cache):
     """
-    A cache that as described in the [Attention Sinks paper](https://arxiv.org/abs/2309.17453). It allows the model to
-    generate beyond the length of its context window, without losing fluency in the conversation. As it discards past
-    tokens, the model will lose the ability to generate tokens that depend on the context that was discarded.
+    A [~Cache] that as described in the [Attention Sinks paper](https://arxiv.org/abs/2309.17453). It allows the model
+    to generate beyond the length of its context window, without losing fluency in the conversation. As it discards
+    past tokens, the model will lose the ability to generate tokens that depend on the context that was discarded.
 
     It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
     `[batch_size, num_heads, seq_len, head_dim]`.

diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
@@ -291,7 +291,7 @@ def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will lead"
                 "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                 "when creating this class."
             )

diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
@@ -202,7 +202,7 @@ def __init__(self, config: MistralConfig, layer_idx: Optional[int] = None):
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will lead"
                 "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                 "when creating this class."
             )

diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -249,7 +249,7 @@ def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None):
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will lead"
                 "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                 "when creating this class."
             )

diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -185,7 +185,7 @@ def __init__(self, config: PersimmonConfig, layer_idx: Optional[int] = None):
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will lead"
                 "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                 "when creating this class."
             )

diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
@@ -224,7 +224,7 @@ def __init__(self, config: PhiConfig, layer_idx: Optional[int] = None):
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will lead"
                 "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                 "when creating this class."
             )