Current implementation for `DynamicNTKScalingRotaryEmbedding` in modeling_llama.py does not update cos, sin correctly. #27226

bzantium · 2023-11-02T06:24:07Z

DynamicNTKScalingRotaryEmbedding is originally designed to update base and inv_freq with dynamic_length (seq_len / self.max_position_embeddings) for every input sequence. However, current implementation only updates base and inv_freq only when seq_len > self.max_seq_len_cached.

class LlamaRotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()

        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # Build here to make `torch.jit.trace` work.
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
        )

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)

        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )


class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""

    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len

        if seq_len > self.max_position_embeddings:
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))
            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
            self.register_buffer("inv_freq", inv_freq, persistent=False)

        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)

        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

Thus, it needs to be fixed like the following:

class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""

    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def forward(self, x, seq_len=None):
        if seq_len <= self.max_position_embeddings:
            dynamic = 1.0
        else:
            dynamic = seq_len / self.max_position_embeddings
        base = self.base * (
            (self.scaling_factor * dynamic) - (self.scaling_factor - 1)
        ) ** (self.dim / (self.dim - 2))
        inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(x.device) / self.dim))
        t = torch.arange(seq_len, device=x.device, dtype=inv_freq.dtype)
        freqs = torch.einsum("i,j->ij", t, inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)
        cos, sin = emb.cos(), emb.sin()
        return cos[:seq_len].to(dtype=x.dtype), sin[:seq_len].to(dtype=x.dtype)

to: @ArthurZucker @younesbelkada

Expected behavior

When using DynamicNTK for llama, cos and sin would be updated correctly.

The text was updated successfully, but these errors were encountered:

amyeroberts · 2023-11-02T11:47:02Z

cc @gante

ArthurZucker · 2023-11-06T13:57:58Z

Seems to be a duplicate of #27003 and was mentioned in #25306.

bzantium mentioned this issue Nov 2, 2023

make LlamaDynamicNTKScalingRotaryEmbedding work correctly #27227

Closed

bzantium closed this as completed Nov 15, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Current implementation for `DynamicNTKScalingRotaryEmbedding` in modeling_llama.py does not update cos, sin correctly. #27226

Current implementation for `DynamicNTKScalingRotaryEmbedding` in modeling_llama.py does not update cos, sin correctly. #27226

bzantium commented Nov 2, 2023 •

edited

Loading

amyeroberts commented Nov 2, 2023

ArthurZucker commented Nov 6, 2023

Current implementation for DynamicNTKScalingRotaryEmbedding in modeling_llama.py does not update cos, sin correctly. #27226

Current implementation for DynamicNTKScalingRotaryEmbedding in modeling_llama.py does not update cos, sin correctly. #27226

Comments

bzantium commented Nov 2, 2023 • edited Loading

Expected behavior

amyeroberts commented Nov 2, 2023

ArthurZucker commented Nov 6, 2023

Current implementation for `DynamicNTKScalingRotaryEmbedding` in modeling_llama.py does not update cos, sin correctly. #27226

Current implementation for `DynamicNTKScalingRotaryEmbedding` in modeling_llama.py does not update cos, sin correctly. #27226

bzantium commented Nov 2, 2023 •

edited

Loading