fix logic and cite partial rotary embeddings from Wang & Komatsuzaki …

…et al
lucidrains · Apr 14, 2022 · f2d2815 · f2d2815
1 parent 4f99e31
commit f2d2815
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 3 deletions.
diff --git a/retro_pytorch/retro_pytorch.py b/retro_pytorch/retro_pytorch.py
@@ -322,7 +322,10 @@ def __init__(
         super().__init__()
         self.layers = nn.ModuleList([])
 
-        rotary_emb_dim = max(dim_head // 2, MIN_DIM_HEAD)
+        # partial rotary embeddings, which is better than full rotary
+        # Wang and Komatsuzaki et al https://github.com/kingoflolz/mesh-transformer-jax/
+
+        rotary_emb_dim = min(dim_head, MIN_DIM_HEAD)
         self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim)
 
         wrapper = partial(PreNorm, dim, norm_klass = norm_klass) if not post_norm else partial(PostNorm, dim, scale_residual = scale_residual, norm_klass = norm_klass)
@@ -377,7 +380,10 @@ def __init__(
         super().__init__()
         self.layers = nn.ModuleList([])
 
-        rotary_emb_dim = max(dim_head // 2, MIN_DIM_HEAD)
+        # partial rotary embeddings, which is better than full rotary
+        # Wang and Komatsuzaki et al https://github.com/kingoflolz/mesh-transformer-jax/
+
+        rotary_emb_dim = min(dim_head, MIN_DIM_HEAD)
         self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim)
 
         wrapper = partial(PreNorm, dim, norm_klass = norm_klass) if not post_norm else partial(PostNorm, dim, scale_residual = scale_residual, norm_klass = norm_klass)

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'retro-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.3.0',
+  version = '0.3.1',
   license='MIT',
   description = 'RETRO - Retrieval Enhanced Transformer - Pytorch',
   author = 'Phil Wang',