dptech-corp · guolinke · Oct 13, 2022 · Jul 31, 2022 · Aug 1, 2022 · Aug 1, 2022
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,125 @@
+*.pt
+*.tfevents.*
+# JetBrains PyCharm IDE
+.idea/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# macOS dir files
+.DS_Store
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.args
+*.egg
+
+# Checkpoints
+checkpoints
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mypy
+.mypy_cache/
+
+# VSCODE
+.vscode/ftp-sync.json
+.vscode/settings.json
+
+# too big to git
+*.lmdb
+*.sto
+*.pt
+*.pkl
+
+# pytest
+.pytest_cache
+test/.pytest_cache
+/local*
+/_*
diff --git a/benchmarks/correctness/attention.py b/benchmarks/correctness/attention.py
@@ -0,0 +1,35 @@
+import torch
+from typing import Optional, Callable, List, Tuple, Sequence
+
+from unicore.modules import softmax_dropout
+
+
+def permute_final_dims(tensor: torch.Tensor, inds: List[int]):
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.permute(first_inds + [zero_index + i for i in inds])
+
+def _attention(query, key, value, mask=None, bias=None, upcast=False) -> torch.Tensor:
+    dtype_og = query.dtype
+
+    if upcast:
+        query = query.float()
+        key = key.float()
+        value = value.float()
+        if mask is not None:
+            mask = mask.float()
+        if bias is not None:
+            bias = bias.float()
+
+    # [*, H, C_hidden, K]
+    key = permute_final_dims(key, (1, 0))
+
+    # [*, H, Q, K]
+    a = torch.matmul(query, key)
+
+    a = softmax_dropout(a, dropout_prob=0, is_training=True, mask=mask, bias=bias)
+
+    # [*, H, Q, C_hidden]
+    b = torch.matmul(a, value)
+
+    return b.to(dtype_og)
diff --git a/benchmarks/correctness/benchmark_memory.py b/benchmarks/correctness/benchmark_memory.py
@@ -0,0 +1,128 @@
+import torch
+import torch.utils.benchmark as benchmark
+
+from flash_attention import _flash_attn
+from attention import _attention
+from torch_attention import _torch_attention
+
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--has_mask_bias", required=False, help="add bias in attention", type=bool, default=False)
+parser.add_argument("--eval", required=False, help="test whether has backward", type=bool, default=False)
+
+args = parser.parse_args()
+print(args)
+
+
+def benchmark_memory(fn, inputs, mask=None, bias=None, grad=None, eval=True, desc='', verbose=False, **kwinputs):
+    def fwd(grad, inputs, mask=mask, bias=bias, **kwinputs):
+        with torch.no_grad():
+            y = fn(inputs, inputs, inputs, mask=mask, bias=bias, **kwinputs)
+
+
+    def fwd_bwd(grad, inputs, mask=mask, bias=bias, **kwinputs):
+        y = fn(inputs, inputs, inputs, mask=mask, bias=bias, **kwinputs)
+        if type(y) is tuple:
+            y = y[0]
+        if grad is None:
+            grad = torch.randn_like(y)
+        else:
+            if grad.shape != y.shape:
+                raise RuntimeError('Grad shape does not match output shape')
+        y.backward(grad, retain_graph=False)
+
+    if eval:
+        f = fwd
+        if verbose:
+            print ("using fwd func...")
+    else:
+        f = fwd_bwd
+        if verbose:
+            print ("using fwd and bwd func...")
+
+    torch.cuda.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+    torch.cuda.synchronize()
+
+    f(None, inputs, mask, bias)
+
+    torch.cuda.synchronize()
+    mem = torch.cuda.max_memory_allocated() / ((2 ** 20) * 1000)
+    if verbose:
+        print(f"{desc} max memory: ", mem)
+    torch.cuda.empty_cache()
+    return mem
+
+
+def gen_attn_mask(mask, neg_inf):
+    assert neg_inf < -1e4
+    attn_mask = torch.zeros_like(mask)
+    attn_mask[mask == 0] = neg_inf
+    return attn_mask
+
+
+def fun(seqlen=128, verbose=False, has_bias=True, has_mask=True, eval=True):
+    bs = 1
+    head = 4
+    c_dim = 32
+    seq_q = seq_k = seq_v = seqlen
+    dtype = torch.bfloat16
+    device = "cuda"
+
+    inputs = torch.empty((bs, seq_q, head, seq_q, c_dim), dtype=dtype, device=device).normal_(mean=0, std=.5)
+    inputs.requires_grad = True
+    if verbose:
+        print ("inputs shape: ", inputs.shape)
+    # [bs, seq, seq, head, c_dim]
+
+    if has_bias:
+        bias = torch.randn(
+            1, 1, head, seq_q, seq_k, dtype=dtype, device=device
+        )
+        bias.requires_grad = True
+        if verbose:
+            print ("bias shape: ", bias.shape)
+        # [1, 1, seq, head, seq_k]
+    else:
+        bias = None
+
+    if has_mask:
+        mask = gen_attn_mask(
+            (
+                torch.rand(bs, seq_q, 1, 1, seq_k, dtype=dtype, device=device,) > 0.2
+            ).type(dtype),
+            -3e4,
+        )
+        if verbose:
+            print ("mask shape: ", mask.shape)
+    else:
+        mask = None
+
+    print ("processing seq length: {} in eval model {} ......".format(seqlen, eval))
+
+    try:
+        m1 = benchmark_memory(_attention, inputs, mask=mask, bias=bias, eval=eval, desc='Normal Attention forward')
+        print (m1)
+    except:
+        print ("Normal Attention OOM")
+
+    try:
+        m2 = benchmark_memory(_flash_attn, inputs, mask=mask, bias=bias, eval=eval, desc='Flash Attention forward')
+        print (m2)
+    except:
+        print ("Flash Attention OOM")
+
+
+for seqlen in [2**8, 2**9, 600, 700, 800, 2**10, 1200, 1400, 2**11, 2500, 3000, 3500, 2**12]:
+    if args.has_mask_bias:
+        if not args.eval:
+            fun(seqlen=seqlen, eval=False)
+        else:
+            fun(seqlen=seqlen, eval=True)
+    else:
+        if not args.eval:
+            fun(seqlen=seqlen, has_bias=None, has_mask=None, eval=False)
+        else:
+            fun(seqlen=seqlen, has_bias=None, has_mask=None, eval=True)
+