Matmul tutorial - cache padding (triton-lang#14)

Adds extra optional padding that can be use to ensure that input matrices' strides are non-power-of-two to improve cache behavior. Currently, it is most useful with DYNAMIC_K_BLOCK enabled.
Devjiu · Nov 13, 2024 · 0e05cae · 0e05cae
1 parent 44545c6
commit 0e05cae
Showing 1 changed file with 8 additions and 0 deletions.
diff --git a/python/tutorials/03-matrix-multiplication-cpu.py b/python/tutorials/03-matrix-multiplication-cpu.py
@@ -165,6 +165,7 @@
 DATA_TYPE = torch.float32
 K_DIM_PADDING = False
 DYNAMIC_K_BLOCK = False
+CACHE_PADDING = False
 
 @triton.jit
 def matmul_kernel(
@@ -322,6 +323,13 @@ def matmul(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
             b = torch.nn.functional.pad(b, (0, 0, 0, padding_size), mode='constant', value=0)
             K = a.shape[1]
 
+    # TODO: Check if padding is needed at all.
+    #       Currently, cache padding is most useful together with dynamic K blocking
+    #       to ensure that stride is non-power-of-two to improve cache behavior.
+    if CACHE_PADDING:
+        a = torch.nn.functional.pad(a, (0, 32, 0, 0), mode='constant', value=0)
+        b = torch.nn.functional.pad(b, (0, 32, 0, 0), mode='constant', value=0)
+
     #TODO: Currently masked load is not supported yet.
     assert (M % BLOCK_SIZE_M == 0) and (N % BLOCK_SIZE_N == 0) and (
         K % k_block == 0), "Masking currently not supported, Matrix dimensions must be multiples of block size"