From 5074ee6204f7c33875ee40988f1dc9bb20211a3b Mon Sep 17 00:00:00 2001
From: Brian Popow <popow@gmx.de>
Date: Fri, 12 Nov 2021 13:33:30 +0100
Subject: [PATCH] Refactor: extract horizontal and vertical pass into methods

---
 .../Formats/Webp/Lossy/Vp8Encoding.cs         | 161 +++++++-----------
 1 file changed, 63 insertions(+), 98 deletions(-)
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index bcecdcd757..aa4ab5767b 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -136,61 +136,14 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
 
                 // Vertical pass and subsequent transpose.
                 // First pass, c and d calculations are longer because of the "trick" multiplications.
-                Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
-                Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
-
-                // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
-                Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
-                Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
-                Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
-                Vector128<short> c4 = Sse2.Subtract(c1, c2);
-                Vector128<short> c = Sse2.Add(c3, c4);
-
-                // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
-                Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
-                Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
-                Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
-                Vector128<short> d4 = Sse2.Add(d1, d2);
-                Vector128<short> d = Sse2.Add(d3, d4);
-
-                // Second pass.
-                Vector128<short> tmp0 = Sse2.Add(a, d);
-                Vector128<short> tmp1 = Sse2.Add(b, c);
-                Vector128<short> tmp2 = Sse2.Subtract(b, c);
-                Vector128<short> tmp3 = Sse2.Subtract(a, d);
+                InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
 
                 // Transpose the two 4x4.
                 LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
 
                 // Horizontal pass and subsequent transpose.
                 // First pass, c and d calculations are longer because of the "trick" multiplications.
-                Vector128<short> dc = Sse2.Add(t0.AsInt16(), Four);
-                a = Sse2.Add(dc, t2.AsInt16());
-                b = Sse2.Subtract(dc, t2.AsInt16());
-
-                // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
-                c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
-                c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
-                c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
-                c4 = Sse2.Subtract(c1, c2);
-                c = Sse2.Add(c3, c4);
-
-                // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
-                d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
-                d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
-                d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
-                d4 = Sse2.Add(d1, d2);
-                d = Sse2.Add(d3, d4);
-
-                // Second pass.
-                tmp0 = Sse2.Add(a, d);
-                tmp1 = Sse2.Add(b, c);
-                tmp2 = Sse2.Subtract(b, c);
-                tmp3 = Sse2.Subtract(a, d);
-                Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
-                Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
-                Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
-                Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
+                InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
 
                 // Transpose the two 4x4.
                 LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
@@ -266,61 +219,14 @@ public static void ITransformOne(Span<byte> reference, Span<short> input, Span<b
 
                 // Vertical pass and subsequent transpose.
                 // First pass, c and d calculations are longer because of the "trick" multiplications.
-                Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
-                Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
-
-                // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
-                Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
-                Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
-                Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
-                Vector128<short> c4 = Sse2.Subtract(c1, c2);
-                Vector128<short> c = Sse2.Add(c3, c4);
-
-                // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
-                Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
-                Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
-                Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
-                Vector128<short> d4 = Sse2.Add(d1, d2);
-                Vector128<short> d = Sse2.Add(d3, d4);
-
-                // Second pass.
-                Vector128<short> tmp0 = Sse2.Add(a, d);
-                Vector128<short> tmp1 = Sse2.Add(b, c);
-                Vector128<short> tmp2 = Sse2.Subtract(b, c);
-                Vector128<short> tmp3 = Sse2.Subtract(a, d);
+                InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
 
                 // Transpose the two 4x4.
                 LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
 
                 // Horizontal pass and subsequent transpose.
                 // First pass, c and d calculations are longer because of the "trick" multiplications.
-                Vector128<short> dc = Sse2.Add(t0.AsInt16(), Four);
-                a = Sse2.Add(dc, t2.AsInt16());
-                b = Sse2.Subtract(dc, t2.AsInt16());
-
-                // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
-                c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
-                c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
-                c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
-                c4 = Sse2.Subtract(c1, c2);
-                c = Sse2.Add(c3, c4);
-
-                // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
-                d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
-                d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
-                d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
-                d4 = Sse2.Add(d1, d2);
-                d = Sse2.Add(d3, d4);
-
-                // Second pass.
-                tmp0 = Sse2.Add(a, d);
-                tmp1 = Sse2.Add(b, c);
-                tmp2 = Sse2.Subtract(b, c);
-                tmp3 = Sse2.Subtract(a, d);
-                Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
-                Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
-                Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
-                Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
+                InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
 
                 // Transpose the two 4x4.
                 LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
@@ -409,6 +315,65 @@ public static void ITransformOne(Span<byte> reference, Span<short> input, Span<b
             }
         }
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static void InverseTransformVerticalPass(Vector128<long> in0, Vector128<long> in2, Vector128<long> in1, Vector128<long> in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3)
+        {
+            Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
+            Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
+
+            // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+            Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
+            Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
+            Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
+            Vector128<short> c4 = Sse2.Subtract(c1, c2);
+            Vector128<short> c = Sse2.Add(c3, c4);
+
+            // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+            Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
+            Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
+            Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
+            Vector128<short> d4 = Sse2.Add(d1, d2);
+            Vector128<short> d = Sse2.Add(d3, d4);
+
+            // Second pass.
+            tmp0 = Sse2.Add(a, d);
+            tmp1 = Sse2.Add(b, c);
+            tmp2 = Sse2.Subtract(b, c);
+            tmp3 = Sse2.Subtract(a, d);
+        }
+
+        private static void InverseTransformHorizontalPass(Vector128<long> t0, Vector128<long> t2, Vector128<long> t1, Vector128<long> t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3)
+        {
+            Vector128<short> dc = Sse2.Add(t0.AsInt16(), Four);
+            Vector128<short> a = Sse2.Add(dc, t2.AsInt16());
+            Vector128<short> b = Sse2.Subtract(dc, t2.AsInt16());
+
+            // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+            Vector128<short> c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
+            Vector128<short> c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
+            Vector128<short> c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
+            Vector128<short> c4 = Sse2.Subtract(c1, c2);
+            Vector128<short> c = Sse2.Add(c3, c4);
+
+            // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+            Vector128<short> d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
+            Vector128<short> d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
+            Vector128<short> d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
+            Vector128<short> d4 = Sse2.Add(d1, d2);
+            Vector128<short> d = Sse2.Add(d3, d4);
+
+            // Second pass.
+            Vector128<short> tmp0 = Sse2.Add(a, d);
+            Vector128<short> tmp1 = Sse2.Add(b, c);
+            Vector128<short> tmp2 = Sse2.Subtract(b, c);
+            Vector128<short> tmp3 = Sse2.Subtract(a, d);
+            shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
+            shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
+            shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
+            shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
+        }
+#endif
+
         public static void FTransform2(Span<byte> src, Span<byte> reference, Span<short> output, Span<short> output2, Span<int> scratch)
         {
             FTransform(src, reference, output, scratch);