daphne-eu · philipportner · Apr 30, 2024 · Nov 16, 2023 · Jan 14, 2024 · Jan 14, 2024
diff --git a/UserConfig.json b/UserConfig.json
@@ -1,4 +1,11 @@
 {
+    "matmul_vec_size_bits": 0,
+    "matmul_tile": false,
+    "matmul_use_fixed_tile_sizes": true,
+    "matmul_fixed_tile_sizes": [4, 4, 4, 4, 4],
+    "matmul_unroll_factor": 1,
+    "matmul_unroll_jam_factor": 4,
+    "matmul_num_vec_registers": 16,
     "use_cuda": false,
     "use_vectorized_exec": false,
     "use_obj_ref_mgnt": true,

diff --git a/install-ubuntu-packages.sh b/install-ubuntu-packages.sh
diff --git a/src/api/cli/DaphneUserConfig.h b/src/api/cli/DaphneUserConfig.h
@@ -34,15 +34,22 @@ class DaphneLogger;
  * Container to pass around user configuration
  */
 struct DaphneUserConfig {
-    // Remember to update UserConfig.json accordingly!
-
+    // Remember to update UserConfig.json accordingly!    
     bool use_cuda = false;
     bool use_vectorized_exec = false;
     bool use_distributed = false;
     bool use_obj_ref_mgnt = true;
     bool use_ipa_const_propa = true;
     bool use_phy_op_selection = true;
     bool use_mlir_codegen = false;
+    int  matmul_vec_size_bits = 0;
+    bool matmul_tile = false;
+    int matmul_unroll_factor = 1;
+    int matmul_unroll_jam_factor=4;
+    int matmul_num_vec_registers=16;
+    bool matmul_use_fixed_tile_sizes = false;
+    std::vector<unsigned> matmul_fixed_tile_sizes = {4, 4};
+    bool matmul_invert_loops = false;
     bool use_mlir_hybrid_codegen = false;
     bool cuda_fuse_any = false;
     bool vectorized_single_queue = false;
@@ -74,6 +81,7 @@ struct DaphneUserConfig {
     size_t max_distributed_serialization_chunk_size = std::numeric_limits<int>::max() - 1024; // 2GB (-1KB to make up for gRPC headers etc.) - which is the maximum size allowed by gRPC / MPI. TODO: Investigate what might be the optimal.
     int numberOfThreads = -1;
     int minimumTaskSize = 1;
+
     // minimum considered log level (e.g., no logging below ERROR (essentially suppressing WARN, INFO, DEBUG and TRACE)
     spdlog::level::level_enum log_level_limit = spdlog::level::err;
     std::vector<LogConfig> loggers;

diff --git a/src/api/internal/daphne_internal.cpp b/src/api/internal/daphne_internal.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "runtime/local/datastructures/IAllocationDescriptor.h"
+#include <vector>
 #ifdef USE_MPI
     #include "runtime/distributed/worker/MPIWorker.h"
 #endif
@@ -31,7 +32,6 @@
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
-#include "mlir/Pass/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 
 #ifdef USE_CUDA
@@ -260,10 +260,46 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
             "libdir", cat(daphneOptions),
             desc("The directory containing kernel libraries")
     );
+
     static opt<bool> mlirCodegen(
         "mlir-codegen", cat(daphneOptions),
         desc("Enables lowering of certain DaphneIR operations on DenseMatrix to low-level MLIR operations.")
     );
+    static opt<int> matmul_vec_size_bits(
+        "matmul-vec-size-bits", cat(daphneOptions),
+        desc("Set the vector size to be used in the lowering of the MatMul operation if possible. Value of 0 is interpreted as off switch."),
+        init(0)
+    );
+    static opt<bool> matmul_tile(
+        "matmul-tile", cat(daphneOptions),
+        desc("Enables loop tiling in the lowering of the MatMul operation.")
+    );
+    static opt<int> matmul_unroll_factor(
+        "matmul-unroll-factor", cat(daphneOptions),
+        desc("Factor by which to unroll the finally resulting inner most loop in the lowered MatMul if tiling is used."),
+        init(1)
+    );
+    static opt<int> matmul_unroll_jam_factor(
+        "matmul-unroll-jam-factor", cat(daphneOptions),
+        desc("Factor by which to unroll jam the two inner most loop in the lowered MatMul if tiling is used."),
+        init(4)
+    );
+    static opt<int> matmul_num_vec_registers(
+        "matmul-num-vec-registers", cat(daphneOptions),
+        desc("Number of vector registers. Used during automatic tiling in lowering of MatMulOp"),
+        init(16)
+    );
+    static llvm::cl::list<unsigned> matmul_fixed_tile_sizes(
+        "matmul-fixed-tile-sizes", cat(daphneOptions),
+        desc("Set fixed tile sizes to be used for the lowering of MatMul if tiling is used. This also enables tiling."),
+        CommaSeparated
+    );
+    static opt<bool> matmul_invert_loops(
+        "matmul-invert-loops", cat(daphneOptions),
+        desc("Enable inverting of the inner two loops in the matrix multiplication as a fallback option, if tiling is not possible or deactivated.")
+    );
+
+
     static opt<bool> performHybridCodegen(
         "mlir-hybrid-codegen", cat(daphneOptions),
         desc("Enables prototypical hybrid code generation combining pre-compiled kernels and MLIR code generation.")
@@ -382,6 +418,18 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
     user_config.use_ipa_const_propa = !noIPAConstPropa;
     user_config.use_phy_op_selection = !noPhyOpSelection;
     user_config.use_mlir_codegen = mlirCodegen;
+    user_config.matmul_vec_size_bits = matmul_vec_size_bits;
+    user_config.matmul_tile = matmul_tile;
+    user_config.matmul_unroll_factor = matmul_unroll_factor;
+    user_config.matmul_unroll_jam_factor = matmul_unroll_jam_factor;
+    user_config.matmul_num_vec_registers = matmul_num_vec_registers;
+    user_config.matmul_invert_loops = matmul_invert_loops;
+    if (matmul_fixed_tile_sizes.size() > 0) {
+        user_config.matmul_use_fixed_tile_sizes = true;
+        user_config.matmul_fixed_tile_sizes = matmul_fixed_tile_sizes;
+        // Specifying a fixed tile size will be interpreted as wanting to use tiling.
+        user_config.matmul_tile = true;
+    }
     user_config.use_mlir_hybrid_codegen = performHybridCodegen;
 
     if(!libDir.getValue().empty())

diff --git a/src/compiler/execution/DaphneIrExecutor.cpp b/src/compiler/execution/DaphneIrExecutor.cpp
@@ -18,6 +18,7 @@
 
 #include <ir/daphneir/Daphne.h>
 #include <ir/daphneir/Passes.h>
+#include <ir/daphneir/Passes.h.inc>
 #include <mlir/Dialect/LLVMIR/LLVMDialect.h>
 #include <mlir/Dialect/LLVMIR/Transforms/Passes.h>
 
@@ -32,6 +33,7 @@
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -298,25 +300,36 @@ void DaphneIrExecutor::buildCodegenPipeline(mlir::PassManager &pm) {
             mlir::daphne::createPrintIRPass("IR before codegen pipeline"));
 
     pm.addPass(mlir::daphne::createDaphneOptPass());
-
-    if (!userConfig_.use_mlir_hybrid_codegen) {
-        pm.addPass(mlir::daphne::createMatMulOpLoweringPass());
-    }
-
+    pm.addPass(mlir::daphne::createEwOpLoweringPass());
     pm.addPass(mlir::daphne::createAggAllOpLoweringPass());
     pm.addPass(mlir::daphne::createMapOpLoweringPass());
     pm.addPass(mlir::createInlinerPass());
 
-    pm.addPass(mlir::daphne::createEwOpLoweringPass());
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::createLoopFusionPass());
+
+    if (!userConfig_.use_mlir_hybrid_codegen) {
+        pm.addPass(mlir::daphne::createMatMulOpLoweringPass(
+        userConfig_.matmul_tile, userConfig_.matmul_vec_size_bits,
+        userConfig_.matmul_fixed_tile_sizes,
+        userConfig_.matmul_use_fixed_tile_sizes,
+        userConfig_.matmul_unroll_factor, userConfig_.matmul_unroll_jam_factor,
+        userConfig_.matmul_num_vec_registers,
+        userConfig_.matmul_invert_loops));
+        if (userConfig_.explain_mlir_codegen)
+        pm.addPass(
+            mlir::daphne::createPrintIRPass("IR directly after lowering MatMulOp."));
+    }
+
     pm.addPass(mlir::createConvertMathToLLVMPass());
     pm.addPass(mlir::daphne::createModOpLoweringPass());
     pm.addPass(mlir::createCanonicalizerPass());
     pm.addPass(mlir::createCSEPass());
-    pm.addNestedPass<mlir::func::FuncOp>(mlir::createLoopFusionPass());
     pm.addNestedPass<mlir::func::FuncOp>(
         mlir::createAffineScalarReplacementPass());
     pm.addPass(mlir::createLowerAffinePass());
-
+    mlir::LowerVectorToLLVMOptions lowerVectorToLLVMOptions;
+    pm.addPass(mlir::createConvertVectorToLLVMPass(lowerVectorToLLVMOptions));
+
     if (userConfig_.explain_mlir_codegen)
         pm.addPass(
             mlir::daphne::createPrintIRPass("IR after codegen pipeline"));