diff --git a/software/.gitignore b/software/.gitignore
index dce9d8683..35dccde4a 100644
--- a/software/.gitignore
+++ b/software/.gitignore
@@ -26,5 +26,5 @@ runtime/arch.ld
 
 # Generated data files
 data.h
-apps/*/*/data*.h
+data/data*.h
 data/__pyc*
diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile
index 14e02456e..c4a2a40a3 100644
--- a/software/apps/baremetal/Makefile
+++ b/software/apps/baremetal/Makefile
@@ -17,7 +17,6 @@ RUNTIME_DIR := $(abspath $(SOFTWARE_DIR)/runtime)
 include $(RUNTIME_DIR)/runtime.mk
 
 APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c"))
-DATA := $(patsubst %.args,%.h,$(shell find $(APPS_DIR) -name "data.args"))
 BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
 ALL := $(APPS)
 
@@ -33,7 +32,7 @@ all_llvm: $(ALL_LLVM)
 $(APPS): % : $(BIN_DIR)/% $(APPS_DIR)/Makefile $(shell find $(RUNTIME_DIR)/**.{S,c,h,ld} -type f)
 
 .PHONY: $(BINARIES)
-$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) $(DATA) update_opcodes
+$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) data_%.h update_opcodes
 	mkdir -p $(dir $@)
 	$(RISCV_CC) -Iinclude -o $@ $< $(RUNTIME) $(RISCV_LDFLAGS) -T$(RUNTIME_DIR)/link.ld
 	$(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) -D $@ > $@.dump
@@ -49,5 +48,6 @@ clean:
 	rm -vf $(addsuffix /main.c.o,$(APPS))
 	rm -vf $(RUNTIME)
 	rm -vf $(LINKER_SCRIPT)
+	rm -vf $(wildcard $(DATA_DIR)/data_*.h)
 
 .INTERMEDIATE: $(addsuffix /main.c.o,$(APPS))
diff --git a/software/apps/baremetal/axpy_i32/main.c b/software/apps/baremetal/axpy_i32/main.c
index a9354796e..aa91733ea 100644
--- a/software/apps/baremetal/axpy_i32/main.c
+++ b/software/apps/baremetal/axpy_i32/main.c
@@ -5,125 +5,50 @@
 // Author: Yichao Zhang, ETH Zurich
 
 #include <stdint.h>
+#include <stdlib.h>
 #include <string.h>
 
-#include "baremetal/mempool_axpy_i32p.h"
+/* Mempool runtime libraries */
+#include "builtins_v2.h"
+#include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
-#include <stdlib.h>
-
-#if NUM_CORES > 32
-#define size_M 64
-#define size_N 64
-#else
-#define size_M (NUM_CORES)
-#define size_N (NUM_CORES)
-#endif
-
-#define ALPHA 2
 
-#if NUM_CORES > 32
-int32_t data_x[size_M * size_N]
-    __attribute__((aligned(64 * 1024), section(".l1")));
-int32_t data_y[size_M * size_N]
-    __attribute__((aligned(64 * 1024), section(".l1")));
-int32_t data_y_copy[size_M * size_N]
-    __attribute__((aligned(64 * 1024), section(".l1")));
-#else
-int32_t data_x[size_M * size_N] __attribute__((aligned(32), section(".l1")));
-int32_t data_y[size_M * size_N] __attribute__((aligned(32), section(".l1")));
-int32_t data_y_copy[size_M * size_N]
-    __attribute__((aligned(32), section(".l1")));
-#endif
+#include "baremetal/mempool_axpy_i32p.h"
+#include "baremetal/mempool_checks.h"
+#include "data_axpy_i32.h"
 
+int32_t l1_X[array_N]
+    __attribute__((aligned(NUM_CORES * sizeof(uint32_t)), section(".l1")));
+int32_t l1_Y[array_N]
+    __attribute__((aligned(NUM_CORES * sizeof(uint32_t)), section(".l1")));
 int volatile error __attribute__((section(".l1")));
 
-void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int32_t a, int32_t b, int32_t c, uint32_t core_id,
-                 uint32_t num_cores) {
-  // How many rows/columns to split the matrix into
-  uint32_t const split = 8;
-  if (num_columns > num_rows) {
-    // Parallelize over columns
-    uint32_t const c_start = (num_rows / split) * (core_id % split);
-    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
-    for (uint32_t j = (core_id / split); j < num_columns;
-         j += (num_cores / split)) {
-      for (uint32_t i = c_start; i < c_end; ++i) {
-        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
-      }
-    }
-  } else {
-    // Parallelize over rows
-    uint32_t const c_start = (num_columns / split) * (core_id % split);
-    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
-    for (uint32_t i = (core_id / split); i < num_rows;
-         i += (num_cores / split)) {
-      for (uint32_t j = c_start; j < c_end; ++j) {
-        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
-      }
-    }
-  }
-}
-
-int verify_axpy(int32_t *matrix_X, int32_t *matrix_Y, int32_t *matrix_Y_COPY,
-                int32_t alpha, uint32_t elements) {
-  for (uint32_t i = 0; i < elements; i++) {
-    if (matrix_Y[i] != matrix_X[i] * alpha + matrix_Y_COPY[i]) {
-      return 1;
-    }
-  }
-  return 0;
-}
-
 int main() {
 
   uint32_t const core_id = mempool_get_core_id();
   uint32_t const num_cores = mempool_get_core_count();
-  uint32_t const total_elements = size_M * size_N;
-
-  // Seed for create element matrix
-  int32_t const A_a = 1;
-  int32_t const A_b = 1;
-  int32_t const A_c = -32;
-  int32_t const B_a = 2;
-  int32_t const B_b = 1;
-  int32_t const B_c = 16;
-
-  // Initialize synchronization variables
   mempool_barrier_init(core_id);
+
+  // Initialize data
   if (core_id == 0) {
-    printf("Initialize %3d cores\n", num_cores);
+    dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int32_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t));
     error = 0;
   }
-
-  // init_elements;
-  init_matrix(data_x, size_M, size_N, A_a, A_b, A_c, core_id, num_cores);
-  init_matrix(data_y, size_M, size_N, B_a, B_b, B_c, core_id, num_cores);
-  init_matrix(data_y_copy, size_M, size_N, B_a, B_b, B_c, core_id, num_cores);
   mempool_barrier(num_cores);
 
-  // start kernel testing
+  // Benchmark
   mempool_start_benchmark();
-  calc_axpy_unloop_x4_localbank(data_x, data_y, ALPHA, total_elements, core_id,
-                                num_cores);
+  calc_axpy_unloop_x4_localbank(l1_X, l1_Y, ALPHA, array_N, core_id, num_cores);
   mempool_barrier(num_cores);
   mempool_stop_benchmark();
-  // end kernel testing
 
   // Verify results
-  if (core_id == 0) {
-    printf("START CHECKING RESULTS\n");
-    if (verify_axpy(data_x, data_y, data_y_copy, ALPHA, total_elements)) {
-      printf("RESULTS ERROR\n");
-      error = 1;
-    } else {
-      printf("RESULTS CORRECT\n");
-    }
-  }
+  mempool_check_q32(l1_Y, l2_Z, array_N, 0, 0);
   mempool_barrier(num_cores);
 
-  return error;
+  return 0;
 }
diff --git a/software/apps/baremetal/cfft_radix2_q16/data.args b/software/apps/baremetal/cfft_radix2_q16/data.args
deleted file mode 100644
index a7d20d682..000000000
--- a/software/apps/baremetal/cfft_radix2_q16/data.args
+++ /dev/null
@@ -1 +0,0 @@
-LEN 64
diff --git a/software/apps/baremetal/cfft_radix2_q16/main.c b/software/apps/baremetal/cfft_radix2_q16/main.c
index 105cf6370..5a66c37d3 100644
--- a/software/apps/baremetal/cfft_radix2_q16/main.c
+++ b/software/apps/baremetal/cfft_radix2_q16/main.c
@@ -19,6 +19,7 @@
 #include "synchronization.h"
 
 #include "data_cfft_radix2_q16.h"
+#define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 /* CFFT mempool libraries */
 #include "baremetal/mempool_cfft_q16_bitreversal.h"
diff --git a/software/apps/baremetal/cfft_radix4_q16/data.args b/software/apps/baremetal/cfft_radix4_q16/data.args
deleted file mode 100644
index a7d20d682..000000000
--- a/software/apps/baremetal/cfft_radix4_q16/data.args
+++ /dev/null
@@ -1 +0,0 @@
-LEN 64
diff --git a/software/apps/baremetal/cfft_radix4_q16/main.c b/software/apps/baremetal/cfft_radix4_q16/main.c
index 88d7182fa..a4d9f887d 100644
--- a/software/apps/baremetal/cfft_radix4_q16/main.c
+++ b/software/apps/baremetal/cfft_radix4_q16/main.c
@@ -19,6 +19,8 @@
 
 /* CFFT data libraries */
 #include "data_cfft_radix4_q16.h"
+#define N_BANKS (NUM_CORES * BANKING_FACTOR)
+#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
 
 /* CHOOSE ONE */
 //#define SINGLE // Single core FFT.
diff --git a/software/apps/baremetal/chest_q16/data.args b/software/apps/baremetal/chest_q16/data.args
deleted file mode 100644
index d8d3acb68..000000000
--- a/software/apps/baremetal/chest_q16/data.args
+++ /dev/null
@@ -1,3 +0,0 @@
-N_TX 4
-N_RX 32
-N_SAMPLES 32
diff --git a/software/apps/baremetal/matmul_f16/main.c b/software/apps/baremetal/matmul_f16/main.c
index b3b474b1d..99a0269cc 100644
--- a/software/apps/baremetal/matmul_f16/main.c
+++ b/software/apps/baremetal/matmul_f16/main.c
@@ -34,8 +34,10 @@ int main() {
 
   // Initialize Matrices 1
   if (core_id == 0) {
-    dma_memcpy_blocking(matrix_a, A, (matrix_M * matrix_N) * sizeof(int16_t));
-    dma_memcpy_blocking(matrix_b, B, (matrix_N * matrix_P) * sizeof(int16_t));
+    dma_memcpy_blocking(matrix_a, l2_A,
+                        (matrix_M * matrix_N) * sizeof(int16_t));
+    dma_memcpy_blocking(matrix_b, l2_B,
+                        (matrix_N * matrix_P) * sizeof(int16_t));
   }
   mempool_barrier(num_cores);
 
@@ -59,7 +61,7 @@ int main() {
   mempool_stop_benchmark();
 #endif
 
-  mempool_check_f16(matrix_c, C, matrix_M * matrix_P, 0.5f, 0);
+  mempool_check_f16(matrix_c, l2_C, matrix_M * matrix_P, 0.5f, 0);
   mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/apps/baremetal/matmul_f32/main.c b/software/apps/baremetal/matmul_f32/main.c
index bc391200f..d3d7622db 100644
--- a/software/apps/baremetal/matmul_f32/main.c
+++ b/software/apps/baremetal/matmul_f32/main.c
@@ -30,13 +30,14 @@ int main() {
   uint32_t num_cores = mempool_get_core_count();
   mempool_barrier_init(core_id);
 
-  // Initialize Matrices
+  // Initialize data
   if (core_id == 0) {
-    dma_memcpy_blocking(matrix_a, A, matrix_M * matrix_N * sizeof(int32_t));
-    dma_memcpy_blocking(matrix_b, B, matrix_N * matrix_P * sizeof(int32_t));
+    dma_memcpy_blocking(matrix_a, l2_A, matrix_M * matrix_N * sizeof(int32_t));
+    dma_memcpy_blocking(matrix_b, l2_B, matrix_N * matrix_P * sizeof(int32_t));
   }
   mempool_barrier(num_cores);
 
+  // Benchmark
 #if defined(SINGLE)
   if (core_id == 0) {
     // Execute function to test.
@@ -57,7 +58,7 @@ int main() {
   mempool_stop_benchmark();
 #endif
 
-  mempool_check_f32(matrix_c, C, matrix_M * matrix_P, 0.01f, 0);
+  mempool_check_f32(matrix_c, l2_C, matrix_M * matrix_P, 0.01f, 0);
   mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/apps/baremetal/matmul_i32/main.c b/software/apps/baremetal/matmul_i32/main.c
index 65e2b82f1..94b250306 100644
--- a/software/apps/baremetal/matmul_i32/main.c
+++ b/software/apps/baremetal/matmul_i32/main.c
@@ -7,131 +7,46 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "baremetal/mempool_matmul_i32p.h"
+#include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
 
-// Define Matrix dimensions:
-// C = AB with A=[MxN], B=[NxP], C=[MxP]
-#define matrix_M 64
-#define matrix_N 32
-#define matrix_P 64
-
-int32_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
-int32_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
-int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
+#include "baremetal/mempool_checks.h"
+#include "baremetal/mempool_matmul_i32p.h"
+#include "data_matmul_i32.h"
 
-int volatile error __attribute__((section(".l1")));
+int32_t l1_A[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
+int32_t l1_B[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
+int32_t l1_C[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
 
-void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int32_t a, int32_t b, int32_t c, uint32_t core_id,
-                 uint32_t num_cores) {
-  uint32_t const split = 8; // How many rows/columns to split the matrix into
-  if (num_columns > num_rows) {
-    // Parallelize over columns
-    uint32_t const c_start = (num_rows / split) * (core_id % split);
-    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
-    for (uint32_t j = (core_id / split); j < num_columns;
-         j += (num_cores / split)) {
-      for (uint32_t i = c_start; i < c_end; ++i) {
-        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
-      }
-    }
-  } else {
-    // Parallelize over rows
-    uint32_t const c_start = (num_columns / split) * (core_id % split);
-    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
-    for (uint32_t i = (core_id / split); i < num_rows;
-         i += (num_cores / split)) {
-      for (uint32_t j = c_start; j < c_end; ++j) {
-        matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c;
-      }
-    }
-  }
-}
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier_init(core_id);
 
-// Initialize the matrices in parallel
-int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                  uint32_t inner_dim, int32_t aa, int32_t ab, int32_t ac,
-                  int32_t ba, int32_t bb, int32_t bc, uint32_t core_id,
-                  uint32_t num_cores) {
-  // Convert to signed
-  int32_t n = (int32_t)inner_dim;
-  // Parallelize over rows
-  for (uint32_t i = core_id; i < num_rows; i += num_cores) {
-    for (uint32_t j = 0; j < num_columns; ++j) {
-      int32_t ii = (int32_t)i;
-      int32_t jj = (int32_t)j;
-      int32_t lin =
-          (aa * bb * ii * jj + aa * bc * ii + ac * bb * jj + ac * bc) * n;
-      int32_t qua =
-          ((aa * ba * ii + ab * bb * jj + ab * bc + ba * ac) * (n * (n - 1))) /
-          2;
-      int32_t cub = ((ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6;
-      int32_t golden = lin + qua + cub;
-      if (matrix[i * num_columns + j] != golden) {
-        return (i + j) == 0 ? -1 : (int)(i * num_columns + j);
-      }
-      matrix[i * num_columns + j] = 0;
-    }
+  // Initialize data
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_A, l2_A, matrix_M * matrix_N * sizeof(int32_t));
+    dma_memcpy_blocking(l1_B, l2_B, matrix_N * matrix_P * sizeof(int32_t));
   }
-  return 0;
-}
-
-int test_matrix_multiplication(int32_t *__restrict__ A, int32_t *__restrict__ B,
-                               int32_t *__restrict__ C, uint32_t M, uint32_t N,
-                               uint32_t P, uint32_t core_id,
-                               uint32_t num_cores) {
-  int32_t const A_a = 1;
-  int32_t const A_b = 1;
-  int32_t const A_c = -32;
-  int32_t const B_a = 2;
-  int32_t const B_b = 1;
-  int32_t const B_c = 16;
-
-  // Initialize Matrices
-  init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores);
-  init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores);
-  // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
-  // Execute function to test.
-  mempool_start_benchmark();
 
+  // Benchmark
+  mempool_start_benchmark();
 #ifdef __XPULPIMG
-  matmul_unrolled_2x2_parallel_i32_xpulpv2(A, B, C, M, N, P, core_id,
-                                           num_cores);
+  matmul_unrolled_2x2_parallel_i32_xpulpv2(l1_A, l1_B, l1_C, matrix_M, matrix_N,
+                                           matrix_P, core_id, num_cores);
 #else
-  matmul_unrolled_2x2_parallel_i32_rv32im(A, B, C, M, N, P, core_id, num_cores);
+  matmul_unrolled_2x2_parallel_i32_rv32im(l1_A, l1_B, l1_C, matrix_M, matrix_N,
+                                          matrix_P, core_id, num_cores);
 #endif
-
   mempool_stop_benchmark();
-  // Wait at barrier befor checking
   mempool_barrier(num_cores);
-  if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id,
-                    num_cores)) {
-    error = 1;
-    return -1;
-  }
-  return 0;
-}
 
-int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  // Initialize barrier and synchronize
-  mempool_barrier_init(core_id);
-
-  if (core_id == 0) {
-    error = 0;
-  }
-
-  // Test the Matrix multiplication
-  test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
-                             matrix_P, core_id, num_cores);
-  // wait until all cores have finished
+  // Verify results
+  mempool_check_q32(l1_C, l2_C, matrix_M * matrix_P, 0, 0);
   mempool_barrier(num_cores);
-
-  return error;
+  return 0;
 }
diff --git a/software/apps/baremetal/matmul_i8/main.c b/software/apps/baremetal/matmul_i8/main.c
index 4fb557f2c..63a24418e 100644
--- a/software/apps/baremetal/matmul_i8/main.c
+++ b/software/apps/baremetal/matmul_i8/main.c
@@ -7,137 +7,46 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "baremetal/mempool_matmul_i8p.h"
+#include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
 
-// Define Matrix dimensions:
-// C = AB with A=[MxN], B=[NxP], C=[MxP]
-#define matrix_M 64
-#define matrix_N 64
-#define matrix_P 64
-
-int8_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
-int8_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
-int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
+#include "baremetal/mempool_checks.h"
+#include "baremetal/mempool_matmul_i8p.h"
+#include "data_matmul_i8.h"
 
-int volatile error __attribute__((section(".l1")));
+int8_t l1_A[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
+int8_t l1_B[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
+int32_t l1_C[matrix_M * matrix_P] __attribute__((section(".l1_prio")));
 
-void init_matrix(int8_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                 int8_t a, int8_t b, int8_t c, uint32_t core_id,
-                 uint32_t num_cores) {
-  uint32_t const split = 8; // How many rows/columns to split the matrix into
-  if (num_columns > num_rows) {
-    // Parallelize over columns
-    uint32_t const c_start = (num_rows / split) * (core_id % split);
-    uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1);
-    for (uint32_t j = (core_id / split); j < num_columns;
-         j += (num_cores / split)) {
-      for (uint32_t i = c_start; i < c_end; ++i) {
-        matrix[i * num_columns + j] =
-            (int8_t)(a * (int8_t)i + b * (int8_t)j + c);
-      }
-    }
-  } else {
-    // Parallelize over rows
-    uint32_t const c_start = (num_columns / split) * (core_id % split);
-    uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1);
-    for (uint32_t i = (core_id / split); i < num_rows;
-         i += (num_cores / split)) {
-      for (uint32_t j = c_start; j < c_end; ++j) {
-        matrix[i * num_columns + j] =
-            (int8_t)(a * (int8_t)i + b * (int8_t)j + c);
-      }
-    }
-  }
-}
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier_init(core_id);
 
-// Initialize the matrices in parallel
-int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
-                  uint32_t inner_dim, int8_t aa, int8_t ab, int8_t ac,
-                  int8_t ba, int8_t bb, int8_t bc, uint32_t core_id,
-                  uint32_t num_cores) {
-  // Convert to signed
-  int32_t n = (int32_t)inner_dim;
-  // Parallelize over rows
-  for (uint32_t i = core_id; i < num_rows; i += num_cores) {
-    for (uint32_t j = 0; j < num_columns; ++j) {
-      int32_t ii = (int32_t)i;
-      int32_t jj = (int32_t)j;
-      int32_t lin = ((int32_t)aa * bb * ii * jj + aa * bc * ii + ac * bb * jj +
-                     (int32_t)ac * bc) *
-                    n;
-      int32_t qua =
-          (((int32_t)aa * ba * ii + ab * bb * jj + ab * bc + (int32_t)ba * ac) *
-           (n * (n - 1))) /
-          2;
-      int32_t cub = (((int32_t)ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6;
-      int32_t golden = lin + qua + cub;
-      if (matrix[i * num_columns + j] != golden) {
-        return (i + j) == 0 ? -1 : (int)(i * num_columns + j);
-      }
-      matrix[i * num_columns + j] = 0;
-    }
+  // Initialize data
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_A, l2_A, matrix_M * matrix_N * sizeof(int8_t));
+    dma_memcpy_blocking(l1_B, l2_B, matrix_N * matrix_P * sizeof(int8_t));
   }
-  return 0;
-}
-
-int test_matrix_multiplication(int8_t *__restrict__ A, int8_t *__restrict__ B,
-                               int32_t *__restrict__ C, uint32_t M, uint32_t N,
-                               uint32_t P, uint32_t core_id,
-                               uint32_t num_cores) {
-  int8_t const A_a = 1;
-  int8_t const A_b = 1;
-  int8_t const A_c = -40;
-  int8_t const B_a = 0;
-  int8_t const B_b = 1;
-  int8_t const B_c = 19;
-
-  // Initialize Matrices
-  init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores);
-  init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores);
-  // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
-  // Execute function to test.
-  mempool_start_benchmark();
 
+  // Benchmark
+  mempool_start_benchmark();
 #ifdef __XPULPIMG
-  matmul_unrolled_2x4_pincr_asm_parallel_i8_xpulpv2(A, B, C, M, N, P, core_id,
-                                                    num_cores);
-  // matmul_unrolled_2x4_parallel_i8_xpulpv2(A, B, C, M, N, P, core_id,
-  // num_cores);
+  matmul_unrolled_2x4_pincr_asm_parallel_i8_xpulpv2(
+      l1_A, l1_B, l1_C, matrix_M, matrix_N, matrix_P, core_id, num_cores);
 #else
-  matmul_unrolled_2x2_parallel_i8_rv32im(A, B, C, M, N, P, core_id, num_cores);
+  matmul_unrolled_2x2_parallel_i8_rv32im(l1_A, l1_B, l1_C, matrix_M, matrix_N,
+                                         matrix_P, core_id, num_cores);
 #endif
-
   mempool_stop_benchmark();
-  // Wait at barrier befor checking
   mempool_barrier(num_cores);
-  if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id,
-                    num_cores)) {
-    error = 1;
-    return -1;
-  }
-  return 0;
-}
 
-int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  // Initialize barrier and synchronize
-  mempool_barrier_init(core_id);
-
-  if (core_id == 0) {
-    error = 0;
-  }
-
-  // Test the Matrix multiplication
-  test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N,
-                             matrix_P, core_id, num_cores);
-  // wait until all cores have finished
+  // Verify results
+  mempool_check_q32(l1_C, l2_C, matrix_M * matrix_P, 0, 0);
   mempool_barrier(num_cores);
-
-  return error;
+  return 0;
 }
diff --git a/software/data/data_matmul_f16.h.tpl b/software/data/data_matmul_f16.h.tpl
deleted file mode 100644
index 96aa738a3..000000000
--- a/software/data/data_matmul_f16.h.tpl
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(__fp16){:.4f}f, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define matrix_M (${matrix_M})
-#define matrix_N (${matrix_N})
-#define matrix_P (${matrix_P})
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) A[${matrix_M * matrix_N}] = ${array_to_cstr(A)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) B[${matrix_N * matrix_P}] = ${array_to_cstr(B)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) C[${matrix_M * matrix_P}] = ${array_to_cstr(C)};
diff --git a/software/data/data_matmul_f32.h.tpl b/software/data/data_matmul_f32.h.tpl
deleted file mode 100644
index 4e9e6a4d6..000000000
--- a/software/data/data_matmul_f32.h.tpl
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}f, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define matrix_M (${matrix_M})
-#define matrix_N (${matrix_N})
-#define matrix_P (${matrix_P})
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) A[${matrix_M * matrix_N}] = ${array_to_cstr(A)};
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) B[${matrix_N * matrix_P}] = ${array_to_cstr(B)};
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) C[${matrix_M * matrix_P}] = ${array_to_cstr(C)};
diff --git a/software/data/data_matmulf16.py b/software/data/data_matmulf16.py
deleted file mode 100644
index 2c362208b..000000000
--- a/software/data/data_matmulf16.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp16 matmul.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-
-
-##################
-# compute_result #
-##################
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_matmul_f16.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-m",
-        "--dim_m",
-        type=int,
-        required=False,
-        default=16,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-n",
-        "--dim_n",
-        type=int,
-        required=False,
-        default=16,
-        help='Second dimension.'
-    )
-    parser.add_argument(
-        "-p",
-        "--dim_p",
-        type=int,
-        required=False,
-        default=16,
-        help='Third dimension.'
-    )
-
-    args = parser.parse_args()
-
-    matrix_M = args.dim_m
-    matrix_N = args.dim_n
-    matrix_P = args.dim_p
-
-    # Create matrix
-    A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(np.float16)
-    B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(np.float16)
-    C = np.matmul(A, B)
-
-    A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float16)
-    B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float16)
-    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float16)
-
-    kwargs = {
-        'name': 'data_matmul_f16',
-        'A': A,
-        'B': B,
-        'C': C,
-        'matrix_M': matrix_M,
-        'matrix_N': matrix_N,
-        'matrix_P': matrix_P}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_matmulf32.py b/software/data/data_matmulf32.py
deleted file mode 100644
index 15086d0fc..000000000
--- a/software/data/data_matmulf32.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp32 matmul.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-
-
-##################
-# compute_result #
-##################
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_matmul_f32.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-
-    parser.add_argument(
-        "-m",
-        "--dim_m",
-        type=int,
-        required=False,
-        default=16,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-n",
-        "--dim_n",
-        type=int,
-        required=False,
-        default=16,
-        help='Second dimension.'
-    )
-    parser.add_argument(
-        "-p",
-        "--dim_p",
-        type=int,
-        required=False,
-        default=16,
-        help='Third dimension.'
-    )
-
-    args = parser.parse_args()
-
-    matrix_M = args.dim_m
-    matrix_N = args.dim_n
-    matrix_P = args.dim_p
-
-    # Create matrix
-    A = np.random.rand(matrix_M, matrix_N)
-    B = np.random.rand(matrix_N, matrix_P)
-    C = np.matmul(A, B)
-
-    A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float32)
-    B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float32)
-    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float32)
-
-    kwargs = {
-        'name': 'data_matmul_f32',
-        'A': A,
-        'B': B,
-        'C': C,
-        'matrix_M': matrix_M,
-        'matrix_N': matrix_N,
-        'matrix_P': matrix_P}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/gendata_header.py b/software/data/gendata_header.py
new file mode 100644
index 000000000..a064ce63b
--- /dev/null
+++ b/software/data/gendata_header.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
+
+# This script generates data.h files.
+# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
+
+import argparse
+import os
+import math
+import numpy as np
+import hjson
+import ast
+
+import gendatalib_cfft as cfft
+import gendatalib_chest as chest
+import gendatalib_blas as blas
+
+
+header = """\
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// File generated with .data/print_header.py
+// Author: Marco Bertuletti\n\n
+"""
+
+
+def print_array(arr, typ, name):
+
+    typ_i32b = ["int32_t", "uint32_t"]
+    typ_i16b = ["int16_t", "uint16_t"]
+    typ_i8b = ["int8_t", "uint8_t"]
+
+    output_string = typ
+    output_string += " __attribute__((aligned(sizeof(int32_t)), section(\".l2\"))) "
+    output_string += name + '[{}] = {{\n'.format(arr.size)
+    for (value, count) in zip(arr, range(arr.size)):
+        if typ in typ_i32b:
+            output_string += '({}) 0X{:08X}, '.format(typ, value & 0xffffffff)
+        elif typ in typ_i16b:
+            output_string += '({}) 0X{:04X}, '.format(typ, value & 0x0000ffff)
+        elif typ in typ_i8b:
+            output_string += '({}) 0X{:02X}, '.format(typ, value & 0x000000ff)
+        elif typ == 'float':
+            output_string += '({}) {:+.8f}, '.format(typ, value)
+        elif typ == '__fp16':
+            output_string += '({}) {:+.4f}, '.format(typ, value)
+        else:
+            raise Exception("ERROR: Unsupported data type!!!")
+        count += 1
+        if count % 4 == 0:
+            output_string += '\n'
+    output_string = output_string[:-3]
+    output_string += "};\n\n"
+    return output_string
+
+
+def print_file(header, defines, arrays, filename):
+    """
+    Writes defines and arrays to a file.
+
+    :param header: Header of the printed file
+    :param defines: A tuple of (define_name, define_value) for #define directives.
+    :param arrays: A tuple of (array_name, array_type, array_values) for arrays.
+    :param filename: The output file to write to.
+    """
+
+    # Initialize the output string
+    output_string = header
+
+    # Write the defines
+    for define_name, define_value in defines:
+        output_string += "#define {} ({})\n".format(define_name, define_value)
+    output_string += "\n"  # Add space between defines and arrays
+
+    # Write the arrays using print_array
+    for array_values, array_type, array_name in arrays:
+        output_string += print_array(array_values, array_type, array_name)
+
+    # Write everything to the file
+    with open(filename, "w") as file:
+        file.write(output_string)
+
+    print("Generate {}".format(filename))
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description='Generate data.h header files.')
+    parser.add_argument('--app_name', type=str, help='Name of the app')
+    parser.add_argument('--params', type=str, help='Name of the app')
+
+    # Parse the command-line arguments
+    args = parser.parse_args()
+    app_name = args.app_name
+    with open(args.params, 'r') as hjson_file:
+        config_data = hjson.load(hjson_file)
+    data_args = config_data.get(app_name)
+    my_type = data_args.get("type")
+    defnes = [ast.literal_eval(defne) for defne in data_args.get("defines")]
+    arrays = [ast.literal_eval(array) for array in data_args.get("arrays")]
+
+    # Determine output file name
+    filename = os.path.dirname(os.path.abspath(__file__))
+    filename = os.path.join(filename, "data_{}.h".format(app_name))
+
+    # Generate data header file
+    if app_name == "axpy_i32":
+
+        result = blas.generate_iaxpy(**{name: value for name, value in defnes})
+        arrays = [(result[i], *arrays[i]) for i in range(len(arrays))]
+        print_file(header, defnes, arrays, filename)
+
+    elif app_name == "cfft_radix4_q16":
+
+        result = cfft.generate_cfft_q16(
+            **{name: value for name, value in defnes})
+        N = defnes[0][1]
+        defnes += [
+            ("LOG2", int(math.log2(N))),
+            ("N_TWIDDLES", 3 * N // 4),
+            ("BITREVINDEXTABLE_LENGTH", len(result[3])),
+            ("TOLERANCE", result[4]),
+        ]
+        result = result[0:4]
+        arrays = [(result[i], *arrays[i]) for i in range(len(arrays))]
+        print_file(header, defnes, arrays, filename)
+
+    elif app_name == "cfft_radix2_q16":
+
+        result = cfft.generate_cfft_q16(
+            **{name: value for name, value in defnes})
+        N = defnes[0][1]
+        defnes += [
+            ("LOG2", int(math.log2(N))),
+            ("N_TWIDDLES", 3 * N // 4),
+            ("BITREVINDEXTABLE_LENGTH", len(result[3])),
+            ("TOLERANCE", result[4]),
+        ]
+        result = result[0:4]
+        arrays = [(result[i], *arrays[i]) for i in range(len(arrays))]
+        print_file(header, defnes, arrays, filename)
+
+    elif app_name == "chest_q16":
+
+        result = chest.generate_chest_q16(
+            **{name: value for name, value in defnes})
+        arrays = [(result[i], *arrays[i]) for i in range(len(arrays))]
+        print_file(header, defnes, arrays, filename)
+
+    elif app_name == "conv2d_i32":
+
+        result = blas.generate_iconv(
+            **{name: value for name, value in defnes}, my_type=my_type)
+        arrays = [(result[i], *arrays[i]) for i in range(len(arrays))]
+        print_file(header, defnes, arrays, filename)
+
+    elif app_name == "matmul_f16":
+
+        result = blas.generate_fmatmul(
+            **{name: value for name, value in defnes}, my_type=my_type)
+        arrays = [(result[i], *arrays[i]) for i in range(len(arrays))]
+        print_file(header, defnes, arrays, filename)
+
+    elif app_name == "matmul_f32":
+
+        result = blas.generate_fmatmul(
+            **{name: value for name, value in defnes}, my_type=my_type)
+        arrays = [(result[i], *arrays[i]) for i in range(len(arrays))]
+        print_file(header, defnes, arrays, filename)
+
+    elif app_name == "matmul_i32":
+
+        result = blas.generate_imatmul(
+            **{name: value for name, value in defnes}, my_type=my_type)
+        arrays = [(result[i], *arrays[i]) for i in range(len(arrays))]
+        print_file(header, defnes, arrays, filename)
+
+    elif app_name == "matmul_i8":
+
+        result = blas.generate_imatmul(
+            **{name: value for name, value in defnes}, my_type=my_type)
+        arrays = [(result[i], *arrays[i]) for i in range(len(arrays))]
+        print_file(header, defnes, arrays, filename)
+
+    else:
+        print("No need for data generation.")
diff --git a/software/data/gendata_params.hjson b/software/data/gendata_params.hjson
new file mode 100644
index 000000000..5953c9710
--- /dev/null
+++ b/software/data/gendata_params.hjson
@@ -0,0 +1,132 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// This script generates data.h files.
+// Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
+
+{
+  "axpy_i32": {
+    "type": "int32",
+    "defines": [
+      ("ALPHA",      6)
+      ("array_N", 1024)
+    ]
+    "arrays": [
+      ("int32_t", "l2_X")
+      ("int32_t", "l2_Y")
+      ("int32_t", "l2_Z")
+    ]
+  },
+
+  "cfft_radix4_q16": {
+    "type": "int16",
+    "defines": [
+      ("N_CSAMPLES", 1024)
+    ]
+    "arrays": [
+      ("int16_t", "l2_pSrc")
+      ("int16_t", "l2_pRes")
+      ("int16_t", "l2_twiddleCoef_q16")
+      ("int16_t", "l2_BitRevIndexTable")
+    ]
+  },
+
+  "cfft_radix2_q16": {
+    "type": "int16",
+    "defines": [
+      ("N_CSAMPLES", 1024)
+    ]
+    "arrays": [
+      ("int16_t", "l2_pSrc")
+      ("int16_t", "l2_pRes")
+      ("int16_t", "l2_twiddleCoef_q16")
+      ("int16_t", "l2_BitRevIndexTable")
+    ]
+  },
+
+  "chest_q16": {
+    "type": "int32",
+    "defines": [
+      ("N_TX",        4)
+      ("N_RX",        4)
+      ("N_SAMPLES", 512)
+    ]
+    "arrays": [
+      ("int16_t", "l2_PilotTX")
+      ("int16_t", "l2_PilotRX")
+      ("int16_t", "l2_HEST")
+    ]
+  },
+
+  "conv2d_i32": {
+    "type": "int32",
+    "defines": [
+      ("matrix_M",   20)
+      ("matrix_N", 1024)
+      ("kernel_N",    3)
+    ]
+    "arrays": [
+      ("int32_t", "l2_X")
+      ("int32_t", "l2_K")
+      ("int32_t", "l2_Y")
+    ]
+  },
+
+  "matmul_f16": {
+    "type": "float16",
+    "defines": [
+      ("matrix_M", 32)
+      ("matrix_N", 32)
+      ("matrix_P", 32)
+    ]
+    "arrays": [
+      ("__fp16", "l2_A")
+      ("__fp16", "l2_B")
+      ("__fp16", "l2_C")
+    ]
+  },
+
+  "matmul_f32": {
+    "type": "float32",
+    "defines": [
+      ("matrix_M", 16)
+      ("matrix_N", 16)
+      ("matrix_P", 16)
+    ]
+    "arrays": [
+      ("float", "l2_A")
+      ("float", "l2_B")
+      ("float", "l2_C")
+    ]
+  }
+
+  "matmul_i32": {
+    "type": "int32",
+    "defines": [
+      ("matrix_M", 32)
+      ("matrix_N", 32)
+      ("matrix_P", 32)
+    ]
+    "arrays": [
+      ("int32_t", "l2_A")
+      ("int32_t", "l2_B")
+      ("int32_t", "l2_C")
+    ]
+  }
+
+  "matmul_i8": {
+    "type": "int8",
+    "defines": [
+      ("matrix_M", 64)
+      ("matrix_N", 64)
+      ("matrix_P", 64)
+    ]
+    "arrays": [
+      ("int8_t", "l2_A")
+      ("int8_t", "l2_B")
+      ("int32_t", "l2_C")
+    ]
+  }
+
+}
diff --git a/software/data/gendatalib_blas.py b/software/data/gendatalib_blas.py
new file mode 100644
index 000000000..0fd1cf780
--- /dev/null
+++ b/software/data/gendatalib_blas.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
+
+# This script generates data for the fp16 matmul.
+# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
+
+import numpy as np
+from scipy import signal
+
+
+def generate_fmatmul(matrix_M=16, matrix_N=16,
+                     matrix_P=16, my_type=np.float32):
+
+    # Create matrix
+    A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(my_type)
+    B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(my_type)
+    C = np.matmul(A, B)
+
+    A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(my_type)
+    B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(my_type)
+    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(my_type)
+
+    return A, B, C
+
+
+def generate_imatmul(matrix_M=16, matrix_N=16, matrix_P=16, my_type=np.int32):
+
+    # Create matrix
+    MAX = 2**6
+    A = np.random.randint(-MAX, MAX - 1, size=(matrix_M, matrix_N))
+    B = np.random.randint(-MAX, MAX - 1, size=(matrix_M, matrix_N))
+    C = np.matmul(A, B)
+
+    A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(my_type)
+    B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(my_type)
+    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.int32)
+
+    return A, B, C
+
+
+def generate_iaxpy(ALPHA=6, array_N=1024, my_type=np.int32):
+
+    # Create matrix
+    MAX = 32
+    X = np.random.randint(-MAX, MAX, size=(array_N)).astype(my_type)
+    Y = np.random.randint(-MAX, MAX, size=(array_N)).astype(my_type)
+    Z = (Y + X * ALPHA).astype(my_type)
+
+    return X, Y, Z
+
+
+def generate_iconv(matrix_M=32, matrix_N=32, kernel_N=3, my_type=np.int32):
+
+    # Create matrix
+    MAX = 32
+    X = np.random.randint(-MAX, MAX, size=(matrix_M, matrix_M)).astype(my_type)
+    K = np.random.randint(-MAX, MAX, size=(kernel_N, kernel_N)).astype(my_type)
+    Y = signal.convolve2d(X, K, mode="same", boundary='fill')
+
+    X = X.flatten().astype(my_type)
+    K = K.flatten().astype(my_type)
+    Y = Y.flatten().astype(my_type)
+
+    return X, K, Y
diff --git a/software/data/generate_cfft.py b/software/data/gendatalib_cfft.py
similarity index 80%
rename from software/data/generate_cfft.py
rename to software/data/gendatalib_cfft.py
index 584233f03..6916a532d 100644
--- a/software/data/generate_cfft.py
+++ b/software/data/gendatalib_cfft.py
@@ -11,13 +11,46 @@
 import math as M
 from sympy.combinatorics import Permutation
 
-__all__ = [
-    'generate_cfft_q16',
-    'generate_twiddleCoefq15',
-    'generate_bitreversal']
+
+def generate_twiddleCoefq15(N):
+    PI = 3.14159265358979
+    twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16)
+    for i in range(0, (int)(3 * N / 4)):
+        twiddleCoefq15_cos = M.cos(i * 2 * PI / N)
+        twiddleCoefq15_sin = M.sin(i * 2 * PI / N)
+        twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1)))
+        twiddleCoefq15[2 * i +
+                       1] = int(round(twiddleCoefq15_sin * (2**15 - 1)))
+    return twiddleCoefq15
+
+
+def generate_bitreversal(N, R):
+    # Decompose
+    logR2 = []
+    idx = N
+    while (idx >= R):
+        logR2.append(int(M.log2(R)))
+        idx = idx // R
+    if (idx > 1):
+        logR2.append(int(M.log2(idx)))
+    # Bitreversal
+    indexes = []
+    for x in range(N):
+        result = 0
+        for bits in logR2:
+            mask = (0xffffffff >> (32 - bits))
+            result = (result << bits) | (x & mask)
+            x = x >> bits
+        indexes.append(result)
+    # Create transpositions table
+    tps = []
+    for c in Permutation.from_sequence(indexes).cyclic_form:
+        for i in range(len(c) - 1):
+            tps.append([c[i] * 8, c[-1] * 8])
+    return np.ndarray.flatten(np.array(tps))
 
 
-def generate_cfft_q16(N):
+def generate_cfft_q16(N_CSAMPLES):
     # Q16:
     # len=16:    Q1.15 -> Q5.11
     # len=32:    Q1.15 -> Q6.10
@@ -28,8 +61,9 @@ def generate_cfft_q16(N):
     # len=1024:  Q1.15 -> Q11.5
     # len=2048:  Q1.15 -> Q12.4
     # len=4096:  Q1.15 -> Q13.3
-    src = (np.random.randint(-2**(15), 2**(15) - 1,
-           2 * N, dtype=np.int16)).astype(np.int16)
+    MAX = 2**(15)
+    src = (np.random.randint(-MAX, MAX - 1, 2 *
+           N_CSAMPLES, dtype=np.int16)).astype(np.int16)
     tolerance = {
         16: 16,
         32: 20,
@@ -51,54 +85,20 @@ def generate_cfft_q16(N):
         2048: 4,
         4096: 3}
     my_fixpoint = 15
-    dst = np.zeros(2 * N, dtype=np.int16)
-    complex_src = np.zeros(N, dtype=np.csingle)
-    complex_dst = np.zeros(N, dtype=np.csingle)
-    for i in range(N):
+    dst = np.zeros(2 * N_CSAMPLES, dtype=np.int16)
+    complex_src = np.zeros(N_CSAMPLES, dtype=np.csingle)
+    complex_dst = np.zeros(N_CSAMPLES, dtype=np.csingle)
+    for i in range(N_CSAMPLES):
         shift = 2**(my_fixpoint)
         complex_src[i] = (src[2 * i].astype(np.csingle) / shift) + \
             1j * (src[2 * i + 1].astype(np.csingle) / shift)
     complex_dst = np.fft.fft(complex_src)
-    for i in range(N):
-        shift = 2**(bit_shift_dict_q16[N])
+    for i in range(N_CSAMPLES):
+        shift = 2**(bit_shift_dict_q16[N_CSAMPLES])
         dst[2 * i] = (np.real(complex_dst[i]) * shift).astype(np.int16)
         dst[2 * i + 1] = (np.imag(complex_dst[i]) * shift).astype(np.int16)
-    return src, dst, tolerance[N]
 
+    twiddles = generate_twiddleCoefq15(N_CSAMPLES)
+    bitrever = generate_bitreversal(N_CSAMPLES, 2)
 
-def generate_twiddleCoefq15(N):
-    PI = 3.14159265358979
-    twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16)
-    for i in range(0, (int)(3 * N / 4)):
-        twiddleCoefq15_cos = M.cos(i * 2 * PI / N)
-        twiddleCoefq15_sin = M.sin(i * 2 * PI / N)
-        twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1)))
-        twiddleCoefq15[2 * i +
-                       1] = int(round(twiddleCoefq15_sin * (2**15 - 1)))
-    return twiddleCoefq15
-
-
-def generate_bitreversal(N, R):
-    # Decompose
-    logR2 = []
-    idx = N
-    while (idx >= R):
-        logR2.append(int(M.log2(R)))
-        idx = idx // R
-    if (idx > 1):
-        logR2.append(int(M.log2(idx)))
-    # Bitreversal
-    indexes = []
-    for x in range(N):
-        result = 0
-        for bits in logR2:
-            mask = (0xffffffff >> (32 - bits))
-            result = (result << bits) | (x & mask)
-            x = x >> bits
-        indexes.append(result)
-    # Create transpositions table
-    tps = []
-    for c in Permutation.from_sequence(indexes).cyclic_form:
-        for i in range(len(c) - 1):
-            tps.append([c[i] * 8, c[-1] * 8])
-    return np.ndarray.flatten(np.array(tps))
+    return src, dst, twiddles, bitrever, tolerance[N_CSAMPLES]
diff --git a/software/data/generate_chest.py b/software/data/gendatalib_chest.py
similarity index 76%
rename from software/data/generate_chest.py
rename to software/data/gendatalib_chest.py
index d95b9748e..ae197723b 100755
--- a/software/data/generate_chest.py
+++ b/software/data/gendatalib_chest.py
@@ -10,9 +10,6 @@
 import numpy as np
 
 
-__all__ = ['generate_chest_q16']
-
-
 def q_sat(x):
     if x > 2**15 - 1:
         return x - 2**16
@@ -48,19 +45,19 @@ def compute_chest_q16(in_rx, in_tx, p):
     return result
 
 
-def generate_chest_q16(nb_tx, nb_rx, nb_samples):
+def generate_chest_q16(N_TX, N_RX, N_SAMPLES):
     FIXED_POINT = 8
     MAX = 2**7
 
     qvector_pilot_tx = []
     qvector_pilot_rx = []
     qvector_Hest = []
-    for k in range(nb_samples):
+    for k in range(N_SAMPLES):
         # Create pilots
-        pilot_rx = np.random.randint(-MAX, MAX - 1, size=nb_rx) + 1j * \
-            np.random.randint(-MAX, MAX - 1, size=nb_rx)
-        pilot_tx = np.random.randint(-MAX, MAX - 1, size=nb_tx) + 1j * \
-            np.random.randint(-MAX, MAX - 1, size=nb_tx)
+        pilot_rx = np.random.randint(-MAX, MAX - 1, size=N_RX) + 1j * \
+            np.random.randint(-MAX, MAX - 1, size=N_RX)
+        pilot_tx = np.random.randint(-MAX, MAX - 1, size=N_TX) + 1j * \
+            np.random.randint(-MAX, MAX - 1, size=N_TX)
         # Compute Hest
         Hest = compute_chest_q16(pilot_rx, pilot_tx, FIXED_POINT)
 
@@ -74,7 +71,7 @@ def generate_chest_q16(nb_tx, nb_rx, nb_samples):
         qvector_pilot_rx.append(pilot_rx)
         qvector_Hest.append(Hest)
 
-    qvector_pilot_tx = np.reshape(qvector_pilot_tx, [2 * nb_tx * nb_samples])
-    qvector_pilot_rx = np.reshape(qvector_pilot_rx, [2 * nb_rx * nb_samples])
-    qvector_Hest = np.reshape(qvector_Hest, [2 * nb_tx * nb_rx * nb_samples])
+    qvector_pilot_tx = np.reshape(qvector_pilot_tx, [2 * N_TX * N_SAMPLES])
+    qvector_pilot_rx = np.reshape(qvector_pilot_rx, [2 * N_RX * N_SAMPLES])
+    qvector_Hest = np.reshape(qvector_Hest, [2 * N_TX * N_RX * N_SAMPLES])
     return qvector_pilot_tx, qvector_pilot_rx, qvector_Hest
diff --git a/software/data/print_header.py b/software/data/print_header.py
deleted file mode 100644
index 8d2575d33..000000000
--- a/software/data/print_header.py
+++ /dev/null
@@ -1,161 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the Channel estimation.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import argparse
-import os
-import math
-import generate_cfft as cfft
-import generate_chest as chest
-
-
-def extract_data_args(filename):
-    # Define a dictionary to store numerical values for each flag
-    args = {}
-
-    # Open the file for reading
-    with open(filename, 'r') as file:
-        # Iterate through each line in the file
-        for line in file:
-            # Split the line into words
-            words = line.split()
-            # Iterate through each word in the line
-            for i in range(len(words)):
-                flag = words[i]  # Get the flag name
-                # Check if the next word exists and is a numerical value
-                if i + 1 < len(words) and words[i + 1].isdigit():
-                    # Convert the numerical value to an integer
-                    numerical_value = int(words[i + 1])
-                    # Store the numerical value in the structure
-                    args[flag] = numerical_value
-
-    # Return the structure containing numerical values for each flag
-    return args
-
-
-class dot_dict:
-    def __init__(self, data):
-        self.data = data
-
-    def __getattr__(self, attr):
-        if attr in self.data:
-            return self.data[attr]
-        else:
-            raise AttributeError(f"Object has no attribute '{attr}'")
-
-
-def print_array(arr, typ, name, str):
-    count = 0
-    output_string = typ
-    output_string += " __attribute__((aligned(sizeof(int32_t)), \
-                       section(\".l2\"))) "
-    output_string += name + '[{}] = {{\n'.format(arr.size)
-    for value in arr:
-        output_string += '(int16_t) 0X{:04X}, '.format(value & 0xffff)
-        count += 1
-        if count % 8 == 0:
-            output_string += '\n'
-    output_string = output_string[:-3]
-    output_string += "};\n"
-    return output_string
-
-
-def print_file(string, filename):
-    with open(filename, "w") as file:
-        # Write the string to the file
-        file.write(string + '\n')
-    return file
-
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser(
-        description='Generate data.h header files.')
-    parser.add_argument('--params', type=str, help='Name of the app')
-    # Parse the command-line arguments
-    args = parser.parse_args()
-    params = args.params
-    # Read arguments from data.args file
-    data_args = extract_data_args(params)
-    (app_path, _) = os.path.split(params)
-    (_, app_name) = os.path.split(app_path)
-
-    if data_args != {}:
-        string = "// Copyright 2022 ETH Zurich and University of \
-                 Bologna.\n // Licensed under the Apache License, \
-                 Version 2.0, see LICENSE for details.\n \
-                 // SPDX-License-Identifier: Apache-2.0\n\n \
-                 // File generated with .data/print_header.py\n"
-
-        data_args = dot_dict(data_args)  # Access args with .notation
-
-        if app_name == "cfft_radix4_q16":
-            # cfft_radix4_q16
-            src_cfft_q16, dst_cfft_q16, tolerance_q16 = cfft.generate_cfft_q16(
-                data_args.LEN)
-            brv_cfft_q16 = cfft.generate_bitreversal(data_args.LEN, 2)
-            twi_cfft_q16 = cfft.generate_twiddleCoefq15(data_args.LEN)
-            string += "#define LOG2 ({})\n".format(
-                int(math.log2(data_args.LEN)))
-            string += "#define N_CSAMPLES ({})\n".format(data_args.LEN)
-            string += "#define N_TWIDDLES ({})\n".format(3 *
-                                                         data_args.LEN // 4)
-            string += "#define BITREVINDEXTABLE_LENGTH ({})\n".format(
-                len(brv_cfft_q16))
-            string += "#define TOLERANCE ({})\n".format(tolerance_q16)
-            string += "#define N_BANKS (NUM_CORES * BANKING_FACTOR)\n"
-            string += "#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))\n"
-            string += print_array(src_cfft_q16, "int16_t", "l2_pSrc", string)
-            string += print_array(dst_cfft_q16, "int16_t", "l2_pRes", string)
-            string += print_array(twi_cfft_q16, "int16_t",
-                                  "l2_twiddleCoef_q16", string)
-            string += print_array(brv_cfft_q16, "int16_t",
-                                  "l2_BitRevIndexTable", string)
-            filename = app_path + "/data_cfft_radix4_q16.h"
-
-        elif app_name == "cfft_radix2_q16":
-            # cfft_radix2_q16
-            src_cfft_q16, dst_cfft_q16, tolerance_q16 = cfft.generate_cfft_q16(
-                data_args.LEN)
-            brv_cfft_q16 = cfft.generate_bitreversal(data_args.LEN, 2)
-            twi_cfft_q16 = cfft.generate_twiddleCoefq15(data_args.LEN)
-            string += "#define LOG2 ({})\n".format(
-                int(math.log2(data_args.LEN)))
-            string += "#define N_CSAMPLES ({})\n".format(data_args.LEN)
-            string += "#define N_TWIDDLES ({})\n".format(3 *
-                                                         data_args.LEN // 4)
-            string += "#define BITREVINDEXTABLE_LENGTH ({})\n".format(
-                len(brv_cfft_q16))
-            string += "#define TOLERANCE ({})\n".format(tolerance_q16)
-            string += "#define N_BANKS (NUM_CORES * BANKING_FACTOR)\n"
-            string += print_array(src_cfft_q16, "int16_t", "l2_pSrc", string)
-            string += print_array(dst_cfft_q16, "int16_t", "l2_pRes", string)
-            string += print_array(twi_cfft_q16, "int16_t",
-                                  "l2_twiddleCoef_q16", string)
-            string += print_array(brv_cfft_q16, "int16_t",
-                                  "l2_BitRevIndexTable", string)
-            filename = app_path + "/data_cfft_radix2_q16.h"
-
-        elif app_name == "chest_q16":
-            src1_chest_q16, src2_chest_q16, dst_chest_q16 = \
-                chest.generate_chest_q16(data_args.N_TX, data_args.N_RX,
-                                         data_args.N_SAMPLES)
-            string += "#define N_TX ({})\n".format(data_args.N_TX)
-            string += "#define N_RX ({})\n".format(data_args.N_RX)
-            string += "#define N_SAMPLES ({})\n".format(data_args.N_SAMPLES)
-            string += print_array(src1_chest_q16,
-                                  "int16_t", "l2_PilotTX", string)
-            string += print_array(src2_chest_q16,
-                                  "int16_t", "l2_PilotRX", string)
-            string += print_array(dst_chest_q16, "int16_t", "l2_HEST", string)
-            filename = app_path + "/data_chest_q16.h"
-
-        else:
-            raise Exception("ERROR: no app with such name!!!")
-
-        print_file(string, filename)
diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk
index 82268ba07..52d86c6d1 100644
--- a/software/runtime/runtime.mk
+++ b/software/runtime/runtime.mk
@@ -172,8 +172,8 @@ OMP_RUNTIME := $(addsuffix .o,$(shell find $(OMP_DIR) -name "*.c"))
 %.ld: %.ld.c
 	$(RISCV_CC) -P -E $(DEFINES) $< -o $@
 
-%.h: %.args
-	$(python) $(MEMPOOL_DIR)/software/data/print_header.py --params $<
+data_%.h: $(DATA_DIR)/gendata_params.hjson
+	$(python) $(DATA_DIR)/gendata_header.py --app_name $* --params $(DATA_DIR)/gendata_params.hjson
 
 # Bootrom
 %.elf: %.S $(ROOT_DIR)/bootrom.ld $(LINKER_SCRIPT)