diff --git a/.gitignore b/.gitignore
index ea8c4bf..1b86eab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 /target
+*.csv
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index f424226..f9cf57f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,29 +9,30 @@ edition = "2021"
 codegen-units = 1
 lto = "fat"
 panic = "abort"
+debug = true
 
 # FEATURES 
 
 [features]
 threads = ["dep:atomic", "dep:num_cpus"]
-rayon =   ["dep:atomic", "dep:num_cpus", "dep:rayon"]
-gpu =     ["dep:atomic"]
+rayon = ["dep:atomic", "dep:num_cpus", "dep:rayon"]
+gpu = ["dep:atomic"]
 
 # DEPENDENCIES
 
 [dependencies]
-cxx      = "*"
-cfg-if   = "*"
-rayon    = {version = "*", optional=true}
-atomic   = {version = "0.5.3", optional=true}
-num_cpus = {version = "*", optional=true}
+cxx = "*"
+cfg-if = "*"
+rayon = { version = "*", optional = true }
+atomic = { version = "0.5.3", optional = true }
+num_cpus = { version = "*", optional = true }
 #bytemuck = {version = "*", optional=true} # needed for atomic >= 0.6.0
+rand = { version = "*", features = ["small_rng", "alloc"] }
 
 [dev-dependencies]
 criterion = { version = "*", features = ["html_reports"] }
-rand      = { version = "*", features = ["small_rng", "alloc"] }
-atomic   = {version = "0.5.3"}
-rayon    = {version = "*"}
+atomic = { version = "0.5.3" }
+rayon = { version = "*" }
 
 [build-dependencies]
 cxx-build = "*"
@@ -81,4 +82,4 @@ harness = false
 
 [[bench]]
 name = "hardcoded_gemm"
-harness = false
\ No newline at end of file
+harness = false
diff --git a/benches/blas-speedup-kokkos/gemv.cpp b/benches/blas-speedup-kokkos/gemv.cpp
index 18c4113..1952d41 100644
--- a/benches/blas-speedup-kokkos/gemv.cpp
+++ b/benches/blas-speedup-kokkos/gemv.cpp
@@ -14,7 +14,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#define DATA_SIZE 15
+#define DATA_SIZE 12
 #define N_REPEAT 100
 
 
diff --git a/benches/layout/size.rs b/benches/layout/size.rs
index 5241165..979f9fa 100644
--- a/benches/layout/size.rs
+++ b/benches/layout/size.rs
@@ -13,14 +13,16 @@ use rand::{
     SeedableRng,
 };
 
+type FloatType = f64;
+
 // GEMM - usual case layout
 fn f1(
     length: usize,
-    aa_init: Vec<f64>,
-    bb_init: Vec<f64>,
-    cc_init: Vec<f64>,
-    alpha: f64,
-    beta: f64,
+    aa_init: Vec<FloatType>,
+    bb_init: Vec<FloatType>,
+    cc_init: Vec<FloatType>,
+    alpha: FloatType,
+    beta: FloatType,
 ) {
     // best case layout:
     // iterate on lines -> line-major layout   (Right)
@@ -45,8 +47,8 @@ fn f1(
             // cols
             for j in 0..length {
                 // all b[k, j] for k values are adjacent in memory thanks to the LayoutLeft
-                let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum();
-                let val: f64 = alpha * ab_ij + beta * cc.get([i, j]);
+                let ab_ij: FloatType = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum();
+                let val: FloatType = alpha * ab_ij + beta * cc.get([i, j]);
                 cc.set([i, j], val);
             }
         }
@@ -60,11 +62,11 @@ fn f1(
 // GEMM - best case layout
 fn f2(
     length: usize,
-    aa_init: Vec<f64>,
-    bb_init: Vec<f64>,
-    cc_init: Vec<f64>,
-    alpha: f64,
-    beta: f64,
+    aa_init: Vec<FloatType>,
+    bb_init: Vec<FloatType>,
+    cc_init: Vec<FloatType>,
+    alpha: FloatType,
+    beta: FloatType,
 ) {
     let mut aa = ViewOwned::new_from_data(aa_init, Layout::Right, [length, length]);
     let mut bb = ViewOwned::new_from_data(bb_init, Layout::Left, [length, length]);
@@ -86,8 +88,8 @@ fn f2(
             // cols
             for j in 0..length {
                 // all b[k, j] for k values are adjacent in memory thanks to the LayoutLeft
-                let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum();
-                let val: f64 = alpha * ab_ij + beta * cc.get([i, j]);
+                let ab_ij: FloatType = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum();
+                let val: FloatType = alpha * ab_ij + beta * cc.get([i, j]);
                 cc.set([i, j], val);
             }
         }
@@ -104,18 +106,18 @@ pub fn criterion_benchmark(c: &mut Criterion) {
         let length = 2_usize.pow(data_size);
         let seed: u64 = 9817498146784;
         let mut rng = SmallRng::seed_from_u64(seed);
-        let range: Uniform<f64> = rand::distributions::Uniform::new(0.0, 100.0);
-        let aa_init: Vec<f64> = (0..length * length)
+        let range: Uniform<FloatType> = rand::distributions::Uniform::new(0.0, 100.0);
+        let aa_init: Vec<FloatType> = (0..length * length)
             .map(|_| range.sample(&mut rng))
             .collect();
-        let bb_init: Vec<f64> = (0..length * length)
+        let bb_init: Vec<FloatType> = (0..length * length)
             .map(|_| range.sample(&mut rng))
             .collect();
-        let cc_init: Vec<f64> = (0..length * length)
+        let cc_init: Vec<FloatType> = (0..length * length)
             .map(|_| range.sample(&mut rng))
             .collect();
-        let alpha: f64 = range.sample(&mut rng);
-        let beta: f64 = range.sample(&mut rng);
+        let alpha: FloatType = range.sample(&mut rng);
+        let beta: FloatType = range.sample(&mut rng);
         // f64 uses 8 bytes
         group.throughput(Throughput::Bytes((8 * length).pow(2) as u64));
         group.bench_with_input(
diff --git a/scripts/cache-miss-rates.py b/scripts/cache-miss-rates.py
new file mode 100755
index 0000000..78993e8
--- /dev/null
+++ b/scripts/cache-miss-rates.py
@@ -0,0 +1,49 @@
+# The script expects a single csv file, containing 3 lines:
+# - 1st line: data size (used as the X coordinate)
+# - 2nd line: cache miss-rates times using the usual (i.e. naive) layout
+# - 3rd line: cache miss-rates times using ideal layout
+
+import sys
+import csv
+import matplotlib.pyplot as plt
+
+def main():
+    # read input
+    fileName = sys.argv[1]
+    tmp = []
+    with open(fileName, newline='') as csvfile:
+        reader = csv.reader(csvfile)
+        for row in reader:
+            tmp.append(row)
+
+    # parse values
+    sizes = []
+    usualLayoutRates = []
+    bestLayoutRates = []
+    for size in tmp[0]:
+        sizes.append(int(size))
+    for time in tmp[1]:
+        usualLayoutRates.append(float(time))
+    for time in tmp[2]:
+        bestLayoutRates.append(float(time))
+    tmp.clear()
+
+    # compute relative change
+    percentsMore=[]
+    for i in range(len(sizes)):
+        percentsMore.append( 100 * (usualLayoutRates[i] - bestLayoutRates[i]) / bestLayoutRates[i])
+
+    # plot
+    plt.title("GEMM: L1 Cache Miss-Rate Evolution = f(Data Size)")
+    plt.xlabel("Square Matrix Dimension (# of rows/cols)")
+    plt.ylabel("Miss-Rate (%)")
+    
+    plt.semilogx(base=2.0)
+    plt.grid(visible=True, axis='y')
+    plt.scatter(sizes, usualLayoutRates, marker='+', color='r', label="usual-layout")
+    plt.scatter(sizes, bestLayoutRates, marker='x', color='b', label="best-layout")
+    plt.legend()
+    plt.savefig(fname="cache-miss-rates.svg", format="svg")
+    
+
+main()
diff --git a/scripts/cache-sizes.py b/scripts/cache-sizes.py
new file mode 100755
index 0000000..08a0f10
--- /dev/null
+++ b/scripts/cache-sizes.py
@@ -0,0 +1,57 @@
+# This script is used to generate a speedup graph from the output
+# of the layout-size benchmark (criterion group gemm-sizes).
+# Additionally, it places vertical bars corresponding to dimensions
+# where matrix size exceed a certain threshold
+#
+# The script expects a single csv file, containing 3 lines:
+# - 1st line: data size (used as the X coordinate)
+# - 2nd line: execution times using the usual (i.e. naive) layout
+# - 3rd line: execution times using ideal layout
+
+import sys
+import csv
+import matplotlib.pyplot as plt
+
+def main():
+    # read input
+    fileName = sys.argv[1]
+    tmp = []
+    with open(fileName, newline='') as csvfile:
+        reader = csv.reader(csvfile)
+        for row in reader:
+            tmp.append(row)
+
+    # parse values
+    sizes = []
+    usualLayoutTimes = []
+    bestLayoutTimes = []
+    for size in tmp[0]:
+        sizes.append(int(size)) # matrix size = dim1 * dim2 * sizeof(double)
+    for time in tmp[1]:
+        usualLayoutTimes.append(float(time))
+    for time in tmp[2]:
+        bestLayoutTimes.append(float(time))
+    tmp.clear()
+
+    # compute relative change
+    percentsSlower=[]
+    for i in range(len(sizes)):
+        percentLonger = (usualLayoutTimes[i] - bestLayoutTimes[i]) / bestLayoutTimes[i]
+        percentsSlower.append(- 100*100 * percentLonger / (100.0 + percentLonger))
+    
+    # plot
+    plt.title("GEMM: Speed Gain = f(Data Size)")
+    plt.xlabel("Square Matrix Dimension (# of rows/cols)")
+    plt.ylabel("Gain (%)")
+    plt.ylim([-175, 10])
+    plt.semilogx(base=2.0)
+    plt.axvline(x=64*6**0.5,  label="Exceed L1 Total Size", color='r', ymax=0.95, ymin=0.05)
+    plt.axvline(x=512*3**0.5, label="Exceed L2 Total Size", color='g', ymax=0.95, ymin=0.05)
+    plt.axvline(x=2048,       label="Exceed L3 Total Size", color='b', ymax=0.95, ymin=0.15)
+    plt.legend(loc="center left")
+    plt.grid(visible=True, axis='y')
+    plt.scatter(sizes, percentsSlower, marker='+', color='r')
+    plt.savefig(fname="gemm-sizes-plot.svg", format="svg")
+    
+
+main()
diff --git a/scripts/gemm-sizes.py b/scripts/gemm-sizes.py
new file mode 100755
index 0000000..b47b1eb
--- /dev/null
+++ b/scripts/gemm-sizes.py
@@ -0,0 +1,51 @@
+# This script is used to generate a speedup graph from the output
+# of the layout-size benchmark (criterion group gemm-sizes).
+#
+# The script expects a single csv file, containing 3 lines:
+# - 1st line: data size (used as the X coordinate)
+# - 2nd line: execution times using the usual (i.e. naive) layout
+# - 3rd line: execution times using ideal layout
+
+import sys
+import csv
+import matplotlib.pyplot as plt
+
+def main():
+    # read input
+    fileName = sys.argv[1]
+    tmp = []
+    with open(fileName, newline='') as csvfile:
+        reader = csv.reader(csvfile)
+        for row in reader:
+            tmp.append(row)
+
+    # parse values
+    sizes = []
+    usualLayoutTimes = []
+    bestLayoutTimes = []
+    for size in tmp[0]:
+        sizes.append(int(size))
+    for time in tmp[1]:
+        usualLayoutTimes.append(float(time))
+    for time in tmp[2]:
+        bestLayoutTimes.append(float(time))
+    tmp.clear()
+
+    # compute relative change
+    percentsSlower=[]
+    for i in range(len(sizes)):
+        percentLonger = (usualLayoutTimes[i] - bestLayoutTimes[i]) / bestLayoutTimes[i]
+        percentsSlower.append(- 100*100 * percentLonger / (100.0 + percentLonger))
+    
+    # plot
+    plt.title("GEMM: Speed Gain = f(Data Size)")
+    plt.xlabel("Square Matrix Dimension (# of rows/cols)")
+    plt.ylabel("Gain (%)")
+    plt.ylim([-175, 10])
+    plt.semilogx(base=2.0)
+    plt.grid(visible=True, axis='y')
+    plt.scatter(sizes, percentsSlower, marker='+', color='r')
+    plt.savefig(fname="gemm-sizes-plot.svg", format="svg")
+    
+
+main()
diff --git a/src/main.rs b/src/main.rs
index 00aaf8a..940d1c2 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,7 +1,68 @@
-use poc_kokkos_rs::ffi;
+//use poc_kokkos_rs::ffi;
+
+use std::hint::black_box;
+
+use poc_kokkos_rs::{
+    functor::KernelArgs,
+    routines::{
+        parallel_for,
+        parameters::{ExecutionPolicy, ExecutionSpace, RangePolicy, Schedule},
+    },
+    view::{parameters::Layout, ViewOwned},
+};
+use rand::{distributions::Uniform, prelude::*, rngs::SmallRng, SeedableRng};
 
 fn main() {
-    ffi::say_hello();
-    println!("Hello from Rust!");
-    ffi::say_many_hello()
+    // ffi::say_hello();
+    // println!("Hello from Rust!");
+    // ffi::say_many_hello()
+
+    // inits
+    const DATA_SIZE: u32 = 10;
+    let length = 2_usize.pow(DATA_SIZE);
+    let seed: u64 = 9817498146784;
+    let mut rng = SmallRng::seed_from_u64(seed);
+    let range: Uniform<f64> = rand::distributions::Uniform::new(0.0, 100.0);
+    let aa_init: Vec<f64> = (0..length * length)
+        .map(|_| range.sample(&mut rng))
+        .collect();
+    let bb_init: Vec<f64> = (0..length * length)
+        .map(|_| range.sample(&mut rng))
+        .collect();
+    let cc_init: Vec<f64> = (0..length * length)
+        .map(|_| range.sample(&mut rng))
+        .collect();
+    let alpha: f64 = range.sample(&mut rng);
+    let beta: f64 = range.sample(&mut rng);
+
+    // inits again
+    let mut aa = ViewOwned::new_from_data(aa_init, Layout::Right, [length, length]);
+    let mut bb = ViewOwned::new_from_data(bb_init, Layout::Left, [length, length]); // optimal layout since we iterate inside columns :)
+    let mut cc = ViewOwned::new_from_data(cc_init, Layout::Right, [length, length]);
+    black_box(&mut aa);
+    black_box(&mut bb);
+    black_box(&mut cc);
+
+    let execp = ExecutionPolicy {
+        space: ExecutionSpace::DeviceCPU,
+        range: RangePolicy::RangePolicy(0..length),
+        schedule: Schedule::Static,
+    };
+
+    // C = alpha * A * B + beta * C
+    let gemm_kernel = |arg: KernelArgs<1>| match arg {
+        // lines
+        KernelArgs::Index1D(i) => {
+            // cols
+            for j in 0..length {
+                // all b[k, j] for k values are adjacent in memory thanks to the LayoutLeft
+                let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum();
+                let val: f64 = alpha * ab_ij + beta * cc.get([i, j]);
+                cc.set([i, j], val);
+            }
+        }
+        KernelArgs::IndexND(_) => unimplemented!(),
+        KernelArgs::Handle => unimplemented!(),
+    };
+    parallel_for(execp, gemm_kernel).unwrap();
 }