diff --git a/.gitignore b/.gitignore index ea8c4bf..1b86eab 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +*.csv \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index f424226..f9cf57f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,29 +9,30 @@ edition = "2021" codegen-units = 1 lto = "fat" panic = "abort" +debug = true # FEATURES [features] threads = ["dep:atomic", "dep:num_cpus"] -rayon = ["dep:atomic", "dep:num_cpus", "dep:rayon"] -gpu = ["dep:atomic"] +rayon = ["dep:atomic", "dep:num_cpus", "dep:rayon"] +gpu = ["dep:atomic"] # DEPENDENCIES [dependencies] -cxx = "*" -cfg-if = "*" -rayon = {version = "*", optional=true} -atomic = {version = "0.5.3", optional=true} -num_cpus = {version = "*", optional=true} +cxx = "*" +cfg-if = "*" +rayon = { version = "*", optional = true } +atomic = { version = "0.5.3", optional = true } +num_cpus = { version = "*", optional = true } #bytemuck = {version = "*", optional=true} # needed for atomic >= 0.6.0 +rand = { version = "*", features = ["small_rng", "alloc"] } [dev-dependencies] criterion = { version = "*", features = ["html_reports"] } -rand = { version = "*", features = ["small_rng", "alloc"] } -atomic = {version = "0.5.3"} -rayon = {version = "*"} +atomic = { version = "0.5.3" } +rayon = { version = "*" } [build-dependencies] cxx-build = "*" @@ -81,4 +82,4 @@ harness = false [[bench]] name = "hardcoded_gemm" -harness = false \ No newline at end of file +harness = false diff --git a/benches/blas-speedup-kokkos/gemv.cpp b/benches/blas-speedup-kokkos/gemv.cpp index 18c4113..1952d41 100644 --- a/benches/blas-speedup-kokkos/gemv.cpp +++ b/benches/blas-speedup-kokkos/gemv.cpp @@ -14,7 +14,7 @@ #include -#define DATA_SIZE 15 +#define DATA_SIZE 12 #define N_REPEAT 100 diff --git a/benches/layout/size.rs b/benches/layout/size.rs index 5241165..979f9fa 100644 --- a/benches/layout/size.rs +++ b/benches/layout/size.rs @@ -13,14 +13,16 @@ use rand::{ SeedableRng, }; +type FloatType = f64; + // GEMM - usual case layout fn f1( length: usize, - aa_init: Vec, - bb_init: Vec, - cc_init: Vec, - alpha: f64, - beta: f64, + aa_init: Vec, + bb_init: Vec, + cc_init: Vec, + alpha: FloatType, + beta: FloatType, ) { // best case layout: // iterate on lines -> line-major layout (Right) @@ -45,8 +47,8 @@ fn f1( // cols for j in 0..length { // all b[k, j] for k values are adjacent in memory thanks to the LayoutLeft - let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum(); - let val: f64 = alpha * ab_ij + beta * cc.get([i, j]); + let ab_ij: FloatType = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum(); + let val: FloatType = alpha * ab_ij + beta * cc.get([i, j]); cc.set([i, j], val); } } @@ -60,11 +62,11 @@ fn f1( // GEMM - best case layout fn f2( length: usize, - aa_init: Vec, - bb_init: Vec, - cc_init: Vec, - alpha: f64, - beta: f64, + aa_init: Vec, + bb_init: Vec, + cc_init: Vec, + alpha: FloatType, + beta: FloatType, ) { let mut aa = ViewOwned::new_from_data(aa_init, Layout::Right, [length, length]); let mut bb = ViewOwned::new_from_data(bb_init, Layout::Left, [length, length]); @@ -86,8 +88,8 @@ fn f2( // cols for j in 0..length { // all b[k, j] for k values are adjacent in memory thanks to the LayoutLeft - let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum(); - let val: f64 = alpha * ab_ij + beta * cc.get([i, j]); + let ab_ij: FloatType = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum(); + let val: FloatType = alpha * ab_ij + beta * cc.get([i, j]); cc.set([i, j], val); } } @@ -104,18 +106,18 @@ pub fn criterion_benchmark(c: &mut Criterion) { let length = 2_usize.pow(data_size); let seed: u64 = 9817498146784; let mut rng = SmallRng::seed_from_u64(seed); - let range: Uniform = rand::distributions::Uniform::new(0.0, 100.0); - let aa_init: Vec = (0..length * length) + let range: Uniform = rand::distributions::Uniform::new(0.0, 100.0); + let aa_init: Vec = (0..length * length) .map(|_| range.sample(&mut rng)) .collect(); - let bb_init: Vec = (0..length * length) + let bb_init: Vec = (0..length * length) .map(|_| range.sample(&mut rng)) .collect(); - let cc_init: Vec = (0..length * length) + let cc_init: Vec = (0..length * length) .map(|_| range.sample(&mut rng)) .collect(); - let alpha: f64 = range.sample(&mut rng); - let beta: f64 = range.sample(&mut rng); + let alpha: FloatType = range.sample(&mut rng); + let beta: FloatType = range.sample(&mut rng); // f64 uses 8 bytes group.throughput(Throughput::Bytes((8 * length).pow(2) as u64)); group.bench_with_input( diff --git a/scripts/cache-miss-rates.py b/scripts/cache-miss-rates.py new file mode 100755 index 0000000..78993e8 --- /dev/null +++ b/scripts/cache-miss-rates.py @@ -0,0 +1,49 @@ +# The script expects a single csv file, containing 3 lines: +# - 1st line: data size (used as the X coordinate) +# - 2nd line: cache miss-rates times using the usual (i.e. naive) layout +# - 3rd line: cache miss-rates times using ideal layout + +import sys +import csv +import matplotlib.pyplot as plt + +def main(): + # read input + fileName = sys.argv[1] + tmp = [] + with open(fileName, newline='') as csvfile: + reader = csv.reader(csvfile) + for row in reader: + tmp.append(row) + + # parse values + sizes = [] + usualLayoutRates = [] + bestLayoutRates = [] + for size in tmp[0]: + sizes.append(int(size)) + for time in tmp[1]: + usualLayoutRates.append(float(time)) + for time in tmp[2]: + bestLayoutRates.append(float(time)) + tmp.clear() + + # compute relative change + percentsMore=[] + for i in range(len(sizes)): + percentsMore.append( 100 * (usualLayoutRates[i] - bestLayoutRates[i]) / bestLayoutRates[i]) + + # plot + plt.title("GEMM: L1 Cache Miss-Rate Evolution = f(Data Size)") + plt.xlabel("Square Matrix Dimension (# of rows/cols)") + plt.ylabel("Miss-Rate (%)") + + plt.semilogx(base=2.0) + plt.grid(visible=True, axis='y') + plt.scatter(sizes, usualLayoutRates, marker='+', color='r', label="usual-layout") + plt.scatter(sizes, bestLayoutRates, marker='x', color='b', label="best-layout") + plt.legend() + plt.savefig(fname="cache-miss-rates.svg", format="svg") + + +main() diff --git a/scripts/cache-sizes.py b/scripts/cache-sizes.py new file mode 100755 index 0000000..08a0f10 --- /dev/null +++ b/scripts/cache-sizes.py @@ -0,0 +1,57 @@ +# This script is used to generate a speedup graph from the output +# of the layout-size benchmark (criterion group gemm-sizes). +# Additionally, it places vertical bars corresponding to dimensions +# where matrix size exceed a certain threshold +# +# The script expects a single csv file, containing 3 lines: +# - 1st line: data size (used as the X coordinate) +# - 2nd line: execution times using the usual (i.e. naive) layout +# - 3rd line: execution times using ideal layout + +import sys +import csv +import matplotlib.pyplot as plt + +def main(): + # read input + fileName = sys.argv[1] + tmp = [] + with open(fileName, newline='') as csvfile: + reader = csv.reader(csvfile) + for row in reader: + tmp.append(row) + + # parse values + sizes = [] + usualLayoutTimes = [] + bestLayoutTimes = [] + for size in tmp[0]: + sizes.append(int(size)) # matrix size = dim1 * dim2 * sizeof(double) + for time in tmp[1]: + usualLayoutTimes.append(float(time)) + for time in tmp[2]: + bestLayoutTimes.append(float(time)) + tmp.clear() + + # compute relative change + percentsSlower=[] + for i in range(len(sizes)): + percentLonger = (usualLayoutTimes[i] - bestLayoutTimes[i]) / bestLayoutTimes[i] + percentsSlower.append(- 100*100 * percentLonger / (100.0 + percentLonger)) + + # plot + plt.title("GEMM: Speed Gain = f(Data Size)") + plt.xlabel("Square Matrix Dimension (# of rows/cols)") + plt.ylabel("Gain (%)") + plt.ylim([-175, 10]) + plt.semilogx(base=2.0) + plt.axvline(x=64*6**0.5, label="Exceed L1 Total Size", color='r', ymax=0.95, ymin=0.05) + plt.axvline(x=512*3**0.5, label="Exceed L2 Total Size", color='g', ymax=0.95, ymin=0.05) + plt.axvline(x=2048, label="Exceed L3 Total Size", color='b', ymax=0.95, ymin=0.15) + plt.legend(loc="center left") + plt.grid(visible=True, axis='y') + plt.scatter(sizes, percentsSlower, marker='+', color='r') + plt.savefig(fname="gemm-sizes-plot.svg", format="svg") + + +main() diff --git a/scripts/gemm-sizes.py b/scripts/gemm-sizes.py new file mode 100755 index 0000000..b47b1eb --- /dev/null +++ b/scripts/gemm-sizes.py @@ -0,0 +1,51 @@ +# This script is used to generate a speedup graph from the output +# of the layout-size benchmark (criterion group gemm-sizes). +# +# The script expects a single csv file, containing 3 lines: +# - 1st line: data size (used as the X coordinate) +# - 2nd line: execution times using the usual (i.e. naive) layout +# - 3rd line: execution times using ideal layout + +import sys +import csv +import matplotlib.pyplot as plt + +def main(): + # read input + fileName = sys.argv[1] + tmp = [] + with open(fileName, newline='') as csvfile: + reader = csv.reader(csvfile) + for row in reader: + tmp.append(row) + + # parse values + sizes = [] + usualLayoutTimes = [] + bestLayoutTimes = [] + for size in tmp[0]: + sizes.append(int(size)) + for time in tmp[1]: + usualLayoutTimes.append(float(time)) + for time in tmp[2]: + bestLayoutTimes.append(float(time)) + tmp.clear() + + # compute relative change + percentsSlower=[] + for i in range(len(sizes)): + percentLonger = (usualLayoutTimes[i] - bestLayoutTimes[i]) / bestLayoutTimes[i] + percentsSlower.append(- 100*100 * percentLonger / (100.0 + percentLonger)) + + # plot + plt.title("GEMM: Speed Gain = f(Data Size)") + plt.xlabel("Square Matrix Dimension (# of rows/cols)") + plt.ylabel("Gain (%)") + plt.ylim([-175, 10]) + plt.semilogx(base=2.0) + plt.grid(visible=True, axis='y') + plt.scatter(sizes, percentsSlower, marker='+', color='r') + plt.savefig(fname="gemm-sizes-plot.svg", format="svg") + + +main() diff --git a/src/main.rs b/src/main.rs index 00aaf8a..940d1c2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,68 @@ -use poc_kokkos_rs::ffi; +//use poc_kokkos_rs::ffi; + +use std::hint::black_box; + +use poc_kokkos_rs::{ + functor::KernelArgs, + routines::{ + parallel_for, + parameters::{ExecutionPolicy, ExecutionSpace, RangePolicy, Schedule}, + }, + view::{parameters::Layout, ViewOwned}, +}; +use rand::{distributions::Uniform, prelude::*, rngs::SmallRng, SeedableRng}; fn main() { - ffi::say_hello(); - println!("Hello from Rust!"); - ffi::say_many_hello() + // ffi::say_hello(); + // println!("Hello from Rust!"); + // ffi::say_many_hello() + + // inits + const DATA_SIZE: u32 = 10; + let length = 2_usize.pow(DATA_SIZE); + let seed: u64 = 9817498146784; + let mut rng = SmallRng::seed_from_u64(seed); + let range: Uniform = rand::distributions::Uniform::new(0.0, 100.0); + let aa_init: Vec = (0..length * length) + .map(|_| range.sample(&mut rng)) + .collect(); + let bb_init: Vec = (0..length * length) + .map(|_| range.sample(&mut rng)) + .collect(); + let cc_init: Vec = (0..length * length) + .map(|_| range.sample(&mut rng)) + .collect(); + let alpha: f64 = range.sample(&mut rng); + let beta: f64 = range.sample(&mut rng); + + // inits again + let mut aa = ViewOwned::new_from_data(aa_init, Layout::Right, [length, length]); + let mut bb = ViewOwned::new_from_data(bb_init, Layout::Left, [length, length]); // optimal layout since we iterate inside columns :) + let mut cc = ViewOwned::new_from_data(cc_init, Layout::Right, [length, length]); + black_box(&mut aa); + black_box(&mut bb); + black_box(&mut cc); + + let execp = ExecutionPolicy { + space: ExecutionSpace::DeviceCPU, + range: RangePolicy::RangePolicy(0..length), + schedule: Schedule::Static, + }; + + // C = alpha * A * B + beta * C + let gemm_kernel = |arg: KernelArgs<1>| match arg { + // lines + KernelArgs::Index1D(i) => { + // cols + for j in 0..length { + // all b[k, j] for k values are adjacent in memory thanks to the LayoutLeft + let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum(); + let val: f64 = alpha * ab_ij + beta * cc.get([i, j]); + cc.set([i, j], val); + } + } + KernelArgs::IndexND(_) => unimplemented!(), + KernelArgs::Handle => unimplemented!(), + }; + parallel_for(execp, gemm_kernel).unwrap(); }