Skip to content

Commit

Permalink
Scripts (#10)
Browse files Browse the repository at this point in the history
* addedbasic script to plot relative change according to layout

need to add label, title, etc.

* cleaner plot + added a switch to use f32 in layout-size bench

* modified main to measure cache misses according to layout

* changed kokkos gemv to match dta size of rust bench + new script to plot
cache missrates

* updated scripts

* updated description of cache-sizes script
  • Loading branch information
imrn99 authored Dec 8, 2023
1 parent 3a3f285 commit 87ce049
Show file tree
Hide file tree
Showing 8 changed files with 258 additions and 36 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
/target
*.csv
23 changes: 12 additions & 11 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,30 @@ edition = "2021"
codegen-units = 1
lto = "fat"
panic = "abort"
debug = true

# FEATURES

[features]
threads = ["dep:atomic", "dep:num_cpus"]
rayon = ["dep:atomic", "dep:num_cpus", "dep:rayon"]
gpu = ["dep:atomic"]
rayon = ["dep:atomic", "dep:num_cpus", "dep:rayon"]
gpu = ["dep:atomic"]

# DEPENDENCIES

[dependencies]
cxx = "*"
cfg-if = "*"
rayon = {version = "*", optional=true}
atomic = {version = "0.5.3", optional=true}
num_cpus = {version = "*", optional=true}
cxx = "*"
cfg-if = "*"
rayon = { version = "*", optional = true }
atomic = { version = "0.5.3", optional = true }
num_cpus = { version = "*", optional = true }
#bytemuck = {version = "*", optional=true} # needed for atomic >= 0.6.0
rand = { version = "*", features = ["small_rng", "alloc"] }

[dev-dependencies]
criterion = { version = "*", features = ["html_reports"] }
rand = { version = "*", features = ["small_rng", "alloc"] }
atomic = {version = "0.5.3"}
rayon = {version = "*"}
atomic = { version = "0.5.3" }
rayon = { version = "*" }

[build-dependencies]
cxx-build = "*"
Expand Down Expand Up @@ -81,4 +82,4 @@ harness = false

[[bench]]
name = "hardcoded_gemm"
harness = false
harness = false
2 changes: 1 addition & 1 deletion benches/blas-speedup-kokkos/gemv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

#include <Kokkos_Core.hpp>

#define DATA_SIZE 15
#define DATA_SIZE 12
#define N_REPEAT 100


Expand Down
42 changes: 22 additions & 20 deletions benches/layout/size.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,16 @@ use rand::{
SeedableRng,
};

type FloatType = f64;

// GEMM - usual case layout
fn f1(
length: usize,
aa_init: Vec<f64>,
bb_init: Vec<f64>,
cc_init: Vec<f64>,
alpha: f64,
beta: f64,
aa_init: Vec<FloatType>,
bb_init: Vec<FloatType>,
cc_init: Vec<FloatType>,
alpha: FloatType,
beta: FloatType,
) {
// best case layout:
// iterate on lines -> line-major layout (Right)
Expand All @@ -45,8 +47,8 @@ fn f1(
// cols
for j in 0..length {
// all b[k, j] for k values are adjacent in memory thanks to the LayoutLeft
let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum();
let val: f64 = alpha * ab_ij + beta * cc.get([i, j]);
let ab_ij: FloatType = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum();
let val: FloatType = alpha * ab_ij + beta * cc.get([i, j]);
cc.set([i, j], val);
}
}
Expand All @@ -60,11 +62,11 @@ fn f1(
// GEMM - best case layout
fn f2(
length: usize,
aa_init: Vec<f64>,
bb_init: Vec<f64>,
cc_init: Vec<f64>,
alpha: f64,
beta: f64,
aa_init: Vec<FloatType>,
bb_init: Vec<FloatType>,
cc_init: Vec<FloatType>,
alpha: FloatType,
beta: FloatType,
) {
let mut aa = ViewOwned::new_from_data(aa_init, Layout::Right, [length, length]);
let mut bb = ViewOwned::new_from_data(bb_init, Layout::Left, [length, length]);
Expand All @@ -86,8 +88,8 @@ fn f2(
// cols
for j in 0..length {
// all b[k, j] for k values are adjacent in memory thanks to the LayoutLeft
let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum();
let val: f64 = alpha * ab_ij + beta * cc.get([i, j]);
let ab_ij: FloatType = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum();
let val: FloatType = alpha * ab_ij + beta * cc.get([i, j]);
cc.set([i, j], val);
}
}
Expand All @@ -104,18 +106,18 @@ pub fn criterion_benchmark(c: &mut Criterion) {
let length = 2_usize.pow(data_size);
let seed: u64 = 9817498146784;
let mut rng = SmallRng::seed_from_u64(seed);
let range: Uniform<f64> = rand::distributions::Uniform::new(0.0, 100.0);
let aa_init: Vec<f64> = (0..length * length)
let range: Uniform<FloatType> = rand::distributions::Uniform::new(0.0, 100.0);
let aa_init: Vec<FloatType> = (0..length * length)
.map(|_| range.sample(&mut rng))
.collect();
let bb_init: Vec<f64> = (0..length * length)
let bb_init: Vec<FloatType> = (0..length * length)
.map(|_| range.sample(&mut rng))
.collect();
let cc_init: Vec<f64> = (0..length * length)
let cc_init: Vec<FloatType> = (0..length * length)
.map(|_| range.sample(&mut rng))
.collect();
let alpha: f64 = range.sample(&mut rng);
let beta: f64 = range.sample(&mut rng);
let alpha: FloatType = range.sample(&mut rng);
let beta: FloatType = range.sample(&mut rng);
// f64 uses 8 bytes
group.throughput(Throughput::Bytes((8 * length).pow(2) as u64));
group.bench_with_input(
Expand Down
49 changes: 49 additions & 0 deletions scripts/cache-miss-rates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# The script expects a single csv file, containing 3 lines:
# - 1st line: data size (used as the X coordinate)
# - 2nd line: cache miss-rates times using the usual (i.e. naive) layout
# - 3rd line: cache miss-rates times using ideal layout

import sys
import csv
import matplotlib.pyplot as plt

def main():
# read input
fileName = sys.argv[1]
tmp = []
with open(fileName, newline='') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
tmp.append(row)

# parse values
sizes = []
usualLayoutRates = []
bestLayoutRates = []
for size in tmp[0]:
sizes.append(int(size))
for time in tmp[1]:
usualLayoutRates.append(float(time))
for time in tmp[2]:
bestLayoutRates.append(float(time))
tmp.clear()

# compute relative change
percentsMore=[]
for i in range(len(sizes)):
percentsMore.append( 100 * (usualLayoutRates[i] - bestLayoutRates[i]) / bestLayoutRates[i])

# plot
plt.title("GEMM: L1 Cache Miss-Rate Evolution = f(Data Size)")
plt.xlabel("Square Matrix Dimension (# of rows/cols)")
plt.ylabel("Miss-Rate (%)")

plt.semilogx(base=2.0)
plt.grid(visible=True, axis='y')
plt.scatter(sizes, usualLayoutRates, marker='+', color='r', label="usual-layout")
plt.scatter(sizes, bestLayoutRates, marker='x', color='b', label="best-layout")
plt.legend()
plt.savefig(fname="cache-miss-rates.svg", format="svg")


main()
57 changes: 57 additions & 0 deletions scripts/cache-sizes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# This script is used to generate a speedup graph from the output
# of the layout-size benchmark (criterion group gemm-sizes).
# Additionally, it places vertical bars corresponding to dimensions
# where matrix size exceed a certain threshold
#
# The script expects a single csv file, containing 3 lines:
# - 1st line: data size (used as the X coordinate)
# - 2nd line: execution times using the usual (i.e. naive) layout
# - 3rd line: execution times using ideal layout

import sys
import csv
import matplotlib.pyplot as plt

def main():
# read input
fileName = sys.argv[1]
tmp = []
with open(fileName, newline='') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
tmp.append(row)

# parse values
sizes = []
usualLayoutTimes = []
bestLayoutTimes = []
for size in tmp[0]:
sizes.append(int(size)) # matrix size = dim1 * dim2 * sizeof(double)
for time in tmp[1]:
usualLayoutTimes.append(float(time))
for time in tmp[2]:
bestLayoutTimes.append(float(time))
tmp.clear()

# compute relative change
percentsSlower=[]
for i in range(len(sizes)):
percentLonger = (usualLayoutTimes[i] - bestLayoutTimes[i]) / bestLayoutTimes[i]
percentsSlower.append(- 100*100 * percentLonger / (100.0 + percentLonger))

# plot
plt.title("GEMM: Speed Gain = f(Data Size)")
plt.xlabel("Square Matrix Dimension (# of rows/cols)")
plt.ylabel("Gain (%)")
plt.ylim([-175, 10])
plt.semilogx(base=2.0)
plt.axvline(x=64*6**0.5, label="Exceed L1 Total Size", color='r', ymax=0.95, ymin=0.05)
plt.axvline(x=512*3**0.5, label="Exceed L2 Total Size", color='g', ymax=0.95, ymin=0.05)
plt.axvline(x=2048, label="Exceed L3 Total Size", color='b', ymax=0.95, ymin=0.15)
plt.legend(loc="center left")
plt.grid(visible=True, axis='y')
plt.scatter(sizes, percentsSlower, marker='+', color='r')
plt.savefig(fname="gemm-sizes-plot.svg", format="svg")


main()
51 changes: 51 additions & 0 deletions scripts/gemm-sizes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# This script is used to generate a speedup graph from the output
# of the layout-size benchmark (criterion group gemm-sizes).
#
# The script expects a single csv file, containing 3 lines:
# - 1st line: data size (used as the X coordinate)
# - 2nd line: execution times using the usual (i.e. naive) layout
# - 3rd line: execution times using ideal layout

import sys
import csv
import matplotlib.pyplot as plt

def main():
# read input
fileName = sys.argv[1]
tmp = []
with open(fileName, newline='') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
tmp.append(row)

# parse values
sizes = []
usualLayoutTimes = []
bestLayoutTimes = []
for size in tmp[0]:
sizes.append(int(size))
for time in tmp[1]:
usualLayoutTimes.append(float(time))
for time in tmp[2]:
bestLayoutTimes.append(float(time))
tmp.clear()

# compute relative change
percentsSlower=[]
for i in range(len(sizes)):
percentLonger = (usualLayoutTimes[i] - bestLayoutTimes[i]) / bestLayoutTimes[i]
percentsSlower.append(- 100*100 * percentLonger / (100.0 + percentLonger))

# plot
plt.title("GEMM: Speed Gain = f(Data Size)")
plt.xlabel("Square Matrix Dimension (# of rows/cols)")
plt.ylabel("Gain (%)")
plt.ylim([-175, 10])
plt.semilogx(base=2.0)
plt.grid(visible=True, axis='y')
plt.scatter(sizes, percentsSlower, marker='+', color='r')
plt.savefig(fname="gemm-sizes-plot.svg", format="svg")


main()
69 changes: 65 additions & 4 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,68 @@
use poc_kokkos_rs::ffi;
//use poc_kokkos_rs::ffi;

use std::hint::black_box;

use poc_kokkos_rs::{
functor::KernelArgs,
routines::{
parallel_for,
parameters::{ExecutionPolicy, ExecutionSpace, RangePolicy, Schedule},
},
view::{parameters::Layout, ViewOwned},
};
use rand::{distributions::Uniform, prelude::*, rngs::SmallRng, SeedableRng};

fn main() {
ffi::say_hello();
println!("Hello from Rust!");
ffi::say_many_hello()
// ffi::say_hello();
// println!("Hello from Rust!");
// ffi::say_many_hello()

// inits
const DATA_SIZE: u32 = 10;
let length = 2_usize.pow(DATA_SIZE);
let seed: u64 = 9817498146784;
let mut rng = SmallRng::seed_from_u64(seed);
let range: Uniform<f64> = rand::distributions::Uniform::new(0.0, 100.0);
let aa_init: Vec<f64> = (0..length * length)
.map(|_| range.sample(&mut rng))
.collect();
let bb_init: Vec<f64> = (0..length * length)
.map(|_| range.sample(&mut rng))
.collect();
let cc_init: Vec<f64> = (0..length * length)
.map(|_| range.sample(&mut rng))
.collect();
let alpha: f64 = range.sample(&mut rng);
let beta: f64 = range.sample(&mut rng);

// inits again
let mut aa = ViewOwned::new_from_data(aa_init, Layout::Right, [length, length]);
let mut bb = ViewOwned::new_from_data(bb_init, Layout::Left, [length, length]); // optimal layout since we iterate inside columns :)
let mut cc = ViewOwned::new_from_data(cc_init, Layout::Right, [length, length]);
black_box(&mut aa);
black_box(&mut bb);
black_box(&mut cc);

let execp = ExecutionPolicy {
space: ExecutionSpace::DeviceCPU,
range: RangePolicy::RangePolicy(0..length),
schedule: Schedule::Static,
};

// C = alpha * A * B + beta * C
let gemm_kernel = |arg: KernelArgs<1>| match arg {
// lines
KernelArgs::Index1D(i) => {
// cols
for j in 0..length {
// all b[k, j] for k values are adjacent in memory thanks to the LayoutLeft
let ab_ij: f64 = (0..length).map(|k| aa.get([i, k]) * bb.get([k, j])).sum();
let val: f64 = alpha * ab_ij + beta * cc.get([i, j]);
cc.set([i, j], val);
}
}
KernelArgs::IndexND(_) => unimplemented!(),
KernelArgs::Handle => unimplemented!(),
};
parallel_for(execp, gemm_kernel).unwrap();
}

0 comments on commit 87ce049

Please sign in to comment.