Skip to content

Commit

Permalink
perf/shared_memory: use linear_map for SharedMemory.copies
Browse files Browse the repository at this point in the history
  • Loading branch information
hobofan committed Dec 2, 2015
1 parent 430c4ed commit 44ea377
Show file tree
Hide file tree
Showing 11 changed files with 63 additions and 23 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ enum_primitive = "0.1.0"
byteorder = "0.4"
num = "0.1"
lazy_static = "0.1.15"
linear-map = "0.0.3"

clippy = { version = "0.0.27", optional = true }

Expand Down
41 changes: 39 additions & 2 deletions benches/rblas_overhead.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ use co::framework::IFramework;
use co::shared_memory::SharedMemory;
use co::libraries::blas::IBlas;
use rblas::Dot;
use std::time::Duration;
use std::thread::sleep;

use rand::{thread_rng, Rng};

Expand Down Expand Up @@ -63,3 +61,42 @@ fn bench_1000_dot_100_collenchyma_profile(b: &mut Bencher, backend: &Backend<Nat
}
});
}

#[bench]
fn bench_5_dot_20000_rblas(b: &mut Bencher) {
let mut rng = thread_rng();
let slice_a = rng.gen_iter::<f32>().take(20000).collect::<Vec<f32>>();
let slice_b = rng.gen_iter::<f32>().take(20000).collect::<Vec<f32>>();

b.iter(|| {
for _ in 0..5 {
let res = Dot::dot(&slice_a, &slice_b);
test::black_box(res);
}
});
}

#[bench]
fn bench_5_dot_20000_collenchyma(b: &mut Bencher) {
let mut rng = thread_rng();
let slice_a = rng.gen_iter::<f32>().take(20000).collect::<Vec<f32>>();
let slice_b = rng.gen_iter::<f32>().take(20000).collect::<Vec<f32>>();

let backend = backend();
let shared_a = &mut SharedMemory::<f32>::new(backend.device(), 20000);
let shared_b = &mut SharedMemory::<f32>::new(backend.device(), 20000);
let shared_res = &mut SharedMemory::<f32>::new(backend.device(), 20000);
shared_a.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_a);
shared_b.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_b);
let _ = backend.dot(shared_a, shared_b, shared_res);
bench_5_dot_20000_collenchyma_profile(b, &backend, shared_a, shared_b, shared_res);
}

#[inline(never)]
fn bench_5_dot_20000_collenchyma_profile(b: &mut Bencher, backend: &Backend<Native>, shared_a: &mut SharedMemory<f32>, shared_b: &mut SharedMemory<f32>, shared_res: &mut SharedMemory<f32>) {
b.iter(|| {
for _ in 0..5 {
let _ = backend.dot(shared_a, shared_b, shared_res);
}
});
}
4 changes: 2 additions & 2 deletions perf/run_perf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ if [ $# -eq 0 ]
fi
benchname=$1
mkdir -p target/perf
perf record -a -g -F 10000 --output target/perf/${benchname}.data target/debug/rblas_overhead-cf1a2670c118749d --bench ${benchname}
perf record -a -g --output target/perf/${benchname}.data target/debug/rblas_overhead-cf1a2670c118749d --bench ${benchname}
perf script -f -i target/perf/${benchname}.data > target/perf/${benchname}.scripted
stackcollapse-perf target/perf/${benchname}.scripted > target/perf/${benchname}.folded
stackcollapse-perf target/perf/${benchname}.scripted | grep ${benchname} > target/perf/${benchname}.folded
flamegraph target/perf/${benchname}.folded > target/perf/${benchname}.svg
14 changes: 7 additions & 7 deletions src/backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ impl<F: IFramework + Clone> Backend<F> {
}

/// Returns the backend framework.
pub fn framework(&self) -> Box<F> {
self.framework.clone()
pub fn framework(&self) -> &Box<F> {
&self.framework
}

/// Returns the backend device.
Expand All @@ -92,8 +92,8 @@ impl<F: IFramework + Clone> Backend<F> {
}

/// Returns the blas binary.
pub fn binary(&self) -> F::B {
self.framework().binary().clone()
pub fn binary(&self) -> &F::B {
self.framework().binary()
}
}

Expand All @@ -120,7 +120,7 @@ impl IBackend for Backend<Cuda> {
impl IBlas<f32> for Backend<OpenCL> {
type B = ::frameworks::opencl::Program;

fn binary(&self) -> Self::B {
fn binary(&self) -> &Self::B {
self.binary()
}

Expand All @@ -132,7 +132,7 @@ impl IBlas<f32> for Backend<OpenCL> {
impl IBlas<f32> for Backend<Native> {
type B = ::frameworks::native::Binary;

fn binary(&self) -> Self::B {
fn binary(&self) -> &Self::B {
self.binary()
}

Expand All @@ -144,7 +144,7 @@ impl IBlas<f32> for Backend<Native> {
impl IBlas<f64> for Backend<Native> {
type B = ::frameworks::native::Binary;

fn binary(&self) -> Self::B {
fn binary(&self) -> &Self::B {
self.binary()
}

Expand Down
2 changes: 1 addition & 1 deletion src/framework.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ pub trait IFramework {
fn hardwares(&self) -> Vec<Self::H>;

/// Returns the initialized binary.
fn binary(&self) -> Self::B;
fn binary(&self) -> &Self::B;

/// Initializes a new Device from the provided hardwares.
fn new_device(&self, Vec<Self::H>) -> Result<DeviceType, Error>;
Expand Down
4 changes: 2 additions & 2 deletions src/frameworks/cuda/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ impl IFramework for Cuda {
self.hardwares.clone()
}

fn binary(&self) -> Self::B {
self.binary.clone()
fn binary(&self) -> &Self::B {
&self.binary
}

/// Creates a new Cuda device for computation.
Expand Down
4 changes: 2 additions & 2 deletions src/frameworks/native/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ impl IFramework for Native {
self.hardwares.clone()
}

fn binary(&self) -> Binary {
self.binary.clone()
fn binary(&self) -> &Binary {
&self.binary
}

fn new_device(&self, devices: Vec<Hardware>) -> Result<DeviceType, ::framework::Error> {
Expand Down
4 changes: 2 additions & 2 deletions src/frameworks/opencl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ impl IFramework for OpenCL {
self.hardwares.clone()
}

fn binary(&self) -> Self::B {
self.binary.clone()
fn binary(&self) -> &Self::B {
&self.binary
}

/// Creates a new OpenCL context over one or many devices ready for computation.
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ extern crate enum_primitive;
extern crate lazy_static;
extern crate num;
extern crate byteorder;
extern crate linear_map;
extern crate rblas as blas;

pub mod backend;
Expand Down
2 changes: 1 addition & 1 deletion src/libraries/blas.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ pub trait IBlas<F: Float> {
}

/// Returns the binary representation
fn binary(&self) -> Self::B;
fn binary(&self) -> &Self::B;

/// Returns the device representation
fn device(&self) -> &DeviceType;
Expand Down
9 changes: 5 additions & 4 deletions src/shared_memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,19 @@
//! # }
//! ```

use std::collections::HashMap;
use linear_map::LinearMap;
use device::{IDevice, DeviceType};
use memory::MemoryType;
use std::marker::PhantomData;
use std::{fmt, mem, error};

#[derive(Debug)]
// #[derive(Debug)]
/// Container that handles synchronization of [Memory][1] of type `T`.
/// [1]: ../memory/index.html
#[allow(missing_debug_implementations)] // due to LinearMap
pub struct SharedMemory<T> {
latest_location: DeviceType,
copies: HashMap<DeviceType, MemoryType>,
copies: LinearMap<DeviceType, MemoryType>,
cap: usize,
phantom: PhantomData<T>,
}
Expand All @@ -55,7 +56,7 @@ impl<T> SharedMemory<T> {
/// Create new SharedMemory by allocating [Memory][1] on a Device.
/// [1]: ../memory/index.html
pub fn new(dev: &DeviceType, capacity: usize) -> SharedMemory<T> {
let mut copies = HashMap::<DeviceType, MemoryType>::new();
let mut copies = LinearMap::<DeviceType, MemoryType>::new();
let copy: MemoryType;
let alloc_size = mem::size_of::<T>() * capacity;
match *dev {
Expand Down

0 comments on commit 44ea377

Please sign in to comment.