From 44ea377d08da066159a01d646fed65f5f7080f8f Mon Sep 17 00:00:00 2001 From: Hobofan Date: Tue, 1 Dec 2015 23:51:30 +0100 Subject: [PATCH] perf/shared_memory: use linear_map for SharedMemory.copies --- Cargo.toml | 1 + benches/rblas_overhead.rs | 41 ++++++++++++++++++++++++++++++++++-- perf/run_perf.sh | 4 ++-- src/backend.rs | 14 ++++++------ src/framework.rs | 2 +- src/frameworks/cuda/mod.rs | 4 ++-- src/frameworks/native/mod.rs | 4 ++-- src/frameworks/opencl/mod.rs | 4 ++-- src/lib.rs | 1 + src/libraries/blas.rs | 2 +- src/shared_memory.rs | 9 ++++---- 11 files changed, 63 insertions(+), 23 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6247cf6e..8b947682 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ enum_primitive = "0.1.0" byteorder = "0.4" num = "0.1" lazy_static = "0.1.15" +linear-map = "0.0.3" clippy = { version = "0.0.27", optional = true } diff --git a/benches/rblas_overhead.rs b/benches/rblas_overhead.rs index 294ecbab..9b556973 100644 --- a/benches/rblas_overhead.rs +++ b/benches/rblas_overhead.rs @@ -13,8 +13,6 @@ use co::framework::IFramework; use co::shared_memory::SharedMemory; use co::libraries::blas::IBlas; use rblas::Dot; -use std::time::Duration; -use std::thread::sleep; use rand::{thread_rng, Rng}; @@ -63,3 +61,42 @@ fn bench_1000_dot_100_collenchyma_profile(b: &mut Bencher, backend: &Backend().take(20000).collect::>(); + let slice_b = rng.gen_iter::().take(20000).collect::>(); + + b.iter(|| { + for _ in 0..5 { + let res = Dot::dot(&slice_a, &slice_b); + test::black_box(res); + } + }); +} + +#[bench] +fn bench_5_dot_20000_collenchyma(b: &mut Bencher) { + let mut rng = thread_rng(); + let slice_a = rng.gen_iter::().take(20000).collect::>(); + let slice_b = rng.gen_iter::().take(20000).collect::>(); + + let backend = backend(); + let shared_a = &mut SharedMemory::::new(backend.device(), 20000); + let shared_b = &mut SharedMemory::::new(backend.device(), 20000); + let shared_res = &mut SharedMemory::::new(backend.device(), 20000); + shared_a.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_a); + shared_b.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_b); + let _ = backend.dot(shared_a, shared_b, shared_res); + bench_5_dot_20000_collenchyma_profile(b, &backend, shared_a, shared_b, shared_res); +} + +#[inline(never)] +fn bench_5_dot_20000_collenchyma_profile(b: &mut Bencher, backend: &Backend, shared_a: &mut SharedMemory, shared_b: &mut SharedMemory, shared_res: &mut SharedMemory) { + b.iter(|| { + for _ in 0..5 { + let _ = backend.dot(shared_a, shared_b, shared_res); + } + }); +} diff --git a/perf/run_perf.sh b/perf/run_perf.sh index 9924cfc9..4736c210 100755 --- a/perf/run_perf.sh +++ b/perf/run_perf.sh @@ -6,7 +6,7 @@ if [ $# -eq 0 ] fi benchname=$1 mkdir -p target/perf -perf record -a -g -F 10000 --output target/perf/${benchname}.data target/debug/rblas_overhead-cf1a2670c118749d --bench ${benchname} +perf record -a -g --output target/perf/${benchname}.data target/debug/rblas_overhead-cf1a2670c118749d --bench ${benchname} perf script -f -i target/perf/${benchname}.data > target/perf/${benchname}.scripted -stackcollapse-perf target/perf/${benchname}.scripted > target/perf/${benchname}.folded +stackcollapse-perf target/perf/${benchname}.scripted | grep ${benchname} > target/perf/${benchname}.folded flamegraph target/perf/${benchname}.folded > target/perf/${benchname}.svg diff --git a/src/backend.rs b/src/backend.rs index 2bf6c844..594a1747 100644 --- a/src/backend.rs +++ b/src/backend.rs @@ -82,8 +82,8 @@ impl Backend { } /// Returns the backend framework. - pub fn framework(&self) -> Box { - self.framework.clone() + pub fn framework(&self) -> &Box { + &self.framework } /// Returns the backend device. @@ -92,8 +92,8 @@ impl Backend { } /// Returns the blas binary. - pub fn binary(&self) -> F::B { - self.framework().binary().clone() + pub fn binary(&self) -> &F::B { + self.framework().binary() } } @@ -120,7 +120,7 @@ impl IBackend for Backend { impl IBlas for Backend { type B = ::frameworks::opencl::Program; - fn binary(&self) -> Self::B { + fn binary(&self) -> &Self::B { self.binary() } @@ -132,7 +132,7 @@ impl IBlas for Backend { impl IBlas for Backend { type B = ::frameworks::native::Binary; - fn binary(&self) -> Self::B { + fn binary(&self) -> &Self::B { self.binary() } @@ -144,7 +144,7 @@ impl IBlas for Backend { impl IBlas for Backend { type B = ::frameworks::native::Binary; - fn binary(&self) -> Self::B { + fn binary(&self) -> &Self::B { self.binary() } diff --git a/src/framework.rs b/src/framework.rs index ab1391b2..a72c7c6f 100644 --- a/src/framework.rs +++ b/src/framework.rs @@ -53,7 +53,7 @@ pub trait IFramework { fn hardwares(&self) -> Vec; /// Returns the initialized binary. - fn binary(&self) -> Self::B; + fn binary(&self) -> &Self::B; /// Initializes a new Device from the provided hardwares. fn new_device(&self, Vec) -> Result; diff --git a/src/frameworks/cuda/mod.rs b/src/frameworks/cuda/mod.rs index ea0ce7ae..0d8781e6 100644 --- a/src/frameworks/cuda/mod.rs +++ b/src/frameworks/cuda/mod.rs @@ -59,8 +59,8 @@ impl IFramework for Cuda { self.hardwares.clone() } - fn binary(&self) -> Self::B { - self.binary.clone() + fn binary(&self) -> &Self::B { + &self.binary } /// Creates a new Cuda device for computation. diff --git a/src/frameworks/native/mod.rs b/src/frameworks/native/mod.rs index 73d4e491..454c11ce 100644 --- a/src/frameworks/native/mod.rs +++ b/src/frameworks/native/mod.rs @@ -65,8 +65,8 @@ impl IFramework for Native { self.hardwares.clone() } - fn binary(&self) -> Binary { - self.binary.clone() + fn binary(&self) -> &Binary { + &self.binary } fn new_device(&self, devices: Vec) -> Result { diff --git a/src/frameworks/opencl/mod.rs b/src/frameworks/opencl/mod.rs index f09f3c1e..3937f3a5 100644 --- a/src/frameworks/opencl/mod.rs +++ b/src/frameworks/opencl/mod.rs @@ -81,8 +81,8 @@ impl IFramework for OpenCL { self.hardwares.clone() } - fn binary(&self) -> Self::B { - self.binary.clone() + fn binary(&self) -> &Self::B { + &self.binary } /// Creates a new OpenCL context over one or many devices ready for computation. diff --git a/src/lib.rs b/src/lib.rs index a643914f..f02c0d02 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -148,6 +148,7 @@ extern crate enum_primitive; extern crate lazy_static; extern crate num; extern crate byteorder; +extern crate linear_map; extern crate rblas as blas; pub mod backend; diff --git a/src/libraries/blas.rs b/src/libraries/blas.rs index f6c7ba53..64095363 100644 --- a/src/libraries/blas.rs +++ b/src/libraries/blas.rs @@ -134,7 +134,7 @@ pub trait IBlas { } /// Returns the binary representation - fn binary(&self) -> Self::B; + fn binary(&self) -> &Self::B; /// Returns the device representation fn device(&self) -> &DeviceType; diff --git a/src/shared_memory.rs b/src/shared_memory.rs index ea8c9737..b08ff617 100644 --- a/src/shared_memory.rs +++ b/src/shared_memory.rs @@ -35,18 +35,19 @@ //! # } //! ``` -use std::collections::HashMap; +use linear_map::LinearMap; use device::{IDevice, DeviceType}; use memory::MemoryType; use std::marker::PhantomData; use std::{fmt, mem, error}; -#[derive(Debug)] +// #[derive(Debug)] /// Container that handles synchronization of [Memory][1] of type `T`. /// [1]: ../memory/index.html +#[allow(missing_debug_implementations)] // due to LinearMap pub struct SharedMemory { latest_location: DeviceType, - copies: HashMap, + copies: LinearMap, cap: usize, phantom: PhantomData, } @@ -55,7 +56,7 @@ impl SharedMemory { /// Create new SharedMemory by allocating [Memory][1] on a Device. /// [1]: ../memory/index.html pub fn new(dev: &DeviceType, capacity: usize) -> SharedMemory { - let mut copies = HashMap::::new(); + let mut copies = LinearMap::::new(); let copy: MemoryType; let alloc_size = mem::size_of::() * capacity; match *dev {