From 607a99d18747479c3f1e64b294972987345e910f Mon Sep 17 00:00:00 2001 From: imrn99 <95699343+imrn99@users.noreply.github.com> Date: Fri, 15 Dec 2023 14:44:06 +0100 Subject: [PATCH] Update documentation & test suite (#11) * update readme * update rustdoc homepage * moved cpp files * detailed KernelArgs doc * updated functor docs * replaced SerialForKernelType in code * fix doc test * replaced ForKernel in code * disptach module doc * routines doc * finished doc of the routines module * view module doc * view parameters doc * misc formatting * update test CI to include features * fix ci? --- .github/workflows/simple-ci.yml | 8 ++ README.md | 27 ++--- build.rs | 9 +- src/{ => cpp}/hello.cpp | 2 +- src/functor.rs | 106 +++++++++++++++++-- {include => src/include}/hello.hpp | 0 src/lib.rs | 68 ++++-------- src/routines/dispatch.rs | 100 ++++++++++++------ src/routines/mod.rs | 147 +++++++++++++++++++++++--- src/routines/parameters.rs | 26 ++++- src/view/mod.rs | 164 ++++++++++++++++++++--------- src/view/parameters.rs | 37 +++++-- 12 files changed, 508 insertions(+), 186 deletions(-) rename src/{ => cpp}/hello.cpp (82%) rename {include => src/include}/hello.hpp (100%) diff --git a/.github/workflows/simple-ci.yml b/.github/workflows/simple-ci.yml index d6b8137..3de7280 100644 --- a/.github/workflows/simple-ci.yml +++ b/.github/workflows/simple-ci.yml @@ -37,6 +37,14 @@ jobs: - uses: actions-rs/cargo@v1 with: command: test + - uses: actions-rs/cargo@v1 + with: + command: test + args: --features rayon + - uses: actions-rs/cargo@v1 + with: + command: test + args: --features threads fmt: name: Rustfmt diff --git a/README.md b/README.md index b428ac0..7bb8dce 100644 --- a/README.md +++ b/README.md @@ -9,17 +9,13 @@ proof and verification of that statement. ## Scope of the Project -~~The main focus of this Proof-of-Concept is the architecture and approach used by -Kokkos for data management. While multiple targets support (Serial, [rayon][2], OpenMP) -could be interesting, it is not the priority.~~ - -Rudimentary data structure implementation being done, the goal is now to write a simple -program using a `parallel_for` statement with satisfying portability as defined by Kokkos. - -Additionally, some features of Kokkos are not reproducible in Rust (GPU targetting, -templating); These create limits for the implementation that may or may not be bypassed. -This makes limit-testing an fundamental part of the project. +The goal of this project is not to produce an entire Kokkos implementation nor to +replicate the existing C++ library. While the current C++ source code is interesting +to use as inspiration, the main reference is the model description. +Additionally, because of language specific features (Rust strict compilation rules, +C++ templates), you can expect the underlying implementation of concepts to be +vastly different. ## Quickstart @@ -97,16 +93,9 @@ do. ## References -### View Implementation - -- `ndarray` Rust implementation: [link][NDARRAY] -- Const generics documentation from The Rust Reference: [link][CONSTG] -- `move` keyword semantic & implementation: [link][MOVE] +- The Kokkos Wiki: [link][1] +- `rayon` crate documentation: [link][2] [1]: https://kokkos.github.io/kokkos-core-wiki/index.html [2]: https://docs.rs/rayon/latest/rayon/ - -[NDARRAY]: https://docs.rs/ndarray/latest/ndarray/ -[CONSTG]: https://doc.rust-lang.org/reference/items/generics.html -[MOVE]: https://stackoverflow.com/questions/30288782/what-are-move-semantics-in-rust \ No newline at end of file diff --git a/build.rs b/build.rs index aea57ff..1682a86 100644 --- a/build.rs +++ b/build.rs @@ -17,7 +17,7 @@ fn main() { cxx_build::bridge("src/lib.rs") .compiler(compiler) - .file("src/hello.cpp") + .file("src/cpp/hello.cpp") .flag_if_supported("-std=c++20") .flag(ompflags) // clang .compile("poc-cc"); @@ -36,7 +36,10 @@ fn main() { } _ => unimplemented!(), } + // main println!("cargo:rerun-if-changed=src/main.rs"); - println!("cargo:rerun-if-changed=src/hello.cpp"); - println!("cargo:rerun-if-changed=include/hello.hpp"); + // cpp files + println!("cargo:rerun-if-changed=src/cpp/hello.cpp"); + // header files + println!("cargo:rerun-if-changed=src/include/hello.hpp"); } diff --git a/src/hello.cpp b/src/cpp/hello.cpp similarity index 82% rename from src/hello.cpp rename to src/cpp/hello.cpp index be10e21..b7f9c5f 100644 --- a/src/hello.cpp +++ b/src/cpp/hello.cpp @@ -1,4 +1,4 @@ -#include "poc-kokkos-rs/include/hello.hpp" +#include "poc-kokkos-rs/src/include/hello.hpp" #include "omp.h" #include diff --git a/src/functor.rs b/src/functor.rs index d24eda0..9709ef5 100644 --- a/src/functor.rs +++ b/src/functor.rs @@ -3,19 +3,70 @@ //! This module contains all functor and kernel related code. Its content //! is highly dependant on the features enabled since the traits that a //! kernel must satisfy changes totally depending on the backend used. +//! +//! Kernel signatures are handled using `cargo` features. Using conditionnal +//! compilation, the exact trait kernels must implement are adjusted according +//! to the backend used to dispatch statements. +//! +//! In order to have actual closures match the required trait implementation, +//! the same mechanism is used to define operations on [`Views`][crate::view]. -/// Kernel argument types +#[cfg(doc)] +use crate::routines::parameters::RangePolicy; + +/// Kernel argument enum +/// +/// In the Kokkos library, there is a finite number of kernel signatures. +/// Each is associated to/determined by a given execution policy. +/// In order to have kernel genericity in Rust, without introducing overhead +/// due to downcasting, the solution was to define kernel arguments as a +/// struct-like enum. /// -/// Until some work is done to have a better solution[^sol1][^sol2], this will -/// be an enum and kernels will be written in an idiomatic way. +/// ### Example /// -/// [^sol1]: Current tracking issue for upcasting implementation: +/// One-dimensional kernel: +/// ``` +/// // Range is defined in the execution policy +/// use poc_kokkos_rs::functor::KernelArgs; /// -/// [^sol2]: Current tracking issue to allow impl trait usage in types aliases: +/// let kern = |arg: KernelArgs<1>| match arg { +/// KernelArgs::Index1D(i) => { +/// // body of the kernel +/// println!("Hello from iteration {i}") +/// }, +/// KernelArgs::IndexND(_) => unimplemented!(), +/// KernelArgs::Handle => unimplemented!(), +/// }; +/// ``` +/// +/// 3D kernel: +/// ``` +/// use poc_kokkos_rs::functor::KernelArgs; +/// +/// // Use the array +/// let kern = |arg: KernelArgs<3>| match arg { +/// KernelArgs::Index1D(_) => unimplemented!(), +/// KernelArgs::IndexND(idx) => { // idx: [usize; 3] +/// // body of the kernel +/// println!("Hello from iteration {idx:?}") +/// }, +/// KernelArgs::Handle => unimplemented!(), +/// }; +/// +/// // Decompose the array +/// let kern = |arg: KernelArgs<3>| match arg { +/// KernelArgs::Index1D(_) => unimplemented!(), +/// KernelArgs::IndexND([i, j, k]) => { // i,j,k: usize +/// // body of the kernel +/// println!("Hello from iteration {i},{j},{k}"); +/// }, +/// KernelArgs::Handle => unimplemented!(), +/// }; +/// ``` pub enum KernelArgs { - /// Arguments of a one-dimensionnal kernel (e.g. a RangePolicy). + /// Arguments of a one-dimensionnal kernel (e.g. a [RangePolicy][RangePolicy::RangePolicy]). Index1D(usize), - /// Arguments of a `N`-dimensionnal kernel (e.g. a MDRangePolicy). + /// Arguments of a `N`-dimensionnal kernel (e.g. a [MDRangePolicy][RangePolicy::MDRangePolicy]). IndexND([usize; N]), /// Arguments of a team-based kernel. Handle, @@ -23,16 +74,49 @@ pub enum KernelArgs { cfg_if::cfg_if! { if #[cfg(feature = "rayon")] { - /// `rayon`-specific kernel type. + /// `parallel_for` kernel type. Depends on enabled feature(s). + /// + /// This type alias is configured according to enabled feature in order to adjust + /// the signatures of kernels to match the requirements of the underlying dispatch routines. + /// + /// ### Possible Values + /// - `rayon` feature enabled: `Box) + Send + Sync + 'a>` + /// - `threads` feature enabled: `Box) + Send + 'a>` + /// - no feature enabled: fall back to [`SerialForKernelType`][SerialForKernelType] + /// + /// **Current version**: `rayon` pub type ForKernelType<'a, const N: usize> = Box) + Send + Sync + 'a>; } else if #[cfg(feature = "threads")] { - /// Standard threads specific kernel type. + /// `parallel_for` kernel type. Depends on enabled feature(s). + /// + /// This type alias is configured according to enabled feature in order to adjust + /// the signatures of kernels to match the requirements of the underlying dispatch routines. + /// + /// ### Possible Values + /// - `rayon` feature enabled: `Box) + Send + Sync + 'a>` + /// - `threads` feature enabled: `Box) + Send + 'a>` + /// - no feature enabled: fall back to [`SerialForKernelType`][SerialForKernelType] + /// + /// **Current version**: `threads` pub type ForKernelType<'a, const N: usize> = Box) + Send + 'a>; } else { - /// Fall back kernel type. + /// `parallel_for` kernel type. Depends on enabled feature(s). + /// + /// This type alias is configured according to enabled feature in order to adjust + /// the signatures of kernels to match the requirements of the underlying dispatch routines. + /// + /// ### Possible Values + /// - `rayon` feature enabled: `Box) + Send + Sync + 'a>` + /// - `threads` feature enabled: `Box) + Send + 'a>` + /// - no feature enabled: fall back to [`SerialForKernelType`][SerialForKernelType] + /// + /// **Current version**: no feature pub type ForKernelType<'a, const N: usize> = SerialForKernelType<'a, N>; } } -/// Serial kernel type. +/// Serial kernel type. Does not depend on enabled feature(s). +/// +/// This is the minimal required trait implementation for closures passed to a +/// `for_each` statement. pub type SerialForKernelType<'a, const N: usize> = Box) + 'a>; diff --git a/include/hello.hpp b/src/include/hello.hpp similarity index 100% rename from include/hello.hpp rename to src/include/hello.hpp diff --git a/src/lib.rs b/src/lib.rs index 96ca044..16df8d1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,71 +2,39 @@ //! //! ## Scope of the Project //! -//! ~~The main focus of this Proof-of-Concept is the architecture and approach used by -//! [Kokkos][1] for data management. While multiple targets support (Serial, [rayon][2], -//! OpenMP) could be interesting, it is not the priority.~~ +//! The goal of this project is not to produce an entire Kokkos implementation nor to +//! replicate the existing C++ library. While the current C++ source code is interesting +//! to use as inspiration, the main reference is the model description. //! -//! Rudimentary data structure implementation being done, the goal is now to write a simple -//! program using a `parallel_for` statement with satisfying portability as defined by Kokkos. -//! -//! Additionally, some features of Kokkos are not reproducible in Rust (GPU targetting, -//! templating); These create limits for the implementation, hence the existence of this PoC. -//! This makes limit-testing an fundamental part of the project. +//! Additionally, because of language specific features (Rust strict compilation rules, +//! C++ templates), you can expect the underlying implementation of concepts to be +//! vastly different. //! //! //! ## Quickstart //! -//! The PoC itself is a library, but you can run benchmarks and examples out of the box. -//! -//! ### Benchmarks -//! -//! Benchmarks can be run using the following command: +//! The PoC itself is a library, but you can run benchmarks and examples out of the box: //! //! ```bash //! # all benchmarks //! cargo bench //! # a specific benchmark -//! cargo bench --bench bench_name +//! cargo bench --bench +//! # a specific example +//! cargo run --example //! ``` //! -//! All results are compiled to the `target/criterion/` folder. The following -//! benchmarks are available: -//! -//! **Layout:** -//! - `layout-comparison`: Bench a Matrix-Matrix product three times, using the worst possible layout, -//! the usual layout, and then the optimal layout for the operation. This shows the importance of layout -//! selection for performances. -//! - `layout-size`: Bench a Matrix-Matrix product using the usual layout and the optimal layout, -//! over a range of sizes for the square matrices. This shows the influence of cache size over -//! layout importance. -//! **Computation:** -//! - `axpy` / `gemv` / `gemm`: Measure speedup on basic BLAS implementations by running the same kernel -//! in serial mode first, then using parallelization on CPU. _Meant to be executed using features_. -//! - `hardcoded_gemm`: Compute the same operations as the `gemm` benchmark, but using a hardcoded implementation -//! instead of methods from the PoC. Used to assess the additional cost induced by the library. -//! **Library overhead:** -//! - `view_init`: Compare initialization performances of regular vectors to [Views][view]; This -//! is used to spot potential scaling issues induced by the more complex structure of Views. -//! - `view_access`: Compare data access performances of regular vectors to [Views][view]; This -//! is used to spot potential scaling issues induced by the more complex structure of Views. -//! -//! Additionally, a kokkos-equivalent of the blas kernels can be found in the `blas-speedup-kokkos/` -//! subdirectory. These are far from being the most optimized implementation, instead they are written -//! as close-ish counterparts to the Rust benchmarks. -//! -//! ### Examples +//! Generate local documentation: //! //! ```bash -//! cargo run --example hello-world +//! cargo doc --no-deps --open //! ``` //! -//! The following examples are available: -//! -//! - `hello_world`: ... -//! - `hello_world_omp`: ... +//! Note that some elements of the documentation are feature specific. //! +//! ## Compilation //! -//! ## Features +//! ### Features //! //! Using `features`, the crate can be compiled to use different backend for execution of parallel section. //! These can also be enabled in benchmarks. @@ -81,13 +49,13 @@ //! - `threads` : Uses [`std::thread`] methods to handle parallelization on CPU. //! - `gpu`: Currently used as a way to gate GPU usage as this cannot be done in pure Rust. //! -//! ## Compilation +//! ### C++ Interoperability //! //! The build script will read the `CXX` environment variable to choose which C++ compiler to use //! for Rust/C++ interop. Note that the crate itself does not currently use C++ code, only examples //! do. //! -//! ### Known issues +//! #### Known issues //! //! - On MacOs: Does not work with Apple Clang //! - Solution: Homebrew Clang or tinker with flags to get OpenMP to work @@ -106,7 +74,7 @@ pub mod ffi { // C++ types and signatures exposed to Rust. unsafe extern "C++" { - include!("poc-kokkos-rs/include/hello.hpp"); + include!("poc-kokkos-rs/src/include/hello.hpp"); fn say_hello(); diff --git a/src/routines/dispatch.rs b/src/routines/dispatch.rs index f46335b..7de6018 100644 --- a/src/routines/dispatch.rs +++ b/src/routines/dispatch.rs @@ -3,6 +3,12 @@ //! This module contains all code used to dispatch computational kernels //! onto specified devices. Note that the documentation is feature-specific when the //! items are, i.e. documentation is altered by enabled features. +//! +//! The methods desccribed in this module are not meant to be used directly, they are only +//! building blocks for the parallel statements. + +#[cfg(any(doc, feature = "rayon", feature = "gpu"))] +use crate::functor::ForKernelType; #[cfg(feature = "rayon")] use rayon::prelude::*; @@ -10,7 +16,7 @@ use rayon::prelude::*; use std::{fmt::Display, ops::Range}; use super::parameters::{ExecutionPolicy, RangePolicy}; -use crate::functor::KernelArgs; +use crate::functor::{KernelArgs, SerialForKernelType}; // enums @@ -50,16 +56,13 @@ impl std::error::Error for DispatchError { /// Builds a N-depth nested loop executing a kernel using the N resulting indices. /// Technically, this should be replaced by a tiling function, for both serial and parallel -/// implementations. In practice, the cost of tiling might be too high in a serial context. -fn recursive_loop( - ranges: &[Range; N], - mut kernel: Box)>, -) { +/// implementations. +fn recursive_loop(ranges: &[Range; N], mut kernel: SerialForKernelType) { // handles recursions fn inner( current_depth: usize, ranges: &[Range; N], - kernel: &mut Box)>, + kernel: &mut SerialForKernelType, indices: &mut [usize; N], ) { if current_depth == N { @@ -83,12 +86,14 @@ fn recursive_loop( // serial dispatch -/// Dispatch routine for serial backend. +/// CPU dispatch routine of `for` statements. Does not depend on enabled feature(s). /// -/// This also serve as the fallback CPU dispatch routine in specific cases. +/// The dispatch function execute the kernel accordingly to the directives contained in the +/// execution policy. The kernel signature does not vary according to enabled features as this +/// is the invariant fallback dispatch routine. pub fn serial( execp: ExecutionPolicy, - kernel: Box)>, + kernel: SerialForKernelType, ) -> Result<(), DispatchError> { match execp.range { RangePolicy::RangePolicy(range) => { @@ -151,12 +156,24 @@ pub fn serial( cfg_if::cfg_if! { if #[cfg(feature = "threads")] { - /// Dispatch routine for CPU parallelization. + /// CPU dispatch routine of `for` statements. Implementation depends on enabled feature(s). + /// + /// The dispatch function execute the kernel accordingly to the directives contained in the + /// execution policy. The kernel signature varies according to enabled features. + /// + /// ### Possible Kernel Signatures /// - /// Backend-specific function for [std::thread] usage. + /// - `rayon` feature enabled: [`ForKernelType`] + /// - `threads` feature enabled: `Box) + Send + Sync + 'a + Clone>` + /// - no feature enabled: fall back to [`SerialForKernelType`] + /// + /// The `threads` implementation cannot currently use the generic [`ForKernelType`] because + /// of the Clone requirement. + /// + /// **Current version**: `threads` pub fn cpu<'a, const N: usize>( execp: ExecutionPolicy, - kernel: Box) + Send + Sync + 'a + Clone>, + kernel: Box) + Send + Sync + 'a + Clone>, // cannot be replaced by functor type bc of Clone ) -> Result<(), DispatchError> { match execp.range { RangePolicy::RangePolicy(range) => { @@ -172,7 +189,6 @@ cfg_if::cfg_if! { // use scope to avoid 'static lifetime reqs std::thread::scope(|s| { let handles: Vec<_> = indices.chunks(chunk_size).map(|chunk| { - // rebuild the kernel from the copied raw pointer s.spawn(|| chunk.iter().map(|idx_ref| KernelArgs::Index1D(*idx_ref)).for_each(kernel.clone())) }).collect(); @@ -218,12 +234,24 @@ cfg_if::cfg_if! { Ok(()) } } else if #[cfg(feature = "rayon")] { - /// Dispatch routine for CPU parallelization. + /// CPU dispatch routine of `for` statements. Implementation depends on enabled feature(s). + /// + /// The dispatch function execute the kernel accordingly to the directives contained in the + /// execution policy. The kernel signature varies according to enabled features. + /// + /// ### Possible Kernel Signatures /// - /// Backend-specific function for [rayon](https://docs.rs/rayon/latest/rayon/) usage. + /// - `rayon` feature enabled: [`ForKernelType`] + /// - `threads` feature enabled: `Box) + Send + Sync + 'a + Clone>` + /// - no feature enabled: fall back to [`SerialForKernelType`] + /// + /// The `threads` implementation cannot currently use the generic [`ForKernelType`] because + /// of the Clone requirement. + /// + /// **Current version**: `rayon` pub fn cpu<'a, const N: usize>( execp: ExecutionPolicy, - kernel: Box) + Send + Sync + 'a + Clone>, + kernel: ForKernelType, ) -> Result<(), DispatchError> { match execp.range { RangePolicy::RangePolicy(range) => { @@ -276,12 +304,24 @@ cfg_if::cfg_if! { Ok(()) } } else { - /// Dispatch routine for CPU parallelization. + /// CPU dispatch routine of `for` statements. Implementation depends on enabled feature(s). + /// + /// The dispatch function execute the kernel accordingly to the directives contained in the + /// execution policy. The kernel signature varies according to enabled features. + /// + /// ### Possible Kernel Signatures /// - /// Backend-specific function that falls back to serial execution. + /// - `rayon` feature enabled: [`ForKernelType`] + /// - `threads` feature enabled: `Box) + Send + Sync + 'a + Clone>` + /// - no feature enabled: fall back to [`SerialForKernelType`] + /// + /// The `threads` implementation cannot currently use the generic [`ForKernelType`] because + /// of the Clone requirement. + /// + /// **Current version**: no feature pub fn cpu( execp: ExecutionPolicy, - kernel: Box)>, + kernel: SerialForKernelType, ) -> Result<(), DispatchError> { serial(execp, kernel) } @@ -290,18 +330,18 @@ cfg_if::cfg_if! { cfg_if::cfg_if! { if #[cfg(feature = "gpu")] { - /// Dispatch routine for GPU parallelization. UNIMPLEMENTED + /// GPU Dispatch routine of `for` statements. UNIMPLEMENTED pub fn gpu<'a, const N: usize>( - execp: ExecutionPolicy, - kernel: Box) + Send + Sync + 'a + Clone>, + _execp: ExecutionPolicy, + _kernel: ForKernelType, ) -> Result<(), DispatchError> { - serial(execp, kernel) + unimplemented!() } } else { - /// Dispatch routine for GPU parallelization. UNIMPLEMENTED - pub fn gpu< const N: usize>( + /// GPU Dispatch routine of `for` statements. UNIMPLEMENTED + pub fn gpu( execp: ExecutionPolicy, - kernel: Box)>, + kernel: SerialForKernelType, ) -> Result<(), DispatchError> { serial(execp, kernel) } @@ -321,7 +361,7 @@ mod tests { }; // fixes warnings when testing using a parallel feature cfg_if::cfg_if! { - if #[cfg(any(feature = "threads", feature = "rayon"))] { + if #[cfg(any(feature = "threads", feature = "rayon", feature = "gpu"))] { let mat = ViewOwned::new_from_data(vec![0.0; 15], Layout::Right, [15]); } else { let mut mat = ViewOwned::new_from_data(vec![0.0; 15], Layout::Right, [15]); @@ -356,7 +396,7 @@ mod tests { }; // fixes warnings when testing using a parallel feature cfg_if::cfg_if! { - if #[cfg(any(feature = "threads", feature = "rayon"))] { + if #[cfg(any(feature = "threads", feature = "rayon", feature = "gpu"))] { let mat = ViewOwned::new_from_data(vec![0.0; 150], Layout::Right, [10, 15]); } else { let mut mat = ViewOwned::new_from_data(vec![0.0; 150], Layout::Right, [10, 15]); @@ -392,7 +432,7 @@ mod tests { // fixes warnings when testing using a parallel feature cfg_if::cfg_if! { - if #[cfg(any(feature = "threads", feature = "rayon"))] { + if #[cfg(any(feature = "threads", feature = "rayon", feature = "gpu"))] { let mat = ViewOwned::new_from_data(vec![0.0; 15], Layout::Right, [15]); } else { let mut mat = ViewOwned::new_from_data(vec![0.0; 15], Layout::Right, [15]); diff --git a/src/routines/mod.rs b/src/routines/mod.rs index 0d576a5..0f4bf69 100644 --- a/src/routines/mod.rs +++ b/src/routines/mod.rs @@ -1,11 +1,15 @@ //! parallel statement related code //! -//! This module contains code used for the implementations of parallel statements, e.g. -//! `parallel_for`, a Kokkos specific implementation of the commonly used pattern. -//! -//! `parallel_for` is currently the only statement considered for implementation; +//! This module contains code used for the implementation of parallel statements, e.g. +//! `parallel_for`, a Kokkos specific implementation of commonly used patterns. //! //! Parameters of aforementionned statements are defined in the [`parameters`] sub-module. +//! +//! Dispatch code is defined in the [`dispatch`] sub-module. +//! +//! Currently implemented statements: +//! +//! - `parallel_for` pub mod dispatch; pub mod parameters; @@ -26,7 +30,7 @@ pub enum StatementError { Dispatch(DispatchError), /// Error raised when parallel hierarchy isn't respected. InconsistentDepth, - /// ... + /// What did I mean by this? InconsistentExecSpace, } @@ -44,7 +48,7 @@ impl Display for StatementError { write!(f, "inconsistent depth & range policy association") } StatementError::InconsistentExecSpace => { - write!(f, "inconsistent depth & range policy association") + write!(f, "?") } } } @@ -65,11 +69,41 @@ impl std::error::Error for StatementError { // All of this would be half as long if impl trait in type aliases was stabilized cfg_if::cfg_if! { - if #[cfg(any(feature = "rayon", feature = "threads", feature = "gpu"))] { - - /// Parallel For statement. The `const` generic argument should - /// be `0`, `1`, or `2` according to its position in a nested structure - /// (`0` being the most outer level, `2` the most inner level). + if #[cfg(feature = "threads")] { + /// Parallel For statement. + /// + /// **Current version**: `threads` + /// + /// ### Example + /// + /// ```rust + /// use poc_kokkos_rs::{ + /// functor::KernelArgs, + /// routines::{ + /// parallel_for, + /// parameters::{ExecutionPolicy, ExecutionSpace, RangePolicy, Schedule}, + /// }, + /// }; + /// + /// let length: usize = 8; + /// + /// let kern = |arg: KernelArgs<1>| match arg { + /// KernelArgs::Index1D(i) => { + /// // body of the kernel + /// println!("Hello from iteration {i}") + /// }, + /// KernelArgs::IndexND(_) => unimplemented!(), + /// KernelArgs::Handle => unimplemented!(), + /// }; + /// + /// let execp = ExecutionPolicy { + /// space: ExecutionSpace::DeviceCPU, + /// range: RangePolicy::RangePolicy(0..length), + /// schedule: Schedule::Static, + /// }; + /// + /// parallel_for(execp, kern).unwrap(); + /// ``` pub fn parallel_for( execp: ExecutionPolicy, func: impl Fn(KernelArgs) + Send + Sync + Clone, @@ -86,13 +120,98 @@ cfg_if::cfg_if! { parameters::ExecutionSpace::DeviceGPU => dispatch::gpu(execp, kernel), }; + // Ok or converts error + res.map_err(|e| e.into()) + } + } else if #[cfg(feature = "rayon")] { + /// Parallel For statement. + /// + /// **Current version**: `rayon` + /// + /// ### Example + /// + /// ```rust + /// use poc_kokkos_rs::{ + /// functor::KernelArgs, + /// routines::{ + /// parallel_for, + /// parameters::{ExecutionPolicy, ExecutionSpace, RangePolicy, Schedule}, + /// }, + /// }; + /// + /// let length: usize = 8; + /// + /// let kern = |arg: KernelArgs<1>| match arg { + /// KernelArgs::Index1D(i) => { + /// // body of the kernel + /// println!("Hello from iteration {i}") + /// }, + /// KernelArgs::IndexND(_) => unimplemented!(), + /// KernelArgs::Handle => unimplemented!(), + /// }; + /// + /// let execp = ExecutionPolicy { + /// space: ExecutionSpace::DeviceCPU, + /// range: RangePolicy::RangePolicy(0..length), + /// schedule: Schedule::Static, + /// }; + /// + /// parallel_for(execp, kern).unwrap(); + /// ``` + pub fn parallel_for( + execp: ExecutionPolicy, + func: impl Fn(KernelArgs) + Send + Sync, + ) -> Result<(), StatementError> { + // checks... + + // data prep? + let kernel = Box::new(func); + + // dispatch + let res = match execp.space { + parameters::ExecutionSpace::Serial => dispatch::serial(execp, kernel), + parameters::ExecutionSpace::DeviceCPU => dispatch::cpu(execp, kernel), + parameters::ExecutionSpace::DeviceGPU => dispatch::gpu(execp, kernel), + }; + // Ok or converts error res.map_err(|e| e.into()) } } else { - /// Parallel For statement. The `const` generic argument should - /// be `0`, `1`, or `2` according to its position in a nested structure - /// (`0` being the most outer level, `2` the most inner level). + /// Parallel For statement. + /// + /// **Current version**: no feature + /// + /// ### Example + /// + /// ```rust + /// use poc_kokkos_rs::{ + /// functor::KernelArgs, + /// routines::{ + /// parallel_for, + /// parameters::{ExecutionPolicy, ExecutionSpace, RangePolicy, Schedule}, + /// }, + /// }; + /// + /// let length: usize = 8; + /// + /// let kern = |arg: KernelArgs<1>| match arg { + /// KernelArgs::Index1D(i) => { + /// // body of the kernel + /// println!("Hello from iteration {i}") + /// }, + /// KernelArgs::IndexND(_) => unimplemented!(), + /// KernelArgs::Handle => unimplemented!(), + /// }; + /// + /// let execp = ExecutionPolicy { + /// space: ExecutionSpace::DeviceCPU, + /// range: RangePolicy::RangePolicy(0..length), + /// schedule: Schedule::Static, + /// }; + /// + /// parallel_for(execp, kern).unwrap(); + /// ``` pub fn parallel_for( execp: ExecutionPolicy, func: impl FnMut(KernelArgs), diff --git a/src/routines/parameters.rs b/src/routines/parameters.rs index c257306..ac9a43b 100644 --- a/src/routines/parameters.rs +++ b/src/routines/parameters.rs @@ -28,8 +28,9 @@ pub enum ExecutionSpace { } #[derive(Debug, Clone)] -/// Range Policy enum. This holds information related to the looping structure -/// adopted by the routine. +/// Range Policy enum. +/// +/// This holds information related to the looping structure adopted by the routine. pub enum RangePolicy { // Outer range /// 1D iteration range. @@ -70,7 +71,7 @@ pub enum RangePolicy { ThreadVectorMDRange, } -/// Schedule enum. CURRENTLY IGNORED. +/// Scheduling enum. CURRENTLY IGNORED. /// /// Used to set the workload scheduling policy. Defaults to [Schedule::Static]. #[derive(Debug, Default, Clone)] @@ -86,6 +87,25 @@ pub enum Schedule { #[derive(Debug, Clone)] /// Execution Policy enum. See Kokkos documentation for explanation on their model. +/// +/// ### Example +/// +/// ```rust +/// use poc_kokkos_rs::routines::parameters::{ +/// ExecutionPolicy, +/// ExecutionSpace, +/// RangePolicy, +/// Schedule +/// }; +/// +/// let length: usize = 8; +/// +/// let execp = ExecutionPolicy::<1> { +/// space: ExecutionSpace::DeviceCPU, // will try to parallelize code on CPU +/// range: RangePolicy::RangePolicy(0..length), // equivalent to "for i in 0..length" +/// schedule: Schedule::Static, // static division of workload +/// }; +/// ``` pub struct ExecutionPolicy { /// Execution space targetted by the dispatch. pub space: ExecutionSpace, diff --git a/src/view/mod.rs b/src/view/mod.rs index f1e0d48..076831a 100644 --- a/src/view/mod.rs +++ b/src/view/mod.rs @@ -4,19 +4,54 @@ //! defined and used by the Kokkos library. There are different types of views, all //! implemented using the same backend, [ViewBase]. //! +//! Eventually, the different types of Views should be removed and replaced by a single +//! type. The distinction between original and mirrors doesn't seem necessary in a Rust +//! implementation where the ownership system handles all memory transaction. +//! +//! In order to have thread-safe structures to use in parallel statement, the inner data +//! type of views is adjusted implicitly when compiling using parallelization features. +//! To match the adjusted data type, view access is done through `get` and `set` methods, +//! allowing for feature-specific mutability in signatures while keeping a consistent user +//! API. +//! //! Parameters of aforementionned views are defined in the [`parameters`] sub-module. //! +//! ### Example +//! +//! Initialize and fill a 2D matrix: +//! ```rust +//! use poc_kokkos_rs::view::{ +//! parameters::Layout, +//! ViewOwned, +//! }; +//! +//! let mut viewA: ViewOwned<'_, 2, f64> = ViewOwned::new( +//! Layout::Right, // see parameters & Kokkos doc +//! [3, 5], // 3 rows, 5 columns +//! ); +//! +//! for row in 0..3 { +//! for col in 0..5 { +//! viewA.set([row, col], row as f64); +//! } +//! } +//! +//! // viewA: +//! // (0.0 0.0 0.0 0.0 0.0) +//! // (1.0 1.0 1.0 1.0 1.0) +//! // (2.0 2.0 2.0 2.0 2.0) +//! ``` pub mod parameters; #[cfg(any(feature = "rayon", feature = "threads", feature = "gpu"))] -use atomic::Atomic; +use atomic::{Atomic, Ordering}; -#[cfg(not(any(feature = "rayon", feature = "threads", feature = "gpu")))] +#[cfg(any(doc, not(any(feature = "rayon", feature = "threads", feature = "gpu"))))] use std::ops::IndexMut; use self::parameters::{compute_stride, DataTraits, DataType, InnerDataType, Layout}; -use std::{fmt::Debug, ops::Index, sync::Arc}; +use std::{fmt::Debug, ops::Index}; #[derive(Debug)] /// Enum used to classify view-related errors. @@ -27,7 +62,7 @@ pub enum ViewError<'a> { DoubleMirroring(&'a str), } -#[derive(Debug)] +#[derive(Debug, PartialEq)] /// Common structure used as the backend of all `View` types. The main differences between /// usable types is the type of the `data` field. pub struct ViewBase<'a, const N: usize, T> @@ -35,8 +70,7 @@ where T: DataTraits, { /// Data container. Depending on the type, it can be a vector (`Owned`), a reference - /// (`ReadOnly`), a mutable reference (`ReadWrite`) or an `Arc<>` pointing on a vector - /// (`Shared`). + /// (`ReadOnly`) or a mutable reference (`ReadWrite`). pub data: DataType<'a, T>, /// Memory layout of the view. Refer to Kokkos documentation for more information. pub layout: Layout, @@ -57,8 +91,7 @@ impl<'a, const N: usize, T> ViewBase<'a, N, T> where T: DataTraits, // fair assumption imo { - /// Constructor used to create owned (and shared?) views. See dedicated methods for - /// others. + /// Constructor used to create owned views. See dedicated methods for others. pub fn new(layout: Layout, dim: [usize; N]) -> Self { // compute stride & capacity let stride = compute_stride(&dim, &layout); @@ -73,8 +106,7 @@ where } } - /// Constructor used to create owned (and shared?) views. See dedicated methods for - /// others. + /// Constructor used to create owned views. See dedicated methods for others. pub fn new_from_data(data: Vec, layout: Layout, dim: [usize; N]) -> Self { // compute stride if necessary let stride = compute_stride(&dim, &layout); @@ -99,8 +131,7 @@ impl<'a, const N: usize, T> ViewBase<'a, N, T> where T: DataTraits, // fair assumption imo { - /// Constructor used to create owned (and shared?) views. See dedicated methods for - /// others. + /// Constructor used to create owned views. See dedicated methods for others. pub fn new(layout: Layout, dim: [usize; N]) -> Self { // compute stride & capacity let stride = compute_stride(&dim, &layout); @@ -115,8 +146,7 @@ where } } - /// Constructor used to create owned (and shared?) views. See dedicated methods for - /// others. + /// Constructor used to create owned views. See dedicated methods for others. pub fn new_from_data(data: Vec, layout: Layout, dim: [usize; N]) -> Self { // compute stride if necessary let stride = compute_stride(&dim, &layout); @@ -135,42 +165,89 @@ where } } +// ~~~~~~~~ Uniform writing interface across all features impl<'a, const N: usize, T> ViewBase<'a, N, T> where T: DataTraits, { - // ~~~~~~~~ Uniform writing interface across all features - #[inline(always)] #[cfg(not(any(feature = "rayon", feature = "threads", feature = "gpu")))] - /// Serial writing interface. Uses mutable indexing implementation. + /// Writing interface. + /// + /// Two different implementations of this method are defined in order to satisfy + /// the (im)mutability requirements when using parallelization features & keep a + /// consistent user API: + /// + /// - any feature enabled: implictly use an atomic store operation on top of the + /// regular [Index] trait implementation to prevent a mutable borrow. The store + /// currently uses relaxed ordering, this may change. + /// - no feature enabled: uses a regular [IndexMut] trait implementation. + /// + /// Note that [Index] is always implemented while [IndexMut] only is when no + /// features are enabled. + /// + /// **Current version**: no feature pub fn set(&mut self, index: [usize; N], val: T) { self[index] = val; } #[inline(always)] #[cfg(any(feature = "rayon", feature = "threads", feature = "gpu"))] - /// Thread-safe writing interface. Uses non-mutable indexing and - /// immutability of atomic type methods. + /// Writing interface. + /// + /// Two different implementations of this method are defined in order to satisfy + /// the (im)mutability requirements when using parallelization features & keep a + /// consistent user API: /// - /// Uses [atomic::Ordering::Relaxed], may be subject to change. + /// - any feature enabled: implictly use an atomic store operation on top of the + /// regular [Index] trait implementation to prevent a mutable borrow. The store + /// currently uses relaxed ordering, this may change. + /// - no feature enabled: uses a regular [IndexMut] trait implementation. + /// + /// Note that [Index] is always implemented while [IndexMut] only is when no + /// features are enabled. + /// + /// **Current version**: thread-safe pub fn set(&self, index: [usize; N], val: T) { - self[index].store(val, atomic::Ordering::Relaxed); + self[index].store(val, Ordering::Relaxed); } #[inline(always)] #[cfg(not(any(feature = "rayon", feature = "threads", feature = "gpu")))] - /// Serial writing interface. Uses mutable indexing implementation. + /// Reading interface. + /// + /// Two different implementations of this method are defined in order to keep a + /// consistent user API across features: + /// + /// - any feature enabled: implictly use an atomic load operation on top of the + /// regular [Index] trait implementation. The load currently uses relaxed ordering, + /// this may change. + /// - no feature enabled: uses the regular [Index] trait implementation. + /// + /// Note that [Index] is always implemented while [IndexMut] only is when no + /// features are enabled. + /// + /// **Current version**: no feature pub fn get(&self, index: [usize; N]) -> T { self[index] } #[inline(always)] #[cfg(any(feature = "rayon", feature = "threads", feature = "gpu"))] - /// Thread-safe writing interface. Uses non-mutable indexing and - /// immutability of atomic type methods. + /// Reading interface. + /// + /// Two different implementations of this method are defined in order to keep a + /// consistent user API across features: + /// + /// - any feature enabled: implictly use an atomic load operation on top of the + /// regular [Index] trait implementation. The load currently uses relaxed ordering, + /// this may change. + /// - no feature enabled: uses the regular [Index] trait implementation. + /// + /// Note that [Index] is always implemented while [IndexMut] only is when no + /// features are enabled. /// - /// Uses [atomic::Ordering::Relaxed], may be subject to change. + /// **Current version**: thread-safe pub fn get(&self, index: [usize; N]) -> T { self[index].load(atomic::Ordering::Relaxed) } @@ -234,8 +311,14 @@ where // ~~~~~~~~ Convenience - #[cfg(not(any(feature = "rayon", feature = "threads", feature = "gpu")))] - pub fn raw_val<'b>(self) -> Result>, ViewError<'b>> { + #[cfg(all( + test, + not(any(feature = "rayon", feature = "threads", feature = "gpu")) + ))] + /// Consumes the view to return a `Vec` containing its raw data content. + /// + /// This method is meant to be used in tests + pub fn raw_val<'b>(self) -> Result, ViewError<'b>> { if let DataType::Owned(v) = self.data { Ok(v) } else { @@ -245,7 +328,10 @@ where } } - #[cfg(any(feature = "rayon", feature = "threads", feature = "gpu"))] + #[cfg(all(test, any(feature = "rayon", feature = "threads", feature = "gpu")))] + /// Consumes the view to return a `Vec` containing its raw data content. + /// + /// This method is meant to be used in tests pub fn raw_val<'b>(self) -> Result, ViewError<'b>> { if let DataType::Owned(v) = self.data { Ok(v.iter() @@ -259,6 +345,7 @@ where } #[inline(always)] + /// Mapping function between N-indices and the flat offset. pub fn flat_idx(&self, index: [usize; N]) -> usize { index .iter() @@ -268,7 +355,7 @@ where } } -/// Read-only access is always implemented. +/// **Read-only access is always implemented.** impl<'a, const N: usize, T> Index<[usize; N]> for ViewBase<'a, N, T> where T: DataTraits, @@ -295,8 +382,7 @@ where } #[cfg(not(any(feature = "rayon", feature = "threads", feature = "gpu")))] -/// Read-write access is implemented using [IndexMut] trait when no parallel -/// features are enabled. +/// **Read-write access is only implemented when no parallel features are enabled.** impl<'a, const N: usize, T> IndexMut<[usize; N]> for ViewBase<'a, N, T> where T: DataTraits, @@ -317,18 +403,6 @@ where } } -impl<'a, const N: usize, T: PartialEq + Debug> PartialEq for ViewBase<'a, N, T> -where - T: DataTraits, -{ - fn eq(&self, other: &Self) -> bool { - self.data == other.data - && self.layout == other.layout - && self.dim == other.dim - && self.stride == other.stride - } -} - /// View type owning the data it yields access to, i.e. "original" view. pub type ViewOwned<'a, const N: usize, T> = ViewBase<'a, N, T>; @@ -339,7 +413,3 @@ pub type ViewRO<'a, const N: usize, T> = ViewBase<'a, N, T>; /// View type owning a mutable borrow to the data it yields access to, i.e. a /// read-write mirror. pub type ViewRW<'a, const N: usize, T> = ViewBase<'a, N, T>; - -/// View type owning a shared reference to the data it yields access to, i.e. a -/// thread-safe read-only mirror. Is this useful ? Shouldn't this be `Arc>` ? -pub type ViewShared<'a, const N: usize, T> = ViewBase<'a, N, Arc>; diff --git a/src/view/parameters.rs b/src/view/parameters.rs index 8a88788..5497acb 100644 --- a/src/view/parameters.rs +++ b/src/view/parameters.rs @@ -10,9 +10,7 @@ //! Possible future implementations include: //! //! - Memory space -//! - Memory traits? -//! -//! +//! - Memory traits use std::fmt::Debug; @@ -22,21 +20,47 @@ use atomic::Atomic; /// Maximum possible depth (i.e. number of dimensions) for a view. pub const MAX_VIEW_DEPTH: usize = 8; +/// Supertrait with common trait that elements of a View should implement. pub trait DataTraits: Debug + Clone + Copy + Default {} impl DataTraits for f64 {} impl DataTraits for f32 {} #[cfg(not(any(feature = "rayon", feature = "threads", feature = "gpu")))] +/// Generic alias for elements of type `T` of a View. +/// +/// This alias automatically changes according to features to ensure thread-safety +/// of Views. There are two possible values: +/// +/// - any feature enabled: `InnerDataType = Atomic`. By adding the atomic wrapping, +/// operations on views can be implemented using thread-safe methods. +/// - no feature enabled: `InnerDataType = T`. +/// +/// **Current version**: no feature pub type InnerDataType = T; #[cfg(any(feature = "rayon", feature = "threads", feature = "gpu"))] +/// Generic alias for elements of type `T` of a View. +/// +/// This alias automatically changes according to features to ensure thread-safety +/// of Views. There are two possible values: +/// +/// - any feature enabled: `InnerDataType = Atomic`. By adding the atomic wrapping, +/// operations on views can be implemented using thread-safe methods. +/// - no feature enabled: `InnerDataType = T`. +/// +/// **Current version**: thread-safe pub type InnerDataType = Atomic; #[derive(Debug)] -/// Enum used to identify the type of data the view is holding. See variants for more -/// information. The policy used to implement the [PartialEq] trait is based on Kokkos' +/// Enum used to identify the type of data the view is holding. +/// +/// This should eventually be removed. See the [view][crate::view] module documentation +/// for more information. +/// +/// The policy used to implement the [PartialEq] trait is based on Kokkos' /// [`equal` algorithm](https://kokkos.github.io/kokkos-core-wiki/API/algorithms/std-algorithms/all/StdEqual.html). +/// Essentially, it corresponds to equality by reference instead of equality by value. pub enum DataType<'a, T> where T: DataTraits, @@ -49,9 +73,6 @@ where MutBorrowed(&'a mut [InnerDataType]), } -/// Kokkos implements equality check by comparing the pointers, i.e. -/// two views are "equal" if and only if their data field points to the -/// same memory space. impl<'a, T> PartialEq for DataType<'a, T> where T: DataTraits,