diff --git a/.cargo/config.toml b/.cargo/config.toml index d6f8c52..f7772d8 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,6 +1,7 @@ -[build] +# For generic x86_64 architecture. +[x86_64-unknown-linux-gnu] rustflags = ["-C", "target-cpu=native"] -# Can also specify your simd flags like avx512 etc. +# Use +avx512 to enable avx512. # https://rust-lang.github.io/packed_simd/perf-guide/target-feature/rustflags.html # rustflags = ["-C", "target-cpu=native", "-C", "target-feature=+avx,+fma"] diff --git a/Cargo.lock b/Cargo.lock index 7e7833f..038c973 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,7 +4,7 @@ version = 3 [[package]] name = "TheRustBandwidthBenchmark" -version = "0.1.0" +version = "1.0.0" dependencies = [ "clap", "num_cpus", diff --git a/Cargo.toml b/Cargo.toml index c02a5f9..eee5dad 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "TheRustBandwidthBenchmark" -version = "0.1.0" +version = "1.0.0" edition = "2021" [dependencies] @@ -15,4 +15,4 @@ path = "src/main.rs" [profile.release] debug = false lto = true -incremental = false +incremental = false \ No newline at end of file diff --git a/bench b/bench index 6490326..79be342 100755 Binary files a/bench and b/bench differ diff --git a/src/kernels/copy.rs b/src/kernels/copy.rs index a84b076..e8c706b 100644 --- a/src/kernels/copy.rs +++ b/src/kernels/copy.rs @@ -1,17 +1,30 @@ use std::time::Instant; -use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator}; +use rayon::{ + iter::ParallelIterator, + slice::{ParallelSlice, ParallelSliceMut}, +}; #[allow(clippy::ptr_arg, clippy::manual_memcpy, unused_variables)] -pub fn copy(c: &mut Vec, a: &Vec, n: usize) -> f64 { - let s = Instant::now(); +#[inline(never)] +pub fn copy(c: &mut [f64], a: &[f64], n: usize) -> f64 { + let c_iter = c.par_chunks_mut(n); + let a_iter = a.par_chunks(n); - c.par_iter_mut().enumerate().for_each(|(i, x)| *x = a[i]); + let s = Instant::now(); // Serial version // for i in 0..n { // c[i] = a[i]; // } + // Parallel version + c_iter.for_each(|c_slice| { + c_slice + .iter_mut() + .enumerate() + .for_each(|(i, val)| *val = a[i]) + }); + s.elapsed().as_secs_f64() } diff --git a/src/kernels/daxpy.rs b/src/kernels/daxpy.rs index 84890ce..c9d4931 100644 --- a/src/kernels/daxpy.rs +++ b/src/kernels/daxpy.rs @@ -1,19 +1,26 @@ use std::time::Instant; -use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator}; +use rayon::{iter::ParallelIterator, slice::ParallelSliceMut}; #[allow(clippy::ptr_arg, unused_variables)] -pub fn daxpy(a: &mut Vec, b: &Vec, scalar: f64, n: usize) -> f64 { - let s = Instant::now(); +#[inline(never)] +pub fn daxpy(a: &mut [f64], b: &[f64], scalar: f64, n: usize) -> f64 { + let a_iter = a.par_chunks_mut(n); - a.par_iter_mut() - .enumerate() - .for_each(|(i, x)| *x += scalar * b[i]); + let s = Instant::now(); // Serial version // for i in 0..n { // a[i] += scalar * b[i]; // } + // Parallel version + a_iter.for_each(|a_slice| { + a_slice + .iter_mut() + .enumerate() + .for_each(|(i, val)| *val = b[i].mul_add(scalar, *val)) + }); + s.elapsed().as_secs_f64() } diff --git a/src/kernels/init.rs b/src/kernels/init.rs index 3d6a984..0ba91bc 100644 --- a/src/kernels/init.rs +++ b/src/kernels/init.rs @@ -1,17 +1,21 @@ use std::time::Instant; -use rayon::iter::{IntoParallelRefMutIterator, ParallelIterator}; +use rayon::{iter::ParallelIterator, slice::ParallelSliceMut}; #[allow(clippy::ptr_arg, unused_variables)] -pub fn init(b: &mut Vec, scalar: f64, n: usize) -> f64 { - let s = Instant::now(); +#[inline(never)] +pub fn init(b: &mut [f64], scalar: f64, n: usize) -> f64 { + let b_iter = b.par_chunks_mut(n); - b.par_iter_mut().for_each(|x| *x = scalar); + let s = Instant::now(); // Serial version // for i in b.iter_mut().take(n) { // *i = scalar; // } + // Parallel version + b_iter.for_each(|b_slice| b_slice.iter_mut().for_each(|val| *val = scalar)); + s.elapsed().as_secs_f64() } diff --git a/src/kernels/sdaxpy.rs b/src/kernels/sdaxpy.rs index 28f8341..f9490ea 100644 --- a/src/kernels/sdaxpy.rs +++ b/src/kernels/sdaxpy.rs @@ -1,19 +1,25 @@ use std::time::Instant; -use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator}; +use rayon::{iter::ParallelIterator, slice::ParallelSliceMut}; #[allow(clippy::ptr_arg, unused_variables)] -pub fn sdaxpy(a: &mut Vec, b: &Vec, c: &Vec, n: usize) -> f64 { - let s = Instant::now(); +pub fn sdaxpy(a: &mut [f64], b: &[f64], c: &[f64], n: usize) -> f64 { + let a_iter = a.par_chunks_mut(n); - a.par_iter_mut() - .enumerate() - .for_each(|(i, x)| *x += b[i] * c[i]); + let s = Instant::now(); // Serial version // for i in 0..n { // a[i] += b[i] * c[i]; // } + // Parallel version + a_iter.for_each(|a_slice| { + a_slice + .iter_mut() + .enumerate() + .for_each(|(i, val)| *val = c[i].mul_add(b[i], *val)) + }); + s.elapsed().as_secs_f64() } diff --git a/src/kernels/striad.rs b/src/kernels/striad.rs index e598b4a..1af52f9 100644 --- a/src/kernels/striad.rs +++ b/src/kernels/striad.rs @@ -1,19 +1,24 @@ use std::time::Instant; -use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator}; +use rayon::{iter::ParallelIterator, slice::ParallelSliceMut}; #[allow(clippy::ptr_arg, unused_variables)] -pub fn striad(a: &mut Vec, b: &Vec, c: &Vec, d: &Vec, n: usize) -> f64 { - let s = Instant::now(); +pub fn striad(a: &mut [f64], b: &[f64], c: &[f64], d: &[f64], n: usize) -> f64 { + let a_iter = a.par_chunks_mut(n); - a.par_iter_mut() - .enumerate() - .for_each(|(i, x)| *x = b[i] + d[i] * c[i]); + let s = Instant::now(); // Serial version // for i in 0..n { // a[i] = b[i] + d[i] * c[i]; // } + // Parallel version + a_iter.for_each(|a_slice| { + a_slice + .iter_mut() + .enumerate() + .for_each(|(i, val)| *val = c[i].mul_add(d[i], b[i])) + }); s.elapsed().as_secs_f64() } diff --git a/src/kernels/sum.rs b/src/kernels/sum.rs index a3f667c..b4bd56e 100644 --- a/src/kernels/sum.rs +++ b/src/kernels/sum.rs @@ -3,17 +3,18 @@ use std::time::Instant; use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; #[allow(clippy::ptr_arg, unused_variables)] -pub fn sum(a: &mut Vec, n: usize) -> f64 { +pub fn sum(a: &mut [f64], n: usize) -> f64 { let s = Instant::now(); - let sum = a.par_iter().sum(); - // Serial version // let mut sum = 0.0; // for i in a.iter().take(n) { // sum += *i; // } + // Parallel sum reduction + let sum = a.par_iter().sum(); + let e = s.elapsed(); a[10] = sum; diff --git a/src/kernels/triad.rs b/src/kernels/triad.rs index 874944b..811f894 100644 --- a/src/kernels/triad.rs +++ b/src/kernels/triad.rs @@ -1,19 +1,26 @@ use std::time::Instant; -use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator}; +use rayon::{iter::ParallelIterator, slice::ParallelSliceMut}; #[allow(clippy::ptr_arg, unused_variables)] -pub fn triad(a: &mut Vec, b: &Vec, c: &Vec, scalar: f64, n: usize) -> f64 { - let s = Instant::now(); +#[inline(never)] +pub fn triad(a: &mut [f64], b: &[f64], c: &[f64], scalar: f64, n: usize) -> f64 { + let a_iter = a.par_chunks_mut(n); - a.par_iter_mut() - .enumerate() - .for_each(|(i, x)| *x = b[i] + scalar * c[i]); + let s = Instant::now(); // Serial version - // for i in 0..n { - // a[i] = b[i] + scalar * c[i]; + // for i in 0..(n * 8) { + // a[i] = c[i].mul_add(scalar, b[i]); // } + // Parallel version + a_iter.for_each(|a_slice| { + a_slice + .iter_mut() + .enumerate() + .for_each(|(i, val)| *val = c[i].mul_add(scalar, b[i])) + }); + s.elapsed().as_secs_f64() } diff --git a/src/kernels/update.rs b/src/kernels/update.rs index fec0872..badd4cd 100644 --- a/src/kernels/update.rs +++ b/src/kernels/update.rs @@ -1,17 +1,20 @@ use std::time::Instant; -use rayon::iter::{IntoParallelRefMutIterator, ParallelIterator}; +use rayon::{iter::ParallelIterator, slice::ParallelSliceMut}; #[allow(clippy::ptr_arg, unused_variables)] -pub fn update(b: &mut Vec, scalar: f64, n: usize) -> f64 { - let s = Instant::now(); +pub fn update(b: &mut [f64], scalar: f64, n: usize) -> f64 { + let b_iter = b.par_chunks_mut(n); - b.par_iter_mut().for_each(|x| *x += scalar); + let s = Instant::now(); // Serial version // for i in b.iter_mut().take(n) { // *i += scalar; // } + // Parallel version + b_iter.for_each(|b_slice| b_slice.iter_mut().for_each(|val| *val += scalar)); + s.elapsed().as_secs_f64() } diff --git a/src/main.rs b/src/main.rs index 8a687b7..beed4e3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -40,6 +40,7 @@ fn main() { const BYTES_PER_WORD: usize = size_of::(); let n = arg_parser.size; let ntimes = arg_parser.ntimes; + let n_chunks = n / arg_parser.n; let num_of_benchmarks = Benchmark::Numbench as usize; @@ -115,50 +116,50 @@ fn main() { for k in 0..ntimes { bench!( Benchmark::Init as usize, - init(b.as_mut(), scalar, n), + init(b.as_mut(), scalar, n_chunks), times, k ); let tmp = a[10]; - bench!(Benchmark::Sum as usize, sum(a.as_mut(), n), times, k); + bench!(Benchmark::Sum as usize, sum(a.as_mut(), n_chunks), times, k); a[10] = tmp; bench!( Benchmark::Copy as usize, - copy(c.as_mut(), a.as_ref(), n), + copy(c.as_mut(), a.as_ref(), n_chunks), times, k ); bench!( Benchmark::Update as usize, - update(a.as_mut(), scalar, n), + update(a.as_mut(), scalar, n_chunks), times, k ); bench!( Benchmark::Triad as usize, - triad(a.as_mut(), b.as_ref(), c.as_ref(), scalar, n), + triad(a.as_mut(), b.as_ref(), c.as_ref(), scalar, n_chunks), times, k ); bench!( Benchmark::Daxpy as usize, - daxpy(a.as_mut(), b.as_ref(), scalar, n), + daxpy(a.as_mut(), b.as_ref(), scalar, n_chunks), times, k ); bench!( Benchmark::Striad as usize, - striad(a.as_mut(), b.as_ref(), c.as_ref(), d.as_ref(), n), + striad(a.as_mut(), b.as_ref(), c.as_ref(), d.as_ref(), n_chunks), times, k ); bench!( Benchmark::Sdaxpy as usize, - sdaxpy(a.as_mut(), b.as_ref(), c.as_ref(), n), + sdaxpy(a.as_mut(), b.as_ref(), c.as_ref(), n_chunks), times, k );