Skip to content

Commit

Permalink
Update or parallel kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
adityauj committed Sep 12, 2024
1 parent 9e449ca commit c781042
Show file tree
Hide file tree
Showing 13 changed files with 102 additions and 54 deletions.
5 changes: 3 additions & 2 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[build]
# For generic x86_64 architecture.
[x86_64-unknown-linux-gnu]
rustflags = ["-C", "target-cpu=native"]

# Can also specify your simd flags like avx512 etc.
# Use +avx512 to enable avx512.
# https://rust-lang.github.io/packed_simd/perf-guide/target-feature/rustflags.html
# rustflags = ["-C", "target-cpu=native", "-C", "target-feature=+avx,+fma"]
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "TheRustBandwidthBenchmark"
version = "0.1.0"
version = "1.0.0"
edition = "2021"

[dependencies]
Expand All @@ -15,4 +15,4 @@ path = "src/main.rs"
[profile.release]
debug = false
lto = true
incremental = false
incremental = false
Binary file modified bench
Binary file not shown.
21 changes: 17 additions & 4 deletions src/kernels/copy.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,30 @@
use std::time::Instant;

use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator};
use rayon::{
iter::ParallelIterator,
slice::{ParallelSlice, ParallelSliceMut},
};

#[allow(clippy::ptr_arg, clippy::manual_memcpy, unused_variables)]
pub fn copy(c: &mut Vec<f64>, a: &Vec<f64>, n: usize) -> f64 {
let s = Instant::now();
#[inline(never)]
pub fn copy(c: &mut [f64], a: &[f64], n: usize) -> f64 {
let c_iter = c.par_chunks_mut(n);
let a_iter = a.par_chunks(n);

c.par_iter_mut().enumerate().for_each(|(i, x)| *x = a[i]);
let s = Instant::now();

// Serial version
// for i in 0..n {
// c[i] = a[i];
// }

// Parallel version
c_iter.for_each(|c_slice| {
c_slice
.iter_mut()
.enumerate()
.for_each(|(i, val)| *val = a[i])
});

s.elapsed().as_secs_f64()
}
19 changes: 13 additions & 6 deletions src/kernels/daxpy.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
use std::time::Instant;

use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator};
use rayon::{iter::ParallelIterator, slice::ParallelSliceMut};

#[allow(clippy::ptr_arg, unused_variables)]
pub fn daxpy(a: &mut Vec<f64>, b: &Vec<f64>, scalar: f64, n: usize) -> f64 {
let s = Instant::now();
#[inline(never)]
pub fn daxpy(a: &mut [f64], b: &[f64], scalar: f64, n: usize) -> f64 {
let a_iter = a.par_chunks_mut(n);

a.par_iter_mut()
.enumerate()
.for_each(|(i, x)| *x += scalar * b[i]);
let s = Instant::now();

// Serial version
// for i in 0..n {
// a[i] += scalar * b[i];
// }

// Parallel version
a_iter.for_each(|a_slice| {
a_slice
.iter_mut()
.enumerate()
.for_each(|(i, val)| *val = b[i].mul_add(scalar, *val))
});

s.elapsed().as_secs_f64()
}
12 changes: 8 additions & 4 deletions src/kernels/init.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
use std::time::Instant;

use rayon::iter::{IntoParallelRefMutIterator, ParallelIterator};
use rayon::{iter::ParallelIterator, slice::ParallelSliceMut};

#[allow(clippy::ptr_arg, unused_variables)]
pub fn init(b: &mut Vec<f64>, scalar: f64, n: usize) -> f64 {
let s = Instant::now();
#[inline(never)]
pub fn init(b: &mut [f64], scalar: f64, n: usize) -> f64 {
let b_iter = b.par_chunks_mut(n);

b.par_iter_mut().for_each(|x| *x = scalar);
let s = Instant::now();

// Serial version
// for i in b.iter_mut().take(n) {
// *i = scalar;
// }

// Parallel version
b_iter.for_each(|b_slice| b_slice.iter_mut().for_each(|val| *val = scalar));

s.elapsed().as_secs_f64()
}
18 changes: 12 additions & 6 deletions src/kernels/sdaxpy.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,25 @@
use std::time::Instant;

use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator};
use rayon::{iter::ParallelIterator, slice::ParallelSliceMut};

#[allow(clippy::ptr_arg, unused_variables)]
pub fn sdaxpy(a: &mut Vec<f64>, b: &Vec<f64>, c: &Vec<f64>, n: usize) -> f64 {
let s = Instant::now();
pub fn sdaxpy(a: &mut [f64], b: &[f64], c: &[f64], n: usize) -> f64 {
let a_iter = a.par_chunks_mut(n);

a.par_iter_mut()
.enumerate()
.for_each(|(i, x)| *x += b[i] * c[i]);
let s = Instant::now();

// Serial version
// for i in 0..n {
// a[i] += b[i] * c[i];
// }

// Parallel version
a_iter.for_each(|a_slice| {
a_slice
.iter_mut()
.enumerate()
.for_each(|(i, val)| *val = c[i].mul_add(b[i], *val))
});

s.elapsed().as_secs_f64()
}
17 changes: 11 additions & 6 deletions src/kernels/striad.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
use std::time::Instant;

use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator};
use rayon::{iter::ParallelIterator, slice::ParallelSliceMut};

#[allow(clippy::ptr_arg, unused_variables)]
pub fn striad(a: &mut Vec<f64>, b: &Vec<f64>, c: &Vec<f64>, d: &Vec<f64>, n: usize) -> f64 {
let s = Instant::now();
pub fn striad(a: &mut [f64], b: &[f64], c: &[f64], d: &[f64], n: usize) -> f64 {
let a_iter = a.par_chunks_mut(n);

a.par_iter_mut()
.enumerate()
.for_each(|(i, x)| *x = b[i] + d[i] * c[i]);
let s = Instant::now();

// Serial version
// for i in 0..n {
// a[i] = b[i] + d[i] * c[i];
// }

// Parallel version
a_iter.for_each(|a_slice| {
a_slice
.iter_mut()
.enumerate()
.for_each(|(i, val)| *val = c[i].mul_add(d[i], b[i]))
});
s.elapsed().as_secs_f64()
}
7 changes: 4 additions & 3 deletions src/kernels/sum.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@ use std::time::Instant;
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};

#[allow(clippy::ptr_arg, unused_variables)]
pub fn sum(a: &mut Vec<f64>, n: usize) -> f64 {
pub fn sum(a: &mut [f64], n: usize) -> f64 {
let s = Instant::now();

let sum = a.par_iter().sum();

// Serial version
// let mut sum = 0.0;
// for i in a.iter().take(n) {
// sum += *i;
// }

// Parallel sum reduction
let sum = a.par_iter().sum();

let e = s.elapsed();

a[10] = sum;
Expand Down
23 changes: 15 additions & 8 deletions src/kernels/triad.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
use std::time::Instant;

use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator};
use rayon::{iter::ParallelIterator, slice::ParallelSliceMut};

#[allow(clippy::ptr_arg, unused_variables)]
pub fn triad(a: &mut Vec<f64>, b: &Vec<f64>, c: &Vec<f64>, scalar: f64, n: usize) -> f64 {
let s = Instant::now();
#[inline(never)]
pub fn triad(a: &mut [f64], b: &[f64], c: &[f64], scalar: f64, n: usize) -> f64 {
let a_iter = a.par_chunks_mut(n);

a.par_iter_mut()
.enumerate()
.for_each(|(i, x)| *x = b[i] + scalar * c[i]);
let s = Instant::now();

// Serial version
// for i in 0..n {
// a[i] = b[i] + scalar * c[i];
// for i in 0..(n * 8) {
// a[i] = c[i].mul_add(scalar, b[i]);
// }

// Parallel version
a_iter.for_each(|a_slice| {
a_slice
.iter_mut()
.enumerate()
.for_each(|(i, val)| *val = c[i].mul_add(scalar, b[i]))
});

s.elapsed().as_secs_f64()
}
11 changes: 7 additions & 4 deletions src/kernels/update.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
use std::time::Instant;

use rayon::iter::{IntoParallelRefMutIterator, ParallelIterator};
use rayon::{iter::ParallelIterator, slice::ParallelSliceMut};

#[allow(clippy::ptr_arg, unused_variables)]
pub fn update(b: &mut Vec<f64>, scalar: f64, n: usize) -> f64 {
let s = Instant::now();
pub fn update(b: &mut [f64], scalar: f64, n: usize) -> f64 {
let b_iter = b.par_chunks_mut(n);

b.par_iter_mut().for_each(|x| *x += scalar);
let s = Instant::now();

// Serial version
// for i in b.iter_mut().take(n) {
// *i += scalar;
// }

// Parallel version
b_iter.for_each(|b_slice| b_slice.iter_mut().for_each(|val| *val += scalar));

s.elapsed().as_secs_f64()
}
17 changes: 9 additions & 8 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ fn main() {
const BYTES_PER_WORD: usize = size_of::<f64>();
let n = arg_parser.size;
let ntimes = arg_parser.ntimes;
let n_chunks = n / arg_parser.n;

let num_of_benchmarks = Benchmark::Numbench as usize;

Expand Down Expand Up @@ -115,50 +116,50 @@ fn main() {
for k in 0..ntimes {
bench!(
Benchmark::Init as usize,
init(b.as_mut(), scalar, n),
init(b.as_mut(), scalar, n_chunks),
times,
k
);

let tmp = a[10];

bench!(Benchmark::Sum as usize, sum(a.as_mut(), n), times, k);
bench!(Benchmark::Sum as usize, sum(a.as_mut(), n_chunks), times, k);

a[10] = tmp;

bench!(
Benchmark::Copy as usize,
copy(c.as_mut(), a.as_ref(), n),
copy(c.as_mut(), a.as_ref(), n_chunks),
times,
k
);
bench!(
Benchmark::Update as usize,
update(a.as_mut(), scalar, n),
update(a.as_mut(), scalar, n_chunks),
times,
k
);
bench!(
Benchmark::Triad as usize,
triad(a.as_mut(), b.as_ref(), c.as_ref(), scalar, n),
triad(a.as_mut(), b.as_ref(), c.as_ref(), scalar, n_chunks),
times,
k
);
bench!(
Benchmark::Daxpy as usize,
daxpy(a.as_mut(), b.as_ref(), scalar, n),
daxpy(a.as_mut(), b.as_ref(), scalar, n_chunks),
times,
k
);
bench!(
Benchmark::Striad as usize,
striad(a.as_mut(), b.as_ref(), c.as_ref(), d.as_ref(), n),
striad(a.as_mut(), b.as_ref(), c.as_ref(), d.as_ref(), n_chunks),
times,
k
);
bench!(
Benchmark::Sdaxpy as usize,
sdaxpy(a.as_mut(), b.as_ref(), c.as_ref(), n),
sdaxpy(a.as_mut(), b.as_ref(), c.as_ref(), n_chunks),
times,
k
);
Expand Down

0 comments on commit c781042

Please sign in to comment.