GPU upgrade

PoC-Consortium · Dec 1, 2018 · 2eeb527 · 2eeb527
1 parent b2c23cd
commit 2eeb527
Show file tree

Hide file tree

Showing 11 changed files with 1,745 additions and 78 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,10 @@
 # Generated by Cargo
 # will have compiled files and executables
 /target/
+/.vs/
+/bin/
+/obj/
+/packages/
 
 # These are backup files generated by rustfmt
 **/*.rs.bk

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "engraver"
-version = "2.0.5"
+version = "2.2.0"
 license = "GPL-3.0"
 authors = ["PoC Consortium <[email protected]>"]
 description = """

diff --git a/src/cpu_hasher.rs b/src/cpu_hasher.rs
@@ -0,0 +1,117 @@
+use libc::{c_void, size_t, uint64_t};
+use std::sync::mpsc::Sender;
+
+extern "C" {
+    pub fn noncegen(
+        cache: *mut c_void,
+        cache_size: size_t,
+        chunk_offset: size_t,
+        numeric_ID: uint64_t,
+        local_startnonce: uint64_t,
+        local_nonces: uint64_t,
+    );
+    pub fn noncegen_sse(
+        cache: *mut c_void,
+        cache_size: size_t,
+        chunk_offset: size_t,
+        numeric_ID: uint64_t,
+        local_startnonce: uint64_t,
+        local_nonces: uint64_t,
+    );
+    pub fn noncegen_avx(
+        cache: *mut c_void,
+        cache_size: size_t,
+        chunk_offset: size_t,
+        numeric_ID: uint64_t,
+        local_startnonce: uint64_t,
+        local_nonces: uint64_t,
+    );
+    pub fn noncegen_avx2(
+        cache: *mut c_void,
+        cache_size: size_t,
+        chunk_offset: size_t,
+        numeric_ID: uint64_t,
+        local_startnonce: uint64_t,
+        local_nonces: uint64_t,
+    );
+    pub fn noncegen_avx512(
+        cache: *mut c_void,
+        cache_size: size_t,
+        chunk_offset: size_t,
+        numeric_ID: uint64_t,
+        local_startnonce: uint64_t,
+        local_nonces: uint64_t,
+    );
+}
+pub struct SafeCVoid {
+    pub ptr: *mut c_void,
+}
+unsafe impl Send for SafeCVoid {}
+
+pub struct CpuTask {
+    pub cache: SafeCVoid,
+    pub cache_size: size_t,
+    pub chunk_offset: size_t,
+    pub numeric_id: uint64_t,
+    pub local_startnonce: uint64_t,
+    pub local_nonces: uint64_t,
+}
+
+pub fn hash_cpu(
+    tx: Sender<(u8, u8, u64)>,
+    hasher_task: CpuTask,
+    simd_ext: String,
+) -> impl FnOnce() {
+    move || {
+        unsafe {
+            match &*simd_ext {
+                "AVX512F" => noncegen_avx512(
+                    hasher_task.cache.ptr,
+                    hasher_task.cache_size,
+                    hasher_task.chunk_offset,
+                    hasher_task.numeric_id,
+                    hasher_task.local_startnonce,
+                    hasher_task.local_nonces,
+                ),
+                "AVX2" => noncegen_avx2(
+                    hasher_task.cache.ptr,
+                    hasher_task.cache_size,
+                    hasher_task.chunk_offset,
+                    hasher_task.numeric_id,
+                    hasher_task.local_startnonce,
+                    hasher_task.local_nonces,
+                ),
+                "AVX" => noncegen_avx(
+                    hasher_task.cache.ptr,
+                    hasher_task.cache_size,
+                    hasher_task.chunk_offset,
+                    hasher_task.numeric_id,
+                    hasher_task.local_startnonce,
+                    hasher_task.local_nonces,
+                ),
+                "SSE2" => noncegen_sse(
+                    hasher_task.cache.ptr,
+                    hasher_task.cache_size,
+                    hasher_task.chunk_offset,
+                    hasher_task.numeric_id,
+                    hasher_task.local_startnonce,
+                    hasher_task.local_nonces,
+                ),
+                _ => noncegen(
+                    hasher_task.cache.ptr,
+                    hasher_task.cache_size,
+                    hasher_task.chunk_offset,
+                    hasher_task.numeric_id,
+                    hasher_task.local_startnonce,
+                    hasher_task.local_nonces,
+                ),
+            }
+        }
+        // report hashing done
+        tx.send((0u8, 1u8, 0))
+            .expect("CPU task can't communicate with scheduler thread.");
+        // report data in hostmem
+        tx.send((0u8, 0u8, hasher_task.local_nonces))
+            .expect("CPU task can't communicate with scheduler thread.");
+    }
+}
diff --git a/src/gpu_hasher.rs b/src/gpu_hasher.rs
@@ -0,0 +1,78 @@
+use chan::Receiver;
+use ocl::{gpu_hash, gpu_hash_and_transfer_to_host, gpu_transfer_to_host, GpuContext};
+use std::sync::mpsc::Sender;
+use std::sync::{Arc, Mutex};
+
+pub struct SafePointer {
+    pub ptr: *mut u8,
+}
+unsafe impl Send for SafePointer {}
+unsafe impl Sync for SafePointer {}
+
+pub struct GpuTask {
+    pub cache: SafePointer,
+    pub cache_size: u64,
+    pub chunk_offset: u64,
+    pub numeric_id: u64,
+    pub local_startnonce: u64,
+    pub local_nonces: u64,
+}
+
+pub fn create_gpu_hasher_thread(
+    gpu_id: u8,
+    gpu_context: Arc<Mutex<GpuContext>>,
+    tx: Sender<(u8, u8, u64)>,
+    rx_hasher_task: Receiver<Option<GpuTask>>,
+) -> impl FnOnce() {
+    move || {
+        let mut first_run = true;
+        let mut buffer_id = 0u8;
+        let mut last_task = GpuTask {
+            cache: SafePointer { ptr: &mut 0u8 },
+            cache_size: 0,
+            chunk_offset: 0,
+            numeric_id: 0,
+            local_startnonce: 0,
+            local_nonces: 0,
+        };
+        for task in rx_hasher_task {
+            // check if new task or termination
+            match task {
+                // new task
+                Some(task) => {
+                    // first run - just hash
+                    if first_run {
+                        if task.local_nonces != 0 {
+                            first_run = false;
+                            gpu_hash(&gpu_context, &task);
+                            buffer_id = 1 - buffer_id;
+                            last_task = task;
+                            tx.send((gpu_id, 1u8, 0))
+                                .expect("GPU task can't communicate with scheduler thread.");
+                        }
+                    // last run - just transfer
+                    } else if task.local_nonces == 0 {
+                        gpu_transfer_to_host(&gpu_context, buffer_id, &last_task);
+                        first_run = true;
+                        buffer_id = 0;
+                        tx.send((gpu_id, 0u8, last_task.local_nonces))
+                            .expect("GPU task can't communicate with scheduler thread.");
+                    // normal run - hash and transfer async
+                    } else {
+                        gpu_hash_and_transfer_to_host(&gpu_context, buffer_id, &task, &last_task);
+                        buffer_id = 1 - buffer_id;
+                        tx.send((gpu_id, 0u8, last_task.local_nonces))
+                            .expect("GPU task can't communicate with scheduler thread.");
+                        last_task = task;
+                        tx.send((gpu_id, 1u8, 0))
+                            .expect("GPU task can't communicate with scheduler thread.");
+                    }
+                }
+                // termination
+                None => {
+                    break;
+                }
+            }
+        }
+    }
+}
diff --git a/src/main.rs b/src/main.rs
@@ -9,8 +9,13 @@ extern crate pbr;
 extern crate stopwatch;
 extern crate sys_info;
 
-mod hasher;
+mod cpu_hasher;
+#[cfg(feature = "opencl")]
+mod gpu_hasher;
+#[cfg(feature = "opencl")]
+mod ocl;
 mod plotter;
+mod scheduler;
 mod utils;
 mod writer;
 
@@ -19,13 +24,10 @@ use clap::AppSettings::{ArgRequiredElseHelp, DeriveDisplayOrder, VersionlessSubc
 use clap::ArgGroup;
 use clap::{App, Arg};
 use plotter::{Plotter, PlotterTask};
+use std::cmp::min;
 use utils::set_low_prio;
 
 fn main() {
-    #[cfg(not(feature = "opencl"))]
-    let _opencl = false;
-    #[cfg(feature = "opencl")]
-    let opencl = true;
     let arg = App::new("Engraver")
         .version(crate_version!())
         .author(crate_authors!())
@@ -60,6 +62,12 @@ fn main() {
                 .long("quiet")
                 .help("Runs engraver in non-verbose mode")
                 .global(true),
+        ).arg(
+            Arg::with_name("benchmark")
+                .short("b")
+                .long("bench")
+                .help("Runs engraver in xPU benchmark mode")
+                .global(true),
         )
         /*
         .subcommand(
@@ -74,23 +82,23 @@ fn main() {
                         .value_name("numeric_ID")
                         .help("your numeric Burst ID")
                         .takes_value(true)
-                        .required(true),
+                        .required_unless("ocl-devices"),
                 ).arg(
                     Arg::with_name("start nonce")
                         .short("s")
                         .long("sn")
                         .value_name("start_nonce")
                         .help("where you want to start plotting")
                         .takes_value(true)
-                        .required(true),
+                        .required_unless("ocl-devices"),
                 ).arg(
                     Arg::with_name("nonces")
                         .short("n")
                         .long("n")
                         .value_name("nonces")
                         .help("how many nonces you want to plot")
                         .takes_value(true)
-                        .required(true),
+                        .required_unless("ocl-devices"),
                 ).arg(
                     Arg::with_name("path")
                         .short("p")
@@ -120,14 +128,13 @@ fn main() {
                         .short("g")
                         .long("gpu")
                         .value_name("platform_id:device_id")
-                        .help("*GPU(s) you want to use for plotting")
+                        .help("GPU(s) you want to use for plotting (optional)")
                         .multiple(true)
                         .takes_value(true),
                 ]).groups(&[#[cfg(feature = "opencl")]
                 ArgGroup::with_name("processing")
                     .args(&["cpu", "gpu"])
-                    .multiple(true)
-                    .required(true)])
+                    .multiple(true)])
                     /*
                     .arg(
                     Arg::with_name("ssd buffer")
@@ -168,14 +175,35 @@ fn main() {
                 
         )*/;
 
+    #[cfg(feature = "opencl")]
+    let arg = arg
+        .arg(
+            Arg::with_name("ocl-devices")
+                .short("o")
+                .long("opencl")
+                .help("Display OpenCL platforms and devices")
+                .global(true),
+        ).arg(
+            Arg::with_name("zero-copy")
+                .short("z")
+                .long("zcb")
+                .help("Enables zero copy buffers for shared mem (integrated) gpus")
+                .global(true),
+        );
     let matches = &arg.get_matches();
 
     if matches.is_present("low priority") {
         set_low_prio();
     }
 
+    if matches.is_present("ocl-devices") {
+        #[cfg(feature = "opencl")]
+        ocl::platform_info();
+        return;
+    }
+
     // plotting
-    /*
+    /* subcommand
     if let Some(matches) = matches.subcommand_matches("plot") {
     */
     let numeric_id = value_t!(matches, "numeric id", u64).unwrap_or_else(|e| e.exit());
@@ -189,8 +217,30 @@ fn main() {
             .unwrap()
     });
     let mem = value_t!(matches, "memory", String).unwrap_or_else(|_| "0B".to_owned());
-    let cpu_threads =
-        value_t!(matches, "cpu", u8).unwrap_or_else(|_| sys_info::cpu_num().unwrap() as u8);
+    let cpu_threads = value_t!(matches, "cpu", u8).unwrap_or(0u8);
+
+    let gpus = if matches.occurrences_of("gpu") > 0 {
+        let gpu = values_t!(matches, "gpu", String);
+        Some(gpu.unwrap())
+    } else {
+        None
+    };
+
+    // work out number of cpu threads to use
+    let cores = sys_info::cpu_num().unwrap() as u8;
+    let cpu_threads = if cpu_threads == 0 {
+        cores
+    } else {
+        min(cores, cpu_threads)
+    };
+
+    // special case: dont use cpu if only a gpu is defined
+    #[cfg(feature = "opencl")]
+    let cpu_threads = if matches.occurrences_of("gpu") > 0 && matches.occurrences_of("cpu") == 0 {
+        0u8
+    } else {
+        cpu_threads
+    };
 
     let p = Plotter::new();
     p.run(PlotterTask {
@@ -200,8 +250,11 @@ fn main() {
         output_path,
         mem,
         cpu_threads,
+        gpus,
         direct_io: !matches.is_present("disable direct i/o"),
         async_io: !matches.is_present("disable async i/o"),
         quiet: matches.is_present("non-verbosity"),
+        benchmark: matches.is_present("benchmark"),
+        zcb: matches.is_present("zero-copy"),
     });
 }