From 2eeb5278032f2410639c78a53b9860c841aa2db6 Mon Sep 17 00:00:00 2001
From: JohnnyFFM <JohnnyDeluxe@habmalnefrage.de>
Date: Sat, 1 Dec 2018 08:13:39 +0100
Subject: [PATCH] GPU upgrade

---
 .gitignore        |   4 +
 Cargo.lock        |   2 +-
 Cargo.toml        |   2 +-
 src/cpu_hasher.rs | 117 +++++++++
 src/gpu_hasher.rs |  78 ++++++
 src/main.rs       |  81 ++++--
 src/ocl.rs        | 609 ++++++++++++++++++++++++++++++++++++++++++++++
 src/ocl/kernel.cl | 514 ++++++++++++++++++++++++++++++++++++++
 src/plotter.rs    | 111 +++++++--
 src/scheduler.rs  | 232 ++++++++++++++++++
 src/writer.rs     |  73 +++---
 11 files changed, 1745 insertions(+), 78 deletions(-)
 create mode 100644 src/cpu_hasher.rs
 create mode 100644 src/gpu_hasher.rs
 create mode 100644 src/ocl.rs
 create mode 100644 src/ocl/kernel.cl
 create mode 100644 src/scheduler.rs

diff --git a/.gitignore b/.gitignore
index 89552ee..6567326 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,10 @@
 # Generated by Cargo
 # will have compiled files and executables
 /target/
+/.vs/
+/bin/
+/obj/
+/packages/
 
 # These are backup files generated by rustfmt
 **/*.rs.bk
diff --git a/Cargo.lock b/Cargo.lock
index 97a2ecc..6b99bbc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -169,7 +169,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 
 [[package]]
 name = "engraver"
-version = "2.0.5"
+version = "2.2.0"
 dependencies = [
  "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)",
  "cfg-if 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
diff --git a/Cargo.toml b/Cargo.toml
index f863f88..46ad75f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "engraver"
-version = "2.0.5"
+version = "2.2.0"
 license = "GPL-3.0"
 authors = ["PoC Consortium <bots@cryptoguru.org>"]
 description = """
diff --git a/src/cpu_hasher.rs b/src/cpu_hasher.rs
new file mode 100644
index 0000000..ecc2fbb
--- /dev/null
+++ b/src/cpu_hasher.rs
@@ -0,0 +1,117 @@
+use libc::{c_void, size_t, uint64_t};
+use std::sync::mpsc::Sender;
+
+extern "C" {
+    pub fn noncegen(
+        cache: *mut c_void,
+        cache_size: size_t,
+        chunk_offset: size_t,
+        numeric_ID: uint64_t,
+        local_startnonce: uint64_t,
+        local_nonces: uint64_t,
+    );
+    pub fn noncegen_sse(
+        cache: *mut c_void,
+        cache_size: size_t,
+        chunk_offset: size_t,
+        numeric_ID: uint64_t,
+        local_startnonce: uint64_t,
+        local_nonces: uint64_t,
+    );
+    pub fn noncegen_avx(
+        cache: *mut c_void,
+        cache_size: size_t,
+        chunk_offset: size_t,
+        numeric_ID: uint64_t,
+        local_startnonce: uint64_t,
+        local_nonces: uint64_t,
+    );
+    pub fn noncegen_avx2(
+        cache: *mut c_void,
+        cache_size: size_t,
+        chunk_offset: size_t,
+        numeric_ID: uint64_t,
+        local_startnonce: uint64_t,
+        local_nonces: uint64_t,
+    );
+    pub fn noncegen_avx512(
+        cache: *mut c_void,
+        cache_size: size_t,
+        chunk_offset: size_t,
+        numeric_ID: uint64_t,
+        local_startnonce: uint64_t,
+        local_nonces: uint64_t,
+    );
+}
+pub struct SafeCVoid {
+    pub ptr: *mut c_void,
+}
+unsafe impl Send for SafeCVoid {}
+
+pub struct CpuTask {
+    pub cache: SafeCVoid,
+    pub cache_size: size_t,
+    pub chunk_offset: size_t,
+    pub numeric_id: uint64_t,
+    pub local_startnonce: uint64_t,
+    pub local_nonces: uint64_t,
+}
+
+pub fn hash_cpu(
+    tx: Sender<(u8, u8, u64)>,
+    hasher_task: CpuTask,
+    simd_ext: String,
+) -> impl FnOnce() {
+    move || {
+        unsafe {
+            match &*simd_ext {
+                "AVX512F" => noncegen_avx512(
+                    hasher_task.cache.ptr,
+                    hasher_task.cache_size,
+                    hasher_task.chunk_offset,
+                    hasher_task.numeric_id,
+                    hasher_task.local_startnonce,
+                    hasher_task.local_nonces,
+                ),
+                "AVX2" => noncegen_avx2(
+                    hasher_task.cache.ptr,
+                    hasher_task.cache_size,
+                    hasher_task.chunk_offset,
+                    hasher_task.numeric_id,
+                    hasher_task.local_startnonce,
+                    hasher_task.local_nonces,
+                ),
+                "AVX" => noncegen_avx(
+                    hasher_task.cache.ptr,
+                    hasher_task.cache_size,
+                    hasher_task.chunk_offset,
+                    hasher_task.numeric_id,
+                    hasher_task.local_startnonce,
+                    hasher_task.local_nonces,
+                ),
+                "SSE2" => noncegen_sse(
+                    hasher_task.cache.ptr,
+                    hasher_task.cache_size,
+                    hasher_task.chunk_offset,
+                    hasher_task.numeric_id,
+                    hasher_task.local_startnonce,
+                    hasher_task.local_nonces,
+                ),
+                _ => noncegen(
+                    hasher_task.cache.ptr,
+                    hasher_task.cache_size,
+                    hasher_task.chunk_offset,
+                    hasher_task.numeric_id,
+                    hasher_task.local_startnonce,
+                    hasher_task.local_nonces,
+                ),
+            }
+        }
+        // report hashing done
+        tx.send((0u8, 1u8, 0))
+            .expect("CPU task can't communicate with scheduler thread.");
+        // report data in hostmem
+        tx.send((0u8, 0u8, hasher_task.local_nonces))
+            .expect("CPU task can't communicate with scheduler thread.");
+    }
+}
diff --git a/src/gpu_hasher.rs b/src/gpu_hasher.rs
new file mode 100644
index 0000000..46e89b4
--- /dev/null
+++ b/src/gpu_hasher.rs
@@ -0,0 +1,78 @@
+use chan::Receiver;
+use ocl::{gpu_hash, gpu_hash_and_transfer_to_host, gpu_transfer_to_host, GpuContext};
+use std::sync::mpsc::Sender;
+use std::sync::{Arc, Mutex};
+
+pub struct SafePointer {
+    pub ptr: *mut u8,
+}
+unsafe impl Send for SafePointer {}
+unsafe impl Sync for SafePointer {}
+
+pub struct GpuTask {
+    pub cache: SafePointer,
+    pub cache_size: u64,
+    pub chunk_offset: u64,
+    pub numeric_id: u64,
+    pub local_startnonce: u64,
+    pub local_nonces: u64,
+}
+
+pub fn create_gpu_hasher_thread(
+    gpu_id: u8,
+    gpu_context: Arc<Mutex<GpuContext>>,
+    tx: Sender<(u8, u8, u64)>,
+    rx_hasher_task: Receiver<Option<GpuTask>>,
+) -> impl FnOnce() {
+    move || {
+        let mut first_run = true;
+        let mut buffer_id = 0u8;
+        let mut last_task = GpuTask {
+            cache: SafePointer { ptr: &mut 0u8 },
+            cache_size: 0,
+            chunk_offset: 0,
+            numeric_id: 0,
+            local_startnonce: 0,
+            local_nonces: 0,
+        };
+        for task in rx_hasher_task {
+            // check if new task or termination
+            match task {
+                // new task
+                Some(task) => {
+                    // first run - just hash
+                    if first_run {
+                        if task.local_nonces != 0 {
+                            first_run = false;
+                            gpu_hash(&gpu_context, &task);
+                            buffer_id = 1 - buffer_id;
+                            last_task = task;
+                            tx.send((gpu_id, 1u8, 0))
+                                .expect("GPU task can't communicate with scheduler thread.");
+                        }
+                    // last run - just transfer
+                    } else if task.local_nonces == 0 {
+                        gpu_transfer_to_host(&gpu_context, buffer_id, &last_task);
+                        first_run = true;
+                        buffer_id = 0;
+                        tx.send((gpu_id, 0u8, last_task.local_nonces))
+                            .expect("GPU task can't communicate with scheduler thread.");
+                    // normal run - hash and transfer async
+                    } else {
+                        gpu_hash_and_transfer_to_host(&gpu_context, buffer_id, &task, &last_task);
+                        buffer_id = 1 - buffer_id;
+                        tx.send((gpu_id, 0u8, last_task.local_nonces))
+                            .expect("GPU task can't communicate with scheduler thread.");
+                        last_task = task;
+                        tx.send((gpu_id, 1u8, 0))
+                            .expect("GPU task can't communicate with scheduler thread.");
+                    }
+                }
+                // termination
+                None => {
+                    break;
+                }
+            }
+        }
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index 471e59d..a33bc35 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -9,8 +9,13 @@ extern crate pbr;
 extern crate stopwatch;
 extern crate sys_info;
 
-mod hasher;
+mod cpu_hasher;
+#[cfg(feature = "opencl")]
+mod gpu_hasher;
+#[cfg(feature = "opencl")]
+mod ocl;
 mod plotter;
+mod scheduler;
 mod utils;
 mod writer;
 
@@ -19,13 +24,10 @@ use clap::AppSettings::{ArgRequiredElseHelp, DeriveDisplayOrder, VersionlessSubc
 use clap::ArgGroup;
 use clap::{App, Arg};
 use plotter::{Plotter, PlotterTask};
+use std::cmp::min;
 use utils::set_low_prio;
 
 fn main() {
-    #[cfg(not(feature = "opencl"))]
-    let _opencl = false;
-    #[cfg(feature = "opencl")]
-    let opencl = true;
     let arg = App::new("Engraver")
         .version(crate_version!())
         .author(crate_authors!())
@@ -60,6 +62,12 @@ fn main() {
                 .long("quiet")
                 .help("Runs engraver in non-verbose mode")
                 .global(true),
+        ).arg(
+            Arg::with_name("benchmark")
+                .short("b")
+                .long("bench")
+                .help("Runs engraver in xPU benchmark mode")
+                .global(true),
         )
         /*
         .subcommand(
@@ -74,7 +82,7 @@ fn main() {
                         .value_name("numeric_ID")
                         .help("your numeric Burst ID")
                         .takes_value(true)
-                        .required(true),
+                        .required_unless("ocl-devices"),
                 ).arg(
                     Arg::with_name("start nonce")
                         .short("s")
@@ -82,7 +90,7 @@ fn main() {
                         .value_name("start_nonce")
                         .help("where you want to start plotting")
                         .takes_value(true)
-                        .required(true),
+                        .required_unless("ocl-devices"),
                 ).arg(
                     Arg::with_name("nonces")
                         .short("n")
@@ -90,7 +98,7 @@ fn main() {
                         .value_name("nonces")
                         .help("how many nonces you want to plot")
                         .takes_value(true)
-                        .required(true),
+                        .required_unless("ocl-devices"),
                 ).arg(
                     Arg::with_name("path")
                         .short("p")
@@ -120,14 +128,13 @@ fn main() {
                         .short("g")
                         .long("gpu")
                         .value_name("platform_id:device_id")
-                        .help("*GPU(s) you want to use for plotting")
+                        .help("GPU(s) you want to use for plotting (optional)")
                         .multiple(true)
                         .takes_value(true),
                 ]).groups(&[#[cfg(feature = "opencl")]
                 ArgGroup::with_name("processing")
                     .args(&["cpu", "gpu"])
-                    .multiple(true)
-                    .required(true)])
+                    .multiple(true)])
                     /*
                     .arg(
                     Arg::with_name("ssd buffer")
@@ -168,14 +175,35 @@ fn main() {
                 
         )*/;
 
+    #[cfg(feature = "opencl")]
+    let arg = arg
+        .arg(
+            Arg::with_name("ocl-devices")
+                .short("o")
+                .long("opencl")
+                .help("Display OpenCL platforms and devices")
+                .global(true),
+        ).arg(
+            Arg::with_name("zero-copy")
+                .short("z")
+                .long("zcb")
+                .help("Enables zero copy buffers for shared mem (integrated) gpus")
+                .global(true),
+        );
     let matches = &arg.get_matches();
 
     if matches.is_present("low priority") {
         set_low_prio();
     }
 
+    if matches.is_present("ocl-devices") {
+        #[cfg(feature = "opencl")]
+        ocl::platform_info();
+        return;
+    }
+
     // plotting
-    /*
+    /* subcommand
     if let Some(matches) = matches.subcommand_matches("plot") {
     */
     let numeric_id = value_t!(matches, "numeric id", u64).unwrap_or_else(|e| e.exit());
@@ -189,8 +217,30 @@ fn main() {
             .unwrap()
     });
     let mem = value_t!(matches, "memory", String).unwrap_or_else(|_| "0B".to_owned());
-    let cpu_threads =
-        value_t!(matches, "cpu", u8).unwrap_or_else(|_| sys_info::cpu_num().unwrap() as u8);
+    let cpu_threads = value_t!(matches, "cpu", u8).unwrap_or(0u8);
+
+    let gpus = if matches.occurrences_of("gpu") > 0 {
+        let gpu = values_t!(matches, "gpu", String);
+        Some(gpu.unwrap())
+    } else {
+        None
+    };
+
+    // work out number of cpu threads to use
+    let cores = sys_info::cpu_num().unwrap() as u8;
+    let cpu_threads = if cpu_threads == 0 {
+        cores
+    } else {
+        min(cores, cpu_threads)
+    };
+
+    // special case: dont use cpu if only a gpu is defined
+    #[cfg(feature = "opencl")]
+    let cpu_threads = if matches.occurrences_of("gpu") > 0 && matches.occurrences_of("cpu") == 0 {
+        0u8
+    } else {
+        cpu_threads
+    };
 
     let p = Plotter::new();
     p.run(PlotterTask {
@@ -200,8 +250,11 @@ fn main() {
         output_path,
         mem,
         cpu_threads,
+        gpus,
         direct_io: !matches.is_present("disable direct i/o"),
         async_io: !matches.is_present("disable async i/o"),
         quiet: matches.is_present("non-verbosity"),
+        benchmark: matches.is_present("benchmark"),
+        zcb: matches.is_present("zero-copy"),
     });
 }
diff --git a/src/ocl.rs b/src/ocl.rs
new file mode 100644
index 0000000..b38ce70
--- /dev/null
+++ b/src/ocl.rs
@@ -0,0 +1,609 @@
+extern crate ocl_core as core;
+extern crate rayon;
+
+use self::core::{
+    ArgVal, ContextProperties, DeviceInfo, Event, KernelWorkGroupInfo, PlatformInfo, Status,
+};
+use gpu_hasher::GpuTask;
+use ocl::rayon::prelude::*;
+use std::cmp::min;
+use std::ffi::CString;
+use std::process;
+use std::slice::{from_raw_parts, from_raw_parts_mut};
+use std::sync::{Arc, Mutex};
+use std::u64;
+
+static SRC: &'static str = include_str!("ocl/kernel.cl");
+
+const NONCE_SIZE: u64 = (2 << 17);
+const NUM_SCOOPS: u64 = 4096;
+const GPU_HASHES_PER_RUN: usize = 32;
+const MSHABAL512_VECTOR_SIZE: u64 = 16;
+const SCOOP_SIZE: u64 = 64;
+
+// convert the info or error to a string for printing:
+macro_rules! to_string {
+    ($expr:expr) => {
+        match $expr {
+            Ok(info) => info.to_string(),
+            Err(err) => match err.api_status() {
+                Some(Status::CL_KERNEL_ARG_INFO_NOT_AVAILABLE) => "Not available".into(),
+                _ => err.to_string(),
+            },
+        }
+    };
+}
+
+#[allow(dead_code)]
+pub struct GpuContext {
+    queue_a: core::CommandQueue,
+    queue_b: core::CommandQueue,
+    kernel: core::Kernel,
+    ldim1: [usize; 3],
+    gdim1: [usize; 3],
+    mapping: bool,
+    buffer_ptr_host: Option<core::MemMap<u8>>,
+    buffer_host: Option<core::Mem>,
+    buffer_gpu_a: core::Mem,
+    buffer_gpu_b: core::Mem,
+    pub worksize: usize,
+}
+
+// Ohne Gummi im Bahnhofsviertel... das wird noch Konsequenzen haben
+unsafe impl Sync for GpuContext {}
+
+impl GpuContext {
+    pub fn new(
+        gpu_platform: usize,
+        gpu_id: usize,
+        cores: usize,
+        nvidia: bool,
+        mapping: bool,
+    ) -> GpuContext {
+        let platform_ids = core::get_platform_ids().unwrap();
+        let platform_id = platform_ids[gpu_platform];
+        let device_ids = core::get_device_ids(&platform_id, None, None).unwrap();
+        let device_id = device_ids[gpu_id];
+        let context_properties = ContextProperties::new().platform(platform_id);
+        let context =
+            core::create_context(Some(&context_properties), &[device_id], None, None).unwrap();
+        let src_cstring = CString::new(SRC).unwrap();
+        let program = core::create_program_with_source(&context, &[src_cstring]).unwrap();
+        core::build_program(
+            &program,
+            None::<&[()]>,
+            &CString::new("").unwrap(),
+            None,
+            None,
+        ).unwrap();
+        let queue_a = core::create_command_queue(&context, &device_id, None).unwrap();
+        let queue_b = core::create_command_queue(&context, &device_id, None).unwrap();
+        let kernel = core::create_kernel(&program, "calculate_nonces").unwrap();
+        let kernel_workgroup_size = get_kernel_work_group_size(&kernel, device_id);
+        let workgroup_count = cores;
+        let worksize = kernel_workgroup_size * workgroup_count;
+        let gdim1 = [worksize, 1, 1];
+        let ldim1 = [kernel_workgroup_size, 1, 1];
+
+        // create buffers
+        // mapping = zero copy buffers, no mapping = pinned memory for fast DMA.
+        if mapping {
+            let buffer_gpu_a = unsafe {
+                core::create_buffer::<_, u8>(
+                    &context,
+                    core::MEM_READ_WRITE | core::MEM_ALLOC_HOST_PTR,
+                    (NONCE_SIZE as usize) * worksize,
+                    None,
+                ).unwrap()
+            };
+            let buffer_gpu_b = unsafe {
+                core::create_buffer::<_, u8>(
+                    &context,
+                    core::MEM_READ_WRITE | core::MEM_ALLOC_HOST_PTR,
+                    (NONCE_SIZE as usize) * worksize,
+                    None,
+                ).unwrap()
+            };
+            GpuContext {
+                queue_a,
+                queue_b,
+                kernel,
+                ldim1,
+                gdim1,
+                mapping,
+                buffer_gpu_a,
+                buffer_gpu_b,
+                buffer_ptr_host: None,
+                buffer_host: None,
+                worksize,
+            }
+        } else {
+            let buffer_host = unsafe {
+                core::create_buffer::<_, u8>(
+                    &context,
+                    core::MEM_READ_WRITE | core::MEM_ALLOC_HOST_PTR,
+                    (NONCE_SIZE as usize) * worksize,
+                    None,
+                ).unwrap()
+            };
+            let buffer_ptr_host = unsafe {
+                Some(
+                    core::enqueue_map_buffer::<u8, _, _, _>(
+                        &queue_b,
+                        &buffer_host,
+                        true,
+                        core::MAP_READ,
+                        0,
+                        worksize * NONCE_SIZE as usize,
+                        None::<Event>,
+                        None::<&mut Event>,
+                    ).unwrap(),
+                )
+            };
+            let buffer_gpu_a = if nvidia {
+                buffer_host.clone()
+            } else {
+                unsafe {
+                    core::create_buffer::<_, u8>(
+                        &context,
+                        core::MEM_READ_WRITE,
+                        (NONCE_SIZE as usize) * worksize,
+                        None,
+                    ).unwrap()
+                }
+            };
+            let buffer_gpu_b = unsafe {
+                core::create_buffer::<_, u8>(
+                    &context,
+                    core::MEM_READ_WRITE,
+                    (NONCE_SIZE as usize) * worksize,
+                    None,
+                ).unwrap()
+            };
+
+            let buffer_host = if nvidia { None } else { Some(buffer_host) };
+            GpuContext {
+                queue_a,
+                queue_b,
+                kernel,
+                ldim1,
+                gdim1,
+                mapping,
+                buffer_gpu_a,
+                buffer_gpu_b,
+                buffer_ptr_host,
+                buffer_host,
+                worksize,
+            }
+        }
+    }
+}
+
+pub fn platform_info() {
+    let platform_ids = core::get_platform_ids().unwrap();
+    for (i, platform_id) in platform_ids.iter().enumerate() {
+        println!(
+            "OCL: platform {}, {} - {}",
+            i,
+            to_string!(core::get_platform_info(&platform_id, PlatformInfo::Name)),
+            to_string!(core::get_platform_info(&platform_id, PlatformInfo::Version))
+        );
+        let device_ids = core::get_device_ids(&platform_id, None, None).unwrap();
+        for (j, device_id) in device_ids.iter().enumerate() {
+            println!(
+                "OCL: device {}, {} - {}",
+                j,
+                to_string!(core::get_device_info(device_id, DeviceInfo::Vendor)),
+                to_string!(core::get_device_info(device_id, DeviceInfo::Name))
+            );
+        }
+    }
+}
+
+pub fn gpu_get_info(gpus: &[String], quiet: bool) -> u64 {
+    let mut total_mem_needed = 0u64;
+    for gpu in gpus.iter() {
+        let gpu = gpu.split(':').collect::<Vec<&str>>();
+        let platform_id = gpu[0].parse::<usize>().unwrap();
+        let gpu_id = gpu[1].parse::<usize>().unwrap();
+        let gpu_cores = gpu[2].parse::<usize>().unwrap();
+
+        let platform_ids = core::get_platform_ids().unwrap();
+        if platform_id >= platform_ids.len() {
+            println!("Error: Selected OpenCL platform doesn't exist.");
+            println!("Shutting down...");
+            process::exit(0);
+        }
+        let platform = platform_ids[platform_id];
+        let device_ids = core::get_device_ids(&platform, None, None).unwrap();
+        if gpu_id >= device_ids.len() {
+            println!("Error: Selected OpenCL device doesn't exist");
+            println!("Shutting down...");
+            process::exit(0);
+        }
+        let device = device_ids[gpu_id];
+        let max_compute_units =
+            match core::get_device_info(&device, DeviceInfo::MaxComputeUnits).unwrap() {
+                core::DeviceInfoResult::MaxComputeUnits(mcu) => mcu,
+                _ => panic!("Unexpected error. Can't obtain number of GPU cores."),
+            };
+        let mem = match core::get_device_info(&device, DeviceInfo::GlobalMemSize).unwrap() {
+            core::DeviceInfoResult::GlobalMemSize(gms) => gms,
+            _ => panic!("Unexpected error. Can't obtain GPU memory size."),
+        };
+
+        // get work_group_size for kernel
+        let context_properties = ContextProperties::new().platform(platform);
+        let context =
+            core::create_context(Some(&context_properties), &[device], None, None).unwrap();
+        let src_cstring = CString::new(SRC).unwrap();
+        let program = core::create_program_with_source(&context, &[src_cstring]).unwrap();
+        core::build_program(
+            &program,
+            None::<&[()]>,
+            &CString::new("").unwrap(),
+            None,
+            None,
+        ).unwrap();
+        let kernel = core::create_kernel(&program, "calculate_nonces").unwrap();
+        let kernel_workgroup_size = get_kernel_work_group_size(&kernel, device);
+
+        let gpu_cores = if gpu_cores == 0 {
+            max_compute_units as usize
+        } else {
+            min(gpu_cores, max_compute_units as usize)
+        };
+        let mem_needed = 2 * gpu_cores * kernel_workgroup_size * 256 * 1024;
+
+        if mem_needed > mem as usize {
+            println!("Error: Not enough GPU-memory. Please reduce number of cores.");
+            println!("Shutting down...");
+            process::exit(0);
+        }
+
+        if !quiet {
+            println!(
+                "GPU: {} - {} [using {} of {} cores]",
+                to_string!(core::get_device_info(&device, DeviceInfo::Vendor)),
+                to_string!(core::get_device_info(&device, DeviceInfo::Name)),
+                gpu_cores,
+                max_compute_units
+            );
+        }
+        if !quiet {
+            println!(
+                "     GPU-RAM: Total={:.2} MiB, Usage={:.2} MiB",
+                mem / 1024 / 1024,
+                mem_needed / 1024 / 1024,
+            );
+        }
+        total_mem_needed += mem_needed as u64;
+    }
+    total_mem_needed
+}
+
+pub fn gpu_init(gpus: &[String], zcb: bool) -> Vec<Arc<Mutex<GpuContext>>> {
+    let mut result = Vec::new();
+    for gpu in gpus.iter() {
+        let gpu = gpu.split(':').collect::<Vec<&str>>();
+        let platform_id = gpu[0].parse::<usize>().unwrap();
+        let gpu_id = gpu[1].parse::<usize>().unwrap();
+        let gpu_cores = gpu[2].parse::<usize>().unwrap();
+        let platform_ids = core::get_platform_ids().unwrap();
+        if platform_id >= platform_ids.len() {
+            println!("Error: Selected OpenCL platform doesn't exist.");
+            println!("Shutting down...");
+            process::exit(0);
+        }
+        let platform = platform_ids[platform_id];
+        let device_ids = core::get_device_ids(&platform, None, None).unwrap();
+        if gpu_id >= device_ids.len() {
+            println!("Error: Selected OpenCL device doesn't exist");
+            println!("Shutting down...");
+            process::exit(0);
+        }
+        let device = device_ids[gpu_id];
+        let max_compute_units =
+            match core::get_device_info(&device, DeviceInfo::MaxComputeUnits).unwrap() {
+                core::DeviceInfoResult::MaxComputeUnits(mcu) => mcu,
+                _ => panic!("Unexpected error. Can't obtain number of GPU cores."),
+            };
+        let vendor = to_string!(core::get_device_info(&device, DeviceInfo::Vendor)).to_uppercase();
+        let nvidia = vendor.contains("NVIDIA");
+        let gpu_cores = if gpu_cores == 0 {
+            max_compute_units as usize
+        } else {
+            min(gpu_cores, max_compute_units as usize)
+        };
+        result.push(Arc::new(Mutex::new(GpuContext::new(
+            platform_id,
+            gpu_id,
+            gpu_cores,
+            nvidia,
+            zcb,
+        ))));
+    }
+    result
+}
+
+fn get_kernel_work_group_size(x: &core::Kernel, y: core::DeviceId) -> usize {
+    match core::get_kernel_work_group_info(x, y, KernelWorkGroupInfo::WorkGroupSize).unwrap() {
+        core::KernelWorkGroupInfoResult::WorkGroupSize(kws) => kws,
+        _ => panic!("Unexpected error"),
+    }
+}
+
+pub fn gpu_hash(gpu_context: &Arc<Mutex<GpuContext>>, task: &GpuTask) {
+    let numeric_id_be: u64 = task.numeric_id.to_be();
+
+    let mut start;
+    let mut end;
+    let gpu_context = gpu_context.lock().unwrap();
+
+    core::set_kernel_arg(
+        &gpu_context.kernel,
+        0,
+        ArgVal::mem(&gpu_context.buffer_gpu_a),
+    ).unwrap();
+    core::set_kernel_arg(
+        &gpu_context.kernel,
+        1,
+        ArgVal::primitive(&task.local_startnonce),
+    ).unwrap();
+    core::set_kernel_arg(
+        &gpu_context.kernel,
+        5,
+        ArgVal::primitive(&task.local_nonces),
+    ).unwrap();
+    core::set_kernel_arg(&gpu_context.kernel, 2, ArgVal::primitive(&numeric_id_be)).unwrap();
+
+    for i in (0..8192).step_by(GPU_HASHES_PER_RUN) {
+        if i + GPU_HASHES_PER_RUN < 8192 {
+            start = i;
+            end = i + GPU_HASHES_PER_RUN - 1;
+        } else {
+            start = i;
+            end = i + GPU_HASHES_PER_RUN;
+        }
+
+        core::set_kernel_arg(&gpu_context.kernel, 3, ArgVal::primitive(&(start as i32))).unwrap();
+        core::set_kernel_arg(&gpu_context.kernel, 4, ArgVal::primitive(&(end as i32))).unwrap();
+
+        unsafe {
+            core::enqueue_kernel(
+                &gpu_context.queue_a,
+                &gpu_context.kernel,
+                1,
+                None,
+                &gpu_context.gdim1,
+                Some(gpu_context.ldim1),
+                None::<Event>,
+                None::<&mut Event>,
+            ).unwrap();
+        }
+    }
+    core::finish(&gpu_context.queue_a).unwrap();
+}
+
+pub fn gpu_transfer_to_host(
+    gpu_context: &Arc<Mutex<GpuContext>>,
+    buffer_id: u8,
+    transfer_task: &GpuTask,
+) {
+    let mut gpu_context = gpu_context.lock().unwrap();
+
+    // get mem mapping
+    let map = if gpu_context.mapping {
+        Some(mem_map_gpu_to_host(buffer_id, &gpu_context))
+    } else {
+        None
+    };
+
+    let buffer = if gpu_context.mapping {
+        // map to host (zero copy buffer)
+        map.as_ref().unwrap().as_ptr()
+    } else {
+        // get pointer
+        let ptr = gpu_context.buffer_ptr_host.as_mut().unwrap().as_mut_ptr();
+        // copy to host
+        let slice = unsafe { from_raw_parts_mut(ptr, gpu_context.worksize * NONCE_SIZE as usize) };
+        mem_transfer_gpu_to_host(buffer_id, &gpu_context, slice);
+        core::finish(&gpu_context.queue_b).unwrap();
+        ptr
+    };
+    unpack_shuffle_scatter(buffer, &gpu_context, &transfer_task);
+    if gpu_context.mapping {
+        mem_unmap_gpu_to_host(buffer_id, &gpu_context, map);
+        core::finish(&gpu_context.queue_a).unwrap();
+    }
+}
+
+pub fn gpu_hash_and_transfer_to_host(
+    gpu_context: &Arc<Mutex<GpuContext>>,
+    buffer_id: u8,
+    hasher_task: &GpuTask,
+    transfer_task: &GpuTask,
+) {
+    let mut gpu_context = gpu_context.lock().unwrap();
+
+    let map = if gpu_context.mapping {
+        Some(mem_map_gpu_to_host(buffer_id, &gpu_context))
+    } else {
+        None
+    };
+
+    let buffer = if gpu_context.mapping {
+        // map to host (zero copy buffer)
+        map.as_ref().unwrap().as_ptr()
+    } else {
+        // get pointer
+        let ptr = gpu_context.buffer_ptr_host.as_mut().unwrap().as_mut_ptr();
+        // copy to host
+        let slice = unsafe { from_raw_parts_mut(ptr, gpu_context.worksize * NONCE_SIZE as usize) };
+        mem_transfer_gpu_to_host(buffer_id, &gpu_context, slice);
+        ptr
+    };
+
+    let numeric_id_be: u64 = hasher_task.numeric_id.to_be();
+
+    let mut start;
+    let mut end;
+
+    core::set_kernel_arg(
+        &gpu_context.kernel,
+        0,
+        ArgVal::mem(if buffer_id == 0 {
+            &gpu_context.buffer_gpu_a
+        } else {
+            &gpu_context.buffer_gpu_b
+        }),
+    ).unwrap();
+    core::set_kernel_arg(
+        &gpu_context.kernel,
+        1,
+        ArgVal::primitive(&hasher_task.local_startnonce),
+    ).unwrap();
+    core::set_kernel_arg(
+        &gpu_context.kernel,
+        5,
+        ArgVal::primitive(&hasher_task.local_nonces),
+    ).unwrap();
+    core::set_kernel_arg(&gpu_context.kernel, 2, ArgVal::primitive(&numeric_id_be)).unwrap();
+
+    for i in (0..8192).step_by(GPU_HASHES_PER_RUN) {
+        if i + GPU_HASHES_PER_RUN < 8192 {
+            start = i;
+            end = i + GPU_HASHES_PER_RUN - 1;
+        } else {
+            start = i;
+            end = i + GPU_HASHES_PER_RUN;
+        }
+        core::set_kernel_arg(&gpu_context.kernel, 3, ArgVal::primitive(&(start as i32))).unwrap();
+        core::set_kernel_arg(&gpu_context.kernel, 4, ArgVal::primitive(&(end as i32))).unwrap();
+        unsafe {
+            core::enqueue_kernel(
+                &gpu_context.queue_a,
+                &gpu_context.kernel,
+                1,
+                None,
+                &gpu_context.gdim1,
+                Some(gpu_context.ldim1),
+                None::<Event>,
+                None::<&mut Event>,
+            ).unwrap();
+        }
+    }
+    core::finish(&gpu_context.queue_b).unwrap();
+    unpack_shuffle_scatter(buffer, &gpu_context, &transfer_task);
+    if gpu_context.mapping {
+        mem_unmap_gpu_to_host(buffer_id, &gpu_context, map);
+    }
+    core::finish(&gpu_context.queue_a).unwrap();
+}
+
+fn mem_map_gpu_to_host(buffer_id: u8, gpu_context: &GpuContext) -> core::MemMap<u8> {
+    unsafe {
+        if buffer_id == 1 {
+            core::enqueue_map_buffer::<u8, _, _, _>(
+                &gpu_context.queue_b,
+                &gpu_context.buffer_gpu_a,
+                true,
+                core::MAP_READ,
+                0,
+                gpu_context.gdim1[0] * NONCE_SIZE as usize,
+                None::<Event>,
+                None::<&mut Event>,
+            ).unwrap()
+        } else {
+            core::enqueue_map_buffer::<u8, _, _, _>(
+                &gpu_context.queue_b,
+                &gpu_context.buffer_gpu_b,
+                true,
+                core::MAP_READ,
+                0,
+                gpu_context.gdim1[0] * NONCE_SIZE as usize,
+                None::<Event>,
+                None::<&mut Event>,
+            ).unwrap()
+        }
+    }
+}
+
+fn mem_unmap_gpu_to_host(buffer_id: u8, gpu_context: &GpuContext, map: Option<core::MemMap<u8>>) {
+    // map to host (zero copy buffer)
+    if buffer_id == 1 {
+        core::enqueue_unmap_mem_object(
+            &gpu_context.queue_a,
+            &gpu_context.buffer_gpu_a,
+            &map.unwrap(),
+            None::<Event>,
+            None::<&mut Event>,
+        ).unwrap()
+    } else {
+        core::enqueue_unmap_mem_object(
+            &gpu_context.queue_a,
+            &gpu_context.buffer_gpu_b,
+            &map.unwrap(),
+            None::<Event>,
+            None::<&mut Event>,
+        ).unwrap()
+    };
+}
+
+fn mem_transfer_gpu_to_host(buffer_id: u8, gpu_context: &GpuContext, slice: &mut [u8]) {
+    unsafe {
+        if buffer_id == 1 {
+            core::enqueue_read_buffer(
+                &gpu_context.queue_b,
+                &gpu_context.buffer_gpu_a,
+                false,
+                0,
+                slice,
+                None::<Event>,
+                None::<&mut Event>,
+            ).unwrap();
+        } else {
+            core::enqueue_read_buffer(
+                &gpu_context.queue_b,
+                &gpu_context.buffer_gpu_b,
+                false,
+                0,
+                slice,
+                None::<Event>,
+                None::<&mut Event>,
+            ).unwrap();
+        }
+    }
+}
+
+// simd shabal words unpack + POC Shuffle + scatter nonces into optimised cache
+fn unpack_shuffle_scatter(buffer: *const u8, gpu_context: &GpuContext, transfer_task: &GpuTask) {
+    unsafe {
+        let buffer = from_raw_parts(buffer, gpu_context.worksize * NONCE_SIZE as usize);
+        let iter: Vec<u64> = (0..transfer_task.local_nonces).step_by(16).collect();
+        iter.par_iter().for_each(|n| {
+            // get global buffer
+            let data = from_raw_parts_mut(
+                transfer_task.cache.ptr,
+                NONCE_SIZE as usize * transfer_task.cache_size as usize,
+            );
+            for i in 0..(NUM_SCOOPS * 2) {
+                for j in (0..32).step_by(4) {
+                    for k in 0..MSHABAL512_VECTOR_SIZE {
+                        let data_offset = (((i & 1) * (4095 - (i >> 1)) + ((i + 1) & 1) * (i >> 1))
+                            * SCOOP_SIZE
+                            * transfer_task.cache_size
+                            + (*n + k + transfer_task.chunk_offset) * SCOOP_SIZE
+                            + (i & 1) * 32
+                            + j) as usize;
+                        let buffer_offset = (*n * NONCE_SIZE
+                            + (i * 32 + j) * MSHABAL512_VECTOR_SIZE
+                            + k * 4) as usize;
+                        data[data_offset..(data_offset + 4)]
+                            .clone_from_slice(&buffer[buffer_offset..(buffer_offset + 4)]);
+                    }
+                }
+            }
+        })
+    }
+}
diff --git a/src/ocl/kernel.cl b/src/ocl/kernel.cl
new file mode 100644
index 0000000..263e91c
--- /dev/null
+++ b/src/ocl/kernel.cl
@@ -0,0 +1,514 @@
+typedef unsigned int sph_u32;
+
+#define SPH_C32(x)    ((sph_u32)(x ## U))
+#define SPH_T32(x) (as_uint(x))
+#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
+#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
+
+#define SPH_C64(x)    ((sph_u64)(x ## UL))
+#define SPH_T64(x) (as_ulong(x))
+#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
+#define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))
+
+/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
+/*
+ * Shabal implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+/*
+ * Part of this code was automatically generated (the part between
+ * the "BEGIN" and "END" markers).
+ */
+
+#define sM    16
+
+#define C32   SPH_C32
+#define T32   SPH_T32
+
+#define O1   13
+#define O2    9
+#define O3    6
+
+/*
+ * We copy the state into local variables, so that the compiler knows
+ * that it can optimize them at will.
+ */
+
+/* BEGIN -- automatically generated code. */
+
+#define INPUT_BLOCK_ADD   do { \
+		B0 = T32(B0 + M0); \
+		B1 = T32(B1 + M1); \
+		B2 = T32(B2 + M2); \
+		B3 = T32(B3 + M3); \
+		B4 = T32(B4 + M4); \
+		B5 = T32(B5 + M5); \
+		B6 = T32(B6 + M6); \
+		B7 = T32(B7 + M7); \
+		B8 = T32(B8 + M8); \
+		B9 = T32(B9 + M9); \
+		BA = T32(BA + MA); \
+		BB = T32(BB + MB); \
+		BC = T32(BC + MC); \
+		BD = T32(BD + MD); \
+		BE = T32(BE + ME); \
+		BF = T32(BF + MF); \
+	} while (0)
+
+#define INPUT_BLOCK_SUB   do { \
+		C0 = T32(C0 - M0); \
+		C1 = T32(C1 - M1); \
+		C2 = T32(C2 - M2); \
+		C3 = T32(C3 - M3); \
+		C4 = T32(C4 - M4); \
+		C5 = T32(C5 - M5); \
+		C6 = T32(C6 - M6); \
+		C7 = T32(C7 - M7); \
+		C8 = T32(C8 - M8); \
+		C9 = T32(C9 - M9); \
+		CA = T32(CA - MA); \
+		CB = T32(CB - MB); \
+		CC = T32(CC - MC); \
+		CD = T32(CD - MD); \
+		CE = T32(CE - ME); \
+		CF = T32(CF - MF); \
+	} while (0)
+
+#define XOR_W   do { \
+		A00 ^= Wlow; \
+		A01 ^= Whigh; \
+	} while (0)
+
+#define SWAP(v1, v2)   do { \
+		sph_u32 tmp = (v1); \
+		(v1) = (v2); \
+		(v2) = tmp; \
+	} while (0)
+
+#define SWAP_BC   do { \
+		SWAP(B0, C0); \
+		SWAP(B1, C1); \
+		SWAP(B2, C2); \
+		SWAP(B3, C3); \
+		SWAP(B4, C4); \
+		SWAP(B5, C5); \
+		SWAP(B6, C6); \
+		SWAP(B7, C7); \
+		SWAP(B8, C8); \
+		SWAP(B9, C9); \
+		SWAP(BA, CA); \
+		SWAP(BB, CB); \
+		SWAP(BC, CC); \
+		SWAP(BD, CD); \
+		SWAP(BE, CE); \
+		SWAP(BF, CF); \
+	} while (0)
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)   do { \
+		xa0 = T32((xa0 \
+			^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \
+			^ xc) * 3U) \
+			^ xb1 ^ (xb2 & ~xb3) ^ xm; \
+		xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \
+	} while (0)
+
+#define PERM_STEP_0   do { \
+		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_1   do { \
+		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_2   do { \
+		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define APPLY_P   do { \
+		B0 = T32(B0 << 17) | (B0 >> 15); \
+		B1 = T32(B1 << 17) | (B1 >> 15); \
+		B2 = T32(B2 << 17) | (B2 >> 15); \
+		B3 = T32(B3 << 17) | (B3 >> 15); \
+		B4 = T32(B4 << 17) | (B4 >> 15); \
+		B5 = T32(B5 << 17) | (B5 >> 15); \
+		B6 = T32(B6 << 17) | (B6 >> 15); \
+		B7 = T32(B7 << 17) | (B7 >> 15); \
+		B8 = T32(B8 << 17) | (B8 >> 15); \
+		B9 = T32(B9 << 17) | (B9 >> 15); \
+		BA = T32(BA << 17) | (BA >> 15); \
+		BB = T32(BB << 17) | (BB >> 15); \
+		BC = T32(BC << 17) | (BC >> 15); \
+		BD = T32(BD << 17) | (BD >> 15); \
+		BE = T32(BE << 17) | (BE >> 15); \
+		BF = T32(BF << 17) | (BF >> 15); \
+		PERM_STEP_0; \
+		PERM_STEP_1; \
+		PERM_STEP_2; \
+		A0B = T32(A0B + C6); \
+		A0A = T32(A0A + C5); \
+		A09 = T32(A09 + C4); \
+		A08 = T32(A08 + C3); \
+		A07 = T32(A07 + C2); \
+		A06 = T32(A06 + C1); \
+		A05 = T32(A05 + C0); \
+		A04 = T32(A04 + CF); \
+		A03 = T32(A03 + CE); \
+		A02 = T32(A02 + CD); \
+		A01 = T32(A01 + CC); \
+		A00 = T32(A00 + CB); \
+		A0B = T32(A0B + CA); \
+		A0A = T32(A0A + C9); \
+		A09 = T32(A09 + C8); \
+		A08 = T32(A08 + C7); \
+		A07 = T32(A07 + C6); \
+		A06 = T32(A06 + C5); \
+		A05 = T32(A05 + C4); \
+		A04 = T32(A04 + C3); \
+		A03 = T32(A03 + C2); \
+		A02 = T32(A02 + C1); \
+		A01 = T32(A01 + C0); \
+		A00 = T32(A00 + CF); \
+		A0B = T32(A0B + CE); \
+		A0A = T32(A0A + CD); \
+		A09 = T32(A09 + CC); \
+		A08 = T32(A08 + CB); \
+		A07 = T32(A07 + CA); \
+		A06 = T32(A06 + C9); \
+		A05 = T32(A05 + C8); \
+		A04 = T32(A04 + C7); \
+		A03 = T32(A03 + C6); \
+		A02 = T32(A02 + C5); \
+		A01 = T32(A01 + C4); \
+		A00 = T32(A00 + C3); \
+	} while (0)
+
+#define INCR_W   do { \
+		if ((Wlow = T32(Wlow + 1)) == 0) \
+			Whigh = T32(Whigh + 1); \
+	} while (0)
+
+__constant static const sph_u32 A_init_192[] = {
+    C32(0xFD749ED4), C32(0xB798E530), C32(0x33904B6F), C32(0x46BDA85E),
+    C32(0x076934B4), C32(0x454B4058), C32(0x77F74527), C32(0xFB4CF465),
+    C32(0x62931DA9), C32(0xE778C8DB), C32(0x22B3998E), C32(0xAC15CFB9)
+};
+
+__constant static const sph_u32 B_init_192[] = {
+    C32(0x58BCBAC4), C32(0xEC47A08E), C32(0xAEE933B2), C32(0xDFCBC824),
+    C32(0xA7944804), C32(0xBF65BDB0), C32(0x5A9D4502), C32(0x59979AF7),
+    C32(0xC5CEA54E), C32(0x4B6B8150), C32(0x16E71909), C32(0x7D632319),
+    C32(0x930573A0), C32(0xF34C63D1), C32(0xCAF914B4), C32(0xFDD6612C)
+};
+
+__constant static const sph_u32 C_init_192[] = {
+    C32(0x61550878), C32(0x89EF2B75), C32(0xA1660C46), C32(0x7EF3855B),
+    C32(0x7297B58C), C32(0x1BC67793), C32(0x7FB1C723), C32(0xB66FC640),
+    C32(0x1A48B71C), C32(0xF0976D17), C32(0x088CE80A), C32(0xA454EDF3),
+    C32(0x1C096BF4), C32(0xAC76224B), C32(0x5215781C), C32(0xCD5D2669)
+};
+
+__constant static const sph_u32 A_init_224[] = {
+    C32(0xA5201467), C32(0xA9B8D94A), C32(0xD4CED997), C32(0x68379D7B),
+    C32(0xA7FC73BA), C32(0xF1A2546B), C32(0x606782BF), C32(0xE0BCFD0F),
+    C32(0x2F25374E), C32(0x069A149F), C32(0x5E2DFF25), C32(0xFAECF061)
+};
+
+__constant static const sph_u32 B_init_224[] = {
+    C32(0xEC9905D8), C32(0xF21850CF), C32(0xC0A746C8), C32(0x21DAD498),
+    C32(0x35156EEB), C32(0x088C97F2), C32(0x26303E40), C32(0x8A2D4FB5),
+    C32(0xFEEE44B6), C32(0x8A1E9573), C32(0x7B81111A), C32(0xCBC139F0),
+    C32(0xA3513861), C32(0x1D2C362E), C32(0x918C580E), C32(0xB58E1B9C)
+};
+
+__constant static const sph_u32 C_init_224[] = {
+    C32(0xE4B573A1), C32(0x4C1A0880), C32(0x1E907C51), C32(0x04807EFD),
+    C32(0x3AD8CDE5), C32(0x16B21302), C32(0x02512C53), C32(0x2204CB18),
+    C32(0x99405F2D), C32(0xE5B648A1), C32(0x70AB1D43), C32(0xA10C25C2),
+    C32(0x16F1AC05), C32(0x38BBEB56), C32(0x9B01DC60), C32(0xB1096D83)
+};
+
+__constant static const sph_u32 A_init_256[] = {
+    C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
+    C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
+    C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
+};
+
+__constant static const sph_u32 B_init_256[] = {
+    C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
+    C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
+    C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
+    C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
+};
+
+__constant static const sph_u32 C_init_256[] = {
+    C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
+    C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
+    C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
+    C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
+};
+
+__constant static const sph_u32 A_init_384[] = {
+    C32(0xC8FCA331), C32(0xE55C504E), C32(0x003EBF26), C32(0xBB6B8D83),
+    C32(0x7B0448C1), C32(0x41B82789), C32(0x0A7C9601), C32(0x8D659CFF),
+    C32(0xB6E2673E), C32(0xCA54C77B), C32(0x1460FD7E), C32(0x3FCB8F2D)
+};
+
+__constant static const sph_u32 B_init_384[] = {
+    C32(0x527291FC), C32(0x2A16455F), C32(0x78E627E5), C32(0x944F169F),
+    C32(0x1CA6F016), C32(0xA854EA25), C32(0x8DB98ABE), C32(0xF2C62641),
+    C32(0x30117DCB), C32(0xCF5C4309), C32(0x93711A25), C32(0xF9F671B8),
+    C32(0xB01D2116), C32(0x333F4B89), C32(0xB285D165), C32(0x86829B36)
+};
+
+__constant static const sph_u32 C_init_384[] = {
+    C32(0xF764B11A), C32(0x76172146), C32(0xCEF6934D), C32(0xC6D28399),
+    C32(0xFE095F61), C32(0x5E6018B4), C32(0x5048ECF5), C32(0x51353261),
+    C32(0x6E6E36DC), C32(0x63130DAD), C32(0xA9C69BD6), C32(0x1E90EA0C),
+    C32(0x7C35073B), C32(0x28D95E6D), C32(0xAA340E0D), C32(0xCB3DEE70)
+};
+
+__constant static const sph_u32 A_init_512[] = {
+    C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
+    C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
+    C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
+};
+
+__constant static const sph_u32 B_init_512[] = {
+    C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
+    C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
+    C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
+    C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
+};
+
+__constant static const sph_u32 C_init_512[] = {
+    C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
+    C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
+    C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
+    C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
+};
+
+/* END -- automatically generated code. */
+
+#define NONCES_VECTOR           16
+#define NONCES_VECTOR_LOG2      4
+#define MESSAGE_CAP             64
+#define NUM_HASHES   			8192
+#define HASH_SIZE_WORDS         8
+#define NONCE_SIZE_WORDS        HASH_SIZE_WORDS * NUM_HASHES
+
+#define EndianSwap(n) (rotate(n & 0x00FF00FF, 24UL)|(rotate(n, 8UL) & 0x00FF00FF))
+
+#define EndianSwap64(n)	bitselect( \
+		bitselect(rotate(n, 24UL), \
+		          rotate(n, 8UL), 0x000000FF000000FFUL), \
+		bitselect(rotate(n, 56UL), \
+		          rotate(n, 40UL), 0x00FF000000FF0000UL), \
+		0xFFFF0000FFFF0000UL)
+
+#define Address(nonce,hash,word) ((nonce >> NONCES_VECTOR_LOG2) * NONCES_VECTOR * NONCE_SIZE_WORDS + (hash) * NONCES_VECTOR * HASH_SIZE_WORDS + word * NONCES_VECTOR + (nonce & (NONCES_VECTOR-1)))
+//#define Address(nonce,hash,word) (nonce * NONCE_SIZE_WORDS + (hash) * HASH_SIZE_WORDS + word)
+
+/* Johnny's optimised nonce calculation kernel 
+ * based on the implementation found in BRS
+ */
+__kernel void calculate_nonces(__global unsigned char* buffer, unsigned long startnonce, unsigned long numeric_id_be, int start, int end, unsigned long nonces) {
+	//if (gid==0) {printf("\n\nOCL 2 %lu\n\n",startnonce);} DEBUG
+	int gid = get_global_id(0);
+
+	if (gid >= nonces)
+		return;
+	// number of shabal message round
+	int num; 
+	// buffer for final hash
+	sph_u32 B8,B9,BA,BB,BC,BD,BE,BF;
+	// init
+	unsigned long nonce_be = EndianSwap64(startnonce + gid);
+	// run 8192 rounds + final round 
+	for (int hash = NUM_HASHES - start; hash > -1 + NUM_HASHES - end; hash -= 1) {
+		// calculate number of shabal messages excl. final message
+		num = (NUM_HASHES - hash) >> 1; 
+		if (hash != 0) { 
+			num = (num > MESSAGE_CAP) ? MESSAGE_CAP : num;
+		} 
+
+		// init shabal
+        sph_u32
+            A00 = A_init_256[0], A01 = A_init_256[1], A02 = A_init_256[2], A03 = A_init_256[3],
+            A04 = A_init_256[4], A05 = A_init_256[5], A06 = A_init_256[6], A07 = A_init_256[7],
+            A08 = A_init_256[8], A09 = A_init_256[9], A0A = A_init_256[10], A0B = A_init_256[11];
+        sph_u32
+            B0 = B_init_256[0], B1 = B_init_256[1], B2 = B_init_256[2], B3 = B_init_256[3],
+            B4 = B_init_256[4], B5 = B_init_256[5], B6 = B_init_256[6], B7 = B_init_256[7];
+            B8 = B_init_256[8]; B9 = B_init_256[9]; BA = B_init_256[10]; BB = B_init_256[11];
+            BC = B_init_256[12]; BD = B_init_256[13]; BE = B_init_256[14]; BF = B_init_256[15];
+        sph_u32
+            C0 = C_init_256[0], C1 = C_init_256[1], C2 = C_init_256[2], C3 = C_init_256[3],
+            C4 = C_init_256[4], C5 = C_init_256[5], C6 = C_init_256[6], C7 = C_init_256[7],
+            C8 = C_init_256[8], C9 = C_init_256[9], CA = C_init_256[10], CB = C_init_256[11],
+            CC = C_init_256[12], CD = C_init_256[13], CE = C_init_256[14], CF = C_init_256[15];
+        sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
+        sph_u32 Wlow = 1, Whigh = 0;
+	
+		for (int i = 0; i < 2 * num; i+=2){
+			M0 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 0)];
+			M1 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 1)];
+			M2 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 2)];
+			M3 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 3)];
+			M4 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 4)];
+			M5 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 5)];
+			M6 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 6)];
+			M7 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 7)];
+			M8 = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 0)];
+			M9 = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 1)];
+			MA = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 2)];
+			MB = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 3)];
+			MC = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 4)];
+			MD = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 5)];
+			ME = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 6)];
+			MF = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 7)];
+
+    		INPUT_BLOCK_ADD;
+    		XOR_W;
+    		APPLY_P;
+    		INPUT_BLOCK_SUB;
+    		SWAP_BC;
+    		INCR_W;
+    	}
+
+		// final message determination
+		if (num == MESSAGE_CAP) {
+            M0 = 0x80;
+            M1 = M2 = M3 = M4 = M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0;
+        }
+        else if((hash & 1) == 0) {
+            M0 = ((unsigned int*)&numeric_id_be)[0];
+            M1 = ((unsigned int*)&numeric_id_be)[1];
+            M2 = ((unsigned int*)&nonce_be)[0];
+            M3 = ((unsigned int*)&nonce_be)[1];
+            M4 = 0x80;
+            M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0;
+        }
+        else if((hash & 1) == 1) {
+            M0 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 0)];
+            M1 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 1)];
+            M2 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 2)];
+            M3 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 3)];
+            M4 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 4)];
+            M5 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 5)];
+            M6 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 6)];
+            M7 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 7)];
+            M8 = ((unsigned int*)&numeric_id_be)[0];
+            M9 = ((unsigned int*)&numeric_id_be)[1];
+            MA = ((unsigned int*)&nonce_be)[0];
+            MB = ((unsigned int*)&nonce_be)[1];
+            MC = 0x80;
+            MD = ME = MF = 0;
+		}
+
+    	INPUT_BLOCK_ADD;
+    	XOR_W;
+    	APPLY_P;
+    	for (int i = 0; i < 3; i ++) {
+	        SWAP_BC;
+        	XOR_W;
+        	APPLY_P;
+    	}
+
+		if (hash > 0){
+			((__global unsigned int*)buffer)[Address(gid, hash-1, 0)] = B8;		
+			((__global unsigned int*)buffer)[Address(gid, hash-1, 1)] = B9;
+			((__global unsigned int*)buffer)[Address(gid, hash-1, 2)] = BA;
+			((__global unsigned int*)buffer)[Address(gid, hash-1, 3)] = BB;
+			((__global unsigned int*)buffer)[Address(gid, hash-1, 4)] = BC;
+			((__global unsigned int*)buffer)[Address(gid, hash-1, 5)] = BD;
+			((__global unsigned int*)buffer)[Address(gid, hash-1, 6)] = BE;
+			((__global unsigned int*)buffer)[Address(gid, hash-1, 7)] = BF;	
+		}
+	}
+
+	// final xor 
+	if(end==8192){
+		for (size_t i = 0; i < NUM_HASHES; i++){ 
+			((__global unsigned int*)buffer)[Address(gid, i, 0)] ^= B8;
+			((__global unsigned int*)buffer)[Address(gid, i, 1)] ^= B9;
+			((__global unsigned int*)buffer)[Address(gid, i, 2)] ^= BA;
+			((__global unsigned int*)buffer)[Address(gid, i, 3)] ^= BB;
+			((__global unsigned int*)buffer)[Address(gid, i, 4)] ^= BC;
+			((__global unsigned int*)buffer)[Address(gid, i, 5)] ^= BD;
+			((__global unsigned int*)buffer)[Address(gid, i, 6)] ^= BE;
+			((__global unsigned int*)buffer)[Address(gid, i, 7)] ^= BF;
+		}
+	}
+}
\ No newline at end of file
diff --git a/src/plotter.rs b/src/plotter.rs
index 7ee23df..1f142bd 100644
--- a/src/plotter.rs
+++ b/src/plotter.rs
@@ -9,9 +9,12 @@ use self::pbr::{MultiBar, Units};
 use self::raw_cpuid::CpuId;
 use chan;
 use core_affinity;
-use hasher::create_hasher_task;
+#[cfg(feature = "opencl")]
+use ocl::gpu_get_info;
+use scheduler::create_scheduler_thread;
 use std::cmp::{max, min};
 use std::path::Path;
+use std::process;
 use std::sync::{Arc, Mutex};
 use std::thread;
 use stopwatch::Stopwatch;
@@ -20,7 +23,7 @@ use utils::get_sector_size;
 use utils::preallocate;
 #[cfg(windows)]
 use utils::set_thread_ideal_processor;
-use writer::{create_writer_task, read_resume_info, write_resume_info};
+use writer::{create_writer_thread, read_resume_info, write_resume_info};
 
 const NONCE_SIZE: u64 = (2 << 17);
 const SCOOP_SIZE: u64 = 64;
@@ -42,9 +45,12 @@ pub struct PlotterTask {
     pub output_path: String,
     pub mem: String,
     pub cpu_threads: u8,
+    pub gpus: Option<Vec<String>>,
     pub direct_io: bool,
     pub async_io: bool,
     pub quiet: bool,
+    pub benchmark: bool,
+    pub zcb: bool,
 }
 
 pub struct Buffer {
@@ -81,6 +87,11 @@ impl Plotter {
         if !task.quiet {
             println!("Engraver {} - PoC2 Plotter\n", crate_version!());
         }
+
+        if !task.quiet && task.benchmark {
+            println!("*BENCHMARK MODE*\n");
+        }
+
         if !task.quiet {
             println!(
                 "CPU: {} [using {} of {} cores{}{}]",
@@ -92,12 +103,32 @@ impl Plotter {
             );
         }
 
+        #[cfg(not(feature = "opencl"))]
+        let gpu_mem_needed = 0u64;
+        #[cfg(feature = "opencl")]
+        let gpu_mem_needed = match &task.gpus {
+            Some(x) => gpu_get_info(&x, task.quiet),
+            None => 0,
+        };
+
+        #[cfg(feature = "opencl")]
+        let gpu_mem_needed = if task.zcb {
+            gpu_mem_needed
+        } else {
+            gpu_mem_needed / 2
+        };
+
         // use all avaiblable disk space if nonce parameter has been omitted
         let free_disk_space = free_disk_space(&task.output_path);
         if task.nonces == 0 {
             task.nonces = free_disk_space / NONCE_SIZE;
         }
 
+        let gpu = match &task.gpus {
+            Some(_) => true,
+            None => false,
+        };
+
         // align number of nonces with sector size if direct i/o
         let mut rounded_nonces_to_sector_size = false;
         let mut nonces_per_sector = 1;
@@ -128,9 +159,9 @@ impl Plotter {
         }
 
         // check available disk space
-        if free_disk_space < plotsize && !file.exists(){
+        if free_disk_space < plotsize && !file.exists() && !task.benchmark {
             println!(
-                "Error: insufficient disk space, MiB_required={}, MiB_available={}",
+                "Error: insufficient disk space, MiB_required={:.2}, MiB_available={:.2}",
                 plotsize as f64 / 1024.0 / 1024.0,
                 free_disk_space as f64 / 1024.0 / 1024.0
             );
@@ -139,17 +170,25 @@ impl Plotter {
         }
 
         // calculate memory usage
-        let mem = match calculate_mem_to_use(&task, &memory, nonces_per_sector){
+        let mem = match calculate_mem_to_use(&task, &memory, nonces_per_sector, gpu, gpu_mem_needed)
+        {
             Ok(x) => x,
-            Err(_) => return
+            Err(_) => return,
         };
 
         if !task.quiet {
             println!(
-                "RAM: Total={:.2} GiB, Free= {:.2} GiB, Usage= {:.2} GiB \n",
+                "RAM: Total={:.2} GiB, Free={:.2} GiB, Usage={:.2} GiB",
                 memory.total as f64 / 1024.0 / 1024.0,
                 memory.free as f64 / 1024.0 / 1024.0,
-                mem as f64 / 1024.0 / 1024.0 / 1024.0
+                (mem + gpu_mem_needed) as f64 / 1024.0 / 1024.0 / 1024.0
+            );
+
+            #[cfg(feature = "opencl")]
+            println!(
+                "     HDDcache={:.2} GiB, GPUcache={:.2} GiB,\n",
+                mem as f64 / 1024.0 / 1024.0 / 1024.0,
+                gpu_mem_needed as f64 / 1024.0 / 1024.0 / 1024.0
             );
 
             println!("Numeric ID:  {}", task.numeric_id);
@@ -181,7 +220,6 @@ impl Plotter {
                     println!("File is already completed.");
                     println!("Shutting Down...");
                     return;
-
                 }
             }
             if !task.quiet {
@@ -191,10 +229,10 @@ impl Plotter {
             if !task.quiet {
                 print!("Fast file pre-allocation...");
             }
-
-            preallocate(&file, plotsize, task.direct_io);
-            write_resume_info(&file, 0u64);
-
+            if !task.benchmark {
+                preallocate(&file, plotsize, task.direct_io);
+                write_resume_info(&file, 0u64);
+            }
             if !task.quiet {
                 println!("OK");
             }
@@ -205,7 +243,7 @@ impl Plotter {
                 println!("Starting plotting...\n");
             } else {
                 println!("Resuming plotting from nonce offset {}...\n", progress);
-            }        
+            }
         }
 
         // determine buffer size
@@ -261,14 +299,14 @@ impl Plotter {
 
         // hi bold! might make this optional in future releases.
         let thread_pinning = true;
-        let mut core_ids: Vec<core_affinity::CoreId> = Vec::new();
-
-        if thread_pinning {
-            core_ids = core_affinity::get_core_ids().unwrap();
-        }
+        let core_ids = if thread_pinning {
+            core_affinity::get_core_ids().unwrap()
+        } else {
+            Vec::new()
+        };
 
         let hasher = thread::spawn({
-            create_hasher_task(
+            create_scheduler_thread(
                 task.clone(),
                 rayon::ThreadPoolBuilder::new()
                     .num_threads(task.cpu_threads as usize)
@@ -292,7 +330,7 @@ impl Plotter {
         });
 
         let writer = thread::spawn({
-            create_writer_task(
+            create_writer_thread(
                 task.clone(),
                 progress,
                 p2x,
@@ -330,30 +368,49 @@ fn calculate_mem_to_use(
     task: &PlotterTask,
     memory: &sys_info::MemInfo,
     nonces_per_sector: u64,
+    gpu: bool,
+    gpu_mem_needed: u64,
 ) -> Result<u64, &'static str> {
     let plotsize = task.nonces * NONCE_SIZE;
 
     let mut mem = match task.mem.parse::<Bytes>() {
         Ok(x) => x.size() as u64,
-        Err(_) => { println!(
+        Err(_) => {
+            println!(
                 "Error: Can't parse memory limit parameter, input={}",
                 task.mem,
             );
             println!("\nPlease specify a number followed by a unit. If no unit is provided, bytes will be assumed.");
-            println!("Supported units: B, KiB, MiB, GiB, TiB, PiB, EiB, KB, MB, GB, TB, PB, EB");           
-            println!("Example: --mem 10GiB\n");           
+            println!("Supported units: B, KiB, MiB, GiB, TiB, PiB, EiB, KB, MB, GB, TB, PB, EB");
+            println!("Example: --mem 10GiB\n");
             println!("Shutting down...");
             return Err("invalid unit");
         }
     };
-    
+    if gpu && mem > 0 && mem < gpu_mem_needed + nonces_per_sector * NONCE_SIZE {
+        println!("Error: Insufficient host memory for GPU plotting!");
+        println!("Shutting down...");
+        process::exit(0);
+    }
+
+    if gpu && mem > 0 {
+        mem -= gpu_mem_needed;
+    }
+
     if mem == 0 {
         mem = plotsize;
     }
-    mem = min(mem, plotsize);
+    mem = min(mem, plotsize + gpu_mem_needed);
+
+    // opencl requires buffer to be a multiple of 16 (data coalescence magic)
+    let nonces_per_sector = if gpu {
+        max(16, nonces_per_sector)
+    } else {
+        nonces_per_sector
+    };
 
     // don't exceed free memory and leave some elbow room 1-1000/1024
-    mem = min(mem, memory.free * 1000);
+    mem = min(mem, memory.free * 1000 - gpu_mem_needed);
 
     // rounding single/double buffer
     let num_buffer = if task.async_io { 2 } else { 1 };
diff --git a/src/scheduler.rs b/src/scheduler.rs
new file mode 100644
index 0000000..cbf3f9b
--- /dev/null
+++ b/src/scheduler.rs
@@ -0,0 +1,232 @@
+extern crate pbr;
+extern crate rayon;
+
+use chan;
+use cpu_hasher::{hash_cpu, CpuTask, SafeCVoid};
+#[cfg(feature = "opencl")]
+use gpu_hasher::{create_gpu_hasher_thread, GpuTask, SafePointer};
+use libc::{c_void, size_t};
+#[cfg(feature = "opencl")]
+use ocl::gpu_init;
+use plotter::{Buffer, PlotterTask};
+use std::cmp::min;
+use std::sync::mpsc::channel;
+use std::sync::Arc;
+#[cfg(feature = "opencl")]
+use std::thread;
+
+const CPU_TASK_SIZE: u64 = 64;
+const NONCE_SIZE: u64 = (2 << 17);
+
+pub fn create_scheduler_thread(
+    task: Arc<PlotterTask>,
+    thread_pool: rayon::ThreadPool,
+    mut nonces_hashed: u64,
+    mut pb: Option<pbr::ProgressBar<pbr::Pipe>>,
+    rx_empty_buffers: chan::Receiver<Buffer>,
+    tx_buffers_to_writer: chan::Sender<Buffer>,
+    simd_ext: String,
+) -> impl FnOnce() {
+    move || {
+        // synchronisation chanel for all hashing devices (CPU+GPU)
+        // message protocol:    (hash_device_id: u8, message: u8, nonces processed: u64)
+        // hash_device_id:      0=CPU, 1=GPU0, 2=GPU1...
+        // message:             0 = data ready to write
+        //                      1 = device ready to compute next hashing batch
+        // nonces_processed:    nonces hashed / nonces writen to host buffer
+        let (tx, rx) = channel();
+
+        // create gpu threads and channels
+        #[cfg(feature = "opencl")]
+        let gpu_contexts = match &task.gpus {
+            Some(x) => Some(gpu_init(&x, task.zcb)),
+            None => None,
+        };
+
+        #[cfg(feature = "opencl")]
+        let gpus = match gpu_contexts {
+            Some(x) => x,
+            None => Vec::new(),
+        };
+        #[cfg(feature = "opencl")]
+        let mut gpu_threads = Vec::new();
+        #[cfg(feature = "opencl")]
+        let mut gpu_channels = Vec::new();
+
+        #[cfg(feature = "opencl")]
+        for (i, gpu) in gpus.iter().enumerate() {
+            gpu_channels.push(chan::unbounded());
+            gpu_threads.push(thread::spawn({
+                create_gpu_hasher_thread(
+                    (i + 1) as u8,
+                    gpu.clone(),
+                    tx.clone(),
+                    gpu_channels.last().unwrap().1.clone(),
+                )
+            }));
+        }
+
+        for buffer in rx_empty_buffers {
+            let mut_bs = &buffer.get_buffer();
+            let mut bs = mut_bs.lock().unwrap();
+            let buffer_size = (*bs).len() as u64;
+            let nonces_to_hash = min(buffer_size / NONCE_SIZE, task.nonces - nonces_hashed);
+
+            let mut requested = 0u64;
+            let mut processed = 0u64;
+
+            // kickoff first gpu and cpu runs
+            #[cfg(feature = "opencl")]
+            for (i, gpu) in gpus.iter().enumerate() {
+                // schedule next gpu task
+                let mut gpu = gpu.lock().unwrap();
+                let task_size = min(gpu.worksize as u64, nonces_to_hash - requested);
+                if task_size > 0 {
+                    gpu_channels[i].0.send(Some(GpuTask {
+                        cache: SafePointer {
+                            ptr: bs.as_mut_ptr(),
+                        },
+                        cache_size: buffer_size / NONCE_SIZE,
+                        chunk_offset: requested,
+                        numeric_id: task.numeric_id,
+                        local_startnonce: task.start_nonce + nonces_hashed + requested,
+                        local_nonces: task_size,
+                    }));
+                }
+                requested += task_size;
+                //println!("Debug: Device: {} started. {} nonces assigned. Total requested: {}\n\n\n",i+1,task_size,requested);
+            }
+
+            for _ in 0..task.cpu_threads {
+                let task_size = min(CPU_TASK_SIZE, nonces_to_hash - requested);
+                if task_size > 0 {
+                    let task = hash_cpu(
+                        tx.clone(),
+                        CpuTask {
+                            cache: SafeCVoid {
+                                ptr: bs.as_ptr() as *mut c_void,
+                            },
+                            cache_size: buffer_size / NONCE_SIZE as size_t,
+                            chunk_offset: requested as size_t,
+                            numeric_id: task.numeric_id,
+                            local_startnonce: task.start_nonce + nonces_hashed + requested,
+                            local_nonces: task_size,
+                        },
+                        simd_ext.clone(),
+                    );
+                    thread_pool.spawn(task);
+                }
+                requested += task_size;
+            }
+
+            // control loop
+            let rx = &rx;
+            for msg in rx {
+                match msg.1 {
+                    // process a request for work: provide a task or signal completion
+                    1 => {
+                        let task_size = match msg.0 {
+                            0 => {
+                                // schedule next cpu task
+                                let task_size = min(CPU_TASK_SIZE, nonces_to_hash - requested);
+                                if task_size > 0 {
+                                    let task = hash_cpu(
+                                        tx.clone(),
+                                        CpuTask {
+                                            cache: SafeCVoid {
+                                                ptr: bs.as_ptr() as *mut c_void,
+                                            },
+                                            cache_size: buffer_size / NONCE_SIZE as size_t,
+                                            chunk_offset: requested as size_t,
+                                            numeric_id: task.numeric_id,
+                                            local_startnonce: task.start_nonce
+                                                + nonces_hashed
+                                                + requested,
+                                            local_nonces: task_size,
+                                        },
+                                        simd_ext.clone(),
+                                    );
+                                    thread_pool.spawn(task);
+                                }
+                                task_size
+                            }
+                            _ => {
+                                // schedule next gpu task
+                                #[cfg(feature = "opencl")]
+                                let mut gpu = gpus[(msg.0 - 1) as usize].lock().unwrap();
+                                #[cfg(feature = "opencl")]
+                                let task_size =
+                                    min(gpu.worksize as u64, nonces_to_hash - requested);
+
+                                // optimisation: leave some work for cpu in dual mode
+                                #[cfg(feature = "opencl")]
+                                let task_size = if task_size < gpu.worksize as u64
+                                    && task.cpu_threads > 0
+                                    && task_size > CPU_TASK_SIZE
+                                {
+                                    task_size / 2
+                                } else {
+                                    task_size
+                                };
+
+                                #[cfg(not(feature = "opencl"))]
+                                let task_size = 0;
+
+                                #[cfg(feature = "opencl")]
+                                gpu_channels[(msg.0 - 1) as usize].0.send(Some(GpuTask {
+                                    cache: SafePointer {
+                                        ptr: bs.as_mut_ptr(),
+                                    },
+                                    cache_size: buffer_size / NONCE_SIZE,
+                                    chunk_offset: requested,
+                                    numeric_id: task.numeric_id,
+                                    local_startnonce: task.start_nonce + nonces_hashed + requested,
+                                    local_nonces: task_size,
+                                }));
+                                task_size
+                            }
+                        };
+
+                        requested += task_size;
+                        //println!("Debug: Device: {} asked for work. {} nonces assigned. Total requested: {}\n\n\n",msg.0,task_size,requested);
+                    }
+                    // process work completed message
+                    0 => {
+                        processed += msg.2;
+                        match &mut pb {
+                            Some(pb) => {
+                                pb.add(msg.2 * NONCE_SIZE);
+                            }
+                            None => (),
+                        }
+                    }
+                    _ => {}
+                }
+                if processed == nonces_to_hash {
+                    break;
+                }
+            }
+
+            nonces_hashed += nonces_to_hash;
+
+            // queue buffer for writing
+            tx_buffers_to_writer.send(buffer);
+
+            // thread end
+            if task.nonces == nonces_hashed {
+                match &mut pb {
+                    Some(pb) => {
+                        pb.finish_print("Hasher done.");
+                    }
+                    None => (),
+                }
+                // shutdown gpu threads
+                #[cfg(feature = "opencl")]
+                for gpu in &gpu_channels {
+                    gpu.0.send(None);
+                }
+                break;
+            };
+        }
+    }
+}
diff --git a/src/writer.rs b/src/writer.rs
index ca7b717..cce8a58 100644
--- a/src/writer.rs
+++ b/src/writer.rs
@@ -12,7 +12,7 @@ const TASK_SIZE: u64 = 16384;
 const SCOOP_SIZE: u64 = 64;
 const NONCE_SIZE: u64 = 4096 * SCOOP_SIZE;
 
-pub fn create_writer_task(
+pub fn create_writer_thread(
     task: Arc<PlotterTask>,
     mut nonces_written: u64,
     mut pb: Option<pbr::ProgressBar<pbr::Pipe>>,
@@ -30,49 +30,50 @@ pub fn create_writer_task(
                 "{}_{}_{}",
                 task.numeric_id, task.start_nonce, task.nonces
             ));
+            if !task.benchmark {
+                let file = if task.direct_io {
+                    open_using_direct_io(&filename)
+                } else {
+                    open(&filename)
+                };
 
-            let file = if task.direct_io {
-                open_using_direct_io(&filename)
-            } else {
-                open(&filename)
-            };
+                let mut file = file.unwrap();
 
-            let mut file = file.unwrap();
+                for scoop in 0..4096 {
+                    let mut seek_addr = scoop * task.nonces as u64 * SCOOP_SIZE;
+                    seek_addr += nonces_written as u64 * SCOOP_SIZE;
 
-            for scoop in 0..4096 {
-                let mut seek_addr = scoop * task.nonces as u64 * SCOOP_SIZE;
-                seek_addr += nonces_written as u64 * SCOOP_SIZE;
+                    file.seek(SeekFrom::Start(seek_addr)).unwrap();
 
-                file.seek(SeekFrom::Start(seek_addr)).unwrap();
+                    let mut local_addr = scoop * buffer_size / NONCE_SIZE * SCOOP_SIZE;
+                    for _ in 0..nonces_to_write / TASK_SIZE {
+                        file.write_all(
+                            &bs[local_addr as usize
+                                    ..(local_addr + TASK_SIZE * SCOOP_SIZE) as usize],
+                        ).unwrap();
 
-                let mut local_addr = scoop * buffer_size / NONCE_SIZE * SCOOP_SIZE;
-                for _ in 0..nonces_to_write / TASK_SIZE {
-                    file.write_all(
-                        &bs[local_addr as usize..(local_addr + TASK_SIZE * SCOOP_SIZE) as usize],
-                    ).unwrap();
-
-                    local_addr += TASK_SIZE * SCOOP_SIZE;
-                }
+                        local_addr += TASK_SIZE * SCOOP_SIZE;
+                    }
 
-                // write remainder
-                if nonces_to_write % TASK_SIZE > 0 {
-                    file.write_all(
-                        &bs[local_addr as usize
-                                ..(local_addr + (nonces_to_write % TASK_SIZE) * SCOOP_SIZE)
-                                    as usize],
-                    ).unwrap();
-                }
+                    // write remainder
+                    if nonces_to_write % TASK_SIZE > 0 {
+                        file.write_all(
+                            &bs[local_addr as usize
+                                    ..(local_addr + (nonces_to_write % TASK_SIZE) * SCOOP_SIZE)
+                                        as usize],
+                        ).unwrap();
+                    }
 
-                if (scoop + 1) % 128 == 0 {
-                    match &mut pb {
-                        Some(pb) => {
-                            pb.add(nonces_to_write * SCOOP_SIZE * 128);
+                    if (scoop + 1) % 128 == 0 {
+                        match &mut pb {
+                            Some(pb) => {
+                                pb.add(nonces_to_write * SCOOP_SIZE * 128);
+                            }
+                            None => (),
                         }
-                        None => (),
                     }
                 }
             }
-
             nonces_written += nonces_to_write;
 
             // thread end
@@ -86,13 +87,15 @@ pub fn create_writer_task(
                 break;
             }
 
-            write_resume_info(&filename, nonces_written);
+            if !task.benchmark {
+                write_resume_info(&filename, nonces_written);
+            }
             tx_empty_buffers.send(buffer);
         }
     }
 }
 
-pub fn read_resume_info(file: &Path) -> Result<u64,u64> {
+pub fn read_resume_info(file: &Path) -> Result<u64, u64> {
     let mut file = open_r(&file).unwrap();
     file.seek(SeekFrom::End(-8)).unwrap();