From 2eeb5278032f2410639c78a53b9860c841aa2db6 Mon Sep 17 00:00:00 2001 From: JohnnyFFM Date: Sat, 1 Dec 2018 08:13:39 +0100 Subject: [PATCH] GPU upgrade --- .gitignore | 4 + Cargo.lock | 2 +- Cargo.toml | 2 +- src/cpu_hasher.rs | 117 +++++++++ src/gpu_hasher.rs | 78 ++++++ src/main.rs | 81 ++++-- src/ocl.rs | 609 ++++++++++++++++++++++++++++++++++++++++++++++ src/ocl/kernel.cl | 514 ++++++++++++++++++++++++++++++++++++++ src/plotter.rs | 111 +++++++-- src/scheduler.rs | 232 ++++++++++++++++++ src/writer.rs | 73 +++--- 11 files changed, 1745 insertions(+), 78 deletions(-) create mode 100644 src/cpu_hasher.rs create mode 100644 src/gpu_hasher.rs create mode 100644 src/ocl.rs create mode 100644 src/ocl/kernel.cl create mode 100644 src/scheduler.rs diff --git a/.gitignore b/.gitignore index 89552ee..6567326 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,10 @@ # Generated by Cargo # will have compiled files and executables /target/ +/.vs/ +/bin/ +/obj/ +/packages/ # These are backup files generated by rustfmt **/*.rs.bk diff --git a/Cargo.lock b/Cargo.lock index 97a2ecc..6b99bbc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -169,7 +169,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "engraver" -version = "2.0.5" +version = "2.2.0" dependencies = [ "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", "cfg-if 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/Cargo.toml b/Cargo.toml index f863f88..46ad75f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "engraver" -version = "2.0.5" +version = "2.2.0" license = "GPL-3.0" authors = ["PoC Consortium "] description = """ diff --git a/src/cpu_hasher.rs b/src/cpu_hasher.rs new file mode 100644 index 0000000..ecc2fbb --- /dev/null +++ b/src/cpu_hasher.rs @@ -0,0 +1,117 @@ +use libc::{c_void, size_t, uint64_t}; +use std::sync::mpsc::Sender; + +extern "C" { + pub fn noncegen( + cache: *mut c_void, + cache_size: size_t, + chunk_offset: size_t, + numeric_ID: uint64_t, + local_startnonce: uint64_t, + local_nonces: uint64_t, + ); + pub fn noncegen_sse( + cache: *mut c_void, + cache_size: size_t, + chunk_offset: size_t, + numeric_ID: uint64_t, + local_startnonce: uint64_t, + local_nonces: uint64_t, + ); + pub fn noncegen_avx( + cache: *mut c_void, + cache_size: size_t, + chunk_offset: size_t, + numeric_ID: uint64_t, + local_startnonce: uint64_t, + local_nonces: uint64_t, + ); + pub fn noncegen_avx2( + cache: *mut c_void, + cache_size: size_t, + chunk_offset: size_t, + numeric_ID: uint64_t, + local_startnonce: uint64_t, + local_nonces: uint64_t, + ); + pub fn noncegen_avx512( + cache: *mut c_void, + cache_size: size_t, + chunk_offset: size_t, + numeric_ID: uint64_t, + local_startnonce: uint64_t, + local_nonces: uint64_t, + ); +} +pub struct SafeCVoid { + pub ptr: *mut c_void, +} +unsafe impl Send for SafeCVoid {} + +pub struct CpuTask { + pub cache: SafeCVoid, + pub cache_size: size_t, + pub chunk_offset: size_t, + pub numeric_id: uint64_t, + pub local_startnonce: uint64_t, + pub local_nonces: uint64_t, +} + +pub fn hash_cpu( + tx: Sender<(u8, u8, u64)>, + hasher_task: CpuTask, + simd_ext: String, +) -> impl FnOnce() { + move || { + unsafe { + match &*simd_ext { + "AVX512F" => noncegen_avx512( + hasher_task.cache.ptr, + hasher_task.cache_size, + hasher_task.chunk_offset, + hasher_task.numeric_id, + hasher_task.local_startnonce, + hasher_task.local_nonces, + ), + "AVX2" => noncegen_avx2( + hasher_task.cache.ptr, + hasher_task.cache_size, + hasher_task.chunk_offset, + hasher_task.numeric_id, + hasher_task.local_startnonce, + hasher_task.local_nonces, + ), + "AVX" => noncegen_avx( + hasher_task.cache.ptr, + hasher_task.cache_size, + hasher_task.chunk_offset, + hasher_task.numeric_id, + hasher_task.local_startnonce, + hasher_task.local_nonces, + ), + "SSE2" => noncegen_sse( + hasher_task.cache.ptr, + hasher_task.cache_size, + hasher_task.chunk_offset, + hasher_task.numeric_id, + hasher_task.local_startnonce, + hasher_task.local_nonces, + ), + _ => noncegen( + hasher_task.cache.ptr, + hasher_task.cache_size, + hasher_task.chunk_offset, + hasher_task.numeric_id, + hasher_task.local_startnonce, + hasher_task.local_nonces, + ), + } + } + // report hashing done + tx.send((0u8, 1u8, 0)) + .expect("CPU task can't communicate with scheduler thread."); + // report data in hostmem + tx.send((0u8, 0u8, hasher_task.local_nonces)) + .expect("CPU task can't communicate with scheduler thread."); + } +} diff --git a/src/gpu_hasher.rs b/src/gpu_hasher.rs new file mode 100644 index 0000000..46e89b4 --- /dev/null +++ b/src/gpu_hasher.rs @@ -0,0 +1,78 @@ +use chan::Receiver; +use ocl::{gpu_hash, gpu_hash_and_transfer_to_host, gpu_transfer_to_host, GpuContext}; +use std::sync::mpsc::Sender; +use std::sync::{Arc, Mutex}; + +pub struct SafePointer { + pub ptr: *mut u8, +} +unsafe impl Send for SafePointer {} +unsafe impl Sync for SafePointer {} + +pub struct GpuTask { + pub cache: SafePointer, + pub cache_size: u64, + pub chunk_offset: u64, + pub numeric_id: u64, + pub local_startnonce: u64, + pub local_nonces: u64, +} + +pub fn create_gpu_hasher_thread( + gpu_id: u8, + gpu_context: Arc>, + tx: Sender<(u8, u8, u64)>, + rx_hasher_task: Receiver>, +) -> impl FnOnce() { + move || { + let mut first_run = true; + let mut buffer_id = 0u8; + let mut last_task = GpuTask { + cache: SafePointer { ptr: &mut 0u8 }, + cache_size: 0, + chunk_offset: 0, + numeric_id: 0, + local_startnonce: 0, + local_nonces: 0, + }; + for task in rx_hasher_task { + // check if new task or termination + match task { + // new task + Some(task) => { + // first run - just hash + if first_run { + if task.local_nonces != 0 { + first_run = false; + gpu_hash(&gpu_context, &task); + buffer_id = 1 - buffer_id; + last_task = task; + tx.send((gpu_id, 1u8, 0)) + .expect("GPU task can't communicate with scheduler thread."); + } + // last run - just transfer + } else if task.local_nonces == 0 { + gpu_transfer_to_host(&gpu_context, buffer_id, &last_task); + first_run = true; + buffer_id = 0; + tx.send((gpu_id, 0u8, last_task.local_nonces)) + .expect("GPU task can't communicate with scheduler thread."); + // normal run - hash and transfer async + } else { + gpu_hash_and_transfer_to_host(&gpu_context, buffer_id, &task, &last_task); + buffer_id = 1 - buffer_id; + tx.send((gpu_id, 0u8, last_task.local_nonces)) + .expect("GPU task can't communicate with scheduler thread."); + last_task = task; + tx.send((gpu_id, 1u8, 0)) + .expect("GPU task can't communicate with scheduler thread."); + } + } + // termination + None => { + break; + } + } + } + } +} diff --git a/src/main.rs b/src/main.rs index 471e59d..a33bc35 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,8 +9,13 @@ extern crate pbr; extern crate stopwatch; extern crate sys_info; -mod hasher; +mod cpu_hasher; +#[cfg(feature = "opencl")] +mod gpu_hasher; +#[cfg(feature = "opencl")] +mod ocl; mod plotter; +mod scheduler; mod utils; mod writer; @@ -19,13 +24,10 @@ use clap::AppSettings::{ArgRequiredElseHelp, DeriveDisplayOrder, VersionlessSubc use clap::ArgGroup; use clap::{App, Arg}; use plotter::{Plotter, PlotterTask}; +use std::cmp::min; use utils::set_low_prio; fn main() { - #[cfg(not(feature = "opencl"))] - let _opencl = false; - #[cfg(feature = "opencl")] - let opencl = true; let arg = App::new("Engraver") .version(crate_version!()) .author(crate_authors!()) @@ -60,6 +62,12 @@ fn main() { .long("quiet") .help("Runs engraver in non-verbose mode") .global(true), + ).arg( + Arg::with_name("benchmark") + .short("b") + .long("bench") + .help("Runs engraver in xPU benchmark mode") + .global(true), ) /* .subcommand( @@ -74,7 +82,7 @@ fn main() { .value_name("numeric_ID") .help("your numeric Burst ID") .takes_value(true) - .required(true), + .required_unless("ocl-devices"), ).arg( Arg::with_name("start nonce") .short("s") @@ -82,7 +90,7 @@ fn main() { .value_name("start_nonce") .help("where you want to start plotting") .takes_value(true) - .required(true), + .required_unless("ocl-devices"), ).arg( Arg::with_name("nonces") .short("n") @@ -90,7 +98,7 @@ fn main() { .value_name("nonces") .help("how many nonces you want to plot") .takes_value(true) - .required(true), + .required_unless("ocl-devices"), ).arg( Arg::with_name("path") .short("p") @@ -120,14 +128,13 @@ fn main() { .short("g") .long("gpu") .value_name("platform_id:device_id") - .help("*GPU(s) you want to use for plotting") + .help("GPU(s) you want to use for plotting (optional)") .multiple(true) .takes_value(true), ]).groups(&[#[cfg(feature = "opencl")] ArgGroup::with_name("processing") .args(&["cpu", "gpu"]) - .multiple(true) - .required(true)]) + .multiple(true)]) /* .arg( Arg::with_name("ssd buffer") @@ -168,14 +175,35 @@ fn main() { )*/; + #[cfg(feature = "opencl")] + let arg = arg + .arg( + Arg::with_name("ocl-devices") + .short("o") + .long("opencl") + .help("Display OpenCL platforms and devices") + .global(true), + ).arg( + Arg::with_name("zero-copy") + .short("z") + .long("zcb") + .help("Enables zero copy buffers for shared mem (integrated) gpus") + .global(true), + ); let matches = &arg.get_matches(); if matches.is_present("low priority") { set_low_prio(); } + if matches.is_present("ocl-devices") { + #[cfg(feature = "opencl")] + ocl::platform_info(); + return; + } + // plotting - /* + /* subcommand if let Some(matches) = matches.subcommand_matches("plot") { */ let numeric_id = value_t!(matches, "numeric id", u64).unwrap_or_else(|e| e.exit()); @@ -189,8 +217,30 @@ fn main() { .unwrap() }); let mem = value_t!(matches, "memory", String).unwrap_or_else(|_| "0B".to_owned()); - let cpu_threads = - value_t!(matches, "cpu", u8).unwrap_or_else(|_| sys_info::cpu_num().unwrap() as u8); + let cpu_threads = value_t!(matches, "cpu", u8).unwrap_or(0u8); + + let gpus = if matches.occurrences_of("gpu") > 0 { + let gpu = values_t!(matches, "gpu", String); + Some(gpu.unwrap()) + } else { + None + }; + + // work out number of cpu threads to use + let cores = sys_info::cpu_num().unwrap() as u8; + let cpu_threads = if cpu_threads == 0 { + cores + } else { + min(cores, cpu_threads) + }; + + // special case: dont use cpu if only a gpu is defined + #[cfg(feature = "opencl")] + let cpu_threads = if matches.occurrences_of("gpu") > 0 && matches.occurrences_of("cpu") == 0 { + 0u8 + } else { + cpu_threads + }; let p = Plotter::new(); p.run(PlotterTask { @@ -200,8 +250,11 @@ fn main() { output_path, mem, cpu_threads, + gpus, direct_io: !matches.is_present("disable direct i/o"), async_io: !matches.is_present("disable async i/o"), quiet: matches.is_present("non-verbosity"), + benchmark: matches.is_present("benchmark"), + zcb: matches.is_present("zero-copy"), }); } diff --git a/src/ocl.rs b/src/ocl.rs new file mode 100644 index 0000000..b38ce70 --- /dev/null +++ b/src/ocl.rs @@ -0,0 +1,609 @@ +extern crate ocl_core as core; +extern crate rayon; + +use self::core::{ + ArgVal, ContextProperties, DeviceInfo, Event, KernelWorkGroupInfo, PlatformInfo, Status, +}; +use gpu_hasher::GpuTask; +use ocl::rayon::prelude::*; +use std::cmp::min; +use std::ffi::CString; +use std::process; +use std::slice::{from_raw_parts, from_raw_parts_mut}; +use std::sync::{Arc, Mutex}; +use std::u64; + +static SRC: &'static str = include_str!("ocl/kernel.cl"); + +const NONCE_SIZE: u64 = (2 << 17); +const NUM_SCOOPS: u64 = 4096; +const GPU_HASHES_PER_RUN: usize = 32; +const MSHABAL512_VECTOR_SIZE: u64 = 16; +const SCOOP_SIZE: u64 = 64; + +// convert the info or error to a string for printing: +macro_rules! to_string { + ($expr:expr) => { + match $expr { + Ok(info) => info.to_string(), + Err(err) => match err.api_status() { + Some(Status::CL_KERNEL_ARG_INFO_NOT_AVAILABLE) => "Not available".into(), + _ => err.to_string(), + }, + } + }; +} + +#[allow(dead_code)] +pub struct GpuContext { + queue_a: core::CommandQueue, + queue_b: core::CommandQueue, + kernel: core::Kernel, + ldim1: [usize; 3], + gdim1: [usize; 3], + mapping: bool, + buffer_ptr_host: Option>, + buffer_host: Option, + buffer_gpu_a: core::Mem, + buffer_gpu_b: core::Mem, + pub worksize: usize, +} + +// Ohne Gummi im Bahnhofsviertel... das wird noch Konsequenzen haben +unsafe impl Sync for GpuContext {} + +impl GpuContext { + pub fn new( + gpu_platform: usize, + gpu_id: usize, + cores: usize, + nvidia: bool, + mapping: bool, + ) -> GpuContext { + let platform_ids = core::get_platform_ids().unwrap(); + let platform_id = platform_ids[gpu_platform]; + let device_ids = core::get_device_ids(&platform_id, None, None).unwrap(); + let device_id = device_ids[gpu_id]; + let context_properties = ContextProperties::new().platform(platform_id); + let context = + core::create_context(Some(&context_properties), &[device_id], None, None).unwrap(); + let src_cstring = CString::new(SRC).unwrap(); + let program = core::create_program_with_source(&context, &[src_cstring]).unwrap(); + core::build_program( + &program, + None::<&[()]>, + &CString::new("").unwrap(), + None, + None, + ).unwrap(); + let queue_a = core::create_command_queue(&context, &device_id, None).unwrap(); + let queue_b = core::create_command_queue(&context, &device_id, None).unwrap(); + let kernel = core::create_kernel(&program, "calculate_nonces").unwrap(); + let kernel_workgroup_size = get_kernel_work_group_size(&kernel, device_id); + let workgroup_count = cores; + let worksize = kernel_workgroup_size * workgroup_count; + let gdim1 = [worksize, 1, 1]; + let ldim1 = [kernel_workgroup_size, 1, 1]; + + // create buffers + // mapping = zero copy buffers, no mapping = pinned memory for fast DMA. + if mapping { + let buffer_gpu_a = unsafe { + core::create_buffer::<_, u8>( + &context, + core::MEM_READ_WRITE | core::MEM_ALLOC_HOST_PTR, + (NONCE_SIZE as usize) * worksize, + None, + ).unwrap() + }; + let buffer_gpu_b = unsafe { + core::create_buffer::<_, u8>( + &context, + core::MEM_READ_WRITE | core::MEM_ALLOC_HOST_PTR, + (NONCE_SIZE as usize) * worksize, + None, + ).unwrap() + }; + GpuContext { + queue_a, + queue_b, + kernel, + ldim1, + gdim1, + mapping, + buffer_gpu_a, + buffer_gpu_b, + buffer_ptr_host: None, + buffer_host: None, + worksize, + } + } else { + let buffer_host = unsafe { + core::create_buffer::<_, u8>( + &context, + core::MEM_READ_WRITE | core::MEM_ALLOC_HOST_PTR, + (NONCE_SIZE as usize) * worksize, + None, + ).unwrap() + }; + let buffer_ptr_host = unsafe { + Some( + core::enqueue_map_buffer::( + &queue_b, + &buffer_host, + true, + core::MAP_READ, + 0, + worksize * NONCE_SIZE as usize, + None::, + None::<&mut Event>, + ).unwrap(), + ) + }; + let buffer_gpu_a = if nvidia { + buffer_host.clone() + } else { + unsafe { + core::create_buffer::<_, u8>( + &context, + core::MEM_READ_WRITE, + (NONCE_SIZE as usize) * worksize, + None, + ).unwrap() + } + }; + let buffer_gpu_b = unsafe { + core::create_buffer::<_, u8>( + &context, + core::MEM_READ_WRITE, + (NONCE_SIZE as usize) * worksize, + None, + ).unwrap() + }; + + let buffer_host = if nvidia { None } else { Some(buffer_host) }; + GpuContext { + queue_a, + queue_b, + kernel, + ldim1, + gdim1, + mapping, + buffer_gpu_a, + buffer_gpu_b, + buffer_ptr_host, + buffer_host, + worksize, + } + } + } +} + +pub fn platform_info() { + let platform_ids = core::get_platform_ids().unwrap(); + for (i, platform_id) in platform_ids.iter().enumerate() { + println!( + "OCL: platform {}, {} - {}", + i, + to_string!(core::get_platform_info(&platform_id, PlatformInfo::Name)), + to_string!(core::get_platform_info(&platform_id, PlatformInfo::Version)) + ); + let device_ids = core::get_device_ids(&platform_id, None, None).unwrap(); + for (j, device_id) in device_ids.iter().enumerate() { + println!( + "OCL: device {}, {} - {}", + j, + to_string!(core::get_device_info(device_id, DeviceInfo::Vendor)), + to_string!(core::get_device_info(device_id, DeviceInfo::Name)) + ); + } + } +} + +pub fn gpu_get_info(gpus: &[String], quiet: bool) -> u64 { + let mut total_mem_needed = 0u64; + for gpu in gpus.iter() { + let gpu = gpu.split(':').collect::>(); + let platform_id = gpu[0].parse::().unwrap(); + let gpu_id = gpu[1].parse::().unwrap(); + let gpu_cores = gpu[2].parse::().unwrap(); + + let platform_ids = core::get_platform_ids().unwrap(); + if platform_id >= platform_ids.len() { + println!("Error: Selected OpenCL platform doesn't exist."); + println!("Shutting down..."); + process::exit(0); + } + let platform = platform_ids[platform_id]; + let device_ids = core::get_device_ids(&platform, None, None).unwrap(); + if gpu_id >= device_ids.len() { + println!("Error: Selected OpenCL device doesn't exist"); + println!("Shutting down..."); + process::exit(0); + } + let device = device_ids[gpu_id]; + let max_compute_units = + match core::get_device_info(&device, DeviceInfo::MaxComputeUnits).unwrap() { + core::DeviceInfoResult::MaxComputeUnits(mcu) => mcu, + _ => panic!("Unexpected error. Can't obtain number of GPU cores."), + }; + let mem = match core::get_device_info(&device, DeviceInfo::GlobalMemSize).unwrap() { + core::DeviceInfoResult::GlobalMemSize(gms) => gms, + _ => panic!("Unexpected error. Can't obtain GPU memory size."), + }; + + // get work_group_size for kernel + let context_properties = ContextProperties::new().platform(platform); + let context = + core::create_context(Some(&context_properties), &[device], None, None).unwrap(); + let src_cstring = CString::new(SRC).unwrap(); + let program = core::create_program_with_source(&context, &[src_cstring]).unwrap(); + core::build_program( + &program, + None::<&[()]>, + &CString::new("").unwrap(), + None, + None, + ).unwrap(); + let kernel = core::create_kernel(&program, "calculate_nonces").unwrap(); + let kernel_workgroup_size = get_kernel_work_group_size(&kernel, device); + + let gpu_cores = if gpu_cores == 0 { + max_compute_units as usize + } else { + min(gpu_cores, max_compute_units as usize) + }; + let mem_needed = 2 * gpu_cores * kernel_workgroup_size * 256 * 1024; + + if mem_needed > mem as usize { + println!("Error: Not enough GPU-memory. Please reduce number of cores."); + println!("Shutting down..."); + process::exit(0); + } + + if !quiet { + println!( + "GPU: {} - {} [using {} of {} cores]", + to_string!(core::get_device_info(&device, DeviceInfo::Vendor)), + to_string!(core::get_device_info(&device, DeviceInfo::Name)), + gpu_cores, + max_compute_units + ); + } + if !quiet { + println!( + " GPU-RAM: Total={:.2} MiB, Usage={:.2} MiB", + mem / 1024 / 1024, + mem_needed / 1024 / 1024, + ); + } + total_mem_needed += mem_needed as u64; + } + total_mem_needed +} + +pub fn gpu_init(gpus: &[String], zcb: bool) -> Vec>> { + let mut result = Vec::new(); + for gpu in gpus.iter() { + let gpu = gpu.split(':').collect::>(); + let platform_id = gpu[0].parse::().unwrap(); + let gpu_id = gpu[1].parse::().unwrap(); + let gpu_cores = gpu[2].parse::().unwrap(); + let platform_ids = core::get_platform_ids().unwrap(); + if platform_id >= platform_ids.len() { + println!("Error: Selected OpenCL platform doesn't exist."); + println!("Shutting down..."); + process::exit(0); + } + let platform = platform_ids[platform_id]; + let device_ids = core::get_device_ids(&platform, None, None).unwrap(); + if gpu_id >= device_ids.len() { + println!("Error: Selected OpenCL device doesn't exist"); + println!("Shutting down..."); + process::exit(0); + } + let device = device_ids[gpu_id]; + let max_compute_units = + match core::get_device_info(&device, DeviceInfo::MaxComputeUnits).unwrap() { + core::DeviceInfoResult::MaxComputeUnits(mcu) => mcu, + _ => panic!("Unexpected error. Can't obtain number of GPU cores."), + }; + let vendor = to_string!(core::get_device_info(&device, DeviceInfo::Vendor)).to_uppercase(); + let nvidia = vendor.contains("NVIDIA"); + let gpu_cores = if gpu_cores == 0 { + max_compute_units as usize + } else { + min(gpu_cores, max_compute_units as usize) + }; + result.push(Arc::new(Mutex::new(GpuContext::new( + platform_id, + gpu_id, + gpu_cores, + nvidia, + zcb, + )))); + } + result +} + +fn get_kernel_work_group_size(x: &core::Kernel, y: core::DeviceId) -> usize { + match core::get_kernel_work_group_info(x, y, KernelWorkGroupInfo::WorkGroupSize).unwrap() { + core::KernelWorkGroupInfoResult::WorkGroupSize(kws) => kws, + _ => panic!("Unexpected error"), + } +} + +pub fn gpu_hash(gpu_context: &Arc>, task: &GpuTask) { + let numeric_id_be: u64 = task.numeric_id.to_be(); + + let mut start; + let mut end; + let gpu_context = gpu_context.lock().unwrap(); + + core::set_kernel_arg( + &gpu_context.kernel, + 0, + ArgVal::mem(&gpu_context.buffer_gpu_a), + ).unwrap(); + core::set_kernel_arg( + &gpu_context.kernel, + 1, + ArgVal::primitive(&task.local_startnonce), + ).unwrap(); + core::set_kernel_arg( + &gpu_context.kernel, + 5, + ArgVal::primitive(&task.local_nonces), + ).unwrap(); + core::set_kernel_arg(&gpu_context.kernel, 2, ArgVal::primitive(&numeric_id_be)).unwrap(); + + for i in (0..8192).step_by(GPU_HASHES_PER_RUN) { + if i + GPU_HASHES_PER_RUN < 8192 { + start = i; + end = i + GPU_HASHES_PER_RUN - 1; + } else { + start = i; + end = i + GPU_HASHES_PER_RUN; + } + + core::set_kernel_arg(&gpu_context.kernel, 3, ArgVal::primitive(&(start as i32))).unwrap(); + core::set_kernel_arg(&gpu_context.kernel, 4, ArgVal::primitive(&(end as i32))).unwrap(); + + unsafe { + core::enqueue_kernel( + &gpu_context.queue_a, + &gpu_context.kernel, + 1, + None, + &gpu_context.gdim1, + Some(gpu_context.ldim1), + None::, + None::<&mut Event>, + ).unwrap(); + } + } + core::finish(&gpu_context.queue_a).unwrap(); +} + +pub fn gpu_transfer_to_host( + gpu_context: &Arc>, + buffer_id: u8, + transfer_task: &GpuTask, +) { + let mut gpu_context = gpu_context.lock().unwrap(); + + // get mem mapping + let map = if gpu_context.mapping { + Some(mem_map_gpu_to_host(buffer_id, &gpu_context)) + } else { + None + }; + + let buffer = if gpu_context.mapping { + // map to host (zero copy buffer) + map.as_ref().unwrap().as_ptr() + } else { + // get pointer + let ptr = gpu_context.buffer_ptr_host.as_mut().unwrap().as_mut_ptr(); + // copy to host + let slice = unsafe { from_raw_parts_mut(ptr, gpu_context.worksize * NONCE_SIZE as usize) }; + mem_transfer_gpu_to_host(buffer_id, &gpu_context, slice); + core::finish(&gpu_context.queue_b).unwrap(); + ptr + }; + unpack_shuffle_scatter(buffer, &gpu_context, &transfer_task); + if gpu_context.mapping { + mem_unmap_gpu_to_host(buffer_id, &gpu_context, map); + core::finish(&gpu_context.queue_a).unwrap(); + } +} + +pub fn gpu_hash_and_transfer_to_host( + gpu_context: &Arc>, + buffer_id: u8, + hasher_task: &GpuTask, + transfer_task: &GpuTask, +) { + let mut gpu_context = gpu_context.lock().unwrap(); + + let map = if gpu_context.mapping { + Some(mem_map_gpu_to_host(buffer_id, &gpu_context)) + } else { + None + }; + + let buffer = if gpu_context.mapping { + // map to host (zero copy buffer) + map.as_ref().unwrap().as_ptr() + } else { + // get pointer + let ptr = gpu_context.buffer_ptr_host.as_mut().unwrap().as_mut_ptr(); + // copy to host + let slice = unsafe { from_raw_parts_mut(ptr, gpu_context.worksize * NONCE_SIZE as usize) }; + mem_transfer_gpu_to_host(buffer_id, &gpu_context, slice); + ptr + }; + + let numeric_id_be: u64 = hasher_task.numeric_id.to_be(); + + let mut start; + let mut end; + + core::set_kernel_arg( + &gpu_context.kernel, + 0, + ArgVal::mem(if buffer_id == 0 { + &gpu_context.buffer_gpu_a + } else { + &gpu_context.buffer_gpu_b + }), + ).unwrap(); + core::set_kernel_arg( + &gpu_context.kernel, + 1, + ArgVal::primitive(&hasher_task.local_startnonce), + ).unwrap(); + core::set_kernel_arg( + &gpu_context.kernel, + 5, + ArgVal::primitive(&hasher_task.local_nonces), + ).unwrap(); + core::set_kernel_arg(&gpu_context.kernel, 2, ArgVal::primitive(&numeric_id_be)).unwrap(); + + for i in (0..8192).step_by(GPU_HASHES_PER_RUN) { + if i + GPU_HASHES_PER_RUN < 8192 { + start = i; + end = i + GPU_HASHES_PER_RUN - 1; + } else { + start = i; + end = i + GPU_HASHES_PER_RUN; + } + core::set_kernel_arg(&gpu_context.kernel, 3, ArgVal::primitive(&(start as i32))).unwrap(); + core::set_kernel_arg(&gpu_context.kernel, 4, ArgVal::primitive(&(end as i32))).unwrap(); + unsafe { + core::enqueue_kernel( + &gpu_context.queue_a, + &gpu_context.kernel, + 1, + None, + &gpu_context.gdim1, + Some(gpu_context.ldim1), + None::, + None::<&mut Event>, + ).unwrap(); + } + } + core::finish(&gpu_context.queue_b).unwrap(); + unpack_shuffle_scatter(buffer, &gpu_context, &transfer_task); + if gpu_context.mapping { + mem_unmap_gpu_to_host(buffer_id, &gpu_context, map); + } + core::finish(&gpu_context.queue_a).unwrap(); +} + +fn mem_map_gpu_to_host(buffer_id: u8, gpu_context: &GpuContext) -> core::MemMap { + unsafe { + if buffer_id == 1 { + core::enqueue_map_buffer::( + &gpu_context.queue_b, + &gpu_context.buffer_gpu_a, + true, + core::MAP_READ, + 0, + gpu_context.gdim1[0] * NONCE_SIZE as usize, + None::, + None::<&mut Event>, + ).unwrap() + } else { + core::enqueue_map_buffer::( + &gpu_context.queue_b, + &gpu_context.buffer_gpu_b, + true, + core::MAP_READ, + 0, + gpu_context.gdim1[0] * NONCE_SIZE as usize, + None::, + None::<&mut Event>, + ).unwrap() + } + } +} + +fn mem_unmap_gpu_to_host(buffer_id: u8, gpu_context: &GpuContext, map: Option>) { + // map to host (zero copy buffer) + if buffer_id == 1 { + core::enqueue_unmap_mem_object( + &gpu_context.queue_a, + &gpu_context.buffer_gpu_a, + &map.unwrap(), + None::, + None::<&mut Event>, + ).unwrap() + } else { + core::enqueue_unmap_mem_object( + &gpu_context.queue_a, + &gpu_context.buffer_gpu_b, + &map.unwrap(), + None::, + None::<&mut Event>, + ).unwrap() + }; +} + +fn mem_transfer_gpu_to_host(buffer_id: u8, gpu_context: &GpuContext, slice: &mut [u8]) { + unsafe { + if buffer_id == 1 { + core::enqueue_read_buffer( + &gpu_context.queue_b, + &gpu_context.buffer_gpu_a, + false, + 0, + slice, + None::, + None::<&mut Event>, + ).unwrap(); + } else { + core::enqueue_read_buffer( + &gpu_context.queue_b, + &gpu_context.buffer_gpu_b, + false, + 0, + slice, + None::, + None::<&mut Event>, + ).unwrap(); + } + } +} + +// simd shabal words unpack + POC Shuffle + scatter nonces into optimised cache +fn unpack_shuffle_scatter(buffer: *const u8, gpu_context: &GpuContext, transfer_task: &GpuTask) { + unsafe { + let buffer = from_raw_parts(buffer, gpu_context.worksize * NONCE_SIZE as usize); + let iter: Vec = (0..transfer_task.local_nonces).step_by(16).collect(); + iter.par_iter().for_each(|n| { + // get global buffer + let data = from_raw_parts_mut( + transfer_task.cache.ptr, + NONCE_SIZE as usize * transfer_task.cache_size as usize, + ); + for i in 0..(NUM_SCOOPS * 2) { + for j in (0..32).step_by(4) { + for k in 0..MSHABAL512_VECTOR_SIZE { + let data_offset = (((i & 1) * (4095 - (i >> 1)) + ((i + 1) & 1) * (i >> 1)) + * SCOOP_SIZE + * transfer_task.cache_size + + (*n + k + transfer_task.chunk_offset) * SCOOP_SIZE + + (i & 1) * 32 + + j) as usize; + let buffer_offset = (*n * NONCE_SIZE + + (i * 32 + j) * MSHABAL512_VECTOR_SIZE + + k * 4) as usize; + data[data_offset..(data_offset + 4)] + .clone_from_slice(&buffer[buffer_offset..(buffer_offset + 4)]); + } + } + } + }) + } +} diff --git a/src/ocl/kernel.cl b/src/ocl/kernel.cl new file mode 100644 index 0000000..263e91c --- /dev/null +++ b/src/ocl/kernel.cl @@ -0,0 +1,514 @@ +typedef unsigned int sph_u32; + +#define SPH_C32(x) ((sph_u32)(x ## U)) +#define SPH_T32(x) (as_uint(x)) +#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n)) +#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) + +#define SPH_C64(x) ((sph_u64)(x ## UL)) +#define SPH_T64(x) (as_ulong(x)) +#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL) +#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) + +/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */ +/* + * Shabal implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +/* + * Part of this code was automatically generated (the part between + * the "BEGIN" and "END" markers). + */ + +#define sM 16 + +#define C32 SPH_C32 +#define T32 SPH_T32 + +#define O1 13 +#define O2 9 +#define O3 6 + +/* + * We copy the state into local variables, so that the compiler knows + * that it can optimize them at will. + */ + +/* BEGIN -- automatically generated code. */ + +#define INPUT_BLOCK_ADD do { \ + B0 = T32(B0 + M0); \ + B1 = T32(B1 + M1); \ + B2 = T32(B2 + M2); \ + B3 = T32(B3 + M3); \ + B4 = T32(B4 + M4); \ + B5 = T32(B5 + M5); \ + B6 = T32(B6 + M6); \ + B7 = T32(B7 + M7); \ + B8 = T32(B8 + M8); \ + B9 = T32(B9 + M9); \ + BA = T32(BA + MA); \ + BB = T32(BB + MB); \ + BC = T32(BC + MC); \ + BD = T32(BD + MD); \ + BE = T32(BE + ME); \ + BF = T32(BF + MF); \ + } while (0) + +#define INPUT_BLOCK_SUB do { \ + C0 = T32(C0 - M0); \ + C1 = T32(C1 - M1); \ + C2 = T32(C2 - M2); \ + C3 = T32(C3 - M3); \ + C4 = T32(C4 - M4); \ + C5 = T32(C5 - M5); \ + C6 = T32(C6 - M6); \ + C7 = T32(C7 - M7); \ + C8 = T32(C8 - M8); \ + C9 = T32(C9 - M9); \ + CA = T32(CA - MA); \ + CB = T32(CB - MB); \ + CC = T32(CC - MC); \ + CD = T32(CD - MD); \ + CE = T32(CE - ME); \ + CF = T32(CF - MF); \ + } while (0) + +#define XOR_W do { \ + A00 ^= Wlow; \ + A01 ^= Whigh; \ + } while (0) + +#define SWAP(v1, v2) do { \ + sph_u32 tmp = (v1); \ + (v1) = (v2); \ + (v2) = tmp; \ + } while (0) + +#define SWAP_BC do { \ + SWAP(B0, C0); \ + SWAP(B1, C1); \ + SWAP(B2, C2); \ + SWAP(B3, C3); \ + SWAP(B4, C4); \ + SWAP(B5, C5); \ + SWAP(B6, C6); \ + SWAP(B7, C7); \ + SWAP(B8, C8); \ + SWAP(B9, C9); \ + SWAP(BA, CA); \ + SWAP(BB, CB); \ + SWAP(BC, CC); \ + SWAP(BD, CD); \ + SWAP(BE, CE); \ + SWAP(BF, CF); \ + } while (0) + +#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \ + xa0 = T32((xa0 \ + ^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \ + ^ xc) * 3U) \ + ^ xb1 ^ (xb2 & ~xb3) ^ xm; \ + xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \ + } while (0) + +#define PERM_STEP_0 do { \ + PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \ + } while (0) + +#define PERM_STEP_1 do { \ + PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \ + } while (0) + +#define PERM_STEP_2 do { \ + PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \ + } while (0) + +#define APPLY_P do { \ + B0 = T32(B0 << 17) | (B0 >> 15); \ + B1 = T32(B1 << 17) | (B1 >> 15); \ + B2 = T32(B2 << 17) | (B2 >> 15); \ + B3 = T32(B3 << 17) | (B3 >> 15); \ + B4 = T32(B4 << 17) | (B4 >> 15); \ + B5 = T32(B5 << 17) | (B5 >> 15); \ + B6 = T32(B6 << 17) | (B6 >> 15); \ + B7 = T32(B7 << 17) | (B7 >> 15); \ + B8 = T32(B8 << 17) | (B8 >> 15); \ + B9 = T32(B9 << 17) | (B9 >> 15); \ + BA = T32(BA << 17) | (BA >> 15); \ + BB = T32(BB << 17) | (BB >> 15); \ + BC = T32(BC << 17) | (BC >> 15); \ + BD = T32(BD << 17) | (BD >> 15); \ + BE = T32(BE << 17) | (BE >> 15); \ + BF = T32(BF << 17) | (BF >> 15); \ + PERM_STEP_0; \ + PERM_STEP_1; \ + PERM_STEP_2; \ + A0B = T32(A0B + C6); \ + A0A = T32(A0A + C5); \ + A09 = T32(A09 + C4); \ + A08 = T32(A08 + C3); \ + A07 = T32(A07 + C2); \ + A06 = T32(A06 + C1); \ + A05 = T32(A05 + C0); \ + A04 = T32(A04 + CF); \ + A03 = T32(A03 + CE); \ + A02 = T32(A02 + CD); \ + A01 = T32(A01 + CC); \ + A00 = T32(A00 + CB); \ + A0B = T32(A0B + CA); \ + A0A = T32(A0A + C9); \ + A09 = T32(A09 + C8); \ + A08 = T32(A08 + C7); \ + A07 = T32(A07 + C6); \ + A06 = T32(A06 + C5); \ + A05 = T32(A05 + C4); \ + A04 = T32(A04 + C3); \ + A03 = T32(A03 + C2); \ + A02 = T32(A02 + C1); \ + A01 = T32(A01 + C0); \ + A00 = T32(A00 + CF); \ + A0B = T32(A0B + CE); \ + A0A = T32(A0A + CD); \ + A09 = T32(A09 + CC); \ + A08 = T32(A08 + CB); \ + A07 = T32(A07 + CA); \ + A06 = T32(A06 + C9); \ + A05 = T32(A05 + C8); \ + A04 = T32(A04 + C7); \ + A03 = T32(A03 + C6); \ + A02 = T32(A02 + C5); \ + A01 = T32(A01 + C4); \ + A00 = T32(A00 + C3); \ + } while (0) + +#define INCR_W do { \ + if ((Wlow = T32(Wlow + 1)) == 0) \ + Whigh = T32(Whigh + 1); \ + } while (0) + +__constant static const sph_u32 A_init_192[] = { + C32(0xFD749ED4), C32(0xB798E530), C32(0x33904B6F), C32(0x46BDA85E), + C32(0x076934B4), C32(0x454B4058), C32(0x77F74527), C32(0xFB4CF465), + C32(0x62931DA9), C32(0xE778C8DB), C32(0x22B3998E), C32(0xAC15CFB9) +}; + +__constant static const sph_u32 B_init_192[] = { + C32(0x58BCBAC4), C32(0xEC47A08E), C32(0xAEE933B2), C32(0xDFCBC824), + C32(0xA7944804), C32(0xBF65BDB0), C32(0x5A9D4502), C32(0x59979AF7), + C32(0xC5CEA54E), C32(0x4B6B8150), C32(0x16E71909), C32(0x7D632319), + C32(0x930573A0), C32(0xF34C63D1), C32(0xCAF914B4), C32(0xFDD6612C) +}; + +__constant static const sph_u32 C_init_192[] = { + C32(0x61550878), C32(0x89EF2B75), C32(0xA1660C46), C32(0x7EF3855B), + C32(0x7297B58C), C32(0x1BC67793), C32(0x7FB1C723), C32(0xB66FC640), + C32(0x1A48B71C), C32(0xF0976D17), C32(0x088CE80A), C32(0xA454EDF3), + C32(0x1C096BF4), C32(0xAC76224B), C32(0x5215781C), C32(0xCD5D2669) +}; + +__constant static const sph_u32 A_init_224[] = { + C32(0xA5201467), C32(0xA9B8D94A), C32(0xD4CED997), C32(0x68379D7B), + C32(0xA7FC73BA), C32(0xF1A2546B), C32(0x606782BF), C32(0xE0BCFD0F), + C32(0x2F25374E), C32(0x069A149F), C32(0x5E2DFF25), C32(0xFAECF061) +}; + +__constant static const sph_u32 B_init_224[] = { + C32(0xEC9905D8), C32(0xF21850CF), C32(0xC0A746C8), C32(0x21DAD498), + C32(0x35156EEB), C32(0x088C97F2), C32(0x26303E40), C32(0x8A2D4FB5), + C32(0xFEEE44B6), C32(0x8A1E9573), C32(0x7B81111A), C32(0xCBC139F0), + C32(0xA3513861), C32(0x1D2C362E), C32(0x918C580E), C32(0xB58E1B9C) +}; + +__constant static const sph_u32 C_init_224[] = { + C32(0xE4B573A1), C32(0x4C1A0880), C32(0x1E907C51), C32(0x04807EFD), + C32(0x3AD8CDE5), C32(0x16B21302), C32(0x02512C53), C32(0x2204CB18), + C32(0x99405F2D), C32(0xE5B648A1), C32(0x70AB1D43), C32(0xA10C25C2), + C32(0x16F1AC05), C32(0x38BBEB56), C32(0x9B01DC60), C32(0xB1096D83) +}; + +__constant static const sph_u32 A_init_256[] = { + C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191), + C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C), + C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A) +}; + +__constant static const sph_u32 B_init_256[] = { + C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F), + C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002), + C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890), + C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5) +}; + +__constant static const sph_u32 C_init_256[] = { + C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55), + C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433), + C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F), + C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60) +}; + +__constant static const sph_u32 A_init_384[] = { + C32(0xC8FCA331), C32(0xE55C504E), C32(0x003EBF26), C32(0xBB6B8D83), + C32(0x7B0448C1), C32(0x41B82789), C32(0x0A7C9601), C32(0x8D659CFF), + C32(0xB6E2673E), C32(0xCA54C77B), C32(0x1460FD7E), C32(0x3FCB8F2D) +}; + +__constant static const sph_u32 B_init_384[] = { + C32(0x527291FC), C32(0x2A16455F), C32(0x78E627E5), C32(0x944F169F), + C32(0x1CA6F016), C32(0xA854EA25), C32(0x8DB98ABE), C32(0xF2C62641), + C32(0x30117DCB), C32(0xCF5C4309), C32(0x93711A25), C32(0xF9F671B8), + C32(0xB01D2116), C32(0x333F4B89), C32(0xB285D165), C32(0x86829B36) +}; + +__constant static const sph_u32 C_init_384[] = { + C32(0xF764B11A), C32(0x76172146), C32(0xCEF6934D), C32(0xC6D28399), + C32(0xFE095F61), C32(0x5E6018B4), C32(0x5048ECF5), C32(0x51353261), + C32(0x6E6E36DC), C32(0x63130DAD), C32(0xA9C69BD6), C32(0x1E90EA0C), + C32(0x7C35073B), C32(0x28D95E6D), C32(0xAA340E0D), C32(0xCB3DEE70) +}; + +__constant static const sph_u32 A_init_512[] = { + C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632), + C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B), + C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F) +}; + +__constant static const sph_u32 B_init_512[] = { + C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640), + C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08), + C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E), + C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B) +}; + +__constant static const sph_u32 C_init_512[] = { + C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359), + C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780), + C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A), + C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969) +}; + +/* END -- automatically generated code. */ + +#define NONCES_VECTOR 16 +#define NONCES_VECTOR_LOG2 4 +#define MESSAGE_CAP 64 +#define NUM_HASHES 8192 +#define HASH_SIZE_WORDS 8 +#define NONCE_SIZE_WORDS HASH_SIZE_WORDS * NUM_HASHES + +#define EndianSwap(n) (rotate(n & 0x00FF00FF, 24UL)|(rotate(n, 8UL) & 0x00FF00FF)) + +#define EndianSwap64(n) bitselect( \ + bitselect(rotate(n, 24UL), \ + rotate(n, 8UL), 0x000000FF000000FFUL), \ + bitselect(rotate(n, 56UL), \ + rotate(n, 40UL), 0x00FF000000FF0000UL), \ + 0xFFFF0000FFFF0000UL) + +#define Address(nonce,hash,word) ((nonce >> NONCES_VECTOR_LOG2) * NONCES_VECTOR * NONCE_SIZE_WORDS + (hash) * NONCES_VECTOR * HASH_SIZE_WORDS + word * NONCES_VECTOR + (nonce & (NONCES_VECTOR-1))) +//#define Address(nonce,hash,word) (nonce * NONCE_SIZE_WORDS + (hash) * HASH_SIZE_WORDS + word) + +/* Johnny's optimised nonce calculation kernel + * based on the implementation found in BRS + */ +__kernel void calculate_nonces(__global unsigned char* buffer, unsigned long startnonce, unsigned long numeric_id_be, int start, int end, unsigned long nonces) { + //if (gid==0) {printf("\n\nOCL 2 %lu\n\n",startnonce);} DEBUG + int gid = get_global_id(0); + + if (gid >= nonces) + return; + // number of shabal message round + int num; + // buffer for final hash + sph_u32 B8,B9,BA,BB,BC,BD,BE,BF; + // init + unsigned long nonce_be = EndianSwap64(startnonce + gid); + // run 8192 rounds + final round + for (int hash = NUM_HASHES - start; hash > -1 + NUM_HASHES - end; hash -= 1) { + // calculate number of shabal messages excl. final message + num = (NUM_HASHES - hash) >> 1; + if (hash != 0) { + num = (num > MESSAGE_CAP) ? MESSAGE_CAP : num; + } + + // init shabal + sph_u32 + A00 = A_init_256[0], A01 = A_init_256[1], A02 = A_init_256[2], A03 = A_init_256[3], + A04 = A_init_256[4], A05 = A_init_256[5], A06 = A_init_256[6], A07 = A_init_256[7], + A08 = A_init_256[8], A09 = A_init_256[9], A0A = A_init_256[10], A0B = A_init_256[11]; + sph_u32 + B0 = B_init_256[0], B1 = B_init_256[1], B2 = B_init_256[2], B3 = B_init_256[3], + B4 = B_init_256[4], B5 = B_init_256[5], B6 = B_init_256[6], B7 = B_init_256[7]; + B8 = B_init_256[8]; B9 = B_init_256[9]; BA = B_init_256[10]; BB = B_init_256[11]; + BC = B_init_256[12]; BD = B_init_256[13]; BE = B_init_256[14]; BF = B_init_256[15]; + sph_u32 + C0 = C_init_256[0], C1 = C_init_256[1], C2 = C_init_256[2], C3 = C_init_256[3], + C4 = C_init_256[4], C5 = C_init_256[5], C6 = C_init_256[6], C7 = C_init_256[7], + C8 = C_init_256[8], C9 = C_init_256[9], CA = C_init_256[10], CB = C_init_256[11], + CC = C_init_256[12], CD = C_init_256[13], CE = C_init_256[14], CF = C_init_256[15]; + sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; + sph_u32 Wlow = 1, Whigh = 0; + + for (int i = 0; i < 2 * num; i+=2){ + M0 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 0)]; + M1 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 1)]; + M2 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 2)]; + M3 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 3)]; + M4 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 4)]; + M5 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 5)]; + M6 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 6)]; + M7 = ((__global unsigned int*)buffer)[Address(gid, hash + i, 7)]; + M8 = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 0)]; + M9 = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 1)]; + MA = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 2)]; + MB = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 3)]; + MC = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 4)]; + MD = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 5)]; + ME = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 6)]; + MF = ((__global unsigned int*)buffer)[Address(gid, hash + i + 1, 7)]; + + INPUT_BLOCK_ADD; + XOR_W; + APPLY_P; + INPUT_BLOCK_SUB; + SWAP_BC; + INCR_W; + } + + // final message determination + if (num == MESSAGE_CAP) { + M0 = 0x80; + M1 = M2 = M3 = M4 = M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0; + } + else if((hash & 1) == 0) { + M0 = ((unsigned int*)&numeric_id_be)[0]; + M1 = ((unsigned int*)&numeric_id_be)[1]; + M2 = ((unsigned int*)&nonce_be)[0]; + M3 = ((unsigned int*)&nonce_be)[1]; + M4 = 0x80; + M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0; + } + else if((hash & 1) == 1) { + M0 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 0)]; + M1 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 1)]; + M2 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 2)]; + M3 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 3)]; + M4 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 4)]; + M5 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 5)]; + M6 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 6)]; + M7 = ((__global unsigned int*)buffer)[Address(gid, NUM_HASHES-1, 7)]; + M8 = ((unsigned int*)&numeric_id_be)[0]; + M9 = ((unsigned int*)&numeric_id_be)[1]; + MA = ((unsigned int*)&nonce_be)[0]; + MB = ((unsigned int*)&nonce_be)[1]; + MC = 0x80; + MD = ME = MF = 0; + } + + INPUT_BLOCK_ADD; + XOR_W; + APPLY_P; + for (int i = 0; i < 3; i ++) { + SWAP_BC; + XOR_W; + APPLY_P; + } + + if (hash > 0){ + ((__global unsigned int*)buffer)[Address(gid, hash-1, 0)] = B8; + ((__global unsigned int*)buffer)[Address(gid, hash-1, 1)] = B9; + ((__global unsigned int*)buffer)[Address(gid, hash-1, 2)] = BA; + ((__global unsigned int*)buffer)[Address(gid, hash-1, 3)] = BB; + ((__global unsigned int*)buffer)[Address(gid, hash-1, 4)] = BC; + ((__global unsigned int*)buffer)[Address(gid, hash-1, 5)] = BD; + ((__global unsigned int*)buffer)[Address(gid, hash-1, 6)] = BE; + ((__global unsigned int*)buffer)[Address(gid, hash-1, 7)] = BF; + } + } + + // final xor + if(end==8192){ + for (size_t i = 0; i < NUM_HASHES; i++){ + ((__global unsigned int*)buffer)[Address(gid, i, 0)] ^= B8; + ((__global unsigned int*)buffer)[Address(gid, i, 1)] ^= B9; + ((__global unsigned int*)buffer)[Address(gid, i, 2)] ^= BA; + ((__global unsigned int*)buffer)[Address(gid, i, 3)] ^= BB; + ((__global unsigned int*)buffer)[Address(gid, i, 4)] ^= BC; + ((__global unsigned int*)buffer)[Address(gid, i, 5)] ^= BD; + ((__global unsigned int*)buffer)[Address(gid, i, 6)] ^= BE; + ((__global unsigned int*)buffer)[Address(gid, i, 7)] ^= BF; + } + } +} \ No newline at end of file diff --git a/src/plotter.rs b/src/plotter.rs index 7ee23df..1f142bd 100644 --- a/src/plotter.rs +++ b/src/plotter.rs @@ -9,9 +9,12 @@ use self::pbr::{MultiBar, Units}; use self::raw_cpuid::CpuId; use chan; use core_affinity; -use hasher::create_hasher_task; +#[cfg(feature = "opencl")] +use ocl::gpu_get_info; +use scheduler::create_scheduler_thread; use std::cmp::{max, min}; use std::path::Path; +use std::process; use std::sync::{Arc, Mutex}; use std::thread; use stopwatch::Stopwatch; @@ -20,7 +23,7 @@ use utils::get_sector_size; use utils::preallocate; #[cfg(windows)] use utils::set_thread_ideal_processor; -use writer::{create_writer_task, read_resume_info, write_resume_info}; +use writer::{create_writer_thread, read_resume_info, write_resume_info}; const NONCE_SIZE: u64 = (2 << 17); const SCOOP_SIZE: u64 = 64; @@ -42,9 +45,12 @@ pub struct PlotterTask { pub output_path: String, pub mem: String, pub cpu_threads: u8, + pub gpus: Option>, pub direct_io: bool, pub async_io: bool, pub quiet: bool, + pub benchmark: bool, + pub zcb: bool, } pub struct Buffer { @@ -81,6 +87,11 @@ impl Plotter { if !task.quiet { println!("Engraver {} - PoC2 Plotter\n", crate_version!()); } + + if !task.quiet && task.benchmark { + println!("*BENCHMARK MODE*\n"); + } + if !task.quiet { println!( "CPU: {} [using {} of {} cores{}{}]", @@ -92,12 +103,32 @@ impl Plotter { ); } + #[cfg(not(feature = "opencl"))] + let gpu_mem_needed = 0u64; + #[cfg(feature = "opencl")] + let gpu_mem_needed = match &task.gpus { + Some(x) => gpu_get_info(&x, task.quiet), + None => 0, + }; + + #[cfg(feature = "opencl")] + let gpu_mem_needed = if task.zcb { + gpu_mem_needed + } else { + gpu_mem_needed / 2 + }; + // use all avaiblable disk space if nonce parameter has been omitted let free_disk_space = free_disk_space(&task.output_path); if task.nonces == 0 { task.nonces = free_disk_space / NONCE_SIZE; } + let gpu = match &task.gpus { + Some(_) => true, + None => false, + }; + // align number of nonces with sector size if direct i/o let mut rounded_nonces_to_sector_size = false; let mut nonces_per_sector = 1; @@ -128,9 +159,9 @@ impl Plotter { } // check available disk space - if free_disk_space < plotsize && !file.exists(){ + if free_disk_space < plotsize && !file.exists() && !task.benchmark { println!( - "Error: insufficient disk space, MiB_required={}, MiB_available={}", + "Error: insufficient disk space, MiB_required={:.2}, MiB_available={:.2}", plotsize as f64 / 1024.0 / 1024.0, free_disk_space as f64 / 1024.0 / 1024.0 ); @@ -139,17 +170,25 @@ impl Plotter { } // calculate memory usage - let mem = match calculate_mem_to_use(&task, &memory, nonces_per_sector){ + let mem = match calculate_mem_to_use(&task, &memory, nonces_per_sector, gpu, gpu_mem_needed) + { Ok(x) => x, - Err(_) => return + Err(_) => return, }; if !task.quiet { println!( - "RAM: Total={:.2} GiB, Free= {:.2} GiB, Usage= {:.2} GiB \n", + "RAM: Total={:.2} GiB, Free={:.2} GiB, Usage={:.2} GiB", memory.total as f64 / 1024.0 / 1024.0, memory.free as f64 / 1024.0 / 1024.0, - mem as f64 / 1024.0 / 1024.0 / 1024.0 + (mem + gpu_mem_needed) as f64 / 1024.0 / 1024.0 / 1024.0 + ); + + #[cfg(feature = "opencl")] + println!( + " HDDcache={:.2} GiB, GPUcache={:.2} GiB,\n", + mem as f64 / 1024.0 / 1024.0 / 1024.0, + gpu_mem_needed as f64 / 1024.0 / 1024.0 / 1024.0 ); println!("Numeric ID: {}", task.numeric_id); @@ -181,7 +220,6 @@ impl Plotter { println!("File is already completed."); println!("Shutting Down..."); return; - } } if !task.quiet { @@ -191,10 +229,10 @@ impl Plotter { if !task.quiet { print!("Fast file pre-allocation..."); } - - preallocate(&file, plotsize, task.direct_io); - write_resume_info(&file, 0u64); - + if !task.benchmark { + preallocate(&file, plotsize, task.direct_io); + write_resume_info(&file, 0u64); + } if !task.quiet { println!("OK"); } @@ -205,7 +243,7 @@ impl Plotter { println!("Starting plotting...\n"); } else { println!("Resuming plotting from nonce offset {}...\n", progress); - } + } } // determine buffer size @@ -261,14 +299,14 @@ impl Plotter { // hi bold! might make this optional in future releases. let thread_pinning = true; - let mut core_ids: Vec = Vec::new(); - - if thread_pinning { - core_ids = core_affinity::get_core_ids().unwrap(); - } + let core_ids = if thread_pinning { + core_affinity::get_core_ids().unwrap() + } else { + Vec::new() + }; let hasher = thread::spawn({ - create_hasher_task( + create_scheduler_thread( task.clone(), rayon::ThreadPoolBuilder::new() .num_threads(task.cpu_threads as usize) @@ -292,7 +330,7 @@ impl Plotter { }); let writer = thread::spawn({ - create_writer_task( + create_writer_thread( task.clone(), progress, p2x, @@ -330,30 +368,49 @@ fn calculate_mem_to_use( task: &PlotterTask, memory: &sys_info::MemInfo, nonces_per_sector: u64, + gpu: bool, + gpu_mem_needed: u64, ) -> Result { let plotsize = task.nonces * NONCE_SIZE; let mut mem = match task.mem.parse::() { Ok(x) => x.size() as u64, - Err(_) => { println!( + Err(_) => { + println!( "Error: Can't parse memory limit parameter, input={}", task.mem, ); println!("\nPlease specify a number followed by a unit. If no unit is provided, bytes will be assumed."); - println!("Supported units: B, KiB, MiB, GiB, TiB, PiB, EiB, KB, MB, GB, TB, PB, EB"); - println!("Example: --mem 10GiB\n"); + println!("Supported units: B, KiB, MiB, GiB, TiB, PiB, EiB, KB, MB, GB, TB, PB, EB"); + println!("Example: --mem 10GiB\n"); println!("Shutting down..."); return Err("invalid unit"); } }; - + if gpu && mem > 0 && mem < gpu_mem_needed + nonces_per_sector * NONCE_SIZE { + println!("Error: Insufficient host memory for GPU plotting!"); + println!("Shutting down..."); + process::exit(0); + } + + if gpu && mem > 0 { + mem -= gpu_mem_needed; + } + if mem == 0 { mem = plotsize; } - mem = min(mem, plotsize); + mem = min(mem, plotsize + gpu_mem_needed); + + // opencl requires buffer to be a multiple of 16 (data coalescence magic) + let nonces_per_sector = if gpu { + max(16, nonces_per_sector) + } else { + nonces_per_sector + }; // don't exceed free memory and leave some elbow room 1-1000/1024 - mem = min(mem, memory.free * 1000); + mem = min(mem, memory.free * 1000 - gpu_mem_needed); // rounding single/double buffer let num_buffer = if task.async_io { 2 } else { 1 }; diff --git a/src/scheduler.rs b/src/scheduler.rs new file mode 100644 index 0000000..cbf3f9b --- /dev/null +++ b/src/scheduler.rs @@ -0,0 +1,232 @@ +extern crate pbr; +extern crate rayon; + +use chan; +use cpu_hasher::{hash_cpu, CpuTask, SafeCVoid}; +#[cfg(feature = "opencl")] +use gpu_hasher::{create_gpu_hasher_thread, GpuTask, SafePointer}; +use libc::{c_void, size_t}; +#[cfg(feature = "opencl")] +use ocl::gpu_init; +use plotter::{Buffer, PlotterTask}; +use std::cmp::min; +use std::sync::mpsc::channel; +use std::sync::Arc; +#[cfg(feature = "opencl")] +use std::thread; + +const CPU_TASK_SIZE: u64 = 64; +const NONCE_SIZE: u64 = (2 << 17); + +pub fn create_scheduler_thread( + task: Arc, + thread_pool: rayon::ThreadPool, + mut nonces_hashed: u64, + mut pb: Option>, + rx_empty_buffers: chan::Receiver, + tx_buffers_to_writer: chan::Sender, + simd_ext: String, +) -> impl FnOnce() { + move || { + // synchronisation chanel for all hashing devices (CPU+GPU) + // message protocol: (hash_device_id: u8, message: u8, nonces processed: u64) + // hash_device_id: 0=CPU, 1=GPU0, 2=GPU1... + // message: 0 = data ready to write + // 1 = device ready to compute next hashing batch + // nonces_processed: nonces hashed / nonces writen to host buffer + let (tx, rx) = channel(); + + // create gpu threads and channels + #[cfg(feature = "opencl")] + let gpu_contexts = match &task.gpus { + Some(x) => Some(gpu_init(&x, task.zcb)), + None => None, + }; + + #[cfg(feature = "opencl")] + let gpus = match gpu_contexts { + Some(x) => x, + None => Vec::new(), + }; + #[cfg(feature = "opencl")] + let mut gpu_threads = Vec::new(); + #[cfg(feature = "opencl")] + let mut gpu_channels = Vec::new(); + + #[cfg(feature = "opencl")] + for (i, gpu) in gpus.iter().enumerate() { + gpu_channels.push(chan::unbounded()); + gpu_threads.push(thread::spawn({ + create_gpu_hasher_thread( + (i + 1) as u8, + gpu.clone(), + tx.clone(), + gpu_channels.last().unwrap().1.clone(), + ) + })); + } + + for buffer in rx_empty_buffers { + let mut_bs = &buffer.get_buffer(); + let mut bs = mut_bs.lock().unwrap(); + let buffer_size = (*bs).len() as u64; + let nonces_to_hash = min(buffer_size / NONCE_SIZE, task.nonces - nonces_hashed); + + let mut requested = 0u64; + let mut processed = 0u64; + + // kickoff first gpu and cpu runs + #[cfg(feature = "opencl")] + for (i, gpu) in gpus.iter().enumerate() { + // schedule next gpu task + let mut gpu = gpu.lock().unwrap(); + let task_size = min(gpu.worksize as u64, nonces_to_hash - requested); + if task_size > 0 { + gpu_channels[i].0.send(Some(GpuTask { + cache: SafePointer { + ptr: bs.as_mut_ptr(), + }, + cache_size: buffer_size / NONCE_SIZE, + chunk_offset: requested, + numeric_id: task.numeric_id, + local_startnonce: task.start_nonce + nonces_hashed + requested, + local_nonces: task_size, + })); + } + requested += task_size; + //println!("Debug: Device: {} started. {} nonces assigned. Total requested: {}\n\n\n",i+1,task_size,requested); + } + + for _ in 0..task.cpu_threads { + let task_size = min(CPU_TASK_SIZE, nonces_to_hash - requested); + if task_size > 0 { + let task = hash_cpu( + tx.clone(), + CpuTask { + cache: SafeCVoid { + ptr: bs.as_ptr() as *mut c_void, + }, + cache_size: buffer_size / NONCE_SIZE as size_t, + chunk_offset: requested as size_t, + numeric_id: task.numeric_id, + local_startnonce: task.start_nonce + nonces_hashed + requested, + local_nonces: task_size, + }, + simd_ext.clone(), + ); + thread_pool.spawn(task); + } + requested += task_size; + } + + // control loop + let rx = ℞ + for msg in rx { + match msg.1 { + // process a request for work: provide a task or signal completion + 1 => { + let task_size = match msg.0 { + 0 => { + // schedule next cpu task + let task_size = min(CPU_TASK_SIZE, nonces_to_hash - requested); + if task_size > 0 { + let task = hash_cpu( + tx.clone(), + CpuTask { + cache: SafeCVoid { + ptr: bs.as_ptr() as *mut c_void, + }, + cache_size: buffer_size / NONCE_SIZE as size_t, + chunk_offset: requested as size_t, + numeric_id: task.numeric_id, + local_startnonce: task.start_nonce + + nonces_hashed + + requested, + local_nonces: task_size, + }, + simd_ext.clone(), + ); + thread_pool.spawn(task); + } + task_size + } + _ => { + // schedule next gpu task + #[cfg(feature = "opencl")] + let mut gpu = gpus[(msg.0 - 1) as usize].lock().unwrap(); + #[cfg(feature = "opencl")] + let task_size = + min(gpu.worksize as u64, nonces_to_hash - requested); + + // optimisation: leave some work for cpu in dual mode + #[cfg(feature = "opencl")] + let task_size = if task_size < gpu.worksize as u64 + && task.cpu_threads > 0 + && task_size > CPU_TASK_SIZE + { + task_size / 2 + } else { + task_size + }; + + #[cfg(not(feature = "opencl"))] + let task_size = 0; + + #[cfg(feature = "opencl")] + gpu_channels[(msg.0 - 1) as usize].0.send(Some(GpuTask { + cache: SafePointer { + ptr: bs.as_mut_ptr(), + }, + cache_size: buffer_size / NONCE_SIZE, + chunk_offset: requested, + numeric_id: task.numeric_id, + local_startnonce: task.start_nonce + nonces_hashed + requested, + local_nonces: task_size, + })); + task_size + } + }; + + requested += task_size; + //println!("Debug: Device: {} asked for work. {} nonces assigned. Total requested: {}\n\n\n",msg.0,task_size,requested); + } + // process work completed message + 0 => { + processed += msg.2; + match &mut pb { + Some(pb) => { + pb.add(msg.2 * NONCE_SIZE); + } + None => (), + } + } + _ => {} + } + if processed == nonces_to_hash { + break; + } + } + + nonces_hashed += nonces_to_hash; + + // queue buffer for writing + tx_buffers_to_writer.send(buffer); + + // thread end + if task.nonces == nonces_hashed { + match &mut pb { + Some(pb) => { + pb.finish_print("Hasher done."); + } + None => (), + } + // shutdown gpu threads + #[cfg(feature = "opencl")] + for gpu in &gpu_channels { + gpu.0.send(None); + } + break; + }; + } + } +} diff --git a/src/writer.rs b/src/writer.rs index ca7b717..cce8a58 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -12,7 +12,7 @@ const TASK_SIZE: u64 = 16384; const SCOOP_SIZE: u64 = 64; const NONCE_SIZE: u64 = 4096 * SCOOP_SIZE; -pub fn create_writer_task( +pub fn create_writer_thread( task: Arc, mut nonces_written: u64, mut pb: Option>, @@ -30,49 +30,50 @@ pub fn create_writer_task( "{}_{}_{}", task.numeric_id, task.start_nonce, task.nonces )); + if !task.benchmark { + let file = if task.direct_io { + open_using_direct_io(&filename) + } else { + open(&filename) + }; - let file = if task.direct_io { - open_using_direct_io(&filename) - } else { - open(&filename) - }; + let mut file = file.unwrap(); - let mut file = file.unwrap(); + for scoop in 0..4096 { + let mut seek_addr = scoop * task.nonces as u64 * SCOOP_SIZE; + seek_addr += nonces_written as u64 * SCOOP_SIZE; - for scoop in 0..4096 { - let mut seek_addr = scoop * task.nonces as u64 * SCOOP_SIZE; - seek_addr += nonces_written as u64 * SCOOP_SIZE; + file.seek(SeekFrom::Start(seek_addr)).unwrap(); - file.seek(SeekFrom::Start(seek_addr)).unwrap(); + let mut local_addr = scoop * buffer_size / NONCE_SIZE * SCOOP_SIZE; + for _ in 0..nonces_to_write / TASK_SIZE { + file.write_all( + &bs[local_addr as usize + ..(local_addr + TASK_SIZE * SCOOP_SIZE) as usize], + ).unwrap(); - let mut local_addr = scoop * buffer_size / NONCE_SIZE * SCOOP_SIZE; - for _ in 0..nonces_to_write / TASK_SIZE { - file.write_all( - &bs[local_addr as usize..(local_addr + TASK_SIZE * SCOOP_SIZE) as usize], - ).unwrap(); - - local_addr += TASK_SIZE * SCOOP_SIZE; - } + local_addr += TASK_SIZE * SCOOP_SIZE; + } - // write remainder - if nonces_to_write % TASK_SIZE > 0 { - file.write_all( - &bs[local_addr as usize - ..(local_addr + (nonces_to_write % TASK_SIZE) * SCOOP_SIZE) - as usize], - ).unwrap(); - } + // write remainder + if nonces_to_write % TASK_SIZE > 0 { + file.write_all( + &bs[local_addr as usize + ..(local_addr + (nonces_to_write % TASK_SIZE) * SCOOP_SIZE) + as usize], + ).unwrap(); + } - if (scoop + 1) % 128 == 0 { - match &mut pb { - Some(pb) => { - pb.add(nonces_to_write * SCOOP_SIZE * 128); + if (scoop + 1) % 128 == 0 { + match &mut pb { + Some(pb) => { + pb.add(nonces_to_write * SCOOP_SIZE * 128); + } + None => (), } - None => (), } } } - nonces_written += nonces_to_write; // thread end @@ -86,13 +87,15 @@ pub fn create_writer_task( break; } - write_resume_info(&filename, nonces_written); + if !task.benchmark { + write_resume_info(&filename, nonces_written); + } tx_empty_buffers.send(buffer); } } } -pub fn read_resume_info(file: &Path) -> Result { +pub fn read_resume_info(file: &Path) -> Result { let mut file = open_r(&file).unwrap(); file.seek(SeekFrom::End(-8)).unwrap();