From b2e7c01e819aca9c26c679abf35da0418592df84 Mon Sep 17 00:00:00 2001 From: Nazar Mokrynskyi Date: Mon, 16 Sep 2024 00:08:57 +0300 Subject: [PATCH] ROCm WIP --- Cargo.lock | 3 +- shared/subspace-proof-of-space-gpu/Cargo.toml | 6 +- shared/subspace-proof-of-space-gpu/build.rs | 10 +- shared/subspace-proof-of-space-gpu/src/lib.rs | 2 + .../subspace-proof-of-space-gpu/src/rocm.rs | 190 ++++++++++++++++++ .../src/rocm/tests.rs | 84 ++++++++ 6 files changed, 286 insertions(+), 9 deletions(-) create mode 100644 shared/subspace-proof-of-space-gpu/src/rocm.rs create mode 100644 shared/subspace-proof-of-space-gpu/src/rocm/tests.rs diff --git a/Cargo.lock b/Cargo.lock index ceda2ba65eb..5ae54c55447 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12423,8 +12423,7 @@ dependencies = [ [[package]] name = "sppark" version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c55f3833d30846a26110dccb1d5366314c2c52516a9173b74238c16b24b1a9f9" +source = "git+https://github.com/dot-asm/sppark?rev=fe1237fe9eabb8aeb48a21af4d439fb4ac4f5d5d#fe1237fe9eabb8aeb48a21af4d439fb4ac4f5d5d" dependencies = [ "cc", "which", diff --git a/shared/subspace-proof-of-space-gpu/Cargo.toml b/shared/subspace-proof-of-space-gpu/Cargo.toml index 7456c2a1cb3..87ab98f5cd7 100644 --- a/shared/subspace-proof-of-space-gpu/Cargo.toml +++ b/shared/subspace-proof-of-space-gpu/Cargo.toml @@ -16,8 +16,8 @@ include = [ blst = { version = "0.3.13", optional = true } rust-kzg-blst = { git = "https://github.com/grandinetech/rust-kzg", rev = "6c8fcc623df3d7e8c0f30951a49bfea764f90bf4", default-features = false, optional = true } # TODO: This is `rocm` branch, it is needed for ROCm support -#sppark = { git = "https://github.com/dot-asm/sppark", rev = "8eeafe0f6cc0ca8211b1be93922df1b5a118bbd2", optional = true } -sppark = { version = "0.1.8", optional = true } +sppark = { git = "https://github.com/dot-asm/sppark", rev = "fe1237fe9eabb8aeb48a21af4d439fb4ac4f5d5d", optional = true } +#sppark = { version = "0.1.8", optional = true } subspace-core-primitives = { version = "0.1.0", path = "../../crates/subspace-core-primitives", default-features = false, optional = true } [dev-dependencies] @@ -31,7 +31,7 @@ cc = "1.1.15" [features] # Only Volta+ architectures are supported (GeForce RTX 20xx consumer GPUs and newer) cuda = ["_gpu"] -# TODO: ROCm can't be enabled at the same time as `cuda` feature at the moment and is not exposed on library level +# TODO: ROCm can't be enabled at the same time as `cuda` feature at the moment rocm = ["_gpu"] # Internal feature, shouldn't be used directly _gpu = [ diff --git a/shared/subspace-proof-of-space-gpu/build.rs b/shared/subspace-proof-of-space-gpu/build.rs index 0e9d891da89..24b081b66cb 100644 --- a/shared/subspace-proof-of-space-gpu/build.rs +++ b/shared/subspace-proof-of-space-gpu/build.rs @@ -21,11 +21,10 @@ fn main() { hipcc.compiler(env::var("HIPCC").unwrap_or("hipcc".to_string())); hipcc.cpp(true); if cfg!(debug_assertions) { - hipcc.opt_level(1); + hipcc.opt_level(2); } - hipcc.flag("--offload-arch=native,gfx1100,gfx1030,gfx942,gfx90a,gfx908"); - // 6 corresponds to the number of offload-arch - hipcc.flag("-parallel-jobs=6"); + hipcc.flag("--offload-arch=gfx1100,gfx1030,gfx942,gfx90a,gfx908"); + // hipcc.flag("--offload-device-only"); // This controls how error strings get handled in the FFI. When defined error strings get // returned from the FFI, and Rust must then free them. When not defined error strings are // not returned. @@ -35,6 +34,9 @@ fn main() { hipcc.flag("-include").flag("util/cuda2hip.hpp"); } hipcc.file("src/subspace_api.cu").compile("subspace_rocm"); + + // Doesn't link otherwise + println!("cargo::rustc-link-lib=amdhip64"); } if cfg!(feature = "cuda") { diff --git a/shared/subspace-proof-of-space-gpu/src/lib.rs b/shared/subspace-proof-of-space-gpu/src/lib.rs index 572db957bf6..0b1269fb893 100644 --- a/shared/subspace-proof-of-space-gpu/src/lib.rs +++ b/shared/subspace-proof-of-space-gpu/src/lib.rs @@ -1,2 +1,4 @@ #[cfg(feature = "cuda")] pub mod cuda; +#[cfg(feature = "rocm")] +pub mod rocm; diff --git a/shared/subspace-proof-of-space-gpu/src/rocm.rs b/shared/subspace-proof-of-space-gpu/src/rocm.rs new file mode 100644 index 00000000000..87b55c442a8 --- /dev/null +++ b/shared/subspace-proof-of-space-gpu/src/rocm.rs @@ -0,0 +1,190 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#[cfg(test)] +mod tests; + +use rust_kzg_blst::types::fr::FsFr; +use std::ops::DerefMut; +use subspace_core_primitives::crypto::Scalar; +use subspace_core_primitives::{PosProof, PosSeed, Record}; + +extern "C" { + /// # Returns + /// * `usize` - The number of available GPUs. + fn gpu_count() -> usize; + + /// # Parameters + /// * `k: The size parameter for the table. + /// * `seed: A pointer to the seed data. + /// * `lg_record_size: The logarithm of the record size. + /// * `challenge_index: A mutable pointer to store the index of the challenge. + /// * `record: A pointer to the record data. + /// * `chunks_scratch: A mutable pointer to a scratch space for chunk data. + /// * `proof_count: A mutable pointer to store the count of proofs. + /// * `source_record_chunks: A mutable pointer to the source record chunks. + /// * `parity_record_chunks: A mutable pointer to the parity record chunks. + /// * `gpu_id: The ID of the GPU to use. + /// + /// # Returns + /// * `sppark::Error` - An error code indicating the result of the operation. + /// + /// # Assumptions + /// * `seed` must be a valid pointer to a 32-byte. + /// * `record` must be a valid pointer to the record data (`*const Record`), with a length of `1 << lg_record_size`. + /// * `source_record_chunks` and `parity_record_chunks` must be valid mutable pointers to `Scalar` elements, each with a length of `1 << lg_record_size`. + /// * `chunks_scratch` must be a valid mutable pointer where up to `challenges_count` 32-byte chunks of GPU-calculated data will be written. + /// * `gpu_id` must be a valid identifier of an available GPU. The available GPUs can be determined by using the `gpu_count` function. + fn generate_and_encode_pospace_dispatch( + k: u32, + seed: *const [u8; 32], + lg_record_size: u32, + challenge_index: *mut u32, + record: *const [u8; 32], + chunks_scratch: *mut [u8; 32], + proof_count: *mut u32, + parity_record_chunks: *mut FsFr, + gpu_id: i32, + ) -> sppark::Error; +} + +/// Returns [`RocmDevice`] for each available device +pub fn rocm_devices() -> Vec { + let num_devices = unsafe { gpu_count() }; + + (0i32..) + .take(num_devices) + .map(|gpu_id| RocmDevice { gpu_id }) + .collect() +} + +/// Wrapper data structure encapsulating a single ROCm-capable device +#[derive(Debug)] +pub struct RocmDevice { + gpu_id: i32, +} + +impl RocmDevice { + /// ROCm device ID + pub fn id(&self) -> i32 { + self.gpu_id + } + + /// Generates and encodes PoSpace on the GPU. + /// + /// This function performs the generation and encoding of PoSpace + /// on a GPU. It uses the specified parameters to perform the computations and + /// ensures that errors are properly handled by returning a `Result` type. + /// + /// # Parameters + /// + /// ## Input + /// + /// - `k`: The size parameter for the table. + /// - `seed`: A 32-byte seed used for the table generation process. + /// - `record`: A slice of bytes (`&[u8]`). These records are the data on which the proof of space will be generated. + /// - `gpu_id`: ID of the GPU to use. This parameter specifies which GPU to use for the computation. + /// + /// ## Output + /// + /// - `source_record_chunks`: A mutable vector of original data chunks of type FsFr, each 32 bytes in size. + /// - `parity_record_chunks`: A mutable vector of parity chunks derived from the source, each 32 bytes in size. + /// - `proof_count`: A mutable reference to the proof count. This value will be updated with the number of proofs generated. + /// - `chunks_scratch`: A mutable vector used to store the processed chunks. This vector holds the final results after combining record chunks and proof hashes. + /// - `challenge_index`: A mutable vector used to map the challenges to specific parts of the data. + pub fn generate_and_encode_pospace( + &self, + seed: &PosSeed, + record: &mut Record, + encoded_chunks_used_output: impl ExactSizeIterator>, + ) -> Result<(), String> { + let record_len = Record::NUM_CHUNKS; + let challenge_len = Record::NUM_S_BUCKETS; + let lg_record_size = record_len.ilog2(); + + if challenge_len > u32::MAX as usize { + return Err(String::from("challenge_len is too large to fit in u32")); + } + + let mut proof_count = 0u32; + let mut chunks_scratch_gpu = Vec::<[u8; Scalar::FULL_BYTES]>::with_capacity(challenge_len); + let mut challenge_index_gpu = Vec::::with_capacity(challenge_len); + let mut parity_record_chunks = Vec::::with_capacity(Record::NUM_CHUNKS); + + let error = unsafe { + generate_and_encode_pospace_dispatch( + u32::from(PosProof::K), + &**seed, + lg_record_size, + challenge_index_gpu.as_mut_ptr(), + record.as_ptr(), + chunks_scratch_gpu.as_mut_ptr(), + &mut proof_count, + Scalar::slice_mut_to_repr(&mut parity_record_chunks).as_mut_ptr(), + self.gpu_id, + ) + }; + + if error.code != 0 { + return Err(error.to_string()); + } + + let proof_count = proof_count as usize; + unsafe { + chunks_scratch_gpu.set_len(proof_count); + challenge_index_gpu.set_len(proof_count); + parity_record_chunks.set_len(Record::NUM_CHUNKS); + } + + let mut encoded_chunks_used = vec![false; challenge_len]; + let source_record_chunks = record.to_vec(); + + let mut chunks_scratch = challenge_index_gpu + .into_iter() + .zip(chunks_scratch_gpu) + .collect::>(); + + chunks_scratch + .sort_unstable_by(|(a_out_index, _), (b_out_index, _)| a_out_index.cmp(b_out_index)); + + // We don't need all the proofs + chunks_scratch.truncate(proof_count.min(Record::NUM_CHUNKS)); + + for (out_index, _chunk) in &chunks_scratch { + encoded_chunks_used[*out_index as usize] = true; + } + + encoded_chunks_used_output + .zip(&encoded_chunks_used) + .for_each(|(mut output, input)| *output = *input); + + record + .iter_mut() + .zip( + chunks_scratch + .into_iter() + .map(|(_out_index, chunk)| chunk) + .chain( + source_record_chunks + .into_iter() + .zip(parity_record_chunks) + .flat_map(|(a, b)| [a, b.to_bytes()]) + .zip(encoded_chunks_used.iter()) + // Skip chunks that were used previously + .filter_map(|(record_chunk, encoded_chunk_used)| { + if *encoded_chunk_used { + None + } else { + Some(record_chunk) + } + }), + ), + ) + .for_each(|(output_chunk, input_chunk)| { + *output_chunk = input_chunk; + }); + + Ok(()) + } +} diff --git a/shared/subspace-proof-of-space-gpu/src/rocm/tests.rs b/shared/subspace-proof-of-space-gpu/src/rocm/tests.rs new file mode 100644 index 00000000000..8ff4c0aea20 --- /dev/null +++ b/shared/subspace-proof-of-space-gpu/src/rocm/tests.rs @@ -0,0 +1,84 @@ +use crate::rocm::rocm_devices; +use std::num::NonZeroUsize; +use std::slice; +use subspace_core_primitives::crypto::{blake3_254_hash_to_scalar, blake3_hash}; +use subspace_core_primitives::{HistorySize, PieceOffset, Record, SectorId}; +use subspace_erasure_coding::ErasureCoding; +use subspace_farmer_components::plotting::{CpuRecordsEncoder, RecordsEncoder}; +use subspace_farmer_components::sector::SectorContentsMap; +use subspace_proof_of_space::chia::ChiaTable; +use subspace_proof_of_space::Table; + +type PosTable = ChiaTable; + +#[test] +fn basic() { + let rocm_device = rocm_devices() + .into_iter() + .next() + .expect("Need ROCm device to run this test"); + + let mut table_generator = PosTable::generator(); + let erasure_coding = ErasureCoding::new( + NonZeroUsize::new(Record::NUM_S_BUCKETS.next_power_of_two().ilog2() as usize) + .expect("Not zero; qed"), + ) + .unwrap(); + let global_mutex = Default::default(); + let mut cpu_records_encoder = CpuRecordsEncoder::::new( + slice::from_mut(&mut table_generator), + &erasure_coding, + &global_mutex, + ); + + let sector_id = SectorId::new(blake3_hash(b"hello"), 500); + let history_size = HistorySize::ONE; + let mut record = Record::new_boxed(); + record.iter_mut().enumerate().for_each(|(index, chunk)| { + *chunk = blake3_254_hash_to_scalar(&index.to_le_bytes()).to_bytes() + }); + + let mut cpu_encoded_records = Record::new_zero_vec(2); + for cpu_encoded_record in &mut cpu_encoded_records { + cpu_encoded_record.clone_from(&record); + } + let cpu_sector_contents_map = cpu_records_encoder + .encode_records( + §or_id, + &mut cpu_encoded_records, + history_size, + &Default::default(), + ) + .unwrap(); + + let mut gpu_encoded_records = Record::new_zero_vec(2); + for gpu_encoded_record in &mut gpu_encoded_records { + gpu_encoded_record.clone_from(&record); + } + let mut gpu_sector_contents_map = SectorContentsMap::new(2); + rocm_device + .generate_and_encode_pospace( + §or_id.derive_evaluation_seed(PieceOffset::ZERO, history_size), + &mut gpu_encoded_records[0], + gpu_sector_contents_map + .iter_record_bitfields_mut() + .next() + .unwrap() + .iter_mut(), + ) + .unwrap(); + rocm_device + .generate_and_encode_pospace( + §or_id.derive_evaluation_seed(PieceOffset::ONE, history_size), + &mut gpu_encoded_records[1], + gpu_sector_contents_map + .iter_record_bitfields_mut() + .nth(1) + .unwrap() + .iter_mut(), + ) + .unwrap(); + + assert!(cpu_sector_contents_map == gpu_sector_contents_map); + assert!(cpu_encoded_records == gpu_encoded_records); +}