From b2e7c01e819aca9c26c679abf35da0418592df84 Mon Sep 17 00:00:00 2001
From: Nazar Mokrynskyi <nazar@mokrynskyi.com>
Date: Mon, 16 Sep 2024 00:08:57 +0300
Subject: [PATCH] ROCm WIP

---
 Cargo.lock                                    |   3 +-
 shared/subspace-proof-of-space-gpu/Cargo.toml |   6 +-
 shared/subspace-proof-of-space-gpu/build.rs   |  10 +-
 shared/subspace-proof-of-space-gpu/src/lib.rs |   2 +
 .../subspace-proof-of-space-gpu/src/rocm.rs   | 190 ++++++++++++++++++
 .../src/rocm/tests.rs                         |  84 ++++++++
 6 files changed, 286 insertions(+), 9 deletions(-)
 create mode 100644 shared/subspace-proof-of-space-gpu/src/rocm.rs
 create mode 100644 shared/subspace-proof-of-space-gpu/src/rocm/tests.rs

diff --git a/Cargo.lock b/Cargo.lock
index ceda2ba65eb..5ae54c55447 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -12423,8 +12423,7 @@ dependencies = [
 [[package]]
 name = "sppark"
 version = "0.1.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c55f3833d30846a26110dccb1d5366314c2c52516a9173b74238c16b24b1a9f9"
+source = "git+https://github.com/dot-asm/sppark?rev=fe1237fe9eabb8aeb48a21af4d439fb4ac4f5d5d#fe1237fe9eabb8aeb48a21af4d439fb4ac4f5d5d"
 dependencies = [
  "cc",
  "which",
diff --git a/shared/subspace-proof-of-space-gpu/Cargo.toml b/shared/subspace-proof-of-space-gpu/Cargo.toml
index 7456c2a1cb3..87ab98f5cd7 100644
--- a/shared/subspace-proof-of-space-gpu/Cargo.toml
+++ b/shared/subspace-proof-of-space-gpu/Cargo.toml
@@ -16,8 +16,8 @@ include = [
 blst = { version = "0.3.13", optional = true }
 rust-kzg-blst = { git = "https://github.com/grandinetech/rust-kzg", rev = "6c8fcc623df3d7e8c0f30951a49bfea764f90bf4", default-features = false, optional = true }
 # TODO: This is `rocm` branch, it is needed for ROCm support
-#sppark = { git = "https://github.com/dot-asm/sppark", rev = "8eeafe0f6cc0ca8211b1be93922df1b5a118bbd2", optional = true }
-sppark = { version = "0.1.8", optional = true }
+sppark = { git = "https://github.com/dot-asm/sppark", rev = "fe1237fe9eabb8aeb48a21af4d439fb4ac4f5d5d", optional = true }
+#sppark = { version = "0.1.8", optional = true }
 subspace-core-primitives = { version = "0.1.0", path = "../../crates/subspace-core-primitives", default-features = false, optional = true }
 
 [dev-dependencies]
@@ -31,7 +31,7 @@ cc = "1.1.15"
 [features]
 # Only Volta+ architectures are supported (GeForce RTX 20xx consumer GPUs and newer)
 cuda = ["_gpu"]
-# TODO: ROCm can't be enabled at the same time as `cuda` feature at the moment and is not exposed on library level
+# TODO: ROCm can't be enabled at the same time as `cuda` feature at the moment
 rocm = ["_gpu"]
 # Internal feature, shouldn't be used directly
 _gpu = [
diff --git a/shared/subspace-proof-of-space-gpu/build.rs b/shared/subspace-proof-of-space-gpu/build.rs
index 0e9d891da89..24b081b66cb 100644
--- a/shared/subspace-proof-of-space-gpu/build.rs
+++ b/shared/subspace-proof-of-space-gpu/build.rs
@@ -21,11 +21,10 @@ fn main() {
         hipcc.compiler(env::var("HIPCC").unwrap_or("hipcc".to_string()));
         hipcc.cpp(true);
         if cfg!(debug_assertions) {
-            hipcc.opt_level(1);
+            hipcc.opt_level(2);
         }
-        hipcc.flag("--offload-arch=native,gfx1100,gfx1030,gfx942,gfx90a,gfx908");
-        // 6 corresponds to the number of offload-arch
-        hipcc.flag("-parallel-jobs=6");
+        hipcc.flag("--offload-arch=gfx1100,gfx1030,gfx942,gfx90a,gfx908");
+        // hipcc.flag("--offload-device-only");
         // This controls how error strings get handled in the FFI. When defined error strings get
         // returned from the FFI, and Rust must then free them. When not defined error strings are
         // not returned.
@@ -35,6 +34,9 @@ fn main() {
             hipcc.flag("-include").flag("util/cuda2hip.hpp");
         }
         hipcc.file("src/subspace_api.cu").compile("subspace_rocm");
+
+        // Doesn't link otherwise
+        println!("cargo::rustc-link-lib=amdhip64");
     }
 
     if cfg!(feature = "cuda") {
diff --git a/shared/subspace-proof-of-space-gpu/src/lib.rs b/shared/subspace-proof-of-space-gpu/src/lib.rs
index 572db957bf6..0b1269fb893 100644
--- a/shared/subspace-proof-of-space-gpu/src/lib.rs
+++ b/shared/subspace-proof-of-space-gpu/src/lib.rs
@@ -1,2 +1,4 @@
 #[cfg(feature = "cuda")]
 pub mod cuda;
+#[cfg(feature = "rocm")]
+pub mod rocm;
diff --git a/shared/subspace-proof-of-space-gpu/src/rocm.rs b/shared/subspace-proof-of-space-gpu/src/rocm.rs
new file mode 100644
index 00000000000..87b55c442a8
--- /dev/null
+++ b/shared/subspace-proof-of-space-gpu/src/rocm.rs
@@ -0,0 +1,190 @@
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#[cfg(test)]
+mod tests;
+
+use rust_kzg_blst::types::fr::FsFr;
+use std::ops::DerefMut;
+use subspace_core_primitives::crypto::Scalar;
+use subspace_core_primitives::{PosProof, PosSeed, Record};
+
+extern "C" {
+    /// # Returns
+    /// * `usize` - The number of available GPUs.
+    fn gpu_count() -> usize;
+
+    /// # Parameters
+    /// * `k: The size parameter for the table.
+    /// * `seed: A pointer to the seed data.
+    /// * `lg_record_size: The logarithm of the record size.
+    /// * `challenge_index: A mutable pointer to store the index of the challenge.
+    /// * `record: A pointer to the record data.
+    /// * `chunks_scratch: A mutable pointer to a scratch space for chunk data.
+    /// * `proof_count: A mutable pointer to store the count of proofs.
+    /// * `source_record_chunks: A mutable pointer to the source record chunks.
+    /// * `parity_record_chunks: A mutable pointer to the parity record chunks.
+    /// * `gpu_id: The ID of the GPU to use.
+    ///
+    /// # Returns
+    /// * `sppark::Error` - An error code indicating the result of the operation.
+    ///
+    /// # Assumptions
+    /// * `seed` must be a valid pointer to a 32-byte.
+    /// * `record` must be a valid pointer to the record data (`*const Record`), with a length of `1 << lg_record_size`.
+    /// * `source_record_chunks` and `parity_record_chunks` must be valid mutable pointers to `Scalar` elements, each with a length of `1 << lg_record_size`.
+    /// * `chunks_scratch` must be a valid mutable pointer where up to `challenges_count` 32-byte chunks of GPU-calculated data will be written.
+    /// * `gpu_id` must be a valid identifier of an available GPU. The available GPUs can be determined by using the `gpu_count` function.
+    fn generate_and_encode_pospace_dispatch(
+        k: u32,
+        seed: *const [u8; 32],
+        lg_record_size: u32,
+        challenge_index: *mut u32,
+        record: *const [u8; 32],
+        chunks_scratch: *mut [u8; 32],
+        proof_count: *mut u32,
+        parity_record_chunks: *mut FsFr,
+        gpu_id: i32,
+    ) -> sppark::Error;
+}
+
+/// Returns [`RocmDevice`] for each available device
+pub fn rocm_devices() -> Vec<RocmDevice> {
+    let num_devices = unsafe { gpu_count() };
+
+    (0i32..)
+        .take(num_devices)
+        .map(|gpu_id| RocmDevice { gpu_id })
+        .collect()
+}
+
+/// Wrapper data structure encapsulating a single ROCm-capable device
+#[derive(Debug)]
+pub struct RocmDevice {
+    gpu_id: i32,
+}
+
+impl RocmDevice {
+    /// ROCm device ID
+    pub fn id(&self) -> i32 {
+        self.gpu_id
+    }
+
+    /// Generates and encodes PoSpace on the GPU.
+    ///
+    /// This function performs the generation and encoding of PoSpace
+    /// on a GPU. It uses the specified parameters to perform the computations and
+    /// ensures that errors are properly handled by returning a `Result` type.
+    ///
+    /// # Parameters
+    ///
+    /// ## Input
+    ///
+    /// - `k`: The size parameter for the table.
+    /// - `seed`: A 32-byte seed used for the table generation process.
+    /// - `record`: A slice of bytes (`&[u8]`). These records are the data on which the proof of space will be generated.
+    /// - `gpu_id`: ID of the GPU to use. This parameter specifies which GPU to use for the computation.
+    ///
+    /// ## Output
+    ///
+    /// - `source_record_chunks`: A mutable vector of original data chunks of type FsFr, each 32 bytes in size.
+    /// - `parity_record_chunks`: A mutable vector of parity chunks derived from the source, each 32 bytes in size.
+    /// - `proof_count`: A mutable reference to the proof count. This value will be updated with the number of proofs generated.
+    /// - `chunks_scratch`:  A mutable vector used to store the processed chunks. This vector holds the final results after combining record chunks and proof hashes.
+    /// - `challenge_index`: A mutable vector used to map the challenges to specific parts of the data.
+    pub fn generate_and_encode_pospace(
+        &self,
+        seed: &PosSeed,
+        record: &mut Record,
+        encoded_chunks_used_output: impl ExactSizeIterator<Item = impl DerefMut<Target = bool>>,
+    ) -> Result<(), String> {
+        let record_len = Record::NUM_CHUNKS;
+        let challenge_len = Record::NUM_S_BUCKETS;
+        let lg_record_size = record_len.ilog2();
+
+        if challenge_len > u32::MAX as usize {
+            return Err(String::from("challenge_len is too large to fit in u32"));
+        }
+
+        let mut proof_count = 0u32;
+        let mut chunks_scratch_gpu = Vec::<[u8; Scalar::FULL_BYTES]>::with_capacity(challenge_len);
+        let mut challenge_index_gpu = Vec::<u32>::with_capacity(challenge_len);
+        let mut parity_record_chunks = Vec::<Scalar>::with_capacity(Record::NUM_CHUNKS);
+
+        let error = unsafe {
+            generate_and_encode_pospace_dispatch(
+                u32::from(PosProof::K),
+                &**seed,
+                lg_record_size,
+                challenge_index_gpu.as_mut_ptr(),
+                record.as_ptr(),
+                chunks_scratch_gpu.as_mut_ptr(),
+                &mut proof_count,
+                Scalar::slice_mut_to_repr(&mut parity_record_chunks).as_mut_ptr(),
+                self.gpu_id,
+            )
+        };
+
+        if error.code != 0 {
+            return Err(error.to_string());
+        }
+
+        let proof_count = proof_count as usize;
+        unsafe {
+            chunks_scratch_gpu.set_len(proof_count);
+            challenge_index_gpu.set_len(proof_count);
+            parity_record_chunks.set_len(Record::NUM_CHUNKS);
+        }
+
+        let mut encoded_chunks_used = vec![false; challenge_len];
+        let source_record_chunks = record.to_vec();
+
+        let mut chunks_scratch = challenge_index_gpu
+            .into_iter()
+            .zip(chunks_scratch_gpu)
+            .collect::<Vec<_>>();
+
+        chunks_scratch
+            .sort_unstable_by(|(a_out_index, _), (b_out_index, _)| a_out_index.cmp(b_out_index));
+
+        // We don't need all the proofs
+        chunks_scratch.truncate(proof_count.min(Record::NUM_CHUNKS));
+
+        for (out_index, _chunk) in &chunks_scratch {
+            encoded_chunks_used[*out_index as usize] = true;
+        }
+
+        encoded_chunks_used_output
+            .zip(&encoded_chunks_used)
+            .for_each(|(mut output, input)| *output = *input);
+
+        record
+            .iter_mut()
+            .zip(
+                chunks_scratch
+                    .into_iter()
+                    .map(|(_out_index, chunk)| chunk)
+                    .chain(
+                        source_record_chunks
+                            .into_iter()
+                            .zip(parity_record_chunks)
+                            .flat_map(|(a, b)| [a, b.to_bytes()])
+                            .zip(encoded_chunks_used.iter())
+                            // Skip chunks that were used previously
+                            .filter_map(|(record_chunk, encoded_chunk_used)| {
+                                if *encoded_chunk_used {
+                                    None
+                                } else {
+                                    Some(record_chunk)
+                                }
+                            }),
+                    ),
+            )
+            .for_each(|(output_chunk, input_chunk)| {
+                *output_chunk = input_chunk;
+            });
+
+        Ok(())
+    }
+}
diff --git a/shared/subspace-proof-of-space-gpu/src/rocm/tests.rs b/shared/subspace-proof-of-space-gpu/src/rocm/tests.rs
new file mode 100644
index 00000000000..8ff4c0aea20
--- /dev/null
+++ b/shared/subspace-proof-of-space-gpu/src/rocm/tests.rs
@@ -0,0 +1,84 @@
+use crate::rocm::rocm_devices;
+use std::num::NonZeroUsize;
+use std::slice;
+use subspace_core_primitives::crypto::{blake3_254_hash_to_scalar, blake3_hash};
+use subspace_core_primitives::{HistorySize, PieceOffset, Record, SectorId};
+use subspace_erasure_coding::ErasureCoding;
+use subspace_farmer_components::plotting::{CpuRecordsEncoder, RecordsEncoder};
+use subspace_farmer_components::sector::SectorContentsMap;
+use subspace_proof_of_space::chia::ChiaTable;
+use subspace_proof_of_space::Table;
+
+type PosTable = ChiaTable;
+
+#[test]
+fn basic() {
+    let rocm_device = rocm_devices()
+        .into_iter()
+        .next()
+        .expect("Need ROCm device to run this test");
+
+    let mut table_generator = PosTable::generator();
+    let erasure_coding = ErasureCoding::new(
+        NonZeroUsize::new(Record::NUM_S_BUCKETS.next_power_of_two().ilog2() as usize)
+            .expect("Not zero; qed"),
+    )
+    .unwrap();
+    let global_mutex = Default::default();
+    let mut cpu_records_encoder = CpuRecordsEncoder::<PosTable>::new(
+        slice::from_mut(&mut table_generator),
+        &erasure_coding,
+        &global_mutex,
+    );
+
+    let sector_id = SectorId::new(blake3_hash(b"hello"), 500);
+    let history_size = HistorySize::ONE;
+    let mut record = Record::new_boxed();
+    record.iter_mut().enumerate().for_each(|(index, chunk)| {
+        *chunk = blake3_254_hash_to_scalar(&index.to_le_bytes()).to_bytes()
+    });
+
+    let mut cpu_encoded_records = Record::new_zero_vec(2);
+    for cpu_encoded_record in &mut cpu_encoded_records {
+        cpu_encoded_record.clone_from(&record);
+    }
+    let cpu_sector_contents_map = cpu_records_encoder
+        .encode_records(
+            &sector_id,
+            &mut cpu_encoded_records,
+            history_size,
+            &Default::default(),
+        )
+        .unwrap();
+
+    let mut gpu_encoded_records = Record::new_zero_vec(2);
+    for gpu_encoded_record in &mut gpu_encoded_records {
+        gpu_encoded_record.clone_from(&record);
+    }
+    let mut gpu_sector_contents_map = SectorContentsMap::new(2);
+    rocm_device
+        .generate_and_encode_pospace(
+            &sector_id.derive_evaluation_seed(PieceOffset::ZERO, history_size),
+            &mut gpu_encoded_records[0],
+            gpu_sector_contents_map
+                .iter_record_bitfields_mut()
+                .next()
+                .unwrap()
+                .iter_mut(),
+        )
+        .unwrap();
+    rocm_device
+        .generate_and_encode_pospace(
+            &sector_id.derive_evaluation_seed(PieceOffset::ONE, history_size),
+            &mut gpu_encoded_records[1],
+            gpu_sector_contents_map
+                .iter_record_bitfields_mut()
+                .nth(1)
+                .unwrap()
+                .iter_mut(),
+        )
+        .unwrap();
+
+    assert!(cpu_sector_contents_map == gpu_sector_contents_map);
+    assert!(cpu_encoded_records == gpu_encoded_records);
+}