Skip to content

Commit

Permalink
feat(boojum-cuda,shivini): implement GPU PoW (#54)
Browse files Browse the repository at this point in the history
# What ❔

This PR implements GPU PoW for blake2s and poseidon2-bn254 and modifies
shivini tu use it.

## Why ❔

PoW on GPU allows more aggressive use of PoW thereby reducing the proof
time and/or verification complexity.

## Checklist

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [x] Tests for the changes have been added / updated.
- [x] Code has been formatted via `zk fmt` and `zk lint`.
  • Loading branch information
robik75 authored Nov 20, 2024
1 parent 0128a63 commit 67f4c46
Show file tree
Hide file tree
Showing 15 changed files with 197 additions and 117 deletions.
8 changes: 5 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,15 @@ wrapper-prover = { version = "=0.152.6", path = "crates/wrapper-prover", package
fflonk = { version = "=0.152.6", path = "crates/fflonk", package = "fflonk-cuda" }

# These dependencies should be shared by all the crates.
circuit_definitions = { version = "=0.150.14" }
zkevm_test_harness = { version = "=0.150.14" }
# zksync-crypto repository
boojum = "=0.30.8"
fflonk-cpu = {package = "fflonk", version = "=0.30.8"}
franklin-crypto = "=0.30.8"
rescue_poseidon = "=0.30.8"
snark_wrapper = "=0.30.8"
fflonk-cpu = {package = "fflonk", version = "=0.30.8"}
# zksync-protocol repository
circuit_definitions = { version = "=0.150.15" }
zkevm_test_harness = { version = "=0.150.15" }

[profile.release]
debug = "line-tables-only"
23 changes: 21 additions & 2 deletions crates/boojum-cuda/benches/poseidon2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#![test_runner(criterion::runner)]

use boojum::field::goldilocks::GoldilocksField;
use boojum_cuda::poseidon2::{BNHasher, GLHasher, GpuTreeHasher};
use boojum_cuda::poseidon2::{poseidon2_bn_pow, BNHasher, GLHasher, GpuTreeHasher};
use criterion::{
criterion_group, criterion_main, BenchmarkId, Criterion, SamplingMode, Throughput,
};
Expand Down Expand Up @@ -161,10 +161,29 @@ fn bn_merkle_tree(c: &mut Criterion<CudaMeasurement>) {
bench_merkle_tree::<BNHasher, 47>(c, String::from("bn_merkle_tree")).unwrap();
}

fn bn_pow(c: &mut Criterion<CudaMeasurement>) {
const MIN_BITS_COUNT: u32 = 15;
const MAX_BITS_COUNT: u32 = 26;
let d_seed = DeviceAllocation::alloc(4).unwrap();
let mut d_result = DeviceAllocation::alloc(1).unwrap();
let stream = CudaStream::default();
let mut group = c.benchmark_group("bn_pow");
for bits_count in MIN_BITS_COUNT..=MAX_BITS_COUNT {
let max_nonce = 1 << bits_count;
group.throughput(Throughput::Elements(max_nonce));
group.bench_function(BenchmarkId::from_parameter(bits_count), |b| {
b.iter(|| {
poseidon2_bn_pow(&d_seed, u32::MAX, max_nonce, &mut d_result[0], &stream).unwrap();
})
});
}
group.finish();
}

criterion_group!(
name = bench_poseidon2;
config = Criterion::default().with_measurement::<CudaMeasurement>(CudaMeasurement{});
targets = gl_leafs, bn_leafs, gl_nodes, bn_nodes, gl_merkle_tree, bn_merkle_tree
targets = gl_leafs, bn_leafs, gl_nodes, bn_nodes, gl_merkle_tree, bn_merkle_tree, bn_pow
);

criterion_main!(bench_poseidon2);
3 changes: 2 additions & 1 deletion crates/boojum-cuda/native/blake2s.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ DEVICE_FORCEINLINE uint64_t get_digest(const uint32_t input[10]) {
}

EXTERN __global__ void blake2s_pow_kernel(const uint64_t *seed, const uint32_t bits_count, const uint64_t max_nonce, volatile uint64_t *result) {
const uint32_t digest_mask = (1 << bits_count) - 1;
__align__(8) uint32_t input_u32[10];
auto input_u64 = reinterpret_cast<uint64_t *>(input_u32);
#pragma unroll
Expand All @@ -62,7 +63,7 @@ EXTERN __global__ void blake2s_pow_kernel(const uint64_t *seed, const uint32_t b
for (uint64_t nonce = threadIdx.x + blockIdx.x * blockDim.x; nonce < max_nonce && *result == UINT64_MAX; nonce += blockDim.x * gridDim.x) {
input_u64[4] = nonce;
uint64_t digest = get_digest(input_u32);
if (__clzll((long long)__brevll(digest)) >= bits_count)
if (!(digest & digest_mask))
atomicCAS(reinterpret_cast<unsigned long long *>(const_cast<uint64_t *>(result)), UINT64_MAX, nonce);
}
}
31 changes: 31 additions & 0 deletions crates/boojum-cuda/native/poseidon2/bn/poseidon2_bn_st.cu
Original file line number Diff line number Diff line change
Expand Up @@ -120,4 +120,35 @@ EXTERN __global__ void poseidon2_bn_st_nodes_kernel(const bn *values, bn *result
store_cs(results, state[i]);
}

EXTERN __global__ void poseidon2_bn_pow_kernel(const gl *seed, const uint32_t bits_count, const uint64_t max_nonce, volatile uint64_t *result) {
const unsigned SEED_SIZE = 4;
static_assert(RATE == 2);
static_assert(CAPACITY == 1);
static_assert(CHUNK_BY == 3);
static_assert(SEED_SIZE + 2 == RATE * CHUNK_BY);
const uint32_t digest_mask = (1 << bits_count) - 1;
__align__(8) poseidon_state base_state{};
#pragma unroll
for (unsigned i = 0; i < CHUNK_BY; i++)
reinterpret_cast<gl *>(&base_state[0])[i] = load_ca(seed + i);
base_state[0] = fr::to_montgomery(base_state[0]);
reinterpret_cast<gl *>(&base_state[1])[0] = load_ca(seed + CHUNK_BY);
for (uint64_t nonce = threadIdx.x + blockIdx.x * blockDim.x; nonce < max_nonce && *result == UINT64_MAX; nonce += blockDim.x * gridDim.x) {
poseidon_state state{};
#pragma unroll
for (unsigned i = 0; i < STATE_WIDTH; i++)
state[i] = base_state[i];
state[1].limbs[2] = nonce;
state[1].limbs[4] = nonce >> 32;
state[1] = fr::to_montgomery(state[1]);
permutation(state);
state[0] = fr::get_one();
state[1] = {};
permutation(state);
const uint32_t digest = fr::from_montgomery(state[0]).limbs[0];
if (!(digest & digest_mask))
atomicCAS(reinterpret_cast<unsigned long long *>(const_cast<uint64_t *>(result)), UINT64_MAX, nonce);
}
}

} // namespace poseidon2::bn254
18 changes: 7 additions & 11 deletions crates/boojum-cuda/src/blake2s.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ pub fn blake2s_pow(
}
const BLOCK_SIZE: u32 = WARP_SIZE * 4;
let device_id = get_device()?;
let mpc = device_get_attribute(CudaDeviceAttr::MultiProcessorCount, device_id).unwrap();
let mpc = device_get_attribute(CudaDeviceAttr::MultiProcessorCount, device_id)?;
let kernel_function = Blake2SPowFunction::default();
let max_blocks = max_active_blocks_per_multiprocessor(&kernel_function, BLOCK_SIZE as i32, 0)?;
let num_blocks = (mpc * max_blocks) as u32;
Expand All @@ -43,28 +43,24 @@ pub fn blake2s_pow(

#[cfg(test)]
mod tests {
use blake2::{Blake2s256, Digest};
use blake2::Blake2s256;
use boojum::cs::implementations::pow::PoWRunner;
use era_cudart::memory::{memory_copy_async, DeviceAllocation};
use era_cudart::stream::CudaStream;

#[test]
fn blake2s_pow() {
const BITS_COUNT: u32 = 24;
let h_seed = [42u8; 32];
let seed = vec![42u8; 32];
let mut h_result = [0u64; 1];
let mut d_seed = DeviceAllocation::alloc(32).unwrap();
let mut d_result = DeviceAllocation::alloc(1).unwrap();
let stream = CudaStream::default();
memory_copy_async(&mut d_seed, &h_seed, &stream).unwrap();
memory_copy_async(&mut d_seed, &seed, &stream).unwrap();
super::blake2s_pow(&d_seed, BITS_COUNT, u64::MAX, &mut d_result[0], &stream).unwrap();
memory_copy_async(&mut h_result, &d_result, &stream).unwrap();
stream.synchronize().unwrap();
let mut digest = Blake2s256::new();
digest.update(h_seed);
digest.update(h_result[0].to_le_bytes());
let output = digest.finalize();
let mut le_bytes = [0u8; 8];
le_bytes.copy_from_slice(&output[..8]);
assert!(u64::from_le_bytes(le_bytes).trailing_zeros() >= BITS_COUNT);
let challenge = h_result[0];
assert!(Blake2s256::verify_from_bytes(seed, BITS_COUNT, challenge));
}
}
60 changes: 58 additions & 2 deletions crates/boojum-cuda/src/poseidon2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,18 @@ use boojum::algebraic_props::round_function::AbsorptionModeOverwrite;
use boojum::algebraic_props::sponge::GoldilocksPoseidon2Sponge;
use boojum::cs::oracle::TreeHasher;
use boojum::field::goldilocks::GoldilocksField;
use era_cudart::device::{device_get_attribute, get_device};
use era_cudart::execution::{CudaLaunchConfig, Dim3, KernelFunction};
use era_cudart::memory::memory_set_async;
use era_cudart::occupancy::max_active_blocks_per_multiprocessor;
use era_cudart::paste::paste;
use era_cudart::result::CudaResult;
use era_cudart::slice::DeviceSlice;
use era_cudart::slice::{DeviceSlice, DeviceVariable};
use era_cudart::stream::CudaStream;
use era_cudart::{cuda_kernel_declaration, cuda_kernel_signature_arguments_and_function};
use era_cudart::{
cuda_kernel, cuda_kernel_declaration, cuda_kernel_signature_arguments_and_function,
};
use era_cudart_sys::CudaDeviceAttr;
use snark_wrapper::franklin_crypto::bellman::bn256::{Bn256, Fr};
use snark_wrapper::implementations::poseidon2::tree_hasher::AbsorptionModeReplacement;
use snark_wrapper::rescue_poseidon::poseidon2::Poseidon2Sponge;
Expand Down Expand Up @@ -494,12 +500,44 @@ impl GpuTreeHasher for BNHasher {
GatherMerklePathsKernelFunction(poseidon2_bn_gather_merkle_paths_kernel);
}

cuda_kernel!(Poseidon2Pow, poseidon2_bn_pow_kernel(seed: *const GL, bits_count: u32, max_nonce: u64, result: *mut u64));

pub fn poseidon2_bn_pow(
seed: &DeviceSlice<GL>,
bits_count: u32,
max_nonce: u64,
result: &mut DeviceVariable<u64>,
stream: &CudaStream,
) -> CudaResult<()> {
assert_eq!(seed.len(), 4);
unsafe {
memory_set_async(result.transmute_mut(), 0xff, stream)?;
}
const BLOCK_SIZE: u32 = WARP_SIZE * 4;
let device_id = get_device()?;
let mpc = device_get_attribute(CudaDeviceAttr::MultiProcessorCount, device_id)?;
let kernel_function = Poseidon2PowFunction::default();
let max_blocks = max_active_blocks_per_multiprocessor(&kernel_function, BLOCK_SIZE as i32, 0)?;
let num_blocks = (mpc * max_blocks) as u32;
let config = CudaLaunchConfig::basic(num_blocks, BLOCK_SIZE, stream);
let seed = seed.as_ptr();
let result = result.as_mut_ptr();
let args = Poseidon2PowArguments {
seed,
bits_count,
max_nonce,
result,
};
kernel_function.launch(&config, &args)
}

#[cfg(test)]
mod tests {
use super::*;
use crate::device_structures::{DeviceMatrix, DeviceMatrixMut};
use crate::ops_simple::set_to_zero;
use crate::tests_helpers::RandomIterator;
use boojum::cs::implementations::pow::PoWRunner;
use era_cudart::memory::{memory_copy_async, DeviceAllocation};
use itertools::Itertools;
use rand::{thread_rng, Rng};
Expand Down Expand Up @@ -898,4 +936,22 @@ mod tests {
fn bn_gather_merkle_paths() -> CudaResult<()> {
BNHasher::test_gather_merkle_paths()
}

#[test]
fn poseidon2_bn_pow() {
const BITS_COUNT: u32 = 26;
let seed = GL::get_random_iterator().take(4).collect_vec();
let mut h_result = [0u64; 1];
let mut d_seed = DeviceAllocation::alloc(4).unwrap();
let mut d_result = DeviceAllocation::alloc(1).unwrap();
let stream = CudaStream::default();
memory_copy_async(&mut d_seed, &seed, &stream).unwrap();
super::poseidon2_bn_pow(&d_seed, BITS_COUNT, u64::MAX, &mut d_result[0], &stream).unwrap();
memory_copy_async(&mut h_result, &d_result, &stream).unwrap();
stream.synchronize().unwrap();
let challenge = h_result[0];
assert!(BNHasher::verify_from_field_elements(
seed, BITS_COUNT, challenge
));
}
}
10 changes: 7 additions & 3 deletions crates/proof-compression/src/gpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use shivini::cs::GpuSetup;
use shivini::gpu_proof_config::GpuProofConfig;
use shivini::{
gpu_prove_from_external_witness_data_with_cache_strategy, CacheStrategy,
CommitmentCacheStrategy, GpuTreeHasher, PolynomialsCacheStrategy, ProverContext,
CommitmentCacheStrategy, GPUPoWRunner, GpuTreeHasher, PolynomialsCacheStrategy, ProverContext,
ProverContextConfig,
};
use std::alloc::Global;
Expand Down Expand Up @@ -369,7 +369,9 @@ pub fn prove_compression_wrapper_circuit_with_precomputations(
(proof, vk)
}

pub fn inner_prove_compression_layer_circuit<CF: ProofCompressionFunction>(
pub fn inner_prove_compression_layer_circuit<
CF: ProofCompressionFunction<ThisLayerPoW: GPUPoWRunner>,
>(
circuit: CompressionLayerCircuit<CF>,
device_setup: &GpuSetup<CompressionProofsTreeHasher>,
finalization_hint: FinalizationHintsForProver,
Expand Down Expand Up @@ -411,7 +413,9 @@ pub fn inner_prove_compression_layer_circuit<CF: ProofCompressionFunction>(
(proof, vk)
}

pub fn inner_prove_compression_wrapper_circuit<CF: ProofCompressionFunction>(
pub fn inner_prove_compression_wrapper_circuit<
CF: ProofCompressionFunction<ThisLayerPoW: GPUPoWRunner>,
>(
circuit: CompressionLayerCircuit<CF>,
device_setup: &GpuSetup<CompressionTreeHasherForWrapper>,
finalization_hint: FinalizationHintsForProver,
Expand Down
2 changes: 0 additions & 2 deletions crates/shivini/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ smallvec = { version = "1.13", features = [
"const_new",
"serde",
] }
sha2 = "0.10"
blake2 = "0.10"
hex = "0.4"
derivative = "2.2"
bincode = "1.3"
Expand Down
3 changes: 1 addition & 2 deletions crates/shivini/src/data_structures/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ use crate::poly::{CosetEvaluations, LagrangeBasis, MonomialBasis};
use crate::prover::{
compute_quotient_degree, gpu_prove_from_external_witness_data_with_cache_strategy,
};
use boojum::cs::implementations::pow::PoWRunner;
use boojum::cs::implementations::prover::ProofConfig;
use boojum::cs::implementations::transcript::Transcript;
use boojum::cs::implementations::verifier::{VerificationKey, VerificationKeyCircuitGeometry};
Expand Down Expand Up @@ -736,7 +735,7 @@ impl CacheStrategy {
pub(crate) fn get<
TR: Transcript<F, CompatibleCap: Hash>,
H: GpuTreeHasher<Output = TR::CompatibleCap>,
POW: PoWRunner,
POW: GPUPoWRunner,
A: GoodAllocator,
>(
config: &GpuProofConfig,
Expand Down
1 change: 1 addition & 0 deletions crates/shivini/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ pub use context::ProverContextConfig;
pub use data_structures::CacheStrategy;
pub use data_structures::CommitmentCacheStrategy;
pub use data_structures::PolynomialsCacheStrategy;
pub use pow::GPUPoWRunner;
pub use primitives::tree::GpuTreeHasher;
pub use prover::gpu_prove_from_external_witness_data;
pub use prover::gpu_prove_from_external_witness_data_with_cache_strategy;
Loading

0 comments on commit 67f4c46

Please sign in to comment.