From 0e8ded4dc7d99a97b7d386973a6adff52b123fb5 Mon Sep 17 00:00:00 2001 From: qima Date: Sun, 15 Dec 2024 21:32:45 +0800 Subject: [PATCH 1/3] fix(client): carry out retries in case of got less of quotes --- autonomi/src/client/quote.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/autonomi/src/client/quote.rs b/autonomi/src/client/quote.rs index 9794f165d7..38dfd7f6fd 100644 --- a/autonomi/src/client/quote.rs +++ b/autonomi/src/client/quote.rs @@ -11,7 +11,7 @@ use crate::client::rate_limiter::RateLimiter; use ant_evm::payment_vault::get_market_price; use ant_evm::{Amount, EvmNetwork, PaymentQuote, QuotePayment, QuotingMetrics}; use ant_networking::{Network, NetworkError}; -use ant_protocol::{storage::ChunkAddress, NetworkAddress}; +use ant_protocol::{storage::ChunkAddress, NetworkAddress, CLOSE_GROUP_SIZE}; use libp2p::PeerId; use std::collections::HashMap; use xor_name::XorName; @@ -159,6 +159,14 @@ async fn fetch_store_quote_with_retries( loop { match fetch_store_quote(network, content_addr).await { Ok(quote) => { + if quote.len() < CLOSE_GROUP_SIZE { + retries += 1; + error!("Error while fetching store quote: not enough quotes ({}/{CLOSE_GROUP_SIZE}), retry #{retries}, quotes {quote:?}", + quote.len()); + if retries > 2 { + break Err(CostError::CouldNotGetStoreQuote(content_addr)); + } + } break Ok((content_addr, quote)); } Err(err) if retries < 2 => { @@ -172,6 +180,9 @@ async fn fetch_store_quote_with_retries( break Err(CostError::CouldNotGetStoreQuote(content_addr)); } } + // Shall have a sleep between retries to avoid choking the network. + // This shall be rare to happen though. + std::thread::sleep(std::time::Duration::from_secs(5)); } } From 9bea82cdde6964efa3b400a577e9c4b183da0acf Mon Sep 17 00:00:00 2001 From: qima Date: Sun, 15 Dec 2024 22:01:50 +0800 Subject: [PATCH 2/3] fix(client): expand replicator_factor to get more closest_peers --- ant-networking/src/driver.rs | 2 +- ant-networking/src/lib.rs | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ant-networking/src/driver.rs b/ant-networking/src/driver.rs index 4534b49110..bb1637a099 100644 --- a/ant-networking/src/driver.rs +++ b/ant-networking/src/driver.rs @@ -135,7 +135,7 @@ const PERIODIC_KAD_BOOTSTRAP_INTERVAL_MAX_S: u64 = 21600; // Init during compilation, instead of runtime error that should never happen // Option::expect will be stabilised as const in the future (https://github.com/rust-lang/rust/issues/67441) -const REPLICATION_FACTOR: NonZeroUsize = match NonZeroUsize::new(CLOSE_GROUP_SIZE) { +const REPLICATION_FACTOR: NonZeroUsize = match NonZeroUsize::new(CLOSE_GROUP_SIZE + 2) { Some(v) => v, None => panic!("CLOSE_GROUP_SIZE should not be zero"), }; diff --git a/ant-networking/src/lib.rs b/ant-networking/src/lib.rs index 434aa192ad..fca47f18d0 100644 --- a/ant-networking/src/lib.rs +++ b/ant-networking/src/lib.rs @@ -387,6 +387,10 @@ impl Network { .await?; // Filter out results from the ignored peers. close_nodes.retain(|peer_id| !ignore_peers.contains(peer_id)); + info!( + "For record {record_address:?} quoting {} nodes. ignore_peers is {ignore_peers:?}", + close_nodes.len() + ); if close_nodes.is_empty() { error!("Can't get store_cost of {record_address:?}, as all close_nodes are ignored"); From 758852dac6bb5a541ac7a8dbdaf7185250769679 Mon Sep 17 00:00:00 2001 From: qima Date: Mon, 16 Dec 2024 19:21:17 +0800 Subject: [PATCH 3/3] fix(client): wait a short while before startup quoting/upload tasks --- .github/workflows/merge.yml | 4 ++-- ant-node/src/node.rs | 6 ++++-- autonomi/src/client/mod.rs | 6 ++++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/merge.yml b/.github/workflows/merge.yml index 60faed6af6..0e70de1d94 100644 --- a/.github/workflows/merge.yml +++ b/.github/workflows/merge.yml @@ -404,7 +404,7 @@ jobs: if: matrix.os != 'windows-latest' run: | set -e - for i in {1..100}; do + for i in {1..50}; do dd if=/dev/urandom of=random_file_$i.bin bs=1M count=1 status=none ./target/release/ant --log-output-dest data-dir file upload random_file_$i.bin --public ./target/release/ant --log-output-dest data-dir file upload random_file_$i.bin @@ -419,7 +419,7 @@ jobs: shell: pwsh run: | $ErrorActionPreference = "Stop" - for ($i = 1; $i -le 100; $i++) { + for ($i = 1; $i -le 50; $i++) { $fileName = "random_file_$i.bin" $byteArray = [byte[]]@(0xFF) * (1MB) # Create a 1 MB array filled with 0xFF [System.IO.File]::WriteAllBytes($fileName, $byteArray) diff --git a/ant-node/src/node.rs b/ant-node/src/node.rs index 4908c0bc23..2515af6344 100644 --- a/ant-node/src/node.rs +++ b/ant-node/src/node.rs @@ -16,7 +16,9 @@ use ant_bootstrap::BootstrapCacheStore; use ant_evm::RewardsAddress; #[cfg(feature = "open-metrics")] use ant_networking::MetricsRegistries; -use ant_networking::{Instant, Network, NetworkBuilder, NetworkEvent, NodeIssue, SwarmDriver}; +use ant_networking::{ + target_arch::sleep, Instant, Network, NetworkBuilder, NetworkEvent, NodeIssue, SwarmDriver, +}; use ant_protocol::{ convert_distance_to_u256, error::Error as ProtocolError, @@ -969,7 +971,7 @@ impl Node { } } // Sleep a short while to avoid causing a spike on resource usage. - std::thread::sleep(std::time::Duration::from_secs(10)); + sleep(std::time::Duration::from_secs(10)).await; } } } diff --git a/autonomi/src/client/mod.rs b/autonomi/src/client/mod.rs index fae0a87ba8..b9fb2008ab 100644 --- a/autonomi/src/client/mod.rs +++ b/autonomi/src/client/mod.rs @@ -120,6 +120,12 @@ impl Client { receiver.await.expect("sender should not close")?; debug!("Client is connected to the network"); + // With the switch to the new bootstrap cache scheme, + // Seems the too many `initial dial`s could result in failure, + // when startup quoting/upload tasks got started up immediatly. + // Hence, put in a forced wait to allow `initial network discovery` to be completed. + ant_networking::target_arch::sleep(Duration::from_secs(5)).await; + Ok(Self { network, client_event_sender: Arc::new(None),