near · morgsmccauley · Nov 12, 2023 · Nov 9, 2023 · Nov 9, 2023 · Nov 9, 2023
@@ -7,6 +7,7 @@ authors = ["Near Inc <[email protected]>"]
 [dependencies]
 anyhow = "1.0.57"
 actix-web = "=4.0.1"
+aws-config = "0.53.0"
 borsh = "0.10.2"
 cached = "0.23.0"
 chrono = "0.4.25"

@@ -11,11 +11,8 @@ use near_lake_framework::near_indexer_primitives::types::{BlockHeight, BlockId,
 use serde_json::from_str;
 use tokio::task::JoinHandle;
 
-pub const INDEXED_DATA_FILES_BUCKET: &str = "near-delta-lake";
-pub const LAKE_BUCKET_PREFIX: &str = "near-lake-data-";
-pub const INDEXED_ACTIONS_FILES_FOLDER: &str = "silver/accounts/action_receipt_actions/metadata";
-pub const MAX_UNINDEXED_BLOCKS_TO_PROCESS: u64 = 7200; // two hours of blocks takes ~14 minutes.
 pub const MAX_RPC_BLOCKS_TO_PROCESS: u8 = 20;
+pub const UNINDEXED_BLOCKS_SOFT_LIMIT: u64 = 7200;
 
 pub struct Task {
     handle: JoinHandle<()>,
@@ -53,7 +50,7 @@ impl Streamer {
                 _ = cancellation_token_clone.cancelled() => {
                     tracing::info!(
                         target: crate::INDEXER,
-                        "Cancelling existing historical backfill for indexer: {:?}",
+                        "Cancelling existing historical backfill for indexer: {}",
                         indexer.get_full_name(),
                     );
                 },
@@ -64,7 +61,13 @@ impl Streamer {
                     &s3_client,
                     &chain_id,
                     &json_rpc_client,
-                ) => { }
+                ) => {
+                    tracing::info!(
+                        target: crate::INDEXER,
+                        "Finished historical backfill for indexer: {}",
+                        indexer.get_full_name(),
+                    );
+                }
             }
         });
 
@@ -132,36 +135,72 @@ pub(crate) async fn process_historical_messages(
     let block_difference: i64 = (current_block_height - start_block) as i64;
     match block_difference {
         i64::MIN..=-1 => {
-            bail!("Skipping back fill, start_block_height is greater than current block height: {:?} {:?}",
-                                     indexer_function.account_id,
-                                     indexer_function.function_name);
+            bail!(
+                "Skipping back fill, start_block_height is greater than current block height: {}",
+                indexer_function.get_full_name(),
+            );
         }
         0 => {
-            bail!("Skipping back fill, start_block_height is equal to current block height: {:?} {:?}",
-                                     indexer_function.account_id,
-                                     indexer_function.function_name);
+            bail!(
+                "Skipping back fill, start_block_height is equal to current block height: {}",
+                indexer_function.get_full_name(),
+            );
         }
         1..=i64::MAX => {
             tracing::info!(
                 target: crate::INDEXER,
-                "Back filling {block_difference} blocks from {start_block} to current block height {current_block_height}: {:?} {:?}",
-                indexer_function.account_id,
-                indexer_function.function_name
+                "Back filling {block_difference} blocks from {start_block} to current block height {current_block_height}: {}",
+                indexer_function.get_full_name(),
             );
 
+            storage::del(
+                redis_connection_manager,
+                storage::generate_historical_stream_key(&indexer_function.get_full_name()),
+            )
+            .await?;
+            storage::sadd(
+                redis_connection_manager,
+                storage::STREAMS_SET_KEY,
+                storage::generate_historical_stream_key(&indexer_function.get_full_name()),
+            )
+            .await?;
+            storage::set(
+                redis_connection_manager,
+                storage::generate_historical_storage_key(&indexer_function.get_full_name()),
+                serde_json::to_string(&indexer_function)?,
+                None,
+            )
+            .await?;
+
             let start_date =
                 lookup_block_date_or_next_block_date(start_block, json_rpc_client).await?;
 
             let last_indexed_block = last_indexed_block_from_metadata(s3_client).await?;
 
-            let mut blocks_from_index = filter_matching_blocks_from_index_files(
+            let blocks_from_index = filter_matching_blocks_from_index_files(
                 start_block,
                 &indexer_function,
                 s3_client,
                 start_date,
             )
             .await?;
 
+            tracing::info!(
+                target: crate::INDEXER,
+                "Flushing {} block heights from index files to historical Stream for indexer: {}",
+                blocks_from_index.len(),
+                indexer_function.get_full_name(),
+            );
+
+            for block in &blocks_from_index {
+                storage::xadd(
+                    redis_connection_manager,
+                    storage::generate_historical_stream_key(&indexer_function.get_full_name()),
+                    &[("block_height", block)],
+                )
+                .await?;
+            }
+
             // Check for the case where an index file is written right after we get the last_indexed_block metadata
             let last_block_in_data = blocks_from_index.last().unwrap_or(&start_block);
             let last_indexed_block = if last_block_in_data > &last_indexed_block {
@@ -170,47 +209,65 @@ pub(crate) async fn process_historical_messages(
                 last_indexed_block
             };
 
-            let mut blocks_between_indexed_and_current_block: Vec<BlockHeight> =
-                filter_matching_unindexed_blocks_from_lake(
-                    last_indexed_block,
-                    current_block_height,
-                    &indexer_function,
-                    s3_client,
-                    chain_id,
-                )
-                .await?;
+            let unindexed_block_difference = current_block_height - last_indexed_block;
 
-            blocks_from_index.append(&mut blocks_between_indexed_and_current_block);
+            tracing::info!(
+                target: crate::INDEXER,
+                "Filtering {} unindexed blocks from lake: from block {last_indexed_block} to {current_block_height} for indexer: {}",
+                unindexed_block_difference,
+                indexer_function.get_full_name(),
+            );
 
-            if !blocks_from_index.is_empty() {
-                storage::del(
-                    redis_connection_manager,
-                    storage::generate_historical_stream_key(&indexer_function.get_full_name()),
-                )
-                .await?;
-                storage::sadd(
-                    redis_connection_manager,
-                    storage::STREAMS_SET_KEY,
-                    storage::generate_historical_stream_key(&indexer_function.get_full_name()),
-                )
-                .await?;
-                storage::set(
-                    redis_connection_manager,
-                    storage::generate_historical_storage_key(&indexer_function.get_full_name()),
-                    serde_json::to_string(&indexer_function)?,
-                    None,
-                )
-                .await?;
+            if unindexed_block_difference > UNINDEXED_BLOCKS_SOFT_LIMIT {
+                tracing::warn!(
+                    target: crate::INDEXER,
+                    "Unindexed block difference exceeds soft limit of: {UNINDEXED_BLOCKS_SOFT_LIMIT} for indexer: {}",
+                    indexer_function.get_full_name(),
+                );
             }
 
-            for current_block in blocks_from_index {
-                storage::xadd(
-                    redis_connection_manager,
-                    storage::generate_historical_stream_key(&indexer_function.get_full_name()),
-                    &[("block_height", current_block)],
-                )
-                .await?;
+            let lake_config = match &chain_id {
+                ChainId::Mainnet => near_lake_framework::LakeConfigBuilder::default().mainnet(),
+                ChainId::Testnet => near_lake_framework::LakeConfigBuilder::default().testnet(),
             }
+            .start_block_height(last_indexed_block)
+            .build()
+            .context("Failed to build lake config")?;
+
+            let (sender, mut stream) = near_lake_framework::streamer(lake_config);
+
+            let mut filtered_block_count = 0;
+            while let Some(streamer_message) = stream.recv().await {
+                let block_height = streamer_message.block.header.height;
+                if block_height == current_block_height {
+                    break;
+                }
+
+                let matches = indexer_rules_engine::reduce_indexer_rule_matches_sync(
+                    &indexer_function.indexer_rule,
+                    &streamer_message,
+                    chain_id.clone(),
+                );
+
+                if !matches.is_empty() {
+                    filtered_block_count += 1;
+
+                    storage::xadd(
+                        redis_connection_manager,
+                        storage::generate_historical_stream_key(&indexer_function.get_full_name()),
+                        &[("block_height", block_height)],
+                    )
+                    .await?;
+                }
+            }
+            drop(sender);
+
+            tracing::info!(
+                target: crate::INDEXER,
+                "Flushed {} unindexed block heights to historical Stream for indexer: {}",
+                filtered_block_count,
+                indexer_function.get_full_name(),
+            );
         }
     }
     Ok(block_difference)
@@ -219,8 +276,13 @@ pub(crate) async fn process_historical_messages(
 pub(crate) async fn last_indexed_block_from_metadata(
     s3_client: &S3Client,
 ) -> anyhow::Result<BlockHeight> {
-    let key = format!("{}/{}", INDEXED_ACTIONS_FILES_FOLDER, "latest_block.json");
-    let metadata = s3::fetch_text_file_from_s3(INDEXED_DATA_FILES_BUCKET, key, s3_client).await?;
+    let key = format!(
+        "{}/{}",
+        s3::INDEXED_ACTIONS_FILES_FOLDER,
+        "latest_block.json"
+    );
+    let metadata =
+        s3::fetch_text_file_from_s3(s3::INDEXED_DATA_FILES_BUCKET, key, s3_client).await?;
 
     let metadata: serde_json::Value = serde_json::from_str(&metadata).unwrap();
     let last_indexed_block = metadata["last_indexed_block"].clone();
@@ -243,7 +305,7 @@ pub(crate) async fn filter_matching_blocks_from_index_files(
     s3_client: &S3Client,
     start_date: DateTime<Utc>,
 ) -> anyhow::Result<Vec<BlockHeight>> {
-    let s3_bucket = INDEXED_DATA_FILES_BUCKET;
+    let s3_bucket = s3::INDEXED_DATA_FILES_BUCKET;
 
     let mut needs_dedupe_and_sort = false;
     let indexer_rule = &indexer_function.indexer_rule;
@@ -259,7 +321,7 @@ pub(crate) async fn filter_matching_blocks_from_index_files(
             s3::fetch_contract_index_files(
                 s3_client,
                 s3_bucket,
-                INDEXED_ACTIONS_FILES_FOLDER,
+                s3::INDEXED_ACTIONS_FILES_FOLDER,
                 start_date,
                 affected_account_id,
             )
@@ -333,117 +395,6 @@ fn parse_blocks_from_index_files(
         .collect::<Vec<u64>>()
 }
 
-async fn filter_matching_unindexed_blocks_from_lake(
-    last_indexed_block: BlockHeight,
-    ending_block_height: BlockHeight,
-    indexer_function: &IndexerFunction,
-    s3_client: &S3Client,
-    chain_id: &ChainId,
-) -> anyhow::Result<Vec<u64>> {
-    let lake_bucket = lake_bucket_for_chain(chain_id);
-
-    let indexer_rule = &indexer_function.indexer_rule;
-    let count = ending_block_height - last_indexed_block;
-    if count > MAX_UNINDEXED_BLOCKS_TO_PROCESS {
-        bail!(
-            "Too many unindexed blocks to filter: {count}. Last indexed block is {last_indexed_block} for function {:?} {:?}",
-            indexer_function.account_id,
-            indexer_function.function_name,
-        );
-    }
-    tracing::info!(
-        target: crate::INDEXER,
-        "Filtering {count} unindexed blocks from lake: from block {last_indexed_block} to {ending_block_height} for function {:?} {:?}",
-        indexer_function.account_id,
-        indexer_function.function_name,
-    );
-
-    let mut blocks_to_process: Vec<u64> = vec![];
-    for current_block in (last_indexed_block + 1)..ending_block_height {
-        // fetch block file from S3
-        let key = format!("{}/block.json", normalize_block_height(current_block));
-        let s3_result = s3::fetch_text_file_from_s3(&lake_bucket, key, s3_client).await;
-
-        if s3_result.is_err() {
-            let error = s3_result.err().unwrap();
-            if error
-                .root_cause()
-                .downcast_ref::<aws_sdk_s3::error::NoSuchKey>()
-                .is_some()
-            {
-                tracing::info!(
-                    target: crate::INDEXER,
-                    "In manual filtering, skipping block number {} which was not found. For function {:?} {:?}",
-                    current_block,
-                    indexer_function.account_id,
-                    indexer_function.function_name,
-                );
-                continue;
-            } else {
-                bail!(error);
-            }
-        }
-
-        let block = s3_result.unwrap();
-        let block_view = serde_json::from_slice::<
-            near_lake_framework::near_indexer_primitives::views::BlockView,
-        >(block.as_ref())
-        .with_context(|| format!("Error parsing block {} from S3", current_block))?;
-
-        let mut shards = vec![];
-        for shard_id in 0..block_view.chunks.len() as u64 {
-            let key = format!(
-                "{}/shard_{}.json",
-                normalize_block_height(current_block),
-                shard_id
-            );
-            let shard = s3::fetch_text_file_from_s3(&lake_bucket, key, s3_client).await?;
-            match serde_json::from_slice::<near_lake_framework::near_indexer_primitives::IndexerShard>(
-                shard.as_ref(),
-            ) {
-                Ok(parsed_shard) => {
-                    shards.push(parsed_shard);
-                }
-                Err(e) => {
-                    bail!("Error parsing shard: {}", e.to_string());
-                }
-            }
-        }
-
-        let streamer_message = near_lake_framework::near_indexer_primitives::StreamerMessage {
-            block: block_view,
-            shards,
-        };
-
-        // filter block
-        let matches = indexer_rules_engine::reduce_indexer_rule_matches_sync(
-            indexer_rule,
-            &streamer_message,
-            chain_id.clone(),
-        );
-        if !matches.is_empty() {
-            blocks_to_process.push(current_block);
-        }
-    }
-
-    tracing::info!(
-        target: crate::INDEXER,
-        "Found {block_count} unindexed blocks to process for function {:?} {:?}",
-        indexer_function.account_id,
-        indexer_function.function_name,
-        block_count = blocks_to_process.len()
-    );
-    Ok(blocks_to_process)
-}
-
-fn lake_bucket_for_chain(chain_id: &ChainId) -> String {
-    format!("{}{}", LAKE_BUCKET_PREFIX, chain_id)
-}
-
-fn normalize_block_height(block_height: BlockHeight) -> String {
-    format!("{:0>12}", block_height)
-}
-
 // if block does not exist, try next block, up to MAX_RPC_BLOCKS_TO_PROCESS (20) blocks
 pub async fn lookup_block_date_or_next_block_date(
     block_height: u64,