From febef043597ba22bbebba77f6d18d79ad093aedf Mon Sep 17 00:00:00 2001 From: snaker Date: Thu, 26 Sep 2024 20:12:21 +0100 Subject: [PATCH 01/24] feat(ScriptData): reduced rpc calls --- scripts/data/generate_data.py | 17 +++++++++++++++-- scripts/data/generate_timestamp_data.py | 18 ++++++++++++------ scripts/data/generate_utxo_data.py | 3 +-- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/scripts/data/generate_data.py b/scripts/data/generate_data.py index ea6daf1f..94c60f65 100755 --- a/scripts/data/generate_data.py +++ b/scripts/data/generate_data.py @@ -6,6 +6,8 @@ import requests from pathlib import Path from decimal import Decimal, getcontext +from generate_timestamp_data import get_timestamp_data +from generate_utxo_data import get_utxo_set getcontext().prec = 16 @@ -47,6 +49,17 @@ def fetch_chain_state(block_height: int): block_hash = request_rpc("getblockhash", [block_height]) head = request_rpc("getblockheader", [block_hash]) + # If block is downloaded take it localy + data = get_timestamp_data(block_height) + if str(block_height) in data: + data = data[str(block_height)] + head["prev_timestamps"] = data["previous_timestamps"] + if block_height < 2016: + head["epoch_start_time"] = 1231006505 + else: + head["epoch_start_time"] = data["epoch_start_time"] + return head + # In order to init prev_timestamps we need to query 10 previous headers prev_header = head prev_timestamps = [head["time"]] @@ -65,7 +78,6 @@ def fetch_chain_state(block_height: int): head["epoch_start_time"] = 1231006505 else: head["epoch_start_time"] = get_epoch_start_time(block_height) - return head @@ -127,6 +139,7 @@ def bits_to_target(bits: str) -> int: def fetch_block(block_hash: str): """Downloads block with transactions (and referred UTXOs) from RPC given the block hash.""" + block = request_rpc("getblockheader", [block_hash]) block = request_rpc("getblock", [block_hash, 2]) block["data"] = {tx["txid"]: resolve_transaction(tx) for tx in block["tx"]} return block @@ -167,7 +180,7 @@ def resolve_outpoint(input: dict): "txid": input["txid"], "vout": input["vout"], "data": format_output(tx["vout"][input["vout"]]), - "block_hash": block["hash"], + "block_hash": tx["blockhash"], "block_height": block["height"], "block_time": block["time"], "is_coinbase": tx["vin"][0].get("coinbase") is not None, diff --git a/scripts/data/generate_timestamp_data.py b/scripts/data/generate_timestamp_data.py index 1f019488..767ce5b8 100644 --- a/scripts/data/generate_timestamp_data.py +++ b/scripts/data/generate_timestamp_data.py @@ -49,12 +49,18 @@ def create_index(folder_path): def list_files_in_gcs(bucket_name: str, prefix: str): """List all files in a GCS bucket under a specific folder (prefix).""" - client = storage.Client() + client = storage.Client.create_anonymous_client() bucket = client.get_bucket(bucket_name) - blobs = bucket.list_blobs(prefix=prefix) - return [ - os.path.basename(blob.name) for blob in blobs if blob.name.endswith(".json") - ] + blobs = bucket.list_blobs() + if not os.path.exists(prefix): + os.makedirs(prefix) + files = [] + for blob in blobs: + if blob.name.endswith(".json") and blob.name.startswith(prefix): + files.append(blob.name) + if not os.path.exists(blob.name): + blob.download_to_filename(blob.name) + return files def index_file_name(key): @@ -89,7 +95,7 @@ def get_timestamp_data(block_number): file_name = index_file_name(int(block_number) // INDEX_SIZE) print(file_name) index = load_index(file_name) - return index[block_number] + return index if __name__ == "__main__": diff --git a/scripts/data/generate_utxo_data.py b/scripts/data/generate_utxo_data.py index a5b4bae8..7b14b68e 100644 --- a/scripts/data/generate_utxo_data.py +++ b/scripts/data/generate_utxo_data.py @@ -118,9 +118,8 @@ def get_utxo_set(block_number: int) -> Dict[str, Any]: with open(BASE_DIR + "/" + chunk_file, "r") as f: for line in f: data = json.loads(line.strip()) - if data["block_number"] == block_number: + if data["block_number"] == str(block_number): return data - raise Exception(f"Block {block_number} not found in chunk file {chunk_file}") From d80bddae9968dddcb5e416b2b731eaec69414e6e Mon Sep 17 00:00:00 2001 From: snaker Date: Thu, 26 Sep 2024 20:16:07 +0100 Subject: [PATCH 02/24] fix(ScriptData): adjusting a few forgets --- scripts/data/generate_data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/data/generate_data.py b/scripts/data/generate_data.py index 94c60f65..b180b2a4 100755 --- a/scripts/data/generate_data.py +++ b/scripts/data/generate_data.py @@ -139,7 +139,6 @@ def bits_to_target(bits: str) -> int: def fetch_block(block_hash: str): """Downloads block with transactions (and referred UTXOs) from RPC given the block hash.""" - block = request_rpc("getblockheader", [block_hash]) block = request_rpc("getblock", [block_hash, 2]) block["data"] = {tx["txid"]: resolve_transaction(tx) for tx in block["tx"]} return block From fb92e16dcd2323809476a77d07306704dcb8ccec Mon Sep 17 00:00:00 2001 From: snaker Date: Fri, 27 Sep 2024 10:11:05 +0100 Subject: [PATCH 03/24] fix(ScriptData): typos --- scripts/data/generate_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/data/generate_data.py b/scripts/data/generate_data.py index b180b2a4..39c60ad0 100755 --- a/scripts/data/generate_data.py +++ b/scripts/data/generate_data.py @@ -49,7 +49,7 @@ def fetch_chain_state(block_height: int): block_hash = request_rpc("getblockhash", [block_height]) head = request_rpc("getblockheader", [block_hash]) - # If block is downloaded take it localy + # If block is downloaded take it locally data = get_timestamp_data(block_height) if str(block_height) in data: data = data[str(block_height)] From 2fa594ccad9f1edee3041ee0a5d487ea120c70b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Fri, 27 Sep 2024 13:18:47 +0200 Subject: [PATCH 04/24] handle utxo data --- scripts/data/generate_data_fast.py | 329 ++++++++++++++++++++++++ scripts/data/generate_timestamp_data.py | 2 - scripts/data/generate_utxo_data.py | 2 +- 3 files changed, 330 insertions(+), 3 deletions(-) create mode 100755 scripts/data/generate_data_fast.py diff --git a/scripts/data/generate_data_fast.py b/scripts/data/generate_data_fast.py new file mode 100755 index 00000000..08a69d67 --- /dev/null +++ b/scripts/data/generate_data_fast.py @@ -0,0 +1,329 @@ +#!/usr/bin/env python + +import sys +import os +import json +import requests +from pathlib import Path +from decimal import Decimal, getcontext +from generate_timestamp_data import get_timestamp_data +from generate_utxo_data import get_utxo_set + +getcontext().prec = 16 + +BITCOIN_RPC = os.getenv("BITCOIN_RPC") +USERPWD = os.getenv("USERPWD") +DEFAULT_URL = "https://bitcoin-mainnet.public.blastapi.io" + + +def request_rpc(method: str, params: list): + """Makes a JSON-RPC call to a Bitcoin API endpoint. + Uses environment variables BITCOIN_RPC and USERPWD + or the default public endpoint if those variables are not set. + + :return: parsed JSON result as Python object + """ + url = BITCOIN_RPC or DEFAULT_URL + auth = tuple(USERPWD.split(":")) if USERPWD else None + headers = {"content-type": "application/json"} + payload = { + "jsonrpc": "2.0", + "method": method, + "params": params, + "id": 0, + } + print(f"Requesting {method} with params {params}") + res = requests.post(url, auth=auth, headers=headers, json=payload) + try: + return res.json()["result"] + except Exception: + raise ConnectionError(f"Unexpected RPC response:\n{res.text}") + + +def fetch_chain_state(block_height: int): + """Fetches chain state at the end of a specific block with given height. + Chain state is a just a block header extended with extra fields: + - prev_timestamps + - epoch_start_time + """ + # Chain state at height H is the state after applying block H + block_hash = request_rpc("getblockhash", [block_height]) + head = request_rpc("getblockheader", [block_hash]) + + # If block is downloaded take it localy + data = get_timestamp_data(block_height)[str(block_height)] + head["prev_timestamps"] = data["previous_timestamps"] + if block_height < 2016: + head["epoch_start_time"] = 1231006505 + else: + head["epoch_start_time"] = data["epoch_start_time"] + return head + +def next_chain_state(head: dict, blocks: list): + """Computes resulting chain state given the initial chain state + and all blocks that were applied to it. + """ + block_height = head["height"] + len(blocks) + next_head = blocks[-1] + + # We need to recalculate the prev_timestamps field given the previous chain state + # and all the blocks we applied to it + prev_timestamps = head["prev_timestamps"] + list(map(lambda x: x["time"], blocks)) + next_head["prev_timestamps"] = prev_timestamps[-11:] + + # Update epoch start time if necessary + if head["height"] // 2016 != block_height // 2016: + next_head["epoch_start_time"] = get_epoch_start_time(block_height) + else: + next_head["epoch_start_time"] = head["epoch_start_time"] + return next_head + + +def get_epoch_start_time(block_height: int) -> int: + """Computes the corresponding epoch start time given the current block height.""" + epoch_start_block_height = (block_height // 2016) * 2016 + epoch_start_block_hash = request_rpc("getblockhash", [epoch_start_block_height]) + epoch_start_header = request_rpc("getblockheader", [epoch_start_block_hash]) + return epoch_start_header["time"] + + +def format_chain_state(head: dict): + """Formats chain state according to the respective Cairo type.""" + return { + "block_height": head["height"], + "total_work": str(int.from_bytes(bytes.fromhex(head["chainwork"]), "big")), + "best_block_hash": head["hash"], + "current_target": str(bits_to_target(head["bits"])), + "epoch_start_time": head["epoch_start_time"], + "prev_timestamps": head["prev_timestamps"], + } + + +def bits_to_target(bits: str) -> int: + """Convert difficulty bits (compact target representation) to target. + + :param bits: bits as a hex string (without 0x prefix) + :return: target as integer + """ + exponent = int.from_bytes(bytes.fromhex(bits[:2]), "big") + mantissa = int.from_bytes(bytes.fromhex(bits[2:]), "big") + if exponent == 0: + return mantissa + elif exponent <= 3: + return mantissa >> (8 * (3 - exponent)) + else: + return mantissa << (8 * (exponent - 3)) + + +def fetch_block(block_height: int, block_hash: str): + """Downloads block with transactions (and referred UTXOs) from RPC given the block hash.""" + block = request_rpc("getblock", [block_hash, 2]) + utxos = get_utxo_set(block_height + 1) + block["data"] = {tx["txid"]: resolve_transaction(tx, utxos) for tx in block["tx"]} + return block + + +def resolve_transaction(transaction: dict, previous_outputs): + """Resolves transaction inputs and formats the content according to the Cairo type.""" + return { + "version": transaction["version"], + # Skip the first 4 bytes (version) and take the next 4 bytes (marker + flag) + "is_segwit": transaction["hex"][8:12] == "0001", + "inputs": [resolve_input(input, previous_outputs) for input in transaction["vin"]], + "outputs": [format_output(output) for output in transaction["vout"]], + "lock_time": transaction["locktime"], + } + + +def resolve_input(input: dict, previous_outputs): + """Resolves referenced UTXO and formats the transaction inputs according to the Cairo type.""" + if input.get("coinbase"): + return format_coinbase_input(input) + else: + previous_output = [ + output for output in previous_outputs + if output["txid"] == input["txid"] and int(output["vout"]) == input["vout"] + ][0] + return { + "script": f'0x{input["scriptSig"]["hex"]}', + "sequence": input["sequence"], + "previous_output": format_outpoint(previous_output), + "witness": [f"0x{item}" for item in input.get("txinwitness", [])], + } + +def format_outpoint(previous_output): + """Formats output according to the Cairo type.""" + + return { + "txid": previous_output["txid"], + "vout": int(previous_output["vout"]), + "data": { + "value": int(previous_output["value"]), + "pk_script": f'0x{previous_output["pk_script"]}', + "cached": False, + }, + "block_hash": previous_output["block_hash"], + "block_height": int(previous_output["block_height"]), + "block_time": int(previous_output["block_time"]), + "is_coinbase": previous_output["is_coinbase"], + } + + +def format_coinbase_input(input: dict): + """Formats coinbase input according to the Cairo type.""" + return { + "script": f'0x{input["coinbase"]}', + "sequence": input["sequence"], + "previous_output": { + "txid": "0" * 64, + "vout": 0xFFFFFFFF, + "data": { + "value": 0, + "pk_script": "0x", + "cached": False, + }, + "block_hash": "0" * 64, + "block_height": 0, + "block_time": 0, + "is_coinbase": False, + }, + "witness": [ + "0x0000000000000000000000000000000000000000000000000000000000000000" + ], + } + + +def format_output(output: dict): + """Formats transaction output according to the Cairo type.""" + value = (Decimal(str(output["value"])) * Decimal("100000000")).to_integral_value() + return { + "value": int(value), + "pk_script": f'0x{output["scriptPubKey"]["hex"]}', + "cached": False, + } + + +def format_block_with_transactions(block: dict): + """Formats block with transactions according to the respective Cairo type.""" + return { + "header": format_header(block), + "data": { + "variant_id": 1, + "transactions": list(block["data"].values()), + }, + } + + +def fetch_block_header(block_hash: str): + """Downloads block header (without transaction) from RPC given the block hash.""" + return request_rpc("getblockheader", [block_hash]) + + +def format_block(header: dict): + """Formats block (without transactions) according to the respective Cairo type. + Note that transaction data uses a verbose format to include information + about the particular enum variant. + + :param header: block header obtained from RPC + """ + return { + "header": format_header(header), + "data": {"variant_id": 0, "merkle_root": header["merkleroot"]}, + } + + +def format_header(header: dict): + """Formats header according to the respective Cairo type. + + :param header: block header obtained from RPC + """ + return { + "hash": header["hash"], + "version": header["version"], + "time": header["time"], + "bits": int.from_bytes(bytes.fromhex(header["bits"]), "big"), + "nonce": header["nonce"], + } + + +def generate_data( + mode: str, initial_height: int, num_blocks: int, include_expected: bool +): + """Generates arguments for Raito program in a human readable form and the expected result. + + :param mode: Validation mode: + "light" — generate block headers with Merkle root only + "full" — generate full blocks with transactions (and referenced UTXOs) + :param initial_height: The block height of the initial chain state (0 means the state after genesis) + :param num_blocks: The number of blocks to apply on top of it (has to be at least 1) + :return: tuple (arguments, expected output) + """ + chain_state = fetch_chain_state(initial_height) + next_block_hash = chain_state["nextblockhash"] + blocks = [] + + # UTXO set to track unspent outputs + utxo_set = {} + + for i in range(num_blocks): + if mode == "light": + block = fetch_block_header(next_block_hash) + elif mode == "full": + block = fetch_block(initial_height + i, next_block_hash) + # Build UTXO set and mark outputs spent within the same block (span). + # Also set "cached" flag for the inputs that spend those UTXOs. + for txid, tx in block["data"].items(): + for tx_input in tx["inputs"]: + outpoint = ( + tx_input["previous_output"]["txid"], + tx_input["previous_output"]["vout"], + ) + if outpoint in utxo_set: + tx_input["previous_output"]["cached"] = True + utxo_set[outpoint]["cached"] = True + + for idx, output in enumerate(tx["outputs"]): + outpoint = (txid, idx) + utxo_set[outpoint] = output + else: + raise NotImplementedError(mode) + next_block_hash = block["nextblockhash"] + blocks.append(block) + + if mode == "full": + # Do another pass to mark UTXOs spent within the same block (span) with "cached" flag. + for block in blocks: + for txid, tx in block["data"].items(): + for idx, output in enumerate(tx["outputs"]): + outpoint = (txid, idx) + if outpoint in utxo_set and utxo_set[outpoint].get("cached", False): + tx["outputs"][idx]["cached"] = True + + block_formatter = ( + format_block if mode == "light" else format_block_with_transactions + ) + result = { + "chain_state": format_chain_state(chain_state), + "blocks": list(map(block_formatter, blocks)), + } + + if include_expected: + result["expected"] = format_chain_state(next_chain_state(chain_state, blocks)) + + return result + + +# Usage: generate_data.py MODE INITIAL_HEIGHT NUM_BLOCKS INCLUDE_EXPECTED OUTPUT_FILE +# Example: generate_data.py 'light' 0 10 false light_0_10.json +if __name__ == "__main__": + if len(sys.argv) != 6: + raise TypeError("Expected five arguments") + + data = generate_data( + mode=sys.argv[1], + initial_height=int(sys.argv[2]), + num_blocks=int(sys.argv[3]), + include_expected=sys.argv[4].lower() == "true", + ) + + Path(sys.argv[5]).write_text(json.dumps(data, indent=2)) diff --git a/scripts/data/generate_timestamp_data.py b/scripts/data/generate_timestamp_data.py index 767ce5b8..ddbfb59e 100644 --- a/scripts/data/generate_timestamp_data.py +++ b/scripts/data/generate_timestamp_data.py @@ -91,9 +91,7 @@ def load_index(file_name): def get_timestamp_data(block_number): """Get the timestamp data for a given block number.""" - print(int(block_number) // INDEX_SIZE) file_name = index_file_name(int(block_number) // INDEX_SIZE) - print(file_name) index = load_index(file_name) return index diff --git a/scripts/data/generate_utxo_data.py b/scripts/data/generate_utxo_data.py index 7b14b68e..0e16346c 100644 --- a/scripts/data/generate_utxo_data.py +++ b/scripts/data/generate_utxo_data.py @@ -119,7 +119,7 @@ def get_utxo_set(block_number: int) -> Dict[str, Any]: for line in f: data = json.loads(line.strip()) if data["block_number"] == str(block_number): - return data + return data["outputs"] raise Exception(f"Block {block_number} not found in chunk file {chunk_file}") From 41bd340b6064d8c2afec2f2474cd219edcf4d40d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Fri, 27 Sep 2024 13:41:44 +0200 Subject: [PATCH 05/24] fix timestamps script --- scripts/data/generate_timestamp_data.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/scripts/data/generate_timestamp_data.py b/scripts/data/generate_timestamp_data.py index ddbfb59e..2251a9df 100644 --- a/scripts/data/generate_timestamp_data.py +++ b/scripts/data/generate_timestamp_data.py @@ -29,7 +29,7 @@ def download_timestamp(file_name: str): response = requests.get(url) if response.status_code != 200: - raise Exception(f"Failed to download {file_name}") + raise Exception(f"Failed to download {file_name}", response) with open(file_path, "wb") as f: f.write(response.content) @@ -47,21 +47,16 @@ def create_index(folder_path): return index -def list_files_in_gcs(bucket_name: str, prefix: str): +def list_files_in_gcs(bucket_name: str): """List all files in a GCS bucket under a specific folder (prefix).""" + print("Getting file list from GCS...") client = storage.Client.create_anonymous_client() bucket = client.get_bucket(bucket_name) blobs = bucket.list_blobs() - if not os.path.exists(prefix): - os.makedirs(prefix) - files = [] - for blob in blobs: - if blob.name.endswith(".json") and blob.name.startswith(prefix): - files.append(blob.name) - if not os.path.exists(blob.name): - blob.download_to_filename(blob.name) - return files - + + return [ + os.path.basename(blob.name) for blob in blobs if blob.name.endswith(".json") + ] def index_file_name(key): return f"{BASE_DIR}/timestamp_index_{key}.json" @@ -97,7 +92,7 @@ def get_timestamp_data(block_number): if __name__ == "__main__": - file_names = list_files_in_gcs(GCS_BUCKET_NAME, GCS_FOLDER_NAME) + file_names = list_files_in_gcs(GCS_BUCKET_NAME) for file_name in tqdm(file_names, "Downloading files"): download_timestamp(file_name) From 451adb04daef19397e94a2860c13b48705676367 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Fri, 27 Sep 2024 13:57:53 +0200 Subject: [PATCH 06/24] even more fixes --- scripts/data/generate_timestamp_data.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/data/generate_timestamp_data.py b/scripts/data/generate_timestamp_data.py index 2251a9df..ddea3937 100644 --- a/scripts/data/generate_timestamp_data.py +++ b/scripts/data/generate_timestamp_data.py @@ -47,12 +47,12 @@ def create_index(folder_path): return index -def list_files_in_gcs(bucket_name: str): +def list_files_in_gcs(): """List all files in a GCS bucket under a specific folder (prefix).""" - print("Getting file list from GCS...") + print(f"Getting file list from GCS...") client = storage.Client.create_anonymous_client() - bucket = client.get_bucket(bucket_name) - blobs = bucket.list_blobs() + bucket = client.get_bucket(GCS_BUCKET_NAME) + blobs = bucket.list_blobs(prefix=GCS_FOLDER_NAME) return [ os.path.basename(blob.name) for blob in blobs if blob.name.endswith(".json") @@ -92,7 +92,7 @@ def get_timestamp_data(block_number): if __name__ == "__main__": - file_names = list_files_in_gcs(GCS_BUCKET_NAME) + file_names = list_files_in_gcs() for file_name in tqdm(file_names, "Downloading files"): download_timestamp(file_name) From f1558f045bc5085bce8e261f194fa410c5b3b0ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Mon, 30 Sep 2024 11:57:38 +0200 Subject: [PATCH 07/24] integrate data generation --- scripts/data/generate_data.py | 133 +++++++++--- scripts/data/generate_data_fast.py | 329 ----------------------------- scripts/data/generate_utxo_data.py | 2 +- scripts/data/regenerate_tests.sh | 2 +- 4 files changed, 111 insertions(+), 355 deletions(-) delete mode 100755 scripts/data/generate_data_fast.py diff --git a/scripts/data/generate_data.py b/scripts/data/generate_data.py index 39c60ad0..c9d34dc5 100755 --- a/scripts/data/generate_data.py +++ b/scripts/data/generate_data.py @@ -4,10 +4,12 @@ import os import json import requests +import argparse from pathlib import Path from decimal import Decimal, getcontext from generate_timestamp_data import get_timestamp_data from generate_utxo_data import get_utxo_set +from tqdm import tqdm getcontext().prec = 16 @@ -15,6 +17,7 @@ USERPWD = os.getenv("USERPWD") DEFAULT_URL = "https://bitcoin-mainnet.public.blastapi.io" +FAST = False def request_rpc(method: str, params: list): """Makes a JSON-RPC call to a Bitcoin API endpoint. @@ -39,6 +42,25 @@ def request_rpc(method: str, params: list): raise ConnectionError(f"Unexpected RPC response:\n{res.text}") +def fetch_chain_state_fast(block_height: int): + """Fetches chain state at the end of a specific block with given height. + Chain state is a just a block header extended with extra fields: + - prev_timestamps + - epoch_start_time + """ + # Chain state at height H is the state after applying block H + block_hash = request_rpc("getblockhash", [block_height]) + head = request_rpc("getblockheader", [block_hash]) + + # If block is downloaded take it localy + data = get_timestamp_data(block_height)[str(block_height)] + head["prev_timestamps"] = data["previous_timestamps"] + if block_height < 2016: + head["epoch_start_time"] = 1231006505 + else: + head["epoch_start_time"] = data["epoch_start_time"] + return head + def fetch_chain_state(block_height: int): """Fetches chain state at the end of a specific block with given height. Chain state is a just a block header extended with extra fields: @@ -137,37 +159,66 @@ def bits_to_target(bits: str) -> int: return mantissa << (8 * (exponent - 3)) -def fetch_block(block_hash: str): +def fetch_block(block_height: int, block_hash: str, fast): """Downloads block with transactions (and referred UTXOs) from RPC given the block hash.""" block = request_rpc("getblock", [block_hash, 2]) - block["data"] = {tx["txid"]: resolve_transaction(tx) for tx in block["tx"]} + previous_outputs = get_utxo_set(block_height + 1) if fast else None + block["data"] = {tx["txid"]: resolve_transaction(tx, previous_outputs) for tx in tqdm(block["tx"], "Resolving transactions")} return block -def resolve_transaction(transaction: dict): +def resolve_transaction(transaction: dict, previous_outputs): """Resolves transaction inputs and formats the content according to the Cairo type.""" return { "version": transaction["version"], # Skip the first 4 bytes (version) and take the next 4 bytes (marker + flag) "is_segwit": transaction["hex"][8:12] == "0001", - "inputs": [resolve_input(input) for input in transaction["vin"]], + "inputs": [resolve_input(input, previous_outputs) for input in transaction["vin"]], "outputs": [format_output(output) for output in transaction["vout"]], "lock_time": transaction["locktime"], } -def resolve_input(input: dict): +def resolve_input(input: dict, previous_outputs): """Resolves referenced UTXO and formats the transaction inputs according to the Cairo type.""" if input.get("coinbase"): return format_coinbase_input(input) else: - return { - "script": f'0x{input["scriptSig"]["hex"]}', - "sequence": input["sequence"], - "previous_output": resolve_outpoint(input), - "witness": [f"0x{item}" for item in input.get("txinwitness", [])], - } - + if previous_outputs: + previous_output = [ + output for output in previous_outputs + if output["txid"] == input["txid"] and int(output["vout"]) == input["vout"] + ][0] + return { + "script": f'0x{input["scriptSig"]["hex"]}', + "sequence": input["sequence"], + "previous_output": format_outpoint(previous_output), + "witness": [f"0x{item}" for item in input.get("txinwitness", [])], + } + else: + return { + "script": f'0x{input["scriptSig"]["hex"]}', + "sequence": input["sequence"], + "previous_output": resolve_outpoint(input), + "witness": [f"0x{item}" for item in input.get("txinwitness", [])], + } + +def format_outpoint(previous_output): + """Formats output according to the Cairo type.""" + + return { + "txid": previous_output["txid"], + "vout": int(previous_output["vout"]), + "data": { + "value": int(previous_output["value"]), + "pk_script": f'0x{previous_output["pk_script"]}', + "cached": False, + }, + "block_hash": previous_output["block_hash"], + "block_height": int(previous_output["block_height"]), + "block_time": int(previous_output["block_time"]), + "is_coinbase": previous_output["is_coinbase"], + } def resolve_outpoint(input: dict): """Fetches transaction and block header for the referenced output, @@ -270,11 +321,13 @@ def generate_data( :param mode: Validation mode: "light" — generate block headers with Merkle root only - "full" — generate full blocks with transactions (and referenced UTXOs) + "full, full_fast" — generate full blocks with transactions (and referenced UTXOs) :param initial_height: The block height of the initial chain state (0 means the state after genesis) :param num_blocks: The number of blocks to apply on top of it (has to be at least 1) :return: tuple (arguments, expected output) """ + + print("Fetching chain state...") chain_state = fetch_chain_state(initial_height) next_block_hash = chain_state["nextblockhash"] blocks = [] @@ -282,11 +335,12 @@ def generate_data( # UTXO set to track unspent outputs utxo_set = {} - for _ in range(num_blocks): + for i in range(num_blocks): + print(f"Fetching block {initial_height + i}...") if mode == "light": block = fetch_block_header(next_block_hash) - elif mode == "full": - block = fetch_block(next_block_hash) + elif mode == "full" or mode == "full_fast": + block = fetch_block(initial_height + i, next_block_hash, mode == "full_fast") # Build UTXO set and mark outputs spent within the same block (span). # Also set "cached" flag for the inputs that spend those UTXOs. for txid, tx in block["data"].items(): @@ -307,7 +361,7 @@ def generate_data( next_block_hash = block["nextblockhash"] blocks.append(block) - if mode == "full": + if mode == "full" or mode == "full_fast": # Do another pass to mark UTXOs spent within the same block (span) with "cached" flag. for block in blocks: for txid, tx in block["data"].items(): @@ -333,14 +387,45 @@ def generate_data( # Usage: generate_data.py MODE INITIAL_HEIGHT NUM_BLOCKS INCLUDE_EXPECTED OUTPUT_FILE # Example: generate_data.py 'light' 0 10 false light_0_10.json if __name__ == "__main__": - if len(sys.argv) != 6: - raise TypeError("Expected five arguments") + + parser = argparse.ArgumentParser(description="Process UTXO files.") + parser.add_argument( + "mode", + choices=['light', 'full', 'full_fast'], + help="Mode", + ) + + parser.add_argument( + "initial_height", + type=int, + help="The block height of the initial chain state", + ) + + parser.add_argument( + "num_blocks", + type=int, + help="The number of blocks", + ) + + parser.add_argument( + "include_expected", + type=bool, + help="Include expected output", + ) + + parser.add_argument( + "output_file", + help="Output file", + ) + + + args = parser.parse_args() data = generate_data( - mode=sys.argv[1], - initial_height=int(sys.argv[2]), - num_blocks=int(sys.argv[3]), - include_expected=sys.argv[4].lower() == "true", + mode=args.mode, + initial_height=args.initial_height, + num_blocks=args.num_blocks, + include_expected=args.include_expected, ) - Path(sys.argv[5]).write_text(json.dumps(data, indent=2)) + Path(args.output_file).write_text(json.dumps(data, indent=2)) diff --git a/scripts/data/generate_data_fast.py b/scripts/data/generate_data_fast.py deleted file mode 100755 index 08a69d67..00000000 --- a/scripts/data/generate_data_fast.py +++ /dev/null @@ -1,329 +0,0 @@ -#!/usr/bin/env python - -import sys -import os -import json -import requests -from pathlib import Path -from decimal import Decimal, getcontext -from generate_timestamp_data import get_timestamp_data -from generate_utxo_data import get_utxo_set - -getcontext().prec = 16 - -BITCOIN_RPC = os.getenv("BITCOIN_RPC") -USERPWD = os.getenv("USERPWD") -DEFAULT_URL = "https://bitcoin-mainnet.public.blastapi.io" - - -def request_rpc(method: str, params: list): - """Makes a JSON-RPC call to a Bitcoin API endpoint. - Uses environment variables BITCOIN_RPC and USERPWD - or the default public endpoint if those variables are not set. - - :return: parsed JSON result as Python object - """ - url = BITCOIN_RPC or DEFAULT_URL - auth = tuple(USERPWD.split(":")) if USERPWD else None - headers = {"content-type": "application/json"} - payload = { - "jsonrpc": "2.0", - "method": method, - "params": params, - "id": 0, - } - print(f"Requesting {method} with params {params}") - res = requests.post(url, auth=auth, headers=headers, json=payload) - try: - return res.json()["result"] - except Exception: - raise ConnectionError(f"Unexpected RPC response:\n{res.text}") - - -def fetch_chain_state(block_height: int): - """Fetches chain state at the end of a specific block with given height. - Chain state is a just a block header extended with extra fields: - - prev_timestamps - - epoch_start_time - """ - # Chain state at height H is the state after applying block H - block_hash = request_rpc("getblockhash", [block_height]) - head = request_rpc("getblockheader", [block_hash]) - - # If block is downloaded take it localy - data = get_timestamp_data(block_height)[str(block_height)] - head["prev_timestamps"] = data["previous_timestamps"] - if block_height < 2016: - head["epoch_start_time"] = 1231006505 - else: - head["epoch_start_time"] = data["epoch_start_time"] - return head - -def next_chain_state(head: dict, blocks: list): - """Computes resulting chain state given the initial chain state - and all blocks that were applied to it. - """ - block_height = head["height"] + len(blocks) - next_head = blocks[-1] - - # We need to recalculate the prev_timestamps field given the previous chain state - # and all the blocks we applied to it - prev_timestamps = head["prev_timestamps"] + list(map(lambda x: x["time"], blocks)) - next_head["prev_timestamps"] = prev_timestamps[-11:] - - # Update epoch start time if necessary - if head["height"] // 2016 != block_height // 2016: - next_head["epoch_start_time"] = get_epoch_start_time(block_height) - else: - next_head["epoch_start_time"] = head["epoch_start_time"] - return next_head - - -def get_epoch_start_time(block_height: int) -> int: - """Computes the corresponding epoch start time given the current block height.""" - epoch_start_block_height = (block_height // 2016) * 2016 - epoch_start_block_hash = request_rpc("getblockhash", [epoch_start_block_height]) - epoch_start_header = request_rpc("getblockheader", [epoch_start_block_hash]) - return epoch_start_header["time"] - - -def format_chain_state(head: dict): - """Formats chain state according to the respective Cairo type.""" - return { - "block_height": head["height"], - "total_work": str(int.from_bytes(bytes.fromhex(head["chainwork"]), "big")), - "best_block_hash": head["hash"], - "current_target": str(bits_to_target(head["bits"])), - "epoch_start_time": head["epoch_start_time"], - "prev_timestamps": head["prev_timestamps"], - } - - -def bits_to_target(bits: str) -> int: - """Convert difficulty bits (compact target representation) to target. - - :param bits: bits as a hex string (without 0x prefix) - :return: target as integer - """ - exponent = int.from_bytes(bytes.fromhex(bits[:2]), "big") - mantissa = int.from_bytes(bytes.fromhex(bits[2:]), "big") - if exponent == 0: - return mantissa - elif exponent <= 3: - return mantissa >> (8 * (3 - exponent)) - else: - return mantissa << (8 * (exponent - 3)) - - -def fetch_block(block_height: int, block_hash: str): - """Downloads block with transactions (and referred UTXOs) from RPC given the block hash.""" - block = request_rpc("getblock", [block_hash, 2]) - utxos = get_utxo_set(block_height + 1) - block["data"] = {tx["txid"]: resolve_transaction(tx, utxos) for tx in block["tx"]} - return block - - -def resolve_transaction(transaction: dict, previous_outputs): - """Resolves transaction inputs and formats the content according to the Cairo type.""" - return { - "version": transaction["version"], - # Skip the first 4 bytes (version) and take the next 4 bytes (marker + flag) - "is_segwit": transaction["hex"][8:12] == "0001", - "inputs": [resolve_input(input, previous_outputs) for input in transaction["vin"]], - "outputs": [format_output(output) for output in transaction["vout"]], - "lock_time": transaction["locktime"], - } - - -def resolve_input(input: dict, previous_outputs): - """Resolves referenced UTXO and formats the transaction inputs according to the Cairo type.""" - if input.get("coinbase"): - return format_coinbase_input(input) - else: - previous_output = [ - output for output in previous_outputs - if output["txid"] == input["txid"] and int(output["vout"]) == input["vout"] - ][0] - return { - "script": f'0x{input["scriptSig"]["hex"]}', - "sequence": input["sequence"], - "previous_output": format_outpoint(previous_output), - "witness": [f"0x{item}" for item in input.get("txinwitness", [])], - } - -def format_outpoint(previous_output): - """Formats output according to the Cairo type.""" - - return { - "txid": previous_output["txid"], - "vout": int(previous_output["vout"]), - "data": { - "value": int(previous_output["value"]), - "pk_script": f'0x{previous_output["pk_script"]}', - "cached": False, - }, - "block_hash": previous_output["block_hash"], - "block_height": int(previous_output["block_height"]), - "block_time": int(previous_output["block_time"]), - "is_coinbase": previous_output["is_coinbase"], - } - - -def format_coinbase_input(input: dict): - """Formats coinbase input according to the Cairo type.""" - return { - "script": f'0x{input["coinbase"]}', - "sequence": input["sequence"], - "previous_output": { - "txid": "0" * 64, - "vout": 0xFFFFFFFF, - "data": { - "value": 0, - "pk_script": "0x", - "cached": False, - }, - "block_hash": "0" * 64, - "block_height": 0, - "block_time": 0, - "is_coinbase": False, - }, - "witness": [ - "0x0000000000000000000000000000000000000000000000000000000000000000" - ], - } - - -def format_output(output: dict): - """Formats transaction output according to the Cairo type.""" - value = (Decimal(str(output["value"])) * Decimal("100000000")).to_integral_value() - return { - "value": int(value), - "pk_script": f'0x{output["scriptPubKey"]["hex"]}', - "cached": False, - } - - -def format_block_with_transactions(block: dict): - """Formats block with transactions according to the respective Cairo type.""" - return { - "header": format_header(block), - "data": { - "variant_id": 1, - "transactions": list(block["data"].values()), - }, - } - - -def fetch_block_header(block_hash: str): - """Downloads block header (without transaction) from RPC given the block hash.""" - return request_rpc("getblockheader", [block_hash]) - - -def format_block(header: dict): - """Formats block (without transactions) according to the respective Cairo type. - Note that transaction data uses a verbose format to include information - about the particular enum variant. - - :param header: block header obtained from RPC - """ - return { - "header": format_header(header), - "data": {"variant_id": 0, "merkle_root": header["merkleroot"]}, - } - - -def format_header(header: dict): - """Formats header according to the respective Cairo type. - - :param header: block header obtained from RPC - """ - return { - "hash": header["hash"], - "version": header["version"], - "time": header["time"], - "bits": int.from_bytes(bytes.fromhex(header["bits"]), "big"), - "nonce": header["nonce"], - } - - -def generate_data( - mode: str, initial_height: int, num_blocks: int, include_expected: bool -): - """Generates arguments for Raito program in a human readable form and the expected result. - - :param mode: Validation mode: - "light" — generate block headers with Merkle root only - "full" — generate full blocks with transactions (and referenced UTXOs) - :param initial_height: The block height of the initial chain state (0 means the state after genesis) - :param num_blocks: The number of blocks to apply on top of it (has to be at least 1) - :return: tuple (arguments, expected output) - """ - chain_state = fetch_chain_state(initial_height) - next_block_hash = chain_state["nextblockhash"] - blocks = [] - - # UTXO set to track unspent outputs - utxo_set = {} - - for i in range(num_blocks): - if mode == "light": - block = fetch_block_header(next_block_hash) - elif mode == "full": - block = fetch_block(initial_height + i, next_block_hash) - # Build UTXO set and mark outputs spent within the same block (span). - # Also set "cached" flag for the inputs that spend those UTXOs. - for txid, tx in block["data"].items(): - for tx_input in tx["inputs"]: - outpoint = ( - tx_input["previous_output"]["txid"], - tx_input["previous_output"]["vout"], - ) - if outpoint in utxo_set: - tx_input["previous_output"]["cached"] = True - utxo_set[outpoint]["cached"] = True - - for idx, output in enumerate(tx["outputs"]): - outpoint = (txid, idx) - utxo_set[outpoint] = output - else: - raise NotImplementedError(mode) - next_block_hash = block["nextblockhash"] - blocks.append(block) - - if mode == "full": - # Do another pass to mark UTXOs spent within the same block (span) with "cached" flag. - for block in blocks: - for txid, tx in block["data"].items(): - for idx, output in enumerate(tx["outputs"]): - outpoint = (txid, idx) - if outpoint in utxo_set and utxo_set[outpoint].get("cached", False): - tx["outputs"][idx]["cached"] = True - - block_formatter = ( - format_block if mode == "light" else format_block_with_transactions - ) - result = { - "chain_state": format_chain_state(chain_state), - "blocks": list(map(block_formatter, blocks)), - } - - if include_expected: - result["expected"] = format_chain_state(next_chain_state(chain_state, blocks)) - - return result - - -# Usage: generate_data.py MODE INITIAL_HEIGHT NUM_BLOCKS INCLUDE_EXPECTED OUTPUT_FILE -# Example: generate_data.py 'light' 0 10 false light_0_10.json -if __name__ == "__main__": - if len(sys.argv) != 6: - raise TypeError("Expected five arguments") - - data = generate_data( - mode=sys.argv[1], - initial_height=int(sys.argv[2]), - num_blocks=int(sys.argv[3]), - include_expected=sys.argv[4].lower() == "true", - ) - - Path(sys.argv[5]).write_text(json.dumps(data, indent=2)) diff --git a/scripts/data/generate_utxo_data.py b/scripts/data/generate_utxo_data.py index 0e16346c..549b01a4 100644 --- a/scripts/data/generate_utxo_data.py +++ b/scripts/data/generate_utxo_data.py @@ -131,7 +131,7 @@ def process_file_range(start_file: str, end_file: str): start_num = int(start_file.split(".")[0]) end_num = int(end_file.split(".")[0]) - for file_num in tqdm(range(start_num, end_num + 1), desc="Processing files"): + for file_num in tqdm(range(start_num, end_num + 1), desc="Downloading files"): file_name = f"{file_num:012d}.json" # print(f"\nProcessing file: {file_name}") download_and_split(file_name) diff --git a/scripts/data/regenerate_tests.sh b/scripts/data/regenerate_tests.sh index cfe9fd71..df7e2adf 100755 --- a/scripts/data/regenerate_tests.sh +++ b/scripts/data/regenerate_tests.sh @@ -62,5 +62,5 @@ done for test_case in "${full_test_cases[@]}"; do echo "Generating test data: full mode, chain state @ $test_case, single block" - generate_test "full" $test_case + generate_test "full_fast" $test_case done \ No newline at end of file From 957dd8c58371f055874107643e8d7234433affa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Mon, 30 Sep 2024 12:13:31 +0200 Subject: [PATCH 08/24] optimization --- scripts/data/generate_utxo_data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/data/generate_utxo_data.py b/scripts/data/generate_utxo_data.py index 549b01a4..9660b5de 100644 --- a/scripts/data/generate_utxo_data.py +++ b/scripts/data/generate_utxo_data.py @@ -117,8 +117,10 @@ def get_utxo_set(block_number: int) -> Dict[str, Any]: # Find and return data for the block with open(BASE_DIR + "/" + chunk_file, "r") as f: for line in f: - data = json.loads(line.strip()) - if data["block_number"] == str(block_number): + # data = json.loads(line.strip()) + # if data["block_number"] == str(block_number): + if line.startswith(f'{{"block_number":{block_number}'): + data = json.loads(line.strip()) return data["outputs"] raise Exception(f"Block {block_number} not found in chunk file {chunk_file}") From 82388b53304152c57a78508aec1adec30f77d9d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Mon, 30 Sep 2024 16:00:34 +0200 Subject: [PATCH 09/24] update client script --- .gitignore | 5 ++- scripts/data/client.sh | 16 ++++--- scripts/data/generate_data.py | 20 ++++++--- scripts/data/generate_timestamp_data.py | 2 +- scripts/data/generate_utxo_data.py | 57 ++++++++++++++++--------- 5 files changed, 65 insertions(+), 35 deletions(-) mode change 100644 => 100755 scripts/data/client.sh diff --git a/.gitignore b/.gitignore index 2fb090f2..efec7b40 100644 --- a/.gitignore +++ b/.gitignore @@ -25,4 +25,7 @@ Cargo.lock .python-version __pycache__ -.client_cache/ \ No newline at end of file +.client_cache/ +.utxo_data/ +.timestamps_data/ + diff --git a/scripts/data/client.sh b/scripts/data/client.sh old mode 100644 new mode 100755 index 5b39d1a2..e5bddc90 --- a/scripts/data/client.sh +++ b/scripts/data/client.sh @@ -19,21 +19,23 @@ run_client() { first=$((initial_height+1)) second=$((initial_height+num_blocks)) - echo -n "Running $mode client on blocks $first — $second ..." + echo "Running $mode client on blocks $first — $second ..." batch_file=${base_dir}/${mode}_${initial_height}_${num_blocks}.json + arguments_file=${base_dir}/arguments-${mode}_${initial_height}_${num_blocks}.json if [ ! -f "$batch_file" ]; then - python ../../scripts/data/generate_data.py $mode $initial_height $num_blocks true $batch_file + python ../../scripts/data/generate_data.py --fast $mode $initial_height $num_blocks true $batch_file fi - arguments=$(python ../../scripts/data/format_args.py $batch_file) - output=$(scarb cairo-run --no-build --package client --function test "$arguments") - if [[ "$output" == *"FAIL"* ]]; then - echo " fail" + python ../../scripts/data/format_args.py $batch_file > $arguments_file + output=$(scarb cairo-run --no-build --package client --function test --arguments-file $arguments_file) + if [[ $? -eq 0 || "$output" == *"FAIL"* || "$output" == *error* ]]; then + echo "fail" echo $output exit 1 else - echo " ok" + echo "ok" + echo $output fi } diff --git a/scripts/data/generate_data.py b/scripts/data/generate_data.py index c9d34dc5..c48ab173 100755 --- a/scripts/data/generate_data.py +++ b/scripts/data/generate_data.py @@ -315,7 +315,7 @@ def format_header(header: dict): def generate_data( - mode: str, initial_height: int, num_blocks: int, include_expected: bool + mode: str, initial_height: int, num_blocks: int, include_expected: bool, fast: bool ): """Generates arguments for Raito program in a human readable form and the expected result. @@ -327,7 +327,10 @@ def generate_data( :return: tuple (arguments, expected output) """ - print("Fetching chain state...") + if fast: + print("Fetching chain state (fast)...") + else: + print("Fetching chain state...") chain_state = fetch_chain_state(initial_height) next_block_hash = chain_state["nextblockhash"] blocks = [] @@ -339,8 +342,8 @@ def generate_data( print(f"Fetching block {initial_height + i}...") if mode == "light": block = fetch_block_header(next_block_hash) - elif mode == "full" or mode == "full_fast": - block = fetch_block(initial_height + i, next_block_hash, mode == "full_fast") + elif mode == "full": + block = fetch_block(initial_height + i, next_block_hash, fast) # Build UTXO set and mark outputs spent within the same block (span). # Also set "cached" flag for the inputs that spend those UTXOs. for txid, tx in block["data"].items(): @@ -391,7 +394,7 @@ def generate_data( parser = argparse.ArgumentParser(description="Process UTXO files.") parser.add_argument( "mode", - choices=['light', 'full', 'full_fast'], + choices=['light', 'full'], help="Mode", ) @@ -418,6 +421,12 @@ def generate_data( help="Output file", ) + parser.add_argument( + "--fast", + dest="fast", + action="store_true", + help="Ending file number (e.g., 000000000050)", + ) args = parser.parse_args() @@ -426,6 +435,7 @@ def generate_data( initial_height=args.initial_height, num_blocks=args.num_blocks, include_expected=args.include_expected, + fast=args.fast ) Path(args.output_file).write_text(json.dumps(data, indent=2)) diff --git a/scripts/data/generate_timestamp_data.py b/scripts/data/generate_timestamp_data.py index ddea3937..316f3bed 100644 --- a/scripts/data/generate_timestamp_data.py +++ b/scripts/data/generate_timestamp_data.py @@ -10,7 +10,7 @@ INDEX_SIZE = 30000 -BASE_DIR = "timestamps_data" +BASE_DIR = ".timestamps_data" GCS_BUCKET_NAME = "shinigami-consensus" GCS_FOLDER_NAME = "timestamps" diff --git a/scripts/data/generate_utxo_data.py b/scripts/data/generate_utxo_data.py index 9660b5de..382c7d1f 100644 --- a/scripts/data/generate_utxo_data.py +++ b/scripts/data/generate_utxo_data.py @@ -7,16 +7,33 @@ import subprocess from typing import Dict, Any import argparse +from google.cloud import storage from tqdm import tqdm from functools import lru_cache from collections import defaultdict # Constants -GCS_BASE_URL = "https://storage.googleapis.com/shinigami-consensus/utxos/" -BASE_DIR = "utxo_data" +BASE_DIR = ".utxo_data" CHUNK_SIZE = 10 INDEX_SIZE = 50000 +GCS_BUCKET_NAME = "shinigami-consensus" +GCS_FOLDER_NAME = "utxos" +GCS_BASE_URL = f"https://storage.googleapis.com/{GCS_BUCKET_NAME}/{GCS_FOLDER_NAME}/" + + +def list_files_in_gcs(): + """List all files in a GCS bucket under a specific folder (prefix).""" + print(f"Getting file list from GCS...") + client = storage.Client.create_anonymous_client() + bucket = client.get_bucket(GCS_BUCKET_NAME) + blobs = bucket.list_blobs(prefix=GCS_FOLDER_NAME) + + return [ + os.path.basename(blob.name) for blob in blobs if blob.name.endswith(".json") + ] + + def download_and_split(file_name: str): """Download a file from GCS and split it into chunks.""" @@ -107,34 +124,38 @@ def load_index(file_name): def get_utxo_set(block_number: int) -> Dict[str, Any]: - index = load_index(index_file_name(int(block_number) // INDEX_SIZE)) + index_file = index_file_name(int(block_number) // INDEX_SIZE) + index = load_index(index_file) # Find chunk file chunk_file = index.get(str(block_number)) if not chunk_file: - raise Exception(f"Block number {block_number} not found in index") + raise Exception(f"Block number {block_number} not found in index file: {index_file}") # Find and return data for the block with open(BASE_DIR + "/" + chunk_file, "r") as f: for line in f: # data = json.loads(line.strip()) # if data["block_number"] == str(block_number): - if line.startswith(f'{{"block_number":{block_number}'): + if line.startswith(f'{{"block_number":"{block_number}"'): data = json.loads(line.strip()) return data["outputs"] + + print() raise Exception(f"Block {block_number} not found in chunk file {chunk_file}") -def process_file_range(start_file: str, end_file: str): +def process_files(num_files: int): """Process a range of files from start_file to end_file.""" os.makedirs(BASE_DIR, exist_ok=True) - start_num = int(start_file.split(".")[0]) - end_num = int(end_file.split(".")[0]) + files = list_files_in_gcs() - for file_num in tqdm(range(start_num, end_num + 1), desc="Downloading files"): - file_name = f"{file_num:012d}.json" + if num_files: + files = files[:num_files] + + for file_name in tqdm(files, desc="Downloading files"): # print(f"\nProcessing file: {file_name}") download_and_split(file_name) @@ -145,20 +166,14 @@ def process_file_range(start_file: str, end_file: str): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process UTXO files.") parser.add_argument( - "--from", - dest="start_file", - required=True, - help="Starting file number (e.g., 000000000001)", - ) - parser.add_argument( - "--to", - dest="end_file", - required=True, - help="Ending file number (e.g., 000000000050)", + "--num_files", + dest="num_files", + type=int, + help="Number of files to process, all if not specified", ) args = parser.parse_args() - process_file_range(args.start_file, args.end_file) + process_files(args.num_files) print("All files processed successfully.") From f7331c3d59dc5690c732014c0c29b1778229e741 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Mon, 30 Sep 2024 17:52:25 +0200 Subject: [PATCH 10/24] fixes after the merge --- scripts/data/generate_data.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/scripts/data/generate_data.py b/scripts/data/generate_data.py index 909cd739..e1f80d02 100755 --- a/scripts/data/generate_data.py +++ b/scripts/data/generate_data.py @@ -162,7 +162,7 @@ def fetch_block(block_height: int, block_hash: str, include_utreexo_data: bool, return block -def resolve_transaction(transaction: dict, previous_outputs): +def resolve_transaction(transaction: dict, include_utreexo_data, previous_outputs): """Resolves transaction inputs and formats the content according to the Cairo type.""" if include_utreexo_data: return { @@ -343,8 +343,10 @@ def generate_data( if include_utreexo_data: blocks.append( fetch_block( + 0, "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f", include_utreexo_data, + fast ) ) @@ -353,7 +355,7 @@ def generate_data( utxo_set = {} for i in range(num_blocks): - print(f"Fetching block {initial_height + i}/{initial_height + num_blocks}") + print(f"Fetching block {initial_height}| {i+1}/{num_blocks}") if mode == "light": block = fetch_block_header(next_block_hash) elif mode == "full": @@ -401,6 +403,16 @@ def generate_data( return result +def str2bool(value): + if isinstance(value, bool): + return value + if value.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif value.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + # Usage: generate_data.py MODE INITIAL_HEIGHT NUM_BLOCKS INCLUDE_EXPECTED OUTPUT_FILE # Example: generate_data.py 'light' 0 10 false light_0_10.json if __name__ == "__main__": @@ -426,13 +438,13 @@ def generate_data( parser.add_argument( "include_expected", - type=bool, + type=str2bool, help="Include expected output", ) parser.add_argument( "include_utreexo_data", - type=bool, + type=str2bool, help="Include utreexo data", ) @@ -446,11 +458,13 @@ def generate_data( "--fast", dest="fast", action="store_true", - help="Ending file number (e.g., 000000000050)", + help="Fast mode", ) args = parser.parse_args() + print(f'Parsing arguments...') + data = generate_data( mode=args.mode, initial_height=args.initial_height, From 971016e178cbf5b0f9c4aa5d4677a4f96f724dcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Mon, 30 Sep 2024 18:00:37 +0200 Subject: [PATCH 11/24] more fixes --- scripts/data/client.sh | 2 +- scripts/data/generate_data.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/data/client.sh b/scripts/data/client.sh index e5bddc90..262ea14e 100755 --- a/scripts/data/client.sh +++ b/scripts/data/client.sh @@ -24,7 +24,7 @@ run_client() { batch_file=${base_dir}/${mode}_${initial_height}_${num_blocks}.json arguments_file=${base_dir}/arguments-${mode}_${initial_height}_${num_blocks}.json if [ ! -f "$batch_file" ]; then - python ../../scripts/data/generate_data.py --fast $mode $initial_height $num_blocks true $batch_file + python ../../scripts/data/generate_data.py --fast $mode $initial_height $num_blocks true false $batch_file fi python ../../scripts/data/format_args.py $batch_file > $arguments_file diff --git a/scripts/data/generate_data.py b/scripts/data/generate_data.py index e1f80d02..795af541 100755 --- a/scripts/data/generate_data.py +++ b/scripts/data/generate_data.py @@ -463,8 +463,6 @@ def str2bool(value): args = parser.parse_args() - print(f'Parsing arguments...') - data = generate_data( mode=args.mode, initial_height=args.initial_height, From 80ec12fc4b43f5282f3d1fdd8fd80ae578828d79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Mon, 30 Sep 2024 18:50:39 +0200 Subject: [PATCH 12/24] make sure timestamps are ints --- scripts/data/generate_data.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/scripts/data/generate_data.py b/scripts/data/generate_data.py index 795af541..782d068a 100755 --- a/scripts/data/generate_data.py +++ b/scripts/data/generate_data.py @@ -49,13 +49,14 @@ def fetch_chain_state_fast(block_height: int): # If block is downloaded take it localy data = get_timestamp_data(block_height)[str(block_height)] - head["prev_timestamps"] = data["previous_timestamps"] + head["prev_timestamps"] = [int(t) for t in data["previous_timestamps"]] if block_height < 2016: head["epoch_start_time"] = 1231006505 else: - head["epoch_start_time"] = data["epoch_start_time"] + head["epoch_start_time"] = int(data["epoch_start_time"]) return head + def fetch_chain_state(block_height: int): """Fetches chain state at the end of a specific block with given height. Chain state is a just a block header extended with extra fields: @@ -66,20 +67,9 @@ def fetch_chain_state(block_height: int): block_hash = request_rpc("getblockhash", [block_height]) head = request_rpc("getblockheader", [block_hash]) - # If block is downloaded take it locally - data = get_timestamp_data(block_height) - if str(block_height) in data: - data = data[str(block_height)] - head["prev_timestamps"] = data["previous_timestamps"] - if block_height < 2016: - head["epoch_start_time"] = 1231006505 - else: - head["epoch_start_time"] = data["epoch_start_time"] - return head - # In order to init prev_timestamps we need to query 10 previous headers prev_header = head - prev_timestamps = [head["time"]] + prev_timestamps = [int(head["time"])] for _ in range(10): if prev_header["height"] == 0: prev_timestamps.insert(0, 0) @@ -87,16 +77,18 @@ def fetch_chain_state(block_height: int): prev_header = request_rpc( "getblockheader", [prev_header["previousblockhash"]] ) - prev_timestamps.insert(0, prev_header["time"]) + prev_timestamps.insert(0, int(prev_header["time"])) head["prev_timestamps"] = prev_timestamps + print("timestamps", prev_timestamps) + # In order to init epoch start we need to query block header at epoch start if block_height < 2016: head["epoch_start_time"] = 1231006505 else: head["epoch_start_time"] = get_epoch_start_time(block_height) - return head + return head def next_chain_state(head: dict, blocks: list): """Computes resulting chain state given the initial chain state @@ -335,7 +327,9 @@ def generate_data( print("Fetching chain state (fast)...") else: print("Fetching chain state...") - chain_state = fetch_chain_state(initial_height) + + chain_state = fetch_chain_state_fast(initial_height) if fast else fetch_chain_state(initial_height) + next_block_hash = chain_state["nextblockhash"] blocks = [] From 601e39643a7cdc6c1a0f6651d08bb7db7ee35f46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Mon, 30 Sep 2024 19:03:03 +0200 Subject: [PATCH 13/24] fix blocks without outputs --- scripts/data/generate_utxo_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/data/generate_utxo_data.py b/scripts/data/generate_utxo_data.py index 382c7d1f..f90ae3ef 100644 --- a/scripts/data/generate_utxo_data.py +++ b/scripts/data/generate_utxo_data.py @@ -130,7 +130,8 @@ def get_utxo_set(block_number: int) -> Dict[str, Any]: # Find chunk file chunk_file = index.get(str(block_number)) if not chunk_file: - raise Exception(f"Block number {block_number} not found in index file: {index_file}") + return {"block_number":str(block_number), "outputs": []} + # raise Exception(f"Block number {block_number} not found in index file: {index_file}") # Find and return data for the block with open(BASE_DIR + "/" + chunk_file, "r") as f: From 8e64f3539129459548cdb85bacfed2042e0fd0de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Mon, 30 Sep 2024 19:05:11 +0200 Subject: [PATCH 14/24] fix error detection --- scripts/data/client.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/data/client.sh b/scripts/data/client.sh index 262ea14e..6c3480fb 100755 --- a/scripts/data/client.sh +++ b/scripts/data/client.sh @@ -29,7 +29,7 @@ run_client() { python ../../scripts/data/format_args.py $batch_file > $arguments_file output=$(scarb cairo-run --no-build --package client --function test --arguments-file $arguments_file) - if [[ $? -eq 0 || "$output" == *"FAIL"* || "$output" == *error* ]]; then + if [[ $? -ne 0 || "$output" == *"FAIL"* || "$output" == *error* ]]; then echo "fail" echo $output exit 1 From 663be4594941674c721924567205e5b6920e53fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Mon, 30 Sep 2024 19:12:50 +0200 Subject: [PATCH 15/24] fix mesages --- scripts/data/client.sh | 6 +++--- scripts/data/generate_data.py | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/data/client.sh b/scripts/data/client.sh index 6c3480fb..fc78ad0e 100755 --- a/scripts/data/client.sh +++ b/scripts/data/client.sh @@ -19,17 +19,17 @@ run_client() { first=$((initial_height+1)) second=$((initial_height+num_blocks)) - echo "Running $mode client on blocks $first — $second ..." batch_file=${base_dir}/${mode}_${initial_height}_${num_blocks}.json arguments_file=${base_dir}/arguments-${mode}_${initial_height}_${num_blocks}.json if [ ! -f "$batch_file" ]; then python ../../scripts/data/generate_data.py --fast $mode $initial_height $num_blocks true false $batch_file fi - + + echo -n "Running $mode client on blocks $first — $second " python ../../scripts/data/format_args.py $batch_file > $arguments_file output=$(scarb cairo-run --no-build --package client --function test --arguments-file $arguments_file) - if [[ $? -ne 0 || "$output" == *"FAIL"* || "$output" == *error* ]]; then + if [[ $? -ne 0 || "$output" == *"FAIL"* || "$output" == *error* || "$output" == *panicked* ]]; then echo "fail" echo $output exit 1 diff --git a/scripts/data/generate_data.py b/scripts/data/generate_data.py index 782d068a..d0ab9649 100755 --- a/scripts/data/generate_data.py +++ b/scripts/data/generate_data.py @@ -327,6 +327,8 @@ def generate_data( print("Fetching chain state (fast)...") else: print("Fetching chain state...") + + print(f"blocks: {initial_height} - {initial_height + num_blocks - 1}") chain_state = fetch_chain_state_fast(initial_height) if fast else fetch_chain_state(initial_height) From 0e85082625f60256604fde59f7a724b5a5edd96b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Mon, 30 Sep 2024 19:20:40 +0200 Subject: [PATCH 16/24] fixes --- scripts/data/client.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/data/client.sh b/scripts/data/client.sh index fc78ad0e..46b3f8ba 100755 --- a/scripts/data/client.sh +++ b/scripts/data/client.sh @@ -6,12 +6,13 @@ base_dir=".client_cache" start=${1:-0} -end=${2:-100} +no_of_blocks=${2:-100} +end=$(($start+$no_of_blocks)) step=${3:-1} mode=${4:-"light"} strategy=${5:-"sequential"} -mkdir $base_dir || true +mkdir -p $base_dir run_client() { local initial_height=$1 From c4b409b8658caed4b5303d2ce8e0f1479cde3891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Mon, 30 Sep 2024 19:40:41 +0200 Subject: [PATCH 17/24] fmt --- scripts/data/generate_data.py | 71 ++++++++++++++++--------- scripts/data/generate_timestamp_data.py | 3 +- scripts/data/generate_utxo_data.py | 9 ++-- 3 files changed, 51 insertions(+), 32 deletions(-) diff --git a/scripts/data/generate_data.py b/scripts/data/generate_data.py index d0ab9649..c3f632f5 100755 --- a/scripts/data/generate_data.py +++ b/scripts/data/generate_data.py @@ -19,6 +19,7 @@ FAST = False + def request_rpc(method: str, params: list): """Makes a JSON-RPC call to a Bitcoin API endpoint. Uses environment variables BITCOIN_RPC and USERPWD @@ -47,7 +48,7 @@ def fetch_chain_state_fast(block_height: int): block_hash = request_rpc("getblockhash", [block_height]) head = request_rpc("getblockheader", [block_hash]) - # If block is downloaded take it localy + # If block is downloaded take it locally data = get_timestamp_data(block_height)[str(block_height)] head["prev_timestamps"] = [int(t) for t in data["previous_timestamps"]] if block_height < 2016: @@ -90,6 +91,7 @@ def fetch_chain_state(block_height: int): return head + def next_chain_state(head: dict, blocks: list): """Computes resulting chain state given the initial chain state and all blocks that were applied to it. @@ -150,7 +152,10 @@ def fetch_block(block_height: int, block_hash: str, include_utreexo_data: bool, """Downloads block with transactions (and referred UTXOs) from RPC given the block hash.""" block = request_rpc("getblock", [block_hash, 2]) previous_outputs = get_utxo_set(block_height + 1) if fast else None - block["data"] = {tx["txid"]: resolve_transaction(tx, include_utreexo_data, previous_outputs) for tx in tqdm(block["tx"], "Resolving transactions")} + block["data"] = { + tx["txid"]: resolve_transaction(tx, include_utreexo_data, previous_outputs) + for tx in tqdm(block["tx"], "Resolving transactions") + } return block @@ -162,16 +167,20 @@ def resolve_transaction(transaction: dict, include_utreexo_data, previous_output "txid": transaction["txid"], # Skip the first 4 bytes (version) and take the next 4 bytes (marker + flag) "is_segwit": transaction["hex"][8:12] == "0001", - "inputs": [resolve_input(input, previous_outputs) for input in transaction["vin"]], + "inputs": [ + resolve_input(input, previous_outputs) for input in transaction["vin"] + ], "outputs": [format_output(output) for output in transaction["vout"]], "lock_time": transaction["locktime"], } - else: + else: return { "version": transaction["version"], # Skip the first 4 bytes (version) and take the next 4 bytes (marker + flag) "is_segwit": transaction["hex"][8:12] == "0001", - "inputs": [resolve_input(input, previous_outputs) for input in transaction["vin"]], + "inputs": [ + resolve_input(input, previous_outputs) for input in transaction["vin"] + ], "outputs": [format_output(output) for output in transaction["vout"]], "lock_time": transaction["locktime"], } @@ -184,8 +193,10 @@ def resolve_input(input: dict, previous_outputs): else: if previous_outputs: previous_output = [ - output for output in previous_outputs - if output["txid"] == input["txid"] and int(output["vout"]) == input["vout"] + output + for output in previous_outputs + if output["txid"] == input["txid"] + and int(output["vout"]) == input["vout"] ][0] return { "script": f'0x{input["scriptSig"]["hex"]}', @@ -201,16 +212,17 @@ def resolve_input(input: dict, previous_outputs): "witness": [f"0x{item}" for item in input.get("txinwitness", [])], } + def format_outpoint(previous_output): """Formats output according to the Cairo type.""" - + return { "txid": previous_output["txid"], "vout": int(previous_output["vout"]), "data": { "value": int(previous_output["value"]), "pk_script": f'0x{previous_output["pk_script"]}', - "cached": False, + "cached": False, }, "block_hash": previous_output["block_hash"], "block_height": int(previous_output["block_height"]), @@ -218,6 +230,7 @@ def format_outpoint(previous_output): "is_coinbase": previous_output["is_coinbase"], } + def resolve_outpoint(input: dict): """Fetches transaction and block header for the referenced output, formats resulting outpoint according to the Cairo type. @@ -311,7 +324,7 @@ def generate_data( num_blocks: int, include_expected: bool, include_utreexo_data: bool, - fast: bool + fast: bool, ): """Generates arguments for Raito program in a human readable form and the expected result. @@ -322,16 +335,20 @@ def generate_data( :param num_blocks: The number of blocks to apply on top of it (has to be at least 1) :return: tuple (arguments, expected output) """ - + if fast: print("Fetching chain state (fast)...") else: print("Fetching chain state...") print(f"blocks: {initial_height} - {initial_height + num_blocks - 1}") - - chain_state = fetch_chain_state_fast(initial_height) if fast else fetch_chain_state(initial_height) - + + chain_state = ( + fetch_chain_state_fast(initial_height) + if fast + else fetch_chain_state(initial_height) + ) + next_block_hash = chain_state["nextblockhash"] blocks = [] @@ -342,7 +359,7 @@ def generate_data( 0, "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f", include_utreexo_data, - fast + fast, ) ) @@ -355,7 +372,9 @@ def generate_data( if mode == "light": block = fetch_block_header(next_block_hash) elif mode == "full": - block = fetch_block(initial_height + i, next_block_hash, include_utreexo_data, fast) + block = fetch_block( + initial_height + i, next_block_hash, include_utreexo_data, fast + ) # Build UTXO set and mark outputs spent within the same block (span). # Also set "cached" flag for the inputs that spend those UTXOs. for txid, tx in block["data"].items(): @@ -402,24 +421,25 @@ def generate_data( def str2bool(value): if isinstance(value, bool): return value - if value.lower() in ('yes', 'true', 't', 'y', '1'): + if value.lower() in ("yes", "true", "t", "y", "1"): return True - elif value.lower() in ('no', 'false', 'f', 'n', '0'): + elif value.lower() in ("no", "false", "f", "n", "0"): return False else: - raise argparse.ArgumentTypeError('Boolean value expected.') + raise argparse.ArgumentTypeError("Boolean value expected.") + # Usage: generate_data.py MODE INITIAL_HEIGHT NUM_BLOCKS INCLUDE_EXPECTED OUTPUT_FILE # Example: generate_data.py 'light' 0 10 false light_0_10.json if __name__ == "__main__": - + parser = argparse.ArgumentParser(description="Process UTXO files.") parser.add_argument( "mode", - choices=['light', 'full'], + choices=["light", "full"], help="Mode", ) - + parser.add_argument( "initial_height", type=int, @@ -431,7 +451,7 @@ def str2bool(value): type=int, help="The number of blocks", ) - + parser.add_argument( "include_expected", type=str2bool, @@ -444,7 +464,6 @@ def str2bool(value): help="Include utreexo data", ) - parser.add_argument( "output_file", help="Output file", @@ -457,7 +476,7 @@ def str2bool(value): help="Fast mode", ) - args = parser.parse_args() + args = parser.parse_args() data = generate_data( mode=args.mode, @@ -465,7 +484,7 @@ def str2bool(value): num_blocks=args.num_blocks, include_expected=args.include_expected, include_utreexo_data=args.include_utreexo_data, - fast=args.fast + fast=args.fast, ) Path(args.output_file).write_text(json.dumps(data, indent=2)) diff --git a/scripts/data/generate_timestamp_data.py b/scripts/data/generate_timestamp_data.py index 316f3bed..71b74719 100644 --- a/scripts/data/generate_timestamp_data.py +++ b/scripts/data/generate_timestamp_data.py @@ -53,11 +53,12 @@ def list_files_in_gcs(): client = storage.Client.create_anonymous_client() bucket = client.get_bucket(GCS_BUCKET_NAME) blobs = bucket.list_blobs(prefix=GCS_FOLDER_NAME) - + return [ os.path.basename(blob.name) for blob in blobs if blob.name.endswith(".json") ] + def index_file_name(key): return f"{BASE_DIR}/timestamp_index_{key}.json" diff --git a/scripts/data/generate_utxo_data.py b/scripts/data/generate_utxo_data.py index f90ae3ef..54086185 100644 --- a/scripts/data/generate_utxo_data.py +++ b/scripts/data/generate_utxo_data.py @@ -28,13 +28,12 @@ def list_files_in_gcs(): client = storage.Client.create_anonymous_client() bucket = client.get_bucket(GCS_BUCKET_NAME) blobs = bucket.list_blobs(prefix=GCS_FOLDER_NAME) - + return [ os.path.basename(blob.name) for blob in blobs if blob.name.endswith(".json") ] - def download_and_split(file_name: str): """Download a file from GCS and split it into chunks.""" os.makedirs(BASE_DIR, exist_ok=True) @@ -124,13 +123,13 @@ def load_index(file_name): def get_utxo_set(block_number: int) -> Dict[str, Any]: - index_file = index_file_name(int(block_number) // INDEX_SIZE) + index_file = index_file_name(int(block_number) // INDEX_SIZE) index = load_index(index_file) # Find chunk file chunk_file = index.get(str(block_number)) if not chunk_file: - return {"block_number":str(block_number), "outputs": []} + return {"block_number": str(block_number), "outputs": []} # raise Exception(f"Block number {block_number} not found in index file: {index_file}") # Find and return data for the block @@ -141,7 +140,7 @@ def get_utxo_set(block_number: int) -> Dict[str, Any]: if line.startswith(f'{{"block_number":"{block_number}"'): data = json.loads(line.strip()) return data["outputs"] - + print() raise Exception(f"Block {block_number} not found in chunk file {chunk_file}") From dab4c6785a8177f7b145579711d6fd7fc9981be6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Tue, 1 Oct 2024 11:52:33 +0200 Subject: [PATCH 18/24] fix review comments --- scripts/data/client.sh | 2 +- scripts/data/generate_data.py | 46 +++++++++++++++++++++-------------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/scripts/data/client.sh b/scripts/data/client.sh index 46b3f8ba..c36dea78 100755 --- a/scripts/data/client.sh +++ b/scripts/data/client.sh @@ -24,7 +24,7 @@ run_client() { batch_file=${base_dir}/${mode}_${initial_height}_${num_blocks}.json arguments_file=${base_dir}/arguments-${mode}_${initial_height}_${num_blocks}.json if [ ! -f "$batch_file" ]; then - python ../../scripts/data/generate_data.py --fast $mode $initial_height $num_blocks true false $batch_file + python ../../scripts/data/generate_data.py --fast --mode $mode --height $initial_height --num_blocks $num_blocks --include_expected --output_file $batch_file fi echo -n "Running $mode client on blocks $first — $second " diff --git a/scripts/data/generate_data.py b/scripts/data/generate_data.py index c3f632f5..b8501dd7 100755 --- a/scripts/data/generate_data.py +++ b/scripts/data/generate_data.py @@ -81,8 +81,6 @@ def fetch_chain_state(block_height: int): prev_timestamps.insert(0, int(prev_header["time"])) head["prev_timestamps"] = prev_timestamps - print("timestamps", prev_timestamps) - # In order to init epoch start we need to query block header at epoch start if block_height < 2016: head["epoch_start_time"] = 1231006505 @@ -151,7 +149,13 @@ def bits_to_target(bits: str) -> int: def fetch_block(block_height: int, block_hash: str, include_utreexo_data: bool, fast): """Downloads block with transactions (and referred UTXOs) from RPC given the block hash.""" block = request_rpc("getblock", [block_hash, 2]) - previous_outputs = get_utxo_set(block_height + 1) if fast else None + + previous_outputs = ( + {(o["txid"], int(o["vout"])): o for o in get_utxo_set(block_height + 1)} + if fast + else None + ) + block["data"] = { tx["txid"]: resolve_transaction(tx, include_utreexo_data, previous_outputs) for tx in tqdm(block["tx"], "Resolving transactions") @@ -192,12 +196,7 @@ def resolve_input(input: dict, previous_outputs): return format_coinbase_input(input) else: if previous_outputs: - previous_output = [ - output - for output in previous_outputs - if output["txid"] == input["txid"] - and int(output["vout"]) == input["vout"] - ][0] + previous_output = previous_outputs.get((input["txid"], input["vout"])) return { "script": f'0x{input["scriptSig"]["hex"]}', "sequence": input["sequence"], @@ -435,37 +434,48 @@ def str2bool(value): parser = argparse.ArgumentParser(description="Process UTXO files.") parser.add_argument( - "mode", + "--mode", + dest="mode", + default="full", choices=["light", "full"], help="Mode", ) parser.add_argument( - "initial_height", + "--height", + dest="height", + required=True, type=int, help="The block height of the initial chain state", ) parser.add_argument( - "num_blocks", + "--num_blocks", + dest="num_blocks", + required=True, type=int, help="The number of blocks", ) parser.add_argument( - "include_expected", - type=str2bool, + "--include_expected", + dest="include_expected", + action="store_true", help="Include expected output", ) parser.add_argument( - "include_utreexo_data", - type=str2bool, + "--include_utreexo_data", + dest="include_utreexo_data", + action="store_true", help="Include utreexo data", ) parser.add_argument( - "output_file", + "--output_file", + dest="output_file", + required=True, + type=str, help="Output file", ) @@ -480,7 +490,7 @@ def str2bool(value): data = generate_data( mode=args.mode, - initial_height=args.initial_height, + initial_height=args.height, num_blocks=args.num_blocks, include_expected=args.include_expected, include_utreexo_data=args.include_utreexo_data, From ad0a9499d739debca3bcd76583981e06e1f357b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Tue, 1 Oct 2024 13:56:53 +0200 Subject: [PATCH 19/24] remove cached on previous_output --- packages/client/src/main.cairo | 4 ++-- scripts/data/generate_data.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/client/src/main.cairo b/packages/client/src/main.cairo index 82dcd3e4..eabd4fb5 100644 --- a/packages/client/src/main.cairo +++ b/packages/client/src/main.cairo @@ -19,8 +19,8 @@ struct Args { /// then validates and applies them one by one. /// Returns new state in case of success, otherwise raises an error. fn main(mut arguments: Span) -> State { - let Args { mut state, blocks, } = Serde::deserialize(ref arguments) - .expect('Failed to deserialize'); + let Args { mut state, blocks, } = Serde::deserialize(ref arguments).unwrap(); + // .expect('Failed to deserialize'); let mut utxo_set: UtxoSet = Default::default(); diff --git a/scripts/data/generate_data.py b/scripts/data/generate_data.py index b8501dd7..e865aca6 100755 --- a/scripts/data/generate_data.py +++ b/scripts/data/generate_data.py @@ -383,6 +383,7 @@ def generate_data( tx_input["previous_output"]["vout"], ) if outpoint in utxo_set: + # TODO: Check if this is correct tx_input["previous_output"]["cached"] = True utxo_set[outpoint]["cached"] = True From a5a17ca63134ed112faf14536e52c267ab153517 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Tue, 1 Oct 2024 14:04:00 +0200 Subject: [PATCH 20/24] remove cached on previous_output 2 --- scripts/data/generate_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/data/generate_data.py b/scripts/data/generate_data.py index e865aca6..6cfb44c7 100755 --- a/scripts/data/generate_data.py +++ b/scripts/data/generate_data.py @@ -384,7 +384,7 @@ def generate_data( ) if outpoint in utxo_set: # TODO: Check if this is correct - tx_input["previous_output"]["cached"] = True + # tx_input["previous_output"]["cached"] = True utxo_set[outpoint]["cached"] = True for idx, output in enumerate(tx["outputs"]): From f880769c0a35a2b87590021a954f56e7d6057daa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Tue, 1 Oct 2024 14:25:58 +0200 Subject: [PATCH 21/24] fix cached on previous_output 3 --- scripts/data/generate_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/data/generate_data.py b/scripts/data/generate_data.py index 6cfb44c7..a483fec1 100755 --- a/scripts/data/generate_data.py +++ b/scripts/data/generate_data.py @@ -384,7 +384,7 @@ def generate_data( ) if outpoint in utxo_set: # TODO: Check if this is correct - # tx_input["previous_output"]["cached"] = True + tx_input["previous_output"]["data"]["cached"] = True utxo_set[outpoint]["cached"] = True for idx, output in enumerate(tx["outputs"]): From a014ca56b1aeaa4be75664d013f90f0c06ece300 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Tue, 1 Oct 2024 14:27:29 +0200 Subject: [PATCH 22/24] fmt --- packages/client/src/main.cairo | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/client/src/main.cairo b/packages/client/src/main.cairo index eabd4fb5..82dcd3e4 100644 --- a/packages/client/src/main.cairo +++ b/packages/client/src/main.cairo @@ -19,8 +19,8 @@ struct Args { /// then validates and applies them one by one. /// Returns new state in case of success, otherwise raises an error. fn main(mut arguments: Span) -> State { - let Args { mut state, blocks, } = Serde::deserialize(ref arguments).unwrap(); - // .expect('Failed to deserialize'); + let Args { mut state, blocks, } = Serde::deserialize(ref arguments) + .expect('Failed to deserialize'); let mut utxo_set: UtxoSet = Default::default(); From a247da11588d4505a9ea3d6d7a336fd7e34ea320 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Tue, 1 Oct 2024 14:45:25 +0200 Subject: [PATCH 23/24] fix no outputs edge case --- scripts/data/client.sh | 3 +-- scripts/data/generate_utxo_data.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/data/client.sh b/scripts/data/client.sh index c36dea78..121120f8 100755 --- a/scripts/data/client.sh +++ b/scripts/data/client.sh @@ -19,7 +19,6 @@ run_client() { local num_blocks=$2 first=$((initial_height+1)) - second=$((initial_height+num_blocks)) batch_file=${base_dir}/${mode}_${initial_height}_${num_blocks}.json arguments_file=${base_dir}/arguments-${mode}_${initial_height}_${num_blocks}.json @@ -27,7 +26,7 @@ run_client() { python ../../scripts/data/generate_data.py --fast --mode $mode --height $initial_height --num_blocks $num_blocks --include_expected --output_file $batch_file fi - echo -n "Running $mode client on blocks $first — $second " + echo -n "Running $mode client on block $first" python ../../scripts/data/format_args.py $batch_file > $arguments_file output=$(scarb cairo-run --no-build --package client --function test --arguments-file $arguments_file) if [[ $? -ne 0 || "$output" == *"FAIL"* || "$output" == *error* || "$output" == *panicked* ]]; then diff --git a/scripts/data/generate_utxo_data.py b/scripts/data/generate_utxo_data.py index 54086185..456cfa33 100644 --- a/scripts/data/generate_utxo_data.py +++ b/scripts/data/generate_utxo_data.py @@ -129,7 +129,7 @@ def get_utxo_set(block_number: int) -> Dict[str, Any]: # Find chunk file chunk_file = index.get(str(block_number)) if not chunk_file: - return {"block_number": str(block_number), "outputs": []} + return [] # raise Exception(f"Block number {block_number} not found in index file: {index_file}") # Find and return data for the block @@ -141,7 +141,6 @@ def get_utxo_set(block_number: int) -> Dict[str, Any]: data = json.loads(line.strip()) return data["outputs"] - print() raise Exception(f"Block {block_number} not found in chunk file {chunk_file}") From a04201f28b8b944e95e6b7a8316d94e2a2a55ebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kami=C5=84ski?= Date: Tue, 1 Oct 2024 14:52:42 +0200 Subject: [PATCH 24/24] fix messages --- scripts/data/client.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/data/client.sh b/scripts/data/client.sh index 121120f8..7296f3c9 100755 --- a/scripts/data/client.sh +++ b/scripts/data/client.sh @@ -19,6 +19,7 @@ run_client() { local num_blocks=$2 first=$((initial_height+1)) + second=$((initial_height+num_blocks)) batch_file=${base_dir}/${mode}_${initial_height}_${num_blocks}.json arguments_file=${base_dir}/arguments-${mode}_${initial_height}_${num_blocks}.json @@ -26,7 +27,7 @@ run_client() { python ../../scripts/data/generate_data.py --fast --mode $mode --height $initial_height --num_blocks $num_blocks --include_expected --output_file $batch_file fi - echo -n "Running $mode client on block $first" + echo -n "Running $mode client on blocks $first - $second " python ../../scripts/data/format_args.py $batch_file > $arguments_file output=$(scarb cairo-run --no-build --package client --function test --arguments-file $arguments_file) if [[ $? -ne 0 || "$output" == *"FAIL"* || "$output" == *error* || "$output" == *panicked* ]]; then