From 1367c1be09ff48b82534f6f695c8e94b6019c0bc Mon Sep 17 00:00:00 2001 From: Sid Date: Thu, 19 Dec 2024 08:59:13 -0800 Subject: [PATCH 01/13] Include competition weight as "norm_weight" in uid data. --- neurons/validator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/neurons/validator.py b/neurons/validator.py index b492ecb..1219c11 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -1294,6 +1294,9 @@ def log_step( "win_rate": win_rate[uid], "win_total": wins[uid], "weight": self.weights[uid].item(), + "norm_weight": competition_weights[ + uid + ].item(), # Named norm_weight for leaderboard pipeline compatibilty. } for task in eval_tasks: step_log["uid_data"][str(uid)][f"{task.name}.raw_score"] = ( From 552bca2bb136c75b3bae819025ca8772acb70057 Mon Sep 17 00:00:00 2001 From: Sid Date: Thu, 19 Dec 2024 09:00:10 -0800 Subject: [PATCH 02/13] Bump to bittensor 8.5.1. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c65fdbf..10d1377 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -bittensor==8.4.3 +bittensor==8.5.1 huggingface_hub nltk numpy==2.0.2 From f7e2af3111a9a5370bf97941eda17971725fe7a1 Mon Sep 17 00:00:00 2001 From: Sid Date: Sat, 21 Dec 2024 08:14:48 -0800 Subject: [PATCH 03/13] Fix netuid arg type. --- neurons/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neurons/config.py b/neurons/config.py index 63e56df..f41828f 100644 --- a/neurons/config.py +++ b/neurons/config.py @@ -69,7 +69,7 @@ def validator_config(): help="Where to store downloaded models", ) parser.add_argument( - "--netuid", type=str, default=constants.SUBNET_UID, help="The subnet UID." + "--netuid", type=int, default=constants.SUBNET_UID, help="The subnet UID." ) parser.add_argument( "--do_sample", From 13819ef22c327c5dfa7965984701c9dcb74fa80a Mon Sep 17 00:00:00 2001 From: Sid Date: Sat, 21 Dec 2024 08:55:15 -0800 Subject: [PATCH 04/13] Bump Comp 3 to 25%. --- constants/__init__.py | 68 ++++++++----------------------------------- 1 file changed, 12 insertions(+), 56 deletions(-) diff --git a/constants/__init__.py b/constants/__init__.py index d3501c6..da0430e 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -140,56 +140,12 @@ ), } -INSTRUCT_8B_BLOCK = 4_451_695 -IF_EVAL_V2_BLOCK = 4_523_592 +INSTRUCT_8B_TO_25_WEIGHT_BLOCK = 4_552_883 # Schedule of competitions by block. COMPETITION_SCHEDULE_BY_BLOCK: List[Tuple[int, List[Competition]]] = [ ( 0, - [ - Competition( - CompetitionId.B7_MULTI_CHOICE, - MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MULTI_CHOICE], - 1.0, - eval_tasks=[ - EvalTask( - name="SYNTHETIC_MMLU", - method_id=EvalMethodId.MULTIPLE_CHOICE, - dataset_id=DatasetId.SYNTHETIC_MMLU, - normalization_id=NormalizationId.NONE, - weight=0.85, - ), - EvalTask( - name="WORD_SORTING", - method_id=EvalMethodId.REFERENCE_LOSS, - dataset_id=DatasetId.WORD_SORTING, - normalization_id=NormalizationId.INVERSE_EXPONENTIAL, - normalization_kwargs={"ceiling": 40.0}, - weight=0.05, - ), - EvalTask( - name="FINEWEB", - method_id=EvalMethodId.TEXT_LOSS, - dataset_id=DatasetId.FINEWEB, - normalization_id=NormalizationId.INVERSE_EXPONENTIAL, - normalization_kwargs={"ceiling": 20.0}, - weight=0.05, - ), - EvalTask( - name="IF_EVAL_V1", - method_id=EvalMethodId.IF_EVAL, - dataset_id=DatasetId.SYNTHETIC_IF_EVAL, - normalization_id=NormalizationId.NONE, - dataset_kwargs={"if_eval_version": IfEvalVersion.V1}, - weight=0.05, - ), - ], - ), - ], - ), - ( - INSTRUCT_8B_BLOCK, [ Competition( CompetitionId.B7_MULTI_CHOICE, @@ -201,7 +157,7 @@ method_id=EvalMethodId.MULTIPLE_CHOICE, dataset_id=DatasetId.SYNTHETIC_MMLU, normalization_id=NormalizationId.NONE, - weight=0.8, + weight=0.75, ), EvalTask( name="WORD_SORTING", @@ -220,12 +176,12 @@ weight=0.1, ), EvalTask( - name="IF_EVAL_V1", + name="IF_EVAL_V2", method_id=EvalMethodId.IF_EVAL, dataset_id=DatasetId.SYNTHETIC_IF_EVAL, normalization_id=NormalizationId.NONE, - dataset_kwargs={"if_eval_version": IfEvalVersion.V1}, - weight=0.05, + dataset_kwargs={"if_eval_version": IfEvalVersion.V2}, + weight=0.1, ), ], ), @@ -239,7 +195,7 @@ method_id=EvalMethodId.MULTIPLE_CHOICE, dataset_id=DatasetId.SYNTHETIC_MMLU, normalization_id=NormalizationId.NONE, - weight=0.8, + weight=0.75, ), EvalTask( name="WORD_SORTING", @@ -258,24 +214,24 @@ weight=0.1, ), EvalTask( - name="IF_EVAL_V1", + name="IF_EVAL_V2", method_id=EvalMethodId.IF_EVAL, dataset_id=DatasetId.SYNTHETIC_IF_EVAL, normalization_id=NormalizationId.NONE, - dataset_kwargs={"if_eval_version": IfEvalVersion.V1}, - weight=0.05, + dataset_kwargs={"if_eval_version": IfEvalVersion.V2}, + weight=0.1, ), ], ), ], ), ( - IF_EVAL_V2_BLOCK, + INSTRUCT_8B_TO_25_WEIGHT_BLOCK, [ Competition( CompetitionId.B7_MULTI_CHOICE, MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MULTI_CHOICE], - 0.9, + 0.75, eval_tasks=[ EvalTask( name="SYNTHETIC_MMLU", @@ -313,7 +269,7 @@ Competition( CompetitionId.INSTRUCT_8B, MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.INSTRUCT_8B], - 0.1, + 0.25, eval_tasks=[ EvalTask( name="SYNTHETIC_MMLU", From c5216a8843caa5a65ac1a5f2d52785901eb5f1c5 Mon Sep 17 00:00:00 2001 From: Sid Date: Sat, 21 Dec 2024 09:05:00 -0800 Subject: [PATCH 05/13] Do not reset history at this time. --- neurons/validator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/neurons/validator.py b/neurons/validator.py index b492ecb..b972e29 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -929,7 +929,8 @@ async def run_step(self): logging.info("Starting evaluation for competition: " + str(competition.id)) # If the competition's eval tasks have changed, make sure all models are re-evaluated. - self._maybe_reset_eval_history(competition) + # Commenting out for now. Churn from IfEval changes should not actual require a reset. + # self._maybe_reset_eval_history(competition) # Add uids with newly updated models to the upcoming batch of evaluations. with self.pending_uids_to_eval_lock: From dd563b2aa08b0396b00e9cccc3647e4957542eb7 Mon Sep 17 00:00:00 2001 From: Sid Date: Wed, 18 Dec 2024 17:23:36 -0800 Subject: [PATCH 06/13] Set weights on a separate thread. --- neurons/validator.py | 123 ++++++++++++++++++++++++++++--------------- 1 file changed, 82 insertions(+), 41 deletions(-) diff --git a/neurons/validator.py b/neurons/validator.py index b911eea..85465c9 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -218,9 +218,9 @@ def __init__(self): self._new_wandb_run() # === Running args === + self.weight_lock = threading.RLock() self.weights = torch.zeros_like(torch.from_numpy(self.metagraph.S)) self.global_step = 0 - self.last_epoch = self.metagraph.block.item() self.uids_to_eval: typing.Dict[CompetitionId, typing.Set] = defaultdict(set) @@ -300,6 +300,22 @@ def __init__(self): f"Failed to load competition tracker state. Reason: {e}. Starting from scratch." ) + # Also update our internal weights based on the tracker. + cur_block = self._get_current_block() + + # Get the competition schedule for the current block. + # This is a list of competitions + competition_schedule: typing.List[Competition] = ( + competition_utils.get_competition_schedule_for_block( + block=cur_block, + schedule_by_block=constants.COMPETITION_SCHEDULE_BY_BLOCK, + ) + ) + with self.weight_lock: + self.weights = self.competition_tracker.get_subnet_weights( + competition_schedule + ) + # Initialize the UIDs to eval. if not os.path.exists(self.uids_filepath): logging.warning("No uids state file found. Starting from scratch.") @@ -720,38 +736,72 @@ def clean_models(self): logging.info("Exiting clean models loop.") - async def try_set_weights(self, block: int, ttl: int): + def set_weights(self): + """Set weights on the chain regularly.""" + + # Check that we have some weights internally for startup situations. + all_zero_weights = True + while all_zero_weights is True: + # Technically returns a tensor but it evaluates to true. + with self.weight_lock: + all_zero_weights = torch.all(self.weights == 0) + bt.logging.trace( + "Waiting 60 seconds for internal weights before continuing to try set weights." + ) + time.sleep(60) + + while not self.stop_event.is_set(): + try: + set_weights_success = False + while not set_weights_success: + set_weights_success, _ = asyncio.run(self.try_set_weights(ttl=60)) + # Wait for 60 seconds before we try to set weights again. + if set_weights_success: + bt.logging.info("Successfully set weights.") + else: + time.sleep(60) + except Exception as e: + bt.logging.error(f"Error in set weights: {e}") + + # Only try at most once every 20 minutes + time.sleep(60 * 20) + + bt.logging.info("Exiting set weights loop.") + + async def try_set_weights(self, ttl: int) -> typing.Tuple[bool, str]: """Sets the weights on the chain with ttl, without raising exceptions if it times out.""" - async def _try_set_weights(): + async def _try_set_weights() -> typing.Tuple[bool, str]: with self.metagraph_lock: uids = self.metagraph.uids try: - weight_subtensor = bt.subtensor(config=self.config) - success, message = weight_subtensor.set_weights( + with self.weight_lock: + self.weights.nan_to_num(0.0) + weights_to_set = self.weights + + return self.subtensor.set_weights( netuid=self.config.netuid, wallet=self.wallet, uids=uids, - weights=self.weights.numpy(), + weights=weights_to_set.numpy(), wait_for_inclusion=False, version_key=constants.weights_version_key, + max_retries=1, ) - if not success: - logging.warning( - f"Failed to set weights (will retry later): {message}" - ) - else: - # We only update the last epoch when we successfully set weights. - self.last_epoch = block - except: - logging.warning("Failed to set weights. Trying again later.") + except Exception as e: + bt.logging.warning( + f"Failed to set weights due to {e}. Trying again later." + ) + return (False, str(e)) try: logging.debug(f"Setting weights.") - await asyncio.wait_for(_try_set_weights(), ttl) - logging.debug(f"Finished setting weights.") + status = await asyncio.wait_for(_try_set_weights(), ttl) + bt.logging.debug(f"Finished setting weights with status: {status}.") + return status except asyncio.TimeoutError: - logging.error(f"Failed to set weights after {ttl} seconds") + bt.logging.error(f"Failed to set weights after {ttl} seconds") + return (False, f"Timeout after {ttl} seconds") def _get_current_block(self) -> int: """Returns the current block.""" @@ -1140,10 +1190,11 @@ async def run_step(self): # Align competition_tracker to only track active competitions. self.competition_tracker.reset_competitions(active_competition_ids) # Update self.weights to the merged values across active competitions. - self.weights = self.competition_tracker.get_subnet_weights( - competitions=competition_schedule, - min_comp_weight_threshold=constants.MIN_WEIGHT_THRESHOLD, - ) + with self.weight_lock: + self.weights = self.competition_tracker.get_subnet_weights( + competitions=competition_schedule, + min_comp_weight_threshold=constants.MIN_WEIGHT_THRESHOLD, + ) # Prioritize models for keeping up to the sample_min for the next eval loop. # If the model has any significant weight, prioritize by weight with greater weights being kept first. @@ -1280,6 +1331,11 @@ def log_step( "uids": uids, "uid_data": {}, } + + # Get a copy of weights to print. + with self.weight_lock: + log_weights = self.weights + for uid in uids: step_log["uid_data"][str(uid)] = { "uid": uid, @@ -1294,7 +1350,7 @@ def log_step( ), "win_rate": win_rate[uid], "win_total": wins[uid], - "weight": self.weights[uid].item(), + "weight": log_weights[uid].item(), "norm_weight": competition_weights[ uid ].item(), # Named norm_weight for leaderboard pipeline compatibilty. @@ -1329,7 +1385,7 @@ def log_step( str(round(step_log["uid_data"][str(uid)]["average_loss"], 4)), str(round(step_log["uid_data"][str(uid)]["epsilon_adv"], 4)), str(round(step_log["uid_data"][str(uid)]["win_rate"], 4)), - str(round(self.weights[uid].item(), 4)), + str(round(log_weights[uid].item(), 4)), str(round(competition_weights[uid].item(), 4)), str(step_log["uid_data"][str(uid)]["block"]), str(step_log["uid_data"][str(uid)]["competition_id"]), @@ -1339,7 +1395,7 @@ def log_step( console = Console() console.print(table) - ws, ui = self.weights.topk(len(self.weights)) + ws, ui = log_weights.topk(len(log_weights)) table = Table(title=f"Weights >= {constants.WEIGHT_SYNC_MINER_MIN_PERCENT}") table.add_column("uid", justify="right", style="cyan", no_wrap=True) table.add_column("weight", style="magenta") @@ -1390,7 +1446,7 @@ def log_step( "win_total_data": { str(uid): uid_data[str(uid)]["win_total"] for uid in uids }, - "weight_data": {str(uid): self.weights[uid].item() for uid in uids}, + "weight_data": {str(uid): log_weights[uid].item() for uid in uids}, "competition_weight_data": { str(uid): competition_weights[uid].item() for uid in uids }, @@ -1480,24 +1536,9 @@ async def run(self): while True: try: - - # First run a step. await self.try_run_step(ttl=75 * 60) self.global_step += 1 - block = self._get_current_block() - - # Then check if we should set weights and do so if needed. - if not self.config.offline: - blocks_until_epoch = block - self.last_epoch - - if blocks_until_epoch >= self.config.blocks_per_epoch: - await self.try_set_weights(block=block, ttl=60) - else: - logging.debug( - f"{blocks_until_epoch} / {self.config.blocks_per_epoch} blocks until next epoch." - ) - except KeyboardInterrupt: logging.info( "KeyboardInterrupt caught, gracefully closing the wandb run..." From 7f2dbd119aaf52d89cf2d54c93555b2004942ec1 Mon Sep 17 00:00:00 2001 From: Sid Date: Sat, 21 Dec 2024 08:29:37 -0800 Subject: [PATCH 07/13] Actually start the thread (if not offline). --- neurons/validator.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/neurons/validator.py b/neurons/validator.py index 85465c9..03d0b9d 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -388,6 +388,11 @@ def __init__(self): ) self.clean_thread.start() + # == Initialize the weight setting thread == + if not self.config.offline: + self.weight_thread = threading.Thread(target=self.set_weights, daemon=True) + self.weight_thread.start() + def __del__(self): if hasattr(self, "stop_event"): self.stop_event.set() From 528a01460b23898c01fb098b1a0b36568f6bd239 Mon Sep 17 00:00:00 2001 From: rusticluftig Date: Sat, 21 Dec 2024 09:36:41 -0800 Subject: [PATCH 08/13] Give every thread its own subtensor --- neurons/validator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/neurons/validator.py b/neurons/validator.py index 03d0b9d..b9a2f76 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -159,8 +159,9 @@ def __init__(self): torch.backends.cudnn.benchmark = True # Setup metagraph syncer for the subnet based on config. This is non-lite for getting weights by vali. + syncer_subtensor = bt.subtensor(config=self.config) self.subnet_metagraph_syncer = MetagraphSyncer( - self.subtensor, + syncer_subtensor, config={ self.config.netuid: dt.timedelta(minutes=20).total_seconds(), }, @@ -353,8 +354,9 @@ def __init__(self): self.miner_iterator = MinerIterator(self.metagraph.uids.tolist()) # Setup a ModelMetadataStore + chain_store_subtensor = bt.subtensor(config=self.config) self.metadata_store = ChainModelMetadataStore( - subtensor=self.subtensor, + subtensor=chain_store_subtensor, subnet_uid=self.config.netuid, wallet=self.wallet, ) From 9e903575bae5968b76ef3cae150e2c0724646137 Mon Sep 17 00:00:00 2001 From: rusticluftig Date: Sun, 22 Dec 2024 10:13:13 -0800 Subject: [PATCH 09/13] Use taoverse logger --- neurons/validator.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/neurons/validator.py b/neurons/validator.py index b9a2f76..a06cb09 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -752,7 +752,7 @@ def set_weights(self): # Technically returns a tensor but it evaluates to true. with self.weight_lock: all_zero_weights = torch.all(self.weights == 0) - bt.logging.trace( + logging.trace( "Waiting 60 seconds for internal weights before continuing to try set weights." ) time.sleep(60) @@ -764,16 +764,16 @@ def set_weights(self): set_weights_success, _ = asyncio.run(self.try_set_weights(ttl=60)) # Wait for 60 seconds before we try to set weights again. if set_weights_success: - bt.logging.info("Successfully set weights.") + logging.info("Successfully set weights.") else: time.sleep(60) except Exception as e: - bt.logging.error(f"Error in set weights: {e}") + logging.error(f"Error in set weights: {e}") # Only try at most once every 20 minutes time.sleep(60 * 20) - bt.logging.info("Exiting set weights loop.") + logging.info("Exiting set weights loop.") async def try_set_weights(self, ttl: int) -> typing.Tuple[bool, str]: """Sets the weights on the chain with ttl, without raising exceptions if it times out.""" @@ -796,7 +796,7 @@ async def _try_set_weights() -> typing.Tuple[bool, str]: max_retries=1, ) except Exception as e: - bt.logging.warning( + logging.warning( f"Failed to set weights due to {e}. Trying again later." ) return (False, str(e)) @@ -804,10 +804,10 @@ async def _try_set_weights() -> typing.Tuple[bool, str]: try: logging.debug(f"Setting weights.") status = await asyncio.wait_for(_try_set_weights(), ttl) - bt.logging.debug(f"Finished setting weights with status: {status}.") + logging.debug(f"Finished setting weights with status: {status}.") return status except asyncio.TimeoutError: - bt.logging.error(f"Failed to set weights after {ttl} seconds") + logging.error(f"Failed to set weights after {ttl} seconds") return (False, f"Timeout after {ttl} seconds") def _get_current_block(self) -> int: From b97c8072eb233cb72c6889d38c7181b0bc0f7ee7 Mon Sep 17 00:00:00 2001 From: rusticluftig Date: Sun, 22 Dec 2024 10:24:26 -0800 Subject: [PATCH 10/13] Use a separate subtensor for weight setting --- neurons/validator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/neurons/validator.py b/neurons/validator.py index a06cb09..27ae894 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -150,6 +150,7 @@ def __init__(self): # === Bittensor objects ==== self.wallet = bt.wallet(config=self.config) self.subtensor = bt.subtensor(config=self.config) + self.weights_subtensor = bt.subtensor(config=self.config) # If running on testnet, default to using finney for the dataset subtensor. if self.config.using_test_subtensor: self.dataset_subtensor = bt.subtensor() @@ -786,7 +787,7 @@ async def _try_set_weights() -> typing.Tuple[bool, str]: self.weights.nan_to_num(0.0) weights_to_set = self.weights - return self.subtensor.set_weights( + return self.weights_subtensor.set_weights( netuid=self.config.netuid, wallet=self.wallet, uids=uids, From 034c56f52e27af0934240f3f05193ca38e42dbea Mon Sep 17 00:00:00 2001 From: rusticluftig Date: Sun, 22 Dec 2024 10:33:24 -0800 Subject: [PATCH 11/13] Set weights once an hour --- neurons/validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neurons/validator.py b/neurons/validator.py index 27ae894..d4257bf 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -771,8 +771,8 @@ def set_weights(self): except Exception as e: logging.error(f"Error in set weights: {e}") - # Only try at most once every 20 minutes - time.sleep(60 * 20) + # Only set weights once every hour + time.sleep(60 * 60) logging.info("Exiting set weights loop.") From 6cf6436bb528d0105f6ac148b963bc29d07168c0 Mon Sep 17 00:00:00 2001 From: rusticluftig Date: Sun, 22 Dec 2024 12:05:34 -0800 Subject: [PATCH 12/13] Wait for inclusion --- neurons/validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neurons/validator.py b/neurons/validator.py index d4257bf..aea4b1e 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -792,7 +792,7 @@ async def _try_set_weights() -> typing.Tuple[bool, str]: wallet=self.wallet, uids=uids, weights=weights_to_set.numpy(), - wait_for_inclusion=False, + wait_for_inclusion=True, version_key=constants.weights_version_key, max_retries=1, ) From 8675ee10ceccd9c12e69b807f0291dcec9b6dfdb Mon Sep 17 00:00:00 2001 From: Sid Date: Mon, 23 Dec 2024 12:38:38 -0800 Subject: [PATCH 13/13] Bump version to 2.7.2. --- constants/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/constants/__init__.py b/constants/__init__.py index da0430e..2db6d72 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -33,7 +33,7 @@ # Project Constants. # --------------------------------- -__version__ = "2.7.1" +__version__ = "2.7.2" version_split = __version__.split(".") __spec_version__ = ( (1000 * int(version_split[0]))