Merge pull request #161 from macrocosm-os/dev

Release 2.7.0
macrocosm-os · Dec 17, 2024 · de1346a · de1346a
2 parents c6dce9d + 23cad3a
commit de1346a
Show file tree

Hide file tree

Showing 29 changed files with 1,050 additions and 255 deletions.
diff --git a/constants/__init__.py b/constants/__init__.py
@@ -22,18 +22,18 @@
     MistralForCausalLM,
     Phi3ForCausalLM,
     PhiForCausalLM,
-    Qwen2ForCausalLM,
 )
 
 from competitions.data import CompetitionId
 from finetune.datasets.ids import DatasetId
 from finetune.eval.method import EvalMethodId
+from finetune.eval.if_eval.version import IfEvalVersion
 
 # ---------------------------------
 # Project Constants.
 # ---------------------------------
 
-__version__ = "2.6.0"
+__version__ = "2.7.0"
 version_split = __version__.split(".")
 __spec_version__ = (
     (1000 * int(version_split[0]))
@@ -141,6 +141,7 @@
 }
 
 INSTRUCT_8B_BLOCK = 4_451_695
+IF_EVAL_V2_BLOCK = 4_523_592
 
 # Schedule of competitions by block.
 COMPETITION_SCHEDULE_BY_BLOCK: List[Tuple[int, List[Competition]]] = [
@@ -180,6 +181,7 @@
                         method_id=EvalMethodId.IF_EVAL,
                         dataset_id=DatasetId.SYNTHETIC_IF_EVAL,
                         normalization_id=NormalizationId.NONE,
+                        dataset_kwargs={"if_eval_version": IfEvalVersion.V1},
                         weight=0.05,
                     ),
                 ],
@@ -222,6 +224,7 @@
                         method_id=EvalMethodId.IF_EVAL,
                         dataset_id=DatasetId.SYNTHETIC_IF_EVAL,
                         normalization_id=NormalizationId.NONE,
+                        dataset_kwargs={"if_eval_version": IfEvalVersion.V1},
                         weight=0.05,
                     ),
                 ],
@@ -259,12 +262,94 @@
                         method_id=EvalMethodId.IF_EVAL,
                         dataset_id=DatasetId.SYNTHETIC_IF_EVAL,
                         normalization_id=NormalizationId.NONE,
+                        dataset_kwargs={"if_eval_version": IfEvalVersion.V1},
                         weight=0.05,
                     ),
                 ],
             ),
         ],
     ),
+    (
+        IF_EVAL_V2_BLOCK,
+        [
+            Competition(
+                CompetitionId.B7_MULTI_CHOICE,
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MULTI_CHOICE],
+                0.9,
+                eval_tasks=[
+                    EvalTask(
+                        name="SYNTHETIC_MMLU",
+                        method_id=EvalMethodId.MULTIPLE_CHOICE,
+                        dataset_id=DatasetId.SYNTHETIC_MMLU,
+                        normalization_id=NormalizationId.NONE,
+                        weight=0.75,
+                    ),
+                    EvalTask(
+                        name="WORD_SORTING",
+                        method_id=EvalMethodId.REFERENCE_LOSS,
+                        dataset_id=DatasetId.WORD_SORTING,
+                        normalization_id=NormalizationId.INVERSE_EXPONENTIAL,
+                        normalization_kwargs={"ceiling": 40.0},
+                        weight=0.05,
+                    ),
+                    EvalTask(
+                        name="FINEWEB",
+                        method_id=EvalMethodId.TEXT_LOSS,
+                        dataset_id=DatasetId.FINEWEB,
+                        normalization_id=NormalizationId.INVERSE_EXPONENTIAL,
+                        normalization_kwargs={"ceiling": 20.0},
+                        weight=0.1,
+                    ),
+                    EvalTask(
+                        name="IF_EVAL_V2",
+                        method_id=EvalMethodId.IF_EVAL,
+                        dataset_id=DatasetId.SYNTHETIC_IF_EVAL,
+                        normalization_id=NormalizationId.NONE,
+                        dataset_kwargs={"if_eval_version": IfEvalVersion.V2},
+                        weight=0.1,
+                    ),
+                ],
+            ),
+            Competition(
+                CompetitionId.INSTRUCT_8B,
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.INSTRUCT_8B],
+                0.1,
+                eval_tasks=[
+                    EvalTask(
+                        name="SYNTHETIC_MMLU",
+                        method_id=EvalMethodId.MULTIPLE_CHOICE,
+                        dataset_id=DatasetId.SYNTHETIC_MMLU,
+                        normalization_id=NormalizationId.NONE,
+                        weight=0.75,
+                    ),
+                    EvalTask(
+                        name="WORD_SORTING",
+                        method_id=EvalMethodId.REFERENCE_LOSS,
+                        dataset_id=DatasetId.WORD_SORTING,
+                        normalization_id=NormalizationId.INVERSE_EXPONENTIAL,
+                        normalization_kwargs={"ceiling": 40.0},
+                        weight=0.05,
+                    ),
+                    EvalTask(
+                        name="FINEWEB",
+                        method_id=EvalMethodId.TEXT_LOSS,
+                        dataset_id=DatasetId.FINEWEB,
+                        normalization_id=NormalizationId.INVERSE_EXPONENTIAL,
+                        normalization_kwargs={"ceiling": 20.0},
+                        weight=0.1,
+                    ),
+                    EvalTask(
+                        name="IF_EVAL_V2",
+                        method_id=EvalMethodId.IF_EVAL,
+                        dataset_id=DatasetId.SYNTHETIC_IF_EVAL,
+                        normalization_id=NormalizationId.NONE,
+                        dataset_kwargs={"if_eval_version": IfEvalVersion.V2},
+                        weight=0.1,
+                    ),
+                ],
+            ),
+        ],
+    ),
 ]
 
 for block_and_competitions in COMPETITION_SCHEDULE_BY_BLOCK:
@@ -287,7 +372,7 @@
 # time required between updates to the chain.
 chain_update_cadence = dt.timedelta(minutes=20)
 # Number of blocks required between retrying evaluation of a model.
-model_retry_cadence = 300  # Roughly 1 hour
+model_retry_cadence = 1200  # Roughly 4 hour
 # How frequently to check the models given weights by other large validators.
 scan_top_model_cadence = dt.timedelta(minutes=30)
 # validator eval batch min to keep for next loop.

diff --git a/docs/competitions.md b/docs/competitions.md
@@ -30,7 +30,7 @@ The evaluation tasks are the same as the B7_MULTICHOICE competition
 
 ### Definitions
 
-TODO: Fill in post check-in
+[Code Link](https://github.com/macrocosm-os/finetuning/blob/c6dce9d27d1317b9c543071913ae34df09faddc7/constants/__init__.py#L114)
 
 # Deprecated Competitions
 

diff --git a/finetune/datasets/generated/if_eval_loader.py b/finetune/datasets/generated/if_eval_loader.py
@@ -3,7 +3,7 @@
 import random
 from typing import List, Set
 
-import bittensor as bt
+import taoverse.utilities.logging as logging
 import torch
 from transformers import PreTrainedTokenizerBase
 
@@ -12,20 +12,21 @@
 from finetune.datasets.subnet.prompting_subset_loader import PromptingSubsetLoader
 from finetune.eval.if_eval import rule_factory
 from finetune.eval.if_eval.sample import IFEvalTokenizedSample
+from finetune.eval.if_eval.version import IfEvalVersion
 
 
 class IFEvalLoader(DatasetLoader):
     """Generates samples for the IfEval task."""
 
-    # The min/max number of rules per sample.
-    MIN_RULES = 1
-    MAX_RULES = 4
+    # The min/max number of rules per sample per version.
+    VERSION_TO_RULE_COUNTS = {IfEvalVersion.V1: (1, 4), IfEvalVersion.V2: (2, 5)}
 
     def __init__(
         self,
         random_seed: int = None,
         max_samples: int = 20,
         validator_hotkeys: Set[str] = None,
+        if_eval_version: IfEvalVersion = IfEvalVersion.V1,
     ):
         if random_seed:
             random.seed(random_seed)
@@ -44,14 +45,14 @@ def __init__(
             )
         )
 
-        bt.logging.trace(f"Loaded {len(questions)} raw samples")
+        logging.trace(f"Loaded {len(questions)} raw samples")
 
         # Parse the question and answer text from the raw text.
         parsed_q_and_a = [
             extract_q_and_a_text(prompt, answer) for prompt, answer in questions
         ]
         parsed_q_and_a = [qa for qa in parsed_q_and_a if qa is not None]
-        bt.logging.trace(
+        logging.trace(
             f"Extracted {len(parsed_q_and_a)} questions and answers from raw samples"
         )
 
@@ -69,11 +70,15 @@ def __init__(
         ):
             self.buffer.append(
                 rule_factory.generate_if_eval_sample(
-                    qa1, qa2, IFEvalLoader.MIN_RULES, IFEvalLoader.MAX_RULES
+                    qa1,
+                    qa2,
+                    IFEvalLoader.VERSION_TO_RULE_COUNTS[if_eval_version][0],
+                    IFEvalLoader.VERSION_TO_RULE_COUNTS[if_eval_version][1],
+                    if_eval_version,
                 )
             )
 
-        bt.logging.trace(f"Generated {len(self.buffer)} IFEval samples")
+        logging.trace(f"Generated {len(self.buffer)} IFEval samples")
 
     def _should_filter_question(self, question: str, answer: str) -> bool:
         # For now, just filter out 1 word answers.

diff --git a/finetune/datasets/hugging_face/hugging_face_loader.py b/finetune/datasets/hugging_face/hugging_face_loader.py
@@ -17,14 +17,12 @@
 import random
 import time
 import requests
+import taoverse.utilities.logging as logging
 import torch
-
-import bittensor as bt
 from transformers import PreTrainedTokenizerBase
 
 from finetune.datasets.loader import DatasetLoader
 
-
 FINEWEB_EDU_SCORE_2_NAME = "HuggingFaceFW/fineweb-edu-score-2"
 FALCON_NAME = "tiiuae/falcon-refinedweb"
 
@@ -97,7 +95,7 @@ def _fetch_data_to_buffer(self, num_pages):
             if page in self.pages:
                 duplicates += 1
                 if duplicates >= self.duplicate_page_threshold:
-                    bt.logging.debug(
+                    logging.debug(
                         f"Hit duplicate page threshold of {self.duplicate_page_threshold}. Stopping early at: {len(self.pages)} pages."
                     )
                     break
@@ -143,16 +141,14 @@ def _fetch_data_to_buffer(self, num_pages):
             except requests.exceptions.RequestException:
                 response.close()
                 attempts += 1
-                bt.logging.warning(
+                logging.warning(
                     f"Failed to fetch data, retrying with a newly sampled page. Attempt {attempts}/{self.retry_limit * num_pages}"
                 )
                 if attempts < num_pages * self.retry_limit:
                     pass
 
                 else:
-                    bt.logging.error(
-                        "Maximum retry limit reached. Unable to fetch data."
-                    )
+                    logging.error("Maximum retry limit reached. Unable to fetch data.")
                     raise
 
     def get_random_pages(self, num_pages, initial_offset):
@@ -243,15 +239,13 @@ def fetch_dataset_configs(self) -> typing.Dict[str, typing.Dict]:
 
             except requests.exceptions.RequestException:
                 attempt += 1
-                bt.logging.warning(
+                logging.warning(
                     f"Failed to fetch dataset configs, retrying. Attempt {attempt}/{self.retry_limit}"
                 )
                 if attempt < self.retry_limit:
                     time.sleep(self.retry_delay)  # Wait before the next retry
                 else:
-                    bt.logging.error(
-                        "Maximum retry limit reached. Unable to fetch data."
-                    )
+                    logging.error("Maximum retry limit reached. Unable to fetch data.")
                     raise
 
     def tokenize(

diff --git a/finetune/datasets/subnet/prompting_subset_loader.py b/finetune/datasets/subnet/prompting_subset_loader.py
@@ -22,6 +22,7 @@
 import typing
 
 import bittensor as bt
+import taoverse.utilities.logging as logging
 import torch
 import wandb
 from transformers import PreTrainedTokenizerBase
@@ -114,11 +115,11 @@ def __init__(
             validator_hotkeys, oldest_sample_timestamp, newest_sample_timestamp
         )
 
-        bt.logging.trace(f"Fetching runs using filters {filters}")
+        logging.trace(f"Fetching runs using filters {filters}")
 
         # Get the runs, oldest first.
         runs = list(api.runs(prompting_project, filters, order="+created_at"))
-        bt.logging.trace(f"Found {len(runs)} runs")
+        logging.trace(f"Found {len(runs)} runs")
 
         all_samples: typing.Set[str] = set()
         self.buffer: typing.List[typing.Tuple[str, str]] = []
@@ -138,7 +139,7 @@ def _collect_samples(run: wandb.apis.public.Run) -> bool:
                 hotkey = run.config.get("HOTKEY_SS58", None)
                 # First check that the hotkey is in fact a desired validator hotkey.
                 if hotkey not in validator_hotkeys:
-                    bt.logging.trace(
+                    logging.trace(
                         f"Hotkey: {hotkey} does not match an expected validator for {run.id}."
                     )
                     return False
@@ -148,7 +149,7 @@ def _collect_samples(run: wandb.apis.public.Run) -> bool:
                 if not signature or not bt.Keypair(ss58_address=hotkey).verify(
                     run.id, bytes.fromhex(signature)
                 ):
-                    bt.logging.trace(
+                    logging.trace(
                         f"Failed Signature: {signature} is not valid for {run.id}."
                     )
                     return False
@@ -233,7 +234,7 @@ def _collect_samples(run: wandb.apis.public.Run) -> bool:
                     break
                 except Exception:
                     attempt += 1
-                    bt.logging.trace(
+                    logging.trace(
                         f"Failed to fetch data. {traceback.format_exc()}, retrying. Attempt {attempt}/{max_attempts}"
                     )
                     if attempt < max_attempts:
@@ -244,11 +245,9 @@ def _collect_samples(run: wandb.apis.public.Run) -> bool:
 
         self.buffer = list(all_samples)
         if len(self.buffer) < max_samples:
-            bt.logging.debug(
-                f"Did not collect {max_samples}, only got {len(self.buffer)}"
-            )
+            logging.debug(f"Did not collect {max_samples}, only got {len(self.buffer)}")
         else:
-            bt.logging.trace(f"Collected {max_samples} samples")
+            logging.trace(f"Collected {max_samples} samples")
 
     def tokenize(
         self, tokenizer: PreTrainedTokenizerBase, sequence_length: int

diff --git a/finetune/eval/if_eval/bullet_count.py b/finetune/eval/if_eval/bullet_count.py
@@ -0,0 +1,24 @@
+from finetune.eval.if_eval.rule import IFEvalRule, RuleId
+
+
+class BulletFrequencyRule(IFEvalRule):
+    """Rule that enforces an exact amount of * bullet points."""
+
+    def __init__(self, count: int):
+        super().__init__(rule_id=RuleId.BULLET_COUNT_FREQUENCY)
+
+        if count < 1:
+            raise ValueError(
+                f"BulletFrequencyRule must expect at least 1 bullet point."
+            )
+        self.count = count
+
+    def get_prompt(self, index: int = -1) -> str:
+        bullet = "bullet point" if self.count == 1 else "bullet points"
+        return f"The response must contain exactly {self.count} {bullet} in markdown format."
+
+    def matches(self, text: str, index: int = -1) -> bool:
+        return (
+            sum(1 for line in text.splitlines() if line.lstrip().startswith("*"))
+            == self.count
+        )