Merge pull request #142 from macrocosm-os/dev

Release 2.6.0
macrocosm-os · Dec 5, 2024 · c6dce9d · c6dce9d
2 parents 1237391 + 67bde1f
commit c6dce9d
Show file tree

Hide file tree

Showing 22 changed files with 440 additions and 258 deletions.
diff --git a/competitions/data.py b/competitions/data.py
@@ -11,6 +11,8 @@ class CompetitionId(IntEnum):
 
     B7_MULTI_CHOICE = 2
 
+    INSTRUCT_8B = 3
+
     # Overwrite the default __repr__, which doesn't work with
     # bt.logging for some unknown reason.
     def __repr__(self) -> str:

diff --git a/constants/__init__.py b/constants/__init__.py
@@ -15,11 +15,14 @@
 from transformers import (
     BartForCausalLM,
     FalconForCausalLM,
+    Gemma2ForCausalLM,
     GemmaForCausalLM,
     GPTNeoXForCausalLM,
     LlamaForCausalLM,
     MistralForCausalLM,
+    Phi3ForCausalLM,
     PhiForCausalLM,
+    Qwen2ForCausalLM,
 )
 
 from competitions.data import CompetitionId
@@ -30,7 +33,7 @@
 # Project Constants.
 # ---------------------------------
 
-__version__ = "2.5.1"
+__version__ = "2.6.0"
 version_split = __version__.split(".")
 __spec_version__ = (
     (1000 * int(version_split[0]))
@@ -45,16 +48,16 @@
 # Block the subnet was registered.
 GENESIS_BLOCK = 3138611
 # Define the number of blocks per vali "sync". This cadence is used to align validator behavior for better vtrust.
-SYNC_BLOCK_CADENCE = 180
+SYNC_BLOCK_CADENCE = 270
 # Rough estimate of the number of seconds per block.
 SECONDS_PER_BLOCK = 12
 # Validator weight moving average term.
-# At 0.9 a model will go from 0 -> 0.190 in 2 cycles and from 0 -> 0.83 in 17 cycles.
-ALPHA = 0.9
+# At 0.85 a model will go from 0 -> 0.278 in 2 cycles and from 0 -> 0.833 in 11 cycles.
+ALPHA = 0.85
 # Any miners with a combined competition weight below this threshold will instead receive 0 weight.
 # This is intended to help vtrust in conjunction with a low alpha by handling the tail ends.
-# At 1 eval per 180 blocks, newly winning models will start recieving weight after ~360 blocks.
-# Previously winning models will phase out after ~3060 blocks, at which point only the new winner will have weight.
+# At 1 eval per 270 blocks, newly winning models will start recieving weight after ~540 blocks.
+# Previously winning models will phase out after ~2970 blocks, at which point only the new winner will have weight.
 MIN_WEIGHT_THRESHOLD = 0.18
 
 # The validator WANDB project.
@@ -75,7 +78,8 @@
 WEIGHT_SYNC_VALI_MIN_STAKE = 100_000
 # Minimum percent of weight on a vali for a miner to be considered a top miner.
 # Since there can be multiple competitions at different reward percentages we can't just check biggest.
-WEIGHT_SYNC_MINER_MIN_PERCENT = 0.10
+# Since we only set weights per competition with a threshold of 0.18 we can just take any percent here.
+WEIGHT_SYNC_MINER_MIN_PERCENT = 0.01
 # The root directory of this project.
 ROOT_DIR = Path(__file__).parent.parent
 # The maximum bytes for the hugging face repo.
@@ -98,7 +102,7 @@
         kwargs={
             "torch_dtype": torch.bfloat16,
         },
-        eval_block_delay=1200,  # ~4 hours.
+        eval_block_delay=1600,  # ~5 hours.
         norm_validation_constraints=NormValidationConstraints(
             norm_eps_soft=200,
             norm_eps_soft_percent_threshold=0.15,
@@ -107,10 +111,36 @@
         epsilon_func=LinearDecay(0.05, 0.01, 7200 * 5),  # Decay over ~5 days.
         max_bytes=15 * 1024 * 1024 * 1024,
     ),
+    CompetitionId.INSTRUCT_8B: ModelConstraints(
+        max_model_parameter_size=8_100_000_000,
+        sequence_length=4096,
+        allowed_architectures=[
+            BartForCausalLM,
+            FalconForCausalLM,
+            Gemma2ForCausalLM,
+            GemmaForCausalLM,
+            GPTNeoXForCausalLM,
+            LlamaForCausalLM,
+            MistralForCausalLM,
+            Phi3ForCausalLM,
+            PhiForCausalLM,
+        ],
+        tokenizer=None,  # Any tokenizer can be used.
+        kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        eval_block_delay=1600,  # ~5 hours.
+        norm_validation_constraints=NormValidationConstraints(
+            norm_eps_soft=200,
+            norm_eps_soft_percent_threshold=0.15,
+            norm_eps_hard=1000,
+        ),
+        epsilon_func=LinearDecay(0.05, 0.01, 7200 * 5),  # Decay over ~5 days.
+        max_bytes=20 * (1024**3),
+    ),
 }
 
-# Block to start including fineweb data.
-IF_EVAL_BLOCK = 4_344_030
+INSTRUCT_8B_BLOCK = 4_451_695
 
 # Schedule of competitions by block.
 COMPETITION_SCHEDULE_BY_BLOCK: List[Tuple[int, List[Competition]]] = [
@@ -127,7 +157,7 @@
                         method_id=EvalMethodId.MULTIPLE_CHOICE,
                         dataset_id=DatasetId.SYNTHETIC_MMLU,
                         normalization_id=NormalizationId.NONE,
-                        weight=0.9,
+                        weight=0.85,
                     ),
                     EvalTask(
                         name="WORD_SORTING",
@@ -145,24 +175,31 @@
                         normalization_kwargs={"ceiling": 20.0},
                         weight=0.05,
                     ),
+                    EvalTask(
+                        name="IF_EVAL_V1",
+                        method_id=EvalMethodId.IF_EVAL,
+                        dataset_id=DatasetId.SYNTHETIC_IF_EVAL,
+                        normalization_id=NormalizationId.NONE,
+                        weight=0.05,
+                    ),
                 ],
             ),
         ],
     ),
     (
-        IF_EVAL_BLOCK,
+        INSTRUCT_8B_BLOCK,
         [
             Competition(
                 CompetitionId.B7_MULTI_CHOICE,
                 MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MULTI_CHOICE],
-                1.0,
+                0.9,
                 eval_tasks=[
                     EvalTask(
                         name="SYNTHETIC_MMLU",
                         method_id=EvalMethodId.MULTIPLE_CHOICE,
                         dataset_id=DatasetId.SYNTHETIC_MMLU,
                         normalization_id=NormalizationId.NONE,
-                        weight=0.85,
+                        weight=0.8,
                     ),
                     EvalTask(
                         name="WORD_SORTING",
@@ -178,8 +215,45 @@
                         dataset_id=DatasetId.FINEWEB,
                         normalization_id=NormalizationId.INVERSE_EXPONENTIAL,
                         normalization_kwargs={"ceiling": 20.0},
+                        weight=0.1,
+                    ),
+                    EvalTask(
+                        name="IF_EVAL_V1",
+                        method_id=EvalMethodId.IF_EVAL,
+                        dataset_id=DatasetId.SYNTHETIC_IF_EVAL,
+                        normalization_id=NormalizationId.NONE,
+                        weight=0.05,
+                    ),
+                ],
+            ),
+            Competition(
+                CompetitionId.INSTRUCT_8B,
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.INSTRUCT_8B],
+                0.1,
+                eval_tasks=[
+                    EvalTask(
+                        name="SYNTHETIC_MMLU",
+                        method_id=EvalMethodId.MULTIPLE_CHOICE,
+                        dataset_id=DatasetId.SYNTHETIC_MMLU,
+                        normalization_id=NormalizationId.NONE,
+                        weight=0.8,
+                    ),
+                    EvalTask(
+                        name="WORD_SORTING",
+                        method_id=EvalMethodId.REFERENCE_LOSS,
+                        dataset_id=DatasetId.WORD_SORTING,
+                        normalization_id=NormalizationId.INVERSE_EXPONENTIAL,
+                        normalization_kwargs={"ceiling": 40.0},
                         weight=0.05,
                     ),
+                    EvalTask(
+                        name="FINEWEB",
+                        method_id=EvalMethodId.TEXT_LOSS,
+                        dataset_id=DatasetId.FINEWEB,
+                        normalization_id=NormalizationId.INVERSE_EXPONENTIAL,
+                        normalization_kwargs={"ceiling": 20.0},
+                        weight=0.1,
+                    ),
                     EvalTask(
                         name="IF_EVAL_V1",
                         method_id=EvalMethodId.IF_EVAL,

diff --git a/docs/competitions.md b/docs/competitions.md
@@ -18,6 +18,20 @@ Models submitted to this competition are evaluated on a set of evaluation tasks,
 
 [Code Link](https://github.com/macrocosm-os/finetuning/blob/94e8fd92ab4158e1e4a425a9562695eebafa27b1/constants/__init__.py#L128)
 
+## Competition INSTRUCT_8B:
+
+### Goal
+
+The goal of this competition is to train a SOTA instruct 8B model. This competition provides more freedom to miners than other competitions: there are no restrictions on the tokenizer used and miners are allowed to use a wider range of architectures.
+
+### Evaluation
+
+The evaluation tasks are the same as the B7_MULTICHOICE competition
+
+### Definitions
+
+TODO: Fill in post check-in
+
 # Deprecated Competitions
 
 ## Competition 1: SN9_MODEL

diff --git a/docs/examples.ipynb b/docs/examples.ipynb
@@ -92,8 +92,8 @@
     ")\n",
     "\n",
     "# Move the model to the appropriate device and set to eval mode.\n",
-    "model.to(device)\n",
-    "model.eval()\n",
+    "model.pt_model.to(device)\n",
+    "model.pt_model.eval()\n",
     "\n",
     "# Load the competition so we can load the right tokenizer.\n",
     "metagraph = bt.metagraph(constants.SUBNET_UID)\n",
@@ -134,7 +134,7 @@
     "    pad_token_id=tokenizer.eos_token_id,\n",
     ")\n",
     "response = ft.eval.method.generate_output(\n",
-    "    model=model,\n",
+    "    model=model.pt_model,\n",
     "    input_ids=input_ids,\n",
     "    generation_config=generation_config,\n",
     "    device=device,\n",

diff --git a/finetune/datasets/factory.py b/finetune/datasets/factory.py
@@ -1,22 +1,24 @@
+from typing import Any, Dict, Set
+
 from finetune.datasets.generated.dyck_loader import DyckLoader
 from finetune.datasets.generated.if_eval_loader import IFEvalLoader
 from finetune.datasets.generated.word_sorting_loader import WordSortingLoader
 from finetune.datasets.hugging_face.hugging_face_loader import (
-    HuggingFaceLoader,
     FINEWEB_EDU_SCORE_2_NAME,
+    HuggingFaceLoader,
 )
 from finetune.datasets.ids import DatasetId
-from typing import Dict, Any, Set
+from finetune.datasets.loader import DatasetLoader
 
 
-class DatasetLoader:
+class DatasetLoaderFactory:
     @staticmethod
     def get_loader(
         dataset_id: DatasetId,
         dataset_kwargs: Dict[str, Any],
         seed: int,
         validator_hotkeys: Set[str],
-    ) -> "DatasetLoader":
+    ) -> DatasetLoader:
         """Loads data samples from the appropriate dataset."""
 
         match dataset_id:

diff --git a/finetune/datasets/generated/dyck_loader.py b/finetune/datasets/generated/dyck_loader.py
@@ -19,6 +19,8 @@
 import torch
 from transformers import PreTrainedTokenizerBase
 
+from finetune.datasets.loader import DatasetLoader
+
 # Characters to use in the dycks.
 DYCK_CHARACTER_PAIRS = [("<", ">"), ("[", "]"), ("{", "}"), ("(", ")")]
 DYCK_ENDING_CHARS = [x[1] for x in DYCK_CHARACTER_PAIRS]
@@ -69,7 +71,7 @@ def generate_dyck(
     return dyck_word
 
 
-class DyckLoader:
+class DyckLoader(DatasetLoader):
     def __init__(
         self,
         dyck_character_pairs: typing.List[

diff --git a/finetune/datasets/generated/if_eval_loader.py b/finetune/datasets/generated/if_eval_loader.py
@@ -8,12 +8,13 @@
 from transformers import PreTrainedTokenizerBase
 
 from finetune.datasets.generated.mmlu_parser import extract_q_and_a_text
+from finetune.datasets.loader import DatasetLoader
 from finetune.datasets.subnet.prompting_subset_loader import PromptingSubsetLoader
 from finetune.eval.if_eval import rule_factory
 from finetune.eval.if_eval.sample import IFEvalTokenizedSample
 
 
-class IFEvalLoader:
+class IFEvalLoader(DatasetLoader):
     """Generates samples for the IfEval task."""
 
     # The min/max number of rules per sample.

diff --git a/finetune/datasets/generated/word_sorting_loader.py b/finetune/datasets/generated/word_sorting_loader.py
@@ -20,6 +20,8 @@
 import torch
 from transformers import PreTrainedTokenizerBase
 
+from finetune.datasets.loader import DatasetLoader
+
 try:
     from nltk.corpus import words
 except:
@@ -28,7 +30,7 @@
 WORD_SORTING_CHALLENGE_PROMPT = "Sort the following words alphabetically: "
 
 
-class WordSortingLoader:
+class WordSortingLoader(DatasetLoader):
     def __init__(
         self,
         min_word_count: int = 2,

diff --git a/finetune/datasets/hugging_face/hugging_face_loader.py b/finetune/datasets/hugging_face/hugging_face_loader.py
@@ -22,12 +22,14 @@
 import bittensor as bt
 from transformers import PreTrainedTokenizerBase
 
+from finetune.datasets.loader import DatasetLoader
+
 
 FINEWEB_EDU_SCORE_2_NAME = "HuggingFaceFW/fineweb-edu-score-2"
 FALCON_NAME = "tiiuae/falcon-refinedweb"
 
 
-class HuggingFaceLoader:
+class HuggingFaceLoader(DatasetLoader):
     rows_base_url: str = "https://datasets-server.huggingface.co/rows"
     size_base_url: str = "https://datasets-server.huggingface.co/size"
 

diff --git a/finetune/datasets/loader.py b/finetune/datasets/loader.py
@@ -0,0 +1,24 @@
+import abc
+from typing import List
+
+from transformers import PreTrainedTokenizerBase
+
+from finetune.eval.sample import EvalSample
+
+
+class DatasetLoader(abc.ABC):
+    """Base class for dataset loaders."""
+
+    @abc.abstractmethod
+    def tokenize(
+        self, tokenizer: PreTrainedTokenizerBase, sequence_length: int
+    ) -> List[EvalSample]:
+        pass
+
+    @abc.abstractmethod
+    def __iter__(self):
+        pass
+
+    @abc.abstractmethod
+    def __len__(self):
+        pass
diff --git a/finetune/datasets/subnet/prompting_subset_loader.py b/finetune/datasets/subnet/prompting_subset_loader.py
@@ -27,6 +27,7 @@
 from transformers import PreTrainedTokenizerBase
 
 import constants
+from finetune.datasets.loader import DatasetLoader
 from finetune.datasets.subnet.history_scan import SampledHistoryScan
 
 # Multiple choice answers for the prompting subnet.
@@ -36,7 +37,7 @@
 EARLIEST_DATE = dt.datetime(2024, 8, 29, tzinfo=dt.timezone.utc)
 
 
-class PromptingSubsetLoader:
+class PromptingSubsetLoader(DatasetLoader):
     @staticmethod
     def _get_filters(
         validator_hotkeys: typing.List[str],