Skip to content

Commit

Permalink
Merge pull request #161 from macrocosm-os/dev
Browse files Browse the repository at this point in the history
Release 2.7.0
  • Loading branch information
RusticLuftig authored Dec 17, 2024
2 parents c6dce9d + 23cad3a commit de1346a
Show file tree
Hide file tree
Showing 29 changed files with 1,050 additions and 255 deletions.
91 changes: 88 additions & 3 deletions constants/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,18 @@
MistralForCausalLM,
Phi3ForCausalLM,
PhiForCausalLM,
Qwen2ForCausalLM,
)

from competitions.data import CompetitionId
from finetune.datasets.ids import DatasetId
from finetune.eval.method import EvalMethodId
from finetune.eval.if_eval.version import IfEvalVersion

# ---------------------------------
# Project Constants.
# ---------------------------------

__version__ = "2.6.0"
__version__ = "2.7.0"
version_split = __version__.split(".")
__spec_version__ = (
(1000 * int(version_split[0]))
Expand Down Expand Up @@ -141,6 +141,7 @@
}

INSTRUCT_8B_BLOCK = 4_451_695
IF_EVAL_V2_BLOCK = 4_523_592

# Schedule of competitions by block.
COMPETITION_SCHEDULE_BY_BLOCK: List[Tuple[int, List[Competition]]] = [
Expand Down Expand Up @@ -180,6 +181,7 @@
method_id=EvalMethodId.IF_EVAL,
dataset_id=DatasetId.SYNTHETIC_IF_EVAL,
normalization_id=NormalizationId.NONE,
dataset_kwargs={"if_eval_version": IfEvalVersion.V1},
weight=0.05,
),
],
Expand Down Expand Up @@ -222,6 +224,7 @@
method_id=EvalMethodId.IF_EVAL,
dataset_id=DatasetId.SYNTHETIC_IF_EVAL,
normalization_id=NormalizationId.NONE,
dataset_kwargs={"if_eval_version": IfEvalVersion.V1},
weight=0.05,
),
],
Expand Down Expand Up @@ -259,12 +262,94 @@
method_id=EvalMethodId.IF_EVAL,
dataset_id=DatasetId.SYNTHETIC_IF_EVAL,
normalization_id=NormalizationId.NONE,
dataset_kwargs={"if_eval_version": IfEvalVersion.V1},
weight=0.05,
),
],
),
],
),
(
IF_EVAL_V2_BLOCK,
[
Competition(
CompetitionId.B7_MULTI_CHOICE,
MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MULTI_CHOICE],
0.9,
eval_tasks=[
EvalTask(
name="SYNTHETIC_MMLU",
method_id=EvalMethodId.MULTIPLE_CHOICE,
dataset_id=DatasetId.SYNTHETIC_MMLU,
normalization_id=NormalizationId.NONE,
weight=0.75,
),
EvalTask(
name="WORD_SORTING",
method_id=EvalMethodId.REFERENCE_LOSS,
dataset_id=DatasetId.WORD_SORTING,
normalization_id=NormalizationId.INVERSE_EXPONENTIAL,
normalization_kwargs={"ceiling": 40.0},
weight=0.05,
),
EvalTask(
name="FINEWEB",
method_id=EvalMethodId.TEXT_LOSS,
dataset_id=DatasetId.FINEWEB,
normalization_id=NormalizationId.INVERSE_EXPONENTIAL,
normalization_kwargs={"ceiling": 20.0},
weight=0.1,
),
EvalTask(
name="IF_EVAL_V2",
method_id=EvalMethodId.IF_EVAL,
dataset_id=DatasetId.SYNTHETIC_IF_EVAL,
normalization_id=NormalizationId.NONE,
dataset_kwargs={"if_eval_version": IfEvalVersion.V2},
weight=0.1,
),
],
),
Competition(
CompetitionId.INSTRUCT_8B,
MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.INSTRUCT_8B],
0.1,
eval_tasks=[
EvalTask(
name="SYNTHETIC_MMLU",
method_id=EvalMethodId.MULTIPLE_CHOICE,
dataset_id=DatasetId.SYNTHETIC_MMLU,
normalization_id=NormalizationId.NONE,
weight=0.75,
),
EvalTask(
name="WORD_SORTING",
method_id=EvalMethodId.REFERENCE_LOSS,
dataset_id=DatasetId.WORD_SORTING,
normalization_id=NormalizationId.INVERSE_EXPONENTIAL,
normalization_kwargs={"ceiling": 40.0},
weight=0.05,
),
EvalTask(
name="FINEWEB",
method_id=EvalMethodId.TEXT_LOSS,
dataset_id=DatasetId.FINEWEB,
normalization_id=NormalizationId.INVERSE_EXPONENTIAL,
normalization_kwargs={"ceiling": 20.0},
weight=0.1,
),
EvalTask(
name="IF_EVAL_V2",
method_id=EvalMethodId.IF_EVAL,
dataset_id=DatasetId.SYNTHETIC_IF_EVAL,
normalization_id=NormalizationId.NONE,
dataset_kwargs={"if_eval_version": IfEvalVersion.V2},
weight=0.1,
),
],
),
],
),
]

for block_and_competitions in COMPETITION_SCHEDULE_BY_BLOCK:
Expand All @@ -287,7 +372,7 @@
# time required between updates to the chain.
chain_update_cadence = dt.timedelta(minutes=20)
# Number of blocks required between retrying evaluation of a model.
model_retry_cadence = 300 # Roughly 1 hour
model_retry_cadence = 1200 # Roughly 4 hour
# How frequently to check the models given weights by other large validators.
scan_top_model_cadence = dt.timedelta(minutes=30)
# validator eval batch min to keep for next loop.
Expand Down
2 changes: 1 addition & 1 deletion docs/competitions.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ The evaluation tasks are the same as the B7_MULTICHOICE competition

### Definitions

TODO: Fill in post check-in
[Code Link](https://github.com/macrocosm-os/finetuning/blob/c6dce9d27d1317b9c543071913ae34df09faddc7/constants/__init__.py#L114)

# Deprecated Competitions

Expand Down
21 changes: 13 additions & 8 deletions finetune/datasets/generated/if_eval_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import random
from typing import List, Set

import bittensor as bt
import taoverse.utilities.logging as logging
import torch
from transformers import PreTrainedTokenizerBase

Expand All @@ -12,20 +12,21 @@
from finetune.datasets.subnet.prompting_subset_loader import PromptingSubsetLoader
from finetune.eval.if_eval import rule_factory
from finetune.eval.if_eval.sample import IFEvalTokenizedSample
from finetune.eval.if_eval.version import IfEvalVersion


class IFEvalLoader(DatasetLoader):
"""Generates samples for the IfEval task."""

# The min/max number of rules per sample.
MIN_RULES = 1
MAX_RULES = 4
# The min/max number of rules per sample per version.
VERSION_TO_RULE_COUNTS = {IfEvalVersion.V1: (1, 4), IfEvalVersion.V2: (2, 5)}

def __init__(
self,
random_seed: int = None,
max_samples: int = 20,
validator_hotkeys: Set[str] = None,
if_eval_version: IfEvalVersion = IfEvalVersion.V1,
):
if random_seed:
random.seed(random_seed)
Expand All @@ -44,14 +45,14 @@ def __init__(
)
)

bt.logging.trace(f"Loaded {len(questions)} raw samples")
logging.trace(f"Loaded {len(questions)} raw samples")

# Parse the question and answer text from the raw text.
parsed_q_and_a = [
extract_q_and_a_text(prompt, answer) for prompt, answer in questions
]
parsed_q_and_a = [qa for qa in parsed_q_and_a if qa is not None]
bt.logging.trace(
logging.trace(
f"Extracted {len(parsed_q_and_a)} questions and answers from raw samples"
)

Expand All @@ -69,11 +70,15 @@ def __init__(
):
self.buffer.append(
rule_factory.generate_if_eval_sample(
qa1, qa2, IFEvalLoader.MIN_RULES, IFEvalLoader.MAX_RULES
qa1,
qa2,
IFEvalLoader.VERSION_TO_RULE_COUNTS[if_eval_version][0],
IFEvalLoader.VERSION_TO_RULE_COUNTS[if_eval_version][1],
if_eval_version,
)
)

bt.logging.trace(f"Generated {len(self.buffer)} IFEval samples")
logging.trace(f"Generated {len(self.buffer)} IFEval samples")

def _should_filter_question(self, question: str, answer: str) -> bool:
# For now, just filter out 1 word answers.
Expand Down
18 changes: 6 additions & 12 deletions finetune/datasets/hugging_face/hugging_face_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,12 @@
import random
import time
import requests
import taoverse.utilities.logging as logging
import torch

import bittensor as bt
from transformers import PreTrainedTokenizerBase

from finetune.datasets.loader import DatasetLoader


FINEWEB_EDU_SCORE_2_NAME = "HuggingFaceFW/fineweb-edu-score-2"
FALCON_NAME = "tiiuae/falcon-refinedweb"

Expand Down Expand Up @@ -97,7 +95,7 @@ def _fetch_data_to_buffer(self, num_pages):
if page in self.pages:
duplicates += 1
if duplicates >= self.duplicate_page_threshold:
bt.logging.debug(
logging.debug(
f"Hit duplicate page threshold of {self.duplicate_page_threshold}. Stopping early at: {len(self.pages)} pages."
)
break
Expand Down Expand Up @@ -143,16 +141,14 @@ def _fetch_data_to_buffer(self, num_pages):
except requests.exceptions.RequestException:
response.close()
attempts += 1
bt.logging.warning(
logging.warning(
f"Failed to fetch data, retrying with a newly sampled page. Attempt {attempts}/{self.retry_limit * num_pages}"
)
if attempts < num_pages * self.retry_limit:
pass

else:
bt.logging.error(
"Maximum retry limit reached. Unable to fetch data."
)
logging.error("Maximum retry limit reached. Unable to fetch data.")
raise

def get_random_pages(self, num_pages, initial_offset):
Expand Down Expand Up @@ -243,15 +239,13 @@ def fetch_dataset_configs(self) -> typing.Dict[str, typing.Dict]:

except requests.exceptions.RequestException:
attempt += 1
bt.logging.warning(
logging.warning(
f"Failed to fetch dataset configs, retrying. Attempt {attempt}/{self.retry_limit}"
)
if attempt < self.retry_limit:
time.sleep(self.retry_delay) # Wait before the next retry
else:
bt.logging.error(
"Maximum retry limit reached. Unable to fetch data."
)
logging.error("Maximum retry limit reached. Unable to fetch data.")
raise

def tokenize(
Expand Down
17 changes: 8 additions & 9 deletions finetune/datasets/subnet/prompting_subset_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import typing

import bittensor as bt
import taoverse.utilities.logging as logging
import torch
import wandb
from transformers import PreTrainedTokenizerBase
Expand Down Expand Up @@ -114,11 +115,11 @@ def __init__(
validator_hotkeys, oldest_sample_timestamp, newest_sample_timestamp
)

bt.logging.trace(f"Fetching runs using filters {filters}")
logging.trace(f"Fetching runs using filters {filters}")

# Get the runs, oldest first.
runs = list(api.runs(prompting_project, filters, order="+created_at"))
bt.logging.trace(f"Found {len(runs)} runs")
logging.trace(f"Found {len(runs)} runs")

all_samples: typing.Set[str] = set()
self.buffer: typing.List[typing.Tuple[str, str]] = []
Expand All @@ -138,7 +139,7 @@ def _collect_samples(run: wandb.apis.public.Run) -> bool:
hotkey = run.config.get("HOTKEY_SS58", None)
# First check that the hotkey is in fact a desired validator hotkey.
if hotkey not in validator_hotkeys:
bt.logging.trace(
logging.trace(
f"Hotkey: {hotkey} does not match an expected validator for {run.id}."
)
return False
Expand All @@ -148,7 +149,7 @@ def _collect_samples(run: wandb.apis.public.Run) -> bool:
if not signature or not bt.Keypair(ss58_address=hotkey).verify(
run.id, bytes.fromhex(signature)
):
bt.logging.trace(
logging.trace(
f"Failed Signature: {signature} is not valid for {run.id}."
)
return False
Expand Down Expand Up @@ -233,7 +234,7 @@ def _collect_samples(run: wandb.apis.public.Run) -> bool:
break
except Exception:
attempt += 1
bt.logging.trace(
logging.trace(
f"Failed to fetch data. {traceback.format_exc()}, retrying. Attempt {attempt}/{max_attempts}"
)
if attempt < max_attempts:
Expand All @@ -244,11 +245,9 @@ def _collect_samples(run: wandb.apis.public.Run) -> bool:

self.buffer = list(all_samples)
if len(self.buffer) < max_samples:
bt.logging.debug(
f"Did not collect {max_samples}, only got {len(self.buffer)}"
)
logging.debug(f"Did not collect {max_samples}, only got {len(self.buffer)}")
else:
bt.logging.trace(f"Collected {max_samples} samples")
logging.trace(f"Collected {max_samples} samples")

def tokenize(
self, tokenizer: PreTrainedTokenizerBase, sequence_length: int
Expand Down
24 changes: 24 additions & 0 deletions finetune/eval/if_eval/bullet_count.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from finetune.eval.if_eval.rule import IFEvalRule, RuleId


class BulletFrequencyRule(IFEvalRule):
"""Rule that enforces an exact amount of * bullet points."""

def __init__(self, count: int):
super().__init__(rule_id=RuleId.BULLET_COUNT_FREQUENCY)

if count < 1:
raise ValueError(
f"BulletFrequencyRule must expect at least 1 bullet point."
)
self.count = count

def get_prompt(self, index: int = -1) -> str:
bullet = "bullet point" if self.count == 1 else "bullet points"
return f"The response must contain exactly {self.count} {bullet} in markdown format."

def matches(self, text: str, index: int = -1) -> bool:
return (
sum(1 for line in text.splitlines() if line.lstrip().startswith("*"))
== self.count
)
Loading

0 comments on commit de1346a

Please sign in to comment.