stanford-crfm · ahmeda14960 · Nov 7, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/config/llama_7b_with_olmo_config.yaml b/config/llama_7b_with_olmo_config.yaml
@@ -15,6 +15,10 @@ trainer:
     project: "marin"
     tags: ["dolma", "olmo", "llama"]
 
+    checkpointer:
+    keep:
+      - every: 250
+
   mp: p=f32,c=bfloat16
   train_batch_size: 2048
   num_train_steps: 750000  # 3,000,000,000,000 / 4,000,000 = 750,000
@@ -27,3 +31,5 @@ optimizer:
   weight_decay: 0.1
   min_lr_ratio: 0.1
   warmup: 0.01
+
+ data_shuffle: true
diff --git a/examples/alpaca/alpaca.py b/examples/alpaca/alpaca.py
@@ -162,11 +162,13 @@ def _prepare_example(ex: dict) -> LmExample:
         # mask out padding and anything before the start of the target
         Pos = input_ids.resolve_axis("position")
         if config.mask_inputs:
-            loss_mask = hax.arange(Pos) >= ex["source_lens"]
+            loss_mask = hax.arange(Pos) >= ex["source_lens"] - 1 # should be minus 1?
 
             # don't predict the padding
             targets = hax.roll(input_ids, -1, Pos)
             loss_mask = loss_mask & (targets != tokenizer.pad_token_id)
+            # to not predict EOS token since we don't have target!
+            loss_mask = loss_mask & (1 - hax.nn.one_hot(-1, Pos, dtype=jax.numpy.bool_))
         else:
             loss_mask = 1 - hax.nn.one_hot(-1, Pos, dtype=jax.numpy.float32)
         lm_ex = LmExample.causal(input_ids, loss_mask=loss_mask)

diff --git a/pyproject.toml b/pyproject.toml
@@ -48,7 +48,6 @@ dependencies = [
     "pydantic<3",
     "rich~=13.0",
     "filelock~=3.13",
-    #    "ai2-olmo",
     "async-lru~=2.0",
     "tqdm-loggable>=0.2",
     "deepdiff"

diff --git a/src/levanter/callbacks.py b/src/levanter/callbacks.py
@@ -25,11 +25,55 @@
 from levanter.utils import flop_utils
 from levanter.utils.jax_utils import barrier_sync, jnp_to_python
 from levanter.visualization import compute_and_visualize_log_probs as viz_probs
+from levanter.data.text import TokenSeqEpochDataset
+from concurrent.futures import ThreadPoolExecutor
+
 
 
 logger = pylogging.getLogger(__name__)
 
 
+def log_epoch_progress(total_tokens_future, tokens_per_example, batch_size):
+    total_tokens = None
+
+    def log_epoch(step_info: StepInfo):
+        nonlocal total_tokens
+        if total_tokens is None:
+            if not total_tokens_future.done():
+                return  # We don't have the total tokens yet, so we can't calculate epoch
+            total_tokens = total_tokens_future.result()
+
+        # Get the total processed tokens from the metrics logged by log_performance_stats
+        processed_tokens = tokens_per_example * batch_size * step_info.step
+        if processed_tokens is None:
+            return  # No token count available yet
+
+        current_epoch = processed_tokens / total_tokens
+        levanter.tracker.log_metrics({"train/current_epoch": current_epoch}, step=step_info.step)
+
+    return log_epoch
+
+def get_total_dataset_tokens(ds: TokenSeqEpochDataset, seq_length: int):
+    def log_length():
+        # If ds.async_len() is the only option, run it in an event loop inside the thread
+        import asyncio
+
+        async def compute_length():
+            length = await ds.async_len()
+            return length
+
+        # Run the async function synchronously in this thread
+        length = asyncio.run(compute_length())
+        total_tokens = length * seq_length
+        levanter.tracker.log_summary({"dataset/total_tokens": total_tokens})
+        return total_tokens
+
+    # Create a ThreadPoolExecutor with a single worker thread
+    executor = ThreadPoolExecutor(max_workers=1)
+    # Submit the log_length function to be executed in a separate thread
+    future = executor.submit(log_length)
+    return future
+
 def eval_loss_loop(loss_fn, model, dataset, max_batches: Optional[int] = None, name: Optional[str] = None):
     total_loss = 0.0
     total_load_time = 0.0

diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
@@ -63,6 +63,57 @@
 
 DEFAULT_IGNORE_INDEX = -100  # Mirrors pytorch's default ignore index
 
+class TokenSeqEpochDataset(AsyncDataset[np.ndarray]):
+    def __init__(self, doc_cache: TreeCache[dict], seq_len: int):
+        self.doc_cache = doc_cache
+        self.seq_len = seq_len
+        self._store: Optional[TreeStore] = None
+        self._cached_len: Optional[int] = None
+
+    async def async_len(self) -> int:
+        await self.doc_cache.finished()
+        token_arrays = await self._await_token_cache()
+        return token_arrays.data_size // self.seq_len
+
+    async def _await_token_cache(self) -> JaggedArrayStore:
+        if self._store is None:
+            self._store = await self.doc_cache.store_async()
+        return self._store.tree["input_ids"]
+
+    async def final_length_is_known(self) -> bool:
+        return await self.doc_cache.final_length_is_known()
+
+    def is_finite(self) -> bool:
+        return False  # Now infinite due to epoch wrapping
+
+    async def current_len(self) -> Optional[int]:
+        store = await self._await_token_cache()
+        return store.data_size // self.seq_len
+
+    async def get_batch(self, indices: Sequence[int]) -> Sequence[T_co]:
+        token_arrays = await self._await_token_cache()
+        dataset_len = await self.async_len()
+
+        wrapped_indices = [idx % dataset_len for idx in indices]
+        offsets = np.array(wrapped_indices) * self.seq_len
+
+        with ts.Batch():
+            out = []
+            for offset in offsets:
+                out.append(token_arrays.data[offset : offset + self.seq_len].read())
+
+        out = await asyncio.gather(*out)
+        return out
+
+    async def wait_until_len_at_least(self, length: int) -> int:
+        # length is brutally slow to compute, so we cache it
+        if self._cached_len is not None:
+            return self._cached_len
+
+        # TODO: would be better to listen for cache updates
+        length = await super().wait_until_len_at_least(length)
+        self._cached_len = length
+        return length
 
 class TokenSeqDataset(AsyncDataset[np.ndarray]):
     """
@@ -640,9 +691,15 @@ class LMDatasetConfig(LMDatasetSourceConfig, LMTaskConfig):
     cache_dir: Optional[str] = "cache/"
 
     def train_set(
-        self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True, *, key: Optional[PRNGKeyArray] = None
+        self, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True, *, key: Optional[PRNGKeyArray] = None, epochs: bool = False 
     ) -> AsyncDataset[np.ndarray]:
-        ds = self.token_seq_dataset("train", seq_len, monitors)
+
+        if epochs:
+            ds = self.token_epoch_dataset("train", seq_len, monitors)
+        else:
+            ds = self.token_seq_dataset("train", seq_len, monitors)
+
+        # add epoch flag here.
         if ds is None:
             raise ValueError("No training set!")
 
@@ -693,6 +750,14 @@ def token_seq_dataset(
         if cache is None:
             return None
         return TokenSeqDataset(cache, seq_len)
+
+    def token_epoch_dataset(
+        self, split: str, seq_len: int, monitors: Union[bool, List[MetricsMonitor]] = True
+    ) -> Optional[TokenSeqDataset]:
+        cache = self.build_or_load_cache(split, monitors=monitors)
+        if cache is None:
+            return None
+        return TokenSeqEpochDataset(cache, seq_len)
 
     def build_or_load_cache(
         self, split: str, monitors: Union[bool, List[MetricsMonitor]] = True, logger_name: Optional[str] = None

diff --git a/src/levanter/main/train_lm.py b/src/levanter/main/train_lm.py
@@ -54,6 +54,7 @@ class TrainLmConfig:
     data_seed: Optional[int] = None  # if provided, will override the data seed from the trainer
     initialize_from_checkpoint_path: Optional[str] = None
     # if provided, will initialize from this checkpoint, used for llama style data mixture
+    epoch: bool = False  # if true, will keep epoching over the dataset and track epochs
 
 
 def main(config: TrainLmConfig):
@@ -117,10 +118,17 @@ def main(config: TrainLmConfig):
 
         # TODO: fix this
         tagged_eval_datasets: list = config.data.tagged_eval_sets(Pos.size)
+        # TokenSeqDataset is config.data.train_set(Pos.size, key=data_key)
+
         train_dataset = CausalLmDataset(
-            config.data.train_set(Pos.size, key=data_key), Pos, KeyPos, ignore_index=config.data.ignore_token_id
+            config.data.train_set(Pos.size, key=data_key, epochs=config.epoch), Pos, KeyPos, ignore_index=config.data.ignore_token_id
         )
 
+        if config.epoch:
+            # add epoch logging
+            total_tokens_future = callbacks.get_total_dataset_tokens(train_dataset.dataset, config.model.seq_len)
+            trainer.add_hook(callbacks.log_epoch_progress(total_tokens_future, Pos.size, trainer.config.train_batch_size), every=1)
+
         # to do partitioning, our dimensions have to be divisible by the size of the physical axes they're mapped to
         # For most things, we just insist you specify the config right, but tokenizers often have strange numbers of
         # tokens: gpt-2 has 50257, for example. So we round up.
@@ -236,6 +244,7 @@ def compute_log_probs(model, example):
 
         ## OK, actually run training!
         trainer.train(state, train_loader)
+
         # checkpointer.on_step(last_step, force=True)
 
 

diff --git a/src/levanter/trainer.py b/src/levanter/trainer.py
@@ -376,7 +376,6 @@ def training_steps(self, state: S, train_loader, run_hooks: bool = True) -> typi
         while int(state.step) < self.num_train_steps:
             with capture_time() as loading_time:
                 example = next(iter_data)
-
             info = self.train_step(state, example)
             state = info.state