mosaicml · snarayan21 · Nov 2, 2023 · Aug 17, 2023 · Aug 17, 2023 · Aug 17, 2023
@@ -28,4 +28,7 @@ test:
 web:
 	uvicorn scripts.partition.web:app --port 1337 --reload
 
+simulator:
+	streamlit run simulation/interfaces/sim_ui.py
+
 .PHONY: test lint style
diff --git a/docs/source/_static/images/downloads.png b/docs/source/_static/images/downloads.png
diff --git a/docs/source/_static/images/inputs.png b/docs/source/_static/images/inputs.png
diff --git a/docs/source/_static/images/shuffle_quality_graph.png b/docs/source/_static/images/shuffle_quality_graph.png
diff --git a/docs/source/_static/images/shuffle_quality_toggle.png b/docs/source/_static/images/shuffle_quality_toggle.png
diff --git a/docs/source/_static/images/stats.png b/docs/source/_static/images/stats.png
diff --git a/docs/source/_static/images/throughput.png b/docs/source/_static/images/throughput.png
diff --git a/docs/source/_static/images/yaml_toggle.png b/docs/source/_static/images/yaml_toggle.png
diff --git a/docs/source/fundamentals/simulator.md b/docs/source/fundamentals/simulator.md
@@ -0,0 +1,45 @@
+# Streaming Simulator
+A simulator for throughput, network use, and shuffle quality with MosaicML Streaming. The simulator allows you to:
+- Plan runs and anticipate issues beforehand
+- Find optimal run configurations
+- Debug issues with underperforming runs
+- Better understand the impact of different configurations
+
+## Getting Started
+Run the following to install simulator-specific dependencies, if they don't already exist:
+```
+pip install --upgrade "mosaicml-streaming[simulator]"
+```
+Then, simply run `simulator` in your command line to open the Web UI and get simulating!
+## Key Features
+
+### Throughput
+Throughput is estimated for the duration of the run and is displayed as the simulation progresses. We estimate throughput by iterating over the samples of the dataset in order, and performing shard downloads based on an estimate of network bandwidth. The 10-step rolling average is displayed.
+
+<img src="../_static/images/throughput.png" alt="Throughput Graph" width="500"/>
+
+### Network Downloads
+Cumulative network downloads are also estimated for the run and displayed. It is calculated in conjunction with throughput. If shards are compressed, we assume they are downloaded in compressed form and immediately uncompressed.
+
+<img src="../_static/images/downloads.png" alt="Downloads Graph" width="500"/>
+
+### Simulation Stats
+We also provide various useful statistics from the simulation, such as:
+- Minimum cache limit (i.e., maximum space used by live shards)
+- Steps slowed down by shard downloads
+- Estimated time to first batch
+- Estimated warmup time (i.e., time until throughput maximized)
+
+<img src="../_static/images/stats.png" alt="Simulation Stats" width="500"/>
+
+### Shuffle Quality
+You can choose to evaluate the quality of different shuffling algorithms for your run. We provide an estimate of shuffle quality based on the entropy calculated over the probability distribution of differences between neighboring sample indices and shard indices of the dataset. *These shuffle quality metrics are noisy and may not reflect the true strength of a shuffle.*
+
+<img src="../_static/images/shuffle_quality_toggle.png" alt="Shuffle Quality Toggle" width="300"/>
+
+<img src="../_static/images/shuffle_quality_graph.png" alt="Shuffle Quality Graph" width="500"/>
+
+### Yaml Support
+Yaml files that follow MosaicML conventions can be uploaded and simulated as well. Simply click the toggle, enter any needed additional information, and see your results. Parameters can also be modified to quickly test out configurations.
+
+<img src="../_static/images/yaml_toggle.png" alt="Yaml Quality Toggle" width="300"/>
diff --git a/docs/source/index.md b/docs/source/index.md
@@ -60,6 +60,7 @@ If you have any questions, please feel free to reach out to us on [Twitter](htt
    fundamentals/shuffling.md
    fundamentals/sampling.md
    fundamentals/batching.md
+   fundamentals/simulator.md
 
 .. toctree::
    :hidden:

@@ -93,6 +93,16 @@
     'sphinx-tabs==3.4.1',
 ]
 
+extra_deps['simulator'] = [
+    'sortedcollections>=2.1.0,<3',
+    'streamlit>=1.26.0,<2',
+    'altair>=5.1.1,<6',
+    'omegaconf>=2.3.0,<3',
+    'PyYAML>=6.0,<7',
+    'pandas>=2.0.3,<3',
+    'wandb>=0.15.5,<1',
+]
+
 extra_deps['spark'] = [
     'pyspark>=3,<4',
 ]
@@ -123,6 +133,9 @@
         'streaming': ['py.typed'],
     },
     packages=setuptools.find_packages(exclude=['tests*']),
+    entry_points={
+        'console_scripts': ['simulator = simulation.launcher:launch_simulation_ui',],
+    },
     classifiers=classifiers,
     install_requires=install_requires,
     extras_require=extra_deps,

diff --git a/simulation/README.md b/simulation/README.md
@@ -0,0 +1,48 @@
+# 🤖 Streaming Simulator
+A simulator for throughput, network use, and shuffle quality with MosaicML Streaming. The simulator allows you to:
+- Plan runs and anticipate issues beforehand
+- Find optimal run configurations
+- Debug issues with underperforming runs
+- Better understand the impact of different configurations
+
+## 🚀 Getting Started
+Run the following to install simulator-specific dependencies, if they don't already exist:
+```
+pip install --upgrade "mosaicml-streaming[simulator]"
+```
+Then, simply run `simulator` in your command line to open the Web UI and get simulating!
+## 🔑 Key Features
+
+### Throughput
+Throughput is estimated for the duration of the run and is displayed as the simulation progresses. We estimate throughput by iterating over the samples of the dataset in order, and performing shard downloads based on an estimate of network bandwidth. The 10-step rolling average is displayed.
+
+![Throughput Graph](../docs/source/_static/images/throughput.png)
+
+### Network Downloads
+Cumulative network downloads are also estimated for the run and displayed. It is calculated in conjunction with throughput. If shards are compressed, we assume they are downloaded in compressed form and immediately uncompressed.
+
+![Downloads Graph](../docs/source/_static/images/downloads.png)
+
+### Simulation Stats
+We also provide various useful statistics from the simulation, such as:
+- Minimum cache limit (i.e., maximum space used by live shards)
+- Steps slowed down by shard downloads
+- Estimated time to first batch
+- Estimated warmup time (i.e., time until throughput maximized)
+
+![Simulation Stats](../docs/source/_static/images/stats.png)
+
+### Shuffle Quality
+You can choose to evaluate the quality of different shuffling algorithms for your run. We provide an estimate of shuffle quality based on the entropy calculated over the probability distribution of differences between neighboring sample indices and shard indices of the dataset. *These shuffle quality metrics are noisy and may not reflect the true strength of a shuffle.*
+
+![Shuffle Quality Toggle](../docs/source/_static/images/shuffle_quality_toggle.png)
+
+![Shuffle Quality Graph](../docs/source/_static/images/shuffle_quality_graph.png)
+
+### Yaml Support
+Yaml files that follow MosaicML conventions can be uploaded and simulated as well. Simply click the toggle, enter any needed additional information, and see your results. Parameters can also be modified to quickly test out configurations.
+
+![Yaml Quality Toggle](../docs/source/_static/images/yaml_toggle.png)
+
+## 💬 Contact
+If you have problems, questions, or suggestions, please reach out to the MosaicML team on our [community slack channel](https://mosaicml.me/slack).
diff --git a/simulation/__init__.py b/simulation/__init__.py
@@ -0,0 +1,4 @@
+# Copyright 2023 MosaicML Streaming authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Streaming simulation for throughput, network downloads, and shuffle quality."""
diff --git a/simulation/core/create_index.py b/simulation/core/create_index.py
@@ -0,0 +1,86 @@
+# Copyright 2023 MosaicML Streaming authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Create a dataset index file from input parameters."""
+
+import json
+import logging
+import os
+import random
+import string
+from typing import Optional
+
+from streaming.base.format import get_index_basename
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def get_random_foldername() -> str:
+    """Generate random folder name to store the index file in.
+
+    Returns:
+        str: random alphanumeric folder name.
+    """
+    return ''.join(
+        random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits)
+        for _ in range(16))
+
+
+def create_stream_index(shards: int, samples_per_shard: int, avg_raw_shard_size: int,
+                        avg_zip_shard_size: Optional[int]) -> str:
+    """Create dataset index file from input parameters.
+
+    Args:
+        shards (int): Number of shards.
+        samples_per_shard (int): Number of samples per shard.
+        avg_raw_shard_size (int): Average raw shard size.
+        avg_zip_shard_size (int): Average compressed shard size.
+
+    Returns:
+        local path to created index file for stream.
+    """
+    index_data = {'version': 2, 'shards': []}
+
+    shards_list = []
+    for _ in range(shards):
+        shard_data = {
+            'column_encodings': [],
+            'column_names': [],
+            'column_sizes': [],
+            'format': 'mds',
+            'raw_data': {
+                'basename': '',
+                'bytes': avg_raw_shard_size,
+                'hashes': {}
+            },
+            'hashes': [],
+            'samples': samples_per_shard,
+            'size_limit': avg_raw_shard_size,
+            'version': 2,
+            'zip_data': None,
+            'compression': None
+        }
+        if avg_zip_shard_size is not None:
+            shard_data['zip_data'] = {'basename': '', 'bytes': avg_zip_shard_size, 'hashes': {}}
+            shard_data['compression'] = ''
+        shards_list.append(shard_data)
+
+    index_data['shards'] = shards_list
+
+    # Try making the directory for the stream's index.json file
+    foldername = get_random_foldername() + '_indexcreated'
+    try:
+        os.mkdir(foldername)
+    except FileExistsError:
+        logger.warning(' Folder already exists, trying again...')
+        foldername = get_random_foldername()
+        os.mkdir(foldername)
+
+    index_basename = get_index_basename()
+    index_path = os.path.join(foldername, index_basename)
+
+    with open(index_path, 'w') as f:
+        json.dump(index_data, f)
+
+    return index_path
diff --git a/simulation/core/last_used_ordered_set.py b/simulation/core/last_used_ordered_set.py
@@ -0,0 +1,37 @@
+# Copyright 2023 MosaicML Streaming authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""An ordered set that can be used as an LRU cache."""
+
+from collections import OrderedDict
+from typing import Any
+
+
+class LastUsedOrderedSet(OrderedDict):
+    """An ordered dict that can be used as an LRU cache.
+
+    This is a subclass of OrderedDict, with some LRU-specific functions and all values as ``None``.
+    """
+
+    def setitem(self, key: Any, move_to_end: bool = True):
+        """Set/add an item.
+
+        Args:
+            key (Any): key to be added.
+            move_to_end (bool, optional): whether to move the item to the end, signifying most
+                recent access. Defaults to ``True``.
+        """
+        super().__setitem__(key, None)
+        self.move_to_end(key, last=move_to_end)
+
+    def popLRU(self):
+        """Pop the least recently used item (located at the front)."""
+        return self.popitem(last=False)[0]
+
+    def setuse(self, key: Any):
+        """Mark an item as used, moving it to the end.
+
+        Args:
+            key (Any): key of element to move to the end, signifying most recent access.
+        """
+        self.setitem(key)