From 43297f658bb9ba05e8b41e2d0cc85c91fff242c7 Mon Sep 17 00:00:00 2001 From: Jae-Won Chung Date: Thu, 19 Sep 2024 11:20:58 -0400 Subject: [PATCH] [Misc] Reorganize Zeus NSDI 23 paper artifacts (#126) --- README.md | 7 ++----- docs/research_overview/zeus.md | 2 +- examples/batch_size_optimizer/capriccio/README.md | 6 +++--- .../zeus_nsdi23/README.md | 9 ++++++--- .../zeus_nsdi23/capriccio}/.gitignore | 0 .../zeus_nsdi23/capriccio}/README.md | 0 .../zeus_nsdi23/capriccio}/generate.py | 2 +- .../zeus_nsdi23/capriccio}/requirements.txt | 0 .../zeus_nsdi23/run_alibaba.py | 6 +++--- .../zeus_nsdi23/run_single.py | 4 ++-- .../zeus_nsdi23/trace}/README.md | 0 .../zeus_nsdi23/trace}/alibaba_groups.csv.xz | Bin .../zeus_nsdi23/trace}/summary_power_a40.csv | 0 .../zeus_nsdi23/trace}/summary_power_p100.csv | 0 .../zeus_nsdi23/trace}/summary_power_rtx6000.csv | 0 .../zeus_nsdi23/trace}/summary_power_v100.csv | 0 .../zeus_nsdi23/trace}/summary_train.csv | 0 scripts/lint.sh | 4 ++-- tests/optimizer/batch_size/test_simulator.py | 14 ++++++++++++-- 19 files changed, 32 insertions(+), 22 deletions(-) rename {capriccio => examples/research_reproducibility/zeus_nsdi23/capriccio}/.gitignore (100%) rename {capriccio => examples/research_reproducibility/zeus_nsdi23/capriccio}/README.md (100%) rename {capriccio => examples/research_reproducibility/zeus_nsdi23/capriccio}/generate.py (98%) rename {capriccio => examples/research_reproducibility/zeus_nsdi23/capriccio}/requirements.txt (100%) rename {trace => examples/research_reproducibility/zeus_nsdi23/trace}/README.md (100%) rename {trace => examples/research_reproducibility/zeus_nsdi23/trace}/alibaba_groups.csv.xz (100%) rename {trace => examples/research_reproducibility/zeus_nsdi23/trace}/summary_power_a40.csv (100%) rename {trace => examples/research_reproducibility/zeus_nsdi23/trace}/summary_power_p100.csv (100%) rename {trace => examples/research_reproducibility/zeus_nsdi23/trace}/summary_power_rtx6000.csv (100%) rename {trace => examples/research_reproducibility/zeus_nsdi23/trace}/summary_power_v100.csv (100%) rename {trace => examples/research_reproducibility/zeus_nsdi23/trace}/summary_train.csv (100%) diff --git a/README.md b/README.md index 811e4271..8bf76f6f 100644 --- a/README.md +++ b/README.md @@ -36,17 +36,14 @@ zeus/ │ ├── device/ # - Abstraction layer over CPU and GPU devices │ ├── utils/ # - Utility functions and classes │ ├── _legacy/ # - Legacy code to keep our research papers reproducible +│ ├── show_env.py # - Installation & device detection verification script │ └── callback.py # - Base class for callbacks during training │ ├── zeusd # 🌩️ Zeus daemon │ ├── docker/ # 🐳 Dockerfiles and Docker Compose files │ -├── examples/ # 🛠️ Zeus usage examples -│ -├── capriccio/ # 🌊 A drifting sentiment analysis dataset -│ -└── trace/ # 🗃️ Training and energy traces for various GPUs and DNNs +└── examples/ # 🛠️ Zeus usage examples ``` ## Getting Started diff --git a/docs/research_overview/zeus.md b/docs/research_overview/zeus.md index d6d8a160..0787b138 100644 --- a/docs/research_overview/zeus.md +++ b/docs/research_overview/zeus.md @@ -130,7 +130,7 @@ We have our trace-driven simulator open-sourced [here](https://github.com/ml-ene ### Extending the Zeus simulator Users can implement custom policies that optimize batch size and power limit, and plug it into the Zeus simulator. -We have training and energy traces for 6 different DNNs and 4 different NVIDIA GPU microarchitectures [here](https://github.com/ml-energy/zeus/tree/master/trace){.external}, which the simulator runs with. +We have training and energy traces for 6 different DNNs and 4 different NVIDIA GPU microarchitectures [here](https://github.com/ml-energy/zeus/tree/master/examples/research_reproducibility/zeus_nsdi23/trace){.external}, which the simulator runs with. Zeus defines two abstract classes [`BatchSizeOptimizer`][zeus._legacy.policy.BatchSizeOptimizer] and [`PowerLimitOptimizer`][zeus._legacy.policy.PowerLimitOptimizer] in [`zeus._legacy.policy.interface`][zeus._legacy.policy.interface]. Each class optimizes the batch size and power limit of a recurring training job respectively. diff --git a/examples/batch_size_optimizer/capriccio/README.md b/examples/batch_size_optimizer/capriccio/README.md index d650ccfd..41395abe 100644 --- a/examples/batch_size_optimizer/capriccio/README.md +++ b/examples/batch_size_optimizer/capriccio/README.md @@ -1,11 +1,11 @@ # Capriccio + BSO -This example will demonstrate how to integrate Zeus with [Capriccio](../../../capriccio), a drifting sentiment analysis dataset. +This example will demonstrate how to integrate Zeus with [Capriccio](../../research_reproducibility/zeus_nsdi23/capriccio), a drifting sentiment analysis dataset. ## Dependencies -1. Generate Capriccio, following the instructions in [Capriccio's README.md](../../../capriccio/). -1. If you're not using our [Docker image](https://ml.energy/zeus/getting_started/environment/), install `zeus` following our [Getting Started](https://ml.energy/zeus/getting_started/) guide. +1. Generate Capriccio, following the instructions in [Capriccio's README](../../research_reproducibility/zeus_nsdi23/capriccio). +1. Either use our Docker images or install `zeus` following our [documentation](https://ml.energy/zeus/getting_started/). 1. Install python dependencies for this example: ```sh pip install -r requirements.txt diff --git a/examples/research_reproducibility/zeus_nsdi23/README.md b/examples/research_reproducibility/zeus_nsdi23/README.md index fa8de5b1..2c3209bc 100644 --- a/examples/research_reproducibility/zeus_nsdi23/README.md +++ b/examples/research_reproducibility/zeus_nsdi23/README.md @@ -1,4 +1,7 @@ -# Running Zeus in a Trace-Driven Fashion +# Zeus NSDI 23 paper artifacts + +Two big chunks in our Zeus NSDI 23 paper were (1) running Zeus in a trace-driven fashion, and (2) Capriccio, a sentiment analysis dataset with data drift. +See the [capriccio](capriccio) directory for more about the Capriccio dataset; read on for trace-driven Zeus. While the existence of recurring jobs in production GPU clusters is clear, it is not really easy to run 50 DNN training jobs sequentially to evaluate energy optimization methods. Thus, Zeus provides a trace-driven simulator that allows users to plug in their own customized batch size optimizer and power limit optimizers and observe gains. @@ -8,7 +11,7 @@ We provide two types of traces. 1. Train trace: We trained six different (model, dataset) pairs with many different batch sizes. And we repeated training at least four times for each triplet with different random seeds. Thus, when we would like to know the result of training a model on a dataset with a certain batch size, we can sample a *training path* from this trace. 2. Power trace: We profiled the duration of one epoch and average power consumption for six (model, dataset) pairs with many different (batch size, power limit) configurations. These results not stochastic, and can be fetched from the trace to construct TTA (time to accuracy) and ETA (energy to accuracy) values. -Refer to the [`trace`](../../trace/) directory for more information about the traces we provide. +Refer to the [`trace`](trace) directory for more information about the traces we provide. ## Simulating the recurrence of one job @@ -45,7 +48,7 @@ Please refer to our paper for details on how jobs in our train/power traces are ### Dependencies -Install `zeus` following [Installing and Building](https://ml.energy/zeus/getting_started). The power monitor is not needed. +Install `zeus` following [our Getting Started guide](https://ml.energy/zeus/getting_started). All dependencies are already installed you're using our Docker image (`mlenergy/zeus:latest`). diff --git a/capriccio/.gitignore b/examples/research_reproducibility/zeus_nsdi23/capriccio/.gitignore similarity index 100% rename from capriccio/.gitignore rename to examples/research_reproducibility/zeus_nsdi23/capriccio/.gitignore diff --git a/capriccio/README.md b/examples/research_reproducibility/zeus_nsdi23/capriccio/README.md similarity index 100% rename from capriccio/README.md rename to examples/research_reproducibility/zeus_nsdi23/capriccio/README.md diff --git a/capriccio/generate.py b/examples/research_reproducibility/zeus_nsdi23/capriccio/generate.py similarity index 98% rename from capriccio/generate.py rename to examples/research_reproducibility/zeus_nsdi23/capriccio/generate.py index 1342671e..d75a6e09 100644 --- a/capriccio/generate.py +++ b/examples/research_reproducibility/zeus_nsdi23/capriccio/generate.py @@ -6,7 +6,7 @@ (%d_val.json). Usage: - python capriccio/generate.py /path/to/sentiment140.json + python generate.py /path/to/sentiment140.json """ import argparse diff --git a/capriccio/requirements.txt b/examples/research_reproducibility/zeus_nsdi23/capriccio/requirements.txt similarity index 100% rename from capriccio/requirements.txt rename to examples/research_reproducibility/zeus_nsdi23/capriccio/requirements.txt diff --git a/examples/research_reproducibility/zeus_nsdi23/run_alibaba.py b/examples/research_reproducibility/zeus_nsdi23/run_alibaba.py index d5c67246..d1e2708b 100644 --- a/examples/research_reproducibility/zeus_nsdi23/run_alibaba.py +++ b/examples/research_reproducibility/zeus_nsdi23/run_alibaba.py @@ -34,7 +34,7 @@ def run_simulator( ) -> list[tuple[str, list[HistoryEntry]]]: """Run the simulator on the given job.""" # Read in the Alibaba trace - alibaba_df = pd.DataFrame(pd.read_csv("../../../trace/alibaba_groups.csv.xz")) + alibaba_df = pd.DataFrame(pd.read_csv("trace/alibaba_groups.csv.xz")) print("Read in the Alibaba trace.") print(f"Number of groups: {alibaba_df.group.nunique()}") @@ -83,13 +83,13 @@ def simulate_group( @lru_cache(maxsize=1) def read_train_trace() -> pd.DataFrame: """Read the train trace file as a Pandas DataFrame.""" - return pd.DataFrame(pd.read_csv("../../../trace/summary_train.csv")) + return pd.DataFrame(pd.read_csv("trace/summary_train.csv")) @lru_cache(maxsize=1) def read_power_trace(gpu: Literal["a40", "v100", "p100", "rtx6000"]) -> pd.DataFrame: """Read the power trace of the given GPU as a Pandas DataFrame.""" - return pd.DataFrame(pd.read_csv(f"../../../trace/summary_power_{gpu}.csv")) + return pd.DataFrame(pd.read_csv(f"trace/summary_power_{gpu}.csv")) def get_job_with_defaults( diff --git a/examples/research_reproducibility/zeus_nsdi23/run_single.py b/examples/research_reproducibility/zeus_nsdi23/run_single.py index 9e7a3bc1..b40add55 100644 --- a/examples/research_reproducibility/zeus_nsdi23/run_single.py +++ b/examples/research_reproducibility/zeus_nsdi23/run_single.py @@ -46,8 +46,8 @@ def read_trace( gpu: Literal["a40", "v100", "p100", "rtx6000"] ) -> tuple[pd.DataFrame, pd.DataFrame]: """Read the train and power trace files as Pandas DataFrames.""" - train_df = pd.DataFrame(pd.read_csv("../../trace/summary_train.csv")) - power_df = pd.DataFrame(pd.read_csv(f"../../trace/summary_power_{gpu}.csv")) + train_df = pd.DataFrame(pd.read_csv("trace/summary_train.csv")) + power_df = pd.DataFrame(pd.read_csv(f"trace/summary_power_{gpu}.csv")) return train_df, power_df diff --git a/trace/README.md b/examples/research_reproducibility/zeus_nsdi23/trace/README.md similarity index 100% rename from trace/README.md rename to examples/research_reproducibility/zeus_nsdi23/trace/README.md diff --git a/trace/alibaba_groups.csv.xz b/examples/research_reproducibility/zeus_nsdi23/trace/alibaba_groups.csv.xz similarity index 100% rename from trace/alibaba_groups.csv.xz rename to examples/research_reproducibility/zeus_nsdi23/trace/alibaba_groups.csv.xz diff --git a/trace/summary_power_a40.csv b/examples/research_reproducibility/zeus_nsdi23/trace/summary_power_a40.csv similarity index 100% rename from trace/summary_power_a40.csv rename to examples/research_reproducibility/zeus_nsdi23/trace/summary_power_a40.csv diff --git a/trace/summary_power_p100.csv b/examples/research_reproducibility/zeus_nsdi23/trace/summary_power_p100.csv similarity index 100% rename from trace/summary_power_p100.csv rename to examples/research_reproducibility/zeus_nsdi23/trace/summary_power_p100.csv diff --git a/trace/summary_power_rtx6000.csv b/examples/research_reproducibility/zeus_nsdi23/trace/summary_power_rtx6000.csv similarity index 100% rename from trace/summary_power_rtx6000.csv rename to examples/research_reproducibility/zeus_nsdi23/trace/summary_power_rtx6000.csv diff --git a/trace/summary_power_v100.csv b/examples/research_reproducibility/zeus_nsdi23/trace/summary_power_v100.csv similarity index 100% rename from trace/summary_power_v100.csv rename to examples/research_reproducibility/zeus_nsdi23/trace/summary_power_v100.csv diff --git a/trace/summary_train.csv b/examples/research_reproducibility/zeus_nsdi23/trace/summary_train.csv similarity index 100% rename from trace/summary_train.csv rename to examples/research_reproducibility/zeus_nsdi23/trace/summary_train.csv diff --git a/scripts/lint.sh b/scripts/lint.sh index 0c2e568b..3535c48f 100755 --- a/scripts/lint.sh +++ b/scripts/lint.sh @@ -3,9 +3,9 @@ set -ev if [[ -z $GITHUB_ACTION ]]; then - black zeus capriccio tests + black zeus tests else - black --check zeus capriccio tests + black --check zeus tests fi ruff check zeus diff --git a/tests/optimizer/batch_size/test_simulator.py b/tests/optimizer/batch_size/test_simulator.py index d3e7c0fa..f05dcf86 100644 --- a/tests/optimizer/batch_size/test_simulator.py +++ b/tests/optimizer/batch_size/test_simulator.py @@ -55,8 +55,18 @@ def read_trace( ) -> tuple[pd.DataFrame, pd.DataFrame]: """Read the train and power trace files as Pandas DataFrames.""" trace_dir = Path(__file__).resolve(strict=True).parents[3] - train_df = pd.DataFrame(pd.read_csv(trace_dir / "trace/summary_train.csv")) - power_df = pd.DataFrame(pd.read_csv(trace_dir / f"trace/summary_power_{gpu}.csv")) + train_df = pd.DataFrame( + pd.read_csv( + trace_dir + / "examples/research_reproducibility/zeus_nsdi23/trace/summary_train.csv" + ) + ) + power_df = pd.DataFrame( + pd.read_csv( + trace_dir + / f"examples/research_reproducibility/zeus_nsdi23/trace/summary_power_{gpu}.csv" + ) + ) return train_df, power_df