Merge branch 'master' into vshekhawat/skip_fused_optim_tests

microsoft · May 15, 2024 · 16750e9 · 16750e9
2 parents 26c8077 + 488a823
commit 16750e9
Show file tree

Hide file tree

Showing 14 changed files with 302 additions and 11 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -2,7 +2,7 @@ include *.txt README.md
 include deepspeed/inference/v2/kernels/ragged_ops/libs/*.so
 include deepspeed/inference/v2/kernels/cutlass_ops/libs/*.so
 recursive-include requirements *.txt
-recursive-include deepspeed *.cpp *.h *.cu *.hip *.tr *.cuh *.cc *.json
+recursive-include deepspeed *.cpp *.h *.hpp *.cu *.hip *.tr *.cuh *.cc *.json
 recursive-include csrc *.cpp *.h *.hpp *.cu *.tr *.cuh *.cc
 recursive-include op_builder *.py
 recursive-include benchmarks *.py

diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
@@ -307,6 +307,8 @@ def tp_parser(model):
                 # Mixtral-7x8b used w2*act(w1*w3) linear. need to replace w2 to linearallreduce.
                 elif 'w2' in layer and 'Mixtral' in str(type(module)):
                     gem_list = gem_list + [layer]
+                elif 'self_attn.dense' in layer and 'Phi' in str(type(module)):
+                    gem_list = gem_list + [layer]
 
             layer_list = []
             if gem_list != []:

diff --git a/deepspeed/monitor/comet.py b/deepspeed/monitor/comet.py
@@ -0,0 +1,92 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import TYPE_CHECKING, Any, Tuple, List, Dict, Optional
+
+from .utils import check_comet_availability
+from .monitor import Monitor
+
+import deepspeed.comm as dist
+
+if TYPE_CHECKING:
+    import comet_ml
+    from .config import CometConfig
+
+Name = str
+Value = Any
+GlobalSamples = int
+Event = Tuple[Name, Value, GlobalSamples]
+
+
+class CometMonitor(Monitor):
+
+    def __init__(self, comet_config: "CometConfig"):
+        super().__init__(comet_config)
+        check_comet_availability()
+        import comet_ml
+
+        self.enabled = comet_config.enabled
+        self._samples_log_interval = comet_config.samples_log_interval
+        self._experiment: Optional["comet_ml.ExperimentBase"] = None
+
+        if self.enabled and dist.get_rank() == 0:
+            self._experiment = comet_ml.start(
+                api_key=comet_config.api_key,
+                project=comet_config.project,
+                workspace=comet_config.workspace,
+                experiment_key=comet_config.experiment_key,
+                mode=comet_config.mode,
+                online=comet_config.online,
+            )
+
+            if comet_config.experiment_name is not None:
+                self._experiment.set_name(comet_config.experiment_name)
+
+        self._events_log_scheduler = EventsLogScheduler(comet_config.samples_log_interval)
+
+    @property
+    def experiment(self) -> Optional["comet_ml.ExperimentBase"]:
+        return self._experiment
+
+    @property
+    def samples_log_interval(self) -> int:
+        return self._samples_log_interval
+
+    def write_events(self, event_list: List[Event]) -> None:
+        if not self.enabled or dist.get_rank() != 0:
+            return None
+
+        for event in event_list:
+            name = event[0]
+            value = event[1]
+            engine_global_samples = event[2]
+
+            if self._events_log_scheduler.needs_logging(name, engine_global_samples):
+                self._experiment.__internal_api__log_metric__(
+                    name=name,
+                    value=value,
+                    step=engine_global_samples,
+                )
+
+
+class EventsLogScheduler:
+
+    def __init__(self, samples_log_interval: int):
+        self._samples_log_interval = samples_log_interval
+        self._last_logged_events_samples: Dict[str, int] = {}
+
+    def needs_logging(self, name: str, current_sample: int) -> bool:
+        if name not in self._last_logged_events_samples:
+            self._last_logged_events_samples[name] = current_sample
+            return True
+
+        last_logged_sample = self._last_logged_events_samples[name]
+        samples_delta = current_sample - last_logged_sample
+
+        if samples_delta >= self._samples_log_interval:
+            self._last_logged_events_samples[name] = current_sample
+            return True
+
+        return False
diff --git a/deepspeed/monitor/config.py b/deepspeed/monitor/config.py
@@ -3,12 +3,14 @@
 
 # DeepSpeed Team
 
+from typing import Optional
+
 from deepspeed.pydantic_v1 import root_validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
 
 def get_monitor_config(param_dict):
-    monitor_dict = {key: param_dict.get(key, {}) for key in ("tensorboard", "wandb", "csv_monitor")}
+    monitor_dict = {key: param_dict.get(key, {}) for key in ("tensorboard", "wandb", "csv_monitor", "comet")}
     return DeepSpeedMonitorConfig(**monitor_dict)
 
 
@@ -60,12 +62,75 @@ class CSVConfig(DeepSpeedConfigModel):
     """ Name for the current job. This will become a new directory inside `output_path`. """
 
 
+class CometConfig(DeepSpeedConfigModel):
+    """
+    Sets parameters for Comet monitor. For logging data Comet uses
+    experiment object.
+    https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment/
+    """
+
+    enabled: bool = False
+    """ Whether logging to Comet is enabled. Requires `comet_ml` package is installed. """
+
+    samples_log_interval: int = 100
+    """ Metrics will be submitted to Comet after processing every `samples_log_intervas` samples"""
+
+    project: Optional[str] = None
+    """
+    Comet project name. Can be set through .comet.config file or environment variable COMET_PROJECT_NAME
+    https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
+    """
+
+    workspace: Optional[str] = None
+    """
+    Comet workspace name. Can be set through .comet.config file or environment variable COMET_WORKSPACE
+    https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
+    """
+
+    api_key: Optional[str] = None
+    """
+    Comet API key. Can be set through .comet.config file or environment variable COMET_API_KEY
+    https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
+    """
+
+    experiment_name: Optional[str] = None
+    """
+    The name for comet experiment to be used for logging.
+    Can be set through .comet.config file or environment variable COMET_EXPERIMENT_NAME
+    https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
+    """
+
+    experiment_key: Optional[str] = None
+    """
+    The key for comet experiment to be used for logging. Must be an alphanumeric string whose length is between 32 and 50 characters.
+    Can be set through .comet.config  or environment variable COMET_EXPERIMENT_KEY
+    https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
+    """
+
+    online: Optional[bool] = None
+    """
+    If True, the data will be logged to Comet server, otherwise it will be stored locally in offline experiment
+    Defaults to True.
+    """
+
+    mode: Optional[str] = None
+    """
+    Control how the Comet experiment is started, 3 options are possible.:
+        - "get": Continue logging to an existing experiment identified by the `experiment_key` value.
+        - "create": Always creates of a new experiment, useful for HPO sweeps.
+        - "get_or_create" (default): Starts a fresh experiment if required, or persists logging to an existing one.
+    """
+
+
 class DeepSpeedMonitorConfig(DeepSpeedConfigModel):
     """Sets parameters for various monitoring methods."""
 
     tensorboard: TensorBoardConfig = {}
     """ TensorBoard monitor, requires `tensorboard` package is installed. """
 
+    comet: CometConfig = {}
+    """ Comet monitor, requires `comet_ml` package is installed """
+
     wandb: WandbConfig = {}
     """ WandB monitor, requires `wandb` package is installed. """
 
@@ -75,5 +140,5 @@ class DeepSpeedMonitorConfig(DeepSpeedConfigModel):
     @root_validator
     def check_enabled(cls, values):
         values["enabled"] = values.get("tensorboard").enabled or values.get("wandb").enabled or values.get(
-            "csv_monitor").enabled
+            "csv_monitor").enabled or values.get("comet")
         return values
diff --git a/deepspeed/monitor/monitor.py b/deepspeed/monitor/monitor.py
@@ -24,6 +24,7 @@ def write_events(self, event_list):
 from .wandb import WandbMonitor
 from .tensorboard import TensorBoardMonitor
 from .csv_monitor import csvMonitor
+from .comet import CometMonitor
 
 
 class MonitorMaster(Monitor):
@@ -33,6 +34,7 @@ def __init__(self, monitor_config):
         self.tb_monitor = None
         self.wandb_monitor = None
         self.csv_monitor = None
+        self.comet_monitor = None
         self.enabled = monitor_config.enabled
 
         if dist.get_rank() == 0:
@@ -42,6 +44,8 @@ def __init__(self, monitor_config):
                 self.wandb_monitor = WandbMonitor(monitor_config.wandb)
             if monitor_config.csv_monitor.enabled:
                 self.csv_monitor = csvMonitor(monitor_config.csv_monitor)
+            if monitor_config.comet.enabled:
+                self.comet_monitor = CometMonitor(monitor_config.comet)
 
     def write_events(self, event_list):
         if dist.get_rank() == 0:
@@ -51,3 +55,5 @@ def write_events(self, event_list):
                 self.wandb_monitor.write_events(event_list)
             if self.csv_monitor is not None:
                 self.csv_monitor.write_events(event_list)
+            if self.comet_monitor is not None:
+                self.comet_monitor.write_events(event_list)
diff --git a/deepspeed/monitor/utils.py b/deepspeed/monitor/utils.py
@@ -3,6 +3,8 @@
 
 # DeepSpeed Team
 
+from packaging import version as pkg_version
+
 
 def check_tb_availability():
     try:
@@ -22,3 +24,14 @@ def check_wandb_availability():
             'If you want to use wandb logging, please `pip install wandb` and follow the instructions at https://docs.wandb.ai/quickstart'
         )
         raise
+
+
+def check_comet_availability():
+    try:
+        import comet_ml
+        comet_version = pkg_version.parse(comet_ml.__version__)
+        if comet_version < pkg_version.Version("3.41.0"):
+            raise ImportError("`comet_ml` must have at least version 3.41.0")
+    except ImportError:
+        print('If you want to use comet logging, please `pip install "comet_ml>=3.41.0"`')
+        raise
diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
@@ -41,7 +41,7 @@ lnav:
       - title: 'Flops Profiler'
         url: /docs/config-json/#flops-profiler
       - title: 'Monitoring'
-        url: /docs/config-json/#monitoring-module-tensorboard-wandb-csv
+        url: /docs/config-json/#monitoring-module
       - title: 'Communication Logging'
         url: /docs/config-json/#communication-logging
       - title: 'Model Compression'

diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
@@ -1139,15 +1139,16 @@ DeepSpeed Data Efficiency Library includes two techniques: curriculum learning a
 | ---------------------------------------------------------------------------------------------------------------------------- | ------- |
 | List of which step to change difficulty level. One of the `schedule_config` when the `fixed_discrete` schedule_type is used. | N/A     |
 
-### Monitoring Module (TensorBoard, WandB, CSV)
+### Monitoring Module
 
 **Note:** Deepspeed logs to TensorBoard through PyTorch. Logging to TensorBoard requires that the `tensorboard` package is installed (read more in the [PyTorch documentation](https://pytorch.org/docs/1.8.0/tensorboard.html)).
 {: .notice--warning}
 **Note:** Logging to WandB requires that the `wandb` package is installed (read more in the [WandB documentation](https://docs.wandb.ai/quickstart)).
 {: .notice--warning}
+**Note:** Logging to Comet requires that the `comet_ml` package is installed (read more in the [Comet documentation](https://www.comet.com/docs/v2/guides/quickstart/#1-install-and-configure-the-comet-ml-sdk)).
+{: .notice--warning}
 
-
-Deepspeed's Monitor module can log training details into a [Tensorboard](https://www.tensorflow.org/tensorboard)-compatible file, to [WandB](https://wandb.ai/site), or to simple CSV files. Below is an overview of what DeepSpeed will log automatically.
+Deepspeed's Monitor module can log training details into a [Tensorboard](https://www.tensorflow.org/tensorboard)-compatible file, to [WandB](https://wandb.ai/site), to [Comet](https://www.comet.com/site/?utm_source=deepseed&utm_medium=docs&utm_content=docs) or to simple CSV files. Below is an overview of what DeepSpeed will log automatically.
 
 | Field | Description                                                                                                                                                                                                                                                                                               |Conditions |
 | ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- |
@@ -1201,6 +1202,36 @@ Example of <i>**wandb**</i> configuration:
 }
 ```
 
+<i>**comet**</i>: [dictionary]
+
+| Fields  | Value   | Default   |
+|---  |---  |---  |
+| enabled   | Whether logging to [Comet](https://www.comet.com/site/) is enabled.   | `false`   |
+| workspace   | Comet workspace name.   | `None`  |
+| project   | Comet project name.   | `None`  |
+| samples_log_interval  | Metrics will be submitted to Comet after processing every `samples_log_intervas` samples.   | `100`   |
+| experiment_name   | The name for comet experiment to be used for logging.   | `None`  |
+| api_key   | Comet API key. It's not recommended to save the Comet API Key in code.  | `None`  |
+| experiment_key  | The key for comet experiment to be used for logging. Must be an alphanumeric string whose length is between 32 and 50 characters.   | `None`  |
+| online  | If True, the data will be logged to Comet server, otherwise it will be stored locally in offline experiment. Default is `True`.   | `None`  |
+| mode  | Control how the Comet experiment is started. "get": Continue logging to an existing experiment identified by the `experiment_key` value. "create": Always creates of a new experiment, useful for HPO sweeps. "get_or_create" (default): Starts a fresh experiment if required, or persists logging to an existing one.   | `None`  |
+
+
+Example of <i>**comet**</i> configuration:
+
+```json
+"comet": {
+    "enabled": true,
+    "workspace": "my_workspace",
+    "project": "my_project",
+    "samples_log_interval": 50,
+    "experiment_name": "llama-fine-tuning",
+    "experiment_key": "0c4a1c4a90664f2a8084e600b19a9d7",
+    "online": false,
+    "mode": "get",
+}
+```
+
 <i>**csv_monitor**</i>: [dictionary]
 
 | Fields | Value                                                                                                                                                                                                                                                                                                        |Default |

diff --git a/docs/_tutorials/monitor.md b/docs/_tutorials/monitor.md
@@ -11,7 +11,7 @@ In this tutorial, we introduce the DeepSpeed Monitor and provide examples of its
 
 ## Overview
 
-Monitoring model and system metrics during training is vital to ensure hardware resources are fully utilized. The DeepSpeed Monitor enables live logging of metrics through one or more monitoring backends such as PyTorch's [TensorBoard](https://pytorch.org/docs/1.8.0/tensorboard.html), [WandB](https://docs.wandb.ai/quickstart), and simple CSV files.
+Monitoring model and system metrics during training is vital to ensure hardware resources are fully utilized. The DeepSpeed Monitor enables live logging of metrics through one or more monitoring backends such as PyTorch's [TensorBoard](https://pytorch.org/docs/1.8.0/tensorboard.html), [WandB](https://docs.wandb.ai/quickstart), [Comet](https://www.comet.com/site/?utm_source=deepseed&utm_medium=docs&utm_content=tutorial) and simple CSV files.
 
 Below is a live monitoring view for TensorBoard:
 
@@ -21,16 +21,20 @@ Below is a live monitoring view for WandB:
 
 ![WandB Example Output](/assets/images/wandb_monitor.PNG){: .align-center}
 
+Below is a live monitoring view for Comet:
+
+![CometML Example Output](/assets/images/comet_monitor.png){: .align-center}
+
 ## Usage
 
-The DeepSpeed Monitor is configured within the deepspeed [configuration file](/docs/config-json/#monitoring-module-tensorboard-wandb-csv). DeepSpeed will automatically monitor key training metrics, including those tracked with the `wall_clock_breakdown` configuration option. In addition, users can log their own custom events and metrics.
+The DeepSpeed Monitor is configured within the deepspeed [configuration file](/docs/config-json/#monitoring-module). DeepSpeed will automatically monitor key training metrics, including those tracked with the `wall_clock_breakdown` configuration option. In addition, users can log their own custom events and metrics.
 
   - [Automatic Monitoring](#automatic-monitoring)
   - [Custom Monitoring](#custom-monitoring)
 
 ### Automatic Monitoring
 
-When using DeepSpeed for model training, the Monitor can be configured in the DeepSpeed [configuration file](/docs/config-json/#monitoring-module-tensorboard-wandb-csv). No explicit API calls are needed to use the Monitor. The Monitor can be enabled by adding the following field to DeepSpeed's configuration json file. Refer to [Monitoring](/docs/config-json/#monitoring-module-tensorboard-wandb-csv) for details.
+When using DeepSpeed for model training, the Monitor can be configured in the DeepSpeed [configuration file](/docs/config-json/#monitoring-module). No explicit API calls are needed to use the Monitor. The Monitor can be enabled by adding the following field to DeepSpeed's configuration json file. Refer to [Monitoring](/docs/config-json/#monitoring-module) for details.
 
 ```json
 {
@@ -45,6 +49,11 @@ When using DeepSpeed for model training, the Monitor can be configured in the De
     "group": "my_group",
     "project": "my_project"
   }
+  "comet": {
+    "enabled": true,
+    "project": "my_project",
+    "experiment_name": "my_experiment"
+  }
   "csv_monitor": {
     "enabled": true,
     "output_path": "output/ds_logs/",

diff --git a/docs/assets/images/comet_monitor.png b/docs/assets/images/comet_monitor.png
diff --git a/docs/code-docs/source/monitor.rst b/docs/code-docs/source/monitor.rst
@@ -29,6 +29,11 @@ WandB
 .. _WandbConfig:
 .. autopydantic_model:: deepspeed.monitor.config.WandbConfig
 
+Comet
+-----
+.. _CometConfig:
+.. autopydantic_model:: deepspeed.monitor.config.CometConfig
+
 CSV Monitor
 -----------
 .. _CSVConfig:

diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
@@ -1,5 +1,6 @@
 accelerate
 clang-format==16.0.2
+comet_ml>=3.41.0
 deepspeed-kernels ; sys_platform == 'linux'
 docutils<0.18
 future