Merge branch 'stateful-dataloader' of https://github.com/byi8220/acce…

…lerate into stateful-dataloader
byi8220 · Jul 15, 2024 · d264939 · d264939
2 parents 59738f4 + e4e1cac
commit d264939
Show file tree

Hide file tree

Showing 24 changed files with 213 additions and 20 deletions.
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -86,6 +86,7 @@
     is_megatron_lm_available,
     is_mlu_available,
     is_msamp_available,
+    is_musa_available,
     is_npu_available,
     is_torch_version,
     is_torch_xla_available,
@@ -293,6 +294,9 @@ def __init__(
             if is_mlu_available():
                 if compare_versions("deepspeed-mlu", "<", "0.10.1"):
                     raise ImportError("DeepSpeed MLU version must be >= 0.10.1. Please update DeepSpeed MLU.")
+            elif is_musa_available():
+                if compare_versions("deepspeed", ">", "0.14.3"):
+                    raise ImportError("DeepSpeed MUSA version must be <= 0.14.3. Please downgrade DeepSpeed.")
             elif compare_versions("deepspeed", "<", "0.9.3"):
                 raise ImportError("DeepSpeed version must be >= 0.9.3. Please update DeepSpeed.")
 
@@ -461,7 +465,7 @@ def __init__(
             and self.distributed_type not in (DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM)
         ):
             self.native_amp = True
-            if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu") or is_torch_xla_available(
+            if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu", "musa") or is_torch_xla_available(
                 check_is_tpu=True
             ):
                 raise ValueError(f"fp16 mixed precision requires a GPU (not {self.device.type!r}).")
@@ -474,6 +478,8 @@ def __init__(
                 self.scaler = xamp.GradScaler(**kwargs)
             elif is_mlu_available():
                 self.scaler = torch.mlu.amp.GradScaler(**kwargs)
+            elif is_musa_available():
+                self.scalar = torch.musa.amp.GradScaler(**kwargs)
             elif is_npu_available():
                 self.scaler = torch.npu.amp.GradScaler(**kwargs)
             elif is_xpu_available():
@@ -1122,6 +1128,7 @@ def join_uneven_inputs(self, joinables, even_batches=None):
             DistributedType.MULTI_GPU,
             DistributedType.MULTI_NPU,
             DistributedType.MULTI_MLU,
+            DistributedType.MULTI_MUSA,
             DistributedType.MULTI_XPU,
         ):
             dl_even_batches_values = []
@@ -1434,6 +1441,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
             if self.distributed_type in (
                 DistributedType.MULTI_GPU,
                 DistributedType.MULTI_MLU,
+                DistributedType.MULTI_MUSA,
                 DistributedType.MULTI_NPU,
                 DistributedType.MULTI_XPU,
             ):
@@ -3132,6 +3140,7 @@ def _inner(folder):
             if self.num_processes > 1 and self.distributed_type in (
                 DistributedType.MULTI_GPU,
                 DistributedType.MULTI_MLU,
+                DistributedType.MULTI_MUSA,
                 DistributedType.MULTI_NPU,
             ):
                 map_location = "on_device"

diff --git a/src/accelerate/big_modeling.py b/src/accelerate/big_modeling.py
@@ -38,6 +38,7 @@
     get_balanced_memory,
     infer_auto_device_map,
     is_mlu_available,
+    is_musa_available,
     is_npu_available,
     is_torch_version,
     is_xpu_available,
@@ -463,6 +464,8 @@ def wrapper(*args, **kwargs):
             model.npu = add_warning(model.npu, model)
         elif is_mlu_available():
             model.mlu = add_warning(model.mlu, model)
+        elif is_musa_available():
+            model.musa = add_warning(model.musa, model)
         elif is_xpu_available():
             model.xpu = add_warning(model.xpu, model)
         else:
@@ -483,6 +486,8 @@ def wrapper(*args, **kwargs):
             device = f"npu:{device}"
         elif is_mlu_available() and isinstance(device, int):
             device = f"mlu:{device}"
+        elif is_musa_available() and isinstance(device, int):
+            device = f"musa:{device}"
         elif is_xpu_available() and isinstance(device, int):
             device = f"xpu:{device}"
         if device != "disk":

diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py
@@ -22,6 +22,7 @@
     is_deepspeed_available,
     is_mlu_available,
     is_mps_available,
+    is_musa_available,
     is_npu_available,
     is_transformers_available,
     is_xpu_available,
@@ -49,7 +50,16 @@
 def get_cluster_input():
     distributed_type = _ask_options(
         "Which type of machine are you using?",
-        ["No distributed training", "multi-CPU", "multi-XPU", "multi-GPU", "multi-NPU", "multi-MLU", "TPU"],
+        [
+            "No distributed training",
+            "multi-CPU",
+            "multi-XPU",
+            "multi-GPU",
+            "multi-NPU",
+            "multi-MLU",
+            "multi-MUSA",
+            "TPU",
+        ],
         _convert_distributed_mode,
     )
 
@@ -66,6 +76,7 @@ def get_cluster_input():
     if distributed_type in [
         DistributedType.MULTI_GPU,
         DistributedType.MULTI_MLU,
+        DistributedType.MULTI_MUSA,
         DistributedType.MULTI_NPU,
         DistributedType.MULTI_XPU,
         DistributedType.MULTI_CPU,
@@ -145,7 +156,13 @@ def get_cluster_input():
         not use_cpu
         and is_xpu_available()
         and distributed_type
-        not in [DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.MULTI_MLU, DistributedType.XLA]
+        not in [
+            DistributedType.MULTI_GPU,
+            DistributedType.MULTI_NPU,
+            DistributedType.MULTI_MLU,
+            DistributedType.XLA,
+            DistributedType.MULTI_MUSA,
+        ]
     ):
         ipex_config["use_xpu"] = _ask_field(
             "Do you want to use XPU plugin to speed up training on XPU? [yes/NO]:",
@@ -205,6 +222,7 @@ def get_cluster_input():
             DistributedType.MULTI_XPU,
             DistributedType.MULTI_NPU,
             DistributedType.MULTI_MLU,
+            DistributedType.MULTI_MUSA,
             DistributedType.NO,
         ]
         and not use_mps
@@ -358,6 +376,7 @@ def get_cluster_input():
         DistributedType.MULTI_GPU,
         DistributedType.MULTI_NPU,
         DistributedType.MULTI_MLU,
+        DistributedType.MULTI_MUSA,
         DistributedType.MULTI_XPU,
     ]:
         use_fsdp = _ask_field(
@@ -529,6 +548,7 @@ def get_cluster_input():
         DistributedType.MULTI_XPU,
         DistributedType.MULTI_GPU,
         DistributedType.MULTI_MLU,
+        DistributedType.MULTI_MUSA,
         DistributedType.MULTI_NPU,
         DistributedType.XLA,
     ]:
@@ -565,6 +585,7 @@ def get_cluster_input():
         in [
             DistributedType.MULTI_GPU,
             DistributedType.MULTI_MLU,
+            DistributedType.MULTI_MUSA,
             DistributedType.MULTI_NPU,
             DistributedType.MULTI_XPU,
             DistributedType.NO,
@@ -576,6 +597,8 @@ def get_cluster_input():
             machine_type = "NPU(s)"
         elif is_mlu_available():
             machine_type = "MLU(s)"
+        elif is_musa_available():
+            machine_type = "MUSA(s)"
         else:
             machine_type = "GPU(s)"
         gpu_ids = _ask_field(

diff --git a/src/accelerate/commands/config/config_utils.py b/src/accelerate/commands/config/config_utils.py
@@ -70,7 +70,9 @@ def _convert_compute_environment(value):
 
 def _convert_distributed_mode(value):
     value = int(value)
-    return DistributedType(["NO", "MULTI_CPU", "MULTI_XPU", "MULTI_GPU", "MULTI_NPU", "MULTI_MLU", "XLA"][value])
+    return DistributedType(
+        ["NO", "MULTI_CPU", "MULTI_XPU", "MULTI_GPU", "MULTI_NPU", "MULTI_MLU", "MULTI_MUSA", "XLA"][value]
+    )
 
 
 def _convert_dynamo_backend(value):

diff --git a/src/accelerate/commands/config/default.py b/src/accelerate/commands/config/default.py
@@ -18,7 +18,7 @@
 
 import torch
 
-from ...utils import is_mlu_available, is_npu_available, is_xpu_available
+from ...utils import is_mlu_available, is_musa_available, is_npu_available, is_xpu_available
 from .config_args import ClusterConfig, default_json_config_file
 from .config_utils import SubcommandHelpFormatter
 
@@ -65,6 +65,14 @@ def write_basic_config(mixed_precision="no", save_location: str = default_json_c
             config["distributed_type"] = "MULTI_MLU"
         else:
             config["distributed_type"] = "NO"
+    elif is_musa_available():
+        num_musas = torch.musa.device_count()
+        config["num_processes"] = num_musas
+        config["use_cpu"] = False
+        if num_musas > 1:
+            config["distributed_type"] = "MULTI_MUSA"
+        else:
+            config["distributed_type"] = "NO"
     elif torch.cuda.is_available():
         num_gpus = torch.cuda.device_count()
         config["num_processes"] = num_gpus

diff --git a/src/accelerate/commands/env.py b/src/accelerate/commands/env.py
@@ -26,7 +26,7 @@
 from accelerate import __version__ as version
 from accelerate.commands.config import default_config_file, load_config_from_file
 
-from ..utils import is_mlu_available, is_npu_available, is_xpu_available
+from ..utils import is_mlu_available, is_musa_available, is_npu_available, is_xpu_available
 
 
 def env_command_parser(subparsers=None):
@@ -49,6 +49,7 @@ def env_command(args):
     pt_cuda_available = torch.cuda.is_available()
     pt_xpu_available = is_xpu_available()
     pt_mlu_available = is_mlu_available()
+    pt_musa_available = is_musa_available()
     pt_npu_available = is_npu_available()
 
     accelerate_config = "Not found"
@@ -75,6 +76,7 @@ def env_command(args):
         "PyTorch XPU available": str(pt_xpu_available),
         "PyTorch NPU available": str(pt_npu_available),
         "PyTorch MLU available": str(pt_mlu_available),
+        "PyTorch MUSA available": str(pt_musa_available),
         "System RAM": f"{psutil.virtual_memory().total / 1024 ** 3:.2f} GB",
     }
     if pt_cuda_available:

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
@@ -40,6 +40,7 @@
     is_bf16_available,
     is_deepspeed_available,
     is_mlu_available,
+    is_musa_available,
     is_npu_available,
     is_rich_available,
     is_sagemaker_available,
@@ -934,6 +935,7 @@ def _validate_launch_command(args):
                     DistributedType.MULTI_GPU,
                     DistributedType.MULTI_NPU,
                     DistributedType.MULTI_MLU,
+                    DistributedType.MULTI_MUSA,
                     DistributedType.MULTI_XPU,
                 )
                 else False
@@ -1013,18 +1015,25 @@ def _validate_launch_command(args):
                 args.num_processes = torch.xpu.device_count()
             elif is_mlu_available():
                 args.num_processes = torch.mlu.device_count()
+            elif is_musa_available():
+                args.num_processes = torch.musa.device_count()
             elif is_npu_available():
                 args.num_processes = torch.npu.device_count()
             else:
                 args.num_processes = torch.cuda.device_count()
             warned.append(f"\t`--num_processes` was set to a value of `{args.num_processes}`")
         if args.debug is None:
             args.debug = False
-        if not args.multi_gpu and (
-            (args.use_xpu and is_xpu_available() and torch.xpu.device_count() > 1)
-            or (is_mlu_available() and torch.mlu.device_count() > 1)
-            or (is_npu_available() and torch.npu.device_count() > 1)
-            or (torch.cuda.device_count() > 1)
+        if (
+            not args.multi_gpu
+            and args.num_processes > 1
+            and (
+                (args.use_xpu and is_xpu_available() and torch.xpu.device_count() > 1)
+                or (is_mlu_available() and torch.mlu.device_count() > 1)
+                or (is_musa_available() and torch.musa.device_count() > 1)
+                or (is_npu_available() and torch.npu.device_count() > 1)
+                or (torch.cuda.device_count() > 1)
+            )
         ):
             warned.append(
                 "\t\tMore than one GPU was found, enabling multi-GPU training.\n"

diff --git a/src/accelerate/hooks.py b/src/accelerate/hooks.py
@@ -30,7 +30,7 @@
 from .utils.other import recursive_getattr
 
 
-_accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu"]
+_accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu", "musa"]
 
 
 class ModelHook:

diff --git a/src/accelerate/local_sgd.py b/src/accelerate/local_sgd.py
@@ -70,6 +70,7 @@ def __init__(self, accelerator: Accelerator, model: torch.nn.Module, local_sgd_s
             DistributedType.MULTI_CPU,
             DistributedType.MULTI_GPU,
             DistributedType.MULTI_MLU,
+            DistributedType.MULTI_MUSA,
             DistributedType.MULTI_NPU,
         ]:
             raise NotImplementedError("LocalSGD is supported only for CPUs and GPUs (no DeepSpeed or MegatronLM)")

diff --git a/src/accelerate/state.py b/src/accelerate/state.py
@@ -40,6 +40,7 @@
     is_ipex_available,
     is_mlu_available,
     is_mps_available,
+    is_musa_available,
     is_npu_available,
     is_torch_xla_available,
     is_xpu_available,
@@ -56,6 +57,9 @@
 if is_mlu_available(check_device=False):
     import torch_mlu  # noqa: F401
 
+if is_musa_available(check_device=False):
+    import torch_musa  # noqa: F401
+
 if is_npu_available(check_device=False):
     import torch_npu  # noqa: F401
 
@@ -369,6 +373,7 @@ def wait_for_everyone(self):
         if self.distributed_type in (
             DistributedType.MULTI_GPU,
             DistributedType.MULTI_MLU,
+            DistributedType.MULTI_MUSA,
             DistributedType.MULTI_NPU,
             DistributedType.MULTI_XPU,
             DistributedType.MULTI_CPU,
@@ -688,6 +693,7 @@ def default_device(self) -> torch.device:
         - MPS if `torch.backends.mps.is_available()` and `torch.backends.mps.is_built()` both return True.
         - CUDA if `torch.cuda.is_available()`
         - MLU if `is_mlu_available()`
+        - MUSA if `is_musa_available()`
         - NPU if `is_npu_available()`
         - CPU otherwise
         """
@@ -696,6 +702,8 @@ def default_device(self) -> torch.device:
             return torch.device("mps")
         elif is_mlu_available():
             return torch.device("mlu")
+        elif is_musa_available():
+            return torch.device("musa")
         elif torch.cuda.is_available():
             return torch.device("cuda")
         elif is_xpu_available():
@@ -722,6 +730,9 @@ def _prepare_backend(
             if is_mlu_available():
                 backend = "cncl"
                 distributed_type = DistributedType.MULTI_MLU
+            elif is_musa_available():
+                backend = "mccl"
+                distributed_type = DistributedType.MULTI_MUSA
             elif torch.cuda.is_available():
                 if backend is None:
                     backend = "nccl"
@@ -769,7 +780,7 @@ def set_device(self):
             self.device = torch.device("cpu") if self._cpu else self.default_device
             return
         device = str(self.distributed_type).split(".")[-1].replace("MULTI_", "").lower()
-        if device not in ("cpu", "gpu", "mlu", "npu", "xpu", "xla"):
+        if device not in ("cpu", "gpu", "mlu", "musa", "npu", "xpu", "xla"):
             raise ValueError(
                 f"Can't set device for {self.distributed_type} ({device}), verify we should be calling `_set_device()` for it!"
             )
@@ -784,6 +795,8 @@ def set_device(self):
                 torch.xpu.set_device(self.device)
             elif device == "mlu":
                 torch.mlu.set_device(self.device)
+            elif device == "musa":
+                torch.musa.set_device(self.device)
             elif device == "npu":
                 torch.npu.set_device(self.device)
             elif device == "cuda":
@@ -894,6 +907,7 @@ def __init__(
             elif self.distributed_type in [
                 DistributedType.MULTI_GPU,
                 DistributedType.MULTI_MLU,
+                DistributedType.MULTI_MUSA,
                 DistributedType.MULTI_NPU,
                 DistributedType.MULTI_XPU,
             ]:
@@ -920,6 +934,12 @@ def __init__(
                 and self.device.type == "cuda"
             ):
                 torch.backends.cuda.matmul.allow_tf32 = True
+            if (
+                self.dynamo_plugin.backend != DynamoBackend.NO
+                and self._mixed_precision == "no"
+                and self.device.type == "musa"
+            ):
+                torch.backends.musa.matmul.allow_tf32 = True
             PartialState._shared_state["distributed_type"] = self.distributed_type
 
     @property

diff --git a/src/accelerate/test_utils/__init__.py b/src/accelerate/test_utils/__init__.py
@@ -29,6 +29,7 @@
     require_multi_device,
     require_multi_gpu,
     require_multi_xpu,
+    require_musa,
     require_non_cpu,
     require_non_torch_xla,
     require_non_xpu,