Skip to content

Commit

Permalink
Merge branch 'stateful-dataloader' of https://github.com/byi8220/acce…
Browse files Browse the repository at this point in the history
…lerate into stateful-dataloader
  • Loading branch information
byi8220 committed Jul 15, 2024
2 parents 59738f4 + e4e1cac commit d264939
Show file tree
Hide file tree
Showing 24 changed files with 213 additions and 20 deletions.
11 changes: 10 additions & 1 deletion src/accelerate/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
is_megatron_lm_available,
is_mlu_available,
is_msamp_available,
is_musa_available,
is_npu_available,
is_torch_version,
is_torch_xla_available,
Expand Down Expand Up @@ -293,6 +294,9 @@ def __init__(
if is_mlu_available():
if compare_versions("deepspeed-mlu", "<", "0.10.1"):
raise ImportError("DeepSpeed MLU version must be >= 0.10.1. Please update DeepSpeed MLU.")
elif is_musa_available():
if compare_versions("deepspeed", ">", "0.14.3"):
raise ImportError("DeepSpeed MUSA version must be <= 0.14.3. Please downgrade DeepSpeed.")
elif compare_versions("deepspeed", "<", "0.9.3"):
raise ImportError("DeepSpeed version must be >= 0.9.3. Please update DeepSpeed.")

Expand Down Expand Up @@ -461,7 +465,7 @@ def __init__(
and self.distributed_type not in (DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM)
):
self.native_amp = True
if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu") or is_torch_xla_available(
if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu", "musa") or is_torch_xla_available(
check_is_tpu=True
):
raise ValueError(f"fp16 mixed precision requires a GPU (not {self.device.type!r}).")
Expand All @@ -474,6 +478,8 @@ def __init__(
self.scaler = xamp.GradScaler(**kwargs)
elif is_mlu_available():
self.scaler = torch.mlu.amp.GradScaler(**kwargs)
elif is_musa_available():
self.scalar = torch.musa.amp.GradScaler(**kwargs)
elif is_npu_available():
self.scaler = torch.npu.amp.GradScaler(**kwargs)
elif is_xpu_available():
Expand Down Expand Up @@ -1122,6 +1128,7 @@ def join_uneven_inputs(self, joinables, even_batches=None):
DistributedType.MULTI_GPU,
DistributedType.MULTI_NPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_XPU,
):
dl_even_batches_values = []
Expand Down Expand Up @@ -1434,6 +1441,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
if self.distributed_type in (
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
DistributedType.MULTI_XPU,
):
Expand Down Expand Up @@ -3132,6 +3140,7 @@ def _inner(folder):
if self.num_processes > 1 and self.distributed_type in (
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
):
map_location = "on_device"
Expand Down
5 changes: 5 additions & 0 deletions src/accelerate/big_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
get_balanced_memory,
infer_auto_device_map,
is_mlu_available,
is_musa_available,
is_npu_available,
is_torch_version,
is_xpu_available,
Expand Down Expand Up @@ -463,6 +464,8 @@ def wrapper(*args, **kwargs):
model.npu = add_warning(model.npu, model)
elif is_mlu_available():
model.mlu = add_warning(model.mlu, model)
elif is_musa_available():
model.musa = add_warning(model.musa, model)
elif is_xpu_available():
model.xpu = add_warning(model.xpu, model)
else:
Expand All @@ -483,6 +486,8 @@ def wrapper(*args, **kwargs):
device = f"npu:{device}"
elif is_mlu_available() and isinstance(device, int):
device = f"mlu:{device}"
elif is_musa_available() and isinstance(device, int):
device = f"musa:{device}"
elif is_xpu_available() and isinstance(device, int):
device = f"xpu:{device}"
if device != "disk":
Expand Down
27 changes: 25 additions & 2 deletions src/accelerate/commands/config/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
is_deepspeed_available,
is_mlu_available,
is_mps_available,
is_musa_available,
is_npu_available,
is_transformers_available,
is_xpu_available,
Expand Down Expand Up @@ -49,7 +50,16 @@
def get_cluster_input():
distributed_type = _ask_options(
"Which type of machine are you using?",
["No distributed training", "multi-CPU", "multi-XPU", "multi-GPU", "multi-NPU", "multi-MLU", "TPU"],
[
"No distributed training",
"multi-CPU",
"multi-XPU",
"multi-GPU",
"multi-NPU",
"multi-MLU",
"multi-MUSA",
"TPU",
],
_convert_distributed_mode,
)

Expand All @@ -66,6 +76,7 @@ def get_cluster_input():
if distributed_type in [
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
DistributedType.MULTI_XPU,
DistributedType.MULTI_CPU,
Expand Down Expand Up @@ -145,7 +156,13 @@ def get_cluster_input():
not use_cpu
and is_xpu_available()
and distributed_type
not in [DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.MULTI_MLU, DistributedType.XLA]
not in [
DistributedType.MULTI_GPU,
DistributedType.MULTI_NPU,
DistributedType.MULTI_MLU,
DistributedType.XLA,
DistributedType.MULTI_MUSA,
]
):
ipex_config["use_xpu"] = _ask_field(
"Do you want to use XPU plugin to speed up training on XPU? [yes/NO]:",
Expand Down Expand Up @@ -205,6 +222,7 @@ def get_cluster_input():
DistributedType.MULTI_XPU,
DistributedType.MULTI_NPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_MUSA,
DistributedType.NO,
]
and not use_mps
Expand Down Expand Up @@ -358,6 +376,7 @@ def get_cluster_input():
DistributedType.MULTI_GPU,
DistributedType.MULTI_NPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_XPU,
]:
use_fsdp = _ask_field(
Expand Down Expand Up @@ -529,6 +548,7 @@ def get_cluster_input():
DistributedType.MULTI_XPU,
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
DistributedType.XLA,
]:
Expand Down Expand Up @@ -565,6 +585,7 @@ def get_cluster_input():
in [
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
DistributedType.MULTI_XPU,
DistributedType.NO,
Expand All @@ -576,6 +597,8 @@ def get_cluster_input():
machine_type = "NPU(s)"
elif is_mlu_available():
machine_type = "MLU(s)"
elif is_musa_available():
machine_type = "MUSA(s)"
else:
machine_type = "GPU(s)"
gpu_ids = _ask_field(
Expand Down
4 changes: 3 additions & 1 deletion src/accelerate/commands/config/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@ def _convert_compute_environment(value):

def _convert_distributed_mode(value):
value = int(value)
return DistributedType(["NO", "MULTI_CPU", "MULTI_XPU", "MULTI_GPU", "MULTI_NPU", "MULTI_MLU", "XLA"][value])
return DistributedType(
["NO", "MULTI_CPU", "MULTI_XPU", "MULTI_GPU", "MULTI_NPU", "MULTI_MLU", "MULTI_MUSA", "XLA"][value]
)


def _convert_dynamo_backend(value):
Expand Down
10 changes: 9 additions & 1 deletion src/accelerate/commands/config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import torch

from ...utils import is_mlu_available, is_npu_available, is_xpu_available
from ...utils import is_mlu_available, is_musa_available, is_npu_available, is_xpu_available
from .config_args import ClusterConfig, default_json_config_file
from .config_utils import SubcommandHelpFormatter

Expand Down Expand Up @@ -65,6 +65,14 @@ def write_basic_config(mixed_precision="no", save_location: str = default_json_c
config["distributed_type"] = "MULTI_MLU"
else:
config["distributed_type"] = "NO"
elif is_musa_available():
num_musas = torch.musa.device_count()
config["num_processes"] = num_musas
config["use_cpu"] = False
if num_musas > 1:
config["distributed_type"] = "MULTI_MUSA"
else:
config["distributed_type"] = "NO"
elif torch.cuda.is_available():
num_gpus = torch.cuda.device_count()
config["num_processes"] = num_gpus
Expand Down
4 changes: 3 additions & 1 deletion src/accelerate/commands/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from accelerate import __version__ as version
from accelerate.commands.config import default_config_file, load_config_from_file

from ..utils import is_mlu_available, is_npu_available, is_xpu_available
from ..utils import is_mlu_available, is_musa_available, is_npu_available, is_xpu_available


def env_command_parser(subparsers=None):
Expand All @@ -49,6 +49,7 @@ def env_command(args):
pt_cuda_available = torch.cuda.is_available()
pt_xpu_available = is_xpu_available()
pt_mlu_available = is_mlu_available()
pt_musa_available = is_musa_available()
pt_npu_available = is_npu_available()

accelerate_config = "Not found"
Expand All @@ -75,6 +76,7 @@ def env_command(args):
"PyTorch XPU available": str(pt_xpu_available),
"PyTorch NPU available": str(pt_npu_available),
"PyTorch MLU available": str(pt_mlu_available),
"PyTorch MUSA available": str(pt_musa_available),
"System RAM": f"{psutil.virtual_memory().total / 1024 ** 3:.2f} GB",
}
if pt_cuda_available:
Expand Down
19 changes: 14 additions & 5 deletions src/accelerate/commands/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
is_bf16_available,
is_deepspeed_available,
is_mlu_available,
is_musa_available,
is_npu_available,
is_rich_available,
is_sagemaker_available,
Expand Down Expand Up @@ -934,6 +935,7 @@ def _validate_launch_command(args):
DistributedType.MULTI_GPU,
DistributedType.MULTI_NPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_XPU,
)
else False
Expand Down Expand Up @@ -1013,18 +1015,25 @@ def _validate_launch_command(args):
args.num_processes = torch.xpu.device_count()
elif is_mlu_available():
args.num_processes = torch.mlu.device_count()
elif is_musa_available():
args.num_processes = torch.musa.device_count()
elif is_npu_available():
args.num_processes = torch.npu.device_count()
else:
args.num_processes = torch.cuda.device_count()
warned.append(f"\t`--num_processes` was set to a value of `{args.num_processes}`")
if args.debug is None:
args.debug = False
if not args.multi_gpu and (
(args.use_xpu and is_xpu_available() and torch.xpu.device_count() > 1)
or (is_mlu_available() and torch.mlu.device_count() > 1)
or (is_npu_available() and torch.npu.device_count() > 1)
or (torch.cuda.device_count() > 1)
if (
not args.multi_gpu
and args.num_processes > 1
and (
(args.use_xpu and is_xpu_available() and torch.xpu.device_count() > 1)
or (is_mlu_available() and torch.mlu.device_count() > 1)
or (is_musa_available() and torch.musa.device_count() > 1)
or (is_npu_available() and torch.npu.device_count() > 1)
or (torch.cuda.device_count() > 1)
)
):
warned.append(
"\t\tMore than one GPU was found, enabling multi-GPU training.\n"
Expand Down
2 changes: 1 addition & 1 deletion src/accelerate/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from .utils.other import recursive_getattr


_accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu"]
_accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu", "musa"]


class ModelHook:
Expand Down
1 change: 1 addition & 0 deletions src/accelerate/local_sgd.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def __init__(self, accelerator: Accelerator, model: torch.nn.Module, local_sgd_s
DistributedType.MULTI_CPU,
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
]:
raise NotImplementedError("LocalSGD is supported only for CPUs and GPUs (no DeepSpeed or MegatronLM)")
Expand Down
22 changes: 21 additions & 1 deletion src/accelerate/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
is_ipex_available,
is_mlu_available,
is_mps_available,
is_musa_available,
is_npu_available,
is_torch_xla_available,
is_xpu_available,
Expand All @@ -56,6 +57,9 @@
if is_mlu_available(check_device=False):
import torch_mlu # noqa: F401

if is_musa_available(check_device=False):
import torch_musa # noqa: F401

if is_npu_available(check_device=False):
import torch_npu # noqa: F401

Expand Down Expand Up @@ -369,6 +373,7 @@ def wait_for_everyone(self):
if self.distributed_type in (
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
DistributedType.MULTI_XPU,
DistributedType.MULTI_CPU,
Expand Down Expand Up @@ -688,6 +693,7 @@ def default_device(self) -> torch.device:
- MPS if `torch.backends.mps.is_available()` and `torch.backends.mps.is_built()` both return True.
- CUDA if `torch.cuda.is_available()`
- MLU if `is_mlu_available()`
- MUSA if `is_musa_available()`
- NPU if `is_npu_available()`
- CPU otherwise
"""
Expand All @@ -696,6 +702,8 @@ def default_device(self) -> torch.device:
return torch.device("mps")
elif is_mlu_available():
return torch.device("mlu")
elif is_musa_available():
return torch.device("musa")
elif torch.cuda.is_available():
return torch.device("cuda")
elif is_xpu_available():
Expand All @@ -722,6 +730,9 @@ def _prepare_backend(
if is_mlu_available():
backend = "cncl"
distributed_type = DistributedType.MULTI_MLU
elif is_musa_available():
backend = "mccl"
distributed_type = DistributedType.MULTI_MUSA
elif torch.cuda.is_available():
if backend is None:
backend = "nccl"
Expand Down Expand Up @@ -769,7 +780,7 @@ def set_device(self):
self.device = torch.device("cpu") if self._cpu else self.default_device
return
device = str(self.distributed_type).split(".")[-1].replace("MULTI_", "").lower()
if device not in ("cpu", "gpu", "mlu", "npu", "xpu", "xla"):
if device not in ("cpu", "gpu", "mlu", "musa", "npu", "xpu", "xla"):
raise ValueError(
f"Can't set device for {self.distributed_type} ({device}), verify we should be calling `_set_device()` for it!"
)
Expand All @@ -784,6 +795,8 @@ def set_device(self):
torch.xpu.set_device(self.device)
elif device == "mlu":
torch.mlu.set_device(self.device)
elif device == "musa":
torch.musa.set_device(self.device)
elif device == "npu":
torch.npu.set_device(self.device)
elif device == "cuda":
Expand Down Expand Up @@ -894,6 +907,7 @@ def __init__(
elif self.distributed_type in [
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
DistributedType.MULTI_XPU,
]:
Expand All @@ -920,6 +934,12 @@ def __init__(
and self.device.type == "cuda"
):
torch.backends.cuda.matmul.allow_tf32 = True
if (
self.dynamo_plugin.backend != DynamoBackend.NO
and self._mixed_precision == "no"
and self.device.type == "musa"
):
torch.backends.musa.matmul.allow_tf32 = True
PartialState._shared_state["distributed_type"] = self.distributed_type

@property
Expand Down
1 change: 1 addition & 0 deletions src/accelerate/test_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
require_multi_device,
require_multi_gpu,
require_multi_xpu,
require_musa,
require_non_cpu,
require_non_torch_xla,
require_non_xpu,
Expand Down
Loading

0 comments on commit d264939

Please sign in to comment.