Skip to content

Commit

Permalink
Enhance testing: Skip fused_optimizer tests if not supported by the a…
Browse files Browse the repository at this point in the history
…ccelerator.

Added condition check to skip fused_optimizer tests if FusedAdam and FusedLamb are not supported by the accelerator. This enhancement ensures that the tests are appropriately skipped when the hardware configuration does not support these optimizers, preventing potential issues.

Details:
- Introduced a condition check to determine support for FusedAdam and FusedLamb.
- If not supported, fused_optimizer tests are skipped to improve test reliability.
- Improved compatibility and stability across different hardware configurations.
  • Loading branch information
vshekhawat-hlab committed Feb 19, 2024
1 parent a37e59b commit 050ede6
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 4 deletions.
4 changes: 4 additions & 0 deletions tests/unit/compression/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import torch
import pytest
import random
import deepspeed
import numpy as np
from unit.megatron_model import get_gpt2_model
from deepspeed.compression.compress import init_compression
Expand All @@ -16,6 +17,7 @@
from deepspeed.accelerator import get_accelerator
from deepspeed.runtime.utils import required_torch_version
from unit.common import DistributedTest
from deepspeed.ops.op_builder import FusedLambBuilder

pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5),
reason='Megatron-LM package requires Pytorch version 1.5 or above')
Expand Down Expand Up @@ -216,6 +218,7 @@ def get_ds_config(self):

return ds_config_dict

@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
def test_linear_layer_compress(self, tmpdir):
model = create_bert_model()
compressed_model = init_compression(model, self.get_ds_config())
Expand All @@ -225,6 +228,7 @@ def test_linear_layer_compress(self, tmpdir):
assert isinstance(compressed_model.layer[0].attention.self.value, LinearLayer_Compress)

@pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
def test_mpu_compress(self, tmpdir):
if not required_torch_version(max_version=1.13):
pytest.skip("megatron not compatible with torch >1.13")
Expand Down
5 changes: 4 additions & 1 deletion tests/unit/elasticity/test_elastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from deepspeed.git_version_info import version as ds_version
import os
from unit.simple_model import SimpleModel
from deepspeed.ops.op_builder import FusedAdamBuilder
from deepspeed.ops.op_builder import FusedAdamBuilder, FusedLambBuilder

if not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME]:
pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
Expand Down Expand Up @@ -150,6 +150,7 @@ def test_proper_mbsz(ds_config):
class TestNonElasticBatchParams(DistributedTest):
world_size = 2

@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
def test(self):
config_dict = {
"train_batch_size": 2,
Expand Down Expand Up @@ -182,6 +183,7 @@ def test(self):
class TestNonElasticBatchParamsWithOverride(DistributedTest):
world_size = 2

@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
def test(self):
config_dict = {
"train_batch_size": 2,
Expand Down Expand Up @@ -213,6 +215,7 @@ def test(self):
class TestElasticConfigChanged(DistributedTest):
world_size = 2

@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
def test(self):
config_dict = {
"train_batch_size": 2,
Expand Down
4 changes: 3 additions & 1 deletion tests/unit/ops/adam/test_cpu_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import deepspeed
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.adam import FusedAdam
from deepspeed.ops.op_builder import CPUAdamBuilder
from deepspeed.ops.op_builder import CPUAdamBuilder, FusedAdamBuilder
from unit.common import DistributedTest

if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
Expand Down Expand Up @@ -62,6 +62,8 @@ class TestCPUAdam(DistributedTest):
set_dist_env = False

@pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.")
@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME],
reason="FusedAdam is not compatible")
def test_fused_adam_equal(self, dtype, model_size):
if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
Expand Down
4 changes: 3 additions & 1 deletion tests/unit/ops/adam/test_hybrid_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import deepspeed
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.adam import FusedAdam, DeepSpeedCPUAdam
from deepspeed.ops.op_builder import CPUAdamBuilder
from deepspeed.ops.op_builder import CPUAdamBuilder, FusedAdamBuilder
from unit.common import DistributedTest

if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
Expand Down Expand Up @@ -43,6 +43,8 @@ class TestHybridAdam(DistributedTest):
set_dist_env = False

@pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.")
@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME],
reason="FusedAdam is not compatible")
def test_hybrid_adam_equal(self, dtype, model_size):
if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
Expand Down
3 changes: 3 additions & 0 deletions tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
# DeepSpeed Team

import torch
import pytest
import deepspeed
import numpy as np
from unit.common import DistributedTest
from unit.simple_model import SimpleModel
from deepspeed.ops.op_builder import FusedLambBuilder


def run_model_step(model, gradient_list):
Expand Down Expand Up @@ -143,6 +145,7 @@ def test_some_overflow(self):
assert optim.cur_iter == expected_iteration


@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
class TestUnfused(DistributedTest):
world_size = 1

Expand Down
9 changes: 8 additions & 1 deletion tests/unit/runtime/half_precision/test_fp16.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader, SimpleMoEModel, sequence_dataloader
from deepspeed.runtime.utils import required_torch_version
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder import CPUAdamBuilder
from deepspeed.ops.op_builder import CPUAdamBuilder, FusedLambBuilder

try:
from apex import amp # noqa: F401 # type: ignore
Expand All @@ -21,7 +21,11 @@
_amp_available = False
amp_available = pytest.mark.skipif(not _amp_available, reason="apex/amp is not installed")

if torch.half not in get_accelerator().supported_dtypes():
pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)


@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
class TestLambFP32GradClip(DistributedTest):
world_size = 2

Expand Down Expand Up @@ -52,6 +56,7 @@ def test(self):
model.step()


@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
class TestLambFP16(DistributedTest):
world_size = 2

Expand Down Expand Up @@ -216,6 +221,7 @@ def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True)
engine.backward(loss)
engine.step()

@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
@pytest.mark.parametrize("fused_lamb_legacy", [(False), (True)])
def test_lamb_gradnorm(self, monkeypatch, fused_lamb_legacy: bool):
if not required_torch_version(min_version=1.8):
Expand Down Expand Up @@ -466,6 +472,7 @@ def test_adam_basic(self):
model.backward(loss)
model.step()

@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
def test_lamb_basic(self):
config_dict = {
"train_batch_size": 2,
Expand Down
6 changes: 6 additions & 0 deletions tests/unit/runtime/test_ds_initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from deepspeed.runtime.lr_schedules import WARMUP_LR, WarmupLR
from deepspeed.runtime.config import ADAM_OPTIMIZER
from deepspeed.runtime.utils import see_memory_usage, required_torch_version
from deepspeed.ops.op_builder import FusedAdamBuilder


@pytest.mark.parametrize('zero_stage', [0, 3])
Expand Down Expand Up @@ -68,6 +69,9 @@ def test(self, optimizer_type):
def _optimizer_callable(params) -> Optimizer:
return AdamW(params=params)

if (optimizer_type is None) and (not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME]):
pytest.skip("FusedAdam is not compatible")

hidden_dim = 10
model = SimpleModel(hidden_dim)

Expand Down Expand Up @@ -96,6 +100,8 @@ def _optimizer_callable(params) -> Optimizer:
class TestConfigOptimizer(DistributedTest):
world_size = 1

@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME],
reason="FusedAdam is not compatible")
def test(self, client_parameters):
ds_config = {"train_batch_size": 1, "optimizer": {"type": "Adam", "params": {"lr": 0.001}}}

Expand Down

0 comments on commit 050ede6

Please sign in to comment.