Skip to content

Commit

Permalink
Refactor PT 2.1 tests (#3762)
Browse files Browse the repository at this point in the history
* Minimize PT 2.1 test bins

* update config

* stop spurious ec2 benchmark jobs

* skip clarify

* Update dlc_developer_config.toml

* Update dlc_developer_config.toml
  • Loading branch information
arjkesh authored Mar 15, 2024
1 parent d6374f4 commit 3b23162
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 4 deletions.
2 changes: 1 addition & 1 deletion src/start_testbuilds.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def is_test_job_enabled(test_type):
return True
if test_type == constants.EC2_TESTS and config.is_ec2_test_enabled():
return True
if test_type == constants.EC2_BENCHMARK_TESTS and config.is_ec2_benchmark_test_enabled:
if test_type == constants.EC2_BENCHMARK_TESTS and config.is_ec2_benchmark_test_enabled():
return True
if test_type == constants.ECS_TESTS and config.is_ecs_test_enabled():
return True
Expand Down
11 changes: 11 additions & 0 deletions test/dlc_tests/ec2/pytorch/training/common_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
CONTAINER_TESTS_PREFIX, "pytorch_tests", "test_pt_dlc_telemetry_test"
)
PT_TORCHAUDIO_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchaudio")
PT_TORCHDATA_DEV_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchdataDev")
PT_TORCHDATA_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchdata")

# Instance type filters
PT_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.9xlarge", processor="cpu")
Expand Down Expand Up @@ -240,3 +242,12 @@ def pytorch_telemetry_cpu(pytorch_training, ec2_connection):

def curand_gpu(training, ec2_connection):
execute_ec2_training_test(ec2_connection, training, CURAND_CMD)


def pytorch_training_torchdata(pytorch_training, ec2_connection):
_, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
# HACK including PT 1.13 in this condition because the Torchdata 0.5.0 tag includes old tests data
if Version(image_framework_version) in SpecifierSet(">=1.11,<=1.13.1"):
execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_DEV_CMD)
else:
execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD)
28 changes: 28 additions & 0 deletions test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def test_pytorch_train_mlp_neuronx_inf2(pytorch_training_neuronx, ec2_connection
execute_ec2_training_test(ec2_connection, pytorch_training_neuronx, PT_NEURON_MLP_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("pytorch_sanity_test")
Expand Down Expand Up @@ -171,6 +172,7 @@ def test_pytorch_healthcheck_nccl(pytorch_training, ec2_connection, gpu_only, ec
execute_ec2_training_test(ec2_connection, pytorch_training, PT_NCCL_LOCAL_TEST_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("pytorch_sanity_test")
Expand All @@ -181,6 +183,7 @@ def test_pytorch_standalone_cpu(pytorch_training, ec2_connection, cpu_only):
execute_ec2_training_test(ec2_connection, pytorch_training, PT_STANDALONE_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.model("mnist")
Expand All @@ -194,6 +197,7 @@ def test_pytorch_train_mnist_gpu(pytorch_training, ec2_connection, gpu_only, ec2
execute_ec2_training_test(ec2_connection, pytorch_training, PT_MNIST_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.model("mnist")
Expand All @@ -203,6 +207,7 @@ def test_pytorch_train_mnist_cpu(pytorch_training, ec2_connection, cpu_only):
execute_ec2_training_test(ec2_connection, pytorch_training, PT_MNIST_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.model("linear_regression")
Expand All @@ -223,6 +228,7 @@ def test_pytorch_linear_regression_gpu(
execute_ec2_training_test(ec2_connection, pytorch_training, PT_REGRESSION_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.model("linear_regression")
Expand All @@ -232,6 +238,7 @@ def test_pytorch_linear_regression_cpu(pytorch_training, ec2_connection, cpu_onl
execute_ec2_training_test(ec2_connection, pytorch_training, PT_REGRESSION_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.skip_dgl_test
@pytest.mark.usefixtures("sagemaker")
Expand All @@ -258,6 +265,7 @@ def test_pytorch_train_dgl_gpu(
execute_ec2_training_test(ec2_connection, pytorch_training, PT_DGL_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.skip_dgl_test
@pytest.mark.usefixtures("sagemaker")
Expand All @@ -270,6 +278,7 @@ def test_pytorch_train_dgl_cpu(pytorch_training, ec2_connection, cpu_only, py3_o
execute_ec2_training_test(ec2_connection, pytorch_training, PT_DGL_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("gloo")
Expand All @@ -292,6 +301,7 @@ def test_pytorch_gloo_gpu(pytorch_training, ec2_connection, gpu_only, py3_only,
)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("gloo")
Expand All @@ -318,6 +328,7 @@ def test_pytorch_gloo_inductor_gpu(
)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("gloo")
Expand All @@ -336,6 +347,7 @@ def test_pytorch_gloo_cpu(pytorch_training, ec2_connection, cpu_only, py3_only,
)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("nccl")
Expand All @@ -356,6 +368,7 @@ def test_pytorch_nccl(pytorch_training, ec2_connection, gpu_only, py3_only, ec2_
execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd, large_shm=True)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("nccl")
Expand All @@ -378,6 +391,7 @@ def test_pytorch_nccl_inductor(
execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd, large_shm=True)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("nccl")
Expand Down Expand Up @@ -409,6 +423,7 @@ def test_pytorch_nccl_version(
execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("mpi")
Expand Down Expand Up @@ -439,6 +454,7 @@ def test_pytorch_mpi_gpu(
execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("mpi")
Expand Down Expand Up @@ -471,6 +487,7 @@ def test_pytorch_mpi_inductor_gpu(
execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("mpi")
Expand All @@ -497,6 +514,7 @@ def test_pytorch_mpi_cpu(
execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("nvidia_apex")
Expand All @@ -511,6 +529,7 @@ def test_nvapex(pytorch_training, ec2_connection, gpu_only, ec2_instance_type):
execute_ec2_training_test(ec2_connection, pytorch_training, PT_APEX_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("amp")
Expand All @@ -528,6 +547,7 @@ def test_pytorch_amp(
execute_ec2_training_test(ec2_connection, pytorch_training, PT_AMP_CMD, timeout=1500)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("amp")
Expand All @@ -547,6 +567,7 @@ def test_pytorch_amp_inductor(
execute_ec2_training_test(ec2_connection, pytorch_training, PT_AMP_INDUCTOR_CMD, timeout=1500)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("feature_s3_plugin_present")
@pytest.mark.usefixtures("sagemaker")
Expand All @@ -573,6 +594,7 @@ def test_pytorch_s3_plugin_gpu(
execute_ec2_training_test(ec2_connection, pytorch_training, PT_S3_PLUGIN_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("feature_s3_plugin_present")
@pytest.mark.usefixtures("sagemaker")
Expand All @@ -596,6 +618,7 @@ def test_pytorch_s3_plugin_cpu(
execute_ec2_training_test(ec2_connection, pytorch_training, PT_S3_PLUGIN_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("feature_torchaudio_present")
@pytest.mark.usefixtures("sagemaker")
Expand All @@ -613,6 +636,7 @@ def test_pytorch_training_torchaudio_gpu(
execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHAUDIO_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("feature_torchaudio_present")
@pytest.mark.usefixtures("sagemaker")
Expand All @@ -630,6 +654,7 @@ def test_pytorch_training_torchaudio_cpu(
execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHAUDIO_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.skip_torchdata_test
@pytest.mark.usefixtures("feature_torchdata_present")
Expand All @@ -655,6 +680,7 @@ def test_pytorch_training_torchdata_gpu(
execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.skip_torchdata_test
@pytest.mark.usefixtures("feature_torchdata_present")
Expand All @@ -678,6 +704,7 @@ def test_pytorch_training_torchdata_cpu(
execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("feature_aws_framework_present")
@pytest.mark.usefixtures("sagemaker")
Expand Down Expand Up @@ -705,6 +732,7 @@ def test_pytorch_standalone_hpu(
)


@pytest.mark.skip_pt21_test
@pytest.mark.skip_pt22_test
@pytest.mark.usefixtures("feature_aws_framework_present")
@pytest.mark.usefixtures("sagemaker")
Expand Down
104 changes: 104 additions & 0 deletions test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import pytest

import test.test_utils as test_utils

from test.test_utils import ec2

from test.dlc_tests.ec2.pytorch.training import common_cases
from test.dlc_tests.ec2 import smclarify_cases


@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("PT21_general")
@pytest.mark.model("N/A")
@pytest.mark.team("conda")
@pytest.mark.parametrize(
"ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True
)
def test_pytorch_2_1_gpu(
pytorch_training___2__1, ec2_connection, region, gpu_only, ec2_instance_type
):
pytorch_training = pytorch_training___2__1
if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
pytest.skip(
f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
)

test_cases = [
(common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
(common_cases.pytorch_train_mnist, (pytorch_training, ec2_connection)),
(common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)),
(common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
(common_cases.pytorch_nccl, (pytorch_training, ec2_connection)),
(common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
(common_cases.nvapex, (pytorch_training, ec2_connection)),
(common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
(common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
(common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
]

if "sagemaker" in pytorch_training:
test_cases.append(
(smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)),
)

# AMP must be run on multi_gpu
if ec2.is_instance_multi_gpu(ec2_instance_type):
test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection)))

test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.1 GPU")


@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("inductor")
@pytest.mark.model("N/A")
@pytest.mark.team("training-compiler")
@pytest.mark.parametrize(
"ec2_instance_type, region",
common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION,
indirect=True,
)
def test_pytorch_2_1_gpu_inductor(
pytorch_training___2__1, ec2_connection, region, gpu_only, ec2_instance_type
):
pytorch_training = pytorch_training___2__1
if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
pytest.skip(
f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
)

test_cases = [
(common_cases.pytorch_gloo_inductor_gpu, (pytorch_training, ec2_connection)),
(common_cases.pytorch_mpi_inductor_gpu, (pytorch_training, ec2_connection)),
(common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)),
(common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)),
]

test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.1 GPU Inductor")


@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("pytorch_sanity_test")
@pytest.mark.model("N/A")
@pytest.mark.team("conda")
@pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True)
def test_pytorch_2_1_cpu(pytorch_training___2__1, ec2_connection, cpu_only):
pytorch_training = pytorch_training___2__1

test_cases = [
(common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
(common_cases.pytorch_train_mnist, (pytorch_training, ec2_connection)),
(common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)),
(common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
(common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
(common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
(common_cases.pytorch_telemetry_cpu, (pytorch_training, ec2_connection)),
(common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
]

if "sagemaker" in pytorch_training:
test_cases += [
(smclarify_cases.smclarify_metrics_cpu, (pytorch_training, ec2_connection)),
]

test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.1 CPU")
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@


@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("all PT 2.2 tests")
@pytest.mark.integration("PT22_general")
@pytest.mark.model("N/A")
@pytest.mark.team("conda")
@pytest.mark.parametrize(
Expand Down Expand Up @@ -49,7 +49,7 @@ def test_pytorch_2_2_gpu(


@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("all PT 2.2 tests")
@pytest.mark.integration("inductor")
@pytest.mark.model("N/A")
@pytest.mark.team("training-compiler")
@pytest.mark.parametrize(
Expand All @@ -73,7 +73,7 @@ def test_pytorch_2_2_gpu_inductor(
(common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)),
]

test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.2 GPU")
test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.2 GPU Inductor")


@pytest.mark.usefixtures("sagemaker")
Expand Down
Loading

0 comments on commit 3b23162

Please sign in to comment.