From 3b23162c36308525c207edce1aff5925ce4c5a45 Mon Sep 17 00:00:00 2001 From: arjkesh <33526713+arjkesh@users.noreply.github.com> Date: Thu, 14 Mar 2024 17:07:17 -0700 Subject: [PATCH] Refactor PT 2.1 tests (#3762) * Minimize PT 2.1 test bins * update config * stop spurious ec2 benchmark jobs * skip clarify * Update dlc_developer_config.toml * Update dlc_developer_config.toml --- src/start_testbuilds.py | 2 +- .../ec2/pytorch/training/common_cases.py | 11 ++ .../pytorch/training/test_pytorch_training.py | 28 +++++ .../training/test_pytorch_training_2_1.py | 104 ++++++++++++++++++ .../training/test_pytorch_training_2_2.py | 6 +- test/dlc_tests/ec2/test_smclarify.py | 2 + 6 files changed, 149 insertions(+), 4 deletions(-) create mode 100644 test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_1.py diff --git a/src/start_testbuilds.py b/src/start_testbuilds.py index b72964815843..93c99f2b3c44 100644 --- a/src/start_testbuilds.py +++ b/src/start_testbuilds.py @@ -121,7 +121,7 @@ def is_test_job_enabled(test_type): return True if test_type == constants.EC2_TESTS and config.is_ec2_test_enabled(): return True - if test_type == constants.EC2_BENCHMARK_TESTS and config.is_ec2_benchmark_test_enabled: + if test_type == constants.EC2_BENCHMARK_TESTS and config.is_ec2_benchmark_test_enabled(): return True if test_type == constants.ECS_TESTS and config.is_ecs_test_enabled(): return True diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index a1ee96a4edaa..0e52582357df 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -34,6 +34,8 @@ CONTAINER_TESTS_PREFIX, "pytorch_tests", "test_pt_dlc_telemetry_test" ) PT_TORCHAUDIO_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchaudio") +PT_TORCHDATA_DEV_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchdataDev") +PT_TORCHDATA_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchdata") # Instance type filters PT_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.9xlarge", processor="cpu") @@ -240,3 +242,12 @@ def pytorch_telemetry_cpu(pytorch_training, ec2_connection): def curand_gpu(training, ec2_connection): execute_ec2_training_test(ec2_connection, training, CURAND_CMD) + + +def pytorch_training_torchdata(pytorch_training, ec2_connection): + _, image_framework_version = get_framework_and_version_from_tag(pytorch_training) + # HACK including PT 1.13 in this condition because the Torchdata 0.5.0 tag includes old tests data + if Version(image_framework_version) in SpecifierSet(">=1.11,<=1.13.1"): + execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_DEV_CMD) + else: + execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD) diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py index 30171249c063..7c3ee3587ffc 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py @@ -123,6 +123,7 @@ def test_pytorch_train_mlp_neuronx_inf2(pytorch_training_neuronx, ec2_connection execute_ec2_training_test(ec2_connection, pytorch_training_neuronx, PT_NEURON_MLP_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("pytorch_sanity_test") @@ -171,6 +172,7 @@ def test_pytorch_healthcheck_nccl(pytorch_training, ec2_connection, gpu_only, ec execute_ec2_training_test(ec2_connection, pytorch_training, PT_NCCL_LOCAL_TEST_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("pytorch_sanity_test") @@ -181,6 +183,7 @@ def test_pytorch_standalone_cpu(pytorch_training, ec2_connection, cpu_only): execute_ec2_training_test(ec2_connection, pytorch_training, PT_STANDALONE_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.model("mnist") @@ -194,6 +197,7 @@ def test_pytorch_train_mnist_gpu(pytorch_training, ec2_connection, gpu_only, ec2 execute_ec2_training_test(ec2_connection, pytorch_training, PT_MNIST_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.model("mnist") @@ -203,6 +207,7 @@ def test_pytorch_train_mnist_cpu(pytorch_training, ec2_connection, cpu_only): execute_ec2_training_test(ec2_connection, pytorch_training, PT_MNIST_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.model("linear_regression") @@ -223,6 +228,7 @@ def test_pytorch_linear_regression_gpu( execute_ec2_training_test(ec2_connection, pytorch_training, PT_REGRESSION_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.model("linear_regression") @@ -232,6 +238,7 @@ def test_pytorch_linear_regression_cpu(pytorch_training, ec2_connection, cpu_onl execute_ec2_training_test(ec2_connection, pytorch_training, PT_REGRESSION_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.skip_dgl_test @pytest.mark.usefixtures("sagemaker") @@ -258,6 +265,7 @@ def test_pytorch_train_dgl_gpu( execute_ec2_training_test(ec2_connection, pytorch_training, PT_DGL_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.skip_dgl_test @pytest.mark.usefixtures("sagemaker") @@ -270,6 +278,7 @@ def test_pytorch_train_dgl_cpu(pytorch_training, ec2_connection, cpu_only, py3_o execute_ec2_training_test(ec2_connection, pytorch_training, PT_DGL_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("gloo") @@ -292,6 +301,7 @@ def test_pytorch_gloo_gpu(pytorch_training, ec2_connection, gpu_only, py3_only, ) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("gloo") @@ -318,6 +328,7 @@ def test_pytorch_gloo_inductor_gpu( ) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("gloo") @@ -336,6 +347,7 @@ def test_pytorch_gloo_cpu(pytorch_training, ec2_connection, cpu_only, py3_only, ) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("nccl") @@ -356,6 +368,7 @@ def test_pytorch_nccl(pytorch_training, ec2_connection, gpu_only, py3_only, ec2_ execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd, large_shm=True) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("nccl") @@ -378,6 +391,7 @@ def test_pytorch_nccl_inductor( execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd, large_shm=True) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("nccl") @@ -409,6 +423,7 @@ def test_pytorch_nccl_version( execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("mpi") @@ -439,6 +454,7 @@ def test_pytorch_mpi_gpu( execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("mpi") @@ -471,6 +487,7 @@ def test_pytorch_mpi_inductor_gpu( execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("mpi") @@ -497,6 +514,7 @@ def test_pytorch_mpi_cpu( execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("nvidia_apex") @@ -511,6 +529,7 @@ def test_nvapex(pytorch_training, ec2_connection, gpu_only, ec2_instance_type): execute_ec2_training_test(ec2_connection, pytorch_training, PT_APEX_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("amp") @@ -528,6 +547,7 @@ def test_pytorch_amp( execute_ec2_training_test(ec2_connection, pytorch_training, PT_AMP_CMD, timeout=1500) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker") @pytest.mark.integration("amp") @@ -547,6 +567,7 @@ def test_pytorch_amp_inductor( execute_ec2_training_test(ec2_connection, pytorch_training, PT_AMP_INDUCTOR_CMD, timeout=1500) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("feature_s3_plugin_present") @pytest.mark.usefixtures("sagemaker") @@ -573,6 +594,7 @@ def test_pytorch_s3_plugin_gpu( execute_ec2_training_test(ec2_connection, pytorch_training, PT_S3_PLUGIN_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("feature_s3_plugin_present") @pytest.mark.usefixtures("sagemaker") @@ -596,6 +618,7 @@ def test_pytorch_s3_plugin_cpu( execute_ec2_training_test(ec2_connection, pytorch_training, PT_S3_PLUGIN_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("feature_torchaudio_present") @pytest.mark.usefixtures("sagemaker") @@ -613,6 +636,7 @@ def test_pytorch_training_torchaudio_gpu( execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHAUDIO_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("feature_torchaudio_present") @pytest.mark.usefixtures("sagemaker") @@ -630,6 +654,7 @@ def test_pytorch_training_torchaudio_cpu( execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHAUDIO_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.skip_torchdata_test @pytest.mark.usefixtures("feature_torchdata_present") @@ -655,6 +680,7 @@ def test_pytorch_training_torchdata_gpu( execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.skip_torchdata_test @pytest.mark.usefixtures("feature_torchdata_present") @@ -678,6 +704,7 @@ def test_pytorch_training_torchdata_cpu( execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("feature_aws_framework_present") @pytest.mark.usefixtures("sagemaker") @@ -705,6 +732,7 @@ def test_pytorch_standalone_hpu( ) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("feature_aws_framework_present") @pytest.mark.usefixtures("sagemaker") diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_1.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_1.py new file mode 100644 index 000000000000..bba2b0661f9b --- /dev/null +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_1.py @@ -0,0 +1,104 @@ +import pytest + +import test.test_utils as test_utils + +from test.test_utils import ec2 + +from test.dlc_tests.ec2.pytorch.training import common_cases +from test.dlc_tests.ec2 import smclarify_cases + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("PT21_general") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize( + "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True +) +def test_pytorch_2_1_gpu( + pytorch_training___2__1, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__1 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_train_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.nvapex, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), + (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + ] + + if "sagemaker" in pytorch_training: + test_cases.append( + (smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)), + ) + + # AMP must be run on multi_gpu + if ec2.is_instance_multi_gpu(ec2_instance_type): + test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection))) + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.1 GPU") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("inductor") +@pytest.mark.model("N/A") +@pytest.mark.team("training-compiler") +@pytest.mark.parametrize( + "ec2_instance_type, region", + common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION, + indirect=True, +) +def test_pytorch_2_1_gpu_inductor( + pytorch_training___2__1, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__1 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_gloo_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)), + (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.1 GPU Inductor") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_sanity_test") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True) +def test_pytorch_2_1_cpu(pytorch_training___2__1, ec2_connection, cpu_only): + pytorch_training = pytorch_training___2__1 + + test_cases = [ + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_train_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_cpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + ] + + if "sagemaker" in pytorch_training: + test_cases += [ + (smclarify_cases.smclarify_metrics_cpu, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.1 CPU") diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py index 8818339cbbfd..6132e37b50b6 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py @@ -9,7 +9,7 @@ @pytest.mark.usefixtures("sagemaker") -@pytest.mark.integration("all PT 2.2 tests") +@pytest.mark.integration("PT22_general") @pytest.mark.model("N/A") @pytest.mark.team("conda") @pytest.mark.parametrize( @@ -49,7 +49,7 @@ def test_pytorch_2_2_gpu( @pytest.mark.usefixtures("sagemaker") -@pytest.mark.integration("all PT 2.2 tests") +@pytest.mark.integration("inductor") @pytest.mark.model("N/A") @pytest.mark.team("training-compiler") @pytest.mark.parametrize( @@ -73,7 +73,7 @@ def test_pytorch_2_2_gpu_inductor( (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)), ] - test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.2 GPU") + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.2 GPU Inductor") @pytest.mark.usefixtures("sagemaker") diff --git a/test/dlc_tests/ec2/test_smclarify.py b/test/dlc_tests/ec2/test_smclarify.py index 38f3640bd055..c09dfd1a3a7a 100644 --- a/test/dlc_tests/ec2/test_smclarify.py +++ b/test/dlc_tests/ec2/test_smclarify.py @@ -16,6 +16,7 @@ # Adding separate tests to run on cpu instance for cpu image and gpu instance for gpu image. # But the test behavior doesn't change for cpu or gpu image type. +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker_only") @pytest.mark.integration("smclarify_cpu") @@ -35,6 +36,7 @@ def test_smclarify_metrics_cpu( smclarify_cases.run_smclarify_bias_metrics(training, ec2_connection) +@pytest.mark.skip_pt21_test @pytest.mark.skip_pt22_test @pytest.mark.usefixtures("sagemaker_only") @pytest.mark.integration("smclarify_gpu")