From 3b23162c36308525c207edce1aff5925ce4c5a45 Mon Sep 17 00:00:00 2001
From: arjkesh <33526713+arjkesh@users.noreply.github.com>
Date: Thu, 14 Mar 2024 17:07:17 -0700
Subject: [PATCH] Refactor PT 2.1 tests (#3762)

* Minimize PT 2.1 test bins

* update config

* stop spurious ec2 benchmark jobs

* skip clarify

* Update dlc_developer_config.toml

* Update dlc_developer_config.toml
---
 src/start_testbuilds.py                       |   2 +-
 .../ec2/pytorch/training/common_cases.py      |  11 ++
 .../pytorch/training/test_pytorch_training.py |  28 +++++
 .../training/test_pytorch_training_2_1.py     | 104 ++++++++++++++++++
 .../training/test_pytorch_training_2_2.py     |   6 +-
 test/dlc_tests/ec2/test_smclarify.py          |   2 +
 6 files changed, 149 insertions(+), 4 deletions(-)
 create mode 100644 test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_1.py

diff --git a/src/start_testbuilds.py b/src/start_testbuilds.py
index b72964815843..93c99f2b3c44 100644
--- a/src/start_testbuilds.py
+++ b/src/start_testbuilds.py
@@ -121,7 +121,7 @@ def is_test_job_enabled(test_type):
         return True
     if test_type == constants.EC2_TESTS and config.is_ec2_test_enabled():
         return True
-    if test_type == constants.EC2_BENCHMARK_TESTS and config.is_ec2_benchmark_test_enabled:
+    if test_type == constants.EC2_BENCHMARK_TESTS and config.is_ec2_benchmark_test_enabled():
         return True
     if test_type == constants.ECS_TESTS and config.is_ecs_test_enabled():
         return True
diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py
index a1ee96a4edaa..0e52582357df 100644
--- a/test/dlc_tests/ec2/pytorch/training/common_cases.py
+++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py
@@ -34,6 +34,8 @@
     CONTAINER_TESTS_PREFIX, "pytorch_tests", "test_pt_dlc_telemetry_test"
 )
 PT_TORCHAUDIO_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchaudio")
+PT_TORCHDATA_DEV_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchdataDev")
+PT_TORCHDATA_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testTorchdata")
 
 # Instance type filters
 PT_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.9xlarge", processor="cpu")
@@ -240,3 +242,12 @@ def pytorch_telemetry_cpu(pytorch_training, ec2_connection):
 
 def curand_gpu(training, ec2_connection):
     execute_ec2_training_test(ec2_connection, training, CURAND_CMD)
+
+
+def pytorch_training_torchdata(pytorch_training, ec2_connection):
+    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
+    # HACK including PT 1.13 in this condition because the Torchdata 0.5.0 tag includes old tests data
+    if Version(image_framework_version) in SpecifierSet(">=1.11,<=1.13.1"):
+        execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_DEV_CMD)
+    else:
+        execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD)
diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py
index 30171249c063..7c3ee3587ffc 100644
--- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py
+++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py
@@ -123,6 +123,7 @@ def test_pytorch_train_mlp_neuronx_inf2(pytorch_training_neuronx, ec2_connection
     execute_ec2_training_test(ec2_connection, pytorch_training_neuronx, PT_NEURON_MLP_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.integration("pytorch_sanity_test")
@@ -171,6 +172,7 @@ def test_pytorch_healthcheck_nccl(pytorch_training, ec2_connection, gpu_only, ec
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_NCCL_LOCAL_TEST_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.integration("pytorch_sanity_test")
@@ -181,6 +183,7 @@ def test_pytorch_standalone_cpu(pytorch_training, ec2_connection, cpu_only):
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_STANDALONE_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.model("mnist")
@@ -194,6 +197,7 @@ def test_pytorch_train_mnist_gpu(pytorch_training, ec2_connection, gpu_only, ec2
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_MNIST_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.model("mnist")
@@ -203,6 +207,7 @@ def test_pytorch_train_mnist_cpu(pytorch_training, ec2_connection, cpu_only):
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_MNIST_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.model("linear_regression")
@@ -223,6 +228,7 @@ def test_pytorch_linear_regression_gpu(
         execute_ec2_training_test(ec2_connection, pytorch_training, PT_REGRESSION_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.model("linear_regression")
@@ -232,6 +238,7 @@ def test_pytorch_linear_regression_cpu(pytorch_training, ec2_connection, cpu_onl
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_REGRESSION_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.skip_dgl_test
 @pytest.mark.usefixtures("sagemaker")
@@ -258,6 +265,7 @@ def test_pytorch_train_dgl_gpu(
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_DGL_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.skip_dgl_test
 @pytest.mark.usefixtures("sagemaker")
@@ -270,6 +278,7 @@ def test_pytorch_train_dgl_cpu(pytorch_training, ec2_connection, cpu_only, py3_o
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_DGL_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.integration("gloo")
@@ -292,6 +301,7 @@ def test_pytorch_gloo_gpu(pytorch_training, ec2_connection, gpu_only, py3_only,
     )
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.integration("gloo")
@@ -318,6 +328,7 @@ def test_pytorch_gloo_inductor_gpu(
     )
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.integration("gloo")
@@ -336,6 +347,7 @@ def test_pytorch_gloo_cpu(pytorch_training, ec2_connection, cpu_only, py3_only,
     )
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.integration("nccl")
@@ -356,6 +368,7 @@ def test_pytorch_nccl(pytorch_training, ec2_connection, gpu_only, py3_only, ec2_
     execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd, large_shm=True)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.integration("nccl")
@@ -378,6 +391,7 @@ def test_pytorch_nccl_inductor(
     execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd, large_shm=True)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.integration("nccl")
@@ -409,6 +423,7 @@ def test_pytorch_nccl_version(
     execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.integration("mpi")
@@ -439,6 +454,7 @@ def test_pytorch_mpi_gpu(
     execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.integration("mpi")
@@ -471,6 +487,7 @@ def test_pytorch_mpi_inductor_gpu(
     execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.integration("mpi")
@@ -497,6 +514,7 @@ def test_pytorch_mpi_cpu(
     execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.integration("nvidia_apex")
@@ -511,6 +529,7 @@ def test_nvapex(pytorch_training, ec2_connection, gpu_only, ec2_instance_type):
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_APEX_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.integration("amp")
@@ -528,6 +547,7 @@ def test_pytorch_amp(
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_AMP_CMD, timeout=1500)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker")
 @pytest.mark.integration("amp")
@@ -547,6 +567,7 @@ def test_pytorch_amp_inductor(
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_AMP_INDUCTOR_CMD, timeout=1500)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("feature_s3_plugin_present")
 @pytest.mark.usefixtures("sagemaker")
@@ -573,6 +594,7 @@ def test_pytorch_s3_plugin_gpu(
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_S3_PLUGIN_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("feature_s3_plugin_present")
 @pytest.mark.usefixtures("sagemaker")
@@ -596,6 +618,7 @@ def test_pytorch_s3_plugin_cpu(
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_S3_PLUGIN_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("feature_torchaudio_present")
 @pytest.mark.usefixtures("sagemaker")
@@ -613,6 +636,7 @@ def test_pytorch_training_torchaudio_gpu(
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHAUDIO_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("feature_torchaudio_present")
 @pytest.mark.usefixtures("sagemaker")
@@ -630,6 +654,7 @@ def test_pytorch_training_torchaudio_cpu(
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHAUDIO_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.skip_torchdata_test
 @pytest.mark.usefixtures("feature_torchdata_present")
@@ -655,6 +680,7 @@ def test_pytorch_training_torchdata_gpu(
         execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.skip_torchdata_test
 @pytest.mark.usefixtures("feature_torchdata_present")
@@ -678,6 +704,7 @@ def test_pytorch_training_torchdata_cpu(
         execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("feature_aws_framework_present")
 @pytest.mark.usefixtures("sagemaker")
@@ -705,6 +732,7 @@ def test_pytorch_standalone_hpu(
     )
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("feature_aws_framework_present")
 @pytest.mark.usefixtures("sagemaker")
diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_1.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_1.py
new file mode 100644
index 000000000000..bba2b0661f9b
--- /dev/null
+++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_1.py
@@ -0,0 +1,104 @@
+import pytest
+
+import test.test_utils as test_utils
+
+from test.test_utils import ec2
+
+from test.dlc_tests.ec2.pytorch.training import common_cases
+from test.dlc_tests.ec2 import smclarify_cases
+
+
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.integration("PT21_general")
+@pytest.mark.model("N/A")
+@pytest.mark.team("conda")
+@pytest.mark.parametrize(
+    "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True
+)
+def test_pytorch_2_1_gpu(
+    pytorch_training___2__1, ec2_connection, region, gpu_only, ec2_instance_type
+):
+    pytorch_training = pytorch_training___2__1
+    if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
+        pytest.skip(
+            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
+        )
+
+    test_cases = [
+        (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_train_mnist, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
+        (common_cases.nvapex, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
+        (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
+    ]
+
+    if "sagemaker" in pytorch_training:
+        test_cases.append(
+            (smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)),
+        )
+
+    # AMP must be run on multi_gpu
+    if ec2.is_instance_multi_gpu(ec2_instance_type):
+        test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection)))
+
+    test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.1 GPU")
+
+
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.integration("inductor")
+@pytest.mark.model("N/A")
+@pytest.mark.team("training-compiler")
+@pytest.mark.parametrize(
+    "ec2_instance_type, region",
+    common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION,
+    indirect=True,
+)
+def test_pytorch_2_1_gpu_inductor(
+    pytorch_training___2__1, ec2_connection, region, gpu_only, ec2_instance_type
+):
+    pytorch_training = pytorch_training___2__1
+    if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
+        pytest.skip(
+            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
+        )
+
+    test_cases = [
+        (common_cases.pytorch_gloo_inductor_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_mpi_inductor_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)),
+    ]
+
+    test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.1 GPU Inductor")
+
+
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.integration("pytorch_sanity_test")
+@pytest.mark.model("N/A")
+@pytest.mark.team("conda")
+@pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True)
+def test_pytorch_2_1_cpu(pytorch_training___2__1, ec2_connection, cpu_only):
+    pytorch_training = pytorch_training___2__1
+
+    test_cases = [
+        (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_train_mnist, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_cpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
+    ]
+
+    if "sagemaker" in pytorch_training:
+        test_cases += [
+            (smclarify_cases.smclarify_metrics_cpu, (pytorch_training, ec2_connection)),
+        ]
+
+    test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.1 CPU")
diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py
index 8818339cbbfd..6132e37b50b6 100644
--- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py
+++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py
@@ -9,7 +9,7 @@
 
 
 @pytest.mark.usefixtures("sagemaker")
-@pytest.mark.integration("all PT 2.2 tests")
+@pytest.mark.integration("PT22_general")
 @pytest.mark.model("N/A")
 @pytest.mark.team("conda")
 @pytest.mark.parametrize(
@@ -49,7 +49,7 @@ def test_pytorch_2_2_gpu(
 
 
 @pytest.mark.usefixtures("sagemaker")
-@pytest.mark.integration("all PT 2.2 tests")
+@pytest.mark.integration("inductor")
 @pytest.mark.model("N/A")
 @pytest.mark.team("training-compiler")
 @pytest.mark.parametrize(
@@ -73,7 +73,7 @@ def test_pytorch_2_2_gpu_inductor(
         (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)),
     ]
 
-    test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.2 GPU")
+    test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.2 GPU Inductor")
 
 
 @pytest.mark.usefixtures("sagemaker")
diff --git a/test/dlc_tests/ec2/test_smclarify.py b/test/dlc_tests/ec2/test_smclarify.py
index 38f3640bd055..c09dfd1a3a7a 100644
--- a/test/dlc_tests/ec2/test_smclarify.py
+++ b/test/dlc_tests/ec2/test_smclarify.py
@@ -16,6 +16,7 @@
 
 # Adding separate tests to run on cpu instance for cpu image and gpu instance for gpu image.
 # But the test behavior doesn't change for cpu or gpu image type.
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker_only")
 @pytest.mark.integration("smclarify_cpu")
@@ -35,6 +36,7 @@ def test_smclarify_metrics_cpu(
     smclarify_cases.run_smclarify_bias_metrics(training, ec2_connection)
 
 
+@pytest.mark.skip_pt21_test
 @pytest.mark.skip_pt22_test
 @pytest.mark.usefixtures("sagemaker_only")
 @pytest.mark.integration("smclarify_gpu")