aws · sirutBuasai · Mar 19, 2024 · Mar 8, 2024 · Mar 8, 2024 · Mar 8, 2024
diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
@@ -34,15 +34,15 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+build_frameworks = ["tensorflow"]
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true
-build_inference = true
+build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
-do_build = true
+do_build = false
 autopatch_build = false
 
 [notify]
@@ -67,20 +67,20 @@ ec2_benchmark_tests = false
 ### default. If false, these types of tests will be skipped while other tests will run as usual.
 ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true.
 ### Off by default (set to false)
-ec2_tests_on_heavy_instances = false
+ec2_tests_on_heavy_instances = true
 
 ### SM specific tests
 ### Off by default
-sagemaker_local_tests = false
+sagemaker_local_tests = true
 
 # run standard sagemaker remote tests from test/sagemaker_tests
-sagemaker_remote_tests = false
+sagemaker_remote_tests = true
 # run efa sagemaker tests
-sagemaker_efa_tests = false
+sagemaker_efa_tests = true
 # run release_candidate_integration tests
-sagemaker_rc_tests = false
+sagemaker_rc_tests = true
 # run sagemaker benchmark tests
-sagemaker_benchmark_tests = false
+sagemaker_benchmark_tests = true
 
 # SM remote EFA test instance type
 sagemaker_remote_efa_instance_type = ""

@@ -63,7 +63,7 @@ ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
 ENV CUDNN_VERSION=8.9.2.26
 ENV NCCL_VERSION=2.19.4
 ENV EFA_VERSION=1.30.0
-ENV GDRCOPY_VERSION=2.3.1
+ENV GDRCOPY_VERSION=2.4.1
 
 ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
 ENV OPEN_MPI_PATH=/opt/amazon/openmpi

@@ -1,6 +1,8 @@
 import os
 import time
 import pytest
+import boto3
+
 from src.benchmark_metrics import (
     PYTORCH_INFERENCE_GPU_THRESHOLD,
     PYTORCH_INFERENCE_CPU_THRESHOLD,
@@ -11,6 +13,7 @@
     get_framework_and_version_from_tag,
     UL20_CPU_ARM64_US_WEST_2,
     LOGGER,
+    login_to_ecr_registry,
 )
 from test.test_utils.ec2 import (
     ec2_performance_upload_result_to_s3_and_validate,
@@ -85,7 +88,8 @@ def ec2_performance_pytorch_inference(
     repo_name, image_tag = image_uri.split("/")[-1].split(":")
 
     # Make sure we are logged into ECR so we can pull the image
-    ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
+    account_id = boto3.client("sts").get_caller_identity()["Account"]
+    login_to_ecr_registry(ec2_connection, account_id, region)
 
     ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ")
 

@@ -2,6 +2,7 @@
 import time
 import pytest
 import re
+import boto3
 
 from test.test_utils import (
     CONTAINER_TESTS_PREFIX,
@@ -10,6 +11,7 @@
     DEFAULT_REGION,
     get_framework_and_version_from_tag,
     is_pr_context,
+    login_to_ecr_registry,
 )
 from test.test_utils.ec2 import (
     execute_ec2_training_performance_test,
@@ -142,7 +144,8 @@ def execute_pytorch_gpu_py3_imagenet_ec2_training_performance_test(
     container_name = f"{repo_name}-performance-{image_tag}-ec2"
 
     # Make sure we are logged into ECR so we can pull the image
-    connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
+    account_id = boto3.client("sts").get_caller_identity()["Account"]
+    login_to_ecr_registry(connection, account_id, region)
     # Do not add -q to docker pull as it leads to a hang for huge images like trcomp
     connection.run(f"nvidia-docker pull {ecr_uri}")
     timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")

@@ -1,6 +1,7 @@
 import os
 import time
 import pytest
+import boto3
 
 from packaging.version import Version
 
@@ -14,6 +15,7 @@
     is_pr_context,
     is_tf_version,
     UL20_BENCHMARK_CPU_ARM64_US_WEST_2,
+    login_to_ecr_registry,
 )
 from test.test_utils.ec2 import (
     ec2_performance_upload_result_to_s3_and_validate,
@@ -83,7 +85,8 @@ def ec2_performance_tensorflow_inference(
     num_iterations = 500 if is_pr_context() or is_graviton else 1000
 
     # Make sure we are logged into ECR so we can pull the image
-    ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
+    account_id = boto3.client("sts").get_caller_identity()["Account"]
+    login_to_ecr_registry(ec2_connection, account_id, region)
     ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ")
     if is_graviton:
         # TF training binary is used that is compatible for graviton instance type

@@ -31,11 +31,7 @@
     is_nightly_context,
     DEFAULT_REGION,
     P3DN_REGION,
-    UBUNTU_20_BASE_DLAMI_US_EAST_1,
-    UBUNTU_20_BASE_DLAMI_US_WEST_2,
     PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1,
-    AML2_BASE_DLAMI_US_WEST_2,
-    AML2_BASE_DLAMI_US_EAST_1,
     KEYS_TO_DESTROY_FILE,
     are_efa_tests_disabled,
     get_repository_and_tag_from_image_uri,
@@ -330,18 +326,11 @@ def ec2_instance_role_name(request):
 
 
 @pytest.fixture(scope="function")
-def ec2_instance_ami(request, region):
+def ec2_instance_ami(request, region, ec2_instance_type):
     return (
         request.param
         if hasattr(request, "param")
-        else UBUNTU_20_BASE_DLAMI_US_EAST_1
-        if region == "us-east-1"
-        else UBUNTU_20_BASE_DLAMI_US_WEST_2
-        if region == "us-west-2"
-        else test_utils.get_ami_id_boto3(
-            region_name=region,
-            ami_name_pattern="Deep Learning Base GPU AMI (Ubuntu 20.04) ????????",
-        )
+        else test_utils.get_instance_type_base_dlami(ec2_instance_type, region)
     )
 
 
@@ -564,9 +553,14 @@ def ec2_instance(
         )
         if ec2_instance_ami != PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1:
             ec2_instance_ami = (
-                AML2_BASE_DLAMI_US_EAST_1
-                if ec2_instance_ami == AML2_BASE_DLAMI_US_WEST_2
-                else UBUNTU_20_BASE_DLAMI_US_EAST_1
+                test_utils.get_instance_type_base_dlami(
+                    ec2_instance_type, "us-east-1", linux_dist="AML2"
+                )
+                if ec2_instance_ami
+                == test_utils.get_instance_type_base_dlami(
+                    ec2_instance_type, "us-west-2", linux_dist="AML2"
+                )
+                else test_utils.get_instance_type_base_dlami(ec2_instance_type, "us-east-1")
             )
 
     ec2_key_name = f"{ec2_key_name}-{str(uuid.uuid4())}"

@@ -24,4 +24,4 @@ lsmod | grep ib_uverbs
 ibv_devinfo
 
 # check if gdr device is loaded
-grep -e '^1$' /sys/class/infiniband/**/device/gdr
+cat /sys/class/infiniband/**/device/p2p | grep 'NVIDIA'
@@ -1,10 +1,15 @@
 import os
 import pytest
+import boto3
 
 import test.test_utils.ec2 as ec2_utils
 
 from test import test_utils
-from test.test_utils import CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag
+from test.test_utils import (
+    CONTAINER_TESTS_PREFIX,
+    get_framework_and_version_from_tag,
+    login_to_ecr_registry,
+)
 from test.test_utils.ec2 import (
     get_ec2_instance_type,
     execute_ec2_inference_test,
@@ -188,7 +193,8 @@ def run_ec2_mxnet_inference(
             f" {image_uri} {mms_inference_cmd}"
         )
     try:
-        ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
+        account_id = boto3.client("sts").get_caller_identity()["Account"]
+        login_to_ecr_registry(ec2_connection, account_id, region)
         LOGGER.info(docker_run_cmd)
         ec2_connection.run(docker_run_cmd, hide=True)
         if model_name == SQUEEZENET_MODEL:

@@ -3,6 +3,7 @@
 import time
 import logging
 from datetime import date, timedelta, datetime
+import boto3
 
 import pytest
 from packaging.version import Version
@@ -15,14 +16,14 @@
     get_framework_and_version_from_tag,
     get_inference_server_type,
     get_cuda_version_from_tag,
+    login_to_ecr_registry,
 )
 from test.test_utils.ec2 import (
     get_ec2_instance_type,
     execute_ec2_inference_test,
     get_ec2_accelerator_type,
 )
 from test.dlc_tests.conftest import LOGGER
-import boto3
 
 LOGGER = logging.getLogger(__name__)
 LOGGER.addHandler(logging.StreamHandler(sys.stdout))
@@ -238,7 +239,8 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region):
             f" {image_uri} {inference_cmd}"
         )
     try:
-        ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
+        account_id = boto3.client("sts").get_caller_identity()["Account"]
+        login_to_ecr_registry(ec2_connection, account_id, region)
         LOGGER.info(docker_run_cmd)
         ec2_connection.run(docker_run_cmd, hide=True)
         server_type = get_inference_server_type(image_uri)

@@ -1,13 +1,15 @@
 from packaging.version import Version
 from packaging.specifiers import SpecifierSet
 import pytest
+import boto3
 
 from test import test_utils
 from test.test_utils import (
     CONTAINER_TESTS_PREFIX,
     get_framework_and_version_from_tag,
     get_inference_server_type,
     UL20_CPU_ARM64_US_WEST_2,
+    login_to_ecr_registry,
 )
 from test.test_utils.ec2 import (
     get_ec2_instance_type,
@@ -77,7 +79,8 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region):
         f" {image_uri} {inference_cmd}"
     )
     try:
-        ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
+        account_id = boto3.client("sts").get_caller_identity()["Account"]
+        login_to_ecr_registry(ec2_connection, account_id, region)
         LOGGER.info(docker_run_cmd)
         ec2_connection.run(docker_run_cmd, hide=True)
         server_type = get_inference_server_type(image_uri)

@@ -1,4 +1,5 @@
 import os
+import boto3
 
 from packaging.version import Version
 from packaging.specifiers import SpecifierSet
@@ -9,6 +10,7 @@
     CONTAINER_TESTS_PREFIX,
     get_framework_and_version_from_tag,
     get_cuda_version_from_tag,
+    login_to_ecr_registry,
 )
 from test.test_utils.ec2 import (
     execute_ec2_training_test,
@@ -189,7 +191,8 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region):
     PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container.
     """
     container_name = "pt_cudnn_test"
-    ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
+    account_id = boto3.client("sts").get_caller_identity()["Account"]
+    login_to_ecr_registry(ec2_connection, account_id, region)
     ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True)
     ec2_connection.run(
         f"nvidia-docker run --name {container_name} -itd {pytorch_training}", hide=True
@@ -221,6 +224,10 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region):
     ), f"System CUDNN {system_cudnn} and torch cudnn {cudnn_from_torch} do not match. Please downgrade system CUDNN or recompile torch with correct CUDNN verson."
 
 
+def pytorch_curand_gpu(pytorch_training, ec2_connection):
+    execute_ec2_training_test(ec2_connection, pytorch_training, CURAND_CMD)
+
+
 def pytorch_linear_regression_cpu(pytorch_training, ec2_connection):
     execute_ec2_training_test(
         ec2_connection, pytorch_training, PT_REGRESSION_CMD, container_name="pt_reg"
@@ -240,10 +247,6 @@ def pytorch_telemetry_cpu(pytorch_training, ec2_connection):
     )
 
 
-def curand_gpu(training, ec2_connection):
-    execute_ec2_training_test(ec2_connection, training, CURAND_CMD)
-
-
 def pytorch_training_torchdata(pytorch_training, ec2_connection):
     _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
     # HACK including PT 1.13 in this condition because the Torchdata 0.5.0 tag includes old tests data

@@ -1,4 +1,5 @@
 import os
+import boto3
 
 from packaging.version import Version
 from packaging.specifiers import SpecifierSet
@@ -12,6 +13,7 @@
     UBUNTU_18_HPU_DLAMI_US_WEST_2,
     get_framework_and_version_from_tag,
     get_cuda_version_from_tag,
+    login_to_ecr_registry,
 )
 from test.test_utils.ec2 import (
     execute_ec2_training_test,
@@ -747,7 +749,8 @@ def test_pytorch_cudnn_match_gpu(
     PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container.
     """
     container_name = "pt_cudnn_test"
-    ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
+    account_id = boto3.client("sts").get_caller_identity()["Account"]
+    login_to_ecr_registry(ec2_connection, account_id, region)
     ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True)
     ec2_connection.run(
         f"nvidia-docker run --name {container_name} -itd {pytorch_training}", hide=True

@@ -34,6 +34,7 @@ def test_pytorch_2_2_gpu(
         (common_cases.nvapex, (pytorch_training, ec2_connection)),
         (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
         (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
+        (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
     ]
 
     if "sagemaker" in pytorch_training:

@@ -3,6 +3,7 @@
 import json
 from time import sleep
 import pytest
+import boto3
 
 from packaging.version import Version
 from packaging.specifiers import SpecifierSet
@@ -110,7 +111,8 @@ def test_ec2_tensorflow_inference_gpu_tensorrt(
     )
 
     try:
-        ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
+        account_id = boto3.client("sts").get_caller_identity()["Account"]
+        test_utils.login_to_ecr_registry(ec2_connection, account_id, region)
         host_setup_for_tensorflow_inference(serving_folder_path, framework_version, ec2_connection)
         sleep(2)
 
@@ -268,7 +270,8 @@ def run_ec2_tensorflow_inference(
         if not is_neuron:
             train_mnist_model(serving_folder_path, ec2_connection)
             sleep(10)
-        ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
+        account_id = boto3.client("sts").get_caller_identity()["Account"]
+        test_utils.login_to_ecr_registry(ec2_connection, account_id, region)
         ec2_connection.run(docker_run_cmd, hide=True)
         sleep(20)
         if is_neuron and str(framework_version).startswith(TENSORFLOW2_VERSION):

@@ -23,9 +23,6 @@
 @pytest.mark.team("conda")
 @pytest.mark.integration("gdrcopy")
 @pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION)
-@pytest.mark.parametrize(
-    "ec2_instance_ami", [test_utils.UBUNTU_20_BASE_DLAMI_US_WEST_2], indirect=True
-)
 @pytest.mark.skipif(
     is_pr_context() and not are_heavy_instance_ec2_tests_enabled(),
     reason="Skip GDRCopy test in PR context unless explicitly enabled",