From 840aef6ad25000ee139675ecd5c721a04330b0d9 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Fri, 8 Mar 2024 15:44:12 -0800 Subject: [PATCH 01/47] Update DLAMI BASE AMI Logic to switch between OSS and Proprietary Nvidia Driver AMI --- dlc_developer_config.toml | 8 +-- test/dlc_tests/conftest.py | 80 +++++++++++++++++++++++++----- test/dlc_tests/ec2/test_gdrcopy.py | 2 +- test/test_utils/__init__.py | 58 ++++++++++++++++++---- 4 files changed, 119 insertions(+), 29 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 10c783d4eb22..9de117d68e19 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -34,11 +34,11 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -67,7 +67,7 @@ ec2_benchmark_tests = false ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = false +ec2_tests_on_heavy_instances = true ### SM specific tests ### Off by default @@ -102,7 +102,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-mxnet-training = "" -dlc-pr-pytorch-training = "" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-2-ec2.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index ddfc67dd3d8f..a5b45540ad4d 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -31,11 +31,19 @@ is_nightly_context, DEFAULT_REGION, P3DN_REGION, - UBUNTU_20_BASE_DLAMI_US_EAST_1, - UBUNTU_20_BASE_DLAMI_US_WEST_2, + # UBUNTU_20_BASE_DLAMI_US_EAST_1, + # UBUNTU_20_BASE_DLAMI_US_WEST_2, + UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1, + UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2, + UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1, + UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2, PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1, - AML2_BASE_DLAMI_US_WEST_2, - AML2_BASE_DLAMI_US_EAST_1, + # AML2_BASE_DLAMI_US_WEST_2, + # AML2_BASE_DLAMI_US_EAST_1, + AML2_BASE_OSS_DLAMI_US_WEST_2, + AML2_BASE_OSS_DLAMI_US_EAST_1, + AML2_BASE_PROPRIETARY_DLAMI_US_WEST_2, + AML2_BASE_PROPRIETARY_DLAMI_US_EAST_1, KEYS_TO_DESTROY_FILE, are_efa_tests_disabled, get_repository_and_tag_from_image_uri, @@ -330,18 +338,11 @@ def ec2_instance_role_name(request): @pytest.fixture(scope="function") -def ec2_instance_ami(request, region): +def ec2_instance_ami(request, region, ec2_instance_type): return ( request.param if hasattr(request, "param") - else UBUNTU_20_BASE_DLAMI_US_EAST_1 - if region == "us-east-1" - else UBUNTU_20_BASE_DLAMI_US_WEST_2 - if region == "us-west-2" - else test_utils.get_ami_id_boto3( - region_name=region, - ami_name_pattern="Deep Learning Base GPU AMI (Ubuntu 20.04) ????????", - ) + else _get_instance_type_base_dlami(ec2_instance_type, region) ) @@ -715,6 +716,59 @@ def terminate_ec2_instance(): return instance_id, key_filename +def _get_instance_type_base_dlami(instance_type, region): + ubuntu_20_base_oss_dlami_instances = ["p4d.24xlarge", "p4de.24xlarge", "p5.48xlarge"] + ubuntu_20_base_proprietary_dlami_instances = [ + "p3.2xlarge", + "p3.8xlarge", + "p3.16xlarge", + "p3dn.24xlarge", + "g3s.xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3.16xlarge", + "g4dn.xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.16xlarge", + "g4dn.12xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.xlarge", + "g5.2xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.16xlarge", + "g5.12xlarge", + "g5.24xlarge" + "g5.48xlarge" + ] + + if instance_type in ubuntu_20_base_oss_dlami_instances: + return ( + UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1 + if region == "us-east-1" + else UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 + if region == "us-west-2" + else test_utils.get_ami_id_boto3( + region_name=region, + ami_name_pattern="Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", + ) + ) + elif instance_type in ubuntu_20_base_proprietary_dlami_instances: + return ( + UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1 + if region == "us-east-1" + else UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2 + if region == "us-west-2" + else test_utils.get_ami_id_boto3( + region_name=region, + ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", + ) + ) + + def is_neuron_image(fixtures): """ Returns true if a neuron fixture is present in request.fixturenames diff --git a/test/dlc_tests/ec2/test_gdrcopy.py b/test/dlc_tests/ec2/test_gdrcopy.py index cdacbfac02e5..a879939f00af 100644 --- a/test/dlc_tests/ec2/test_gdrcopy.py +++ b/test/dlc_tests/ec2/test_gdrcopy.py @@ -24,7 +24,7 @@ @pytest.mark.integration("gdrcopy") @pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION) @pytest.mark.parametrize( - "ec2_instance_ami", [test_utils.UBUNTU_20_BASE_DLAMI_US_WEST_2], indirect=True + "ec2_instance_ami", [test_utils.UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2], indirect=True ) @pytest.mark.skipif( is_pr_context() and not are_heavy_instance_ec2_tests_enabled(), diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index be8fc7fff395..f98ac8fcf00a 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -78,18 +78,50 @@ def get_ami_id_ssm(region_name, parameter_path): return ami_id -# The Ubuntu 20.04 AMI which adds GDRCopy is used only for GDRCopy feature that is supported on PT1.13 and PT2.0 -UBUNTU_20_BASE_DLAMI_US_WEST_2 = get_ami_id_boto3( - region_name="us-west-2", ami_name_pattern="Deep Learning Base GPU AMI (Ubuntu 20.04) ????????" +# DLAMI Base is split between OSS Nvidia Driver and Propietary Nvidia Driver. sett https://docs.aws.amazon.com/dlami/latest/devguide/important-changes.html +# UBUNTU_20_BASE_DLAMI_US_WEST_2 = get_ami_id_boto3( +# region_name="us-west-2", ami_name_pattern="Deep Learning Base GPU AMI (Ubuntu 20.04) ????????" +# ) +# UBUNTU_20_BASE_DLAMI_US_EAST_1 = get_ami_id_boto3( +# region_name="us-east-1", ami_name_pattern="Deep Learning Base GPU AMI (Ubuntu 20.04) ????????" +# ) +UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 = get_ami_id_boto3( + region_name="us-west-2", + ami_name_pattern="Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", ) -UBUNTU_20_BASE_DLAMI_US_EAST_1 = get_ami_id_boto3( - region_name="us-east-1", ami_name_pattern="Deep Learning Base GPU AMI (Ubuntu 20.04) ????????" +UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1 = get_ami_id_boto3( + region_name="us-east-1", + ami_name_pattern="Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", ) -AML2_BASE_DLAMI_US_WEST_2 = get_ami_id_boto3( - region_name="us-west-2", ami_name_pattern="Deep Learning Base AMI (Amazon Linux 2) Version ??.?" +UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2 = get_ami_id_boto3( + region_name="us-west-2", + ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", ) -AML2_BASE_DLAMI_US_EAST_1 = get_ami_id_boto3( - region_name="us-east-1", ami_name_pattern="Deep Learning Base AMI (Amazon Linux 2) Version ??.?" +UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1 = get_ami_id_boto3( + region_name="us-east-1", + ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", +) +# AML2_BASE_DLAMI_US_WEST_2 = get_ami_id_boto3( +# region_name="us-west-2", ami_name_pattern="Deep Learning Base AMI (Amazon Linux 2) Version ??.?" +# ) +# AML2_BASE_DLAMI_US_EAST_1 = get_ami_id_boto3( +# region_name="us-east-1", ami_name_pattern="Deep Learning Base AMI (Amazon Linux 2) Version ??.?" +# ) +AML2_BASE_OSS_DLAMI_US_WEST_2 = get_ami_id_boto3( + region_name="us-west-2", + ami_name_pattern="Deep Learning Base OSS Nvidia Driver AMI (Amazon Linux 2) Version ??.?", +) +AML2_BASE_OSS_DLAMI_US_EAST_1 = get_ami_id_boto3( + region_name="us-east-1", + ami_name_pattern="Deep Learning Base OSS Nvidia Driver AMI (Amazon Linux 2) Version ??.?", +) +AML2_BASE_PROPRIETARY_DLAMI_US_WEST_2 = get_ami_id_boto3( + region_name="us-west-2", + ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver AMI (Amazon Linux 2) Version ??.?", +) +AML2_BASE_PROPRIETARY_DLAMI_US_EAST_1 = get_ami_id_boto3( + region_name="us-east-1", + ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver AMI (Amazon Linux 2) Version ??.?", ) # We use the following DLAMI for MXNet and TensorFlow tests as well, but this is ok since we use custom DLC Graviton containers on top. We just need an ARM base DLAMI. UL20_CPU_ARM64_US_WEST_2 = get_ami_id_boto3( @@ -145,8 +177,12 @@ def get_ami_id_ssm(region_name, parameter_path): UBUNTU_18_HPU_DLAMI_US_WEST_2 = "ami-03cdcfc91a96a8f92" UBUNTU_18_HPU_DLAMI_US_EAST_1 = "ami-0d83d7487f322545a" UL_AMI_LIST = [ - UBUNTU_20_BASE_DLAMI_US_WEST_2, - UBUNTU_20_BASE_DLAMI_US_EAST_1, + # UBUNTU_20_BASE_DLAMI_US_WEST_2, + # UBUNTU_20_BASE_DLAMI_US_EAST_1, + UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2, + UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1, + UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2, + UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1, UBUNTU_18_HPU_DLAMI_US_WEST_2, UBUNTU_18_HPU_DLAMI_US_EAST_1, PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1, From 95f6e46bfb198417c0098825b6804201de1e109d Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Fri, 8 Mar 2024 15:44:53 -0800 Subject: [PATCH 02/47] update gdrcopy to 2.4 --- pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu b/pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu index fadd4ee396f1..e64e521c601b 100644 --- a/pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu +++ b/pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu @@ -63,7 +63,7 @@ ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" ENV CUDNN_VERSION=8.9.2.26 ENV NCCL_VERSION=2.19.4 ENV EFA_VERSION=1.30.0 -ENV GDRCOPY_VERSION=2.3.1 +ENV GDRCOPY_VERSION=2.4.1 ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" ENV OPEN_MPI_PATH=/opt/amazon/openmpi From 2e4b09bb7d1f18cb49cafa29103a11aafa0563bf Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Fri, 8 Mar 2024 15:58:41 -0800 Subject: [PATCH 03/47] formatting --- test/dlc_tests/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index a5b45540ad4d..781b6b525cea 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -741,8 +741,8 @@ def _get_instance_type_base_dlami(instance_type, region): "g5.8xlarge", "g5.16xlarge", "g5.12xlarge", - "g5.24xlarge" - "g5.48xlarge" + "g5.24xlarge", + "g5.48xlarge", ] if instance_type in ubuntu_20_base_oss_dlami_instances: From c28d32bbb2b9184c082c2ec530bdbf89836e0ac2 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Sun, 10 Mar 2024 21:54:40 -0700 Subject: [PATCH 04/47] disable buiild and fix sm local test instance ami --- dlc_developer_config.toml | 12 +++---- test/dlc_tests/conftest.py | 55 +------------------------------- test/test_utils/__init__.py | 62 ++++++++++++++++++++++++++++++++++++ test/test_utils/sagemaker.py | 21 ++++++------ 4 files changed, 80 insertions(+), 70 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 9de117d68e19..dc8f1b5f1e03 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,7 +42,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = false autopatch_build = false [notify] @@ -71,16 +71,16 @@ ec2_tests_on_heavy_instances = true ### SM specific tests ### Off by default -sagemaker_local_tests = false +sagemaker_local_tests = true # run standard sagemaker remote tests from test/sagemaker_tests -sagemaker_remote_tests = false +sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = false +sagemaker_efa_tests = true # run release_candidate_integration tests -sagemaker_rc_tests = false +sagemaker_rc_tests = true # run sagemaker benchmark tests -sagemaker_benchmark_tests = false +sagemaker_benchmark_tests = true # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index 781b6b525cea..c3fcd6b165c5 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -342,7 +342,7 @@ def ec2_instance_ami(request, region, ec2_instance_type): return ( request.param if hasattr(request, "param") - else _get_instance_type_base_dlami(ec2_instance_type, region) + else test_utils.get_instance_type_base_dlami(ec2_instance_type, region) ) @@ -716,59 +716,6 @@ def terminate_ec2_instance(): return instance_id, key_filename -def _get_instance_type_base_dlami(instance_type, region): - ubuntu_20_base_oss_dlami_instances = ["p4d.24xlarge", "p4de.24xlarge", "p5.48xlarge"] - ubuntu_20_base_proprietary_dlami_instances = [ - "p3.2xlarge", - "p3.8xlarge", - "p3.16xlarge", - "p3dn.24xlarge", - "g3s.xlarge", - "g3.4xlarge", - "g3.8xlarge", - "g3.16xlarge", - "g4dn.xlarge", - "g4dn.2xlarge", - "g4dn.4xlarge", - "g4dn.8xlarge", - "g4dn.16xlarge", - "g4dn.12xlarge", - "g4dn.metal", - "g4dn.xlarge", - "g5.xlarge", - "g5.2xlarge", - "g5.4xlarge", - "g5.8xlarge", - "g5.16xlarge", - "g5.12xlarge", - "g5.24xlarge", - "g5.48xlarge", - ] - - if instance_type in ubuntu_20_base_oss_dlami_instances: - return ( - UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1 - if region == "us-east-1" - else UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 - if region == "us-west-2" - else test_utils.get_ami_id_boto3( - region_name=region, - ami_name_pattern="Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", - ) - ) - elif instance_type in ubuntu_20_base_proprietary_dlami_instances: - return ( - UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1 - if region == "us-east-1" - else UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2 - if region == "us-west-2" - else test_utils.get_ami_id_boto3( - region_name=region, - ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", - ) - ) - - def is_neuron_image(fixtures): """ Returns true if a neuron fixture is present in request.fixturenames diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index f98ac8fcf00a..043a5e5f5358 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -309,6 +309,13 @@ class SerialTestCaseExecutorException(Exception): pass +class UnsupportedInstanceTypeBaseDLAMI(Exception): + """ + Raise for get_instance_type_base_dlami function for supported Base DLAMI instance types + """ + pass + + class EnhancedJSONEncoder(json.JSONEncoder): """ EnhancedJSONEncoder is required to dump dataclass objects as JSON. @@ -2406,3 +2413,58 @@ def get_image_spec_from_buildspec(image_uri, dlc_folder_path): raise ValueError(f"No corresponding entry found for {image_uri} in {buildspec_path}") return matched_image_spec + + +def get_instance_type_base_dlami(instance_type, region): + base_oss_dlami_instances = ["p4d.24xlarge", "p4de.24xlarge", "p5.48xlarge"] + base_proprietary_dlami_instances = [ + "p3.2xlarge", + "p3.8xlarge", + "p3.16xlarge", + "p3dn.24xlarge", + "g3s.xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3.16xlarge", + "g4dn.xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.16xlarge", + "g4dn.12xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.xlarge", + "g5.2xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.16xlarge", + "g5.12xlarge", + "g5.24xlarge", + "g5.48xlarge", + ] + + if instance_type in base_oss_dlami_instances: + return ( + UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1 + if region == "us-east-1" + else UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 + if region == "us-west-2" + else get_ami_id_boto3( + region_name=region, + ami_name_pattern="Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", + ) + ) + elif instance_type in base_proprietary_dlami_instances: + return ( + UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1 + if region == "us-east-1" + else UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2 + if region == "us-west-2" + else get_ami_id_boto3( + region_name=region, + ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", + ) + ) + else: + raise UnsupportedInstanceTypeBaseDLAMI(f"Base DLAMI does not support selected instance type {instance_type}") diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py index 0c533bfd2561..13eefae6961f 100644 --- a/test/test_utils/sagemaker.py +++ b/test/test_utils/sagemaker.py @@ -25,8 +25,12 @@ SAGEMAKER_EXECUTION_REGIONS, SAGEMAKER_NEURON_EXECUTION_REGIONS, SAGEMAKER_NEURONX_EXECUTION_REGIONS, - UBUNTU_20_BASE_DLAMI_US_EAST_1, - UBUNTU_20_BASE_DLAMI_US_WEST_2, + # UBUNTU_20_BASE_DLAMI_US_EAST_1, + # UBUNTU_20_BASE_DLAMI_US_WEST_2, + UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1, + UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2, + UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1, + UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2, UL20_CPU_ARM64_US_EAST_1, UL20_CPU_ARM64_US_WEST_2, SAGEMAKER_LOCAL_TEST_TYPE, @@ -34,6 +38,7 @@ UBUNTU_HOME_DIR, DEFAULT_REGION, is_nightly_context, + get_instance_type_base_dlami, ) from test_utils.pytest_cache import PytestCache @@ -85,7 +90,7 @@ def assign_sagemaker_local_job_instance_type(image): return "p3.8xlarge" if "gpu" in image else "c5.18xlarge" -def assign_sagemaker_local_test_ami(image, region): +def assign_sagemaker_local_test_ami(image, region, instance_type): """ Helper function to get the needed AMI for launching the image. Needed to support Graviton(ARM) images @@ -96,13 +101,10 @@ def assign_sagemaker_local_test_ami(image, region): else: return UL20_CPU_ARM64_US_WEST_2 else: - if region == "us-east-1": - return UBUNTU_20_BASE_DLAMI_US_EAST_1 - else: - return UBUNTU_20_BASE_DLAMI_US_WEST_2 + return get_instance_type_base_dlami(instance_type, region) -def launch_sagemaker_local_ec2_instance(image, ami_id, ec2_key_name, region): +def launch_sagemaker_local_ec2_instance(image, ec2_key_name, region): """ Launch Ec2 instance for running sagemaker local tests :param image: str @@ -112,6 +114,7 @@ def launch_sagemaker_local_ec2_instance(image, ami_id, ec2_key_name, region): :return: str, str """ instance_type = assign_sagemaker_local_job_instance_type(image) + ami_id = assign_sagemaker_local_test_ami(image, region, instance_type) instance_name = image.split("/")[-1] instance = ec2_utils.launch_instance( ami_id, @@ -300,7 +303,6 @@ def execute_local_tests(image, pytest_cache_params): random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}") ec2_key_name = f"{job_type}_{tag}_sagemaker_{random.randint(1, 1000)}" region = os.getenv("AWS_REGION", DEFAULT_REGION) - ec2_ami_id = assign_sagemaker_local_test_ami(image, region) sm_tests_tar_name = "sagemaker_tests.tar.gz" ec2_test_report_path = os.path.join(UBUNTU_HOME_DIR, "test", f"{job_type}_{tag}_sm_local.xml") instance_id = "" @@ -310,7 +312,6 @@ def execute_local_tests(image, pytest_cache_params): print(f"Launching new Instance for image: {image}") instance_id, ip_address = launch_sagemaker_local_ec2_instance( image, - ec2_ami_id, ec2_key_name, region, ) From 61212b4ba8613422b5c8cdcc2598a7f28fc7b130 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Sun, 10 Mar 2024 22:29:23 -0700 Subject: [PATCH 05/47] use proprietary drier dlami as default --- test/test_utils/__init__.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index 043a5e5f5358..d825b6de1aba 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -313,6 +313,7 @@ class UnsupportedInstanceTypeBaseDLAMI(Exception): """ Raise for get_instance_type_base_dlami function for supported Base DLAMI instance types """ + pass @@ -2467,4 +2468,18 @@ def get_instance_type_base_dlami(instance_type, region): ) ) else: - raise UnsupportedInstanceTypeBaseDLAMI(f"Base DLAMI does not support selected instance type {instance_type}") + return ( + UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1 + if region == "us-east-1" + else UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2 + if region == "us-west-2" + else get_ami_id_boto3( + region_name=region, + ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", + ) + ) + # raise UnsupportedInstanceTypeBaseDLAMI( + # f"Base DLAMI does not support selected instance type {instance_type}.\n" + # f"Currently supported instance type for OSS Nvidia Driver Base DLAMI: {base_oss_dlami_instances}.\n" + # f"Currently supported instance type for Proprietary Nvidia Driver Base DLAMI: {base_proprietary_dlami_instances}." + # ) From a0f8f8249aef2d3deeea1b1ce08adfe198f30821 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 11 Mar 2024 00:12:51 -0700 Subject: [PATCH 06/47] fix ul20 and aml2 dlami name logic and test only ec2 --- dlc_developer_config.toml | 18 ++--- test/dlc_tests/conftest.py | 11 ++- test/test_utils/__init__.py | 141 ++++++++++++++++++------------------ 3 files changed, 88 insertions(+), 82 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index dc8f1b5f1e03..404b5208f99d 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -54,11 +54,11 @@ notify_test_failures = false [test] ### On by default -sanity_tests = true +sanity_tests = false safety_check_test = false ecr_scan_allowlist_feature = false -ecs_tests = true -eks_tests = true +ecs_tests = false +eks_tests = false ec2_tests = true # Set it to true if you are preparing a Benchmark related PR ec2_benchmark_tests = false @@ -67,20 +67,20 @@ ec2_benchmark_tests = false ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = true +ec2_tests_on_heavy_instances = false ### SM specific tests ### Off by default -sagemaker_local_tests = true +sagemaker_local_tests = false # run standard sagemaker remote tests from test/sagemaker_tests -sagemaker_remote_tests = true +sagemaker_remote_tests = false # run efa sagemaker tests -sagemaker_efa_tests = true +sagemaker_efa_tests = false # run release_candidate_integration tests -sagemaker_rc_tests = true +sagemaker_rc_tests = false # run sagemaker benchmark tests -sagemaker_benchmark_tests = true +sagemaker_benchmark_tests = false # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index c3fcd6b165c5..0a130085166a 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -565,9 +565,14 @@ def ec2_instance( ) if ec2_instance_ami != PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1: ec2_instance_ami = ( - AML2_BASE_DLAMI_US_EAST_1 - if ec2_instance_ami == AML2_BASE_DLAMI_US_WEST_2 - else UBUNTU_20_BASE_DLAMI_US_EAST_1 + test_utils.get_instance_type_base_dlami( + ec2_instance_type, "us-east-1", linux_dist="AML2" + ) + if ec2_instance_ami + == test_utils.get_instance_type_base_dlami( + ec2_instance_type, "us-west-2", linux_dist="AML2" + ) + else test_utils.get_instance_type_base_dlami(ec2_instance_type, "us-east-1") ) ec2_key_name = f"{ec2_key_name}-{str(uuid.uuid4())}" diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index d825b6de1aba..f6c771108c95 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -78,7 +78,7 @@ def get_ami_id_ssm(region_name, parameter_path): return ami_id -# DLAMI Base is split between OSS Nvidia Driver and Propietary Nvidia Driver. sett https://docs.aws.amazon.com/dlami/latest/devguide/important-changes.html +# DLAMI Base is split between OSS Nvidia Driver and Propietary Nvidia Driver. see https://docs.aws.amazon.com/dlami/latest/devguide/important-changes.html # UBUNTU_20_BASE_DLAMI_US_WEST_2 = get_ami_id_boto3( # region_name="us-west-2", ami_name_pattern="Deep Learning Base GPU AMI (Ubuntu 20.04) ????????" # ) @@ -309,14 +309,6 @@ class SerialTestCaseExecutorException(Exception): pass -class UnsupportedInstanceTypeBaseDLAMI(Exception): - """ - Raise for get_instance_type_base_dlami function for supported Base DLAMI instance types - """ - - pass - - class EnhancedJSONEncoder(json.JSONEncoder): """ EnhancedJSONEncoder is required to dump dataclass objects as JSON. @@ -2416,70 +2408,79 @@ def get_image_spec_from_buildspec(image_uri, dlc_folder_path): return matched_image_spec -def get_instance_type_base_dlami(instance_type, region): +def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): + """ + Get Instance types based on EC2 instance, see https://docs.aws.amazon.com/dlami/latest/devguide/important-changes.html + OSS Nvidia Driver DLAMI supports the following: ["p4d.24xlarge", "p4de.24xlarge", "p5.48xlarge"] + Proprietary Nvidia Driver DLAMI supports the following: ["p3.2xlarge", + "p3.8xlarge", + "p3.16xlarge", + "p3dn.24xlarge", + "g3s.xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3.16xlarge", + "g4dn.xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.16xlarge", + "g4dn.12xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.xlarge", + "g5.2xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.16xlarge", + "g5.12xlarge", + "g5.24xlarge", + "g5.48xlarge"] + + Other instances will default to Proprietary Nvidia Driver DLAMI + """ + base_oss_dlami_instances = ["p4d.24xlarge", "p4de.24xlarge", "p5.48xlarge"] - base_proprietary_dlami_instances = [ - "p3.2xlarge", - "p3.8xlarge", - "p3.16xlarge", - "p3dn.24xlarge", - "g3s.xlarge", - "g3.4xlarge", - "g3.8xlarge", - "g3.16xlarge", - "g4dn.xlarge", - "g4dn.2xlarge", - "g4dn.4xlarge", - "g4dn.8xlarge", - "g4dn.16xlarge", - "g4dn.12xlarge", - "g4dn.metal", - "g4dn.xlarge", - "g5.xlarge", - "g5.2xlarge", - "g5.4xlarge", - "g5.8xlarge", - "g5.16xlarge", - "g5.12xlarge", - "g5.24xlarge", - "g5.48xlarge", - ] - if instance_type in base_oss_dlami_instances: - return ( - UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1 - if region == "us-east-1" - else UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 - if region == "us-west-2" - else get_ami_id_boto3( - region_name=region, - ami_name_pattern="Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", - ) + # set defaults + if linux_dist == "AML2": + oss_dlami_us_east_1 = AML2_BASE_OSS_DLAMI_US_EAST_1 + oss_dlami_us_west_2 = AML2_BASE_OSS_DLAMI_US_WEST_2 + oss_dlami_name_pattern = ( + "Deep Learning Base OSS Nvidia Driver AMI (Amazon Linux 2) Version ??.?" ) - elif instance_type in base_proprietary_dlami_instances: - return ( - UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1 - if region == "us-east-1" - else UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2 - if region == "us-west-2" - else get_ami_id_boto3( - region_name=region, - ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", - ) + + proprietary_dlami_us_east_1 = AML2_BASE_PROPRIETARY_DLAMI_US_EAST_1 + proprietary_dlami_us_west_2 = AML2_BASE_PROPRIETARY_DLAMI_US_WEST_2 + proprietary_dlami_name_pattern = ( + "Deep Learning Base Proprietary Nvidia Driver AMI (Amazon Linux 2) Version ??.?" ) else: - return ( - UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1 - if region == "us-east-1" - else UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2 - if region == "us-west-2" - else get_ami_id_boto3( - region_name=region, - ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", - ) + oss_dlami_us_east_1 = UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1 + oss_dlami_us_west_2 = UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 + oss_dlami_name_pattern = ( + "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) ????????" + ) + + proprietary_dlami_us_east_1 = UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1 + proprietary_dlami_us_west_2 = UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 + proprietary_dlami_name_pattern = ( + "Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????" ) - # raise UnsupportedInstanceTypeBaseDLAMI( - # f"Base DLAMI does not support selected instance type {instance_type}.\n" - # f"Currently supported instance type for OSS Nvidia Driver Base DLAMI: {base_oss_dlami_instances}.\n" - # f"Currently supported instance type for Proprietary Nvidia Driver Base DLAMI: {base_proprietary_dlami_instances}." - # ) + + return ( + oss_dlami_us_east_1 + if region == "us-east-1" and instance_type in base_oss_dlami_instances + else oss_dlami_us_west_2 + if region == "us-west-2" and instance_type in base_oss_dlami_instances + else get_ami_id_boto3( + region_name=region, + ami_name_pattern=oss_dlami_name_pattern, + ) + if instance_type in base_oss_dlami_instances + else proprietary_dlami_us_east_1 + if region == "us-east-1" + else proprietary_dlami_us_west_2 + if region == "us-west-2" + else get_ami_id_boto3(region_name=region, ami_name_pattern=proprietary_dlami_name_pattern) + ) From ce3d3da53ee62faaeb61e71cd637899e5b794d1f Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 11 Mar 2024 11:01:24 -0700 Subject: [PATCH 07/47] allow test efa --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 404b5208f99d..4754047b504c 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -67,7 +67,7 @@ ec2_benchmark_tests = false ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = false +ec2_tests_on_heavy_instances = true ### SM specific tests ### Off by default From 00fba940232fc9df474144dc49a92fe3b5e07c4f Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 11 Mar 2024 12:49:26 -0700 Subject: [PATCH 08/47] update oss dlami list --- test/test_utils/__init__.py | 68 ++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index f6c771108c95..fc407cb09159 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -2411,7 +2411,26 @@ def get_image_spec_from_buildspec(image_uri, dlc_folder_path): def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): """ Get Instance types based on EC2 instance, see https://docs.aws.amazon.com/dlami/latest/devguide/important-changes.html - OSS Nvidia Driver DLAMI supports the following: ["p4d.24xlarge", "p4de.24xlarge", "p5.48xlarge"] + OSS Nvidia Driver DLAMI supports the following: ["g4dn.xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.16xlarge", + "g4dn.12xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.xlarge", + "g5.2xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.16xlarge", + "g5.12xlarge", + "g5.24xlarge", + "g5.48xlarge", + "p4d.24xlarge", + "p4de.24xlarge", + "p5.48xlarge",] + Proprietary Nvidia Driver DLAMI supports the following: ["p3.2xlarge", "p3.8xlarge", "p3.16xlarge", @@ -2419,28 +2438,21 @@ def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): "g3s.xlarge", "g3.4xlarge", "g3.8xlarge", - "g3.16xlarge", - "g4dn.xlarge", - "g4dn.2xlarge", - "g4dn.4xlarge", - "g4dn.8xlarge", - "g4dn.16xlarge", - "g4dn.12xlarge", - "g4dn.metal", - "g4dn.xlarge", - "g5.xlarge", - "g5.2xlarge", - "g5.4xlarge", - "g5.8xlarge", - "g5.16xlarge", - "g5.12xlarge", - "g5.24xlarge", - "g5.48xlarge"] + "g3.16xlarge",] Other instances will default to Proprietary Nvidia Driver DLAMI """ - base_oss_dlami_instances = ["p4d.24xlarge", "p4de.24xlarge", "p5.48xlarge"] + base_proprietary_dlami_instances = [ + "p3.2xlarge", + "p3.8xlarge", + "p3.16xlarge", + "p3dn.24xlarge", + "g3s.xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3.16xlarge", + ] # set defaults if linux_dist == "AML2": @@ -2469,18 +2481,18 @@ def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): ) return ( - oss_dlami_us_east_1 - if region == "us-east-1" and instance_type in base_oss_dlami_instances - else oss_dlami_us_west_2 - if region == "us-west-2" and instance_type in base_oss_dlami_instances + proprietary_dlami_us_east_1 + if region == "us-east-1" and instance_type in base_proprietary_dlami_instances + else proprietary_dlami_us_west_2 + if region == "us-west-2" and instance_type in base_proprietary_dlami_instances else get_ami_id_boto3( region_name=region, - ami_name_pattern=oss_dlami_name_pattern, + ami_name_pattern=proprietary_dlami_name_pattern, ) - if instance_type in base_oss_dlami_instances - else proprietary_dlami_us_east_1 + if instance_type in base_proprietary_dlami_instances + else oss_dlami_us_east_1 if region == "us-east-1" - else proprietary_dlami_us_west_2 + else oss_dlami_us_west_2 if region == "us-west-2" - else get_ami_id_boto3(region_name=region, ami_name_pattern=proprietary_dlami_name_pattern) + else get_ami_id_boto3(region_name=region, ami_name_pattern=oss_dlami_name_pattern) ) From 434fbdcf30ce49faae67732381f537d38e146472 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 11 Mar 2024 15:59:46 -0700 Subject: [PATCH 09/47] test curand --- test/dlc_tests/ec2/pytorch/training/common_cases.py | 8 ++++---- .../ec2/pytorch/training/test_pytorch_training_2_2.py | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index a1ee96a4edaa..edd7bfc18412 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -219,6 +219,10 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region): ), f"System CUDNN {system_cudnn} and torch cudnn {cudnn_from_torch} do not match. Please downgrade system CUDNN or recompile torch with correct CUDNN verson." +def pytorch_curand_gpu(pytorch_training, ec2_connection): + execute_ec2_training_test(ec2_connection, pytorch_training, CURAND_CMD) + + def pytorch_linear_regression_cpu(pytorch_training, ec2_connection): execute_ec2_training_test( ec2_connection, pytorch_training, PT_REGRESSION_CMD, container_name="pt_reg" @@ -236,7 +240,3 @@ def pytorch_telemetry_cpu(pytorch_training, ec2_connection): execute_ec2_training_test( ec2_connection, pytorch_training, PT_TELEMETRY_CMD, timeout=900, container_name="telemetry" ) - - -def curand_gpu(training, ec2_connection): - execute_ec2_training_test(ec2_connection, training, CURAND_CMD) diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py index 8818339cbbfd..cbb2612c6a8e 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py @@ -34,6 +34,8 @@ def test_pytorch_2_2_gpu( (common_cases.nvapex, (pytorch_training, ec2_connection)), (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), + (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), + (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection, region)), ] if "sagemaker" in pytorch_training: From 115c33c7f8ff62127e2d8be466620191f3d3d7c2 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 11 Mar 2024 17:16:57 -0700 Subject: [PATCH 10/47] ensure ec2 instance type fixture is ran before ec2 instance ami --- test/dlc_tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index 0a130085166a..812fab7acb9f 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -338,7 +338,7 @@ def ec2_instance_role_name(request): @pytest.fixture(scope="function") -def ec2_instance_ami(request, region, ec2_instance_type): +def ec2_instance_ami(ec2_instance_type, request, region): return ( request.param if hasattr(request, "param") From 092b14b4901ef51bd4d7c3aff86c4502ed45bf71 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Tue, 12 Mar 2024 11:21:30 -0700 Subject: [PATCH 11/47] alter ami pulling logic --- test/dlc_tests/conftest.py | 9 +++++++-- .../ec2/pytorch/training/test_pytorch_training_2_2.py | 3 +-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index 812fab7acb9f..a6f7e89d2fa8 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -338,11 +338,16 @@ def ec2_instance_role_name(request): @pytest.fixture(scope="function") -def ec2_instance_ami(ec2_instance_type, request, region): +def ec2_instance_ami(request, region): + if "ec2_instance_type" in request.fixturenames: + local_instance_type = request.getfixturevalue("ec2_instance_type") + else: + return + return ( request.param if hasattr(request, "param") - else test_utils.get_instance_type_base_dlami(ec2_instance_type, region) + else test_utils.get_instance_type_base_dlami(local_instance_type, region) ) diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py index cbb2612c6a8e..4427c40810c4 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py @@ -34,8 +34,7 @@ def test_pytorch_2_2_gpu( (common_cases.nvapex, (pytorch_training, ec2_connection)), (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), - (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), - (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection, region)), + (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), ] if "sagemaker" in pytorch_training: From b75b4156e22127f872760630d2ae176afc7a104a Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Tue, 12 Mar 2024 12:40:55 -0700 Subject: [PATCH 12/47] usefixtures --- .../training/test_performance_pytorch_training.py | 2 ++ test/dlc_tests/conftest.py | 9 ++------- test/dlc_tests/ec2/test_curand.py | 1 + 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py index dac172061d54..329dbd2a9c82 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py @@ -10,6 +10,7 @@ DEFAULT_REGION, get_framework_and_version_from_tag, is_pr_context, + get_instance_type_base_dlami, ) from test.test_utils.ec2 import ( execute_ec2_training_performance_test, @@ -54,6 +55,7 @@ @pytest.mark.model("resnet50") @pytest.mark.parametrize("ec2_instance_type", [PT_EC2_GPU_SYNTHETIC_INSTANCE_TYPE], indirect=True) +@pytest.mark.usefixtures("ec2_instance_ami") @pytest.mark.team("conda") def test_performance_pytorch_gpu_synthetic( pytorch_training, ec2_connection, gpu_only, py3_only, ec2_instance_type diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index a6f7e89d2fa8..0a130085166a 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -338,16 +338,11 @@ def ec2_instance_role_name(request): @pytest.fixture(scope="function") -def ec2_instance_ami(request, region): - if "ec2_instance_type" in request.fixturenames: - local_instance_type = request.getfixturevalue("ec2_instance_type") - else: - return - +def ec2_instance_ami(request, region, ec2_instance_type): return ( request.param if hasattr(request, "param") - else test_utils.get_instance_type_base_dlami(local_instance_type, region) + else test_utils.get_instance_type_base_dlami(ec2_instance_type, region) ) diff --git a/test/dlc_tests/ec2/test_curand.py b/test/dlc_tests/ec2/test_curand.py index 8b5789900501..b230e5860cda 100644 --- a/test/dlc_tests/ec2/test_curand.py +++ b/test/dlc_tests/ec2/test_curand.py @@ -20,6 +20,7 @@ @pytest.mark.model("N/A") @pytest.mark.team("frameworks") @pytest.mark.parametrize("ec2_instance_type", CURAND_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) +@pytest.mark.usefixtures("ec2_instance_ami") def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_incompatible_with_instance_type(training, ec2_instance_type): pytest.skip(f"Image {training} is incompatible with instance type {ec2_instance_type}") From 9dc8fec2ca49d20663c648aa797ca66dfc091332 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Tue, 12 Mar 2024 13:53:43 -0700 Subject: [PATCH 13/47] use parametrize --- .../ec2/pytorch/training/test_performance_pytorch_training.py | 3 ++- test/dlc_tests/ec2/test_curand.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py index 329dbd2a9c82..775d1902be53 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py @@ -7,6 +7,7 @@ CONTAINER_TESTS_PREFIX, PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_WEST_2, UBUNTU_18_HPU_DLAMI_US_WEST_2, + UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2, DEFAULT_REGION, get_framework_and_version_from_tag, is_pr_context, @@ -55,7 +56,7 @@ @pytest.mark.model("resnet50") @pytest.mark.parametrize("ec2_instance_type", [PT_EC2_GPU_SYNTHETIC_INSTANCE_TYPE], indirect=True) -@pytest.mark.usefixtures("ec2_instance_ami") +@pytest.mark.parametrize("ec2_instance_ami", [UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2], indirect=True) @pytest.mark.team("conda") def test_performance_pytorch_gpu_synthetic( pytorch_training, ec2_connection, gpu_only, py3_only, ec2_instance_type diff --git a/test/dlc_tests/ec2/test_curand.py b/test/dlc_tests/ec2/test_curand.py index b230e5860cda..994dbcf2493f 100644 --- a/test/dlc_tests/ec2/test_curand.py +++ b/test/dlc_tests/ec2/test_curand.py @@ -20,7 +20,7 @@ @pytest.mark.model("N/A") @pytest.mark.team("frameworks") @pytest.mark.parametrize("ec2_instance_type", CURAND_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) -@pytest.mark.usefixtures("ec2_instance_ami") +@pytest.mark.parametrize("ec2_instance_ami", [test_utils.UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2], indirect=True) def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_incompatible_with_instance_type(training, ec2_instance_type): pytest.skip(f"Image {training} is incompatible with instance type {ec2_instance_type}") From 95ddb86d82e243b8b23e33de214f33d50766239c Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Tue, 12 Mar 2024 14:40:21 -0700 Subject: [PATCH 14/47] use instance ami in parametrize --- .../training/test_performance_pytorch_training.py | 5 +++-- .../ec2/pytorch/training/test_pytorch_training_2_2.py | 10 +++++++--- test/dlc_tests/ec2/test_curand.py | 5 +++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py index 775d1902be53..f68994684677 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py @@ -55,8 +55,9 @@ @pytest.mark.model("resnet50") -@pytest.mark.parametrize("ec2_instance_type", [PT_EC2_GPU_SYNTHETIC_INSTANCE_TYPE], indirect=True) -@pytest.mark.parametrize("ec2_instance_ami", [UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2], indirect=True) +@pytest.mark.parametrize( + "ec2_instance_type, ec2_instance_ami", [PT_EC2_GPU_SYNTHETIC_INSTANCE_TYPE], indirect=True +) @pytest.mark.team("conda") def test_performance_pytorch_gpu_synthetic( pytorch_training, ec2_connection, gpu_only, py3_only, ec2_instance_type diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py index 4427c40810c4..bdfc45ab3ee8 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py @@ -13,7 +13,9 @@ @pytest.mark.model("N/A") @pytest.mark.team("conda") @pytest.mark.parametrize( - "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True + "ec2_instance_type, region, ec2_instance_ami", + common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, + indirect=True, ) def test_pytorch_2_2_gpu( pytorch_training___2__2, ec2_connection, region, gpu_only, ec2_instance_type @@ -54,7 +56,7 @@ def test_pytorch_2_2_gpu( @pytest.mark.model("N/A") @pytest.mark.team("training-compiler") @pytest.mark.parametrize( - "ec2_instance_type, region", + "ec2_instance_type, region, ec2_instance_ami", common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION, indirect=True, ) @@ -81,7 +83,9 @@ def test_pytorch_2_2_gpu_inductor( @pytest.mark.integration("pytorch_sanity_test") @pytest.mark.model("N/A") @pytest.mark.team("conda") -@pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True) +@pytest.mark.parametrize( + "ec2_instance_type, ec2_instance_ami", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True +) def test_pytorch_2_2_cpu(pytorch_training___2__2, ec2_connection, cpu_only): pytorch_training = pytorch_training___2__2 diff --git a/test/dlc_tests/ec2/test_curand.py b/test/dlc_tests/ec2/test_curand.py index 994dbcf2493f..f206728b394d 100644 --- a/test/dlc_tests/ec2/test_curand.py +++ b/test/dlc_tests/ec2/test_curand.py @@ -19,8 +19,9 @@ @pytest.mark.integration("curand") @pytest.mark.model("N/A") @pytest.mark.team("frameworks") -@pytest.mark.parametrize("ec2_instance_type", CURAND_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) -@pytest.mark.parametrize("ec2_instance_ami", [test_utils.UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2], indirect=True) +@pytest.mark.parametrize( + "ec2_instance_type, ec2_instance_ami", CURAND_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True +) def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_incompatible_with_instance_type(training, ec2_instance_type): pytest.skip(f"Image {training} is incompatible with instance type {ec2_instance_type}") From 1d9347f297e1ade17c990744cbc2c655ed91b02d Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Tue, 12 Mar 2024 15:48:40 -0700 Subject: [PATCH 15/47] add instace ami ad parametrize --- .../training/test_performance_pytorch_training.py | 3 ++- .../ec2/pytorch/training/common_cases.py | 11 +++++++++++ .../pytorch/training/test_pytorch_training_2_2.py | 15 ++++++++------- test/dlc_tests/ec2/test_curand.py | 3 ++- test/test_utils/ec2.py | 13 +++++++++++-- 5 files changed, 34 insertions(+), 11 deletions(-) diff --git a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py index f68994684677..f5b7a825a182 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py @@ -55,8 +55,9 @@ @pytest.mark.model("resnet50") +@pytest.mark.parametrize("ec2_instance_type", [PT_EC2_GPU_SYNTHETIC_INSTANCE_TYPE], indirect=True) @pytest.mark.parametrize( - "ec2_instance_type, ec2_instance_ami", [PT_EC2_GPU_SYNTHETIC_INSTANCE_TYPE], indirect=True + "ec2_instance_ami", [UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2], indirect=True ) @pytest.mark.team("conda") def test_performance_pytorch_gpu_synthetic( diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index edd7bfc18412..abe5ee58bc96 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -7,6 +7,7 @@ from test.test_utils import ( CONTAINER_TESTS_PREFIX, + UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2, get_framework_and_version_from_tag, get_cuda_version_from_tag, ) @@ -14,6 +15,7 @@ execute_ec2_training_test, get_ec2_instance_type, get_efa_ec2_instance_type, + get_efa_ec2_instance_ami, ) # Test functions @@ -44,6 +46,15 @@ default="g4dn.12xlarge", filter_function=ec2_utils.filter_non_g3_instance_type ) +# Instance AMI filters +PT_EC2_CPU_INSTANCE_AMI = UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 + +PT_EC2_GPU_INSTANCE_AMI = get_efa_ec2_instance_ami(PT_EC2_GPU_INSTANCE_TYPE_AND_REGION) + +PT_EC2_GPU_INDUCTOR_INSTANCE_AMI = get_efa_ec2_instance_ami( + PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION +) + def pytorch_standalone(pytorch_training, ec2_connection): execute_ec2_training_test( diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py index bdfc45ab3ee8..dc1a5645c5a3 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py @@ -13,10 +13,9 @@ @pytest.mark.model("N/A") @pytest.mark.team("conda") @pytest.mark.parametrize( - "ec2_instance_type, region, ec2_instance_ami", - common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, - indirect=True, + "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True ) +@pytest.mark.parametrize("ec2_instance_ami", common_cases.PT_GPU_INSTANCE_AMI, indirect=True) def test_pytorch_2_2_gpu( pytorch_training___2__2, ec2_connection, region, gpu_only, ec2_instance_type ): @@ -56,10 +55,13 @@ def test_pytorch_2_2_gpu( @pytest.mark.model("N/A") @pytest.mark.team("training-compiler") @pytest.mark.parametrize( - "ec2_instance_type, region, ec2_instance_ami", + "ec2_instance_type, region", common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION, indirect=True, ) +@pytest.mark.parametrize( + "ec2_instance_ami", common_cases.PT_GPU_INDUCTOR_INSTANCE_AMI, indirect=True +) def test_pytorch_2_2_gpu_inductor( pytorch_training___2__2, ec2_connection, region, gpu_only, ec2_instance_type ): @@ -83,9 +85,8 @@ def test_pytorch_2_2_gpu_inductor( @pytest.mark.integration("pytorch_sanity_test") @pytest.mark.model("N/A") @pytest.mark.team("conda") -@pytest.mark.parametrize( - "ec2_instance_type, ec2_instance_ami", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True -) +@pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True) +@pytest.mark.parametrize("ec2_instance_ami", common_cases.PT_CPU_INSTANCE_AMI, indirect=True) def test_pytorch_2_2_cpu(pytorch_training___2__2, ec2_connection, cpu_only): pytorch_training = pytorch_training___2__2 diff --git a/test/dlc_tests/ec2/test_curand.py b/test/dlc_tests/ec2/test_curand.py index f206728b394d..62621ca3c7c8 100644 --- a/test/dlc_tests/ec2/test_curand.py +++ b/test/dlc_tests/ec2/test_curand.py @@ -19,8 +19,9 @@ @pytest.mark.integration("curand") @pytest.mark.model("N/A") @pytest.mark.team("frameworks") +@pytest.mark.parametrize("ec2_instance_type", CURAND_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) @pytest.mark.parametrize( - "ec2_instance_type, ec2_instance_ami", CURAND_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True + "ec2_instance_ami", test_utils.UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2, indirect=True ) def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_incompatible_with_instance_type(training, ec2_instance_type): diff --git a/test/test_utils/ec2.py b/test/test_utils/ec2.py index b435c8c3953b..1fc523a28c7a 100644 --- a/test/test_utils/ec2.py +++ b/test/test_utils/ec2.py @@ -30,6 +30,7 @@ is_pr_context, is_mainline_context, are_heavy_instance_ec2_tests_enabled, + get_instance_type_base_dlami, ) from . import DEFAULT_REGION, P3DN_REGION, P4DE_REGION, UL_AMI_LIST, BENCHMARK_RESULTS_S3_BUCKET @@ -126,6 +127,14 @@ def get_cicd_instance_reserved_region(instance_type): ) +def get_efa_ec2_instance_ami(instance_region_list): + instance_amis = [ + get_instance_type_base_dlami(instance_type, region) + for instance_type, region in instance_region_list + ] + return instance_amis + + def get_efa_ec2_instance_type(default, filter_function=lambda x: x, job_type=""): """ Helper function wrapping around get_ec2_instance_type to parametrize both ec2_instance_type @@ -139,11 +148,11 @@ def get_efa_ec2_instance_type(default, filter_function=lambda x: x, job_type="") a list. """ instance_list = get_ec2_instance_type(default, "gpu", filter_function, job_type=job_type) - instance_list = [ + instance_region_list = [ (instance_type, get_cicd_instance_reserved_region(instance_type)) for instance_type in instance_list ] - return instance_list + return instance_region_list def get_ec2_instance_type( From 0a9504de4fefc1f65463434cda33affa1cc22eba Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Wed, 13 Mar 2024 10:39:29 -0700 Subject: [PATCH 16/47] fix curand test --- test/dlc_tests/conftest.py | 12 ------------ test/dlc_tests/ec2/test_curand.py | 2 +- test/test_utils/__init__.py | 14 -------------- test/test_utils/sagemaker.py | 6 ------ 4 files changed, 1 insertion(+), 33 deletions(-) diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index 0a130085166a..d308fd1a82e7 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -31,19 +31,7 @@ is_nightly_context, DEFAULT_REGION, P3DN_REGION, - # UBUNTU_20_BASE_DLAMI_US_EAST_1, - # UBUNTU_20_BASE_DLAMI_US_WEST_2, - UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1, - UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2, - UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1, - UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2, PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1, - # AML2_BASE_DLAMI_US_WEST_2, - # AML2_BASE_DLAMI_US_EAST_1, - AML2_BASE_OSS_DLAMI_US_WEST_2, - AML2_BASE_OSS_DLAMI_US_EAST_1, - AML2_BASE_PROPRIETARY_DLAMI_US_WEST_2, - AML2_BASE_PROPRIETARY_DLAMI_US_EAST_1, KEYS_TO_DESTROY_FILE, are_efa_tests_disabled, get_repository_and_tag_from_image_uri, diff --git a/test/dlc_tests/ec2/test_curand.py b/test/dlc_tests/ec2/test_curand.py index 62621ca3c7c8..1a28f5390c02 100644 --- a/test/dlc_tests/ec2/test_curand.py +++ b/test/dlc_tests/ec2/test_curand.py @@ -21,7 +21,7 @@ @pytest.mark.team("frameworks") @pytest.mark.parametrize("ec2_instance_type", CURAND_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) @pytest.mark.parametrize( - "ec2_instance_ami", test_utils.UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2, indirect=True + "ec2_instance_ami", [test_utils.UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2], indirect=True ) def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_incompatible_with_instance_type(training, ec2_instance_type): diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index fc407cb09159..e69f1f620d72 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -79,12 +79,6 @@ def get_ami_id_ssm(region_name, parameter_path): # DLAMI Base is split between OSS Nvidia Driver and Propietary Nvidia Driver. see https://docs.aws.amazon.com/dlami/latest/devguide/important-changes.html -# UBUNTU_20_BASE_DLAMI_US_WEST_2 = get_ami_id_boto3( -# region_name="us-west-2", ami_name_pattern="Deep Learning Base GPU AMI (Ubuntu 20.04) ????????" -# ) -# UBUNTU_20_BASE_DLAMI_US_EAST_1 = get_ami_id_boto3( -# region_name="us-east-1", ami_name_pattern="Deep Learning Base GPU AMI (Ubuntu 20.04) ????????" -# ) UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 = get_ami_id_boto3( region_name="us-west-2", ami_name_pattern="Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", @@ -101,12 +95,6 @@ def get_ami_id_ssm(region_name, parameter_path): region_name="us-east-1", ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", ) -# AML2_BASE_DLAMI_US_WEST_2 = get_ami_id_boto3( -# region_name="us-west-2", ami_name_pattern="Deep Learning Base AMI (Amazon Linux 2) Version ??.?" -# ) -# AML2_BASE_DLAMI_US_EAST_1 = get_ami_id_boto3( -# region_name="us-east-1", ami_name_pattern="Deep Learning Base AMI (Amazon Linux 2) Version ??.?" -# ) AML2_BASE_OSS_DLAMI_US_WEST_2 = get_ami_id_boto3( region_name="us-west-2", ami_name_pattern="Deep Learning Base OSS Nvidia Driver AMI (Amazon Linux 2) Version ??.?", @@ -177,8 +165,6 @@ def get_ami_id_ssm(region_name, parameter_path): UBUNTU_18_HPU_DLAMI_US_WEST_2 = "ami-03cdcfc91a96a8f92" UBUNTU_18_HPU_DLAMI_US_EAST_1 = "ami-0d83d7487f322545a" UL_AMI_LIST = [ - # UBUNTU_20_BASE_DLAMI_US_WEST_2, - # UBUNTU_20_BASE_DLAMI_US_EAST_1, UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2, UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1, UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2, diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py index 852ae2336fe6..5713e077ba16 100644 --- a/test/test_utils/sagemaker.py +++ b/test/test_utils/sagemaker.py @@ -25,12 +25,6 @@ SAGEMAKER_EXECUTION_REGIONS, SAGEMAKER_NEURON_EXECUTION_REGIONS, SAGEMAKER_NEURONX_EXECUTION_REGIONS, - # UBUNTU_20_BASE_DLAMI_US_EAST_1, - # UBUNTU_20_BASE_DLAMI_US_WEST_2, - UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1, - UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2, - UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1, - UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2, UL20_CPU_ARM64_US_EAST_1, UL20_CPU_ARM64_US_WEST_2, SAGEMAKER_LOCAL_TEST_TYPE, From c66555e8ed31e5fe30cef7b55d703d4c82bc11f4 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Wed, 13 Mar 2024 12:26:33 -0700 Subject: [PATCH 17/47] correct ami name --- .../pytorch/training/test_performance_pytorch_training.py | 1 - .../ec2/pytorch/training/test_pytorch_training_2_2.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py index f5b7a825a182..9570205e5e6a 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py @@ -11,7 +11,6 @@ DEFAULT_REGION, get_framework_and_version_from_tag, is_pr_context, - get_instance_type_base_dlami, ) from test.test_utils.ec2 import ( execute_ec2_training_performance_test, diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py index dc1a5645c5a3..4015e035e0ce 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py @@ -15,7 +15,7 @@ @pytest.mark.parametrize( "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True ) -@pytest.mark.parametrize("ec2_instance_ami", common_cases.PT_GPU_INSTANCE_AMI, indirect=True) +@pytest.mark.parametrize("ec2_instance_ami", common_cases.PT_EC2_GPU_INSTANCE_AMI, indirect=True) def test_pytorch_2_2_gpu( pytorch_training___2__2, ec2_connection, region, gpu_only, ec2_instance_type ): @@ -60,7 +60,7 @@ def test_pytorch_2_2_gpu( indirect=True, ) @pytest.mark.parametrize( - "ec2_instance_ami", common_cases.PT_GPU_INDUCTOR_INSTANCE_AMI, indirect=True + "ec2_instance_ami", common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_AMI, indirect=True ) def test_pytorch_2_2_gpu_inductor( pytorch_training___2__2, ec2_connection, region, gpu_only, ec2_instance_type @@ -86,7 +86,7 @@ def test_pytorch_2_2_gpu_inductor( @pytest.mark.model("N/A") @pytest.mark.team("conda") @pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True) -@pytest.mark.parametrize("ec2_instance_ami", common_cases.PT_CPU_INSTANCE_AMI, indirect=True) +@pytest.mark.parametrize("ec2_instance_ami", common_cases.PT_EC2_CPU_INSTANCE_AMI, indirect=True) def test_pytorch_2_2_cpu(pytorch_training___2__2, ec2_connection, cpu_only): pytorch_training = pytorch_training___2__2 From e5716bcb5f9b856d8b3fa7c203d71cb4b0b64c04 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Wed, 13 Mar 2024 12:27:58 -0700 Subject: [PATCH 18/47] correct ami format --- test/dlc_tests/ec2/pytorch/training/common_cases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index abe5ee58bc96..b97b121b3fef 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -47,7 +47,7 @@ ) # Instance AMI filters -PT_EC2_CPU_INSTANCE_AMI = UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 +PT_EC2_CPU_INSTANCE_AMI = [UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2] PT_EC2_GPU_INSTANCE_AMI = get_efa_ec2_instance_ami(PT_EC2_GPU_INSTANCE_TYPE_AND_REGION) From 66ce9fc8fafa4418c6422b95d464e3d7e668d0af Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Wed, 13 Mar 2024 15:51:19 -0700 Subject: [PATCH 19/47] use proprietary dlami for curand --- test/dlc_tests/ec2/test_curand.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/dlc_tests/ec2/test_curand.py b/test/dlc_tests/ec2/test_curand.py index 1a28f5390c02..6bd6c14934db 100644 --- a/test/dlc_tests/ec2/test_curand.py +++ b/test/dlc_tests/ec2/test_curand.py @@ -21,7 +21,7 @@ @pytest.mark.team("frameworks") @pytest.mark.parametrize("ec2_instance_type", CURAND_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) @pytest.mark.parametrize( - "ec2_instance_ami", [test_utils.UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2], indirect=True + "ec2_instance_ami", [test_utils.UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2], indirect=True ) def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_incompatible_with_instance_type(training, ec2_instance_type): From 68273a44b70ea08ec17c1e5c631a77a3a0f22cc9 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Wed, 13 Mar 2024 17:40:30 -0700 Subject: [PATCH 20/47] rebuild --- dlc_developer_config.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 4754047b504c..3d3bf6423389 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,7 +42,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = false +do_build = true autopatch_build = false [notify] @@ -54,7 +54,7 @@ notify_test_failures = false [test] ### On by default -sanity_tests = false +sanity_tests = true safety_check_test = false ecr_scan_allowlist_feature = false ecs_tests = false From c70f0e9c54b536fdf0762a0de3e441a4237c35ac Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 11:19:05 -0700 Subject: [PATCH 21/47] logging debug --- dlc_developer_config.toml | 2 +- test/test_utils/__init__.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 3d3bf6423389..2b669dc795ff 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,7 +42,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = false autopatch_build = false [notify] diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index e69f1f620d72..bc2e8ddbb9c6 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -2466,6 +2466,8 @@ def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): "Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????" ) + LOGGER.info(f"Instance Type: {instance_type}") + return ( proprietary_dlami_us_east_1 if region == "us-east-1" and instance_type in base_proprietary_dlami_instances From 75f8e8624b5efc574b669bed5aaf3850cf3a4fd5 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 11:45:21 -0700 Subject: [PATCH 22/47] remove parametrize ami --- .../pytorch/training/test_performance_pytorch_training.py | 6 +++--- .../ec2/pytorch/training/test_pytorch_training_2_2.py | 2 +- test/dlc_tests/ec2/test_curand.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py index 9570205e5e6a..f3c24e5e7e51 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py @@ -55,9 +55,9 @@ @pytest.mark.model("resnet50") @pytest.mark.parametrize("ec2_instance_type", [PT_EC2_GPU_SYNTHETIC_INSTANCE_TYPE], indirect=True) -@pytest.mark.parametrize( - "ec2_instance_ami", [UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2], indirect=True -) +# @pytest.mark.parametrize( +# "ec2_instance_ami", [UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2], indirect=True +# ) @pytest.mark.team("conda") def test_performance_pytorch_gpu_synthetic( pytorch_training, ec2_connection, gpu_only, py3_only, ec2_instance_type diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py index 4015e035e0ce..1b27a2f12a4d 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py @@ -15,7 +15,7 @@ @pytest.mark.parametrize( "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True ) -@pytest.mark.parametrize("ec2_instance_ami", common_cases.PT_EC2_GPU_INSTANCE_AMI, indirect=True) +# @pytest.mark.parametrize("ec2_instance_ami", common_cases.PT_EC2_GPU_INSTANCE_AMI, indirect=True) def test_pytorch_2_2_gpu( pytorch_training___2__2, ec2_connection, region, gpu_only, ec2_instance_type ): diff --git a/test/dlc_tests/ec2/test_curand.py b/test/dlc_tests/ec2/test_curand.py index 6bd6c14934db..4054a4c7184a 100644 --- a/test/dlc_tests/ec2/test_curand.py +++ b/test/dlc_tests/ec2/test_curand.py @@ -20,9 +20,9 @@ @pytest.mark.model("N/A") @pytest.mark.team("frameworks") @pytest.mark.parametrize("ec2_instance_type", CURAND_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) -@pytest.mark.parametrize( - "ec2_instance_ami", [test_utils.UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2], indirect=True -) +# @pytest.mark.parametrize( +# "ec2_instance_ami", [test_utils.UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2], indirect=True +# ) def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_incompatible_with_instance_type(training, ec2_instance_type): pytest.skip(f"Image {training} is incompatible with instance type {ec2_instance_type}") From 5a99d362da4150fac3e719619b19a34d7686e633 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 12:35:09 -0700 Subject: [PATCH 23/47] flip logic --- test/test_utils/__init__.py | 54 ++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index bc2e8ddbb9c6..ee311cdad6f6 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -2440,6 +2440,26 @@ def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): "g3.16xlarge", ] + base_oss_dlami_instances = ["g4dn.xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.16xlarge", + "g4dn.12xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.xlarge", + "g5.2xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.16xlarge", + "g5.12xlarge", + "g5.24xlarge", + "g5.48xlarge", + "p4d.24xlarge", + "p4de.24xlarge", + "p5.48xlarge",] + # set defaults if linux_dist == "AML2": oss_dlami_us_east_1 = AML2_BASE_OSS_DLAMI_US_EAST_1 @@ -2468,19 +2488,35 @@ def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): LOGGER.info(f"Instance Type: {instance_type}") + # return ( + # proprietary_dlami_us_east_1 + # if region == "us-east-1" and instance_type in base_proprietary_dlami_instances + # else proprietary_dlami_us_west_2 + # if region == "us-west-2" and instance_type in base_proprietary_dlami_instances + # else get_ami_id_boto3( + # region_name=region, + # ami_name_pattern=proprietary_dlami_name_pattern, + # ) + # if instance_type in base_proprietary_dlami_instances + # else oss_dlami_us_east_1 + # if region == "us-east-1" + # else oss_dlami_us_west_2 + # if region == "us-west-2" + # else get_ami_id_boto3(region_name=region, ami_name_pattern=oss_dlami_name_pattern) + # ) return ( - proprietary_dlami_us_east_1 - if region == "us-east-1" and instance_type in base_proprietary_dlami_instances - else proprietary_dlami_us_west_2 - if region == "us-west-2" and instance_type in base_proprietary_dlami_instances + oss_dlami_us_east_1 + if region == "us-east-1" and instance_type in base_oss_dlami_instances + else oss_dlami_us_west_2 + if region == "us-west-2" and instance_type in base_oss_dlami_instances else get_ami_id_boto3( region_name=region, - ami_name_pattern=proprietary_dlami_name_pattern, + ami_name_pattern=oss_dlami_name_pattern, ) - if instance_type in base_proprietary_dlami_instances - else oss_dlami_us_east_1 + if instance_type in base_oss_dlami_instances + else proprietary_dlami_us_east_1 if region == "us-east-1" - else oss_dlami_us_west_2 + else proprietary_dlami_us_west_2 if region == "us-west-2" - else get_ami_id_boto3(region_name=region, ami_name_pattern=oss_dlami_name_pattern) + else get_ami_id_boto3(region_name=region, ami_name_pattern=proprietary_dlami_name_pattern) ) From 9040c77b8c177286eeb992c9481f48a43d70c5ad Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 12:35:30 -0700 Subject: [PATCH 24/47] formatting --- test/test_utils/__init__.py | 40 +++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index ee311cdad6f6..bb9a8dd60fa6 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -2440,25 +2440,27 @@ def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): "g3.16xlarge", ] - base_oss_dlami_instances = ["g4dn.xlarge", - "g4dn.2xlarge", - "g4dn.4xlarge", - "g4dn.8xlarge", - "g4dn.16xlarge", - "g4dn.12xlarge", - "g4dn.metal", - "g4dn.xlarge", - "g5.xlarge", - "g5.2xlarge", - "g5.4xlarge", - "g5.8xlarge", - "g5.16xlarge", - "g5.12xlarge", - "g5.24xlarge", - "g5.48xlarge", - "p4d.24xlarge", - "p4de.24xlarge", - "p5.48xlarge",] + base_oss_dlami_instances = [ + "g4dn.xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.16xlarge", + "g4dn.12xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.xlarge", + "g5.2xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.16xlarge", + "g5.12xlarge", + "g5.24xlarge", + "g5.48xlarge", + "p4d.24xlarge", + "p4de.24xlarge", + "p5.48xlarge", + ] # set defaults if linux_dist == "AML2": From 2f16a5bd0aa2307b1cb69576192610f6ecd95f37 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 13:11:55 -0700 Subject: [PATCH 25/47] print instance ami --- test/test_utils/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index bb9a8dd60fa6..5842e8662dc1 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -2488,8 +2488,6 @@ def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): "Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????" ) - LOGGER.info(f"Instance Type: {instance_type}") - # return ( # proprietary_dlami_us_east_1 # if region == "us-east-1" and instance_type in base_proprietary_dlami_instances @@ -2506,7 +2504,7 @@ def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): # if region == "us-west-2" # else get_ami_id_boto3(region_name=region, ami_name_pattern=oss_dlami_name_pattern) # ) - return ( + instance_ami = ( oss_dlami_us_east_1 if region == "us-east-1" and instance_type in base_oss_dlami_instances else oss_dlami_us_west_2 @@ -2522,3 +2520,7 @@ def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): if region == "us-west-2" else get_ami_id_boto3(region_name=region, ami_name_pattern=proprietary_dlami_name_pattern) ) + LOGGER.info(f"Instance Type: {instance_type}, Instance AMI: {instance_ami}") + + + return instance_ami From 3b15a71f5b71c40be14028a339a0461329bf8e36 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 13:40:00 -0700 Subject: [PATCH 26/47] fix typo --- test/test_utils/__init__.py | 55 ++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index 5842e8662dc1..49970e8236a0 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -2482,45 +2482,44 @@ def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) ????????" ) - proprietary_dlami_us_east_1 = UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1 - proprietary_dlami_us_west_2 = UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 + proprietary_dlami_us_east_1 = UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1 + proprietary_dlami_us_west_2 = UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2 proprietary_dlami_name_pattern = ( "Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????" ) - # return ( - # proprietary_dlami_us_east_1 - # if region == "us-east-1" and instance_type in base_proprietary_dlami_instances - # else proprietary_dlami_us_west_2 - # if region == "us-west-2" and instance_type in base_proprietary_dlami_instances - # else get_ami_id_boto3( - # region_name=region, - # ami_name_pattern=proprietary_dlami_name_pattern, - # ) - # if instance_type in base_proprietary_dlami_instances - # else oss_dlami_us_east_1 - # if region == "us-east-1" - # else oss_dlami_us_west_2 - # if region == "us-west-2" - # else get_ami_id_boto3(region_name=region, ami_name_pattern=oss_dlami_name_pattern) - # ) instance_ami = ( - oss_dlami_us_east_1 - if region == "us-east-1" and instance_type in base_oss_dlami_instances - else oss_dlami_us_west_2 - if region == "us-west-2" and instance_type in base_oss_dlami_instances + proprietary_dlami_us_east_1 + if region == "us-east-1" and instance_type in base_proprietary_dlami_instances + else proprietary_dlami_us_west_2 + if region == "us-west-2" and instance_type in base_proprietary_dlami_instances else get_ami_id_boto3( region_name=region, - ami_name_pattern=oss_dlami_name_pattern, + ami_name_pattern=proprietary_dlami_name_pattern, ) - if instance_type in base_oss_dlami_instances - else proprietary_dlami_us_east_1 + if instance_type in base_proprietary_dlami_instances + else oss_dlami_us_east_1 if region == "us-east-1" - else proprietary_dlami_us_west_2 + else oss_dlami_us_west_2 if region == "us-west-2" - else get_ami_id_boto3(region_name=region, ami_name_pattern=proprietary_dlami_name_pattern) + else get_ami_id_boto3(region_name=region, ami_name_pattern=oss_dlami_name_pattern) ) + # instance_ami = ( + # oss_dlami_us_east_1 + # if region == "us-east-1" and instance_type in base_oss_dlami_instances + # else oss_dlami_us_west_2 + # if region == "us-west-2" and instance_type in base_oss_dlami_instances + # else get_ami_id_boto3( + # region_name=region, + # ami_name_pattern=oss_dlami_name_pattern, + # ) + # if instance_type in base_oss_dlami_instances + # else proprietary_dlami_us_east_1 + # if region == "us-east-1" + # else proprietary_dlami_us_west_2 + # if region == "us-west-2" + # else get_ami_id_boto3(region_name=region, ami_name_pattern=proprietary_dlami_name_pattern) + # ) LOGGER.info(f"Instance Type: {instance_type}, Instance AMI: {instance_ami}") - return instance_ami From b83eed139502d5d67f818404d4b3817ab648f97d Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 14:00:36 -0700 Subject: [PATCH 27/47] remove parametrize logic and fix proprietary dlami name pattern --- .../test_performance_pytorch_training.py | 4 -- .../ec2/pytorch/training/common_cases.py | 11 ------ .../training/test_pytorch_training_2_2.py | 5 --- test/dlc_tests/ec2/test_curand.py | 3 -- test/dlc_tests/ec2/test_gdrcopy.py | 6 +-- test/test_utils/__init__.py | 39 ------------------- test/test_utils/ec2.py | 9 ----- 7 files changed, 3 insertions(+), 74 deletions(-) diff --git a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py index f3c24e5e7e51..dac172061d54 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py @@ -7,7 +7,6 @@ CONTAINER_TESTS_PREFIX, PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_WEST_2, UBUNTU_18_HPU_DLAMI_US_WEST_2, - UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2, DEFAULT_REGION, get_framework_and_version_from_tag, is_pr_context, @@ -55,9 +54,6 @@ @pytest.mark.model("resnet50") @pytest.mark.parametrize("ec2_instance_type", [PT_EC2_GPU_SYNTHETIC_INSTANCE_TYPE], indirect=True) -# @pytest.mark.parametrize( -# "ec2_instance_ami", [UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2], indirect=True -# ) @pytest.mark.team("conda") def test_performance_pytorch_gpu_synthetic( pytorch_training, ec2_connection, gpu_only, py3_only, ec2_instance_type diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index b97b121b3fef..edd7bfc18412 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -7,7 +7,6 @@ from test.test_utils import ( CONTAINER_TESTS_PREFIX, - UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2, get_framework_and_version_from_tag, get_cuda_version_from_tag, ) @@ -15,7 +14,6 @@ execute_ec2_training_test, get_ec2_instance_type, get_efa_ec2_instance_type, - get_efa_ec2_instance_ami, ) # Test functions @@ -46,15 +44,6 @@ default="g4dn.12xlarge", filter_function=ec2_utils.filter_non_g3_instance_type ) -# Instance AMI filters -PT_EC2_CPU_INSTANCE_AMI = [UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2] - -PT_EC2_GPU_INSTANCE_AMI = get_efa_ec2_instance_ami(PT_EC2_GPU_INSTANCE_TYPE_AND_REGION) - -PT_EC2_GPU_INDUCTOR_INSTANCE_AMI = get_efa_ec2_instance_ami( - PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION -) - def pytorch_standalone(pytorch_training, ec2_connection): execute_ec2_training_test( diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py index 1b27a2f12a4d..4427c40810c4 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_2.py @@ -15,7 +15,6 @@ @pytest.mark.parametrize( "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True ) -# @pytest.mark.parametrize("ec2_instance_ami", common_cases.PT_EC2_GPU_INSTANCE_AMI, indirect=True) def test_pytorch_2_2_gpu( pytorch_training___2__2, ec2_connection, region, gpu_only, ec2_instance_type ): @@ -59,9 +58,6 @@ def test_pytorch_2_2_gpu( common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION, indirect=True, ) -@pytest.mark.parametrize( - "ec2_instance_ami", common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_AMI, indirect=True -) def test_pytorch_2_2_gpu_inductor( pytorch_training___2__2, ec2_connection, region, gpu_only, ec2_instance_type ): @@ -86,7 +82,6 @@ def test_pytorch_2_2_gpu_inductor( @pytest.mark.model("N/A") @pytest.mark.team("conda") @pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True) -@pytest.mark.parametrize("ec2_instance_ami", common_cases.PT_EC2_CPU_INSTANCE_AMI, indirect=True) def test_pytorch_2_2_cpu(pytorch_training___2__2, ec2_connection, cpu_only): pytorch_training = pytorch_training___2__2 diff --git a/test/dlc_tests/ec2/test_curand.py b/test/dlc_tests/ec2/test_curand.py index 4054a4c7184a..8b5789900501 100644 --- a/test/dlc_tests/ec2/test_curand.py +++ b/test/dlc_tests/ec2/test_curand.py @@ -20,9 +20,6 @@ @pytest.mark.model("N/A") @pytest.mark.team("frameworks") @pytest.mark.parametrize("ec2_instance_type", CURAND_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) -# @pytest.mark.parametrize( -# "ec2_instance_ami", [test_utils.UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2], indirect=True -# ) def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_incompatible_with_instance_type(training, ec2_instance_type): pytest.skip(f"Image {training} is incompatible with instance type {ec2_instance_type}") diff --git a/test/dlc_tests/ec2/test_gdrcopy.py b/test/dlc_tests/ec2/test_gdrcopy.py index a879939f00af..286a70b4d245 100644 --- a/test/dlc_tests/ec2/test_gdrcopy.py +++ b/test/dlc_tests/ec2/test_gdrcopy.py @@ -23,9 +23,9 @@ @pytest.mark.team("conda") @pytest.mark.integration("gdrcopy") @pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION) -@pytest.mark.parametrize( - "ec2_instance_ami", [test_utils.UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2], indirect=True -) +# @pytest.mark.parametrize( +# "ec2_instance_ami", [test_utils.UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2], indirect=True +# ) @pytest.mark.skipif( is_pr_context() and not are_heavy_instance_ec2_tests_enabled(), reason="Skip GDRCopy test in PR context unless explicitly enabled", diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index 49970e8236a0..deb85d25d551 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -2440,28 +2440,6 @@ def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): "g3.16xlarge", ] - base_oss_dlami_instances = [ - "g4dn.xlarge", - "g4dn.2xlarge", - "g4dn.4xlarge", - "g4dn.8xlarge", - "g4dn.16xlarge", - "g4dn.12xlarge", - "g4dn.metal", - "g4dn.xlarge", - "g5.xlarge", - "g5.2xlarge", - "g5.4xlarge", - "g5.8xlarge", - "g5.16xlarge", - "g5.12xlarge", - "g5.24xlarge", - "g5.48xlarge", - "p4d.24xlarge", - "p4de.24xlarge", - "p5.48xlarge", - ] - # set defaults if linux_dist == "AML2": oss_dlami_us_east_1 = AML2_BASE_OSS_DLAMI_US_EAST_1 @@ -2504,22 +2482,5 @@ def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): if region == "us-west-2" else get_ami_id_boto3(region_name=region, ami_name_pattern=oss_dlami_name_pattern) ) - # instance_ami = ( - # oss_dlami_us_east_1 - # if region == "us-east-1" and instance_type in base_oss_dlami_instances - # else oss_dlami_us_west_2 - # if region == "us-west-2" and instance_type in base_oss_dlami_instances - # else get_ami_id_boto3( - # region_name=region, - # ami_name_pattern=oss_dlami_name_pattern, - # ) - # if instance_type in base_oss_dlami_instances - # else proprietary_dlami_us_east_1 - # if region == "us-east-1" - # else proprietary_dlami_us_west_2 - # if region == "us-west-2" - # else get_ami_id_boto3(region_name=region, ami_name_pattern=proprietary_dlami_name_pattern) - # ) - LOGGER.info(f"Instance Type: {instance_type}, Instance AMI: {instance_ami}") return instance_ami diff --git a/test/test_utils/ec2.py b/test/test_utils/ec2.py index cd27ef34e434..f31dfff251f8 100644 --- a/test/test_utils/ec2.py +++ b/test/test_utils/ec2.py @@ -30,7 +30,6 @@ is_pr_context, is_mainline_context, are_heavy_instance_ec2_tests_enabled, - get_instance_type_base_dlami, ) from . import DEFAULT_REGION, P3DN_REGION, P4DE_REGION, UL_AMI_LIST, BENCHMARK_RESULTS_S3_BUCKET @@ -127,14 +126,6 @@ def get_cicd_instance_reserved_region(instance_type): ) -def get_efa_ec2_instance_ami(instance_region_list): - instance_amis = [ - get_instance_type_base_dlami(instance_type, region) - for instance_type, region in instance_region_list - ] - return instance_amis - - def get_efa_ec2_instance_type(default, filter_function=lambda x: x, job_type=""): """ Helper function wrapping around get_ec2_instance_type to parametrize both ec2_instance_type From 3a78b32847a4a608c482b0a2650295e71b8b69ea Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 15:39:30 -0700 Subject: [PATCH 28/47] revert gdr copy --- pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu b/pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu index e64e521c601b..fadd4ee396f1 100644 --- a/pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu +++ b/pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu @@ -63,7 +63,7 @@ ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" ENV CUDNN_VERSION=8.9.2.26 ENV NCCL_VERSION=2.19.4 ENV EFA_VERSION=1.30.0 -ENV GDRCOPY_VERSION=2.4.1 +ENV GDRCOPY_VERSION=2.3.1 ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" ENV OPEN_MPI_PATH=/opt/amazon/openmpi From f8af0b04c80c03bda5a3100b390c39fb9525a493 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 15:46:48 -0700 Subject: [PATCH 29/47] update test with gdrcopy 2.4 --- pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu | 2 +- test/dlc_tests/container_tests/bin/efa/testEFASanity | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu b/pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu index fadd4ee396f1..e64e521c601b 100644 --- a/pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu +++ b/pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu @@ -63,7 +63,7 @@ ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" ENV CUDNN_VERSION=8.9.2.26 ENV NCCL_VERSION=2.19.4 ENV EFA_VERSION=1.30.0 -ENV GDRCOPY_VERSION=2.3.1 +ENV GDRCOPY_VERSION=2.4.1 ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" ENV OPEN_MPI_PATH=/opt/amazon/openmpi diff --git a/test/dlc_tests/container_tests/bin/efa/testEFASanity b/test/dlc_tests/container_tests/bin/efa/testEFASanity index 8fc1c428d664..1f350628c668 100755 --- a/test/dlc_tests/container_tests/bin/efa/testEFASanity +++ b/test/dlc_tests/container_tests/bin/efa/testEFASanity @@ -24,4 +24,4 @@ lsmod | grep ib_uverbs ibv_devinfo # check if gdr device is loaded -grep -e '^1$' /sys/class/infiniband/**/device/gdr +cat /sys/class/infiniband/**/device/p2p | grep 'NVIDIA' From e32684a77aaf3a07c75a1d36bb4eafedee9967a8 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 17:24:12 -0700 Subject: [PATCH 30/47] build test pt ec2 --- dlc_developer_config.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 2b669dc795ff..9de117d68e19 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,7 +42,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = false +do_build = true autopatch_build = false [notify] @@ -57,8 +57,8 @@ notify_test_failures = false sanity_tests = true safety_check_test = false ecr_scan_allowlist_feature = false -ecs_tests = false -eks_tests = false +ecs_tests = true +eks_tests = true ec2_tests = true # Set it to true if you are preparing a Benchmark related PR ec2_benchmark_tests = false From c8b1ce0eb7c087168847f612935328f6588229f6 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 19:05:00 -0700 Subject: [PATCH 31/47] build test pt sm --- dlc_developer_config.toml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 9de117d68e19..13fb37cfb3b6 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -71,16 +71,16 @@ ec2_tests_on_heavy_instances = true ### SM specific tests ### Off by default -sagemaker_local_tests = false +sagemaker_local_tests = true # run standard sagemaker remote tests from test/sagemaker_tests -sagemaker_remote_tests = false +sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = false +sagemaker_efa_tests = true # run release_candidate_integration tests -sagemaker_rc_tests = false +sagemaker_rc_tests = true # run sagemaker benchmark tests -sagemaker_benchmark_tests = false +sagemaker_benchmark_tests = true # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" @@ -102,7 +102,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-mxnet-training = "" -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-2-ec2.yml" +dlc-pr-pytorch-training = "" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" From 63d8a31fb8cee4169c6deab69fd24a91d451ac26 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 22:53:41 -0700 Subject: [PATCH 32/47] remove gdrcopy ami --- test/dlc_tests/ec2/test_gdrcopy.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/dlc_tests/ec2/test_gdrcopy.py b/test/dlc_tests/ec2/test_gdrcopy.py index 286a70b4d245..9788470ead67 100644 --- a/test/dlc_tests/ec2/test_gdrcopy.py +++ b/test/dlc_tests/ec2/test_gdrcopy.py @@ -23,9 +23,6 @@ @pytest.mark.team("conda") @pytest.mark.integration("gdrcopy") @pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION) -# @pytest.mark.parametrize( -# "ec2_instance_ami", [test_utils.UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2], indirect=True -# ) @pytest.mark.skipif( is_pr_context() and not are_heavy_instance_ec2_tests_enabled(), reason="Skip GDRCopy test in PR context unless explicitly enabled", From e40298b0b2c724d70b7ba5b974a11d0580ca87be Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 22:55:38 -0700 Subject: [PATCH 33/47] sanity and sm local testonly --- dlc_developer_config.toml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 13fb37cfb3b6..eeb167c0bc36 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,7 +42,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = false autopatch_build = false [notify] @@ -57,9 +57,9 @@ notify_test_failures = false sanity_tests = true safety_check_test = false ecr_scan_allowlist_feature = false -ecs_tests = true -eks_tests = true -ec2_tests = true +ecs_tests = false +eks_tests = false +ec2_tests = false # Set it to true if you are preparing a Benchmark related PR ec2_benchmark_tests = false @@ -67,20 +67,20 @@ ec2_benchmark_tests = false ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = true +ec2_tests_on_heavy_instances = false ### SM specific tests ### Off by default sagemaker_local_tests = true # run standard sagemaker remote tests from test/sagemaker_tests -sagemaker_remote_tests = true +sagemaker_remote_tests = false # run efa sagemaker tests -sagemaker_efa_tests = true +sagemaker_efa_tests = false # run release_candidate_integration tests -sagemaker_rc_tests = true +sagemaker_rc_tests = false # run sagemaker benchmark tests -sagemaker_benchmark_tests = true +sagemaker_benchmark_tests = false # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" From 9efd8da273fa27c66f4b11b0f0add23a8a24b2b9 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 23:30:45 -0700 Subject: [PATCH 34/47] build test pt sm --- dlc_developer_config.toml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index eeb167c0bc36..13fb37cfb3b6 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,7 +42,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = false +do_build = true autopatch_build = false [notify] @@ -57,9 +57,9 @@ notify_test_failures = false sanity_tests = true safety_check_test = false ecr_scan_allowlist_feature = false -ecs_tests = false -eks_tests = false -ec2_tests = false +ecs_tests = true +eks_tests = true +ec2_tests = true # Set it to true if you are preparing a Benchmark related PR ec2_benchmark_tests = false @@ -67,20 +67,20 @@ ec2_benchmark_tests = false ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = false +ec2_tests_on_heavy_instances = true ### SM specific tests ### Off by default sagemaker_local_tests = true # run standard sagemaker remote tests from test/sagemaker_tests -sagemaker_remote_tests = false +sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = false +sagemaker_efa_tests = true # run release_candidate_integration tests -sagemaker_rc_tests = false +sagemaker_rc_tests = true # run sagemaker benchmark tests -sagemaker_benchmark_tests = false +sagemaker_benchmark_tests = true # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" From f9633d69a307789f3325d5d2b1eaabbe076071a2 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Thu, 14 Mar 2024 23:32:11 -0700 Subject: [PATCH 35/47] formatting --- test/dlc_tests/ec2/pytorch/training/common_cases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index 9b70632dee59..9599eacbb2c9 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -250,4 +250,4 @@ def pytorch_training_torchdata(pytorch_training, ec2_connection): if Version(image_framework_version) in SpecifierSet(">=1.11,<=1.13.1"): execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_DEV_CMD) else: - execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD) \ No newline at end of file + execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD) From 2682dacef40eefc93376d2437bbb4c77afbf658b Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Sat, 16 Mar 2024 08:01:25 -0700 Subject: [PATCH 36/47] test pt sm --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 13fb37cfb3b6..88711a5bab4e 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,7 +42,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = false autopatch_build = false [notify] From b561099481b237978ebd7fef7d9a5982af7614ea Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Sat, 16 Mar 2024 08:05:10 -0700 Subject: [PATCH 37/47] build test pt sm --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 88711a5bab4e..13fb37cfb3b6 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,7 +42,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = false +do_build = true autopatch_build = false [notify] From 2b528048bc43599cb535ff6f08981121fb8f4462 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Sat, 16 Mar 2024 13:04:16 -0700 Subject: [PATCH 38/47] disable build --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 13fb37cfb3b6..5e7116b01172 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,7 +42,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = flase autopatch_build = false [notify] From dd1e2b213fad0c4e72cc25057d3597c635b7d8a1 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Sat, 16 Mar 2024 13:04:31 -0700 Subject: [PATCH 39/47] build test pt sm --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 5e7116b01172..13fb37cfb3b6 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,7 +42,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = flase +do_build = true autopatch_build = false [notify] From e5fe4852b560a6269e0eb1294876efa10c5fc019 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 18 Mar 2024 01:03:14 -0700 Subject: [PATCH 40/47] use get-login-password --- .../test_performance_pytorch_inference.py | 8 ++++- .../test_performance_pytorch_training.py | 7 +++- .../test_performance_tensorflow_inference.py | 7 +++- .../mxnet/inference/test_mxnet_inference.py | 7 +++- .../inference/test_pytorch_inference.py | 8 +++-- .../test_pytorch_inference_inductor.py | 7 +++- .../ec2/pytorch/training/common_cases.py | 7 +++- .../pytorch/training/test_pytorch_training.py | 7 +++- .../inference/test_tensorflow_inference.py | 13 +++++-- test/dlc_tests/ec2/test_smdebug.py | 14 ++++++-- test/test_utils/ec2.py | 34 ++++++++++++++----- test/test_utils/sagemaker.py | 5 ++- 12 files changed, 101 insertions(+), 23 deletions(-) diff --git a/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py b/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py index 2f4ecd201fb1..cfc7157ca729 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py @@ -1,6 +1,8 @@ import os import time import pytest +import boto3 + from src.benchmark_metrics import ( PYTORCH_INFERENCE_GPU_THRESHOLD, PYTORCH_INFERENCE_CPU_THRESHOLD, @@ -85,7 +87,11 @@ def ec2_performance_pytorch_inference( repo_name, image_tag = image_uri.split("/")[-1].split(":") # Make sure we are logged into ECR so we can pull the image - ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + ec2_connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ") diff --git a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py index dac172061d54..4b366258d055 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py @@ -2,6 +2,7 @@ import time import pytest import re +import boto3 from test.test_utils import ( CONTAINER_TESTS_PREFIX, @@ -142,7 +143,11 @@ def execute_pytorch_gpu_py3_imagenet_ec2_training_performance_test( container_name = f"{repo_name}-performance-{image_tag}-ec2" # Make sure we are logged into ECR so we can pull the image - connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) # Do not add -q to docker pull as it leads to a hang for huge images like trcomp connection.run(f"nvidia-docker pull {ecr_uri}") timestamp = time.strftime("%Y-%m-%d-%H-%M-%S") diff --git a/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py b/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py index 4bc20dd9cde1..3134adc009ac 100644 --- a/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py +++ b/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py @@ -1,6 +1,7 @@ import os import time import pytest +import boto3 from packaging.version import Version @@ -83,7 +84,11 @@ def ec2_performance_tensorflow_inference( num_iterations = 500 if is_pr_context() or is_graviton else 1000 # Make sure we are logged into ECR so we can pull the image - ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + ec2_connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ") if is_graviton: # TF training binary is used that is compatible for graviton instance type diff --git a/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py b/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py index a9eb0eccc277..e2a29aa8df3b 100644 --- a/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py +++ b/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py @@ -1,5 +1,6 @@ import os import pytest +import boto3 import test.test_utils.ec2 as ec2_utils @@ -188,7 +189,11 @@ def run_ec2_mxnet_inference( f" {image_uri} {mms_inference_cmd}" ) try: - ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + ec2_connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) LOGGER.info(docker_run_cmd) ec2_connection.run(docker_run_cmd, hide=True) if model_name == SQUEEZENET_MODEL: diff --git a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py index c32d3d86f35b..2da5c417a03a 100644 --- a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py +++ b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py @@ -3,6 +3,7 @@ import time import logging from datetime import date, timedelta, datetime +import boto3 import pytest from packaging.version import Version @@ -22,7 +23,6 @@ get_ec2_accelerator_type, ) from test.dlc_tests.conftest import LOGGER -import boto3 LOGGER = logging.getLogger(__name__) LOGGER.addHandler(logging.StreamHandler(sys.stdout)) @@ -238,7 +238,11 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region): f" {image_uri} {inference_cmd}" ) try: - ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + ec2_connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) LOGGER.info(docker_run_cmd) ec2_connection.run(docker_run_cmd, hide=True) server_type = get_inference_server_type(image_uri) diff --git a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py index 03c826df3638..19dc2091363f 100644 --- a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py +++ b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py @@ -1,6 +1,7 @@ from packaging.version import Version from packaging.specifiers import SpecifierSet import pytest +import boto3 from test import test_utils from test.test_utils import ( @@ -77,7 +78,11 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region): f" {image_uri} {inference_cmd}" ) try: - ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + ec2_connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) LOGGER.info(docker_run_cmd) ec2_connection.run(docker_run_cmd, hide=True) server_type = get_inference_server_type(image_uri) diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index 9599eacbb2c9..eae9aea7a67c 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -1,4 +1,5 @@ import os +import boto3 from packaging.version import Version from packaging.specifiers import SpecifierSet @@ -189,7 +190,11 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region): PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container. """ container_name = "pt_cudnn_test" - ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + ec2_connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True) ec2_connection.run( f"nvidia-docker run --name {container_name} -itd {pytorch_training}", hide=True diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py index 7c3ee3587ffc..829d97367b81 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py @@ -1,4 +1,5 @@ import os +import boto3 from packaging.version import Version from packaging.specifiers import SpecifierSet @@ -747,7 +748,11 @@ def test_pytorch_cudnn_match_gpu( PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container. """ container_name = "pt_cudnn_test" - ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + ec2_connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True) ec2_connection.run( f"nvidia-docker run --name {container_name} -itd {pytorch_training}", hide=True diff --git a/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py b/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py index 22341767f4c8..fa97b6dce4eb 100644 --- a/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py +++ b/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py @@ -3,6 +3,7 @@ import json from time import sleep import pytest +import boto3 from packaging.version import Version from packaging.specifiers import SpecifierSet @@ -110,7 +111,11 @@ def test_ec2_tensorflow_inference_gpu_tensorrt( ) try: - ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + ec2_connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) host_setup_for_tensorflow_inference(serving_folder_path, framework_version, ec2_connection) sleep(2) @@ -268,7 +273,11 @@ def run_ec2_tensorflow_inference( if not is_neuron: train_mnist_model(serving_folder_path, ec2_connection) sleep(10) - ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + ec2_connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) ec2_connection.run(docker_run_cmd, hide=True) sleep(20) if is_neuron and str(framework_version).startswith(TENSORFLOW2_VERSION): diff --git a/test/dlc_tests/ec2/test_smdebug.py b/test/dlc_tests/ec2/test_smdebug.py index 7a6d1acfd914..2637d84b9067 100644 --- a/test/dlc_tests/ec2/test_smdebug.py +++ b/test/dlc_tests/ec2/test_smdebug.py @@ -1,6 +1,6 @@ import os - import pytest +import boto3 import test.test_utils as test_utils @@ -170,7 +170,11 @@ def run_smdebug_test( shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " " framework = get_framework_from_image_uri(image_uri) container_test_local_dir = os.path.join("$HOME", "container_tests") - ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + ec2_connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) # Do not add -q to docker pull as it leads to a hang for huge images like trcomp ec2_connection.run(f"docker pull {image_uri}") @@ -209,7 +213,11 @@ def run_smprofiler_test( shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " " framework = get_framework_from_image_uri(image_uri) container_test_local_dir = os.path.join("$HOME", "container_tests") - ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + ec2_connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) # Do not add -q to docker pull as it leads to a hang for huge images like trcomp ec2_connection.run(f"docker pull {image_uri}") diff --git a/test/test_utils/ec2.py b/test/test_utils/ec2.py index f31dfff251f8..bbddb62b5c62 100644 --- a/test/test_utils/ec2.py +++ b/test/test_utils/ec2.py @@ -1027,7 +1027,11 @@ def execute_ec2_training_test( container_test_local_dir = os.path.join("$HOME", "container_tests") synapseai_version = get_synapseai_version_from_tag(ecr_uri) # Make sure we are logged into ECR so we can pull the image - connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) # Run training command shm_setting = '--shm-size="1g"' if large_shm else "" @@ -1063,9 +1067,7 @@ def execute_ec2_training_test( "tensorflow" if "tensorflow" in ecr_uri else "pytorch" if "pytorch" in ecr_uri else None ) test_type = "ec2" - account_id_prefix = os.getenv( - "ACCOUNT_ID", boto3.client("sts").get_caller_identity()["Account"] - )[:3] + account_id_prefix = os.getenv("ACCOUNT_ID", account_id)[:3] s3_bucket_for_permanent_logs = f"dlinfra-habana-tests-{account_id_prefix}" s3_uri_permanent_logs = get_s3_uri_for_saving_permanent_logs( framework, s3_bucket=s3_bucket_for_permanent_logs, test_type=test_type @@ -1111,7 +1113,11 @@ def execute_ec2_inference_test(connection, ecr_uri, test_cmd, region=DEFAULT_REG container_test_local_dir = os.path.join("$HOME", "container_tests") # Make sure we are logged into ECR so we can pull the image - connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) # Run training command connection.run( @@ -1145,7 +1151,11 @@ def execute_ec2_training_performance_test( log_location = os.path.join(container_test_local_dir, "benchmark", "logs", log_name) # Make sure we are logged into ECR so we can pull the image - connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) connection.run(f"{docker_cmd} pull {ecr_uri}", hide=True) @@ -1186,7 +1196,11 @@ def execute_ec2_habana_training_performance_test( ) synapseai_version = get_synapseai_version_from_tag(ecr_uri) # Make sure we are logged into ECR so we can pull the image - connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) connection.run(f"{docker_cmd} pull -q {ecr_uri}") @@ -1251,7 +1265,11 @@ def execute_ec2_inference_performance_test( f"{data_source}_results_{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}_{timestamp}.txt" ) # Make sure we are logged into ECR so we can pull the image - connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + account_id = boto3.client("sts").get_caller_identity()["Account"] + connection.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) connection.run(f"{docker_cmd} pull -q {ecr_uri}") # Run training command, display benchmark results to console diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py index 5713e077ba16..9c8656859925 100644 --- a/test/test_utils/sagemaker.py +++ b/test/test_utils/sagemaker.py @@ -310,7 +310,10 @@ def execute_local_tests(image, pytest_cache_params): ec2_conn = ec2_utils.get_ec2_fabric_connection(instance_id, key_file, region) ec2_conn.put(sm_tests_tar_name, f"{UBUNTU_HOME_DIR}") ec2_utils.install_python_in_instance(ec2_conn, python_version="3.9") - ec2_conn.run(f"$(aws ecr get-login --no-include-email --region {region})") + ec2_conn.run( + f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + hide=True, + ) try: ec2_conn.run(f"docker pull {image}", timeout=600) except invoke.exceptions.CommandTimedOut as e: From ac78c1fe183454d137b18f9a5b22e9d8b7913e7a Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 18 Mar 2024 10:26:01 -0700 Subject: [PATCH 41/47] remove () from get-login --- dlc_developer_config.toml | 2 +- .../inference/test_performance_pytorch_inference.py | 2 +- .../training/test_performance_pytorch_training.py | 2 +- .../inference/test_performance_tensorflow_inference.py | 2 +- .../ec2/mxnet/inference/test_mxnet_inference.py | 2 +- .../ec2/pytorch/inference/test_pytorch_inference.py | 2 +- .../inference/test_pytorch_inference_inductor.py | 2 +- test/dlc_tests/ec2/pytorch/training/common_cases.py | 2 +- .../ec2/pytorch/training/test_pytorch_training.py | 2 +- .../tensorflow/inference/test_tensorflow_inference.py | 4 ++-- test/dlc_tests/ec2/test_smdebug.py | 4 ++-- test/test_utils/ec2.py | 10 +++++----- test/test_utils/sagemaker.py | 2 +- 13 files changed, 19 insertions(+), 19 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 13fb37cfb3b6..88711a5bab4e 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,7 +42,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = false autopatch_build = false [notify] diff --git a/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py b/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py index cfc7157ca729..55fbc2317818 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py @@ -89,7 +89,7 @@ def ec2_performance_pytorch_inference( # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] ec2_connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) diff --git a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py index 4b366258d055..f8ca58c429fd 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py @@ -145,7 +145,7 @@ def execute_pytorch_gpu_py3_imagenet_ec2_training_performance_test( # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) # Do not add -q to docker pull as it leads to a hang for huge images like trcomp diff --git a/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py b/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py index 3134adc009ac..f33e893b9cce 100644 --- a/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py +++ b/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py @@ -86,7 +86,7 @@ def ec2_performance_tensorflow_inference( # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] ec2_connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ") diff --git a/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py b/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py index e2a29aa8df3b..55e77648f745 100644 --- a/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py +++ b/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py @@ -191,7 +191,7 @@ def run_ec2_mxnet_inference( try: account_id = boto3.client("sts").get_caller_identity()["Account"] ec2_connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) LOGGER.info(docker_run_cmd) diff --git a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py index 2da5c417a03a..7ba9d8ac4905 100644 --- a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py +++ b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py @@ -240,7 +240,7 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region): try: account_id = boto3.client("sts").get_caller_identity()["Account"] ec2_connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) LOGGER.info(docker_run_cmd) diff --git a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py index 19dc2091363f..d6a523d7f8e2 100644 --- a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py +++ b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py @@ -80,7 +80,7 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region): try: account_id = boto3.client("sts").get_caller_identity()["Account"] ec2_connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) LOGGER.info(docker_run_cmd) diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index eae9aea7a67c..78c195c0c3be 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -192,7 +192,7 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region): container_name = "pt_cudnn_test" account_id = boto3.client("sts").get_caller_identity()["Account"] ec2_connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True) diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py index 829d97367b81..961191a25a72 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py @@ -750,7 +750,7 @@ def test_pytorch_cudnn_match_gpu( container_name = "pt_cudnn_test" account_id = boto3.client("sts").get_caller_identity()["Account"] ec2_connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True) diff --git a/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py b/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py index fa97b6dce4eb..c15ff65fcfa2 100644 --- a/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py +++ b/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py @@ -113,7 +113,7 @@ def test_ec2_tensorflow_inference_gpu_tensorrt( try: account_id = boto3.client("sts").get_caller_identity()["Account"] ec2_connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) host_setup_for_tensorflow_inference(serving_folder_path, framework_version, ec2_connection) @@ -275,7 +275,7 @@ def run_ec2_tensorflow_inference( sleep(10) account_id = boto3.client("sts").get_caller_identity()["Account"] ec2_connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) ec2_connection.run(docker_run_cmd, hide=True) diff --git a/test/dlc_tests/ec2/test_smdebug.py b/test/dlc_tests/ec2/test_smdebug.py index 2637d84b9067..1080d034c37d 100644 --- a/test/dlc_tests/ec2/test_smdebug.py +++ b/test/dlc_tests/ec2/test_smdebug.py @@ -172,7 +172,7 @@ def run_smdebug_test( container_test_local_dir = os.path.join("$HOME", "container_tests") account_id = boto3.client("sts").get_caller_identity()["Account"] ec2_connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) # Do not add -q to docker pull as it leads to a hang for huge images like trcomp @@ -215,7 +215,7 @@ def run_smprofiler_test( container_test_local_dir = os.path.join("$HOME", "container_tests") account_id = boto3.client("sts").get_caller_identity()["Account"] ec2_connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) # Do not add -q to docker pull as it leads to a hang for huge images like trcomp diff --git a/test/test_utils/ec2.py b/test/test_utils/ec2.py index bbddb62b5c62..7db07b3be736 100644 --- a/test/test_utils/ec2.py +++ b/test/test_utils/ec2.py @@ -1029,7 +1029,7 @@ def execute_ec2_training_test( # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) @@ -1115,7 +1115,7 @@ def execute_ec2_inference_test(connection, ecr_uri, test_cmd, region=DEFAULT_REG # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) @@ -1153,7 +1153,7 @@ def execute_ec2_training_performance_test( # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) @@ -1198,7 +1198,7 @@ def execute_ec2_habana_training_performance_test( # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) @@ -1267,7 +1267,7 @@ def execute_ec2_inference_performance_test( # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] connection.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) connection.run(f"{docker_cmd} pull -q {ecr_uri}") diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py index 9c8656859925..211370348e47 100644 --- a/test/test_utils/sagemaker.py +++ b/test/test_utils/sagemaker.py @@ -311,7 +311,7 @@ def execute_local_tests(image, pytest_cache_params): ec2_conn.put(sm_tests_tar_name, f"{UBUNTU_HOME_DIR}") ec2_utils.install_python_in_instance(ec2_conn, python_version="3.9") ec2_conn.run( - f"$(aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com)", + f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", hide=True, ) try: From 4013f890844a676c553b2ec61f8ca5d49f3b4e1f Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 18 Mar 2024 11:43:59 -0700 Subject: [PATCH 42/47] test tensorflow --- dlc_developer_config.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 88711a5bab4e..f7b5b161aa70 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -34,7 +34,7 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"] -build_frameworks = ["pytorch"] +build_frameworks = ["tensorflow"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true @@ -42,7 +42,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = false +do_build = true autopatch_build = false [notify] From 9f74eebcd1077eafb30cc7c5935fec1b14bf7d11 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 18 Mar 2024 12:54:46 -0700 Subject: [PATCH 43/47] use login_to_ecr_registry function --- dlc_developer_config.toml | 2 +- .../test_performance_pytorch_inference.py | 6 ++-- .../test_performance_pytorch_training.py | 6 ++-- .../test_performance_tensorflow_inference.py | 6 ++-- .../mxnet/inference/test_mxnet_inference.py | 11 +++---- .../inference/test_pytorch_inference.py | 6 ++-- .../test_pytorch_inference_inductor.py | 6 ++-- .../ec2/pytorch/training/common_cases.py | 6 ++-- .../pytorch/training/test_pytorch_training.py | 6 ++-- .../inference/test_tensorflow_inference.py | 10 ++----- test/dlc_tests/ec2/test_smdebug.py | 11 ++----- test/test_utils/ec2.py | 30 +++++-------------- test/test_utils/sagemaker.py | 6 ++-- 13 files changed, 35 insertions(+), 77 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index f7b5b161aa70..4eedf0d476b7 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,7 +42,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = false autopatch_build = false [notify] diff --git a/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py b/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py index 55fbc2317818..4c32e5f86e3f 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py @@ -13,6 +13,7 @@ get_framework_and_version_from_tag, UL20_CPU_ARM64_US_WEST_2, LOGGER, + login_to_ecr_registry, ) from test.test_utils.ec2 import ( ec2_performance_upload_result_to_s3_and_validate, @@ -88,10 +89,7 @@ def ec2_performance_pytorch_inference( # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] - ec2_connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(ec2_connection, account_id, region) ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ") diff --git a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py index f8ca58c429fd..bb2757d51922 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py @@ -11,6 +11,7 @@ DEFAULT_REGION, get_framework_and_version_from_tag, is_pr_context, + login_to_ecr_registry, ) from test.test_utils.ec2 import ( execute_ec2_training_performance_test, @@ -144,10 +145,7 @@ def execute_pytorch_gpu_py3_imagenet_ec2_training_performance_test( # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] - connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(connection, account_id, region) # Do not add -q to docker pull as it leads to a hang for huge images like trcomp connection.run(f"nvidia-docker pull {ecr_uri}") timestamp = time.strftime("%Y-%m-%d-%H-%M-%S") diff --git a/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py b/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py index f33e893b9cce..6ca2184ea693 100644 --- a/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py +++ b/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py @@ -15,6 +15,7 @@ is_pr_context, is_tf_version, UL20_BENCHMARK_CPU_ARM64_US_WEST_2, + login_to_ecr_registry, ) from test.test_utils.ec2 import ( ec2_performance_upload_result_to_s3_and_validate, @@ -85,10 +86,7 @@ def ec2_performance_tensorflow_inference( # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] - ec2_connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(ec2_connection, account_id, region) ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ") if is_graviton: # TF training binary is used that is compatible for graviton instance type diff --git a/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py b/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py index 55e77648f745..2c1a4ffdf139 100644 --- a/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py +++ b/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py @@ -5,7 +5,11 @@ import test.test_utils.ec2 as ec2_utils from test import test_utils -from test.test_utils import CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag +from test.test_utils import ( + CONTAINER_TESTS_PREFIX, + get_framework_and_version_from_tag, + login_to_ecr_registry, +) from test.test_utils.ec2 import ( get_ec2_instance_type, execute_ec2_inference_test, @@ -190,10 +194,7 @@ def run_ec2_mxnet_inference( ) try: account_id = boto3.client("sts").get_caller_identity()["Account"] - ec2_connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(ec2_connection, account_id, region) LOGGER.info(docker_run_cmd) ec2_connection.run(docker_run_cmd, hide=True) if model_name == SQUEEZENET_MODEL: diff --git a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py index 7ba9d8ac4905..f657b1dcd177 100644 --- a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py +++ b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py @@ -16,6 +16,7 @@ get_framework_and_version_from_tag, get_inference_server_type, get_cuda_version_from_tag, + login_to_ecr_registry, ) from test.test_utils.ec2 import ( get_ec2_instance_type, @@ -239,10 +240,7 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region): ) try: account_id = boto3.client("sts").get_caller_identity()["Account"] - ec2_connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(ec2_connection, account_id, region) LOGGER.info(docker_run_cmd) ec2_connection.run(docker_run_cmd, hide=True) server_type = get_inference_server_type(image_uri) diff --git a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py index d6a523d7f8e2..2165112c9041 100644 --- a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py +++ b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py @@ -9,6 +9,7 @@ get_framework_and_version_from_tag, get_inference_server_type, UL20_CPU_ARM64_US_WEST_2, + login_to_ecr_registry, ) from test.test_utils.ec2 import ( get_ec2_instance_type, @@ -79,10 +80,7 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region): ) try: account_id = boto3.client("sts").get_caller_identity()["Account"] - ec2_connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(ec2_connection, account_id, region) LOGGER.info(docker_run_cmd) ec2_connection.run(docker_run_cmd, hide=True) server_type = get_inference_server_type(image_uri) diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index 78c195c0c3be..c1e684c8a962 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -10,6 +10,7 @@ CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag, get_cuda_version_from_tag, + login_to_ecr_registry, ) from test.test_utils.ec2 import ( execute_ec2_training_test, @@ -191,10 +192,7 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region): """ container_name = "pt_cudnn_test" account_id = boto3.client("sts").get_caller_identity()["Account"] - ec2_connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(ec2_connection, account_id, region) ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True) ec2_connection.run( f"nvidia-docker run --name {container_name} -itd {pytorch_training}", hide=True diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py index 961191a25a72..a40819904799 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py @@ -13,6 +13,7 @@ UBUNTU_18_HPU_DLAMI_US_WEST_2, get_framework_and_version_from_tag, get_cuda_version_from_tag, + login_to_ecr_registry, ) from test.test_utils.ec2 import ( execute_ec2_training_test, @@ -749,10 +750,7 @@ def test_pytorch_cudnn_match_gpu( """ container_name = "pt_cudnn_test" account_id = boto3.client("sts").get_caller_identity()["Account"] - ec2_connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(ec2_connection, account_id, region) ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True) ec2_connection.run( f"nvidia-docker run --name {container_name} -itd {pytorch_training}", hide=True diff --git a/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py b/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py index c15ff65fcfa2..f52f82b7cf3c 100644 --- a/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py +++ b/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py @@ -112,10 +112,7 @@ def test_ec2_tensorflow_inference_gpu_tensorrt( try: account_id = boto3.client("sts").get_caller_identity()["Account"] - ec2_connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + test_utils.login_to_ecr_registry(ec2_connection, account_id, region) host_setup_for_tensorflow_inference(serving_folder_path, framework_version, ec2_connection) sleep(2) @@ -274,10 +271,7 @@ def run_ec2_tensorflow_inference( train_mnist_model(serving_folder_path, ec2_connection) sleep(10) account_id = boto3.client("sts").get_caller_identity()["Account"] - ec2_connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + test_utils.login_to_ecr_registry(ec2_connection, account_id, region) ec2_connection.run(docker_run_cmd, hide=True) sleep(20) if is_neuron and str(framework_version).startswith(TENSORFLOW2_VERSION): diff --git a/test/dlc_tests/ec2/test_smdebug.py b/test/dlc_tests/ec2/test_smdebug.py index 1080d034c37d..3f89c3c1b4f1 100644 --- a/test/dlc_tests/ec2/test_smdebug.py +++ b/test/dlc_tests/ec2/test_smdebug.py @@ -13,6 +13,7 @@ is_tf_version, get_framework_and_version_from_tag, is_nightly_context, + login_to_ecr_registry, ) from test.test_utils.ec2 import get_ec2_instance_type @@ -171,10 +172,7 @@ def run_smdebug_test( framework = get_framework_from_image_uri(image_uri) container_test_local_dir = os.path.join("$HOME", "container_tests") account_id = boto3.client("sts").get_caller_identity()["Account"] - ec2_connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(ec2_connection, account_id, region) # Do not add -q to docker pull as it leads to a hang for huge images like trcomp ec2_connection.run(f"docker pull {image_uri}") @@ -214,10 +212,7 @@ def run_smprofiler_test( framework = get_framework_from_image_uri(image_uri) container_test_local_dir = os.path.join("$HOME", "container_tests") account_id = boto3.client("sts").get_caller_identity()["Account"] - ec2_connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(ec2_connection, account_id, region) # Do not add -q to docker pull as it leads to a hang for huge images like trcomp ec2_connection.run(f"docker pull {image_uri}") diff --git a/test/test_utils/ec2.py b/test/test_utils/ec2.py index 7db07b3be736..60cffb863d98 100644 --- a/test/test_utils/ec2.py +++ b/test/test_utils/ec2.py @@ -30,6 +30,7 @@ is_pr_context, is_mainline_context, are_heavy_instance_ec2_tests_enabled, + login_to_ecr_registry, ) from . import DEFAULT_REGION, P3DN_REGION, P4DE_REGION, UL_AMI_LIST, BENCHMARK_RESULTS_S3_BUCKET @@ -1028,10 +1029,7 @@ def execute_ec2_training_test( synapseai_version = get_synapseai_version_from_tag(ecr_uri) # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] - connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(connection, account_id, region) # Run training command shm_setting = '--shm-size="1g"' if large_shm else "" @@ -1114,10 +1112,7 @@ def execute_ec2_inference_test(connection, ecr_uri, test_cmd, region=DEFAULT_REG # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] - connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(connection, account_id, region) # Run training command connection.run( @@ -1152,10 +1147,7 @@ def execute_ec2_training_performance_test( # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] - connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(connection, account_id, region) connection.run(f"{docker_cmd} pull {ecr_uri}", hide=True) @@ -1197,10 +1189,7 @@ def execute_ec2_habana_training_performance_test( synapseai_version = get_synapseai_version_from_tag(ecr_uri) # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] - connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(connection, account_id, region) connection.run(f"{docker_cmd} pull -q {ecr_uri}") @@ -1222,9 +1211,7 @@ def execute_ec2_habana_training_performance_test( framework = ( "tensorflow" if "tensorflow" in ecr_uri else "pytorch" if "pytorch" in ecr_uri else None ) - account_id_prefix = os.getenv( - "ACCOUNT_ID", boto3.client("sts").get_caller_identity()["Account"] - )[:3] + account_id_prefix = os.getenv("ACCOUNT_ID", account_id)[:3] s3_bucket_for_permanent_logs = f"dlinfra-habana-tests-{account_id_prefix}" test_type = "benchmark" custom_filename = test_cmd.split(f"{os.sep}")[-1] @@ -1266,10 +1253,7 @@ def execute_ec2_inference_performance_test( ) # Make sure we are logged into ECR so we can pull the image account_id = boto3.client("sts").get_caller_identity()["Account"] - connection.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(connection, account_id, region) connection.run(f"{docker_cmd} pull -q {ecr_uri}") # Run training command, display benchmark results to console diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py index 211370348e47..ab468bdaf992 100644 --- a/test/test_utils/sagemaker.py +++ b/test/test_utils/sagemaker.py @@ -33,6 +33,7 @@ DEFAULT_REGION, is_nightly_context, get_instance_type_base_dlami, + login_to_ecr_registry, ) from test_utils.pytest_cache import PytestCache @@ -310,10 +311,7 @@ def execute_local_tests(image, pytest_cache_params): ec2_conn = ec2_utils.get_ec2_fabric_connection(instance_id, key_file, region) ec2_conn.put(sm_tests_tar_name, f"{UBUNTU_HOME_DIR}") ec2_utils.install_python_in_instance(ec2_conn, python_version="3.9") - ec2_conn.run( - f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com", - hide=True, - ) + login_to_ecr_registry(ec2_conn, account_id, region) try: ec2_conn.run(f"docker pull {image}", timeout=600) except invoke.exceptions.CommandTimedOut as e: From 185d4a5f66a55de1500eb4a0a5472b551d7a060a Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 18 Mar 2024 13:31:37 -0700 Subject: [PATCH 44/47] use dict for base dlami logic --- test/test_utils/__init__.py | 73 +++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 40 deletions(-) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index deb85d25d551..59c2daa48c97 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -2397,6 +2397,7 @@ def get_image_spec_from_buildspec(image_uri, dlc_folder_path): def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): """ Get Instance types based on EC2 instance, see https://docs.aws.amazon.com/dlami/latest/devguide/important-changes.html + For all instance names, see https://aws.amazon.com/ec2/instance-types/#Accelerated_Computing OSS Nvidia Driver DLAMI supports the following: ["g4dn.xlarge", "g4dn.2xlarge", "g4dn.4xlarge", @@ -2440,47 +2441,39 @@ def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"): "g3.16xlarge", ] - # set defaults - if linux_dist == "AML2": - oss_dlami_us_east_1 = AML2_BASE_OSS_DLAMI_US_EAST_1 - oss_dlami_us_west_2 = AML2_BASE_OSS_DLAMI_US_WEST_2 - oss_dlami_name_pattern = ( - "Deep Learning Base OSS Nvidia Driver AMI (Amazon Linux 2) Version ??.?" - ) - - proprietary_dlami_us_east_1 = AML2_BASE_PROPRIETARY_DLAMI_US_EAST_1 - proprietary_dlami_us_west_2 = AML2_BASE_PROPRIETARY_DLAMI_US_WEST_2 - proprietary_dlami_name_pattern = ( - "Deep Learning Base Proprietary Nvidia Driver AMI (Amazon Linux 2) Version ??.?" - ) - else: - oss_dlami_us_east_1 = UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1 - oss_dlami_us_west_2 = UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 - oss_dlami_name_pattern = ( - "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) ????????" - ) - - proprietary_dlami_us_east_1 = UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1 - proprietary_dlami_us_west_2 = UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2 - proprietary_dlami_name_pattern = ( - "Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????" - ) + ami_patterns = { + "AML2": { + "oss": { + "name_pattern": "Deep Learning Base OSS Nvidia Driver AMI (Amazon Linux 2) Version ??.?", + "us-east-1": AML2_BASE_OSS_DLAMI_US_EAST_1, + "us-west-2": AML2_BASE_OSS_DLAMI_US_WEST_2, + }, + "proprietary": { + "name_pattern": "Deep Learning Base Proprietary Nvidia Driver AMI (Amazon Linux 2) Version ??.?", + "us-east-1": AML2_BASE_PROPRIETARY_DLAMI_US_EAST_1, + "us-west-2": AML2_BASE_PROPRIETARY_DLAMI_US_WEST_2, + }, + }, + "UBUNTU_20": { + "oss": { + "name_pattern": "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", + "us-east-1": UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1, + "us-west-2": UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2, + }, + "proprietary": { + "name_pattern": "Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????", + "us-east-1": UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1, + "us-west-2": UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2, + }, + }, + } - instance_ami = ( - proprietary_dlami_us_east_1 - if region == "us-east-1" and instance_type in base_proprietary_dlami_instances - else proprietary_dlami_us_west_2 - if region == "us-west-2" and instance_type in base_proprietary_dlami_instances - else get_ami_id_boto3( - region_name=region, - ami_name_pattern=proprietary_dlami_name_pattern, - ) - if instance_type in base_proprietary_dlami_instances - else oss_dlami_us_east_1 - if region == "us-east-1" - else oss_dlami_us_west_2 - if region == "us-west-2" - else get_ami_id_boto3(region_name=region, ami_name_pattern=oss_dlami_name_pattern) + ami_type = "proprietary" if instance_type in base_proprietary_dlami_instances else "oss" + instance_ami = ami_patterns[linux_dist][ami_type].get( + region, + get_ami_id_boto3( + region_name=region, ami_name_pattern=ami_patterns[linux_dist][ami_type]["name_pattern"] + ), ) return instance_ami From a8930355480a54561e5dedc27b1a65287f1e180f Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 18 Mar 2024 16:27:44 -0700 Subject: [PATCH 45/47] use image uri instead --- dlc_developer_config.toml | 8 ++++---- .../test_performance_pytorch_inference.py | 6 +++--- .../test_performance_pytorch_training.py | 4 ++-- .../test_performance_tensorflow_inference.py | 4 ++-- .../mxnet/inference/test_mxnet_inference.py | 4 ++-- .../inference/test_pytorch_inference.py | 7 ++----- .../test_pytorch_inference_inductor.py | 5 ++--- .../ec2/pytorch/training/common_cases.py | 4 ++-- .../pytorch/training/test_pytorch_training.py | 4 ++-- .../inference/test_tensorflow_inference.py | 6 ++---- test/dlc_tests/ec2/test_smdebug.py | 5 +++-- test/test_utils/ec2.py | 20 +++++++++++-------- 12 files changed, 38 insertions(+), 39 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 4eedf0d476b7..c66a67df0942 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -34,15 +34,15 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"] -build_frameworks = ["tensorflow"] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. -build_training = true -build_inference = false +build_training = false +build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = false +do_build = true autopatch_build = false [notify] diff --git a/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py b/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py index 4c32e5f86e3f..2fddca843d15 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/inference/test_performance_pytorch_inference.py @@ -1,7 +1,6 @@ import os import time import pytest -import boto3 from src.benchmark_metrics import ( PYTORCH_INFERENCE_GPU_THRESHOLD, @@ -12,8 +11,9 @@ CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag, UL20_CPU_ARM64_US_WEST_2, - LOGGER, login_to_ecr_registry, + get_account_id_from_image_uri, + LOGGER, ) from test.test_utils.ec2 import ( ec2_performance_upload_result_to_s3_and_validate, @@ -88,7 +88,7 @@ def ec2_performance_pytorch_inference( repo_name, image_tag = image_uri.split("/")[-1].split(":") # Make sure we are logged into ECR so we can pull the image - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(image_uri) login_to_ecr_registry(ec2_connection, account_id, region) ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ") diff --git a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py index bb2757d51922..8aeec2c27488 100644 --- a/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py +++ b/test/dlc_tests/benchmark/ec2/pytorch/training/test_performance_pytorch_training.py @@ -2,7 +2,6 @@ import time import pytest import re -import boto3 from test.test_utils import ( CONTAINER_TESTS_PREFIX, @@ -12,6 +11,7 @@ get_framework_and_version_from_tag, is_pr_context, login_to_ecr_registry, + get_account_id_from_image_uri, ) from test.test_utils.ec2 import ( execute_ec2_training_performance_test, @@ -144,7 +144,7 @@ def execute_pytorch_gpu_py3_imagenet_ec2_training_performance_test( container_name = f"{repo_name}-performance-{image_tag}-ec2" # Make sure we are logged into ECR so we can pull the image - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(ecr_uri) login_to_ecr_registry(connection, account_id, region) # Do not add -q to docker pull as it leads to a hang for huge images like trcomp connection.run(f"nvidia-docker pull {ecr_uri}") diff --git a/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py b/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py index 6ca2184ea693..a10dcb7dbe8b 100644 --- a/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py +++ b/test/dlc_tests/benchmark/ec2/tensorflow/inference/test_performance_tensorflow_inference.py @@ -1,7 +1,6 @@ import os import time import pytest -import boto3 from packaging.version import Version @@ -16,6 +15,7 @@ is_tf_version, UL20_BENCHMARK_CPU_ARM64_US_WEST_2, login_to_ecr_registry, + get_account_id_from_image_uri, ) from test.test_utils.ec2 import ( ec2_performance_upload_result_to_s3_and_validate, @@ -85,7 +85,7 @@ def ec2_performance_tensorflow_inference( num_iterations = 500 if is_pr_context() or is_graviton else 1000 # Make sure we are logged into ECR so we can pull the image - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(image_uri) login_to_ecr_registry(ec2_connection, account_id, region) ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ") if is_graviton: diff --git a/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py b/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py index 2c1a4ffdf139..fff6e577bf34 100644 --- a/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py +++ b/test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py @@ -1,6 +1,5 @@ import os import pytest -import boto3 import test.test_utils.ec2 as ec2_utils @@ -9,6 +8,7 @@ CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag, login_to_ecr_registry, + get_account_id_from_image_uri, ) from test.test_utils.ec2 import ( get_ec2_instance_type, @@ -193,7 +193,7 @@ def run_ec2_mxnet_inference( f" {image_uri} {mms_inference_cmd}" ) try: - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(image_uri) login_to_ecr_registry(ec2_connection, account_id, region) LOGGER.info(docker_run_cmd) ec2_connection.run(docker_run_cmd, hide=True) diff --git a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py index f657b1dcd177..98c8d146dbbe 100644 --- a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py +++ b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference.py @@ -1,9 +1,6 @@ import os import sys -import time import logging -from datetime import date, timedelta, datetime -import boto3 import pytest from packaging.version import Version @@ -15,8 +12,8 @@ CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag, get_inference_server_type, - get_cuda_version_from_tag, login_to_ecr_registry, + get_account_id_from_image_uri, ) from test.test_utils.ec2 import ( get_ec2_instance_type, @@ -239,7 +236,7 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region): f" {image_uri} {inference_cmd}" ) try: - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(image_uri) login_to_ecr_registry(ec2_connection, account_id, region) LOGGER.info(docker_run_cmd) ec2_connection.run(docker_run_cmd, hide=True) diff --git a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py index 2165112c9041..8c772d6b5f50 100644 --- a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py +++ b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py @@ -1,15 +1,14 @@ from packaging.version import Version from packaging.specifiers import SpecifierSet import pytest -import boto3 from test import test_utils from test.test_utils import ( - CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag, get_inference_server_type, UL20_CPU_ARM64_US_WEST_2, login_to_ecr_registry, + get_account_id_from_image_uri, ) from test.test_utils.ec2 import ( get_ec2_instance_type, @@ -79,7 +78,7 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region): f" {image_uri} {inference_cmd}" ) try: - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(image_uri) login_to_ecr_registry(ec2_connection, account_id, region) LOGGER.info(docker_run_cmd) ec2_connection.run(docker_run_cmd, hide=True) diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index c1e684c8a962..67ccf8835324 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -1,5 +1,4 @@ import os -import boto3 from packaging.version import Version from packaging.specifiers import SpecifierSet @@ -11,6 +10,7 @@ get_framework_and_version_from_tag, get_cuda_version_from_tag, login_to_ecr_registry, + get_account_id_from_image_uri, ) from test.test_utils.ec2 import ( execute_ec2_training_test, @@ -191,7 +191,7 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region): PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container. """ container_name = "pt_cudnn_test" - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(pytorch_training) login_to_ecr_registry(ec2_connection, account_id, region) ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True) ec2_connection.run( diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py index a40819904799..755472282380 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py @@ -1,5 +1,4 @@ import os -import boto3 from packaging.version import Version from packaging.specifiers import SpecifierSet @@ -14,6 +13,7 @@ get_framework_and_version_from_tag, get_cuda_version_from_tag, login_to_ecr_registry, + get_account_id_from_image_uri, ) from test.test_utils.ec2 import ( execute_ec2_training_test, @@ -749,7 +749,7 @@ def test_pytorch_cudnn_match_gpu( PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container. """ container_name = "pt_cudnn_test" - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(pytorch_training) login_to_ecr_registry(ec2_connection, account_id, region) ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True) ec2_connection.run( diff --git a/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py b/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py index f52f82b7cf3c..8c5a3f4fce27 100644 --- a/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py +++ b/test/dlc_tests/ec2/tensorflow/inference/test_tensorflow_inference.py @@ -1,9 +1,7 @@ import os import re -import json from time import sleep import pytest -import boto3 from packaging.version import Version from packaging.specifiers import SpecifierSet @@ -111,7 +109,7 @@ def test_ec2_tensorflow_inference_gpu_tensorrt( ) try: - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = test_utils.get_account_id_from_image_uri(tensorflow_inference) test_utils.login_to_ecr_registry(ec2_connection, account_id, region) host_setup_for_tensorflow_inference(serving_folder_path, framework_version, ec2_connection) sleep(2) @@ -270,7 +268,7 @@ def run_ec2_tensorflow_inference( if not is_neuron: train_mnist_model(serving_folder_path, ec2_connection) sleep(10) - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = test_utils.get_account_id_from_image_uri(image_uri) test_utils.login_to_ecr_registry(ec2_connection, account_id, region) ec2_connection.run(docker_run_cmd, hide=True) sleep(20) diff --git a/test/dlc_tests/ec2/test_smdebug.py b/test/dlc_tests/ec2/test_smdebug.py index 3f89c3c1b4f1..be536dc56d7d 100644 --- a/test/dlc_tests/ec2/test_smdebug.py +++ b/test/dlc_tests/ec2/test_smdebug.py @@ -14,6 +14,7 @@ get_framework_and_version_from_tag, is_nightly_context, login_to_ecr_registry, + get_account_id_from_image_uri, ) from test.test_utils.ec2 import get_ec2_instance_type @@ -171,7 +172,7 @@ def run_smdebug_test( shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " " framework = get_framework_from_image_uri(image_uri) container_test_local_dir = os.path.join("$HOME", "container_tests") - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(image_uri) login_to_ecr_registry(ec2_connection, account_id, region) # Do not add -q to docker pull as it leads to a hang for huge images like trcomp ec2_connection.run(f"docker pull {image_uri}") @@ -211,7 +212,7 @@ def run_smprofiler_test( shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " " framework = get_framework_from_image_uri(image_uri) container_test_local_dir = os.path.join("$HOME", "container_tests") - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(image_uri) login_to_ecr_registry(ec2_connection, account_id, region) # Do not add -q to docker pull as it leads to a hang for huge images like trcomp ec2_connection.run(f"docker pull {image_uri}") diff --git a/test/test_utils/ec2.py b/test/test_utils/ec2.py index 60cffb863d98..a3654cb33d43 100644 --- a/test/test_utils/ec2.py +++ b/test/test_utils/ec2.py @@ -8,7 +8,6 @@ from inspect import signature import boto3 -import pytest from fabric import Connection from botocore.config import Config @@ -31,6 +30,7 @@ is_mainline_context, are_heavy_instance_ec2_tests_enabled, login_to_ecr_registry, + get_account_id_from_image_uri, ) from . import DEFAULT_REGION, P3DN_REGION, P4DE_REGION, UL_AMI_LIST, BENCHMARK_RESULTS_S3_BUCKET @@ -1028,7 +1028,7 @@ def execute_ec2_training_test( container_test_local_dir = os.path.join("$HOME", "container_tests") synapseai_version = get_synapseai_version_from_tag(ecr_uri) # Make sure we are logged into ECR so we can pull the image - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(ecr_uri) login_to_ecr_registry(connection, account_id, region) # Run training command @@ -1065,7 +1065,9 @@ def execute_ec2_training_test( "tensorflow" if "tensorflow" in ecr_uri else "pytorch" if "pytorch" in ecr_uri else None ) test_type = "ec2" - account_id_prefix = os.getenv("ACCOUNT_ID", account_id)[:3] + account_id_prefix = os.getenv( + "ACCOUNT_ID", boto3.client("sts").get_caller_identity()["Account"] + )[:3] s3_bucket_for_permanent_logs = f"dlinfra-habana-tests-{account_id_prefix}" s3_uri_permanent_logs = get_s3_uri_for_saving_permanent_logs( framework, s3_bucket=s3_bucket_for_permanent_logs, test_type=test_type @@ -1111,7 +1113,7 @@ def execute_ec2_inference_test(connection, ecr_uri, test_cmd, region=DEFAULT_REG container_test_local_dir = os.path.join("$HOME", "container_tests") # Make sure we are logged into ECR so we can pull the image - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(ecr_uri) login_to_ecr_registry(connection, account_id, region) # Run training command @@ -1146,7 +1148,7 @@ def execute_ec2_training_performance_test( log_location = os.path.join(container_test_local_dir, "benchmark", "logs", log_name) # Make sure we are logged into ECR so we can pull the image - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(ecr_uri) login_to_ecr_registry(connection, account_id, region) connection.run(f"{docker_cmd} pull {ecr_uri}", hide=True) @@ -1188,7 +1190,7 @@ def execute_ec2_habana_training_performance_test( ) synapseai_version = get_synapseai_version_from_tag(ecr_uri) # Make sure we are logged into ECR so we can pull the image - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(ecr_uri) login_to_ecr_registry(connection, account_id, region) connection.run(f"{docker_cmd} pull -q {ecr_uri}") @@ -1211,7 +1213,9 @@ def execute_ec2_habana_training_performance_test( framework = ( "tensorflow" if "tensorflow" in ecr_uri else "pytorch" if "pytorch" in ecr_uri else None ) - account_id_prefix = os.getenv("ACCOUNT_ID", account_id)[:3] + account_id_prefix = os.getenv( + "ACCOUNT_ID", boto3.client("sts").get_caller_identity()["Account"] + )[:3] s3_bucket_for_permanent_logs = f"dlinfra-habana-tests-{account_id_prefix}" test_type = "benchmark" custom_filename = test_cmd.split(f"{os.sep}")[-1] @@ -1252,7 +1256,7 @@ def execute_ec2_inference_performance_test( f"{data_source}_results_{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}_{timestamp}.txt" ) # Make sure we are logged into ECR so we can pull the image - account_id = boto3.client("sts").get_caller_identity()["Account"] + account_id = get_account_id_from_image_uri(ecr_uri) login_to_ecr_registry(connection, account_id, region) connection.run(f"{docker_cmd} pull -q {ecr_uri}") From 64d9afae35fb67fa4622ec3eceb767aaef216030 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 18 Mar 2024 16:47:17 -0700 Subject: [PATCH 46/47] fix aml2 dlami logic --- test/dlc_tests/conftest.py | 9 ++++++--- .../pytorch/inference/test_pytorch_inference_inductor.py | 2 +- test/dlc_tests/ec2/test_smdebug.py | 1 - 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index d308fd1a82e7..1823c4ebd520 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -544,6 +544,8 @@ def ec2_instance( ): _validate_p4de_usage(request, ec2_instance_type) if ec2_instance_type == "p3dn.24xlarge": + # Keep track of initial region to get information about previous AMI + initial_region = region region = P3DN_REGION ec2_client = boto3.client( "ec2", region_name=region, config=Config(retries={"max_attempts": 10}) @@ -552,15 +554,16 @@ def ec2_instance( "ec2", region_name=region, config=Config(retries={"max_attempts": 10}) ) if ec2_instance_ami != PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1: + # Assign as AML2 if initial AMI is AML2, else use default ec2_instance_ami = ( test_utils.get_instance_type_base_dlami( - ec2_instance_type, "us-east-1", linux_dist="AML2" + ec2_instance_type, region, linux_dist="AML2" ) if ec2_instance_ami == test_utils.get_instance_type_base_dlami( - ec2_instance_type, "us-west-2", linux_dist="AML2" + ec2_instance_type, initial_region, linux_dist="AML2" ) - else test_utils.get_instance_type_base_dlami(ec2_instance_type, "us-east-1") + else test_utils.get_instance_type_base_dlami(ec2_instance_type, region) ) ec2_key_name = f"{ec2_key_name}-{str(uuid.uuid4())}" diff --git a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py index 8c772d6b5f50..fa491ff50414 100644 --- a/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py +++ b/test/dlc_tests/ec2/pytorch/inference/test_pytorch_inference_inductor.py @@ -53,7 +53,7 @@ def test_ec2_pytorch_inference_cpu_compilation(pytorch_inference, ec2_connection @pytest.mark.parametrize("ec2_instance_type", PT_EC2_GRAVITON_INSTANCE_TYPES, indirect=True) @pytest.mark.parametrize("ec2_instance_ami", [UL20_CPU_ARM64_US_WEST_2], indirect=True) @pytest.mark.team("training-compiler") -def test_ec2_pytorch_inference_cpu_compilation( +def test_ec2_pytorch_inference_graviton_compilation( pytorch_inference_graviton, ec2_connection, region, cpu_only ): _, image_framework_version = get_framework_and_version_from_tag(pytorch_inference_graviton) diff --git a/test/dlc_tests/ec2/test_smdebug.py b/test/dlc_tests/ec2/test_smdebug.py index be536dc56d7d..927721ed09e1 100644 --- a/test/dlc_tests/ec2/test_smdebug.py +++ b/test/dlc_tests/ec2/test_smdebug.py @@ -1,6 +1,5 @@ import os import pytest -import boto3 import test.test_utils as test_utils From ef0357855a89753ff0730d049d5672be41684dff Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 18 Mar 2024 19:48:41 -0700 Subject: [PATCH 47/47] revert toml file --- dlc_developer_config.toml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index c66a67df0942..10c783d4eb22 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -34,10 +34,10 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"] -build_frameworks = ["pytorch"] +build_frameworks = [] # By default we build both training and inference containers. Set true/false values to determine which to build. -build_training = false +build_training = true build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR @@ -67,20 +67,20 @@ ec2_benchmark_tests = false ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = true +ec2_tests_on_heavy_instances = false ### SM specific tests ### Off by default -sagemaker_local_tests = true +sagemaker_local_tests = false # run standard sagemaker remote tests from test/sagemaker_tests -sagemaker_remote_tests = true +sagemaker_remote_tests = false # run efa sagemaker tests -sagemaker_efa_tests = true +sagemaker_efa_tests = false # run release_candidate_integration tests -sagemaker_rc_tests = true +sagemaker_rc_tests = false # run sagemaker benchmark tests -sagemaker_benchmark_tests = true +sagemaker_benchmark_tests = false # SM remote EFA test instance type sagemaker_remote_efa_instance_type = ""