aws · sirutBuasai · Mar 19, 2024 · Mar 8, 2024 · Mar 8, 2024 · Mar 8, 2024
diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
@@ -34,11 +34,11 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+build_frameworks = ["pytorch"]
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true
-build_inference = true
+build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
@@ -67,20 +67,20 @@ ec2_benchmark_tests = false
 ### default. If false, these types of tests will be skipped while other tests will run as usual.
 ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true.
 ### Off by default (set to false)
-ec2_tests_on_heavy_instances = false
+ec2_tests_on_heavy_instances = true
 
 ### SM specific tests
 ### Off by default
-sagemaker_local_tests = false
+sagemaker_local_tests = true
 
 # run standard sagemaker remote tests from test/sagemaker_tests
-sagemaker_remote_tests = false
+sagemaker_remote_tests = true
 # run efa sagemaker tests
-sagemaker_efa_tests = false
+sagemaker_efa_tests = true
 # run release_candidate_integration tests
-sagemaker_rc_tests = false
+sagemaker_rc_tests = true
 # run sagemaker benchmark tests
-sagemaker_benchmark_tests = false
+sagemaker_benchmark_tests = true
 
 # SM remote EFA test instance type
 sagemaker_remote_efa_instance_type = ""

@@ -63,7 +63,7 @@ ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
 ENV CUDNN_VERSION=8.9.2.26
 ENV NCCL_VERSION=2.19.4
 ENV EFA_VERSION=1.30.0
-ENV GDRCOPY_VERSION=2.3.1
+ENV GDRCOPY_VERSION=2.4.1
 
 ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
 ENV OPEN_MPI_PATH=/opt/amazon/openmpi

@@ -31,11 +31,7 @@
     is_nightly_context,
     DEFAULT_REGION,
     P3DN_REGION,
-    UBUNTU_20_BASE_DLAMI_US_EAST_1,
-    UBUNTU_20_BASE_DLAMI_US_WEST_2,
     PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1,
-    AML2_BASE_DLAMI_US_WEST_2,
-    AML2_BASE_DLAMI_US_EAST_1,
     KEYS_TO_DESTROY_FILE,
     are_efa_tests_disabled,
     get_repository_and_tag_from_image_uri,
@@ -330,18 +326,11 @@ def ec2_instance_role_name(request):
 
 
 @pytest.fixture(scope="function")
-def ec2_instance_ami(request, region):
+def ec2_instance_ami(request, region, ec2_instance_type):
     return (
         request.param
         if hasattr(request, "param")
-        else UBUNTU_20_BASE_DLAMI_US_EAST_1
-        if region == "us-east-1"
-        else UBUNTU_20_BASE_DLAMI_US_WEST_2
-        if region == "us-west-2"
-        else test_utils.get_ami_id_boto3(
-            region_name=region,
-            ami_name_pattern="Deep Learning Base GPU AMI (Ubuntu 20.04) ????????",
-        )
+        else test_utils.get_instance_type_base_dlami(ec2_instance_type, region)
     )
 
 
@@ -564,9 +553,14 @@ def ec2_instance(
         )
         if ec2_instance_ami != PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1:
             ec2_instance_ami = (
-                AML2_BASE_DLAMI_US_EAST_1
-                if ec2_instance_ami == AML2_BASE_DLAMI_US_WEST_2
-                else UBUNTU_20_BASE_DLAMI_US_EAST_1
+                test_utils.get_instance_type_base_dlami(
+                    ec2_instance_type, "us-east-1", linux_dist="AML2"
+                )
+                if ec2_instance_ami
+                == test_utils.get_instance_type_base_dlami(
+                    ec2_instance_type, "us-west-2", linux_dist="AML2"
+                )
+                else test_utils.get_instance_type_base_dlami(ec2_instance_type, "us-east-1")
             )
 
     ec2_key_name = f"{ec2_key_name}-{str(uuid.uuid4())}"

@@ -24,4 +24,4 @@ lsmod | grep ib_uverbs
 ibv_devinfo
 
 # check if gdr device is loaded
-grep -e '^1$' /sys/class/infiniband/**/device/gdr
+cat /sys/class/infiniband/**/device/p2p | grep 'NVIDIA'
@@ -221,6 +221,10 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region):
     ), f"System CUDNN {system_cudnn} and torch cudnn {cudnn_from_torch} do not match. Please downgrade system CUDNN or recompile torch with correct CUDNN verson."
 
 
+def pytorch_curand_gpu(pytorch_training, ec2_connection):
+    execute_ec2_training_test(ec2_connection, pytorch_training, CURAND_CMD)
+
+
 def pytorch_linear_regression_cpu(pytorch_training, ec2_connection):
     execute_ec2_training_test(
         ec2_connection, pytorch_training, PT_REGRESSION_CMD, container_name="pt_reg"
@@ -240,14 +244,10 @@ def pytorch_telemetry_cpu(pytorch_training, ec2_connection):
     )
 
 
-def curand_gpu(training, ec2_connection):
-    execute_ec2_training_test(ec2_connection, training, CURAND_CMD)
-
-
 def pytorch_training_torchdata(pytorch_training, ec2_connection):
     _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
     # HACK including PT 1.13 in this condition because the Torchdata 0.5.0 tag includes old tests data
     if Version(image_framework_version) in SpecifierSet(">=1.11,<=1.13.1"):
         execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_DEV_CMD)
     else:
-        execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD)
+        execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD)
@@ -34,6 +34,7 @@ def test_pytorch_2_2_gpu(
         (common_cases.nvapex, (pytorch_training, ec2_connection)),
         (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
         (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
+        (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
     ]
 
     if "sagemaker" in pytorch_training:

@@ -23,9 +23,6 @@
 @pytest.mark.team("conda")
 @pytest.mark.integration("gdrcopy")
 @pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION)
-@pytest.mark.parametrize(
-    "ec2_instance_ami", [test_utils.UBUNTU_20_BASE_DLAMI_US_WEST_2], indirect=True
-)
 @pytest.mark.skipif(
     is_pr_context() and not are_heavy_instance_ec2_tests_enabled(),
     reason="Skip GDRCopy test in PR context unless explicitly enabled",

@@ -78,18 +78,38 @@ def get_ami_id_ssm(region_name, parameter_path):
     return ami_id
 
 
-# The Ubuntu 20.04 AMI which adds GDRCopy is used only for GDRCopy feature that is supported on PT1.13 and PT2.0
-UBUNTU_20_BASE_DLAMI_US_WEST_2 = get_ami_id_boto3(
-    region_name="us-west-2", ami_name_pattern="Deep Learning Base GPU AMI (Ubuntu 20.04) ????????"
+# DLAMI Base is split between OSS Nvidia Driver and Propietary Nvidia Driver. see https://docs.aws.amazon.com/dlami/latest/devguide/important-changes.html
+UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 = get_ami_id_boto3(
+    region_name="us-west-2",
+    ami_name_pattern="Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) ????????",
 )
-UBUNTU_20_BASE_DLAMI_US_EAST_1 = get_ami_id_boto3(
-    region_name="us-east-1", ami_name_pattern="Deep Learning Base GPU AMI (Ubuntu 20.04) ????????"
+UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1 = get_ami_id_boto3(
+    region_name="us-east-1",
+    ami_name_pattern="Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) ????????",
 )
-AML2_BASE_DLAMI_US_WEST_2 = get_ami_id_boto3(
-    region_name="us-west-2", ami_name_pattern="Deep Learning Base AMI (Amazon Linux 2) Version ??.?"
+UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2 = get_ami_id_boto3(
+    region_name="us-west-2",
+    ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????",
 )
-AML2_BASE_DLAMI_US_EAST_1 = get_ami_id_boto3(
-    region_name="us-east-1", ami_name_pattern="Deep Learning Base AMI (Amazon Linux 2) Version ??.?"
+UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1 = get_ami_id_boto3(
+    region_name="us-east-1",
+    ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????",
+)
+AML2_BASE_OSS_DLAMI_US_WEST_2 = get_ami_id_boto3(
+    region_name="us-west-2",
+    ami_name_pattern="Deep Learning Base OSS Nvidia Driver AMI (Amazon Linux 2) Version ??.?",
+)
+AML2_BASE_OSS_DLAMI_US_EAST_1 = get_ami_id_boto3(
+    region_name="us-east-1",
+    ami_name_pattern="Deep Learning Base OSS Nvidia Driver AMI (Amazon Linux 2) Version ??.?",
+)
+AML2_BASE_PROPRIETARY_DLAMI_US_WEST_2 = get_ami_id_boto3(
+    region_name="us-west-2",
+    ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver AMI (Amazon Linux 2) Version ??.?",
+)
+AML2_BASE_PROPRIETARY_DLAMI_US_EAST_1 = get_ami_id_boto3(
+    region_name="us-east-1",
+    ami_name_pattern="Deep Learning Base Proprietary Nvidia Driver AMI (Amazon Linux 2) Version ??.?",
 )
 # We use the following DLAMI for MXNet and TensorFlow tests as well, but this is ok since we use custom DLC Graviton containers on top. We just need an ARM base DLAMI.
 UL20_CPU_ARM64_US_WEST_2 = get_ami_id_boto3(
@@ -145,8 +165,10 @@ def get_ami_id_ssm(region_name, parameter_path):
 UBUNTU_18_HPU_DLAMI_US_WEST_2 = "ami-03cdcfc91a96a8f92"
 UBUNTU_18_HPU_DLAMI_US_EAST_1 = "ami-0d83d7487f322545a"
 UL_AMI_LIST = [
-    UBUNTU_20_BASE_DLAMI_US_WEST_2,
-    UBUNTU_20_BASE_DLAMI_US_EAST_1,
+    UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2,
+    UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1,
+    UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2,
+    UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1,
     UBUNTU_18_HPU_DLAMI_US_WEST_2,
     UBUNTU_18_HPU_DLAMI_US_EAST_1,
     PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1,
@@ -2370,3 +2392,95 @@ def get_image_spec_from_buildspec(image_uri, dlc_folder_path):
         raise ValueError(f"No corresponding entry found for {image_uri} in {buildspec_path}")
 
     return matched_image_spec
+
+
+def get_instance_type_base_dlami(instance_type, region, linux_dist="UBUNTU_20"):
+    """
+    Get Instance types based on EC2 instance, see https://docs.aws.amazon.com/dlami/latest/devguide/important-changes.html
+    OSS Nvidia Driver DLAMI supports the following: ["g4dn.xlarge",
+                                                     "g4dn.2xlarge",
+                                                     "g4dn.4xlarge",
+                                                     "g4dn.8xlarge",
+                                                     "g4dn.16xlarge",
+                                                     "g4dn.12xlarge",
+                                                     "g4dn.metal",
+                                                     "g4dn.xlarge",
+                                                     "g5.xlarge",
+                                                     "g5.2xlarge",
+                                                     "g5.4xlarge",
+                                                     "g5.8xlarge",
+                                                     "g5.16xlarge",
+                                                     "g5.12xlarge",
+                                                     "g5.24xlarge",
+                                                     "g5.48xlarge",
+                                                     "p4d.24xlarge",
+                                                     "p4de.24xlarge",
+                                                     "p5.48xlarge",]
+
+    Proprietary Nvidia Driver DLAMI supports the following: ["p3.2xlarge",
+                                                             "p3.8xlarge",
+                                                             "p3.16xlarge",
+                                                             "p3dn.24xlarge",
+                                                             "g3s.xlarge",
+                                                             "g3.4xlarge",
+                                                             "g3.8xlarge",
+                                                             "g3.16xlarge",]
+
+    Other instances will default to Proprietary Nvidia Driver DLAMI
+    """
+
+    base_proprietary_dlami_instances = [
+        "p3.2xlarge",
+        "p3.8xlarge",
+        "p3.16xlarge",
+        "p3dn.24xlarge",
+        "g3s.xlarge",
+        "g3.4xlarge",
+        "g3.8xlarge",
+        "g3.16xlarge",
+    ]
+
+    # set defaults
+    if linux_dist == "AML2":
+        oss_dlami_us_east_1 = AML2_BASE_OSS_DLAMI_US_EAST_1
+        oss_dlami_us_west_2 = AML2_BASE_OSS_DLAMI_US_WEST_2
+        oss_dlami_name_pattern = (
+            "Deep Learning Base OSS Nvidia Driver AMI (Amazon Linux 2) Version ??.?"
+        )
+
+        proprietary_dlami_us_east_1 = AML2_BASE_PROPRIETARY_DLAMI_US_EAST_1
+        proprietary_dlami_us_west_2 = AML2_BASE_PROPRIETARY_DLAMI_US_WEST_2
+        proprietary_dlami_name_pattern = (
+            "Deep Learning Base Proprietary Nvidia Driver AMI (Amazon Linux 2) Version ??.?"
+        )
+    else:
+        oss_dlami_us_east_1 = UBUNTU_20_BASE_OSS_DLAMI_US_EAST_1
+        oss_dlami_us_west_2 = UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2
+        oss_dlami_name_pattern = (
+            "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) ????????"
+        )
+
+        proprietary_dlami_us_east_1 = UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_EAST_1
+        proprietary_dlami_us_west_2 = UBUNTU_20_BASE_PROPRIETARY_DLAMI_US_WEST_2
+        proprietary_dlami_name_pattern = (
+            "Deep Learning Base Proprietary Nvidia Driver GPU AMI (Ubuntu 20.04) ????????"
+        )
+
+    instance_ami = (
+        proprietary_dlami_us_east_1
+        if region == "us-east-1" and instance_type in base_proprietary_dlami_instances
+        else proprietary_dlami_us_west_2
+        if region == "us-west-2" and instance_type in base_proprietary_dlami_instances
+        else get_ami_id_boto3(
+            region_name=region,
+            ami_name_pattern=proprietary_dlami_name_pattern,
+        )
+        if instance_type in base_proprietary_dlami_instances
+        else oss_dlami_us_east_1
+        if region == "us-east-1"
+        else oss_dlami_us_west_2
+        if region == "us-west-2"
+        else get_ami_id_boto3(region_name=region, ami_name_pattern=oss_dlami_name_pattern)
+    )
+
+    return instance_ami
@@ -139,11 +139,11 @@ def get_efa_ec2_instance_type(default, filter_function=lambda x: x, job_type="")
     a list.
     """
     instance_list = get_ec2_instance_type(default, "gpu", filter_function, job_type=job_type)
-    instance_list = [
+    instance_region_list = [
         (instance_type, get_cicd_instance_reserved_region(instance_type))
         for instance_type in instance_list
     ]
-    return instance_list
+    return instance_region_list
 
 
 def get_ec2_instance_type(

@@ -25,15 +25,14 @@
     SAGEMAKER_EXECUTION_REGIONS,
     SAGEMAKER_NEURON_EXECUTION_REGIONS,
     SAGEMAKER_NEURONX_EXECUTION_REGIONS,
-    UBUNTU_20_BASE_DLAMI_US_EAST_1,
-    UBUNTU_20_BASE_DLAMI_US_WEST_2,
     UL20_CPU_ARM64_US_EAST_1,
     UL20_CPU_ARM64_US_WEST_2,
     SAGEMAKER_LOCAL_TEST_TYPE,
     SAGEMAKER_REMOTE_TEST_TYPE,
     UBUNTU_HOME_DIR,
     DEFAULT_REGION,
     is_nightly_context,
+    get_instance_type_base_dlami,
 )
 from test_utils.pytest_cache import PytestCache
 
@@ -85,7 +84,7 @@ def assign_sagemaker_local_job_instance_type(image):
     return "p3.8xlarge" if "gpu" in image else "c5.18xlarge"
 
 
-def assign_sagemaker_local_test_ami(image, region):
+def assign_sagemaker_local_test_ami(image, region, instance_type):
     """
     Helper function to get the needed AMI for launching the image.
     Needed to support Graviton(ARM) images
@@ -96,13 +95,10 @@ def assign_sagemaker_local_test_ami(image, region):
         else:
             return UL20_CPU_ARM64_US_WEST_2
     else:
-        if region == "us-east-1":
-            return UBUNTU_20_BASE_DLAMI_US_EAST_1
-        else:
-            return UBUNTU_20_BASE_DLAMI_US_WEST_2
+        return get_instance_type_base_dlami(instance_type, region)
 
 
-def launch_sagemaker_local_ec2_instance(image, ami_id, ec2_key_name, region):
+def launch_sagemaker_local_ec2_instance(image, ec2_key_name, region):
     """
     Launch Ec2 instance for running sagemaker local tests
     :param image: str
@@ -112,6 +108,7 @@ def launch_sagemaker_local_ec2_instance(image, ami_id, ec2_key_name, region):
     :return: str, str
     """
     instance_type = assign_sagemaker_local_job_instance_type(image)
+    ami_id = assign_sagemaker_local_test_ami(image, region, instance_type)
     instance_name = image.split("/")[-1]
     instance = ec2_utils.launch_instance(
         ami_id,
@@ -298,7 +295,6 @@ def execute_local_tests(image, pytest_cache_params):
     random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}")
     ec2_key_name = f"{job_type}_{tag}_sagemaker_{random.randint(1, 1000)}"
     region = os.getenv("AWS_REGION", DEFAULT_REGION)
-    ec2_ami_id = assign_sagemaker_local_test_ami(image, region)
     sm_tests_tar_name = "sagemaker_tests.tar.gz"
     ec2_test_report_path = os.path.join(UBUNTU_HOME_DIR, "test", f"{job_type}_{tag}_sm_local.xml")
     instance_id = ""
@@ -308,7 +304,6 @@ def execute_local_tests(image, pytest_cache_params):
         print(f"Launching new Instance for image: {image}")
         instance_id, ip_address = launch_sagemaker_local_ec2_instance(
             image,
-            ec2_ami_id,
             ec2_key_name,
             region,
         )