aws · arjkesh · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024
@@ -34,11 +34,11 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+build_frameworks = ["pytorch"]
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true
-build_inference = true
+build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
@@ -57,9 +57,9 @@ notify_test_failures = false
 sanity_tests = true
   safety_check_test = false
   ecr_scan_allowlist_feature = false
-ecs_tests = true
-eks_tests = true
-ec2_tests = true
+ecs_tests = false
+eks_tests = false
+ec2_tests = false
 # Set it to true if you are preparing a Benchmark related PR
 ec2_benchmark_tests = false
 
@@ -102,7 +102,7 @@ use_scheduler = false
 
 # Standard Framework Training
 dlc-pr-mxnet-training = ""
-dlc-pr-pytorch-training = ""
+dlc-pr-pytorch-training = "pytorch/training/buildspec-2-0-sm.yml"
 dlc-pr-tensorflow-2-training = ""
 dlc-pr-autogluon-training = ""
 

@@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch
 version: &VERSION 2.0.1
 short_version: &SHORT_VERSION "2.0"
 arch_type: x86
-autopatch_build: "True"
+# autopatch_build: "True"
 
 repository_info:
   training_repository: &TRAINING_REPOSITORY
@@ -46,6 +46,7 @@ images:
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
                          *DEVICE_TYPE ]
     target: sagemaker
+    build_tag_override: "beta:2.0.1-gpu-py310-cu118-ubuntu20.04-sagemaker-benchmark-tested"
     context:
       <<: *TRAINING_CONTEXT
   # BuildSageMakerCPUPTTrainPy3DockerImage:
@@ -75,5 +76,6 @@ images:
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
                          *DEVICE_TYPE ]
     target: sagemaker
+    build_tag_override: "beta:2.0.1-gpu-py310-cu121-ubuntu20.04-sagemaker-benchmark-tested"
     context:
       <<: *TRAINING_CONTEXT
@@ -428,7 +428,6 @@ def launch_efa_instances_with_retry(
     :param fn_name: string - function name for ease of logging
     :return: dict response from ec2_client.run_instances
     """
-    response = None
     region = ec2_client.meta.region_name
     reservations = get_available_reservations(
         ec2_client=ec2_client,
@@ -487,7 +486,7 @@ def launch_efa_instances_with_retry(
             if response and response["Instances"]:
                 break
         except ClientError as e:
-            LOGGER.debug(
+            LOGGER.info(
                 f"Failed to launch in {availability_zone} for {fn_name} due to {e}\n"
                 "Retrying in the next availability zone."
             )