diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 10c783d4eb22..570fd9b036bd 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -34,11 +34,11 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -57,9 +57,9 @@ notify_test_failures = false sanity_tests = true safety_check_test = false ecr_scan_allowlist_feature = false -ecs_tests = true -eks_tests = true -ec2_tests = true +ecs_tests = false +eks_tests = false +ec2_tests = false # Set it to true if you are preparing a Benchmark related PR ec2_benchmark_tests = false @@ -102,7 +102,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-mxnet-training = "" -dlc-pr-pytorch-training = "" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-0-sm.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec-2-0-sm.yml b/pytorch/training/buildspec-2-0-sm.yml index 98f001d6c689..05481f2f7e2d 100644 --- a/pytorch/training/buildspec-2-0-sm.yml +++ b/pytorch/training/buildspec-2-0-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.0.1 short_version: &SHORT_VERSION "2.0" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -46,6 +46,7 @@ images: docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker + build_tag_override: "beta:2.0.1-gpu-py310-cu118-ubuntu20.04-sagemaker-benchmark-tested" context: <<: *TRAINING_CONTEXT # BuildSageMakerCPUPTTrainPy3DockerImage: @@ -75,5 +76,6 @@ images: docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker + build_tag_override: "beta:2.0.1-gpu-py310-cu121-ubuntu20.04-sagemaker-benchmark-tested" context: <<: *TRAINING_CONTEXT diff --git a/test/test_utils/ec2.py b/test/test_utils/ec2.py index bd013495d424..d086682e6f7c 100644 --- a/test/test_utils/ec2.py +++ b/test/test_utils/ec2.py @@ -428,7 +428,6 @@ def launch_efa_instances_with_retry( :param fn_name: string - function name for ease of logging :return: dict response from ec2_client.run_instances """ - response = None region = ec2_client.meta.region_name reservations = get_available_reservations( ec2_client=ec2_client, @@ -487,7 +486,7 @@ def launch_efa_instances_with_retry( if response and response["Instances"]: break except ClientError as e: - LOGGER.debug( + LOGGER.info( f"Failed to launch in {availability_zone} for {fn_name} due to {e}\n" "Retrying in the next availability zone." )