From 64ff5a5d67468362b7704370e1f03c042050a2a7 Mon Sep 17 00:00:00 2001 From: arjkesh <33526713+arjkesh@users.noreply.github.com> Date: Mon, 18 Mar 2024 12:04:31 -0700 Subject: [PATCH 1/3] test PT 2.0 images --- dlc_developer_config.toml | 12 ++++++------ pytorch/training/buildspec-2-0-sm.yml | 4 +++- test/test_utils/ec2.py | 3 +-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 10c783d4eb22..a5966f063cdd 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -34,11 +34,11 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -57,9 +57,9 @@ notify_test_failures = false sanity_tests = true safety_check_test = false ecr_scan_allowlist_feature = false -ecs_tests = true -eks_tests = true -ec2_tests = true +ecs_tests = false +eks_tests = false +ec2_tests = false # Set it to true if you are preparing a Benchmark related PR ec2_benchmark_tests = false @@ -102,7 +102,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-mxnet-training = "" -dlc-pr-pytorch-training = "" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-0.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec-2-0-sm.yml b/pytorch/training/buildspec-2-0-sm.yml index 98f001d6c689..65d7a1ce664e 100644 --- a/pytorch/training/buildspec-2-0-sm.yml +++ b/pytorch/training/buildspec-2-0-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK pytorch version: &VERSION 2.0.1 short_version: &SHORT_VERSION "2.0" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY @@ -47,6 +47,7 @@ images: *DEVICE_TYPE ] target: sagemaker context: + build_tag_override: "beta:2.0.1-gpu-py310-cu118-ubuntu20.04-sagemaker-benchmark-tested" <<: *TRAINING_CONTEXT # BuildSageMakerCPUPTTrainPy3DockerImage: # <<: *TRAINING_REPOSITORY @@ -75,5 +76,6 @@ images: docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker + build_tag_override: "beta:2.0.1-gpu-py310-cu121-ubuntu20.04-sagemaker-benchmark-tested" context: <<: *TRAINING_CONTEXT diff --git a/test/test_utils/ec2.py b/test/test_utils/ec2.py index bd013495d424..d086682e6f7c 100644 --- a/test/test_utils/ec2.py +++ b/test/test_utils/ec2.py @@ -428,7 +428,6 @@ def launch_efa_instances_with_retry( :param fn_name: string - function name for ease of logging :return: dict response from ec2_client.run_instances """ - response = None region = ec2_client.meta.region_name reservations = get_available_reservations( ec2_client=ec2_client, @@ -487,7 +486,7 @@ def launch_efa_instances_with_retry( if response and response["Instances"]: break except ClientError as e: - LOGGER.debug( + LOGGER.info( f"Failed to launch in {availability_zone} for {fn_name} due to {e}\n" "Retrying in the next availability zone." ) From 20d9c027fb3eb78baf3e14ed89162af4ac79c2b0 Mon Sep 17 00:00:00 2001 From: arjkesh <33526713+arjkesh@users.noreply.github.com> Date: Mon, 18 Mar 2024 12:05:31 -0700 Subject: [PATCH 2/3] Update dlc_developer_config.toml --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index a5966f063cdd..570fd9b036bd 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -102,7 +102,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-mxnet-training = "" -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-0.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-0-sm.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" From 1ce8105faf32dfc76727e7041b741b0c92221ff6 Mon Sep 17 00:00:00 2001 From: arjkesh <33526713+arjkesh@users.noreply.github.com> Date: Mon, 18 Mar 2024 12:09:21 -0700 Subject: [PATCH 3/3] Update buildspec-2-0-sm.yml --- pytorch/training/buildspec-2-0-sm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch/training/buildspec-2-0-sm.yml b/pytorch/training/buildspec-2-0-sm.yml index 65d7a1ce664e..05481f2f7e2d 100644 --- a/pytorch/training/buildspec-2-0-sm.yml +++ b/pytorch/training/buildspec-2-0-sm.yml @@ -46,8 +46,8 @@ images: docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker - context: build_tag_override: "beta:2.0.1-gpu-py310-cu118-ubuntu20.04-sagemaker-benchmark-tested" + context: <<: *TRAINING_CONTEXT # BuildSageMakerCPUPTTrainPy3DockerImage: # <<: *TRAINING_REPOSITORY