Skip to content

Commit

Permalink
Merge branch 'master' into ap-handle-removal-pr
Browse files Browse the repository at this point in the history
  • Loading branch information
arjkesh authored Mar 13, 2024
2 parents 9fb7b6f + b17f44d commit dd2e186
Show file tree
Hide file tree
Showing 17 changed files with 313 additions and 310 deletions.
104 changes: 30 additions & 74 deletions available_images.md

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pytorch/training/buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ images:
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: sagemaker
# build_tag_override: "beta:2.2.0-cpu-py310-ubuntu20.04-sagemaker"
context:
<<: *TRAINING_CONTEXT
BuildSageMakerGPUPTTrainPy3DockerImage:
Expand All @@ -60,5 +61,6 @@ images:
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: sagemaker
# build_tag_override: "beta:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker"
context:
<<: *TRAINING_CONTEXT
13 changes: 13 additions & 0 deletions release_images_inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,16 @@ release_images:
example: False
disable_sm_tag: False
force_release: False
17:
framework: "pytorch"
version: "2.2.0"
arch_type: "x86"
customer_type: "ec2"
inference:
device_types: [ "cpu", "gpu" ]
python_versions: [ "py310" ]
os_version: "ubuntu20.04"
cuda_version: "cu118"
example: False
disable_sm_tag: False
force_release: False
13 changes: 13 additions & 0 deletions release_images_training.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,16 @@ release_images:
example: False
disable_sm_tag: False
force_release: False
5:
framework: "pytorch"
version: "2.2.0"
arch_type: "x86"
customer_type: "ec2"
training:
device_types: [ "cpu", "gpu" ]
python_versions: [ "py310" ]
os_version: "ubuntu20.04"
cuda_version: "cu121"
example: False
disable_sm_tag: False
force_release: False
9 changes: 3 additions & 6 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,13 @@ def is_autopatch_build_enabled_in_image_buildspec(buildspec_path=None):
image_buildspec_object = Buildspec()
image_buildspec_object.load(buildspec_path)
autopatch_build_flag = image_buildspec_object.get("autopatch_build", "False").lower() == "true"
print(f"Here: {buildspec_path}")
return autopatch_build_flag


def is_autopatch_build_enabled(buildspec_path=None):
return (
parse_dlc_developer_configs("build", "autopatch_build")
or os.getenv("AUTOPATCH")
or is_autopatch_build_enabled_in_image_buildspec(buildspec_path=buildspec_path)
)
return parse_dlc_developer_configs(
"build", "autopatch_build"
) or is_autopatch_build_enabled_in_image_buildspec(buildspec_path=buildspec_path)


def is_ec2_test_enabled():
Expand Down
1 change: 1 addition & 0 deletions src/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ def docker_build(self, fileobj=None, custom_context=False):
:return: int, Build Status
"""
response = [f"Starting the Build Process for {self.repository}:{self.tag}"]

for line in self.client.build(
fileobj=fileobj,
path=self.dockerfile,
Expand Down
36 changes: 29 additions & 7 deletions src/image_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import os
import re
import json
import tempfile

from copy import deepcopy

Expand Down Expand Up @@ -140,10 +141,7 @@ def image_builder(buildspec, image_types=[], device_types=[]):
if is_nightly_build_context():
additional_image_tags.append(tag_image_with_date(image_tag))

if build_context != "PR":
image_tag = tag_image_with_datetime(image_tag)
# If build is not enabled, we don't care about the datetime tag
elif is_build_enabled():
if is_build_enabled() or build_context != "PR":
# Order appears to matter in datetime tagging, so tag with no datetime first, then
# set image_tag to have datetime
no_datetime = image_tag
Expand Down Expand Up @@ -211,10 +209,34 @@ def image_builder(buildspec, image_types=[], device_types=[]):
if inference_toolkit_version:
extra_build_args["SM_TOOLKIT_VERSION"] = inference_toolkit_version

tag_override = image_config.get("build_tag_override")
dockerfile = image_config["docker_file"]
target = image_config.get("target")
tag_override_regex = r"^(beta|pr):\S+$"
if tag_override and build_context == "PR":
if is_autopatch_build_enabled(buildspec_path=buildspec):
FORMATTER.print("AUTOPATCH ENABLED IN BUILDSPEC, CANNOT OVERRIDE WITH TAG, SORRY!")
elif not re.match(tag_override_regex, tag_override):
FORMATTER.print(
f"TAG OVERRIDE MUST BE OF FORMAT {tag_override_regex}, but got {tag_override}. Proceeding with regular build."
)
else:
repo_override, t_override = tag_override.split(":")
with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file_handle:
source_uri = (
f"{image_repo_uri.replace('pr-', f'{repo_override}-')}:{t_override}"
)
temp_file_handle.write(
f"FROM {source_uri}\nLABEL dlc.dev.source_uri={source_uri}"
)
dockerfile = temp_file_handle.name
target = None
FORMATTER.print(f"USING TAG OVERRIDE {source_uri}")

ARTIFACTS.update(
{
"dockerfile": {
"source": image_config["docker_file"],
"source": dockerfile,
"target": "Dockerfile",
}
}
Expand Down Expand Up @@ -308,14 +330,14 @@ def image_builder(buildspec, image_types=[], device_types=[]):
# Create pre_push stage docker object
pre_push_stage_image_object = DockerImage(
info=info,
dockerfile=image_config["docker_file"],
dockerfile=dockerfile,
repository=image_repo_uri,
tag=append_tag(image_tag, "pre-push"),
to_build=image_config["build"],
stage=constants.PRE_PUSH_STAGE,
context=context,
additional_tags=additional_image_tags,
target=image_config.get("target"),
target=target,
)

##### Create Common stage docker object #####
Expand Down
48 changes: 34 additions & 14 deletions test/dlc_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,11 @@ def delete_ssh_keypair():
],
}
response = ec2_utils.launch_efa_instances_with_retry(
ec2_client, ec2_instance_type, availability_zone_options, ec2_run_instances_definition
ec2_client,
ec2_instance_type,
availability_zone_options,
ec2_run_instances_definition,
fn_name=request.node.name,
)

instances = response["Instances"]
Expand Down Expand Up @@ -693,6 +697,8 @@ def delete_ssh_keypair():
ec2_resource=ec2_resource,
availability_zone_options=availability_zone_options,
ec2_create_instances_definition=params,
ec2_client=ec2_client,
fn_name=request.node.name,
)
instance_id = instances[0].id

Expand Down Expand Up @@ -956,23 +962,37 @@ def skip_efa_tests(request):

@pytest.fixture(autouse=True)
def skip_p5_tests(request, ec2_instance_type):
allowed_p5_fixtures = (
"gpu",
"image",
"training",
"pytorch_training",
r"pytorch_training___\S+",
)
image_uri = None

if "p5." in ec2_instance_type:
if "gpu" in request.fixturenames:
img_uri = request.getfixturevalue("gpu")
elif "image" in request.fixturenames:
img_uri = request.getfixturevalue("image")
elif "training" in request.fixturenames:
img_uri = request.getfixturevalue("training")
elif "pytorch_training" in request.fixturenames:
img_uri = request.getfixturevalue("pytorch_training")
else:
pytest.skip("Current image doesn't support P5 EC2 instance.")
p5_fixture_stack = list(allowed_p5_fixtures)
while p5_fixture_stack and not image_uri:
fixture_name = p5_fixture_stack.pop()
if fixture_name in request.fixturenames:
image_uri = request.getfixturevalue(fixture_name)
# Handle fixture names that include tag as regex
elif "___" in fixture_name:
regex = re.compile(fixture_name)
matches = list(filter(regex.match, request.fixturenames))
image_uri = request.getfixturevalue(matches[0]) if matches else None

if not image_uri:
pytest.skip(
f"Current image doesn't support P5 EC2 instance. Must be of fixture name {allowed_p5_fixtures}"
)

framework, image_framework_version = get_framework_and_version_from_tag(img_uri)
framework, image_framework_version = get_framework_and_version_from_tag(image_uri)
if "pytorch" not in framework:
pytest.skip("Current image doesn't support P5 EC2 instance.")
image_processor = get_processor_from_image_uri(img_uri)
image_cuda_version = get_cuda_version_from_tag(img_uri)
image_processor = get_processor_from_image_uri(image_uri)
image_cuda_version = get_cuda_version_from_tag(image_uri)
if image_processor != "gpu" or Version(image_cuda_version.strip("cu")) < Version("120"):
pytest.skip("Images using less than CUDA 12.0 doesn't support P5 EC2 instance.")

Expand Down
40 changes: 24 additions & 16 deletions test/dlc_tests/ecs/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from test import test_utils
import test.test_utils.ecs as ecs_utils
import test.test_utils.ec2 as ec2_utils


@pytest.fixture(scope="function")
Expand Down Expand Up @@ -107,7 +108,6 @@ def ecs_container_instance(
ecs_instance_type,
ecs_ami,
region,
ei_accelerator_type,
use_large_storage,
):
"""
Expand Down Expand Up @@ -148,22 +148,30 @@ def ecs_container_instance(
{"DeviceName": "/dev/xvda", "Ebs": {"VolumeSize": 90, "VolumeType": "gp2"}}
]

if ei_accelerator_type:
params["ElasticInferenceAccelerators"] = [{"Type": ei_accelerator_type, "Count": 1}]
availability_zones = {
"us-west-2": ["us-west-2a", "us-west-2b", "us-west-2c"],
"us-east-1": ["us-east-1a", "us-east-1b", "us-east-1c"],
reservations = ec2_utils.get_available_reservations(
ec2_client=ec2_client, instance_type=instance_type, min_availability=params["MinCount"]
)
fn_name = request.node.name
instances = None
while reservations:
reservation = reservations.pop(0)
params["CapacityReservationSpecification"] = {
"CapacityReservationTarget": {
"CapacityReservationId": reservation["CapacityReservationId"]
}
}
for a_zone in availability_zones[region]:
params["Placement"] = {"AvailabilityZone": a_zone}
try:
instances = ec2_client.run_instances(**params)
if instances:
break
except ClientError as e:
print(f"Failed to launch in {a_zone} with Error: {e}")
continue
else:
try:
instances = ec2_client.run_instances(**params)
test_utils.LOGGER.info(
f"Your reservation is ready for {fn_name}, please wait to be seated. Launching..."
)
if test_utils.is_mainline_context():
test_utils.LOGGER.info(f"Launched instance for {fn_name} via {reservation}")
except ClientError as e:
test_utils.LOGGER.error(f"Failed to launch via reservation for {fn_name} - {e}")

if not instances:
params.pop("CapacityReservationSpecification", None)
instances = ec2_client.run_instances(**params)
instance_id = instances.get("Instances")[0].get("InstanceId")

Expand Down
37 changes: 0 additions & 37 deletions test/dlc_tests/ecs/mxnet/inference/test_ecs_mxnet_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,43 +48,6 @@ def __test_ecs_mxnet_inference_cpu(mxnet_inference, ecs_container_instance, regi
)


@pytest.mark.integration("elastic_inference")
@pytest.mark.model("squeezenet")
@pytest.mark.parametrize("ecs_instance_type", ["c5.4xlarge"], indirect=True)
@pytest.mark.parametrize("ecs_ami", [ECS_AML2_CPU_USWEST2], indirect=True)
@pytest.mark.parametrize("ei_accelerator_type", ["eia1.large"], indirect=True)
def test_ecs_mxnet_inference_eia(
mxnet_inference_eia, ecs_container_instance, ei_accelerator_type, region
):
worker_instance_id, ecs_cluster_arn = ecs_container_instance
public_ip_address = ec2_utils.get_public_ip(worker_instance_id, region=region)

model_name = "resnet-152-eia"
image_framework, image_framework_version = get_framework_and_version_from_tag(
mxnet_inference_eia
)
if image_framework_version == "1.5.1":
model_name = "resnet-152-eia-1-5-1"
service_name = task_family = revision = None
try:
service_name, task_family, revision = ecs_utils.setup_ecs_inference_service(
mxnet_inference_eia,
"mxnet",
ecs_cluster_arn,
model_name,
worker_instance_id,
ei_accelerator_type,
region=region,
)
inference_result = request_mxnet_inference(public_ip_address, model=model_name)
assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}"

finally:
ecs_utils.tear_down_ecs_inference_service(
ecs_cluster_arn, service_name, task_family, revision
)


@pytest.mark.model("mxnet-resnet-neuron")
@pytest.mark.parametrize("ecs_instance_type", ["inf1.2xlarge"], indirect=True)
@pytest.mark.parametrize("ecs_ami", [ECS_AML2_NEURON_USWEST2], indirect=True)
Expand Down
40 changes: 0 additions & 40 deletions test/dlc_tests/ecs/pytorch/inference/test_ecs_pytorch_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,46 +59,6 @@ def __ecs_pytorch_inference_cpu(pytorch_inference, ecs_container_instance, regio
)


@pytest.mark.integration("elastic_inference")
@pytest.mark.model("densenet")
@pytest.mark.parametrize("ecs_instance_type", ["c5.4xlarge"], indirect=True)
@pytest.mark.parametrize("ecs_ami", [ECS_AML2_CPU_USWEST2], indirect=True)
@pytest.mark.parametrize("ei_accelerator_type", ["eia1.large"], indirect=True)
def test_ecs_pytorch_inference_eia(
pytorch_inference_eia, ecs_container_instance, ei_accelerator_type, region, pt14_and_above_only
):
worker_instance_id, ecs_cluster_arn = ecs_container_instance
public_ip_address = ec2_utils.get_public_ip(worker_instance_id, region=region)

model_name = "pytorch-densenet"
image_framework, image_framework_version = get_framework_and_version_from_tag(
pytorch_inference_eia
)
if image_framework_version == "1.3.1":
model_name = "pytorch-densenet-v1-3-1"
service_name = task_family = revision = None
try:
service_name, task_family, revision = ecs_utils.setup_ecs_inference_service(
pytorch_inference_eia,
"pytorch",
ecs_cluster_arn,
model_name,
worker_instance_id,
ei_accelerator_type,
region=region,
)
server_type = get_inference_server_type(pytorch_inference_eia)
inference_result = request_pytorch_inference_densenet(
public_ip_address, model_name=model_name, server_type=server_type
)
assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}"

finally:
ecs_utils.tear_down_ecs_inference_service(
ecs_cluster_arn, service_name, task_family, revision
)


@pytest.mark.model("resnet")
@pytest.mark.parametrize("ecs_instance_type", ["inf1.2xlarge"], indirect=True)
@pytest.mark.parametrize("ecs_ami", [ECS_AML2_NEURON_USWEST2], indirect=True)
Expand Down
Loading

0 comments on commit dd2e186

Please sign in to comment.