Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update DLAMI BASE AMI Logic to switch between OSS and Proprietary Nvidia Driver AMI #3760

Merged
merged 51 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
840aef6
Update DLAMI BASE AMI Logic to switch between OSS and Proprietary Nvi…
Mar 8, 2024
95f6e46
update gdrcopy to 2.4
Mar 8, 2024
2e4b09b
formatting
Mar 8, 2024
c28d32b
disable buiild and fix sm local test instance ami
Mar 11, 2024
61212b4
use proprietary drier dlami as default
Mar 11, 2024
a0f8f82
fix ul20 and aml2 dlami name logic and test only ec2
Mar 11, 2024
ce3d3da
allow test efa
Mar 11, 2024
00fba94
update oss dlami list
Mar 11, 2024
434fbdc
test curand
Mar 11, 2024
115c33c
ensure ec2 instance type fixture is ran before ec2 instance ami
Mar 12, 2024
092b14b
alter ami pulling logic
Mar 12, 2024
b75b415
usefixtures
Mar 12, 2024
9dc8fec
use parametrize
Mar 12, 2024
95ddb86
use instance ami in parametrize
Mar 12, 2024
1d9347f
add instace ami ad parametrize
Mar 12, 2024
a78962d
Merge branch 'master' into update-ami
sirutBuasai Mar 12, 2024
0a9504d
fix curand test
Mar 13, 2024
c66555e
correct ami name
Mar 13, 2024
e5716bc
correct ami format
Mar 13, 2024
66ce9fc
use proprietary dlami for curand
Mar 13, 2024
68273a4
rebuild
Mar 14, 2024
c70f0e9
logging debug
Mar 14, 2024
75f8e86
remove parametrize ami
Mar 14, 2024
5a99d36
flip logic
Mar 14, 2024
9040c77
formatting
Mar 14, 2024
2f16a5b
print instance ami
Mar 14, 2024
3b15a71
fix typo
Mar 14, 2024
b83eed1
remove parametrize logic and fix proprietary dlami name pattern
Mar 14, 2024
9f3a24d
Merge branch 'master' into update-ami
sirutBuasai Mar 14, 2024
3a78b32
revert gdr copy
Mar 14, 2024
f8af0b0
update test with gdrcopy 2.4
Mar 14, 2024
e32684a
build test pt ec2
Mar 15, 2024
c8b1ce0
build test pt sm
Mar 15, 2024
63d8a31
remove gdrcopy ami
Mar 15, 2024
e40298b
sanity and sm local testonly
Mar 15, 2024
9efd8da
build test pt sm
Mar 15, 2024
f8538bf
Merge branch 'master' into update-ami
sirutBuasai Mar 15, 2024
f9633d6
formatting
Mar 15, 2024
2682dac
test pt sm
Mar 16, 2024
b561099
build test pt sm
Mar 16, 2024
2b52804
disable build
Mar 16, 2024
dd1e2b2
build test pt sm
Mar 16, 2024
e5fe485
use get-login-password
Mar 18, 2024
ac78c1f
remove () from get-login
Mar 18, 2024
4013f89
test tensorflow
Mar 18, 2024
9f74eeb
use login_to_ecr_registry function
Mar 18, 2024
185d4a5
use dict for base dlami logic
Mar 18, 2024
a893035
use image uri instead
Mar 18, 2024
64d9afa
fix aml2 dlami logic
Mar 18, 2024
ef03578
revert toml file
Mar 19, 2024
feab20e
Merge branch 'master' into update-ami
sirutBuasai Mar 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pytorch/training/docker/2.2/py3/cu121/Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
ENV CUDNN_VERSION=8.9.2.26
ENV NCCL_VERSION=2.19.4
ENV EFA_VERSION=1.30.0
ENV GDRCOPY_VERSION=2.3.1
ENV GDRCOPY_VERSION=2.4.1

ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
ENV OPEN_MPI_PATH=/opt/amazon/openmpi
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import time
import pytest

from src.benchmark_metrics import (
PYTORCH_INFERENCE_GPU_THRESHOLD,
PYTORCH_INFERENCE_CPU_THRESHOLD,
Expand All @@ -10,6 +11,8 @@
CONTAINER_TESTS_PREFIX,
get_framework_and_version_from_tag,
UL20_CPU_ARM64_US_WEST_2,
login_to_ecr_registry,
get_account_id_from_image_uri,
LOGGER,
)
from test.test_utils.ec2 import (
Expand Down Expand Up @@ -85,7 +88,8 @@ def ec2_performance_pytorch_inference(
repo_name, image_tag = image_uri.split("/")[-1].split(":")

# Make sure we are logged into ECR so we can pull the image
ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
account_id = get_account_id_from_image_uri(image_uri)
login_to_ecr_registry(ec2_connection, account_id, region)

ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
DEFAULT_REGION,
get_framework_and_version_from_tag,
is_pr_context,
login_to_ecr_registry,
get_account_id_from_image_uri,
)
from test.test_utils.ec2 import (
execute_ec2_training_performance_test,
Expand Down Expand Up @@ -142,7 +144,8 @@ def execute_pytorch_gpu_py3_imagenet_ec2_training_performance_test(
container_name = f"{repo_name}-performance-{image_tag}-ec2"

# Make sure we are logged into ECR so we can pull the image
connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
account_id = get_account_id_from_image_uri(ecr_uri)
login_to_ecr_registry(connection, account_id, region)
# Do not add -q to docker pull as it leads to a hang for huge images like trcomp
connection.run(f"nvidia-docker pull {ecr_uri}")
timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
is_pr_context,
is_tf_version,
UL20_BENCHMARK_CPU_ARM64_US_WEST_2,
login_to_ecr_registry,
get_account_id_from_image_uri,
)
from test.test_utils.ec2 import (
ec2_performance_upload_result_to_s3_and_validate,
Expand Down Expand Up @@ -83,7 +85,8 @@ def ec2_performance_tensorflow_inference(
num_iterations = 500 if is_pr_context() or is_graviton else 1000

# Make sure we are logged into ECR so we can pull the image
ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
account_id = get_account_id_from_image_uri(image_uri)
login_to_ecr_registry(ec2_connection, account_id, region)
ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ")
if is_graviton:
# TF training binary is used that is compatible for graviton instance type
Expand Down
29 changes: 13 additions & 16 deletions test/dlc_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,7 @@
is_nightly_context,
DEFAULT_REGION,
P3DN_REGION,
UBUNTU_20_BASE_DLAMI_US_EAST_1,
UBUNTU_20_BASE_DLAMI_US_WEST_2,
PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1,
AML2_BASE_DLAMI_US_WEST_2,
AML2_BASE_DLAMI_US_EAST_1,
KEYS_TO_DESTROY_FILE,
are_efa_tests_disabled,
get_repository_and_tag_from_image_uri,
Expand Down Expand Up @@ -330,18 +326,11 @@ def ec2_instance_role_name(request):


@pytest.fixture(scope="function")
def ec2_instance_ami(request, region):
def ec2_instance_ami(request, region, ec2_instance_type):
return (
request.param
if hasattr(request, "param")
else UBUNTU_20_BASE_DLAMI_US_EAST_1
if region == "us-east-1"
else UBUNTU_20_BASE_DLAMI_US_WEST_2
if region == "us-west-2"
else test_utils.get_ami_id_boto3(
region_name=region,
ami_name_pattern="Deep Learning Base GPU AMI (Ubuntu 20.04) ????????",
)
else test_utils.get_instance_type_base_dlami(ec2_instance_type, region)
)


Expand Down Expand Up @@ -555,6 +544,8 @@ def ec2_instance(
):
_validate_p4de_usage(request, ec2_instance_type)
if ec2_instance_type == "p3dn.24xlarge":
# Keep track of initial region to get information about previous AMI
initial_region = region
region = P3DN_REGION
ec2_client = boto3.client(
"ec2", region_name=region, config=Config(retries={"max_attempts": 10})
Expand All @@ -563,10 +554,16 @@ def ec2_instance(
"ec2", region_name=region, config=Config(retries={"max_attempts": 10})
)
if ec2_instance_ami != PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1:
# Assign as AML2 if initial AMI is AML2, else use default
ec2_instance_ami = (
AML2_BASE_DLAMI_US_EAST_1
if ec2_instance_ami == AML2_BASE_DLAMI_US_WEST_2
else UBUNTU_20_BASE_DLAMI_US_EAST_1
test_utils.get_instance_type_base_dlami(
sirutBuasai marked this conversation as resolved.
Show resolved Hide resolved
ec2_instance_type, region, linux_dist="AML2"
)
if ec2_instance_ami
== test_utils.get_instance_type_base_dlami(
ec2_instance_type, initial_region, linux_dist="AML2"
)
else test_utils.get_instance_type_base_dlami(ec2_instance_type, region)
)

ec2_key_name = f"{ec2_key_name}-{str(uuid.uuid4())}"
Expand Down
2 changes: 1 addition & 1 deletion test/dlc_tests/container_tests/bin/efa/testEFASanity
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ lsmod | grep ib_uverbs
ibv_devinfo

# check if gdr device is loaded
grep -e '^1$' /sys/class/infiniband/**/device/gdr
cat /sys/class/infiniband/**/device/p2p | grep 'NVIDIA'
10 changes: 8 additions & 2 deletions test/dlc_tests/ec2/mxnet/inference/test_mxnet_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@
import test.test_utils.ec2 as ec2_utils

from test import test_utils
from test.test_utils import CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag
from test.test_utils import (
CONTAINER_TESTS_PREFIX,
get_framework_and_version_from_tag,
login_to_ecr_registry,
get_account_id_from_image_uri,
)
from test.test_utils.ec2 import (
get_ec2_instance_type,
execute_ec2_inference_test,
Expand Down Expand Up @@ -188,7 +193,8 @@ def run_ec2_mxnet_inference(
f" {image_uri} {mms_inference_cmd}"
)
try:
ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
account_id = get_account_id_from_image_uri(image_uri)
login_to_ecr_registry(ec2_connection, account_id, region)
LOGGER.info(docker_run_cmd)
ec2_connection.run(docker_run_cmd, hide=True)
if model_name == SQUEEZENET_MODEL:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import os
import sys
import time
import logging
from datetime import date, timedelta, datetime

import pytest
from packaging.version import Version
Expand All @@ -14,15 +12,15 @@
CONTAINER_TESTS_PREFIX,
get_framework_and_version_from_tag,
get_inference_server_type,
get_cuda_version_from_tag,
login_to_ecr_registry,
get_account_id_from_image_uri,
)
from test.test_utils.ec2 import (
get_ec2_instance_type,
execute_ec2_inference_test,
get_ec2_accelerator_type,
)
from test.dlc_tests.conftest import LOGGER
import boto3

LOGGER = logging.getLogger(__name__)
LOGGER.addHandler(logging.StreamHandler(sys.stdout))
Expand Down Expand Up @@ -238,7 +236,8 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region):
f" {image_uri} {inference_cmd}"
)
try:
ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
account_id = get_account_id_from_image_uri(image_uri)
login_to_ecr_registry(ec2_connection, account_id, region)
LOGGER.info(docker_run_cmd)
ec2_connection.run(docker_run_cmd, hide=True)
server_type = get_inference_server_type(image_uri)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@

from test import test_utils
from test.test_utils import (
CONTAINER_TESTS_PREFIX,
get_framework_and_version_from_tag,
get_inference_server_type,
UL20_CPU_ARM64_US_WEST_2,
login_to_ecr_registry,
get_account_id_from_image_uri,
)
from test.test_utils.ec2 import (
get_ec2_instance_type,
Expand Down Expand Up @@ -52,7 +53,7 @@ def test_ec2_pytorch_inference_cpu_compilation(pytorch_inference, ec2_connection
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GRAVITON_INSTANCE_TYPES, indirect=True)
@pytest.mark.parametrize("ec2_instance_ami", [UL20_CPU_ARM64_US_WEST_2], indirect=True)
@pytest.mark.team("training-compiler")
def test_ec2_pytorch_inference_cpu_compilation(
def test_ec2_pytorch_inference_graviton_compilation(
pytorch_inference_graviton, ec2_connection, region, cpu_only
):
_, image_framework_version = get_framework_and_version_from_tag(pytorch_inference_graviton)
Expand All @@ -77,7 +78,8 @@ def ec2_pytorch_inference(image_uri, processor, ec2_connection, region):
f" {image_uri} {inference_cmd}"
)
try:
ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
account_id = get_account_id_from_image_uri(image_uri)
login_to_ecr_registry(ec2_connection, account_id, region)
LOGGER.info(docker_run_cmd)
ec2_connection.run(docker_run_cmd, hide=True)
server_type = get_inference_server_type(image_uri)
Expand Down
13 changes: 8 additions & 5 deletions test/dlc_tests/ec2/pytorch/training/common_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
CONTAINER_TESTS_PREFIX,
get_framework_and_version_from_tag,
get_cuda_version_from_tag,
login_to_ecr_registry,
get_account_id_from_image_uri,
)
from test.test_utils.ec2 import (
execute_ec2_training_test,
Expand Down Expand Up @@ -189,7 +191,8 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region):
PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container.
"""
container_name = "pt_cudnn_test"
ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
account_id = get_account_id_from_image_uri(pytorch_training)
login_to_ecr_registry(ec2_connection, account_id, region)
ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True)
ec2_connection.run(
f"nvidia-docker run --name {container_name} -itd {pytorch_training}", hide=True
Expand Down Expand Up @@ -221,6 +224,10 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region):
), f"System CUDNN {system_cudnn} and torch cudnn {cudnn_from_torch} do not match. Please downgrade system CUDNN or recompile torch with correct CUDNN verson."


def pytorch_curand_gpu(pytorch_training, ec2_connection):
execute_ec2_training_test(ec2_connection, pytorch_training, CURAND_CMD)


def pytorch_linear_regression_cpu(pytorch_training, ec2_connection):
execute_ec2_training_test(
ec2_connection, pytorch_training, PT_REGRESSION_CMD, container_name="pt_reg"
Expand All @@ -240,10 +247,6 @@ def pytorch_telemetry_cpu(pytorch_training, ec2_connection):
)


def curand_gpu(training, ec2_connection):
execute_ec2_training_test(ec2_connection, training, CURAND_CMD)


def pytorch_training_torchdata(pytorch_training, ec2_connection):
_, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
# HACK including PT 1.13 in this condition because the Torchdata 0.5.0 tag includes old tests data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
UBUNTU_18_HPU_DLAMI_US_WEST_2,
get_framework_and_version_from_tag,
get_cuda_version_from_tag,
login_to_ecr_registry,
get_account_id_from_image_uri,
)
from test.test_utils.ec2 import (
execute_ec2_training_test,
Expand Down Expand Up @@ -747,7 +749,8 @@ def test_pytorch_cudnn_match_gpu(
PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container.
"""
container_name = "pt_cudnn_test"
ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
account_id = get_account_id_from_image_uri(pytorch_training)
login_to_ecr_registry(ec2_connection, account_id, region)
ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True)
ec2_connection.run(
f"nvidia-docker run --name {container_name} -itd {pytorch_training}", hide=True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def test_pytorch_2_2_gpu(
(common_cases.nvapex, (pytorch_training, ec2_connection)),
(common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
(common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
(common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
sirutBuasai marked this conversation as resolved.
Show resolved Hide resolved
]

if "sagemaker" in pytorch_training:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import re
import json
from time import sleep
import pytest

Expand Down Expand Up @@ -110,7 +109,8 @@ def test_ec2_tensorflow_inference_gpu_tensorrt(
)

try:
ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
account_id = test_utils.get_account_id_from_image_uri(tensorflow_inference)
test_utils.login_to_ecr_registry(ec2_connection, account_id, region)
host_setup_for_tensorflow_inference(serving_folder_path, framework_version, ec2_connection)
sleep(2)

Expand Down Expand Up @@ -268,7 +268,8 @@ def run_ec2_tensorflow_inference(
if not is_neuron:
train_mnist_model(serving_folder_path, ec2_connection)
sleep(10)
ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
account_id = test_utils.get_account_id_from_image_uri(image_uri)
test_utils.login_to_ecr_registry(ec2_connection, account_id, region)
ec2_connection.run(docker_run_cmd, hide=True)
sleep(20)
if is_neuron and str(framework_version).startswith(TENSORFLOW2_VERSION):
Expand Down
3 changes: 0 additions & 3 deletions test/dlc_tests/ec2/test_gdrcopy.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,6 @@
@pytest.mark.team("conda")
@pytest.mark.integration("gdrcopy")
@pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION)
@pytest.mark.parametrize(
"ec2_instance_ami", [test_utils.UBUNTU_20_BASE_DLAMI_US_WEST_2], indirect=True
)
@pytest.mark.skipif(
is_pr_context() and not are_heavy_instance_ec2_tests_enabled(),
reason="Skip GDRCopy test in PR context unless explicitly enabled",
Expand Down
9 changes: 6 additions & 3 deletions test/dlc_tests/ec2/test_smdebug.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os

import pytest

import test.test_utils as test_utils
Expand All @@ -13,6 +12,8 @@
is_tf_version,
get_framework_and_version_from_tag,
is_nightly_context,
login_to_ecr_registry,
get_account_id_from_image_uri,
)
from test.test_utils.ec2 import get_ec2_instance_type

Expand Down Expand Up @@ -170,7 +171,8 @@ def run_smdebug_test(
shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " "
framework = get_framework_from_image_uri(image_uri)
container_test_local_dir = os.path.join("$HOME", "container_tests")
ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
account_id = get_account_id_from_image_uri(image_uri)
login_to_ecr_registry(ec2_connection, account_id, region)
# Do not add -q to docker pull as it leads to a hang for huge images like trcomp
ec2_connection.run(f"docker pull {image_uri}")

Expand Down Expand Up @@ -209,7 +211,8 @@ def run_smprofiler_test(
shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " "
framework = get_framework_from_image_uri(image_uri)
container_test_local_dir = os.path.join("$HOME", "container_tests")
ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
account_id = get_account_id_from_image_uri(image_uri)
login_to_ecr_registry(ec2_connection, account_id, region)
# Do not add -q to docker pull as it leads to a hang for huge images like trcomp
ec2_connection.run(f"docker pull {image_uri}")

Expand Down
Loading
Loading