Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{AKS} Add cli changes for gpu driver selection feature #7993

Merged
merged 18 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/aks-preview/HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ If there is no rush to release a new version, please just add a description of t

To release a new version, please select a new version number (usually plus 1 to last patch version, X.Y.Z -> Major.Minor.Patch, more details in `\doc <https://semver.org/>`_), and then add a new section named as the new version number in this file, the content should include the new modifications and everything from the *Pending* section. Finally, update the `VERSION` variable in `setup.py` with this new version number.

9.0.0b5
+++++++
* Add `--driver-type` to the `az aks nodepool add` command.

9.0.0b4
+++++++
* Set the --node-vm-size to empty string when the cluster sku name is automatic. The node vm size will pick from the candidate toggle based on the logic in automatic vm selection.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
"test_aks_disable_addon_gitops"
],
"gpu, no quota": [
"test_aks_nodepool_add_with_gpu_instance_profile"
"test_aks_nodepool_add_with_gpu_instance_profile",
"test_aks_gpu_driver_type"
],
"overlay migration, missing toggle": [
"test_aks_azure_cni_overlay_migration"
Expand Down
4 changes: 4 additions & 0 deletions src/aks-preview/azext_aks_preview/_consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,3 +330,7 @@
# TLS Management Consts
CONST_TLS_MANAGEMENT_MANAGED = "Managed"
CONST_TLS_MANAGEMENT_NONE = "None"

# GPU Driver Type Consts
CONST_GPU_DRIVER_TYPE_CUDA = "CUDA"
CONST_GPU_DRIVER_TYPE_GRID = "GRID"
3 changes: 3 additions & 0 deletions src/aks-preview/azext_aks_preview/_help.py
Original file line number Diff line number Diff line change
Expand Up @@ -1861,6 +1861,9 @@
- name: --skip-gpu-driver-install
type: bool
short-summary: To skip GPU driver auto installation by AKS on a nodepool using GPU vm size if customers want to manage GPU driver installation by their own. If not specified, the default is false.
- name: --driver-type
type: string
short-summary: Specify the type of GPU driver to install when creating Windows agent pools. Valid values are "GRID" and "CUDA". If not provided, AKS selects the driver based on system compatibility. This option cannot be changed once the AgentPool has been created. The default is system selected.
- name: --ssh-access
type: string
short-summary: Configure SSH setting for the node pool. Use "disabled" to disable SSH access, "localuser" to enable SSH access using private key.
Expand Down
12 changes: 12 additions & 0 deletions src/aks-preview/azext_aks_preview/_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@
CONST_APP_ROUTING_NONE_NGINX,
CONST_TLS_MANAGEMENT_MANAGED,
CONST_TLS_MANAGEMENT_NONE,
CONST_GPU_DRIVER_TYPE_CUDA,
CONST_GPU_DRIVER_TYPE_GRID,
)
from azext_aks_preview._validators import (
validate_acr,
Expand Down Expand Up @@ -420,6 +422,11 @@
CONST_TLS_MANAGEMENT_NONE,
]

gpu_driver_types = [
CONST_GPU_DRIVER_TYPE_CUDA,
CONST_GPU_DRIVER_TYPE_GRID,
]


def load_arguments(self, _):
acr_arg_type = CLIArgumentType(metavar="ACR_NAME_OR_RESOURCE_ID")
Expand Down Expand Up @@ -1589,6 +1596,11 @@ def load_arguments(self, _):
help="space-separated tags: key[=value] [key[=value] ...].",
)
c.argument('skip_gpu_driver_install', action='store_true', is_preview=True)
c.argument(
"driver_type",
arg_type=get_enum_type(gpu_driver_types),
is_preview=True,
)
# in creation scenario, use "localuser" as default
c.argument(
'ssh_access',
Expand Down
31 changes: 31 additions & 0 deletions src/aks-preview/azext_aks_preview/agentpool_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,24 @@ def get_skip_gpu_driver_install(self) -> bool:

return skip_gpu_driver_install

def get_driver_type(self) -> Union[str, None]:
"""Obtain the value of driver_type.
:return: str or None
"""
# read the original value passed by the command
driver_type = self.raw_param.get("driver_type")

# In create mode, try to read the property value corresponding to the parameter from the `agentpool` object
if self.decorator_mode == DecoratorMode.CREATE:
if (
self.agentpool and
self.agentpool.gpu_profile is not None and
self.agentpool.gpu_profile.driver_type is not None
):
driver_type = self.agentpool.gpu_profile.driver_type

return driver_type

def get_enable_secure_boot(self) -> bool:
"""Obtain the value of enable_secure_boot.
:return: bool
Expand Down Expand Up @@ -866,6 +884,17 @@ def set_up_skip_gpu_driver_install(self, agentpool: AgentPool) -> AgentPool:
agentpool.gpu_profile.install_gpu_driver = False
return agentpool

def set_up_driver_type(self, agentpool: AgentPool) -> AgentPool:
"""Set up driver type property for the AgentPool object."""
self._ensure_agentpool(agentpool)

driver_type = self.context.get_driver_type()
if driver_type is not None:
if agentpool.gpu_profile is None:
agentpool.gpu_profile = self.models.AgentPoolGPUProfile() # pylint: disable=no-member
agentpool.gpu_profile.driver_type = driver_type
calvin197 marked this conversation as resolved.
Show resolved Hide resolved
return agentpool

def set_up_pod_ip_allocation_mode(self, agentpool: AgentPool) -> AgentPool:
"""Set up pod ip allocation mode for the AgentPool object."""
self._ensure_agentpool(agentpool)
Expand Down Expand Up @@ -967,6 +996,8 @@ def construct_agentpool_profile_preview(self) -> AgentPool:
agentpool = self.set_up_artifact_streaming(agentpool)
# set up skip_gpu_driver_install
agentpool = self.set_up_skip_gpu_driver_install(agentpool)
# set up driver_type
agentpool = self.set_up_driver_type(agentpool)
# set up agentpool ssh access
agentpool = self.set_up_ssh_access(agentpool)
# set up agentpool pod ip allocation mode
Expand Down
1 change: 1 addition & 0 deletions src/aks-preview/azext_aks_preview/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -1215,6 +1215,7 @@ def aks_agentpool_add(
node_public_ip_tags=None,
enable_artifact_streaming=False,
skip_gpu_driver_install=False,
driver_type=None,
ssh_access=CONST_SSH_ACCESS_LOCALUSER,
# trusted launch
enable_secure_boot=False,
Expand Down
Loading
Loading