vGPU implementation

- Builds on the changes in #1579 Co-authored-by: Geetika Batra <[email protected]> Signed-off-by: Puneet Katyal <[email protected]>
kubernetes-sigs · Aug 22, 2023 · 9dc76f8 · 9dc76f8
1 parent 9881385
commit 9dc76f8
Show file tree

Hide file tree

Showing 14 changed files with 273 additions and 5 deletions.
diff --git a/Makefile b/Makefile
@@ -310,6 +310,8 @@ generate-e2e-templates-main: $(KUSTOMIZE) ## Generate test templates for the mai
 	"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build $(E2E_TEMPLATE_DIR)/main/topology > $(E2E_TEMPLATE_DIR)/main/cluster-template-topology.yaml
 	# for PCI passthrough template
 	"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build $(E2E_TEMPLATE_DIR)/main/pci > $(E2E_TEMPLATE_DIR)/main/cluster-template-pci.yaml
+	# for vGPU template
+	"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build $(E2E_TEMPLATE_DIR)/main/vgpu > $(E2E_TEMPLATE_DIR)/main/cluster-template-vgpu.yaml
 	# for DHCP overrides
 	"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build $(E2E_TEMPLATE_DIR)/main/dhcp-overrides > $(E2E_TEMPLATE_DIR)/main/cluster-template-dhcp-overrides.yaml
 

diff --git a/apis/v1alpha3/zz_generated.conversion.go b/apis/v1alpha3/zz_generated.conversion.go
diff --git a/apis/v1alpha4/zz_generated.conversion.go b/apis/v1alpha4/zz_generated.conversion.go
diff --git a/apis/v1beta1/types.go b/apis/v1beta1/types.go
@@ -194,6 +194,9 @@ type VirtualMachineCloneSpec struct {
 	// PciDevices is the list of pci devices used by the virtual machine.
 	// +optional
 	PciDevices []PCIDeviceSpec `json:"pciDevices,omitempty"`
+	// VGPUDevices is the list of vGPUs used by the virtual machine.
+	// +optional
+	VGPUDevices []VGPUSpec `json:"vgpuDevices,omitempty"`
 	// OS is the Operating System of the virtual machine
 	// Defaults to Linux
 	// +optional
@@ -261,6 +264,15 @@ type PCIDeviceSpec struct {
 	VendorID *int32 `json:"vendorId,omitempty"`
 }
 
+// VGPUSpec defines virtual machine's VGPU configuration
+type VGPUSpec struct {
+	// ProfileName is the ProfileName of a virtual machine's vGPU, in string.
+	// Defaults to the eponymous property value in the template from which the
+	// virtual machine is cloned.
+	// +kubebuilder:validation:Required
+	ProfileName string `json:"profileName,omitempty"`
+}
+
 // NetworkSpec defines the virtual machine's network configuration.
 type NetworkSpec struct {
 	// Devices is the list of network devices used by the virtual machine.

diff --git a/apis/v1beta1/zz_generated.deepcopy.go b/apis/v1beta1/zz_generated.deepcopy.go
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
@@ -1274,6 +1274,19 @@ spec:
                   of the communication between Cluster API Provider vSphere and the
                   VMware vCenter server.
                 type: string
+              vgpuDevices:
+                description: VGPUDevices is the list of vGPUs used by the virtual
+                  machine.
+                items:
+                  description: VGPUSpec defines virtual machine's VGPU configuration
+                  properties:
+                    profileName:
+                      description: ProfileName is the ProfileName of a virtual machine's
+                        vGPU, in string. Defaults to the eponymous property value
+                        in the template from which the virtual machine is cloned.
+                      type: string
+                  type: object
+                type: array
             required:
             - network
             - template

diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
@@ -1187,6 +1187,20 @@ spec:
                           TLS certificate validation of the communication between
                           Cluster API Provider vSphere and the VMware vCenter server.
                         type: string
+                      vgpuDevices:
+                        description: VGPUDevices is the list of vGPUs used by the
+                          virtual machine.
+                        items:
+                          description: VGPUSpec defines virtual machine's VGPU configuration
+                          properties:
+                            profileName:
+                              description: ProfileName is the ProfileName of a virtual
+                                machine's vGPU, in string. Defaults to the eponymous
+                                property value in the template from which the virtual
+                                machine is cloned.
+                              type: string
+                          type: object
+                        type: array
                     required:
                     - network
                     - template

diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
@@ -1316,6 +1316,19 @@ spec:
                   of the communication between Cluster API Provider vSphere and the
                   VMware vCenter server.
                 type: string
+              vgpuDevices:
+                description: VGPUDevices is the list of vGPUs used by the virtual
+                  machine.
+                items:
+                  description: VGPUSpec defines virtual machine's VGPU configuration
+                  properties:
+                    profileName:
+                      description: ProfileName is the ProfileName of a virtual machine's
+                        vGPU, in string. Defaults to the eponymous property value
+                        in the template from which the virtual machine is cloned.
+                      type: string
+                  type: object
+                type: array
             required:
             - network
             - template

diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md
@@ -0,0 +1,107 @@
+# GPU enabled clusters using vGPU
+
+## Overview
+
+You can choose to create a cluster with both worker and control plane nodes having vGPU devices attached to them.
+
+Before we begin, a few important things to note:
+
+- [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) is used to expose the GPU PCI devices to the workloads running on the cluster.
+- The OVA templates used for cluster creation should have the VMX version (Virtual Hardware) set to 17 or higher. This is necessary because Dynamic DirectPath I/O was introduced in this version, which enables the Assignable Hardware intelligence for passthrough devices.
+- Since we need the VMX version to be >=17, this way of provisioning clusters with PCI passthrough devices works for vSphere 7.0 and above. This is the ESXi/VMX version [compatibility list](https://kb.vmware.com/s/article/2007240).
+- UEFI boot mode is recommended for the OVAs used for cluster creation.
+- Most of the setup is similar to [GPU enabled clusters via PCI Passthrough](https://github.com/kubernetes-sigs/cluster-api-provider-vsphere/blob/main/docs/gpu-pci.md#create-the-cluster).
+
+## An example GPU enabled cluster
+
+Let's create a CAPV cluster with vGPU enabled nodes.
+
+### Prerequisites
+
+- Refer the [NVIDIA Virtual GPU Software Quick Start Guide](https://docs.nvidia.com/grid/latest/grid-software-quick-start-guide/index.html) to download and install the vGPU software and configure vGPU licensing.
+
+- Ensure vGPU compatibility for your vSphere installation and the GPU devices using the [VMware Compatibility Guide - Shared Pass-through Graphics](https://www.vmware.com/resources/compatibility/search.php?deviceCategory=vgpu)
+
+- Enable Shared Passthrough for the GPU device on the ESXi Host
+  - Browse to a host in the vSphere Client navigator.
+  - On the **Configure** tab, expand **Hardware** and click **Graphics**.
+  - Under **GRAPHICS DEVICES**, select the GPU device to be used for vGPU, click **EDIT...** and select **Shared Direct**. Repeat this for additional GPU devices as needed.
+  - Select **HOST GRAPHICS**, click **EDIT...** and select **Shared Direct** and select a shared passthrough GPU assignment policy, for example **Group VMs on GPU until full (GPU consolidation)**.
+
+- Build an OVA template
+  We can build a custom OVA template using the [image-builder](https://github.com/kubernetes-sigs/image-builder) project. We will build a Ubuntu 20.04 OVA with UEFI boot mode. More documentation on how to use image-builder can be found in the [image-builder book](https://image-builder.sigs.k8s.io/capi/providers/vsphere.html)
+  - Clone the repo locally and go to the `./images/capi/` directory.
+  - Create a `packer-vars.json` file with the following content.
+
+    ```shell
+    $ cat packer-vars.json
+    {
+        "vmx_version": 17
+    }
+    ```
+
+  - Run the make file target associated to ubuntu 20.04 UEFI OVA as follows:
+
+    ```shell
+    > PACKER_VAR_FILES=packer-vars.json make build-node-ova-vsphere-ubuntu-2004-efi
+    ```
+
+### Source the vGPU profile(s) for the GPU device
+
+See "2. Choosing the vGPU Profile for the Virtual Machine" at [Using GPUs with Virtual Machines on vSphere](https://blogs.vmware.com/apps/2018/09/using-gpus-with-virtual-machines-on-vsphere-part-3-installing-the-nvidia-grid-technology.html) to see what vGPU profiles are available for your GPU device.
+
+We are using NVIDIA Tesla V100 32GB cards for this example and will use the `grid_v100d-4c` vGPU profile for this card that allocates 4GB GPU memory to the worker node's vGPU device. 
+
+### Create the cluster template
+
+```shell
+$ make dev-flavors
+/Applications/Xcode.app/Contents/Developer/usr/bin/make generate-flavors FLAVOR_DIR=/Users/pkatyal/.cluster-api/overrides/infrastructure-vsphere/v0.0.0
+go run ./packaging/flavorgen --output-dir /Users/pkatyal/.cluster-api/overrides/infrastructure-vsphere/v0.0.0
+```
+
+Edit the generated Cluster template (`cluster-template.yaml`) to set the values for the `vgpuDevices` array. Here we are editing the VSphereMachineTemplate object for the worker nodes. This will create a worker node with a single NVIDIA 16GB vGPU device attached to the VM.
+
+```yaml
+---
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
+kind: VSphereMachineTemplate
+metadata:
+  name: ${CLUSTER_NAME}-worker
+  namespace: '${NAMESPACE}'
+spec:
+  template:
+    spec:
+      cloneMode: linkedClone
+      datacenter: '${VSPHERE_DATACENTER}'
+      datastore: '${VSPHERE_DATASTORE}'
+      diskGiB: 25
+      folder: '${VSPHERE_FOLDER}'
+      memoryMiB: 8192
+      network:
+        devices:
+        - dhcp4: true
+          networkName: '${VSPHERE_NETWORK}'
+      numCPUs: 2
+      os: Linux
+      powerOffMode: trySoft
+      resourcePool: '${VSPHERE_RESOURCE_POOL}'
+      server: '${VSPHERE_SERVER}'
+      storagePolicyName: '${VSPHERE_STORAGE_POLICY}'
+      template: '${VSPHERE_TEMPLATE}'
+      thumbprint: '${VSPHERE_TLS_THUMBPRINT}'
+      vgpuDevices:
+        - profileName: "grid_v100d-4c"    <============ value from above
+```
+
+Set the required values for the other fields and the cluster template is ready for use. The similar changes can be made to a template generated using clusterctl generate cluster command as well.
+
+### Create the cluster
+
+Set the size of the GPU nodes appropriately, since the Nvidia gpu-operator requires additional CPU and memory to install the device drivers on the VMs.
+
+Note: For GPU nodes (PCI Passthrough or vGPU), all memory of the nodes must be reserved. CAPV will automatically do this for nodes that have a PCI Passthrough GPU or a vGPU device in the spec. See "Memory Reservation" at [Using GPUs with Virtual Machines on vSphere](https://blogs.vmware.com/apps/2018/09/using-gpus-with-virtual-machines-on-vsphere-part-2-vmdirectpath-i-o.html)
+
+Apply the manifest from the previous step to your management cluster to have CAPV create a workload cluster with worker nodes that have vGPUs.
+
+From this point on, the setup is exactly the same as [GPU enabled clusters via PCI Passthrough](https://github.com/kubernetes-sigs/cluster-api-provider-vsphere/blob/main/docs/gpu-pci.md#create-the-cluster). 
diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go
@@ -68,7 +68,7 @@ func Clone(ctx *context.VMContext, bootstrapData []byte, format bootstrapv1.Form
 		}
 	}
 	if ctx.VSphereVM.Spec.CustomVMXKeys != nil {
-		ctx.Logger.Info("applied custom vmx keys o VM clone spec")
+		ctx.Logger.Info("applied custom vmx keys to VM clone spec")
 		if err := extraConfig.SetCustomVMXKeys(ctx.VSphereVM.Spec.CustomVMXKeys); err != nil {
 			return err
 		}
@@ -151,8 +151,22 @@ func Clone(ctx *context.VMContext, bootstrapData []byte, format bootstrapv1.Form
 
 	deviceSpecs = append(deviceSpecs, networkSpecs...)
 
-	if err != nil {
-		return errors.Wrapf(err, "error getting network specs for %q", ctx)
+	if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices) != 0 {
+		gpuSpecs, err := getGpuSpecs(ctx)
+		if err != nil {
+			return errors.Wrapf(err, "error getting gpu specs for %q", ctx)
+		}
+		ctx.Logger.V(4).Info("created gpu devices", "gpu-device-specs", gpuSpecs)
+		deviceSpecs = append(deviceSpecs, gpuSpecs...)
+	}
+
+	if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices) != 0 {
+		vgpuSpecs, err := getVgpuSpecs(ctx)
+		if err != nil {
+			return errors.Wrapf(err, "error getting gpu specs for %q", ctx)
+		}
+		ctx.Logger.V(4).Info("created vgpu devices", "vgpu-device-specs", vgpuSpecs)
+		deviceSpecs = append(deviceSpecs, vgpuSpecs...)
 	}
 
 	numCPUs := ctx.VSphereVM.Spec.NumCPUs
@@ -199,10 +213,10 @@ func Clone(ctx *context.VMContext, bootstrapData []byte, format bootstrapv1.Form
 		Snapshot: snapshotRef,
 	}
 
-	// For PCI devices, the memory for the VM needs to be reserved
+	// For PCI and vGPU devices, the memory for the VM needs to be reserved
 	// We can replace this once we have another way of reserving memory option
 	// exposed via the API types.
-	if len(ctx.VSphereVM.Spec.PciDevices) > 0 {
+	if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices) > 0 || len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices) > 0 {
 		spec.Config.MemoryReservationLockedToMax = pointer.Bool(true)
 	}
 
@@ -453,3 +467,55 @@ func getNetworkSpecs(ctx *context.VMContext, devices object.VirtualDeviceList) (
 
 	return deviceSpecs, nil
 }
+
+func createPCIPassThroughDevice(deviceKey int32, backingInfo types.BaseVirtualDeviceBackingInfo) types.BaseVirtualDevice {
+	device := &types.VirtualPCIPassthrough{
+		VirtualDevice: types.VirtualDevice{
+			Key:     deviceKey,
+			Backing: backingInfo,
+		},
+	}
+	return device
+}
+
+func getGpuSpecs(ctx *context.VMContext) ([]types.BaseVirtualDeviceConfigSpec, error) {
+	deviceSpecs := []types.BaseVirtualDeviceConfigSpec{}
+	deviceKey := int32(-200)
+
+	for _, pciDevice := range ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices {
+		backingInfo := &types.VirtualPCIPassthroughDynamicBackingInfo{
+			AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{
+				{
+					VendorId: *pciDevice.VendorID,
+					DeviceId: *pciDevice.DeviceID,
+				},
+			},
+		}
+		dynamicDirectPathDevice := createPCIPassThroughDevice(deviceKey, backingInfo)
+		deviceSpecs = append(deviceSpecs, &types.VirtualDeviceConfigSpec{
+			Device:    dynamicDirectPathDevice,
+			Operation: types.VirtualDeviceConfigSpecOperationAdd,
+		})
+		deviceKey--
+	}
+	return deviceSpecs, nil
+}
+
+func getVgpuSpecs(ctx *context.VMContext) ([]types.BaseVirtualDeviceConfigSpec, error) {
+	deviceSpecs := []types.BaseVirtualDeviceConfigSpec{}
+	deviceKey := int32(-200)
+
+	for _, vGPUDevice := range ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices {
+		backingInfo := &types.VirtualPCIPassthroughVmiopBackingInfo{
+			Vgpu: vGPUDevice.ProfileName,
+		}
+		dynamicDirectPathDevice := createPCIPassThroughDevice(deviceKey, backingInfo)
+		deviceSpecs = append(deviceSpecs, &types.VirtualDeviceConfigSpec{
+			Device:    dynamicDirectPathDevice,
+			Operation: types.VirtualDeviceConfigSpecOperationAdd,
+		})
+		ctx.Logger.V(4).Info("created vGPU device", "vgpu-profile", vGPUDevice.ProfileName)
+		deviceKey--
+	}
+	return deviceSpecs, nil
+}
diff --git a/test/e2e/config/vsphere-ci.yaml b/test/e2e/config/vsphere-ci.yaml
@@ -121,6 +121,7 @@ variables:
   # These IDs correspond to Tesla T4s, they are the decimal representation of the hex values.
   DEVICE_ID: 7864
   VENDOR_ID: 4318
+  PROFILE_NAME: grid_v100d-4c
   # CAPV feature flags
   EXP_NODE_ANTI_AFFINITY: "true"
 

diff --git a/test/e2e/config/vsphere-dev.yaml b/test/e2e/config/vsphere-dev.yaml
@@ -135,6 +135,7 @@ variables:
   # These IDs correspond to Tesla T4s, they are the decimal representation of the hex values.
   DEVICE_ID: 7864
   VENDOR_ID: 4318
+  PROFILE_NAME: grid_v100d-4c
   # CAPV feature flags
   EXP_NODE_ANTI_AFFINITY: "true"
   # Following CAPV variables is used for multivc_test.go. This is the second VSphere and should be set if multivc test is enabled.

diff --git a/test/e2e/data/infrastructure-vsphere/main/vgpu/kustomization.yaml b/test/e2e/data/infrastructure-vsphere/main/vgpu/kustomization.yaml
@@ -0,0 +1,6 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - ../base
+patchesStrategicMerge:
+  - vgpu-device-template.yaml
diff --git a/test/e2e/data/infrastructure-vsphere/main/vgpu/vgpu-device-template.yaml b/test/e2e/data/infrastructure-vsphere/main/vgpu/vgpu-device-template.yaml
@@ -0,0 +1,11 @@
+---
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
+kind: VSphereMachineTemplate
+metadata:
+  name: ${CLUSTER_NAME}-worker
+  namespace: ${NAMESPACE}
+spec:
+  template:
+    spec:
+      vgpuDevices:
+        - profileName: ${PROFILE_NAME}