From 009129131af60ad3bbccdfa929b7160af25262ab Mon Sep 17 00:00:00 2001 From: Puneet Katyal Date: Thu, 14 Jul 2022 13:45:04 +0530 Subject: [PATCH 01/21] vGPU implementation - Builds on the changes in https://github.com/kubernetes-sigs/cluster-api-provider-vsphere/pull/1579 Co-authored-by: Geetika Batra Signed-off-by: Puneet Katyal --- Makefile | 2 + apis/v1alpha3/conversion_test.go | 1 + apis/v1alpha3/zz_generated.conversion.go | 1 + apis/v1alpha4/zz_generated.conversion.go | 1 + apis/v1beta1/types.go | 12 ++ apis/v1beta1/zz_generated.deepcopy.go | 20 +++ ...ture.cluster.x-k8s.io_vspheremachines.yaml | 13 ++ ...ster.x-k8s.io_vspheremachinetemplates.yaml | 14 ++ ...structure.cluster.x-k8s.io_vspherevms.yaml | 13 ++ docs/gpu-vgpu.md | 107 ++++++++++++ pkg/services/govmomi/vcenter/clone.go | 72 +++++++- test/e2e/config/vsphere-dev.yaml | 164 ++++++++++++++++++ test/e2e/config/vsphere.yaml | 1 + .../main/vgpu/kustomization.yaml | 6 + .../main/vgpu/vgpu-device-template.yaml | 11 ++ 15 files changed, 432 insertions(+), 6 deletions(-) create mode 100644 docs/gpu-vgpu.md create mode 100644 test/e2e/config/vsphere-dev.yaml create mode 100644 test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml create mode 100644 test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml diff --git a/Makefile b/Makefile index 2eb00f4911..bb5d78b839 100644 --- a/Makefile +++ b/Makefile @@ -384,6 +384,8 @@ generate-e2e-templates-main: $(KUSTOMIZE) ## Generate test templates for the mai "$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/install-on-bootstrap" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-install-on-bootstrap.yaml" # for PCI passthrough template "$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/pci" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-pci.yaml" + # for vGPU template + "$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/vgpu" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-vgpu.yaml" # for DHCP overrides "$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/dhcp-overrides" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-dhcp-overrides.yaml" "$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/ownerrefs-finalizers" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-ownerrefs-finalizers.yaml" diff --git a/apis/v1alpha3/conversion_test.go b/apis/v1alpha3/conversion_test.go index 732fae6f8b..e956f34da2 100644 --- a/apis/v1alpha3/conversion_test.go +++ b/apis/v1alpha3/conversion_test.go @@ -120,6 +120,7 @@ func CustomSpecNewFieldFuzzer(in *infrav1.VirtualMachineCloneSpec, c fuzz.Contin c.FuzzNoCustom(in) in.PciDevices = nil + in.VGPUDevices = nil in.AdditionalDisksGiB = nil in.OS = "" in.HardwareVersion = "" diff --git a/apis/v1alpha3/zz_generated.conversion.go b/apis/v1alpha3/zz_generated.conversion.go index 966195ccbb..0c8f7f2cb0 100644 --- a/apis/v1alpha3/zz_generated.conversion.go +++ b/apis/v1alpha3/zz_generated.conversion.go @@ -1760,6 +1760,7 @@ func autoConvert_v1beta1_VirtualMachineCloneSpec_To_v1alpha3_VirtualMachineClone out.CustomVMXKeys = *(*map[string]string)(unsafe.Pointer(&in.CustomVMXKeys)) // WARNING: in.TagIDs requires manual conversion: does not exist in peer-type // WARNING: in.PciDevices requires manual conversion: does not exist in peer-type + // WARNING: in.VGPUDevices requires manual conversion: does not exist in peer-type // WARNING: in.OS requires manual conversion: does not exist in peer-type // WARNING: in.HardwareVersion requires manual conversion: does not exist in peer-type return nil diff --git a/apis/v1alpha4/zz_generated.conversion.go b/apis/v1alpha4/zz_generated.conversion.go index 147c1a9894..a18d2ecd3d 100644 --- a/apis/v1alpha4/zz_generated.conversion.go +++ b/apis/v1alpha4/zz_generated.conversion.go @@ -1914,6 +1914,7 @@ func autoConvert_v1beta1_VirtualMachineCloneSpec_To_v1alpha4_VirtualMachineClone out.CustomVMXKeys = *(*map[string]string)(unsafe.Pointer(&in.CustomVMXKeys)) // WARNING: in.TagIDs requires manual conversion: does not exist in peer-type // WARNING: in.PciDevices requires manual conversion: does not exist in peer-type + // WARNING: in.VGPUDevices requires manual conversion: does not exist in peer-type // WARNING: in.OS requires manual conversion: does not exist in peer-type // WARNING: in.HardwareVersion requires manual conversion: does not exist in peer-type return nil diff --git a/apis/v1beta1/types.go b/apis/v1beta1/types.go index 1a12b8de6f..81c5882279 100644 --- a/apis/v1beta1/types.go +++ b/apis/v1beta1/types.go @@ -193,6 +193,9 @@ type VirtualMachineCloneSpec struct { // PciDevices is the list of pci devices used by the virtual machine. // +optional PciDevices []PCIDeviceSpec `json:"pciDevices,omitempty"` + // VGPUDevices is the list of vGPUs used by the virtual machine. + // +optional + VGPUDevices []VGPUSpec `json:"vgpuDevices,omitempty"` // OS is the Operating System of the virtual machine // Defaults to Linux // +optional @@ -265,6 +268,15 @@ type PCIDeviceSpec struct { CustomLabel string `json:"customLabel,omitempty"` } +// VGPUSpec defines virtual machine's VGPU configuration +type VGPUSpec struct { + // ProfileName is the ProfileName of a virtual machine's vGPU, in string. + // Defaults to the eponymous property value in the template from which the + // virtual machine is cloned. + // +kubebuilder:validation:Required + ProfileName string `json:"profileName,omitempty"` +} + // NetworkSpec defines the virtual machine's network configuration. type NetworkSpec struct { // Devices is the list of network devices used by the virtual machine. diff --git a/apis/v1beta1/zz_generated.deepcopy.go b/apis/v1beta1/zz_generated.deepcopy.go index 44d12a65fe..c6b26cfb39 100644 --- a/apis/v1beta1/zz_generated.deepcopy.go +++ b/apis/v1beta1/zz_generated.deepcopy.go @@ -403,6 +403,21 @@ func (in *Topology) DeepCopy() *Topology { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *VGPUSpec) DeepCopyInto(out *VGPUSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VGPUSpec. +func (in *VGPUSpec) DeepCopy() *VGPUSpec { + if in == nil { + return nil + } + out := new(VGPUSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *VSphereCluster) DeepCopyInto(out *VSphereCluster) { *out = *in @@ -1321,6 +1336,11 @@ func (in *VirtualMachineCloneSpec) DeepCopyInto(out *VirtualMachineCloneSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.VGPUDevices != nil { + in, out := &in.VGPUDevices, &out.VGPUDevices + *out = make([]VGPUSpec, len(*in)) + copy(*out, *in) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineCloneSpec. diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml index bc1ec1541e..eb46f63a03 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml @@ -1448,6 +1448,19 @@ spec: without TLS certificate validation of the communication between Cluster API Provider vSphere and the VMware vCenter server. type: string + vgpuDevices: + description: VGPUDevices is the list of vGPUs used by the virtual + machine. + items: + description: VGPUSpec defines virtual machine's VGPU configuration + properties: + profileName: + description: ProfileName is the ProfileName of a virtual machine's + vGPU, in string. Defaults to the eponymous property value + in the template from which the virtual machine is cloned. + type: string + type: object + type: array required: - network - template diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml index ca4bae3640..de5d4f2c62 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml @@ -1323,6 +1323,20 @@ spec: without TLS certificate validation of the communication between Cluster API Provider vSphere and the VMware vCenter server. type: string + vgpuDevices: + description: VGPUDevices is the list of vGPUs used by the + virtual machine. + items: + description: VGPUSpec defines virtual machine's VGPU configuration + properties: + profileName: + description: ProfileName is the ProfileName of a virtual + machine's vGPU, in string. Defaults to the eponymous + property value in the template from which the virtual + machine is cloned. + type: string + type: object + type: array required: - network - template diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml index 3f42eea904..f6f2f80e71 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml @@ -1531,6 +1531,19 @@ spec: without TLS certificate validation of the communication between Cluster API Provider vSphere and the VMware vCenter server. type: string + vgpuDevices: + description: VGPUDevices is the list of vGPUs used by the virtual + machine. + items: + description: VGPUSpec defines virtual machine's VGPU configuration + properties: + profileName: + description: ProfileName is the ProfileName of a virtual machine's + vGPU, in string. Defaults to the eponymous property value + in the template from which the virtual machine is cloned. + type: string + type: object + type: array required: - network - template diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md new file mode 100644 index 0000000000..3c5f546afb --- /dev/null +++ b/docs/gpu-vgpu.md @@ -0,0 +1,107 @@ +# GPU enabled clusters using vGPU + +## Overview + +You can choose to create a cluster with both worker and control plane nodes having vGPU devices attached to them. + +Before we begin, a few important things to note: + +- [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) is used to expose the GPU PCI devices to the workloads running on the cluster. +- The OVA templates used for cluster creation should have the VMX version (Virtual Hardware) set to 17 or higher. This is necessary because Dynamic DirectPath I/O was introduced in this version, which enables the Assignable Hardware intelligence for passthrough devices. +- Since we need the VMX version to be >=17, this way of provisioning clusters with PCI passthrough devices works for vSphere 7.0 and above. This is the ESXi/VMX version [compatibility list](https://kb.vmware.com/s/article/2007240). +- UEFI boot mode is recommended for the OVAs used for cluster creation. +- Most of the setup is similar to [GPU enabled clusters via PCI Passthrough](https://github.com/kubernetes-sigs/cluster-api-provider-vsphere/blob/main/docs/gpu-pci.md#create-the-cluster). + +## An example GPU enabled cluster + +Let's create a CAPV cluster with vGPU enabled nodes. + +### Prerequisites + +- Refer the [NVIDIA Virtual GPU Software Quick Start Guide](https://docs.nvidia.com/grid/latest/grid-software-quick-start-guide/index.html) to download and install the vGPU software and configure vGPU licensing. + +- Ensure vGPU compatibility for your vSphere installation and the GPU devices using the [VMware Compatibility Guide - Shared Pass-through Graphics](https://www.vmware.com/resources/compatibility/search.php?deviceCategory=vgpu) + +- Enable Shared Passthrough for the GPU device on the ESXi Host + - Browse to a host in the vSphere Client navigator. + - On the **Configure** tab, expand **Hardware** and click **Graphics**. + - Under **GRAPHICS DEVICES**, select the GPU device to be used for vGPU, click **EDIT...** and select **Shared Direct**. Repeat this for additional GPU devices as needed. + - Select **HOST GRAPHICS**, click **EDIT...** and select **Shared Direct** and select a shared passthrough GPU assignment policy, for example **Group VMs on GPU until full (GPU consolidation)**. + +- Build an OVA template + We can build a custom OVA template using the [image-builder](https://github.com/kubernetes-sigs/image-builder) project. We will build a Ubuntu 20.04 OVA with UEFI boot mode. More documentation on how to use image-builder can be found in the [image-builder book](https://image-builder.sigs.k8s.io/capi/providers/vsphere.html) + - Clone the repo locally and go to the `./images/capi/` directory. + - Create a `packer-vars.json` file with the following content. + + ```shell + $ cat packer-vars.json + { + "vmx_version": 17 + } + ``` + + - Run the make file target associated to ubuntu 20.04 UEFI OVA as follows: + + ```shell + > PACKER_VAR_FILES=packer-vars.json make build-node-ova-vsphere-ubuntu-2004-efi + ``` + +### Source the vGPU profile(s) for the GPU device + +See "2. Choosing the vGPU Profile for the Virtual Machine" at [Using GPUs with Virtual Machines on vSphere](https://blogs.vmware.com/apps/2018/09/using-gpus-with-virtual-machines-on-vsphere-part-3-installing-the-nvidia-grid-technology.html) to see what vGPU profiles are available for your GPU device. + +We are using NVIDIA Tesla V100 32GB cards for this example and will use the `grid_v100d-4c` vGPU profile for this card that allocates 4GB GPU memory to the worker node's vGPU device. + +### Create the cluster template + +```shell +$ make dev-flavors +/Applications/Xcode.app/Contents/Developer/usr/bin/make generate-flavors FLAVOR_DIR=/Users/pkatyal/.cluster-api/overrides/infrastructure-vsphere/v0.0.0 +go run ./packaging/flavorgen --output-dir /Users/pkatyal/.cluster-api/overrides/infrastructure-vsphere/v0.0.0 +``` + +Edit the generated Cluster template (`cluster-template.yaml`) to set the values for the `vgpuDevices` array. Here we are editing the VSphereMachineTemplate object for the worker nodes. This will create a worker node with a single NVIDIA 16GB vGPU device attached to the VM. + +```yaml +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: VSphereMachineTemplate +metadata: + name: ${CLUSTER_NAME}-worker + namespace: '${NAMESPACE}' +spec: + template: + spec: + cloneMode: linkedClone + datacenter: '${VSPHERE_DATACENTER}' + datastore: '${VSPHERE_DATASTORE}' + diskGiB: 25 + folder: '${VSPHERE_FOLDER}' + memoryMiB: 8192 + network: + devices: + - dhcp4: true + networkName: '${VSPHERE_NETWORK}' + numCPUs: 2 + os: Linux + powerOffMode: trySoft + resourcePool: '${VSPHERE_RESOURCE_POOL}' + server: '${VSPHERE_SERVER}' + storagePolicyName: '${VSPHERE_STORAGE_POLICY}' + template: '${VSPHERE_TEMPLATE}' + thumbprint: '${VSPHERE_TLS_THUMBPRINT}' + vgpuDevices: + - profileName: "grid_v100d-4c" <============ value from above +``` + +Set the required values for the other fields and the cluster template is ready for use. The similar changes can be made to a template generated using clusterctl generate cluster command as well. + +### Create the cluster + +Set the size of the GPU nodes appropriately, since the Nvidia gpu-operator requires additional CPU and memory to install the device drivers on the VMs. + +Note: For GPU nodes (PCI Passthrough or vGPU), all memory of the nodes must be reserved. CAPV will automatically do this for nodes that have a PCI Passthrough GPU or a vGPU device in the spec. See "Memory Reservation" at [Using GPUs with Virtual Machines on vSphere](https://blogs.vmware.com/apps/2018/09/using-gpus-with-virtual-machines-on-vsphere-part-2-vmdirectpath-i-o.html) + +Apply the manifest from the previous step to your management cluster to have CAPV create a workload cluster with worker nodes that have vGPUs. + +From this point on, the setup is exactly the same as [GPU enabled clusters via PCI Passthrough](https://github.com/kubernetes-sigs/cluster-api-provider-vsphere/blob/main/docs/gpu-pci.md#create-the-cluster). diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go index e216790081..e263ca3412 100644 --- a/pkg/services/govmomi/vcenter/clone.go +++ b/pkg/services/govmomi/vcenter/clone.go @@ -68,9 +68,9 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by extraConfig.SetIgnitionUserData(bootstrapData) } } - if vmCtx.VSphereVM.Spec.CustomVMXKeys != nil { - log.Info("Applied custom vmx keys o VM clone spec") - if err := extraConfig.SetCustomVMXKeys(vmCtx.VSphereVM.Spec.CustomVMXKeys); err != nil { + if ctx.VSphereVM.Spec.CustomVMXKeys != nil { + ctx.Logger.Info("applied custom vmx keys to VM clone spec") + if err := extraConfig.SetCustomVMXKeys(ctx.VSphereVM.Spec.CustomVMXKeys); err != nil { return err } } @@ -152,8 +152,16 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by deviceSpecs = append(deviceSpecs, networkSpecs...) - if err != nil { - return errors.Wrapf(err, "error getting network specs for %q", ctx) + if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices) != 0 { + gpuSpecs := getGpuSpecs(ctx) + ctx.Logger.V(4).Info("created gpu devices", "gpu-device-specs", gpuSpecs) + deviceSpecs = append(deviceSpecs, gpuSpecs...) + } + + if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices) != 0 { + vgpuSpecs := getVgpuSpecs(ctx) + ctx.Logger.V(4).Info("created vgpu devices", "vgpu-device-specs", vgpuSpecs) + deviceSpecs = append(deviceSpecs, vgpuSpecs...) } numCPUs := vmCtx.VSphereVM.Spec.NumCPUs @@ -200,7 +208,7 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by Snapshot: snapshotRef, } - // For PCI devices, the memory for the VM needs to be reserved + // For PCI and vGPU devices, the memory for the VM needs to be reserved // We can replace this once we have another way of reserving memory option // exposed via the API types. if len(vmCtx.VSphereVM.Spec.PciDevices) > 0 { @@ -453,3 +461,55 @@ func getNetworkSpecs(ctx context.Context, vmCtx *capvcontext.VMContext, devices return deviceSpecs, nil } + +func createPCIPassThroughDevice(deviceKey int32, backingInfo types.BaseVirtualDeviceBackingInfo) types.BaseVirtualDevice { + device := &types.VirtualPCIPassthrough{ + VirtualDevice: types.VirtualDevice{ + Key: deviceKey, + Backing: backingInfo, + }, + } + return device +} + +func getGpuSpecs(ctx *context.VMContext) []types.BaseVirtualDeviceConfigSpec { + deviceSpecs := []types.BaseVirtualDeviceConfigSpec{} + deviceKey := int32(-200) + + for _, pciDevice := range ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices { + backingInfo := &types.VirtualPCIPassthroughDynamicBackingInfo{ + AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{ + { + VendorId: *pciDevice.VendorID, + DeviceId: *pciDevice.DeviceID, + }, + }, + } + dynamicDirectPathDevice := createPCIPassThroughDevice(deviceKey, backingInfo) + deviceSpecs = append(deviceSpecs, &types.VirtualDeviceConfigSpec{ + Device: dynamicDirectPathDevice, + Operation: types.VirtualDeviceConfigSpecOperationAdd, + }) + deviceKey-- + } + return deviceSpecs +} + +func getVgpuSpecs(ctx *context.VMContext) []types.BaseVirtualDeviceConfigSpec { + deviceSpecs := []types.BaseVirtualDeviceConfigSpec{} + deviceKey := int32(-200) + + for _, vGPUDevice := range ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices { + backingInfo := &types.VirtualPCIPassthroughVmiopBackingInfo{ + Vgpu: vGPUDevice.ProfileName, + } + dynamicDirectPathDevice := createPCIPassThroughDevice(deviceKey, backingInfo) + deviceSpecs = append(deviceSpecs, &types.VirtualDeviceConfigSpec{ + Device: dynamicDirectPathDevice, + Operation: types.VirtualDeviceConfigSpecOperationAdd, + }) + ctx.Logger.V(4).Info("created vGPU device", "vgpu-profile", vGPUDevice.ProfileName) + deviceKey-- + } + return deviceSpecs +} diff --git a/test/e2e/config/vsphere-dev.yaml b/test/e2e/config/vsphere-dev.yaml new file mode 100644 index 0000000000..d5a4df0026 --- /dev/null +++ b/test/e2e/config/vsphere-dev.yaml @@ -0,0 +1,164 @@ +--- +# E2E test scenario using local dev images and manifests built from the source tree for following providers: +# - cluster-api +# - bootstrap kubeadm +# - control-plane kubeadm +# - vsphere + +# For creating local dev images built from the source tree; +# - from the CAPI repository root, `make docker-build REGISTRY=gcr.io/k8s-staging-cluster-api` to build the cluster-api, +# bootstrap kubeadm, control-plane kubeadm provider images. This step can be skipped to use upstream images. +# - from the CAPV repository root, `make e2e` to build the vsphere provider image and run e2e tests. + +images: + - name: registry.k8s.io/cluster-api/cluster-api-controller:v1.5.0 + loadBehavior: tryLoad + - name: registry.k8s.io/cluster-api/kubeadm-bootstrap-controller:v1.5.0 + loadBehavior: tryLoad + - name: registry.k8s.io/cluster-api/kubeadm-control-plane-controller:v1.5.0 + loadBehavior: tryLoad + - name: gcr.io/k8s-staging-cluster-api/capv-manager:e2e + loadBehavior: mustLoad + - name: quay.io/jetstack/cert-manager-cainjector:v1.12.2 + loadBehavior: tryLoad + - name: quay.io/jetstack/cert-manager-webhook:v1.12.2 + loadBehavior: tryLoad + - name: quay.io/jetstack/cert-manager-controller:v1.12.2 + loadBehavior: tryLoad + +providers: + + - name: cluster-api + type: CoreProvider + versions: + - name: v1.5.0 + # Use manifest from source files + value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/v1.5.0/core-components.yaml" + type: "url" + contract: v1beta1 + files: + - sourcePath: "../data/shared/main/v1beta1/metadata.yaml" + replacements: + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + + - name: kubeadm + type: BootstrapProvider + versions: + - name: v1.5.0 + # Use manifest from source files + value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/v1.5.0/bootstrap-components.yaml" + type: "url" + contract: v1beta1 + files: + - sourcePath: "../data/shared/main/v1beta1/metadata.yaml" + replacements: + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + + - name: kubeadm + type: ControlPlaneProvider + versions: + - name: v1.5.0 + # Use manifest from source files + value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/v1.5.0/control-plane-components.yaml" + type: "url" + contract: v1beta1 + files: + - sourcePath: "../data/shared/main/v1beta1/metadata.yaml" + replacements: + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + + - name: vsphere + type: InfrastructureProvider + versions: + - name: v1.9.99 + # Use manifest from source files + value: ../../../../cluster-api-provider-vsphere/config/default + contract: v1beta1 + replacements: + - old: gcr.io/cluster-api-provider-vsphere/release/manager:latest + new: gcr.io/k8s-staging-cluster-api/capv-manager:e2e + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + files: + # Add a cluster template + - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template.yaml" + - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-conformance.yaml" + - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-hw-upgrade.yaml" + - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-kcp-remediation.yaml" + - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-md-remediation.yaml" + - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-node-drain.yaml" + - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-pci.yaml" + - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-remote-management.yaml" + - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-storage-policy.yaml" + - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-topology.yaml" + - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-dhcp-overrides.yaml" + - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/clusterclass-quick-start.yaml" + - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-ignition.yaml" + - sourcePath: "../data/shared/main/v1beta1_provider/metadata.yaml" + +variables: + KUBERNETES_VERSION: "v1.28.0" + CPI_IMAGE_K8S_VERSION: "v1.27.0" + CNI: "./data/cni/calico/calico.yaml" + EXP_CLUSTER_RESOURCE_SET: "true" + EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION: "true" + CONTROL_PLANE_MACHINE_COUNT: 1 + WORKER_MACHINE_COUNT: 1 + IP_FAMILY: "IPv4" + CLUSTER_CLASS_NAME: "quick-start" + # Following CAPV variables should be set before testing + VSPHERE_SERVER: "vcenter.vmware.com" + VSPHERE_TLS_THUMBPRINT: "AA:BB:CC:DD:11:22:33:44:EE:FF" + VSPHERE_DATACENTER: "SDDC-Datacenter" + VSPHERE_FOLDER: "FolderName" + VSPHERE_RESOURCE_POOL: "ResourcePool" + VSPHERE_DATASTORE: "WorkloadDatastore" + VSPHERE_STORAGE_POLICY: "Cluster API vSphere Storage Policy" + VSPHERE_NETWORK: "network-1" + VSPHERE_TEMPLATE: "ubuntu-2204-kube-v1.28.0" + FLATCAR_VSPHERE_TEMPLATE: "flatcar-stable-3510.2.6-kube-v1.28.0" + # WORKLOAD_CONTROL_PLANE_ENDPOINT_IP: + # Also following variables are required but it is recommended to use env variables to avoid disclosure of sensitive data + # VSPHERE_SSH_AUTHORIZED_KEY: + # VSPHERE_PASSWORD: + # VSPHERE_USERNAME: + # Dedicated IP to be used by kube-vip + # CONTROL_PLANE_ENDPOINT_IP: + # Sets the insecure-flag for vsphere-csi-controller config + VSPHERE_INSECURE_CSI: "true" + KUBETEST_CONFIGURATION: "./data/kubetest/conformance-fast.yaml" + NODE_DRAIN_TIMEOUT: "60s" + CLUSTER_TOPOLOGY: "true" + # These IDs correspond to Tesla T4s, they are the decimal representation of the hex values. + DEVICE_ID: 7864 + VENDOR_ID: 4318 + PROFILE_NAME: grid_v100d-4c + # CAPV feature flags + EXP_NODE_ANTI_AFFINITY: "true" + # Following CAPV variables is used for multivc_test.go. This is the second VSphere and should be set if multivc test is enabled. + VSPHERE2_SERVER: "vcenter2.vmware.com" + VSPHERE2_TLS_THUMBPRINT: "AA:BB:CC:DD:11:22:33:44:EE:FF" + VSPHERE2_RESOURCE_POOL: "ResourcePool" + VSPHERE2_TEMPLATE: "ubuntu-2004-kube-v1.27.3" + # Dedicated IP to be used by kube-vip + VSPHERE2_CONTROL_PLANE_ENDPOINT_IP: + # Following variables are also required and please use env variables to avoid disclosure of sensitive data + VSPHERE2_USERNAME: + VSPHERE2_PASSWORD: + + +intervals: + default/wait-controllers: ["5m", "10s"] + default/wait-cluster: ["5m", "10s"] + default/wait-control-plane: ["20m", "10s"] + default/wait-worker-nodes: ["20m", "10s"] + default/wait-delete-cluster: ["5m", "10s"] + default/wait-machine-upgrade: ["15m", "1m"] + default/wait-machine-remediation: ["15m", "10s"] + mhc-remediation/mhc-remediation: ["30m", "10s"] + node-drain/wait-deployment-available: ["3m", "10s"] + node-drain/wait-machine-deleted: ["2m", "10s"] + anti-affinity/wait-vm-redistribution: ["5m", "10s"] diff --git a/test/e2e/config/vsphere.yaml b/test/e2e/config/vsphere.yaml index 2aeb087684..d0f768d305 100644 --- a/test/e2e/config/vsphere.yaml +++ b/test/e2e/config/vsphere.yaml @@ -279,6 +279,7 @@ variables: # These IDs correspond to Tesla T4s, they are the decimal representation of the hex values. DEVICE_ID: 7864 VENDOR_ID: 4318 + PROFILE_NAME: grid_v100d-4c # CAPV feature flags EXP_NODE_ANTI_AFFINITY: "true" CAPI_DIAGNOSTICS_ADDRESS: ":8080" diff --git a/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml b/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml new file mode 100644 index 0000000000..75b395b27b --- /dev/null +++ b/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ../base +patchesStrategicMerge: + - vgpu-device-template.yaml diff --git a/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml b/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml new file mode 100644 index 0000000000..4404df5f3f --- /dev/null +++ b/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml @@ -0,0 +1,11 @@ +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: VSphereMachineTemplate +metadata: + name: ${CLUSTER_NAME}-worker + namespace: ${NAMESPACE} +spec: + template: + spec: + vgpuDevices: + - profileName: ${PROFILE_NAME} \ No newline at end of file From c5d2a78fcff6991d7e76ec4d61169081d7370884 Mon Sep 17 00:00:00 2001 From: Puneet Katyal <1063570+puneetkatyal@users.noreply.github.com> Date: Tue, 29 Aug 2023 13:43:42 -0700 Subject: [PATCH 02/21] Update pkg/services/govmomi/vcenter/clone.go Co-authored-by: Christian Schlotter --- pkg/services/govmomi/vcenter/clone.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go index e263ca3412..04151157a4 100644 --- a/pkg/services/govmomi/vcenter/clone.go +++ b/pkg/services/govmomi/vcenter/clone.go @@ -463,13 +463,12 @@ func getNetworkSpecs(ctx context.Context, vmCtx *capvcontext.VMContext, devices } func createPCIPassThroughDevice(deviceKey int32, backingInfo types.BaseVirtualDeviceBackingInfo) types.BaseVirtualDevice { - device := &types.VirtualPCIPassthrough{ + return &types.VirtualPCIPassthrough{ VirtualDevice: types.VirtualDevice{ Key: deviceKey, Backing: backingInfo, }, } - return device } func getGpuSpecs(ctx *context.VMContext) []types.BaseVirtualDeviceConfigSpec { From b85ad404260b41477c0d86d2711897136cc5fb18 Mon Sep 17 00:00:00 2001 From: Birk Lewin Date: Fri, 23 Feb 2024 11:01:05 +0100 Subject: [PATCH 03/21] Handle adding vGPU in reconcilePCIDevices instead of in Clone --- pkg/services/govmomi/pci/vgpu.go | 81 +++++++++++++++++++++++++++ pkg/services/govmomi/service.go | 34 +++++++++++ pkg/services/govmomi/vcenter/clone.go | 69 +---------------------- 3 files changed, 118 insertions(+), 66 deletions(-) create mode 100644 pkg/services/govmomi/pci/vgpu.go diff --git a/pkg/services/govmomi/pci/vgpu.go b/pkg/services/govmomi/pci/vgpu.go new file mode 100644 index 0000000000..e4053b54ff --- /dev/null +++ b/pkg/services/govmomi/pci/vgpu.go @@ -0,0 +1,81 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package pci + +import ( + "context" + + "github.com/vmware/govmomi/object" + "github.com/vmware/govmomi/vim25/types" + + infrav1 "sigs.k8s.io/cluster-api-provider-vsphere/apis/v1beta1" +) + +// CalculateVGPUsToBeAdded calculates the vGPU devices which should be added to the VM. +func CalculateVGPUsToBeAdded(ctx context.Context, vm *object.VirtualMachine, deviceSpecs []infrav1.VGPUSpec) ([]infrav1.VGPUSpec, error) { + // store the number of expected devices for each deviceID + vendorID combo + deviceVendorIDComboMap := map[string]int{} + for _, spec := range deviceSpecs { + key := spec.ProfileName + if _, ok := deviceVendorIDComboMap[key]; !ok { + deviceVendorIDComboMap[key] = 1 + } else { + deviceVendorIDComboMap[key]++ + } + } + + devices, err := vm.Device(ctx) + if err != nil { + return nil, err + } + + specsToBeAdded := []infrav1.VGPUSpec{} + for _, spec := range deviceSpecs { + key := spec.ProfileName + pciDeviceList := devices.SelectByBackingInfo(createBackingInfoVGPU(spec)) + expectedDeviceLen := deviceVendorIDComboMap[key] + if expectedDeviceLen-len(pciDeviceList) > 0 { + specsToBeAdded = append(specsToBeAdded, spec) + deviceVendorIDComboMap[key]-- + } + } + return specsToBeAdded, nil +} + +// ConstructDeviceSpecsVGPU transforms a list of VGPUSpec into a list of BaseVirutalDevices used by govmomi. +func ConstructDeviceSpecsVGPU(vGPUDeviceSpecs []infrav1.VGPUSpec) []types.BaseVirtualDevice { + vGPUDevices := []types.BaseVirtualDevice{} + deviceKey := int32(-200) + + for _, pciDevice := range vGPUDeviceSpecs { + backingInfo := createBackingInfoVGPU(pciDevice) + vGPUDevices = append(vGPUDevices, &types.VirtualPCIPassthrough{ + VirtualDevice: types.VirtualDevice{ + Key: deviceKey, + Backing: backingInfo, + }, + }) + deviceKey-- + } + return vGPUDevices +} + +func createBackingInfoVGPU(spec infrav1.VGPUSpec) *types.VirtualPCIPassthroughVmiopBackingInfo { + return &types.VirtualPCIPassthroughVmiopBackingInfo{ + Vgpu: spec.ProfileName, + } +} diff --git a/pkg/services/govmomi/service.go b/pkg/services/govmomi/service.go index 92256bf2a5..e53358bbc0 100644 --- a/pkg/services/govmomi/service.go +++ b/pkg/services/govmomi/service.go @@ -538,6 +538,40 @@ func (vms *VMService) reconcilePCIDevices(ctx context.Context, virtualMachineCtx return errors.Wrapf(err, "error adding pci devices for %q", ctx) } } + if expectedVGPUs := virtualMachineCtx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices; len(expectedVGPUs) != 0 { + specsToBeAdded, err := pci.CalculateVGPUsToBeAdded(ctx, virtualMachineCtx.Obj, expectedVGPUs) + if err != nil { + return err + } + + if len(specsToBeAdded) == 0 { + if conditions.Has(virtualMachineCtx.VSphereVM, infrav1.PCIDevicesDetachedCondition) { + conditions.Delete(virtualMachineCtx.VSphereVM, infrav1.PCIDevicesDetachedCondition) + } + log.V(5).Info("No new PCI devices to be added") + return nil + } + + powerState, err := virtualMachineCtx.Obj.PowerState(ctx) + if err != nil { + return err + } + if powerState == types.VirtualMachinePowerStatePoweredOn { + // This would arise only when the PCI device is manually removed from + // the VM post creation. + log.Info("vGPU device cannot be attached in powered on state") + conditions.MarkFalse(virtualMachineCtx.VSphereVM, + infrav1.PCIDevicesDetachedCondition, + infrav1.NotFoundReason, + clusterv1.ConditionSeverityWarning, + "vGPU devices removed after VM was powered on") + return errors.Errorf("missing vGPU devices") + } + log.Info("vGPU devices to be added", "number", len(specsToBeAdded)) + if err := virtualMachineCtx.Obj.AddDevice(ctx, pci.ConstructDeviceSpecsVGPU(specsToBeAdded)...); err != nil { + return errors.Wrapf(err, "error adding vGPU devices for %q", ctx) + } + } return nil } diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go index 04151157a4..87a2591cd0 100644 --- a/pkg/services/govmomi/vcenter/clone.go +++ b/pkg/services/govmomi/vcenter/clone.go @@ -68,9 +68,9 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by extraConfig.SetIgnitionUserData(bootstrapData) } } - if ctx.VSphereVM.Spec.CustomVMXKeys != nil { - ctx.Logger.Info("applied custom vmx keys to VM clone spec") - if err := extraConfig.SetCustomVMXKeys(ctx.VSphereVM.Spec.CustomVMXKeys); err != nil { + if vmCtx.VSphereVM.Spec.CustomVMXKeys != nil { + log.Info("applied custom vmx keys to VM clone spec") + if err := extraConfig.SetCustomVMXKeys(vmCtx.VSphereVM.Spec.CustomVMXKeys); err != nil { return err } } @@ -152,18 +152,6 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by deviceSpecs = append(deviceSpecs, networkSpecs...) - if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices) != 0 { - gpuSpecs := getGpuSpecs(ctx) - ctx.Logger.V(4).Info("created gpu devices", "gpu-device-specs", gpuSpecs) - deviceSpecs = append(deviceSpecs, gpuSpecs...) - } - - if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices) != 0 { - vgpuSpecs := getVgpuSpecs(ctx) - ctx.Logger.V(4).Info("created vgpu devices", "vgpu-device-specs", vgpuSpecs) - deviceSpecs = append(deviceSpecs, vgpuSpecs...) - } - numCPUs := vmCtx.VSphereVM.Spec.NumCPUs if numCPUs < 2 { numCPUs = 2 @@ -461,54 +449,3 @@ func getNetworkSpecs(ctx context.Context, vmCtx *capvcontext.VMContext, devices return deviceSpecs, nil } - -func createPCIPassThroughDevice(deviceKey int32, backingInfo types.BaseVirtualDeviceBackingInfo) types.BaseVirtualDevice { - return &types.VirtualPCIPassthrough{ - VirtualDevice: types.VirtualDevice{ - Key: deviceKey, - Backing: backingInfo, - }, - } -} - -func getGpuSpecs(ctx *context.VMContext) []types.BaseVirtualDeviceConfigSpec { - deviceSpecs := []types.BaseVirtualDeviceConfigSpec{} - deviceKey := int32(-200) - - for _, pciDevice := range ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices { - backingInfo := &types.VirtualPCIPassthroughDynamicBackingInfo{ - AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{ - { - VendorId: *pciDevice.VendorID, - DeviceId: *pciDevice.DeviceID, - }, - }, - } - dynamicDirectPathDevice := createPCIPassThroughDevice(deviceKey, backingInfo) - deviceSpecs = append(deviceSpecs, &types.VirtualDeviceConfigSpec{ - Device: dynamicDirectPathDevice, - Operation: types.VirtualDeviceConfigSpecOperationAdd, - }) - deviceKey-- - } - return deviceSpecs -} - -func getVgpuSpecs(ctx *context.VMContext) []types.BaseVirtualDeviceConfigSpec { - deviceSpecs := []types.BaseVirtualDeviceConfigSpec{} - deviceKey := int32(-200) - - for _, vGPUDevice := range ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices { - backingInfo := &types.VirtualPCIPassthroughVmiopBackingInfo{ - Vgpu: vGPUDevice.ProfileName, - } - dynamicDirectPathDevice := createPCIPassThroughDevice(deviceKey, backingInfo) - deviceSpecs = append(deviceSpecs, &types.VirtualDeviceConfigSpec{ - Device: dynamicDirectPathDevice, - Operation: types.VirtualDeviceConfigSpecOperationAdd, - }) - ctx.Logger.V(4).Info("created vGPU device", "vgpu-profile", vGPUDevice.ProfileName) - deviceKey-- - } - return deviceSpecs -} From 47d7d72e8d55db166b29370b2977b4033884b8bc Mon Sep 17 00:00:00 2001 From: Birk Lewin Date: Fri, 23 Feb 2024 17:00:42 +0100 Subject: [PATCH 04/21] Duplicate PCI device test for vGPU test --- docs/gpu-vgpu.md | 3 +- pkg/services/govmomi/pci/vgpu_test.go | 156 ++++++++++++++++++++++++++ pkg/services/govmomi/vcenter/clone.go | 2 +- 3 files changed, 158 insertions(+), 3 deletions(-) create mode 100644 pkg/services/govmomi/pci/vgpu_test.go diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md index 3c5f546afb..4b1dc11d4c 100644 --- a/docs/gpu-vgpu.md +++ b/docs/gpu-vgpu.md @@ -56,7 +56,6 @@ We are using NVIDIA Tesla V100 32GB cards for this example and will use the `gri ```shell $ make dev-flavors -/Applications/Xcode.app/Contents/Developer/usr/bin/make generate-flavors FLAVOR_DIR=/Users/pkatyal/.cluster-api/overrides/infrastructure-vsphere/v0.0.0 go run ./packaging/flavorgen --output-dir /Users/pkatyal/.cluster-api/overrides/infrastructure-vsphere/v0.0.0 ``` @@ -91,7 +90,7 @@ spec: template: '${VSPHERE_TEMPLATE}' thumbprint: '${VSPHERE_TLS_THUMBPRINT}' vgpuDevices: - - profileName: "grid_v100d-4c" <============ value from above + - profileName: "grid_v100d-4c" # value from above ``` Set the required values for the other fields and the cluster template is ready for use. The similar changes can be made to a template generated using clusterctl generate cluster command as well. diff --git a/pkg/services/govmomi/pci/vgpu_test.go b/pkg/services/govmomi/pci/vgpu_test.go new file mode 100644 index 0000000000..5b4a572040 --- /dev/null +++ b/pkg/services/govmomi/pci/vgpu_test.go @@ -0,0 +1,156 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package pci + +import ( + "context" + "testing" + + "github.com/onsi/gomega" + "github.com/vmware/govmomi/find" + "github.com/vmware/govmomi/simulator" + "github.com/vmware/govmomi/vim25" + + infrav1 "sigs.k8s.io/cluster-api-provider-vsphere/apis/v1beta1" +) + +func Test_CalculateVGPUsToBeAdded(t *testing.T) { + type input struct { + name string + expectedLen int + existingDeviceSpecIndexes []int + vGPUDeviceSpecs []infrav1.VGPUSpec + assertFunc func(g *gomega.WithT, actual []infrav1.VGPUSpec) + } + + testFunc := func(t *testing.T, i input) { + t.Helper() + t.Run(i.name, func(t *testing.T) { + g := gomega.NewWithT(t) + simulator.Run(func(ctx context.Context, client *vim25.Client) error { + finder := find.NewFinder(client) + vm, err := finder.VirtualMachine(ctx, "DC0_H0_VM0") + if err != nil { + return err + } + + if len(i.existingDeviceSpecIndexes) > 0 { + existingDevices := []infrav1.VGPUSpec{} + for _, idx := range i.existingDeviceSpecIndexes { + existingDevices = append(existingDevices, i.vGPUDeviceSpecs[idx]) + } + g.Expect(vm.AddDevice(ctx, + ConstructDeviceSpecsVGPU(existingDevices)...)).ToNot(gomega.HaveOccurred()) + } + toBeAdded, err := CalculateVGPUsToBeAdded(ctx, vm, i.vGPUDeviceSpecs) + g.Expect(err).ToNot(gomega.HaveOccurred()) + g.Expect(toBeAdded).To(gomega.HaveLen(i.expectedLen)) + if i.assertFunc != nil { + i.assertFunc(g, toBeAdded) + } + return nil + }) + }) + } + + t.Run("when no vGPU devices exist on the VM", func(t *testing.T) { + inputs := []input{ + { + name: "when adding a single vGPU device of each type", + expectedLen: 2, + vGPUDeviceSpecs: []infrav1.VGPUSpec{ + {ProfileName: "1234"}, {ProfileName: "4321"}, + }, + assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) { + g.Expect(actual[0].ProfileName).To(gomega.Equal("1234")) + g.Expect(actual[1].ProfileName).To(gomega.Equal("4321")) + }, + }, + { + name: "when adding multiple vGPU devices of a type", + expectedLen: 2, + vGPUDeviceSpecs: []infrav1.VGPUSpec{ + {ProfileName: "1234"}, {ProfileName: "1234"}, + }, + assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) { + g.Expect(actual[0].ProfileName).To(gomega.Equal("1234")) + g.Expect(actual[1].ProfileName).To(gomega.Equal("1234")) + }, + }, + } + for _, tt := range inputs { + testFunc(t, tt) + } + }) + + t.Run("when all vGPU devices exist on the VM", func(t *testing.T) { + inputs := []input{ + { + name: "when adding a single vGPU device of each type", + expectedLen: 0, + vGPUDeviceSpecs: []infrav1.VGPUSpec{ + {ProfileName: "1234"}, {ProfileName: "4321"}, + }, + existingDeviceSpecIndexes: []int{0, 1}, + }, + { + name: "when adding multiple vGPU devices of a type", + expectedLen: 0, + vGPUDeviceSpecs: []infrav1.VGPUSpec{ + {ProfileName: "1234"}, {ProfileName: "1234"}, + }, + existingDeviceSpecIndexes: []int{0, 1}, + }, + } + for _, tt := range inputs { + testFunc(t, tt) + } + }) + + t.Run("when some vGPU devices exist on the VM", func(t *testing.T) { + inputs := []input{ + { + name: "when adding a single vGPU device of each type", + expectedLen: 1, + vGPUDeviceSpecs: []infrav1.VGPUSpec{ + {ProfileName: "1234"}, {ProfileName: "4321"}, + }, + existingDeviceSpecIndexes: []int{0}, + assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) { + g.Expect(actual[0].ProfileName).To(gomega.Equal("4321")) + }, + }, + { + name: "when adding multiple vGPU devices of a type", + expectedLen: 2, + vGPUDeviceSpecs: []infrav1.VGPUSpec{ + {ProfileName: "1234"}, + {ProfileName: "1234"}, + {ProfileName: "4321"}, + }, + existingDeviceSpecIndexes: []int{0}, + assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) { + g.Expect(actual[0].ProfileName).To(gomega.Equal("1234")) + g.Expect(actual[1].ProfileName).To(gomega.Equal("4321")) + }, + }, + } + for _, tt := range inputs { + testFunc(t, tt) + } + }) +} diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go index 87a2591cd0..b3b0f1af29 100644 --- a/pkg/services/govmomi/vcenter/clone.go +++ b/pkg/services/govmomi/vcenter/clone.go @@ -199,7 +199,7 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by // For PCI and vGPU devices, the memory for the VM needs to be reserved // We can replace this once we have another way of reserving memory option // exposed via the API types. - if len(vmCtx.VSphereVM.Spec.PciDevices) > 0 { + if len(vmCtx.VSphereVM.Spec.PciDevices) > 0 || len(vmCtx.VSphereVM.Spec.VGPUDevices) > 0 { spec.Config.MemoryReservationLockedToMax = ptr.To(true) } From 27933fad699bfa6a8c30cb16c25230c57419019c Mon Sep 17 00:00:00 2001 From: Birk Lewin <89076383+birksl@users.noreply.github.com> Date: Thu, 30 May 2024 13:17:20 +0200 Subject: [PATCH 05/21] Update docs/gpu-vgpu.md Co-authored-by: Christian Schlotter --- docs/gpu-vgpu.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md index 4b1dc11d4c..af6901d049 100644 --- a/docs/gpu-vgpu.md +++ b/docs/gpu-vgpu.md @@ -56,7 +56,7 @@ We are using NVIDIA Tesla V100 32GB cards for this example and will use the `gri ```shell $ make dev-flavors -go run ./packaging/flavorgen --output-dir /Users/pkatyal/.cluster-api/overrides/infrastructure-vsphere/v0.0.0 +go run ./packaging/flavorgen --output-dir /home/user/.cluster-api/overrides/infrastructure-vsphere/v0.0.0 ``` Edit the generated Cluster template (`cluster-template.yaml`) to set the values for the `vgpuDevices` array. Here we are editing the VSphereMachineTemplate object for the worker nodes. This will create a worker node with a single NVIDIA 16GB vGPU device attached to the VM. From 9f16da1beacd4b8ca2bbf8921b3c389e63c05c64 Mon Sep 17 00:00:00 2001 From: Birk Lewin <89076383+birksl@users.noreply.github.com> Date: Thu, 30 May 2024 13:17:32 +0200 Subject: [PATCH 06/21] Update pkg/services/govmomi/vcenter/clone.go Co-authored-by: Christian Schlotter --- pkg/services/govmomi/vcenter/clone.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go index b3b0f1af29..61126d7e43 100644 --- a/pkg/services/govmomi/vcenter/clone.go +++ b/pkg/services/govmomi/vcenter/clone.go @@ -69,7 +69,7 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by } } if vmCtx.VSphereVM.Spec.CustomVMXKeys != nil { - log.Info("applied custom vmx keys to VM clone spec") + log.Info("Applied custom vmx keys to VM clone spec") if err := extraConfig.SetCustomVMXKeys(vmCtx.VSphereVM.Spec.CustomVMXKeys); err != nil { return err } From cd250d7475859f445203ec79756f42514700628e Mon Sep 17 00:00:00 2001 From: Birk Lewin Date: Thu, 30 May 2024 15:01:29 +0200 Subject: [PATCH 07/21] Make VGPU directly part of PCI specs. --- apis/v1alpha3/conversion_test.go | 1 - apis/v1alpha3/zz_generated.conversion.go | 1 - apis/v1alpha4/zz_generated.conversion.go | 1 - apis/v1beta1/types.go | 17 +- apis/v1beta1/zz_generated.deepcopy.go | 20 --- ...ture.cluster.x-k8s.io_vspheremachines.yaml | 18 +- ...ster.x-k8s.io_vspheremachinetemplates.yaml | 20 +-- ...structure.cluster.x-k8s.io_vspherevms.yaml | 18 +- docs/gpu-vgpu.md | 6 +- pkg/services/govmomi/pci/device.go | 28 +++- pkg/services/govmomi/pci/device_test.go | 25 ++- pkg/services/govmomi/pci/vgpu.go | 81 --------- pkg/services/govmomi/pci/vgpu_test.go | 156 ------------------ pkg/services/govmomi/service.go | 34 ---- pkg/services/govmomi/vcenter/clone.go | 4 +- 15 files changed, 64 insertions(+), 366 deletions(-) delete mode 100644 pkg/services/govmomi/pci/vgpu.go delete mode 100644 pkg/services/govmomi/pci/vgpu_test.go diff --git a/apis/v1alpha3/conversion_test.go b/apis/v1alpha3/conversion_test.go index e956f34da2..732fae6f8b 100644 --- a/apis/v1alpha3/conversion_test.go +++ b/apis/v1alpha3/conversion_test.go @@ -120,7 +120,6 @@ func CustomSpecNewFieldFuzzer(in *infrav1.VirtualMachineCloneSpec, c fuzz.Contin c.FuzzNoCustom(in) in.PciDevices = nil - in.VGPUDevices = nil in.AdditionalDisksGiB = nil in.OS = "" in.HardwareVersion = "" diff --git a/apis/v1alpha3/zz_generated.conversion.go b/apis/v1alpha3/zz_generated.conversion.go index 0c8f7f2cb0..966195ccbb 100644 --- a/apis/v1alpha3/zz_generated.conversion.go +++ b/apis/v1alpha3/zz_generated.conversion.go @@ -1760,7 +1760,6 @@ func autoConvert_v1beta1_VirtualMachineCloneSpec_To_v1alpha3_VirtualMachineClone out.CustomVMXKeys = *(*map[string]string)(unsafe.Pointer(&in.CustomVMXKeys)) // WARNING: in.TagIDs requires manual conversion: does not exist in peer-type // WARNING: in.PciDevices requires manual conversion: does not exist in peer-type - // WARNING: in.VGPUDevices requires manual conversion: does not exist in peer-type // WARNING: in.OS requires manual conversion: does not exist in peer-type // WARNING: in.HardwareVersion requires manual conversion: does not exist in peer-type return nil diff --git a/apis/v1alpha4/zz_generated.conversion.go b/apis/v1alpha4/zz_generated.conversion.go index a18d2ecd3d..147c1a9894 100644 --- a/apis/v1alpha4/zz_generated.conversion.go +++ b/apis/v1alpha4/zz_generated.conversion.go @@ -1914,7 +1914,6 @@ func autoConvert_v1beta1_VirtualMachineCloneSpec_To_v1alpha4_VirtualMachineClone out.CustomVMXKeys = *(*map[string]string)(unsafe.Pointer(&in.CustomVMXKeys)) // WARNING: in.TagIDs requires manual conversion: does not exist in peer-type // WARNING: in.PciDevices requires manual conversion: does not exist in peer-type - // WARNING: in.VGPUDevices requires manual conversion: does not exist in peer-type // WARNING: in.OS requires manual conversion: does not exist in peer-type // WARNING: in.HardwareVersion requires manual conversion: does not exist in peer-type return nil diff --git a/apis/v1beta1/types.go b/apis/v1beta1/types.go index 81c5882279..374084ce5d 100644 --- a/apis/v1beta1/types.go +++ b/apis/v1beta1/types.go @@ -193,9 +193,6 @@ type VirtualMachineCloneSpec struct { // PciDevices is the list of pci devices used by the virtual machine. // +optional PciDevices []PCIDeviceSpec `json:"pciDevices,omitempty"` - // VGPUDevices is the list of vGPUs used by the virtual machine. - // +optional - VGPUDevices []VGPUSpec `json:"vgpuDevices,omitempty"` // OS is the Operating System of the virtual machine // Defaults to Linux // +optional @@ -261,6 +258,11 @@ type PCIDeviceSpec struct { // virtual machine is cloned. // +kubebuilder:validation:Required VendorID *int32 `json:"vendorId,omitempty"` + // VGPUProfile is the profile name of a virtual machine's vGPU, in string. + // Defaults to the eponymous property value in the template from which the + // virtual machine is cloned. + // +kubebuilder:validation:Required + VGPUProfile string `json:"vgpuProfile,omitempty"` // CustomLabel is the hardware label of a virtual machine's PCI device. // Defaults to the eponymous property value in the template from which the // virtual machine is cloned. @@ -268,15 +270,6 @@ type PCIDeviceSpec struct { CustomLabel string `json:"customLabel,omitempty"` } -// VGPUSpec defines virtual machine's VGPU configuration -type VGPUSpec struct { - // ProfileName is the ProfileName of a virtual machine's vGPU, in string. - // Defaults to the eponymous property value in the template from which the - // virtual machine is cloned. - // +kubebuilder:validation:Required - ProfileName string `json:"profileName,omitempty"` -} - // NetworkSpec defines the virtual machine's network configuration. type NetworkSpec struct { // Devices is the list of network devices used by the virtual machine. diff --git a/apis/v1beta1/zz_generated.deepcopy.go b/apis/v1beta1/zz_generated.deepcopy.go index c6b26cfb39..44d12a65fe 100644 --- a/apis/v1beta1/zz_generated.deepcopy.go +++ b/apis/v1beta1/zz_generated.deepcopy.go @@ -403,21 +403,6 @@ func (in *Topology) DeepCopy() *Topology { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *VGPUSpec) DeepCopyInto(out *VGPUSpec) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VGPUSpec. -func (in *VGPUSpec) DeepCopy() *VGPUSpec { - if in == nil { - return nil - } - out := new(VGPUSpec) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *VSphereCluster) DeepCopyInto(out *VSphereCluster) { *out = *in @@ -1336,11 +1321,6 @@ func (in *VirtualMachineCloneSpec) DeepCopyInto(out *VirtualMachineCloneSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } - if in.VGPUDevices != nil { - in, out := &in.VGPUDevices, &out.VGPUDevices - *out = make([]VGPUSpec, len(*in)) - copy(*out, *in) - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineCloneSpec. diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml index eb46f63a03..84cd06f334 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml @@ -1379,6 +1379,11 @@ spec: virtual machine is cloned. format: int32 type: integer + vgpuProfile: + description: VGPUProfile is the VGPUProfile of a virtual machine's + vGPU, in string. Defaults to the eponymous property value + in the template from which the virtual machine is cloned. + type: string type: object type: array powerOffMode: @@ -1448,19 +1453,6 @@ spec: without TLS certificate validation of the communication between Cluster API Provider vSphere and the VMware vCenter server. type: string - vgpuDevices: - description: VGPUDevices is the list of vGPUs used by the virtual - machine. - items: - description: VGPUSpec defines virtual machine's VGPU configuration - properties: - profileName: - description: ProfileName is the ProfileName of a virtual machine's - vGPU, in string. Defaults to the eponymous property value - in the template from which the virtual machine is cloned. - type: string - type: object - type: array required: - network - template diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml index de5d4f2c62..27c270bffa 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml @@ -1254,6 +1254,12 @@ spec: virtual machine is cloned. format: int32 type: integer + vgpuProfile: + description: VGPUProfile is the VGPUProfile of a virtual + machine's vGPU, in string. Defaults to the eponymous + property value in the template from which the virtual + machine is cloned. + type: string type: object type: array powerOffMode: @@ -1323,20 +1329,6 @@ spec: without TLS certificate validation of the communication between Cluster API Provider vSphere and the VMware vCenter server. type: string - vgpuDevices: - description: VGPUDevices is the list of vGPUs used by the - virtual machine. - items: - description: VGPUSpec defines virtual machine's VGPU configuration - properties: - profileName: - description: ProfileName is the ProfileName of a virtual - machine's vGPU, in string. Defaults to the eponymous - property value in the template from which the virtual - machine is cloned. - type: string - type: object - type: array required: - network - template diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml index f6f2f80e71..96589edfdc 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml @@ -1467,6 +1467,11 @@ spec: virtual machine is cloned. format: int32 type: integer + vgpuProfile: + description: VGPUProfile is the VGPUProfile of a virtual machine's + vGPU, in string. Defaults to the eponymous property value + in the template from which the virtual machine is cloned. + type: string type: object type: array powerOffMode: @@ -1531,19 +1536,6 @@ spec: without TLS certificate validation of the communication between Cluster API Provider vSphere and the VMware vCenter server. type: string - vgpuDevices: - description: VGPUDevices is the list of vGPUs used by the virtual - machine. - items: - description: VGPUSpec defines virtual machine's VGPU configuration - properties: - profileName: - description: ProfileName is the ProfileName of a virtual machine's - vGPU, in string. Defaults to the eponymous property value - in the template from which the virtual machine is cloned. - type: string - type: object - type: array required: - network - template diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md index af6901d049..268aab1075 100644 --- a/docs/gpu-vgpu.md +++ b/docs/gpu-vgpu.md @@ -59,7 +59,7 @@ $ make dev-flavors go run ./packaging/flavorgen --output-dir /home/user/.cluster-api/overrides/infrastructure-vsphere/v0.0.0 ``` -Edit the generated Cluster template (`cluster-template.yaml`) to set the values for the `vgpuDevices` array. Here we are editing the VSphereMachineTemplate object for the worker nodes. This will create a worker node with a single NVIDIA 16GB vGPU device attached to the VM. +Edit the generated Cluster template (`cluster-template.yaml`) to set the values for the `pciDevices` array. Here we are editing the VSphereMachineTemplate object for the worker nodes. This will create a worker node with a single NVIDIA 16GB vGPU device attached to the VM. ```yaml --- @@ -89,8 +89,8 @@ spec: storagePolicyName: '${VSPHERE_STORAGE_POLICY}' template: '${VSPHERE_TEMPLATE}' thumbprint: '${VSPHERE_TLS_THUMBPRINT}' - vgpuDevices: - - profileName: "grid_v100d-4c" # value from above + pciDevices: + - vgpuProfile: "grid_t4-1a" # value from above ``` Set the required values for the other fields and the cluster template is ready for use. The similar changes can be made to a template generated using clusterctl generate cluster command as well. diff --git a/pkg/services/govmomi/pci/device.go b/pkg/services/govmomi/pci/device.go index f92670bf9d..cc75f670bc 100644 --- a/pkg/services/govmomi/pci/device.go +++ b/pkg/services/govmomi/pci/device.go @@ -76,18 +76,28 @@ func ConstructDeviceSpecs(pciDeviceSpecs []infrav1.PCIDeviceSpec) []types.BaseVi return pciDevices } -func createBackingInfo(spec infrav1.PCIDeviceSpec) *types.VirtualPCIPassthroughDynamicBackingInfo { - return &types.VirtualPCIPassthroughDynamicBackingInfo{ - AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{ - { - VendorId: *spec.VendorID, - DeviceId: *spec.DeviceID, +func createBackingInfo(spec infrav1.PCIDeviceSpec) types.BaseVirtualDeviceBackingInfo { + if spec.VGPUProfile == "" { + return &types.VirtualPCIPassthroughDynamicBackingInfo{ + AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{ + { + VendorId: *spec.VendorID, + DeviceId: *spec.DeviceID, + }, }, - }, - CustomLabel: spec.CustomLabel, + CustomLabel: spec.CustomLabel, + } + } + + return &types.VirtualPCIPassthroughVmiopBackingInfo{ + Vgpu: spec.VGPUProfile, } } func constructKey(pciDeviceSpec infrav1.PCIDeviceSpec) string { - return fmt.Sprintf("%d-%d", *pciDeviceSpec.DeviceID, *pciDeviceSpec.VendorID) + if pciDeviceSpec.VGPUProfile == "" { + return fmt.Sprintf("%d-%d", *pciDeviceSpec.DeviceID, *pciDeviceSpec.VendorID) + } + + return pciDeviceSpec.VGPUProfile } diff --git a/pkg/services/govmomi/pci/device_test.go b/pkg/services/govmomi/pci/device_test.go index 74f57245c8..5f62089552 100644 --- a/pkg/services/govmomi/pci/device_test.go +++ b/pkg/services/govmomi/pci/device_test.go @@ -72,30 +72,36 @@ func Test_CalculateDevicesToBeAdded(t *testing.T) { inputs := []input{ { name: "when adding a single PCI device of each type", - expectedLen: 2, + expectedLen: 3, pciDeviceSpecs: []infrav1.PCIDeviceSpec{ {DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)}, {DeviceID: ptr.To[int32](4321), VendorID: ptr.To[int32](8765)}, + {VGPUProfile: "grid_t4-1a"}, }, assertFunc: func(g *gomega.WithT, actual []infrav1.PCIDeviceSpec) { g.Expect(*actual[0].DeviceID).To(gomega.Equal(int32(1234))) g.Expect(*actual[0].VendorID).To(gomega.Equal(int32(5678))) g.Expect(*actual[1].DeviceID).To(gomega.Equal(int32(4321))) g.Expect(*actual[1].VendorID).To(gomega.Equal(int32(8765))) + g.Expect(actual[2].VGPUProfile).To(gomega.Equal("grid_t4-1a")) }, }, { name: "when adding multiple PCI devices of a type", - expectedLen: 2, + expectedLen: 4, pciDeviceSpecs: []infrav1.PCIDeviceSpec{ {DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)}, {DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)}, + {VGPUProfile: "grid_t4-1a"}, + {VGPUProfile: "grid_t4-1a"}, }, assertFunc: func(g *gomega.WithT, actual []infrav1.PCIDeviceSpec) { g.Expect(*actual[0].DeviceID).To(gomega.Equal(int32(1234))) g.Expect(*actual[0].VendorID).To(gomega.Equal(int32(5678))) g.Expect(*actual[1].DeviceID).To(gomega.Equal(int32(1234))) g.Expect(*actual[1].VendorID).To(gomega.Equal(int32(5678))) + g.Expect(actual[2].VGPUProfile).To(gomega.Equal("grid_t4-1a")) + g.Expect(actual[3].VGPUProfile).To(gomega.Equal("grid_t4-1a")) }, }, } @@ -112,8 +118,9 @@ func Test_CalculateDevicesToBeAdded(t *testing.T) { pciDeviceSpecs: []infrav1.PCIDeviceSpec{ {DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)}, {DeviceID: ptr.To[int32](4321), VendorID: ptr.To[int32](8765)}, + {VGPUProfile: "grid_t4-1a"}, }, - existingDeviceSpecIndexes: []int{0, 1}, + existingDeviceSpecIndexes: []int{0, 1, 2}, }, { name: "when adding multiple PCI devices of a type", @@ -121,8 +128,10 @@ func Test_CalculateDevicesToBeAdded(t *testing.T) { pciDeviceSpecs: []infrav1.PCIDeviceSpec{ {DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)}, {DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)}, + {VGPUProfile: "grid_t4-1a"}, + {VGPUProfile: "grid_t4-1a"}, }, - existingDeviceSpecIndexes: []int{0, 1}, + existingDeviceSpecIndexes: []int{0, 1, 2, 3}, }, } for _, tt := range inputs { @@ -134,24 +143,27 @@ func Test_CalculateDevicesToBeAdded(t *testing.T) { inputs := []input{ { name: "when adding a single PCI device of each type", - expectedLen: 1, + expectedLen: 2, pciDeviceSpecs: []infrav1.PCIDeviceSpec{ {DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)}, {DeviceID: ptr.To[int32](4321), VendorID: ptr.To[int32](8765)}, + {VGPUProfile: "grid_t4-1a"}, }, existingDeviceSpecIndexes: []int{0}, assertFunc: func(g *gomega.WithT, actual []infrav1.PCIDeviceSpec) { g.Expect(*actual[0].DeviceID).To(gomega.Equal(int32(4321))) g.Expect(*actual[0].VendorID).To(gomega.Equal(int32(8765))) + g.Expect(actual[1].VGPUProfile).To(gomega.Equal("grid_t4-1a")) }, }, { name: "when adding multiple PCI devices of a type", - expectedLen: 2, + expectedLen: 3, pciDeviceSpecs: []infrav1.PCIDeviceSpec{ {DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)}, {DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)}, {DeviceID: ptr.To[int32](4321), VendorID: ptr.To[int32](8765)}, + {VGPUProfile: "grid_t4-1a"}, }, existingDeviceSpecIndexes: []int{0}, assertFunc: func(g *gomega.WithT, actual []infrav1.PCIDeviceSpec) { @@ -159,6 +171,7 @@ func Test_CalculateDevicesToBeAdded(t *testing.T) { g.Expect(*actual[0].VendorID).To(gomega.Equal(int32(5678))) g.Expect(*actual[1].DeviceID).To(gomega.Equal(int32(4321))) g.Expect(*actual[1].VendorID).To(gomega.Equal(int32(8765))) + g.Expect(actual[2].VGPUProfile).To(gomega.Equal("grid_t4-1a")) }, }, } diff --git a/pkg/services/govmomi/pci/vgpu.go b/pkg/services/govmomi/pci/vgpu.go deleted file mode 100644 index e4053b54ff..0000000000 --- a/pkg/services/govmomi/pci/vgpu.go +++ /dev/null @@ -1,81 +0,0 @@ -/* -Copyright 2023 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package pci - -import ( - "context" - - "github.com/vmware/govmomi/object" - "github.com/vmware/govmomi/vim25/types" - - infrav1 "sigs.k8s.io/cluster-api-provider-vsphere/apis/v1beta1" -) - -// CalculateVGPUsToBeAdded calculates the vGPU devices which should be added to the VM. -func CalculateVGPUsToBeAdded(ctx context.Context, vm *object.VirtualMachine, deviceSpecs []infrav1.VGPUSpec) ([]infrav1.VGPUSpec, error) { - // store the number of expected devices for each deviceID + vendorID combo - deviceVendorIDComboMap := map[string]int{} - for _, spec := range deviceSpecs { - key := spec.ProfileName - if _, ok := deviceVendorIDComboMap[key]; !ok { - deviceVendorIDComboMap[key] = 1 - } else { - deviceVendorIDComboMap[key]++ - } - } - - devices, err := vm.Device(ctx) - if err != nil { - return nil, err - } - - specsToBeAdded := []infrav1.VGPUSpec{} - for _, spec := range deviceSpecs { - key := spec.ProfileName - pciDeviceList := devices.SelectByBackingInfo(createBackingInfoVGPU(spec)) - expectedDeviceLen := deviceVendorIDComboMap[key] - if expectedDeviceLen-len(pciDeviceList) > 0 { - specsToBeAdded = append(specsToBeAdded, spec) - deviceVendorIDComboMap[key]-- - } - } - return specsToBeAdded, nil -} - -// ConstructDeviceSpecsVGPU transforms a list of VGPUSpec into a list of BaseVirutalDevices used by govmomi. -func ConstructDeviceSpecsVGPU(vGPUDeviceSpecs []infrav1.VGPUSpec) []types.BaseVirtualDevice { - vGPUDevices := []types.BaseVirtualDevice{} - deviceKey := int32(-200) - - for _, pciDevice := range vGPUDeviceSpecs { - backingInfo := createBackingInfoVGPU(pciDevice) - vGPUDevices = append(vGPUDevices, &types.VirtualPCIPassthrough{ - VirtualDevice: types.VirtualDevice{ - Key: deviceKey, - Backing: backingInfo, - }, - }) - deviceKey-- - } - return vGPUDevices -} - -func createBackingInfoVGPU(spec infrav1.VGPUSpec) *types.VirtualPCIPassthroughVmiopBackingInfo { - return &types.VirtualPCIPassthroughVmiopBackingInfo{ - Vgpu: spec.ProfileName, - } -} diff --git a/pkg/services/govmomi/pci/vgpu_test.go b/pkg/services/govmomi/pci/vgpu_test.go deleted file mode 100644 index 5b4a572040..0000000000 --- a/pkg/services/govmomi/pci/vgpu_test.go +++ /dev/null @@ -1,156 +0,0 @@ -/* -Copyright 2023 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package pci - -import ( - "context" - "testing" - - "github.com/onsi/gomega" - "github.com/vmware/govmomi/find" - "github.com/vmware/govmomi/simulator" - "github.com/vmware/govmomi/vim25" - - infrav1 "sigs.k8s.io/cluster-api-provider-vsphere/apis/v1beta1" -) - -func Test_CalculateVGPUsToBeAdded(t *testing.T) { - type input struct { - name string - expectedLen int - existingDeviceSpecIndexes []int - vGPUDeviceSpecs []infrav1.VGPUSpec - assertFunc func(g *gomega.WithT, actual []infrav1.VGPUSpec) - } - - testFunc := func(t *testing.T, i input) { - t.Helper() - t.Run(i.name, func(t *testing.T) { - g := gomega.NewWithT(t) - simulator.Run(func(ctx context.Context, client *vim25.Client) error { - finder := find.NewFinder(client) - vm, err := finder.VirtualMachine(ctx, "DC0_H0_VM0") - if err != nil { - return err - } - - if len(i.existingDeviceSpecIndexes) > 0 { - existingDevices := []infrav1.VGPUSpec{} - for _, idx := range i.existingDeviceSpecIndexes { - existingDevices = append(existingDevices, i.vGPUDeviceSpecs[idx]) - } - g.Expect(vm.AddDevice(ctx, - ConstructDeviceSpecsVGPU(existingDevices)...)).ToNot(gomega.HaveOccurred()) - } - toBeAdded, err := CalculateVGPUsToBeAdded(ctx, vm, i.vGPUDeviceSpecs) - g.Expect(err).ToNot(gomega.HaveOccurred()) - g.Expect(toBeAdded).To(gomega.HaveLen(i.expectedLen)) - if i.assertFunc != nil { - i.assertFunc(g, toBeAdded) - } - return nil - }) - }) - } - - t.Run("when no vGPU devices exist on the VM", func(t *testing.T) { - inputs := []input{ - { - name: "when adding a single vGPU device of each type", - expectedLen: 2, - vGPUDeviceSpecs: []infrav1.VGPUSpec{ - {ProfileName: "1234"}, {ProfileName: "4321"}, - }, - assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) { - g.Expect(actual[0].ProfileName).To(gomega.Equal("1234")) - g.Expect(actual[1].ProfileName).To(gomega.Equal("4321")) - }, - }, - { - name: "when adding multiple vGPU devices of a type", - expectedLen: 2, - vGPUDeviceSpecs: []infrav1.VGPUSpec{ - {ProfileName: "1234"}, {ProfileName: "1234"}, - }, - assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) { - g.Expect(actual[0].ProfileName).To(gomega.Equal("1234")) - g.Expect(actual[1].ProfileName).To(gomega.Equal("1234")) - }, - }, - } - for _, tt := range inputs { - testFunc(t, tt) - } - }) - - t.Run("when all vGPU devices exist on the VM", func(t *testing.T) { - inputs := []input{ - { - name: "when adding a single vGPU device of each type", - expectedLen: 0, - vGPUDeviceSpecs: []infrav1.VGPUSpec{ - {ProfileName: "1234"}, {ProfileName: "4321"}, - }, - existingDeviceSpecIndexes: []int{0, 1}, - }, - { - name: "when adding multiple vGPU devices of a type", - expectedLen: 0, - vGPUDeviceSpecs: []infrav1.VGPUSpec{ - {ProfileName: "1234"}, {ProfileName: "1234"}, - }, - existingDeviceSpecIndexes: []int{0, 1}, - }, - } - for _, tt := range inputs { - testFunc(t, tt) - } - }) - - t.Run("when some vGPU devices exist on the VM", func(t *testing.T) { - inputs := []input{ - { - name: "when adding a single vGPU device of each type", - expectedLen: 1, - vGPUDeviceSpecs: []infrav1.VGPUSpec{ - {ProfileName: "1234"}, {ProfileName: "4321"}, - }, - existingDeviceSpecIndexes: []int{0}, - assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) { - g.Expect(actual[0].ProfileName).To(gomega.Equal("4321")) - }, - }, - { - name: "when adding multiple vGPU devices of a type", - expectedLen: 2, - vGPUDeviceSpecs: []infrav1.VGPUSpec{ - {ProfileName: "1234"}, - {ProfileName: "1234"}, - {ProfileName: "4321"}, - }, - existingDeviceSpecIndexes: []int{0}, - assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) { - g.Expect(actual[0].ProfileName).To(gomega.Equal("1234")) - g.Expect(actual[1].ProfileName).To(gomega.Equal("4321")) - }, - }, - } - for _, tt := range inputs { - testFunc(t, tt) - } - }) -} diff --git a/pkg/services/govmomi/service.go b/pkg/services/govmomi/service.go index e53358bbc0..92256bf2a5 100644 --- a/pkg/services/govmomi/service.go +++ b/pkg/services/govmomi/service.go @@ -538,40 +538,6 @@ func (vms *VMService) reconcilePCIDevices(ctx context.Context, virtualMachineCtx return errors.Wrapf(err, "error adding pci devices for %q", ctx) } } - if expectedVGPUs := virtualMachineCtx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices; len(expectedVGPUs) != 0 { - specsToBeAdded, err := pci.CalculateVGPUsToBeAdded(ctx, virtualMachineCtx.Obj, expectedVGPUs) - if err != nil { - return err - } - - if len(specsToBeAdded) == 0 { - if conditions.Has(virtualMachineCtx.VSphereVM, infrav1.PCIDevicesDetachedCondition) { - conditions.Delete(virtualMachineCtx.VSphereVM, infrav1.PCIDevicesDetachedCondition) - } - log.V(5).Info("No new PCI devices to be added") - return nil - } - - powerState, err := virtualMachineCtx.Obj.PowerState(ctx) - if err != nil { - return err - } - if powerState == types.VirtualMachinePowerStatePoweredOn { - // This would arise only when the PCI device is manually removed from - // the VM post creation. - log.Info("vGPU device cannot be attached in powered on state") - conditions.MarkFalse(virtualMachineCtx.VSphereVM, - infrav1.PCIDevicesDetachedCondition, - infrav1.NotFoundReason, - clusterv1.ConditionSeverityWarning, - "vGPU devices removed after VM was powered on") - return errors.Errorf("missing vGPU devices") - } - log.Info("vGPU devices to be added", "number", len(specsToBeAdded)) - if err := virtualMachineCtx.Obj.AddDevice(ctx, pci.ConstructDeviceSpecsVGPU(specsToBeAdded)...); err != nil { - return errors.Wrapf(err, "error adding vGPU devices for %q", ctx) - } - } return nil } diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go index 61126d7e43..0334e247e1 100644 --- a/pkg/services/govmomi/vcenter/clone.go +++ b/pkg/services/govmomi/vcenter/clone.go @@ -196,10 +196,10 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by Snapshot: snapshotRef, } - // For PCI and vGPU devices, the memory for the VM needs to be reserved + // For PCI devices, the memory for the VM needs to be reserved // We can replace this once we have another way of reserving memory option // exposed via the API types. - if len(vmCtx.VSphereVM.Spec.PciDevices) > 0 || len(vmCtx.VSphereVM.Spec.VGPUDevices) > 0 { + if len(vmCtx.VSphereVM.Spec.PciDevices) > 0 { spec.Config.MemoryReservationLockedToMax = ptr.To(true) } From 1e9a6e1fecd4835a5321bea5b3d49ad1e8b47066 Mon Sep 17 00:00:00 2001 From: Birk Lewin Date: Thu, 30 May 2024 15:11:43 +0200 Subject: [PATCH 08/21] Remove outdated vpgu e2e setup --- Makefile | 2 -- test/e2e/config/vsphere-dev.yaml | 1 - .../main/vgpu/kustomization.yaml | 6 ------ .../main/vgpu/vgpu-device-template.yaml | 11 ----------- 4 files changed, 20 deletions(-) delete mode 100644 test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml delete mode 100644 test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml diff --git a/Makefile b/Makefile index bb5d78b839..2eb00f4911 100644 --- a/Makefile +++ b/Makefile @@ -384,8 +384,6 @@ generate-e2e-templates-main: $(KUSTOMIZE) ## Generate test templates for the mai "$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/install-on-bootstrap" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-install-on-bootstrap.yaml" # for PCI passthrough template "$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/pci" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-pci.yaml" - # for vGPU template - "$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/vgpu" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-vgpu.yaml" # for DHCP overrides "$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/dhcp-overrides" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-dhcp-overrides.yaml" "$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/ownerrefs-finalizers" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-ownerrefs-finalizers.yaml" diff --git a/test/e2e/config/vsphere-dev.yaml b/test/e2e/config/vsphere-dev.yaml index d5a4df0026..4a1d89e7f3 100644 --- a/test/e2e/config/vsphere-dev.yaml +++ b/test/e2e/config/vsphere-dev.yaml @@ -1,4 +1,3 @@ ---- # E2E test scenario using local dev images and manifests built from the source tree for following providers: # - cluster-api # - bootstrap kubeadm diff --git a/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml b/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml deleted file mode 100644 index 75b395b27b..0000000000 --- a/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -resources: - - ../base -patchesStrategicMerge: - - vgpu-device-template.yaml diff --git a/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml b/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml deleted file mode 100644 index 4404df5f3f..0000000000 --- a/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml +++ /dev/null @@ -1,11 +0,0 @@ ---- -apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 -kind: VSphereMachineTemplate -metadata: - name: ${CLUSTER_NAME}-worker - namespace: ${NAMESPACE} -spec: - template: - spec: - vgpuDevices: - - profileName: ${PROFILE_NAME} \ No newline at end of file From 6e8d105561d7c51bb0477d8c5ee6351065283eca Mon Sep 17 00:00:00 2001 From: Birk Lewin Date: Thu, 30 May 2024 15:34:47 +0200 Subject: [PATCH 09/21] Fix nil-pointer mistake --- pkg/services/govmomi/pci/device.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/services/govmomi/pci/device.go b/pkg/services/govmomi/pci/device.go index cc75f670bc..a6e1a1132b 100644 --- a/pkg/services/govmomi/pci/device.go +++ b/pkg/services/govmomi/pci/device.go @@ -77,7 +77,7 @@ func ConstructDeviceSpecs(pciDeviceSpecs []infrav1.PCIDeviceSpec) []types.BaseVi } func createBackingInfo(spec infrav1.PCIDeviceSpec) types.BaseVirtualDeviceBackingInfo { - if spec.VGPUProfile == "" { + if spec.DeviceID != nil && spec.VendorID != nil { return &types.VirtualPCIPassthroughDynamicBackingInfo{ AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{ { @@ -95,7 +95,7 @@ func createBackingInfo(spec infrav1.PCIDeviceSpec) types.BaseVirtualDeviceBackin } func constructKey(pciDeviceSpec infrav1.PCIDeviceSpec) string { - if pciDeviceSpec.VGPUProfile == "" { + if pciDeviceSpec.DeviceID != nil && pciDeviceSpec.VendorID != nil { return fmt.Sprintf("%d-%d", *pciDeviceSpec.DeviceID, *pciDeviceSpec.VendorID) } From c7f4781067b2f356089b3b6dd812588982d23a77 Mon Sep 17 00:00:00 2001 From: Birk Lewin Date: Fri, 31 May 2024 15:22:19 +0200 Subject: [PATCH 10/21] Webhook validation for vgpuProfile --- apis/v1beta1/types.go | 3 + ...ture.cluster.x-k8s.io_vspheremachines.yaml | 3 +- ...ster.x-k8s.io_vspheremachinetemplates.yaml | 9 +- ...structure.cluster.x-k8s.io_vspherevms.yaml | 5 +- internal/webhooks/vspheremachinetemplate.go | 7 + test/e2e/config/vsphere-dev.yaml | 163 ------------------ test/e2e/config/vsphere.yaml | 1 - 7 files changed, 21 insertions(+), 170 deletions(-) delete mode 100644 test/e2e/config/vsphere-dev.yaml diff --git a/apis/v1beta1/types.go b/apis/v1beta1/types.go index 374084ce5d..d542adf722 100644 --- a/apis/v1beta1/types.go +++ b/apis/v1beta1/types.go @@ -251,16 +251,19 @@ type PCIDeviceSpec struct { // DeviceID is the device ID of a virtual machine's PCI, in integer. // Defaults to the eponymous property value in the template from which the // virtual machine is cloned. + // Mutually exclusive with VGPUProfile. // +kubebuilder:validation:Required DeviceID *int32 `json:"deviceId,omitempty"` // VendorId is the vendor ID of a virtual machine's PCI, in integer. // Defaults to the eponymous property value in the template from which the // virtual machine is cloned. + // Mutually exclusive with VGPUProfile. // +kubebuilder:validation:Required VendorID *int32 `json:"vendorId,omitempty"` // VGPUProfile is the profile name of a virtual machine's vGPU, in string. // Defaults to the eponymous property value in the template from which the // virtual machine is cloned. + // Mutually exclusive with DeviceID and VendorID. // +kubebuilder:validation:Required VGPUProfile string `json:"vgpuProfile,omitempty"` // CustomLabel is the hardware label of a virtual machine's PCI device. diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml index 84cd06f334..4dc55498e0 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml @@ -1380,9 +1380,10 @@ spec: format: int32 type: integer vgpuProfile: - description: VGPUProfile is the VGPUProfile of a virtual machine's + description: VGPUProfile is the profile name of a virtual machine's vGPU, in string. Defaults to the eponymous property value in the template from which the virtual machine is cloned. + Mutually exclusive with DeviceID and VendorID. type: string type: object type: array diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml index 27c270bffa..bc8d15216d 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml @@ -1244,21 +1244,22 @@ spec: description: |- DeviceID is the device ID of a virtual machine's PCI, in integer. Defaults to the eponymous property value in the template from which the - virtual machine is cloned. + virtual machine is cloned. Mutually exclusive with VGPUProfile. format: int32 type: integer vendorId: description: |- VendorId is the vendor ID of a virtual machine's PCI, in integer. Defaults to the eponymous property value in the template from which the - virtual machine is cloned. + virtual machine is cloned. Mutually exclusive with VGPUProfile. format: int32 type: integer vgpuProfile: - description: VGPUProfile is the VGPUProfile of a virtual + description: VGPUProfile is the profile name of a virtual machine's vGPU, in string. Defaults to the eponymous property value in the template from which the virtual - machine is cloned. + machine is cloned. Mutually exclusive with DeviceID + and VendorID. type: string type: object type: array diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml index 96589edfdc..6ce31f5d74 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml @@ -1458,6 +1458,7 @@ spec: DeviceID is the device ID of a virtual machine's PCI, in integer. Defaults to the eponymous property value in the template from which the virtual machine is cloned. + Mutually exclusive with VGPUProfile. format: int32 type: integer vendorId: @@ -1465,12 +1466,14 @@ spec: VendorId is the vendor ID of a virtual machine's PCI, in integer. Defaults to the eponymous property value in the template from which the virtual machine is cloned. + Mutually exclusive with VGPUProfile. format: int32 type: integer vgpuProfile: - description: VGPUProfile is the VGPUProfile of a virtual machine's + description: VGPUProfile is the profile name of a virtual machine's vGPU, in string. Defaults to the eponymous property value in the template from which the virtual machine is cloned. + Mutually exclusive with DeviceID and VendorID. type: string type: object type: array diff --git a/internal/webhooks/vspheremachinetemplate.go b/internal/webhooks/vspheremachinetemplate.go index a88af09892..e787f3289a 100644 --- a/internal/webhooks/vspheremachinetemplate.go +++ b/internal/webhooks/vspheremachinetemplate.go @@ -84,6 +84,13 @@ func (webhook *VSphereMachineTemplateWebhook) ValidateCreate(_ context.Context, allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "guestSoftPowerOffTimeout"), spec.GuestSoftPowerOffTimeout, "should be greater than 0")) } } + for _, device := range spec.PciDevices { + hasVGPU := device.VGPUProfile != "" + hasPCI := device.DeviceID != nil && device.VendorID != nil + if (hasPCI && hasVGPU) || (!hasPCI && !hasVGPU) { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices"), spec.PciDevices, "should have either deviceID + vendorID or vgpuProfile")) + } + } return nil, AggregateObjErrors(obj.GroupVersionKind().GroupKind(), obj.Name, allErrs) } diff --git a/test/e2e/config/vsphere-dev.yaml b/test/e2e/config/vsphere-dev.yaml deleted file mode 100644 index 4a1d89e7f3..0000000000 --- a/test/e2e/config/vsphere-dev.yaml +++ /dev/null @@ -1,163 +0,0 @@ -# E2E test scenario using local dev images and manifests built from the source tree for following providers: -# - cluster-api -# - bootstrap kubeadm -# - control-plane kubeadm -# - vsphere - -# For creating local dev images built from the source tree; -# - from the CAPI repository root, `make docker-build REGISTRY=gcr.io/k8s-staging-cluster-api` to build the cluster-api, -# bootstrap kubeadm, control-plane kubeadm provider images. This step can be skipped to use upstream images. -# - from the CAPV repository root, `make e2e` to build the vsphere provider image and run e2e tests. - -images: - - name: registry.k8s.io/cluster-api/cluster-api-controller:v1.5.0 - loadBehavior: tryLoad - - name: registry.k8s.io/cluster-api/kubeadm-bootstrap-controller:v1.5.0 - loadBehavior: tryLoad - - name: registry.k8s.io/cluster-api/kubeadm-control-plane-controller:v1.5.0 - loadBehavior: tryLoad - - name: gcr.io/k8s-staging-cluster-api/capv-manager:e2e - loadBehavior: mustLoad - - name: quay.io/jetstack/cert-manager-cainjector:v1.12.2 - loadBehavior: tryLoad - - name: quay.io/jetstack/cert-manager-webhook:v1.12.2 - loadBehavior: tryLoad - - name: quay.io/jetstack/cert-manager-controller:v1.12.2 - loadBehavior: tryLoad - -providers: - - - name: cluster-api - type: CoreProvider - versions: - - name: v1.5.0 - # Use manifest from source files - value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/v1.5.0/core-components.yaml" - type: "url" - contract: v1beta1 - files: - - sourcePath: "../data/shared/main/v1beta1/metadata.yaml" - replacements: - - old: "imagePullPolicy: Always" - new: "imagePullPolicy: IfNotPresent" - - - name: kubeadm - type: BootstrapProvider - versions: - - name: v1.5.0 - # Use manifest from source files - value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/v1.5.0/bootstrap-components.yaml" - type: "url" - contract: v1beta1 - files: - - sourcePath: "../data/shared/main/v1beta1/metadata.yaml" - replacements: - - old: "imagePullPolicy: Always" - new: "imagePullPolicy: IfNotPresent" - - - name: kubeadm - type: ControlPlaneProvider - versions: - - name: v1.5.0 - # Use manifest from source files - value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/v1.5.0/control-plane-components.yaml" - type: "url" - contract: v1beta1 - files: - - sourcePath: "../data/shared/main/v1beta1/metadata.yaml" - replacements: - - old: "imagePullPolicy: Always" - new: "imagePullPolicy: IfNotPresent" - - - name: vsphere - type: InfrastructureProvider - versions: - - name: v1.9.99 - # Use manifest from source files - value: ../../../../cluster-api-provider-vsphere/config/default - contract: v1beta1 - replacements: - - old: gcr.io/cluster-api-provider-vsphere/release/manager:latest - new: gcr.io/k8s-staging-cluster-api/capv-manager:e2e - - old: "imagePullPolicy: Always" - new: "imagePullPolicy: IfNotPresent" - files: - # Add a cluster template - - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template.yaml" - - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-conformance.yaml" - - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-hw-upgrade.yaml" - - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-kcp-remediation.yaml" - - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-md-remediation.yaml" - - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-node-drain.yaml" - - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-pci.yaml" - - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-remote-management.yaml" - - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-storage-policy.yaml" - - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-topology.yaml" - - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-dhcp-overrides.yaml" - - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/clusterclass-quick-start.yaml" - - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-ignition.yaml" - - sourcePath: "../data/shared/main/v1beta1_provider/metadata.yaml" - -variables: - KUBERNETES_VERSION: "v1.28.0" - CPI_IMAGE_K8S_VERSION: "v1.27.0" - CNI: "./data/cni/calico/calico.yaml" - EXP_CLUSTER_RESOURCE_SET: "true" - EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION: "true" - CONTROL_PLANE_MACHINE_COUNT: 1 - WORKER_MACHINE_COUNT: 1 - IP_FAMILY: "IPv4" - CLUSTER_CLASS_NAME: "quick-start" - # Following CAPV variables should be set before testing - VSPHERE_SERVER: "vcenter.vmware.com" - VSPHERE_TLS_THUMBPRINT: "AA:BB:CC:DD:11:22:33:44:EE:FF" - VSPHERE_DATACENTER: "SDDC-Datacenter" - VSPHERE_FOLDER: "FolderName" - VSPHERE_RESOURCE_POOL: "ResourcePool" - VSPHERE_DATASTORE: "WorkloadDatastore" - VSPHERE_STORAGE_POLICY: "Cluster API vSphere Storage Policy" - VSPHERE_NETWORK: "network-1" - VSPHERE_TEMPLATE: "ubuntu-2204-kube-v1.28.0" - FLATCAR_VSPHERE_TEMPLATE: "flatcar-stable-3510.2.6-kube-v1.28.0" - # WORKLOAD_CONTROL_PLANE_ENDPOINT_IP: - # Also following variables are required but it is recommended to use env variables to avoid disclosure of sensitive data - # VSPHERE_SSH_AUTHORIZED_KEY: - # VSPHERE_PASSWORD: - # VSPHERE_USERNAME: - # Dedicated IP to be used by kube-vip - # CONTROL_PLANE_ENDPOINT_IP: - # Sets the insecure-flag for vsphere-csi-controller config - VSPHERE_INSECURE_CSI: "true" - KUBETEST_CONFIGURATION: "./data/kubetest/conformance-fast.yaml" - NODE_DRAIN_TIMEOUT: "60s" - CLUSTER_TOPOLOGY: "true" - # These IDs correspond to Tesla T4s, they are the decimal representation of the hex values. - DEVICE_ID: 7864 - VENDOR_ID: 4318 - PROFILE_NAME: grid_v100d-4c - # CAPV feature flags - EXP_NODE_ANTI_AFFINITY: "true" - # Following CAPV variables is used for multivc_test.go. This is the second VSphere and should be set if multivc test is enabled. - VSPHERE2_SERVER: "vcenter2.vmware.com" - VSPHERE2_TLS_THUMBPRINT: "AA:BB:CC:DD:11:22:33:44:EE:FF" - VSPHERE2_RESOURCE_POOL: "ResourcePool" - VSPHERE2_TEMPLATE: "ubuntu-2004-kube-v1.27.3" - # Dedicated IP to be used by kube-vip - VSPHERE2_CONTROL_PLANE_ENDPOINT_IP: - # Following variables are also required and please use env variables to avoid disclosure of sensitive data - VSPHERE2_USERNAME: - VSPHERE2_PASSWORD: - - -intervals: - default/wait-controllers: ["5m", "10s"] - default/wait-cluster: ["5m", "10s"] - default/wait-control-plane: ["20m", "10s"] - default/wait-worker-nodes: ["20m", "10s"] - default/wait-delete-cluster: ["5m", "10s"] - default/wait-machine-upgrade: ["15m", "1m"] - default/wait-machine-remediation: ["15m", "10s"] - mhc-remediation/mhc-remediation: ["30m", "10s"] - node-drain/wait-deployment-available: ["3m", "10s"] - node-drain/wait-machine-deleted: ["2m", "10s"] - anti-affinity/wait-vm-redistribution: ["5m", "10s"] diff --git a/test/e2e/config/vsphere.yaml b/test/e2e/config/vsphere.yaml index d0f768d305..2aeb087684 100644 --- a/test/e2e/config/vsphere.yaml +++ b/test/e2e/config/vsphere.yaml @@ -279,7 +279,6 @@ variables: # These IDs correspond to Tesla T4s, they are the decimal representation of the hex values. DEVICE_ID: 7864 VENDOR_ID: 4318 - PROFILE_NAME: grid_v100d-4c # CAPV feature flags EXP_NODE_ANTI_AFFINITY: "true" CAPI_DIAGNOSTICS_ADDRESS: ":8080" From 1353b4db60457683e869ee234d70782484f9caf7 Mon Sep 17 00:00:00 2001 From: Birk Lewin <89076383+birksl@users.noreply.github.com> Date: Fri, 31 May 2024 18:15:59 +0200 Subject: [PATCH 11/21] Update internal/webhooks/vspheremachinetemplate.go Co-authored-by: Christian Schlotter --- internal/webhooks/vspheremachinetemplate.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/webhooks/vspheremachinetemplate.go b/internal/webhooks/vspheremachinetemplate.go index e787f3289a..e72aeb5c5e 100644 --- a/internal/webhooks/vspheremachinetemplate.go +++ b/internal/webhooks/vspheremachinetemplate.go @@ -84,11 +84,11 @@ func (webhook *VSphereMachineTemplateWebhook) ValidateCreate(_ context.Context, allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "guestSoftPowerOffTimeout"), spec.GuestSoftPowerOffTimeout, "should be greater than 0")) } } - for _, device := range spec.PciDevices { + for i, device := range spec.PciDevices { hasVGPU := device.VGPUProfile != "" hasPCI := device.DeviceID != nil && device.VendorID != nil if (hasPCI && hasVGPU) || (!hasPCI && !hasVGPU) { - allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices"), spec.PciDevices, "should have either deviceID + vendorID or vgpuProfile")) + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d, i)), device, "should have either deviceID + vendorID or vgpuProfile")) } } return nil, AggregateObjErrors(obj.GroupVersionKind().GroupKind(), obj.Name, allErrs) From a6c0fd82d86909ef5cc06638d6ad0889d50052bf Mon Sep 17 00:00:00 2001 From: Birk Lewin Date: Fri, 7 Jun 2024 17:53:42 +0200 Subject: [PATCH 12/21] Update validation webhooks --- internal/webhooks/vspheremachine.go | 11 +++ internal/webhooks/vspheremachine_test.go | 95 +++++++++++++------ internal/webhooks/vspheremachinetemplate.go | 12 ++- .../webhooks/vspheremachinetemplate_test.go | 77 +++++++++++---- pkg/services/govmomi/pci/device.go | 23 +++-- 5 files changed, 157 insertions(+), 61 deletions(-) diff --git a/internal/webhooks/vspheremachine.go b/internal/webhooks/vspheremachine.go index 420df4733d..f183e9a10e 100644 --- a/internal/webhooks/vspheremachine.go +++ b/internal/webhooks/vspheremachine.go @@ -92,6 +92,17 @@ func (webhook *VSphereMachineWebhook) ValidateCreate(_ context.Context, raw runt allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "guestSoftPowerOffTimeout"), spec.GuestSoftPowerOffTimeout, "should be greater than 0")) } } + for i, device := range spec.PciDevices { + if device.VGPUProfile == "" { + if device.DeviceID == nil || device.VendorID == nil { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have both deviceId and vendorId set")) + } + } else { + if device.DeviceID != nil || device.VendorID != nil { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have either deviceId + vendorId or vgpuProfile")) + } + } + } return nil, AggregateObjErrors(obj.GroupVersionKind().GroupKind(), obj.Name, allErrs) } diff --git a/internal/webhooks/vspheremachine_test.go b/internal/webhooks/vspheremachine_test.go index b5ef77df7f..8c1fa7857e 100644 --- a/internal/webhooks/vspheremachine_test.go +++ b/internal/webhooks/vspheremachine_test.go @@ -48,52 +48,86 @@ func TestVSphereMachine_ValidateCreate(t *testing.T) { }{ { name: "preferredAPIServerCIDR set on creation ", - vsphereMachine: createVSphereMachine("foo.com", nil, "192.168.0.1/32", []string{}, infrav1.VirtualMachinePowerOpModeTrySoft, nil), + vsphereMachine: createVSphereMachine("foo.com", nil, "192.168.0.1/32", []string{}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, nil), wantErr: true, }, { name: "IPs are not in CIDR format", - vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, nil), wantErr: true, }, { name: "IPs are not valid IPs in CIDR format", - vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"/32", "192.168.0.644/33"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"/32", "192.168.0.644/33"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, nil), wantErr: true, }, { name: "guestSoftPowerOffTimeout should not be set with powerOffMode set to hard", - vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeHard, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeHard, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}, nil), wantErr: true, }, { name: "guestSoftPowerOffTimeout should not be set with powerOffMode set to soft", - vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeSoft, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeSoft, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}, nil), wantErr: true, }, { name: "guestSoftPowerOffTimeout should not be negative", - vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeHard, &metav1.Duration{Duration: -1234}), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeHard, &metav1.Duration{Duration: -1234}, nil), + wantErr: true, + }, + + { + name: "empty pciDevice", + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: ""}}), + wantErr: true, + }, + { + name: "incorrect pciDevice", + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32)}}), + wantErr: true, + }, + { + name: "incorrect pciDevice", + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32), VendorID: new(int32)}}), wantErr: true, }, + { + name: "incomplete pciDevice", + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{DeviceID: new(int32)}}), + wantErr: true, + }, + { + name: "incomplete pciDevice", + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VendorID: new(int32)}}), + wantErr: true, + }, + { + name: "successful VSphereMachine creation with PCI device", + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{DeviceID: new(int32), VendorID: new(int32)}}), + }, + { + name: "successful VSphereMachine creation with vgpu", + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu"}}), + }, { name: "successful VSphereMachine creation", - vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, nil), wantErr: false, }, { name: "successful VSphereMachine creation with powerOffMode set to hard", - vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeHard, nil), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeHard, nil, nil), wantErr: false, }, { name: "successful VSphereMachine creation with powerOffMode set to soft", - vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil), wantErr: false, }, { name: "successful VSphereMachine creation with powerOffMode set to trySoft and non-default timeout", - vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, &metav1.Duration{Duration: 1234}), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, &metav1.Duration{Duration: 1234}, nil), wantErr: false, }, } @@ -121,50 +155,56 @@ func TestVSphereMachine_ValidateUpdate(t *testing.T) { }{ { name: "ProviderID can be updated", - oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil), - vsphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil), + oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil), + vsphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil), wantErr: false, }, { name: "updating ips can be done", - oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil), - vsphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil), + oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil), + vsphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil), wantErr: false, }, { name: "updating non-existing IP with invalid ips can not be done", - oldVSphereMachine: createVSphereMachine("foo.com", nil, "", nil, infrav1.VirtualMachinePowerOpModeSoft, nil), - vsphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"/32", "192.168.0.10/33"}, infrav1.VirtualMachinePowerOpModeSoft, nil), + oldVSphereMachine: createVSphereMachine("foo.com", nil, "", nil, infrav1.VirtualMachinePowerOpModeSoft, nil, nil), + vsphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"/32", "192.168.0.10/33"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil), wantErr: true, }, { name: "updating existing IP with invalid ips can not be done", - oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil), - vsphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"/32", "192.168.0.10/33"}, infrav1.VirtualMachinePowerOpModeSoft, nil), + oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil), + vsphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"/32", "192.168.0.10/33"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil), wantErr: true, }, { name: "updating server cannot be done", - oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil), - vsphereMachine: createVSphereMachine("bar.com", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil), + oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil), + vsphereMachine: createVSphereMachine("bar.com", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil), + wantErr: true, + }, + { + name: "updating pci devices cannot be done", + oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu"}}), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "new-vgpu"}}), wantErr: true, }, { name: "powerOffMode cannot be updated when new powerOffMode is not valid", - oldVSphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil), - vsphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeHard, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}), + oldVSphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, nil), + vsphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeHard, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}, nil), wantErr: true, }, { name: "powerOffMode can be updated to hard", - oldVSphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}), - vsphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeHard, nil), + oldVSphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}, nil), + vsphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeHard, nil, nil), wantErr: false, }, { name: "powerOffMode can be updated to soft", - oldVSphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}), - vsphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil), + oldVSphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}, nil), + vsphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil), wantErr: false, }, } @@ -181,7 +221,7 @@ func TestVSphereMachine_ValidateUpdate(t *testing.T) { } } -func createVSphereMachine(server string, providerID *string, preferredAPIServerCIDR string, ips []string, powerOffMode infrav1.VirtualMachinePowerOpMode, guestSoftPowerOffTimeout *metav1.Duration) *infrav1.VSphereMachine { +func createVSphereMachine(server string, providerID *string, preferredAPIServerCIDR string, ips []string, powerOffMode infrav1.VirtualMachinePowerOpMode, guestSoftPowerOffTimeout *metav1.Duration, pciDevices []infrav1.PCIDeviceSpec) *infrav1.VSphereMachine { VSphereMachine := &infrav1.VSphereMachine{ Spec: infrav1.VSphereMachineSpec{ VirtualMachineCloneSpec: infrav1.VirtualMachineCloneSpec{ @@ -190,6 +230,7 @@ func createVSphereMachine(server string, providerID *string, preferredAPIServerC PreferredAPIServerCIDR: preferredAPIServerCIDR, Devices: []infrav1.NetworkDeviceSpec{}, }, + PciDevices: pciDevices, }, ProviderID: providerID, PowerOffMode: powerOffMode, diff --git a/internal/webhooks/vspheremachinetemplate.go b/internal/webhooks/vspheremachinetemplate.go index e72aeb5c5e..2497249c83 100644 --- a/internal/webhooks/vspheremachinetemplate.go +++ b/internal/webhooks/vspheremachinetemplate.go @@ -85,10 +85,14 @@ func (webhook *VSphereMachineTemplateWebhook) ValidateCreate(_ context.Context, } } for i, device := range spec.PciDevices { - hasVGPU := device.VGPUProfile != "" - hasPCI := device.DeviceID != nil && device.VendorID != nil - if (hasPCI && hasVGPU) || (!hasPCI && !hasVGPU) { - allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d, i)), device, "should have either deviceID + vendorID or vgpuProfile")) + if device.VGPUProfile == "" { + if device.DeviceID == nil || device.VendorID == nil { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have both deviceId and vendorId set")) + } + } else { + if device.DeviceID != nil || device.VendorID != nil { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have either deviceId + vendorId or vgpuProfile")) + } } } return nil, AggregateObjErrors(obj.GroupVersionKind().GroupKind(), obj.Name, allErrs) diff --git a/internal/webhooks/vspheremachinetemplate_test.go b/internal/webhooks/vspheremachinetemplate_test.go index 20f6eddacf..d1a0714645 100644 --- a/internal/webhooks/vspheremachinetemplate_test.go +++ b/internal/webhooks/vspheremachinetemplate_test.go @@ -37,37 +37,70 @@ func TestVSphereMachineTemplate_ValidateCreate(t *testing.T) { }{ { name: "preferredAPIServerCIDR set on creation ", - vsphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "192.168.0.1/32", []string{}), + vsphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "192.168.0.1/32", []string{}, nil), wantErr: true, }, { name: "ProviderID set on creation", - vsphereMachine: createVSphereMachineTemplate("foo.com", "", &someProviderID, "", []string{}), + vsphereMachine: createVSphereMachineTemplate("foo.com", "", &someProviderID, "", []string{}, nil), wantErr: true, }, { name: "IPs are not in CIDR format", - vsphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32", "192.168.0.3"}), + vsphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32", "192.168.0.3"}, nil), wantErr: true, }, { name: "successful VSphereMachine creation", - vsphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}), + vsphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, nil), wantErr: true, }, { name: "incomplete hardware version", - vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}), + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, nil), wantErr: true, }, { name: "incorrect hardware version", - vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-0", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}), + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-0", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, nil), wantErr: true, }, + { + name: "empty pciDevice", + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: ""}}), + wantErr: true, + }, + { + name: "incorrect pciDevice", + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32)}}), + wantErr: true, + }, + { + name: "incorrect pciDevice", + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32), VendorID: new(int32)}}), + wantErr: true, + }, + { + name: "incomplete pciDevice", + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{DeviceID: new(int32)}}), + wantErr: true, + }, + { + name: "incomplete pciDevice", + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VendorID: new(int32)}}), + wantErr: true, + }, + { + name: "successful VSphereMachine creation with PCI device", + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{DeviceID: new(int32), VendorID: new(int32)}}), + }, + { + name: "successful VSphereMachine creation with vgpu", + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu"}}), + }, { name: "successful VSphereMachine creation with hardware version set", - vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}), + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, nil), }, } for _, tc := range tests { @@ -94,36 +127,43 @@ func TestVSphereMachineTemplate_ValidateUpdate(t *testing.T) { }{ { name: "ProviderID cannot be updated", - oldVSphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32"}), - vsphereMachine: createVSphereMachineTemplate("foo.com", "", &someProviderID, "", []string{"192.168.0.1/32"}), + oldVSphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32"}, nil), + vsphereMachine: createVSphereMachineTemplate("foo.com", "", &someProviderID, "", []string{"192.168.0.1/32"}, nil), req: &admission.Request{AdmissionRequest: admissionv1.AdmissionRequest{DryRun: ptr.To(false)}}, wantErr: true, }, { name: "ip addresses cannot be updated", - oldVSphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32"}), - vsphereMachine: createVSphereMachineTemplate("foo.com", "", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}), + oldVSphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32"}, nil), + vsphereMachine: createVSphereMachineTemplate("foo.com", "", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}, nil), req: &admission.Request{AdmissionRequest: admissionv1.AdmissionRequest{DryRun: ptr.To(false)}}, wantErr: true, }, { name: "server cannot be updated", - oldVSphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32"}), - vsphereMachine: createVSphereMachineTemplate("baz.com", "", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}), + oldVSphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32"}, nil), + vsphereMachine: createVSphereMachineTemplate("baz.com", "", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}, nil), req: &admission.Request{AdmissionRequest: admissionv1.AdmissionRequest{DryRun: ptr.To(false)}}, wantErr: true, }, { name: "hardware version cannot be updated", - oldVSphereMachine: createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}), - vsphereMachine: createVSphereMachineTemplate("baz.com", "vmx-17", nil, "", []string{"192.168.0.1/32"}), + oldVSphereMachine: createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}, nil), + vsphereMachine: createVSphereMachineTemplate("baz.com", "vmx-17", nil, "", []string{"192.168.0.1/32"}, nil), + req: &admission.Request{AdmissionRequest: admissionv1.AdmissionRequest{DryRun: ptr.To(false)}}, + wantErr: true, + }, + { + name: "pci devices cannot be updated", + oldVSphereMachine: createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu"}}), + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}, []infrav1.PCIDeviceSpec{{VGPUProfile: "new-vgpu"}}), req: &admission.Request{AdmissionRequest: admissionv1.AdmissionRequest{DryRun: ptr.To(false)}}, wantErr: true, }, { name: "with hardware version set and not updated", - oldVSphereMachine: createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}), - vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}), + oldVSphereMachine: createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}, nil), + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}, nil), req: &admission.Request{AdmissionRequest: admissionv1.AdmissionRequest{DryRun: ptr.To(false)}}, wantErr: false, // explicitly calling out that this is a valid scenario. }, @@ -145,7 +185,7 @@ func TestVSphereMachineTemplate_ValidateUpdate(t *testing.T) { } } -func createVSphereMachineTemplate(server, hwVersion string, providerID *string, preferredAPIServerCIDR string, ips []string) *infrav1.VSphereMachineTemplate { +func createVSphereMachineTemplate(server, hwVersion string, providerID *string, preferredAPIServerCIDR string, ips []string, pciDevices []infrav1.PCIDeviceSpec) *infrav1.VSphereMachineTemplate { vsphereMachineTemplate := &infrav1.VSphereMachineTemplate{ Spec: infrav1.VSphereMachineTemplateSpec{ Template: infrav1.VSphereMachineTemplateResource{ @@ -158,6 +198,7 @@ func createVSphereMachineTemplate(server, hwVersion string, providerID *string, Devices: []infrav1.NetworkDeviceSpec{}, }, HardwareVersion: hwVersion, + PciDevices: pciDevices, }, }, }, diff --git a/pkg/services/govmomi/pci/device.go b/pkg/services/govmomi/pci/device.go index a6e1a1132b..eff285859d 100644 --- a/pkg/services/govmomi/pci/device.go +++ b/pkg/services/govmomi/pci/device.go @@ -77,20 +77,19 @@ func ConstructDeviceSpecs(pciDeviceSpecs []infrav1.PCIDeviceSpec) []types.BaseVi } func createBackingInfo(spec infrav1.PCIDeviceSpec) types.BaseVirtualDeviceBackingInfo { - if spec.DeviceID != nil && spec.VendorID != nil { - return &types.VirtualPCIPassthroughDynamicBackingInfo{ - AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{ - { - VendorId: *spec.VendorID, - DeviceId: *spec.DeviceID, - }, - }, - CustomLabel: spec.CustomLabel, + if spec.VGPUProfile != "" { + return &types.VirtualPCIPassthroughVmiopBackingInfo{ + Vgpu: spec.VGPUProfile, } } - - return &types.VirtualPCIPassthroughVmiopBackingInfo{ - Vgpu: spec.VGPUProfile, + return &types.VirtualPCIPassthroughDynamicBackingInfo{ + AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{ + { + VendorId: *spec.VendorID, + DeviceId: *spec.DeviceID, + }, + }, + CustomLabel: spec.CustomLabel, } } From 8a6a558323774a8f3bfd8c79e71005ea3d891bad Mon Sep 17 00:00:00 2001 From: Birk Lewin Date: Fri, 7 Jun 2024 17:59:45 +0200 Subject: [PATCH 13/21] Reorder if-statement --- pkg/services/govmomi/pci/device.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/services/govmomi/pci/device.go b/pkg/services/govmomi/pci/device.go index eff285859d..7c41cb194e 100644 --- a/pkg/services/govmomi/pci/device.go +++ b/pkg/services/govmomi/pci/device.go @@ -94,9 +94,8 @@ func createBackingInfo(spec infrav1.PCIDeviceSpec) types.BaseVirtualDeviceBackin } func constructKey(pciDeviceSpec infrav1.PCIDeviceSpec) string { - if pciDeviceSpec.DeviceID != nil && pciDeviceSpec.VendorID != nil { - return fmt.Sprintf("%d-%d", *pciDeviceSpec.DeviceID, *pciDeviceSpec.VendorID) + if pciDeviceSpec.VGPUProfile != "" { + return pciDeviceSpec.VGPUProfile } - - return pciDeviceSpec.VGPUProfile + return fmt.Sprintf("%d-%d", *pciDeviceSpec.DeviceID, *pciDeviceSpec.VendorID) } From cea00162ab59746157a2986512c699833ad25fe6 Mon Sep 17 00:00:00 2001 From: Birk Lewin Date: Wed, 12 Jun 2024 11:00:39 +0200 Subject: [PATCH 14/21] Run make generate --- ...ructure.cluster.x-k8s.io_vspheremachines.yaml | 9 ++++++--- ...cluster.x-k8s.io_vspheremachinetemplates.yaml | 16 +++++++++------- ...frastructure.cluster.x-k8s.io_vspherevms.yaml | 7 ++++--- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml index 4dc55498e0..f775114990 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml @@ -1370,6 +1370,7 @@ spec: DeviceID is the device ID of a virtual machine's PCI, in integer. Defaults to the eponymous property value in the template from which the virtual machine is cloned. + Mutually exclusive with VGPUProfile. format: int32 type: integer vendorId: @@ -1377,12 +1378,14 @@ spec: VendorId is the vendor ID of a virtual machine's PCI, in integer. Defaults to the eponymous property value in the template from which the virtual machine is cloned. + Mutually exclusive with VGPUProfile. format: int32 type: integer vgpuProfile: - description: VGPUProfile is the profile name of a virtual machine's - vGPU, in string. Defaults to the eponymous property value - in the template from which the virtual machine is cloned. + description: |- + VGPUProfile is the profile name of a virtual machine's vGPU, in string. + Defaults to the eponymous property value in the template from which the + virtual machine is cloned. Mutually exclusive with DeviceID and VendorID. type: string type: object diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml index bc8d15216d..a9518be232 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml @@ -1244,22 +1244,24 @@ spec: description: |- DeviceID is the device ID of a virtual machine's PCI, in integer. Defaults to the eponymous property value in the template from which the - virtual machine is cloned. Mutually exclusive with VGPUProfile. + virtual machine is cloned. + Mutually exclusive with VGPUProfile. format: int32 type: integer vendorId: description: |- VendorId is the vendor ID of a virtual machine's PCI, in integer. Defaults to the eponymous property value in the template from which the - virtual machine is cloned. Mutually exclusive with VGPUProfile. + virtual machine is cloned. + Mutually exclusive with VGPUProfile. format: int32 type: integer vgpuProfile: - description: VGPUProfile is the profile name of a virtual - machine's vGPU, in string. Defaults to the eponymous - property value in the template from which the virtual - machine is cloned. Mutually exclusive with DeviceID - and VendorID. + description: |- + VGPUProfile is the profile name of a virtual machine's vGPU, in string. + Defaults to the eponymous property value in the template from which the + virtual machine is cloned. + Mutually exclusive with DeviceID and VendorID. type: string type: object type: array diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml index 6ce31f5d74..a35692c085 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml @@ -1470,9 +1470,10 @@ spec: format: int32 type: integer vgpuProfile: - description: VGPUProfile is the profile name of a virtual machine's - vGPU, in string. Defaults to the eponymous property value - in the template from which the virtual machine is cloned. + description: |- + VGPUProfile is the profile name of a virtual machine's vGPU, in string. + Defaults to the eponymous property value in the template from which the + virtual machine is cloned. Mutually exclusive with DeviceID and VendorID. type: string type: object From 9fc17c97b281e862e48c4e8a7d01a7f88c86d3d8 Mon Sep 17 00:00:00 2001 From: Birk Lewin Date: Thu, 20 Jun 2024 09:43:58 +0200 Subject: [PATCH 15/21] Share device validation logic in webhooks --- internal/webhooks/vspheremachine.go | 30 +++++++++++++-------- internal/webhooks/vspheremachinetemplate.go | 14 +++------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/internal/webhooks/vspheremachine.go b/internal/webhooks/vspheremachine.go index f183e9a10e..7fe2abf24c 100644 --- a/internal/webhooks/vspheremachine.go +++ b/internal/webhooks/vspheremachine.go @@ -92,17 +92,8 @@ func (webhook *VSphereMachineWebhook) ValidateCreate(_ context.Context, raw runt allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "guestSoftPowerOffTimeout"), spec.GuestSoftPowerOffTimeout, "should be greater than 0")) } } - for i, device := range spec.PciDevices { - if device.VGPUProfile == "" { - if device.DeviceID == nil || device.VendorID == nil { - allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have both deviceId and vendorId set")) - } - } else { - if device.DeviceID != nil || device.VendorID != nil { - allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have either deviceId + vendorId or vgpuProfile")) - } - } - } + pciErrs := validatePCIDevices(spec.PciDevices) + allErrs = append(allErrs, pciErrs...) return nil, AggregateObjErrors(obj.GroupVersionKind().GroupKind(), obj.Name, allErrs) } @@ -171,3 +162,20 @@ func (webhook *VSphereMachineWebhook) ValidateUpdate(_ context.Context, oldRaw r func (webhook *VSphereMachineWebhook) ValidateDelete(_ context.Context, _ runtime.Object) (admission.Warnings, error) { return nil, nil } + +func validatePCIDevices(devices []infrav1.PCIDeviceSpec) field.ErrorList { + var allErrs field.ErrorList + + for i, device := range devices { + if device.VGPUProfile != "" && device.DeviceID == nil && device.VendorID == nil { + // Valid case for vGPU. + continue + } + if device.VGPUProfile == "" && device.DeviceID != nil && device.VendorID != nil { + // Valid case for PCI Passthrough. + continue + } + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have either deviceId + vendorId or vgpuProfile set")) + } + return allErrs +} diff --git a/internal/webhooks/vspheremachinetemplate.go b/internal/webhooks/vspheremachinetemplate.go index 2497249c83..8a1c865481 100644 --- a/internal/webhooks/vspheremachinetemplate.go +++ b/internal/webhooks/vspheremachinetemplate.go @@ -84,17 +84,9 @@ func (webhook *VSphereMachineTemplateWebhook) ValidateCreate(_ context.Context, allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "guestSoftPowerOffTimeout"), spec.GuestSoftPowerOffTimeout, "should be greater than 0")) } } - for i, device := range spec.PciDevices { - if device.VGPUProfile == "" { - if device.DeviceID == nil || device.VendorID == nil { - allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have both deviceId and vendorId set")) - } - } else { - if device.DeviceID != nil || device.VendorID != nil { - allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have either deviceId + vendorId or vgpuProfile")) - } - } - } + pciErrs := validatePCIDevices(spec.PciDevices) + allErrs = append(allErrs, pciErrs...) + return nil, AggregateObjErrors(obj.GroupVersionKind().GroupKind(), obj.Name, allErrs) } From 0e84953eb8b48075a03dfddfc63862986c80f3c0 Mon Sep 17 00:00:00 2001 From: Birk Lewin Date: Thu, 20 Jun 2024 09:47:55 +0200 Subject: [PATCH 16/21] Fix empty pointer nit --- internal/webhooks/vspheremachine_test.go | 11 ++++++----- internal/webhooks/vspheremachinetemplate_test.go | 10 +++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/internal/webhooks/vspheremachine_test.go b/internal/webhooks/vspheremachine_test.go index 8c1fa7857e..64b5c327f9 100644 --- a/internal/webhooks/vspheremachine_test.go +++ b/internal/webhooks/vspheremachine_test.go @@ -22,6 +22,7 @@ import ( . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" infrav1 "sigs.k8s.io/cluster-api-provider-vsphere/apis/v1beta1" ) @@ -84,27 +85,27 @@ func TestVSphereMachine_ValidateCreate(t *testing.T) { }, { name: "incorrect pciDevice", - vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32)}}), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: ptr.To[int32](1)}}), wantErr: true, }, { name: "incorrect pciDevice", - vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32), VendorID: new(int32)}}), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: ptr.To[int32](1), VendorID: ptr.To[int32](1)}}), wantErr: true, }, { name: "incomplete pciDevice", - vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{DeviceID: new(int32)}}), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{DeviceID: ptr.To[int32](1)}}), wantErr: true, }, { name: "incomplete pciDevice", - vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VendorID: new(int32)}}), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VendorID: ptr.To[int32](1)}}), wantErr: true, }, { name: "successful VSphereMachine creation with PCI device", - vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{DeviceID: new(int32), VendorID: new(int32)}}), + vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{DeviceID: ptr.To[int32](1), VendorID: ptr.To[int32](1)}}), }, { name: "successful VSphereMachine creation with vgpu", diff --git a/internal/webhooks/vspheremachinetemplate_test.go b/internal/webhooks/vspheremachinetemplate_test.go index d1a0714645..95ccd7042e 100644 --- a/internal/webhooks/vspheremachinetemplate_test.go +++ b/internal/webhooks/vspheremachinetemplate_test.go @@ -72,27 +72,27 @@ func TestVSphereMachineTemplate_ValidateCreate(t *testing.T) { }, { name: "incorrect pciDevice", - vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32)}}), + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: ptr.To[int32](1)}}), wantErr: true, }, { name: "incorrect pciDevice", - vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32), VendorID: new(int32)}}), + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: ptr.To[int32](1), VendorID: ptr.To[int32](1)}}), wantErr: true, }, { name: "incomplete pciDevice", - vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{DeviceID: new(int32)}}), + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{DeviceID: ptr.To[int32](1)}}), wantErr: true, }, { name: "incomplete pciDevice", - vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VendorID: new(int32)}}), + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VendorID: ptr.To[int32](1)}}), wantErr: true, }, { name: "successful VSphereMachine creation with PCI device", - vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{DeviceID: new(int32), VendorID: new(int32)}}), + vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{DeviceID: ptr.To[int32](1), VendorID: ptr.To[int32](1)}}), }, { name: "successful VSphereMachine creation with vgpu", From 1f433ebda9af3f0f6fc94f9cbdc18feabbb5f0db Mon Sep 17 00:00:00 2001 From: Birk Lewin <89076383+birksl@users.noreply.github.com> Date: Thu, 20 Jun 2024 09:49:35 +0200 Subject: [PATCH 17/21] Update docs/gpu-vgpu.md Co-authored-by: Christian Schlotter --- docs/gpu-vgpu.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md index 268aab1075..67c482db98 100644 --- a/docs/gpu-vgpu.md +++ b/docs/gpu-vgpu.md @@ -103,4 +103,4 @@ Note: For GPU nodes (PCI Passthrough or vGPU), all memory of the nodes must be r Apply the manifest from the previous step to your management cluster to have CAPV create a workload cluster with worker nodes that have vGPUs. -From this point on, the setup is exactly the same as [GPU enabled clusters via PCI Passthrough](https://github.com/kubernetes-sigs/cluster-api-provider-vsphere/blob/main/docs/gpu-pci.md#create-the-cluster). +From this point on, the setup is exactly the same as [GPU enabled clusters via PCI Passthrough](./gpu-pci.md#create-the-cluster). From 609542bac0384d538a11e537c2172d59f59a6188 Mon Sep 17 00:00:00 2001 From: Birk Lewin <89076383+birksl@users.noreply.github.com> Date: Thu, 20 Jun 2024 12:26:08 +0200 Subject: [PATCH 18/21] Apply suggestions from code review Co-authored-by: Lubomir I. Ivanov --- apis/v1beta1/types.go | 2 +- docs/gpu-vgpu.md | 3 ++- internal/webhooks/vspheremachine.go | 2 +- pkg/services/govmomi/vcenter/clone.go | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/apis/v1beta1/types.go b/apis/v1beta1/types.go index d542adf722..3372ffa118 100644 --- a/apis/v1beta1/types.go +++ b/apis/v1beta1/types.go @@ -265,7 +265,7 @@ type PCIDeviceSpec struct { // virtual machine is cloned. // Mutually exclusive with DeviceID and VendorID. // +kubebuilder:validation:Required - VGPUProfile string `json:"vgpuProfile,omitempty"` + VGPUProfile string `json:"vGPUProfile,omitempty"` // CustomLabel is the hardware label of a virtual machine's PCI device. // Defaults to the eponymous property value in the template from which the // virtual machine is cloned. diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md index 67c482db98..bc37b238aa 100644 --- a/docs/gpu-vgpu.md +++ b/docs/gpu-vgpu.md @@ -93,7 +93,8 @@ spec: - vgpuProfile: "grid_t4-1a" # value from above ``` -Set the required values for the other fields and the cluster template is ready for use. The similar changes can be made to a template generated using clusterctl generate cluster command as well. +Set the required values for the other fields and the cluster template is ready for use. +The similar changes can be made to a template generated using `clusterctl generate cluster` command as well. ### Create the cluster diff --git a/internal/webhooks/vspheremachine.go b/internal/webhooks/vspheremachine.go index 7fe2abf24c..328c7c1361 100644 --- a/internal/webhooks/vspheremachine.go +++ b/internal/webhooks/vspheremachine.go @@ -175,7 +175,7 @@ func validatePCIDevices(devices []infrav1.PCIDeviceSpec) field.ErrorList { // Valid case for PCI Passthrough. continue } - allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have either deviceId + vendorId or vgpuProfile set")) + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have either deviceId + vendorId or vGPUProfile set")) } return allErrs } diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go index 0334e247e1..91f64eb28d 100644 --- a/pkg/services/govmomi/vcenter/clone.go +++ b/pkg/services/govmomi/vcenter/clone.go @@ -69,7 +69,7 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by } } if vmCtx.VSphereVM.Spec.CustomVMXKeys != nil { - log.Info("Applied custom vmx keys to VM clone spec") + log.Info("Applied custom VMX keys to VM clone spec") if err := extraConfig.SetCustomVMXKeys(vmCtx.VSphereVM.Spec.CustomVMXKeys); err != nil { return err } From 438c6e5b285056bc17e38c755424709937df4d2a Mon Sep 17 00:00:00 2001 From: Birk Lewin Date: Thu, 20 Jun 2024 12:27:28 +0200 Subject: [PATCH 19/21] Generate manifests --- ...structure.cluster.x-k8s.io_vspheremachines.yaml | 14 +++++++------- ...e.cluster.x-k8s.io_vspheremachinetemplates.yaml | 14 +++++++------- ...infrastructure.cluster.x-k8s.io_vspherevms.yaml | 14 +++++++------- docs/gpu-vgpu.md | 2 +- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml index f775114990..36dfb02772 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml @@ -1373,6 +1373,13 @@ spec: Mutually exclusive with VGPUProfile. format: int32 type: integer + vGPUProfile: + description: |- + VGPUProfile is the profile name of a virtual machine's vGPU, in string. + Defaults to the eponymous property value in the template from which the + virtual machine is cloned. + Mutually exclusive with DeviceID and VendorID. + type: string vendorId: description: |- VendorId is the vendor ID of a virtual machine's PCI, in integer. @@ -1381,13 +1388,6 @@ spec: Mutually exclusive with VGPUProfile. format: int32 type: integer - vgpuProfile: - description: |- - VGPUProfile is the profile name of a virtual machine's vGPU, in string. - Defaults to the eponymous property value in the template from which the - virtual machine is cloned. - Mutually exclusive with DeviceID and VendorID. - type: string type: object type: array powerOffMode: diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml index a9518be232..54733547a6 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml @@ -1248,6 +1248,13 @@ spec: Mutually exclusive with VGPUProfile. format: int32 type: integer + vGPUProfile: + description: |- + VGPUProfile is the profile name of a virtual machine's vGPU, in string. + Defaults to the eponymous property value in the template from which the + virtual machine is cloned. + Mutually exclusive with DeviceID and VendorID. + type: string vendorId: description: |- VendorId is the vendor ID of a virtual machine's PCI, in integer. @@ -1256,13 +1263,6 @@ spec: Mutually exclusive with VGPUProfile. format: int32 type: integer - vgpuProfile: - description: |- - VGPUProfile is the profile name of a virtual machine's vGPU, in string. - Defaults to the eponymous property value in the template from which the - virtual machine is cloned. - Mutually exclusive with DeviceID and VendorID. - type: string type: object type: array powerOffMode: diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml index a35692c085..333df3fe42 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml @@ -1461,6 +1461,13 @@ spec: Mutually exclusive with VGPUProfile. format: int32 type: integer + vGPUProfile: + description: |- + VGPUProfile is the profile name of a virtual machine's vGPU, in string. + Defaults to the eponymous property value in the template from which the + virtual machine is cloned. + Mutually exclusive with DeviceID and VendorID. + type: string vendorId: description: |- VendorId is the vendor ID of a virtual machine's PCI, in integer. @@ -1469,13 +1476,6 @@ spec: Mutually exclusive with VGPUProfile. format: int32 type: integer - vgpuProfile: - description: |- - VGPUProfile is the profile name of a virtual machine's vGPU, in string. - Defaults to the eponymous property value in the template from which the - virtual machine is cloned. - Mutually exclusive with DeviceID and VendorID. - type: string type: object type: array powerOffMode: diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md index bc37b238aa..81d1ea19d1 100644 --- a/docs/gpu-vgpu.md +++ b/docs/gpu-vgpu.md @@ -90,7 +90,7 @@ spec: template: '${VSPHERE_TEMPLATE}' thumbprint: '${VSPHERE_TLS_THUMBPRINT}' pciDevices: - - vgpuProfile: "grid_t4-1a" # value from above + - vGPUProfile: "grid_t4-1a" # value from above ``` Set the required values for the other fields and the cluster template is ready for use. From f18ba350b357098d97d7a13c1fd321a2dbb9d015 Mon Sep 17 00:00:00 2001 From: Birk Lewin Date: Thu, 20 Jun 2024 12:37:20 +0200 Subject: [PATCH 20/21] Update PCIDevice doc comments --- apis/v1beta1/types.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/apis/v1beta1/types.go b/apis/v1beta1/types.go index 3372ffa118..728f26a457 100644 --- a/apis/v1beta1/types.go +++ b/apis/v1beta1/types.go @@ -251,19 +251,22 @@ type PCIDeviceSpec struct { // DeviceID is the device ID of a virtual machine's PCI, in integer. // Defaults to the eponymous property value in the template from which the // virtual machine is cloned. - // Mutually exclusive with VGPUProfile. + // Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID + // are two independent ways to define PCI devices. // +kubebuilder:validation:Required DeviceID *int32 `json:"deviceId,omitempty"` // VendorId is the vendor ID of a virtual machine's PCI, in integer. // Defaults to the eponymous property value in the template from which the // virtual machine is cloned. - // Mutually exclusive with VGPUProfile. + // Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID + // are two independent ways to define PCI devices. // +kubebuilder:validation:Required VendorID *int32 `json:"vendorId,omitempty"` // VGPUProfile is the profile name of a virtual machine's vGPU, in string. // Defaults to the eponymous property value in the template from which the // virtual machine is cloned. - // Mutually exclusive with DeviceID and VendorID. + // Mutually exclusive with DeviceID and VendorID as VGPUProfile and DeviceID + VendorID + // are two independent ways to define PCI devices. // +kubebuilder:validation:Required VGPUProfile string `json:"vGPUProfile,omitempty"` // CustomLabel is the hardware label of a virtual machine's PCI device. From ee5d3ac355744cfa3f724421bcf6f49c0c179dc6 Mon Sep 17 00:00:00 2001 From: Birk Lewin Date: Thu, 20 Jun 2024 12:43:18 +0200 Subject: [PATCH 21/21] Forgot to run make generate --- .../infrastructure.cluster.x-k8s.io_vspheremachines.yaml | 9 ++++++--- ...ructure.cluster.x-k8s.io_vspheremachinetemplates.yaml | 9 ++++++--- .../infrastructure.cluster.x-k8s.io_vspherevms.yaml | 9 ++++++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml index 36dfb02772..3f9b8fb56c 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml @@ -1370,7 +1370,8 @@ spec: DeviceID is the device ID of a virtual machine's PCI, in integer. Defaults to the eponymous property value in the template from which the virtual machine is cloned. - Mutually exclusive with VGPUProfile. + Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID + are two independent ways to define PCI devices. format: int32 type: integer vGPUProfile: @@ -1378,14 +1379,16 @@ spec: VGPUProfile is the profile name of a virtual machine's vGPU, in string. Defaults to the eponymous property value in the template from which the virtual machine is cloned. - Mutually exclusive with DeviceID and VendorID. + Mutually exclusive with DeviceID and VendorID as VGPUProfile and DeviceID + VendorID + are two independent ways to define PCI devices. type: string vendorId: description: |- VendorId is the vendor ID of a virtual machine's PCI, in integer. Defaults to the eponymous property value in the template from which the virtual machine is cloned. - Mutually exclusive with VGPUProfile. + Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID + are two independent ways to define PCI devices. format: int32 type: integer type: object diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml index 54733547a6..9d72178886 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml @@ -1245,7 +1245,8 @@ spec: DeviceID is the device ID of a virtual machine's PCI, in integer. Defaults to the eponymous property value in the template from which the virtual machine is cloned. - Mutually exclusive with VGPUProfile. + Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID + are two independent ways to define PCI devices. format: int32 type: integer vGPUProfile: @@ -1253,14 +1254,16 @@ spec: VGPUProfile is the profile name of a virtual machine's vGPU, in string. Defaults to the eponymous property value in the template from which the virtual machine is cloned. - Mutually exclusive with DeviceID and VendorID. + Mutually exclusive with DeviceID and VendorID as VGPUProfile and DeviceID + VendorID + are two independent ways to define PCI devices. type: string vendorId: description: |- VendorId is the vendor ID of a virtual machine's PCI, in integer. Defaults to the eponymous property value in the template from which the virtual machine is cloned. - Mutually exclusive with VGPUProfile. + Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID + are two independent ways to define PCI devices. format: int32 type: integer type: object diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml index 333df3fe42..f7c8474262 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml @@ -1458,7 +1458,8 @@ spec: DeviceID is the device ID of a virtual machine's PCI, in integer. Defaults to the eponymous property value in the template from which the virtual machine is cloned. - Mutually exclusive with VGPUProfile. + Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID + are two independent ways to define PCI devices. format: int32 type: integer vGPUProfile: @@ -1466,14 +1467,16 @@ spec: VGPUProfile is the profile name of a virtual machine's vGPU, in string. Defaults to the eponymous property value in the template from which the virtual machine is cloned. - Mutually exclusive with DeviceID and VendorID. + Mutually exclusive with DeviceID and VendorID as VGPUProfile and DeviceID + VendorID + are two independent ways to define PCI devices. type: string vendorId: description: |- VendorId is the vendor ID of a virtual machine's PCI, in integer. Defaults to the eponymous property value in the template from which the virtual machine is cloned. - Mutually exclusive with VGPUProfile. + Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID + are two independent ways to define PCI devices. format: int32 type: integer type: object