From 009129131af60ad3bbccdfa929b7160af25262ab Mon Sep 17 00:00:00 2001
From: Puneet Katyal <pkatyal@vmware.com>
Date: Thu, 14 Jul 2022 13:45:04 +0530
Subject: [PATCH 01/21] vGPU implementation

- Builds on the changes in
  https://github.com/kubernetes-sigs/cluster-api-provider-vsphere/pull/1579

Co-authored-by: Geetika Batra <geetika791@gmail.com>
Signed-off-by: Puneet Katyal <pkatyal@vmware.com>
---
 Makefile                                      |   2 +
 apis/v1alpha3/conversion_test.go              |   1 +
 apis/v1alpha3/zz_generated.conversion.go      |   1 +
 apis/v1alpha4/zz_generated.conversion.go      |   1 +
 apis/v1beta1/types.go                         |  12 ++
 apis/v1beta1/zz_generated.deepcopy.go         |  20 +++
 ...ture.cluster.x-k8s.io_vspheremachines.yaml |  13 ++
 ...ster.x-k8s.io_vspheremachinetemplates.yaml |  14 ++
 ...structure.cluster.x-k8s.io_vspherevms.yaml |  13 ++
 docs/gpu-vgpu.md                              | 107 ++++++++++++
 pkg/services/govmomi/vcenter/clone.go         |  72 +++++++-
 test/e2e/config/vsphere-dev.yaml              | 164 ++++++++++++++++++
 test/e2e/config/vsphere.yaml                  |   1 +
 .../main/vgpu/kustomization.yaml              |   6 +
 .../main/vgpu/vgpu-device-template.yaml       |  11 ++
 15 files changed, 432 insertions(+), 6 deletions(-)
 create mode 100644 docs/gpu-vgpu.md
 create mode 100644 test/e2e/config/vsphere-dev.yaml
 create mode 100644 test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml
 create mode 100644 test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml

diff --git a/Makefile b/Makefile
index 2eb00f4911..bb5d78b839 100644
--- a/Makefile
+++ b/Makefile
@@ -384,6 +384,8 @@ generate-e2e-templates-main: $(KUSTOMIZE) ## Generate test templates for the mai
 	"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/install-on-bootstrap" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-install-on-bootstrap.yaml"
 	# for PCI passthrough template
 	"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/pci" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-pci.yaml"
+	# for vGPU template
+	"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/vgpu" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-vgpu.yaml"
 	# for DHCP overrides
 	"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/dhcp-overrides" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-dhcp-overrides.yaml"
 	"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/ownerrefs-finalizers" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-ownerrefs-finalizers.yaml"
diff --git a/apis/v1alpha3/conversion_test.go b/apis/v1alpha3/conversion_test.go
index 732fae6f8b..e956f34da2 100644
--- a/apis/v1alpha3/conversion_test.go
+++ b/apis/v1alpha3/conversion_test.go
@@ -120,6 +120,7 @@ func CustomSpecNewFieldFuzzer(in *infrav1.VirtualMachineCloneSpec, c fuzz.Contin
 	c.FuzzNoCustom(in)
 
 	in.PciDevices = nil
+	in.VGPUDevices = nil
 	in.AdditionalDisksGiB = nil
 	in.OS = ""
 	in.HardwareVersion = ""
diff --git a/apis/v1alpha3/zz_generated.conversion.go b/apis/v1alpha3/zz_generated.conversion.go
index 966195ccbb..0c8f7f2cb0 100644
--- a/apis/v1alpha3/zz_generated.conversion.go
+++ b/apis/v1alpha3/zz_generated.conversion.go
@@ -1760,6 +1760,7 @@ func autoConvert_v1beta1_VirtualMachineCloneSpec_To_v1alpha3_VirtualMachineClone
 	out.CustomVMXKeys = *(*map[string]string)(unsafe.Pointer(&in.CustomVMXKeys))
 	// WARNING: in.TagIDs requires manual conversion: does not exist in peer-type
 	// WARNING: in.PciDevices requires manual conversion: does not exist in peer-type
+	// WARNING: in.VGPUDevices requires manual conversion: does not exist in peer-type
 	// WARNING: in.OS requires manual conversion: does not exist in peer-type
 	// WARNING: in.HardwareVersion requires manual conversion: does not exist in peer-type
 	return nil
diff --git a/apis/v1alpha4/zz_generated.conversion.go b/apis/v1alpha4/zz_generated.conversion.go
index 147c1a9894..a18d2ecd3d 100644
--- a/apis/v1alpha4/zz_generated.conversion.go
+++ b/apis/v1alpha4/zz_generated.conversion.go
@@ -1914,6 +1914,7 @@ func autoConvert_v1beta1_VirtualMachineCloneSpec_To_v1alpha4_VirtualMachineClone
 	out.CustomVMXKeys = *(*map[string]string)(unsafe.Pointer(&in.CustomVMXKeys))
 	// WARNING: in.TagIDs requires manual conversion: does not exist in peer-type
 	// WARNING: in.PciDevices requires manual conversion: does not exist in peer-type
+	// WARNING: in.VGPUDevices requires manual conversion: does not exist in peer-type
 	// WARNING: in.OS requires manual conversion: does not exist in peer-type
 	// WARNING: in.HardwareVersion requires manual conversion: does not exist in peer-type
 	return nil
diff --git a/apis/v1beta1/types.go b/apis/v1beta1/types.go
index 1a12b8de6f..81c5882279 100644
--- a/apis/v1beta1/types.go
+++ b/apis/v1beta1/types.go
@@ -193,6 +193,9 @@ type VirtualMachineCloneSpec struct {
 	// PciDevices is the list of pci devices used by the virtual machine.
 	// +optional
 	PciDevices []PCIDeviceSpec `json:"pciDevices,omitempty"`
+	// VGPUDevices is the list of vGPUs used by the virtual machine.
+	// +optional
+	VGPUDevices []VGPUSpec `json:"vgpuDevices,omitempty"`
 	// OS is the Operating System of the virtual machine
 	// Defaults to Linux
 	// +optional
@@ -265,6 +268,15 @@ type PCIDeviceSpec struct {
 	CustomLabel string `json:"customLabel,omitempty"`
 }
 
+// VGPUSpec defines virtual machine's VGPU configuration
+type VGPUSpec struct {
+	// ProfileName is the ProfileName of a virtual machine's vGPU, in string.
+	// Defaults to the eponymous property value in the template from which the
+	// virtual machine is cloned.
+	// +kubebuilder:validation:Required
+	ProfileName string `json:"profileName,omitempty"`
+}
+
 // NetworkSpec defines the virtual machine's network configuration.
 type NetworkSpec struct {
 	// Devices is the list of network devices used by the virtual machine.
diff --git a/apis/v1beta1/zz_generated.deepcopy.go b/apis/v1beta1/zz_generated.deepcopy.go
index 44d12a65fe..c6b26cfb39 100644
--- a/apis/v1beta1/zz_generated.deepcopy.go
+++ b/apis/v1beta1/zz_generated.deepcopy.go
@@ -403,6 +403,21 @@ func (in *Topology) DeepCopy() *Topology {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *VGPUSpec) DeepCopyInto(out *VGPUSpec) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VGPUSpec.
+func (in *VGPUSpec) DeepCopy() *VGPUSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(VGPUSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *VSphereCluster) DeepCopyInto(out *VSphereCluster) {
 	*out = *in
@@ -1321,6 +1336,11 @@ func (in *VirtualMachineCloneSpec) DeepCopyInto(out *VirtualMachineCloneSpec) {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
+	if in.VGPUDevices != nil {
+		in, out := &in.VGPUDevices, &out.VGPUDevices
+		*out = make([]VGPUSpec, len(*in))
+		copy(*out, *in)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineCloneSpec.
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
index bc1ec1541e..eb46f63a03 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
@@ -1448,6 +1448,19 @@ spec:
                   without TLS certificate validation of the communication between Cluster API Provider vSphere
                   and the VMware vCenter server.
                 type: string
+              vgpuDevices:
+                description: VGPUDevices is the list of vGPUs used by the virtual
+                  machine.
+                items:
+                  description: VGPUSpec defines virtual machine's VGPU configuration
+                  properties:
+                    profileName:
+                      description: ProfileName is the ProfileName of a virtual machine's
+                        vGPU, in string. Defaults to the eponymous property value
+                        in the template from which the virtual machine is cloned.
+                      type: string
+                  type: object
+                type: array
             required:
             - network
             - template
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
index ca4bae3640..de5d4f2c62 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
@@ -1323,6 +1323,20 @@ spec:
                           without TLS certificate validation of the communication between Cluster API Provider vSphere
                           and the VMware vCenter server.
                         type: string
+                      vgpuDevices:
+                        description: VGPUDevices is the list of vGPUs used by the
+                          virtual machine.
+                        items:
+                          description: VGPUSpec defines virtual machine's VGPU configuration
+                          properties:
+                            profileName:
+                              description: ProfileName is the ProfileName of a virtual
+                                machine's vGPU, in string. Defaults to the eponymous
+                                property value in the template from which the virtual
+                                machine is cloned.
+                              type: string
+                          type: object
+                        type: array
                     required:
                     - network
                     - template
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
index 3f42eea904..f6f2f80e71 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
@@ -1531,6 +1531,19 @@ spec:
                   without TLS certificate validation of the communication between Cluster API Provider vSphere
                   and the VMware vCenter server.
                 type: string
+              vgpuDevices:
+                description: VGPUDevices is the list of vGPUs used by the virtual
+                  machine.
+                items:
+                  description: VGPUSpec defines virtual machine's VGPU configuration
+                  properties:
+                    profileName:
+                      description: ProfileName is the ProfileName of a virtual machine's
+                        vGPU, in string. Defaults to the eponymous property value
+                        in the template from which the virtual machine is cloned.
+                      type: string
+                  type: object
+                type: array
             required:
             - network
             - template
diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md
new file mode 100644
index 0000000000..3c5f546afb
--- /dev/null
+++ b/docs/gpu-vgpu.md
@@ -0,0 +1,107 @@
+# GPU enabled clusters using vGPU
+
+## Overview
+
+You can choose to create a cluster with both worker and control plane nodes having vGPU devices attached to them.
+
+Before we begin, a few important things to note:
+
+- [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) is used to expose the GPU PCI devices to the workloads running on the cluster.
+- The OVA templates used for cluster creation should have the VMX version (Virtual Hardware) set to 17 or higher. This is necessary because Dynamic DirectPath I/O was introduced in this version, which enables the Assignable Hardware intelligence for passthrough devices.
+- Since we need the VMX version to be >=17, this way of provisioning clusters with PCI passthrough devices works for vSphere 7.0 and above. This is the ESXi/VMX version [compatibility list](https://kb.vmware.com/s/article/2007240).
+- UEFI boot mode is recommended for the OVAs used for cluster creation.
+- Most of the setup is similar to [GPU enabled clusters via PCI Passthrough](https://github.com/kubernetes-sigs/cluster-api-provider-vsphere/blob/main/docs/gpu-pci.md#create-the-cluster).
+
+## An example GPU enabled cluster
+
+Let's create a CAPV cluster with vGPU enabled nodes.
+
+### Prerequisites
+
+- Refer the [NVIDIA Virtual GPU Software Quick Start Guide](https://docs.nvidia.com/grid/latest/grid-software-quick-start-guide/index.html) to download and install the vGPU software and configure vGPU licensing.
+
+- Ensure vGPU compatibility for your vSphere installation and the GPU devices using the [VMware Compatibility Guide - Shared Pass-through Graphics](https://www.vmware.com/resources/compatibility/search.php?deviceCategory=vgpu)
+
+- Enable Shared Passthrough for the GPU device on the ESXi Host
+  - Browse to a host in the vSphere Client navigator.
+  - On the **Configure** tab, expand **Hardware** and click **Graphics**.
+  - Under **GRAPHICS DEVICES**, select the GPU device to be used for vGPU, click **EDIT...** and select **Shared Direct**. Repeat this for additional GPU devices as needed.
+  - Select **HOST GRAPHICS**, click **EDIT...** and select **Shared Direct** and select a shared passthrough GPU assignment policy, for example **Group VMs on GPU until full (GPU consolidation)**.
+
+- Build an OVA template
+  We can build a custom OVA template using the [image-builder](https://github.com/kubernetes-sigs/image-builder) project. We will build a Ubuntu 20.04 OVA with UEFI boot mode. More documentation on how to use image-builder can be found in the [image-builder book](https://image-builder.sigs.k8s.io/capi/providers/vsphere.html)
+  - Clone the repo locally and go to the `./images/capi/` directory.
+  - Create a `packer-vars.json` file with the following content.
+
+    ```shell
+    $ cat packer-vars.json
+    {
+        "vmx_version": 17
+    }
+    ```
+
+  - Run the make file target associated to ubuntu 20.04 UEFI OVA as follows:
+
+    ```shell
+    > PACKER_VAR_FILES=packer-vars.json make build-node-ova-vsphere-ubuntu-2004-efi
+    ```
+
+### Source the vGPU profile(s) for the GPU device
+
+See "2. Choosing the vGPU Profile for the Virtual Machine" at [Using GPUs with Virtual Machines on vSphere](https://blogs.vmware.com/apps/2018/09/using-gpus-with-virtual-machines-on-vsphere-part-3-installing-the-nvidia-grid-technology.html) to see what vGPU profiles are available for your GPU device.
+
+We are using NVIDIA Tesla V100 32GB cards for this example and will use the `grid_v100d-4c` vGPU profile for this card that allocates 4GB GPU memory to the worker node's vGPU device. 
+
+### Create the cluster template
+
+```shell
+$ make dev-flavors
+/Applications/Xcode.app/Contents/Developer/usr/bin/make generate-flavors FLAVOR_DIR=/Users/pkatyal/.cluster-api/overrides/infrastructure-vsphere/v0.0.0
+go run ./packaging/flavorgen --output-dir /Users/pkatyal/.cluster-api/overrides/infrastructure-vsphere/v0.0.0
+```
+
+Edit the generated Cluster template (`cluster-template.yaml`) to set the values for the `vgpuDevices` array. Here we are editing the VSphereMachineTemplate object for the worker nodes. This will create a worker node with a single NVIDIA 16GB vGPU device attached to the VM.
+
+```yaml
+---
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
+kind: VSphereMachineTemplate
+metadata:
+  name: ${CLUSTER_NAME}-worker
+  namespace: '${NAMESPACE}'
+spec:
+  template:
+    spec:
+      cloneMode: linkedClone
+      datacenter: '${VSPHERE_DATACENTER}'
+      datastore: '${VSPHERE_DATASTORE}'
+      diskGiB: 25
+      folder: '${VSPHERE_FOLDER}'
+      memoryMiB: 8192
+      network:
+        devices:
+        - dhcp4: true
+          networkName: '${VSPHERE_NETWORK}'
+      numCPUs: 2
+      os: Linux
+      powerOffMode: trySoft
+      resourcePool: '${VSPHERE_RESOURCE_POOL}'
+      server: '${VSPHERE_SERVER}'
+      storagePolicyName: '${VSPHERE_STORAGE_POLICY}'
+      template: '${VSPHERE_TEMPLATE}'
+      thumbprint: '${VSPHERE_TLS_THUMBPRINT}'
+      vgpuDevices:
+        - profileName: "grid_v100d-4c"    <============ value from above
+```
+
+Set the required values for the other fields and the cluster template is ready for use. The similar changes can be made to a template generated using clusterctl generate cluster command as well.
+
+### Create the cluster
+
+Set the size of the GPU nodes appropriately, since the Nvidia gpu-operator requires additional CPU and memory to install the device drivers on the VMs.
+
+Note: For GPU nodes (PCI Passthrough or vGPU), all memory of the nodes must be reserved. CAPV will automatically do this for nodes that have a PCI Passthrough GPU or a vGPU device in the spec. See "Memory Reservation" at [Using GPUs with Virtual Machines on vSphere](https://blogs.vmware.com/apps/2018/09/using-gpus-with-virtual-machines-on-vsphere-part-2-vmdirectpath-i-o.html)
+
+Apply the manifest from the previous step to your management cluster to have CAPV create a workload cluster with worker nodes that have vGPUs.
+
+From this point on, the setup is exactly the same as [GPU enabled clusters via PCI Passthrough](https://github.com/kubernetes-sigs/cluster-api-provider-vsphere/blob/main/docs/gpu-pci.md#create-the-cluster). 
diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go
index e216790081..e263ca3412 100644
--- a/pkg/services/govmomi/vcenter/clone.go
+++ b/pkg/services/govmomi/vcenter/clone.go
@@ -68,9 +68,9 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by
 			extraConfig.SetIgnitionUserData(bootstrapData)
 		}
 	}
-	if vmCtx.VSphereVM.Spec.CustomVMXKeys != nil {
-		log.Info("Applied custom vmx keys o VM clone spec")
-		if err := extraConfig.SetCustomVMXKeys(vmCtx.VSphereVM.Spec.CustomVMXKeys); err != nil {
+	if ctx.VSphereVM.Spec.CustomVMXKeys != nil {
+		ctx.Logger.Info("applied custom vmx keys to VM clone spec")
+		if err := extraConfig.SetCustomVMXKeys(ctx.VSphereVM.Spec.CustomVMXKeys); err != nil {
 			return err
 		}
 	}
@@ -152,8 +152,16 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by
 
 	deviceSpecs = append(deviceSpecs, networkSpecs...)
 
-	if err != nil {
-		return errors.Wrapf(err, "error getting network specs for %q", ctx)
+	if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices) != 0 {
+		gpuSpecs := getGpuSpecs(ctx)
+		ctx.Logger.V(4).Info("created gpu devices", "gpu-device-specs", gpuSpecs)
+		deviceSpecs = append(deviceSpecs, gpuSpecs...)
+	}
+
+	if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices) != 0 {
+		vgpuSpecs := getVgpuSpecs(ctx)
+		ctx.Logger.V(4).Info("created vgpu devices", "vgpu-device-specs", vgpuSpecs)
+		deviceSpecs = append(deviceSpecs, vgpuSpecs...)
 	}
 
 	numCPUs := vmCtx.VSphereVM.Spec.NumCPUs
@@ -200,7 +208,7 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by
 		Snapshot: snapshotRef,
 	}
 
-	// For PCI devices, the memory for the VM needs to be reserved
+	// For PCI and vGPU devices, the memory for the VM needs to be reserved
 	// We can replace this once we have another way of reserving memory option
 	// exposed via the API types.
 	if len(vmCtx.VSphereVM.Spec.PciDevices) > 0 {
@@ -453,3 +461,55 @@ func getNetworkSpecs(ctx context.Context, vmCtx *capvcontext.VMContext, devices
 
 	return deviceSpecs, nil
 }
+
+func createPCIPassThroughDevice(deviceKey int32, backingInfo types.BaseVirtualDeviceBackingInfo) types.BaseVirtualDevice {
+	device := &types.VirtualPCIPassthrough{
+		VirtualDevice: types.VirtualDevice{
+			Key:     deviceKey,
+			Backing: backingInfo,
+		},
+	}
+	return device
+}
+
+func getGpuSpecs(ctx *context.VMContext) []types.BaseVirtualDeviceConfigSpec {
+	deviceSpecs := []types.BaseVirtualDeviceConfigSpec{}
+	deviceKey := int32(-200)
+
+	for _, pciDevice := range ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices {
+		backingInfo := &types.VirtualPCIPassthroughDynamicBackingInfo{
+			AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{
+				{
+					VendorId: *pciDevice.VendorID,
+					DeviceId: *pciDevice.DeviceID,
+				},
+			},
+		}
+		dynamicDirectPathDevice := createPCIPassThroughDevice(deviceKey, backingInfo)
+		deviceSpecs = append(deviceSpecs, &types.VirtualDeviceConfigSpec{
+			Device:    dynamicDirectPathDevice,
+			Operation: types.VirtualDeviceConfigSpecOperationAdd,
+		})
+		deviceKey--
+	}
+	return deviceSpecs
+}
+
+func getVgpuSpecs(ctx *context.VMContext) []types.BaseVirtualDeviceConfigSpec {
+	deviceSpecs := []types.BaseVirtualDeviceConfigSpec{}
+	deviceKey := int32(-200)
+
+	for _, vGPUDevice := range ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices {
+		backingInfo := &types.VirtualPCIPassthroughVmiopBackingInfo{
+			Vgpu: vGPUDevice.ProfileName,
+		}
+		dynamicDirectPathDevice := createPCIPassThroughDevice(deviceKey, backingInfo)
+		deviceSpecs = append(deviceSpecs, &types.VirtualDeviceConfigSpec{
+			Device:    dynamicDirectPathDevice,
+			Operation: types.VirtualDeviceConfigSpecOperationAdd,
+		})
+		ctx.Logger.V(4).Info("created vGPU device", "vgpu-profile", vGPUDevice.ProfileName)
+		deviceKey--
+	}
+	return deviceSpecs
+}
diff --git a/test/e2e/config/vsphere-dev.yaml b/test/e2e/config/vsphere-dev.yaml
new file mode 100644
index 0000000000..d5a4df0026
--- /dev/null
+++ b/test/e2e/config/vsphere-dev.yaml
@@ -0,0 +1,164 @@
+---
+# E2E test scenario using local dev images and manifests built from the source tree for following providers:
+# - cluster-api
+# - bootstrap kubeadm
+# - control-plane kubeadm
+# - vsphere
+
+# For creating local dev images built from the source tree;
+# - from the CAPI repository root, `make docker-build REGISTRY=gcr.io/k8s-staging-cluster-api` to build the cluster-api,
+#  bootstrap kubeadm, control-plane kubeadm provider images. This step can be skipped to use upstream images.
+# - from the CAPV repository root, `make e2e` to build the vsphere provider image and run e2e tests.
+
+images:
+  - name: registry.k8s.io/cluster-api/cluster-api-controller:v1.5.0
+    loadBehavior: tryLoad
+  - name: registry.k8s.io/cluster-api/kubeadm-bootstrap-controller:v1.5.0
+    loadBehavior: tryLoad
+  - name: registry.k8s.io/cluster-api/kubeadm-control-plane-controller:v1.5.0
+    loadBehavior: tryLoad
+  - name: gcr.io/k8s-staging-cluster-api/capv-manager:e2e
+    loadBehavior: mustLoad
+  - name: quay.io/jetstack/cert-manager-cainjector:v1.12.2
+    loadBehavior: tryLoad
+  - name: quay.io/jetstack/cert-manager-webhook:v1.12.2
+    loadBehavior: tryLoad
+  - name: quay.io/jetstack/cert-manager-controller:v1.12.2
+    loadBehavior: tryLoad
+
+providers:
+
+  - name: cluster-api
+    type: CoreProvider
+    versions:
+      - name: v1.5.0
+        # Use manifest from source files
+        value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/v1.5.0/core-components.yaml"
+        type: "url"
+        contract: v1beta1
+        files:
+          - sourcePath: "../data/shared/main/v1beta1/metadata.yaml"
+        replacements:
+          - old: "imagePullPolicy: Always"
+            new: "imagePullPolicy: IfNotPresent"
+
+  - name: kubeadm
+    type: BootstrapProvider
+    versions:
+      - name: v1.5.0
+        # Use manifest from source files
+        value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/v1.5.0/bootstrap-components.yaml"
+        type: "url"
+        contract: v1beta1
+        files:
+          - sourcePath: "../data/shared/main/v1beta1/metadata.yaml"
+        replacements:
+          - old: "imagePullPolicy: Always"
+            new: "imagePullPolicy: IfNotPresent"
+
+  - name: kubeadm
+    type: ControlPlaneProvider
+    versions:
+      - name: v1.5.0
+        # Use manifest from source files
+        value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/v1.5.0/control-plane-components.yaml"
+        type: "url"
+        contract: v1beta1
+        files:
+          - sourcePath: "../data/shared/main/v1beta1/metadata.yaml"
+        replacements:
+          - old: "imagePullPolicy: Always"
+            new: "imagePullPolicy: IfNotPresent"
+
+  - name: vsphere
+    type: InfrastructureProvider
+    versions:
+      - name: v1.9.99
+        # Use manifest from source files
+        value: ../../../../cluster-api-provider-vsphere/config/default
+        contract: v1beta1
+        replacements:
+          - old: gcr.io/cluster-api-provider-vsphere/release/manager:latest
+            new: gcr.io/k8s-staging-cluster-api/capv-manager:e2e
+          - old: "imagePullPolicy: Always"
+            new: "imagePullPolicy: IfNotPresent"
+        files:
+          # Add a cluster template
+          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template.yaml"
+          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-conformance.yaml"
+          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-hw-upgrade.yaml"
+          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-kcp-remediation.yaml"
+          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-md-remediation.yaml"
+          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-node-drain.yaml"
+          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-pci.yaml"
+          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-remote-management.yaml"
+          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-storage-policy.yaml"
+          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-topology.yaml"
+          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-dhcp-overrides.yaml"
+          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/clusterclass-quick-start.yaml"
+          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-ignition.yaml"
+          - sourcePath: "../data/shared/main/v1beta1_provider/metadata.yaml"
+
+variables:
+  KUBERNETES_VERSION: "v1.28.0"
+  CPI_IMAGE_K8S_VERSION: "v1.27.0"
+  CNI: "./data/cni/calico/calico.yaml"
+  EXP_CLUSTER_RESOURCE_SET: "true"
+  EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION: "true"
+  CONTROL_PLANE_MACHINE_COUNT: 1
+  WORKER_MACHINE_COUNT: 1
+  IP_FAMILY: "IPv4"
+  CLUSTER_CLASS_NAME: "quick-start"
+  # Following CAPV variables should be set before testing
+  VSPHERE_SERVER: "vcenter.vmware.com"
+  VSPHERE_TLS_THUMBPRINT: "AA:BB:CC:DD:11:22:33:44:EE:FF"
+  VSPHERE_DATACENTER: "SDDC-Datacenter"
+  VSPHERE_FOLDER: "FolderName"
+  VSPHERE_RESOURCE_POOL: "ResourcePool"
+  VSPHERE_DATASTORE: "WorkloadDatastore"
+  VSPHERE_STORAGE_POLICY: "Cluster API vSphere Storage Policy"
+  VSPHERE_NETWORK: "network-1"
+  VSPHERE_TEMPLATE: "ubuntu-2204-kube-v1.28.0"
+  FLATCAR_VSPHERE_TEMPLATE: "flatcar-stable-3510.2.6-kube-v1.28.0"
+  # WORKLOAD_CONTROL_PLANE_ENDPOINT_IP:
+  # Also following variables are required but it is recommended to use env variables to avoid disclosure of sensitive data
+  # VSPHERE_SSH_AUTHORIZED_KEY:
+  # VSPHERE_PASSWORD:
+  # VSPHERE_USERNAME:
+  # Dedicated IP to be used by kube-vip
+  # CONTROL_PLANE_ENDPOINT_IP:
+  # Sets the insecure-flag for vsphere-csi-controller config
+  VSPHERE_INSECURE_CSI: "true"
+  KUBETEST_CONFIGURATION: "./data/kubetest/conformance-fast.yaml"
+  NODE_DRAIN_TIMEOUT: "60s"
+  CLUSTER_TOPOLOGY: "true"
+  # These IDs correspond to Tesla T4s, they are the decimal representation of the hex values.
+  DEVICE_ID: 7864
+  VENDOR_ID: 4318
+  PROFILE_NAME: grid_v100d-4c
+  # CAPV feature flags
+  EXP_NODE_ANTI_AFFINITY: "true"
+  # Following CAPV variables is used for multivc_test.go. This is the second VSphere and should be set if multivc test is enabled.
+  VSPHERE2_SERVER: "vcenter2.vmware.com"
+  VSPHERE2_TLS_THUMBPRINT: "AA:BB:CC:DD:11:22:33:44:EE:FF"
+  VSPHERE2_RESOURCE_POOL: "ResourcePool"
+  VSPHERE2_TEMPLATE: "ubuntu-2004-kube-v1.27.3"
+  # Dedicated IP to be used by kube-vip
+  VSPHERE2_CONTROL_PLANE_ENDPOINT_IP:
+  # Following variables are also required and please use env variables to avoid disclosure of sensitive data
+  VSPHERE2_USERNAME:
+  VSPHERE2_PASSWORD:
+
+
+intervals:
+  default/wait-controllers: ["5m", "10s"]
+  default/wait-cluster: ["5m", "10s"]
+  default/wait-control-plane: ["20m", "10s"]
+  default/wait-worker-nodes: ["20m", "10s"]
+  default/wait-delete-cluster: ["5m", "10s"]
+  default/wait-machine-upgrade: ["15m", "1m"]
+  default/wait-machine-remediation: ["15m", "10s"]
+  mhc-remediation/mhc-remediation: ["30m", "10s"]
+  node-drain/wait-deployment-available: ["3m", "10s"]
+  node-drain/wait-machine-deleted: ["2m", "10s"]
+  anti-affinity/wait-vm-redistribution: ["5m", "10s"]
diff --git a/test/e2e/config/vsphere.yaml b/test/e2e/config/vsphere.yaml
index 2aeb087684..d0f768d305 100644
--- a/test/e2e/config/vsphere.yaml
+++ b/test/e2e/config/vsphere.yaml
@@ -279,6 +279,7 @@ variables:
   # These IDs correspond to Tesla T4s, they are the decimal representation of the hex values.
   DEVICE_ID: 7864
   VENDOR_ID: 4318
+  PROFILE_NAME: grid_v100d-4c
   # CAPV feature flags
   EXP_NODE_ANTI_AFFINITY: "true"
   CAPI_DIAGNOSTICS_ADDRESS: ":8080"
diff --git a/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml b/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml
new file mode 100644
index 0000000000..75b395b27b
--- /dev/null
+++ b/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml
@@ -0,0 +1,6 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - ../base
+patchesStrategicMerge:
+  - vgpu-device-template.yaml
diff --git a/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml b/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml
new file mode 100644
index 0000000000..4404df5f3f
--- /dev/null
+++ b/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml
@@ -0,0 +1,11 @@
+---
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
+kind: VSphereMachineTemplate
+metadata:
+  name: ${CLUSTER_NAME}-worker
+  namespace: ${NAMESPACE}
+spec:
+  template:
+    spec:
+      vgpuDevices:
+        - profileName: ${PROFILE_NAME}
\ No newline at end of file

From c5d2a78fcff6991d7e76ec4d61169081d7370884 Mon Sep 17 00:00:00 2001
From: Puneet Katyal <1063570+puneetkatyal@users.noreply.github.com>
Date: Tue, 29 Aug 2023 13:43:42 -0700
Subject: [PATCH 02/21] Update pkg/services/govmomi/vcenter/clone.go

Co-authored-by: Christian Schlotter <chrischdi@users.noreply.github.com>
---
 pkg/services/govmomi/vcenter/clone.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go
index e263ca3412..04151157a4 100644
--- a/pkg/services/govmomi/vcenter/clone.go
+++ b/pkg/services/govmomi/vcenter/clone.go
@@ -463,13 +463,12 @@ func getNetworkSpecs(ctx context.Context, vmCtx *capvcontext.VMContext, devices
 }
 
 func createPCIPassThroughDevice(deviceKey int32, backingInfo types.BaseVirtualDeviceBackingInfo) types.BaseVirtualDevice {
-	device := &types.VirtualPCIPassthrough{
+	return &types.VirtualPCIPassthrough{
 		VirtualDevice: types.VirtualDevice{
 			Key:     deviceKey,
 			Backing: backingInfo,
 		},
 	}
-	return device
 }
 
 func getGpuSpecs(ctx *context.VMContext) []types.BaseVirtualDeviceConfigSpec {

From b85ad404260b41477c0d86d2711897136cc5fb18 Mon Sep 17 00:00:00 2001
From: Birk Lewin <birk.lewin@xait.com>
Date: Fri, 23 Feb 2024 11:01:05 +0100
Subject: [PATCH 03/21] Handle adding vGPU in reconcilePCIDevices instead of in
 Clone

---
 pkg/services/govmomi/pci/vgpu.go      | 81 +++++++++++++++++++++++++++
 pkg/services/govmomi/service.go       | 34 +++++++++++
 pkg/services/govmomi/vcenter/clone.go | 69 +----------------------
 3 files changed, 118 insertions(+), 66 deletions(-)
 create mode 100644 pkg/services/govmomi/pci/vgpu.go

diff --git a/pkg/services/govmomi/pci/vgpu.go b/pkg/services/govmomi/pci/vgpu.go
new file mode 100644
index 0000000000..e4053b54ff
--- /dev/null
+++ b/pkg/services/govmomi/pci/vgpu.go
@@ -0,0 +1,81 @@
+/*
+Copyright 2023 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package pci
+
+import (
+	"context"
+
+	"github.com/vmware/govmomi/object"
+	"github.com/vmware/govmomi/vim25/types"
+
+	infrav1 "sigs.k8s.io/cluster-api-provider-vsphere/apis/v1beta1"
+)
+
+// CalculateVGPUsToBeAdded calculates the vGPU devices which should be added to the VM.
+func CalculateVGPUsToBeAdded(ctx context.Context, vm *object.VirtualMachine, deviceSpecs []infrav1.VGPUSpec) ([]infrav1.VGPUSpec, error) {
+	// store the number of expected devices for each deviceID + vendorID combo
+	deviceVendorIDComboMap := map[string]int{}
+	for _, spec := range deviceSpecs {
+		key := spec.ProfileName
+		if _, ok := deviceVendorIDComboMap[key]; !ok {
+			deviceVendorIDComboMap[key] = 1
+		} else {
+			deviceVendorIDComboMap[key]++
+		}
+	}
+
+	devices, err := vm.Device(ctx)
+	if err != nil {
+		return nil, err
+	}
+
+	specsToBeAdded := []infrav1.VGPUSpec{}
+	for _, spec := range deviceSpecs {
+		key := spec.ProfileName
+		pciDeviceList := devices.SelectByBackingInfo(createBackingInfoVGPU(spec))
+		expectedDeviceLen := deviceVendorIDComboMap[key]
+		if expectedDeviceLen-len(pciDeviceList) > 0 {
+			specsToBeAdded = append(specsToBeAdded, spec)
+			deviceVendorIDComboMap[key]--
+		}
+	}
+	return specsToBeAdded, nil
+}
+
+// ConstructDeviceSpecsVGPU transforms a list of VGPUSpec into a list of BaseVirutalDevices used by govmomi.
+func ConstructDeviceSpecsVGPU(vGPUDeviceSpecs []infrav1.VGPUSpec) []types.BaseVirtualDevice {
+	vGPUDevices := []types.BaseVirtualDevice{}
+	deviceKey := int32(-200)
+
+	for _, pciDevice := range vGPUDeviceSpecs {
+		backingInfo := createBackingInfoVGPU(pciDevice)
+		vGPUDevices = append(vGPUDevices, &types.VirtualPCIPassthrough{
+			VirtualDevice: types.VirtualDevice{
+				Key:     deviceKey,
+				Backing: backingInfo,
+			},
+		})
+		deviceKey--
+	}
+	return vGPUDevices
+}
+
+func createBackingInfoVGPU(spec infrav1.VGPUSpec) *types.VirtualPCIPassthroughVmiopBackingInfo {
+	return &types.VirtualPCIPassthroughVmiopBackingInfo{
+		Vgpu: spec.ProfileName,
+	}
+}
diff --git a/pkg/services/govmomi/service.go b/pkg/services/govmomi/service.go
index 92256bf2a5..e53358bbc0 100644
--- a/pkg/services/govmomi/service.go
+++ b/pkg/services/govmomi/service.go
@@ -538,6 +538,40 @@ func (vms *VMService) reconcilePCIDevices(ctx context.Context, virtualMachineCtx
 			return errors.Wrapf(err, "error adding pci devices for %q", ctx)
 		}
 	}
+	if expectedVGPUs := virtualMachineCtx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices; len(expectedVGPUs) != 0 {
+		specsToBeAdded, err := pci.CalculateVGPUsToBeAdded(ctx, virtualMachineCtx.Obj, expectedVGPUs)
+		if err != nil {
+			return err
+		}
+
+		if len(specsToBeAdded) == 0 {
+			if conditions.Has(virtualMachineCtx.VSphereVM, infrav1.PCIDevicesDetachedCondition) {
+				conditions.Delete(virtualMachineCtx.VSphereVM, infrav1.PCIDevicesDetachedCondition)
+			}
+			log.V(5).Info("No new PCI devices to be added")
+			return nil
+		}
+
+		powerState, err := virtualMachineCtx.Obj.PowerState(ctx)
+		if err != nil {
+			return err
+		}
+		if powerState == types.VirtualMachinePowerStatePoweredOn {
+			// This would arise only when the PCI device is manually removed from
+			// the VM post creation.
+			log.Info("vGPU device cannot be attached in powered on state")
+			conditions.MarkFalse(virtualMachineCtx.VSphereVM,
+				infrav1.PCIDevicesDetachedCondition,
+				infrav1.NotFoundReason,
+				clusterv1.ConditionSeverityWarning,
+				"vGPU devices removed after VM was powered on")
+			return errors.Errorf("missing vGPU devices")
+		}
+		log.Info("vGPU devices to be added", "number", len(specsToBeAdded))
+		if err := virtualMachineCtx.Obj.AddDevice(ctx, pci.ConstructDeviceSpecsVGPU(specsToBeAdded)...); err != nil {
+			return errors.Wrapf(err, "error adding vGPU devices for %q", ctx)
+		}
+	}
 	return nil
 }
 
diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go
index 04151157a4..87a2591cd0 100644
--- a/pkg/services/govmomi/vcenter/clone.go
+++ b/pkg/services/govmomi/vcenter/clone.go
@@ -68,9 +68,9 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by
 			extraConfig.SetIgnitionUserData(bootstrapData)
 		}
 	}
-	if ctx.VSphereVM.Spec.CustomVMXKeys != nil {
-		ctx.Logger.Info("applied custom vmx keys to VM clone spec")
-		if err := extraConfig.SetCustomVMXKeys(ctx.VSphereVM.Spec.CustomVMXKeys); err != nil {
+	if vmCtx.VSphereVM.Spec.CustomVMXKeys != nil {
+		log.Info("applied custom vmx keys to VM clone spec")
+		if err := extraConfig.SetCustomVMXKeys(vmCtx.VSphereVM.Spec.CustomVMXKeys); err != nil {
 			return err
 		}
 	}
@@ -152,18 +152,6 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by
 
 	deviceSpecs = append(deviceSpecs, networkSpecs...)
 
-	if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices) != 0 {
-		gpuSpecs := getGpuSpecs(ctx)
-		ctx.Logger.V(4).Info("created gpu devices", "gpu-device-specs", gpuSpecs)
-		deviceSpecs = append(deviceSpecs, gpuSpecs...)
-	}
-
-	if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices) != 0 {
-		vgpuSpecs := getVgpuSpecs(ctx)
-		ctx.Logger.V(4).Info("created vgpu devices", "vgpu-device-specs", vgpuSpecs)
-		deviceSpecs = append(deviceSpecs, vgpuSpecs...)
-	}
-
 	numCPUs := vmCtx.VSphereVM.Spec.NumCPUs
 	if numCPUs < 2 {
 		numCPUs = 2
@@ -461,54 +449,3 @@ func getNetworkSpecs(ctx context.Context, vmCtx *capvcontext.VMContext, devices
 
 	return deviceSpecs, nil
 }
-
-func createPCIPassThroughDevice(deviceKey int32, backingInfo types.BaseVirtualDeviceBackingInfo) types.BaseVirtualDevice {
-	return &types.VirtualPCIPassthrough{
-		VirtualDevice: types.VirtualDevice{
-			Key:     deviceKey,
-			Backing: backingInfo,
-		},
-	}
-}
-
-func getGpuSpecs(ctx *context.VMContext) []types.BaseVirtualDeviceConfigSpec {
-	deviceSpecs := []types.BaseVirtualDeviceConfigSpec{}
-	deviceKey := int32(-200)
-
-	for _, pciDevice := range ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices {
-		backingInfo := &types.VirtualPCIPassthroughDynamicBackingInfo{
-			AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{
-				{
-					VendorId: *pciDevice.VendorID,
-					DeviceId: *pciDevice.DeviceID,
-				},
-			},
-		}
-		dynamicDirectPathDevice := createPCIPassThroughDevice(deviceKey, backingInfo)
-		deviceSpecs = append(deviceSpecs, &types.VirtualDeviceConfigSpec{
-			Device:    dynamicDirectPathDevice,
-			Operation: types.VirtualDeviceConfigSpecOperationAdd,
-		})
-		deviceKey--
-	}
-	return deviceSpecs
-}
-
-func getVgpuSpecs(ctx *context.VMContext) []types.BaseVirtualDeviceConfigSpec {
-	deviceSpecs := []types.BaseVirtualDeviceConfigSpec{}
-	deviceKey := int32(-200)
-
-	for _, vGPUDevice := range ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices {
-		backingInfo := &types.VirtualPCIPassthroughVmiopBackingInfo{
-			Vgpu: vGPUDevice.ProfileName,
-		}
-		dynamicDirectPathDevice := createPCIPassThroughDevice(deviceKey, backingInfo)
-		deviceSpecs = append(deviceSpecs, &types.VirtualDeviceConfigSpec{
-			Device:    dynamicDirectPathDevice,
-			Operation: types.VirtualDeviceConfigSpecOperationAdd,
-		})
-		ctx.Logger.V(4).Info("created vGPU device", "vgpu-profile", vGPUDevice.ProfileName)
-		deviceKey--
-	}
-	return deviceSpecs
-}

From 47d7d72e8d55db166b29370b2977b4033884b8bc Mon Sep 17 00:00:00 2001
From: Birk Lewin <birk.lewin@xait.com>
Date: Fri, 23 Feb 2024 17:00:42 +0100
Subject: [PATCH 04/21] Duplicate PCI device test for vGPU test

---
 docs/gpu-vgpu.md                      |   3 +-
 pkg/services/govmomi/pci/vgpu_test.go | 156 ++++++++++++++++++++++++++
 pkg/services/govmomi/vcenter/clone.go |   2 +-
 3 files changed, 158 insertions(+), 3 deletions(-)
 create mode 100644 pkg/services/govmomi/pci/vgpu_test.go

diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md
index 3c5f546afb..4b1dc11d4c 100644
--- a/docs/gpu-vgpu.md
+++ b/docs/gpu-vgpu.md
@@ -56,7 +56,6 @@ We are using NVIDIA Tesla V100 32GB cards for this example and will use the `gri
 
 ```shell
 $ make dev-flavors
-/Applications/Xcode.app/Contents/Developer/usr/bin/make generate-flavors FLAVOR_DIR=/Users/pkatyal/.cluster-api/overrides/infrastructure-vsphere/v0.0.0
 go run ./packaging/flavorgen --output-dir /Users/pkatyal/.cluster-api/overrides/infrastructure-vsphere/v0.0.0
 ```
 
@@ -91,7 +90,7 @@ spec:
       template: '${VSPHERE_TEMPLATE}'
       thumbprint: '${VSPHERE_TLS_THUMBPRINT}'
       vgpuDevices:
-        - profileName: "grid_v100d-4c"    <============ value from above
+        - profileName: "grid_v100d-4c"  # value from above
 ```
 
 Set the required values for the other fields and the cluster template is ready for use. The similar changes can be made to a template generated using clusterctl generate cluster command as well.
diff --git a/pkg/services/govmomi/pci/vgpu_test.go b/pkg/services/govmomi/pci/vgpu_test.go
new file mode 100644
index 0000000000..5b4a572040
--- /dev/null
+++ b/pkg/services/govmomi/pci/vgpu_test.go
@@ -0,0 +1,156 @@
+/*
+Copyright 2023 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package pci
+
+import (
+	"context"
+	"testing"
+
+	"github.com/onsi/gomega"
+	"github.com/vmware/govmomi/find"
+	"github.com/vmware/govmomi/simulator"
+	"github.com/vmware/govmomi/vim25"
+
+	infrav1 "sigs.k8s.io/cluster-api-provider-vsphere/apis/v1beta1"
+)
+
+func Test_CalculateVGPUsToBeAdded(t *testing.T) {
+	type input struct {
+		name                      string
+		expectedLen               int
+		existingDeviceSpecIndexes []int
+		vGPUDeviceSpecs           []infrav1.VGPUSpec
+		assertFunc                func(g *gomega.WithT, actual []infrav1.VGPUSpec)
+	}
+
+	testFunc := func(t *testing.T, i input) {
+		t.Helper()
+		t.Run(i.name, func(t *testing.T) {
+			g := gomega.NewWithT(t)
+			simulator.Run(func(ctx context.Context, client *vim25.Client) error {
+				finder := find.NewFinder(client)
+				vm, err := finder.VirtualMachine(ctx, "DC0_H0_VM0")
+				if err != nil {
+					return err
+				}
+
+				if len(i.existingDeviceSpecIndexes) > 0 {
+					existingDevices := []infrav1.VGPUSpec{}
+					for _, idx := range i.existingDeviceSpecIndexes {
+						existingDevices = append(existingDevices, i.vGPUDeviceSpecs[idx])
+					}
+					g.Expect(vm.AddDevice(ctx,
+						ConstructDeviceSpecsVGPU(existingDevices)...)).ToNot(gomega.HaveOccurred())
+				}
+				toBeAdded, err := CalculateVGPUsToBeAdded(ctx, vm, i.vGPUDeviceSpecs)
+				g.Expect(err).ToNot(gomega.HaveOccurred())
+				g.Expect(toBeAdded).To(gomega.HaveLen(i.expectedLen))
+				if i.assertFunc != nil {
+					i.assertFunc(g, toBeAdded)
+				}
+				return nil
+			})
+		})
+	}
+
+	t.Run("when no vGPU devices exist on the VM", func(t *testing.T) {
+		inputs := []input{
+			{
+				name:        "when adding a single vGPU device of each type",
+				expectedLen: 2,
+				vGPUDeviceSpecs: []infrav1.VGPUSpec{
+					{ProfileName: "1234"}, {ProfileName: "4321"},
+				},
+				assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) {
+					g.Expect(actual[0].ProfileName).To(gomega.Equal("1234"))
+					g.Expect(actual[1].ProfileName).To(gomega.Equal("4321"))
+				},
+			},
+			{
+				name:        "when adding multiple vGPU devices of a type",
+				expectedLen: 2,
+				vGPUDeviceSpecs: []infrav1.VGPUSpec{
+					{ProfileName: "1234"}, {ProfileName: "1234"},
+				},
+				assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) {
+					g.Expect(actual[0].ProfileName).To(gomega.Equal("1234"))
+					g.Expect(actual[1].ProfileName).To(gomega.Equal("1234"))
+				},
+			},
+		}
+		for _, tt := range inputs {
+			testFunc(t, tt)
+		}
+	})
+
+	t.Run("when all vGPU devices exist on the VM", func(t *testing.T) {
+		inputs := []input{
+			{
+				name:        "when adding a single vGPU device of each type",
+				expectedLen: 0,
+				vGPUDeviceSpecs: []infrav1.VGPUSpec{
+					{ProfileName: "1234"}, {ProfileName: "4321"},
+				},
+				existingDeviceSpecIndexes: []int{0, 1},
+			},
+			{
+				name:        "when adding multiple vGPU devices of a type",
+				expectedLen: 0,
+				vGPUDeviceSpecs: []infrav1.VGPUSpec{
+					{ProfileName: "1234"}, {ProfileName: "1234"},
+				},
+				existingDeviceSpecIndexes: []int{0, 1},
+			},
+		}
+		for _, tt := range inputs {
+			testFunc(t, tt)
+		}
+	})
+
+	t.Run("when some vGPU devices exist on the VM", func(t *testing.T) {
+		inputs := []input{
+			{
+				name:        "when adding a single vGPU device of each type",
+				expectedLen: 1,
+				vGPUDeviceSpecs: []infrav1.VGPUSpec{
+					{ProfileName: "1234"}, {ProfileName: "4321"},
+				},
+				existingDeviceSpecIndexes: []int{0},
+				assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) {
+					g.Expect(actual[0].ProfileName).To(gomega.Equal("4321"))
+				},
+			},
+			{
+				name:        "when adding multiple vGPU devices of a type",
+				expectedLen: 2,
+				vGPUDeviceSpecs: []infrav1.VGPUSpec{
+					{ProfileName: "1234"},
+					{ProfileName: "1234"},
+					{ProfileName: "4321"},
+				},
+				existingDeviceSpecIndexes: []int{0},
+				assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) {
+					g.Expect(actual[0].ProfileName).To(gomega.Equal("1234"))
+					g.Expect(actual[1].ProfileName).To(gomega.Equal("4321"))
+				},
+			},
+		}
+		for _, tt := range inputs {
+			testFunc(t, tt)
+		}
+	})
+}
diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go
index 87a2591cd0..b3b0f1af29 100644
--- a/pkg/services/govmomi/vcenter/clone.go
+++ b/pkg/services/govmomi/vcenter/clone.go
@@ -199,7 +199,7 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by
 	// For PCI and vGPU devices, the memory for the VM needs to be reserved
 	// We can replace this once we have another way of reserving memory option
 	// exposed via the API types.
-	if len(vmCtx.VSphereVM.Spec.PciDevices) > 0 {
+	if len(vmCtx.VSphereVM.Spec.PciDevices) > 0 || len(vmCtx.VSphereVM.Spec.VGPUDevices) > 0 {
 		spec.Config.MemoryReservationLockedToMax = ptr.To(true)
 	}
 

From 27933fad699bfa6a8c30cb16c25230c57419019c Mon Sep 17 00:00:00 2001
From: Birk Lewin <89076383+birksl@users.noreply.github.com>
Date: Thu, 30 May 2024 13:17:20 +0200
Subject: [PATCH 05/21] Update docs/gpu-vgpu.md

Co-authored-by: Christian Schlotter <chrischdi@users.noreply.github.com>
---
 docs/gpu-vgpu.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md
index 4b1dc11d4c..af6901d049 100644
--- a/docs/gpu-vgpu.md
+++ b/docs/gpu-vgpu.md
@@ -56,7 +56,7 @@ We are using NVIDIA Tesla V100 32GB cards for this example and will use the `gri
 
 ```shell
 $ make dev-flavors
-go run ./packaging/flavorgen --output-dir /Users/pkatyal/.cluster-api/overrides/infrastructure-vsphere/v0.0.0
+go run ./packaging/flavorgen --output-dir /home/user/.cluster-api/overrides/infrastructure-vsphere/v0.0.0
 ```
 
 Edit the generated Cluster template (`cluster-template.yaml`) to set the values for the `vgpuDevices` array. Here we are editing the VSphereMachineTemplate object for the worker nodes. This will create a worker node with a single NVIDIA 16GB vGPU device attached to the VM.

From 9f16da1beacd4b8ca2bbf8921b3c389e63c05c64 Mon Sep 17 00:00:00 2001
From: Birk Lewin <89076383+birksl@users.noreply.github.com>
Date: Thu, 30 May 2024 13:17:32 +0200
Subject: [PATCH 06/21] Update pkg/services/govmomi/vcenter/clone.go

Co-authored-by: Christian Schlotter <chrischdi@users.noreply.github.com>
---
 pkg/services/govmomi/vcenter/clone.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go
index b3b0f1af29..61126d7e43 100644
--- a/pkg/services/govmomi/vcenter/clone.go
+++ b/pkg/services/govmomi/vcenter/clone.go
@@ -69,7 +69,7 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by
 		}
 	}
 	if vmCtx.VSphereVM.Spec.CustomVMXKeys != nil {
-		log.Info("applied custom vmx keys to VM clone spec")
+		log.Info("Applied custom vmx keys to VM clone spec")
 		if err := extraConfig.SetCustomVMXKeys(vmCtx.VSphereVM.Spec.CustomVMXKeys); err != nil {
 			return err
 		}

From cd250d7475859f445203ec79756f42514700628e Mon Sep 17 00:00:00 2001
From: Birk Lewin <birk.lewin@xait.com>
Date: Thu, 30 May 2024 15:01:29 +0200
Subject: [PATCH 07/21] Make VGPU directly part of PCI specs.

---
 apis/v1alpha3/conversion_test.go              |   1 -
 apis/v1alpha3/zz_generated.conversion.go      |   1 -
 apis/v1alpha4/zz_generated.conversion.go      |   1 -
 apis/v1beta1/types.go                         |  17 +-
 apis/v1beta1/zz_generated.deepcopy.go         |  20 ---
 ...ture.cluster.x-k8s.io_vspheremachines.yaml |  18 +-
 ...ster.x-k8s.io_vspheremachinetemplates.yaml |  20 +--
 ...structure.cluster.x-k8s.io_vspherevms.yaml |  18 +-
 docs/gpu-vgpu.md                              |   6 +-
 pkg/services/govmomi/pci/device.go            |  28 +++-
 pkg/services/govmomi/pci/device_test.go       |  25 ++-
 pkg/services/govmomi/pci/vgpu.go              |  81 ---------
 pkg/services/govmomi/pci/vgpu_test.go         | 156 ------------------
 pkg/services/govmomi/service.go               |  34 ----
 pkg/services/govmomi/vcenter/clone.go         |   4 +-
 15 files changed, 64 insertions(+), 366 deletions(-)
 delete mode 100644 pkg/services/govmomi/pci/vgpu.go
 delete mode 100644 pkg/services/govmomi/pci/vgpu_test.go

diff --git a/apis/v1alpha3/conversion_test.go b/apis/v1alpha3/conversion_test.go
index e956f34da2..732fae6f8b 100644
--- a/apis/v1alpha3/conversion_test.go
+++ b/apis/v1alpha3/conversion_test.go
@@ -120,7 +120,6 @@ func CustomSpecNewFieldFuzzer(in *infrav1.VirtualMachineCloneSpec, c fuzz.Contin
 	c.FuzzNoCustom(in)
 
 	in.PciDevices = nil
-	in.VGPUDevices = nil
 	in.AdditionalDisksGiB = nil
 	in.OS = ""
 	in.HardwareVersion = ""
diff --git a/apis/v1alpha3/zz_generated.conversion.go b/apis/v1alpha3/zz_generated.conversion.go
index 0c8f7f2cb0..966195ccbb 100644
--- a/apis/v1alpha3/zz_generated.conversion.go
+++ b/apis/v1alpha3/zz_generated.conversion.go
@@ -1760,7 +1760,6 @@ func autoConvert_v1beta1_VirtualMachineCloneSpec_To_v1alpha3_VirtualMachineClone
 	out.CustomVMXKeys = *(*map[string]string)(unsafe.Pointer(&in.CustomVMXKeys))
 	// WARNING: in.TagIDs requires manual conversion: does not exist in peer-type
 	// WARNING: in.PciDevices requires manual conversion: does not exist in peer-type
-	// WARNING: in.VGPUDevices requires manual conversion: does not exist in peer-type
 	// WARNING: in.OS requires manual conversion: does not exist in peer-type
 	// WARNING: in.HardwareVersion requires manual conversion: does not exist in peer-type
 	return nil
diff --git a/apis/v1alpha4/zz_generated.conversion.go b/apis/v1alpha4/zz_generated.conversion.go
index a18d2ecd3d..147c1a9894 100644
--- a/apis/v1alpha4/zz_generated.conversion.go
+++ b/apis/v1alpha4/zz_generated.conversion.go
@@ -1914,7 +1914,6 @@ func autoConvert_v1beta1_VirtualMachineCloneSpec_To_v1alpha4_VirtualMachineClone
 	out.CustomVMXKeys = *(*map[string]string)(unsafe.Pointer(&in.CustomVMXKeys))
 	// WARNING: in.TagIDs requires manual conversion: does not exist in peer-type
 	// WARNING: in.PciDevices requires manual conversion: does not exist in peer-type
-	// WARNING: in.VGPUDevices requires manual conversion: does not exist in peer-type
 	// WARNING: in.OS requires manual conversion: does not exist in peer-type
 	// WARNING: in.HardwareVersion requires manual conversion: does not exist in peer-type
 	return nil
diff --git a/apis/v1beta1/types.go b/apis/v1beta1/types.go
index 81c5882279..374084ce5d 100644
--- a/apis/v1beta1/types.go
+++ b/apis/v1beta1/types.go
@@ -193,9 +193,6 @@ type VirtualMachineCloneSpec struct {
 	// PciDevices is the list of pci devices used by the virtual machine.
 	// +optional
 	PciDevices []PCIDeviceSpec `json:"pciDevices,omitempty"`
-	// VGPUDevices is the list of vGPUs used by the virtual machine.
-	// +optional
-	VGPUDevices []VGPUSpec `json:"vgpuDevices,omitempty"`
 	// OS is the Operating System of the virtual machine
 	// Defaults to Linux
 	// +optional
@@ -261,6 +258,11 @@ type PCIDeviceSpec struct {
 	// virtual machine is cloned.
 	// +kubebuilder:validation:Required
 	VendorID *int32 `json:"vendorId,omitempty"`
+	// VGPUProfile is the profile name of a virtual machine's vGPU, in string.
+	// Defaults to the eponymous property value in the template from which the
+	// virtual machine is cloned.
+	// +kubebuilder:validation:Required
+	VGPUProfile string `json:"vgpuProfile,omitempty"`
 	// CustomLabel is the hardware label of a virtual machine's PCI device.
 	// Defaults to the eponymous property value in the template from which the
 	// virtual machine is cloned.
@@ -268,15 +270,6 @@ type PCIDeviceSpec struct {
 	CustomLabel string `json:"customLabel,omitempty"`
 }
 
-// VGPUSpec defines virtual machine's VGPU configuration
-type VGPUSpec struct {
-	// ProfileName is the ProfileName of a virtual machine's vGPU, in string.
-	// Defaults to the eponymous property value in the template from which the
-	// virtual machine is cloned.
-	// +kubebuilder:validation:Required
-	ProfileName string `json:"profileName,omitempty"`
-}
-
 // NetworkSpec defines the virtual machine's network configuration.
 type NetworkSpec struct {
 	// Devices is the list of network devices used by the virtual machine.
diff --git a/apis/v1beta1/zz_generated.deepcopy.go b/apis/v1beta1/zz_generated.deepcopy.go
index c6b26cfb39..44d12a65fe 100644
--- a/apis/v1beta1/zz_generated.deepcopy.go
+++ b/apis/v1beta1/zz_generated.deepcopy.go
@@ -403,21 +403,6 @@ func (in *Topology) DeepCopy() *Topology {
 	return out
 }
 
-// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *VGPUSpec) DeepCopyInto(out *VGPUSpec) {
-	*out = *in
-}
-
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VGPUSpec.
-func (in *VGPUSpec) DeepCopy() *VGPUSpec {
-	if in == nil {
-		return nil
-	}
-	out := new(VGPUSpec)
-	in.DeepCopyInto(out)
-	return out
-}
-
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *VSphereCluster) DeepCopyInto(out *VSphereCluster) {
 	*out = *in
@@ -1336,11 +1321,6 @@ func (in *VirtualMachineCloneSpec) DeepCopyInto(out *VirtualMachineCloneSpec) {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
-	if in.VGPUDevices != nil {
-		in, out := &in.VGPUDevices, &out.VGPUDevices
-		*out = make([]VGPUSpec, len(*in))
-		copy(*out, *in)
-	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineCloneSpec.
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
index eb46f63a03..84cd06f334 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
@@ -1379,6 +1379,11 @@ spec:
                         virtual machine is cloned.
                       format: int32
                       type: integer
+                    vgpuProfile:
+                      description: VGPUProfile is the VGPUProfile of a virtual machine's
+                        vGPU, in string. Defaults to the eponymous property value
+                        in the template from which the virtual machine is cloned.
+                      type: string
                   type: object
                 type: array
               powerOffMode:
@@ -1448,19 +1453,6 @@ spec:
                   without TLS certificate validation of the communication between Cluster API Provider vSphere
                   and the VMware vCenter server.
                 type: string
-              vgpuDevices:
-                description: VGPUDevices is the list of vGPUs used by the virtual
-                  machine.
-                items:
-                  description: VGPUSpec defines virtual machine's VGPU configuration
-                  properties:
-                    profileName:
-                      description: ProfileName is the ProfileName of a virtual machine's
-                        vGPU, in string. Defaults to the eponymous property value
-                        in the template from which the virtual machine is cloned.
-                      type: string
-                  type: object
-                type: array
             required:
             - network
             - template
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
index de5d4f2c62..27c270bffa 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
@@ -1254,6 +1254,12 @@ spec:
                                 virtual machine is cloned.
                               format: int32
                               type: integer
+                            vgpuProfile:
+                              description: VGPUProfile is the VGPUProfile of a virtual
+                                machine's vGPU, in string. Defaults to the eponymous
+                                property value in the template from which the virtual
+                                machine is cloned.
+                              type: string
                           type: object
                         type: array
                       powerOffMode:
@@ -1323,20 +1329,6 @@ spec:
                           without TLS certificate validation of the communication between Cluster API Provider vSphere
                           and the VMware vCenter server.
                         type: string
-                      vgpuDevices:
-                        description: VGPUDevices is the list of vGPUs used by the
-                          virtual machine.
-                        items:
-                          description: VGPUSpec defines virtual machine's VGPU configuration
-                          properties:
-                            profileName:
-                              description: ProfileName is the ProfileName of a virtual
-                                machine's vGPU, in string. Defaults to the eponymous
-                                property value in the template from which the virtual
-                                machine is cloned.
-                              type: string
-                          type: object
-                        type: array
                     required:
                     - network
                     - template
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
index f6f2f80e71..96589edfdc 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
@@ -1467,6 +1467,11 @@ spec:
                         virtual machine is cloned.
                       format: int32
                       type: integer
+                    vgpuProfile:
+                      description: VGPUProfile is the VGPUProfile of a virtual machine's
+                        vGPU, in string. Defaults to the eponymous property value
+                        in the template from which the virtual machine is cloned.
+                      type: string
                   type: object
                 type: array
               powerOffMode:
@@ -1531,19 +1536,6 @@ spec:
                   without TLS certificate validation of the communication between Cluster API Provider vSphere
                   and the VMware vCenter server.
                 type: string
-              vgpuDevices:
-                description: VGPUDevices is the list of vGPUs used by the virtual
-                  machine.
-                items:
-                  description: VGPUSpec defines virtual machine's VGPU configuration
-                  properties:
-                    profileName:
-                      description: ProfileName is the ProfileName of a virtual machine's
-                        vGPU, in string. Defaults to the eponymous property value
-                        in the template from which the virtual machine is cloned.
-                      type: string
-                  type: object
-                type: array
             required:
             - network
             - template
diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md
index af6901d049..268aab1075 100644
--- a/docs/gpu-vgpu.md
+++ b/docs/gpu-vgpu.md
@@ -59,7 +59,7 @@ $ make dev-flavors
 go run ./packaging/flavorgen --output-dir /home/user/.cluster-api/overrides/infrastructure-vsphere/v0.0.0
 ```
 
-Edit the generated Cluster template (`cluster-template.yaml`) to set the values for the `vgpuDevices` array. Here we are editing the VSphereMachineTemplate object for the worker nodes. This will create a worker node with a single NVIDIA 16GB vGPU device attached to the VM.
+Edit the generated Cluster template (`cluster-template.yaml`) to set the values for the `pciDevices` array. Here we are editing the VSphereMachineTemplate object for the worker nodes. This will create a worker node with a single NVIDIA 16GB vGPU device attached to the VM.
 
 ```yaml
 ---
@@ -89,8 +89,8 @@ spec:
       storagePolicyName: '${VSPHERE_STORAGE_POLICY}'
       template: '${VSPHERE_TEMPLATE}'
       thumbprint: '${VSPHERE_TLS_THUMBPRINT}'
-      vgpuDevices:
-        - profileName: "grid_v100d-4c"  # value from above
+      pciDevices:
+        - vgpuProfile: "grid_t4-1a" # value from above
 ```
 
 Set the required values for the other fields and the cluster template is ready for use. The similar changes can be made to a template generated using clusterctl generate cluster command as well.
diff --git a/pkg/services/govmomi/pci/device.go b/pkg/services/govmomi/pci/device.go
index f92670bf9d..cc75f670bc 100644
--- a/pkg/services/govmomi/pci/device.go
+++ b/pkg/services/govmomi/pci/device.go
@@ -76,18 +76,28 @@ func ConstructDeviceSpecs(pciDeviceSpecs []infrav1.PCIDeviceSpec) []types.BaseVi
 	return pciDevices
 }
 
-func createBackingInfo(spec infrav1.PCIDeviceSpec) *types.VirtualPCIPassthroughDynamicBackingInfo {
-	return &types.VirtualPCIPassthroughDynamicBackingInfo{
-		AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{
-			{
-				VendorId: *spec.VendorID,
-				DeviceId: *spec.DeviceID,
+func createBackingInfo(spec infrav1.PCIDeviceSpec) types.BaseVirtualDeviceBackingInfo {
+	if spec.VGPUProfile == "" {
+		return &types.VirtualPCIPassthroughDynamicBackingInfo{
+			AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{
+				{
+					VendorId: *spec.VendorID,
+					DeviceId: *spec.DeviceID,
+				},
 			},
-		},
-		CustomLabel: spec.CustomLabel,
+			CustomLabel: spec.CustomLabel,
+		}
+	}
+
+	return &types.VirtualPCIPassthroughVmiopBackingInfo{
+		Vgpu: spec.VGPUProfile,
 	}
 }
 
 func constructKey(pciDeviceSpec infrav1.PCIDeviceSpec) string {
-	return fmt.Sprintf("%d-%d", *pciDeviceSpec.DeviceID, *pciDeviceSpec.VendorID)
+	if pciDeviceSpec.VGPUProfile == "" {
+		return fmt.Sprintf("%d-%d", *pciDeviceSpec.DeviceID, *pciDeviceSpec.VendorID)
+	}
+
+	return pciDeviceSpec.VGPUProfile
 }
diff --git a/pkg/services/govmomi/pci/device_test.go b/pkg/services/govmomi/pci/device_test.go
index 74f57245c8..5f62089552 100644
--- a/pkg/services/govmomi/pci/device_test.go
+++ b/pkg/services/govmomi/pci/device_test.go
@@ -72,30 +72,36 @@ func Test_CalculateDevicesToBeAdded(t *testing.T) {
 		inputs := []input{
 			{
 				name:        "when adding a single PCI device of each type",
-				expectedLen: 2,
+				expectedLen: 3,
 				pciDeviceSpecs: []infrav1.PCIDeviceSpec{
 					{DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)},
 					{DeviceID: ptr.To[int32](4321), VendorID: ptr.To[int32](8765)},
+					{VGPUProfile: "grid_t4-1a"},
 				},
 				assertFunc: func(g *gomega.WithT, actual []infrav1.PCIDeviceSpec) {
 					g.Expect(*actual[0].DeviceID).To(gomega.Equal(int32(1234)))
 					g.Expect(*actual[0].VendorID).To(gomega.Equal(int32(5678)))
 					g.Expect(*actual[1].DeviceID).To(gomega.Equal(int32(4321)))
 					g.Expect(*actual[1].VendorID).To(gomega.Equal(int32(8765)))
+					g.Expect(actual[2].VGPUProfile).To(gomega.Equal("grid_t4-1a"))
 				},
 			},
 			{
 				name:        "when adding multiple PCI devices of a type",
-				expectedLen: 2,
+				expectedLen: 4,
 				pciDeviceSpecs: []infrav1.PCIDeviceSpec{
 					{DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)},
 					{DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)},
+					{VGPUProfile: "grid_t4-1a"},
+					{VGPUProfile: "grid_t4-1a"},
 				},
 				assertFunc: func(g *gomega.WithT, actual []infrav1.PCIDeviceSpec) {
 					g.Expect(*actual[0].DeviceID).To(gomega.Equal(int32(1234)))
 					g.Expect(*actual[0].VendorID).To(gomega.Equal(int32(5678)))
 					g.Expect(*actual[1].DeviceID).To(gomega.Equal(int32(1234)))
 					g.Expect(*actual[1].VendorID).To(gomega.Equal(int32(5678)))
+					g.Expect(actual[2].VGPUProfile).To(gomega.Equal("grid_t4-1a"))
+					g.Expect(actual[3].VGPUProfile).To(gomega.Equal("grid_t4-1a"))
 				},
 			},
 		}
@@ -112,8 +118,9 @@ func Test_CalculateDevicesToBeAdded(t *testing.T) {
 				pciDeviceSpecs: []infrav1.PCIDeviceSpec{
 					{DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)},
 					{DeviceID: ptr.To[int32](4321), VendorID: ptr.To[int32](8765)},
+					{VGPUProfile: "grid_t4-1a"},
 				},
-				existingDeviceSpecIndexes: []int{0, 1},
+				existingDeviceSpecIndexes: []int{0, 1, 2},
 			},
 			{
 				name:        "when adding multiple PCI devices of a type",
@@ -121,8 +128,10 @@ func Test_CalculateDevicesToBeAdded(t *testing.T) {
 				pciDeviceSpecs: []infrav1.PCIDeviceSpec{
 					{DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)},
 					{DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)},
+					{VGPUProfile: "grid_t4-1a"},
+					{VGPUProfile: "grid_t4-1a"},
 				},
-				existingDeviceSpecIndexes: []int{0, 1},
+				existingDeviceSpecIndexes: []int{0, 1, 2, 3},
 			},
 		}
 		for _, tt := range inputs {
@@ -134,24 +143,27 @@ func Test_CalculateDevicesToBeAdded(t *testing.T) {
 		inputs := []input{
 			{
 				name:        "when adding a single PCI device of each type",
-				expectedLen: 1,
+				expectedLen: 2,
 				pciDeviceSpecs: []infrav1.PCIDeviceSpec{
 					{DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)},
 					{DeviceID: ptr.To[int32](4321), VendorID: ptr.To[int32](8765)},
+					{VGPUProfile: "grid_t4-1a"},
 				},
 				existingDeviceSpecIndexes: []int{0},
 				assertFunc: func(g *gomega.WithT, actual []infrav1.PCIDeviceSpec) {
 					g.Expect(*actual[0].DeviceID).To(gomega.Equal(int32(4321)))
 					g.Expect(*actual[0].VendorID).To(gomega.Equal(int32(8765)))
+					g.Expect(actual[1].VGPUProfile).To(gomega.Equal("grid_t4-1a"))
 				},
 			},
 			{
 				name:        "when adding multiple PCI devices of a type",
-				expectedLen: 2,
+				expectedLen: 3,
 				pciDeviceSpecs: []infrav1.PCIDeviceSpec{
 					{DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)},
 					{DeviceID: ptr.To[int32](1234), VendorID: ptr.To[int32](5678)},
 					{DeviceID: ptr.To[int32](4321), VendorID: ptr.To[int32](8765)},
+					{VGPUProfile: "grid_t4-1a"},
 				},
 				existingDeviceSpecIndexes: []int{0},
 				assertFunc: func(g *gomega.WithT, actual []infrav1.PCIDeviceSpec) {
@@ -159,6 +171,7 @@ func Test_CalculateDevicesToBeAdded(t *testing.T) {
 					g.Expect(*actual[0].VendorID).To(gomega.Equal(int32(5678)))
 					g.Expect(*actual[1].DeviceID).To(gomega.Equal(int32(4321)))
 					g.Expect(*actual[1].VendorID).To(gomega.Equal(int32(8765)))
+					g.Expect(actual[2].VGPUProfile).To(gomega.Equal("grid_t4-1a"))
 				},
 			},
 		}
diff --git a/pkg/services/govmomi/pci/vgpu.go b/pkg/services/govmomi/pci/vgpu.go
deleted file mode 100644
index e4053b54ff..0000000000
--- a/pkg/services/govmomi/pci/vgpu.go
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
-Copyright 2023 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package pci
-
-import (
-	"context"
-
-	"github.com/vmware/govmomi/object"
-	"github.com/vmware/govmomi/vim25/types"
-
-	infrav1 "sigs.k8s.io/cluster-api-provider-vsphere/apis/v1beta1"
-)
-
-// CalculateVGPUsToBeAdded calculates the vGPU devices which should be added to the VM.
-func CalculateVGPUsToBeAdded(ctx context.Context, vm *object.VirtualMachine, deviceSpecs []infrav1.VGPUSpec) ([]infrav1.VGPUSpec, error) {
-	// store the number of expected devices for each deviceID + vendorID combo
-	deviceVendorIDComboMap := map[string]int{}
-	for _, spec := range deviceSpecs {
-		key := spec.ProfileName
-		if _, ok := deviceVendorIDComboMap[key]; !ok {
-			deviceVendorIDComboMap[key] = 1
-		} else {
-			deviceVendorIDComboMap[key]++
-		}
-	}
-
-	devices, err := vm.Device(ctx)
-	if err != nil {
-		return nil, err
-	}
-
-	specsToBeAdded := []infrav1.VGPUSpec{}
-	for _, spec := range deviceSpecs {
-		key := spec.ProfileName
-		pciDeviceList := devices.SelectByBackingInfo(createBackingInfoVGPU(spec))
-		expectedDeviceLen := deviceVendorIDComboMap[key]
-		if expectedDeviceLen-len(pciDeviceList) > 0 {
-			specsToBeAdded = append(specsToBeAdded, spec)
-			deviceVendorIDComboMap[key]--
-		}
-	}
-	return specsToBeAdded, nil
-}
-
-// ConstructDeviceSpecsVGPU transforms a list of VGPUSpec into a list of BaseVirutalDevices used by govmomi.
-func ConstructDeviceSpecsVGPU(vGPUDeviceSpecs []infrav1.VGPUSpec) []types.BaseVirtualDevice {
-	vGPUDevices := []types.BaseVirtualDevice{}
-	deviceKey := int32(-200)
-
-	for _, pciDevice := range vGPUDeviceSpecs {
-		backingInfo := createBackingInfoVGPU(pciDevice)
-		vGPUDevices = append(vGPUDevices, &types.VirtualPCIPassthrough{
-			VirtualDevice: types.VirtualDevice{
-				Key:     deviceKey,
-				Backing: backingInfo,
-			},
-		})
-		deviceKey--
-	}
-	return vGPUDevices
-}
-
-func createBackingInfoVGPU(spec infrav1.VGPUSpec) *types.VirtualPCIPassthroughVmiopBackingInfo {
-	return &types.VirtualPCIPassthroughVmiopBackingInfo{
-		Vgpu: spec.ProfileName,
-	}
-}
diff --git a/pkg/services/govmomi/pci/vgpu_test.go b/pkg/services/govmomi/pci/vgpu_test.go
deleted file mode 100644
index 5b4a572040..0000000000
--- a/pkg/services/govmomi/pci/vgpu_test.go
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
-Copyright 2023 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package pci
-
-import (
-	"context"
-	"testing"
-
-	"github.com/onsi/gomega"
-	"github.com/vmware/govmomi/find"
-	"github.com/vmware/govmomi/simulator"
-	"github.com/vmware/govmomi/vim25"
-
-	infrav1 "sigs.k8s.io/cluster-api-provider-vsphere/apis/v1beta1"
-)
-
-func Test_CalculateVGPUsToBeAdded(t *testing.T) {
-	type input struct {
-		name                      string
-		expectedLen               int
-		existingDeviceSpecIndexes []int
-		vGPUDeviceSpecs           []infrav1.VGPUSpec
-		assertFunc                func(g *gomega.WithT, actual []infrav1.VGPUSpec)
-	}
-
-	testFunc := func(t *testing.T, i input) {
-		t.Helper()
-		t.Run(i.name, func(t *testing.T) {
-			g := gomega.NewWithT(t)
-			simulator.Run(func(ctx context.Context, client *vim25.Client) error {
-				finder := find.NewFinder(client)
-				vm, err := finder.VirtualMachine(ctx, "DC0_H0_VM0")
-				if err != nil {
-					return err
-				}
-
-				if len(i.existingDeviceSpecIndexes) > 0 {
-					existingDevices := []infrav1.VGPUSpec{}
-					for _, idx := range i.existingDeviceSpecIndexes {
-						existingDevices = append(existingDevices, i.vGPUDeviceSpecs[idx])
-					}
-					g.Expect(vm.AddDevice(ctx,
-						ConstructDeviceSpecsVGPU(existingDevices)...)).ToNot(gomega.HaveOccurred())
-				}
-				toBeAdded, err := CalculateVGPUsToBeAdded(ctx, vm, i.vGPUDeviceSpecs)
-				g.Expect(err).ToNot(gomega.HaveOccurred())
-				g.Expect(toBeAdded).To(gomega.HaveLen(i.expectedLen))
-				if i.assertFunc != nil {
-					i.assertFunc(g, toBeAdded)
-				}
-				return nil
-			})
-		})
-	}
-
-	t.Run("when no vGPU devices exist on the VM", func(t *testing.T) {
-		inputs := []input{
-			{
-				name:        "when adding a single vGPU device of each type",
-				expectedLen: 2,
-				vGPUDeviceSpecs: []infrav1.VGPUSpec{
-					{ProfileName: "1234"}, {ProfileName: "4321"},
-				},
-				assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) {
-					g.Expect(actual[0].ProfileName).To(gomega.Equal("1234"))
-					g.Expect(actual[1].ProfileName).To(gomega.Equal("4321"))
-				},
-			},
-			{
-				name:        "when adding multiple vGPU devices of a type",
-				expectedLen: 2,
-				vGPUDeviceSpecs: []infrav1.VGPUSpec{
-					{ProfileName: "1234"}, {ProfileName: "1234"},
-				},
-				assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) {
-					g.Expect(actual[0].ProfileName).To(gomega.Equal("1234"))
-					g.Expect(actual[1].ProfileName).To(gomega.Equal("1234"))
-				},
-			},
-		}
-		for _, tt := range inputs {
-			testFunc(t, tt)
-		}
-	})
-
-	t.Run("when all vGPU devices exist on the VM", func(t *testing.T) {
-		inputs := []input{
-			{
-				name:        "when adding a single vGPU device of each type",
-				expectedLen: 0,
-				vGPUDeviceSpecs: []infrav1.VGPUSpec{
-					{ProfileName: "1234"}, {ProfileName: "4321"},
-				},
-				existingDeviceSpecIndexes: []int{0, 1},
-			},
-			{
-				name:        "when adding multiple vGPU devices of a type",
-				expectedLen: 0,
-				vGPUDeviceSpecs: []infrav1.VGPUSpec{
-					{ProfileName: "1234"}, {ProfileName: "1234"},
-				},
-				existingDeviceSpecIndexes: []int{0, 1},
-			},
-		}
-		for _, tt := range inputs {
-			testFunc(t, tt)
-		}
-	})
-
-	t.Run("when some vGPU devices exist on the VM", func(t *testing.T) {
-		inputs := []input{
-			{
-				name:        "when adding a single vGPU device of each type",
-				expectedLen: 1,
-				vGPUDeviceSpecs: []infrav1.VGPUSpec{
-					{ProfileName: "1234"}, {ProfileName: "4321"},
-				},
-				existingDeviceSpecIndexes: []int{0},
-				assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) {
-					g.Expect(actual[0].ProfileName).To(gomega.Equal("4321"))
-				},
-			},
-			{
-				name:        "when adding multiple vGPU devices of a type",
-				expectedLen: 2,
-				vGPUDeviceSpecs: []infrav1.VGPUSpec{
-					{ProfileName: "1234"},
-					{ProfileName: "1234"},
-					{ProfileName: "4321"},
-				},
-				existingDeviceSpecIndexes: []int{0},
-				assertFunc: func(g *gomega.WithT, actual []infrav1.VGPUSpec) {
-					g.Expect(actual[0].ProfileName).To(gomega.Equal("1234"))
-					g.Expect(actual[1].ProfileName).To(gomega.Equal("4321"))
-				},
-			},
-		}
-		for _, tt := range inputs {
-			testFunc(t, tt)
-		}
-	})
-}
diff --git a/pkg/services/govmomi/service.go b/pkg/services/govmomi/service.go
index e53358bbc0..92256bf2a5 100644
--- a/pkg/services/govmomi/service.go
+++ b/pkg/services/govmomi/service.go
@@ -538,40 +538,6 @@ func (vms *VMService) reconcilePCIDevices(ctx context.Context, virtualMachineCtx
 			return errors.Wrapf(err, "error adding pci devices for %q", ctx)
 		}
 	}
-	if expectedVGPUs := virtualMachineCtx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices; len(expectedVGPUs) != 0 {
-		specsToBeAdded, err := pci.CalculateVGPUsToBeAdded(ctx, virtualMachineCtx.Obj, expectedVGPUs)
-		if err != nil {
-			return err
-		}
-
-		if len(specsToBeAdded) == 0 {
-			if conditions.Has(virtualMachineCtx.VSphereVM, infrav1.PCIDevicesDetachedCondition) {
-				conditions.Delete(virtualMachineCtx.VSphereVM, infrav1.PCIDevicesDetachedCondition)
-			}
-			log.V(5).Info("No new PCI devices to be added")
-			return nil
-		}
-
-		powerState, err := virtualMachineCtx.Obj.PowerState(ctx)
-		if err != nil {
-			return err
-		}
-		if powerState == types.VirtualMachinePowerStatePoweredOn {
-			// This would arise only when the PCI device is manually removed from
-			// the VM post creation.
-			log.Info("vGPU device cannot be attached in powered on state")
-			conditions.MarkFalse(virtualMachineCtx.VSphereVM,
-				infrav1.PCIDevicesDetachedCondition,
-				infrav1.NotFoundReason,
-				clusterv1.ConditionSeverityWarning,
-				"vGPU devices removed after VM was powered on")
-			return errors.Errorf("missing vGPU devices")
-		}
-		log.Info("vGPU devices to be added", "number", len(specsToBeAdded))
-		if err := virtualMachineCtx.Obj.AddDevice(ctx, pci.ConstructDeviceSpecsVGPU(specsToBeAdded)...); err != nil {
-			return errors.Wrapf(err, "error adding vGPU devices for %q", ctx)
-		}
-	}
 	return nil
 }
 
diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go
index 61126d7e43..0334e247e1 100644
--- a/pkg/services/govmomi/vcenter/clone.go
+++ b/pkg/services/govmomi/vcenter/clone.go
@@ -196,10 +196,10 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by
 		Snapshot: snapshotRef,
 	}
 
-	// For PCI and vGPU devices, the memory for the VM needs to be reserved
+	// For PCI devices, the memory for the VM needs to be reserved
 	// We can replace this once we have another way of reserving memory option
 	// exposed via the API types.
-	if len(vmCtx.VSphereVM.Spec.PciDevices) > 0 || len(vmCtx.VSphereVM.Spec.VGPUDevices) > 0 {
+	if len(vmCtx.VSphereVM.Spec.PciDevices) > 0 {
 		spec.Config.MemoryReservationLockedToMax = ptr.To(true)
 	}
 

From 1e9a6e1fecd4835a5321bea5b3d49ad1e8b47066 Mon Sep 17 00:00:00 2001
From: Birk Lewin <birk.lewin@xait.com>
Date: Thu, 30 May 2024 15:11:43 +0200
Subject: [PATCH 08/21] Remove outdated vpgu e2e setup

---
 Makefile                                              |  2 --
 test/e2e/config/vsphere-dev.yaml                      |  1 -
 .../main/vgpu/kustomization.yaml                      |  6 ------
 .../main/vgpu/vgpu-device-template.yaml               | 11 -----------
 4 files changed, 20 deletions(-)
 delete mode 100644 test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml
 delete mode 100644 test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml

diff --git a/Makefile b/Makefile
index bb5d78b839..2eb00f4911 100644
--- a/Makefile
+++ b/Makefile
@@ -384,8 +384,6 @@ generate-e2e-templates-main: $(KUSTOMIZE) ## Generate test templates for the mai
 	"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/install-on-bootstrap" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-install-on-bootstrap.yaml"
 	# for PCI passthrough template
 	"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/pci" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-pci.yaml"
-	# for vGPU template
-	"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/vgpu" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-vgpu.yaml"
 	# for DHCP overrides
 	"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/dhcp-overrides" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-dhcp-overrides.yaml"
 	"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/ownerrefs-finalizers" > "$(E2E_GOVMOMI_TEMPLATE_DIR)/main/cluster-template-ownerrefs-finalizers.yaml"
diff --git a/test/e2e/config/vsphere-dev.yaml b/test/e2e/config/vsphere-dev.yaml
index d5a4df0026..4a1d89e7f3 100644
--- a/test/e2e/config/vsphere-dev.yaml
+++ b/test/e2e/config/vsphere-dev.yaml
@@ -1,4 +1,3 @@
----
 # E2E test scenario using local dev images and manifests built from the source tree for following providers:
 # - cluster-api
 # - bootstrap kubeadm
diff --git a/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml b/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml
deleted file mode 100644
index 75b395b27b..0000000000
--- a/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/kustomization.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-resources:
-  - ../base
-patchesStrategicMerge:
-  - vgpu-device-template.yaml
diff --git a/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml b/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml
deleted file mode 100644
index 4404df5f3f..0000000000
--- a/test/e2e/data/infrastructure-vsphere-govmomi/main/vgpu/vgpu-device-template.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
----
-apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
-kind: VSphereMachineTemplate
-metadata:
-  name: ${CLUSTER_NAME}-worker
-  namespace: ${NAMESPACE}
-spec:
-  template:
-    spec:
-      vgpuDevices:
-        - profileName: ${PROFILE_NAME}
\ No newline at end of file

From 6e8d105561d7c51bb0477d8c5ee6351065283eca Mon Sep 17 00:00:00 2001
From: Birk Lewin <birk.lewin@xait.com>
Date: Thu, 30 May 2024 15:34:47 +0200
Subject: [PATCH 09/21] Fix nil-pointer mistake

---
 pkg/services/govmomi/pci/device.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/services/govmomi/pci/device.go b/pkg/services/govmomi/pci/device.go
index cc75f670bc..a6e1a1132b 100644
--- a/pkg/services/govmomi/pci/device.go
+++ b/pkg/services/govmomi/pci/device.go
@@ -77,7 +77,7 @@ func ConstructDeviceSpecs(pciDeviceSpecs []infrav1.PCIDeviceSpec) []types.BaseVi
 }
 
 func createBackingInfo(spec infrav1.PCIDeviceSpec) types.BaseVirtualDeviceBackingInfo {
-	if spec.VGPUProfile == "" {
+	if spec.DeviceID != nil && spec.VendorID != nil {
 		return &types.VirtualPCIPassthroughDynamicBackingInfo{
 			AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{
 				{
@@ -95,7 +95,7 @@ func createBackingInfo(spec infrav1.PCIDeviceSpec) types.BaseVirtualDeviceBackin
 }
 
 func constructKey(pciDeviceSpec infrav1.PCIDeviceSpec) string {
-	if pciDeviceSpec.VGPUProfile == "" {
+	if pciDeviceSpec.DeviceID != nil && pciDeviceSpec.VendorID != nil {
 		return fmt.Sprintf("%d-%d", *pciDeviceSpec.DeviceID, *pciDeviceSpec.VendorID)
 	}
 

From c7f4781067b2f356089b3b6dd812588982d23a77 Mon Sep 17 00:00:00 2001
From: Birk Lewin <birk.lewin@xait.com>
Date: Fri, 31 May 2024 15:22:19 +0200
Subject: [PATCH 10/21] Webhook validation for vgpuProfile

---
 apis/v1beta1/types.go                         |   3 +
 ...ture.cluster.x-k8s.io_vspheremachines.yaml |   3 +-
 ...ster.x-k8s.io_vspheremachinetemplates.yaml |   9 +-
 ...structure.cluster.x-k8s.io_vspherevms.yaml |   5 +-
 internal/webhooks/vspheremachinetemplate.go   |   7 +
 test/e2e/config/vsphere-dev.yaml              | 163 ------------------
 test/e2e/config/vsphere.yaml                  |   1 -
 7 files changed, 21 insertions(+), 170 deletions(-)
 delete mode 100644 test/e2e/config/vsphere-dev.yaml

diff --git a/apis/v1beta1/types.go b/apis/v1beta1/types.go
index 374084ce5d..d542adf722 100644
--- a/apis/v1beta1/types.go
+++ b/apis/v1beta1/types.go
@@ -251,16 +251,19 @@ type PCIDeviceSpec struct {
 	// DeviceID is the device ID of a virtual machine's PCI, in integer.
 	// Defaults to the eponymous property value in the template from which the
 	// virtual machine is cloned.
+	// Mutually exclusive with VGPUProfile.
 	// +kubebuilder:validation:Required
 	DeviceID *int32 `json:"deviceId,omitempty"`
 	// VendorId is the vendor ID of a virtual machine's PCI, in integer.
 	// Defaults to the eponymous property value in the template from which the
 	// virtual machine is cloned.
+	// Mutually exclusive with VGPUProfile.
 	// +kubebuilder:validation:Required
 	VendorID *int32 `json:"vendorId,omitempty"`
 	// VGPUProfile is the profile name of a virtual machine's vGPU, in string.
 	// Defaults to the eponymous property value in the template from which the
 	// virtual machine is cloned.
+	// Mutually exclusive with DeviceID and VendorID.
 	// +kubebuilder:validation:Required
 	VGPUProfile string `json:"vgpuProfile,omitempty"`
 	// CustomLabel is the hardware label of a virtual machine's PCI device.
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
index 84cd06f334..4dc55498e0 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
@@ -1380,9 +1380,10 @@ spec:
                       format: int32
                       type: integer
                     vgpuProfile:
-                      description: VGPUProfile is the VGPUProfile of a virtual machine's
+                      description: VGPUProfile is the profile name of a virtual machine's
                         vGPU, in string. Defaults to the eponymous property value
                         in the template from which the virtual machine is cloned.
+                        Mutually exclusive with DeviceID and VendorID.
                       type: string
                   type: object
                 type: array
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
index 27c270bffa..bc8d15216d 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
@@ -1244,21 +1244,22 @@ spec:
                               description: |-
                                 DeviceID is the device ID of a virtual machine's PCI, in integer.
                                 Defaults to the eponymous property value in the template from which the
-                                virtual machine is cloned.
+                                virtual machine is cloned. Mutually exclusive with VGPUProfile.
                               format: int32
                               type: integer
                             vendorId:
                               description: |-
                                 VendorId is the vendor ID of a virtual machine's PCI, in integer.
                                 Defaults to the eponymous property value in the template from which the
-                                virtual machine is cloned.
+                                virtual machine is cloned. Mutually exclusive with VGPUProfile.
                               format: int32
                               type: integer
                             vgpuProfile:
-                              description: VGPUProfile is the VGPUProfile of a virtual
+                              description: VGPUProfile is the profile name of a virtual
                                 machine's vGPU, in string. Defaults to the eponymous
                                 property value in the template from which the virtual
-                                machine is cloned.
+                                machine is cloned. Mutually exclusive with DeviceID
+                                and VendorID.
                               type: string
                           type: object
                         type: array
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
index 96589edfdc..6ce31f5d74 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
@@ -1458,6 +1458,7 @@ spec:
                         DeviceID is the device ID of a virtual machine's PCI, in integer.
                         Defaults to the eponymous property value in the template from which the
                         virtual machine is cloned.
+                        Mutually exclusive with VGPUProfile.
                       format: int32
                       type: integer
                     vendorId:
@@ -1465,12 +1466,14 @@ spec:
                         VendorId is the vendor ID of a virtual machine's PCI, in integer.
                         Defaults to the eponymous property value in the template from which the
                         virtual machine is cloned.
+                        Mutually exclusive with VGPUProfile.
                       format: int32
                       type: integer
                     vgpuProfile:
-                      description: VGPUProfile is the VGPUProfile of a virtual machine's
+                      description: VGPUProfile is the profile name of a virtual machine's
                         vGPU, in string. Defaults to the eponymous property value
                         in the template from which the virtual machine is cloned.
+                        Mutually exclusive with DeviceID and VendorID.
                       type: string
                   type: object
                 type: array
diff --git a/internal/webhooks/vspheremachinetemplate.go b/internal/webhooks/vspheremachinetemplate.go
index a88af09892..e787f3289a 100644
--- a/internal/webhooks/vspheremachinetemplate.go
+++ b/internal/webhooks/vspheremachinetemplate.go
@@ -84,6 +84,13 @@ func (webhook *VSphereMachineTemplateWebhook) ValidateCreate(_ context.Context,
 			allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "guestSoftPowerOffTimeout"), spec.GuestSoftPowerOffTimeout, "should be greater than 0"))
 		}
 	}
+	for _, device := range spec.PciDevices {
+		hasVGPU := device.VGPUProfile != ""
+		hasPCI := device.DeviceID != nil && device.VendorID != nil
+		if (hasPCI && hasVGPU) || (!hasPCI && !hasVGPU) {
+			allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices"), spec.PciDevices, "should have either deviceID + vendorID or vgpuProfile"))
+		}
+	}
 	return nil, AggregateObjErrors(obj.GroupVersionKind().GroupKind(), obj.Name, allErrs)
 }
 
diff --git a/test/e2e/config/vsphere-dev.yaml b/test/e2e/config/vsphere-dev.yaml
deleted file mode 100644
index 4a1d89e7f3..0000000000
--- a/test/e2e/config/vsphere-dev.yaml
+++ /dev/null
@@ -1,163 +0,0 @@
-# E2E test scenario using local dev images and manifests built from the source tree for following providers:
-# - cluster-api
-# - bootstrap kubeadm
-# - control-plane kubeadm
-# - vsphere
-
-# For creating local dev images built from the source tree;
-# - from the CAPI repository root, `make docker-build REGISTRY=gcr.io/k8s-staging-cluster-api` to build the cluster-api,
-#  bootstrap kubeadm, control-plane kubeadm provider images. This step can be skipped to use upstream images.
-# - from the CAPV repository root, `make e2e` to build the vsphere provider image and run e2e tests.
-
-images:
-  - name: registry.k8s.io/cluster-api/cluster-api-controller:v1.5.0
-    loadBehavior: tryLoad
-  - name: registry.k8s.io/cluster-api/kubeadm-bootstrap-controller:v1.5.0
-    loadBehavior: tryLoad
-  - name: registry.k8s.io/cluster-api/kubeadm-control-plane-controller:v1.5.0
-    loadBehavior: tryLoad
-  - name: gcr.io/k8s-staging-cluster-api/capv-manager:e2e
-    loadBehavior: mustLoad
-  - name: quay.io/jetstack/cert-manager-cainjector:v1.12.2
-    loadBehavior: tryLoad
-  - name: quay.io/jetstack/cert-manager-webhook:v1.12.2
-    loadBehavior: tryLoad
-  - name: quay.io/jetstack/cert-manager-controller:v1.12.2
-    loadBehavior: tryLoad
-
-providers:
-
-  - name: cluster-api
-    type: CoreProvider
-    versions:
-      - name: v1.5.0
-        # Use manifest from source files
-        value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/v1.5.0/core-components.yaml"
-        type: "url"
-        contract: v1beta1
-        files:
-          - sourcePath: "../data/shared/main/v1beta1/metadata.yaml"
-        replacements:
-          - old: "imagePullPolicy: Always"
-            new: "imagePullPolicy: IfNotPresent"
-
-  - name: kubeadm
-    type: BootstrapProvider
-    versions:
-      - name: v1.5.0
-        # Use manifest from source files
-        value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/v1.5.0/bootstrap-components.yaml"
-        type: "url"
-        contract: v1beta1
-        files:
-          - sourcePath: "../data/shared/main/v1beta1/metadata.yaml"
-        replacements:
-          - old: "imagePullPolicy: Always"
-            new: "imagePullPolicy: IfNotPresent"
-
-  - name: kubeadm
-    type: ControlPlaneProvider
-    versions:
-      - name: v1.5.0
-        # Use manifest from source files
-        value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/v1.5.0/control-plane-components.yaml"
-        type: "url"
-        contract: v1beta1
-        files:
-          - sourcePath: "../data/shared/main/v1beta1/metadata.yaml"
-        replacements:
-          - old: "imagePullPolicy: Always"
-            new: "imagePullPolicy: IfNotPresent"
-
-  - name: vsphere
-    type: InfrastructureProvider
-    versions:
-      - name: v1.9.99
-        # Use manifest from source files
-        value: ../../../../cluster-api-provider-vsphere/config/default
-        contract: v1beta1
-        replacements:
-          - old: gcr.io/cluster-api-provider-vsphere/release/manager:latest
-            new: gcr.io/k8s-staging-cluster-api/capv-manager:e2e
-          - old: "imagePullPolicy: Always"
-            new: "imagePullPolicy: IfNotPresent"
-        files:
-          # Add a cluster template
-          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template.yaml"
-          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-conformance.yaml"
-          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-hw-upgrade.yaml"
-          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-kcp-remediation.yaml"
-          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-md-remediation.yaml"
-          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-node-drain.yaml"
-          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-pci.yaml"
-          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-remote-management.yaml"
-          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-storage-policy.yaml"
-          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-topology.yaml"
-          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-dhcp-overrides.yaml"
-          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/clusterclass-quick-start.yaml"
-          - sourcePath: "../../../test/e2e/data/infrastructure-vsphere/main/cluster-template-ignition.yaml"
-          - sourcePath: "../data/shared/main/v1beta1_provider/metadata.yaml"
-
-variables:
-  KUBERNETES_VERSION: "v1.28.0"
-  CPI_IMAGE_K8S_VERSION: "v1.27.0"
-  CNI: "./data/cni/calico/calico.yaml"
-  EXP_CLUSTER_RESOURCE_SET: "true"
-  EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION: "true"
-  CONTROL_PLANE_MACHINE_COUNT: 1
-  WORKER_MACHINE_COUNT: 1
-  IP_FAMILY: "IPv4"
-  CLUSTER_CLASS_NAME: "quick-start"
-  # Following CAPV variables should be set before testing
-  VSPHERE_SERVER: "vcenter.vmware.com"
-  VSPHERE_TLS_THUMBPRINT: "AA:BB:CC:DD:11:22:33:44:EE:FF"
-  VSPHERE_DATACENTER: "SDDC-Datacenter"
-  VSPHERE_FOLDER: "FolderName"
-  VSPHERE_RESOURCE_POOL: "ResourcePool"
-  VSPHERE_DATASTORE: "WorkloadDatastore"
-  VSPHERE_STORAGE_POLICY: "Cluster API vSphere Storage Policy"
-  VSPHERE_NETWORK: "network-1"
-  VSPHERE_TEMPLATE: "ubuntu-2204-kube-v1.28.0"
-  FLATCAR_VSPHERE_TEMPLATE: "flatcar-stable-3510.2.6-kube-v1.28.0"
-  # WORKLOAD_CONTROL_PLANE_ENDPOINT_IP:
-  # Also following variables are required but it is recommended to use env variables to avoid disclosure of sensitive data
-  # VSPHERE_SSH_AUTHORIZED_KEY:
-  # VSPHERE_PASSWORD:
-  # VSPHERE_USERNAME:
-  # Dedicated IP to be used by kube-vip
-  # CONTROL_PLANE_ENDPOINT_IP:
-  # Sets the insecure-flag for vsphere-csi-controller config
-  VSPHERE_INSECURE_CSI: "true"
-  KUBETEST_CONFIGURATION: "./data/kubetest/conformance-fast.yaml"
-  NODE_DRAIN_TIMEOUT: "60s"
-  CLUSTER_TOPOLOGY: "true"
-  # These IDs correspond to Tesla T4s, they are the decimal representation of the hex values.
-  DEVICE_ID: 7864
-  VENDOR_ID: 4318
-  PROFILE_NAME: grid_v100d-4c
-  # CAPV feature flags
-  EXP_NODE_ANTI_AFFINITY: "true"
-  # Following CAPV variables is used for multivc_test.go. This is the second VSphere and should be set if multivc test is enabled.
-  VSPHERE2_SERVER: "vcenter2.vmware.com"
-  VSPHERE2_TLS_THUMBPRINT: "AA:BB:CC:DD:11:22:33:44:EE:FF"
-  VSPHERE2_RESOURCE_POOL: "ResourcePool"
-  VSPHERE2_TEMPLATE: "ubuntu-2004-kube-v1.27.3"
-  # Dedicated IP to be used by kube-vip
-  VSPHERE2_CONTROL_PLANE_ENDPOINT_IP:
-  # Following variables are also required and please use env variables to avoid disclosure of sensitive data
-  VSPHERE2_USERNAME:
-  VSPHERE2_PASSWORD:
-
-
-intervals:
-  default/wait-controllers: ["5m", "10s"]
-  default/wait-cluster: ["5m", "10s"]
-  default/wait-control-plane: ["20m", "10s"]
-  default/wait-worker-nodes: ["20m", "10s"]
-  default/wait-delete-cluster: ["5m", "10s"]
-  default/wait-machine-upgrade: ["15m", "1m"]
-  default/wait-machine-remediation: ["15m", "10s"]
-  mhc-remediation/mhc-remediation: ["30m", "10s"]
-  node-drain/wait-deployment-available: ["3m", "10s"]
-  node-drain/wait-machine-deleted: ["2m", "10s"]
-  anti-affinity/wait-vm-redistribution: ["5m", "10s"]
diff --git a/test/e2e/config/vsphere.yaml b/test/e2e/config/vsphere.yaml
index d0f768d305..2aeb087684 100644
--- a/test/e2e/config/vsphere.yaml
+++ b/test/e2e/config/vsphere.yaml
@@ -279,7 +279,6 @@ variables:
   # These IDs correspond to Tesla T4s, they are the decimal representation of the hex values.
   DEVICE_ID: 7864
   VENDOR_ID: 4318
-  PROFILE_NAME: grid_v100d-4c
   # CAPV feature flags
   EXP_NODE_ANTI_AFFINITY: "true"
   CAPI_DIAGNOSTICS_ADDRESS: ":8080"

From 1353b4db60457683e869ee234d70782484f9caf7 Mon Sep 17 00:00:00 2001
From: Birk Lewin <89076383+birksl@users.noreply.github.com>
Date: Fri, 31 May 2024 18:15:59 +0200
Subject: [PATCH 11/21] Update internal/webhooks/vspheremachinetemplate.go

Co-authored-by: Christian Schlotter <chrischdi@users.noreply.github.com>
---
 internal/webhooks/vspheremachinetemplate.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/internal/webhooks/vspheremachinetemplate.go b/internal/webhooks/vspheremachinetemplate.go
index e787f3289a..e72aeb5c5e 100644
--- a/internal/webhooks/vspheremachinetemplate.go
+++ b/internal/webhooks/vspheremachinetemplate.go
@@ -84,11 +84,11 @@ func (webhook *VSphereMachineTemplateWebhook) ValidateCreate(_ context.Context,
 			allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "guestSoftPowerOffTimeout"), spec.GuestSoftPowerOffTimeout, "should be greater than 0"))
 		}
 	}
-	for _, device := range spec.PciDevices {
+	for i, device := range spec.PciDevices {
 		hasVGPU := device.VGPUProfile != ""
 		hasPCI := device.DeviceID != nil && device.VendorID != nil
 		if (hasPCI && hasVGPU) || (!hasPCI && !hasVGPU) {
-			allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices"), spec.PciDevices, "should have either deviceID + vendorID or vgpuProfile"))
+			allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d, i)), device, "should have either deviceID + vendorID or vgpuProfile"))
 		}
 	}
 	return nil, AggregateObjErrors(obj.GroupVersionKind().GroupKind(), obj.Name, allErrs)

From a6c0fd82d86909ef5cc06638d6ad0889d50052bf Mon Sep 17 00:00:00 2001
From: Birk Lewin <birk.lewin@xait.com>
Date: Fri, 7 Jun 2024 17:53:42 +0200
Subject: [PATCH 12/21] Update validation webhooks

---
 internal/webhooks/vspheremachine.go           | 11 +++
 internal/webhooks/vspheremachine_test.go      | 95 +++++++++++++------
 internal/webhooks/vspheremachinetemplate.go   | 12 ++-
 .../webhooks/vspheremachinetemplate_test.go   | 77 +++++++++++----
 pkg/services/govmomi/pci/device.go            | 23 +++--
 5 files changed, 157 insertions(+), 61 deletions(-)

diff --git a/internal/webhooks/vspheremachine.go b/internal/webhooks/vspheremachine.go
index 420df4733d..f183e9a10e 100644
--- a/internal/webhooks/vspheremachine.go
+++ b/internal/webhooks/vspheremachine.go
@@ -92,6 +92,17 @@ func (webhook *VSphereMachineWebhook) ValidateCreate(_ context.Context, raw runt
 			allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "guestSoftPowerOffTimeout"), spec.GuestSoftPowerOffTimeout, "should be greater than 0"))
 		}
 	}
+	for i, device := range spec.PciDevices {
+		if device.VGPUProfile == "" {
+			if device.DeviceID == nil || device.VendorID == nil {
+				allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have both deviceId and vendorId set"))
+			}
+		} else {
+			if device.DeviceID != nil || device.VendorID != nil {
+				allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have either deviceId + vendorId or vgpuProfile"))
+			}
+		}
+	}
 
 	return nil, AggregateObjErrors(obj.GroupVersionKind().GroupKind(), obj.Name, allErrs)
 }
diff --git a/internal/webhooks/vspheremachine_test.go b/internal/webhooks/vspheremachine_test.go
index b5ef77df7f..8c1fa7857e 100644
--- a/internal/webhooks/vspheremachine_test.go
+++ b/internal/webhooks/vspheremachine_test.go
@@ -48,52 +48,86 @@ func TestVSphereMachine_ValidateCreate(t *testing.T) {
 	}{
 		{
 			name:           "preferredAPIServerCIDR set on creation ",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "192.168.0.1/32", []string{}, infrav1.VirtualMachinePowerOpModeTrySoft, nil),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "192.168.0.1/32", []string{}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, nil),
 			wantErr:        true,
 		},
 		{
 			name:           "IPs are not in CIDR format",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, nil),
 			wantErr:        true,
 		},
 		{
 			name:           "IPs are not valid IPs in CIDR format",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"<nil>/32", "192.168.0.644/33"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"<nil>/32", "192.168.0.644/33"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, nil),
 			wantErr:        true,
 		},
 		{
 			name:           "guestSoftPowerOffTimeout should not be set with powerOffMode set to hard",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeHard, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeHard, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}, nil),
 			wantErr:        true,
 		},
 		{
 			name:           "guestSoftPowerOffTimeout should not be set with powerOffMode set to soft",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeSoft, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeSoft, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}, nil),
 			wantErr:        true,
 		},
 		{
 			name:           "guestSoftPowerOffTimeout should not be negative",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeHard, &metav1.Duration{Duration: -1234}),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeHard, &metav1.Duration{Duration: -1234}, nil),
+			wantErr:        true,
+		},
+
+		{
+			name:           "empty pciDevice",
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: ""}}),
+			wantErr:        true,
+		},
+		{
+			name:           "incorrect pciDevice",
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32)}}),
+			wantErr:        true,
+		},
+		{
+			name:           "incorrect pciDevice",
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32), VendorID: new(int32)}}),
 			wantErr:        true,
 		},
+		{
+			name:           "incomplete pciDevice",
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{DeviceID: new(int32)}}),
+			wantErr:        true,
+		},
+		{
+			name:           "incomplete pciDevice",
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VendorID: new(int32)}}),
+			wantErr:        true,
+		},
+		{
+			name:           "successful VSphereMachine creation with PCI device",
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{DeviceID: new(int32), VendorID: new(int32)}}),
+		},
+		{
+			name:           "successful VSphereMachine creation with vgpu",
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu"}}),
+		},
 		{
 			name:           "successful VSphereMachine creation",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, nil),
 			wantErr:        false,
 		},
 		{
 			name:           "successful VSphereMachine creation with powerOffMode set to hard",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeHard, nil),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeHard, nil, nil),
 			wantErr:        false,
 		},
 		{
 			name:           "successful VSphereMachine creation with powerOffMode set to soft",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil),
 			wantErr:        false,
 		},
 		{
 			name:           "successful VSphereMachine creation with powerOffMode set to trySoft and non-default timeout",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, &metav1.Duration{Duration: 1234}),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, &metav1.Duration{Duration: 1234}, nil),
 			wantErr:        false,
 		},
 	}
@@ -121,50 +155,56 @@ func TestVSphereMachine_ValidateUpdate(t *testing.T) {
 	}{
 		{
 			name:              "ProviderID can be updated",
-			oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil),
-			vsphereMachine:    createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil),
+			oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil),
+			vsphereMachine:    createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil),
 			wantErr:           false,
 		},
 		{
 			name:              "updating ips can be done",
-			oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil),
-			vsphereMachine:    createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil),
+			oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil),
+			vsphereMachine:    createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil),
 			wantErr:           false,
 		},
 		{
 			name:              "updating non-existing IP with invalid ips can not be done",
-			oldVSphereMachine: createVSphereMachine("foo.com", nil, "", nil, infrav1.VirtualMachinePowerOpModeSoft, nil),
-			vsphereMachine:    createVSphereMachine("foo.com", &someProviderID, "", []string{"<nil>/32", "192.168.0.10/33"}, infrav1.VirtualMachinePowerOpModeSoft, nil),
+			oldVSphereMachine: createVSphereMachine("foo.com", nil, "", nil, infrav1.VirtualMachinePowerOpModeSoft, nil, nil),
+			vsphereMachine:    createVSphereMachine("foo.com", &someProviderID, "", []string{"<nil>/32", "192.168.0.10/33"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil),
 			wantErr:           true,
 		},
 		{
 			name:              "updating existing IP with invalid ips can not be done",
-			oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil),
-			vsphereMachine:    createVSphereMachine("foo.com", &someProviderID, "", []string{"<nil>/32", "192.168.0.10/33"}, infrav1.VirtualMachinePowerOpModeSoft, nil),
+			oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil),
+			vsphereMachine:    createVSphereMachine("foo.com", &someProviderID, "", []string{"<nil>/32", "192.168.0.10/33"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil),
 			wantErr:           true,
 		},
 		{
 			name:              "updating server cannot be done",
-			oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil),
-			vsphereMachine:    createVSphereMachine("bar.com", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil),
+			oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil),
+			vsphereMachine:    createVSphereMachine("bar.com", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil),
+			wantErr:           true,
+		},
+		{
+			name:              "updating pci devices cannot be done",
+			oldVSphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu"}}),
+			vsphereMachine:    createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "new-vgpu"}}),
 			wantErr:           true,
 		},
 		{
 			name:              "powerOffMode cannot be updated when new powerOffMode is not valid",
-			oldVSphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil),
-			vsphereMachine:    createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeHard, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}),
+			oldVSphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, nil),
+			vsphereMachine:    createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeHard, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}, nil),
 			wantErr:           true,
 		},
 		{
 			name:              "powerOffMode can be updated to hard",
-			oldVSphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}),
-			vsphereMachine:    createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeHard, nil),
+			oldVSphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}, nil),
+			vsphereMachine:    createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeHard, nil, nil),
 			wantErr:           false,
 		},
 		{
 			name:              "powerOffMode can be updated to soft",
-			oldVSphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}),
-			vsphereMachine:    createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil),
+			oldVSphereMachine: createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, &metav1.Duration{Duration: infrav1.GuestSoftPowerOffDefaultTimeout}, nil),
+			vsphereMachine:    createVSphereMachine("foo.com", &someProviderID, "", []string{"192.168.0.1/32"}, infrav1.VirtualMachinePowerOpModeSoft, nil, nil),
 			wantErr:           false,
 		},
 	}
@@ -181,7 +221,7 @@ func TestVSphereMachine_ValidateUpdate(t *testing.T) {
 	}
 }
 
-func createVSphereMachine(server string, providerID *string, preferredAPIServerCIDR string, ips []string, powerOffMode infrav1.VirtualMachinePowerOpMode, guestSoftPowerOffTimeout *metav1.Duration) *infrav1.VSphereMachine {
+func createVSphereMachine(server string, providerID *string, preferredAPIServerCIDR string, ips []string, powerOffMode infrav1.VirtualMachinePowerOpMode, guestSoftPowerOffTimeout *metav1.Duration, pciDevices []infrav1.PCIDeviceSpec) *infrav1.VSphereMachine {
 	VSphereMachine := &infrav1.VSphereMachine{
 		Spec: infrav1.VSphereMachineSpec{
 			VirtualMachineCloneSpec: infrav1.VirtualMachineCloneSpec{
@@ -190,6 +230,7 @@ func createVSphereMachine(server string, providerID *string, preferredAPIServerC
 					PreferredAPIServerCIDR: preferredAPIServerCIDR,
 					Devices:                []infrav1.NetworkDeviceSpec{},
 				},
+				PciDevices: pciDevices,
 			},
 			ProviderID:               providerID,
 			PowerOffMode:             powerOffMode,
diff --git a/internal/webhooks/vspheremachinetemplate.go b/internal/webhooks/vspheremachinetemplate.go
index e72aeb5c5e..2497249c83 100644
--- a/internal/webhooks/vspheremachinetemplate.go
+++ b/internal/webhooks/vspheremachinetemplate.go
@@ -85,10 +85,14 @@ func (webhook *VSphereMachineTemplateWebhook) ValidateCreate(_ context.Context,
 		}
 	}
 	for i, device := range spec.PciDevices {
-		hasVGPU := device.VGPUProfile != ""
-		hasPCI := device.DeviceID != nil && device.VendorID != nil
-		if (hasPCI && hasVGPU) || (!hasPCI && !hasVGPU) {
-			allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d, i)), device, "should have either deviceID + vendorID or vgpuProfile"))
+		if device.VGPUProfile == "" {
+			if device.DeviceID == nil || device.VendorID == nil {
+				allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have both deviceId and vendorId set"))
+			}
+		} else {
+			if device.DeviceID != nil || device.VendorID != nil {
+				allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have either deviceId + vendorId or vgpuProfile"))
+			}
 		}
 	}
 	return nil, AggregateObjErrors(obj.GroupVersionKind().GroupKind(), obj.Name, allErrs)
diff --git a/internal/webhooks/vspheremachinetemplate_test.go b/internal/webhooks/vspheremachinetemplate_test.go
index 20f6eddacf..d1a0714645 100644
--- a/internal/webhooks/vspheremachinetemplate_test.go
+++ b/internal/webhooks/vspheremachinetemplate_test.go
@@ -37,37 +37,70 @@ func TestVSphereMachineTemplate_ValidateCreate(t *testing.T) {
 	}{
 		{
 			name:           "preferredAPIServerCIDR set on creation ",
-			vsphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "192.168.0.1/32", []string{}),
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "192.168.0.1/32", []string{}, nil),
 			wantErr:        true,
 		},
 		{
 			name:           "ProviderID set on creation",
-			vsphereMachine: createVSphereMachineTemplate("foo.com", "", &someProviderID, "", []string{}),
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "", &someProviderID, "", []string{}, nil),
 			wantErr:        true,
 		},
 		{
 			name:           "IPs are not in CIDR format",
-			vsphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32", "192.168.0.3"}),
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32", "192.168.0.3"}, nil),
 			wantErr:        true,
 		},
 		{
 			name:           "successful VSphereMachine creation",
-			vsphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}),
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, nil),
 			wantErr:        true,
 		},
 		{
 			name:           "incomplete hardware version",
-			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}),
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, nil),
 			wantErr:        true,
 		},
 		{
 			name:           "incorrect hardware version",
-			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-0", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}),
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-0", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, nil),
 			wantErr:        true,
 		},
+		{
+			name:           "empty pciDevice",
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: ""}}),
+			wantErr:        true,
+		},
+		{
+			name:           "incorrect pciDevice",
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32)}}),
+			wantErr:        true,
+		},
+		{
+			name:           "incorrect pciDevice",
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32), VendorID: new(int32)}}),
+			wantErr:        true,
+		},
+		{
+			name:           "incomplete pciDevice",
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{DeviceID: new(int32)}}),
+			wantErr:        true,
+		},
+		{
+			name:           "incomplete pciDevice",
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VendorID: new(int32)}}),
+			wantErr:        true,
+		},
+		{
+			name:           "successful VSphereMachine creation with PCI device",
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{DeviceID: new(int32), VendorID: new(int32)}}),
+		},
+		{
+			name:           "successful VSphereMachine creation with vgpu",
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu"}}),
+		},
 		{
 			name:           "successful VSphereMachine creation with hardware version set",
-			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}),
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, nil),
 		},
 	}
 	for _, tc := range tests {
@@ -94,36 +127,43 @@ func TestVSphereMachineTemplate_ValidateUpdate(t *testing.T) {
 	}{
 		{
 			name:              "ProviderID cannot be updated",
-			oldVSphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32"}),
-			vsphereMachine:    createVSphereMachineTemplate("foo.com", "", &someProviderID, "", []string{"192.168.0.1/32"}),
+			oldVSphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32"}, nil),
+			vsphereMachine:    createVSphereMachineTemplate("foo.com", "", &someProviderID, "", []string{"192.168.0.1/32"}, nil),
 			req:               &admission.Request{AdmissionRequest: admissionv1.AdmissionRequest{DryRun: ptr.To(false)}},
 			wantErr:           true,
 		},
 		{
 			name:              "ip addresses cannot be updated",
-			oldVSphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32"}),
-			vsphereMachine:    createVSphereMachineTemplate("foo.com", "", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}),
+			oldVSphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32"}, nil),
+			vsphereMachine:    createVSphereMachineTemplate("foo.com", "", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}, nil),
 			req:               &admission.Request{AdmissionRequest: admissionv1.AdmissionRequest{DryRun: ptr.To(false)}},
 			wantErr:           true,
 		},
 		{
 			name:              "server cannot be updated",
-			oldVSphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32"}),
-			vsphereMachine:    createVSphereMachineTemplate("baz.com", "", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}),
+			oldVSphereMachine: createVSphereMachineTemplate("foo.com", "", nil, "", []string{"192.168.0.1/32"}, nil),
+			vsphereMachine:    createVSphereMachineTemplate("baz.com", "", &someProviderID, "", []string{"192.168.0.1/32", "192.168.0.10/32"}, nil),
 			req:               &admission.Request{AdmissionRequest: admissionv1.AdmissionRequest{DryRun: ptr.To(false)}},
 			wantErr:           true,
 		},
 		{
 			name:              "hardware version cannot be updated",
-			oldVSphereMachine: createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}),
-			vsphereMachine:    createVSphereMachineTemplate("baz.com", "vmx-17", nil, "", []string{"192.168.0.1/32"}),
+			oldVSphereMachine: createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}, nil),
+			vsphereMachine:    createVSphereMachineTemplate("baz.com", "vmx-17", nil, "", []string{"192.168.0.1/32"}, nil),
+			req:               &admission.Request{AdmissionRequest: admissionv1.AdmissionRequest{DryRun: ptr.To(false)}},
+			wantErr:           true,
+		},
+		{
+			name:              "pci devices cannot be updated",
+			oldVSphereMachine: createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu"}}),
+			vsphereMachine:    createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}, []infrav1.PCIDeviceSpec{{VGPUProfile: "new-vgpu"}}),
 			req:               &admission.Request{AdmissionRequest: admissionv1.AdmissionRequest{DryRun: ptr.To(false)}},
 			wantErr:           true,
 		},
 		{
 			name:              "with hardware version set and not updated",
-			oldVSphereMachine: createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}),
-			vsphereMachine:    createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}),
+			oldVSphereMachine: createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}, nil),
+			vsphereMachine:    createVSphereMachineTemplate("foo.com", "vmx-16", nil, "", []string{"192.168.0.1/32"}, nil),
 			req:               &admission.Request{AdmissionRequest: admissionv1.AdmissionRequest{DryRun: ptr.To(false)}},
 			wantErr:           false, // explicitly calling out that this is a valid scenario.
 		},
@@ -145,7 +185,7 @@ func TestVSphereMachineTemplate_ValidateUpdate(t *testing.T) {
 	}
 }
 
-func createVSphereMachineTemplate(server, hwVersion string, providerID *string, preferredAPIServerCIDR string, ips []string) *infrav1.VSphereMachineTemplate {
+func createVSphereMachineTemplate(server, hwVersion string, providerID *string, preferredAPIServerCIDR string, ips []string, pciDevices []infrav1.PCIDeviceSpec) *infrav1.VSphereMachineTemplate {
 	vsphereMachineTemplate := &infrav1.VSphereMachineTemplate{
 		Spec: infrav1.VSphereMachineTemplateSpec{
 			Template: infrav1.VSphereMachineTemplateResource{
@@ -158,6 +198,7 @@ func createVSphereMachineTemplate(server, hwVersion string, providerID *string,
 							Devices:                []infrav1.NetworkDeviceSpec{},
 						},
 						HardwareVersion: hwVersion,
+						PciDevices:      pciDevices,
 					},
 				},
 			},
diff --git a/pkg/services/govmomi/pci/device.go b/pkg/services/govmomi/pci/device.go
index a6e1a1132b..eff285859d 100644
--- a/pkg/services/govmomi/pci/device.go
+++ b/pkg/services/govmomi/pci/device.go
@@ -77,20 +77,19 @@ func ConstructDeviceSpecs(pciDeviceSpecs []infrav1.PCIDeviceSpec) []types.BaseVi
 }
 
 func createBackingInfo(spec infrav1.PCIDeviceSpec) types.BaseVirtualDeviceBackingInfo {
-	if spec.DeviceID != nil && spec.VendorID != nil {
-		return &types.VirtualPCIPassthroughDynamicBackingInfo{
-			AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{
-				{
-					VendorId: *spec.VendorID,
-					DeviceId: *spec.DeviceID,
-				},
-			},
-			CustomLabel: spec.CustomLabel,
+	if spec.VGPUProfile != "" {
+		return &types.VirtualPCIPassthroughVmiopBackingInfo{
+			Vgpu: spec.VGPUProfile,
 		}
 	}
-
-	return &types.VirtualPCIPassthroughVmiopBackingInfo{
-		Vgpu: spec.VGPUProfile,
+	return &types.VirtualPCIPassthroughDynamicBackingInfo{
+		AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{
+			{
+				VendorId: *spec.VendorID,
+				DeviceId: *spec.DeviceID,
+			},
+		},
+		CustomLabel: spec.CustomLabel,
 	}
 }
 

From 8a6a558323774a8f3bfd8c79e71005ea3d891bad Mon Sep 17 00:00:00 2001
From: Birk Lewin <birk.lewin@xait.com>
Date: Fri, 7 Jun 2024 17:59:45 +0200
Subject: [PATCH 13/21] Reorder if-statement

---
 pkg/services/govmomi/pci/device.go | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pkg/services/govmomi/pci/device.go b/pkg/services/govmomi/pci/device.go
index eff285859d..7c41cb194e 100644
--- a/pkg/services/govmomi/pci/device.go
+++ b/pkg/services/govmomi/pci/device.go
@@ -94,9 +94,8 @@ func createBackingInfo(spec infrav1.PCIDeviceSpec) types.BaseVirtualDeviceBackin
 }
 
 func constructKey(pciDeviceSpec infrav1.PCIDeviceSpec) string {
-	if pciDeviceSpec.DeviceID != nil && pciDeviceSpec.VendorID != nil {
-		return fmt.Sprintf("%d-%d", *pciDeviceSpec.DeviceID, *pciDeviceSpec.VendorID)
+	if pciDeviceSpec.VGPUProfile != "" {
+		return pciDeviceSpec.VGPUProfile
 	}
-
-	return pciDeviceSpec.VGPUProfile
+	return fmt.Sprintf("%d-%d", *pciDeviceSpec.DeviceID, *pciDeviceSpec.VendorID)
 }

From cea00162ab59746157a2986512c699833ad25fe6 Mon Sep 17 00:00:00 2001
From: Birk Lewin <birk.lewin@xait.com>
Date: Wed, 12 Jun 2024 11:00:39 +0200
Subject: [PATCH 14/21] Run make generate

---
 ...ructure.cluster.x-k8s.io_vspheremachines.yaml |  9 ++++++---
 ...cluster.x-k8s.io_vspheremachinetemplates.yaml | 16 +++++++++-------
 ...frastructure.cluster.x-k8s.io_vspherevms.yaml |  7 ++++---
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
index 4dc55498e0..f775114990 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
@@ -1370,6 +1370,7 @@ spec:
                         DeviceID is the device ID of a virtual machine's PCI, in integer.
                         Defaults to the eponymous property value in the template from which the
                         virtual machine is cloned.
+                        Mutually exclusive with VGPUProfile.
                       format: int32
                       type: integer
                     vendorId:
@@ -1377,12 +1378,14 @@ spec:
                         VendorId is the vendor ID of a virtual machine's PCI, in integer.
                         Defaults to the eponymous property value in the template from which the
                         virtual machine is cloned.
+                        Mutually exclusive with VGPUProfile.
                       format: int32
                       type: integer
                     vgpuProfile:
-                      description: VGPUProfile is the profile name of a virtual machine's
-                        vGPU, in string. Defaults to the eponymous property value
-                        in the template from which the virtual machine is cloned.
+                      description: |-
+                        VGPUProfile is the profile name of a virtual machine's vGPU, in string.
+                        Defaults to the eponymous property value in the template from which the
+                        virtual machine is cloned.
                         Mutually exclusive with DeviceID and VendorID.
                       type: string
                   type: object
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
index bc8d15216d..a9518be232 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
@@ -1244,22 +1244,24 @@ spec:
                               description: |-
                                 DeviceID is the device ID of a virtual machine's PCI, in integer.
                                 Defaults to the eponymous property value in the template from which the
-                                virtual machine is cloned. Mutually exclusive with VGPUProfile.
+                                virtual machine is cloned.
+                                Mutually exclusive with VGPUProfile.
                               format: int32
                               type: integer
                             vendorId:
                               description: |-
                                 VendorId is the vendor ID of a virtual machine's PCI, in integer.
                                 Defaults to the eponymous property value in the template from which the
-                                virtual machine is cloned. Mutually exclusive with VGPUProfile.
+                                virtual machine is cloned.
+                                Mutually exclusive with VGPUProfile.
                               format: int32
                               type: integer
                             vgpuProfile:
-                              description: VGPUProfile is the profile name of a virtual
-                                machine's vGPU, in string. Defaults to the eponymous
-                                property value in the template from which the virtual
-                                machine is cloned. Mutually exclusive with DeviceID
-                                and VendorID.
+                              description: |-
+                                VGPUProfile is the profile name of a virtual machine's vGPU, in string.
+                                Defaults to the eponymous property value in the template from which the
+                                virtual machine is cloned.
+                                Mutually exclusive with DeviceID and VendorID.
                               type: string
                           type: object
                         type: array
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
index 6ce31f5d74..a35692c085 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
@@ -1470,9 +1470,10 @@ spec:
                       format: int32
                       type: integer
                     vgpuProfile:
-                      description: VGPUProfile is the profile name of a virtual machine's
-                        vGPU, in string. Defaults to the eponymous property value
-                        in the template from which the virtual machine is cloned.
+                      description: |-
+                        VGPUProfile is the profile name of a virtual machine's vGPU, in string.
+                        Defaults to the eponymous property value in the template from which the
+                        virtual machine is cloned.
                         Mutually exclusive with DeviceID and VendorID.
                       type: string
                   type: object

From 9fc17c97b281e862e48c4e8a7d01a7f88c86d3d8 Mon Sep 17 00:00:00 2001
From: Birk Lewin <birk.lewin@xait.com>
Date: Thu, 20 Jun 2024 09:43:58 +0200
Subject: [PATCH 15/21] Share device validation logic in webhooks

---
 internal/webhooks/vspheremachine.go         | 30 +++++++++++++--------
 internal/webhooks/vspheremachinetemplate.go | 14 +++-------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/internal/webhooks/vspheremachine.go b/internal/webhooks/vspheremachine.go
index f183e9a10e..7fe2abf24c 100644
--- a/internal/webhooks/vspheremachine.go
+++ b/internal/webhooks/vspheremachine.go
@@ -92,17 +92,8 @@ func (webhook *VSphereMachineWebhook) ValidateCreate(_ context.Context, raw runt
 			allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "guestSoftPowerOffTimeout"), spec.GuestSoftPowerOffTimeout, "should be greater than 0"))
 		}
 	}
-	for i, device := range spec.PciDevices {
-		if device.VGPUProfile == "" {
-			if device.DeviceID == nil || device.VendorID == nil {
-				allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have both deviceId and vendorId set"))
-			}
-		} else {
-			if device.DeviceID != nil || device.VendorID != nil {
-				allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have either deviceId + vendorId or vgpuProfile"))
-			}
-		}
-	}
+	pciErrs := validatePCIDevices(spec.PciDevices)
+	allErrs = append(allErrs, pciErrs...)
 
 	return nil, AggregateObjErrors(obj.GroupVersionKind().GroupKind(), obj.Name, allErrs)
 }
@@ -171,3 +162,20 @@ func (webhook *VSphereMachineWebhook) ValidateUpdate(_ context.Context, oldRaw r
 func (webhook *VSphereMachineWebhook) ValidateDelete(_ context.Context, _ runtime.Object) (admission.Warnings, error) {
 	return nil, nil
 }
+
+func validatePCIDevices(devices []infrav1.PCIDeviceSpec) field.ErrorList {
+	var allErrs field.ErrorList
+
+	for i, device := range devices {
+		if device.VGPUProfile != "" && device.DeviceID == nil && device.VendorID == nil {
+			// Valid case for vGPU.
+			continue
+		}
+		if device.VGPUProfile == "" && device.DeviceID != nil && device.VendorID != nil {
+			// Valid case for PCI Passthrough.
+			continue
+		}
+		allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have either deviceId + vendorId or vgpuProfile set"))
+	}
+	return allErrs
+}
diff --git a/internal/webhooks/vspheremachinetemplate.go b/internal/webhooks/vspheremachinetemplate.go
index 2497249c83..8a1c865481 100644
--- a/internal/webhooks/vspheremachinetemplate.go
+++ b/internal/webhooks/vspheremachinetemplate.go
@@ -84,17 +84,9 @@ func (webhook *VSphereMachineTemplateWebhook) ValidateCreate(_ context.Context,
 			allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "guestSoftPowerOffTimeout"), spec.GuestSoftPowerOffTimeout, "should be greater than 0"))
 		}
 	}
-	for i, device := range spec.PciDevices {
-		if device.VGPUProfile == "" {
-			if device.DeviceID == nil || device.VendorID == nil {
-				allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have both deviceId and vendorId set"))
-			}
-		} else {
-			if device.DeviceID != nil || device.VendorID != nil {
-				allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have either deviceId + vendorId or vgpuProfile"))
-			}
-		}
-	}
+	pciErrs := validatePCIDevices(spec.PciDevices)
+	allErrs = append(allErrs, pciErrs...)
+
 	return nil, AggregateObjErrors(obj.GroupVersionKind().GroupKind(), obj.Name, allErrs)
 }
 

From 0e84953eb8b48075a03dfddfc63862986c80f3c0 Mon Sep 17 00:00:00 2001
From: Birk Lewin <birk.lewin@xait.com>
Date: Thu, 20 Jun 2024 09:47:55 +0200
Subject: [PATCH 16/21] Fix empty pointer nit

---
 internal/webhooks/vspheremachine_test.go         | 11 ++++++-----
 internal/webhooks/vspheremachinetemplate_test.go | 10 +++++-----
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/internal/webhooks/vspheremachine_test.go b/internal/webhooks/vspheremachine_test.go
index 8c1fa7857e..64b5c327f9 100644
--- a/internal/webhooks/vspheremachine_test.go
+++ b/internal/webhooks/vspheremachine_test.go
@@ -22,6 +22,7 @@ import (
 
 	. "github.com/onsi/gomega"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/utils/ptr"
 
 	infrav1 "sigs.k8s.io/cluster-api-provider-vsphere/apis/v1beta1"
 )
@@ -84,27 +85,27 @@ func TestVSphereMachine_ValidateCreate(t *testing.T) {
 		},
 		{
 			name:           "incorrect pciDevice",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32)}}),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: ptr.To[int32](1)}}),
 			wantErr:        true,
 		},
 		{
 			name:           "incorrect pciDevice",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32), VendorID: new(int32)}}),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: ptr.To[int32](1), VendorID: ptr.To[int32](1)}}),
 			wantErr:        true,
 		},
 		{
 			name:           "incomplete pciDevice",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{DeviceID: new(int32)}}),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{DeviceID: ptr.To[int32](1)}}),
 			wantErr:        true,
 		},
 		{
 			name:           "incomplete pciDevice",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VendorID: new(int32)}}),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{VendorID: ptr.To[int32](1)}}),
 			wantErr:        true,
 		},
 		{
 			name:           "successful VSphereMachine creation with PCI device",
-			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{DeviceID: new(int32), VendorID: new(int32)}}),
+			vsphereMachine: createVSphereMachine("foo.com", nil, "", []string{"192.168.0.1/32", "192.168.0.3/32"}, infrav1.VirtualMachinePowerOpModeTrySoft, nil, []infrav1.PCIDeviceSpec{{DeviceID: ptr.To[int32](1), VendorID: ptr.To[int32](1)}}),
 		},
 		{
 			name:           "successful VSphereMachine creation with vgpu",
diff --git a/internal/webhooks/vspheremachinetemplate_test.go b/internal/webhooks/vspheremachinetemplate_test.go
index d1a0714645..95ccd7042e 100644
--- a/internal/webhooks/vspheremachinetemplate_test.go
+++ b/internal/webhooks/vspheremachinetemplate_test.go
@@ -72,27 +72,27 @@ func TestVSphereMachineTemplate_ValidateCreate(t *testing.T) {
 		},
 		{
 			name:           "incorrect pciDevice",
-			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32)}}),
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: ptr.To[int32](1)}}),
 			wantErr:        true,
 		},
 		{
 			name:           "incorrect pciDevice",
-			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: new(int32), VendorID: new(int32)}}),
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VGPUProfile: "vgpu", DeviceID: ptr.To[int32](1), VendorID: ptr.To[int32](1)}}),
 			wantErr:        true,
 		},
 		{
 			name:           "incomplete pciDevice",
-			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{DeviceID: new(int32)}}),
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{DeviceID: ptr.To[int32](1)}}),
 			wantErr:        true,
 		},
 		{
 			name:           "incomplete pciDevice",
-			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VendorID: new(int32)}}),
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{VendorID: ptr.To[int32](1)}}),
 			wantErr:        true,
 		},
 		{
 			name:           "successful VSphereMachine creation with PCI device",
-			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{DeviceID: new(int32), VendorID: new(int32)}}),
+			vsphereMachine: createVSphereMachineTemplate("foo.com", "vmx-17", nil, "", []string{}, []infrav1.PCIDeviceSpec{{DeviceID: ptr.To[int32](1), VendorID: ptr.To[int32](1)}}),
 		},
 		{
 			name:           "successful VSphereMachine creation with vgpu",

From 1f433ebda9af3f0f6fc94f9cbdc18feabbb5f0db Mon Sep 17 00:00:00 2001
From: Birk Lewin <89076383+birksl@users.noreply.github.com>
Date: Thu, 20 Jun 2024 09:49:35 +0200
Subject: [PATCH 17/21] Update docs/gpu-vgpu.md

Co-authored-by: Christian Schlotter <chrischdi@users.noreply.github.com>
---
 docs/gpu-vgpu.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md
index 268aab1075..67c482db98 100644
--- a/docs/gpu-vgpu.md
+++ b/docs/gpu-vgpu.md
@@ -103,4 +103,4 @@ Note: For GPU nodes (PCI Passthrough or vGPU), all memory of the nodes must be r
 
 Apply the manifest from the previous step to your management cluster to have CAPV create a workload cluster with worker nodes that have vGPUs.
 
-From this point on, the setup is exactly the same as [GPU enabled clusters via PCI Passthrough](https://github.com/kubernetes-sigs/cluster-api-provider-vsphere/blob/main/docs/gpu-pci.md#create-the-cluster). 
+From this point on, the setup is exactly the same as [GPU enabled clusters via PCI Passthrough](./gpu-pci.md#create-the-cluster). 

From 609542bac0384d538a11e537c2172d59f59a6188 Mon Sep 17 00:00:00 2001
From: Birk Lewin <89076383+birksl@users.noreply.github.com>
Date: Thu, 20 Jun 2024 12:26:08 +0200
Subject: [PATCH 18/21] Apply suggestions from code review

Co-authored-by: Lubomir I. Ivanov <neolit123@gmail.com>
---
 apis/v1beta1/types.go                 | 2 +-
 docs/gpu-vgpu.md                      | 3 ++-
 internal/webhooks/vspheremachine.go   | 2 +-
 pkg/services/govmomi/vcenter/clone.go | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/apis/v1beta1/types.go b/apis/v1beta1/types.go
index d542adf722..3372ffa118 100644
--- a/apis/v1beta1/types.go
+++ b/apis/v1beta1/types.go
@@ -265,7 +265,7 @@ type PCIDeviceSpec struct {
 	// virtual machine is cloned.
 	// Mutually exclusive with DeviceID and VendorID.
 	// +kubebuilder:validation:Required
-	VGPUProfile string `json:"vgpuProfile,omitempty"`
+	VGPUProfile string `json:"vGPUProfile,omitempty"`
 	// CustomLabel is the hardware label of a virtual machine's PCI device.
 	// Defaults to the eponymous property value in the template from which the
 	// virtual machine is cloned.
diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md
index 67c482db98..bc37b238aa 100644
--- a/docs/gpu-vgpu.md
+++ b/docs/gpu-vgpu.md
@@ -93,7 +93,8 @@ spec:
         - vgpuProfile: "grid_t4-1a" # value from above
 ```
 
-Set the required values for the other fields and the cluster template is ready for use. The similar changes can be made to a template generated using clusterctl generate cluster command as well.
+Set the required values for the other fields and the cluster template is ready for use.
+The similar changes can be made to a template generated using `clusterctl generate cluster` command as well.
 
 ### Create the cluster
 
diff --git a/internal/webhooks/vspheremachine.go b/internal/webhooks/vspheremachine.go
index 7fe2abf24c..328c7c1361 100644
--- a/internal/webhooks/vspheremachine.go
+++ b/internal/webhooks/vspheremachine.go
@@ -175,7 +175,7 @@ func validatePCIDevices(devices []infrav1.PCIDeviceSpec) field.ErrorList {
 			// Valid case for PCI Passthrough.
 			continue
 		}
-		allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have either deviceId + vendorId or vgpuProfile set"))
+		allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "pciDevices", fmt.Sprintf("%d", i)), device, "should have either deviceId + vendorId or vGPUProfile set"))
 	}
 	return allErrs
 }
diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go
index 0334e247e1..91f64eb28d 100644
--- a/pkg/services/govmomi/vcenter/clone.go
+++ b/pkg/services/govmomi/vcenter/clone.go
@@ -69,7 +69,7 @@ func Clone(ctx context.Context, vmCtx *capvcontext.VMContext, bootstrapData []by
 		}
 	}
 	if vmCtx.VSphereVM.Spec.CustomVMXKeys != nil {
-		log.Info("Applied custom vmx keys to VM clone spec")
+		log.Info("Applied custom VMX keys to VM clone spec")
 		if err := extraConfig.SetCustomVMXKeys(vmCtx.VSphereVM.Spec.CustomVMXKeys); err != nil {
 			return err
 		}

From 438c6e5b285056bc17e38c755424709937df4d2a Mon Sep 17 00:00:00 2001
From: Birk Lewin <birk.lewin@xait.com>
Date: Thu, 20 Jun 2024 12:27:28 +0200
Subject: [PATCH 19/21] Generate manifests

---
 ...structure.cluster.x-k8s.io_vspheremachines.yaml | 14 +++++++-------
 ...e.cluster.x-k8s.io_vspheremachinetemplates.yaml | 14 +++++++-------
 ...infrastructure.cluster.x-k8s.io_vspherevms.yaml | 14 +++++++-------
 docs/gpu-vgpu.md                                   |  2 +-
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
index f775114990..36dfb02772 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
@@ -1373,6 +1373,13 @@ spec:
                         Mutually exclusive with VGPUProfile.
                       format: int32
                       type: integer
+                    vGPUProfile:
+                      description: |-
+                        VGPUProfile is the profile name of a virtual machine's vGPU, in string.
+                        Defaults to the eponymous property value in the template from which the
+                        virtual machine is cloned.
+                        Mutually exclusive with DeviceID and VendorID.
+                      type: string
                     vendorId:
                       description: |-
                         VendorId is the vendor ID of a virtual machine's PCI, in integer.
@@ -1381,13 +1388,6 @@ spec:
                         Mutually exclusive with VGPUProfile.
                       format: int32
                       type: integer
-                    vgpuProfile:
-                      description: |-
-                        VGPUProfile is the profile name of a virtual machine's vGPU, in string.
-                        Defaults to the eponymous property value in the template from which the
-                        virtual machine is cloned.
-                        Mutually exclusive with DeviceID and VendorID.
-                      type: string
                   type: object
                 type: array
               powerOffMode:
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
index a9518be232..54733547a6 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
@@ -1248,6 +1248,13 @@ spec:
                                 Mutually exclusive with VGPUProfile.
                               format: int32
                               type: integer
+                            vGPUProfile:
+                              description: |-
+                                VGPUProfile is the profile name of a virtual machine's vGPU, in string.
+                                Defaults to the eponymous property value in the template from which the
+                                virtual machine is cloned.
+                                Mutually exclusive with DeviceID and VendorID.
+                              type: string
                             vendorId:
                               description: |-
                                 VendorId is the vendor ID of a virtual machine's PCI, in integer.
@@ -1256,13 +1263,6 @@ spec:
                                 Mutually exclusive with VGPUProfile.
                               format: int32
                               type: integer
-                            vgpuProfile:
-                              description: |-
-                                VGPUProfile is the profile name of a virtual machine's vGPU, in string.
-                                Defaults to the eponymous property value in the template from which the
-                                virtual machine is cloned.
-                                Mutually exclusive with DeviceID and VendorID.
-                              type: string
                           type: object
                         type: array
                       powerOffMode:
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
index a35692c085..333df3fe42 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
@@ -1461,6 +1461,13 @@ spec:
                         Mutually exclusive with VGPUProfile.
                       format: int32
                       type: integer
+                    vGPUProfile:
+                      description: |-
+                        VGPUProfile is the profile name of a virtual machine's vGPU, in string.
+                        Defaults to the eponymous property value in the template from which the
+                        virtual machine is cloned.
+                        Mutually exclusive with DeviceID and VendorID.
+                      type: string
                     vendorId:
                       description: |-
                         VendorId is the vendor ID of a virtual machine's PCI, in integer.
@@ -1469,13 +1476,6 @@ spec:
                         Mutually exclusive with VGPUProfile.
                       format: int32
                       type: integer
-                    vgpuProfile:
-                      description: |-
-                        VGPUProfile is the profile name of a virtual machine's vGPU, in string.
-                        Defaults to the eponymous property value in the template from which the
-                        virtual machine is cloned.
-                        Mutually exclusive with DeviceID and VendorID.
-                      type: string
                   type: object
                 type: array
               powerOffMode:
diff --git a/docs/gpu-vgpu.md b/docs/gpu-vgpu.md
index bc37b238aa..81d1ea19d1 100644
--- a/docs/gpu-vgpu.md
+++ b/docs/gpu-vgpu.md
@@ -90,7 +90,7 @@ spec:
       template: '${VSPHERE_TEMPLATE}'
       thumbprint: '${VSPHERE_TLS_THUMBPRINT}'
       pciDevices:
-        - vgpuProfile: "grid_t4-1a" # value from above
+        - vGPUProfile: "grid_t4-1a" # value from above
 ```
 
 Set the required values for the other fields and the cluster template is ready for use.

From f18ba350b357098d97d7a13c1fd321a2dbb9d015 Mon Sep 17 00:00:00 2001
From: Birk Lewin <birk.lewin@xait.com>
Date: Thu, 20 Jun 2024 12:37:20 +0200
Subject: [PATCH 20/21] Update PCIDevice doc comments

---
 apis/v1beta1/types.go | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/apis/v1beta1/types.go b/apis/v1beta1/types.go
index 3372ffa118..728f26a457 100644
--- a/apis/v1beta1/types.go
+++ b/apis/v1beta1/types.go
@@ -251,19 +251,22 @@ type PCIDeviceSpec struct {
 	// DeviceID is the device ID of a virtual machine's PCI, in integer.
 	// Defaults to the eponymous property value in the template from which the
 	// virtual machine is cloned.
-	// Mutually exclusive with VGPUProfile.
+	// Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID
+	// are two independent ways to define PCI devices.
 	// +kubebuilder:validation:Required
 	DeviceID *int32 `json:"deviceId,omitempty"`
 	// VendorId is the vendor ID of a virtual machine's PCI, in integer.
 	// Defaults to the eponymous property value in the template from which the
 	// virtual machine is cloned.
-	// Mutually exclusive with VGPUProfile.
+	// Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID
+	// are two independent ways to define PCI devices.
 	// +kubebuilder:validation:Required
 	VendorID *int32 `json:"vendorId,omitempty"`
 	// VGPUProfile is the profile name of a virtual machine's vGPU, in string.
 	// Defaults to the eponymous property value in the template from which the
 	// virtual machine is cloned.
-	// Mutually exclusive with DeviceID and VendorID.
+	// Mutually exclusive with DeviceID and VendorID as VGPUProfile and DeviceID + VendorID
+	// are two independent ways to define PCI devices.
 	// +kubebuilder:validation:Required
 	VGPUProfile string `json:"vGPUProfile,omitempty"`
 	// CustomLabel is the hardware label of a virtual machine's PCI device.

From ee5d3ac355744cfa3f724421bcf6f49c0c179dc6 Mon Sep 17 00:00:00 2001
From: Birk Lewin <birk.lewin@xait.com>
Date: Thu, 20 Jun 2024 12:43:18 +0200
Subject: [PATCH 21/21] Forgot to run make generate

---
 .../infrastructure.cluster.x-k8s.io_vspheremachines.yaml | 9 ++++++---
 ...ructure.cluster.x-k8s.io_vspheremachinetemplates.yaml | 9 ++++++---
 .../infrastructure.cluster.x-k8s.io_vspherevms.yaml      | 9 ++++++---
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
index 36dfb02772..3f9b8fb56c 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml
@@ -1370,7 +1370,8 @@ spec:
                         DeviceID is the device ID of a virtual machine's PCI, in integer.
                         Defaults to the eponymous property value in the template from which the
                         virtual machine is cloned.
-                        Mutually exclusive with VGPUProfile.
+                        Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID
+                        are two independent ways to define PCI devices.
                       format: int32
                       type: integer
                     vGPUProfile:
@@ -1378,14 +1379,16 @@ spec:
                         VGPUProfile is the profile name of a virtual machine's vGPU, in string.
                         Defaults to the eponymous property value in the template from which the
                         virtual machine is cloned.
-                        Mutually exclusive with DeviceID and VendorID.
+                        Mutually exclusive with DeviceID and VendorID as VGPUProfile and DeviceID + VendorID
+                        are two independent ways to define PCI devices.
                       type: string
                     vendorId:
                       description: |-
                         VendorId is the vendor ID of a virtual machine's PCI, in integer.
                         Defaults to the eponymous property value in the template from which the
                         virtual machine is cloned.
-                        Mutually exclusive with VGPUProfile.
+                        Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID
+                        are two independent ways to define PCI devices.
                       format: int32
                       type: integer
                   type: object
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
index 54733547a6..9d72178886 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml
@@ -1245,7 +1245,8 @@ spec:
                                 DeviceID is the device ID of a virtual machine's PCI, in integer.
                                 Defaults to the eponymous property value in the template from which the
                                 virtual machine is cloned.
-                                Mutually exclusive with VGPUProfile.
+                                Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID
+                                are two independent ways to define PCI devices.
                               format: int32
                               type: integer
                             vGPUProfile:
@@ -1253,14 +1254,16 @@ spec:
                                 VGPUProfile is the profile name of a virtual machine's vGPU, in string.
                                 Defaults to the eponymous property value in the template from which the
                                 virtual machine is cloned.
-                                Mutually exclusive with DeviceID and VendorID.
+                                Mutually exclusive with DeviceID and VendorID as VGPUProfile and DeviceID + VendorID
+                                are two independent ways to define PCI devices.
                               type: string
                             vendorId:
                               description: |-
                                 VendorId is the vendor ID of a virtual machine's PCI, in integer.
                                 Defaults to the eponymous property value in the template from which the
                                 virtual machine is cloned.
-                                Mutually exclusive with VGPUProfile.
+                                Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID
+                                are two independent ways to define PCI devices.
                               format: int32
                               type: integer
                           type: object
diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
index 333df3fe42..f7c8474262 100644
--- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
+++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml
@@ -1458,7 +1458,8 @@ spec:
                         DeviceID is the device ID of a virtual machine's PCI, in integer.
                         Defaults to the eponymous property value in the template from which the
                         virtual machine is cloned.
-                        Mutually exclusive with VGPUProfile.
+                        Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID
+                        are two independent ways to define PCI devices.
                       format: int32
                       type: integer
                     vGPUProfile:
@@ -1466,14 +1467,16 @@ spec:
                         VGPUProfile is the profile name of a virtual machine's vGPU, in string.
                         Defaults to the eponymous property value in the template from which the
                         virtual machine is cloned.
-                        Mutually exclusive with DeviceID and VendorID.
+                        Mutually exclusive with DeviceID and VendorID as VGPUProfile and DeviceID + VendorID
+                        are two independent ways to define PCI devices.
                       type: string
                     vendorId:
                       description: |-
                         VendorId is the vendor ID of a virtual machine's PCI, in integer.
                         Defaults to the eponymous property value in the template from which the
                         virtual machine is cloned.
-                        Mutually exclusive with VGPUProfile.
+                        Mutually exclusive with VGPUProfile as VGPUProfile and DeviceID + VendorID
+                        are two independent ways to define PCI devices.
                       format: int32
                       type: integer
                   type: object