From 59361c746edd19fcad915247158de4a799d3a4fe Mon Sep 17 00:00:00 2001 From: Puneet Katyal Date: Thu, 14 Jul 2022 13:45:04 +0530 Subject: [PATCH] vGPU implementation - Builds on the changes in https://github.com/kubernetes-sigs/cluster-api-provider-vsphere/pull/1579 Signed-off-by: Puneet Katyal --- Makefile | 2 + apis/v1alpha3/zz_generated.conversion.go | 1 + apis/v1alpha4/zz_generated.conversion.go | 1 + apis/v1beta1/types.go | 12 +++ apis/v1beta1/zz_generated.deepcopy.go | 20 +++++ ...ture.cluster.x-k8s.io_vspheremachines.yaml | 13 ++++ ...ster.x-k8s.io_vspheremachinetemplates.yaml | 14 ++++ ...structure.cluster.x-k8s.io_vspherevms.yaml | 13 ++++ pkg/services/govmomi/vcenter/clone.go | 76 +++++++++++++++++-- test/e2e/config/vsphere-ci.yaml | 1 + test/e2e/config/vsphere-dev.yaml | 1 + .../kustomization/vgpu/kustomization.yaml | 6 ++ .../vgpu/vgpu-device-template.yaml | 11 +++ 13 files changed, 166 insertions(+), 5 deletions(-) create mode 100644 test/e2e/data/infrastructure-vsphere/kustomization/vgpu/kustomization.yaml create mode 100644 test/e2e/data/infrastructure-vsphere/kustomization/vgpu/vgpu-device-template.yaml diff --git a/Makefile b/Makefile index a5663cf92b..4a750e0ef5 100644 --- a/Makefile +++ b/Makefile @@ -158,6 +158,8 @@ e2e-templates: ## Generate e2e cluster templates "$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build $(E2E_TEMPLATE_DIR)/kustomization/pci > $(E2E_TEMPLATE_DIR)/cluster-template-pci.yaml # for DHCP overrides "$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build $(E2E_TEMPLATE_DIR)/kustomization/dhcp-overrides > $(E2E_TEMPLATE_DIR)/cluster-template-dhcp-overrides.yaml + # for vGPU template + "$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build $(E2E_TEMPLATE_DIR)/kustomization/vgpu > $(E2E_TEMPLATE_DIR)/cluster-template-vgpu.yaml .PHONY: test-integration test-integration: e2e-image diff --git a/apis/v1alpha3/zz_generated.conversion.go b/apis/v1alpha3/zz_generated.conversion.go index 5983bdc72c..1f308d19e0 100644 --- a/apis/v1alpha3/zz_generated.conversion.go +++ b/apis/v1alpha3/zz_generated.conversion.go @@ -1687,6 +1687,7 @@ func autoConvert_v1beta1_VirtualMachineCloneSpec_To_v1alpha3_VirtualMachineClone out.CustomVMXKeys = *(*map[string]string)(unsafe.Pointer(&in.CustomVMXKeys)) // WARNING: in.TagIDs requires manual conversion: does not exist in peer-type // WARNING: in.PciDevices requires manual conversion: does not exist in peer-type + // WARNING: in.VGPUDevices requires manual conversion: does not exist in peer-type // WARNING: in.OS requires manual conversion: does not exist in peer-type // WARNING: in.HardwareVersion requires manual conversion: does not exist in peer-type return nil diff --git a/apis/v1alpha4/zz_generated.conversion.go b/apis/v1alpha4/zz_generated.conversion.go index 253fa6ff0d..23f3593072 100644 --- a/apis/v1alpha4/zz_generated.conversion.go +++ b/apis/v1alpha4/zz_generated.conversion.go @@ -1845,6 +1845,7 @@ func autoConvert_v1beta1_VirtualMachineCloneSpec_To_v1alpha4_VirtualMachineClone out.CustomVMXKeys = *(*map[string]string)(unsafe.Pointer(&in.CustomVMXKeys)) // WARNING: in.TagIDs requires manual conversion: does not exist in peer-type // WARNING: in.PciDevices requires manual conversion: does not exist in peer-type + // WARNING: in.VGPUDevices requires manual conversion: does not exist in peer-type // WARNING: in.OS requires manual conversion: does not exist in peer-type // WARNING: in.HardwareVersion requires manual conversion: does not exist in peer-type return nil diff --git a/apis/v1beta1/types.go b/apis/v1beta1/types.go index 40b543b1c4..13f3890172 100644 --- a/apis/v1beta1/types.go +++ b/apis/v1beta1/types.go @@ -166,6 +166,9 @@ type VirtualMachineCloneSpec struct { // PciDevices is the list of pci devices used by the virtual machine. // +optional PciDevices []PCIDeviceSpec `json:"pciDevices,omitempty"` + // VGPUDevices is the list of vGPUs used by the virtual machine. + // +optional + VGPUDevices []VGPUSpec `json:"vgpuDevices,omitempty"` // OS is the Operating System of the virtual machine // Defaults to Linux // +optional @@ -233,6 +236,15 @@ type PCIDeviceSpec struct { VendorID *int32 `json:"vendorId,omitempty"` } +// VGPUSpec defines virtual machine's VGPU configuration +type VGPUSpec struct { + // ProfileName is the ProfileName of a virtual machine's vGPU, in string. + // Defaults to the eponymous property value in the template from which the + // virtual machine is cloned. + // +kubebuilder:validation:Required + ProfileName string `json:"profileName,omitempty"` +} + // NetworkSpec defines the virtual machine's network configuration. type NetworkSpec struct { // Devices is the list of network devices used by the virtual machine. diff --git a/apis/v1beta1/zz_generated.deepcopy.go b/apis/v1beta1/zz_generated.deepcopy.go index 15533fcaec..75b7c49337 100644 --- a/apis/v1beta1/zz_generated.deepcopy.go +++ b/apis/v1beta1/zz_generated.deepcopy.go @@ -403,6 +403,21 @@ func (in *Topology) DeepCopy() *Topology { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *VGPUSpec) DeepCopyInto(out *VGPUSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VGPUSpec. +func (in *VGPUSpec) DeepCopy() *VGPUSpec { + if in == nil { + return nil + } + out := new(VGPUSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *VSphereCluster) DeepCopyInto(out *VSphereCluster) { *out = *in @@ -1306,6 +1321,11 @@ func (in *VirtualMachineCloneSpec) DeepCopyInto(out *VirtualMachineCloneSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.VGPUDevices != nil { + in, out := &in.VGPUDevices, &out.VGPUDevices + *out = make([]VGPUSpec, len(*in)) + copy(*out, *in) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMachineCloneSpec. diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml index bbfff217c0..b292e05239 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachines.yaml @@ -1246,6 +1246,19 @@ spec: of the communication between Cluster API Provider vSphere and the VMware vCenter server. type: string + vgpuDevices: + description: VGPUDevices is the list of vGPUs used by the virtual + machine. + items: + description: VGPUSpec defines virtual machine's VGPU configuration + properties: + profileName: + description: ProfileName is the ProfileName of a virtual machine's + vGPU, in string. Defaults to the eponymous property value + in the template from which the virtual machine is cloned. + type: string + type: object + type: array required: - network - template diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml index 697563c52e..ed171d1955 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspheremachinetemplates.yaml @@ -1155,6 +1155,20 @@ spec: TLS certificate validation of the communication between Cluster API Provider vSphere and the VMware vCenter server. type: string + vgpuDevices: + description: VGPUDevices is the list of vGPUs used by the + virtual machine. + items: + description: VGPUSpec defines virtual machine's VGPU configuration + properties: + profileName: + description: ProfileName is the ProfileName of a virtual + machine's vGPU, in string. Defaults to the eponymous + property value in the template from which the virtual + machine is cloned. + type: string + type: object + type: array required: - network - template diff --git a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml index 6324b355e8..b22c5957bd 100644 --- a/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml +++ b/config/default/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml @@ -1289,6 +1289,19 @@ spec: of the communication between Cluster API Provider vSphere and the VMware vCenter server. type: string + vgpuDevices: + description: VGPUDevices is the list of vGPUs used by the virtual + machine. + items: + description: VGPUSpec defines virtual machine's VGPU configuration + properties: + profileName: + description: ProfileName is the ProfileName of a virtual machine's + vGPU, in string. Defaults to the eponymous property value + in the template from which the virtual machine is cloned. + type: string + type: object + type: array required: - network - template diff --git a/pkg/services/govmomi/vcenter/clone.go b/pkg/services/govmomi/vcenter/clone.go index 999f984603..b0c39f316d 100644 --- a/pkg/services/govmomi/vcenter/clone.go +++ b/pkg/services/govmomi/vcenter/clone.go @@ -67,7 +67,7 @@ func Clone(ctx *context.VMContext, bootstrapData []byte, format bootstrapv1.Form } } if ctx.VSphereVM.Spec.CustomVMXKeys != nil { - ctx.Logger.Info("applied custom vmx keys o VM clone spec") + ctx.Logger.Info("applied custom vmx keys to VM clone spec") if err := extraConfig.SetCustomVMXKeys(ctx.VSphereVM.Spec.CustomVMXKeys); err != nil { return err } @@ -150,8 +150,22 @@ func Clone(ctx *context.VMContext, bootstrapData []byte, format bootstrapv1.Form deviceSpecs = append(deviceSpecs, networkSpecs...) - if err != nil { - return errors.Wrapf(err, "error getting network specs for %q", ctx) + if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices) != 0 { + gpuSpecs, err := getGpuSpecs(ctx) + if err != nil { + return errors.Wrapf(err, "error getting gpu specs for %q", ctx) + } + ctx.Logger.V(4).Info("created gpu devices", "gpu-device-specs", gpuSpecs) + deviceSpecs = append(deviceSpecs, gpuSpecs...) + } + + if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices) != 0 { + vgpuSpecs, err := getVgpuSpecs(ctx) + if err != nil { + return errors.Wrapf(err, "error getting gpu specs for %q", ctx) + } + ctx.Logger.V(4).Info("created vgpu devices", "vgpu-device-specs", vgpuSpecs) + deviceSpecs = append(deviceSpecs, vgpuSpecs...) } numCPUs := ctx.VSphereVM.Spec.NumCPUs @@ -193,10 +207,10 @@ func Clone(ctx *context.VMContext, bootstrapData []byte, format bootstrapv1.Form Snapshot: snapshotRef, } - // For PCI devices, the memory for the VM needs to be reserved + // For PCI and vGPU devices, the memory for the VM needs to be reserved // We can replace this once we have another way of reserving memory option // exposed via the API types. - if len(ctx.VSphereVM.Spec.PciDevices) > 0 { + if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices) > 0 || len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices) > 0 { spec.Config.MemoryReservationLockedToMax = pointer.Bool(true) } @@ -424,3 +438,55 @@ func getNetworkSpecs(ctx *context.VMContext, devices object.VirtualDeviceList) ( return deviceSpecs, nil } + +func createPCIPassThroughDevice(deviceKey int32, backingInfo types.BaseVirtualDeviceBackingInfo) types.BaseVirtualDevice { + device := &types.VirtualPCIPassthrough{ + VirtualDevice: types.VirtualDevice{ + Key: deviceKey, + Backing: backingInfo, + }, + } + return device +} + +func getGpuSpecs(ctx *context.VMContext) ([]types.BaseVirtualDeviceConfigSpec, error) { + deviceSpecs := []types.BaseVirtualDeviceConfigSpec{} + deviceKey := int32(-200) + + for _, pciDevice := range ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices { + backingInfo := &types.VirtualPCIPassthroughDynamicBackingInfo{ + AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{ + { + VendorId: *pciDevice.VendorID, + DeviceId: *pciDevice.DeviceID, + }, + }, + } + dynamicDirectPathDevice := createPCIPassThroughDevice(deviceKey, backingInfo) + deviceSpecs = append(deviceSpecs, &types.VirtualDeviceConfigSpec{ + Device: dynamicDirectPathDevice, + Operation: types.VirtualDeviceConfigSpecOperationAdd, + }) + deviceKey-- + } + return deviceSpecs, nil +} + +func getVgpuSpecs(ctx *context.VMContext) ([]types.BaseVirtualDeviceConfigSpec, error) { + deviceSpecs := []types.BaseVirtualDeviceConfigSpec{} + deviceKey := int32(-200) + + for _, vGPUDevice := range ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices { + backingInfo := &types.VirtualPCIPassthroughVmiopBackingInfo{ + Vgpu: vGPUDevice.ProfileName, + } + dynamicDirectPathDevice := createPCIPassThroughDevice(deviceKey, backingInfo) + deviceSpecs = append(deviceSpecs, &types.VirtualDeviceConfigSpec{ + Device: dynamicDirectPathDevice, + Operation: types.VirtualDeviceConfigSpecOperationAdd, + }) + ctx.Logger.V(4).Info("created vGPU device", "vgpu-profile", vGPUDevice.ProfileName) + deviceKey-- + } + return deviceSpecs, nil +} diff --git a/test/e2e/config/vsphere-ci.yaml b/test/e2e/config/vsphere-ci.yaml index 5bead4fc92..538ee1d7e5 100644 --- a/test/e2e/config/vsphere-ci.yaml +++ b/test/e2e/config/vsphere-ci.yaml @@ -152,6 +152,7 @@ variables: # These IDs correspond to Tesla T4s, they are the decimal representation of the hex values. DEVICE_ID: 7864 VENDOR_ID: 4318 + PROFILE_NAME: grid_v100d-4c # CAPV feature flags EXP_NODE_ANTI_AFFINITY: "true" EXP_NODE_LABELING: "true" diff --git a/test/e2e/config/vsphere-dev.yaml b/test/e2e/config/vsphere-dev.yaml index eebebc42f6..83316091e9 100644 --- a/test/e2e/config/vsphere-dev.yaml +++ b/test/e2e/config/vsphere-dev.yaml @@ -168,6 +168,7 @@ variables: CLUSTER_TOPOLOGY: "true" DEVICE_ID: 7864 VENDOR_ID: 4318 + PROFILE_NAME: grid_v100d-4c # CAPV feature flags EXP_NODE_ANTI_AFFINITY: "true" EXP_NODE_LABELING: "true" diff --git a/test/e2e/data/infrastructure-vsphere/kustomization/vgpu/kustomization.yaml b/test/e2e/data/infrastructure-vsphere/kustomization/vgpu/kustomization.yaml new file mode 100644 index 0000000000..143eb7f2fd --- /dev/null +++ b/test/e2e/data/infrastructure-vsphere/kustomization/vgpu/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ../base +patchesStrategicMerge: + - pci-device-template.yaml diff --git a/test/e2e/data/infrastructure-vsphere/kustomization/vgpu/vgpu-device-template.yaml b/test/e2e/data/infrastructure-vsphere/kustomization/vgpu/vgpu-device-template.yaml new file mode 100644 index 0000000000..4404df5f3f --- /dev/null +++ b/test/e2e/data/infrastructure-vsphere/kustomization/vgpu/vgpu-device-template.yaml @@ -0,0 +1,11 @@ +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: VSphereMachineTemplate +metadata: + name: ${CLUSTER_NAME}-worker + namespace: ${NAMESPACE} +spec: + template: + spec: + vgpuDevices: + - profileName: ${PROFILE_NAME} \ No newline at end of file