Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ vGPU implementation #1971

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ e2e-templates: ## Generate e2e cluster templates
"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build $(E2E_TEMPLATE_DIR)/kustomization/pci > $(E2E_TEMPLATE_DIR)/cluster-template-pci.yaml
# for DHCP overrides
"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build $(E2E_TEMPLATE_DIR)/kustomization/dhcp-overrides > $(E2E_TEMPLATE_DIR)/cluster-template-dhcp-overrides.yaml
# for vGPU template
"$(KUSTOMIZE)" --load-restrictor LoadRestrictionsNone build $(E2E_TEMPLATE_DIR)/kustomization/vgpu > $(E2E_TEMPLATE_DIR)/cluster-template-vgpu.yaml

.PHONY: test-integration
test-integration: e2e-image
Expand Down
1 change: 1 addition & 0 deletions apis/v1alpha3/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions apis/v1alpha4/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions apis/v1beta1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ type VirtualMachineCloneSpec struct {
// PciDevices is the list of pci devices used by the virtual machine.
// +optional
PciDevices []PCIDeviceSpec `json:"pciDevices,omitempty"`
// VGPUDevices is the list of vGPUs used by the virtual machine.
// +optional
VGPUDevices []VGPUSpec `json:"vgpuDevices,omitempty"`
// OS is the Operating System of the virtual machine
// Defaults to Linux
// +optional
Expand Down Expand Up @@ -233,6 +236,15 @@ type PCIDeviceSpec struct {
VendorID *int32 `json:"vendorId,omitempty"`
}

// VGPUSpec defines virtual machine's VGPU configuration
type VGPUSpec struct {
// ProfileName is the ProfileName of a virtual machine's vGPU, in string.
// Defaults to the eponymous property value in the template from which the
// virtual machine is cloned.
// +kubebuilder:validation:Required
ProfileName string `json:"profileName,omitempty"`
}

// NetworkSpec defines the virtual machine's network configuration.
type NetworkSpec struct {
// Devices is the list of network devices used by the virtual machine.
Expand Down
20 changes: 20 additions & 0 deletions apis/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -1246,6 +1246,19 @@ spec:
of the communication between Cluster API Provider vSphere and the
VMware vCenter server.
type: string
vgpuDevices:
description: VGPUDevices is the list of vGPUs used by the virtual
machine.
items:
description: VGPUSpec defines virtual machine's VGPU configuration
properties:
profileName:
description: ProfileName is the ProfileName of a virtual machine's
vGPU, in string. Defaults to the eponymous property value
in the template from which the virtual machine is cloned.
type: string
type: object
type: array
required:
- network
- template
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1155,6 +1155,20 @@ spec:
TLS certificate validation of the communication between
Cluster API Provider vSphere and the VMware vCenter server.
type: string
vgpuDevices:
description: VGPUDevices is the list of vGPUs used by the
virtual machine.
items:
description: VGPUSpec defines virtual machine's VGPU configuration
properties:
profileName:
description: ProfileName is the ProfileName of a virtual
machine's vGPU, in string. Defaults to the eponymous
property value in the template from which the virtual
machine is cloned.
type: string
type: object
type: array
required:
- network
- template
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1289,6 +1289,19 @@ spec:
of the communication between Cluster API Provider vSphere and the
VMware vCenter server.
type: string
vgpuDevices:
description: VGPUDevices is the list of vGPUs used by the virtual
machine.
items:
description: VGPUSpec defines virtual machine's VGPU configuration
properties:
profileName:
description: ProfileName is the ProfileName of a virtual machine's
vGPU, in string. Defaults to the eponymous property value
in the template from which the virtual machine is cloned.
type: string
type: object
type: array
required:
- network
- template
Expand Down
76 changes: 71 additions & 5 deletions pkg/services/govmomi/vcenter/clone.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
}
}
if ctx.VSphereVM.Spec.CustomVMXKeys != nil {
ctx.Logger.Info("applied custom vmx keys o VM clone spec")
ctx.Logger.Info("applied custom vmx keys to VM clone spec")
if err := extraConfig.SetCustomVMXKeys(ctx.VSphereVM.Spec.CustomVMXKeys); err != nil {
return err
}
Expand Down Expand Up @@ -150,8 +150,22 @@

deviceSpecs = append(deviceSpecs, networkSpecs...)

if err != nil {
return errors.Wrapf(err, "error getting network specs for %q", ctx)
if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices) != 0 {
gpuSpecs, err := getGpuSpecs(ctx)
if err != nil {
return errors.Wrapf(err, "error getting gpu specs for %q", ctx)
}
ctx.Logger.V(4).Info("created gpu devices", "gpu-device-specs", gpuSpecs)
deviceSpecs = append(deviceSpecs, gpuSpecs...)
}

if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices) != 0 {
vgpuSpecs, err := getVgpuSpecs(ctx)
if err != nil {
return errors.Wrapf(err, "error getting gpu specs for %q", ctx)
}
ctx.Logger.V(4).Info("created vgpu devices", "vgpu-device-specs", vgpuSpecs)
deviceSpecs = append(deviceSpecs, vgpuSpecs...)
}

numCPUs := ctx.VSphereVM.Spec.NumCPUs
Expand Down Expand Up @@ -193,10 +207,10 @@
Snapshot: snapshotRef,
}

// For PCI devices, the memory for the VM needs to be reserved
// For PCI and vGPU devices, the memory for the VM needs to be reserved
// We can replace this once we have another way of reserving memory option
// exposed via the API types.
if len(ctx.VSphereVM.Spec.PciDevices) > 0 {
if len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices) > 0 || len(ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices) > 0 {
spec.Config.MemoryReservationLockedToMax = pointer.Bool(true)
}

Expand Down Expand Up @@ -424,3 +438,55 @@

return deviceSpecs, nil
}

func createPCIPassThroughDevice(deviceKey int32, backingInfo types.BaseVirtualDeviceBackingInfo) types.BaseVirtualDevice {
device := &types.VirtualPCIPassthrough{
VirtualDevice: types.VirtualDevice{
Key: deviceKey,
Backing: backingInfo,
},
}
return device
}

func getGpuSpecs(ctx *context.VMContext) ([]types.BaseVirtualDeviceConfigSpec, error) {

Check failure on line 452 in pkg/services/govmomi/vcenter/clone.go

View workflow job for this annotation

GitHub Actions / lint

getGpuSpecs - result 1 (error) is always nil (unparam)

Check failure on line 452 in pkg/services/govmomi/vcenter/clone.go

View workflow job for this annotation

GitHub Actions / lint

getGpuSpecs - result 1 (error) is always nil (unparam)
deviceSpecs := []types.BaseVirtualDeviceConfigSpec{}
deviceKey := int32(-200)

for _, pciDevice := range ctx.VSphereVM.Spec.VirtualMachineCloneSpec.PciDevices {
backingInfo := &types.VirtualPCIPassthroughDynamicBackingInfo{
AllowedDevice: []types.VirtualPCIPassthroughAllowedDevice{
{
VendorId: *pciDevice.VendorID,
DeviceId: *pciDevice.DeviceID,
},
},
}
dynamicDirectPathDevice := createPCIPassThroughDevice(deviceKey, backingInfo)
deviceSpecs = append(deviceSpecs, &types.VirtualDeviceConfigSpec{
Device: dynamicDirectPathDevice,
Operation: types.VirtualDeviceConfigSpecOperationAdd,
})
deviceKey--
}
return deviceSpecs, nil
}

func getVgpuSpecs(ctx *context.VMContext) ([]types.BaseVirtualDeviceConfigSpec, error) {

Check failure on line 475 in pkg/services/govmomi/vcenter/clone.go

View workflow job for this annotation

GitHub Actions / lint

getVgpuSpecs - result 1 (error) is always nil (unparam)

Check failure on line 475 in pkg/services/govmomi/vcenter/clone.go

View workflow job for this annotation

GitHub Actions / lint

getVgpuSpecs - result 1 (error) is always nil (unparam)
deviceSpecs := []types.BaseVirtualDeviceConfigSpec{}
deviceKey := int32(-200)

for _, vGPUDevice := range ctx.VSphereVM.Spec.VirtualMachineCloneSpec.VGPUDevices {
backingInfo := &types.VirtualPCIPassthroughVmiopBackingInfo{
Vgpu: vGPUDevice.ProfileName,
}
dynamicDirectPathDevice := createPCIPassThroughDevice(deviceKey, backingInfo)
deviceSpecs = append(deviceSpecs, &types.VirtualDeviceConfigSpec{
Device: dynamicDirectPathDevice,
Operation: types.VirtualDeviceConfigSpecOperationAdd,
})
ctx.Logger.V(4).Info("created vGPU device", "vgpu-profile", vGPUDevice.ProfileName)
deviceKey--
}
return deviceSpecs, nil
}
1 change: 1 addition & 0 deletions test/e2e/config/vsphere-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ variables:
# These IDs correspond to Tesla T4s, they are the decimal representation of the hex values.
DEVICE_ID: 7864
VENDOR_ID: 4318
PROFILE_NAME: grid_v100d-4c
# CAPV feature flags
EXP_NODE_ANTI_AFFINITY: "true"
EXP_NODE_LABELING: "true"
Expand Down
1 change: 1 addition & 0 deletions test/e2e/config/vsphere-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ variables:
CLUSTER_TOPOLOGY: "true"
DEVICE_ID: 7864
VENDOR_ID: 4318
PROFILE_NAME: grid_v100d-4c
# CAPV feature flags
EXP_NODE_ANTI_AFFINITY: "true"
EXP_NODE_LABELING: "true"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../base
patchesStrategicMerge:
- pci-device-template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: VSphereMachineTemplate
metadata:
name: ${CLUSTER_NAME}-worker
namespace: ${NAMESPACE}
spec:
template:
spec:
vgpuDevices:
- profileName: ${PROFILE_NAME}
Loading