From 7b8e870899ea1a49b741346cbbc02dc9be5a9a8b Mon Sep 17 00:00:00 2001 From: Jordan May Date: Tue, 15 Oct 2024 13:00:50 -0400 Subject: [PATCH] Add GPU/Accelerator support to VMs --- api/v1beta1/gcpmachine_types.go | 19 ++++++++++++++ api/v1beta1/zz_generated.deepcopy.go | 20 +++++++++++++++ cloud/scope/machine.go | 21 ++++++++++++++++ ...tructure.cluster.x-k8s.io_gcpmachines.yaml | 25 +++++++++++++++++++ ....cluster.x-k8s.io_gcpmachinetemplates.yaml | 25 +++++++++++++++++++ docs/book/src/topics/gpus.md | 22 ++++++++++++++++ 6 files changed, 132 insertions(+) create mode 100644 docs/book/src/topics/gpus.md diff --git a/api/v1beta1/gcpmachine_types.go b/api/v1beta1/gcpmachine_types.go index dc7c3c012..ccbf08a89 100644 --- a/api/v1beta1/gcpmachine_types.go +++ b/api/v1beta1/gcpmachine_types.go @@ -346,6 +346,25 @@ type GCPMachineSpec struct { // RootDiskEncryptionKey defines the KMS key to be used to encrypt the root disk. // +optional RootDiskEncryptionKey *CustomerEncryptionKey `json:"rootDiskEncryptionKey,omitempty"` + + // GuestAccelerators is a list of the type and count of accelerator cards + // attached to the instance. + // +optional + GuestAccelerators []Accelerator `json:"guestAccelerators,omitempty"` +} + +// Accelerator is a specification of the type and number of accelerator +// cards attached to the instance. +type Accelerator struct { + // Count is the number of the guest accelerator cards exposed to this + // instance. + Count int64 `json:"count,omitempty"` + // Type is the full or partial URL of the accelerator type resource to + // attach to this instance. For example: + // projects/my-project/zones/us-central1-c/acceleratorTypes/nvidia-tesla-p100 + // If you are creating an instance template, specify only the accelerator name. + // See GPUs on Compute Engine for a full list of accelerator types. + Type string `json:"type,omitempty"` } // MetadataItem defines a single piece of metadata associated with an instance. diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 3c8560ad9..203d0d0dc 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -27,6 +27,21 @@ import ( "sigs.k8s.io/cluster-api/errors" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Accelerator) DeepCopyInto(out *Accelerator) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Accelerator. +func (in *Accelerator) DeepCopy() *Accelerator { + if in == nil { + return nil + } + out := new(Accelerator) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AttachedDiskSpec) DeepCopyInto(out *AttachedDiskSpec) { *out = *in @@ -505,6 +520,11 @@ func (in *GCPMachineSpec) DeepCopyInto(out *GCPMachineSpec) { *out = new(CustomerEncryptionKey) (*in).DeepCopyInto(*out) } + if in.GuestAccelerators != nil { + in, out := &in.GuestAccelerators, &out.GuestAccelerators + *out = make([]Accelerator, len(*in)) + copy(*out, *in) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GCPMachineSpec. diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index ce5d02d55..9f8d5d86c 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -374,6 +374,22 @@ func (m *MachineScope) InstanceAdditionalMetadataSpec() *compute.Metadata { return metadata } +// InstanceGuestAcceleratorsSpec returns a slice of Guest Accelerator Config specs. +func (m *MachineScope) InstanceGuestAcceleratorsSpec() []*compute.AcceleratorConfig { + if len(m.GCPMachine.Spec.GuestAccelerators) == 0 { + return nil + } + accelConfigs := make([]*compute.AcceleratorConfig, 0, len(m.GCPMachine.Spec.GuestAccelerators)) + for _, accel := range m.GCPMachine.Spec.GuestAccelerators { + accelConfig := &compute.AcceleratorConfig{ + AcceleratorType: accel.Type, + AcceleratorCount: accel.Count, + } + accelConfigs = append(accelConfigs, accelConfig) + } + return accelConfigs +} + // InstanceSpec returns instance spec. func (m *MachineScope) InstanceSpec(log logr.Logger) *compute.Instance { instance := &compute.Instance{ @@ -457,6 +473,11 @@ func (m *MachineScope) InstanceSpec(log logr.Logger) *compute.Instance { instance.Metadata = m.InstanceAdditionalMetadataSpec() instance.ServiceAccounts = append(instance.ServiceAccounts, m.InstanceServiceAccountsSpec()) instance.NetworkInterfaces = append(instance.NetworkInterfaces, m.InstanceNetworkInterfaceSpec()) + instance.GuestAccelerators = m.InstanceGuestAcceleratorsSpec() + if len(instance.GuestAccelerators) > 0 { + instance.Scheduling.OnHostMaintenance = "TERMINATE" + } + return instance } diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachines.yaml index 5dbe18c8a..19b242a20 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachines.yaml @@ -200,6 +200,31 @@ spec: - Enabled - Disabled type: string + guestAccelerators: + description: |- + GuestAccelerators is a list of the type and count of accelerator cards + attached to the instance. + items: + description: |- + Accelerator is a specification of the type and number of accelerator + cards attached to the instance. + properties: + count: + description: |- + Count is the number of the guest accelerator cards exposed to this + instance. + format: int64 + type: integer + type: + description: |- + Type is the full or partial URL of the accelerator type resource to + attach to this instance. For example: + projects/my-project/zones/us-central1-c/acceleratorTypes/nvidia-tesla-p100 + If you are creating an instance template, specify only the accelerator name. + See GPUs on Compute Engine for a full list of accelerator types. + type: string + type: object + type: array image: description: |- Image is the full reference to a valid image to be used for this machine. diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachinetemplates.yaml index ece5bc8d8..0144c6691 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachinetemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachinetemplates.yaml @@ -215,6 +215,31 @@ spec: - Enabled - Disabled type: string + guestAccelerators: + description: |- + GuestAccelerators is a list of the type and count of accelerator cards + attached to the instance. + items: + description: |- + Accelerator is a specification of the type and number of accelerator + cards attached to the instance. + properties: + count: + description: |- + Count is the number of the guest accelerator cards exposed to this + instance. + format: int64 + type: integer + type: + description: |- + Type is the full or partial URL of the accelerator type resource to + attach to this instance. For example: + projects/my-project/zones/us-central1-c/acceleratorTypes/nvidia-tesla-p100 + If you are creating an instance template, specify only the accelerator name. + See GPUs on Compute Engine for a full list of accelerator types. + type: string + type: object + type: array image: description: |- Image is the full reference to a valid image to be used for this machine. diff --git a/docs/book/src/topics/gpus.md b/docs/book/src/topics/gpus.md new file mode 100644 index 000000000..aebb9ac84 --- /dev/null +++ b/docs/book/src/topics/gpus.md @@ -0,0 +1,22 @@ +# GPUs + +Add GPUs via the `guestAccelerators` field in `GCPMachineTemplate`. + +``` +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: GCPMachineTemplate +metadata: + name: mygcpmachinetemplate + namespace: mynamespace +spec: + template: + spec: + image: projects/myproject/global/images/myimage + instanceType: n1-standard-2 + guestAccelerators: + - type: projects/myproject/zones/us-central1-c/acceleratorTypes/nvidia-tesla-t4 + count: 1 +``` + +https://cloud.google.com/compute/docs/gpus