From 7b8e870899ea1a49b741346cbbc02dc9be5a9a8b Mon Sep 17 00:00:00 2001
From: Jordan May <jwmay2012@gmail.com>
Date: Tue, 15 Oct 2024 13:00:50 -0400
Subject: [PATCH] Add GPU/Accelerator support to VMs

---
 api/v1beta1/gcpmachine_types.go               | 19 ++++++++++++++
 api/v1beta1/zz_generated.deepcopy.go          | 20 +++++++++++++++
 cloud/scope/machine.go                        | 21 ++++++++++++++++
 ...tructure.cluster.x-k8s.io_gcpmachines.yaml | 25 +++++++++++++++++++
 ....cluster.x-k8s.io_gcpmachinetemplates.yaml | 25 +++++++++++++++++++
 docs/book/src/topics/gpus.md                  | 22 ++++++++++++++++
 6 files changed, 132 insertions(+)
 create mode 100644 docs/book/src/topics/gpus.md

diff --git a/api/v1beta1/gcpmachine_types.go b/api/v1beta1/gcpmachine_types.go
index dc7c3c012..ccbf08a89 100644
--- a/api/v1beta1/gcpmachine_types.go
+++ b/api/v1beta1/gcpmachine_types.go
@@ -346,6 +346,25 @@ type GCPMachineSpec struct {
 	// RootDiskEncryptionKey defines the KMS key to be used to encrypt the root disk.
 	// +optional
 	RootDiskEncryptionKey *CustomerEncryptionKey `json:"rootDiskEncryptionKey,omitempty"`
+
+	// GuestAccelerators is a list of the type and count of accelerator cards
+	// attached to the instance.
+	// +optional
+	GuestAccelerators []Accelerator `json:"guestAccelerators,omitempty"`
+}
+
+// Accelerator is a specification of the type and number of accelerator
+// cards attached to the instance.
+type Accelerator struct {
+	// Count is the number of the guest accelerator cards exposed to this
+	// instance.
+	Count int64 `json:"count,omitempty"`
+	// Type is the full or partial URL of the accelerator type resource to
+	// attach to this instance. For example:
+	// projects/my-project/zones/us-central1-c/acceleratorTypes/nvidia-tesla-p100
+	// If you are creating an instance template, specify only the accelerator name.
+	// See GPUs on Compute Engine for a full list of accelerator types.
+	Type string `json:"type,omitempty"`
 }
 
 // MetadataItem defines a single piece of metadata associated with an instance.
diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go
index 3c8560ad9..203d0d0dc 100644
--- a/api/v1beta1/zz_generated.deepcopy.go
+++ b/api/v1beta1/zz_generated.deepcopy.go
@@ -27,6 +27,21 @@ import (
 	"sigs.k8s.io/cluster-api/errors"
 )
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *Accelerator) DeepCopyInto(out *Accelerator) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Accelerator.
+func (in *Accelerator) DeepCopy() *Accelerator {
+	if in == nil {
+		return nil
+	}
+	out := new(Accelerator)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *AttachedDiskSpec) DeepCopyInto(out *AttachedDiskSpec) {
 	*out = *in
@@ -505,6 +520,11 @@ func (in *GCPMachineSpec) DeepCopyInto(out *GCPMachineSpec) {
 		*out = new(CustomerEncryptionKey)
 		(*in).DeepCopyInto(*out)
 	}
+	if in.GuestAccelerators != nil {
+		in, out := &in.GuestAccelerators, &out.GuestAccelerators
+		*out = make([]Accelerator, len(*in))
+		copy(*out, *in)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GCPMachineSpec.
diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go
index ce5d02d55..9f8d5d86c 100644
--- a/cloud/scope/machine.go
+++ b/cloud/scope/machine.go
@@ -374,6 +374,22 @@ func (m *MachineScope) InstanceAdditionalMetadataSpec() *compute.Metadata {
 	return metadata
 }
 
+// InstanceGuestAcceleratorsSpec returns a slice of Guest Accelerator Config specs.
+func (m *MachineScope) InstanceGuestAcceleratorsSpec() []*compute.AcceleratorConfig {
+	if len(m.GCPMachine.Spec.GuestAccelerators) == 0 {
+		return nil
+	}
+	accelConfigs := make([]*compute.AcceleratorConfig, 0, len(m.GCPMachine.Spec.GuestAccelerators))
+	for _, accel := range m.GCPMachine.Spec.GuestAccelerators {
+		accelConfig := &compute.AcceleratorConfig{
+			AcceleratorType:  accel.Type,
+			AcceleratorCount: accel.Count,
+		}
+		accelConfigs = append(accelConfigs, accelConfig)
+	}
+	return accelConfigs
+}
+
 // InstanceSpec returns instance spec.
 func (m *MachineScope) InstanceSpec(log logr.Logger) *compute.Instance {
 	instance := &compute.Instance{
@@ -457,6 +473,11 @@ func (m *MachineScope) InstanceSpec(log logr.Logger) *compute.Instance {
 	instance.Metadata = m.InstanceAdditionalMetadataSpec()
 	instance.ServiceAccounts = append(instance.ServiceAccounts, m.InstanceServiceAccountsSpec())
 	instance.NetworkInterfaces = append(instance.NetworkInterfaces, m.InstanceNetworkInterfaceSpec())
+	instance.GuestAccelerators = m.InstanceGuestAcceleratorsSpec()
+	if len(instance.GuestAccelerators) > 0 {
+		instance.Scheduling.OnHostMaintenance = "TERMINATE"
+	}
+
 	return instance
 }
 
diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachines.yaml
index 5dbe18c8a..19b242a20 100644
--- a/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachines.yaml
+++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachines.yaml
@@ -200,6 +200,31 @@ spec:
                 - Enabled
                 - Disabled
                 type: string
+              guestAccelerators:
+                description: |-
+                  GuestAccelerators is a list of the type and count of accelerator cards
+                  attached to the instance.
+                items:
+                  description: |-
+                    Accelerator is a specification of the type and number of accelerator
+                    cards attached to the instance.
+                  properties:
+                    count:
+                      description: |-
+                        Count is the number of the guest accelerator cards exposed to this
+                        instance.
+                      format: int64
+                      type: integer
+                    type:
+                      description: |-
+                        Type is the full or partial URL of the accelerator type resource to
+                        attach to this instance. For example:
+                        projects/my-project/zones/us-central1-c/acceleratorTypes/nvidia-tesla-p100
+                        If you are creating an instance template, specify only the accelerator name.
+                        See GPUs on Compute Engine for a full list of accelerator types.
+                      type: string
+                  type: object
+                type: array
               image:
                 description: |-
                   Image is the full reference to a valid image to be used for this machine.
diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachinetemplates.yaml
index ece5bc8d8..0144c6691 100644
--- a/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachinetemplates.yaml
+++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachinetemplates.yaml
@@ -215,6 +215,31 @@ spec:
                         - Enabled
                         - Disabled
                         type: string
+                      guestAccelerators:
+                        description: |-
+                          GuestAccelerators is a list of the type and count of accelerator cards
+                          attached to the instance.
+                        items:
+                          description: |-
+                            Accelerator is a specification of the type and number of accelerator
+                            cards attached to the instance.
+                          properties:
+                            count:
+                              description: |-
+                                Count is the number of the guest accelerator cards exposed to this
+                                instance.
+                              format: int64
+                              type: integer
+                            type:
+                              description: |-
+                                Type is the full or partial URL of the accelerator type resource to
+                                attach to this instance. For example:
+                                projects/my-project/zones/us-central1-c/acceleratorTypes/nvidia-tesla-p100
+                                If you are creating an instance template, specify only the accelerator name.
+                                See GPUs on Compute Engine for a full list of accelerator types.
+                              type: string
+                          type: object
+                        type: array
                       image:
                         description: |-
                           Image is the full reference to a valid image to be used for this machine.
diff --git a/docs/book/src/topics/gpus.md b/docs/book/src/topics/gpus.md
new file mode 100644
index 000000000..aebb9ac84
--- /dev/null
+++ b/docs/book/src/topics/gpus.md
@@ -0,0 +1,22 @@
+# GPUs
+
+Add GPUs via the `guestAccelerators` field in `GCPMachineTemplate`.
+
+```
+---
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
+kind: GCPMachineTemplate
+metadata:
+  name: mygcpmachinetemplate
+  namespace: mynamespace
+spec:
+  template:
+    spec:
+      image: projects/myproject/global/images/myimage
+      instanceType: n1-standard-2
+      guestAccelerators:
+      - type: projects/myproject/zones/us-central1-c/acceleratorTypes/nvidia-tesla-t4
+        count: 1
+```
+
+https://cloud.google.com/compute/docs/gpus