Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Support OFED with DTK #711

Merged
merged 1 commit into from
Jan 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ spec:
fieldPath: metadata.namespace
- name: ENABLE_WEBHOOKS
value: "false"
- name: USE_DTK
value: "true"
securityContext:
allowPrivilegeEscalation: false
livenessProbe:
Expand Down
8 changes: 8 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,14 @@ rules:
- create
- patch
- update
- apiGroups:
- image.openshift.io
resources:
- imagestreams
verbs:
- get
- list
- watch
- apiGroups:
- k8s.cni.cncf.io
resources:
Expand Down
1 change: 1 addition & 0 deletions controllers/nicclusterpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ type NicClusterPolicyReconciler struct {
// +kubebuilder:rbac:groups=nv-ipam.nvidia.com,resources=ippools/status,verbs=get;update;patch;
// +kubebuilder:rbac:groups=cert-manager.io,resources=issuers;certificates,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=admissionregistration.k8s.io,resources=validatingwebhookconfigurations,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=image.openshift.io,resources=imagestreams,verbs=get;list;watch

// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
Expand Down
1 change: 1 addition & 0 deletions deployment/network-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,7 @@ parameters.
| `deployCR` | bool | `false` | Deploy `NicClusterPolicy` custom resource according to provided parameters |
| `nodeAffinity` | yaml | `` | Override the node affinity for various Daemonsets deployed by network operator, e.g. whereabouts, multus, cni-plugins. |
| `tolerations` | yaml | `` | Set additional tolerations for various Daemonsets deployed by network operator, e.g. whereabouts, multus, cni-plugins. |
| `useDTK` | bool | `True` | Enable use of Driver ToolKit to compile OFED drivers (OpenShift Only). |

#### imagePullSecrets customization

Expand Down
2 changes: 2 additions & 0 deletions deployment/network-operator/templates/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ spec:
value: "network-operator"
- name: ENABLE_WEBHOOKS
value: "{{ .Values.operator.admissionController.enabled }}"
- name: USE_DTK
adrianchiris marked this conversation as resolved.
Show resolved Hide resolved
value: "{{ .Values.operator.useDTK }}"
{{- if .Values.operator.cniBinDirectory }}
- name: CNI_BIN_DIR
value: "{{ .Values.operator.cniBinDirectory }}"
Expand Down
8 changes: 8 additions & 0 deletions deployment/network-operator/templates/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,14 @@ rules:
- create
- patch
- update
- apiGroups:
- image.openshift.io
resources:
- imagestreams
verbs:
- get
- list
- watch
- apiGroups:
- k8s.cni.cncf.io
resources:
Expand Down
1 change: 1 addition & 0 deletions deployment/network-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ operator:
# tag, if defined will use the given image tag, else Chart.AppVersion will be used
# tag
cniBinDirectory: /opt/cni/bin
useDTK: true
admissionController:
enabled: false
useCertManager: true
Expand Down
1 change: 1 addition & 0 deletions hack/templates/values/values.template
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ operator:
# tag, if defined will use the given image tag, else Chart.AppVersion will be used
# tag
cniBinDirectory: /opt/cni/bin
useDTK: true
admissionController:
enabled: false
useCertManager: true
Expand Down
2 changes: 2 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/NVIDIA/k8s-operator-libs/pkg/upgrade"
netattdefv1 "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/apis/k8s.cni.cncf.io/v1"
osconfigv1 "github.com/openshift/api/config/v1"
imagev1 "github.com/openshift/api/image/v1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
Expand Down Expand Up @@ -59,6 +60,7 @@ func init() {
utilruntime.Must(mellanoxcomv1alpha1.AddToScheme(scheme))
utilruntime.Must(netattdefv1.AddToScheme(scheme))
utilruntime.Must(osconfigv1.AddToScheme(scheme))
utilruntime.Must(imagev1.AddToScheme(scheme))
// +kubebuilder:scaffold:scheme
}

Expand Down
49 changes: 49 additions & 0 deletions manifests/state-ofed-driver/0050_ofed-driver-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@ spec:
seLinuxOptions:
level: "s0"
env:
{{- if .RuntimeSpec.UseDtk }}
- name: DTK_OCP_DRIVER_BUILD
value: "true"
- name: DTK_OCP_NIC_SHARED_DIR
adrianchiris marked this conversation as resolved.
Show resolved Hide resolved
value: "/mnt/shared-doca-driver-toolkit"
{{- end }}
{{- if .CrSpec.Env }}
adrianchiris marked this conversation as resolved.
Show resolved Hide resolved
{{- range .CrSpec.Env }}
{{ . | yaml | nindentPrefix 14 "- " }}
Expand Down Expand Up @@ -111,6 +117,10 @@ spec:
readOnly: {{ .ReadOnly }}
{{- end }}
{{- end }}
{{- if .RuntimeSpec.UseDtk }}
- name: shared-doca-driver-toolkit
mountPath: /mnt/shared-doca-driver-toolkit
{{- end}}
{{- with index .RuntimeSpec.ContainerResources "mofed-container" }}
resources:
{{- if .Requests }}
Expand Down Expand Up @@ -145,6 +155,41 @@ spec:
initialDelaySeconds: {{ .CrSpec.ReadinessProbe.InitialDelaySeconds }}
failureThreshold: 1
periodSeconds: {{ .CrSpec.ReadinessProbe.PeriodSeconds }}
{{- if .RuntimeSpec.UseDtk }}
adrianchiris marked this conversation as resolved.
Show resolved Hide resolved
- image: {{ .RuntimeSpec.DtkImageName }}
imagePullPolicy: IfNotPresent
name: openshift-driver-toolkit-ctr
command: [bash, -xc]
args:
- |
until [ -f /mnt/shared-doca-driver-toolkit/dtk_start_compile ]; do
echo Waiting for mofed-container container to prepare the shared directory
sleep 3
done
exec /mnt/shared-doca-driver-toolkit/dtk_nic_driver_build.sh
env:
adrianchiris marked this conversation as resolved.
Show resolved Hide resolved
- name: DTK_OCP_NIC_SHARED_DIR
value: "/mnt/shared-doca-driver-toolkit"
{{- if .CrSpec.Env }}
{{- range .CrSpec.Env }}
{{ . | yaml | nindentPrefix 14 "- " }}
{{- end }}
{{- end }}
volumeMounts:
- name: shared-doca-driver-toolkit
mountPath: /mnt/shared-doca-driver-toolkit
{{- with index .RuntimeSpec.ContainerResources "openshift-driver-toolkit-ctr" }}
resources:
{{- if .Requests }}
requests:
{{ .Requests | yaml | nindent 14}}
{{- end }}
{{- if .Limits }}
limits:
{{ .Limits | yaml | nindent 14}}
{{- end }}
{{- end }}
{{- end }}
# unloading OFED modules can take more time than default terminationGracePeriod (30 sec)
terminationGracePeriodSeconds: {{ .CrSpec.TerminationGracePeriodSeconds }}
volumes:
Expand Down Expand Up @@ -175,6 +220,10 @@ spec:
{{ . | yaml | nindentPrefix 14 "- " }}
{{- end }}
{{- end }}
{{- if .RuntimeSpec.UseDtk }}
- name: shared-doca-driver-toolkit
emptyDir: {}
{{- end }}
nodeSelector:
feature.node.kubernetes.io/pci-15b3.present: "true"
feature.node.kubernetes.io/system-os_release.ID: {{ .RuntimeSpec.OSName }}
Expand Down
2 changes: 2 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ type OFEDStateConfig struct {
// InitContainerImage is a full image name (registry, image name, tag) for the OFED init container.
// The init container will not be deployed if this variable is empty/not set.
InitContainerImage string `env:"OFED_INIT_CONTAINER_IMAGE"`
// UseDTK Enable use of Driver ToolKit side car to compile OFED drivers (Relevant for OpenShift only)
UseDTK bool `env:"USE_DTK" envDefault:"true"`
}

// FromEnv pulls the operator configuration from the environment.
Expand Down
4 changes: 4 additions & 0 deletions pkg/nodeinfo/attributes.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ const (
NodeLabelNvGPU = "nvidia.com/gpu.present"
NodeLabelWaitOFED = "network.nvidia.com/operator.mofed.wait"
NodeLabelCudaVersionMajor = "nvidia.com/cuda.driver.major"
NodeLabelOSTreeVersion = "feature.node.kubernetes.io/system-os_release.OSTREE_VERSION"
)

// AttributeType categorizes Attributes of the host.
Expand All @@ -51,6 +52,7 @@ const (
AttrTypeOSVer
// optional attrs
AttrTypeCudaVersionMajor
AttrTypeOSTreeVersion

OptionalAttrsStart = AttrTypeCudaVersionMajor
)
Expand All @@ -66,6 +68,8 @@ var attrToLabel = []string{
NodeLabelOSVer,
// AttrTypeCudaVersionMajor
NodeLabelCudaVersionMajor,
// AttrTypeOSTreeVersion
NodeLabelOSTreeVersion,
}

// NodeAttributes provides attributes of a specific node
Expand Down
43 changes: 43 additions & 0 deletions pkg/state/state_ofed.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/NVIDIA/k8s-operator-libs/pkg/upgrade"
"github.com/go-logr/logr"
osconfigv1 "github.com/openshift/api/config/v1"
apiimagev1 "github.com/openshift/api/image/v1"
"github.com/pkg/errors"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -156,6 +157,8 @@ type ofedRuntimeSpec struct {
// is true if cluster type is Openshift
IsOpenshift bool
ContainerResources ContainerResourcesMap
UseDtk bool
DtkImageName string
}

type ofedManifestRenderData struct {
Expand Down Expand Up @@ -394,6 +397,7 @@ func (s *stateOFED) GetManifestObjects(
if clusterInfo == nil {
return nil, errors.New("clusterInfo provider required")
}

attrs := nodeInfo.GetNodesAttributes(
nodeinfo.NewNodeLabelFilterBuilder().WithLabel(nodeinfo.NodeLabelMlnxNIC, "true").Build())
if len(attrs) == 0 {
Expand All @@ -410,6 +414,19 @@ func (s *stateOFED) GetManifestObjects(
}
nodeAttr := attrs[0].Attributes

useDtk := clusterInfo.IsOpenshift() && config.FromEnv().State.OFEDState.UseDTK
var dtkImageName string
if useDtk {
if err := s.checkAttributesExist(attrs[0], nodeinfo.AttrTypeOSTreeVersion); err != nil {
return nil, err
}
dtk, err := s.getOCPDriverToolkitImage(ctx, nodeAttr[nodeinfo.AttrTypeOSTreeVersion])
if err != nil {
return nil, fmt.Errorf("failed to get OpenShift DTK image : %v", err)
}
dtkImageName = dtk
}

setProbesDefaults(cr)

// Update MOFED Env variables with defaults for the cluster
Expand Down Expand Up @@ -441,6 +458,8 @@ func (s *stateOFED) GetManifestObjects(
config.FromEnv().State.OFEDState.InitContainerImage),
IsOpenshift: clusterInfo.IsOpenshift(),
ContainerResources: createContainerResourcesMap(cr.Spec.OFEDDriver.ContainerResources),
UseDtk: useDtk,
DtkImageName: dtkImageName,
},
Tolerations: cr.Spec.Tolerations,
NodeAffinity: cr.Spec.NodeAffinity,
Expand Down Expand Up @@ -735,3 +754,27 @@ func (s *stateOFED) handleRepoConfig(
}
return nil
}

// getOCPDriverToolkitImage gets the DTK ImageStream and return the DTK image according to OSTREE version
func (s *stateOFED) getOCPDriverToolkitImage(ctx context.Context, ostreeVersion string) (string, error) {
reqLogger := log.FromContext(ctx)
dtkImageStream := &apiimagev1.ImageStream{}
name := "driver-toolkit"
namespace := "openshift"
err := s.client.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, dtkImageStream)
if err != nil {
reqLogger.Error(err, "Couldn't get the driver-toolkit imagestream")
return "", err
}
rhcosDriverToolkitImages := make(map[string]string)
reqLogger.Info("ocpDriverToolkitImages: driver-toolkit imagestream found")
for _, tag := range dtkImageStream.Spec.Tags {
rhcosDriverToolkitImages[tag.Name] = tag.From.Name
}

image, ok := rhcosDriverToolkitImages[ostreeVersion]
if !ok {
return "", fmt.Errorf("failed to find DTK image for RHCOS version: %v", ostreeVersion)
}
return image, nil
}
Loading
Loading