From 1f3edadec24a19d07e109787e5d35310af939902 Mon Sep 17 00:00:00 2001 From: Oksana Baranova Date: Thu, 5 Dec 2024 14:50:38 +0200 Subject: [PATCH] add NFD rule to QAT resource driver (#66) * add NFD rule to QAT resource driver Signed-off-by: Oksana Baranova --- charts/intel-qat-resource-driver/Chart.yaml | 8 ++++++ charts/intel-qat-resource-driver/README.md | 20 ++++++++++++++ .../templates/nfd.yaml | 27 +++++++++++++++++++ charts/intel-qat-resource-driver/values.yaml | 18 ++++++++++--- 4 files changed, 70 insertions(+), 3 deletions(-) create mode 100644 charts/intel-qat-resource-driver/templates/nfd.yaml diff --git a/charts/intel-qat-resource-driver/Chart.yaml b/charts/intel-qat-resource-driver/Chart.yaml index 9826339..b838d2b 100644 --- a/charts/intel-qat-resource-driver/Chart.yaml +++ b/charts/intel-qat-resource-driver/Chart.yaml @@ -5,3 +5,11 @@ description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel QAT Reso type: application version: 0.1.0 appVersion: "v0.1.0" +home: https://github.com/intel/helm-charts + +dependencies: + - name: node-feature-discovery + alias: nfd + version: "0.16.6" + condition: nfd.enabled + repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts diff --git a/charts/intel-qat-resource-driver/README.md b/charts/intel-qat-resource-driver/README.md index 31d6aff..ddf69b2 100644 --- a/charts/intel-qat-resource-driver/README.md +++ b/charts/intel-qat-resource-driver/README.md @@ -16,7 +16,9 @@ helm repo update You can execute `helm search repo intel` command to see pulled charts [optional]. ## Install Helm Chart +When installing, update the dependencies: ``` +helm dependency update helm install intel-qat-resource-driver intel/intel-qat-resource-driver ``` ## Upgrade Chart @@ -46,3 +48,21 @@ You may also run `helm show values` on this chart's dependencies for additional | image.tag | string | `"v0.1.0"` | If you change the image tag to be used in Helm chart deployment, ensure that the version of the container image is consistent with deployment YAMLs - they might change between releases. + + +## Read-only file system error for QAT + +When the following error appears in the logs of the QAT Kubelet plugin: +``` +kubectl logs -n intel-qat-resource-driver intel-qat-resource-driver-kubelet-plugin-ttcs6 +DRA kubelet plugin +In-cluster config +Setting up CDI +failed to create kubelet plugin driver: cannot enable PF device '0000:6b:00.0': open /sysfs/bus/pci/devices/0000:6b:00.0/sriov_numvfs: read-only file system +``` + +Try reseting QAT by reloading its kernel driver: +``` +rmmod qat_4xxx +modprobe qat_4xxx +``` diff --git a/charts/intel-qat-resource-driver/templates/nfd.yaml b/charts/intel-qat-resource-driver/templates/nfd.yaml new file mode 100644 index 0000000..4648073 --- /dev/null +++ b/charts/intel-qat-resource-driver/templates/nfd.yaml @@ -0,0 +1,27 @@ +apiVersion: nfd.k8s-sigs.io/v1alpha1 +kind: NodeFeatureRule +metadata: + name: intel-qat-device-rule +spec: + rules: + - name: "intel.qat" + labels: + feature.node.kubernetes.io/qat: "true" + matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["8086"]} + device: {op: In, value: ["4940", "4941", "4944", "4946"]} + class: {op: In, value: ["0b40"]} + - feature: kernel.loadedmodule + matchExpressions: + intel_qat: {op: Exists} + matchAny: + - matchFeatures: + - feature: kernel.loadedmodule + matchExpressions: + vfio_pci: {op: Exists} + - matchFeatures: + - feature: kernel.enabledmodule + matchExpressions: + vfio-pci: {op: Exists} diff --git a/charts/intel-qat-resource-driver/values.yaml b/charts/intel-qat-resource-driver/values.yaml index 439d75a..329f9dc 100644 --- a/charts/intel-qat-resource-driver/values.yaml +++ b/charts/intel-qat-resource-driver/values.yaml @@ -19,6 +19,8 @@ serviceAccount: kubeletPlugin: podAnnotations: {} + nodeSelector: + feature.node.kubernetes.io/qat: "true" tolerations: - key: node-role.kubernetes.io/master operator: Exists @@ -26,7 +28,17 @@ kubeletPlugin: - key: node-role.kubernetes.io/control-plane operator: Exists effect: NoSchedule - nodeSelector: - {} - #node-role.kubernetes.io/control-plane: "" + # Refer to the official documentation for Node Feature Discovery (NFD) + # regarding node tainting: + # https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting + - key: "node.kubernetes.io/qat" + operator: "Exists" + effect: "NoSchedule" affinity: {} + +nfd: + enabled: false # change to true to install NFD to the cluster + nameOverride: intel-qat-nfd + # TODO: this deprecated NFD option will be replaced in NFD v0.17 with "featureGates.NodeFeatureAPI" (added in v0.16): + # https://kubernetes-sigs.github.io/node-feature-discovery/v0.16/deployment/helm.html#general-parameters + enableNodeFeatureApi: true