Skip to content

Commit

Permalink
Merge pull request #1106 from rbaturov/expose-metrics-securely
Browse files Browse the repository at this point in the history
CNF-10142: Enable NROP metrics to be to scraped securely by Prometheus
  • Loading branch information
ffromani authored Dec 19, 2024
2 parents 8bd44a4 + 42ae268 commit 6a57ee1
Show file tree
Hide file tree
Showing 11 changed files with 120 additions and 17 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Assuming you can push container images to a container registry and you are in th
1. build and upload the manifest bundle container image: `make bundle bundle-build bundle-push`
1. leverage `operator-sdk` to deploy the container: `operator-sdk run bundle ${REPO}/numaresources-operator-bundle:${VERSION}`. Note the build procedure typically downloads a local copy of `operator-sdk` in `bin/` which you can reuse

Note that installing the operator using this method requires adding the openshift.io/cluster-monitoring: "true" label to the operator namespace to enable Prometheus cluster monitoring.

For further details, please refer to the [operator-sdk documentation](https://sdk.operatorframework.io/docs/olm-integration/tutorial-bundle/)

## roadmap
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: v1
kind: Service
metadata:
annotations:
service.beta.openshift.io/serving-cert-secret-name: metrics-service-cert
creationTimestamp: null
labels:
control-plane: controller-manager
name: numaresources-controller-manager-metrics-service
spec:
ports:
- name: https
port: 8080
protocol: TCP
targetPort: https
selector:
control-plane: controller-manager
status:
loadBalancer: {}
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ metadata:
}
]
capabilities: Basic Install
createdAt: "2024-11-14T15:38:47Z"
createdAt: "2024-12-18T20:58:55Z"
olm.skipRange: '>=4.18.0 <4.19.0'
operatorframework.io/cluster-monitoring: "true"
operators.operatorframework.io/builder: operator-sdk-v1.36.1
operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
name: numaresources-operator.v4.19.999-snapshot
Expand Down Expand Up @@ -532,6 +533,10 @@ spec:
initialDelaySeconds: 15
periodSeconds: 20
name: manager
ports:
- containerPort: 8080
name: https
protocol: TCP
readinessProbe:
httpGet:
path: /readyz
Expand All @@ -544,6 +549,10 @@ spec:
memory: 20Mi
securityContext:
allowPrivilegeEscalation: false
volumeMounts:
- mountPath: /certs
name: metrics-tls
readOnly: true
securityContext:
runAsNonRoot: true
serviceAccountName: numaresources-controller-manager
Expand All @@ -553,6 +562,10 @@ spec:
key: node-role.kubernetes.io/control-plane
- effect: NoSchedule
key: node-role.kubernetes.io/master
volumes:
- name: metrics-tls
secret:
secretName: metrics-service-cert
permissions:
- rules:
- apiGroups:
Expand Down
13 changes: 13 additions & 0 deletions config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ metadata:
annotations:
workload.openshift.io/allowed: management
labels:
openshift.io/cluster-monitoring: "true"
control-plane: controller-manager
name: system
---
Expand Down Expand Up @@ -43,15 +44,27 @@ spec:
matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: Exists
volumes:
- name : metrics-tls
secret:
secretName: metrics-service-cert
containers:
- command:
- /bin/numaresources-operator
args:
- -v=4
- --leader-elect
- --enable-scheduler
ports:
- containerPort: 8080
protocol: TCP
name: https
image: controller:latest
name: manager
volumeMounts:
- name: metrics-tls
mountPath: /certs
readOnly: true
securityContext:
allowPrivilegeEscalation: false
env:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ metadata:
alm-examples: '[]'
capabilities: Basic Install
olm.skipRange: '>=4.18.0 <4.19.0'
operatorframework.io/cluster-monitoring: "true"
name: numaresources-operator.v0.0.0
namespace: placeholder
spec:
Expand Down
1 change: 1 addition & 0 deletions config/prometheus/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
resources:
- rbac.yaml
- monitor.yaml
23 changes: 13 additions & 10 deletions config/prometheus/monitor.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@

# Prometheus Monitor Service (Metrics)
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
control-plane: controller-manager
name: controller-manager-metrics-monitor
name: controller-manager
namespace: system
spec:
endpoints:
- path: /metrics
port: https
scheme: https
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token"
interval: 30s
targetPort: 8080
path: /metrics
scheme: https
tlsConfig:
# The CA file used by Prometheus to verify the server's certificate.
# It's the cluster's CA bundle from the service CA operator.
caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt
# The name of the server (CN) in the server's certificate.
serverName: numaresources-controller-manager-metrics-service.numaresources.svc
insecureSkipVerify: false
selector:
matchLabels:
control-plane: controller-manager
31 changes: 31 additions & 0 deletions config/prometheus/rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# creates Role and RoleBinding for prometheus-k8s service account to access our namespace
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-k8s
namespace: system
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: openshift-monitoring
1 change: 1 addition & 0 deletions config/rbac/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ resources:
# runtime. Be sure to update RoleBinding and ClusterRoleBinding
# subjects if changing service account names.
- service_account.yaml
- service.yaml
- role.yaml
- role_binding.yaml
- leader_election_role.yaml
Expand Down
16 changes: 16 additions & 0 deletions config/rbac/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: v1
kind: Service
metadata:
annotations:
service.beta.openshift.io/serving-cert-secret-name: metrics-service-cert
labels:
control-plane: controller-manager
name: controller-manager-metrics-service
spec:
ports:
- name: https
port: 8080
protocol: TCP
targetPort: https
selector:
control-plane: controller-manager
15 changes: 9 additions & 6 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,11 @@ const (
)

const (
defaultWebhookPort = 9443
defaultMetricsAddr = ":8080"
defaultProbeAddr = ":8081"
defaultNamespace = "numaresources-operator"
defaultWebhookPort = 9443
defaultMetricsAddr = ":8080"
defaultMetricsSupport = true
defaultProbeAddr = ":8081"
defaultNamespace = "numaresources-operator"
)

var (
Expand Down Expand Up @@ -130,6 +131,7 @@ func (pa *Params) SetDefaults() {
pa.probeAddr = defaultProbeAddr
pa.render.Namespace = defaultNamespace
pa.enableReplicasDetect = true
pa.enableMetrics = defaultMetricsSupport
}

func (pa *Params) FromFlags() {
Expand Down Expand Up @@ -235,8 +237,9 @@ func main() {
Cache: cache.Options{}, // TODO: restrict namespace here?
Scheme: scheme,
Metrics: metricsserver.Options{
// TODO: secureServing?
BindAddress: params.metricsAddr,
BindAddress: params.metricsAddr,
SecureServing: true,
CertDir: "/certs",
},
WebhookServer: webhook.NewServer(webhook.Options{
Port: params.webhookPort,
Expand Down

0 comments on commit 6a57ee1

Please sign in to comment.