Skip to content

Commit

Permalink
feat(controller): restart pods for replica set automatically
Browse files Browse the repository at this point in the history
  • Loading branch information
basti1302 committed Jun 20, 2024
1 parent 6d1e93c commit e690c31
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 52 deletions.
16 changes: 16 additions & 0 deletions helm-chart/dash0-operator/templates/operator/cluster-roles.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ kind: ClusterRole
metadata:
name: {{ template "dash0-operator.chartName" . }}-manager-role
rules:

- apiGroups:
- apps
resources:
Expand All @@ -16,6 +17,7 @@ rules:
- patch
- update
- watch

- apiGroups:
- batch
resources:
Expand All @@ -27,18 +29,30 @@ rules:
- patch
- update
- watch

- apiGroups:
- ""
resources:
- events
verbs:
- create

- apiGroups:
- ""
resources:
- namespaces
verbs:
- get

- apiGroups:
- ""
resources:
- pods
verbs:
- get
- list
- delete

- apiGroups:
- operator.dash0.com
resources:
Expand All @@ -52,6 +66,7 @@ rules:
- patch
- update
- watch

- apiGroups:
- operator.dash0.com
resources:
Expand All @@ -66,6 +81,7 @@ rules:
- get
- patch
- update

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ cluster roles should match snapshot:
- namespaces
verbs:
- get
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- list
- delete
- apiGroups:
- operator.dash0.com
resources:
Expand Down
74 changes: 65 additions & 9 deletions internal/controller/dash0_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ import (
"context"
"errors"
"fmt"
"slices"
"sort"

"github.com/go-logr/logr"
appsv1 "k8s.io/api/apps/v1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -52,6 +54,10 @@ const (
Uninstrumentation ModificationMode = "Uninstrumentation"
)

var (
timeoutForListingPods int64 = 2
)

type ImmutableWorkloadError struct {
workloadType string
workloadName string
Expand Down Expand Up @@ -540,14 +546,11 @@ func (r *Dash0Reconciler) instrumentReplicaSet(
replicaSet appsv1.ReplicaSet,
reconcileLogger *logr.Logger,
) {
// Note: ReplicaSet pods are not restarted automatically by Kubernetes when their spec is changed (for other
// resource types like deployments or daemonsets this is managed by Kubernetes automatically). For now, we rely on
// the user to manually restart the pods of their replica sets after they have been instrumented. We could consider
// finding all pods for that are owned by the replica set and restart them automatically.

r.instrumentWorkload(ctx, &replicaSetWorkload{
replicaSet: &replicaSet,
}, reconcileLogger)

r.restartPodsOfReplicaSet(ctx, replicaSet, reconcileLogger)
}

func (r *Dash0Reconciler) findAndInstrumentStatefulSets(
Expand Down Expand Up @@ -943,13 +946,11 @@ func (r *Dash0Reconciler) findAndUninstrumentReplicaSets(
}

func (r *Dash0Reconciler) uninstrumentReplicaSet(ctx context.Context, replicaSet appsv1.ReplicaSet, reconcileLogger *logr.Logger) {
// Note: ReplicaSet pods are not restarted automatically by Kubernetes when their spec is change (for other resource
// types like deployments or daemonsets this is managed by Kubernetes automatically). For now, we rely on the user
// to manually restart the pods of their replica sets after they have been instrumented. We could consider finding
// all pods for that are owned by the replica set and restart them automatically.
r.revertWorkloadInstrumentation(ctx, &replicaSetWorkload{
replicaSet: &replicaSet,
}, reconcileLogger)

r.restartPodsOfReplicaSet(ctx, replicaSet, reconcileLogger)
}

func (r *Dash0Reconciler) findAndUninstrumentStatefulSets(
Expand Down Expand Up @@ -1079,3 +1080,58 @@ func (s SortByCreationTimestamp) Swap(i, j int) {
func (s SortByCreationTimestamp) Less(i, j int) bool {
return s[i].CreationTimestamp.Before(&s[j].CreationTimestamp)
}

func (r *Dash0Reconciler) restartPodsOfReplicaSet(
ctx context.Context,
replicaSet appsv1.ReplicaSet,
logger *logr.Logger,
) {
// Note: ReplicaSet pods are not restarted automatically by Kubernetes when their spec is changed (for other
// resource types like deployments or daemonsets this is managed by Kubernetes automatically). Therefore, we
// find all pods owned by the replica set and explicitly delete them to trigger a restart.
allPodsInNamespace, err :=
r.ClientSet.
CoreV1().
Pods(replicaSet.Namespace).
List(ctx, metav1.ListOptions{
TimeoutSeconds: &timeoutForListingPods,
})
if err != nil {
logger.Error(
err,
fmt.Sprintf(
"Failed to list all pods in the namespaces for the purpose of restarting the pods owned by the "+
"replica set %s/%s (%s), pods will not be restarted automatically.",
replicaSet.Namespace,
replicaSet.Name,
replicaSet.UID,
))
return
}

podsOfReplicaSet := slices.DeleteFunc(allPodsInNamespace.Items, func(pod corev1.Pod) bool {
ownerReferences := pod.GetOwnerReferences()
for _, ownerReference := range ownerReferences {
if ownerReference.Kind == "ReplicaSet" &&
ownerReference.Name == replicaSet.Name &&
ownerReference.UID == replicaSet.UID {
return false
}
}
return true
})

for _, pod := range podsOfReplicaSet {
err := r.Client.Delete(ctx, &pod)
if err != nil {
logger.Info(
fmt.Sprintf(
"Failed to restart pod owned by the replica "+
"set %s/%s (%s), this pod will not be restarted automatically.",
replicaSet.Namespace,
replicaSet.Name,
replicaSet.UID,
))
}
}
}
29 changes: 0 additions & 29 deletions test/e2e/e2e_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,6 @@ func VerifyThatWorkloadHasBeenInstrumented(
workloadType string,
port int,
isBatch bool,
restartPodsManually bool,
instrumentationBy string,
) string {
By(fmt.Sprintf("%s: waiting for the workload to get instrumented (polling its labels and events to check)",
Expand All @@ -678,10 +677,6 @@ func VerifyThatWorkloadHasBeenInstrumented(
VerifySuccessfulInstrumentationEvent(g, namespace, workloadType, instrumentationBy)
}, verifyTelemetryTimeout, verifyTelemetryPollingInterval).Should(Succeed())

if restartPodsManually {
restartAllPods(namespace)
}

By(fmt.Sprintf("%s: waiting for spans to be captured", workloadType))
var testId string
if isBatch {
Expand Down Expand Up @@ -718,7 +713,6 @@ func VerifyThatInstrumentationHasBeenReverted(
workloadType string,
port int,
isBatch bool,
restartPodsManually bool,
testId string,
instrumentationBy string,
) {
Expand All @@ -730,10 +724,6 @@ func VerifyThatInstrumentationHasBeenReverted(
VerifySuccessfulUninstrumentationEvent(g, namespace, workloadType, instrumentationBy)
}, verifyTelemetryTimeout, verifyTelemetryPollingInterval).Should(Succeed())

if restartPodsManually {
restartAllPods(namespace)
}

// Add some buffer time between the workloads being restarted and verifying that no spans are produced/captured.
time.Sleep(10 * time.Second)

Expand Down Expand Up @@ -899,25 +889,6 @@ func verifyEvent(
)))
}

func restartAllPods(namespace string) {
// The pods of replicasets are not restarted automatically when the template changes (in contrast to
// deployments, daemonsets etc.). For now we execpt the user to restart the pods of the replciaset manually,
// and we simuate this in the e2e tests.
By("restarting pods manually")
Expect(
RunAndIgnoreOutput(
exec.Command(
"kubectl",
"delete",
"pod",
"--namespace",
namespace,
"--selector",
"app=dash0-operator-nodejs-20-express-test-replicaset-app",
))).To(Succeed())

}

func verifySpans(g Gomega, isBatch bool, workloadType string, port int, httpPathWithQuery string) {
spansFound := sendRequestAndFindMatchingSpans(g, isBatch, workloadType, port, httpPathWithQuery, true, nil)
g.Expect(spansFound).To(BeTrue(),
Expand Down
21 changes: 7 additions & 14 deletions test/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,10 @@ var _ = Describe("Dash0 Kubernetes Operator", Ordered, func() {
})

type controllerTestWorkloadConfig struct {
workloadType string
port int
installWorkload func(string) error
isBatch bool
restartPodsManually bool
workloadType string
port int
installWorkload func(string) error
isBatch bool
}

workloadConfigs := []controllerTestWorkloadConfig{
Expand All @@ -110,10 +109,9 @@ var _ = Describe("Dash0 Kubernetes Operator", Ordered, func() {
port: 1207,
installWorkload: InstallNodeJsDeployment,
}, {
workloadType: "replicaset",
port: 1209,
installWorkload: InstallNodeJsReplicaSet,
restartPodsManually: true,
workloadType: "replicaset",
port: 1209,
installWorkload: InstallNodeJsReplicaSet,
}, {
workloadType: "statefulset",
port: 1210,
Expand Down Expand Up @@ -141,7 +139,6 @@ var _ = Describe("Dash0 Kubernetes Operator", Ordered, func() {
config.workloadType,
config.port,
config.isBatch,
config.restartPodsManually,
"controller",
)
})
Expand All @@ -154,7 +151,6 @@ var _ = Describe("Dash0 Kubernetes Operator", Ordered, func() {
config.workloadType,
config.port,
config.isBatch,
config.restartPodsManually,
testIds[config.workloadType],
"controller",
)
Expand Down Expand Up @@ -240,7 +236,6 @@ var _ = Describe("Dash0 Kubernetes Operator", Ordered, func() {
config.workloadType,
config.port,
config.isBatch,
false,
"webhook",
)

Expand Down Expand Up @@ -382,7 +377,6 @@ var _ = Describe("Dash0 Kubernetes Operator", Ordered, func() {
config.workloadType,
config.port,
false,
false,
"controller",
)
})
Expand All @@ -395,7 +389,6 @@ var _ = Describe("Dash0 Kubernetes Operator", Ordered, func() {
config.workloadType,
config.port,
false,
false,
testIds[config.workloadType],
"controller",
)
Expand Down

0 comments on commit e690c31

Please sign in to comment.