Skip to content

Commit

Permalink
fix(webhook): endless secret update retries
Browse files Browse the repository at this point in the history
Introduce leader election for initializing webhook
servers per node.

longhorn/longhorn-10054

Signed-off-by: Chin-Ya Huang <[email protected]>
(cherry picked from commit 7114b69)
  • Loading branch information
c3y1huang authored and mergify[bot] committed Dec 25, 2024
1 parent 32592f6 commit 134e682
Showing 1 changed file with 127 additions and 39 deletions.
166 changes: 127 additions & 39 deletions app/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,20 @@ import (
"net/http"
_ "net/http/pprof" // for runtime profiling
"os"
"time"

"github.com/gorilla/handlers"
"github.com/pkg/errors"
"github.com/rancher/wrangler/v3/pkg/signals"
"github.com/sirupsen/logrus"
"github.com/urfave/cli"

"k8s.io/client-go/rest"
"k8s.io/client-go/tools/leaderelection"
"k8s.io/client-go/tools/leaderelection/resourcelock"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clientset "k8s.io/client-go/kubernetes"

"github.com/longhorn/go-iscsi-helper/iscsi"

Expand Down Expand Up @@ -49,6 +55,10 @@ const (
FlagUpgradeVersionCheck = "upgrade-version-check"
)

const (
LeaseLockNameWebhook = "longhorn-manager-webhook-lock"
)

func DaemonCmd() cli.Command {
return cli.Command{
Name: "daemon",
Expand Down Expand Up @@ -98,6 +108,122 @@ func DaemonCmd() cli.Command {
}
}

// startWebhooksByLeaderElection starts the webhooks by using Kubernetes leader
// election. It ensures that only one webhook server is initializing at a time in
// the cluster.
//
// https://github.com/longhorn/longhorn/issues/10054
//
// Parameters:
// - ctx: The context used to manage the webhook servers lifecycle.
// - kubeconfigPath: The path to the kubeconfig file.
// - currentNodeID: The ID of the current node attempting to acquire the leadership.
func startWebhooksByLeaderElection(ctx context.Context, kubeconfigPath, currentNodeID string) error {
config, err := rest.InClusterConfig()
if err != nil {
return errors.Wrap(err, "failed to get client config")
}

kubeClient, err := clientset.NewForConfig(config)
if err != nil {
return errors.Wrap(err, "failed to get k8s client")
}

podName, err := util.GetRequiredEnv(types.EnvPodName)
if err != nil {
return fmt.Errorf("failed to detect the manager pod name")
}

podNamespace, err := util.GetRequiredEnv(types.EnvPodNamespace)
if err != nil {
return fmt.Errorf("failed to detect the manager pod namespace")
}

lock := &resourcelock.LeaseLock{
LeaseMeta: metav1.ObjectMeta{
Name: LeaseLockNameWebhook,
Namespace: podNamespace,
},
Client: kubeClient.CoordinationV1(),
LockConfig: resourcelock.ResourceLockConfig{
Identity: currentNodeID,
},
}

fnStartWebhook := func(ctx context.Context) error {
// Conversion webhook needs to be started first since we use its port 9501 as readiness port.
// longhorn-manager pod becomes ready only when conversion webhook is running.
// The services in the longhorn-manager can then start to receive the requests.
// Conversion webhook does not use datastore, since it is a prerequisite for
// datastore operation.
clientsWithoutDatastore, err := client.NewClients(kubeconfigPath, false, ctx.Done())
if err != nil {
return err
}
if err := webhook.StartWebhook(ctx, types.WebhookTypeConversion, clientsWithoutDatastore); err != nil {
return err
}

// This adds the label for the conversion webhook's selector. We do it the hard way without datastore to avoid chicken-and-egg.
pod, err := clientsWithoutDatastore.Clients.K8s.CoreV1().Pods(podNamespace).Get(context.Background(), podName, metav1.GetOptions{})
if err != nil {
return err
}
labels := types.GetConversionWebhookLabel()
for key, value := range labels {
pod.Labels[key] = value
}
_, err = clientsWithoutDatastore.Clients.K8s.CoreV1().Pods(podNamespace).Update(context.Background(), pod, metav1.UpdateOptions{})
if err != nil {
return err
}
if err := webhook.CheckWebhookServiceAvailability(types.WebhookTypeConversion); err != nil {
return err
}

clients, err := client.NewClients(kubeconfigPath, true, ctx.Done())
if err != nil {
return err
}

if err := webhook.StartWebhook(ctx, types.WebhookTypeAdmission, clients); err != nil {
return err
}
return nil
}

leaderCtx, cancel := context.WithCancel(ctx)
defer cancel()

leaderelection.RunOrDie(leaderCtx, leaderelection.LeaderElectionConfig{
Lock: lock,
ReleaseOnCancel: true,
LeaseDuration: 20 * time.Second,
RenewDeadline: 10 * time.Second,
RetryPeriod: 2 * time.Second,
Callbacks: leaderelection.LeaderCallbacks{
OnStartedLeading: func(leaderCtx context.Context) {
if err := fnStartWebhook(ctx); err != nil {
logrus.Fatalf("Error starting webhooks: %v", err)
}

cancel()
},
OnStoppedLeading: func() {
logrus.Infof("Webhook leader lost: %s", currentNodeID)
},
OnNewLeader: func(identity string) {
if identity == currentNodeID {
return
}
logrus.Infof("Webhook leader elected: %s", identity)
},
},
})

return nil
}

func startManager(c *cli.Context) error {
var (
err error
Expand Down Expand Up @@ -147,58 +273,20 @@ func startManager(c *cli.Context) error {
return fmt.Errorf("failed to detect the node IP")
}

podName, err := util.GetRequiredEnv(types.EnvPodName)
if err != nil {
return fmt.Errorf("failed to detect the manager pod name")
}

podNamespace, err := util.GetRequiredEnv(types.EnvPodNamespace)
if err != nil {
return fmt.Errorf("failed to detect the manager pod namespace")
}

ctx := signals.SetupSignalContext()

logger := logrus.StandardLogger().WithField("node", currentNodeID)

// Conversion webhook needs to be started first since we use its port 9501 as readiness port.
// longhorn-manager pod becomes ready only when conversion webhook is running.
// The services in the longhorn-manager can then start to receive the requests.
// Conversion webhook does not use datastore, since it is a prerequisite for
// datastore operation.
clientsWithoutDatastore, err := client.NewClients(kubeconfigPath, false, ctx.Done())
err = startWebhooksByLeaderElection(ctx, kubeconfigPath, currentNodeID)
if err != nil {
return err
}
if err := webhook.StartWebhook(ctx, types.WebhookTypeConversion, clientsWithoutDatastore); err != nil {
return err
}

// This adds the label for the conversion webhook's selector. We do it the hard way without datastore to avoid chicken-and-egg.
pod, err := clientsWithoutDatastore.Clients.K8s.CoreV1().Pods(podNamespace).Get(context.Background(), podName, metav1.GetOptions{})
if err != nil {
return err
}
labels := types.GetConversionWebhookLabel()
for key, value := range labels {
pod.Labels[key] = value
}
_, err = clientsWithoutDatastore.Clients.K8s.CoreV1().Pods(podNamespace).Update(context.Background(), pod, metav1.UpdateOptions{})
if err != nil {
return err
}
if err := webhook.CheckWebhookServiceAvailability(types.WebhookTypeConversion); err != nil {
return err
}

clients, err := client.NewClients(kubeconfigPath, true, ctx.Done())
if err != nil {
return err
}

if err := webhook.StartWebhook(ctx, types.WebhookTypeAdmission, clients); err != nil {
return err
}
if err := clients.Datastore.AddLabelToManagerPod(currentNodeID, types.GetAdmissionWebhookLabel()); err != nil {
return err
}
Expand Down

0 comments on commit 134e682

Please sign in to comment.