From 1caca2c56df18013682adfd2f89497c569cb8532 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E7=A5=96=E5=BB=BA?= Date: Wed, 18 Dec 2024 17:52:11 +0800 Subject: [PATCH] fix gateway node check for centralized ecmp subnets (#4847) Signed-off-by: zhangzujian --- .gitignore | 1 + Makefile | 4 +- .../kube-ovn/templates/controller-deploy.yaml | 1 + cmd/cmdmain.go | 5 - cmd/controller/cmdmain.go | 92 +++++++++++++++++++ cmd/controller/controller.go | 2 +- cmd/pinger/pinger.go | 4 +- dist/images/Dockerfile | 8 +- dist/images/install.sh | 1 + pkg/controller/controller.go | 2 +- pkg/controller/node.go | 40 ++++---- 11 files changed, 126 insertions(+), 34 deletions(-) create mode 100644 cmd/controller/cmdmain.go diff --git a/.gitignore b/.gitignore index 6c02a74d038..94b7d0e7582 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ dist/images/test-server dist/images/kube-ovn dist/images/kube-ovn-cmd dist/images/kube-ovn-daemon +dist/images/kube-ovn-controller dist/images/kube-ovn-pinger dist/images/kube-ovn-webhook dist/windows/kube-ovn.exe diff --git a/Makefile b/Makefile index f0fb8e4e734..956336747d9 100644 --- a/Makefile +++ b/Makefile @@ -117,7 +117,7 @@ build-go: CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build $(GO_BUILD_FLAGS) -o $(CURDIR)/dist/images/kube-ovn -v ./cmd/cni CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-cmd -v ./cmd CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-daemon -v ./cmd/daemon - CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-pinger -v ./cmd/pinger + CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-controller -v ./cmd/controller CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build $(GO_BUILD_FLAGS) -o $(CURDIR)/dist/images/test-server -v ./test/server .PHONY: build-go-windows @@ -131,7 +131,7 @@ build-go-arm: CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build $(GO_BUILD_FLAGS) -o $(CURDIR)/dist/images/kube-ovn -v ./cmd/cni CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-cmd -v ./cmd CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-daemon -v ./cmd/daemon - CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-pinger -v ./cmd/pinger + CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-controller -v ./cmd/controller .PHONY: build-kube-ovn build-kube-ovn: build-debug build-go diff --git a/charts/kube-ovn/templates/controller-deploy.yaml b/charts/kube-ovn/templates/controller-deploy.yaml index d18eb54c281..1a7bcc311f8 100644 --- a/charts/kube-ovn/templates/controller-deploy.yaml +++ b/charts/kube-ovn/templates/controller-deploy.yaml @@ -147,6 +147,7 @@ spec: capabilities: add: - NET_BIND_SERVICE + - NET_RAW env: - name: ENABLE_SSL value: "{{ .Values.networking.ENABLE_SSL }}" diff --git a/cmd/cmdmain.go b/cmd/cmdmain.go index a4166f2fb76..064e3057ab2 100644 --- a/cmd/cmdmain.go +++ b/cmd/cmdmain.go @@ -11,7 +11,6 @@ import ( "k8s.io/klog/v2" - "github.com/kubeovn/kube-ovn/cmd/controller" "github.com/kubeovn/kube-ovn/cmd/health_check" "github.com/kubeovn/kube-ovn/cmd/ovn_ic_controller" "github.com/kubeovn/kube-ovn/cmd/ovn_leader_checker" @@ -22,7 +21,6 @@ import ( ) const ( - CmdController = "kube-ovn-controller" CmdMonitor = "kube-ovn-monitor" CmdSpeaker = "kube-ovn-speaker" CmdWebhook = "kube-ovn-webhook" @@ -91,9 +89,6 @@ func dumpProfile() { func main() { cmd := filepath.Base(os.Args[0]) switch cmd { - case CmdController: - dumpProfile() - controller.CmdMain() case CmdMonitor: dumpProfile() ovn_monitor.CmdMain() diff --git a/cmd/controller/cmdmain.go b/cmd/controller/cmdmain.go new file mode 100644 index 00000000000..476264fb46c --- /dev/null +++ b/cmd/controller/cmdmain.go @@ -0,0 +1,92 @@ +package main + +import ( + "fmt" + "os" + "os/signal" + "path/filepath" + "runtime/pprof" + "syscall" + "time" + + "k8s.io/klog/v2" + + "github.com/kubeovn/kube-ovn/cmd/pinger" + "github.com/kubeovn/kube-ovn/pkg/util" +) + +const ( + CmdController = "kube-ovn-controller" + CmdPinger = "kube-ovn-pinger" +) + +const timeFormat = "2006-01-02_15:04:05" + +func dumpProfile() { + ch1 := make(chan os.Signal, 1) + ch2 := make(chan os.Signal, 1) + signal.Notify(ch1, syscall.SIGUSR1) + signal.Notify(ch2, syscall.SIGUSR2) + go func() { + for { + <-ch1 + name := fmt.Sprintf("cpu-profile-%s.pprof", time.Now().Format(timeFormat)) + path := filepath.Join(os.TempDir(), name) + f, err := os.Create(path) // #nosec G303,G304 + if err != nil { + klog.Errorf("failed to create cpu profile file: %v", err) + return + } + if err = pprof.StartCPUProfile(f); err != nil { + klog.Errorf("failed to start cpu profile: %v", err) + if err = f.Close(); err != nil { + klog.Errorf("failed to close file %q: %v", path, err) + } + return + } + time.Sleep(30 * time.Second) + pprof.StopCPUProfile() + if err = f.Close(); err != nil { + klog.Errorf("failed to close file %q: %v", path, err) + return + } + } + }() + go func() { + for { + <-ch2 + name := fmt.Sprintf("mem-profile-%s.pprof", time.Now().Format(timeFormat)) + path := filepath.Join(os.TempDir(), name) + f, err := os.Create(path) // #nosec G303,G304 + if err != nil { + klog.Errorf("failed to create memory profile file: %v", err) + return + } + if err = pprof.WriteHeapProfile(f); err != nil { + klog.Errorf("failed to write memory profile file: %v", err) + if err = f.Close(); err != nil { + klog.Errorf("failed to close file %q: %v", path, err) + } + return + } + if err = f.Close(); err != nil { + klog.Errorf("failed to close file %q: %v", path, err) + return + } + } + }() +} + +func main() { + cmd := filepath.Base(os.Args[0]) + switch cmd { + case CmdController: + dumpProfile() + CmdMain() + case CmdPinger: + dumpProfile() + pinger.CmdMain() + default: + util.LogFatalAndExit(nil, "%s is an unknown command", cmd) + } +} diff --git a/cmd/controller/controller.go b/cmd/controller/controller.go index d8b4d49b855..8b0a3ad3414 100644 --- a/cmd/controller/controller.go +++ b/cmd/controller/controller.go @@ -1,4 +1,4 @@ -package controller +package main import ( "context" diff --git a/cmd/pinger/pinger.go b/cmd/pinger/pinger.go index 1bb77d2a2a2..046a0bebcc3 100644 --- a/cmd/pinger/pinger.go +++ b/cmd/pinger/pinger.go @@ -1,4 +1,4 @@ -package main +package pinger import ( _ "net/http/pprof" // #nosec @@ -14,7 +14,7 @@ import ( "github.com/kubeovn/kube-ovn/versions" ) -func main() { +func CmdMain() { defer klog.Flush() klog.Info(versions.String()) diff --git a/dist/images/Dockerfile b/dist/images/Dockerfile index 5e39f82d8a3..30a89c6b8a4 100644 --- a/dist/images/Dockerfile +++ b/dist/images/Dockerfile @@ -10,16 +10,16 @@ COPY 01-kube-ovn.conflist /kube-ovn/01-kube-ovn.conflist COPY kube-ovn /kube-ovn/kube-ovn COPY kube-ovn-cmd /kube-ovn/kube-ovn-cmd COPY kube-ovn-daemon /kube-ovn/kube-ovn-daemon -COPY kube-ovn-pinger /kube-ovn/kube-ovn-pinger -RUN ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-controller && \ - ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-monitor && \ +COPY kube-ovn-controller /kube-ovn/kube-ovn-controller +RUN ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-monitor && \ ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-speaker && \ ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-webhook && \ ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-healthcheck && \ ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-leader-checker && \ ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-ic-controller && \ + ln -s /kube-ovn/kube-ovn-controller /kube-ovn/kube-ovn-pinger && \ setcap CAP_NET_BIND_SERVICE+eip /kube-ovn/kube-ovn-cmd && \ - setcap CAP_NET_RAW,CAP_NET_BIND_SERVICE+eip /kube-ovn/kube-ovn-pinger && \ + setcap CAP_NET_RAW,CAP_NET_BIND_SERVICE+eip /kube-ovn/kube-ovn-controller && \ setcap CAP_NET_ADMIN,CAP_NET_RAW,CAP_NET_BIND_SERVICE,CAP_SYS_ADMIN+eip /kube-ovn/kube-ovn-daemon FROM kubeovn/kube-ovn-base:$BASE_TAG diff --git a/dist/images/install.sh b/dist/images/install.sh index 527cf1d9954..fc5e2efed02 100755 --- a/dist/images/install.sh +++ b/dist/images/install.sh @@ -4733,6 +4733,7 @@ spec: capabilities: add: - NET_BIND_SERVICE + - NET_RAW env: - name: ENABLE_SSL value: "$ENABLE_SSL" diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index c1ebe4b5fc3..d77e3451c07 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -1263,7 +1263,7 @@ func (c *Controller) startWorkers(ctx context.Context) { go wait.Until(c.resyncProviderNetworkStatus, 30*time.Second, ctx.Done()) go wait.Until(c.exportSubnetMetrics, 30*time.Second, ctx.Done()) - go wait.Until(c.CheckGatewayReady, 5*time.Second, ctx.Done()) + go wait.Until(c.checkSubnetGateway, 5*time.Second, ctx.Done()) go wait.Until(runWorker("add ovn eip", c.addOvnEipQueue, c.handleAddOvnEip), time.Second, ctx.Done()) go wait.Until(runWorker("update ovn eip", c.updateOvnEipQueue, c.handleUpdateOvnEip), time.Second, ctx.Done()) diff --git a/pkg/controller/node.go b/pkg/controller/node.go index 8f5bbb94f69..bc23ab3d8e4 100644 --- a/pkg/controller/node.go +++ b/pkg/controller/node.go @@ -524,27 +524,27 @@ func (c *Controller) handleUpdateNode(key string) error { return nil } -func (c *Controller) CheckGatewayReady() { - if err := c.checkGatewayReady(); err != nil { - klog.Errorf("failed to check gateway ready %v", err) +func (c *Controller) checkSubnetGateway() { + if err := c.checkSubnetGatewayNode(); err != nil { + klog.Errorf("failed to check subnet gateway node: %v", err) } } -func (c *Controller) checkGatewayReady() error { - klog.V(3).Infoln("start to check gateway status") +func (c *Controller) checkSubnetGatewayNode() error { + klog.V(3).Infoln("start to check subnet gateway node") subnetList, err := c.subnetsLister.List(labels.Everything()) if err != nil { - klog.Errorf("failed to list subnets %v", err) + klog.Errorf("failed to list subnets: %v", err) return err } nodes, err := c.nodesLister.List(labels.Everything()) if err != nil { - klog.Errorf("failed to list nodes, %v", err) + klog.Errorf("failed to list nodes: %v", err) return err } for _, subnet := range subnetList { - if (subnet.Spec.Vlan != "" && !subnet.Spec.LogicalGateway) || + if (subnet.Spec.Vlan != "" && (subnet.Spec.U2OInterconnection || !subnet.Spec.LogicalGateway)) || subnet.Spec.GatewayNode == "" || subnet.Spec.GatewayType != kubeovnv1.GWCentralizedType || !subnet.Spec.EnableEcmp { @@ -582,10 +582,9 @@ func (c *Controller) checkGatewayReady() error { pinger.Timeout = time.Duration(count) * time.Second pinger.Interval = 1 * time.Second - success := false - + var pingSucceeded bool pinger.OnRecv = func(_ *goping.Packet) { - success = true + pingSucceeded = true pinger.Stop() } if err = pinger.Run(); err != nil { @@ -593,13 +592,16 @@ func (c *Controller) checkGatewayReady() error { return err } - if !nodeReady(node) { - success = false - } - - if !success { + nodeIsReady := nodeReady(node) + if !pingSucceeded || !nodeIsReady { if exist { - klog.Warningf("failed to ping ovn0 %s or node %s is not ready, delete ecmp policy route for node", ip, node.Name) + if !pingSucceeded { + klog.Warningf("failed to ping ovn0 ip %s on node %s", ip, node.Name) + } + if !nodeIsReady { + klog.Warningf("node %s is not ready", node.Name) + } + klog.Warningf("delete ecmp policy route for node %s ip %s", node.Name, ip) nextHops.Remove(ip) delete(nameIPMap, node.Name) klog.Infof("update policy route for centralized subnet %s, nextHops %s", subnet.Name, nextHops) @@ -609,7 +611,7 @@ func (c *Controller) checkGatewayReady() error { } } } else { - klog.V(3).Infof("succeed to ping gw %s", ip) + klog.V(3).Infof("succeeded to ping ovn0 ip %s on node %s", ip, node.Name) if !exist { nextHops.Add(ip) if nameIPMap == nil { @@ -624,7 +626,7 @@ func (c *Controller) checkGatewayReady() error { } } } else if exist { - klog.Infof("subnet %s gatewayNode does not contains node %v, delete policy route for node ip %s", subnet.Name, node.Name, ip) + klog.Infof("subnet %s gateway nodes does not contain node %s, delete policy route for node ip %s", subnet.Name, node.Name, ip) nextHops.Remove(ip) delete(nameIPMap, node.Name) klog.Infof("update policy route for centralized subnet %s, nextHops %s", subnet.Name, nextHops)