From 3b2875120f9874b302ae7f65e46c1d957afbd105 Mon Sep 17 00:00:00 2001 From: David Finkel Date: Thu, 20 Aug 2020 11:20:36 -0400 Subject: [PATCH 1/4] Fix version update upon resync By assigning to `version` after the switch statement, any new version set within `resync` gets clobbered by the one that wa returned by watch, which is by definition stale if we decided to resync. --- k8s_pod_watcher.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s_pod_watcher.go b/k8s_pod_watcher.go index b8bac9c..336d606 100644 --- a/k8s_pod_watcher.go +++ b/k8s_pod_watcher.go @@ -302,6 +302,7 @@ func (p *PodWatcher) Run(ctx context.Context) error { rv, err := p.watch(ctx, podWatch, version, cbChans) switch err { case ErrResultsClosed: + version = rv case errVersionGone: switch resync() { case resyncReturn: @@ -313,7 +314,6 @@ func (p *PodWatcher) Run(ctx context.Context) error { default: return err } - version = rv // If it's been a while, reset the backoff so we don't wait too // long after things have been humming for an hour. From efa88856a4c331a05325c25726a83a6c59b9119e Mon Sep 17 00:00:00 2001 From: David Finkel Date: Thu, 20 Aug 2020 11:22:01 -0400 Subject: [PATCH 2/4] Use the status code 410 instead of the reason Reproducing the infinite loop with minikube, (k8s version v1.18.3) the status I get back after restarting minikube was different than in GKE with k8s version 1.16. (likely to differences in the reproduction setup) The Reason field was "Expired" rather than "Gone", but the Code field was still 410 (`net/http.StatusGone`). Switch to checking that code instead. --- k8s_pod_watcher.go | 15 +++++++++++---- k8s_pod_watcher_test.go | 3 ++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/k8s_pod_watcher.go b/k8s_pod_watcher.go index 336d606..22603ed 100644 --- a/k8s_pod_watcher.go +++ b/k8s_pod_watcher.go @@ -19,6 +19,7 @@ import ( "errors" "fmt" "net" + "net/http" "sync" "time" @@ -287,7 +288,13 @@ func (p *PodWatcher) Run(ctx context.Context) error { watchOpt.ResourceVersion = version podWatch, watchStartErr := p.cs.CoreV1().Pods(p.k8sNamespace).Watch(watchOpt) if watchStartErr != nil { - if !k8serrors.IsGone(watchStartErr) { + switch t := watchStartErr.(type) { + case k8serrors.APIStatus: + if t.Status().Code != http.StatusGone { + return fmt.Errorf("failed to startup watcher with status code %d: %w", + t.Status().Code, watchStartErr) + } + default: return fmt.Errorf("failed to startup watcher: %w", watchStartErr) } switch resync() { @@ -342,7 +349,7 @@ func errorEventIsGone(ev watch.Event) bool { if !isStatus { return false } - return errObj.Reason == k8smeta.StatusReasonGone + return errObj.Code == http.StatusGone } func (p *PodWatcher) watch(ctx context.Context, podWatch watch.Interface, rv string, cbChans []chan<- PodEvent) (string, error) { @@ -364,8 +371,8 @@ func (p *PodWatcher) watch(ctx context.Context, podWatch watch.Interface, rv str if !ok { if ev.Type == watch.Error { p.logf("received error event: %s", ev) - // if the error has reason "Gone", - // return and let the outr loop + // if the error has status code "Gone" (410), + // return and let the outer loop // reconnect. if errorEventIsGone(ev) { podWatch.Stop() diff --git a/k8s_pod_watcher_test.go b/k8s_pod_watcher_test.go index fe99416..e5d2cf7 100644 --- a/k8s_pod_watcher_test.go +++ b/k8s_pod_watcher_test.go @@ -17,6 +17,7 @@ package k8swatcher import ( "context" "net" + "net/http" "reflect" "sync" "testing" @@ -614,7 +615,7 @@ func (goneErr) Error() string { } func (goneErr) Status() k8smeta.Status { - return k8smeta.Status{Reason: k8smeta.StatusReasonGone} + return k8smeta.Status{Reason: k8smeta.StatusReasonGone, Code: http.StatusGone} } type dummyPods struct { From d6153e5fd96f3ed6718a0c85e079015d9f7d0f42 Mon Sep 17 00:00:00 2001 From: David Finkel Date: Thu, 20 Aug 2020 11:31:29 -0400 Subject: [PATCH 3/4] Add a couple more log-lines around resyncing --- k8s_pod_watcher.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/k8s_pod_watcher.go b/k8s_pod_watcher.go index 22603ed..84b4f6b 100644 --- a/k8s_pod_watcher.go +++ b/k8s_pod_watcher.go @@ -273,11 +273,13 @@ func (p *PodWatcher) Run(ctx context.Context) error { resync := func() resyncAction { newversion, resyncErr := p.resync(ctx, cbChans) if resyncErr != nil { + p.logf("resync failed: %s", resyncErr) if sleepBackoff() { return resyncReturn } return resyncContinueLoop } + p.logf("resync succeeded; new version: %q (old %q)", newversion, version) version = newversion return resyncSuccess } From 027346f4de77cd77734cb89be23403aa922b4de5 Mon Sep 17 00:00:00 2001 From: David Finkel Date: Thu, 20 Aug 2020 12:05:08 -0400 Subject: [PATCH 4/4] github actions: run tests with go1.15 as well --- .github/workflows/go.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 3cc65fb..0e75957 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: os: [macOS-latest, ubuntu-latest] - goversion: [1.13, 1.14] + goversion: [1.13, 1.14, 1.15] steps: - name: Set up Go ${{matrix.goversion}} on ${{matrix.os}}