From 88bc96a26e04a61c9909f6119d643c5ae6e7ceb2 Mon Sep 17 00:00:00 2001 From: zghh <1069308575@qq.com> Date: Tue, 25 Jun 2024 21:46:23 +0800 Subject: [PATCH] fix the problem of configuring multiple gRPC endpoints where the first endpoint times out and cannot switch to the second endpoint --- WORKSPACE | 10 +++++ .../org_golang_google_grpc_clientconn.patch | 41 +++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 third_party/org_golang_google_grpc_clientconn.patch diff --git a/WORKSPACE b/WORKSPACE index 46e8e7c3c017..30f2b3d09c5a 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -159,6 +159,16 @@ git_repository( # gazelle args: -go_prefix github.com/gogo/protobuf -proto legacy ) +git_repository( + name = "org_golang_google_grpc", + commit = "1055b481ed2204a29d233286b9b50c42b63f8825", + patch_args = ["-p1"], + patches = [ + "//third_party:org_golang_google_grpc_clientconn.patch", + ], + remote = "https://github.com/grpc/grpc-go", +) + load("@rules_oci//oci:pull.bzl", "oci_pull") # A multi-arch base image diff --git a/third_party/org_golang_google_grpc_clientconn.patch b/third_party/org_golang_google_grpc_clientconn.patch new file mode 100644 index 000000000000..f1505c998c73 --- /dev/null +++ b/third_party/org_golang_google_grpc_clientconn.patch @@ -0,0 +1,41 @@ +diff --git a/clientconn.go b/clientconn.go +index 95a7459b..a7acdabc 100644 +--- a/clientconn.go ++++ b/clientconn.go +@@ -1323,18 +1323,11 @@ func (ac *addrConn) resetTransport() { + // Give dial more time as we keep failing to connect. + dialDuration = backoffFor + } +- // We can potentially spend all the time trying the first address, and +- // if the server accepts the connection and then hangs, the following +- // addresses will never be tried. +- // +- // The spec doesn't mention what should be done for multiple addresses. +- // https://github.com/grpc/grpc/blob/master/doc/connection-backoff.md#proposed-backoff-algorithm +- connectDeadline := time.Now().Add(dialDuration) + + ac.updateConnectivityState(connectivity.Connecting, nil) + ac.mu.Unlock() + +- if err := ac.tryAllAddrs(acCtx, addrs, connectDeadline); err != nil { ++ if err := ac.tryAllAddrs(acCtx, addrs, dialDuration); err != nil { + ac.cc.resolveNow(resolver.ResolveNowOptions{}) + // After exhausting all addresses, the addrConn enters + // TRANSIENT_FAILURE. +@@ -1377,7 +1370,7 @@ func (ac *addrConn) resetTransport() { + // tryAllAddrs tries to creates a connection to the addresses, and stop when at + // the first successful one. It returns an error if no address was successfully + // connected, or updates ac appropriately with the new transport. +-func (ac *addrConn) tryAllAddrs(ctx context.Context, addrs []resolver.Address, connectDeadline time.Time) error { ++func (ac *addrConn) tryAllAddrs(ctx context.Context, addrs []resolver.Address, dialDuration time.Duration) error { + var firstConnErr error + for _, addr := range addrs { + if ctx.Err() != nil { +@@ -1397,6 +1390,7 @@ func (ac *addrConn) tryAllAddrs(ctx context.Context, addrs []resolver.Address, c + + channelz.Infof(logger, ac.channelzID, "Subchannel picks a new address %q to connect", addr.Addr) + ++ connectDeadline := time.Now().Add(dialDuration) + err := ac.createTransport(ctx, addr, copts, connectDeadline) + if err == nil { + return nil