Skip to content

Commit

Permalink
fix: delay deleting GSes in error state (#3428)
Browse files Browse the repository at this point in the history
  • Loading branch information
nrwiersma authored Oct 17, 2023
1 parent f9a9f39 commit 93a4c4a
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 1 deletion.
3 changes: 3 additions & 0 deletions pkg/apis/agones/v1/gameserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,9 @@ const (
PodSafeToEvictAnnotation = "cluster-autoscaler.kubernetes.io/safe-to-evict"
// SafeToEvictLabel is a label that, when "false", matches the restrictive PDB agones-gameserver-safe-to-evict-false.
SafeToEvictLabel = agones.GroupName + "/safe-to-evict"
// GameServerErroredAtAnnotation is an annotation that records the timestamp the GameServer entered the
// error state. The timestamp is encoded in RFC3339 format.
GameServerErroredAtAnnotation = agones.GroupName + "/errored-at"

// True is the string "true" to appease the goconst lint.
True = "true"
Expand Down
4 changes: 4 additions & 0 deletions pkg/gameservers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,10 @@ func (c *Controller) syncGameServerShutdownState(ctx context.Context, gs *agones
// moveToErrorState moves the GameServer to the error state
func (c *Controller) moveToErrorState(ctx context.Context, gs *agonesv1.GameServer, msg string) (*agonesv1.GameServer, error) {
gsCopy := gs.DeepCopy()
if gsCopy.Annotations == nil {
gsCopy.Annotations = make(map[string]string, 1)
}
gsCopy.Annotations[agonesv1.GameServerErroredAtAnnotation] = time.Now().Format(time.RFC3339)
gsCopy.Status.State = agonesv1.GameServerStateError

gs, err := c.gameServerGetter.GameServers(gs.ObjectMeta.Namespace).Update(ctx, gsCopy, metav1.UpdateOptions{})
Expand Down
10 changes: 10 additions & 0 deletions pkg/gameservers/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1152,6 +1152,11 @@ func TestControllerCreateGameServerPod(t *testing.T) {

assert.True(t, podCreated, "attempt should have been made to create a pod")
assert.True(t, gsUpdated, "GameServer should be updated")
if assert.NotEmpty(t, gs.Annotations[agonesv1.GameServerErroredAtAnnotation]) {
gotTime, err := time.Parse(time.RFC3339, gs.Annotations[agonesv1.GameServerErroredAtAnnotation])
require.NoError(t, err)
assert.WithinDuration(t, time.Now(), gotTime, time.Second)
}
assert.Equal(t, agonesv1.GameServerStateError, gs.Status.State)
})

Expand All @@ -1178,6 +1183,11 @@ func TestControllerCreateGameServerPod(t *testing.T) {

assert.True(t, podCreated, "attempt should have been made to create a pod")
assert.True(t, gsUpdated, "GameServer should be updated")
if assert.NotEmpty(t, gs.Annotations[agonesv1.GameServerErroredAtAnnotation]) {
gotTime, err := time.Parse(time.RFC3339, gs.Annotations[agonesv1.GameServerErroredAtAnnotation])
require.NoError(t, err)
assert.WithinDuration(t, time.Now(), gotTime, time.Second)
}
assert.Equal(t, agonesv1.GameServerStateError, gs.Status.State)
})
}
Expand Down
36 changes: 35 additions & 1 deletion pkg/gameserversets/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ const (

// maxPodPendingCount is the maximum number of pending pods per game server set
maxPodPendingCount = 5000

// gameServerErrorDeletionDelay is the minimum amount of time to delay the deletion
// of a GameServer in Error state.
gameServerErrorDeletionDelay = 30 * time.Second
)

// Extensions struct contains what is needed to bind webhook handlers
Expand Down Expand Up @@ -430,7 +434,19 @@ func computeReconciliationAction(strategy apis.SchedulingStrategy, list []*agone

// GameServerStateShutdown - already handled above
// GameServerStateAllocated - already handled above
case agonesv1.GameServerStateError, agonesv1.GameServerStateUnhealthy:
case agonesv1.GameServerStateError:
if !shouldDeleteErroredGameServer(gs) {
// The GameServer is in an Error state and should not be deleted yet.
// To stop an ever-increasing number of GameServers from being created,
// consider the Error state GameServers as up and pending. This stops high
// churn rate that can negatively impact Kubernetes.
podPendingCount++
handleGameServerUp(gs)
} else {
scheduleDeletion(gs)
}

case agonesv1.GameServerStateUnhealthy:
scheduleDeletion(gs)
default:
// unrecognized state, assume it's up.
Expand Down Expand Up @@ -474,6 +490,24 @@ func computeReconciliationAction(strategy apis.SchedulingStrategy, list []*agone
return numServersToAdd, toDelete, partialReconciliation
}

func shouldDeleteErroredGameServer(gs *agonesv1.GameServer) bool {
erroredAtStr := gs.Annotations[agonesv1.GameServerErroredAtAnnotation]
if erroredAtStr == "" {
return true
}

erroredAt, err := time.Parse(time.RFC3339, erroredAtStr)
if err != nil {
// The annotation is in the wrong format, delete the GameServer.
return true
}

if time.Since(erroredAt) >= gameServerErrorDeletionDelay {
return true
}
return false
}

// addMoreGameServers adds diff more GameServers to the set
func (c *Controller) addMoreGameServers(ctx context.Context, gsSet *agonesv1.GameServerSet, count int) error {
loggerForGameServerSet(c.baseLogger, gsSet).WithField("count", count).Debug("Adding more gameservers")
Expand Down
92 changes: 92 additions & 0 deletions pkg/gameserversets/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,98 @@ func TestSyncGameServerSet(t *testing.T) {
assert.True(t, updated, "A game servers should have been updated")
})

t.Run("adding and deleting errored gameservers", func(t *testing.T) {
gsSet := defaultFixture()
list := createGameServers(gsSet, 5)

// make some as unhealthy
list[0].Annotations = map[string]string{agonesv1.GameServerErroredAtAnnotation: time.Now().Add(-30 * time.Second).UTC().Format(time.RFC3339)}
list[0].Status.State = agonesv1.GameServerStateError

updated := false
count := 0

c, m := newFakeController()
m.AgonesClient.AddReactor("list", "gameserversets", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, &agonesv1.GameServerSetList{Items: []agonesv1.GameServerSet{*gsSet}}, nil
})
m.AgonesClient.AddReactor("list", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, &agonesv1.GameServerList{Items: list}, nil
})

m.AgonesClient.AddReactor("update", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
ua := action.(k8stesting.UpdateAction)
gs := ua.GetObject().(*agonesv1.GameServer)
assert.Equal(t, gs.Status.State, agonesv1.GameServerStateShutdown)

updated = true
assert.Equal(t, "test-0", gs.GetName())
return true, nil, nil
})
m.AgonesClient.AddReactor("create", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
ca := action.(k8stesting.CreateAction)
gs := ca.GetObject().(*agonesv1.GameServer)

assert.True(t, metav1.IsControlledBy(gs, gsSet))
count++
return true, gs, nil
})

ctx, cancel := agtesting.StartInformers(m, c.gameServerSetSynced, c.gameServerSynced)
defer cancel()

c.syncGameServerSet(ctx, gsSet.ObjectMeta.Namespace+"/"+gsSet.ObjectMeta.Name) // nolint: errcheck

assert.Equal(t, 6, count)
assert.True(t, updated, "A game servers should have been updated")
})

t.Run("adding and delay deleting errored gameservers", func(t *testing.T) {
gsSet := defaultFixture()
list := createGameServers(gsSet, 5)

// make some as unhealthy
list[0].Annotations = map[string]string{agonesv1.GameServerErroredAtAnnotation: time.Now().UTC().Format(time.RFC3339)}
list[0].Status.State = agonesv1.GameServerStateError

updated := false
count := 0

c, m := newFakeController()
m.AgonesClient.AddReactor("list", "gameserversets", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, &agonesv1.GameServerSetList{Items: []agonesv1.GameServerSet{*gsSet}}, nil
})
m.AgonesClient.AddReactor("list", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, &agonesv1.GameServerList{Items: list}, nil
})

m.AgonesClient.AddReactor("update", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
ua := action.(k8stesting.UpdateAction)
gs := ua.GetObject().(*agonesv1.GameServer)
assert.Equal(t, gs.Status.State, agonesv1.GameServerStateShutdown)

updated = true
assert.Equal(t, "test-0", gs.GetName())
return true, nil, nil
})
m.AgonesClient.AddReactor("create", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
ca := action.(k8stesting.CreateAction)
gs := ca.GetObject().(*agonesv1.GameServer)

assert.True(t, metav1.IsControlledBy(gs, gsSet))
count++
return true, gs, nil
})

ctx, cancel := agtesting.StartInformers(m, c.gameServerSetSynced, c.gameServerSynced)
defer cancel()

c.syncGameServerSet(ctx, gsSet.ObjectMeta.Namespace+"/"+gsSet.ObjectMeta.Name) // nolint: errcheck

assert.Equal(t, 5, count)
assert.False(t, updated, "A game servers should not have been updated")
})

t.Run("removing gamservers", func(t *testing.T) {
gsSet := defaultFixture()
list := createGameServers(gsSet, 15)
Expand Down

0 comments on commit 93a4c4a

Please sign in to comment.