From c917986748423fbfd228a419e15d185da17652c6 Mon Sep 17 00:00:00 2001 From: Ivy Gooch Date: Thu, 5 Dec 2024 15:28:40 -0800 Subject: [PATCH 1/3] Test fleet, allocator-client, and custom 'players' per game server counter metric to test allocation for replicating issue #3992 --- examples/allocator-client/main.go | 31 ++++++++++++++++++ pkg/metrics/controller.go | 10 +++++- pkg/metrics/controller_metrics.go | 11 ++++++- pkg/metrics/controller_test.go | 52 +++++++++++++++++++++++++++++++ pkg/metrics/util_test.go | 3 +- tmp/allocate.sh | 23 ++++++++++++++ tmp/fleet.yaml | 43 +++++++++++++++++++++++++ 7 files changed, 170 insertions(+), 3 deletions(-) create mode 100644 tmp/allocate.sh create mode 100644 tmp/fleet.yaml diff --git a/examples/allocator-client/main.go b/examples/allocator-client/main.go index 4031d91597..23cb3bf99a 100644 --- a/examples/allocator-client/main.go +++ b/examples/allocator-client/main.go @@ -26,6 +26,7 @@ import ( "github.com/pkg/errors" "google.golang.org/grpc" "google.golang.org/grpc/credentials" + "google.golang.org/protobuf/types/known/wrapperspb" ) func main() { @@ -55,6 +56,36 @@ func main() { request := &pb.AllocationRequest{ Namespace: *namespace, + GameServerSelectors: []*pb.GameServerSelector{ + { + GameServerState: pb.GameServerSelector_ALLOCATED, + MatchLabels: map[string]string{ + "version": "1.2.3", + }, + Counters: map[string]*pb.CounterSelector{ + "players": { + MinAvailable: 1, + }, + }, + }, + { + GameServerState: pb.GameServerSelector_READY, + MatchLabels: map[string]string{ + "version": "1.2.3", + }, + Counters: map[string]*pb.CounterSelector{ + "players": { + MinAvailable: 1, + }, + }, + }, + }, + Counters: map[string]*pb.CounterAction{ + "players": { + Action: wrapperspb.String("Increment"), + Amount: wrapperspb.Int64(1), + }, + }, MultiClusterSetting: &pb.MultiClusterSetting{ Enabled: *multicluster, }, diff --git a/pkg/metrics/controller.go b/pkg/metrics/controller.go index 7b3c7421d1..a9ccb568e0 100644 --- a/pkg/metrics/controller.go +++ b/pkg/metrics/controller.go @@ -62,7 +62,7 @@ const ( var ( // MetricResyncPeriod is the interval to re-synchronize metrics based on indexed cache. - MetricResyncPeriod = time.Second * 15 + MetricResyncPeriod = time.Second * 1 ) func init() { @@ -449,7 +449,15 @@ func (c *Controller) recordGameServerStatusChanges(old, next interface{}) { RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyName, newGs.GetName()), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerPlayerCapacityTotal.M(newGs.Status.Players.Capacity-newGs.Status.Players.Count)) } + } + if runtime.FeatureEnabled(runtime.FeatureCountsAndLists) && len(newGs.Status.Counters) != 0 { + if counterStatus, ok := newGs.Status.Counters["players"]; ok { + if counterStatus.Count != oldGs.Status.Counters["players"].Count { + RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName), + tag.Upsert(keyName, newGs.GetName()), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServersCountersStats.M(counterStatus.Count)) + } + } } if newGs.Status.State != oldGs.Status.State { diff --git a/pkg/metrics/controller_metrics.go b/pkg/metrics/controller_metrics.go index 22bf46defc..605e8c7a75 100644 --- a/pkg/metrics/controller_metrics.go +++ b/pkg/metrics/controller_metrics.go @@ -31,6 +31,7 @@ const ( fleetAutoscalersLimitedName = "fleet_autoscalers_limited" fleetCountersName = "fleet_counters" fleetListsName = "fleet_lists" + gameServersCountersName = "gameservers_counters" gameServersCountName = "gameservers_count" gameServersTotalName = "gameservers_total" gameServersPlayerConnectedTotalName = "gameserver_player_connected_total" @@ -45,7 +46,7 @@ var ( fleetAutoscalerViews = []string{fleetAutoscalerBufferLimitName, fleetAutoscalterBufferSizeName, fleetAutoscalerCurrentReplicaCountName, fleetAutoscalersDesiredReplicaCountName, fleetAutoscalersAbleToScaleName, fleetAutoscalersLimitedName} // fleetViews are metric views associated with Fleets - fleetViews = append([]string{fleetRolloutPercent, fleetReplicaCountName, gameServersCountName, gameServersTotalName, gameServersPlayerConnectedTotalName, gameServersPlayerCapacityTotalName, gameServerStateDurationName, fleetCountersName, fleetListsName}, fleetAutoscalerViews...) + fleetViews = append([]string{fleetRolloutPercent, fleetReplicaCountName, gameServersCountersName, gameServersCountName, gameServersTotalName, gameServersPlayerConnectedTotalName, gameServersPlayerCapacityTotalName, gameServerStateDurationName, fleetCountersName, fleetListsName}, fleetAutoscalerViews...) stateDurationSeconds = []float64{0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384} fleetRolloutPercentStats = stats.Int64("fleets/rollout_percent", "The current fleet rollout percentage", "1") @@ -58,6 +59,7 @@ var ( fasLimitedStats = stats.Int64("fas/limited", "The fleet autoscaler is capped (0 indicates false, 1 indicates true)", "1") fleetCountersStats = stats.Int64("fleets/counters", "Aggregated Counters counts and capacity across GameServers in the Fleet", "1") fleetListsStats = stats.Int64("fleets/lists", "Aggregated Lists counts and capacity across GameServers in the Fleet", "1") + gameServersCountersStats = stats.Int64("gameservers/counters", "Counters connected to gameservers", "1") gameServerCountStats = stats.Int64("gameservers/count", "The count of gameservers", "1") gameServerTotalStats = stats.Int64("gameservers/total", "The total of gameservers", "1") gameServerPlayerConnectedTotal = stats.Int64("gameservers/player_connected", "The total number of players connected to gameservers", "1") @@ -137,6 +139,13 @@ var ( Aggregation: view.LastValue(), TagKeys: []tag.Key{keyName, keyNamespace, keyType, keyList}, }, + { + Name: gameServersCountersName, + Measure: gameServersCountersStats, + Description: "The current count of counters in gameservers", + Aggregation: view.LastValue(), + TagKeys: []tag.Key{keyFleetName, keyName, keyNamespace}, + }, { Name: gameServersCountName, Measure: gameServerCountStats, diff --git a/pkg/metrics/controller_test.go b/pkg/metrics/controller_test.go index c16f6c57df..b720364a69 100644 --- a/pkg/metrics/controller_test.go +++ b/pkg/metrics/controller_test.go @@ -158,6 +158,58 @@ func TestControllerGameServerCount(t *testing.T) { }) } +func TestControllerGameServerCountersCount(t *testing.T) { + runtime.FeatureTestMutex.Lock() + defer runtime.FeatureTestMutex.Unlock() + runtime.EnableAllFeatures() + resetMetrics() + exporter := &metricExporter{} + reader := metricexport.NewReader() + + c := newFakeController() + defer c.close() + + gs1 := gameServerWithFleetAndState("test-fleet", agonesv1.GameServerStateReady) + gs1.Status.Counters["players"] = agonesv1.CounterStatus{Count: 0, Capacity: 10} + c.gsWatch.Add(gs1) + gs1 = gs1.DeepCopy() + playerCounter := gs1.Status.Counters["players"] + playerCounter.Count++ + gs1.Status.Counters["players"] = playerCounter + c.gsWatch.Modify(gs1) + + c.run(t) + require.True(t, c.sync()) + require.Eventually(t, func() bool { + gs, err := c.gameServerLister.GameServers(gs1.ObjectMeta.Namespace).Get(gs1.ObjectMeta.Name) + assert.NoError(t, err) + pc := gs.Status.Counters["players"] + return pc.Count == 1 + }, 5*time.Second, time.Second) + c.collect() + + gs1 = gs1.DeepCopy() + playerCounter = gs1.Status.Counters["players"] + playerCounter.Count += 4 + gs1.Status.Counters["players"] = playerCounter + c.gsWatch.Modify(gs1) + + c.run(t) + require.True(t, c.sync()) + require.Eventually(t, func() bool { + gs, err := c.gameServerLister.GameServers(gs1.ObjectMeta.Namespace).Get(gs1.ObjectMeta.Name) + assert.NoError(t, err) + pc := gs.Status.Counters["players"] + return pc.Count == 5 + }, 5*time.Second, time.Second) + c.collect() + + reader.ReadAndExport(exporter) + assertMetricData(t, exporter, gameServersCountersName, []expectedMetricData{ + {labels: []string{"test-fleet", gs1.GetName(), defaultNs}, val: int64(5)}, + }) +} + func TestControllerGameServerPlayerConnectedCount(t *testing.T) { runtime.FeatureTestMutex.Lock() defer runtime.FeatureTestMutex.Unlock() diff --git a/pkg/metrics/util_test.go b/pkg/metrics/util_test.go index f938171197..c668a118d2 100644 --- a/pkg/metrics/util_test.go +++ b/pkg/metrics/util_test.go @@ -140,7 +140,8 @@ func gameServerWithFleetAndState(fleetName string, state agonesv1.GameServerStat Labels: lbs, }, Status: agonesv1.GameServerStatus{ - State: state, + State: state, + Counters: map[string]agonesv1.CounterStatus{}, }, } return gs diff --git a/tmp/allocate.sh b/tmp/allocate.sh new file mode 100644 index 0000000000..0b0d11c3ba --- /dev/null +++ b/tmp/allocate.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Get the key, cert, and tls files from "Send allocation request" instructions +# https://agones.dev/site/docs/advanced/allocator-service/ +NAMESPACE=default +EXTERNAL_IP=$(kubectl get services agones-allocator -n agones-system -o jsonpath='{.status.loadBalancer.ingress[0].ip}') +KEY_FILE=client.key +CERT_FILE=client.crt +TLS_CA_FILE=ca.crt + +echo "Starting go runs" + +for _ in {1..10000}; do + go run ../examples/allocator-client/main.go \ + --ip "${EXTERNAL_IP}" \ + --port 443 \ + --namespace "${NAMESPACE}" \ + --key "${KEY_FILE}" \ + --cert "${CERT_FILE}" \ + --cacert "${TLS_CA_FILE}" +done + +echo "All done" diff --git a/tmp/fleet.yaml b/tmp/fleet.yaml new file mode 100644 index 0000000000..621cdb37a4 --- /dev/null +++ b/tmp/fleet.yaml @@ -0,0 +1,43 @@ +--- +# Copyright 2020 Google LLC All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: agones.dev/v1 +kind: Fleet +metadata: + name: simple-game-server +spec: + replicas: 300 + template: + metadata: + labels: + version: "1.2.3" + spec: + ports: + - name: default + containerPort: 7654 + counters: + players: + capacity: 300 + template: + spec: + containers: + - name: simple-game-server + image: us-docker.pkg.dev/agones-images/examples/simple-game-server:0.35 + resources: + requests: + memory: 32Mi + cpu: 10m + limits: + memory: 32Mi + cpu: 10m From d6bee849956f897fc9465692d660e549806dab04 Mon Sep 17 00:00:00 2001 From: Ivy Gooch Date: Wed, 18 Dec 2024 16:14:07 -0800 Subject: [PATCH 2/3] Removes Player Counter from the example fleet and game server allocation and reates a metric to track the number of times a game server has been allocated. This is to diagnose if the problem is with the allocator, or with the Counter --- examples/allocator-client/main.go | 17 ----------------- pkg/gameserverallocations/allocator.go | 14 ++++++++++++++ pkg/gameserverallocations/metrics.go | 14 ++++++++++++++ pkg/metrics/controller_metrics.go | 10 +++++++--- tmp/allocate.sh | 2 +- tmp/fleet.yaml | 5 +---- 6 files changed, 37 insertions(+), 25 deletions(-) diff --git a/examples/allocator-client/main.go b/examples/allocator-client/main.go index 23cb3bf99a..73512ba600 100644 --- a/examples/allocator-client/main.go +++ b/examples/allocator-client/main.go @@ -26,7 +26,6 @@ import ( "github.com/pkg/errors" "google.golang.org/grpc" "google.golang.org/grpc/credentials" - "google.golang.org/protobuf/types/known/wrapperspb" ) func main() { @@ -62,28 +61,12 @@ func main() { MatchLabels: map[string]string{ "version": "1.2.3", }, - Counters: map[string]*pb.CounterSelector{ - "players": { - MinAvailable: 1, - }, - }, }, { GameServerState: pb.GameServerSelector_READY, MatchLabels: map[string]string{ "version": "1.2.3", }, - Counters: map[string]*pb.CounterSelector{ - "players": { - MinAvailable: 1, - }, - }, - }, - }, - Counters: map[string]*pb.CounterAction{ - "players": { - Action: wrapperspb.String("Increment"), - Amount: wrapperspb.Int64(1), }, }, MultiClusterSetting: &pb.MultiClusterSetting{ diff --git a/pkg/gameserverallocations/allocator.go b/pkg/gameserverallocations/allocator.go index d0699514f0..72567ae72a 100644 --- a/pkg/gameserverallocations/allocator.go +++ b/pkg/gameserverallocations/allocator.go @@ -20,6 +20,7 @@ import ( "crypto/x509" goErrors "errors" "fmt" + "log" "strings" "time" @@ -37,6 +38,7 @@ import ( "agones.dev/agones/pkg/util/runtime" "github.com/pkg/errors" "github.com/sirupsen/logrus" + "go.opencensus.io/stats" "go.opencensus.io/tag" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -464,6 +466,18 @@ func (c *Allocator) allocate(ctx context.Context, gsa *allocationv1.GameServerAl select { case res := <-req.response: // wait for the batch to be completed + if res.err == nil && res.gs != nil { + ctx, err := tag.New( + ctx, + tag.Upsert(keyFleetName, res.gs.GetObjectMeta().GetLabels()[agonesv1.FleetNameLabel]), + tag.Upsert(keyName, res.gs.GetName()), + tag.Upsert(keyNamespace, res.gs.GetNamespace()), + ) + if err != nil { + log.Fatal("Could not create tag", err) + } + stats.Record(ctx, gameServerAllocationsCount.M(1)) + } return res.gs, res.err case <-ctx.Done(): return nil, ErrTotalTimeoutExceeded diff --git a/pkg/gameserverallocations/metrics.go b/pkg/gameserverallocations/metrics.go index 64fb4d50ec..142621c122 100644 --- a/pkg/gameserverallocations/metrics.go +++ b/pkg/gameserverallocations/metrics.go @@ -39,9 +39,16 @@ var ( keyMultiCluster = mt.MustTagKey("is_multicluster") keyStatus = mt.MustTagKey("status") keySchedulingStrategy = mt.MustTagKey("scheduling_strategy") + keyName = mt.MustTagKey("name") + keyNamespace = mt.MustTagKey("namespace") gameServerAllocationsLatency = stats.Float64("gameserver_allocations/latency", "The duration of gameserver allocations", "s") gameServerAllocationsRetryTotal = stats.Int64("gameserver_allocations/errors", "The errors of gameserver allocations", "1") + gameServerAllocationsCount = stats.Int64( + "gameserver_allocations/count", + "The total number of times a Game Server has been allocated.", + "1", + ) ) func init() { @@ -61,6 +68,13 @@ func init() { Aggregation: view.Distribution(1, 2, 3, 4, 5), TagKeys: []tag.Key{keyFleetName, keyClusterName, keyMultiCluster, keyStatus, keySchedulingStrategy}, }, + { + Name: "gameserver_allocations_count", + Measure: gameServerAllocationsCount, + Description: "The number of times a game server is allocated", + Aggregation: view.Count(), + TagKeys: []tag.Key{keyFleetName, keyName, keyNamespace}, + }, } for _, v := range stateViews { diff --git a/pkg/metrics/controller_metrics.go b/pkg/metrics/controller_metrics.go index 605e8c7a75..cbaf17093d 100644 --- a/pkg/metrics/controller_metrics.go +++ b/pkg/metrics/controller_metrics.go @@ -43,10 +43,14 @@ const ( var ( // fleetAutoscalerViews are metric views associated with FleetAutoscalers - fleetAutoscalerViews = []string{fleetAutoscalerBufferLimitName, fleetAutoscalterBufferSizeName, fleetAutoscalerCurrentReplicaCountName, - fleetAutoscalersDesiredReplicaCountName, fleetAutoscalersAbleToScaleName, fleetAutoscalersLimitedName} + fleetAutoscalerViews = []string{fleetAutoscalerBufferLimitName, fleetAutoscalterBufferSizeName, + fleetAutoscalerCurrentReplicaCountName, fleetAutoscalersDesiredReplicaCountName, + fleetAutoscalersAbleToScaleName, fleetAutoscalersLimitedName} // fleetViews are metric views associated with Fleets - fleetViews = append([]string{fleetRolloutPercent, fleetReplicaCountName, gameServersCountersName, gameServersCountName, gameServersTotalName, gameServersPlayerConnectedTotalName, gameServersPlayerCapacityTotalName, gameServerStateDurationName, fleetCountersName, fleetListsName}, fleetAutoscalerViews...) + fleetViews = append([]string{fleetRolloutPercent, fleetReplicaCountName, gameServersCountersName, + gameServersCountName, gameServersTotalName, gameServersPlayerConnectedTotalName, + gameServersPlayerCapacityTotalName, gameServerStateDurationName, fleetCountersName, + fleetListsName}, fleetAutoscalerViews...) stateDurationSeconds = []float64{0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384} fleetRolloutPercentStats = stats.Int64("fleets/rollout_percent", "The current fleet rollout percentage", "1") diff --git a/tmp/allocate.sh b/tmp/allocate.sh index 0b0d11c3ba..719179ae28 100644 --- a/tmp/allocate.sh +++ b/tmp/allocate.sh @@ -10,7 +10,7 @@ TLS_CA_FILE=ca.crt echo "Starting go runs" -for _ in {1..10000}; do +for _ in {1..1000}; do go run ../examples/allocator-client/main.go \ --ip "${EXTERNAL_IP}" \ --port 443 \ diff --git a/tmp/fleet.yaml b/tmp/fleet.yaml index 621cdb37a4..3593edaecc 100644 --- a/tmp/fleet.yaml +++ b/tmp/fleet.yaml @@ -17,7 +17,7 @@ kind: Fleet metadata: name: simple-game-server spec: - replicas: 300 + replicas: 100 template: metadata: labels: @@ -26,9 +26,6 @@ spec: ports: - name: default containerPort: 7654 - counters: - players: - capacity: 300 template: spec: containers: From 8a7c819157a4cee33e9a8c26f1846827be821825 Mon Sep 17 00:00:00 2001 From: Ivy Gooch Date: Thu, 19 Dec 2024 14:31:45 -0800 Subject: [PATCH 3/3] Changes allocation script to only create one client and for the client to create multiple allocations --- examples/allocator-client/main.go | 9 ++++++++- tmp/allocate.sh | 6 ++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/examples/allocator-client/main.go b/examples/allocator-client/main.go index 73512ba600..a6f3018c5b 100644 --- a/examples/allocator-client/main.go +++ b/examples/allocator-client/main.go @@ -85,7 +85,14 @@ func main() { defer conn.Close() grpcClient := pb.NewAllocationServiceClient(conn) - response, err := grpcClient.Allocate(context.Background(), request) + + for i := 0; i < 1000; i++ { + allocateGameServer(grpcClient, request) + } +} + +func allocateGameServer(client pb.AllocationServiceClient, request *pb.AllocationRequest) { + response, err := client.Allocate(context.Background(), request) if err != nil { panic(err) } diff --git a/tmp/allocate.sh b/tmp/allocate.sh index 719179ae28..381bf25a0f 100644 --- a/tmp/allocate.sh +++ b/tmp/allocate.sh @@ -10,7 +10,7 @@ TLS_CA_FILE=ca.crt echo "Starting go runs" -for _ in {1..1000}; do +run_allocator_client () { go run ../examples/allocator-client/main.go \ --ip "${EXTERNAL_IP}" \ --port 443 \ @@ -18,6 +18,8 @@ for _ in {1..1000}; do --key "${KEY_FILE}" \ --cert "${CERT_FILE}" \ --cacert "${TLS_CA_FILE}" -done +} +run_allocator_client +wait echo "All done"