diff --git a/README.md b/README.md index eb9ea13..36ca665 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # spot-interruption-exporter -Publishes a prometheus metric `interruption_events_total` whenever a spot instance has been interrupted. +Publishes a prometheus metric `interruption_events_total` that increments by 1 whenever a spot instance has been interrupted. This is a very helpful metric, as it @@ -9,7 +9,7 @@ This is a very helpful metric, as it - can aid in seeing how much more susceptible single-zone clusters are to interruption -- can be used as a signal on whether or not to promote spot instances to other environments +- can be used as a signal on whether to promote spot instances to other environments The app can be expanded to support other cloud providers, but currently is only built for GCP. diff --git a/main.go b/main.go index 1500cdd..415ea9c 100644 --- a/main.go +++ b/main.go @@ -23,8 +23,8 @@ import ( var ( interruptionEvents = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "interruption_events_total", - Help: "The total number of interruption events for a given cluster", - }, []string{"kubernetes_cluster", "resource_id"}) + Help: "The total number of spot interruptions for a given cluster", + }, []string{"kubernetes_cluster"}) ) func main() { @@ -56,14 +56,14 @@ func main() { go n.Receive(ctx, e) c := cache.NewCacheWithTTL(time.Minute * 10) - sugar.Info("listening for preemption events") for event := range e { // this ensures we do not handle a duplicate message in the event pubsub sends it more than once if exists := c.Exists(event.MessageID); exists { continue } c.Insert(event.MessageID) - interruptionEvents.WithLabelValues(cfg.ClusterName, event.ResourceID).Inc() + interruptionEvents.WithLabelValues(cfg.ClusterName).Inc() + sugar.With("resource_id", event.ResourceID).Info("interrupted") } }