-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
🌱 add latency metrics for resource resync. (#76)
* add metrics for resource resync. Signed-off-by: morvencao <[email protected]> * add doc and test. Signed-off-by: morvencao <[email protected]> --------- Signed-off-by: morvencao <[email protected]>
- Loading branch information
Showing
178 changed files
with
31,706 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
package generic | ||
|
||
import ( | ||
"time" | ||
|
||
"github.com/prometheus/client_golang/prometheus" | ||
) | ||
|
||
// Subsystem used to define the metrics: | ||
const metricsSubsystem = "resources" | ||
|
||
// Names of the labels added to metrics: | ||
const ( | ||
metricsSourceLabel = "source" | ||
metricsClusterLabel = "cluster" | ||
metrucsDataTypeLabel = "type" | ||
) | ||
|
||
// metricsLabels - Array of labels added to metrics: | ||
var metricsLabels = []string{ | ||
metricsSourceLabel, // source | ||
metricsClusterLabel, // cluster | ||
metrucsDataTypeLabel, // resource type | ||
} | ||
|
||
// Names of the metrics: | ||
const ( | ||
specResyncDurationMetric = "spec_resync_duration_seconds" | ||
statusResyncDurationMetric = "status_resync_duration_seconds" | ||
) | ||
|
||
// The resource spec resync duration metric is a histogram with a base metric name of 'resource_spec_resync_duration_second' | ||
// exposes multiple time series during a scrape: | ||
// 1. cumulative counters for the observation buckets, exposed as 'resource_spec_resync_duration_seconds_bucket{le="<upper inclusive bound>"}' | ||
// 2. the total sum of all observed values, exposed as 'resource_spec_resync_duration_seconds_sum' | ||
// 3. the count of events that have been observed, exposed as 'resource_spec_resync_duration_seconds_count' (identical to 'resource_spec_resync_duration_seconds_bucket{le="+Inf"}' above) | ||
// For example, 2 resource spec resync for manifests type that have been observed, one taking 0.5s and the other taking 0.7s, would result in the following metrics: | ||
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="0.1"} 0 | ||
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="0.2"} 0 | ||
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="0.5"} 1 | ||
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="1.0"} 2 | ||
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="2.0"} 2 | ||
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="10.0"} 2 | ||
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="30.0"} 2 | ||
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="+Inf"} 2 | ||
// resource_spec_resync_duration_seconds_sum{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests"} 1.2 | ||
// resource_spec_resync_duration_seconds_count{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests"} 2 | ||
var resourceSpecResyncDurationMetric = prometheus.NewHistogramVec( | ||
prometheus.HistogramOpts{ | ||
Subsystem: metricsSubsystem, | ||
Name: specResyncDurationMetric, | ||
Help: "The duration of the resource spec resync in seconds.", | ||
Buckets: []float64{ | ||
0.1, | ||
0.2, | ||
0.5, | ||
1.0, | ||
2.0, | ||
10.0, | ||
30.0, | ||
}, | ||
}, | ||
metricsLabels, | ||
) | ||
|
||
// The resource status resync duration metric is a histogram with a base metric name of 'resource_status_resync_duration_second' | ||
// exposes multiple time series during a scrape: | ||
// 1. cumulative counters for the observation buckets, exposed as 'resource_status_resync_duration_seconds_bucket{le="<upper inclusive bound>"}' | ||
// 2. the total sum of all observed values, exposed as 'resource_status_resync_duration_seconds_sum' | ||
// 3. the count of events that have been observed, exposed as 'resource_status_resync_duration_seconds_count' (identical to 'resource_status_resync_duration_seconds_bucket{le="+Inf"}' above) | ||
// For example, 2 resource status resync for manifestbundles type that have been observed, one taking 0.5s and the other taking 1.1s, would result in the following metrics: | ||
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="0.1"} 0 | ||
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="0.2"} 0 | ||
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="0.5"} 1 | ||
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="1.0"} 1 | ||
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="2.0"} 2 | ||
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="10.0"} 2 | ||
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="30.0"} 2 | ||
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="+Inf"} 2 | ||
// resource_status_resync_duration_seconds_sum{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles"} 1.6 | ||
// resource_status_resync_duration_seconds_count{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles"} 2 | ||
var resourceStatusResyncDurationMetric = prometheus.NewHistogramVec( | ||
prometheus.HistogramOpts{ | ||
Subsystem: metricsSubsystem, | ||
Name: statusResyncDurationMetric, | ||
Help: "The duration of the resource status resync in seconds.", | ||
Buckets: []float64{ | ||
0.1, | ||
0.2, | ||
0.5, | ||
1.0, | ||
2.0, | ||
10.0, | ||
30.0, | ||
}, | ||
}, | ||
metricsLabels, | ||
) | ||
|
||
// Register the metrics: | ||
func RegisterResourceResyncMetrics() { | ||
prometheus.MustRegister(resourceSpecResyncDurationMetric) | ||
prometheus.MustRegister(resourceStatusResyncDurationMetric) | ||
} | ||
|
||
// Unregister the metrics: | ||
func UnregisterResourceResyncMetrics() { | ||
prometheus.Unregister(resourceStatusResyncDurationMetric) | ||
prometheus.Unregister(resourceStatusResyncDurationMetric) | ||
} | ||
|
||
// ResetResourceResyncMetricsCollectors resets all collectors | ||
func ResetResourceResyncMetricsCollectors() { | ||
resourceSpecResyncDurationMetric.Reset() | ||
resourceStatusResyncDurationMetric.Reset() | ||
} | ||
|
||
// updateResourceSpecResyncDurationMetric updates the resource spec resync duration metric: | ||
func updateResourceSpecResyncDurationMetric(source, cluster, dataType string, startTime time.Time) { | ||
labels := prometheus.Labels{ | ||
metricsSourceLabel: source, | ||
metricsClusterLabel: cluster, | ||
metrucsDataTypeLabel: dataType, | ||
} | ||
duration := time.Since(startTime) | ||
resourceSpecResyncDurationMetric.With(labels).Observe(duration.Seconds()) | ||
} | ||
|
||
// updateResourceStatusResyncDurationMetric updates the resource status resync duration metric: | ||
func updateResourceStatusResyncDurationMetric(source, cluster, dataType string, startTime time.Time) { | ||
labels := prometheus.Labels{ | ||
metricsSourceLabel: source, | ||
metricsClusterLabel: cluster, | ||
metrucsDataTypeLabel: dataType, | ||
} | ||
duration := time.Since(startTime) | ||
resourceStatusResyncDurationMetric.With(labels).Observe(duration.Seconds()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
package generic | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"testing" | ||
"time" | ||
|
||
cloudevents "github.com/cloudevents/sdk-go/v2" | ||
"github.com/cloudevents/sdk-go/v2/protocol/gochan" | ||
"github.com/prometheus/client_golang/prometheus" | ||
dto "github.com/prometheus/client_model/go" | ||
"github.com/stretchr/testify/require" | ||
"open-cluster-management.io/sdk-go/pkg/cloudevents/generic/options/fake" | ||
"open-cluster-management.io/sdk-go/pkg/cloudevents/generic/payload" | ||
"open-cluster-management.io/sdk-go/pkg/cloudevents/generic/types" | ||
) | ||
|
||
type testResyncType string | ||
|
||
const ( | ||
testSpecResync testResyncType = "spec" | ||
testStatusResync testResyncType = "status" | ||
) | ||
|
||
func TestResyncMetrics(t *testing.T) { | ||
cases := []struct { | ||
name string | ||
rescType testResyncType | ||
clusterName string | ||
sourceID string | ||
dataType types.CloudEventsDataType | ||
}{ | ||
{ | ||
name: "resync spec", | ||
rescType: testSpecResync, | ||
clusterName: "cluster1", | ||
sourceID: "source1", | ||
dataType: mockEventDataType, | ||
}, | ||
{ | ||
name: "resync status", | ||
rescType: testStatusResync, | ||
clusterName: "cluster1", | ||
sourceID: "source1", | ||
dataType: mockEventDataType, | ||
}, | ||
} | ||
|
||
// register metrics | ||
RegisterResourceResyncMetrics() | ||
// unregister metrics | ||
defer UnregisterResourceResyncMetrics() | ||
for _, c := range cases { | ||
// reset metrics | ||
ResetResourceResyncMetricsCollectors() | ||
// run test | ||
t.Run(c.name, func(t *testing.T) { | ||
ctx, cancel := context.WithCancel(context.Background()) | ||
|
||
if c.rescType == testSpecResync { | ||
sourceOptions := fake.NewSourceOptions(gochan.New(), c.sourceID) | ||
lister := newMockResourceLister([]*mockResource{}...) | ||
source, err := NewCloudEventSourceClient[*mockResource](ctx, sourceOptions, lister, statusHash, newMockResourceCodec()) | ||
require.NoError(t, err) | ||
|
||
eventType := types.CloudEventsType{ | ||
CloudEventsDataType: c.dataType, | ||
SubResource: types.SubResourceSpec, | ||
Action: types.ResyncRequestAction, | ||
} | ||
evt := cloudevents.NewEvent() | ||
evt.SetType(eventType.String()) | ||
evt.SetExtension("clustername", c.clusterName) | ||
if err := evt.SetData(cloudevents.ApplicationJSON, &payload.ResourceStatusHashList{}); err != nil { | ||
t.Errorf("failed to set data for event: %v", err) | ||
} | ||
|
||
// receive resync request and publish associated resources | ||
source.receive(ctx, evt) | ||
// wait 1 seconds to respond to the resync request | ||
time.Sleep(2 * time.Second) | ||
|
||
// check spec resync duration metric as a histogram | ||
h := resourceSpecResyncDurationMetric.WithLabelValues(c.sourceID, c.clusterName, mockEventDataType.String()) | ||
count, sum := toFloat64HistCountAndSum(h) | ||
require.Equal(t, uint64(1), count) | ||
require.Greater(t, sum, 0.0) | ||
require.Less(t, sum, 1.0) | ||
} | ||
|
||
if c.rescType == testStatusResync { | ||
agentOptions := fake.NewAgentOptions(gochan.New(), c.clusterName, testAgentName) | ||
lister := newMockResourceLister([]*mockResource{}...) | ||
agent, err := NewCloudEventAgentClient[*mockResource](ctx, agentOptions, lister, statusHash, newMockResourceCodec()) | ||
require.NoError(t, err) | ||
|
||
eventType := types.CloudEventsType{ | ||
CloudEventsDataType: c.dataType, | ||
SubResource: types.SubResourceStatus, | ||
Action: types.ResyncRequestAction, | ||
} | ||
evt := cloudevents.NewEvent() | ||
evt.SetType(eventType.String()) | ||
evt.SetSource(c.sourceID) | ||
if err := evt.SetData(cloudevents.ApplicationJSON, &payload.ResourceStatusHashList{}); err != nil { | ||
t.Errorf("failed to set data for event: %v", err) | ||
} | ||
|
||
// receive resync request and publish associated resources | ||
agent.receive(ctx, evt) | ||
// wait 1 seconds to respond to the resync request | ||
time.Sleep(1 * time.Second) | ||
|
||
// check status resync duration metric as a histogram | ||
h := resourceStatusResyncDurationMetric.WithLabelValues(c.sourceID, c.clusterName, mockEventDataType.String()) | ||
count, sum := toFloat64HistCountAndSum(h) | ||
require.Equal(t, uint64(1), count) | ||
require.Greater(t, sum, 0.0) | ||
require.Less(t, sum, 1.0) | ||
} | ||
|
||
cancel() | ||
}) | ||
} | ||
} | ||
|
||
// toFloat64HistCountAndSum returns the count and sum of a histogram metric | ||
func toFloat64HistCountAndSum(h prometheus.Observer) (uint64, float64) { | ||
var ( | ||
m prometheus.Metric | ||
mCount int | ||
mChan = make(chan prometheus.Metric) | ||
done = make(chan struct{}) | ||
) | ||
|
||
go func() { | ||
for m = range mChan { | ||
mCount++ | ||
} | ||
close(done) | ||
}() | ||
|
||
c, ok := h.(prometheus.Collector) | ||
if !ok { | ||
panic(fmt.Errorf("observer is not a collector; got: %T", h)) | ||
} | ||
|
||
c.Collect(mChan) | ||
close(mChan) | ||
<-done | ||
|
||
if mCount != 1 { | ||
panic(fmt.Errorf("collected %d metrics instead of exactly 1", mCount)) | ||
} | ||
|
||
pb := &dto.Metric{} | ||
if err := m.Write(pb); err != nil { | ||
panic(fmt.Errorf("metric write failed, err=%v", err)) | ||
} | ||
|
||
if pb.Histogram != nil { | ||
return pb.Histogram.GetSampleCount(), pb.Histogram.GetSampleSum() | ||
} | ||
panic(fmt.Errorf("collected a non-histogram metric: %s", pb)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.