add doc and test.

Signed-off-by: morvencao <[email protected]>
open-cluster-management-io · Sep 6, 2024 · fa553cd · fa553cd
1 parent 2f0a0d4
commit fa553cd
Show file tree

Hide file tree

Showing 3 changed files with 202 additions and 6 deletions.
diff --git a/go.mod b/go.mod
@@ -20,6 +20,7 @@ require (
 	github.com/openshift/build-machinery-go v0.0.0-20240419090851-af9c868bcf52
 	github.com/openshift/library-go v0.0.0-20240621150525-4bb4238aef81
 	github.com/prometheus/client_golang v1.18.0
+	github.com/prometheus/client_model v0.5.0
 	github.com/stretchr/testify v1.8.4
 	golang.org/x/oauth2 v0.16.0
 	google.golang.org/grpc v1.62.1
@@ -63,7 +64,6 @@ require (
 	github.com/nxadm/tail v1.4.8 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
-	github.com/prometheus/client_model v0.5.0 // indirect
 	github.com/prometheus/common v0.45.0 // indirect
 	github.com/prometheus/procfs v0.12.0 // indirect
 	github.com/rs/xid v1.4.0 // indirect

diff --git a/pkg/cloudevents/generic/metrics_collector.go b/pkg/cloudevents/generic/metrics_collector.go
@@ -18,9 +18,9 @@ const (
 
 // metricsLabels - Array of labels added to metrics:
 var metricsLabels = []string{
-	metricsSourceLabel,
-	metricsClusterLabel,
-	metrucsDataTypeLabel,
+	metricsSourceLabel,   // source
+	metricsClusterLabel,  // cluster
+	metrucsDataTypeLabel, // resource type
 }
 
 // Names of the metrics:
@@ -29,7 +29,22 @@ const (
 	statusResyncDurationMetric = "status_resync_duration_seconds"
 )
 
-// Description of the resource spec resync duration metric:
+// The resource spec resync duration metric is a histogram with a base metric name of 'resource_spec_resync_duration_second'
+// exposes multiple time series during a scrape:
+// 1. cumulative counters for the observation buckets, exposed as 'resource_spec_resync_duration_seconds_bucket{le="<upper inclusive bound>"}'
+// 2. the total sum of all observed values, exposed as 'resource_spec_resync_duration_seconds_sum'
+// 3. the count of events that have been observed, exposed as 'resource_spec_resync_duration_seconds_count' (identical to 'resource_spec_resync_duration_seconds_bucket{le="+Inf"}' above)
+// For example, 2 resource spec resync for manifests type that have been observed, one taking 0.5s and the other taking 0.7s, would result in the following metrics:
+// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="0.1"} 0
+// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="0.2"} 0
+// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="0.5"} 1
+// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="1.0"} 2
+// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="2.0"} 2
+// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="10.0"} 2
+// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="30.0"} 2
+// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="+Inf"} 2
+// resource_spec_resync_duration_seconds_sum{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests"} 1.2
+// resource_spec_resync_duration_seconds_count{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests"} 2
 var resourceSpecResyncDurationMetric = prometheus.NewHistogramVec(
 	prometheus.HistogramOpts{
 		Subsystem: metricsSubsystem,
@@ -48,7 +63,22 @@ var resourceSpecResyncDurationMetric = prometheus.NewHistogramVec(
 	metricsLabels,
 )
 
-// Description of the resource status resync duration metric:
+// The resource status resync duration metric is a histogram with a base metric name of 'resource_status_resync_duration_second'
+// exposes multiple time series during a scrape:
+// 1. cumulative counters for the observation buckets, exposed as 'resource_status_resync_duration_seconds_bucket{le="<upper inclusive bound>"}'
+// 2. the total sum of all observed values, exposed as 'resource_status_resync_duration_seconds_sum'
+// 3. the count of events that have been observed, exposed as 'resource_status_resync_duration_seconds_count' (identical to 'resource_status_resync_duration_seconds_bucket{le="+Inf"}' above)
+// For example, 2 resource status resync for manifestbundles type that have been observed, one taking 0.5s and the other taking 1.1s, would result in the following metrics:
+// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="0.1"} 0
+// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="0.2"} 0
+// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="0.5"} 1
+// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="1.0"} 1
+// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="2.0"} 2
+// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="10.0"} 2
+// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="30.0"} 2
+// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="+Inf"} 2
+// resource_status_resync_duration_seconds_sum{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles"} 1.6
+// resource_status_resync_duration_seconds_count{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles"} 2
 var resourceStatusResyncDurationMetric = prometheus.NewHistogramVec(
 	prometheus.HistogramOpts{
 		Subsystem: metricsSubsystem,

diff --git a/pkg/cloudevents/generic/metrics_collector_test.go b/pkg/cloudevents/generic/metrics_collector_test.go
@@ -0,0 +1,166 @@
+package generic
+
+import (
+	"context"
+	"fmt"
+	"testing"
+	"time"
+
+	cloudevents "github.com/cloudevents/sdk-go/v2"
+	"github.com/cloudevents/sdk-go/v2/protocol/gochan"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+	"github.com/stretchr/testify/require"
+	"open-cluster-management.io/sdk-go/pkg/cloudevents/generic/options/fake"
+	"open-cluster-management.io/sdk-go/pkg/cloudevents/generic/payload"
+	"open-cluster-management.io/sdk-go/pkg/cloudevents/generic/types"
+)
+
+type testResyncType string
+
+const (
+	testSpecResync   testResyncType = "spec"
+	testStatusResync testResyncType = "status"
+)
+
+func TestResyncMetrics(t *testing.T) {
+	cases := []struct {
+		name        string
+		rescType    testResyncType
+		clusterName string
+		sourceID    string
+		dataType    types.CloudEventsDataType
+	}{
+		{
+			name:        "resync spec",
+			rescType:    testSpecResync,
+			clusterName: "cluster1",
+			sourceID:    "source1",
+			dataType:    mockEventDataType,
+		},
+		{
+			name:        "resync status",
+			rescType:    testStatusResync,
+			clusterName: "cluster1",
+			sourceID:    "source1",
+			dataType:    mockEventDataType,
+		},
+	}
+
+	// register metrics
+	RegisterResourceResyncMetrics()
+	// unregister metrics
+	defer UnregisterResourceResyncMetrics()
+	for _, c := range cases {
+		// reset metrics
+		ResetResourceResyncMetricsCollectors()
+		// run test
+		t.Run(c.name, func(t *testing.T) {
+			ctx, cancel := context.WithCancel(context.Background())
+
+			if c.rescType == testSpecResync {
+				sourceOptions := fake.NewSourceOptions(gochan.New(), c.sourceID)
+				lister := newMockResourceLister([]*mockResource{}...)
+				source, err := NewCloudEventSourceClient[*mockResource](ctx, sourceOptions, lister, statusHash, newMockResourceCodec())
+				require.NoError(t, err)
+
+				eventType := types.CloudEventsType{
+					CloudEventsDataType: c.dataType,
+					SubResource:         types.SubResourceSpec,
+					Action:              types.ResyncRequestAction,
+				}
+				evt := cloudevents.NewEvent()
+				evt.SetType(eventType.String())
+				evt.SetExtension("clustername", c.clusterName)
+				if err := evt.SetData(cloudevents.ApplicationJSON, &payload.ResourceStatusHashList{}); err != nil {
+					t.Errorf("failed to set data for event: %v", err)
+				}
+
+				// receive resync request and publish associated resources
+				source.receive(ctx, evt)
+				// wait 1 seconds to respond to the resync request
+				time.Sleep(2 * time.Second)
+
+				// check spec resync duration metric as a histogram
+				h := resourceSpecResyncDurationMetric.WithLabelValues(c.sourceID, c.clusterName, mockEventDataType.String())
+				count, sum := toFloat64HistCountAndSum(h)
+				require.Equal(t, uint64(1), count)
+				require.Greater(t, sum, 0.0)
+				require.Less(t, sum, 1.0)
+			}
+
+			if c.rescType == testStatusResync {
+				agentOptions := fake.NewAgentOptions(gochan.New(), c.clusterName, testAgentName)
+				lister := newMockResourceLister([]*mockResource{}...)
+				agent, err := NewCloudEventAgentClient[*mockResource](ctx, agentOptions, lister, statusHash, newMockResourceCodec())
+				require.NoError(t, err)
+
+				eventType := types.CloudEventsType{
+					CloudEventsDataType: c.dataType,
+					SubResource:         types.SubResourceStatus,
+					Action:              types.ResyncRequestAction,
+				}
+				evt := cloudevents.NewEvent()
+				evt.SetType(eventType.String())
+				evt.SetSource(c.sourceID)
+				if err := evt.SetData(cloudevents.ApplicationJSON, &payload.ResourceStatusHashList{}); err != nil {
+					t.Errorf("failed to set data for event: %v", err)
+				}
+
+				// receive resync request and publish associated resources
+				agent.receive(ctx, evt)
+				// wait 1 seconds to respond to the resync request
+				time.Sleep(1 * time.Second)
+
+				// check status resync duration metric as a histogram
+				h := resourceStatusResyncDurationMetric.WithLabelValues(c.sourceID, c.clusterName, mockEventDataType.String())
+				count, sum := toFloat64HistCountAndSum(h)
+				require.Equal(t, uint64(1), count)
+				require.Greater(t, sum, 0.0)
+				require.Less(t, sum, 1.0)
+			}
+
+			cancel()
+		})
+	}
+}
+
+// toFloat64HistCountAndSum returns the count and sum of a histogram metric
+func toFloat64HistCountAndSum(h prometheus.Observer) (uint64, float64) {
+	var (
+		m      prometheus.Metric
+		mCount int
+		mChan  = make(chan prometheus.Metric)
+		done   = make(chan struct{})
+	)
+
+	go func() {
+		for m = range mChan {
+			mCount++
+		}
+		close(done)
+	}()
+
+	c, ok := h.(prometheus.Collector)
+	if !ok {
+		panic(fmt.Errorf("observer is not a collector; got: %T", h))
+	}
+
+	c.Collect(mChan)
+	close(mChan)
+	<-done
+
+	if mCount != 1 {
+		panic(fmt.Errorf("collected %d metrics instead of exactly 1", mCount))
+	}
+
+	pb := &dto.Metric{}
+	if err := m.Write(pb); err != nil {
+		panic(fmt.Errorf("metric write failed, err=%v", err))
+	}
+
+	if pb.Histogram != nil {
+		return pb.Histogram.GetSampleCount(), pb.Histogram.GetSampleSum()
+	}
+	panic(fmt.Errorf("collected a non-histogram metric: %s", pb))
+}