Skip to content

Commit

Permalink
add doc and test.
Browse files Browse the repository at this point in the history
Signed-off-by: morvencao <[email protected]>
  • Loading branch information
morvencao committed Sep 6, 2024
1 parent 2f0a0d4 commit fa553cd
Show file tree
Hide file tree
Showing 3 changed files with 202 additions and 6 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ require (
github.com/openshift/build-machinery-go v0.0.0-20240419090851-af9c868bcf52
github.com/openshift/library-go v0.0.0-20240621150525-4bb4238aef81
github.com/prometheus/client_golang v1.18.0
github.com/prometheus/client_model v0.5.0
github.com/stretchr/testify v1.8.4
golang.org/x/oauth2 v0.16.0
google.golang.org/grpc v1.62.1
Expand Down Expand Up @@ -63,7 +64,6 @@ require (
github.com/nxadm/tail v1.4.8 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/common v0.45.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/rs/xid v1.4.0 // indirect
Expand Down
40 changes: 35 additions & 5 deletions pkg/cloudevents/generic/metrics_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ const (

// metricsLabels - Array of labels added to metrics:
var metricsLabels = []string{
metricsSourceLabel,
metricsClusterLabel,
metrucsDataTypeLabel,
metricsSourceLabel, // source
metricsClusterLabel, // cluster
metrucsDataTypeLabel, // resource type
}

// Names of the metrics:
Expand All @@ -29,7 +29,22 @@ const (
statusResyncDurationMetric = "status_resync_duration_seconds"
)

// Description of the resource spec resync duration metric:
// The resource spec resync duration metric is a histogram with a base metric name of 'resource_spec_resync_duration_second'
// exposes multiple time series during a scrape:
// 1. cumulative counters for the observation buckets, exposed as 'resource_spec_resync_duration_seconds_bucket{le="<upper inclusive bound>"}'
// 2. the total sum of all observed values, exposed as 'resource_spec_resync_duration_seconds_sum'
// 3. the count of events that have been observed, exposed as 'resource_spec_resync_duration_seconds_count' (identical to 'resource_spec_resync_duration_seconds_bucket{le="+Inf"}' above)
// For example, 2 resource spec resync for manifests type that have been observed, one taking 0.5s and the other taking 0.7s, would result in the following metrics:
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="0.1"} 0
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="0.2"} 0
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="0.5"} 1
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="1.0"} 2
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="2.0"} 2
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="10.0"} 2
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="30.0"} 2
// resource_spec_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests",le="+Inf"} 2
// resource_spec_resync_duration_seconds_sum{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests"} 1.2
// resource_spec_resync_duration_seconds_count{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifests"} 2
var resourceSpecResyncDurationMetric = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: metricsSubsystem,
Expand All @@ -48,7 +63,22 @@ var resourceSpecResyncDurationMetric = prometheus.NewHistogramVec(
metricsLabels,
)

// Description of the resource status resync duration metric:
// The resource status resync duration metric is a histogram with a base metric name of 'resource_status_resync_duration_second'
// exposes multiple time series during a scrape:
// 1. cumulative counters for the observation buckets, exposed as 'resource_status_resync_duration_seconds_bucket{le="<upper inclusive bound>"}'
// 2. the total sum of all observed values, exposed as 'resource_status_resync_duration_seconds_sum'
// 3. the count of events that have been observed, exposed as 'resource_status_resync_duration_seconds_count' (identical to 'resource_status_resync_duration_seconds_bucket{le="+Inf"}' above)
// For example, 2 resource status resync for manifestbundles type that have been observed, one taking 0.5s and the other taking 1.1s, would result in the following metrics:
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="0.1"} 0
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="0.2"} 0
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="0.5"} 1
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="1.0"} 1
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="2.0"} 2
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="10.0"} 2
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="30.0"} 2
// resource_status_resync_duration_seconds_bucket{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles",le="+Inf"} 2
// resource_status_resync_duration_seconds_sum{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles"} 1.6
// resource_status_resync_duration_seconds_count{source="source1",cluster="cluster1",type="io.open-cluster-management.works.v1alpha1.manifestbundles"} 2
var resourceStatusResyncDurationMetric = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: metricsSubsystem,
Expand Down
166 changes: 166 additions & 0 deletions pkg/cloudevents/generic/metrics_collector_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
package generic

import (
"context"
"fmt"
"testing"
"time"

cloudevents "github.com/cloudevents/sdk-go/v2"
"github.com/cloudevents/sdk-go/v2/protocol/gochan"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/require"
"open-cluster-management.io/sdk-go/pkg/cloudevents/generic/options/fake"
"open-cluster-management.io/sdk-go/pkg/cloudevents/generic/payload"
"open-cluster-management.io/sdk-go/pkg/cloudevents/generic/types"
)

type testResyncType string

const (
testSpecResync testResyncType = "spec"
testStatusResync testResyncType = "status"
)

func TestResyncMetrics(t *testing.T) {
cases := []struct {
name string
rescType testResyncType
clusterName string
sourceID string
dataType types.CloudEventsDataType
}{
{
name: "resync spec",
rescType: testSpecResync,
clusterName: "cluster1",
sourceID: "source1",
dataType: mockEventDataType,
},
{
name: "resync status",
rescType: testStatusResync,
clusterName: "cluster1",
sourceID: "source1",
dataType: mockEventDataType,
},
}

// register metrics
RegisterResourceResyncMetrics()
// unregister metrics
defer UnregisterResourceResyncMetrics()
for _, c := range cases {
// reset metrics
ResetResourceResyncMetricsCollectors()
// run test
t.Run(c.name, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())

if c.rescType == testSpecResync {
sourceOptions := fake.NewSourceOptions(gochan.New(), c.sourceID)
lister := newMockResourceLister([]*mockResource{}...)
source, err := NewCloudEventSourceClient[*mockResource](ctx, sourceOptions, lister, statusHash, newMockResourceCodec())
require.NoError(t, err)

eventType := types.CloudEventsType{
CloudEventsDataType: c.dataType,
SubResource: types.SubResourceSpec,
Action: types.ResyncRequestAction,
}
evt := cloudevents.NewEvent()
evt.SetType(eventType.String())
evt.SetExtension("clustername", c.clusterName)
if err := evt.SetData(cloudevents.ApplicationJSON, &payload.ResourceStatusHashList{}); err != nil {
t.Errorf("failed to set data for event: %v", err)
}

// receive resync request and publish associated resources
source.receive(ctx, evt)
// wait 1 seconds to respond to the resync request
time.Sleep(2 * time.Second)

// check spec resync duration metric as a histogram
h := resourceSpecResyncDurationMetric.WithLabelValues(c.sourceID, c.clusterName, mockEventDataType.String())
count, sum := toFloat64HistCountAndSum(h)
require.Equal(t, uint64(1), count)
require.Greater(t, sum, 0.0)
require.Less(t, sum, 1.0)
}

if c.rescType == testStatusResync {
agentOptions := fake.NewAgentOptions(gochan.New(), c.clusterName, testAgentName)
lister := newMockResourceLister([]*mockResource{}...)
agent, err := NewCloudEventAgentClient[*mockResource](ctx, agentOptions, lister, statusHash, newMockResourceCodec())
require.NoError(t, err)

eventType := types.CloudEventsType{
CloudEventsDataType: c.dataType,
SubResource: types.SubResourceStatus,
Action: types.ResyncRequestAction,
}
evt := cloudevents.NewEvent()
evt.SetType(eventType.String())
evt.SetSource(c.sourceID)
if err := evt.SetData(cloudevents.ApplicationJSON, &payload.ResourceStatusHashList{}); err != nil {
t.Errorf("failed to set data for event: %v", err)
}

// receive resync request and publish associated resources
agent.receive(ctx, evt)
// wait 1 seconds to respond to the resync request
time.Sleep(1 * time.Second)

// check status resync duration metric as a histogram
h := resourceStatusResyncDurationMetric.WithLabelValues(c.sourceID, c.clusterName, mockEventDataType.String())
count, sum := toFloat64HistCountAndSum(h)
require.Equal(t, uint64(1), count)
require.Greater(t, sum, 0.0)
require.Less(t, sum, 1.0)
}

cancel()
})
}
}

// toFloat64HistCountAndSum returns the count and sum of a histogram metric
func toFloat64HistCountAndSum(h prometheus.Observer) (uint64, float64) {
var (
m prometheus.Metric
mCount int
mChan = make(chan prometheus.Metric)
done = make(chan struct{})
)

go func() {
for m = range mChan {
mCount++
}
close(done)
}()

c, ok := h.(prometheus.Collector)
if !ok {
panic(fmt.Errorf("observer is not a collector; got: %T", h))
}

c.Collect(mChan)
close(mChan)
<-done

if mCount != 1 {
panic(fmt.Errorf("collected %d metrics instead of exactly 1", mCount))
}

pb := &dto.Metric{}
if err := m.Write(pb); err != nil {
panic(fmt.Errorf("metric write failed, err=%v", err))
}

if pb.Histogram != nil {
return pb.Histogram.GetSampleCount(), pb.Histogram.GetSampleSum()
}
panic(fmt.Errorf("collected a non-histogram metric: %s", pb))
}

0 comments on commit fa553cd

Please sign in to comment.