diff --git a/controllers/incarnation.go b/controllers/incarnation.go index ae140c41..2ff9515a 100644 --- a/controllers/incarnation.go +++ b/controllers/incarnation.go @@ -426,6 +426,25 @@ func (i *Incarnation) deleteCanaryRules(ctx context.Context) error { }) } +func (i *Incarnation) syncDeploymentRules(ctx context.Context) error { + return i.controller.applyPlan(ctx, "Sync Deployment Rules", &rmplan.SyncDeploymentRules{ + App: i.appName(), + Namespace: i.targetNamespace(), + Tag: i.tag, + Labels: i.defaultLabels(), + ServiceLevelObjectiveLabels: i.target().ServiceLevelObjectiveLabels, + ServiceLevelObjectives: i.target().SlothServiceLevelObjectives, + }) +} + +func (i *Incarnation) deleteDeploymentRules(ctx context.Context) error { + return i.controller.applyPlan(ctx, "Delete Deployment Rules", &rmplan.DeleteDeploymentRules{ + App: i.appName(), + Namespace: i.targetNamespace(), + Tag: i.tag, + }) +} + func (i *Incarnation) syncTaggedServiceLevels(ctx context.Context) error { if i.picchuConfig.ServiceLevelsNamespace != "" { // Account for a fleet other than Delivery (old way of configuring SLOs) and Production (the only other place we ideally want SLOs to go) diff --git a/controllers/mock_deployment.go b/controllers/mock_deployment.go index 3bb379b6..c5a27a07 100644 --- a/controllers/mock_deployment.go +++ b/controllers/mock_deployment.go @@ -77,6 +77,20 @@ func (mr *MockDeploymentMockRecorder) deleteCanaryRules(arg0 interface{}) *gomoc return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "deleteCanaryRules", reflect.TypeOf((*MockDeployment)(nil).deleteCanaryRules), arg0) } +// deleteDeploymentRules mocks base method +func (m *MockDeployment) deleteDeploymentRules(arg0 context.Context) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "deleteDeploymentRules", arg0) + ret0, _ := ret[0].(error) + return ret0 +} + +// deleteDeploymentRules indicates an expected call of deleteDeploymentRules +func (mr *MockDeploymentMockRecorder) deleteDeploymentRules(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "deleteDeploymentRules", reflect.TypeOf((*MockDeployment)(nil).deleteDeploymentRules), arg0) +} + // deleteTaggedServiceLevels mocks base method func (m *MockDeployment) deleteTaggedServiceLevels(arg0 context.Context) error { m.ctrl.T.Helper() @@ -313,6 +327,20 @@ func (mr *MockDeploymentMockRecorder) syncCanaryRules(arg0 interface{}) *gomock. return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "syncCanaryRules", reflect.TypeOf((*MockDeployment)(nil).syncCanaryRules), arg0) } +// syncDeploymentRules mocks base method +func (m *MockDeployment) syncDeploymentRules(arg0 context.Context) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "syncDeploymentRules", arg0) + ret0, _ := ret[0].(error) + return ret0 +} + +// syncDeploymentRules indicates an expected call of syncDeploymentRules +func (mr *MockDeploymentMockRecorder) syncDeploymentRules(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "syncDeploymentRules", reflect.TypeOf((*MockDeployment)(nil).syncDeploymentRules), arg0) +} + // syncTaggedServiceLevels mocks base method func (m *MockDeployment) syncTaggedServiceLevels(arg0 context.Context) error { m.ctrl.T.Helper() diff --git a/controllers/plan/deleteDeploymentRules.go b/controllers/plan/deleteDeploymentRules.go new file mode 100644 index 00000000..b51e5a8b --- /dev/null +++ b/controllers/plan/deleteDeploymentRules.go @@ -0,0 +1,50 @@ +package plan + +import ( + "context" + + "github.com/go-logr/logr" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + picchuv1alpha1 "go.medium.engineering/picchu/api/v1alpha1" + "go.medium.engineering/picchu/plan" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type DeleteDeploymentRules struct { + App string + Namespace string + Tag string +} + +func (p *DeleteDeploymentRules) Apply(ctx context.Context, cli client.Client, cluster *picchuv1alpha1.Cluster, log logr.Logger) error { + prlist := &monitoringv1.PrometheusRuleList{} + + opts := &client.ListOptions{ + Namespace: p.Namespace, + LabelSelector: labels.SelectorFromSet(map[string]string{ + picchuv1alpha1.LabelApp: p.App, + picchuv1alpha1.LabelTag: p.Tag, + picchuv1alpha1.LabelRuleType: RuleTypeDeployment, + }), + } + + if err := cli.List(ctx, prlist, opts); err != nil { + log.Error(err, "Failed to delete DeploymentRules") + return err + } + + for _, prometheusRule := range prlist.Items { + err := cli.Delete(ctx, prometheusRule) + if err != nil && !errors.IsNotFound(err) { + plan.LogSync(log, "deleted", err, prometheusRule) + return err + } + if err == nil { + plan.LogSync(log, "deleted", err, prometheusRule) + } + } + + return nil +} diff --git a/controllers/plan/syncCanaryRules.go b/controllers/plan/syncCanaryRules.go index 321780ef..cf285e4b 100644 --- a/controllers/plan/syncCanaryRules.go +++ b/controllers/plan/syncCanaryRules.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "strconv" - "strings" "github.com/go-logr/logr" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" @@ -78,16 +77,6 @@ func (p *SyncCanaryRules) prometheusRules(log logr.Logger) (*monitoringv1.Promet } } - config := SLOConfig{ - App: p.App, - Tag: p.Tag, - Labels: p.ServiceLevelObjectiveLabels, - } - helperRules := config.helperRules(log) - for _, rg := range helperRules { - rule.Spec.Groups = append(rule.Spec.Groups, *rg) - } - prs = append(prs, rule) prl.Items = prs return prl, nil @@ -114,53 +103,6 @@ func (p *SyncCanaryRules) prometheusRule() *monitoringv1.PrometheusRule { } } -// create CrashLoopBackOff and ImagePullBackOff for new deployment pods -func (s *SLOConfig) helperRules(log logr.Logger) []*monitoringv1.RuleGroup { - ruleGroups := []*monitoringv1.RuleGroup{} - - crashLoopLabels := s.canaryRuleLabels() - for k, v := range s.Labels.AlertLabels { - crashLoopLabels[k] = v - } - - crashLoopRuleGroup := &monitoringv1.RuleGroup{ - Name: s.crashLoopAlertName(), - Rules: []monitoringv1.Rule{ - { - Alert: s.crashLoopAlertName(), - For: monitoringv1.Duration("1m"), - Expr: intstr.FromString(s.crashLoopQuery(log)), - Labels: crashLoopLabels, - Annotations: s.crashLoopRuleAnnotations(log), - }, - }, - } - - ruleGroups = append(ruleGroups, crashLoopRuleGroup) - - imagePullBackOffLabels := s.canaryRuleLabels() - for k, v := range s.Labels.AlertLabels { - imagePullBackOffLabels[k] = v - } - - imagePullBackOffRuleGroup := &monitoringv1.RuleGroup{ - Name: s.imagePullBackOffAlertName(), - Rules: []monitoringv1.Rule{ - { - Alert: s.imagePullBackOffAlertName(), - For: monitoringv1.Duration("1m"), - Expr: intstr.FromString(s.imagePullBackOffQuery(log)), - Labels: imagePullBackOffLabels, - Annotations: s.imagePullBackOffAnnotations(log), - }, - }, - } - - ruleGroups = append(ruleGroups, imagePullBackOffRuleGroup) - - return ruleGroups -} - func (s *SLOConfig) canaryRules(log logr.Logger) []*monitoringv1.RuleGroup { ruleGroups := []*monitoringv1.RuleGroup{} @@ -209,20 +151,6 @@ func (s *SLOConfig) canaryRuleAnnotations(log logr.Logger) map[string]string { } } -func (s *SLOConfig) crashLoopRuleAnnotations(log logr.Logger) map[string]string { - return map[string]string{ - CanarySummaryAnnotation: fmt.Sprintf("%s - Canary is failing CrashLoopBackOff SLO - there is at least one pod in state `CrashLoopBackOff`", s.App), - CanaryMessageAnnotation: "There is at least one pod in state `CrashLoopBackOff`", - } -} - -func (s *SLOConfig) imagePullBackOffAnnotations(log logr.Logger) map[string]string { - return map[string]string{ - CanarySummaryAnnotation: fmt.Sprintf("%s - Canary is failing ImagePullBackOff SLO - there is at least one pod in state `ImagePullBackOff`", s.App), - CanaryMessageAnnotation: "There is at least one pod in state `ImagePullBackOff`", - } -} - func (p *SyncCanaryRules) canaryRuleName() string { return fmt.Sprintf("%s-canary-%s", p.App, p.Tag) } @@ -231,17 +159,6 @@ func (s *SLOConfig) canaryAlertName() string { return fmt.Sprintf("%s_canary", s.Name) } -func (s *SLOConfig) crashLoopAlertName() string { - // per app not soo - name := strings.Replace(s.App, "-", "_", -1) - return fmt.Sprintf("%s_canary_crashloop", name) -} - -func (s *SLOConfig) imagePullBackOffAlertName() string { - name := strings.Replace(s.App, "-", "_", -1) - return fmt.Sprintf("%s_canary_imagepullbackoff", name) -} - func (s *SLOConfig) canaryQuery(log logr.Logger) string { return fmt.Sprintf("%s{%s=\"%s\"} / %s{%s=\"%s\"} - %v > ignoring(%s) sum(%s) / sum(%s)", s.errorQuery(), s.SLO.ServiceLevelIndicator.TagKey, s.Tag, @@ -250,19 +167,6 @@ func (s *SLOConfig) canaryQuery(log logr.Logger) string { ) } -func (s *SLOConfig) crashLoopQuery(log logr.Logger) string { - // pod=~"main-20240109-181554-cba9e8cbbf-.*" - return fmt.Sprintf("sum by (reason) (kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\", container=\"%s\", pod=~\"%s-.*\"}) > 0", - s.App, s.Tag, - ) -} - -func (s *SLOConfig) imagePullBackOffQuery(log logr.Logger) string { - return fmt.Sprintf("sum by (reason) (kube_pod_container_status_waiting_reason{reason=\"ImagePullBackOff\", container=\"%s\", pod=~\"%s-.*\"}) > 0", - s.App, s.Tag, - ) -} - func (s *SLOConfig) formatAllowancePercent(log logr.Logger) string { allowancePercent := s.SLO.ServiceLevelIndicator.Canary.AllowancePercent if s.SLO.ServiceLevelIndicator.Canary.AllowancePercentString != "" { diff --git a/controllers/plan/syncCanaryRules_test.go b/controllers/plan/syncCanaryRules_test.go index 3d4df9b9..0e70dfdc 100644 --- a/controllers/plan/syncCanaryRules_test.go +++ b/controllers/plan/syncCanaryRules_test.go @@ -147,50 +147,6 @@ var ( }, }, }, - { - Name: "test_app_canary_crashloop", - Rules: []monitoringv1.Rule{ - { - Alert: "test_app_canary_crashloop", - Expr: intstr.FromString("sum by (reason) (kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\", container=\"test-app\", pod=~\"tag-.*\"}) > 0"), - For: "1m", - Annotations: map[string]string{ - CanaryMessageAnnotation: "There is at least one pod in state `CrashLoopBackOff`", - CanarySummaryAnnotation: "test-app - Canary is failing CrashLoopBackOff SLO - there is at least one pod in state `CrashLoopBackOff`", - }, - Labels: map[string]string{ - CanaryAppLabel: "test-app", - CanaryTagLabel: "tag", - CanaryLabel: "true", - CanarySLOLabel: "true", - "severity": "test", - "channel": "#eng-releases", - }, - }, - }, - }, - { - Name: "test_app_canary_imagepullbackoff", - Rules: []monitoringv1.Rule{ - { - Alert: "test_app_canary_imagepullbackoff", - Expr: intstr.FromString("sum by (reason) (kube_pod_container_status_waiting_reason{reason=\"ImagePullBackOff\", container=\"test-app\", pod=~\"tag-.*\"}) > 0"), - For: "1m", - Annotations: map[string]string{ - CanaryMessageAnnotation: "There is at least one pod in state `ImagePullBackOff`", - CanarySummaryAnnotation: "test-app - Canary is failing ImagePullBackOff SLO - there is at least one pod in state `ImagePullBackOff`", - }, - Labels: map[string]string{ - CanaryAppLabel: "test-app", - CanaryTagLabel: "tag", - CanaryLabel: "true", - CanarySLOLabel: "true", - "severity": "test", - "channel": "#eng-releases", - }, - }, - }, - }, }, }, }, diff --git a/controllers/plan/syncDeploymentRules.go b/controllers/plan/syncDeploymentRules.go new file mode 100644 index 00000000..842e852b --- /dev/null +++ b/controllers/plan/syncDeploymentRules.go @@ -0,0 +1,198 @@ +package plan + +import ( + "context" + "fmt" + "strings" + + "github.com/go-logr/logr" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + picchuv1alpha1 "go.medium.engineering/picchu/api/v1alpha1" + "go.medium.engineering/picchu/plan" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + DeploymentAppLabel = "app" + DeploymentTagLabel = "tag" + // should be labeled as an deployment slo - not canary + DeploymentSLOLabel = "deployment" + // CanaryLabel = "canary" + DeploymentChannelLabel = "channel" + + DeploymentSummaryAnnotation = "summary" + DeploymentMessageAnnotation = "message" + + // RuleTypeDeployment is the value of the LabelRuleType label for deployment rules. + RuleTypeDeployment = "deployment" +) + +type SyncDeploymentRules struct { + App string + Namespace string + Tag string + Labels map[string]string + ServiceLevelObjectiveLabels picchuv1alpha1.ServiceLevelObjectiveLabels + ServiceLevelObjectives []*picchuv1alpha1.SlothServiceLevelObjective +} + +func (p *SyncDeploymentRules) Apply(ctx context.Context, cli client.Client, cluster *picchuv1alpha1.Cluster, log logr.Logger) error { + prometheusRules, err := p.prometheusRules(log) + if err != nil { + return err + } + + if len(prometheusRules.Items) > 0 { + for _, pr := range prometheusRules.Items { + if err := plan.CreateOrUpdate(ctx, log, cli, pr); err != nil { + return err + } + } + } + + return nil +} + +func (p *SyncDeploymentRules) prometheusRules(log logr.Logger) (*monitoringv1.PrometheusRuleList, error) { + prl := &monitoringv1.PrometheusRuleList{} + prs := []*monitoringv1.PrometheusRule{} + + rule := p.prometheusRule() + // not looking at slos from -slo prometheusRule - creating 2 new slos + config := SLOConfig{ + App: p.App, + Tag: p.Tag, + Labels: p.ServiceLevelObjectiveLabels, + } + helperRules := config.helperRules(log) + for _, rg := range helperRules { + rule.Spec.Groups = append(rule.Spec.Groups, *rg) + } + + prs = append(prs, rule) + prl.Items = prs + return prl, nil +} + +func (p *SyncDeploymentRules) prometheusRule() *monitoringv1.PrometheusRule { + labels := make(map[string]string) + + for k, v := range p.Labels { + labels[k] = v + } + // for k, v := range p.ServiceLevelObjectiveLabels.RuleLabels { + // labels[k] = v + // } + + labels[picchuv1alpha1.LabelRuleType] = RuleTypeDeployment + + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: p.deploymentRuleName(), + Namespace: p.Namespace, + Labels: labels, + }, + } +} + +// create CrashLoopBackOff and ImagePullBackOff for new deployment pods +func (s *SLOConfig) helperRules(log logr.Logger) []*monitoringv1.RuleGroup { + ruleGroups := []*monitoringv1.RuleGroup{} + + crashLoopLabels := s.deploymentRuleLabels() + for k, v := range s.Labels.AlertLabels { + crashLoopLabels[k] = v + } + + crashLoopRuleGroup := &monitoringv1.RuleGroup{ + Name: s.crashLoopAlertName(), + Rules: []monitoringv1.Rule{ + { + Alert: s.crashLoopAlertName(), + For: monitoringv1.Duration("1m"), + Expr: intstr.FromString(s.crashLoopQuery(log)), + Labels: crashLoopLabels, + Annotations: s.crashLoopRuleAnnotations(log), + }, + }, + } + + ruleGroups = append(ruleGroups, crashLoopRuleGroup) + + imagePullBackOffLabels := s.deploymentRuleLabels() + for k, v := range s.Labels.AlertLabels { + imagePullBackOffLabels[k] = v + } + + imagePullBackOffRuleGroup := &monitoringv1.RuleGroup{ + Name: s.imagePullBackOffAlertName(), + Rules: []monitoringv1.Rule{ + { + Alert: s.imagePullBackOffAlertName(), + For: monitoringv1.Duration("1m"), + Expr: intstr.FromString(s.imagePullBackOffQuery(log)), + Labels: imagePullBackOffLabels, + Annotations: s.imagePullBackOffAnnotations(log), + }, + }, + } + + ruleGroups = append(ruleGroups, imagePullBackOffRuleGroup) + + return ruleGroups +} + +// this needs to be included in the slos +func (s *SLOConfig) deploymentRuleLabels() map[string]string { + return map[string]string{ + DeploymentAppLabel: s.App, + DeploymentTagLabel: s.Tag, + // CanaryLabel: "true", NO CANARY! general slo + DeploymentSLOLabel: "true", + DeploymentChannelLabel: "#eng-releases", + } +} + +func (s *SLOConfig) crashLoopRuleAnnotations(log logr.Logger) map[string]string { + return map[string]string{ + DeploymentSummaryAnnotation: fmt.Sprintf("%s - Deployment is failing CrashLoopBackOff SLO - there is at least one pod in state `CrashLoopBackOff`", s.App), + DeploymentMessageAnnotation: "There is at least one pod in state `CrashLoopBackOff`", + } +} + +func (s *SLOConfig) imagePullBackOffAnnotations(log logr.Logger) map[string]string { + return map[string]string{ + DeploymentSummaryAnnotation: fmt.Sprintf("%s - Deployment is failing ImagePullBackOff SLO - there is at least one pod in state `ImagePullBackOff`", s.App), + DeploymentMessageAnnotation: "There is at least one pod in state `ImagePullBackOff`", + } +} + +func (p *SyncDeploymentRules) deploymentRuleName() string { + return fmt.Sprintf("%s-deployment-%s", p.App, p.Tag) +} + +func (s *SLOConfig) crashLoopAlertName() string { + // per app not soo + name := strings.Replace(s.App, "-", "_", -1) + return fmt.Sprintf("%s_deployment_crashloop", name) +} + +func (s *SLOConfig) imagePullBackOffAlertName() string { + name := strings.Replace(s.App, "-", "_", -1) + return fmt.Sprintf("%s_deployment_imagepullbackoff", name) +} + +func (s *SLOConfig) crashLoopQuery(log logr.Logger) string { + // pod=~"main-20240109-181554-cba9e8cbbf-.*" + return fmt.Sprintf("sum by (reason) (kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\", container=\"%s\", pod=~\"%s-.*\"}) > 0", + s.App, s.Tag, + ) +} + +func (s *SLOConfig) imagePullBackOffQuery(log logr.Logger) string { + return fmt.Sprintf("sum by (reason) (kube_pod_container_status_waiting_reason{reason=\"ImagePullBackOff\", container=\"%s\", pod=~\"%s-.*\"}) > 0", + s.App, s.Tag, + ) +} diff --git a/controllers/plan/syncDeploymntRules_test.go b/controllers/plan/syncDeploymntRules_test.go new file mode 100644 index 00000000..ef737eef --- /dev/null +++ b/controllers/plan/syncDeploymntRules_test.go @@ -0,0 +1,160 @@ +package plan + +import ( + "context" + _ "runtime" + "testing" + + picchuv1alpha1 "go.medium.engineering/picchu/api/v1alpha1" + "go.medium.engineering/picchu/mocks" + common "go.medium.engineering/picchu/plan/test" + "go.medium.engineering/picchu/test" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/golang/mock/gomock" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/stretchr/testify/assert" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/intstr" +) + +var ( + dpplan = &SyncDeploymentRules{ + App: "test-app", + Namespace: "testnamespace", + Tag: "tag", + Labels: map[string]string{ + picchuv1alpha1.LabelApp: "test-app", + picchuv1alpha1.LabelTag: "v1", + picchuv1alpha1.LabelK8sName: "test-app", + picchuv1alpha1.LabelK8sVersion: "v1", + }, + ServiceLevelObjectiveLabels: picchuv1alpha1.ServiceLevelObjectiveLabels{ + AlertLabels: map[string]string{ + "severity": "test", + }, + }, + ServiceLevelObjectives: []*picchuv1alpha1.SlothServiceLevelObjective{ + { + Enabled: true, + Name: "test-app-availability", + Objective: "99.999", + Description: "Test description", + ServiceLevelIndicator: picchuv1alpha1.ServiceLevelIndicator{ + Canary: picchuv1alpha1.SLICanaryConfig{ + Enabled: true, + AllowancePercent: 1, + FailAfter: "1m", + }, + TagKey: "destination_workload", + AlertAfter: "1m", + ErrorQuery: "sum(rate(test_metric{job=\"test\"}[2m])) by (destination_workload)", + TotalQuery: "sum(rate(test_metric2{job=\"test\"}[2m])) by (destination_workload)", + }, + ServiceLevelObjectiveLabels: picchuv1alpha1.ServiceLevelObjectiveLabels{ + AlertLabels: map[string]string{ + "team": "test", + }, + }, + }, + }, + } + + dpexpected = monitoringv1.PrometheusRuleList{ + Items: []*monitoringv1.PrometheusRule{{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-app-deployment-tag", + Namespace: "testnamespace", + Labels: map[string]string{ + picchuv1alpha1.LabelApp: "test-app", + picchuv1alpha1.LabelTag: "v1", + picchuv1alpha1.LabelK8sName: "test-app", + picchuv1alpha1.LabelK8sVersion: "v1", + picchuv1alpha1.LabelRuleType: RuleTypeDeployment, + }, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test_app_deployment_crashloop", + Rules: []monitoringv1.Rule{ + { + Alert: "test_app_deployment_crashloop", + Expr: intstr.FromString("sum by (reason) (kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\", container=\"test-app\", pod=~\"tag-.*\"}) > 0"), + For: "1m", + Annotations: map[string]string{ + DeploymentMessageAnnotation: "There is at least one pod in state `CrashLoopBackOff`", + DeploymentSummaryAnnotation: "test-app - Deployment is failing CrashLoopBackOff SLO - there is at least one pod in state `CrashLoopBackOff`", + }, + Labels: map[string]string{ + DeploymentAppLabel: "test-app", + DeploymentTagLabel: "tag", + DeploymentSLOLabel: "true", + "severity": "test", + "channel": "#eng-releases", + }, + }, + }, + }, + { + Name: "test_app_deployment_imagepullbackoff", + Rules: []monitoringv1.Rule{ + { + Alert: "test_app_deployment_imagepullbackoff", + Expr: intstr.FromString("sum by (reason) (kube_pod_container_status_waiting_reason{reason=\"ImagePullBackOff\", container=\"test-app\", pod=~\"tag-.*\"}) > 0"), + For: "1m", + Annotations: map[string]string{ + DeploymentMessageAnnotation: "There is at least one pod in state `ImagePullBackOff`", + DeploymentSummaryAnnotation: "test-app - Deployment is failing ImagePullBackOff SLO - there is at least one pod in state `ImagePullBackOff`", + }, + Labels: map[string]string{ + DeploymentAppLabel: "test-app", + DeploymentTagLabel: "tag", + DeploymentSLOLabel: "true", + "severity": "test", + "channel": "#eng-releases", + }, + }, + }, + }, + }, + }, + }, + }, + } +) + +func TestSyncDeploymentRules(t *testing.T) { + log := test.MustNewLogger() + ctrl := gomock.NewController(t) + m := mocks.NewMockClient(ctrl) + defer ctrl.Finish() + + tests := []client.ObjectKey{ + {Name: "test-app-deployment-tag", Namespace: "testnamespace"}, + } + ctx := context.TODO() + + for i := range tests { + m. + EXPECT(). + Get(ctx, mocks.ObjectKey(tests[i]), gomock.Any()). + Return(common.NotFoundError). + Times(1) + } + + for i := range dpexpected.Items { + for _, obj := range []runtime.Object{ + dpexpected.Items[i], + } { + m. + EXPECT(). + Create(ctx, common.K8sEqual(obj)). + Return(nil). + Times(1) + } + } + + assert.NoError(t, dpplan.Apply(ctx, m, cluster, log), "Shouldn't return error.") +} diff --git a/controllers/state.go b/controllers/state.go index 8b6f7805..05ee344f 100644 --- a/controllers/state.go +++ b/controllers/state.go @@ -71,7 +71,9 @@ type Deployment interface { retire(context.Context) error del(context.Context) error syncCanaryRules(context.Context) error + syncDeploymentRules(context.Context) error deleteCanaryRules(context.Context) error + deleteDeploymentRules(context.Context) error syncTaggedServiceLevels(context.Context) error deleteTaggedServiceLevels(context.Context) error hasRevision() bool @@ -182,6 +184,9 @@ func Deploying(ctx context.Context, deployment Deployment, lastUpdated *time.Tim if HasFailed(deployment) { return failing, nil } + if err := deployment.syncDeploymentRules(ctx); err != nil { + return deploying, err + } if err := deployment.sync(ctx); err != nil { return deploying, err } @@ -405,6 +410,10 @@ func Deleting(ctx context.Context, deployment Deployment, lastUpdated *time.Time return deleting, err } + if err := deployment.deleteDeploymentRules(ctx); err != nil { + return deleting, err + } + if err := deployment.deleteTaggedServiceLevels(ctx); err != nil { return deleting, err } @@ -439,6 +448,9 @@ func Failing(ctx context.Context, deployment Deployment, lastUpdated *time.Time) if err := deployment.deleteCanaryRules(ctx); err != nil { return failing, err } + if err := deployment.deleteDeploymentRules(ctx); err != nil { + return failing, err + } if err := deployment.deleteTaggedServiceLevels(ctx); err != nil { return failing, err } @@ -490,6 +502,9 @@ func Canaried(ctx context.Context, deployment Deployment, lastUpdated *time.Time if err := deployment.deleteCanaryRules(ctx); err != nil { return canaried, err } + if err := deployment.deleteDeploymentRules(ctx); err != nil { + return canaried, err + } if err := deployment.sync(ctx); err != nil { return canarying, err } diff --git a/controllers/state_test.go b/controllers/state_test.go index 703692bf..826d97d9 100644 --- a/controllers/state_test.go +++ b/controllers/state_test.go @@ -66,10 +66,10 @@ func TestDeploying(t *tt.T) { testcase(failing, m(true, true, false), nil) testcase(failing, m(true, true, true), nil) - testcase(deploying, expectSync(m(true, false, false)), nil) - testcase(timingout, expectSync(m(true, false, false)), &old) + testcase(deploying, expectSync(expectSyncDeploymentRules(m(true, false, false))), nil) + testcase(timingout, expectSync(expectSyncDeploymentRules(m(true, false, false))), &old) - testcase(deployed, expectSync(m(true, false, true)), nil) + testcase(deployed, expectSync(expectSyncDeploymentRules(m(true, false, true))), nil) } func TestDeployed(t *tt.T) { @@ -446,8 +446,8 @@ func TestCanaried(t *tt.T) { testcase(deleting, m(false, false, true)) testcase(deleting, m(false, true, false)) testcase(deleting, m(false, true, true)) - testcase(canaried, expectSync(expectDeleteCanaryRules(m(true, false, false)))) - testcase(pendingrelease, expectSync(expectDeleteCanaryRules(m(true, false, true)))) + testcase(canaried, expectSync(expectDeleteDeploymentules(expectDeleteCanaryRules(m(true, false, false))))) + testcase(pendingrelease, expectSync(expectDeleteDeploymentules(expectDeleteCanaryRules(m(true, false, true))))) testcase(failing, m(true, true, false)) testcase(failing, m(true, true, true)) } @@ -658,7 +658,7 @@ func TestDeleting(t *tt.T) { testcase(deleting, m(false, 100)) testcase(deleting, m(false, 1)) - testcase(deleted, expectDeleteCanaryRules(expectDeleteTaggedServiceLevels(expectDelete(m(false, 0))))) + testcase(deleted, expectDeleteCanaryRules(expectDeleteDeploymentules(expectDeleteTaggedServiceLevels(expectDelete(m(false, 0)))))) testcase(deploying, m(true, 0)) testcase(deleting, m(true, 100)) testcase(deleting, m(true, 1)) @@ -715,10 +715,10 @@ func TestFailing(t *tt.T) { testcase(deploying, m(true, false, ExternalTestSucceeded, 0)) testcase(failing, m(true, false, ExternalTestSucceeded, 100)) - testcase(failed, expectDeleteCanaryRules(expectDeleteTaggedServiceLevels(expectRetire(m(true, true, ExternalTestDisabled, 0))))) + testcase(failed, expectDeleteCanaryRules(expectDeleteDeploymentules(expectDeleteTaggedServiceLevels(expectRetire(m(true, true, ExternalTestDisabled, 0)))))) testcase(failing, m(true, true, ExternalTestDisabled, 1)) testcase(failing, m(true, true, ExternalTestDisabled, 100)) - testcase(failed, expectDeleteCanaryRules(expectDeleteTaggedServiceLevels(expectRetire(m(true, false, ExternalTestFailed, 0))))) + testcase(failed, expectDeleteCanaryRules(expectDeleteDeploymentules(expectDeleteTaggedServiceLevels(expectRetire(m(true, false, ExternalTestFailed, 0)))))) testcase(failing, m(true, false, ExternalTestFailed, 1)) testcase(failing, m(true, false, ExternalTestFailed, 100)) } @@ -792,6 +792,8 @@ type responses struct { peakPercent uint32 syncCanaryRules error deleteCanaryRules error + syncDeploymentRules error + deleteDeploymentRules error syncTaggedServiceLevels error deleteTaggedServiceLevels error isTimingOut bool @@ -910,6 +912,24 @@ func expectDeleteCanaryRules(mock *MockDeployment) *MockDeployment { return mock } +func expectSyncDeploymentRules(mock *MockDeployment) *MockDeployment { + mock. + EXPECT(). + syncDeploymentRules(gomock.Any()). + Return(nil). + Times(1) + return mock +} + +func expectDeleteDeploymentules(mock *MockDeployment) *MockDeployment { + mock. + EXPECT(). + deleteDeploymentRules(gomock.Any()). + Return(nil). + Times(1) + return mock +} + func expectSyncTaggedServiceLevels(mock *MockDeployment) *MockDeployment { mock. EXPECT(). diff --git a/prometheus/api.go b/prometheus/api.go index e7aa1016..53af05a4 100644 --- a/prometheus/api.go +++ b/prometheus/api.go @@ -22,6 +22,11 @@ var ( SLOFiringTemplate = template.Must(template. New("sloFiringAlerts"). Parse(`sum by({{.TagLabel}},app,alertname)(ALERTS{slo="true",alertstate="{{.AlertState}}"})`)) + + // this will be an alert - and we can include the deploying tag + DeploymentFiringTemplate = template.Must(template. + New("deploymentFiringAlerts"). + Parse(`sum by({{.TagLabel}},app,alertname)(ALERTS{deployment="true",alertstate="{{.AlertState}}"})`)) log = logf.Log.WithName("prometheus_alerts") ) @@ -112,11 +117,14 @@ func (a API) queryWithCache(ctx context.Context, query string, t time.Time) (mod // TaggedAlerts returns a set of tags that are firing SLO alerts for an app at a given time. func (a API) TaggedAlerts(ctx context.Context, query AlertQuery, t time.Time, canariesOnly bool) (map[string][]string, error) { + // canarying q := bytes.NewBufferString("") var template template.Template if canariesOnly { + // only slos with the canary label created by picchu template = *CanaryFiringTemplate } else { + // include both canary and generic slos template = *SLOFiringTemplate } if err := template.Execute(q, query); err != nil { @@ -146,10 +154,54 @@ func (a API) TaggedAlerts(ctx context.Context, query AlertQuery, t time.Time, ca return tagset, nil } +// TaggedDeploymentAlerts returns a set of tags that are firing SLO alerts for an app at a given time. +func (a API) TaggedDeploymentAlerts(ctx context.Context, query AlertQuery, t time.Time) (map[string][]string, error) { + q := bytes.NewBufferString("") + + // check for deployment query + var deployment_template = *DeploymentFiringTemplate + + if err := deployment_template.Execute(q, query); err != nil { + return nil, err + } + val, err_deployment := a.queryWithCache(ctx, q.String(), t) + if err_deployment != nil { + return nil, err_deployment + } + + tagset := map[string][]string{} + switch v := val.(type) { + case model.Vector: + for _, sample := range v { + metricTag := string(sample.Metric["tag"]) + if string(sample.Metric["app"]) == query.App && metricTag != "" { + if tagset[metricTag] == nil { + tagset[metricTag] = []string{} + } + tagset[metricTag] = append(tagset[metricTag], string(sample.Metric["alertname"])) + } + } + default: + return nil, fmt.Errorf("Unexpected prom response: %+v", v) + } + return tagset, nil +} + // IsRevisionTriggered returns the offending alerts if any SLO alerts are currently triggered for the app/tag pair. func (a API) IsRevisionTriggered(ctx context.Context, app, tag string, canariesOnly bool) (bool, []string, error) { q := NewAlertQuery(app, tag) + // check if deployment slos triggered + deployment_tags, err := a.TaggedDeploymentAlerts(ctx, q, time.Now()) + if err != nil { + return false, nil, err + } + + if alerts, ok := deployment_tags[tag]; ok && len(alerts) > 0 { + return true, alerts, nil + } + + // check if canary or generic slos triggered tags, err := a.TaggedAlerts(ctx, q, time.Now(), canariesOnly) if err != nil { return false, nil, err diff --git a/prometheus/api_test.go b/prometheus/api_test.go index 8a616d3e..b1560768 100644 --- a/prometheus/api_test.go +++ b/prometheus/api_test.go @@ -19,6 +19,9 @@ func TestPrometheusCache(t *testing.T) { sloTemplate := SLOFiringTemplate testAlertCache(t, *sloTemplate, false) + + deplooymentTemplate := DeploymentFiringTemplate + testAlertCacheDeployment(t, *deplooymentTemplate, false) } func TestPrometheusAlerts(t *testing.T) { @@ -27,6 +30,9 @@ func TestPrometheusAlerts(t *testing.T) { sloTemplate := SLOFiringTemplate testAlert(t, *sloTemplate, false) + + deploymentTemplate := DeploymentFiringTemplate + testDeploymentAlert(t, *deploymentTemplate, false) } func testAlertCache(t *testing.T, template template.Template, canariesOnly bool) { @@ -59,6 +65,36 @@ func testAlertCache(t *testing.T, template template.Template, canariesOnly bool) assert.Equal(t, map[string][]string{}, r, "Should get no firing alerts") } +func testAlertCacheDeployment(t *testing.T, template template.Template, canariesOnly bool) { + ctrl := gomock.NewController(t) + + defer ctrl.Finish() + + m := mocks.NewMockPromAPI(ctrl) + api := InjectAPI(m, time.Duration(25)*time.Millisecond) + + aq := NewAlertQuery("tutu", "v1") + + q := bytes.NewBufferString("") + assert.Nil(t, template.Execute(q, aq), "Template execute shouldn't fail") + + m. + EXPECT(). + Query(gomock.Any(), gomock.Eq(q.String()), gomock.Any()). + Return(model.Vector{}, nil, nil). + Times(2) + + for i := 0; i < 5; i++ { + r, err := api.TaggedDeploymentAlerts(context.TODO(), aq, time.Now()) + assert.Nil(t, err, "Should succeed in querying alerts") + assert.Equal(t, map[string][]string{}, r, "Should get no firing alerts") + } + time.Sleep(time.Duration(25) * time.Millisecond) + r, err := api.TaggedDeploymentAlerts(context.TODO(), aq, time.Now()) + assert.Nil(t, err, "Should succeed in querying alerts") + assert.Equal(t, map[string][]string{}, r, "Should get no firing alerts") +} + func testAlert(t *testing.T, template template.Template, canariesOnly bool) { ctrl := gomock.NewController(t) @@ -113,3 +149,58 @@ func testAlert(t *testing.T, template template.Template, canariesOnly bool) { assert.Nil(t, err, "Should succeed in querying alerts") assert.Equal(t, map[string][]string{"v1": {"test"}}, r, "Should get 1 firing alerts (v1)") } + +func testDeploymentAlert(t *testing.T, template template.Template, canariesOnly bool) { + ctrl := gomock.NewController(t) + + defer ctrl.Finish() + + m := mocks.NewMockPromAPI(ctrl) + api := InjectAPI(m, time.Duration(25)*time.Millisecond) + + aq := NewAlertQuery("tutu", "v1") + + q := bytes.NewBufferString("") + assert.Nil(t, template.Execute(q, aq), "Template execute shouldn't fail") + + response := model.Vector{ + &model.Sample{ + Metric: map[model.LabelName]model.LabelValue{ + "app": "tutu", + "tag": "v1", + "alertname": "test", + }, + }, + &model.Sample{ + Metric: map[model.LabelName]model.LabelValue{ + "app": "tutu", + "xtag": "v2", + "alertname": "test", + }, + }, + &model.Sample{ + Metric: map[model.LabelName]model.LabelValue{ + "xapp": "tutu", + "tag": "v3", + "alertname": "test", + }, + }, + &model.Sample{ + Metric: map[model.LabelName]model.LabelValue{ + "app": "tutux", + "tag": "v4", + "alertname": "test", + }, + }, + } + + m. + EXPECT(). + Query(gomock.Any(), gomock.Eq(q.String()), gomock.Any()). + Return(response, nil, nil). + Times(1) + + r, err := api.TaggedDeploymentAlerts(context.TODO(), aq, time.Now()) + assert.Nil(t, err, "Should succeed in querying alerts") + assert.Equal(t, map[string][]string{"v1": {"test"}}, r, "Should get 1 firing alerts (v1)") +}