From 9aeb5594a894c3eeac27a265b4a79b15286b57fd Mon Sep 17 00:00:00 2001 From: Duncan Watson Date: Tue, 31 Dec 2024 15:18:12 +0000 Subject: [PATCH 1/2] EES-5687 - removed base alert templates for CPU, memory and response times in favour of reusable configuration and direct calls to rejigged dynamicMetricAlert.bicep template which reduces boilerplate from callsites. --- .../shared/postgreSqlFlexibleServer.bicep | 2 +- .../appGateways/responseTimeAlert.bicep | 19 ---- .../appServicePlans/cpuPercentageAlert.bicep | 19 ---- .../memoryPercentageAlert.bicep | 19 ---- .../alerts/baseCpuPercentageAlert.bicep | 38 -------- .../alerts/baseMemoryPercentageAlert.bicep | 38 -------- .../alerts/baseResponseTimeAlert.bicep | 38 -------- .../public-api/components/alerts/config.bicep | 25 ++++++ .../containerApps/cpuPercentageAlert.bicep | 19 ---- .../containerApps/memoryPercentageAlert.bicep | 19 ---- .../containerApps/responseTimeAlert.bicep | 19 ---- .../alerts/dynamicMetricAlertNew.bicep | 87 +++++++++++++++++++ .../alerts/fileServices/latencyAlert.bicep | 32 ------- .../clientConnectionsWaitingAlert.bicep | 2 +- .../cpuPercentageAlert.bicep | 19 ---- .../components/alerts/resourceMetrics.bicep | 53 +++++++++++ .../alerts/storageAccounts/latencyAlert.bicep | 19 ---- .../public-api/components/alerts/types.bicep | 52 +++++++++++ .../public-api/components/appGateway.bicep | 9 +- .../components/appServicePlan.bicep | 16 +++- .../public-api/components/containerApp.bicep | 23 ++++- .../public-api/components/fileShare.bicep | 12 ++- .../components/postgresqlDatabase.bicep | 36 +++++--- .../components/storageAccount.bicep | 9 +- 24 files changed, 303 insertions(+), 321 deletions(-) delete mode 100644 infrastructure/templates/public-api/components/alerts/appGateways/responseTimeAlert.bicep delete mode 100644 infrastructure/templates/public-api/components/alerts/appServicePlans/cpuPercentageAlert.bicep delete mode 100644 infrastructure/templates/public-api/components/alerts/appServicePlans/memoryPercentageAlert.bicep delete mode 100644 infrastructure/templates/public-api/components/alerts/baseCpuPercentageAlert.bicep delete mode 100644 infrastructure/templates/public-api/components/alerts/baseMemoryPercentageAlert.bicep delete mode 100644 infrastructure/templates/public-api/components/alerts/baseResponseTimeAlert.bicep create mode 100644 infrastructure/templates/public-api/components/alerts/config.bicep delete mode 100644 infrastructure/templates/public-api/components/alerts/containerApps/cpuPercentageAlert.bicep delete mode 100644 infrastructure/templates/public-api/components/alerts/containerApps/memoryPercentageAlert.bicep delete mode 100644 infrastructure/templates/public-api/components/alerts/containerApps/responseTimeAlert.bicep create mode 100644 infrastructure/templates/public-api/components/alerts/dynamicMetricAlertNew.bicep delete mode 100644 infrastructure/templates/public-api/components/alerts/fileServices/latencyAlert.bicep delete mode 100644 infrastructure/templates/public-api/components/alerts/postgreSqlFlexibleServers/cpuPercentageAlert.bicep create mode 100644 infrastructure/templates/public-api/components/alerts/resourceMetrics.bicep delete mode 100644 infrastructure/templates/public-api/components/alerts/storageAccounts/latencyAlert.bicep diff --git a/infrastructure/templates/public-api/application/shared/postgreSqlFlexibleServer.bicep b/infrastructure/templates/public-api/application/shared/postgreSqlFlexibleServer.bicep index 4d9fddc329..6d8f1e6250 100644 --- a/infrastructure/templates/public-api/application/shared/postgreSqlFlexibleServer.bicep +++ b/infrastructure/templates/public-api/application/shared/postgreSqlFlexibleServer.bicep @@ -67,7 +67,7 @@ module postgreSqlServerModule '../../components/postgresqlDatabase.bicep' = { diskBandwidth: true diskIops: true memoryPercentage: true - alertGroupName: resourceNames.existingResources.alertsGroup + alertsGroupName: resourceNames.existingResources.alertsGroup } : null tagValues: tagValues } diff --git a/infrastructure/templates/public-api/components/alerts/appGateways/responseTimeAlert.bicep b/infrastructure/templates/public-api/components/alerts/appGateways/responseTimeAlert.bicep deleted file mode 100644 index b767c7551b..0000000000 --- a/infrastructure/templates/public-api/components/alerts/appGateways/responseTimeAlert.bicep +++ /dev/null @@ -1,19 +0,0 @@ -@description('Name of the resource that these alerts are being applied to.') -param resourceName string - -@description('Name of the Alerts Group used to send alert messages.') -param alertsGroupName string - -@description('Tags with which to tag the resource in Azure.') -param tagValues object - -module alerts '../baseResponseTimeAlert.bicep' = { - name: '${resourceName}ResponseTimeAlertModule' - params: { - resourceName: resourceName - resourceType: 'Microsoft.Network/applicationGateways' - metricName: 'ApplicationGatewayTotalTime' - alertsGroupName: alertsGroupName - tagValues: tagValues - } -} diff --git a/infrastructure/templates/public-api/components/alerts/appServicePlans/cpuPercentageAlert.bicep b/infrastructure/templates/public-api/components/alerts/appServicePlans/cpuPercentageAlert.bicep deleted file mode 100644 index 63a12bf7e6..0000000000 --- a/infrastructure/templates/public-api/components/alerts/appServicePlans/cpuPercentageAlert.bicep +++ /dev/null @@ -1,19 +0,0 @@ -@description('Name of the resource that these alerts are being applied to.') -param resourceName string - -@description('Name of the Alerts Group used to send alert messages.') -param alertsGroupName string - -@description('Tags with which to tag the resource in Azure.') -param tagValues object - -module alerts '../baseCpuPercentageAlert.bicep' = { - name: '${resourceName}CpuPercentageAlertModule' - params: { - resourceName: resourceName - resourceType: 'Microsoft.Web/serverfarms' - metricName: 'CpuPercentage' - alertsGroupName: alertsGroupName - tagValues: tagValues - } -} diff --git a/infrastructure/templates/public-api/components/alerts/appServicePlans/memoryPercentageAlert.bicep b/infrastructure/templates/public-api/components/alerts/appServicePlans/memoryPercentageAlert.bicep deleted file mode 100644 index 090f8389f4..0000000000 --- a/infrastructure/templates/public-api/components/alerts/appServicePlans/memoryPercentageAlert.bicep +++ /dev/null @@ -1,19 +0,0 @@ -@description('Names of the resources that these alerts are being applied to.') -param resourceName string - -@description('Name of the Alerts Group used to send alert messages.') -param alertsGroupName string - -@description('Tags with which to tag the resource in Azure.') -param tagValues object - -module alerts '../baseMemoryPercentageAlert.bicep' = { - name: '${resourceName}MemoryPercentageAlertModule' - params: { - resourceName: resourceName - resourceType: 'Microsoft.Web/serverfarms' - metricName: 'MemoryPercentage' - alertsGroupName: alertsGroupName - tagValues: tagValues - } -} diff --git a/infrastructure/templates/public-api/components/alerts/baseCpuPercentageAlert.bicep b/infrastructure/templates/public-api/components/alerts/baseCpuPercentageAlert.bicep deleted file mode 100644 index 27c45cf942..0000000000 --- a/infrastructure/templates/public-api/components/alerts/baseCpuPercentageAlert.bicep +++ /dev/null @@ -1,38 +0,0 @@ -import { Severity } from 'types.bicep' - -@description('Name of the resource that these alerts are being applied to.') -param resourceName string - -@description('Names of the resources that these alerts are being applied to.') -param resourceType string - -@description('Names of the resources that these alerts are being applied to.') -param metricName string - -@description('The alert severity.') -param severity Severity = 'Warning' - -@description('Name of the Alerts Group used to send alert messages.') -param alertsGroupName string - -@description('Tags with which to tag the resource in Azure.') -param tagValues object - -module alerts 'dynamicMetricAlert.bicep' = { - name: '${resourceName}CpuPercentBaseAlertModule' - params: { - alertName: '${resourceName}-cpu-percentage' - resourceIds: [resourceId(resourceType, resourceName)] - resourceType: resourceType - query: { - metric: metricName - aggregation: 'Average' - operator: 'GreaterThan' - } - evaluationFrequency: 'PT5M' - windowSize: 'PT15M' - severity: severity - alertsGroupName: alertsGroupName - tagValues: tagValues - } -} diff --git a/infrastructure/templates/public-api/components/alerts/baseMemoryPercentageAlert.bicep b/infrastructure/templates/public-api/components/alerts/baseMemoryPercentageAlert.bicep deleted file mode 100644 index cfb412844f..0000000000 --- a/infrastructure/templates/public-api/components/alerts/baseMemoryPercentageAlert.bicep +++ /dev/null @@ -1,38 +0,0 @@ -import { Severity } from 'types.bicep' - -@description('Name of the resource that these alerts are being applied to.') -param resourceName string - -@description('Names of the resources that these alerts are being applied to.') -param resourceType string - -@description('Names of the resources that these alerts are being applied to.') -param metricName string - -@description('The alert severity.') -param severity Severity = 'Warning' - -@description('Name of the Alerts Group used to send alert messages.') -param alertsGroupName string - -@description('Tags with which to tag the resource in Azure.') -param tagValues object - -module alerts 'dynamicMetricAlert.bicep' = { - name: '${resourceName}MemoryPercentBaseAlertModule' - params: { - alertName: '${resourceName}-memory-percentage' - resourceIds: [resourceId(resourceType, resourceName)] - resourceType: resourceType - query: { - metric: metricName - aggregation: 'Average' - operator: 'GreaterThan' - } - evaluationFrequency: 'PT5M' - windowSize: 'PT15M' - severity: severity - alertsGroupName: alertsGroupName - tagValues: tagValues - } -} diff --git a/infrastructure/templates/public-api/components/alerts/baseResponseTimeAlert.bicep b/infrastructure/templates/public-api/components/alerts/baseResponseTimeAlert.bicep deleted file mode 100644 index 11fbfbe2b7..0000000000 --- a/infrastructure/templates/public-api/components/alerts/baseResponseTimeAlert.bicep +++ /dev/null @@ -1,38 +0,0 @@ -import { Severity } from 'types.bicep' - -@description('Name of the resource that these alerts are being applied to.') -param resourceName string - -@description('Names of the resources that these alerts are being applied to.') -param resourceType string - -@description('Names of the resources that these alerts are being applied to.') -param metricName string - -@description('The alert severity.') -param severity Severity = 'Warning' - -@description('Name of the Alerts Group used to send alert messages.') -param alertsGroupName string - -@description('Tags with which to tag the resource in Azure.') -param tagValues object - -module alerts 'dynamicMetricAlert.bicep' = { - name: '${resourceName}ResponseTimeBaseAlertModule' - params: { - alertName: '${resourceName}-response-time' - resourceIds: [resourceId(resourceType, resourceName)] - resourceType: resourceType - query: { - metric: metricName - aggregation: 'Average' - operator: 'GreaterThan' - } - evaluationFrequency: 'PT5M' - windowSize: 'PT15M' - severity: severity - alertsGroupName: alertsGroupName - tagValues: tagValues - } -} diff --git a/infrastructure/templates/public-api/components/alerts/config.bicep b/infrastructure/templates/public-api/components/alerts/config.bicep new file mode 100644 index 0000000000..e99796e8b5 --- /dev/null +++ b/infrastructure/templates/public-api/components/alerts/config.bicep @@ -0,0 +1,25 @@ +var defaultDynamicAlertConfig = { + aggregation: 'Average' + operator: 'GreaterThan' + evaluationFrequency: 'PT5M' + windowSize: 'PT15M' + numberOfEvaluationPeriods: 5 + minFailingPeriodsToAlert: 5 + sensitivity: 'Low' + severity: 'Warning' +} + +@export() +var cpuPercentageConfig = union(defaultDynamicAlertConfig, { + nameSuffix: 'cpu-percentage' +}) + +@export() +var memoryPercentageConfig = union(defaultDynamicAlertConfig, { + nameSuffix: 'memory-percentage' +}) + +@export() +var responseTimeConfig = union(defaultDynamicAlertConfig, { + nameSuffix: 'response-time' +}) diff --git a/infrastructure/templates/public-api/components/alerts/containerApps/cpuPercentageAlert.bicep b/infrastructure/templates/public-api/components/alerts/containerApps/cpuPercentageAlert.bicep deleted file mode 100644 index 68031359eb..0000000000 --- a/infrastructure/templates/public-api/components/alerts/containerApps/cpuPercentageAlert.bicep +++ /dev/null @@ -1,19 +0,0 @@ -@description('Name of the resource that these alerts are being applied to.') -param resourceName string - -@description('Name of the Alerts Group used to send alert messages.') -param alertsGroupName string - -@description('Tags with which to tag the resource in Azure.') -param tagValues object - -module alerts '../baseCpuPercentageAlert.bicep' = { - name: '${resourceName}CpuPercentageAlertModule' - params: { - resourceName: resourceName - resourceType: 'Microsoft.App/containerApps' - metricName: 'CpuPercentage' - alertsGroupName: alertsGroupName - tagValues: tagValues - } -} diff --git a/infrastructure/templates/public-api/components/alerts/containerApps/memoryPercentageAlert.bicep b/infrastructure/templates/public-api/components/alerts/containerApps/memoryPercentageAlert.bicep deleted file mode 100644 index 0c20eb1cb8..0000000000 --- a/infrastructure/templates/public-api/components/alerts/containerApps/memoryPercentageAlert.bicep +++ /dev/null @@ -1,19 +0,0 @@ -@description('Name of the resources that these alerts are being applied to.') -param resourceName string - -@description('Name of the Alerts Group used to send alert messages.') -param alertsGroupName string - -@description('Tags with which to tag the resource in Azure.') -param tagValues object - -module alerts '../baseMemoryPercentageAlert.bicep' = { - name: '${resourceName}MemoryPercentageAlertModule' - params: { - resourceName: resourceName - resourceType: 'Microsoft.App/containerApps' - metricName: 'MemoryPercentage' - alertsGroupName: alertsGroupName - tagValues: tagValues - } -} diff --git a/infrastructure/templates/public-api/components/alerts/containerApps/responseTimeAlert.bicep b/infrastructure/templates/public-api/components/alerts/containerApps/responseTimeAlert.bicep deleted file mode 100644 index 979ee0759f..0000000000 --- a/infrastructure/templates/public-api/components/alerts/containerApps/responseTimeAlert.bicep +++ /dev/null @@ -1,19 +0,0 @@ -@description('Name of the resource that these alerts are being applied to.') -param resourceName string - -@description('Name of the Alerts Group used to send alert messages.') -param alertsGroupName string - -@description('Tags with which to tag the resource in Azure.') -param tagValues object - -module alerts '../baseResponseTimeAlert.bicep' = { - name: '${resourceName}ResponseTimeAlertModule' - params: { - resourceName: resourceName - resourceType: 'Microsoft.App/containerApps' - metricName: 'ResponseTime' - alertsGroupName: alertsGroupName - tagValues: tagValues - } -} diff --git a/infrastructure/templates/public-api/components/alerts/dynamicMetricAlertNew.bicep b/infrastructure/templates/public-api/components/alerts/dynamicMetricAlertNew.bicep new file mode 100644 index 0000000000..5aa81d5caf --- /dev/null +++ b/infrastructure/templates/public-api/components/alerts/dynamicMetricAlertNew.bicep @@ -0,0 +1,87 @@ +import { + EvaluationFrequency + MetricName + DynamicMetricOperator + DynamicAlertConfig + ResourceType + TimeAggregation + WindowSize + Severity + Sensitivity + severityMapping +} from 'types.bicep' + +import { ResourceMetric } from 'resourceMetrics.bicep' + +@description('Name of the resource that this alert is being applied to.') +param resourceName string + +@description(''' +Optional id of the resource that this alert is being applied to, +if it cannot be looked up by the combination of resourceName and resourceType. +''') +param id string? + +@description('Resource type and metric name combination.') +param resourceMetric ResourceMetric + +@description('Configuration for this dynamic alert.') +param config DynamicAlertConfig + +@description(''' +An optional date that prevents machine learning algorithms from using metric data prior to this date in order to +calculate its dynamic threshold. +''') +param ignoreDataBefore string? + +@description('Name of the Alerts Group used to send alert messages.') +param alertsGroupName string + +@description('Tags with which to tag the resource in Azure.') +param tagValues object + +var severityLevel = severityMapping[config.severity] + +var resourceIds = [id != null ? id! : resourceId(resourceMetric.resourceType, resourceName)] + +resource alertsActionGroup 'Microsoft.Insights/actionGroups@2023-01-01' existing = { + name: alertsGroupName +} + +resource metricAlertRule 'Microsoft.Insights/metricAlerts@2018-03-01' = { + name: '${resourceName}-${config.nameSuffix}' + location: 'Global' + properties: { + enabled: true + scopes: resourceIds + severity: severityLevel + evaluationFrequency: config.evaluationFrequency + windowSize: config.windowSize + criteria: { + 'odata.type': 'Microsoft.Azure.Monitor.MultipleResourceMultipleMetricCriteria' + allOf: [ + { + criterionType: 'DynamicThresholdCriterion' + name: 'Metric1' + metricName: resourceMetric.metric + metricNamespace: resourceMetric.resourceType + timeAggregation: config.aggregation + operator: config.operator + alertSensitivity: config.sensitivity + skipMetricValidation: false + failingPeriods: { + minFailingPeriodsToAlert: config.minFailingPeriodsToAlert + numberOfEvaluationPeriods: config.numberOfEvaluationPeriods + } + ignoreDataBefore: ignoreDataBefore + } + ] + } + actions: [ + { + actionGroupId: alertsActionGroup.id + } + ] + } + tags: tagValues +} diff --git a/infrastructure/templates/public-api/components/alerts/fileServices/latencyAlert.bicep b/infrastructure/templates/public-api/components/alerts/fileServices/latencyAlert.bicep deleted file mode 100644 index 8c7198ead3..0000000000 --- a/infrastructure/templates/public-api/components/alerts/fileServices/latencyAlert.bicep +++ /dev/null @@ -1,32 +0,0 @@ -import { Severity } from '../types.bicep' - -@description('Name of the resource that these alerts are being applied to.') -param resourceName string - -@description('The alert severity.') -param severity Severity = 'Warning' - -@description('Name of the Alerts Group used to send alert messages.') -param alertsGroupName string - -@description('Tags with which to tag the resource in Azure.') -param tagValues object - -module alerts '../dynamicMetricAlert.bicep' = { - name: '${resourceName}FsLatencyAlertModule' - params: { - alertName: '${resourceName}-fileservice-latency' - resourceIds: [resourceId('Microsoft.Storage/storageAccounts/fileServices', resourceName, 'default')] - resourceType: 'Microsoft.Storage/storageAccounts/fileServices' - query: { - metric: 'SuccessE2ELatency' - aggregation: 'Average' - operator: 'GreaterThan' - } - evaluationFrequency: 'PT1M' - windowSize: 'PT5M' - severity: severity - alertsGroupName: alertsGroupName - tagValues: tagValues - } -} diff --git a/infrastructure/templates/public-api/components/alerts/postgreSqlFlexibleServers/clientConnectionsWaitingAlert.bicep b/infrastructure/templates/public-api/components/alerts/postgreSqlFlexibleServers/clientConnectionsWaitingAlert.bicep index 55372c5cd2..e0049c28d7 100644 --- a/infrastructure/templates/public-api/components/alerts/postgreSqlFlexibleServers/clientConnectionsWaitingAlert.bicep +++ b/infrastructure/templates/public-api/components/alerts/postgreSqlFlexibleServers/clientConnectionsWaitingAlert.bicep @@ -13,7 +13,7 @@ param alertsGroupName string param tagValues object module alerts '../dynamicMetricAlert.bicep' = { - name: '${resourceName}ClientConnectionsWaitingAlertModule' + name: '${resourceName}ClientConnectionsAlertModule' params: { alertName: '${resourceName}-query-time' resourceIds: [resourceId('Microsoft.DBforPostgreSQL/flexibleServers', resourceName)] diff --git a/infrastructure/templates/public-api/components/alerts/postgreSqlFlexibleServers/cpuPercentageAlert.bicep b/infrastructure/templates/public-api/components/alerts/postgreSqlFlexibleServers/cpuPercentageAlert.bicep deleted file mode 100644 index 735aea36ba..0000000000 --- a/infrastructure/templates/public-api/components/alerts/postgreSqlFlexibleServers/cpuPercentageAlert.bicep +++ /dev/null @@ -1,19 +0,0 @@ -@description('Name of the resource that these alerts are being applied to.') -param resourceName string - -@description('Name of the Alerts Group used to send alert messages.') -param alertsGroupName string - -@description('Tags with which to tag the resource in Azure.') -param tagValues object - -module alerts '../baseCpuPercentageAlert.bicep' = { - name: '${resourceName}CpuPercentageAlertModule' - params: { - resourceName: resourceName - resourceType: 'Microsoft.DBforPostgreSQL/flexibleServers' - metricName: 'cpu_percent' - alertsGroupName: alertsGroupName - tagValues: tagValues - } -} diff --git a/infrastructure/templates/public-api/components/alerts/resourceMetrics.bicep b/infrastructure/templates/public-api/components/alerts/resourceMetrics.bicep new file mode 100644 index 0000000000..126aa8c15e --- /dev/null +++ b/infrastructure/templates/public-api/components/alerts/resourceMetrics.bicep @@ -0,0 +1,53 @@ +type AppGatewayMetric = { + resourceType: 'Microsoft.Network/applicationGateways' + metric: + | 'ApplicationGatewayTotalTime' +} + +type AppServicePlanMetric = { + resourceType: 'Microsoft.Web/serverfarms' + metric: + | 'CpuPercentage' + | 'MemoryPercentage' +} + +type ContainerAppMetric = { + resourceType: 'Microsoft.App/containerApps' + metric: + | 'CpuPercentage' + | 'MemoryPercentage' + | 'ResponseTime' + | 'RestartCount' +} + +type FileServiceMetric = { + resourceType: 'Microsoft.Storage/storageAccounts/fileServices' + metric: + | 'SuccessE2ELatency' +} + +type PostgreSqlMetric = { + resourceType: 'Microsoft.DBforPostgreSQL/flexibleServers' + metric: + | 'cpu_percent' + | 'memory_percent' +} + +type StorageAccountMetric = { + resourceType: 'Microsoft.Storage/storageAccounts' + metric: + | 'SuccessE2ELatency' +} + + + +@export() +@discriminator('resourceType') +type ResourceMetric = +| AppServicePlanMetric +| AppGatewayMetric +| ContainerAppMetric +| ContainerAppMetric +| FileServiceMetric +| PostgreSqlMetric +| StorageAccountMetric diff --git a/infrastructure/templates/public-api/components/alerts/storageAccounts/latencyAlert.bicep b/infrastructure/templates/public-api/components/alerts/storageAccounts/latencyAlert.bicep deleted file mode 100644 index 583a5854fa..0000000000 --- a/infrastructure/templates/public-api/components/alerts/storageAccounts/latencyAlert.bicep +++ /dev/null @@ -1,19 +0,0 @@ -@description('Name of the resource that these alerts are being applied to.') -param resourceName string - -@description('Name of the Alerts Group used to send alert messages.') -param alertsGroupName string - -@description('Tags with which to tag the resource in Azure.') -param tagValues object - -module alerts '../baseResponseTimeAlert.bicep' = { - name: '${resourceName}LatencyAlertModule' - params: { - resourceName: resourceName - resourceType: 'Microsoft.Storage/storageAccounts' - metricName: 'SuccessE2ELatency' - alertsGroupName: alertsGroupName - tagValues: tagValues - } -} diff --git a/infrastructure/templates/public-api/components/alerts/types.bicep b/infrastructure/templates/public-api/components/alerts/types.bicep index 9d3c2ce587..87b1f278b7 100644 --- a/infrastructure/templates/public-api/components/alerts/types.bicep +++ b/infrastructure/templates/public-api/components/alerts/types.bicep @@ -93,3 +93,55 @@ type MetricName = | 'RestartCount' | 'SuccessE2ELatency' | 'UnhealthyHostCount' + +@export() +type DynamicAlertConfig = { + @description('Suffix to append to the resource name in order to create an alert name.') + nameSuffix: string + + @description('The aggregation applied to the metric values within the specified time window.') + aggregation: TimeAggregation + + @description('The operator used to compare the aggregated metrics against the dynamic threshold.') + operator: DynamicMetricOperator + + @description(''' + The frequency with which this alert rule evaluates the metrics against the dynamic thresholds. + For instance, PT1M with a window size of PT5M will evaluate the past 5 minutes' worth of metric data + against the dynamic threshold every minute. + ''') + evaluationFrequency: EvaluationFrequency + + @description(''' + The timespan that is used to calculate the metric's value against the specified time aggregation. + For instance, PT5M with a time aggregation of "Average" will use 5 minutes of metric data to calculate + the average value, which is then compared to the dynamic threshold. + ''') + windowSize: WindowSize + + @description(''' + How many periods to look back over to count failing periods. Used in conjunction with "minFailingPeriodsToAlert". + As an example, if "numberOfEvaluationPeriods" is set to 5 and "evaluationFrequency" is set to every minute, the past + 5 alerts (one for each of the last 5 minutes) is looked at and each failure is counted up. + ''') + numberOfEvaluationPeriods: int + + @description(''' + How many of the "numberOfEvaluationPeriods" results need to have failed in order for this rule to fire. + For instance, if this rule is using the past 5 calculations (with "numberOfEvaluationPeriods" set to 5) to evaluate + whether or not to fire, "minFailingPeriodsToAlert" determines how many of those past 5 periods have to have failed + in order for this rule to fire. If this was set to 3, 3 out of the 5 past calculations will have had to fail in + order for this rule to fire. + ''') + minFailingPeriodsToAlert: int + + @description(''' + How sensitive the alert is if a metric exceeds its dynamic threshold. + Low sensitivity means that this alert will fire only if the metric exceeds the threshold by a high degree. + High sensitivity means that this alert will fire if the metric exceeds the threshold to a much lower degree. + ''') + sensitivity: Sensitivity + + @description('The alert severity.') + severity: Severity +} diff --git a/infrastructure/templates/public-api/components/appGateway.bicep b/infrastructure/templates/public-api/components/appGateway.bicep index abb106a9cd..99856325d0 100644 --- a/infrastructure/templates/public-api/components/appGateway.bicep +++ b/infrastructure/templates/public-api/components/appGateway.bicep @@ -1,3 +1,5 @@ +import { responseTimeConfig } from 'alerts/config.bicep' + import { AppGatewayBackend AppGatewayRewriteSet @@ -292,10 +294,15 @@ module backendPoolsHealthAlert 'alerts/appGateways/backendPoolHealthAlert.bicep' } } -module responseTimeAlert 'alerts/appGateways/responseTimeAlert.bicep' = if (alerts != null && alerts!.responseTime) { +module responseTimeAlert 'alerts/dynamicMetricAlertNew.bicep' = if (alerts != null && alerts!.responseTime) { name: '${appGatewayName}ResponseTimeDeploy' params: { resourceName: appGatewayName + resourceMetric: { + resourceType: 'Microsoft.Network/applicationGateways' + metric: 'ApplicationGatewayTotalTime' + } + config: responseTimeConfig alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } diff --git a/infrastructure/templates/public-api/components/appServicePlan.bicep b/infrastructure/templates/public-api/components/appServicePlan.bicep index 09d2c272a1..f1883d0311 100644 --- a/infrastructure/templates/public-api/components/appServicePlan.bicep +++ b/infrastructure/templates/public-api/components/appServicePlan.bicep @@ -1,3 +1,5 @@ +import { cpuPercentageConfig, memoryPercentageConfig } from 'alerts/config.bicep' + @description('Specifies the App Service plan name') param planName string @@ -36,19 +38,29 @@ resource appServicePlan 'Microsoft.Web/serverfarms@2023-12-01' = { tags: tagValues } -module cpuPercentageAlert 'alerts/appServicePlans/cpuPercentageAlert.bicep' = if (alerts != null && alerts!.cpuPercentage) { +module cpuPercentageAlert 'alerts/dynamicMetricAlertNew.bicep' = if (alerts != null && alerts!.cpuPercentage) { name: '${planName}CpuPercentageDeploy' params: { resourceName: planName + resourceMetric: { + resourceType: 'Microsoft.Web/serverfarms' + metric: 'CpuPercentage' + } + config: cpuPercentageConfig alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } } -module memoryPercentageAlert 'alerts/appServicePlans/memoryPercentageAlert.bicep' = if (alerts != null && alerts!.memoryPercentage) { +module memoryPercentageAlert 'alerts/dynamicMetricAlertNew.bicep' = if (alerts != null && alerts!.memoryPercentage) { name: '${planName}MemoryPercentageDeploy' params: { resourceName: planName + resourceMetric: { + resourceType: 'Microsoft.Web/serverfarms' + metric: 'MemoryPercentage' + } + config: memoryPercentageConfig alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } diff --git a/infrastructure/templates/public-api/components/containerApp.bicep b/infrastructure/templates/public-api/components/containerApp.bicep index 0d25e18f44..fb3836c5ef 100644 --- a/infrastructure/templates/public-api/components/containerApp.bicep +++ b/infrastructure/templates/public-api/components/containerApp.bicep @@ -1,3 +1,5 @@ +import { cpuPercentageConfig, memoryPercentageConfig, responseTimeConfig } from 'alerts/config.bicep' + import { EntraIdAuthentication } from '../types.bicep' @description('Specifies the location for all resources.') @@ -222,28 +224,43 @@ module containerAppRestartsAlert 'alerts/containerApps/restartsAlert.bicep' = if } } -module responseTimeAlert 'alerts/containerApps/responseTimeAlert.bicep' = if (alerts != null && alerts!.responseTime) { +module responseTimeAlert 'alerts/dynamicMetricAlertNew.bicep' = if (alerts != null && alerts!.responseTime) { name: '${containerAppName}ResponseTimeDeploy' params: { resourceName: containerAppName + resourceMetric: { + resourceType: 'Microsoft.App/containerApps' + metric: 'ResponseTime' + } + config: responseTimeConfig alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } } -module cpuPercentageAlert 'alerts/containerApps/cpuPercentageAlert.bicep' = if (alerts != null && alerts!.cpuPercentage) { +module cpuPercentageAlert 'alerts/dynamicMetricAlertNew.bicep' = if (alerts != null && alerts!.cpuPercentage) { name: '${containerAppName}CpuPercentageDeploy' params: { resourceName: containerAppName + resourceMetric: { + resourceType: 'Microsoft.App/containerApps' + metric: 'CpuPercentage' + } + config: cpuPercentageConfig alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } } -module memoryPercentageAlert 'alerts/containerApps/memoryPercentageAlert.bicep' = if (alerts != null && alerts!.memoryPercentage) { +module memoryPercentageAlert 'alerts/dynamicMetricAlertNew.bicep' = if (alerts != null && alerts!.memoryPercentage) { name: '${containerAppName}MemoryPercentageDeploy' params: { resourceName: containerAppName + resourceMetric: { + resourceType: 'Microsoft.App/containerApps' + metric: 'MemoryPercentage' + } + config: memoryPercentageConfig alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } diff --git a/infrastructure/templates/public-api/components/fileShare.bicep b/infrastructure/templates/public-api/components/fileShare.bicep index 058591ea25..f8ebedfc8c 100644 --- a/infrastructure/templates/public-api/components/fileShare.bicep +++ b/infrastructure/templates/public-api/components/fileShare.bicep @@ -1,3 +1,5 @@ +import { responseTimeConfig } from 'alerts/config.bicep' + @description('Size in GB of the file share') param fileShareQuota int = 6 @@ -49,10 +51,16 @@ module availabilityAlerts 'alerts/fileServices/availabilityAlert.bicep' = if (al } } -module latencyAlert 'alerts/fileServices/latencyAlert.bicep' = if (alerts != null && alerts!.latency) { +module latencyAlert 'alerts/dynamicMetricAlertNew.bicep' = if (alerts != null && alerts!.latency) { name: '${storageAccountName}FsLatencyDeploy' params: { - resourceName: storageAccountName + resourceName: '${storageAccountName}-fs' + id: resourceId('Microsoft.Storage/storageAccounts/fileServices', storageAccountName, 'default') + resourceMetric: { + resourceType: 'Microsoft.Storage/storageAccounts/fileServices' + metric: 'SuccessE2ELatency' + } + config: responseTimeConfig alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } diff --git a/infrastructure/templates/public-api/components/postgresqlDatabase.bicep b/infrastructure/templates/public-api/components/postgresqlDatabase.bicep index a966d13ca9..ca09708b16 100644 --- a/infrastructure/templates/public-api/components/postgresqlDatabase.bicep +++ b/infrastructure/templates/public-api/components/postgresqlDatabase.bicep @@ -1,3 +1,5 @@ +import { cpuPercentageConfig, memoryPercentageConfig } from 'alerts/config.bicep' + import { IpRange, PrincipalNameAndId } from '../types.bicep' @description('Specifies the location for all resources.') @@ -55,7 +57,7 @@ param alerts { diskBandwidth: bool diskIops: bool memoryPercentage: bool - alertGroupName: string + alertsGroupName: string }? @description('A set of tags with which to tag the resource in Azure') @@ -153,7 +155,7 @@ module databaseAliveAlert 'alerts/postgreSqlFlexibleServers/databaseAlive.bicep' name: '${databaseServerName}DbAliveDeploy' params: { resourceName: databaseServerName - alertsGroupName: alerts!.alertGroupName + alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } } @@ -162,7 +164,7 @@ module queryTimeAlert 'alerts/postgreSqlFlexibleServers/queryTimeAlert.bicep' = name: '${databaseServerName}QueryTimeDeploy' params: { resourceName: databaseServerName - alertsGroupName: alerts!.alertGroupName + alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } } @@ -171,25 +173,30 @@ module transactionTimeAlert 'alerts/postgreSqlFlexibleServers/transactionTimeAle name: '${databaseServerName}TransactionTimeDeploy' params: { resourceName: databaseServerName - alertsGroupName: alerts!.alertGroupName + alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } } -module clientConenctionsWaitingAlert 'alerts/postgreSqlFlexibleServers/clientConnectionsWaitingAlert.bicep' = if (alerts != null && alerts!.clientConenctionsWaiting) { +module clientConnectionsWaitingAlert 'alerts/postgreSqlFlexibleServers/clientConnectionsWaitingAlert.bicep' = if (alerts != null && alerts!.clientConenctionsWaiting) { name: '${databaseServerName}ClientConnectionsDeploy' params: { resourceName: databaseServerName - alertsGroupName: alerts!.alertGroupName + alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } } -module cpuPercentageAlert 'alerts/postgreSqlFlexibleServers/cpuPercentageAlert.bicep' = if (alerts != null && alerts!.cpuPercentage) { +module cpuPercentageAlert 'alerts/dynamicMetricAlertNew.bicep' = if (alerts != null && alerts!.cpuPercentage) { name: '${databaseServerName}CpuPercentageDeploy' params: { resourceName: databaseServerName - alertsGroupName: alerts!.alertGroupName + resourceMetric: { + resourceType: 'Microsoft.DBforPostgreSQL/flexibleServers' + metric: 'cpu_percent' + } + config: cpuPercentageConfig + alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } } @@ -198,7 +205,7 @@ module diskBandwidthAlert 'alerts/postgreSqlFlexibleServers/diskBandwidthAlert.b name: '${databaseServerName}DiskBandwidthDeploy' params: { resourceName: databaseServerName - alertsGroupName: alerts!.alertGroupName + alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } } @@ -207,16 +214,21 @@ module diskIopsAlert 'alerts/postgreSqlFlexibleServers/diskIopsAlert.bicep' = if name: '${databaseServerName}DiskIopsDeploy' params: { resourceName: databaseServerName - alertsGroupName: alerts!.alertGroupName + alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } } -module memoryPercentageAlert 'alerts/postgreSqlFlexibleServers/memoryPercentageAlert.bicep' = if (alerts != null && alerts!.memoryPercentage) { +module memoryPercentageAlert 'alerts/dynamicMetricAlertNew.bicep' = if (alerts != null && alerts!.memoryPercentage) { name: '${databaseServerName}MemoryPercentageDeploy' params: { resourceName: databaseServerName - alertsGroupName: alerts!.alertGroupName + resourceMetric: { + resourceType: 'Microsoft.DBforPostgreSQL/flexibleServers' + metric: 'memory_percent' + } + config: memoryPercentageConfig + alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } } diff --git a/infrastructure/templates/public-api/components/storageAccount.bicep b/infrastructure/templates/public-api/components/storageAccount.bicep index 6e0a106e68..c64aa9932f 100644 --- a/infrastructure/templates/public-api/components/storageAccount.bicep +++ b/infrastructure/templates/public-api/components/storageAccount.bicep @@ -1,3 +1,5 @@ +import { responseTimeConfig } from 'alerts/config.bicep' + import { IpRange } from '../types.bicep' @description('Specifies the location for all resources.') @@ -66,10 +68,15 @@ module availabilityAlerts 'alerts/storageAccounts/availabilityAlert.bicep' = if } } -module latencyAlert 'alerts/storageAccounts/latencyAlert.bicep' = if (alerts != null && alerts!.latency) { +module latencyAlert 'alerts/dynamicMetricAlertNew.bicep' = if (alerts != null && alerts!.latency) { name: '${storageAccountName}LatencyDeploy' params: { resourceName: storageAccountName + resourceMetric: { + resourceType: 'Microsoft.Storage/storageAccounts' + metric: 'SuccessE2ELatency' + } + config: responseTimeConfig alertsGroupName: alerts!.alertsGroupName tagValues: tagValues } From 637b6bfbe4fbf6b18b5fd1ccdb281fd8f2d90f69 Mon Sep 17 00:00:00 2001 From: Duncan Watson Date: Mon, 6 Jan 2025 15:24:04 +0000 Subject: [PATCH 2/2] EES-5687 - responding to PR comments. Renaming variables and some cleanup. --- .../public-api/components/alerts/config.bicep | 4 ++-- .../components/alerts/dynamicMetricAlertNew.bicep | 4 ++-- .../components/alerts/resourceMetrics.bicep | 2 -- .../public-api/components/alerts/types.bicep | 14 +++++++------- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/infrastructure/templates/public-api/components/alerts/config.bicep b/infrastructure/templates/public-api/components/alerts/config.bicep index e99796e8b5..a942c16006 100644 --- a/infrastructure/templates/public-api/components/alerts/config.bicep +++ b/infrastructure/templates/public-api/components/alerts/config.bicep @@ -2,9 +2,9 @@ var defaultDynamicAlertConfig = { aggregation: 'Average' operator: 'GreaterThan' evaluationFrequency: 'PT5M' + evaluationPeriods: 5 + minFailingEvaluationPeriods: 5 windowSize: 'PT15M' - numberOfEvaluationPeriods: 5 - minFailingPeriodsToAlert: 5 sensitivity: 'Low' severity: 'Warning' } diff --git a/infrastructure/templates/public-api/components/alerts/dynamicMetricAlertNew.bicep b/infrastructure/templates/public-api/components/alerts/dynamicMetricAlertNew.bicep index 5aa81d5caf..41f80eb2c7 100644 --- a/infrastructure/templates/public-api/components/alerts/dynamicMetricAlertNew.bicep +++ b/infrastructure/templates/public-api/components/alerts/dynamicMetricAlertNew.bicep @@ -70,8 +70,8 @@ resource metricAlertRule 'Microsoft.Insights/metricAlerts@2018-03-01' = { alertSensitivity: config.sensitivity skipMetricValidation: false failingPeriods: { - minFailingPeriodsToAlert: config.minFailingPeriodsToAlert - numberOfEvaluationPeriods: config.numberOfEvaluationPeriods + numberOfEvaluationPeriods: config.evaluationPeriods + minFailingPeriodsToAlert: config.minFailingEvaluationPeriods } ignoreDataBefore: ignoreDataBefore } diff --git a/infrastructure/templates/public-api/components/alerts/resourceMetrics.bicep b/infrastructure/templates/public-api/components/alerts/resourceMetrics.bicep index 126aa8c15e..089cf4fc45 100644 --- a/infrastructure/templates/public-api/components/alerts/resourceMetrics.bicep +++ b/infrastructure/templates/public-api/components/alerts/resourceMetrics.bicep @@ -39,8 +39,6 @@ type StorageAccountMetric = { | 'SuccessE2ELatency' } - - @export() @discriminator('resourceType') type ResourceMetric = diff --git a/infrastructure/templates/public-api/components/alerts/types.bicep b/infrastructure/templates/public-api/components/alerts/types.bicep index 87b1f278b7..134517ce1a 100644 --- a/infrastructure/templates/public-api/components/alerts/types.bicep +++ b/infrastructure/templates/public-api/components/alerts/types.bicep @@ -120,20 +120,20 @@ type DynamicAlertConfig = { windowSize: WindowSize @description(''' - How many periods to look back over to count failing periods. Used in conjunction with "minFailingPeriodsToAlert". - As an example, if "numberOfEvaluationPeriods" is set to 5 and "evaluationFrequency" is set to every minute, the past + How many periods to look back over to count failing periods. Used in conjunction with "minFailingEvaluationPeriods". + As an example, if "evaluationPeriods" is set to 5 and "evaluationFrequency" is set to every minute, the past 5 alerts (one for each of the last 5 minutes) is looked at and each failure is counted up. ''') - numberOfEvaluationPeriods: int + evaluationPeriods: int @description(''' - How many of the "numberOfEvaluationPeriods" results need to have failed in order for this rule to fire. - For instance, if this rule is using the past 5 calculations (with "numberOfEvaluationPeriods" set to 5) to evaluate - whether or not to fire, "minFailingPeriodsToAlert" determines how many of those past 5 periods have to have failed + How many of the "evaluationPeriods" results need to have failed in order for this rule to fire. + For instance, if this rule is using the past 5 calculations (with "evaluationPeriods" set to 5) to evaluate + whether or not to fire, "minFailingEvaluationPeriods" determines how many of those past 5 periods have to have failed in order for this rule to fire. If this was set to 3, 3 out of the 5 past calculations will have had to fail in order for this rule to fire. ''') - minFailingPeriodsToAlert: int + minFailingEvaluationPeriods: int @description(''' How sensitive the alert is if a metric exceeds its dynamic threshold.