Skip to content

Commit

Permalink
feat(loadbalancing): add fail-open monitoring (#537)
Browse files Browse the repository at this point in the history
Fixes #532

This pull request adds fail-open monitoring to our Load Balancing
monitors.

To give context about fail open
([ref](https://docs.aws.amazon.com/elasticloadbalancing/latest/application/target-group-health-checks.html)):
> If a target group contains only unhealthy registered targets, the load
balancer routes requests to all those targets, regardless of their
health status. This means that if all targets fail health checks at the
same time in all enabled Availability Zones, the load balancer fails
open. The effect of the fail-open is to allow traffic to all targets in
all enabled Availability Zones, regardless of their health status, based
on the load balancing algorithm

Adding this metric will give better visibility into whether the Load
Balancer's fail open routing was used during incidents. The metrics
added as part of this pull request are in line with the AWS
documentation:
* `UnhealthyRoutingRequestCount` for ApplicationLoadBalancer with
`LoadBalancer` and `TargetGroup` dimensions
([ref](https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-cloudwatch-metrics.html))
* `UnhealthyRoutingFlowCount` for NetworkLoadBalancer with
`LoadBalancer` dimension
([ref](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/load-balancer-cloudwatch-metrics.html))

These metrics are reported conditionally, only when they have nonzero
values. Thus, I added a `FILL(metric, 0)` metric math to correctly
represent the values on the dashboards.

Tested with NetworkLoadBalancer, as that's the setup I have on my
account - the ApplicationLoadBalancer values are based only on the
documentation.

---

_By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache-2.0 license_

Co-authored-by: Milosz Watroba <[email protected]>
  • Loading branch information
2 people authored and echeung-amzn committed Jun 26, 2024
1 parent 318c312 commit 40131ce
Show file tree
Hide file tree
Showing 11 changed files with 1,083 additions and 48 deletions.
43 changes: 43 additions & 0 deletions API.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions lib/monitoring/aws-ecs-patterns/Ec2ServiceMonitoring.ts
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ export class Ec2ServiceMonitoring extends Monitoring {
readonly ephemeralStorageUsageMetric: MetricWithAlarmSupport;
readonly activeTcpFlowCountMetric?: MetricWithAlarmSupport;
readonly newTcpFlowCountMetric?: MetricWithAlarmSupport;
readonly unhealthyRoutingFlowCountMetric?: MetricWithAlarmSupport;
readonly processedBytesMetric?: MetricWithAlarmSupport;

private hasLoadBalancer: boolean;
Expand Down Expand Up @@ -219,6 +220,8 @@ export class Ec2ServiceMonitoring extends Monitoring {
this.loadBalancerMetricFactory.metricActiveConnectionCount();
this.newTcpFlowCountMetric =
this.loadBalancerMetricFactory.metricNewConnectionCount();
this.unhealthyRoutingFlowCountMetric =
this.loadBalancerMetricFactory.metricUnhealthyRoutingCount();
this.processedBytesMetric =
this.loadBalancerMetricFactory.metricProcessedBytesMin();
}
Expand Down Expand Up @@ -451,6 +454,10 @@ export class Ec2ServiceMonitoring extends Monitoring {
left.push(this.newTcpFlowCountMetric);
}

if (this.unhealthyRoutingFlowCountMetric) {
left.push(this.unhealthyRoutingFlowCountMetric);
}

if (this.processedBytesMetric) {
right.push(this.processedBytesMetric);
}
Expand Down
7 changes: 7 additions & 0 deletions lib/monitoring/aws-ecs-patterns/FargateServiceMonitoring.ts
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ export class FargateServiceMonitoring extends Monitoring {
readonly ephemeralStorageUsageMetric: MetricWithAlarmSupport;
readonly activeTcpFlowCountMetric?: MetricWithAlarmSupport;
readonly newTcpFlowCountMetric?: MetricWithAlarmSupport;
readonly unhealthyRoutingFlowCountMetric?: MetricWithAlarmSupport;
readonly processedBytesMetric?: MetricWithAlarmSupport;

private hasLoadBalancer: boolean;
Expand Down Expand Up @@ -222,6 +223,8 @@ export class FargateServiceMonitoring extends Monitoring {
this.loadBalancerMetricFactory.metricActiveConnectionCount();
this.newTcpFlowCountMetric =
this.loadBalancerMetricFactory.metricNewConnectionCount();
this.unhealthyRoutingFlowCountMetric =
this.loadBalancerMetricFactory.metricUnhealthyRoutingCount();
this.processedBytesMetric =
this.loadBalancerMetricFactory.metricProcessedBytesMin();
}
Expand Down Expand Up @@ -455,6 +458,10 @@ export class FargateServiceMonitoring extends Monitoring {
left.push(this.newTcpFlowCountMetric);
}

if (this.unhealthyRoutingFlowCountMetric) {
left.push(this.unhealthyRoutingFlowCountMetric);
}

if (this.processedBytesMetric) {
right.push(this.processedBytesMetric);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,21 @@ export class ApplicationLoadBalancerMetricFactory
})
);
}

metricUnhealthyRoutingCount() {
const unhealthyRoutingRequestCount = this.metricFactory.adaptMetric(
this.applicationTargetGroup.metrics.custom(
"UnhealthyRoutingRequestCount",
{
statistic: MetricStatistic.SUM,
}
)
);

return this.metricFactory.createMetricMath(
"FILL(unhealthyRoutingRequestCount, 0)",
{ unhealthyRoutingRequestCount },
"Unhealthy routing (fail open)"
);
}
}
2 changes: 2 additions & 0 deletions lib/monitoring/aws-loadbalancing/LoadBalancerMetricFactory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -104,5 +104,7 @@ export interface ILoadBalancerMetricFactory {

metricNewConnectionCount(): MetricWithAlarmSupport;

metricUnhealthyRoutingCount(): MetricWithAlarmSupport;

metricProcessedBytesMin(): MetricWithAlarmSupport;
}
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,18 @@ export class NetworkLoadBalancerMetricFactory
})
);
}

metricUnhealthyRoutingCount() {
const unhealthyRoutingFlowCount = this.metricFactory.adaptMetric(
this.networkLoadBalancer.metrics.custom("UnhealthyRoutingFlowCount", {
statistic: MetricStatistic.SUM,
})
);

return this.metricFactory.createMetricMath(
"FILL(unhealthyRoutingFlowCount, 0)",
{ unhealthyRoutingFlowCount },
"Unhealthy routing (fail open)"
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ export class NetworkLoadBalancerMonitoring extends Monitoring {
protected readonly healthyTaskPercentMetric: MetricWithAlarmSupport;
protected readonly activeTcpFlowCountMetric: MetricWithAlarmSupport;
protected readonly newTcpFlowCountMetric: MetricWithAlarmSupport;
protected readonly unhealthyRoutingFlowCountMetric: MetricWithAlarmSupport;
protected readonly processedBytesMetric: MetricWithAlarmSupport;

constructor(
Expand Down Expand Up @@ -89,6 +90,8 @@ export class NetworkLoadBalancerMonitoring extends Monitoring {
this.activeTcpFlowCountMetric =
this.metricFactory.metricActiveConnectionCount();
this.newTcpFlowCountMetric = this.metricFactory.metricNewConnectionCount();
this.unhealthyRoutingFlowCountMetric =
this.metricFactory.metricUnhealthyRoutingCount();
this.processedBytesMetric = this.metricFactory.metricProcessedBytesMin();

const alarmFactory = this.createAlarmFactory(
Expand Down Expand Up @@ -184,7 +187,11 @@ export class NetworkLoadBalancerMonitoring extends Monitoring {
width,
height,
title: "TCP Flows",
left: [this.activeTcpFlowCountMetric, this.newTcpFlowCountMetric],
left: [
this.activeTcpFlowCountMetric,
this.newTcpFlowCountMetric,
this.unhealthyRoutingFlowCountMetric,
],
leftYAxis: CountAxisFromZero,
right: [this.processedBytesMetric],
rightYAxis: SizeAxisBytesFromZero,
Expand Down
Loading

0 comments on commit 40131ce

Please sign in to comment.