Skip to content

Commit

Permalink
feat(ecs): add ability to alarm on ephermal storage usage (#525)
Browse files Browse the repository at this point in the history
Closes #399

---

_By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache-2.0 license_
  • Loading branch information
echeung-amzn authored May 30, 2024
1 parent fd545df commit e443b40
Show file tree
Hide file tree
Showing 9 changed files with 2,191 additions and 200 deletions.
341 changes: 291 additions & 50 deletions API.md

Large diffs are not rendered by default.

33 changes: 33 additions & 0 deletions lib/monitoring/aws-ecs-patterns/BaseServiceMetricFactory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,37 @@ export class BaseServiceMetricFactory {
EcsContainerInsightsNamespace
);
}

metricEphemeralStorageReserved() {
return this.metricFactory.createMetric(
"EphemeralStorageReserved",
MetricStatistic.MAX,
"Ephemeral Storage Reserved",
this.dimensionsMap,
undefined,
EcsContainerInsightsNamespace
);
}

metricEphemeralStorageUtilized() {
return this.metricFactory.createMetric(
"EphemeralStorageUtilized",
MetricStatistic.MAX,
"Ephemeral Storage Utilized",
this.dimensionsMap,
undefined,
EcsContainerInsightsNamespace
);
}

metricEphemeralStorageUsageInPercent() {
const total = this.metricEphemeralStorageReserved();
const used = this.metricEphemeralStorageUtilized();

return this.metricFactory.createMetricMath(
"100 * (used/total)",
{ used, total },
"Ephemeral Storage Usage"
);
}
}
66 changes: 45 additions & 21 deletions lib/monitoring/aws-ecs-patterns/Ec2ServiceMonitoring.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,24 @@ import {

export interface BaseEc2ServiceAlarms {
/**
* minimum number of tasks, as specified in your auto scaling config
* Minimum number of tasks, as specified in your auto scaling config.
*/
readonly minAutoScalingTaskCount?: number;
/**
* maximum number of tasks, as specified in your auto scaling config
* Maximum number of tasks, as specified in your auto scaling config.
*/
readonly maxAutoScalingTaskCount?: number;
readonly addCpuUsageAlarm?: Record<string, UsageThreshold>;
readonly addMemoryUsageAlarm?: Record<string, UsageThreshold>;

/**
* Container Insights needs to be enabled for the cluster for this alarm
* Container Insights needs to be enabled for the cluster for this alarm.
*/
readonly addRunningTaskCountAlarm?: Record<string, RunningTaskCountThreshold>;
readonly addCpuUsageAlarm?: Record<string, UsageThreshold>;
readonly addMemoryUsageAlarm?: Record<string, UsageThreshold>;
/**
* Container Insights needs to be enabled for the cluster for this alarm.
*/
readonly addEphermalStorageUsageAlarm?: Record<string, UsageThreshold>;
}

/**
Expand Down Expand Up @@ -170,9 +175,10 @@ export class Ec2ServiceMonitoring extends Monitoring {
readonly healthyTaskCountMetric?: MetricWithAlarmSupport;
readonly unhealthyTaskCountMetric?: MetricWithAlarmSupport;
readonly healthyTaskPercentMetric?: MetricWithAlarmSupport;
readonly runningTaskCountMetric: MetricWithAlarmSupport;
readonly cpuUtilisationMetric: MetricWithAlarmSupport;
readonly memoryUtilisationMetric: MetricWithAlarmSupport;
readonly runningTaskCountMetric: MetricWithAlarmSupport;
readonly ephemeralStorageUsageMetric: MetricWithAlarmSupport;
readonly activeTcpFlowCountMetric?: MetricWithAlarmSupport;
readonly newTcpFlowCountMetric?: MetricWithAlarmSupport;
readonly processedBytesMetric?: MetricWithAlarmSupport;
Expand Down Expand Up @@ -216,12 +222,14 @@ export class Ec2ServiceMonitoring extends Monitoring {
this.processedBytesMetric =
this.loadBalancerMetricFactory.metricProcessedBytesMin();
}
this.runningTaskCountMetric =
this.baseServiceMetricFactory.metricRunningTaskCount();
this.cpuUtilisationMetric =
this.baseServiceMetricFactory.metricClusterCpuUtilisationInPercent();
this.memoryUtilisationMetric =
this.baseServiceMetricFactory.metricClusterMemoryUtilisationInPercent();
this.runningTaskCountMetric =
this.baseServiceMetricFactory.metricRunningTaskCount();
this.ephemeralStorageUsageMetric =
this.baseServiceMetricFactory.metricEphemeralStorageUsageInPercent();

const alarmFactory = this.createAlarmFactory(
namingStrategy.resolveAlarmFriendlyName()
Expand Down Expand Up @@ -288,17 +296,6 @@ export class Ec2ServiceMonitoring extends Monitoring {
this.addAlarm(createdAlarm);
}
}

for (const disambiguator in props.addRunningTaskCountAlarm) {
const alarmProps = props.addRunningTaskCountAlarm[disambiguator];
const createdAlarm = this.taskHealthAlarmFactory.addRunningTaskCountAlarm(
this.runningTaskCountMetric,
alarmProps,
disambiguator
);
this.taskHealthAnnotations.push(createdAlarm.annotation);
this.addAlarm(createdAlarm);
}
for (const disambiguator in props.addCpuUsageAlarm) {
const alarmProps = props.addCpuUsageAlarm[disambiguator];
const createdAlarm = this.usageAlarmFactory.addMaxCpuUsagePercentAlarm(
Expand All @@ -320,6 +317,26 @@ export class Ec2ServiceMonitoring extends Monitoring {
this.addAlarm(createdAlarm);
}

for (const disambiguator in props.addRunningTaskCountAlarm) {
const alarmProps = props.addRunningTaskCountAlarm[disambiguator];
const createdAlarm = this.taskHealthAlarmFactory.addRunningTaskCountAlarm(
this.runningTaskCountMetric,
alarmProps,
disambiguator
);
this.taskHealthAnnotations.push(createdAlarm.annotation);
this.addAlarm(createdAlarm);
}
for (const disambiguator in props.addEphermalStorageUsageAlarm) {
const alarmProps = props.addEphermalStorageUsageAlarm[disambiguator];
const createdAlarm = this.usageAlarmFactory.addMaxDiskUsagePercentAlarm(
this.ephemeralStorageUsageMetric,
alarmProps,
disambiguator
);
this.addAlarm(createdAlarm);
}

if (this.hasLoadBalancer) {
for (const disambiguator in props.addMinProcessedBytesAlarm) {
const alarmProps = props.addMinProcessedBytesAlarm[disambiguator];
Expand Down Expand Up @@ -362,7 +379,7 @@ export class Ec2ServiceMonitoring extends Monitoring {

if (this.hasLoadBalancer) {
return baseWidget.concat([
this.createTpcFlowsWidget(QuarterWidth, DefaultGraphWidgetHeight),
this.createTcpFlowsWidget(QuarterWidth, DefaultGraphWidgetHeight),
this.createTaskHealthWidget(QuarterWidth, DefaultGraphWidgetHeight),
]);
} else {
Expand Down Expand Up @@ -422,7 +439,7 @@ export class Ec2ServiceMonitoring extends Monitoring {
});
}

createTpcFlowsWidget(width: number, height: number) {
createTcpFlowsWidget(width: number, height: number) {
const left: IMetric[] = [];
const right: IMetric[] = [];

Expand All @@ -448,4 +465,11 @@ export class Ec2ServiceMonitoring extends Monitoring {
rightYAxis: SizeAxisBytesFromZero,
});
}

/**
* @deprecated use {@see createTcpFlowsWidget} instead.
*/
createTpcFlowsWidget(width: number, height: number) {
return this.createTcpFlowsWidget(width, height);
}
}
65 changes: 45 additions & 20 deletions lib/monitoring/aws-ecs-patterns/FargateServiceMonitoring.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,24 @@ import {

export interface BaseFargateServiceAlarms {
/**
* minimum number of tasks, as specified in your auto scaling config
* Minimum number of tasks, as specified in your auto scaling config.
*/
readonly minAutoScalingTaskCount?: number;
/**
* maximum number of tasks, as specified in your auto scaling config
* Maximum number of tasks, as specified in your auto scaling config.
*/
readonly maxAutoScalingTaskCount?: number;
readonly addCpuUsageAlarm?: Record<string, UsageThreshold>;
readonly addMemoryUsageAlarm?: Record<string, UsageThreshold>;

/**
* Container Insights needs to be enabled for the cluster for this alarm
* Container Insights needs to be enabled for the cluster for this alarm.
*/
readonly addRunningTaskCountAlarm?: Record<string, RunningTaskCountThreshold>;
readonly addCpuUsageAlarm?: Record<string, UsageThreshold>;
readonly addMemoryUsageAlarm?: Record<string, UsageThreshold>;
/**
* Container Insights needs to be enabled for the cluster for this alarm.
*/
readonly addEphermalStorageUsageAlarm?: Record<string, UsageThreshold>;
}

/**
Expand Down Expand Up @@ -170,9 +175,10 @@ export class FargateServiceMonitoring extends Monitoring {
readonly healthyTaskCountMetric?: MetricWithAlarmSupport;
readonly unhealthyTaskCountMetric?: MetricWithAlarmSupport;
readonly healthyTaskPercentMetric?: MetricWithAlarmSupport;
readonly runningTaskCountMetric: MetricWithAlarmSupport;
readonly cpuUtilisationMetric: MetricWithAlarmSupport;
readonly memoryUtilisationMetric: MetricWithAlarmSupport;
readonly runningTaskCountMetric: MetricWithAlarmSupport;
readonly ephemeralStorageUsageMetric: MetricWithAlarmSupport;
readonly activeTcpFlowCountMetric?: MetricWithAlarmSupport;
readonly newTcpFlowCountMetric?: MetricWithAlarmSupport;
readonly processedBytesMetric?: MetricWithAlarmSupport;
Expand Down Expand Up @@ -219,12 +225,14 @@ export class FargateServiceMonitoring extends Monitoring {
this.processedBytesMetric =
this.loadBalancerMetricFactory.metricProcessedBytesMin();
}
this.runningTaskCountMetric =
this.baseServiceMetricFactory.metricRunningTaskCount();
this.cpuUtilisationMetric =
this.baseServiceMetricFactory.metricClusterCpuUtilisationInPercent();
this.memoryUtilisationMetric =
this.baseServiceMetricFactory.metricClusterMemoryUtilisationInPercent();
this.runningTaskCountMetric =
this.baseServiceMetricFactory.metricRunningTaskCount();
this.ephemeralStorageUsageMetric =
this.baseServiceMetricFactory.metricEphemeralStorageUsageInPercent();

const alarmFactory = this.createAlarmFactory(
namingStrategy.resolveAlarmFriendlyName()
Expand Down Expand Up @@ -292,16 +300,6 @@ export class FargateServiceMonitoring extends Monitoring {
}
}

for (const disambiguator in props.addRunningTaskCountAlarm) {
const alarmProps = props.addRunningTaskCountAlarm[disambiguator];
const createdAlarm = this.taskHealthAlarmFactory.addRunningTaskCountAlarm(
this.runningTaskCountMetric,
alarmProps,
disambiguator
);
this.taskHealthAnnotations.push(createdAlarm.annotation);
this.addAlarm(createdAlarm);
}
for (const disambiguator in props.addCpuUsageAlarm) {
const alarmProps = props.addCpuUsageAlarm[disambiguator];
const createdAlarm = this.usageAlarmFactory.addMaxCpuUsagePercentAlarm(
Expand All @@ -323,6 +321,26 @@ export class FargateServiceMonitoring extends Monitoring {
this.addAlarm(createdAlarm);
}

for (const disambiguator in props.addRunningTaskCountAlarm) {
const alarmProps = props.addRunningTaskCountAlarm[disambiguator];
const createdAlarm = this.taskHealthAlarmFactory.addRunningTaskCountAlarm(
this.runningTaskCountMetric,
alarmProps,
disambiguator
);
this.taskHealthAnnotations.push(createdAlarm.annotation);
this.addAlarm(createdAlarm);
}
for (const disambiguator in props.addEphermalStorageUsageAlarm) {
const alarmProps = props.addEphermalStorageUsageAlarm[disambiguator];
const createdAlarm = this.usageAlarmFactory.addMaxDiskUsagePercentAlarm(
this.ephemeralStorageUsageMetric,
alarmProps,
disambiguator
);
this.addAlarm(createdAlarm);
}

if (this.hasLoadBalancer) {
for (const disambiguator in props.addMinProcessedBytesAlarm) {
const alarmProps = props.addMinProcessedBytesAlarm[disambiguator];
Expand Down Expand Up @@ -365,7 +383,7 @@ export class FargateServiceMonitoring extends Monitoring {

if (this.hasLoadBalancer) {
return baseWidget.concat([
this.createTpcFlowsWidget(QuarterWidth, DefaultGraphWidgetHeight),
this.createTcpFlowsWidget(QuarterWidth, DefaultGraphWidgetHeight),
this.createTaskHealthWidget(QuarterWidth, DefaultGraphWidgetHeight),
]);
} else {
Expand Down Expand Up @@ -425,7 +443,7 @@ export class FargateServiceMonitoring extends Monitoring {
});
}

createTpcFlowsWidget(width: number, height: number) {
createTcpFlowsWidget(width: number, height: number) {
const left: IMetric[] = [];
const right: IMetric[] = [];

Expand All @@ -451,4 +469,11 @@ export class FargateServiceMonitoring extends Monitoring {
rightYAxis: SizeAxisBytesFromZero,
});
}

/**
* @deprecated use {@see createTcpFlowsWidget} instead.
*/
createTpcFlowsWidget(width: number, height: number) {
return this.createTcpFlowsWidget(width, height);
}
}
34 changes: 22 additions & 12 deletions test/monitoring/aws-ecs-patterns/Ec2ServiceMonitoring.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,6 @@ import { TestMonitoringScope } from "../TestMonitoringScope";
minHealthyTaskPercent: 75,
},
},
addRunningTaskCountAlarm: {
Warning: {
maxRunningTasks: 5,
},
},
addCpuUsageAlarm: {
Warning: {
maxUsagePercent: 80,
Expand All @@ -114,6 +109,16 @@ import { TestMonitoringScope } from "../TestMonitoringScope";
maxUsagePercent: 80,
},
},
addRunningTaskCountAlarm: {
Warning: {
maxRunningTasks: 5,
},
},
addEphermalStorageUsageAlarm: {
Warning: {
maxUsagePercent: 90,
},
},
addMinProcessedBytesAlarm: {
Warning: {
minProcessedBytes: 1024,
Expand All @@ -129,7 +134,7 @@ import { TestMonitoringScope } from "../TestMonitoringScope";
});

addMonitoringDashboardsToStack(stack, monitoring);
expect(numAlarmsCreated).toStrictEqual(7);
expect(numAlarmsCreated).toStrictEqual(8);
expect(Template.fromStack(stack)).toMatchSnapshot();
});

Expand Down Expand Up @@ -226,11 +231,6 @@ import { TestMonitoringScope } from "../TestMonitoringScope";
maxUnhealthyTasks: 3,
},
},
addRunningTaskCountAlarm: {
Warning: {
maxRunningTasks: 5,
},
},
addCpuUsageAlarm: {
Warning: {
maxUsagePercent: 80,
Expand All @@ -241,6 +241,16 @@ import { TestMonitoringScope } from "../TestMonitoringScope";
maxUsagePercent: 80,
},
},
addRunningTaskCountAlarm: {
Warning: {
maxRunningTasks: 5,
},
},
addEphermalStorageUsageAlarm: {
Warning: {
maxUsagePercent: 90,
},
},
addMinProcessedBytesAlarm: {
Warning: {
minProcessedBytes: 1024,
Expand All @@ -256,7 +266,7 @@ import { TestMonitoringScope } from "../TestMonitoringScope";
});

addMonitoringDashboardsToStack(stack, monitoring);
expect(numAlarmsCreated).toStrictEqual(7);
expect(numAlarmsCreated).toStrictEqual(8);
expect(Template.fromStack(stack)).toMatchSnapshot();
});
}
Expand Down
Loading

0 comments on commit e443b40

Please sign in to comment.