From 5af1cf7cff4241608693595a61427d7ea1ddeded Mon Sep 17 00:00:00 2001 From: Sayali Gaikawad <61760125+gaiksaya@users.noreply.github.com> Date: Mon, 18 Dec 2023 15:33:52 -0800 Subject: [PATCH] Add alarms and monitoring for infra stack (#84) Signed-off-by: Sayali Gaikawad --- README.md | 1 + lib/cloudwatch/metrics-section.ts | 6 +++ lib/infra/infra-stack.ts | 13 ++++- lib/monitoring/alarms.ts | 85 +++++++++++++++++++++++++++++++ lib/os-cluster-entrypoint.ts | 4 ++ test/os-cluster.test.ts | 3 ++ 6 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 lib/monitoring/alarms.ts diff --git a/README.md b/README.md index 1b3060c64e9..a053c60ce55 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ In order to deploy both the stacks the user needs to provide a set of required a | storageVolumeType | Optional | string | EBS volume type for all the nodes (data, ml, cluster manager). Defaults to gp2. See `lib/opensearch-config/node-config.ts` for available options. E.g., `-c storageVolumeType=gp3`. For SSD based instance (i.e. i3 family), it is used for root volume configuration. | | customRoleArn | Optional | string | User provided IAM role arn to be used as ec2 instance profile. `-c customRoleArn=arn:aws:iam:::role/` | | customConfigFiles | Optional | string | You can provide an entire config file to be overwritten or added to OpenSearch and OpenSearch Dashboards. Pass string in the form of JSON with key as local path to the config file to read from and value as file on the server to overwrite/add. Note that the values in the JSON needs to have prefix of `opensearch` or `opensearch-dashboards`. Example: `-c customConfigFiles='{"opensearch-config/config.yml": "opensearch/config/opensearch-security/config.yml", "opensearch-config/role_mapping.yml":"opensearch/config/opensearch-security/roles_mapping.yml", "/roles.yml": "opensearch/config/opensearch-security/roles.yml"}'` | +| enableMonitoring | Optional | boolean | Boolean flag to enable monitoring and alarms for Infra Stack. See [InfraStackMonitoring class](./lib/monitoring/alarms.ts) for more details. Defaults to false e.g., `--context enableMonitoring=true` * Before starting this step, ensure that your AWS CLI is correctly configured with access credentials. * Also ensure that you're running these commands in the current directory diff --git a/lib/cloudwatch/metrics-section.ts b/lib/cloudwatch/metrics-section.ts index ca4f74d1fe8..0c9ea64a12f 100644 --- a/lib/cloudwatch/metrics-section.ts +++ b/lib/cloudwatch/metrics-section.ts @@ -26,6 +26,12 @@ export interface ProcstatMetricDefinition { } interface EditableCloudwatchMetricsSection { + // eslint-disable-next-line camelcase + namespace?: string; + // eslint-disable-next-line camelcase + append_dimensions?: any; + // eslint-disable-next-line camelcase + aggregation_dimensions?: any; // eslint-disable-next-line camelcase metrics_collected: { procstat?: ProcstatMetricDefinition[], diff --git a/lib/infra/infra-stack.ts b/lib/infra/infra-stack.ts index f0831ab9847..cd7b4c412a1 100644 --- a/lib/infra/infra-stack.ts +++ b/lib/infra/infra-stack.ts @@ -40,6 +40,7 @@ import { dump, load } from 'js-yaml'; import { join } from 'path'; import { CloudwatchAgent } from '../cloudwatch/cloudwatch-agent'; import { ProcstatMetricDefinition } from '../cloudwatch/metrics-section'; +import { InfraStackMonitoring } from '../monitoring/alarms'; import { nodeConfig } from '../opensearch-config/node-config'; import { RemoteStoreResources } from './remote-store-resources'; @@ -72,6 +73,7 @@ export interface infraProps extends StackProps { readonly additionalConfig?: string, readonly additionalOsdConfig?: string, readonly customConfigFiles?: string, + readonly enableMonitoring?: boolean, } export class InfraStack extends Stack { @@ -368,10 +370,13 @@ export class InfraStack extends Stack { }); } } - new CfnOutput(this, 'loadbalancer-url', { value: nlb.loadBalancerDnsName, }); + + if (props.enableMonitoring) { + const monitoring = new InfraStackMonitoring(this, props.dashboardsUrl); + } } private static getCfnInitElement(scope: Stack, logGroup: LogGroup, props: infraProps, nodeType?: string): InitElement[] { @@ -406,6 +411,12 @@ export class InfraStack extends Stack { debug: false, }, metrics: { + append_dimensions: { + // eslint-disable-next-line no-template-curly-in-string + InstanceId: '${aws:InstanceId}', + }, + aggregation_dimensions: [[]], // Create rollups without instance id + namespace: `${scope.stackName}/InfraStack`, metrics_collected: { procstat: procstatConfig, cpu: { diff --git a/lib/monitoring/alarms.ts b/lib/monitoring/alarms.ts new file mode 100644 index 00000000000..337d1e7d4b3 --- /dev/null +++ b/lib/monitoring/alarms.ts @@ -0,0 +1,85 @@ +/** + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +import { + Alarm, AlarmWidget, ComparisonOperator, Dashboard, MathExpression, Metric, TreatMissingData, +} from 'aws-cdk-lib/aws-cloudwatch'; +import { InfraStack } from '../infra/infra-stack'; + +export class InfraStackMonitoring { + public readonly alarmMetrics: { + memUsed: Metric | MathExpression, + diskUsed: Metric| MathExpression, + openSearchProcessNotFound: Metric | MathExpression, + openSearchDashboardsProcessNotFound?: Metric | MathExpression, + } + + public readonly alarms: Alarm[] = [] + + constructor(infraStack: InfraStack, dashboardsUrl: string) { + this.alarmMetrics = { + memUsed: new Metric({ + metricName: 'mem_used_percent', + namespace: `${infraStack.stackName}/InfraStack`, + }), + diskUsed: new MathExpression({ + expression: `SELECT AVG(disk_used_percent) FROM "${infraStack.stackName}/InfraStack" WHERE "fstype" = 'xfs'`, + }), + openSearchProcessNotFound: new MathExpression({ + expression: `SELECT AVG(procstat_lookup_pid_count) FROM "${infraStack.stackName}/InfraStack" WHERE "pattern" = '-Dopensearch'`, + }), + openSearchDashboardsProcessNotFound: new MathExpression({ + expression: `SELECT AVG(procstat_lookup_pid_count) FROM "${infraStack.stackName}/InfraStack" WHERE "pattern" = 'opensearch-dashboards'`, + }), + }; + const alarmDashboard = new Dashboard(infraStack, 'AlarmDashboard'); + this.alarms.push(new Alarm(infraStack, 'OpenSearchProcessNotFound', { + alarmDescription: 'OpenSearch Process not found', + metric: this.alarmMetrics.openSearchProcessNotFound.with({ statistic: 'avg' }), + evaluationPeriods: 3, + threshold: 1, + datapointsToAlarm: 3, + comparisonOperator: ComparisonOperator.LESS_THAN_THRESHOLD, + treatMissingData: TreatMissingData.IGNORE, + })); + + if (dashboardsUrl !== 'undefined' && this.alarmMetrics.openSearchDashboardsProcessNotFound !== undefined) { + this.alarms.push(new Alarm(infraStack, 'OpenSearchDashboardsProcessNotFound', { + alarmDescription: 'OpenSearch Dashboards Process not found', + metric: this.alarmMetrics.openSearchDashboardsProcessNotFound.with({ statistic: 'avg' }), + evaluationPeriods: 3, + threshold: 1, + datapointsToAlarm: 3, + comparisonOperator: ComparisonOperator.LESS_THAN_THRESHOLD, + treatMissingData: TreatMissingData.IGNORE, + })); + } + + this.alarms.push(new Alarm(infraStack, 'HighMemoryUtilization', { + alarmDescription: 'The process is using more memory than expected', + metric: this.alarmMetrics.memUsed.with({ statistic: 'avg' }), + evaluationPeriods: 5, + threshold: 65, + comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, + treatMissingData: TreatMissingData.IGNORE, + })); + + this.alarms.push(new Alarm(infraStack, 'HighDiskUtilization', { + alarmDescription: 'High disk utilization found', + metric: this.alarmMetrics.diskUsed.with({ statistic: 'avg' }), + evaluationPeriods: 5, + threshold: 70, + comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD, + treatMissingData: TreatMissingData.IGNORE, + })); + + this.alarms + .map((alarm) => new AlarmWidget({ alarm })) + .forEach((widget) => alarmDashboard.addWidgets(widget)); + } +} diff --git a/lib/os-cluster-entrypoint.ts b/lib/os-cluster-entrypoint.ts index 1a2d8376463..89cd8e8b1a5 100644 --- a/lib/os-cluster-entrypoint.ts +++ b/lib/os-cluster-entrypoint.ts @@ -217,6 +217,9 @@ export class OsClusterEntrypoint { const remoteStore = `${scope.node.tryGetContext('enableRemoteStore')}`; const enableRemoteStore = remoteStore === 'true'; + const monitoringAndAlarms = `${scope.node.tryGetContext('enableMonitoring')}`; + const enableMonitoring = monitoringAndAlarms === 'true'; + const customRoleArn = `${scope.node.tryGetContext('customRoleArn')}`; let networkStackName = 'opensearch-network-stack'; @@ -277,6 +280,7 @@ export class OsClusterEntrypoint { additionalConfig: ymlConfig, additionalOsdConfig: osdYmlConfig, customConfigFiles, + enableMonitoring, ...props, }); diff --git a/test/os-cluster.test.ts b/test/os-cluster.test.ts index a40776aa4a3..b986c2f0a4a 100644 --- a/test/os-cluster.test.ts +++ b/test/os-cluster.test.ts @@ -26,6 +26,7 @@ test('Test Resources with security disabled multi-node default instance types', additionalOsdConfig: '{ "something.enabled": "true", "something_else.enabled": "false" }', // eslint-disable-next-line max-len customConfigFiles: '{"test/data/config.yml": "opensearch/config/opensearch-security/config.yml", "test/data/roles.yml": "opensearch/config/opensearch-security/roles.yml"}', + enableMonitoring: true, }, }); @@ -50,6 +51,8 @@ test('Test Resources with security disabled multi-node default instance types', infraTemplate.resourceCountIs('AWS::ElasticLoadBalancingV2::Listener', 2); infraTemplate.resourceCountIs('AWS::ElasticLoadBalancingV2::TargetGroup', 2); infraTemplate.resourceCountIs('AWS::AutoScaling::LaunchConfiguration', 3); + infraTemplate.resourceCountIs('AWS::CloudWatch::Alarm', 4); + infraTemplate.resourceCountIs('AWS::CloudWatch::Dashboard', 1); infraTemplate.hasResourceProperties('AWS::ElasticLoadBalancingV2::Listener', { Port: 80, Protocol: 'TCP',