Skip to content

Commit

Permalink
Add alarms and monitoring for infra stack (#84)
Browse files Browse the repository at this point in the history
Signed-off-by: Sayali Gaikawad <[email protected]>
  • Loading branch information
gaiksaya authored Dec 18, 2023
1 parent 5ea6747 commit 5af1cf7
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ In order to deploy both the stacks the user needs to provide a set of required a
| storageVolumeType | Optional | string | EBS volume type for all the nodes (data, ml, cluster manager). Defaults to gp2. See `lib/opensearch-config/node-config.ts` for available options. E.g., `-c storageVolumeType=gp3`. For SSD based instance (i.e. i3 family), it is used for root volume configuration. |
| customRoleArn | Optional | string | User provided IAM role arn to be used as ec2 instance profile. `-c customRoleArn=arn:aws:iam::<AWS_ACCOUNT_ID>:role/<ROLE_NAME>` |
| customConfigFiles | Optional | string | You can provide an entire config file to be overwritten or added to OpenSearch and OpenSearch Dashboards. Pass string in the form of JSON with key as local path to the config file to read from and value as file on the server to overwrite/add. Note that the values in the JSON needs to have prefix of `opensearch` or `opensearch-dashboards`. Example: `-c customConfigFiles='{"opensearch-config/config.yml": "opensearch/config/opensearch-security/config.yml", "opensearch-config/role_mapping.yml":"opensearch/config/opensearch-security/roles_mapping.yml", "/roles.yml": "opensearch/config/opensearch-security/roles.yml"}'` |
| enableMonitoring | Optional | boolean | Boolean flag to enable monitoring and alarms for Infra Stack. See [InfraStackMonitoring class](./lib/monitoring/alarms.ts) for more details. Defaults to false e.g., `--context enableMonitoring=true`

* Before starting this step, ensure that your AWS CLI is correctly configured with access credentials.
* Also ensure that you're running these commands in the current directory
Expand Down
6 changes: 6 additions & 0 deletions lib/cloudwatch/metrics-section.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ export interface ProcstatMetricDefinition {
}

interface EditableCloudwatchMetricsSection {
// eslint-disable-next-line camelcase
namespace?: string;
// eslint-disable-next-line camelcase
append_dimensions?: any;
// eslint-disable-next-line camelcase
aggregation_dimensions?: any;
// eslint-disable-next-line camelcase
metrics_collected: {
procstat?: ProcstatMetricDefinition[],
Expand Down
13 changes: 12 additions & 1 deletion lib/infra/infra-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import { dump, load } from 'js-yaml';
import { join } from 'path';
import { CloudwatchAgent } from '../cloudwatch/cloudwatch-agent';
import { ProcstatMetricDefinition } from '../cloudwatch/metrics-section';
import { InfraStackMonitoring } from '../monitoring/alarms';
import { nodeConfig } from '../opensearch-config/node-config';
import { RemoteStoreResources } from './remote-store-resources';

Expand Down Expand Up @@ -72,6 +73,7 @@ export interface infraProps extends StackProps {
readonly additionalConfig?: string,
readonly additionalOsdConfig?: string,
readonly customConfigFiles?: string,
readonly enableMonitoring?: boolean,
}

export class InfraStack extends Stack {
Expand Down Expand Up @@ -368,10 +370,13 @@ export class InfraStack extends Stack {
});
}
}

new CfnOutput(this, 'loadbalancer-url', {
value: nlb.loadBalancerDnsName,
});

if (props.enableMonitoring) {
const monitoring = new InfraStackMonitoring(this, props.dashboardsUrl);
}
}

private static getCfnInitElement(scope: Stack, logGroup: LogGroup, props: infraProps, nodeType?: string): InitElement[] {
Expand Down Expand Up @@ -406,6 +411,12 @@ export class InfraStack extends Stack {
debug: false,
},
metrics: {
append_dimensions: {
// eslint-disable-next-line no-template-curly-in-string
InstanceId: '${aws:InstanceId}',
},
aggregation_dimensions: [[]], // Create rollups without instance id
namespace: `${scope.stackName}/InfraStack`,
metrics_collected: {
procstat: procstatConfig,
cpu: {
Expand Down
85 changes: 85 additions & 0 deletions lib/monitoring/alarms.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/**
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

import {
Alarm, AlarmWidget, ComparisonOperator, Dashboard, MathExpression, Metric, TreatMissingData,
} from 'aws-cdk-lib/aws-cloudwatch';
import { InfraStack } from '../infra/infra-stack';

export class InfraStackMonitoring {
public readonly alarmMetrics: {
memUsed: Metric | MathExpression,
diskUsed: Metric| MathExpression,
openSearchProcessNotFound: Metric | MathExpression,
openSearchDashboardsProcessNotFound?: Metric | MathExpression,
}

public readonly alarms: Alarm[] = []

constructor(infraStack: InfraStack, dashboardsUrl: string) {
this.alarmMetrics = {
memUsed: new Metric({
metricName: 'mem_used_percent',
namespace: `${infraStack.stackName}/InfraStack`,
}),
diskUsed: new MathExpression({
expression: `SELECT AVG(disk_used_percent) FROM "${infraStack.stackName}/InfraStack" WHERE "fstype" = 'xfs'`,
}),
openSearchProcessNotFound: new MathExpression({
expression: `SELECT AVG(procstat_lookup_pid_count) FROM "${infraStack.stackName}/InfraStack" WHERE "pattern" = '-Dopensearch'`,
}),
openSearchDashboardsProcessNotFound: new MathExpression({
expression: `SELECT AVG(procstat_lookup_pid_count) FROM "${infraStack.stackName}/InfraStack" WHERE "pattern" = 'opensearch-dashboards'`,
}),
};
const alarmDashboard = new Dashboard(infraStack, 'AlarmDashboard');
this.alarms.push(new Alarm(infraStack, 'OpenSearchProcessNotFound', {
alarmDescription: 'OpenSearch Process not found',
metric: this.alarmMetrics.openSearchProcessNotFound.with({ statistic: 'avg' }),
evaluationPeriods: 3,
threshold: 1,
datapointsToAlarm: 3,
comparisonOperator: ComparisonOperator.LESS_THAN_THRESHOLD,
treatMissingData: TreatMissingData.IGNORE,
}));

if (dashboardsUrl !== 'undefined' && this.alarmMetrics.openSearchDashboardsProcessNotFound !== undefined) {
this.alarms.push(new Alarm(infraStack, 'OpenSearchDashboardsProcessNotFound', {
alarmDescription: 'OpenSearch Dashboards Process not found',
metric: this.alarmMetrics.openSearchDashboardsProcessNotFound.with({ statistic: 'avg' }),
evaluationPeriods: 3,
threshold: 1,
datapointsToAlarm: 3,
comparisonOperator: ComparisonOperator.LESS_THAN_THRESHOLD,
treatMissingData: TreatMissingData.IGNORE,
}));
}

this.alarms.push(new Alarm(infraStack, 'HighMemoryUtilization', {
alarmDescription: 'The process is using more memory than expected',
metric: this.alarmMetrics.memUsed.with({ statistic: 'avg' }),
evaluationPeriods: 5,
threshold: 65,
comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
treatMissingData: TreatMissingData.IGNORE,
}));

this.alarms.push(new Alarm(infraStack, 'HighDiskUtilization', {
alarmDescription: 'High disk utilization found',
metric: this.alarmMetrics.diskUsed.with({ statistic: 'avg' }),
evaluationPeriods: 5,
threshold: 70,
comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
treatMissingData: TreatMissingData.IGNORE,
}));

this.alarms
.map((alarm) => new AlarmWidget({ alarm }))
.forEach((widget) => alarmDashboard.addWidgets(widget));
}
}
4 changes: 4 additions & 0 deletions lib/os-cluster-entrypoint.ts
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,9 @@ export class OsClusterEntrypoint {
const remoteStore = `${scope.node.tryGetContext('enableRemoteStore')}`;
const enableRemoteStore = remoteStore === 'true';

const monitoringAndAlarms = `${scope.node.tryGetContext('enableMonitoring')}`;
const enableMonitoring = monitoringAndAlarms === 'true';

const customRoleArn = `${scope.node.tryGetContext('customRoleArn')}`;

let networkStackName = 'opensearch-network-stack';
Expand Down Expand Up @@ -277,6 +280,7 @@ export class OsClusterEntrypoint {
additionalConfig: ymlConfig,
additionalOsdConfig: osdYmlConfig,
customConfigFiles,
enableMonitoring,
...props,
});

Expand Down
3 changes: 3 additions & 0 deletions test/os-cluster.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ test('Test Resources with security disabled multi-node default instance types',
additionalOsdConfig: '{ "something.enabled": "true", "something_else.enabled": "false" }',
// eslint-disable-next-line max-len
customConfigFiles: '{"test/data/config.yml": "opensearch/config/opensearch-security/config.yml", "test/data/roles.yml": "opensearch/config/opensearch-security/roles.yml"}',
enableMonitoring: true,
},
});

Expand All @@ -50,6 +51,8 @@ test('Test Resources with security disabled multi-node default instance types',
infraTemplate.resourceCountIs('AWS::ElasticLoadBalancingV2::Listener', 2);
infraTemplate.resourceCountIs('AWS::ElasticLoadBalancingV2::TargetGroup', 2);
infraTemplate.resourceCountIs('AWS::AutoScaling::LaunchConfiguration', 3);
infraTemplate.resourceCountIs('AWS::CloudWatch::Alarm', 4);
infraTemplate.resourceCountIs('AWS::CloudWatch::Dashboard', 1);
infraTemplate.hasResourceProperties('AWS::ElasticLoadBalancingV2::Listener', {
Port: 80,
Protocol: 'TCP',
Expand Down

0 comments on commit 5af1cf7

Please sign in to comment.