-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathlambda_function.py
169 lines (144 loc) · 7.67 KB
/
lambda_function.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import boto3
import datetime
import os
# todo -- some elements only allow you to list up to 10 or 100 items at a time; make iterators
EXCLUSION_LIST = []
DEBUG = True
def ecs_cpu_utilization(cluster_name, cw):
response = cw.get_metric_statistics(
Namespace='AWS/ECS',
MetricName='CPUUtilization',
Dimensions=[
{
'Name': 'ClusterName',
'Value': cluster_name,
},
],
StartTime=datetime.datetime.utcnow() - datetime.timedelta(seconds=300),
EndTime=datetime.datetime.utcnow(),
Period=300,
Statistics=['Average'],
)
return "{:.2f}".format(response['Datapoints'][0]['Average'])
def ecs_memory_utilization(cluster_name, cw):
response = cw.get_metric_statistics(
Namespace='AWS/ECS',
MetricName='MemoryUtilization',
Dimensions=[
{
'Name': 'ClusterName',
'Value': cluster_name,
},
],
StartTime=datetime.datetime.utcnow() - datetime.timedelta(seconds=300),
EndTime=datetime.datetime.utcnow(),
Period=300,
Statistics=['Average'],
)
return "{:.2f}".format(response['Datapoints'][0]['Average'])
def lambda_handler(event, context):
cw = boto3.client('cloudwatch')
ecs = boto3.client('ecs')
asg = boto3.client('autoscaling')
all_clusters = ecs.list_clusters()
cluster_names = list()
for clusterArn in all_clusters['clusterArns']:
cluster_names.append(clusterArn.split('/')[1])
cluster_names.sort()
for clusterName in cluster_names:
if clusterName in EXCLUSION_LIST:
if DEBUG:
print('Found cluster (%s) in exclusion list, skipping...' % clusterName)
continue
cluster_services = ecs.list_services(cluster=clusterName) # 10 limit
print('Getting services for cluster (%s)' % clusterName)
cluster_services_descriptions = ecs.describe_services(cluster=clusterName,
services=cluster_services['serviceArns']) # 10 limit
largest_service = dict(name='', cpu=0, memory=0)
for service in cluster_services_descriptions['services']:
service_task_definition = ecs.describe_task_definition(taskDefinition=service['taskDefinition'])
cpu = memory = 0
for definition in service_task_definition['taskDefinition']['containerDefinitions']:
cpu += definition['cpu']
if 'memoryReservation' in definition:
memory += definition['memoryReservation']
else:
memory += definition['memory']
if cpu > largest_service['cpu'] or memory > largest_service['memory']:
largest_service = dict(name=service['serviceName'], cpu=cpu, memory=memory)
if DEBUG:
print('Service %s needs %s CPU and %s memory' % (service['serviceName'], cpu, memory))
if DEBUG:
print('The largest service on %s is %s requiring %s CPU Shares and %s Memory Shares'
% (clusterName, largest_service['name'], largest_service['cpu'], largest_service['memory']))
cluster_container_instances_list = ecs.list_container_instances(cluster=clusterName, status='ACTIVE') # 100
cluster_container_instances = \
ecs.describe_container_instances(cluster=clusterName,
containerInstances=cluster_container_instances_list['containerInstanceArns'])
can_support_largest_task = False
largest_task_support_count = 0
cluster_instance_count = 0
consider_scaling = 0 # -1 to scale in, 0 to stay, 1 to scale out
'''
# commented out to prevent perceived DoS attacks, we already make a lot of requests to the API
print('The average CPU Utilization for Cluster ({}) is {}%'
.format(clusterName, ecs_cpu_utilization(clusterName, cw)))
print('The average Memory Utilization for Cluster ({}) is {}%'
.format(clusterName, ecs_memory_utilization(clusterName, cw)))
'''
for cluster_instance in cluster_container_instances['containerInstances']:
# grab the ID of the instance to grab ASG info later
cluster_random_instance_id = cluster_instance['ec2InstanceId']
cluster_instance_count += 1
remaining_resources = {resource['name']: resource for resource in cluster_instance['remainingResources']}
remaining_cpu = int(remaining_resources['CPU']['integerValue'])
remaining_ram = int(remaining_resources['MEMORY']['integerValue'])
if DEBUG:
print('Cluster instance (%s) has %s CPU Shares and %s RAM Shares remaining'
% (cluster_instance['ec2InstanceId'], remaining_cpu, remaining_ram))
if remaining_cpu >= largest_service['cpu'] and remaining_ram >= largest_service['memory']:
can_support_largest_task = True
largest_task_support_count += 1
# find out which ASG this cluster instance belongs to
res = asg.describe_auto_scaling_instances(InstanceIds=[cluster_random_instance_id])
asg_name = res['AutoScalingInstances'][0]['AutoScalingGroupName']
# find out the min/max/desired size for the ASG
res = asg.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name])
asg_info = res['AutoScalingGroups'][0]
cluster_min_size = asg_info['MinSize']
cluster_max_size = asg_info['MaxSize']
cluster_desired_size = asg_info['DesiredCapacity']
if can_support_largest_task:
print('Cluster (%s) has enough resources to support %s of the largest service (%s)'
% (clusterName, largest_task_support_count, largest_service['name']))
else:
consider_scaling = 1
print('Cluster (%s) needs to scale to support the largest service (%s)'
% (clusterName, largest_service['name']))
num_cluster_instances = len(cluster_container_instances_list['containerInstanceArns'])
print('Cluster (%s) requires %s, desires %s, maxes at %s, has %s instances and can support %s largest tasks'
% (clusterName, cluster_min_size, cluster_desired_size,
cluster_max_size, num_cluster_instances, largest_task_support_count))
# Check that we don't already need to scale
# Also check for too many hosts and too much capacity
if consider_scaling != 1:
if (cluster_desired_size + 1) > cluster_instance_count > cluster_min_size:
print('Cluster (%s) currently has %s instances, with a minimum cluster size of %s' %
(clusterName, cluster_instance_count, cluster_min_size))
if largest_task_support_count > 2:
consider_scaling = -1
print('Cluster (%s) can support %s of the largest tasks; consider scaling in'
% (clusterName, largest_task_support_count))
print('The current scaling metric would be set to %s' % consider_scaling)
cw.put_metric_data(Namespace='ECS',
MetricData=[{
'MetricName': 'NeedsToScaleOut',
'Dimensions': [{
'Name': 'ClusterName',
'Value': clusterName
}],
'Timestamp': datetime.datetime.utcnow(),
'Value': consider_scaling
}])
print('Metric was sent to CloudWatch')
return {}