Skip to content

Commit

Permalink
Add deleting stale autoscaling group to cleanup process (#137)
Browse files Browse the repository at this point in the history
*Issue description:*
Canaries may exit unexpectedly with auto-scaling groups for EC2 ASG not
being deleted properly.

*Description of changes:*

This PR scans ASGs that stay longer than 3 hours, and delete them if
they are not EKS node groups. Also update gitignore file to ignore
metadata files on Mac OS, and terraform temporary files.

*Ensure you've run the following tests on your changes and include the
link below:*
To do so, create a `test.yml` file with `name: Test` and workflow
description to test your changes, then remove the file for your PR. Link
your test run in your PR description. This process is a short term
solution while we work on creating a staging environment for testing.

NOTE: TESTS RUNNING ON A SINGLE EKS CLUSTER CANNOT BE RUN IN PARALLEL.
See the
[needs](https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idneeds)
keyword to run tests in succession.
- Run Java EKS on `e2e-playground` in us-east-1 and eu-central-2
- Run Python EKS on `e2e-playground` in us-east-1 and eu-central-2
- Run metric limiter on EKS cluster `e2e-playground` in us-east-1 and
eu-central-2
- Run EC2 tests in all regions
- Run K8s on a separate K8s cluster (check IAD test account for master
node endpoints; these will change as we create and destroy clusters for
OS patching)

By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license.
  • Loading branch information
bjrara authored Jul 29, 2024
1 parent de02191 commit 827ac08
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 25 deletions.
108 changes: 86 additions & 22 deletions .github/workflows/util/clean/ec2_instance_cleanup/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,74 @@
# Create an EC2 client
session = boto3.Session()
ec2 = session.client('ec2')
autoscaling = session.client('autoscaling')

# configure logging
logging.basicConfig(level=logging.INFO)

def _get_autoscaling_groups_to_delete():
logging.info("Start scanning autoscaling group...")

current_time = datetime.now(timezone.utc)
time_threshold = current_time - timedelta(hours=3)
groups_to_delete = []

# Initialize the paginator
paginator = autoscaling.get_paginator('describe_auto_scaling_groups')

# Iterate through each page of results
for page in paginator.paginate():
auto_scaling_groups = page['AutoScalingGroups']
for asg in auto_scaling_groups:
asg_name = asg['AutoScalingGroupName']
tags = asg['Tags']

eks_tag_present = any(tag['Key'] == 'eks:cluster-name' for tag in tags)
if eks_tag_present:
logging.info(f"Skipping autoscaling group with 'eks:cluster-name' tag: {asg_name}.")
continue

if not _is_active(asg):
logging.info(f"Skipping autoscaling group {asg_name} with terminating instances.")
continue

logging.info(f"autoscaling group {asg_name} is active.")

creation_time = asg['CreatedTime']
if creation_time < time_threshold:
print(f"Autoscaling group: {asg_name} will be deleted.")
groups_to_delete.append(asg)

logging.info(f"{len(groups_to_delete)} autoscaling groups are active for more than 3 hours.")

return groups_to_delete


def _delete_autoscaling_groups(auto_scaling_groups):
for asg in auto_scaling_groups:
try:
asg_name = asg['AutoScalingGroupName']
response = autoscaling.delete_auto_scaling_group(AutoScalingGroupName=asg_name, ForceDelete=True)
logging.info("===== Response for delete autoscaling group request =====")
logging.info(response)
except Exception as e:
logging.info(f"Error terminating instances: {e}")

def _is_active(asg):
for instance in asg['Instances']:
if instance['LifecycleState'] in [
'Terminating', 'Terminating:Wait', 'Terminating:Proceed'
]:
return False
return True


def _get_instances_to_terminate():
# Get all the running instances
logging.info("Getting all running instances")
logging.info("Start scanning instances")
running_filter = [{'Name': 'instance-state-name', 'Values': [INSTANCE_STATE_RUNNING]}]
running_instances = _get_all_instances_by_filter(filters=running_filter)
logging.info(f"Currently {len(running_instances)} are running.")
logging.info(f"{len(running_instances)} instances are running.")

# Filter instances that have been running for more than 3 hours
logging.info("Filtering instances that have been running for more than 3 hours")
Expand All @@ -42,10 +99,13 @@ def _get_instances_to_terminate():
logging.info("Filtering instances that should not be terminated based on conditions")
instances_to_terminate = []
for instance in instances_running_more_than_3hrs:
if (not _is_eks_cluster_instance(instance)
and not _is_k8s_cluster_instance(instance)
and not _is_tagged_do_not_delete(instance)):
instances_to_terminate.append(instance)
if (not _is_k8s_cluster_instance(instance) and not _is_tagged_do_not_delete(instance)):
group_name = _get_associated_autoscaling_group_name(instance)
if group_name != None:
logging.info(f"Instance {instance['InstanceId']} is associated with autoscaling group {group_name}, skip the termination.")
else:
instances_to_terminate.append(instance)

logging.info(f"{len(instances_to_terminate)} instances will be terminated.")

return instances_to_terminate
Expand All @@ -70,13 +130,6 @@ def _get_all_instances_by_filter(filters: List[dict]):
return filtered_instances


def _is_eks_cluster_instance(instance):
security_groups = instance.get('SecurityGroups', [])
if any(group['GroupName'].startswith(EKS_CLUSTER_SECURITY_GROUP_PREFIX) for group in security_groups):
return True
return False


def _is_k8s_cluster_instance(instance):
tags = instance.get('Tags', [])
if 'Name' in tags and tags['Name'].startswith(K8S_INSTANCE_NAME_PREFIX):
Expand All @@ -92,12 +145,21 @@ def _is_tagged_do_not_delete(instance):
return True
return False


def _prepare_report_and_upload(instances_to_terminate) -> bool:
json_data = json.dumps(instances_to_terminate, default=str)
def _get_associated_autoscaling_group_name(instance):
tags = instance.get('Tags', [])
asg_tag = next((tag for tag in tags if tag['Key'] == 'aws:autoscaling:groupName'), None)
if asg_tag is None:
return None
return asg_tag['Value']

def _prepare_report_and_upload(groups_to_delete, instances_to_terminate) -> bool:
json_data = json.dumps({
"autoscalingGroups": groups_to_delete,
"standaloneInstances": instances_to_terminate
}, default=str)
# save as a json file with timestamp
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
filename = f"report-instances-to-terminate-{timestamp}.json"
filename = f"report-resources-to-clean-{timestamp}.json"
with open(filename, "w") as f:
f.write(json_data)

Expand All @@ -116,24 +178,26 @@ def _prepare_report_and_upload(instances_to_terminate) -> bool:
def _terminate_instances(instances_to_terminate):
# Terminate the instances
instance_ids = [instance['InstanceId'] for instance in instances]
logging.info("Number of instances terminating: " + str(len(instance_ids)))
try:
response = ec2.terminate_instances(InstanceIds=instance_ids)
logging.info("===== Response for terminate request =====")
logging.info("===== Response for terminate instances request =====")
logging.info(response)
except Exception as e:
logging.info(f"Error terminating instances: {e}")


if __name__ == '__main__':
groups = _get_autoscaling_groups_to_delete()
instances = _get_instances_to_terminate()
if len(instances) == 0:
logging.info("No instances to terminate")

if len(groups) == 0 and len(instances) == 0:
logging.info("No resource to terminate")
exit(0)

report_successful = _prepare_report_and_upload(instances)
report_successful = _prepare_report_and_upload(groups, instances)
if not report_successful:
logging.error("Failed to prepare report and upload. Aborting termination of instances.")
exit(1)

_delete_autoscaling_groups(groups)
_terminate_instances(instances)
4 changes: 1 addition & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
.DS_Store
.idea
# Ignore Gradle project-specific cache directory
.gradle

# Ignore Gradle build output directory
build

# Ignore the resource cleanup reports
**/report-*.json
2 changes: 2 additions & 0 deletions terraform/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.terraform*
terraform.tfstate*

0 comments on commit 827ac08

Please sign in to comment.