From 88574e399b239ee774fa14d08bc3c7c47babe6a4 Mon Sep 17 00:00:00 2001 From: Harry Date: Thu, 8 Aug 2024 13:45:35 -0700 Subject: [PATCH] Check if sample app is up (#162) *Issue description:* After removing the public endpoints, we removed the logic for checking if the sample application is up since we are unable to ping the sample app through the workflow. However, recently there have been increasing failures in EC2s due to the sample application not generating telemetry. This PR aims to resolve this issue by adding the logic back *Description of changes:* Check if the sample application is up after building it in Terraform. If the sample application is unavailable, then throw exit 1 to indicate that the Terraform Deployment was a failure. This will trigger the e2e workflow to re-instantiate another EC2 instance and deploy the sample application again. For ASGs, since the ASG creates the EC2 instance we need a different way to check if the sample application is available or not since throwing exit 1 will not cause a failure. Will push this PR first then apply implementation for ASG if we still noticeable improvement *Rollback procedure:* Revert PR Test run: - [Run](https://github.com/aws-observability/aws-application-signals-test-framework/actions/runs/10291998960) to check if EC2s will pass after new implementation: - [Run](https://github.com/aws-observability/aws-application-signals-test-framework/actions/runs/10291376482/job/28483485792)to check if Terraform will fail and retry if sample application is not available (Did this by changing the port to an invalid one) Test run after updating: https://github.com/aws-observability/aws-application-signals-test-framework/actions/runs/10292445294/job/28486870614 Final Test run: https://github.com/aws-observability/aws-application-signals-test-framework/actions/runs/10309116932/job/28538036319 By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. --- terraform/java/ec2/asg/main.tf | 16 +++ terraform/java/ec2/default/main.tf | 30 +++++ terraform/python/ec2/asg/main.tf | 16 +++ terraform/python/ec2/default/main.tf | 157 +++++++++++++++++---------- 4 files changed, 160 insertions(+), 59 deletions(-) diff --git a/terraform/java/ec2/asg/main.tf b/terraform/java/ec2/asg/main.tf index 41849a03c..775fa263d 100644 --- a/terraform/java/ec2/asg/main.tf +++ b/terraform/java/ec2/asg/main.tf @@ -152,6 +152,7 @@ resource "aws_autoscaling_group" "asg" { max_size = 1 launch_configuration = aws_launch_configuration.launch_configuration.name vpc_zone_identifier = [data.aws_subnets.default_subnets.ids.0] + health_check_type = "EC2" } resource "aws_instance" "remote_service_instance" { @@ -215,6 +216,21 @@ resource "null_resource" "remote_service_setup" { # The application needs time to come up and reach a steady state, this should not take longer than 30 seconds sleep 30 + # Check if the application is up. If it is not up, then exit 1. + attempt_counter=0 + max_attempts=30 + until $(curl --output /dev/null --silent --head --fail --max-time 5 $(echo "http://localhost:8080/healthcheck" | tr -d '"')); do + if [ $attempt_counter -eq $max_attempts ];then + echo "Failed to connect to endpoint." + exit 1 + fi + echo "Attempting to connect to the remote endpoint. Tried $attempt_counter out of $max_attempts" + attempt_counter=$(($attempt_counter+1)) + sleep 10 + done + + echo "Successfully connected to remote endpoint" + EOF ] } diff --git a/terraform/java/ec2/default/main.tf b/terraform/java/ec2/default/main.tf index ffdc38b58..ba667e631 100644 --- a/terraform/java/ec2/default/main.tf +++ b/terraform/java/ec2/default/main.tf @@ -138,6 +138,21 @@ resource "null_resource" "main_service_setup" { # The application needs time to come up and reach a steady state, this should not take longer than 30 seconds sleep 30 + # Check if the application is up. If it is not up, then exit 1. + attempt_counter=0 + max_attempts=30 + until $(curl --output /dev/null --silent --head --fail --max-time 5 $(echo "http://localhost:8080" | tr -d '"')); do + if [ $attempt_counter -eq $max_attempts ];then + echo "Failed to connect to endpoint." + exit 1 + fi + echo "Attempting to connect to the main endpoint. Tried $attempt_counter out of $max_attempts" + attempt_counter=$(($attempt_counter+1)) + sleep 10 + done + + echo "Successfully connected to main endpoint" + EOF ] } @@ -206,6 +221,21 @@ resource "null_resource" "remote_service_setup" { # The application needs time to come up and reach a steady state, this should not take longer than 30 seconds sleep 30 + # Check if the application is up. If it is not up, then exit 1. + attempt_counter=0 + max_attempts=30 + until $(curl --output /dev/null --silent --head --fail --max-time 5 $(echo "http://localhost:8080/healthcheck" | tr -d '"')); do + if [ $attempt_counter -eq $max_attempts ];then + echo "Failed to connect to endpoint." + exit 1 + fi + echo "Attempting to connect to the remote endpoint. Tried $attempt_counter out of $max_attempts" + attempt_counter=$(($attempt_counter+1)) + sleep 10 + done + + echo "Successfully connected to remote endpoint" + EOF ] } diff --git a/terraform/python/ec2/asg/main.tf b/terraform/python/ec2/asg/main.tf index 96dc8dd0c..bc40a0802 100644 --- a/terraform/python/ec2/asg/main.tf +++ b/terraform/python/ec2/asg/main.tf @@ -166,6 +166,7 @@ resource "aws_autoscaling_group" "asg" { max_size = 1 launch_configuration = aws_launch_configuration.launch_configuration.name vpc_zone_identifier = [data.aws_subnets.default_subnets.ids.0] + health_check_type = "EC2" } resource "aws_instance" "remote_service_instance" { @@ -241,6 +242,21 @@ resource "null_resource" "remote_service_setup" { # The application needs time to come up and reach a steady state, this should not take longer than 30 seconds sleep 30 + # Check if the application is up. If it is not up, then exit 1. + attempt_counter=0 + max_attempts=30 + until $(curl --output /dev/null --silent --head --fail --max-time 5 $(echo "http://localhost:8001/healthcheck" | tr -d '"')); do + if [ $attempt_counter -eq $max_attempts ];then + echo "Failed to connect to endpoint." + exit 1 + fi + echo "Attempting to connect to the remote endpoint. Tried $attempt_counter out of $max_attempts" + attempt_counter=$(($attempt_counter+1)) + sleep 10 + done + + echo "Successfully connected to remote endpoint" + EOF ] } diff --git a/terraform/python/ec2/default/main.tf b/terraform/python/ec2/default/main.tf index 31446114d..8e2ddf821 100644 --- a/terraform/python/ec2/default/main.tf +++ b/terraform/python/ec2/default/main.tf @@ -104,48 +104,68 @@ resource "null_resource" "main_service_setup" { provisioner "remote-exec" { inline = [ + <<-EOF + #!/bin/bash + # Install Python and wget - "sudo yum install wget -y", - "sudo yum install unzip -y", - "sudo dnf install -y python3.9", - "sudo dnf install -y python3.9-pip", + sudo yum install wget -y + sudo yum install unzip -y + sudo dnf install -y python3.9 + sudo dnf install -y python3.9-pip # Copy in CW Agent configuration - "agent_config='${replace(replace(file("./amazon-cloudwatch-agent.json"), "/\\s+/", ""), "$REGION", var.aws_region)}'", - "echo $agent_config > amazon-cloudwatch-agent.json", + agent_config='${replace(replace(file("./amazon-cloudwatch-agent.json"), "/\\s+/", ""), "$REGION", var.aws_region)}' + echo $agent_config > amazon-cloudwatch-agent.json # Get and run CW agent rpm - "${var.get_cw_agent_rpm_command}", - "sudo rpm -U ./cw-agent.rpm", - "sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:./amazon-cloudwatch-agent.json", + ${var.get_cw_agent_rpm_command} + sudo rpm -U ./cw-agent.rpm + sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:./amazon-cloudwatch-agent.json # Get ADOT Wheel and install it - "${var.get_adot_wheel_command}", + ${var.get_adot_wheel_command} # Get and run the sample application with configuration - "aws s3 cp ${var.sample_app_zip} ./python-sample-app.zip", - "unzip -o python-sample-app.zip", + aws s3 cp ${var.sample_app_zip} ./python-sample-app.zip + unzip -o python-sample-app.zip # Export environment variables for instrumentation - "cd ./django_frontend_service", - "python3.9 -m pip install -r ec2-requirements.txt", - "export DJANGO_SETTINGS_MODULE=\"django_frontend_service.settings\"", - "export OTEL_PYTHON_DISTRO=\"aws_distro\"", - "export OTEL_PYTHON_CONFIGURATOR=\"aws_configurator\"", - "export OTEL_METRICS_EXPORTER=none", - "export OTEL_TRACES_EXPORTER=otlp", - "export OTEL_AWS_APPLICATION_SIGNALS_ENABLED=true", - "export OTEL_AWS_APPLICATION_SIGNALS_EXPORTER_ENDPOINT=http://localhost:4315", - "export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4315", - "export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=grpc", - "export OTEL_EXPORTER_OTLP_METRICS_PROTOCOL=grpc", - "export OTEL_SERVICE_NAME=python-sample-application-${var.test_id}", - "export OTEL_TRACES_SAMPLER=always_on", - "python3.9 manage.py migrate", - "nohup opentelemetry-instrument python3.9 manage.py runserver 0.0.0.0:8000 --noreload &", + cd ./django_frontend_service + python3.9 -m pip install -r ec2-requirements.txt + export DJANGO_SETTINGS_MODULE="django_frontend_service.settings" + export OTEL_PYTHON_DISTRO="aws_distro" + export OTEL_PYTHON_CONFIGURATOR="aws_configurator" + export OTEL_METRICS_EXPORTER=none + export OTEL_TRACES_EXPORTER=otlp + export OTEL_AWS_APPLICATION_SIGNALS_ENABLED=true + export OTEL_AWS_APPLICATION_SIGNALS_EXPORTER_ENDPOINT=http://localhost:4315 + export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4315 + export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=grpc + export OTEL_EXPORTER_OTLP_METRICS_PROTOCOL=grpc + export OTEL_SERVICE_NAME=python-sample-application-${var.test_id} + export OTEL_TRACES_SAMPLER=always_on + python3.9 manage.py migrate + nohup opentelemetry-instrument python3.9 manage.py runserver 0.0.0.0:8000 --noreload & # The application needs time to come up and reach a steady state, this should not take longer than 30 seconds - "sleep 30" + sleep 30 + + # Check if the application is up. If it is not up, then exit 1. + attempt_counter=0 + max_attempts=30 + until $(curl --output /dev/null --silent --head --fail --max-time 5 $(echo "http://localhost:8000" | tr -d '"')); do + if [ $attempt_counter -eq $max_attempts ];then + echo "Failed to connect to endpoint." + exit 1 + fi + echo "Attempting to connect to the main endpoint. Tried $attempt_counter out of $max_attempts" + attempt_counter=$(($attempt_counter+1)) + sleep 10 + done + + echo "Successfully connected to main endpoint" + + EOF ] } @@ -179,49 +199,68 @@ resource "null_resource" "remote_service_setup" { provisioner "remote-exec" { inline = [ + <<-EOF + #!/bin/bash + # Install Python and wget - "sudo yum install wget -y", - "sudo yum install unzip -y", - "sudo dnf install -y python3.9", - "sudo dnf install -y python3.9-pip", + sudo yum install wget -y + sudo yum install unzip -y + sudo dnf install -y python3.9 + sudo dnf install -y python3.9-pip # Copy in CW Agent configuration - "agent_config='${replace(replace(file("./amazon-cloudwatch-agent.json"), "/\\s+/", ""), "$REGION", var.aws_region)}'", - "echo $agent_config > amazon-cloudwatch-agent.json", + agent_config='${replace(replace(file("./amazon-cloudwatch-agent.json"), "/\\s+/", ""), "$REGION", var.aws_region)}' + echo $agent_config > amazon-cloudwatch-agent.json # Get and run CW agent rpm - "${var.get_cw_agent_rpm_command}", - "sudo rpm -U ./cw-agent.rpm", - "sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:./amazon-cloudwatch-agent.json", + ${var.get_cw_agent_rpm_command} + sudo rpm -U ./cw-agent.rpm + sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:./amazon-cloudwatch-agent.json # Get ADOT Wheel and install it - "${var.get_adot_wheel_command}", + ${var.get_adot_wheel_command} # Get and run the sample application with configuration - "aws s3 cp ${var.sample_app_zip} ./python-sample-app.zip", - "unzip -o python-sample-app.zip", + aws s3 cp ${var.sample_app_zip} ./python-sample-app.zip + unzip -o python-sample-app.zip # Export environment variables for instrumentation - "cd ./django_remote_service", - "export DJANGO_SETTINGS_MODULE=\"django_remote_service.settings\"", - "python3.9 -m pip install -r requirements.txt --force-reinstall", - "export OTEL_PYTHON_DISTRO=\"aws_distro\"", - "export OTEL_PYTHON_CONFIGURATOR=\"aws_configurator\"", - "export OTEL_METRICS_EXPORTER=none", - "export OTEL_TRACES_EXPORTER=otlp", - "export OTEL_AWS_APPLICATION_SIGNALS_ENABLED=true", - "export OTEL_AWS_APPLICATION_SIGNALS_EXPORTER_ENDPOINT=http://localhost:4315", - "export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4315", - "export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=grpc", - "export OTEL_EXPORTER_OTLP_METRICS_PROTOCOL=grpc", - "export OTEL_SERVICE_NAME=python-sample-remote-application-${var.test_id}", - "export OTEL_TRACES_SAMPLER=always_on", - "python3.9 manage.py migrate", - "nohup opentelemetry-instrument python3.9 manage.py runserver 0.0.0.0:8001 --noreload &", - + cd ./django_remote_service + export DJANGO_SETTINGS_MODULE="django_remote_service.settings" + python3.9 -m pip install -r requirements.txt --force-reinstall + export OTEL_PYTHON_DISTRO="aws_distro" + export OTEL_PYTHON_CONFIGURATOR="aws_configurator" + export OTEL_METRICS_EXPORTER=none + export OTEL_TRACES_EXPORTER=otlp + export OTEL_AWS_APPLICATION_SIGNALS_ENABLED=true + export OTEL_AWS_APPLICATION_SIGNALS_EXPORTER_ENDPOINT=http://localhost:4315 + export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4315 + export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=grpc + export OTEL_EXPORTER_OTLP_METRICS_PROTOCOL=grpc + export OTEL_SERVICE_NAME=python-sample-remote-application-${var.test_id} + export OTEL_TRACES_SAMPLER=always_on + python3.9 manage.py migrate + nohup opentelemetry-instrument python3.9 manage.py runserver 0.0.0.0:8001 --noreload & # The application needs time to come up and reach a steady state, this should not take longer than 30 seconds - "sleep 30" + sleep 30 + + # Check if the application is up. If it is not up, then exit 1. + attempt_counter=0 + max_attempts=30 + until $(curl --output /dev/null --silent --head --fail --max-time 5 $(echo "http://localhost:8001/healthcheck" | tr -d '"')); do + if [ $attempt_counter -eq $max_attempts ];then + echo "Failed to connect to endpoint." + exit 1 + fi + echo "Attempting to connect to the remote endpoint. Tried $attempt_counter out of $max_attempts" + attempt_counter=$(($attempt_counter+1)) + sleep 10 + done + + echo "Successfully connected to remote endpoint" + + EOF ] }