diff --git a/manifests/modules/troubleshooting/pod/crash/.workshop/cleanup.sh b/manifests/modules/troubleshooting/pod/crash/.workshop/cleanup.sh new file mode 100644 index 000000000..403d252cf --- /dev/null +++ b/manifests/modules/troubleshooting/pod/crash/.workshop/cleanup.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +if kubectl get deployment efs-app -n default > /dev/null 2>&1; then + kubectl delete deployment efs-app -n default +else + echo "Deployment efs-app does not exist." +fi + +if kubectl get pvc efs-claim -n default > /dev/null 2>&1; then + kubectl delete pvc efs-claim -n default +else + echo "PVC efs-claim does not exist." +fi +PV_NAME=$(kubectl get pv -o jsonpath='{.items[?(@.spec.claimRef.name=="efs-claim")].metadata.name}') +if [ -n "$PV_NAME" ]; then + kubectl delete pv "$PV_NAME" +else + echo "No PV associated with efs-claim." +fi + +if kubectl get storageclass efs-sc > /dev/null 2>&1; then + kubectl delete storageclass efs-sc +else + echo "Storage class efs-sc does not exist." +fi diff --git a/manifests/modules/troubleshooting/pod/crash/.workshop/terraform/deployment.yaml.tpl b/manifests/modules/troubleshooting/pod/crash/.workshop/terraform/deployment.yaml.tpl new file mode 100644 index 000000000..609c1f1a4 --- /dev/null +++ b/manifests/modules/troubleshooting/pod/crash/.workshop/terraform/deployment.yaml.tpl @@ -0,0 +1,51 @@ +--- +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: efs-sc +provisioner: efs.csi.aws.com +parameters: + provisioningMode: efs-ap + fileSystemId: ${filesystemid} + directoryPerms: "700" +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: efs-claim +spec: + accessModes: + - ReadWriteMany + storageClassName: efs-sc + resources: + requests: + storage: 20Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: efs-app + labels: + app: efs-app +spec: + replicas: 1 + selector: + matchLabels: + app: efs-app + template: + metadata: + labels: + app: efs-app + spec: + containers: + - name: app + image: centos + command: ["/bin/sh"] + args: ["-c", "while true; do echo $(date -u) >> /example/out.txt; sleep 5; done"] + volumeMounts: + - name: persistent-storage + mountPath: /example + volumes: + - name: persistent-storage + persistentVolumeClaim: + claimName: efs-claim \ No newline at end of file diff --git a/manifests/modules/troubleshooting/pod/crash/.workshop/terraform/main.tf b/manifests/modules/troubleshooting/pod/crash/.workshop/terraform/main.tf new file mode 100644 index 000000000..5237da270 --- /dev/null +++ b/manifests/modules/troubleshooting/pod/crash/.workshop/terraform/main.tf @@ -0,0 +1,124 @@ +terraform { + required_providers { + # kubectl = { + # source = "gavinbunney/kubectl" + # version = ">= 1.14" + # } + } +} + +provider "aws" { + region = "us-east-1" + alias = "virginia" +} + +data "aws_caller_identity" "current" {} + +locals { + account_id = data.aws_caller_identity.current.account_id +} + +data "aws_vpc" "selected" { + tags = { + created-by = "eks-workshop-v2" + env = var.addon_context.eks_cluster_id + } +} + +data "aws_subnets" "public" { + tags = { + created-by = "eks-workshop-v2" + env = var.addon_context.eks_cluster_id + } + + filter { + name = "tag:Name" + values = ["*Public*"] + } +} + +/* +data "aws_eks_cluster" "cluster" { + name = var.eks_cluster_id +} +*/ + +module "eks_blueprints_addons" { + source = "aws-ia/eks-blueprints-addons/aws" + version = "1.16.2" + + enable_aws_efs_csi_driver = true + aws_efs_csi_driver = { + wait = true + } + + cluster_name = var.addon_context.eks_cluster_id + cluster_endpoint = var.addon_context.aws_eks_cluster_endpoint + cluster_version = var.eks_cluster_version + oidc_provider_arn = var.addon_context.eks_oidc_provider_arn +} + + +resource "aws_efs_file_system" "efs" { + tags = { + Name = "eks-workshop-efs" + } +} + +resource "aws_efs_mount_target" "mount_targets" { + for_each = toset(data.aws_subnets.public.ids) + file_system_id = resource.aws_efs_file_system.efs.id + subnet_id = each.value + security_groups = [resource.aws_security_group.efs_sg.id] +} + +resource "aws_security_group" "efs_sg" { + name = "efs_sg" + description = "Allow tarffic to efs" + vpc_id = data.aws_vpc.selected.id + + tags = { + Name = "efs_sg" + } +} + +resource "aws_vpc_security_group_ingress_rule" "allow_ipv4" { + security_group_id = aws_security_group.efs_sg.id + cidr_ipv4 = data.aws_vpc.selected.cidr_block + from_port = 80 + ip_protocol = "tcp" + to_port = 80 +} + + +resource "aws_vpc_security_group_egress_rule" "allow_all_traffic_ipv4" { + security_group_id = aws_security_group.efs_sg.id + cidr_ipv4 = "0.0.0.0/0" + ip_protocol = "-1" # semantically equivalent to all ports +} + +data "template_file" "deployment_yaml" { + template = file("/home/ec2-user/environment/eks-workshop/modules/troubleshooting/pod/crash/.workshop/terraform/deployment.yaml.tpl") + + vars = { + filesystemid = resource.aws_efs_file_system.efs.id + } +} + +resource "local_file" "deployment_yaml" { + filename = "/home/ec2-user/environment/eks-workshop/modules/troubleshooting/pod/crash/deployment.yaml" + content = data.template_file.deployment_yaml.rendered +} + +resource "null_resource" "kustomize_app" { + triggers = { + always_run = timestamp() + } + + provisioner "local-exec" { + command = "kubectl apply -f ~/environment/eks-workshop/modules/troubleshooting/pod/crash/" + when = create + } + + depends_on = [resource.local_file.deployment_yaml, resource.aws_efs_file_system.efs] +} \ No newline at end of file diff --git a/manifests/modules/troubleshooting/pod/crash/.workshop/terraform/outputs.tf b/manifests/modules/troubleshooting/pod/crash/.workshop/terraform/outputs.tf new file mode 100644 index 000000000..d50ab0d36 --- /dev/null +++ b/manifests/modules/troubleshooting/pod/crash/.workshop/terraform/outputs.tf @@ -0,0 +1,14 @@ +output "account_id" { + value = local.account_id + description = "account id env variable" +} + +output "environment_variables" { + description = "Environment variables to be added to the IDE shell" + value = merge({ + VPC_ID = data.aws_vpc.selected.id + }, { + for index, id in data.aws_subnets.public.ids : "PUBLIC_SUBNET_${index + 1}" => id + } + ) +} \ No newline at end of file diff --git a/manifests/modules/troubleshooting/pod/crash/.workshop/terraform/vars.tf b/manifests/modules/troubleshooting/pod/crash/.workshop/terraform/vars.tf new file mode 100644 index 000000000..7e0b2f7ab --- /dev/null +++ b/manifests/modules/troubleshooting/pod/crash/.workshop/terraform/vars.tf @@ -0,0 +1,36 @@ +# tflint-ignore: terraform_unused_declarations +variable "eks_cluster_id" { + description = "EKS cluster name" + type = string + default = "eks-workshop" +} + +# tflint-ignore: terraform_unused_declarations +variable "eks_cluster_version" { + description = "EKS cluster version" + type = string +} + +# tflint-ignore: terraform_unused_declarations +variable "cluster_security_group_id" { + description = "EKS cluster security group ID" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "addon_context" { + description = "Addon context that can be passed directly to blueprints addon modules" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "tags" { + description = "Tags to apply to AWS resources" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "resources_precreated" { + description = "Have expensive resources been created already" + type = bool +} \ No newline at end of file diff --git a/manifests/modules/troubleshooting/pod/image/.workshop/cleanup.sh b/manifests/modules/troubleshooting/pod/image/.workshop/cleanup.sh new file mode 100644 index 000000000..37769eb30 --- /dev/null +++ b/manifests/modules/troubleshooting/pod/image/.workshop/cleanup.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +if kubectl get deployment ui-new -n default > /dev/null 2>&1; then + kubectl delete deploy ui-new -n default +else + echo "delpoyment ui-new does not exist" +fi diff --git a/manifests/modules/troubleshooting/pod/image/.workshop/terraform/main.tf b/manifests/modules/troubleshooting/pod/image/.workshop/terraform/main.tf new file mode 100644 index 000000000..cd46cf5a1 --- /dev/null +++ b/manifests/modules/troubleshooting/pod/image/.workshop/terraform/main.tf @@ -0,0 +1,32 @@ +terraform { + required_providers { + # kubectl = { + # source = "gavinbunney/kubectl" + # version = ">= 1.14" + # } + } +} + +provider "aws" { + region = "us-east-1" + alias = "virginia" +} + +data "aws_caller_identity" "current" {} + +locals { + account_id = data.aws_caller_identity.current.account_id +} + + +resource "null_resource" "kustomize_app" { + triggers = { + always_run = timestamp() + } + + provisioner "local-exec" { + command = "kubectl apply -f ~/environment/eks-workshop/modules/troubleshooting/pod/image/" + when = create + } +} + diff --git a/manifests/modules/troubleshooting/pod/image/.workshop/terraform/outputs.tf b/manifests/modules/troubleshooting/pod/image/.workshop/terraform/outputs.tf new file mode 100644 index 000000000..269e7a3b0 --- /dev/null +++ b/manifests/modules/troubleshooting/pod/image/.workshop/terraform/outputs.tf @@ -0,0 +1,4 @@ +output "account_id" { + value = local.account_id + description = "account id env variable" +} \ No newline at end of file diff --git a/manifests/modules/troubleshooting/pod/image/.workshop/terraform/vars.tf b/manifests/modules/troubleshooting/pod/image/.workshop/terraform/vars.tf new file mode 100644 index 000000000..7e0b2f7ab --- /dev/null +++ b/manifests/modules/troubleshooting/pod/image/.workshop/terraform/vars.tf @@ -0,0 +1,36 @@ +# tflint-ignore: terraform_unused_declarations +variable "eks_cluster_id" { + description = "EKS cluster name" + type = string + default = "eks-workshop" +} + +# tflint-ignore: terraform_unused_declarations +variable "eks_cluster_version" { + description = "EKS cluster version" + type = string +} + +# tflint-ignore: terraform_unused_declarations +variable "cluster_security_group_id" { + description = "EKS cluster security group ID" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "addon_context" { + description = "Addon context that can be passed directly to blueprints addon modules" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "tags" { + description = "Tags to apply to AWS resources" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "resources_precreated" { + description = "Have expensive resources been created already" + type = bool +} \ No newline at end of file diff --git a/manifests/modules/troubleshooting/pod/image/deployment.yaml b/manifests/modules/troubleshooting/pod/image/deployment.yaml new file mode 100644 index 000000000..7e07e3ef8 --- /dev/null +++ b/manifests/modules/troubleshooting/pod/image/deployment.yaml @@ -0,0 +1,61 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ui-new + labels: + app: app-new +spec: + replicas: 1 + selector: + matchLabels: + app: app-new + template: + metadata: + annotations: + prometheus.io/path: /actuator/prometheus + prometheus.io/port: "8080" + prometheus.io/scrape: "true" + labels: + app: app-new + spec: + securityContext: + fsGroup: 1000 + containers: + - name: ui + env: + - name: JAVA_OPTS + value: -XX:MaxRAMPercentage=75.0 -Djava.security.egd=file:/dev/urandom + securityContext: + capabilities: + add: + - NET_BIND_SERVICE + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + image: "public.ecr.aws/aws-containers/retailing-store-sample-ui:0.4.0" + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 8080 + protocol: TCP + livenessProbe: + httpGet: + path: /actuator/health/liveness + port: 8080 + initialDelaySeconds: 45 + periodSeconds: 20 + resources: + limits: + memory: 1.5Gi + requests: + cpu: 250m + memory: 1.5Gi + volumeMounts: + - mountPath: /tmp + name: tmp-volume + volumes: + - name: tmp-volume + emptyDir: + medium: Memory diff --git a/manifests/modules/troubleshooting/pod/permissions/.workshop/cleanup.sh b/manifests/modules/troubleshooting/pod/permissions/.workshop/cleanup.sh new file mode 100644 index 000000000..068f0e83f --- /dev/null +++ b/manifests/modules/troubleshooting/pod/permissions/.workshop/cleanup.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +if kubectl get deployment ui-private -n default > /dev/null 2>&1; then + kubectl delete deploy ui-private -n default +else + echo "delpoyment ui-private does not exist" +fi + diff --git a/manifests/modules/troubleshooting/pod/permissions/.workshop/terraform/deployment.yaml.tpl b/manifests/modules/troubleshooting/pod/permissions/.workshop/terraform/deployment.yaml.tpl new file mode 100644 index 000000000..906b59f34 --- /dev/null +++ b/manifests/modules/troubleshooting/pod/permissions/.workshop/terraform/deployment.yaml.tpl @@ -0,0 +1,61 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ui-private + labels: + app: app-private +spec: + replicas: 1 + selector: + matchLabels: + app: app-private + template: + metadata: + annotations: + prometheus.io/path: /actuator/prometheus + prometheus.io/port: "8080" + prometheus.io/scrape: "true" + labels: + app: app-private + spec: + securityContext: + fsGroup: 1000 + containers: + - name: ui + env: + - name: JAVA_OPTS + value: -XX:MaxRAMPercentage=75.0 -Djava.security.egd=file:/dev/urandom + securityContext: + capabilities: + add: + - NET_BIND_SERVICE + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 1000 + image: ${image} + imagePullPolicy: Always + ports: + - name: http + containerPort: 8080 + protocol: TCP + livenessProbe: + httpGet: + path: /actuator/health/liveness + port: 8080 + initialDelaySeconds: 45 + periodSeconds: 20 + resources: + limits: + memory: 1.5Gi + requests: + cpu: 250m + memory: 1.5Gi + volumeMounts: + - mountPath: /tmp + name: tmp-volume + volumes: + - name: tmp-volume + emptyDir: + medium: Memory \ No newline at end of file diff --git a/manifests/modules/troubleshooting/pod/permissions/.workshop/terraform/main.tf b/manifests/modules/troubleshooting/pod/permissions/.workshop/terraform/main.tf new file mode 100644 index 000000000..b8aa2c22a --- /dev/null +++ b/manifests/modules/troubleshooting/pod/permissions/.workshop/terraform/main.tf @@ -0,0 +1,187 @@ +terraform { + required_providers { + # kubectl = { + # source = "gavinbunney/kubectl" + # version = ">= 1.14" + # } + } +} + +provider "aws" { + region = "us-east-1" + alias = "virginia" +} + +data "aws_caller_identity" "current" {} + +locals { + account_id = data.aws_caller_identity.current.account_id +} + +data "aws_region" "current" {} + +data "aws_eks_cluster" "cluster" { + name = var.eks_cluster_id +} + +/* +data "aws_vpc" "selected" { + tags = { + created-by = "eks-workshop-v2" + env = var.addon_context.eks_cluster_id + } +} +*/ + +data "aws_eks_node_group" "default" { + cluster_name = data.aws_eks_cluster.cluster.id + node_group_name = "default" +} + +data "aws_ssm_parameter" "eks_ami" { + name = "/aws/service/eks/optimized-ami/${var.eks_cluster_version}/amazon-linux-2/recommended/image_id" +} + +data "aws_subnets" "selected" { + tags = { + env = var.addon_context.eks_cluster_id + } +} + +resource "aws_iam_role" "ecr_ec2_role" { + name = "ecr_ec2_role" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = "ec2.amazonaws.com" + } + Action = "sts:AssumeRole" + }, + ] + }) + managed_policy_arns = ["arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess", "arn:aws:iam::aws:policy/AmazonElasticContainerRegistryPublicPowerUser"] +} + +resource "aws_iam_instance_profile" "ecr_ec2" { + name = "ecr_ec2" + role = resource.aws_iam_role.ecr_ec2_role.name +} + +resource "aws_instance" "ui_to_ecr" { + ami = data.aws_ssm_parameter.eks_ami.value + instance_type = "t3.medium" + user_data = <<-EOF + #!/bin/bash + sudo yum update -y + sudo amazon-linux-extras install docker + sudo service docker start + sudo usermod -a -G docker ec2-user + docker pull public.ecr.aws/aws-containers/retail-store-sample-ui:0.4.0 + docker images | grep retail-store | awk '{ print $3 }' | xargs -I {} docker tag {} ${resource.aws_ecr_repository.ui.repository_url}:0.4.0 + aws ecr get-login-password | docker login --username AWS --password-stdin ${data.aws_caller_identity.current.account_id}.dkr.ecr.${data.aws_region.current.id}.amazonaws.com + docker push ${resource.aws_ecr_repository.ui.repository_url}:0.4.0 + EOF + subnet_id = element(data.aws_subnets.selected.ids, 0) + iam_instance_profile = resource.aws_iam_instance_profile.ecr_ec2.name + depends_on = [resource.aws_ecr_repository.ui] +} + +resource "aws_ecr_repository" "ui" { + name = "retail-sample-app-ui" + image_tag_mutability = "MUTABLE" + force_delete = true + +} + +data "aws_iam_policy_document" "private_registry" { + statement { + sid = "new policy" + effect = "Deny" + + principals { + type = "AWS" + identifiers = [data.aws_eks_node_group.default.node_role_arn] + } + + actions = [ + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage", + "ecr:BatchCheckLayerAvailability", + "ecr:PutImage", + "ecr:InitiateLayerUpload", + "ecr:UploadLayerPart", + "ecr:CompleteLayerUpload", + "ecr:DescribeRepositories", + "ecr:GetRepositoryPolicy", + "ecr:ListImages", + "ecr:DeleteRepository", + "ecr:BatchDeleteImage", + "ecr:SetRepositoryPolicy", + "ecr:DeleteRepositoryPolicy", + ] + } +} + + +resource "aws_ecr_repository_policy" "example" { + repository = aws_ecr_repository.ui.name + policy = data.aws_iam_policy_document.private_registry.json + depends_on = [resource.aws_instance.ui_to_ecr] +} + +data "template_file" "deployment_yaml" { + template = file("/home/ec2-user/environment/eks-workshop/modules/troubleshooting/pod/permissions/.workshop/terraform/deployment.yaml.tpl") + + vars = { + image = "${resource.aws_ecr_repository.ui.repository_url}:0.4.0" + } +} + + +resource "local_file" "deployment_yaml" { + filename = "/home/ec2-user/environment/eks-workshop/modules/troubleshooting/pod/permissions/deployment.yaml" + content = data.template_file.deployment_yaml.rendered +} + +/* +resource "null_resource" "ui_to_ecr" { + + #provisioner "local-exec" { + # command = "aws ecr get-login-password | docker login --username AWS --password-stdin ${data.aws_caller_identity.current.account_id}.dkr.ecr.${data.aws_region.current.id}.amazonaws.com" + #} + + provisioner "local-exec" { + command = "docker pull public.ecr.aws/aws-containers/retail-store-sample-ui:0.4.0" + } + + provisioner "local-exec" { + command = "docker images | grep retail-store | awk '{ print $3 }' | xargs -I {} docker tag {} ${resource.aws_ecr_repository.ui.repository_url}:0.4.0" + } + + #provisioner "local-exec" { + # command = "sudo docker tag ${tag} ${resource.aws_ecr_repository.ui.repository_url}:0.4.0" + #} + + provisioner "local-exec" { + command = "aws ecr get-login-password | docker login --username AWS --password-stdin ${data.aws_caller_identity.current.account_id}.dkr.ecr.${data.aws_region.current.id}.amazonaws.com && docker push ${resource.aws_ecr_repository.ui.repository_url}:0.4.0" + } + + depends_on = [resource.aws_ecr_repository.ui] +} +*/ + +resource "null_resource" "kustomize_app" { + triggers = { + always_run = timestamp() + } + + provisioner "local-exec" { + command = "kubectl apply -f ~/environment/eks-workshop/modules/troubleshooting/pod/permissions/" + when = create + } + + depends_on = [resource.local_file.deployment_yaml, resource.aws_instance.ui_to_ecr] +} diff --git a/manifests/modules/troubleshooting/pod/permissions/.workshop/terraform/outputs.tf b/manifests/modules/troubleshooting/pod/permissions/.workshop/terraform/outputs.tf new file mode 100644 index 000000000..269e7a3b0 --- /dev/null +++ b/manifests/modules/troubleshooting/pod/permissions/.workshop/terraform/outputs.tf @@ -0,0 +1,4 @@ +output "account_id" { + value = local.account_id + description = "account id env variable" +} \ No newline at end of file diff --git a/manifests/modules/troubleshooting/pod/permissions/.workshop/terraform/vars.tf b/manifests/modules/troubleshooting/pod/permissions/.workshop/terraform/vars.tf new file mode 100644 index 000000000..7e0b2f7ab --- /dev/null +++ b/manifests/modules/troubleshooting/pod/permissions/.workshop/terraform/vars.tf @@ -0,0 +1,36 @@ +# tflint-ignore: terraform_unused_declarations +variable "eks_cluster_id" { + description = "EKS cluster name" + type = string + default = "eks-workshop" +} + +# tflint-ignore: terraform_unused_declarations +variable "eks_cluster_version" { + description = "EKS cluster version" + type = string +} + +# tflint-ignore: terraform_unused_declarations +variable "cluster_security_group_id" { + description = "EKS cluster security group ID" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "addon_context" { + description = "Addon context that can be passed directly to blueprints addon modules" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "tags" { + description = "Tags to apply to AWS resources" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "resources_precreated" { + description = "Have expensive resources been created already" + type = bool +} \ No newline at end of file diff --git a/website/docs/troubleshooting/pod/assets/rep-not-found.webp b/website/docs/troubleshooting/pod/assets/rep-not-found.webp new file mode 100644 index 000000000..17bcc8038 Binary files /dev/null and b/website/docs/troubleshooting/pod/assets/rep-not-found.webp differ diff --git a/website/docs/troubleshooting/pod/assets/repo-found.webp b/website/docs/troubleshooting/pod/assets/repo-found.webp new file mode 100644 index 000000000..8b343fd29 Binary files /dev/null and b/website/docs/troubleshooting/pod/assets/repo-found.webp differ diff --git a/website/docs/troubleshooting/pod/image_pull_1.md b/website/docs/troubleshooting/pod/image_pull_1.md new file mode 100644 index 000000000..9a98ee4c7 --- /dev/null +++ b/website/docs/troubleshooting/pod/image_pull_1.md @@ -0,0 +1,119 @@ +--- +title: "ImagePullBackOff - Public Image" +sidebar_position: 41 +chapter: true +sidebar_custom_props: { "module": true } +--- + +In this section we will learn how to troubleshoot the pod ImagePullBackOff error for a ECR public image. + +:::tip Before you start +Prepare your environment for this section: + +```bash timeout=600 wait=300 +$ prepare-environment troubleshooting/pod/image +``` + +The preparation of the lab might take a couple of minutes and it will make the following changes to your lab environment: + +- Create a new deployment named ui-new in default namespace +- Introduce an issue to the deployment spec, so we can learn how to troubleshoot these types of issues + +::: + +You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/troubleshooting/pod/image/.workshop/terraform). + +Now let's verify if the deployment is created, so we can start troubleshooting the scenario. + +```bash +$ kubectl get deployment ui-new -n default +NAME READY UP-TO-DATE AVAILABLE AGE +ui-new 0/1 1 0 75s +``` + +If you get the same output, it means you are ready to start the troubleshooting. + +The task for you in this troubleshooting section is to find the cause for the deployment ui-new to be in 0/1 ready state and to fix it, so that the deployment will have one pod ready and running. + +## Let's start the troubleshooting + +### Step 1 + +First, we need to verify the status of our pods. To do so, we will use `kubectl` tool. + +```bash +$ kubectl get pods +NAME READY STATUS RESTARTS AGE +ui-new-5654dd8969-7w98k 0/1 ImagePullBackOff 0 13s +``` + +### Step 2 + +You can see that the pod status is showing as ImagePullBackOff. Lets describe the pod to see the events. + +```bash expectError=true timeout=20 +$ POD=`kubectl get pods -o jsonpath='{.items[*].metadata.name}'` +$ kubectl describe pod $POD | awk '/Events:/,/^$/' +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Normal Scheduled 48s default-scheduler Successfully assigned default/ui-new-5654dd8969-7w98k to ip-10-42-33-232.us-west-2.compute.internal + Normal BackOff 23s (x2 over 47s) kubelet Back-off pulling image "public.ecr.aws/aws-containers/retailing-store-sample-ui:0.4.0" + Warning Failed 23s (x2 over 47s) kubelet Error: ImagePullBackOff + Normal Pulling 12s (x3 over 47s) kubelet Pulling image "public.ecr.aws/aws-containers/retailing-store-sample-ui:0.4.0" + Warning Failed 12s (x3 over 47s) kubelet Failed to pull image "public.ecr.aws/aws-containers/retailing-store-sample-ui:0.4.0": rpc error: code = NotFound desc = failed to pull and unpack image "public.ecr.aws/aws-containers/retailing-store-sample-ui:0.4.0": failed to resolve reference "public.ecr.aws/aws-containers/retailing-store-sample-ui:0.4.0": public.ecr.aws/aws-containers/retailing-store-sample-ui:0.4.0: not found + Warning Failed 12s (x3 over 47s) kubelet Error: ErrImagePull +``` + +### Step 3 + +From the events of the pod, we can see the Failed to pull image warning with error code NotFound. This gives us an idea that the referenced image in the pod/deployment spec was not able to be found at the path. Lets check the image used by the pod. + +```bash +$ kubectl get pod $POD -o jsonpath='{.spec.containers[*].image}' +public.ecr.aws/aws-containers/retailing-store-sample-ui:0.4.0 +``` + +### Step 4 + +From the image URI, we can see that the image is referenced from public ECR repository of aws. Lets check if image named retailing-store-sample-ui with tag 0.4.0 exists at [aws-containers ECR](https://gallery.ecr.aws/aws-containers) . Search for the "retailing-store-sample-ui" and you will notice that no such image repository shows up. You can also easily verify the image existence in public ecr by using the image URI on browser. In our case [image-uri](https://gallery.ecr.aws/aws-containers/retailing-store-sample-ui) and since the image does not exist we will see Repository not found message as shown below. + +![RepoDoesNotExist](./assets/rep-not-found.webp) + +### Step 5 + +To resolve the issue, we will have to update the deployment/pod spec with correct image reference. In our case it is public.ecr.aws/aws-containers/retail-store-sample-ui:0.4.0. Before we update the deployment, lets verify if this image exists using above mentioned method i.e. to hit the [image-uri](https://gallery.ecr.aws/aws-containers/retail-store-sample-ui). You should be able to see the retail-store-sample-ui image with multiple tags available. Out of which we are going to use 0.4.0. + +![RepoExist](./assets/repo-found.webp) + +Update the image in the deployment with correct reference + +```bash +$ kubectl patch deployment ui-new --patch '{"spec": {"template": {"spec": {"containers": [{"name": "ui", "image": "public.ecr.aws/aws-containers/retail-store-sample-ui:0.4.0"}]}}}}' +deployment.apps/ui-new patched +``` + +### Step 6 + +Check if the new pod is created and running successfully. + +```bash timeout=180 hook=fix-1 hookTimeout=600 +$ kubectl get pods +NAME READY STATUS RESTARTS AGE +ui-new-77856467b-2z2s6 1/1 Running 0 13s +``` + +That concludes the public ECR ImagePullBackOff troubleshooting section. + +## Wrapping it up + +General troubleshooting workflow of the pod with ImagePullBackOff on public image includes: + +- Check the pod events for a clue on cause of the issue such as not found, access denied or timeout. +- If not found, ensure that the image exists in the path referenced. +- For access denied, check the permissions on worker node role. +- For timeout on public images on ECR, ensure that the worker node networking is configured to reach the internet via IGW/TGW/NAT. + +References: + +- [ECR-with-EKS](https://docs.aws.amazon.com/AmazonECR/latest/userguide/ECR_on_EKS.html) diff --git a/website/docs/troubleshooting/pod/image_pull_2.md b/website/docs/troubleshooting/pod/image_pull_2.md new file mode 100644 index 000000000..1c53c11aa --- /dev/null +++ b/website/docs/troubleshooting/pod/image_pull_2.md @@ -0,0 +1,252 @@ +--- +title: "ImagePullBackOff - ECR Private Image" +sidebar_position: 42 +chapter: true +sidebar_custom_props: { "module": true } +--- + +In this section we will learn how to troubleshoot the pod ImagePullBackOff error for a ECR private image. + +:::tip Before you start +Prepare your environment for this section: + +```bash timeout=600 wait=300 +$ prepare-environment troubleshooting/pod/permissions +``` + +The preparation of the lab might take a couple of minutes and it will make the following changes to your lab environment: + +- Create a ECR repo named retail-sample-app-ui. +- Create a EC2 instance and push retail store sample app image in to the ECR repo from the instance using tag 0.4.0 +- Create a new deployment named ui-private in default namespace +- Introduce an issue to the deployment spec, so we can learn how to troubleshoot this type of issues + +::: + +You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/troubleshooting/pod/permissions/.workshop/terraform). + +Now let's verify if the deployment is created, so we can start troubleshooting the scenario. + +```bash +$ kubectl get deploy ui-private -n default +NAME READY UP-TO-DATE AVAILABLE AGE +ui-private 0/1 1 0 4m25s +``` + +If you get the same output, it means you are ready to start the troubleshooting. + +The task for you in this troubleshooting section is to find the cause for the deployment ui-private to be in 0/1 ready state and to fix it, so that the deployment will have one pod ready and running. + +## Let's start the troubleshooting + +### Step 1 + +First, we need to verify the status of our pods. + +```bash +$ kubectl get pods +NAME READY STATUS RESTARTS AGE +ui-private-7655bf59b9-jprrj 0/1 ImagePullBackOff 0 4m42s +``` + +### Step 2 + +You can see that the pod status is showing as ImagePullBackOff. Lets describe the pod to see the events. + +```bash expectError=true +$ POD=`kubectl get pods -o jsonpath='{.items[*].metadata.name}'` +$ kubectl describe pod $POD | awk '/Events:/,/^$/' +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Normal Scheduled 5m15s default-scheduler Successfully assigned default/ui-private-7655bf59b9-jprrj to ip-10-42-33-232.us-west-2.compute.internal + Normal Pulling 3m53s (x4 over 5m15s) kubelet Pulling image "1234567890.dkr.ecr.us-west-2.amazonaws.com/retail-sample-app-ui:0.4.0" + Warning Failed 3m53s (x4 over 5m14s) kubelet Failed to pull image "1234567890.dkr.ecr.us-west-2.amazonaws.com/retail-sample-app-ui:0.4.0": failed to pull and unpack image "1234567890.dkr.ecr.us-west-2.amazonaws.com/retail-sample-app-ui:0.4.0": failed to resolve reference "1234567890.dkr.ecr.us-west-2.amazonaws.com/retail-sample-app-ui:0.4.0": unexpected status from HEAD request to https:/"1234567890.dkr.ecr.us-west-2.amazonaws.com/v2/retail-sample-app-ui/manifests/0.4.0: 403 Forbidden + Warning Failed 3m53s (x4 over 5m14s) kubelet Error: ErrImagePull + Warning Failed 3m27s (x6 over 5m14s) kubelet Error: ImagePullBackOff + Normal BackOff 4s (x21 over 5m14s) kubelet Back-off pulling image "1234567890.dkr.ecr.us-west-2.amazonaws.com/retail-sample-app-ui:0.4.0" +``` + +### Step 3 + +From the events of the pod, we can see the 'Failed to pull image' warning, with cause as 403 Forbidden. This gives us an idea that the kubelet faced access denied while trying to pull the image used in the deployment. Lets get the URI of the image used in the deployment. + +```bash +$ kubectl get deploy ui-private -o jsonpath='{.spec.template.spec.containers[*].image}' +"1234567890.dkr.ecr.us-west-2.amazonaws.com/retail-sample-app-ui:0.4.0" +``` + +### Step 4 + +From the image URI, we can see that the image is referenced from the account where our EKS cluster is in. Lets check the ECR repository to see if any such image exists. + +```bash +$ aws ecr describe-images --repository-name retail-sample-app-ui --image-ids imageTag=0.4.0 +{ + "imageDetails": [ + { + "registryId": "1234567890", + "repositoryName": "retail-sample-app-ui", + "imageDigest": "sha256:b338785abbf5a5d7e0f6ebeb8b8fc66e2ef08c05b2b48e5dfe89d03710eec2c1", + "imageTags": [ + "0.4.0" + ], + "imageSizeInBytes": 268443135, + "imagePushedAt": "2024-10-11T14:03:01.207000+00:00", + "imageManifestMediaType": "application/vnd.docker.distribution.manifest.v2+json", + "artifactMediaType": "application/vnd.docker.container.image.v1+json" + } + ] +} +``` + +You should see that the image path we have in deployment i.e. account_id.dkr.ecr.us-west-2.amazonaws.com/retail-sample-app-ui:0.4.0 have a valid registryId i.e. account-number, valid repositoryName i.e. "retail-sample-app-ui" and valid imageTag i.e. "0.4.0". Which confirms the path of the image is correct and is not a wrong reference. + +:::info +Alternatively, you can also check the console for the same. Click the button below to open the ECR Console. Then click on retail-sample-app-ui repository and the image tag 0.4.0, you should then see the complete URI of the image which should match with the URI in deployment spec i.e. account_id.dkr.ecr.us-west-2.amazonaws.com/retail-sample-app-ui:0.4.0 + +::: + +### Step 5 + +As we confirmed that the image URI is correct, lets check the permissions of the kubelet and confirm if the permissions required to pull images from ECR exists. + +Get the IAM role attached to worker nodes in the managed node group of the cluster and list the IAM policies attached to it. + +```bash +$ ROLE_NAME=`aws eks describe-nodegroup --cluster-name eks-workshop --nodegroup-name default --query 'nodegroup.nodeRole' --output text | cut -d'/' -f2` +$ aws iam list-attached-role-policies --role-name $ROLE_NAME +{ + "AttachedPolicies": [ + { + "PolicyName": "AmazonSSMManagedInstanceCore", + "PolicyArn": "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + }, + { + "PolicyName": "AmazonEC2ContainerRegistryReadOnly", + "PolicyArn": "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + }, + { + "PolicyName": "AmazonEKSWorkerNodePolicy", + "PolicyArn": "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" + }, + { + "PolicyName": "AmazonSSMPatchAssociation", + "PolicyArn": "arn:aws:iam::aws:policy/AmazonSSMPatchAssociation" + } + ] +} +``` + +You should see that the AWS managed policy "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" is attached to the worker node role and this policy should provide enough permissions to pull a Image from ECR preivate repository. What else could we check now? + +### Step 6 + +The perimissions to the ECR repository can be managed at both Identity and Resource level. The Identity level permissions are provided at IAM and the resource level permissions are provided at the repository level. As we confirmed that identity based permissions are good, the issue could be with resource level permissions. Lets the check the policy for ECR repo. + +```bash +$ aws ecr get-repository-policy --repository-name retail-sample-app-ui --query policyText --output text | jq . +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "new policy", + "Effect": "Deny", + "Principal": { + "AWS": "arn:aws:iam::1234567890:role/EksNodeGroupRole" + }, + "Action": [ + "ecr:UploadLayerPart", + "ecr:SetRepositoryPolicy", + "ecr:PutImage", + "ecr:ListImages", + "ecr:InitiateLayerUpload", + "ecr:GetRepositoryPolicy", + "ecr:GetDownloadUrlForLayer", + "ecr:DescribeRepositories", + "ecr:DeleteRepositoryPolicy", + "ecr:DeleteRepository", + "ecr:CompleteLayerUpload", + "ecr:BatchGetImage", + "ecr:BatchDeleteImage", + "ecr:BatchCheckLayerAvailability" + ] + } + ] +} +``` + +You should see that the ECR repository policy has Effect as Deny and the Principal as the EKS managed node role. Which is restricting the kubelet from pulling images in this repository. Lets change the effect to allow and see if the kubelet is able to pull the image. + +We will be using below json file to modify the ecr repository permissions. You can notice that the Effect is set to Allow for the Node IAM role. + +```json {6} +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "new policy", + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::1234567890:role/EksNodeGroupRole" + }, + "Action": [ + "ecr:UploadLayerPart", + "ecr:SetRepositoryPolicy", + "ecr:PutImage", + "ecr:ListImages", + "ecr:InitiateLayerUpload", + "ecr:GetRepositoryPolicy", + "ecr:GetDownloadUrlForLayer", + "ecr:DescribeRepositories", + "ecr:DeleteRepositoryPolicy", + "ecr:DeleteRepository", + "ecr:CompleteLayerUpload", + "ecr:BatchGetImage", + "ecr:BatchDeleteImage", + "ecr:BatchCheckLayerAvailability" + ] + } + ] +} +``` + +```bash +$ export ROLE_ARN=`aws eks describe-nodegroup --cluster-name eks-workshop --nodegroup-name default --query 'nodegroup.nodeRole'` +$ echo '{"Version":"2012-10-17","Statement":[{"Sid":"new policy","Effect":"Allow","Principal":{"AWS":'${ROLE_ARN}'},"Action":["ecr:BatchCheckLayerAvailability","ecr:BatchDeleteImage","ecr:BatchGetImage","ecr:CompleteLayerUpload","ecr:DeleteRepository","ecr:DeleteRepositoryPolicy","ecr:DescribeRepositories","ecr:GetDownloadUrlForLayer","ecr:GetRepositoryPolicy","ecr:InitiateLayerUpload","ecr:ListImages","ecr:PutImage","ecr:SetRepositoryPolicy","ecr:UploadLayerPart"]}]}' > ~/ecr-policy.json +$ aws ecr set-repository-policy --repository-name retail-sample-app-ui --policy-text file://~/ecr-policy.json +``` + +You can confirm if the ECR repo policy updated successfully, by using the above get-repository-policy command. + +### Step 7 + +Now, restart the deployment and check if the pods are running. + +```bash timeout=180 hook=fix-2 hookTimeout=600 +$ kubectl rollout restart deploy ui-private +$ kubectl get pods +NAME READY STATUS RESTARTS AGE +ui-private-7655bf59b9-s9pvb 1/1 Running 0 65m +``` + +That concludes the private ECR ImagePullBackOff troubleshooting section. + +## Wrapping it up + +General troubleshooting workflow of the pod with ImagePullBackOff on private image includes: + +- Check the pod events for a clue on cause of the issue such as not found, access denied or timeout. +- If not found, ensure that the image exists in the path referenced in the private ECR repositories. +- For access denied, check the permissions on worker node role and the ECR repository policy. +- For timeout on ECR, ensure that the worker node is configured to reach the ECR endpoint. + +References: + +- [ECR_on_EKS](https://docs.aws.amazon.com/AmazonECR/latest/userguide/ECR_on_EKS.html) +- [ECR_repo_policies](https://docs.aws.amazon.com/AmazonECR/latest/userguide/repository-policies.html) +- [EKS_networking](https://docs.aws.amazon.com/eks/latest/userguide/eks-networking.html) diff --git a/website/docs/troubleshooting/pod/index.md b/website/docs/troubleshooting/pod/index.md new file mode 100644 index 000000000..9ab5da433 --- /dev/null +++ b/website/docs/troubleshooting/pod/index.md @@ -0,0 +1,13 @@ +--- +title: "Pod Issue Scenarios" +sidebar_position: 40 +chapter: true +sidebar_custom_props: { "module": true } +description: "Run deployments with diferent image paths/sources and persistent volume configurations, introduce the issues related to running those deployments" +--- + +::required-time + +In this section we will learn how to troubleshoot some of the most common pod issues which prevent the containerized application from running inside the EKS cluster, such as ImagePullBackOff and stuck in ContainerCreating state. + +- Pod Issues such as readiness/liveness probe failures and scheduling issues will be coming soon under this section of troubleshooting module. diff --git a/website/docs/troubleshooting/pod/pod_stuck.md b/website/docs/troubleshooting/pod/pod_stuck.md new file mode 100644 index 000000000..c628d72da --- /dev/null +++ b/website/docs/troubleshooting/pod/pod_stuck.md @@ -0,0 +1,217 @@ +--- +title: "PodStuck - ContainerCreating" +sidebar_position: 43 +chapter: true +sidebar_custom_props: { "module": true } +--- + +In this section we will learn how to troubleshoot the pod for one of the scenarios where it is stuck in ContainerCreating state. + +:::tip Before you start +Prepare your environment for this section: + +```bash timeout=600 wait=300 +$ prepare-environment troubleshooting/pod/crash +``` + +The preparation of the lab might take a couple of minutes and it will make the following changes to your lab environment: + +- Install aws-efs-csi-driver addon in the EKS cluster. +- Create a EFS filesystem and mount targets. +- Create a deployment named efs-app backed by a persistent volume claim named efs-claim to leverage EFS as persistent volume, in the default namespace. + +::: + +You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/troubleshooting/pod/crash/.workshop/terraform). + +Now let's verify if the deployment is created, so we can start troubleshooting the scenario. + +```bash +$ kubectl get deploy efs-app -n default +NAME READY UP-TO-DATE AVAILABLE AGE +efs-app 0/1 1 0 18m +``` + +If you get the same output, it means you are ready to start the troubleshooting. + +The task for you in this troubleshooting section is to find the cause for the deployment efs-app to be in 0/1 ready state and to fix it, so that the deployment will have one pod ready and running. + +## Let's start the troubleshooting + +### Step 1 + +First, we need to verify the status of our pods. + +```bash +$ kubectl get pods +NAME READY STATUS RESTARTS AGE +efs-app-5c4df89785-m4qz4 0/1 ContainerCreating 0 19m +``` + +### Step 2 + +You can see that the pod status is showing as ContainerCreating. Lets describe the pod to see the events. + +```bash expectError=true +$ export POD=`kubectl get pods -o jsonpath='{.items[*].metadata.name}'` +$ kubectl describe pod $POD | awk '/Events:/,/^$/' +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Warning FailedMount 26m (x3 over 26m) kubelet MountVolume.SetUp failed for volume "pvc-719c8ef2-5bdb-4638-b4db-7d59b53d21f0" : rpc error: code = Internal desc = Could not mount "fs-00a4069aec7924c8c:/" at "/var/lib/kubelet/pods/b2db07f9-0bae-4324-98e6-e4c978a0bef5/volumes/kubernetes.io~csi/pvc-719c8ef2-5bdb-4638-b4db-7d59b53d21f0/mount": mount failed: exit status 1 +Mounting command: mount +Mounting arguments: -t efs -o accesspoint=fsap-0488d7b0bd9c26425,tls fs-00a4069aec7924c8c:/ /var/lib/kubelet/pods/b2db07f9-0bae-4324-98e6-e4c978a0bef5/volumes/kubernetes.io~csi/pvc-719c8ef2-5bdb-4638-b4db-7d59b53d21f0/mount +Output: Failed to resolve "fs-00a4069aec7924c8c.efs.us-west-2.amazonaws.com". The file system mount target ip address cannot be found, please pass mount target ip address via mount options. +No mount target created for the file system fs-00a4069aec7924c8c is in available state yet, please retry in 5 minutes. +Warning: config file does not have fips_mode_enabled item in section mount.. You should be able to find a new config file in the same folder as current config file /etc/amazon/efs/efs-utils.conf. Consider update the new config file to latest config file. Use the default value [fips_mode_enabled = False].Warning: config file does not have fips_mode_enabled item in section mount.. You should be able to find a new config file in the same folder as current config file /etc/amazon/efs/efs-utils.conf. Consider update the new config file to latest config file. Use the default value [fips_mode_enabled = False]. + Warning FailedMount 26m (x3 over 26m) kubelet MountVolume.SetUp failed for volume "pvc-719c8ef2-5bdb-4638-b4db-7d59b53d21f0" : rpc error: code = Internal desc = Could not mount "fs-00a4069aec7924c8c:/" at "/var/lib/kubelet/pods/b2db07f9-0bae-4324-98e6-e4c978a0bef5/volumes/kubernetes.io~csi/pvc-719c8ef2-5bdb-4638-b4db-7d59b53d21f0/mount": mount failed: exit status 1 +Mounting command: mount +Mounting arguments: -t efs -o accesspoint=fsap-0488d7b0bd9c26425,tls fs-00a4069aec7924c8c:/ /var/lib/kubelet/pods/b2db07f9-0bae-4324-98e6-e4c978a0bef5/volumes/kubernetes.io~csi/pvc-719c8ef2-5bdb-4638-b4db-7d59b53d21f0/mount +Output: Failed to resolve "fs-00a4069aec7924c8c.efs.us-west-2.amazonaws.com". Cannot connect to file system mount target ip address 10.42.41.35. +Connection to the mount target IP address 10.42.41.35 timeout. Please retry in 5 minutes if the mount target is newly created. Otherwise check your VPC and security group configuration to ensure your file system is reachable via TCP port 2049 from your instance. +Warning: config file does not have fips_mode_enabled item in section mount.. You should be able to find a new config file in the same folder as current config file /etc/amazon/efs/efs-utils.conf. Consider update the new config file to latest config file. Use the default value [fips_mode_enabled = False].Warning: config file does not have fips_mode_enabled item in section mount.. You should be able to find a new config file in the same folder as current config file /etc/amazon/efs/efs-utils.conf. Consider update the new config file to latest config file. Use the default value [fips_mode_enabled = False]. + Warning FailedMount 19m kubelet MountVolume.SetUp failed for volume "pvc-719c8ef2-5bdb-4638-b4db-7d59b53d21f0" : rpc error: code = Internal desc = Could not mount "fs-00a4069aec7924c8c:/" at "/var/lib/kubelet/pods/b2db07f9-0bae-4324-98e6-e4c978a0bef5/volumes/kubernetes.io~csi/pvc-719c8ef2-5bdb-4638-b4db-7d59b53d21f0/mount": mount failed: exit status 32 +Mounting command: mount +Mounting arguments: -t efs -o accesspoint=fsap-0488d7b0bd9c26425,tls fs-00a4069aec7924c8c:/ /var/lib/kubelet/pods/b2db07f9-0bae-4324-98e6-e4c978a0bef5/volumes/kubernetes.io~csi/pvc-719c8ef2-5bdb-4638-b4db-7d59b53d21f0/mount +Output: Could not start amazon-efs-mount-watchdog, unrecognized init system "aws-efs-csi-dri" +Mount attempt 1/3 failed due to timeout after 15 sec, wait 0 sec before next attempt. +Mount attempt 2/3 failed due to timeout after 15 sec, wait 0 sec before next attempt. +b'mount.nfs4: mount point /var/lib/kubelet/pods/b2db07f9-0bae-4324-98e6-e4c978a0bef5/volumes/kubernetes.io~csi/pvc-719c8ef2-5bdb-4638-b4db-7d59b53d21f0/mount does not exist' +Warning: config file does not have fips_mode_enabled item in section mount.. You should be able to find a new config file in the same folder as current config file /etc/amazon/efs/efs-utils.conf. Consider update the new config file to latest config file. Use the default value [fips_mode_enabled = False].Warning: config file does not have retry_nfs_mount_command item in section mount.. You should be able to find a new config file in the same folder as current config file /etc/amazon/efs/efs-utils.conf. Consider update the new config file to latest config file. Use the default value [retry_nfs_mount_command = True]. + Warning FailedMount 3m33s (x6 over 23m) kubelet MountVolume.SetUp failed for volume "pvc-719c8ef2-5bdb-4638-b4db-7d59b53d21f0" : rpc error: code = DeadlineExceeded desc = context deadline exceeded +``` + +### Step 3 + +From the events of the pod, we can see the ' Cannot connect to file system mount target ip address x.x.x.x. +Connection to the mount target IP address x.x.x.x timeout.'. This gives us an idea that the EFS file system is failed to mount for the pods to use as persistent volume. With out which the pod cannot move to running state. Lets check the networking configuration of the node on which the pod is scheduled to run. + +In the below commands, we are getting the instance id of the node where pod is scheduled and then fetching the security groups attached to that node and further checking the egress rules to see if there are limitations on destination. + +```bash +$ export NODE=`kubectl get pod $POD -o jsonpath='{.spec.nodeName}'` +$ export INSTANCE=`kubectl get node $NODE -o jsonpath='{.spec.providerID}' | cut -d'/' -f5` +$ export SG=`aws ec2 describe-instances --instance-ids $INSTANCE --query "Reservations[].Instances[].SecurityGroups[].GroupId" --output text` +$ aws ec2 describe-security-groups --group-ids $SG --query "SecurityGroups[].IpPermissionsEgress[]" +[ + { + "IpProtocol": "-1", + "UserIdGroupPairs": [], + "IpRanges": [ + { + "CidrIp": "0.0.0.0/0" + } + ], + "Ipv6Ranges": [], + "PrefixListIds": [] + } +] +``` + +You can see that the egress rules have no limitations. IpProtocol -1 indicates all protocols and the CidrIp indicates the destination as 0.0.0.0/0. So the communication from the worker node is not restricted and should be able to reach the EFS mount target. + +:::info +Alternatively, click the button below to open the EKS Console. Then navigate to eks-workshop cluster, Resources and select pods workloads. Select default instead of All namespaces in namespace drop down. Then click on efs-app pod and from their click on Node and then on Instance, which should take you to EC2 console. Click on Security and you should see the security groups attached to the worker node and the Outbound rules of it. + +::: + +### Step 4 + +Now, lets check the EFS file system networking configuration. + +In the below commands, we are + +- Retrieving the Availability zone of the worker node using instance id. +- Retrieving the EFS id from the persistent volume claim. +- Retrieving the mount target ENI for the availability zone corresponding to the instance Id. +- Retrieving the security groups attached to the mount target ENI and checking the inbound rules of the security group + +```bash +$ export AZ=`aws ec2 describe-instances --instance-ids $INSTANCE --query "Reservations[*].Instances[*].[Placement.AvailabilityZone]" --output text` +$ export EFS=`kubectl get pv $(kubectl get pvc efs-claim -o jsonpath='{.spec.volumeName}') -o jsonpath='{.spec.csi.volumeHandle}' | cut -d':' -f1` +$ export MT_ENI=`aws efs describe-mount-targets --file-system-id $EFS --query "MountTargets[?AvailabilityZoneName=='$AZ'].[NetworkInterfaceId]" --output text` +$ export MT_SG=`aws ec2 describe-network-interfaces --network-interface-ids $MT_ENI --query "NetworkInterfaces[*].[Groups[*].GroupId]" --output text` +$ aws ec2 describe-security-groups --group-ids $MT_SG --query "SecurityGroups[].IpPermissions[]" +[ + { + "IpProtocol": "tcp", + "FromPort": 80, + "ToPort": 80, + "UserIdGroupPairs": [], + "IpRanges": [ + { + "CidrIp": "10.42.0.0/16" + } + ], + "Ipv6Ranges": [], + "PrefixListIds": [] + } +] +``` + +You should see that the security group attached to the mount target of EFS have inbound rules only on port 80 from the VPC CIDR. However, for the mount to be successful the security group of mount target should allow traffic on port 2049. Which is why the mount request is timing out from the EKS worker node. + +:::info +Alternatively, you can also check the console for the same. Click the button below to open the EFS Console. Then click on EFS file system Id which has name as eks-workshop-efs. Then click on Network to view mount targets for all availability zones and the security groups attached to the each mount target. + + +::: + +### Step 5 + +Lets add the inboud rule to EFS mount target security group to allow NFS traffic on port 2049 from VPC CIDR of the EKS cluster. + +In the below commands, we are + +- Retrieving the VPC of the EKS cluster and the CIDR of the VPC. +- Adding inbound rule to mount target security group allowing traffic on port 2049 from VPC CIDR. + +```bash +$ export VPC_ID=`aws eks describe-cluster --name eks-workshop --query "cluster.resourcesVpcConfig.vpcId" --output text` +$ export CIDR=`aws ec2 describe-vpcs --vpc-ids $VPC_ID --query "Vpcs[*].CidrBlock" --output text` +$ aws ec2 authorize-security-group-ingress --group-id $MT_SG --protocol tcp --port 2049 --cidr $CIDR +{ + "Return": true, + "SecurityGroupRules": [ + { + "SecurityGroupRuleId": "sgr-05ae66b3cfaf2b03c", + "GroupId": "sg-0d69452207db88cde", + "GroupOwnerId": "682844965773", + "IsEgress": false, + "IpProtocol": "tcp", + "FromPort": 2049, + "ToPort": 2049, + "CidrIpv4": "10.42.0.0/16" + } + ] +} +``` + +After 3-4 minutes, you should notice that the pod in default namespace is in running state + +```bash timeout=180 hook=fix-3 hookTimeout=600 +$ kubectl get pods $POD +NAME READY STATUS RESTARTS AGE +efs-app-5c4df89785-m4qz4 1/1 Running 0 102m +``` + +Since the EFS mount target security group allow traffic on port 2049 the worker nodes where able to successfully communicate with the mount targets and complete the mount of EFS to the pods. + +This concludes the pod stuck in ContainerCreating troubleshooting section. + +## Wrapping it up + +General, troubleshooting workflow of pods stuck in ContainerCreating state is to check pod events and for volume mount issues: + +- Check the volume claim used by the pod and identify the type of volume used. +- Then check the csi driver used for that volume and check the requirements to use that volume in EKS pods at [EKS Storage](https://docs.aws.amazon.com/eks/latest/userguide/storage.html) +- Confirm that all the requiements mentioned in the corresponding storage type document are met. +- Further check for troubleshooting guide of that CSI driver if exists. For example, to check mount issues of EFS with EKS there is a troubleshooting guide at [EFS CSi Driver](https://repost.aws/knowledge-center/eks-troubleshoot-efs-volume-mount-issues) diff --git a/website/docs/troubleshooting/pod/tests/hook-fix-1.sh b/website/docs/troubleshooting/pod/tests/hook-fix-1.sh new file mode 100644 index 000000000..1c86ed001 --- /dev/null +++ b/website/docs/troubleshooting/pod/tests/hook-fix-1.sh @@ -0,0 +1,21 @@ +set -Eeuo pipefail + +before() { + echo "noop" +} + +after() { + sleep 120 + + if kubectl get pods --selector="app=app-new" 2>&1 | grep -q "Running"; then + echo "Success: The pod is now in running state" + exit 0 + fi + + >&2 echo "pod is not in running state, when expected to be running" + exit 1 +} + + + +"$@" \ No newline at end of file diff --git a/website/docs/troubleshooting/pod/tests/hook-fix-2.sh b/website/docs/troubleshooting/pod/tests/hook-fix-2.sh new file mode 100644 index 000000000..9c3ad3ba8 --- /dev/null +++ b/website/docs/troubleshooting/pod/tests/hook-fix-2.sh @@ -0,0 +1,21 @@ +set -Eeuo pipefail + +before() { + echo "noop" +} + +after() { + sleep 180 + + if kubectl get pods --selector="app=app-private" 2>&1 | grep -q "Running"; then + echo "Success: The pod is now in running state" + exit 0 + fi + + >&2 echo "pod is not in running state, when expected to be running" + exit 1 +} + + + +"$@" \ No newline at end of file diff --git a/website/docs/troubleshooting/pod/tests/hook-fix-3.sh b/website/docs/troubleshooting/pod/tests/hook-fix-3.sh new file mode 100644 index 000000000..bbd797ead --- /dev/null +++ b/website/docs/troubleshooting/pod/tests/hook-fix-3.sh @@ -0,0 +1,21 @@ +set -Eeuo pipefail + +before() { + echo "noop" +} + +after() { + sleep 240 + + if kubectl get pods --selector="app=efs-app" 2>&1 | grep -q "Running"; then + echo "Success: The pod is now in running state" + exit 0 + fi + + >&2 echo "pod is not in running state, when expected to be running" + exit 1 +} + + + +"$@" \ No newline at end of file diff --git a/website/docs/troubleshooting/pod/tests/hook-suite.sh b/website/docs/troubleshooting/pod/tests/hook-suite.sh new file mode 100644 index 000000000..36ca4218e --- /dev/null +++ b/website/docs/troubleshooting/pod/tests/hook-suite.sh @@ -0,0 +1,11 @@ +set -e + +before() { + echo "noop" +} + +after() { + prepare-environment +} + +"$@" \ No newline at end of file diff --git a/website/test-durations.json b/website/test-durations.json index 30575364c..949335ff4 100644 --- a/website/test-durations.json +++ b/website/test-durations.json @@ -191,5 +191,8 @@ "/security/secrets-management/secrets-manager/external-secrets.md": 14963, "/security/secrets-management/secrets-manager/index.md": 281009, "/security/secrets-management/secrets-manager/mounting-secrets.md": 16049, - "/troubleshooting/alb/index.md": 16049 + "/troubleshooting/alb/index.md": 16049, + "/troubleshooting/pod/image_pull_1.md": 16049, + "/troubleshooting/pod/image_pull_2.md": 16049, + "/troubleshooting/pod/pod_stuck.md": 16049 }