One scaling policy gets stuck often #879

DTTerastar · 2024-03-14T16:33:29Z

It's specifically my windows asg policy.

Version: v0.4.2 (I see 0.4.3 was released, will test that!)

See attached debug log:

https://gist.github.com/DTTerastar/9bf09f78ce247da5325900652ce2cc53

As well as the sigabrt:

https://gist.github.com/DTTerastar/b67c8f4289af99e5ad54bf05214d80ba

This is the config:

job "autoscaler" {
  datacenters = ["dc1"]
  priority = 100 // Critical job

  reschedule {
    delay          = "30s"
    delay_function = "exponential"
    max_delay      = "1h"
    unlimited      = true
    attempts       = 0
  }
  
  group "autoscaler" {
    count = 1

    network {
      port "http" {}
    }

    task "autoscaler" {
      driver = "docker"

      config {
        image   = "${nomad_autoscaler_image}"
        command = "nomad-autoscaler"

        args = [
          "agent",
          "-config",
          "$${NOMAD_TASK_DIR}/config.hcl",
          "-http-bind-address",
          "0.0.0.0",
          "-http-bind-port",
          "$${NOMAD_PORT_http}",
          "-policy-dir",
          "$${NOMAD_TASK_DIR}/policies/",
        ]

        ports = ["http"]
      }

      template {
        data = <<EOF
log_level = "${log_level}"

nomad {
  address = "http://{{env "attr.unique.network.ip-address" }}:4646"
  namespace = "*"
}

apm "prometheus" {
  driver = "prometheus"
  config = {
    address = "http://{{ range service "prometheus" }}{{ .Address }}:{{ .Port }}{{ end }}"
  }
}

policy {
  default_evaluation_interval = "1m"
  default_cooldown            = "1m"
}

target "linux-aws-asg" {
  driver = "aws-asg"
  config = {
    aws_region = "{{ $x := env "attr.platform.aws.placement.availability-zone" }}{{ $length := len $x |subtract 1 }}{{ slice $x 0 $length}}"
    retry_attempts = "20"
  }
}

target "windows-aws-asg" {
  driver = "aws-asg"
  config = {
    aws_region = "{{ $x := env "attr.platform.aws.placement.availability-zone" }}{{ $length := len $x |subtract 1 }}{{ slice $x 0 $length}}"
    retry_attempts = "20"
  }
}

strategy "target-value" {
  driver = "target-value"
}
EOF

        destination = "$${NOMAD_TASK_DIR}/config.hcl"
      }

      template {
        data = <<EOF
scaling "linux" {
  enabled = true
  min     = ${linux_min_size}
  max     = ${linux_max_size}

  policy {
    cooldown            = "2m"
    evaluation_interval = "1m"

    check "linux_cpu_allocated_percentage" {
      source = "prometheus"
      query  = "sum(nomad_client_allocated_cpu{node_class=\"linux\"}*100/(nomad_client_unallocated_cpu{node_class=\"linux\"}+nomad_client_allocated_cpu{node_class=\"linux\"}))/count(nomad_client_allocated_cpu{node_class=\"linux\"})"

      strategy "target-value" {
        target = 50
      }
    }

    check "linux_mem_allocated_percentage" {
      source = "prometheus"
      query  = "sum(nomad_client_allocated_memory{node_class=\"linux\"}*100/(nomad_client_unallocated_memory{node_class=\"linux\"}+nomad_client_allocated_memory{node_class=\"linux\"}))/count(nomad_client_allocated_memory{node_class=\"linux\"})"

      strategy "target-value" {
        target = 70
      }
    }

    target "linux-aws-asg" {
      dry-run                  = "false"
      aws_asg_name             = "${linux_asg_name}"
      node_class               = "linux"
      node_drain_deadline      = "5m"
      node_filter_ignore_drain = true
    }
  }
}
EOF

        destination = "$${NOMAD_TASK_DIR}/policies/linux.hcl"
      }


      template {
        data = <<EOF
scaling "windows" {
  enabled = true
  min     = ${windows_min_size}
  max     = ${windows_max_size}

  policy {
    cooldown            = "2m"
    evaluation_interval = "1m"

    check "windows_cpu_allocated_percentage" {
      source = "prometheus"
      query  = "sum(nomad_client_allocated_cpu{node_class=\"windows\"}*100/(nomad_client_unallocated_cpu{node_class=\"windows\"}+nomad_client_allocated_cpu{node_class=\"windows\"}))/count(nomad_client_allocated_cpu{node_class=\"windows\"})"

      strategy "target-value" {
        target = 50
      }
    }

    check "windows_mem_allocated_percentage" {
      source = "prometheus"
      query  = "sum(nomad_client_allocated_memory{node_class=\"windows\"}*100/(nomad_client_unallocated_memory{node_class=\"windows\"}+nomad_client_allocated_memory{node_class=\"windows\"}))/count(nomad_client_allocated_memory{node_class=\"windows\"})"

      strategy "target-value" {
        target = 70
      }
    }

    target "windows-aws-asg" {
      dry-run                 = "false"
      aws_asg_name            = "${windows_asg_name}"
      node_class              = "windows"
      node_drain_deadline     = "5m"
      node_filter_ignore_drain = true
    }
  }
}
EOF

        destination = "$${NOMAD_TASK_DIR}/policies/windows.hcl"
      }

      resources {
        cpu    = 1000
        memory = 1024
      }

      env {
        CONSUL_TOKEN = "${consul_token}"
        NOMAD_TOKEN = "${nomad_token}"
      }

      service {
        name = "autoscaler"
        port = "http"

        check {
          type     = "http"
          path     = "/v1/health"
          interval = "5s"
          timeout  = "2s"
        }
      }
    }
  }
}

The text was updated successfully, but these errors were encountered:

lgfa29 · 2024-03-18T23:53:07Z

Hi @DTTerastar 👋

Thank you for the detail report!

Looking at the logs it seems like one node is not being marked as having completed its drain.

2024-03-13T20:43:33.232Z [DEBUG] internal_plugin.windows-aws-asg: node selected for removal: node_id=8e541bc4-c1d3-c2e6-9ee6-7426f94d818e remote_id=i-06458c3709d4efa7c
2024-03-13T20:43:33.232Z [DEBUG] internal_plugin.windows-aws-asg: node selected for removal: node_id=de71fd67-bb97-9679-da2e-9576728b105e remote_id=i-00761f514c21aaf48
2024-03-13T20:43:33.233Z [DEBUG] internal_plugin.windows-aws-asg: node selected for removal: node_id=bf9b3997-c595-b3ab-0547-6e1e8be5f556 remote_id=i-0e2d6dbfee1fb862e
2024-03-13T20:43:33.233Z [DEBUG] internal_plugin.windows-aws-asg: node selected for removal: node_id=e1252bcf-2a01-04d0-1a35-ed36e95185aa remote_id=i-07e60de66c6403caa
2024-03-13T20:43:33.233Z [DEBUG] internal_plugin.windows-aws-asg: node selected for removal: node_id=c35c9225-3629-6cfd-80b6-e4b7d152fda2 remote_id=i-0abdb2a55865097b3
2024-03-13T20:43:33.233Z [DEBUG] internal_plugin.windows-aws-asg: node selected for removal: node_id=9fef61b9-3cc3-7782-3bb3-189a56e616df remote_id=i-0c2f7af9a7c9bd1a1
2024-03-13T20:43:33.233Z [INFO]  internal_plugin.windows-aws-asg: triggering drain on node: node_id=9fef61b9-3cc3-7782-3bb3-189a56e616df deadline=5m0s
2024-03-13T20:43:33.233Z [INFO]  internal_plugin.windows-aws-asg: triggering drain on node: node_id=bf9b3997-c595-b3ab-0547-6e1e8be5f556 deadline=5m0s
2024-03-13T20:43:33.233Z [INFO]  internal_plugin.windows-aws-asg: triggering drain on node: node_id=8e541bc4-c1d3-c2e6-9ee6-7426f94d818e deadline=5m0s
2024-03-13T20:43:33.234Z [INFO]  internal_plugin.windows-aws-asg: triggering drain on node: node_id=de71fd67-bb97-9679-da2e-9576728b105e deadline=5m0s
2024-03-13T20:43:33.235Z [INFO]  internal_plugin.windows-aws-asg: triggering drain on node: node_id=e1252bcf-2a01-04d0-1a35-ed36e95185aa deadline=5m0s
2024-03-13T20:43:33.237Z [INFO]  internal_plugin.windows-aws-asg: triggering drain on node: node_id=c35c9225-3629-6cfd-80b6-e4b7d152fda2 deadline=5m0s
2024-03-13T20:43:33.256Z [INFO]  internal_plugin.windows-aws-asg: received node drain message: node_id=bf9b3997-c595-b3ab-0547-6e1e8be5f556 msg="Drain complete for node bf9b3997-c595-b3ab-0547-6e1e8be5f556"
2024-03-13T20:43:33.289Z [INFO]  internal_plugin.windows-aws-asg: received node drain message: node_id=de71fd67-bb97-9679-da2e-9576728b105e msg="Drain complete for node de71fd67-bb97-9679-da2e-9576728b105e"
2024-03-13T20:43:33.290Z [INFO]  internal_plugin.windows-aws-asg: received node drain message: node_id=c35c9225-3629-6cfd-80b6-e4b7d152fda2 msg="Drain complete for node c35c9225-3629-6cfd-80b6-e4b7d152fda2"
2024-03-13T20:43:33.295Z [INFO]  internal_plugin.windows-aws-asg: received node drain message: node_id=e1252bcf-2a01-04d0-1a35-ed36e95185aa msg="Drain complete for node e1252bcf-2a01-04d0-1a35-ed36e95185aa"
2024-03-13T20:43:33.298Z [INFO]  internal_plugin.windows-aws-asg: received node drain message: node_id=bf9b3997-c595-b3ab-0547-6e1e8be5f556 msg="All allocations on node \"bf9b3997-c595-b3ab-0547-6e1e8be5f556\" have stopped"
2024-03-13T20:43:33.298Z [DEBUG] internal_plugin.windows-aws-asg: node drain complete: node_id=bf9b3997-c595-b3ab-0547-6e1e8be5f556
2024-03-13T20:43:33.302Z [INFO]  internal_plugin.windows-aws-asg: received node drain message: node_id=c35c9225-3629-6cfd-80b6-e4b7d152fda2 msg="All allocations on node \"c35c9225-3629-6cfd-80b6-e4b7d152fda2\" have stopped"
2024-03-13T20:43:33.302Z [DEBUG] internal_plugin.windows-aws-asg: node drain complete: node_id=c35c9225-3629-6cfd-80b6-e4b7d152fda2
2024-03-13T20:43:33.305Z [INFO]  internal_plugin.windows-aws-asg: received node drain message: node_id=e1252bcf-2a01-04d0-1a35-ed36e95185aa msg="All allocations on node \"e1252bcf-2a01-04d0-1a35-ed36e95185aa\" have stopped"
2024-03-13T20:43:33.305Z [DEBUG] internal_plugin.windows-aws-asg: node drain complete: node_id=e1252bcf-2a01-04d0-1a35-ed36e95185aa
2024-03-13T20:43:33.307Z [INFO]  internal_plugin.windows-aws-asg: received node drain message: node_id=de71fd67-bb97-9679-da2e-9576728b105e msg="All allocations on node \"de71fd67-bb97-9679-da2e-9576728b105e\" have stopped"
2024-03-13T20:43:33.307Z [DEBUG] internal_plugin.windows-aws-asg: node drain complete: node_id=de71fd67-bb97-9679-da2e-9576728b105e
2024-03-13T20:43:33.308Z [INFO]  internal_plugin.windows-aws-asg: received node drain message: node_id=8e541bc4-c1d3-c2e6-9ee6-7426f94d818e msg="Drain complete for node 8e541bc4-c1d3-c2e6-9ee6-7426f94d818e"
2024-03-13T20:43:33.310Z [INFO]  internal_plugin.windows-aws-asg: received node drain message: node_id=8e541bc4-c1d3-c2e6-9ee6-7426f94d818e msg="All allocations on node \"8e541bc4-c1d3-c2e6-9ee6-7426f94d818e\" have stopped"
2024-03-13T20:43:33.310Z [DEBUG] internal_plugin.windows-aws-asg: node drain complete: node_id=8e541bc4-c1d3-c2e6-9ee6-7426f94d818e
2024-03-13T20:43:33.319Z [INFO]  internal_plugin.windows-aws-asg: received node drain message: node_id=9fef61b9-3cc3-7782-3bb3-189a56e616df msg="Drain complete for node 9fef61b9-3cc3-7782-3bb3-189a56e616df"

If you look carefully you will notice that the node 9fef61b9 never receives the message node drain complete.

It also never receives the message All allocations on node, so I wonder if there may an allocation still running on the client? Have you noticed anything like that?

DTTerastar · 2024-04-02T11:28:26Z

A node not draining can't be a fatal event for the autoscaler, right? I'm on windows where things only work 99% of the time. I thought that after the drain timeout it would just forcibly stop the node and move on.

lgfa29 · 2024-04-03T23:55:58Z

That's right. We have a drain deadline that was supposed to configure the Nomad drain with a timeout value.

nomad-autoscaler/sdk/helper/scaleutils/node_drain.go

Lines 82 to 84 in ccd5f7c

    
           // The configuration options that define the drain spec are optional and 
        
           // have sensible defaults. 
        
           deadline := defaultNodeDrainDeadline

But for some reason the drain didn't seem to complete, or maybe it did complete and the bug may be in the Nomad SDK not unblocking callers:
https://github.com/hashicorp/nomad/blob/78f9f178670097eac9036c4bf83f7e0ceb55e6f3/api/nodes.go#L213-L231

So that's why I'm trying to understand the details of what you've observed. Do you remember if the drain deadline was reached?

gibbonsjohnm · 2024-04-08T18:49:34Z

I can confirm we are experiencing the same issue described above, and yes the drain deadline was far surpassed. Ours is set to 15m and we've had cases where we are stuck for 18-24 hours.

Debug logs in the autoscaler show it's not even evaluating the "stuck" policy when this situation occurs. Nodes in that scaling group constantly receive drain messages. We have thousands of nodes so it's very difficult to tell which one, or more, it's getting stuck on.

I can provide logs if they would be of assistance.

gibbonsjohnm · 2024-04-16T16:58:13Z

@lgfa29 Pinging for visibility

lgfa29 · 2024-04-16T23:15:53Z

Hi @gibbonsjohnm 👋

Thank you for the extra information. Yes, any logs you can provide could be useful. You can send them via email to [email protected] referencing this issue ID in the subject if they contain sensitive data.

As I mentioned in my previous message, I suspect the issue is that the Nomad SDK is not unblocking callers when the drain is complete. More specifically, looking at the stack traces provided by @DTTerastar we can see several calls to monitorDrainAllocs still active.

Additionally, MonitorDrain doesn't seem to actually monitor the drain progress itself, but rather check for node and alloc updates directly. I suspect it may not return if an allocation gets "stuck" during drain and the drain deadline is reached.

So another thing to look for is the status of the node that is attempted to be drained. Does it still have have allocations in the running or pending state?

One last thing, I no longer work for HashiCorp, so I won't be following this issue anymore. I think we have enough information for someone else in the team to start investigating it further.

gibbonsjohnm · 2024-04-17T21:29:56Z

Yes, one of the nodes does in fact still thinks it has a running system allocation, even though the docker container has been stopped and removed.

I seem unable to remove with nomad alloc stop -detach <alloc_id>

gibbonsjohnm · 2024-04-17T21:34:09Z

Even making the node eligible again, scheduling new workloads, and initiating a manual drain does not clean up the ghost allocation.

gibbonsjohnm · 2024-04-17T21:43:04Z

Note that I manually terminated the machine with the ghost allocation and scale-in started after it disconnected from the cluster and I restarted the autoscaler.

lgfa29 · 2024-04-23T11:44:43Z

Thanks for the extra info @gibbonsjohnm.

Yes, one of the nodes does in fact still thinks it has a running system allocation, even though the docker container has been stopped and removed.

This makes me think of hashicorp/nomad#20116 🤔

But nevertheless, the Nomad SDK drain monitoring function should not block forever because drains have deadlines. Once the deadline is reached the function should return control to the caller, even if there are still (or the client thinks there are still) allocations running.

lgfa29 added theme/policy-eval Policy broker, workers and evaluation type/bug stage/needs-investigation labels Mar 18, 2024

lgfa29 self-assigned this Mar 18, 2024

lgfa29 removed their assignment Apr 16, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

One scaling policy gets stuck often #879

One scaling policy gets stuck often #879

DTTerastar commented Mar 14, 2024

lgfa29 commented Mar 18, 2024

DTTerastar commented Apr 2, 2024

lgfa29 commented Apr 3, 2024

gibbonsjohnm commented Apr 8, 2024

gibbonsjohnm commented Apr 16, 2024

lgfa29 commented Apr 16, 2024

gibbonsjohnm commented Apr 17, 2024

gibbonsjohnm commented Apr 17, 2024

gibbonsjohnm commented Apr 17, 2024

lgfa29 commented Apr 23, 2024

One scaling policy gets stuck often #879

One scaling policy gets stuck often #879

Comments

DTTerastar commented Mar 14, 2024

lgfa29 commented Mar 18, 2024

DTTerastar commented Apr 2, 2024

lgfa29 commented Apr 3, 2024

gibbonsjohnm commented Apr 8, 2024

gibbonsjohnm commented Apr 16, 2024

lgfa29 commented Apr 16, 2024

gibbonsjohnm commented Apr 17, 2024

gibbonsjohnm commented Apr 17, 2024

gibbonsjohnm commented Apr 17, 2024

lgfa29 commented Apr 23, 2024