Improve worker load threshold detection

With 03b301d we have a worker load limit which helps but there can still be cases like happened on a PowerNV machine with the load going way above the configured load limit. The reason was that when the worker was idle within a short time frame of roughly one minute multiple jobs were assigned to individual worker instances on the machine. As we only looked at load15 which was still low at that time all jobs were picked up by the machine leading to overload only about one minute later. Further reducing the load limit would not realistically prevent this situation but only delay until load15 decays sufficiently enough so that new jobs will be picked up again. Instead this commit changes the evaluation to look at all three system load values, load1, load5 and load15, but considering the load evolution over time to react quickly enough if the load rises but still accept a falling edge to allow to pick up jobs again when the load decays. Related progress issue: https://progress.opensuse.org/issues/168244
os-autoinst · Oct 21, 2024 · f8b5424 · f8b5424
1 parent 0030e84
commit f8b5424
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 14 deletions.
diff --git a/lib/OpenQA/Worker.pm b/lib/OpenQA/Worker.pm
@@ -34,6 +34,7 @@ BEGIN {
 use Fcntl;
 use File::Path qw(make_path remove_tree);
 use File::Spec::Functions 'catdir';
+use List::Util qw(all max min);
 use Mojo::IOLoop;
 use Mojo::File 'path';
 use POSIX;
@@ -750,18 +751,24 @@ sub _handle_job_status_changed ($self, $job, $event_data) {
     }
 }
 
-sub _load_avg ($field = 2) {
-    my $value = eval { (split(' ', path($ENV{OPENQA_LOAD_AVG_FILE} // '/proc/loadavg')->slurp))[$field] };
+sub _load_avg ($path = $ENV{OPENQA_LOAD_AVG_FILE} // '/proc/loadavg') {
+    my @load = eval { split(' ', path($path)->slurp) };
     log_warning "Unable to determine average load: $@" if $@;
-    return looks_like_number($value) ? $value : undef;
-}
-
-sub _check_system_utilization ($self) {
-    my $settings = $self->settings->global_settings;
-    return undef unless my $threshold = $settings->{CRITICAL_LOAD_AVG_THRESHOLD};
-    my $load_avg = _load_avg;
-    return "The average load $load_avg is exceeding the configured threshold of $threshold."
-      if defined $load_avg && $load_avg >= $threshold;
+    splice @load, 3;    # remove non-load numbers
+    log_error "Unable to parse system load from file '$path'" and return [] unless all { looks_like_number $_ } @load;
+    return \@load;
+}
+
+sub _check_system_utilization (
+    $self,
+    $threshold = $self->settings->global_settings->{CRITICAL_LOAD_AVG_THRESHOLD},
+    $load = _load_avg())
+{
+    return undef unless $threshold && @$load >= 3;
+    # look at the load evolution over time to react quick enough if the load
+    # rises but accept a falling edge
+    return "The average load (@$load) is exceeding the configured threshold of $threshold."
+      if max(@$load) > $threshold && ($load->[0] > $load->[1] || $load->[0] > $load->[2] || min(@$load) > $threshold);
     return undef;
 }
 

diff --git a/t/24-worker-overall.t b/t/24-worker-overall.t
@@ -30,8 +30,6 @@ $ENV{OPENQA_CONFIG} = "$FindBin::Bin/data/24-worker-overall";
 #       file specified via OPENQA_LOGFILE instead of stdout/stderr.
 $ENV{OPENQA_LOGFILE} = undef;
 
-my $load_avg_file = simulate_load('0.93 0.95 10.25 2/2207 1212', 'worker-overall-load-avg');
-
 # define fake isotovideo
 {
     package Test::FakeProcess;    # uncoverable statement count:1
@@ -90,6 +88,7 @@ my $dbus_mock = Test::MockModule->new('Net::DBus', no_auto => 1);
 $dbus_mock->define(system => sub (@) { Test::FakeDBus->new });
 my $cache_service_client_mock = Test::MockModule->new('OpenQA::CacheService::Client');
 $cache_service_client_mock->redefine(info => sub { Test::FakeCacheServiceClientInfo->new });
+my $load_avg_file = simulate_load('10.93 10.91 10.25 2/2207 1212', 'worker-overall-load-avg');
 
 like(
     exception {
@@ -120,6 +119,21 @@ combined_like { $worker->log_setup_info }
 qr/.*http:\/\/localhost:9527,https:\/\/remotehost.*qemu_i386,qemu_x86_64.*Errors occurred.*foo.*bar.*/s,
   'setup info with parse errors';
 
+subtest 'worker load' => sub {
+    my $load = OpenQA::Worker::_load_avg();
+    is scalar @$load, 3, 'expected number of load values';
+    is $load->[0], 10.93, 'expected load';
+    is_deeply $load, [10.93, 10.91, 10.25], 'expected computed system load, rising flank';
+    is_deeply OpenQA::Worker::_load_avg(path($ENV{OPENQA_CONFIG}, 'invalid_loadavg')), [], 'error on invalid load';
+    ok !$worker->_check_system_utilization, 'default threshold not exceeded';
+    ok $worker->_check_system_utilization(10), 'stricter threshold exceeded by load';
+    ok !$worker->_check_system_utilization(10, [3, 9, 11]), 'load ok on falling flank';
+    ok $worker->_check_system_utilization(10, [12, 9, 3]), 'load exceeded on rising flank';
+    ok $worker->_check_system_utilization(10, [12, 3, 9]), 'load exceeded on rising flank and old load';
+    ok $worker->_check_system_utilization(10, [11, 13, 12]), 'load still exceeded on short load dip';
+    ok $worker->_check_system_utilization(10, [11, 12, 13]), 'load still exceeded on falling flank but high';
+};
+
 subtest 'delay and exec' => sub {
     my $worker_mock = Test::MockModule->new('OpenQA::Worker');
     $worker_mock->redefine(init => 42);
@@ -854,7 +868,7 @@ qr/Job 42 from some-host finished - reason: done.*A QEMU instance using.*Skippin
             $worker_mock->unmock('is_qemu_running');
             $worker->settings->global_settings->{CRITICAL_LOAD_AVG_THRESHOLD} = '10';
             is $worker->status->{status}, 'broken', 'worker considered broken when average load exceeds threshold';
-            like $worker->current_error, qr/load 10\.25.*exceeding.*10/, 'error shows current load and threshold';
+            like $worker->current_error, qr/load \(.*10\.25.*exceeding.*10/, 'error shows current load and threshold';
 
             # assume the error is gone
             $load_avg_file->remove;