Skip to content

Commit

Permalink
Improve worker load threshold detection
Browse files Browse the repository at this point in the history
With 03b301d we have a worker load limit which helps but there can still
be cases like happened on a PowerNV machine with the load going way above
the configured load limit. The reason was that when the worker was idle
within a short time frame of roughly one minute multiple jobs were
assigned to individual worker instances on the machine. As we only
looked at load15 which was still low at that time all jobs were picked
up by the machine leading to overload only about one minute later.
Further reducing the load limit would not realistically prevent this
situation but only delay until load15 decays sufficiently enough so that
new jobs will be picked up again. Instead this commit changes the
evaluation to look at all three system load values, load1, load5 and
load15, but considering the load evolution over time to react quickly
enough if the load rises but still accept a falling edge to allow to
pick up jobs again when the load decays.

Related progress issue: https://progress.opensuse.org/issues/168244
  • Loading branch information
okurz committed Oct 21, 2024
1 parent 0030e84 commit f8b5424
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 14 deletions.
29 changes: 18 additions & 11 deletions lib/OpenQA/Worker.pm
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ BEGIN {
use Fcntl;
use File::Path qw(make_path remove_tree);
use File::Spec::Functions 'catdir';
use List::Util qw(all max min);
use Mojo::IOLoop;
use Mojo::File 'path';
use POSIX;
Expand Down Expand Up @@ -750,18 +751,24 @@ sub _handle_job_status_changed ($self, $job, $event_data) {
}
}

sub _load_avg ($field = 2) {
my $value = eval { (split(' ', path($ENV{OPENQA_LOAD_AVG_FILE} // '/proc/loadavg')->slurp))[$field] };
sub _load_avg ($path = $ENV{OPENQA_LOAD_AVG_FILE} // '/proc/loadavg') {
my @load = eval { split(' ', path($path)->slurp) };
log_warning "Unable to determine average load: $@" if $@;
return looks_like_number($value) ? $value : undef;
}

sub _check_system_utilization ($self) {
my $settings = $self->settings->global_settings;
return undef unless my $threshold = $settings->{CRITICAL_LOAD_AVG_THRESHOLD};
my $load_avg = _load_avg;
return "The average load $load_avg is exceeding the configured threshold of $threshold."
if defined $load_avg && $load_avg >= $threshold;
splice @load, 3; # remove non-load numbers
log_error "Unable to parse system load from file '$path'" and return [] unless all { looks_like_number $_ } @load;
return \@load;
}

sub _check_system_utilization (
$self,
$threshold = $self->settings->global_settings->{CRITICAL_LOAD_AVG_THRESHOLD},
$load = _load_avg())
{
return undef unless $threshold && @$load >= 3;
# look at the load evolution over time to react quick enough if the load
# rises but accept a falling edge
return "The average load (@$load) is exceeding the configured threshold of $threshold."
if max(@$load) > $threshold && ($load->[0] > $load->[1] || $load->[0] > $load->[2] || min(@$load) > $threshold);
return undef;
}

Expand Down
20 changes: 17 additions & 3 deletions t/24-worker-overall.t
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ $ENV{OPENQA_CONFIG} = "$FindBin::Bin/data/24-worker-overall";
# file specified via OPENQA_LOGFILE instead of stdout/stderr.
$ENV{OPENQA_LOGFILE} = undef;

my $load_avg_file = simulate_load('0.93 0.95 10.25 2/2207 1212', 'worker-overall-load-avg');

# define fake isotovideo
{
package Test::FakeProcess; # uncoverable statement count:1
Expand Down Expand Up @@ -90,6 +88,7 @@ my $dbus_mock = Test::MockModule->new('Net::DBus', no_auto => 1);
$dbus_mock->define(system => sub (@) { Test::FakeDBus->new });
my $cache_service_client_mock = Test::MockModule->new('OpenQA::CacheService::Client');
$cache_service_client_mock->redefine(info => sub { Test::FakeCacheServiceClientInfo->new });
my $load_avg_file = simulate_load('10.93 10.91 10.25 2/2207 1212', 'worker-overall-load-avg');

like(
exception {
Expand Down Expand Up @@ -120,6 +119,21 @@ combined_like { $worker->log_setup_info }
qr/.*http:\/\/localhost:9527,https:\/\/remotehost.*qemu_i386,qemu_x86_64.*Errors occurred.*foo.*bar.*/s,
'setup info with parse errors';

subtest 'worker load' => sub {
my $load = OpenQA::Worker::_load_avg();
is scalar @$load, 3, 'expected number of load values';
is $load->[0], 10.93, 'expected load';
is_deeply $load, [10.93, 10.91, 10.25], 'expected computed system load, rising flank';
is_deeply OpenQA::Worker::_load_avg(path($ENV{OPENQA_CONFIG}, 'invalid_loadavg')), [], 'error on invalid load';
ok !$worker->_check_system_utilization, 'default threshold not exceeded';
ok $worker->_check_system_utilization(10), 'stricter threshold exceeded by load';
ok !$worker->_check_system_utilization(10, [3, 9, 11]), 'load ok on falling flank';
ok $worker->_check_system_utilization(10, [12, 9, 3]), 'load exceeded on rising flank';
ok $worker->_check_system_utilization(10, [12, 3, 9]), 'load exceeded on rising flank and old load';
ok $worker->_check_system_utilization(10, [11, 13, 12]), 'load still exceeded on short load dip';
ok $worker->_check_system_utilization(10, [11, 12, 13]), 'load still exceeded on falling flank but high';
};

subtest 'delay and exec' => sub {
my $worker_mock = Test::MockModule->new('OpenQA::Worker');
$worker_mock->redefine(init => 42);
Expand Down Expand Up @@ -854,7 +868,7 @@ qr/Job 42 from some-host finished - reason: done.*A QEMU instance using.*Skippin
$worker_mock->unmock('is_qemu_running');
$worker->settings->global_settings->{CRITICAL_LOAD_AVG_THRESHOLD} = '10';
is $worker->status->{status}, 'broken', 'worker considered broken when average load exceeds threshold';
like $worker->current_error, qr/load 10\.25.*exceeding.*10/, 'error shows current load and threshold';
like $worker->current_error, qr/load \(.*10\.25.*exceeding.*10/, 'error shows current load and threshold';

# assume the error is gone
$load_avg_file->remove;
Expand Down

0 comments on commit f8b5424

Please sign in to comment.