Skip to content

Commit

Permalink
Make it less likely for jobs incompleting with Cache … queue … full
Browse files Browse the repository at this point in the history
* Introduce a hard-limit for inactive Minion jobs
* Make the hard-limit slightly higher than the normal limit
* Enforce only the hard-limit on openQA job setup; so when the normal
  Minion queue limit was reached after a job had already been picked-up it
  can still continue to create Minion jobs instead of ending up as
  incomplete
* See https://progress.opensuse.org/issues/128276
  • Loading branch information
Martchus committed Mar 5, 2024
1 parent 060eef3 commit 34a0b55
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 12 deletions.
7 changes: 7 additions & 0 deletions lib/OpenQA/CacheService/Response.pm
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,15 @@ package OpenQA::CacheService::Response;
use Mojo::Base -base, -signatures;

has [qw(data error)];

# define soft-limit for inactive Minion jobs (enforced when determining idle worker slots availability)
has max_inactive_jobs => $ENV{OPENQA_CACHE_MAX_INACTIVE_JOBS} // 5;

# define hard-limit for inactive Minion jobs (enforced on setup of already started openQA job)
has max_inactive_jobs_hard_limit => sub ($self) {
$ENV{OPENQA_CACHE_MAX_INACTIVE_JOBS_HARD_LIMIT} // ($self->max_inactive_jobs + 10);
};

sub has_error ($self) { !!$self->error }

1;
11 changes: 6 additions & 5 deletions lib/OpenQA/CacheService/Response/Info.pm
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,18 @@ sub available_workers ($self) {
return $data->{active_workers} != 0 || $data->{inactive_workers} != 0;
}

sub inactive_jobs_exceeded ($self) {
return undef if $self->max_inactive_jobs < 0;
sub inactive_jobs_exceeded ($self, $options = {}) {
my $limit = $options->{hard_limit} ? $self->max_inactive_jobs_hard_limit : $self->max_inactive_jobs;
return undef if $limit < 0;
return undef unless my $data = $self->data;
return undef unless $data->{inactive_jobs};
return $data->{inactive_jobs} > $self->max_inactive_jobs;
return $data->{inactive_jobs} > $limit;
}

sub availability_error ($self) {
sub availability_error ($self, $options = {}) {
return $self->error if $self->has_error;
return 'No workers active in the cache service' unless $self->available_workers;
return 'Cache service queue already full (' . $self->max_inactive_jobs . ')' if $self->inactive_jobs_exceeded;
return 'Cache service queue already full (' . $self->max_inactive_jobs . ')' if $self->inactive_jobs_exceeded($options);
return undef;
}

Expand Down
4 changes: 2 additions & 2 deletions lib/OpenQA/Worker/Engines/isotovideo.pm
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ sub cache_assets ($cache_client, $job, $vars, $assets_to_cache, $assetkeys, $web
my $asset_uri = trim($asset_value);
return cache_assets($cache_client, $job, $vars, $assets_to_cache, $assetkeys, $webui_host, $pooldir, $callback)
if ($this_asset eq 'UEFI_PFLASH_VARS' && !$vars->{UEFI});
# check cache availability
my $error = $cache_client->info->availability_error;
# check cache availability but only fail if the hard limit is exceeded
my $error = $cache_client->info->availability_error({hard_limit => 1});
return $callback->({error => $error}) if $error;
log_debug("Found $this_asset, caching $vars->{$this_asset}", channels => 'autoinst');

Expand Down
29 changes: 24 additions & 5 deletions t/24-worker-engine.t
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ use lib "$FindBin::Bin/lib", "$FindBin::Bin/../external/os-autoinst-common/lib";
use OpenQA::Test::TimeLimit '10';
use Mojo::Base -signatures;

BEGIN { $ENV{OPENQA_CACHE_SERVICE_POLL_DELAY} = 0 }
BEGIN {
$ENV{OPENQA_CACHE_SERVICE_POLL_DELAY} = 0;
delete $ENV{OPENQA_CACHE_MAX_INACTIVE_JOBS};
delete $ENV{OPENQA_CACHE_MAX_INACTIVE_JOBS_HARD_LIMIT};
}

use File::Spec::Functions qw(abs2rel catdir);
use OpenQA::Constants 'WORKER_EC_ASSET_FAILURE';
Expand Down Expand Up @@ -188,16 +192,31 @@ subtest 'asset caching' => sub {
is $error, $test_dir, 'Cache directory updated';
};

sub _mock_cache_service_client ($status_data, $error = undef) {
sub _mock_cache_service_client ($status_data, $info_data = undef, $error = undef) {
my $cache_client_mock = Test::MockModule->new('OpenQA::CacheService::Client');
$info_data //= {active_workers => 1};
$cache_client_mock->redefine(enqueue => 'some enqueue error');
$cache_client_mock->redefine(
info => OpenQA::CacheService::Response::Info->new(data => {active_workers => 1}, error => undef));
$cache_client_mock->redefine(info => OpenQA::CacheService::Response::Info->new(data => $info_data, error => undef));
$cache_client_mock->redefine(
status => OpenQA::CacheService::Response::Status->new(data => $status_data, error => $error));
return $cache_client_mock;
}

subtest 'handling availability error' => sub {
my %fake_status = (active_workers => 1, inactive_jobs => 16);
my $cache_client_mock = _mock_cache_service_client {}, \%fake_status;
my $cache_client = OpenQA::CacheService::Client->new;
my $cb_res;
my $cb = sub ($res) { $cb_res = $res };
my @args = ($cache_client, Test::FakeJob->new, {a => 1}, [('a') x 2], {}, 'webuihost', undef, $cb);
OpenQA::Worker::Engines::isotovideo::cache_assets @args;
like $cb_res->{error}, qr/Cache service queue already full/, 'error when hard-limit exceeded';

$fake_status{inactive_jobs} = 15;
OpenQA::Worker::Engines::isotovideo::cache_assets @args;
like $cb_res->{error}, qr/Failed to send asset request/, 'attempt to cache when below hard-limit';
};

subtest 'problems and special cases when caching assets' => sub {
my %fake_status = (status => 'processed', output => 'Download of "FOO" failed: 404 Not Found');
my $cache_client_mock = _mock_cache_service_client \%fake_status;
Expand All @@ -223,7 +242,7 @@ subtest 'problems and special cases when caching assets' => sub {
is $error->{error}, 'Failed to download FOO to some/path', 'asset not found';
is $error->{category}, WORKER_EC_ASSET_FAILURE, 'category set so problem is treated as asset failure';

$cache_client_mock = _mock_cache_service_client {}, 'some severe error';
$cache_client_mock = _mock_cache_service_client {}, undef, 'some severe error';
$cache_client_mock->redefine(asset_request => Test::FakeRequest->new);
$cache_client_mock->redefine(enqueue => 0);
@assets = ('ISO_1');
Expand Down

0 comments on commit 34a0b55

Please sign in to comment.