Skip to content

Commit

Permalink
Limit number of auto_clone restarts
Browse files Browse the repository at this point in the history
It can happen that a job consistently fails with the same error.
We want to prevent an endless cloning loop here.

Issue: https://progress.opensuse.org/issues/152569
  • Loading branch information
perlpunk committed Jan 4, 2024
1 parent 9c8a390 commit 99045fb
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 22 deletions.
2 changes: 2 additions & 0 deletions etc/openqa/openqa.ini
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@
## Causes jobs reported as incomplete by the worker to be cloned automatically when the
## reason matches; set to 0 to disable
#auto_clone_regex = ^(cache failure: |terminated prematurely: |api failure: Failed to register .* 503|backend died: .*VNC.*(timeout|timed out|refused)|QEMU terminated: Failed to allocate KVM HPT of order 25.* Cannot allocate memory)
## The maximum amount of times a job will be restarted if the auto_clone_regex matches
#auto_clone_limit = 20

## A regex pattern that a "force_result" label description in job comments
## must match to be accepted. If undefined and by default no rules are applied
Expand Down
78 changes: 60 additions & 18 deletions lib/OpenQA/Schema/Result/Jobs.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1955,6 +1955,29 @@ sub descendants ($self, $limit = -1) {
$self->{_descendants} = $sth->fetchrow_array;
}

sub incomplete_ancestors ($self, $limit = -1) {
my $ancestors = $self->{_incomplete_ancestors};
return $ancestors if defined $ancestors;
my $sth = $self->result_source->schema->storage->dbh->prepare(<<~'EOM');
with recursive orig_id as (
select ? as orig_id, 0 as level, ? as orig_result, ? as orig_reason
union all
select id as orig_id, orig_id.level + 1 as level, result as orig_result, reason as orig_reason
from jobs join orig_id on orig_id.orig_id = jobs.clone_id
where result = 'incomplete' and (? < 0 or level < ?))
select orig_id, level, orig_reason from orig_id order by level asc;
EOM
$sth->bind_param(1, $self->id, SQL_BIGINT);
$sth->bind_param(2, $self->result, SQL_VARCHAR);
$sth->bind_param(3, $self->reason, SQL_VARCHAR);
$sth->bind_param(4, $limit, SQL_BIGINT);
$sth->bind_param(5, $limit, SQL_BIGINT);
$sth->execute;
$ancestors = $sth->fetchall_arrayref;
shift @$ancestors;
$self->{_incomplete_ancestors} = $ancestors;
}

sub latest_job ($self) {
return $self unless my $clone = $self->clone;
return $clone->latest_job;
Expand Down Expand Up @@ -2065,28 +2088,12 @@ sub done ($self, %args) {
$worker->update({job_id => undef});
}

my %new_val = (state => DONE);
# update result unless already known (it is already known for CANCELLED jobs)
# update the reason if updating the result or if there is no reason yet
my $reason = $args{reason};
my $result_unknown = $self->result eq NONE;
my $reason_unknown = !$self->reason;
my %new_val = (state => DONE);
my $restart = 0;
$new_val{result} = $result if $result_unknown;
if (($result_unknown || $reason_unknown) && defined $reason) {
# restart incompletes when the reason matches the configured regex
my $auto_clone_regex = OpenQA::App->singleton->config->{global}->{auto_clone_regex};
$restart = 1 if $result eq INCOMPLETE && $auto_clone_regex && $reason =~ $auto_clone_regex;

# limit length of the reason
# note: The reason can be anything the worker picked up as useful information so better cut it at a
# reasonable, human-readable length. This also avoids growing the database too big.
$reason = substr($reason, 0, 300) . '' if defined $reason && length $reason > 300;
$new_val{reason} = $reason;
}
elsif ($reason_unknown && !defined $reason && $result eq INCOMPLETE) {
$new_val{reason} = 'no test modules scheduled/uploaded';
}
$self->_reason_and_result(\%new_val, $result, $reason, \$restart);
$self->update(\%new_val);
$self->unblock;
my %finalize_opts = (lax => 1);
Expand All @@ -2109,6 +2116,41 @@ sub done ($self, %args) {
return $new_val{result} // $self->result;
}

sub _reason_and_result ($self, $new_val, $result, $reason, $restart) {
my $result_unknown = $self->result eq NONE;
my $reason_unknown = !$self->reason;
$new_val->{result} = $result if $result_unknown;
if (($result_unknown || $reason_unknown) && defined $reason) {
# restart incompletes when the reason matches the configured regex
my $append_reason = '';
my $auto_clone_regex = OpenQA::App->singleton->config->{global}->{auto_clone_regex};
if ($result eq INCOMPLETE and $auto_clone_regex and $reason =~ $auto_clone_regex) {
my $limit = OpenQA::App->singleton->config->{global}{auto_clone_limit};
my $ancestors = $self->incomplete_ancestors($limit + 1);
# how many of those incomplete ancestors had a reason not matching auto_clone?
my $unrelated = grep { $_->[2] !~ m/$auto_clone_regex/ } @$ancestors;
if (@$ancestors < $limit || $unrelated > 0) {
$append_reason = ' [Auto-restarting because reason matches auto_clone_regex]';
$$restart = 1;
}
else {
$append_reason
= ' [Not restarting because job has been restarted already auto_clone_limit times and failed with /$auto_clone_regex/]';
}
}

# limit length of the reason
# note: The reason can be anything the worker picked up as useful information so better cut it at a
# reasonable, human-readable length. This also avoids growing the database too big.
$reason = substr($reason, 0, 300) . '' if length $reason > 300;
$reason .= $append_reason;
$new_val->{reason} = $reason;
}
elsif ($reason_unknown && !defined $reason && $result eq INCOMPLETE) {
$new_val->{reason} = 'no test modules scheduled/uploaded';
}
}

sub cancel ($self, $result, $reason = undef) {
return undef if $self->result ne NONE;
my %data = (state => CANCELLED, result => $result);
Expand Down
1 change: 1 addition & 0 deletions lib/OpenQA/Setup.pm
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ sub read_config ($app) {
search_results_limit => 50000,
auto_clone_regex =>
'^(cache failure: |terminated prematurely: |api failure: Failed to register .* 503|backend died: .*VNC.*(timeout|timed out|refused)|QEMU terminated: Failed to allocate KVM HPT of order 25.* Cannot allocate memory)',
auto_clone_limit => 20,
force_result_regex => '',
parallel_children_collapsable_results => join(' ', OK_RESULTS),
},
Expand Down
58 changes: 58 additions & 0 deletions t/09-job_clone.t
Original file line number Diff line number Diff line change
Expand Up @@ -113,4 +113,62 @@ subtest 'get job with verbose output' => sub {
combined_like { clone_job_get_job($job_id, $url_handler, \%options) } qr/"id" : $job_id/, 'Job settings logged';
};

subtest 'auto_clone limits' => sub {
$schema->txn_begin;
my $limit = 3;
local $t->app->config->{global}{auto_clone_limit} = $limit;
my @jobs;
my $backend_reason = 'backend died: VNC Connection refused';
for my $i (reverse 10 .. 15) {
my $clone_id = $i < 15 ? $i + 1 : undef;
my $new = $jobs->create(
{
id => $i,
clone_id => $clone_id,
state => DONE,
result => INCOMPLETE,
TEST => 'foo',
reason => $backend_reason,
});
push @jobs, $new;
}
my $last = $jobs[0];
$last->update({reason => undef});
my ($last_incompletes, $restart, %new);

subtest 'more than auto_clone_limit incompletes - all matching auto_clone_regex' => sub {
$last_incompletes = $last->incomplete_ancestors($limit);
is scalar @$last_incompletes, $limit, "incomplete_ancestors returns $limit incompletes";
$restart = 0;
$last->_reason_and_result(\%new, 'incomplete', $backend_reason, \$restart);
is $restart, 0, "restart false";
like $new{reason}, qr{Not restarting because.*auto_clone_limit}, '$reason is modified';
};

subtest 'more than auto_clone_limit incompletes - but less matching auto_clone_regex' => sub {
$jobs[3]->update({reason => 'unrelated'});
delete $last->{_incomplete_ancestors};
$last_incompletes = $last->incomplete_ancestors($limit);
is scalar @$last_incompletes, $limit, "incomplete_ancestors returns $limit incompletes";
$restart = 0;
%new = ();
$last->_reason_and_result(\%new, 'incomplete', $backend_reason, \$restart);
is $restart, 1, "restart true";
like $new{reason}, qr{Auto-restarting because.*auto_clone_regex}, '$reason is modified';
};

subtest 'less than auto_clone_limit incompletes' => sub {
$jobs[3]->update({result => 'failed'});
delete $last->{_incomplete_ancestors};
$last_incompletes = $last->incomplete_ancestors($limit);
is scalar @$last_incompletes, 2, 'incomplete_ancestors returns a row of 2 incompletes';
$restart = 0;
%new = ();
$last->_reason_and_result(\%new, 'incomplete', $backend_reason, \$restart);
is $restart, 1, "restart true";
like $new{reason}, qr{Auto-restarting because.*auto_clone_regex}, '$reason is modified';
};
$schema->txn_rollback;
};

done_testing();
9 changes: 5 additions & 4 deletions t/api/04-jobs.t
Original file line number Diff line number Diff line change
Expand Up @@ -1480,16 +1480,17 @@ subtest 'marking job as done' => sub {
perform_minion_jobs($t->app->minion);
$t->get_ok('/api/v1/jobs/99961')->status_is(200);
$t->json_is('/job/result' => INCOMPLETE, 'result set');
$t->json_is('/job/reason' => $cache_failure_reason, 'reason set');
$t->json_like('/job/reason' => qr{\Q$cache_failure_reason}, 'reason set');
$t->json_is('/job/state' => DONE, 'state set');
$t->json_like('/job/clone_id' => qr/\d+/, 'job cloned when reason does matches configured regex');
$t->json_like('/job/clone_id' => qr/\d+/, 'job cloned when reason does match configured regex');

};
subtest 'job is already done with reason, not overriding existing result and reason' => sub {
$t->post_ok('/api/v1/jobs/99961/set_done?result=passed&reason=foo')->status_is(200);
$t->json_is('/result' => INCOMPLETE, 'post yields result (old result as not overridden)');
$t->get_ok('/api/v1/jobs/99961')->status_is(200);
$t->json_is('/job/result' => INCOMPLETE, 'result unchanged');
$t->json_is('/job/reason' => $cache_failure_reason, 'reason unchanged');
$t->json_like('/job/reason' => qr{\Q$cache_failure_reason}, 'reason unchanged');
};
my $reason_cutted = join('', map { 'ä' } (1 .. 300));
my $reason = $reason_cutted . ' additional characters';
Expand Down Expand Up @@ -1518,7 +1519,7 @@ subtest 'marking job as done' => sub {
perform_minion_jobs($t->app->minion);
$t->get_ok('/api/v1/jobs/99961')->status_is(200);
$t->json_is('/job/result' => INCOMPLETE, 'the job status is correct');
$t->json_is('/job/reason' => $incomplete_reason, 'the incomplete reason is correct');
$t->json_like('/job/reason' => qr{\Q$incomplete_reason}, 'the incomplete reason is correct');
$t->json_like('/job/clone_id' => qr/\d+/, 'job was cloned when reason matches the configured regex');
};
};
Expand Down
1 change: 1 addition & 0 deletions t/config.t
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ subtest 'Test configuration default modes' => sub {
worker_timeout => DEFAULT_WORKER_TIMEOUT,
force_result_regex => '',
parallel_children_collapsable_results_sel => ' .status:not(.result_passed):not(.result_softfailed)',
auto_clone_limit => 20,
},
rate_limits => {
search => 5,
Expand Down

0 comments on commit 99045fb

Please sign in to comment.