From 99045fbed4d3f94051382a444f5047db175b6581 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tina=20M=C3=BCller?= Date: Thu, 14 Dec 2023 13:40:34 +0100 Subject: [PATCH] Limit number of auto_clone restarts It can happen that a job consistently fails with the same error. We want to prevent an endless cloning loop here. Issue: https://progress.opensuse.org/issues/152569 --- etc/openqa/openqa.ini | 2 + lib/OpenQA/Schema/Result/Jobs.pm | 78 ++++++++++++++++++++++++-------- lib/OpenQA/Setup.pm | 1 + t/09-job_clone.t | 58 ++++++++++++++++++++++++ t/api/04-jobs.t | 9 ++-- t/config.t | 1 + 6 files changed, 127 insertions(+), 22 deletions(-) diff --git a/etc/openqa/openqa.ini b/etc/openqa/openqa.ini index 7b52a90f1b46..b030564d9684 100644 --- a/etc/openqa/openqa.ini +++ b/etc/openqa/openqa.ini @@ -78,6 +78,8 @@ ## Causes jobs reported as incomplete by the worker to be cloned automatically when the ## reason matches; set to 0 to disable #auto_clone_regex = ^(cache failure: |terminated prematurely: |api failure: Failed to register .* 503|backend died: .*VNC.*(timeout|timed out|refused)|QEMU terminated: Failed to allocate KVM HPT of order 25.* Cannot allocate memory) +## The maximum amount of times a job will be restarted if the auto_clone_regex matches +#auto_clone_limit = 20 ## A regex pattern that a "force_result" label description in job comments ## must match to be accepted. If undefined and by default no rules are applied diff --git a/lib/OpenQA/Schema/Result/Jobs.pm b/lib/OpenQA/Schema/Result/Jobs.pm index 62ae6a389ae4..42513c63f320 100644 --- a/lib/OpenQA/Schema/Result/Jobs.pm +++ b/lib/OpenQA/Schema/Result/Jobs.pm @@ -1955,6 +1955,29 @@ sub descendants ($self, $limit = -1) { $self->{_descendants} = $sth->fetchrow_array; } +sub incomplete_ancestors ($self, $limit = -1) { + my $ancestors = $self->{_incomplete_ancestors}; + return $ancestors if defined $ancestors; + my $sth = $self->result_source->schema->storage->dbh->prepare(<<~'EOM'); + with recursive orig_id as ( + select ? as orig_id, 0 as level, ? as orig_result, ? as orig_reason + union all + select id as orig_id, orig_id.level + 1 as level, result as orig_result, reason as orig_reason + from jobs join orig_id on orig_id.orig_id = jobs.clone_id + where result = 'incomplete' and (? < 0 or level < ?)) + select orig_id, level, orig_reason from orig_id order by level asc; + EOM + $sth->bind_param(1, $self->id, SQL_BIGINT); + $sth->bind_param(2, $self->result, SQL_VARCHAR); + $sth->bind_param(3, $self->reason, SQL_VARCHAR); + $sth->bind_param(4, $limit, SQL_BIGINT); + $sth->bind_param(5, $limit, SQL_BIGINT); + $sth->execute; + $ancestors = $sth->fetchall_arrayref; + shift @$ancestors; + $self->{_incomplete_ancestors} = $ancestors; +} + sub latest_job ($self) { return $self unless my $clone = $self->clone; return $clone->latest_job; @@ -2065,28 +2088,12 @@ sub done ($self, %args) { $worker->update({job_id => undef}); } + my %new_val = (state => DONE); # update result unless already known (it is already known for CANCELLED jobs) # update the reason if updating the result or if there is no reason yet my $reason = $args{reason}; - my $result_unknown = $self->result eq NONE; - my $reason_unknown = !$self->reason; - my %new_val = (state => DONE); my $restart = 0; - $new_val{result} = $result if $result_unknown; - if (($result_unknown || $reason_unknown) && defined $reason) { - # restart incompletes when the reason matches the configured regex - my $auto_clone_regex = OpenQA::App->singleton->config->{global}->{auto_clone_regex}; - $restart = 1 if $result eq INCOMPLETE && $auto_clone_regex && $reason =~ $auto_clone_regex; - - # limit length of the reason - # note: The reason can be anything the worker picked up as useful information so better cut it at a - # reasonable, human-readable length. This also avoids growing the database too big. - $reason = substr($reason, 0, 300) . '…' if defined $reason && length $reason > 300; - $new_val{reason} = $reason; - } - elsif ($reason_unknown && !defined $reason && $result eq INCOMPLETE) { - $new_val{reason} = 'no test modules scheduled/uploaded'; - } + $self->_reason_and_result(\%new_val, $result, $reason, \$restart); $self->update(\%new_val); $self->unblock; my %finalize_opts = (lax => 1); @@ -2109,6 +2116,41 @@ sub done ($self, %args) { return $new_val{result} // $self->result; } +sub _reason_and_result ($self, $new_val, $result, $reason, $restart) { + my $result_unknown = $self->result eq NONE; + my $reason_unknown = !$self->reason; + $new_val->{result} = $result if $result_unknown; + if (($result_unknown || $reason_unknown) && defined $reason) { + # restart incompletes when the reason matches the configured regex + my $append_reason = ''; + my $auto_clone_regex = OpenQA::App->singleton->config->{global}->{auto_clone_regex}; + if ($result eq INCOMPLETE and $auto_clone_regex and $reason =~ $auto_clone_regex) { + my $limit = OpenQA::App->singleton->config->{global}{auto_clone_limit}; + my $ancestors = $self->incomplete_ancestors($limit + 1); + # how many of those incomplete ancestors had a reason not matching auto_clone? + my $unrelated = grep { $_->[2] !~ m/$auto_clone_regex/ } @$ancestors; + if (@$ancestors < $limit || $unrelated > 0) { + $append_reason = ' [Auto-restarting because reason matches auto_clone_regex]'; + $$restart = 1; + } + else { + $append_reason + = ' [Not restarting because job has been restarted already auto_clone_limit times and failed with /$auto_clone_regex/]'; + } + } + + # limit length of the reason + # note: The reason can be anything the worker picked up as useful information so better cut it at a + # reasonable, human-readable length. This also avoids growing the database too big. + $reason = substr($reason, 0, 300) . '…' if length $reason > 300; + $reason .= $append_reason; + $new_val->{reason} = $reason; + } + elsif ($reason_unknown && !defined $reason && $result eq INCOMPLETE) { + $new_val->{reason} = 'no test modules scheduled/uploaded'; + } +} + sub cancel ($self, $result, $reason = undef) { return undef if $self->result ne NONE; my %data = (state => CANCELLED, result => $result); diff --git a/lib/OpenQA/Setup.pm b/lib/OpenQA/Setup.pm index 8dfb4f6071bf..a82ba395ee8b 100644 --- a/lib/OpenQA/Setup.pm +++ b/lib/OpenQA/Setup.pm @@ -49,6 +49,7 @@ sub read_config ($app) { search_results_limit => 50000, auto_clone_regex => '^(cache failure: |terminated prematurely: |api failure: Failed to register .* 503|backend died: .*VNC.*(timeout|timed out|refused)|QEMU terminated: Failed to allocate KVM HPT of order 25.* Cannot allocate memory)', + auto_clone_limit => 20, force_result_regex => '', parallel_children_collapsable_results => join(' ', OK_RESULTS), }, diff --git a/t/09-job_clone.t b/t/09-job_clone.t index 72d98ad57895..ec4f2a4cb718 100644 --- a/t/09-job_clone.t +++ b/t/09-job_clone.t @@ -113,4 +113,62 @@ subtest 'get job with verbose output' => sub { combined_like { clone_job_get_job($job_id, $url_handler, \%options) } qr/"id" : $job_id/, 'Job settings logged'; }; +subtest 'auto_clone limits' => sub { + $schema->txn_begin; + my $limit = 3; + local $t->app->config->{global}{auto_clone_limit} = $limit; + my @jobs; + my $backend_reason = 'backend died: VNC Connection refused'; + for my $i (reverse 10 .. 15) { + my $clone_id = $i < 15 ? $i + 1 : undef; + my $new = $jobs->create( + { + id => $i, + clone_id => $clone_id, + state => DONE, + result => INCOMPLETE, + TEST => 'foo', + reason => $backend_reason, + }); + push @jobs, $new; + } + my $last = $jobs[0]; + $last->update({reason => undef}); + my ($last_incompletes, $restart, %new); + + subtest 'more than auto_clone_limit incompletes - all matching auto_clone_regex' => sub { + $last_incompletes = $last->incomplete_ancestors($limit); + is scalar @$last_incompletes, $limit, "incomplete_ancestors returns $limit incompletes"; + $restart = 0; + $last->_reason_and_result(\%new, 'incomplete', $backend_reason, \$restart); + is $restart, 0, "restart false"; + like $new{reason}, qr{Not restarting because.*auto_clone_limit}, '$reason is modified'; + }; + + subtest 'more than auto_clone_limit incompletes - but less matching auto_clone_regex' => sub { + $jobs[3]->update({reason => 'unrelated'}); + delete $last->{_incomplete_ancestors}; + $last_incompletes = $last->incomplete_ancestors($limit); + is scalar @$last_incompletes, $limit, "incomplete_ancestors returns $limit incompletes"; + $restart = 0; + %new = (); + $last->_reason_and_result(\%new, 'incomplete', $backend_reason, \$restart); + is $restart, 1, "restart true"; + like $new{reason}, qr{Auto-restarting because.*auto_clone_regex}, '$reason is modified'; + }; + + subtest 'less than auto_clone_limit incompletes' => sub { + $jobs[3]->update({result => 'failed'}); + delete $last->{_incomplete_ancestors}; + $last_incompletes = $last->incomplete_ancestors($limit); + is scalar @$last_incompletes, 2, 'incomplete_ancestors returns a row of 2 incompletes'; + $restart = 0; + %new = (); + $last->_reason_and_result(\%new, 'incomplete', $backend_reason, \$restart); + is $restart, 1, "restart true"; + like $new{reason}, qr{Auto-restarting because.*auto_clone_regex}, '$reason is modified'; + }; + $schema->txn_rollback; +}; + done_testing(); diff --git a/t/api/04-jobs.t b/t/api/04-jobs.t index ee42e341908d..bd16b7615092 100644 --- a/t/api/04-jobs.t +++ b/t/api/04-jobs.t @@ -1480,16 +1480,17 @@ subtest 'marking job as done' => sub { perform_minion_jobs($t->app->minion); $t->get_ok('/api/v1/jobs/99961')->status_is(200); $t->json_is('/job/result' => INCOMPLETE, 'result set'); - $t->json_is('/job/reason' => $cache_failure_reason, 'reason set'); + $t->json_like('/job/reason' => qr{\Q$cache_failure_reason}, 'reason set'); $t->json_is('/job/state' => DONE, 'state set'); - $t->json_like('/job/clone_id' => qr/\d+/, 'job cloned when reason does matches configured regex'); + $t->json_like('/job/clone_id' => qr/\d+/, 'job cloned when reason does match configured regex'); + }; subtest 'job is already done with reason, not overriding existing result and reason' => sub { $t->post_ok('/api/v1/jobs/99961/set_done?result=passed&reason=foo')->status_is(200); $t->json_is('/result' => INCOMPLETE, 'post yields result (old result as not overridden)'); $t->get_ok('/api/v1/jobs/99961')->status_is(200); $t->json_is('/job/result' => INCOMPLETE, 'result unchanged'); - $t->json_is('/job/reason' => $cache_failure_reason, 'reason unchanged'); + $t->json_like('/job/reason' => qr{\Q$cache_failure_reason}, 'reason unchanged'); }; my $reason_cutted = join('', map { 'ä' } (1 .. 300)); my $reason = $reason_cutted . ' additional characters'; @@ -1518,7 +1519,7 @@ subtest 'marking job as done' => sub { perform_minion_jobs($t->app->minion); $t->get_ok('/api/v1/jobs/99961')->status_is(200); $t->json_is('/job/result' => INCOMPLETE, 'the job status is correct'); - $t->json_is('/job/reason' => $incomplete_reason, 'the incomplete reason is correct'); + $t->json_like('/job/reason' => qr{\Q$incomplete_reason}, 'the incomplete reason is correct'); $t->json_like('/job/clone_id' => qr/\d+/, 'job was cloned when reason matches the configured regex'); }; }; diff --git a/t/config.t b/t/config.t index a81c2cdaa5c2..0cbbfd0ad9bc 100644 --- a/t/config.t +++ b/t/config.t @@ -51,6 +51,7 @@ subtest 'Test configuration default modes' => sub { worker_timeout => DEFAULT_WORKER_TIMEOUT, force_result_regex => '', parallel_children_collapsable_results_sel => ' .status:not(.result_passed):not(.result_softfailed)', + auto_clone_limit => 20, }, rate_limits => { search => 5,