From eaf3f187a03545ff4c2614282e96986fd45988bb Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 22 May 2024 16:21:44 +0200 Subject: [PATCH] Support for error report --- PILOTVERSION | 2 +- pilot/control/payload.py | 45 ++++++++++++++++++++++++++++------------ pilot/util/constants.py | 2 +- 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index d366ecd9..3695e848 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.6.6 \ No newline at end of file +3.7.6.7 \ No newline at end of file diff --git a/pilot/control/payload.py b/pilot/control/payload.py index f55c9f6d..af8839b9 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -33,6 +33,11 @@ from re import findall, split from typing import Any, TextIO +from pilot.common.errorcodes import ErrorCodes +from pilot.common.exception import ( + ExcThread, + PilotException +) from pilot.control.payloads import ( generic, eventservice, @@ -41,22 +46,20 @@ from pilot.control.job import send_state from pilot.util.auxiliary import set_pilot_state from pilot.util.container import execute -from pilot.util.processes import get_cpu_consumption_time from pilot.util.config import config from pilot.util.filehandling import ( - read_file, - remove_core_dumps, - get_guid, extract_lines_from_file, - find_file + find_file, + get_guid, + read_file, + read_json, + remove_core_dumps ) -from pilot.util.processes import threads_aborted -from pilot.util.queuehandling import put_in_queue -from pilot.common.errorcodes import ErrorCodes -from pilot.common.exception import ( - ExcThread, - PilotException +from pilot.util.processes import ( + get_cpu_consumption_time, + threads_aborted ) +from pilot.util.queuehandling import put_in_queue from pilot.util.realtimelogger import get_realtime_logger logger = logging.getLogger(__name__) @@ -616,6 +619,23 @@ def perform_initial_payload_error_analysis(job: Any, exit_code: int): if exit_code != 0: logger.warning(f'main payload execution returned non-zero exit code: {exit_code}') + # check if the transform has produced an error report + path = os.path.join(job.workdir, config.Payload.error_report) + if os.path.exists(path): + error_report = read_json(path) + error_code = error_report.get('error_code') + error_diag = error_report.get('error_diag') + if error_code: + logger.warning(f'{config.Payload.error_report} contained error code: {error_code}') + logger.warning(f'{config.Payload.error_report} contained error diag: {error_diag}') + job.exeerrorcode = error_code + job.exeerrordiag = error_report.get('error_diag') + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADEXECUTIONFAILURE, msg=error_diag) + return + logger.info(f'{config.Payload.error_report} exists but did not contain any non-zero error code') + else: + logger.debug(f'{config.Payload.error_report} does not exist') + # look for singularity/apptainer errors (the exit code can be zero in this case) path = os.path.join(job.workdir, config.Payload.payloadstderr) if os.path.exists(path): @@ -664,9 +684,8 @@ def perform_initial_payload_error_analysis(job: Any, exit_code: int): else: logger.info('main payload execution returned zero exit code') - # check if core dumps exist, if so remove them and return True + # check if core dumps exist, if so remove them if not job.debug: # do not shorten these if-statements - # only return True if found core dump belongs to payload if remove_core_dumps(job.workdir, pid=job.pid): # COREDUMP error will only be set if the core dump belongs to the payload (ie 'core.') logger.warning('setting COREDUMP error') diff --git a/pilot/util/constants.py b/pilot/util/constants.py index f2728005..e00b5ea3 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '6' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '6' # build number should be reset to '1' for every new development cycle +BUILD = '7' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1