Skip to content

Commit

Permalink
Support for error report
Browse files Browse the repository at this point in the history
  • Loading branch information
Paul Nilsson committed May 22, 2024
1 parent 5fd9db3 commit eaf3f18
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 15 deletions.
2 changes: 1 addition & 1 deletion PILOTVERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.7.6.6
3.7.6.7
45 changes: 32 additions & 13 deletions pilot/control/payload.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
from re import findall, split
from typing import Any, TextIO

from pilot.common.errorcodes import ErrorCodes
from pilot.common.exception import (
ExcThread,
PilotException
)
from pilot.control.payloads import (
generic,
eventservice,
Expand All @@ -41,22 +46,20 @@
from pilot.control.job import send_state
from pilot.util.auxiliary import set_pilot_state
from pilot.util.container import execute
from pilot.util.processes import get_cpu_consumption_time
from pilot.util.config import config
from pilot.util.filehandling import (
read_file,
remove_core_dumps,
get_guid,
extract_lines_from_file,
find_file
find_file,
get_guid,
read_file,
read_json,
remove_core_dumps
)
from pilot.util.processes import threads_aborted
from pilot.util.queuehandling import put_in_queue
from pilot.common.errorcodes import ErrorCodes
from pilot.common.exception import (
ExcThread,
PilotException
from pilot.util.processes import (
get_cpu_consumption_time,
threads_aborted
)
from pilot.util.queuehandling import put_in_queue
from pilot.util.realtimelogger import get_realtime_logger

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -616,6 +619,23 @@ def perform_initial_payload_error_analysis(job: Any, exit_code: int):
if exit_code != 0:
logger.warning(f'main payload execution returned non-zero exit code: {exit_code}')

# check if the transform has produced an error report
path = os.path.join(job.workdir, config.Payload.error_report)
if os.path.exists(path):
error_report = read_json(path)
error_code = error_report.get('error_code')
error_diag = error_report.get('error_diag')
if error_code:
logger.warning(f'{config.Payload.error_report} contained error code: {error_code}')
logger.warning(f'{config.Payload.error_report} contained error diag: {error_diag}')
job.exeerrorcode = error_code
job.exeerrordiag = error_report.get('error_diag')
job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADEXECUTIONFAILURE, msg=error_diag)
return
logger.info(f'{config.Payload.error_report} exists but did not contain any non-zero error code')
else:
logger.debug(f'{config.Payload.error_report} does not exist')

# look for singularity/apptainer errors (the exit code can be zero in this case)
path = os.path.join(job.workdir, config.Payload.payloadstderr)
if os.path.exists(path):
Expand Down Expand Up @@ -664,9 +684,8 @@ def perform_initial_payload_error_analysis(job: Any, exit_code: int):
else:
logger.info('main payload execution returned zero exit code')

# check if core dumps exist, if so remove them and return True
# check if core dumps exist, if so remove them
if not job.debug: # do not shorten these if-statements
# only return True if found core dump belongs to payload
if remove_core_dumps(job.workdir, pid=job.pid):
# COREDUMP error will only be set if the core dump belongs to the payload (ie 'core.<payload pid>')
logger.warning('setting COREDUMP error')
Expand Down
2 changes: 1 addition & 1 deletion pilot/util/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
RELEASE = '3' # released number should be fixed at 3 for Pilot 3
VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates
REVISION = '6' # revision number should be reset to '0' for every new version release, increased for small updates
BUILD = '6' # build number should be reset to '1' for every new development cycle
BUILD = '7' # build number should be reset to '1' for every new development cycle

SUCCESS = 0
FAILURE = 1
Expand Down

0 comments on commit eaf3f18

Please sign in to comment.