diff --git a/.github/workflows/flake8-workflow.yml b/.github/workflows/flake8-workflow.yml index a4aa52789..979be048b 100644 --- a/.github/workflows/flake8-workflow.yml +++ b/.github/workflows/flake8-workflow.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11"] env: FLAKE8_VERSION: "==6.1.0" FLAKE8_CONFIG: ".flake8" diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 88d5d9322..3bff6d38c 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -11,7 +11,7 @@ jobs: continue-on-error: true strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.9', '3.10', '3.11'] steps: - name: Checkout Pilot3 repo uses: actions/checkout@v3 diff --git a/PILOTVERSION b/PILOTVERSION index b53bc7318..66566d18d 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.5.4 \ No newline at end of file +3.7.6.8 \ No newline at end of file diff --git a/doc/components/api/benchmark.rst b/doc/components/api/benchmark.rst deleted file mode 100644 index e96256a6a..000000000 --- a/doc/components/api/benchmark.rst +++ /dev/null @@ -1,19 +0,0 @@ -.. - Pilot 2 pilot.api.benchmark doc file - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - - Authors: - - Paul Nilsson, paul.nilsson@cern.ch, 2018 - -benchmark -========= - -.. automodule:: pilot.api.benchmark - :members: - :private-members: - :special-members: - :undoc-members: diff --git a/doc/components/api/index.rst b/doc/components/api/index.rst index 6812f8936..6851eab54 100644 --- a/doc/components/api/index.rst +++ b/doc/components/api/index.rst @@ -7,7 +7,7 @@ http://www.apache.org/licenses/LICENSE-2.0 Authors: - - Paul Nilsson, paul.nilsson@cern.ch, 2017-2018 + - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 api components ============== @@ -16,7 +16,6 @@ api components :maxdepth: 2 analytics - benchmark data memorymonitor services diff --git a/pilot.py b/pilot.py index ebd356b34..20a3c47d0 100755 --- a/pilot.py +++ b/pilot.py @@ -17,9 +17,9 @@ # under the License. # # Authors: -# - Mario Lassnig, mario.lassnig@cern.ch, 2016-17 +# - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 """This is the entry point for the PanDA Pilot, executed with 'python3 pilot.py '.""" @@ -121,19 +121,9 @@ def main() -> int: ) # note: assuming IPv6, fallback in place # check cvmfs if available - is_available = is_cvmfs_available() - if is_available is None: - pass # ignore this case - elif is_available is True: - timestamp = get_last_update() - if timestamp and timestamp > 0: - logger.info('CVMFS has been validated') - else: - logger.warning('CVMFS is not responding - aborting pilot') - return errors.CVMFSISNOTALIVE - else: - logger.warning('CVMFS is not alive - aborting pilot') - return errors.CVMFSISNOTALIVE + ec = check_cvmfs(logger) + if ec: + return ec if not args.rucio_host: args.rucio_host = config.Rucio.host @@ -202,6 +192,35 @@ def main() -> int: return exitcode +def check_cvmfs(logger: Any) -> int: + """ + Check if cvmfs is available. + + :param logger: logging object. + :return: exit code (int). + """ + # skip all tests if required + if os.environ.get("NO_CVMFS_OK", False): + logger.info("skipping cvmfs checks") + return 0 + + is_available = is_cvmfs_available() + if is_available is None: + pass # ignore this case + elif is_available is True: + timestamp = get_last_update() + if timestamp and timestamp > 0: + logger.info('CVMFS has been validated') + else: + logger.warning('CVMFS is not responding - aborting pilot') + return errors.CVMFSISNOTALIVE + else: + logger.warning('CVMFS is not alive - aborting pilot') + return errors.CVMFSISNOTALIVE + + return 0 + + def str2bool(var: str) -> bool: """ Convert string to bool. @@ -638,7 +657,7 @@ def create_main_work_dir() -> (int, str): f"failed to create workdir at {_mainworkdir} -- aborting: {error}", file=sys.stderr, ) - exitcode = shell_exit_code(error._errorCode) + exitcode = shell_exit_code(error._error_code) else: _mainworkdir = getcwd() diff --git a/pilot/api/analytics.py b/pilot/api/analytics.py index 34ca9ef6d..5561b2684 100644 --- a/pilot/api/analytics.py +++ b/pilot/api/analytics.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """Functions for performing analytics including fitting of data.""" @@ -379,8 +379,6 @@ def chi2(self): def set_slope(self): """ Calculate and set the slope of the linear fit. - - :return: """ if self._ss2 and self._ss and self._ss != 0: self._slope = self._ss2 / float(self._ss) @@ -398,8 +396,6 @@ def slope(self): def set_intersect(self): """ Calculate and set the intersect of the linear fit. - - :return: """ if self._ym and self._slope and self._xm: self._intersect = self._ym - self._slope * self._xm diff --git a/pilot/api/benchmark.py b/pilot/api/benchmark.py deleted file mode 100644 index cd55e68ff..000000000 --- a/pilot/api/benchmark.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 - -"""Benchmark prototype.""" - -from .services import Services - -import logging -logger = logging.getLogger(__name__) - - -class Benchmark(Services): - """Benchmark service class.""" - - def __init__(self, *args): - """ - Init function. - - :param args: - """ - pass diff --git a/pilot/api/data.py b/pilot/api/data.py index b62df2568..02785845e 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -18,7 +18,7 @@ # # Authors: # - Mario Lassnig, mario.lassnig@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Tobias Wegner, tobias.wegner@cern.ch, 2017-2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2018-2019 @@ -29,7 +29,7 @@ import logging import time from functools import reduce - +from typing import Any try: import requests except ImportError: @@ -56,7 +56,7 @@ from pilot.util.tracereport import TraceReport -class StagingClient(object): +class StagingClient: """Base Staging Client.""" ipv = "IPv6" @@ -80,18 +80,23 @@ class StagingClient(object): # list of allowed schemas to be used for transfers from REMOTE sites remoteinput_allowed_schemas = ['root', 'gsiftp', 'dcap', 'srm', 'storm', 'https'] - def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_copytools='rucio', trace_report=None, ipv='IPv6', workdir=''): + def __init__(self, infosys_instance: Any = None, acopytools: dict = None, logger: Any = None, + default_copytools: str = 'rucio', trace_report: dict = None, ipv: str = 'IPv6', workdir: str = ""): """ Set default/init values. - If `acopytools` is not specified then it will be automatically resolved via infosys. In this case `infosys` requires initialization. + If `acopytools` is not specified then it will be automatically resolved via infosys. In this case `infosys` + requires initialization. + :param infosys_instance: infosys instance to be used for data resolution (Any) :param acopytools: copytool names per activity to be used for transfers. Accepts also list of names or string value without activity passed (dict) - :param logger: logging.Logger object to use for logging (None means no logging) + :param logger: logging.Logger object to use for logging (None means no logging) (Any) :param default_copytools: copytool name(s) to be used in case of unknown activity passed. Accepts either list of names or single string value (str) - :param ipv: internet protocol version (str). + :param trace_report: trace report object (dict) + :param ipv: internet protocol version (str) + :param workdir: working directory (str). """ - super(StagingClient, self).__init__() + super().__init__() if not logger: logger = logging.getLogger(__name__ + '.null') @@ -127,80 +132,74 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_ raise PilotException("failed to resolve acopytools settings") logger.info('configured copytools per activity: acopytools=%s', self.acopytools) - def allow_mvfinaldest(self, catchall): + def allow_mvfinaldest(self, catchall: str): """ Check if there is an override in catchall to allow mv to final destination. :param catchall: catchall from queuedata (str) - :return: True if 'mv_final_destination' is present in catchall, otherwise False (bool) + :return: True if 'mv_final_destination' is present in catchall, otherwise False (bool). """ - return True if catchall and 'mv_final_destination' in catchall else False + return catchall and 'mv_final_destination' in catchall def set_acopytools(self): """Set the internal acopytools.""" if not self.acopytools: # resolve from queuedata.acopytools using infosys self.acopytools = (self.infosys.queuedata.acopytools or {}).copy() if not self.acopytools: # resolve from queuedata.copytools using infosys - self.acopytools = dict(default=list((self.infosys.queuedata.copytools or {}).keys())) + self.acopytools = {"default": list((self.infosys.queuedata.copytools or {}).keys())} @staticmethod - def get_default_copytools(default_copytools): + def get_default_copytools(default_copytools: str): """ - Get the default copytools. + Get the default copytools as a list. :param default_copytools: default copytools (str) :return: default copytools (str). """ if isinstance(default_copytools, str): default_copytools = [default_copytools] if default_copytools else [] + return default_copytools @classmethod - def get_preferred_replica(self, replicas, allowed_schemas): + def get_preferred_replica(cls, replicas: list, allowed_schemas: list) -> Any or None: """ Get preferred replica from the `replicas` list suitable for `allowed_schemas`. - :return: first matched replica or None if not found. + :param replicas: list of replicas (list) + :param allowed_schemas: list of allowed schemas (list) + :return: first matched replica or None if not found (Any or None). """ for replica in replicas: pfn = replica.get('pfn') for schema in allowed_schemas: if pfn and (not schema or pfn.startswith(f'{schema}://')): return replica + return None - def prepare_sources(self, files, activities=None): + def prepare_sources(self, files: list, activities: Any = None) -> None: """ Prepare sources. - Customize/prepare source data for each entry in `files` optionally checking data for requested `activities` - (custom StageClient could extend the logic if need) + Customize/prepare source data for each entry in `files` optionally checking data for requested `activities`. + (custom StageClient could extend the logic if needed). - :param files: list of `FileSpec` objects to be processed - :param activities: string or ordered list of activities to resolve `astorages` (optional) - :return: None. + :param files: list of `FileSpec` objects to be processed (list) + :param activities: string or ordered list of activities to resolve `astorages` (optional) (Any) + :return: (None) """ return None - def prepare_inputddms(self, files, activities=None): + def prepare_inputddms(self, files: list): """ Prepare input DDMs. - Populates filespec.inputddms for each entry from `files` list + Populates filespec.inputddms for each entry from `files` list. :param files: list of `FileSpec` objects - :param activities: sting or ordered list of activities to resolve astorages (optional). """ - activities = activities or 'read_lan' - if isinstance(activities, str): - activities = [activities] - astorages = self.infosys.queuedata.astorages if self.infosys and self.infosys.queuedata else {} - - storages = [] - for a in activities: - storages = astorages.get(a, []) - if storages: - break + storages = astorages.get('read_lan', []) #activity = activities[0] #if not storages: ## ignore empty astorages @@ -213,7 +212,7 @@ def prepare_inputddms(self, files, activities=None): if not fdat.inputddms and fdat.ddmendpoint: fdat.inputddms = [fdat.ddmendpoint] - def print_replicas(self, replicas, label='unsorted'): + def print_replicas(self, replicas: list, label: str = 'unsorted'): """ Print replicas. @@ -233,15 +232,15 @@ def print_replicas(self, replicas, label='unsorted'): break @classmethod - def sort_replicas(self, replicas, inputddms): + def sort_replicas(cls, replicas: list, inputddms: list) -> list: """ Sort input replicas. Consider first affected replicas from inputddms. - :param replicas: Prioritized list of replicas [(pfn, dat)] - :param inputddms: preferred list of ddmebdpoint - :return: sorted `replicas`. + :param replicas: Prioritized list of replicas [(pfn, dat)] (list) + :param inputddms: preferred list of ddmebdpoint (list) + :return: sorted list of `replicas` (list). """ if not inputddms: return replicas @@ -263,21 +262,22 @@ def sort_replicas(self, replicas, inputddms): return xreplicas - def resolve_replicas(self, files, use_vp=False): + def resolve_replicas(self, files: list, use_vp: bool = False) -> list: """ Populate filespec.replicas for each entry from `files` list. fdat.replicas = [{'ddmendpoint':'ddmendpoint', 'pfn':'replica', 'domain':'domain value'}] - :param files: list of `FileSpec` objects - :param use_vp: True for VP jobs (boolean) - :return: files object. + :param files: list of `FileSpec` objects (list) + :param use_vp: True for VP jobs (bool) + :raise: Exception in case of list_replicas() failure + :return: list of files (list). """ logger = self.logger xfiles = [] for fdat in files: - # skip fdat if need for further workflow (e.g. to properly handle OS ddms) + # skip fdat if needed (e.g. to properly handle OS ddms) xfiles.append(fdat) if not xfiles: # no files for replica look-up @@ -311,7 +311,7 @@ def resolve_replicas(self, files, use_vp=False): fdat.filesize = replica['bytes'] logger.warning("Filesize value for input file=%s is not defined, assigning info from Rucio replica: filesize=%s", fdat.lfn, replica['bytes']) - for ctype in ['adler32', 'md5']: + for ctype in ('adler32', 'md5'): if fdat.checksum.get(ctype) != replica[ctype] and replica[ctype]: logger.warning("Checksum value of input file=%s mismatched with info got from Rucio replica: checksum=%s, replica.checksum=%s, fdat=%s", fdat.lfn, fdat.checksum, replica[ctype], fdat) @@ -325,19 +325,19 @@ def resolve_replicas(self, files, use_vp=False): self.trace_report.update(clientState="DONE") logger.info('Number of resolved replicas:\n' + - '\n'.join(["lfn=%s: replicas=%s, is_directaccess=%s" - % (f.lfn, len(f.replicas or []), f.is_directaccess(ensure_replica=False)) for f in files])) + '\n'.join([f"lfn={f.lfn}: nr replicas={len(f.replicas or [])}, is_directaccess={f.is_directaccess(ensure_replica=False)}" for f in files])) return files - def list_replicas(self, xfiles, use_vp): + def list_replicas(self, xfiles: list, use_vp: bool) -> list: """ List Rucio replicas. Wrapper around rucio_client.list_replicas() - :param xfiles: files object + :param xfiles: list of files objects (list) :param use_vp: True for VP jobs (bool) + :raise: PilotException in case of list_replicas() failure :return: replicas (list). """ # load replicas from Rucio @@ -350,7 +350,7 @@ def list_replicas(self, xfiles, use_vp): query = { 'schemes': ['srm', 'root', 'davs', 'gsiftp', 'https', 'storm', 'file'], - 'dids': [dict(scope=e.scope, name=e.lfn) for e in xfiles], + 'dids': [{"scope": e.scope, "name": e.lfn} for e in xfiles], } query.update(sort='geoip', client_location=location) # reset the schemas for VP jobs @@ -367,20 +367,20 @@ def list_replicas(self, xfiles, use_vp): try: replicas = rucio_client.list_replicas(**query) except Exception as exc: - raise PilotException(f"Failed to get replicas from Rucio: {exc}", code=ErrorCodes.RUCIOLISTREPLICASFAILED) + raise PilotException(f"Failed to get replicas from Rucio: {exc}", code=ErrorCodes.RUCIOLISTREPLICASFAILED) from exc replicas = list(replicas) self.logger.debug(f"replicas received from Rucio: {replicas}") return replicas - def add_replicas(self, fdat, replica): + def add_replicas(self, fdat: Any, replica: Any) -> Any: """ Add the replicas to the fdat structure. - :param fdat: fdat object - :param replica: replica object - :return: updated fdat object. + :param fdat: fdat object (Any) + :param replica: replica object (Any) + :return: updated fdat object (Any). """ fdat.replicas = [] # reset replicas list @@ -439,9 +439,16 @@ def detect_client_location(self, use_vp: bool = False) -> dict: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(("8.8.8.8", 80)) ip = s.getsockname()[0] - except Exception as exc: - diagnostics = f'failed to get socket info: {exc}' + except socket.gaierror as e: + diagnostics = f'failed to get socket info due to address-related error: {e}' + self.logger.warning(diagnostics) + except socket.timeout as e: + diagnostics = f'failed to get socket info due to timeout: {e}' self.logger.warning(diagnostics) + except socket.error as e: + diagnostics = f'failed to get socket info due to general socket error: {e}' + self.logger.warning(diagnostics) + client_location['ip'] = ip site = os.environ.get('PILOT_RUCIO_SITENAME', 'unknown') client_location['site'] = site @@ -465,38 +472,59 @@ def detect_client_location(self, use_vp: bool = False) -> dict: client_location = response.json() # put back the site client_location['site'] = site - except Exception as exc: - diagnostics = f'requests.post failed: {exc}' + except requests.exceptions.Timeout as exc: + diagnostics = f'requests.post timed out: {exc}' + self.logger.warning(diagnostics) + except requests.exceptions.RequestException as exc: + diagnostics = f'requests.post failed with general exception: {exc}' self.logger.warning(diagnostics) self.logger.debug(f'will use client_location={client_location}') return client_location, diagnostics - def transfer_files(self, copytool, files, **kwargs): + def resolve_surl(self, fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> dict: + """ + Resolve SURL. + + Only needed in StageOutClient. + + Get final destination SURL for file to be transferred. + Can be customized at the level of specific copytool. + + :param fspec: `FileSpec` object (Any) + :param protocol: suggested protocol (dict) + :param ddmconf: full ddmconf data (dict) + :param kwargs: extra kwargs (dict) + :return: dictionary with keys ('pfn', 'ddmendpoint') (dict). + """ + raise NotImplementedError() + + def transfer_files(self, copytool: Any, files: list, activity: list, **kwargs: dict) -> list: """ Transfer the files. Apply transfer of given `files` using passed `copytool` module. Should be implemented by custom Staging Client. - :param copytool: copytool module - :param files: list of `FileSpec` objects - :param kwargs: extra kwargs to be passed to copytool transfer handler + :param copytool: copytool module (Any) + :param files: list of `FileSpec` objects (list) + :param activity: list of activity names used to determine appropriate copytool (list) + :param kwargs: extra kwargs to be passed to copytool transfer handler (dict) :raise: PilotException in case of controlled error. """ raise NotImplementedError() - def transfer(self, files, activity='default', **kwargs): # noqa: C901 + def transfer(self, files: list, activity: list or str = 'default', **kwargs: dict) -> list: # noqa: C901 """ Perform file transfer. Automatically stage passed files using copy tools related to given `activity`. - :param files: list of `FileSpec` objects - :param activity: list of activity names used to determine appropriate copytool (prioritized list) - :param kwargs: extra kwargs to be passed to copytool transfer handler + :param files: list of `FileSpec` objects (list) + :param activity: list of activity names used to determine appropriate copytool (list or str) + :param kwargs: extra kwargs to be passed to copytool transfer handler (dict) :raise: PilotException in case of controlled error - :return: list of processed `FileSpec` objects. + :return: list of processed `FileSpec` objects (list). """ self.trace_report.update(relativeStart=time.time(), transferStart=time.time()) @@ -512,8 +540,7 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 break if not copytools: - raise PilotException('failed to resolve copytool by preferred activities=%s, acopytools=%s' % - (activity, self.acopytools)) + raise PilotException(f'failed to resolve copytool by preferred activities={activity}, acopytools={self.acopytools}') # populate inputddms if needed self.prepare_inputddms(files) @@ -547,30 +574,30 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 code=ErrorCodes.UNKNOWNCOPYTOOL) module = self.copytool_modules[name]['module_name'] - self.logger.info('trying to use copytool=%s for activity=%s', name, activity) + self.logger.info(f'trying to use copytool={name} for activity={activity}') copytool = __import__(f'pilot.copytool.{module}', globals(), locals(), [module], 0) #self.trace_report.update(protocol=name) except PilotException as exc: caught_errors.append(exc) - self.logger.debug('error: %s', exc) + self.logger.debug(f'error: {exc}') continue except Exception as exc: - self.logger.warning('failed to import copytool module=%s, error=%s', module, exc) + self.logger.warning(f'failed to import copytool module={module}, error={exc}') continue try: result = self.transfer_files(copytool, remain_files, activity, **kwargs) - self.logger.debug('transfer_files() using copytool=%s completed with result=%s', copytool, str(result)) + self.logger.debug(f'transfer_files() using copytool={copytool} completed with result={result}') break except PilotException as exc: - self.logger.warning('failed to transfer_files() using copytool=%s .. skipped; error=%s', copytool, exc) + self.logger.warning(f'failed to transfer_files() using copytool={copytool} .. skipped; error={exc}') caught_errors.append(exc) except TimeoutException as exc: - self.logger.warning('function timed out: %s', exc) + self.logger.warning(f'function timed out: {exc}') caught_errors.append(exc) except Exception as exc: - self.logger.warning('failed to transfer files using copytool=%s .. skipped; error=%s', copytool, exc) + self.logger.warning(f'failed to transfer files using copytool={copytool} .. skipped; error={exc}') caught_errors.append(exc) import traceback self.logger.error(traceback.format_exc()) @@ -583,17 +610,17 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 if remain_files: # failed or incomplete transfer # propagate message from first error back up - errmsg = str(caught_errors[0]) if caught_errors else '' + # errmsg = str(caught_errors[0]) if caught_errors else '' if caught_errors and "Cannot authenticate" in str(caught_errors): code = ErrorCodes.STAGEINAUTHENTICATIONFAILURE elif caught_errors and "bad queue configuration" in str(caught_errors): code = ErrorCodes.BADQUEUECONFIGURATION elif caught_errors and isinstance(caught_errors[0], PilotException): code = caught_errors[0].get_error_code() - errmsg = caught_errors[0].get_last_error() + # errmsg = caught_errors[0].get_last_error() elif caught_errors and isinstance(caught_errors[0], TimeoutException): code = ErrorCodes.STAGEINTIMEOUT if self.mode == 'stage-in' else ErrorCodes.STAGEOUTTIMEOUT # is it stage-in/out? - self.logger.warning('caught time-out exception: %s', caught_errors[0]) + self.logger.warning(f'caught time-out exception: {caught_errors[0]}') else: code = ErrorCodes.STAGEINFAILED if self.mode == 'stage-in' else ErrorCodes.STAGEOUTFAILED # is it stage-in/out? details = str(caught_errors) + ":" + f'failed to transfer files using copytools={copytools}' @@ -602,14 +629,16 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 return files - def require_protocols(self, files, copytool, activity, local_dir=''): + def require_protocols(self, files: list, copytool: Any, activity: list or str, local_dir: str = ''): """ Require protocols. Populates fspec.protocols and fspec.turl for each entry in `files` according to preferred fspec.ddm_activity - :param files: list of `FileSpec` objects - :param activity: str or ordered list of transfer activity names to resolve acopytools related data. + :param files: list of `FileSpec` objects (list) + :param copytool: copytool module (Any) + :param activity: list of activity names used to determine appropriate copytool (list or str) + :param local_dir: local directory (str). """ allowed_schemas = getattr(copytool, 'allowed_schemas', None) @@ -632,13 +661,13 @@ def require_protocols(self, files, copytool, activity, local_dir=''): protocols = self.resolve_protocol(fspec, allowed_schemas) if not protocols and 'mv' not in self.infosys.queuedata.copytools: # no protocols found error = f'Failed to resolve protocol for file={fspec.lfn}, allowed_schemas={allowed_schemas}, fspec={fspec}' - self.logger.error("resolve_protocol: %s", error) + self.logger.error(f"resolve_protocol: {error}") raise PilotException(error, code=ErrorCodes.NOSTORAGEPROTOCOL) # take first available protocol for copytool: FIX ME LATER if need (do iterate over all allowed protocols?) protocol = protocols[0] - self.logger.info("Resolved protocol to be used for transfer: \'%s\': lfn=\'%s\'", protocol, fspec.lfn) + self.logger.info(f"Resolved protocol to be used for transfer: \'{protocol}\': lfn=\'{fspec.lfn}\'") resolve_surl = getattr(copytool, 'resolve_surl', None) if not callable(resolve_surl): @@ -651,14 +680,16 @@ def require_protocols(self, files, copytool, activity, local_dir=''): if r.get('ddmendpoint'): fspec.ddmendpoint = r['ddmendpoint'] - def resolve_protocols(self, files): + def resolve_protocols(self, files: list) -> list: """ Resolve protocols. Populates filespec.protocols for each entry from `files` according to preferred `fspec.ddm_activity` value - :param files: list of `FileSpec` objects + fdat.protocols = [dict(endpoint, path, flavour), ..] - :return: `files` object. + + :param files: list of `FileSpec` objects (list) + :return: list of `files` object (list). """ ddmconf = self.infosys.resolve_storage_data() @@ -666,7 +697,7 @@ def resolve_protocols(self, files): ddm = ddmconf.get(fdat.ddmendpoint) if not ddm: error = f'Failed to resolve output ddmendpoint by name={fdat.ddmendpoint} (from PanDA), please check configuration.' - self.logger.error("resolve_protocols: %s, fspec=%s", error, fdat) + self.logger.error(f"resolve_protocols: {error}, fspec={fdat}") raise PilotException(error, code=ErrorCodes.NOSTORAGE) protocols = [] @@ -680,15 +711,15 @@ def resolve_protocols(self, files): return files @classmethod - def resolve_protocol(self, fspec, allowed_schemas=None): + def resolve_protocol(cls, fspec: Any, allowed_schemas: Any = None) -> list: """ Resolve protocol. Resolve protocol according to allowed schema - :param fspec: `FileSpec` instance - :param allowed_schemas: list of allowed schemas or any if None - :return: list of dict(endpoint, path, flavour). + :param fspec: `FileSpec` instance (list) + :param allowed_schemas: list of allowed schemas or any if None (Any) + :return: list of dict(endpoint, path, flavour) (list). """ if not fspec.protocols: return [] @@ -709,7 +740,7 @@ class StageInClient(StagingClient): mode = "stage-in" - def resolve_replica(self, fspec, primary_schemas=None, allowed_schemas=None, domain=None): + def resolve_replica(self, fspec: Any, primary_schemas: Any = None, allowed_schemas: Any = None, domain: Any = None) -> dict or None: """ Resolve replica. @@ -718,9 +749,11 @@ def resolve_replica(self, fspec, primary_schemas=None, allowed_schemas=None, dom Primary schemas ignore replica priority (used to resolve direct access replica, which could be not with top priority set). - :param fspec: input `FileSpec` objects - :param allowed_schemas: list of allowed schemas or any if None - :return: dict(surl, ddmendpoint, pfn, domain) or None if replica not found. + :param fspec: input `FileSpec` objects (Any) + :param primary_schemas: list of primary schemas or any if None (Any) + :param allowed_schemas: list of allowed schemas or any if None (Any) + :param domain: domain value to match (Any) + :return: dict(surl, ddmendpoint, pfn, domain) or None if replica not found (dict or None). """ if not fspec.replicas: self.logger.warning('resolve_replica() received no fspec.replicas') @@ -759,16 +792,33 @@ def resolve_replica(self, fspec, primary_schemas=None, allowed_schemas=None, dom # prefer SRM protocol for surl -- to be verified, can it be deprecated? rse_replicas = replicas.get(replica['ddmendpoint'], []) surl = self.get_preferred_replica(rse_replicas, ['srm']) or rse_replicas[0] - self.logger.info("[stage-in] surl (srm replica) from Rucio: pfn=%s, ddmendpoint=%s", surl['pfn'], surl['ddmendpoint']) + self.logger.info(f"[stage-in] surl (srm replica) from Rucio: pfn={surl['pfn']}, ddmendpoint={surl['ddmendpoint']}") return {'surl': surl['pfn'], 'ddmendpoint': replica['ddmendpoint'], 'pfn': replica['pfn'], 'domain': replica['domain']} - def get_direct_access_variables(self, job): + def resolve_surl(self, fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> dict: + """ + Resolve SURL. + + Only needed in StageOutClient. + + Get final destination SURL for file to be transferred. + Can be customized at the level of specific copytool. + + :param fspec: `FileSpec` object (Any) + :param protocol: suggested protocol (dict) + :param ddmconf: full ddmconf data (dict) + :param kwargs: extra kwargs (dict) + :return: dictionary with keys ('pfn', 'ddmendpoint') (dict). + """ + raise NotImplementedError() + + def get_direct_access_variables(self, job: Any) -> (bool, str): """ Return the direct access settings for the PQ. - :param job: job object. - :return: allow_direct_access (bool), direct_access_type (string). + :param job: job object (Any) + :return: allow_direct_access (bool), direct_access_type (str). """ allow_direct_access, direct_access_type = False, '' if self.infosys.queuedata: # infosys is initialized @@ -782,19 +832,19 @@ def get_direct_access_variables(self, job): if job and not job.is_analysis() and job.transfertype != 'direct': # task forbids direct access allow_direct_access = False - self.logger.info('switched off direct access mode for production job since transfertype=%s', job.transfertype) + self.logger.info(f'switched off direct access mode for production job since transfertype={job.transfertype}') return allow_direct_access, direct_access_type - def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C901 + def transfer_files(self, copytool: Any, files: list, activity: list = None, **kwargs: dict) -> list: # noqa: C901 """ Automatically stage in files using the selected copy tool module. - :param copytool: copytool module - :param files: list of `FileSpec` objects - :param kwargs: extra kwargs to be passed to copytool transfer handler - - :return: list of processed `FileSpec` objects + :param copytool: copytool module (Any) + :param files: list of `FileSpec` objects (list) + :param activity: list of activity names used to determine appropriate copytool (list or None) + :param kwargs: extra kwargs to be passed to copytool transfer handler (dict) + :return: list of processed `FileSpec` objects (list) :raise: PilotException in case of controlled error. """ if getattr(copytool, 'require_replicas', False) and files: @@ -906,14 +956,14 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 # return copytool.copy_in_bulk(remain_files, **kwargs) return copytool.copy_in(remain_files, **kwargs) - def set_status_for_direct_access(self, files, workdir): + def set_status_for_direct_access(self, files: list, workdir: str): """ Update the FileSpec status with 'remote_io' for direct access mode. Should be called only once since the function sends traces. - :param files: list of FileSpec objects. - :param workdir: work directory (string). + :param files: list of FileSpec objects (list) + :param workdir: work directory (str). """ for fspec in files: direct_lan = (fspec.domain == 'lan' and fspec.direct_access_lan and @@ -969,26 +1019,25 @@ def set_status_for_direct_access(self, files, workdir): else: self.trace_report.send() - def check_availablespace(self, files): + def check_availablespace(self, files: list): """ Verify that enough local space is available to stage in and run the job. - :param files: list of FileSpec objects. + :param files: list of FileSpec objects (list) :raise: PilotException in case of not enough space or total input size too large. """ for f in files: - self.logger.debug('lfn=%s filesize=%d accessmode=%s', f.lfn, f.filesize, f.accessmode) + self.logger.debug(f'lfn={f.lfn} filesize={f.filesize} accessmode={f.accessmode}') maxinputsize = convert_mb_to_b(get_maximum_input_sizes()) totalsize = reduce(lambda x, y: x + y.filesize, files, 0) # verify total filesize if maxinputsize and totalsize > maxinputsize: - error = "too many/too large input files (%s). total file size=%s B > maxinputsize=%s B" % \ - (len(files), totalsize, maxinputsize) + error = f"too many/too large input files ({len(files)}). total file size={totalsize} B > maxinputsize={maxinputsize} B" raise SizeTooLarge(error) - self.logger.info("total input file size=%s B within allowed limit=%s B (zero value means unlimited)", totalsize, maxinputsize) + self.logger.info(f"total input file size={totalsize} B within allowed limit={maxinputsize} B (zero value means unlimited)") # get available space try: @@ -999,12 +1048,11 @@ def check_availablespace(self, files): else: if disk_space: available_space = convert_mb_to_b(disk_space) - self.logger.info("locally available space: %d B", available_space) + self.logger.info("locally available space: {available_space} B") # are we within the limit? if totalsize > available_space: - error = "not enough local space for staging input files and run the job (need %d B, but only have %d B)" % \ - (totalsize, available_space) + error = f"not enough local space for staging input files and run the job (need {totalsize} B, but only have {available_space} B)" raise NoLocalSpace(error) else: self.logger.warning('get_local_disk_space() returned None') @@ -1015,15 +1063,15 @@ class StageOutClient(StagingClient): mode = "stage-out" - def prepare_destinations(self, files, activities): + def prepare_destinations(self, files: list, activities: list or str) -> list: """ Resolve destination RSE (filespec.ddmendpoint) for each entry from `files` according to requested `activities`. Apply Pilot-side logic to choose proper destination. - :param files: list of FileSpec objects to be processed - :param activities: ordered list of activities to be used to resolve astorages - :return: updated fspec entries. + :param files: list of FileSpec objects to be processed (list) + :param activities: ordered list of activities to be used to resolve astorages (list or str) + :return: updated fspec entries (list). """ if not self.infosys.queuedata: # infosys is not initialized: not able to fix destination if need, nothing to do return files @@ -1036,7 +1084,6 @@ def prepare_destinations(self, files, activities): code=ErrorCodes.INTERNALPILOTPROBLEM, state='INTERNAL_ERROR') astorages = self.infosys.queuedata.astorages or {} - storages = None activity = activities[0] for a in activities: @@ -1047,15 +1094,16 @@ def prepare_destinations(self, files, activities): if not storages: if 'mv' in self.infosys.queuedata.copytools: return files - else: - raise PilotException("Failed to resolve destination: no associated storages defined for activity=%s (%s)" - % (activity, ','.join(activities)), code=ErrorCodes.NOSTORAGE, state='NO_ASTORAGES_DEFINED') + + act = ','.join(activities) + raise PilotException(f"Failed to resolve destination: no associated storages defined for activity={activity} ({act})", + code=ErrorCodes.NOSTORAGE, state='NO_ASTORAGES_DEFINED') # take the fist choice for now, extend the logic later if need ddm = storages[0] - self.logger.info("[prepare_destinations][%s]: allowed (local) destinations: %s", activity, storages) - self.logger.info("[prepare_destinations][%s]: resolved default destination ddm=%s", activity, ddm) + self.logger.info(f"[prepare_destinations][{activity}]: allowed (local) destinations: {storages}") + self.logger.info(f"[prepare_destinations][{activity}]: resolved default destination ddm={ddm}") for e in files: if not e.ddmendpoint: # no preferences => use default destination @@ -1071,7 +1119,7 @@ def prepare_destinations(self, files, activities): return files @classmethod - def get_path(self, scope, lfn, prefix='rucio'): + def get_path(cls, scope: str, lfn: str) -> str: """ Construct a partial Rucio PFN using the scope and the LFN. @@ -1079,29 +1127,29 @@ def get_path(self, scope, lfn, prefix='rucio'): :param scope: replica scope (str) :param lfn: repliva LFN (str) - :param prefix: prefix (str). + :return: constructed path (str). """ s = f'{scope}:{lfn}' hash_hex = hashlib.md5(s.encode('utf-8')).hexdigest() - #paths = [prefix] + scope.split('.') + [hash_hex[0:2], hash_hex[2:4], lfn] # exclude prefix from the path: this should be properly considered in protocol/AGIS for today paths = scope.split('.') + [hash_hex[0:2], hash_hex[2:4], lfn] paths = [_f for _f in paths if _f] # remove empty parts to avoid double /-chars return '/'.join(paths) - def resolve_surl(self, fspec, protocol, ddmconf, **kwargs): + def resolve_surl(self, fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> dict: """ Resolve SURL. Get final destination SURL for file to be transferred. Can be customized at the level of specific copytool. - :param protocol: suggested protocol - :param ddmconf: full ddmconf data - :param activity: ordered list of preferred activity names to resolve SE protocols - :return: dict with keys ('pfn', 'ddmendpoint'). + :param fspec: `FileSpec` object (Any) + :param protocol: suggested protocol (dict) + :param ddmconf: full ddmconf data (dict) + :param kwargs: extra kwargs (dict) + :return: dictionary with keys ('pfn', 'ddmendpoint') (dict). """ local_dir = kwargs.get('local_dir', '') if not local_dir: @@ -1113,23 +1161,24 @@ def resolve_surl(self, fspec, protocol, ddmconf, **kwargs): # path = protocol.get('path', '').rstrip('/') # if not (ddm.is_deterministic or (path and path.endswith('/rucio'))): if not ddm.is_deterministic: - raise PilotException('resolve_surl(): Failed to construct SURL for non deterministic ddm=%s: ' - 'NOT IMPLEMENTED' % fspec.ddmendpoint, code=ErrorCodes.NONDETERMINISTICDDM) + raise PilotException(f'resolve_surl(): Failed to construct SURL for non deterministic ' + f'ddm={fspec.ddmendpoint}: NOT IMPLEMENTED', code=ErrorCodes.NONDETERMINISTICDDM) surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), self.get_path(fspec.scope, fspec.lfn)) + return {'surl': surl} - def transfer_files(self, copytool, files, activity, **kwargs): + def transfer_files(self, copytool: Any, files: list, activity: list, **kwargs: dict) -> list: """ Transfer files. Automatically stage out files using the selected copy tool module. - :param copytool: copytool module - :param files: list of `FileSpec` objects - :param activity: ordered list of preferred activity names to resolve SE protocols - :param kwargs: extra kwargs to be passed to copytool transfer handler - :return: the output of the copytool transfer operation + :param copytool: copytool module (Any) + :param files: list of `FileSpec` objects (list) + :param activity: ordered list of preferred activity names to resolve SE protocols (list) + :param kwargs: extra kwargs to be passed to copytool transfer handler (dict) + :return: the output of the copytool transfer operation (list) :raise: PilotException in case of controlled error. """ # check if files exist before actual processing @@ -1163,7 +1212,9 @@ def transfer_files(self, copytool, files, activity, **kwargs): try: fspec.checksum[config.File.checksum_type] = calculate_checksum(pfn, algorithm=config.File.checksum_type) - except (FileHandlingFailure, NotImplementedError, Exception) as exc: + except (FileHandlingFailure, NotImplementedError) as exc: + raise exc + except Exception as exc: raise exc # prepare files (resolve protocol/transfer url) diff --git a/pilot/api/es_data.py b/pilot/api/es_data.py index 64a8af2a0..adc28c807 100644 --- a/pilot/api/es_data.py +++ b/pilot/api/es_data.py @@ -19,11 +19,12 @@ # Authors: # - Wen Guan, wen.guan@cern,ch, 2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2019 -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2024 """API for event service data transfers.""" import logging +from typing import Any from pilot.api.data import StageInClient, StageOutClient @@ -33,14 +34,14 @@ class StageInESClient(StageInClient): """Stage-in client.""" - def __init__(self, *argc, **kwargs): + def __init__(self, **kwargs: dict): """Set default/init values.""" - super(StageInESClient, self).__init__(*argc, **kwargs) + super().__init__(**kwargs) self.copytool_modules.setdefault('objectstore', {'module_name': 'objectstore'}) self.acopytools.setdefault('es_events_read', ['objectstore']) - def prepare_sources(self, files, activities=None): + def prepare_sources(self, files: list, activities: Any = None): """ Prepare sources. @@ -49,13 +50,15 @@ def prepare_sources(self, files, activities=None): If storage_id is specified, replace ddmendpoint by parsing storage_id. - :param files: list of `FileSpec` objects to be processed - :param activities: string or ordered list of activities to resolve `astorages` (optional) - :return: None. + :param files: list of `FileSpec` objects to be processed (list) + :param activities: string or ordered list of activities to resolve `astorages` (optional) (list or str) """ if not self.infosys: self.logger.warning('infosys instance is not initialized: skip calling prepare_sources()') - return None + return + + if activities: + pass # to bypass pylint complaint about activities not used (it cannot be removed) for fspec in files: if fspec.storage_token: ## FIX ME LATER: no need to parse each time storage_id, all this staff should be applied in FileSpec clean method @@ -65,15 +68,14 @@ def prepare_sources(self, files, activities=None): if storage_id: fspec.ddmendpoint = self.infosys.get_ddmendpoint(storage_id) logger.info(f"Processed file with storage id: {fspec}") - return None class StageOutESClient(StageOutClient): """Stage-out client.""" - def __init__(self, *argc, **kwargs): + def __init__(self, **kwargs: dict): """Set default/init values.""" - super(StageOutESClient, self).__init__(*argc, **kwargs) + super().__init__(**kwargs) self.copytool_modules.setdefault('objectstore', {'module_name': 'objectstore'}) self.acopytools.setdefault('es_events', ['objectstore']) diff --git a/pilot/api/memorymonitor.py b/pilot/api/memorymonitor.py index deed32825..87e08c153 100644 --- a/pilot/api/memorymonitor.py +++ b/pilot/api/memorymonitor.py @@ -17,14 +17,15 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """API for memory monitoring.""" +import logging from os import getcwd + from .services import Services -import logging logger = logging.getLogger(__name__) @@ -36,14 +37,14 @@ class MemoryMonitoring(Services): workdir = "" # Job work directory _cmd = "" # Memory monitoring command (full path, all options) - def __init__(self, **kwargs): + def __init__(self, **kwargs: dict): """ Init function. - :param kwargs: kwargs dictionary. + :param kwargs: kwargs dictionary (dict). """ - for key in kwargs: - setattr(self, key, kwargs[key]) + for key, value in kwargs.items(): + setattr(self, key, value) if not self.workdir: self.workdir = getcwd() @@ -52,7 +53,7 @@ def __init__(self, **kwargs): user_utility = __import__(f'pilot.user.{self.user}.utilities', globals(), locals(), [self.user], 0) # Python 2/3 self._cmd = user_utility.get_memory_monitor_setup(self.pid, self.workdir) - def get_command(self): + def get_command(self) -> str: """ Return the full command for the memory monitor. @@ -64,15 +65,15 @@ def execute(self): """ Execute the memory monitor command. - :return: process (currently None). + return: process (currently None). """ return None - def get_filename(self): + def get_filename(self) -> str: """ Return the filename from the memory monitor tool. - :return: fiename (str). + :return: filename (str). """ return "" @@ -80,6 +81,6 @@ def get_results(self): """ Return the results from the memory monitoring. - :return: results (currently None). + return: results (currently None). """ return None diff --git a/pilot/api/services.py b/pilot/api/services.py index 3e7e08d18..1aa352fd5 100644 --- a/pilot/api/services.py +++ b/pilot/api/services.py @@ -17,9 +17,9 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 -"""High-level base class for Benchmark(), MemoryMonitoring() and Analytics() classes.""" +"""High-level base class for MemoryMonitoring and Analytics classes.""" # from pilot.util.container import execute @@ -28,4 +28,4 @@ class Services: - """High-level base class for Benchmark(), MemoryMonitoring() and Analytics() classes.""" + """High-level base class for MemoryMonitoring( and Analytics classes.""" diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py index 4c84c3736..890763a6b 100644 --- a/pilot/common/errorcodes.py +++ b/pilot/common/errorcodes.py @@ -17,13 +17,13 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Wen Guan, wen.guan@cern.ch, 2018 """Error codes set by the pilot.""" import re -from typing import Any, Tuple, List +from typing import Any class ErrorCodes: @@ -36,8 +36,8 @@ class ErrorCodes: """ # global variables shared by all modules/jobs - pilot_error_codes: List[int] = [] - pilot_error_diags: List[str] = [] + pilot_error_codes: list[int] = [] + pilot_error_diags: list[str] = [] # Error code constants (from Pilot 1) GENERALERROR = 1008 @@ -319,7 +319,7 @@ class ErrorCodes: LEASETIME: "Lease time is up", # internal use only LOGCREATIONTIMEOUT: "Log file creation timed out", CVMFSISNOTALIVE: "CVMFS is not responding", - LSETUPTIMEDOUT: "Lsetup command timed out during remote file open" + LSETUPTIMEDOUT: "Lsetup command timed out during remote file open", } put_error_codes = [1135, 1136, 1137, 1141, 1152, 1181] @@ -337,12 +337,14 @@ def get_kill_signal_error_code(self, signal: str) -> int: :param signal: signal name (str). :return: Pilot error code (int). """ - signals_dictionary = {'SIGTERM': self.SIGTERM, - 'SIGQUIT': self.SIGQUIT, - 'SIGSEGV': self.SIGSEGV, - 'SIGXCPU': self.SIGXCPU, - 'SIGUSR1': self.SIGUSR1, - 'SIGBUS': self.SIGBUS} + signals_dictionary = { + "SIGTERM": self.SIGTERM, + "SIGQUIT": self.SIGQUIT, + "SIGSEGV": self.SIGSEGV, + "SIGXCPU": self.SIGXCPU, + "SIGUSR1": self.SIGUSR1, + "SIGBUS": self.SIGBUS, + } return signals_dictionary.get(signal, self.KILLSIGNAL) @@ -355,7 +357,9 @@ def get_error_message(self, errorcode: int) -> str: """ return self._error_messages.get(errorcode, f"unknown error code: {errorcode}") - def add_error_code(self, errorcode: int, priority: bool = False, msg: Any = None) -> Tuple[list, list]: + def add_error_code( + self, errorcode: int, priority: bool = False, msg: Any = None + ) -> tuple[list, list]: """ Add pilot error code to list of error codes. @@ -365,10 +369,8 @@ def add_error_code(self, errorcode: int, priority: bool = False, msg: Any = None The function also sets the corresponding error message. :param errorcode: pilot error code (int) - :param pilot_error_codes: list of pilot error codes (list of integers) - :param pilot_error_diags: list of pilot error diags (list of strings) :param priority: if set to True, the new errorcode will be added to the error code list first (highest priority) (bool) - :param msg: error message (more detailed) to overwrite standard error message (str). + :param msg: error message (more detailed) to overwrite standard error message (str) :return: pilot_error_codes (list), pilot_error_diags (list). """ # do nothing if the error code has already been added @@ -385,7 +387,7 @@ def add_error_code(self, errorcode: int, priority: bool = False, msg: Any = None return pilot_error_codes, pilot_error_diags - def remove_error_code(self, errorcode: int) -> Tuple[list, list]: + def remove_error_code(self, errorcode: int) -> tuple[list, list]: """ Silently remove an error code and its diagnostics from the internal error lists. @@ -419,7 +421,7 @@ def report_errors(self) -> str: counter = 0 pilot_error_codes = ErrorCodes.pilot_error_codes pilot_error_diags = ErrorCodes.pilot_error_diags - if pilot_error_codes == []: + if not pilot_error_codes: report = "no pilot errors were reported" else: report = "Nr.\tError code\tError diagnostics" @@ -447,7 +449,7 @@ def resolve_transform_error(self, exit_code: int, stderr: str) -> int: "Singularity is not installed": self.SINGULARITYNOTINSTALLED, "Apptainer is not installed": self.APPTAINERNOTINSTALLED, "cannot create directory": self.MKDIR, - "General payload setup verification error": self.SETUPFAILURE + "General payload setup verification error": self.SETUPFAILURE, } # Check if stderr contains any known error messages @@ -458,15 +460,15 @@ def resolve_transform_error(self, exit_code: int, stderr: str) -> int: # Handle specific exit codes if exit_code == 2: return self.LSETUPTIMEDOUT - elif exit_code == 3: + if exit_code == 3: return self.REMOTEFILEOPENTIMEDOUT - elif exit_code == 251: + if exit_code == 251: return self.UNKNOWNTRFFAILURE - elif exit_code == -1: + if exit_code == -1: return self.UNKNOWNTRFFAILURE - elif exit_code == self.COMMANDTIMEDOUT: + if exit_code == self.COMMANDTIMEDOUT: return exit_code - elif exit_code != 0: + if exit_code != 0: return self.PAYLOADEXECUTIONFAILURE return exit_code # Return original exit code if no specific error is found @@ -482,7 +484,9 @@ def extract_stderr_error(self, stderr: str) -> str: if "command not found" in stderr: msg = stderr else: - msg = self.get_message_for_pattern([r"ERROR\s*:\s*(.*)", r"Error\s*:\s*(.*)", r"error\s*:\s*(.*)"], stderr) + msg = self.get_message_for_pattern( + [r"ERROR\s*:\s*(.*)", r"Error\s*:\s*(.*)", r"error\s*:\s*(.*)"], stderr + ) return msg def extract_stderr_warning(self, stderr: str) -> str: @@ -492,7 +496,10 @@ def extract_stderr_warning(self, stderr: str) -> str: :param stderr: stderr (str) :return: warning message (str). """ - return self.get_message_for_pattern([r"WARNING\s*:\s*(.*)", r"Warning\s*:\s*(.*)", r"warning\s*:\s*(.*)"], stderr) + return self.get_message_for_pattern( + [r"WARNING\s*:\s*(.*)", r"Warning\s*:\s*(.*)", r"warning\s*:\s*(.*)"], + stderr, + ) def get_message_for_pattern(self, patterns: list, stderr: str) -> str: """ @@ -524,19 +531,19 @@ def format_diagnostics(self, code: int, diag: str) -> str: max_message_length = 256 try: standard_message = self._error_messages[code] + ":" - except Exception: + except KeyError: standard_message = "" # extract the relevant info for reporting exceptions if "Traceback" in diag: - pattern = 'details:(.+)' + pattern = "details:(.+)" found = re.findall(pattern, diag) if found: diag = found[0] - diag = re.sub(r'\[?PilotException\(\"?\'?', r'', diag) - diag = re.sub(r'\[?StageInFailure\(\"?\'?', r'', diag) - diag = re.sub(r'\[?StageOutFailure\(\"?\'?', r'', diag) - diag = re.sub(' +', ' ', diag) + diag = re.sub(r"\[?PilotException\(\"?\'?", r"", diag) + diag = re.sub(r"\[?StageInFailure\(\"?\'?", r"", diag) + diag = re.sub(r"\[?StageOutFailure\(\"?\'?", r"", diag) + diag = re.sub(" +", " ", diag) try: if diag: @@ -545,21 +552,29 @@ def format_diagnostics(self, code: int, diag: str) -> str: # e.g. "Failed to stage-in file:abcdefghijklmnopqrstuvwxyz0123456789" if standard_message in diag: if len(diag) > max_message_length: - error_message = standard_message + diag[-(max_message_length - len(standard_message)):] + error_message = ( + standard_message + + diag[-(max_message_length - len(standard_message)):] + ) else: - error_message = standard_message + diag[len(standard_message):][-max_message_length:] + error_message = ( + standard_message + + diag[len(standard_message):][-max_message_length:] + ) + elif len(diag) + len(standard_message) > max_message_length: + error_message = ( + standard_message + + diag[:(max_message_length + len(standard_message))] + ) else: - if len(diag) + len(standard_message) > max_message_length: - error_message = standard_message + diag[:(max_message_length + len(standard_message))] - else: - error_message = standard_message + diag + error_message = standard_message + diag - if '::' in error_message: - error_message = re.sub(':+', ':', error_message) + if "::" in error_message: + error_message = re.sub(":+", ":", error_message) else: error_message = standard_message - except Exception: + except (TypeError, IndexError): error_message = diag return error_message diff --git a/pilot/common/exception.py b/pilot/common/exception.py index 373e4aa53..a731df736 100644 --- a/pilot/common/exception.py +++ b/pilot/common/exception.py @@ -18,44 +18,53 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2017-2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 """Exceptions set by the pilot.""" import time import threading import traceback +from collections.abc import Callable from sys import exc_info -from typing import Callable, Any, Dict +from typing import Any from .errorcodes import ErrorCodes + errors = ErrorCodes() class PilotException(Exception): """Pilot exceptions class.""" + _error_code = None + _message = None + def __init__(self, *args, **kwargs): """Set default or initial values.""" super().__init__(args, kwargs) self.args = args self.kwargs = kwargs - code = self.kwargs.get('code', None) + code = self.kwargs.get("code", None) if code: - self._errorCode = code + self._error_code = code else: - self._errorCode = errors.UNKNOWNEXCEPTION - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.UNKNOWNEXCEPTION + self._message = errors.get_error_message(self._error_code) self._error_string = None - self._stack_trace = f'{traceback.format_exc()}' + self._stack_trace = f"{traceback.format_exc()}" def __str__(self): """Set and return the error string for string representation of the class instance.""" try: - self._error_string = f"error code: {self._errorCode}, message: {self._message % self.kwargs}" + self._error_string = ( + f"error code: {self._error_code}, message: {self._message % self.kwargs}" + ) except TypeError: # at least get the core message out if something happened - self._error_string = f"error code: {self._errorCode}, message: {self._message}" + self._error_string = ( + f"error code: {self._error_code}, message: {self._message}" + ) if len(self.args) > 0: # If there is a non-kwarg parameter, assume it's the error @@ -63,25 +72,29 @@ def __str__(self): # of the exception message # Convert all arguments into their string representations... try: - args = [f'{arg}' for arg in self.args if arg] + args = [f"{arg}" for arg in self.args if arg] except TypeError: - args = [f'{self.args}'] - self._error_string = (self._error_string + "\ndetails: %s" % '\n'.join(args)) + args = [f"{self.args}"] + self._error_string = self._error_string + "\ndetails: %s" % "\n".join(args) return self._error_string.strip() def get_detail(self): """Set and return the error string with the exception details.""" try: - self._error_string = f"error code: {self._errorCode}, message: {self._message % self.kwargs}" + self._error_string = ( + f"error code: {self._error_code}, message: {self._message % self.kwargs}" + ) except TypeError: # at least get the core message out if something happened - self._error_string = f"error code: {self._errorCode}, message: {self._message}" + self._error_string = ( + f"error code: {self._error_code}, message: {self._message}" + ) return self._error_string + f"\nstacktrace: {self._stack_trace}" def get_error_code(self): """Return the error code.""" - return self._errorCode + return self._error_code def get_last_error(self): """Return the last error message.""" @@ -90,14 +103,14 @@ def get_last_error(self): return self._message -#class NotImplementedError(PilotException): +# class NotImplementedError(PilotException): # """ # Not implemented exception. # """ # def __init__(self, *args, **kwargs): # super().__init__(args, kwargs) -# self._errorCode = errors.NOTIMPLEMENTED -# self._message = errors.get_error_message(self._errorCode) +# self._error_code = errors.NOTIMPLEMENTED +# self._message = errors.get_error_message(self._error_code) class UnknownException(PilotException): @@ -106,8 +119,8 @@ class UnknownException(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.UNKNOWNEXCEPTION - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.UNKNOWNEXCEPTION + self._message = errors.get_error_message(self._error_code) class NoLocalSpace(PilotException): @@ -116,8 +129,8 @@ class NoLocalSpace(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOLOCALSPACE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOLOCALSPACE + self._message = errors.get_error_message(self._error_code) class SizeTooLarge(PilotException): @@ -126,8 +139,8 @@ class SizeTooLarge(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.SIZETOOLARGE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.SIZETOOLARGE + self._message = errors.get_error_message(self._error_code) class StageInFailure(PilotException): @@ -136,8 +149,8 @@ class StageInFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.STAGEINFAILED - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.STAGEINFAILED + self._message = errors.get_error_message(self._error_code) class StageOutFailure(PilotException): @@ -146,8 +159,8 @@ class StageOutFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.STAGEOUTFAILED - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.STAGEOUTFAILED + self._message = errors.get_error_message(self._error_code) class SetupFailure(PilotException): @@ -156,8 +169,8 @@ class SetupFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.SETUPFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.SETUPFAILURE + self._message = errors.get_error_message(self._error_code) class RunPayloadFailure(PilotException): @@ -166,8 +179,8 @@ class RunPayloadFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.PAYLOADEXECUTIONFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.PAYLOADEXECUTIONFAILURE + self._message = errors.get_error_message(self._error_code) class MessageFailure(PilotException): @@ -176,8 +189,8 @@ class MessageFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.MESSAGEHANDLINGFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.MESSAGEHANDLINGFAILURE + self._message = errors.get_error_message(self._error_code) class CommunicationFailure(PilotException): @@ -186,8 +199,8 @@ class CommunicationFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.COMMUNICATIONFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.COMMUNICATIONFAILURE + self._message = errors.get_error_message(self._error_code) class FileHandlingFailure(PilotException): @@ -196,8 +209,8 @@ class FileHandlingFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.FILEHANDLINGFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.FILEHANDLINGFAILURE + self._message = errors.get_error_message(self._error_code) class NoSuchFile(PilotException): @@ -206,8 +219,8 @@ class NoSuchFile(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOSUCHFILE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOSUCHFILE + self._message = errors.get_error_message(self._error_code) class ConversionFailure(PilotException): @@ -216,8 +229,8 @@ class ConversionFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.CONVERSIONFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.CONVERSIONFAILURE + self._message = errors.get_error_message(self._error_code) class MKDirFailure(PilotException): @@ -226,8 +239,8 @@ class MKDirFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.MKDIR - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.MKDIR + self._message = errors.get_error_message(self._error_code) class NoGridProxy(PilotException): @@ -236,8 +249,8 @@ class NoGridProxy(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOPROXY - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOPROXY + self._message = errors.get_error_message(self._error_code) class NoVomsProxy(PilotException): @@ -246,8 +259,8 @@ class NoVomsProxy(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOVOMSPROXY - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOVOMSPROXY + self._message = errors.get_error_message(self._error_code) class TrfDownloadFailure(PilotException): @@ -256,8 +269,8 @@ class TrfDownloadFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.TRFDOWNLOADFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.TRFDOWNLOADFAILURE + self._message = errors.get_error_message(self._error_code) class NotDefined(PilotException): @@ -266,8 +279,8 @@ class NotDefined(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOTDEFINED - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOTDEFINED + self._message = errors.get_error_message(self._error_code) class NotSameLength(PilotException): @@ -276,8 +289,8 @@ class NotSameLength(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOTSAMELENGTH - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOTSAMELENGTH + self._message = errors.get_error_message(self._error_code) class ESRecoverable(PilotException): @@ -286,8 +299,8 @@ class ESRecoverable(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.ESRECOVERABLE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.ESRECOVERABLE + self._message = errors.get_error_message(self._error_code) class ESFatal(PilotException): @@ -296,8 +309,8 @@ class ESFatal(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.ESFATAL - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.ESFATAL + self._message = errors.get_error_message(self._error_code) class ExecutedCloneJob(PilotException): @@ -306,8 +319,8 @@ class ExecutedCloneJob(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.EXECUTEDCLONEJOB - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.EXECUTEDCLONEJOB + self._message = errors.get_error_message(self._error_code) class ESNoEvents(PilotException): @@ -316,8 +329,8 @@ class ESNoEvents(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.ESNOEVENTS - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.ESNOEVENTS + self._message = errors.get_error_message(self._error_code) class ExceededMaxWaitTime(PilotException): @@ -326,8 +339,8 @@ class ExceededMaxWaitTime(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.EXCEEDEDMAXWAITTIME - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.EXCEEDEDMAXWAITTIME + self._message = errors.get_error_message(self._error_code) class BadXML(PilotException): @@ -336,8 +349,8 @@ class BadXML(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.BADXML - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.BADXML + self._message = errors.get_error_message(self._error_code) class NoSoftwareDir(PilotException): @@ -346,8 +359,8 @@ class NoSoftwareDir(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOSOFTWAREDIR - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOSOFTWAREDIR + self._message = errors.get_error_message(self._error_code) class LogFileCreationFailure(PilotException): @@ -356,8 +369,8 @@ class LogFileCreationFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.LOGFILECREATIONFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.LOGFILECREATIONFAILURE + self._message = errors.get_error_message(self._error_code) class QueuedataFailure(PilotException): @@ -366,8 +379,8 @@ class QueuedataFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.QUEUEDATA - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.QUEUEDATA + self._message = errors.get_error_message(self._error_code) class QueuedataNotOK(PilotException): @@ -376,8 +389,8 @@ class QueuedataNotOK(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.QUEUEDATANOTOK - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.QUEUEDATANOTOK + self._message = errors.get_error_message(self._error_code) class ReplicasNotFound(PilotException): @@ -386,8 +399,8 @@ class ReplicasNotFound(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOREPLICAS - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOREPLICAS + self._message = errors.get_error_message(self._error_code) class MiddlewareImportFailure(PilotException): @@ -396,8 +409,8 @@ class MiddlewareImportFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.MIDDLEWAREIMPORTFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.MIDDLEWAREIMPORTFAILURE + self._message = errors.get_error_message(self._error_code) class JobAlreadyRunning(PilotException): @@ -406,8 +419,8 @@ class JobAlreadyRunning(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.JOBALREADYRUNNING - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.JOBALREADYRUNNING + self._message = errors.get_error_message(self._error_code) def __str__(self): """Return string for exception object.""" @@ -418,7 +431,9 @@ def __str__(self): class ExcThread(threading.Thread): """Support class that allows for catching exceptions in threads.""" - def __init__(self, bucket: Any, target: Callable, kwargs: Dict[str, Any], name: str): + def __init__( + self, bucket: Any, target: Callable, kwargs: dict[str, Any], name: str + ): """ Set data members. @@ -446,30 +461,45 @@ def run(self): by the run() function and placed in the bucket belonging to the retrieve thread. The bucket is emptied in job.control(). """ + # pylint: disable=broad-except try: - self.target(**self.kwargs) + self._target(**self.kwargs) + except ValueError: + print(f"ValueError caught by thread run() function: {exc_info()}") + print(traceback.format_exc()) + print(traceback.print_tb(exc_info()[2])) + self.bucket.put(exc_info()) + print( + f"exception has been put in bucket queue belonging to thread '{self.name}'" + ) + args = self._kwargs.get("args", None) + if args: + print( + "setting graceful stop in 10 s since there is no point in continuing" + ) + time.sleep(10) + args.graceful_stop.set() except Exception: # logger object can't be used here for some reason: # IOError: [Errno 2] No such file or directory: '/state/partition1/scratch/PanDA_Pilot2_*/pilotlog.txt' - print(f'exception caught by thread run() function: {exc_info()}') + print(f"unexpected exception caught by thread run() function: {exc_info()}") print(traceback.format_exc()) print(traceback.print_tb(exc_info()[2])) self.bucket.put(exc_info()) - print(f"exception has been put in bucket queue belonging to thread \'{self.name}\'") - args = self._kwargs.get('args', None) + print( + f"exception has been put in bucket queue belonging to thread '{self.name}'" + ) + args = self._kwargs.get("args", None) if args: # the sleep is needed to allow the threads to catch up - print('setting graceful stop in 10 s since there is no point in continuing') + print( + "setting graceful stop in 10 s since there is no point in continuing" + ) time.sleep(10) args.graceful_stop.set() @property - def target(self) -> Callable: - """Help Pyright understand the type for self._target.""" - return self._target - - @property - def kwargs(self) -> Dict[str, Any]: + def kwargs(self) -> dict[str, Any]: """Help Pyright understand the type for self._kwargs.""" return self._kwargs diff --git a/pilot/common/pluginfactory.py b/pilot/common/pluginfactory.py index 934ad56ad..389ee62b7 100644 --- a/pilot/common/pluginfactory.py +++ b/pilot/common/pluginfactory.py @@ -22,40 +22,43 @@ """A factory to manage plugins.""" -from typing import Any, Dict +from typing import Any import logging + logger = logging.getLogger(__name__) class PluginFactory: """Plugin factory class.""" - def __init__(self, *args: Any, **kwargs: Any): + def __init__(self, **kwargs: dict): """Set initial values.""" - self.classMap: Dict[str, Any] = {} + if kwargs: # to avoid pylint complaint + pass + self.classMap: dict[str, Any] = {} def get_plugin(self, confs: dict) -> dict: """ Load plugin class. - :param confs: a dict of configurations. - :return: plugin class. + :param confs: a dict of configurations (dict) + :return: plugin class (dict). """ - class_name = confs['class'] + class_name = confs["class"] if class_name is None: logger.error(f"class is not defined in confs: {confs}") return {} if class_name not in self.classMap: logger.info(f"trying to import {class_name}") - components = class_name.split('.') - mod = __import__('.'.join(components[:-1])) + components = class_name.split(".") + mod = __import__(".".join(components[:-1])) for comp in components[1:]: mod = getattr(mod, comp) self.classMap[class_name] = mod args = {} - excluded_keys = {'class'} # Use a set to store keys to exclude + excluded_keys = {"class"} # Use a set to store keys to exclude for key in confs: if key in excluded_keys: continue diff --git a/pilot/control/data.py b/pilot/control/data.py index 9c8577ffc..12d6a33f7 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -25,8 +25,10 @@ """Control interface to data API.""" +import logging import os import time +import traceback import queue from typing import Any from pathlib import Path @@ -75,6 +77,10 @@ find_files_with_pattern, rename_xrdlog ) +from pilot.util.middleware import ( + containerise_middleware, + use_middleware_script +) from pilot.util.processes import threads_aborted from pilot.util.queuehandling import ( declare_failed_by_kill, @@ -82,9 +88,7 @@ ) from pilot.util.timing import add_to_pilot_timing from pilot.util.tracereport import TraceReport -import pilot.util.middleware -import logging logger = logging.getLogger(__name__) errors = ErrorCodes() @@ -102,7 +106,7 @@ def control(queues: Any, traces: Any, args: Any): threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'queues': queues, 'traces': traces, 'args': args}, name=name) for name, target in list(targets.items())] # Python 2/3 - [thread.start() for thread in threads] + _ = [thread.start() for thread in threads] # if an exception is thrown, the graceful_stop will be set by the ExcThread class run() function try: @@ -114,7 +118,8 @@ def control(queues: Any, traces: Any, args: Any): except queue.Empty: pass else: - exc_type, exc_obj, exc_trace = exc + # exc_type, exc_obj, exc_trace = exc + _, exc_obj, _ = exc logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj) # deal with the exception @@ -260,18 +265,17 @@ def _stage_in(args: Any, job: Any) -> bool: label = 'stage-in' # should stage-in be done by a script (for containerisation) or by invoking the API (ie classic mode)? - use_container = pilot.util.middleware.use_middleware_script(job.infosys.queuedata.container_type.get("middleware")) + use_container = use_middleware_script(job.infosys.queuedata.container_type.get("middleware")) if use_container: logger.info('stage-in will be done in a container') try: eventtype, localsite, remotesite = get_trace_report_variables(job, label=label) - pilot.util.middleware.containerise_middleware(job, args, job.indata, eventtype, localsite, remotesite, - job.infosys.queuedata.container_options, label=label, - container_type=job.infosys.queuedata.container_type.get("middleware")) + containerise_middleware(job, args, job.indata, eventtype, localsite, remotesite, + job.infosys.queuedata.container_options, label=label, + container_type=job.infosys.queuedata.container_type.get("middleware")) except PilotException as error: logger.warning('stage-in containerisation threw a pilot exception: %s', error) except Exception as error: - import traceback logger.warning('stage-in containerisation threw an exception: %s', error) logger.error(traceback.format_exc()) else: @@ -298,7 +302,6 @@ def _stage_in(args: Any, job: Any) -> bool: client.prepare_sources(job.indata) client.transfer(job.indata, activity=activity, **kwargs) except PilotException as error: - import traceback error_msg = traceback.format_exc() logger.error(error_msg) msg = errors.format_diagnostics(error.get_error_code(), error_msg) @@ -322,7 +325,10 @@ def _stage_in(args: Any, job: Any) -> bool: add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args) remain_files = [infile for infile in job.indata if infile.status not in ['remote_io', 'transferred', 'no_transfer']] - logger.info("stage-in finished") if not remain_files else logger.info("stage-in failed") + if not remain_files: + logger.info("stage-in finished") + else: + logger.info("stage-in failed") os.environ['PILOT_JOB_STATE'] = 'stageincompleted' return not remain_files @@ -698,8 +704,22 @@ def get_input_file_dictionary(indata: list) -> dict: return ret -def create_log(workdir: str, logfile_name: str, tarball_name: str, cleanup: bool, input_files: list = [], - output_files: list = [], piloterrors: list = [], debugmode: bool = False): +def remove_input_output_files(workdir: str, files: list): + """ + Remove input/output files. + + :param workdir: work directory (str) + :param files: list of files to remove (list). + """ + for fname in files: + path = os.path.join(workdir, fname) + if os.path.exists(path): + logger.info(f'removing file: {path}') + remove(path) + + +def create_log(workdir: str, logfile_name: str, tarball_name: str, cleanup: bool, # noqa: C901 + input_files: list = None, output_files: list = None, piloterrors: list = None, debugmode: bool = False): # noqa: C901 """ Create the tarball for the job. @@ -713,8 +733,13 @@ def create_log(workdir: str, logfile_name: str, tarball_name: str, cleanup: bool :param debugmode: True if debug mode has been switched on (bool) :raises LogFileCreationFailure: in case of log file creation problem. """ + if input_files is None: + input_files = [] + if output_files is None: + output_files = [] + if piloterrors is None: + piloterrors = [] logger.debug(f'preparing to create log file (debug mode={debugmode})') - logger.debug(f'workdir: {workdir}') # PILOT_HOME is the launch directory of the pilot (or the one specified in pilot options as pilot workdir) pilot_home = os.environ.get('PILOT_HOME', os.getcwd()) @@ -722,8 +747,6 @@ def create_log(workdir: str, logfile_name: str, tarball_name: str, cleanup: bool if pilot_home != current_dir: os.chdir(pilot_home) - logger.debug(f'current_dir: {current_dir}') - # copy special files if they exist (could be made experiment specific if there's a need for it) copy_special_files(workdir) @@ -734,26 +757,18 @@ def create_log(workdir: str, logfile_name: str, tarball_name: str, cleanup: bool user.remove_redundant_files(workdir, piloterrors=piloterrors, debugmode=debugmode) # remove any present input/output files before tarring up workdir - for fname in input_files + output_files: - path = os.path.join(workdir, fname) - if os.path.exists(path): - logger.info(f'removing file: {path}') - remove(path) + remove_input_output_files(workdir, input_files + output_files) if logfile_name is None or len(logfile_name.strip('/ ')) == 0: logger.info('skipping tarball creation, since the logfile_name is empty') return - # rename the workdir for the tarball creation newworkdir = os.path.join(os.path.dirname(workdir), tarball_name) orgworkdir = workdir os.rename(workdir, newworkdir) workdir = newworkdir - - # get the size of the workdir dirsize = get_directory_size(workdir) timeout = get_tar_timeout(dirsize) - fullpath = os.path.join(current_dir, logfile_name) # /some/path/to/dirname/log.tgz logger.info(f'will create archive {fullpath} using timeout={timeout} s for directory size={dirsize} MB') @@ -762,11 +777,11 @@ def create_log(workdir: str, logfile_name: str, tarball_name: str, cleanup: bool cmd = f"pwd;tar cvfz {fullpath} {tarball_name} --dereference --one-file-system; echo $?" exit_code, stdout, stderr = execute(cmd, timeout=timeout) except Exception as error: - raise LogFileCreationFailure(error) - else: - if pilot_home != current_dir: - os.chdir(pilot_home) - logger.debug(f'stdout: {stdout}') + raise LogFileCreationFailure(error) from error + if pilot_home != current_dir: + os.chdir(pilot_home) + logger.debug(f'stdout: {stdout}') + try: os.rename(workdir, orgworkdir) except OSError as error: @@ -861,7 +876,7 @@ def _do_stageout(job: Any, args: Any, xdata: list, activity: list, title: str, i label = 'stage-out' # should stage-in be done by a script (for containerisation) or by invoking the API (ie classic mode)? - use_container = pilot.util.middleware.use_middleware_script(job.infosys.queuedata.container_type.get("middleware")) + use_container = use_middleware_script(job.infosys.queuedata.container_type.get("middleware")) # switch the X509_USER_PROXY on unified dispatch queues (restore later in this function) x509_unified_dispatch = os.environ.get('X509_UNIFIED_DISPATCH', '') @@ -878,9 +893,9 @@ def _do_stageout(job: Any, args: Any, xdata: list, activity: list, title: str, i logger.info('stage-out will be done in a container') try: eventtype, localsite, remotesite = get_trace_report_variables(job, label=label) - pilot.util.middleware.containerise_middleware(job, args, xdata, eventtype, localsite, remotesite, - job.infosys.queuedata.container_options, label=label, - container_type=job.infosys.queuedata.container_type.get("middleware")) + containerise_middleware(job, args, xdata, eventtype, localsite, remotesite, + job.infosys.queuedata.container_options, label=label, + container_type=job.infosys.queuedata.container_type.get("middleware")) except PilotException as error: logger.warning('stage-out containerisation threw a pilot exception: %s', error) except Exception as error: @@ -893,20 +908,19 @@ def _do_stageout(job: Any, args: Any, xdata: list, activity: list, title: str, i trace_report = create_trace_report(job, label=label) client = StageOutClient(job.infosys, logger=logger, trace_report=trace_report, ipv=ipv, workdir=job.workdir) - kwargs = dict(workdir=job.workdir, cwd=job.workdir, usecontainer=False, job=job, output_dir=args.output_dir, - catchall=job.infosys.queuedata.catchall, rucio_host=args.rucio_host) #, mode='stage-out') + kwargs = {'workdir': job.workdir, 'cwd': job.workdir, 'usecontainer': False, 'job': job, + 'output_dir': args.output_dir, 'catchall': job.infosys.queuedata.catchall, + 'rucio_host': args.rucio_host} #, mode='stage-out') # prod analy unification: use destination preferences from PanDA server for unified queues if job.infosys.queuedata.type != 'unified': client.prepare_destinations(xdata, activity) ## FIX ME LATER: split activities: for astorages and for copytools (to unify with ES workflow) client.transfer(xdata, activity, **kwargs) except PilotException as error: - import traceback error_msg = traceback.format_exc() logger.error(error_msg) msg = errors.format_diagnostics(error.get_error_code(), error_msg) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code(), msg=msg) except Exception: - import traceback logger.error(traceback.format_exc()) # do not raise the exception since that will prevent also the log from being staged out # error = PilotException("stageOut failed with error=%s" % e, code=ErrorCodes.STAGEOUTFAILED) @@ -961,7 +975,7 @@ def _stage_out_new(job: Any, args: Any) -> bool: is_success = False logger.warning('transfer of output file(s) failed') - if job.stageout in ['log', 'all'] and job.logdata: ## do stage-out log files + if job.stageout in {'log', 'all'} and job.logdata: ## do stage-out log files # prepare log file, consider only 1st available log file status = job.get_status('LOG_TRANSFER') if status != LOG_TRANSFER_NOT_DONE: @@ -1044,7 +1058,7 @@ def generate_fileinfo(job: Any) -> dict: fileinfo = {} checksum_type = config.File.checksum_type if config.File.checksum_type == 'adler32' else 'md5sum' for iofile in job.outdata + job.logdata: - if iofile.status in ['transferred']: + if iofile.status in {'transferred'}: fileinfo[iofile.lfn] = {'guid': iofile.guid, 'fsize': iofile.filesize, f'{checksum_type}': iofile.checksum.get(config.File.checksum_type), diff --git a/pilot/control/interceptor.py b/pilot/control/interceptor.py index 401e942fa..bf1ee766f 100644 --- a/pilot/control/interceptor.py +++ b/pilot/control/interceptor.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 # Note: leave this module for now - the code might be useful for reuse @@ -46,7 +46,7 @@ def run(args: Any): threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'args': args}, name=name) for name, target in list(targets.items())] # Python 2/3 - [thread.start() for thread in threads] + _ = [thread.start() for thread in threads] # if an exception is thrown, the graceful_stop will be set by the ExcThread class run() function while not args.graceful_stop.is_set(): @@ -57,7 +57,7 @@ def run(args: Any): except queue.Empty: pass else: - exc_type, exc_obj, exc_trace = exc + _, exc_obj, _ = exc logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj) # deal with the exception diff --git a/pilot/control/job.py b/pilot/control/job.py index 7a42206da..c996fad51 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -17,15 +17,13 @@ # under the License. # # Authors: -# - Mario Lassnig, mario.lassnig@cern.ch, 2016-17 +# - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Wen Guan, wen.guan@cern.ch, 2018 """Job module with functions for job handling.""" -from __future__ import print_function # Python 2 - import os import time import hashlib @@ -2896,7 +2894,7 @@ def job_monitor(queues: Any, traces: Any, args: Any): # noqa: C901 # by turning on debug mode, ie we need to get the heartbeat period in case it has changed) try: update_time = send_heartbeat_if_time(jobs[i], args, update_time) - except Exception as exc: + except (ValueError, TypeError) as exc: logger.warning(f'exception caught during send_heartbeat_if_time: {exc}') # note: when sending a state change to the server, the server might respond with 'tobekilled' @@ -3000,7 +2998,7 @@ def job_monitor(queues: Any, traces: Any, args: Any): # noqa: C901 # has expired in the meantime) try: _job = jobs[i] - except Exception: + except (IndexError, TypeError): logger.info('aborting job monitoring since job object (job id=%s) has expired', current_id) break @@ -3008,7 +3006,7 @@ def job_monitor(queues: Any, traces: Any, args: Any): # noqa: C901 # by turning on debug mode, ie we need to get the heartbeat period in case it has changed) try: update_time = send_heartbeat_if_time(_job, args, update_time) - except Exception as error: + except (ValueError, TypeError) as error: logger.warning('exception caught: %s (job id=%s)', error, current_id) break else: @@ -3147,12 +3145,12 @@ def send_heartbeat_if_time(job: Any, args: Any, update_time: float) -> int: # check for state==running here, and send explicit 'running' in send_state, rather than sending job.state # since the job state can actually change in the meantime by another thread # job.completed will anyway be checked in https::send_update() - if job.serverstate != 'finished' and job.serverstate != 'failed' and job.state == 'running': - logger.info('will send heartbeat for job in \'running\' state') - send_state(job, args, 'running') + if job.serverstate not in {'finished', 'failed'}: + logger.info(f'will send heartbeat for job in \'{job.state}\' state') + send_state(job, args, job.state) update_time = time.time() else: - logger.info('will not send any job update') + logger.debug('will not send any job update') return int(update_time) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 4f4c89bb5..66ab1840a 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -18,7 +18,7 @@ # # Authors: # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # NOTE: this module should deal with non-job related monitoring, such as thread monitoring. Job monitoring is # a task for the job_monitor thread in the Job component. @@ -29,7 +29,7 @@ import threading import time import re -from os import environ, getpid, getuid +from os import environ, getuid from subprocess import Popen, PIPE from typing import Any @@ -64,8 +64,8 @@ def control(queues: Any, traces: Any, args: Any): # noqa: C901 threadchecktime = int(config.Pilot.thread_check) # for CPU usage debugging - cpuchecktime = int(config.Pilot.cpu_check) - tcpu = t_0 + # cpuchecktime = int(config.Pilot.cpu_check) + # tcpu = t_0 last_minute_check = t_0 queuedata = get_queuedata_from_job(queues) @@ -110,9 +110,8 @@ def control(queues: Any, traces: Any, args: Any): # noqa: C901 f'exceeded - time to abort pilot') reached_maxtime_abort(args) break - else: - if niter % 60 == 0: - logger.info(f'{time_since_start}s have passed since pilot start') + if niter % 60 == 0: + logger.info(f'{time_since_start}s have passed since pilot start') # every minute run the following check if is_pilot_check(check='machinefeatures'): @@ -127,15 +126,15 @@ def control(queues: Any, traces: Any, args: Any): # noqa: C901 time.sleep(1) # time to check the CPU usage? - if is_pilot_check(check='cpu_usage'): - if int(time.time() - tcpu) > cpuchecktime and False: # for testing only - processes = get_process_info('python3 pilot3/pilot.py', pid=getpid()) - if processes: - logger.info(f'PID={getpid()} has CPU usage={processes[0]}% CMD={processes[2]}') - nproc = processes[3] - if nproc > 1: - logger.info(f'.. there are {nproc} such processes running') - tcpu = time.time() + # if is_pilot_check(check='cpu_usage'): + # if int(time.time() - tcpu) > cpuchecktime: + # processes = get_process_info('python3 pilot3/pilot.py', pid=getpid()) + # if processes: + # logger.info(f'PID={getpid()} has CPU usage={processes[0]}% CMD={processes[2]}') + # nproc = processes[3] + # if nproc > 1: + # logger.info(f'.. there are {nproc} such processes running') + # tcpu = time.time() # proceed with running the other checks run_checks(queues, args) @@ -154,7 +153,7 @@ def control(queues: Any, traces: Any, args: Any): # noqa: C901 except Exception as error: print((f"monitor: exception caught: {error}")) - raise PilotException(error) + raise PilotException(error) from error logger.info('[monitor] control thread has ended') @@ -185,8 +184,7 @@ def run_shutdowntime_minute_check(time_since_start: int) -> bool: except (TypeError, ValueError): # as exc: #logger.debug(f'failed to convert shutdowntime: {exc}') return False # will be ignored - else: - logger.debug(f'machinefeatures shutdowntime={shutdowntime} - now={now}') + logger.debug(f'machinefeatures shutdowntime={shutdowntime} - now={now}') if not shutdowntime: logger.debug('ignoring shutdowntime since it is not set') return False # will be ignored @@ -268,19 +266,19 @@ def get_process_info(cmd: str, user: str = "", args: str = 'aufx', pid: int = 0) pattern = re.compile(r"\S+|[-+]?\d*\.\d+|\d+") arguments = ['ps', '-u', user, args, '--no-headers'] - process = Popen(arguments, stdout=PIPE, stderr=PIPE, encoding='utf-8') - stdout, _ = process.communicate() - for line in stdout.splitlines(): - found = re.findall(pattern, line) - if found is not None: - processid = found[1] - cpu = found[2] - mem = found[3] - command = ' '.join(found[10:]) - if cmd in command: - num += 1 - if processid == str(pid): - processes = [cpu, mem, command] + with Popen(arguments, stdout=PIPE, stderr=PIPE, encoding='utf-8') as process: + stdout, _ = process.communicate() + for line in stdout.splitlines(): + found = re.findall(pattern, line) + if found is not None: + processid = found[1] + cpu = found[2] + mem = found[3] + command = ' '.join(found[10:]) + if cmd in command: + num += 1 + if processid == str(pid): + processes = [cpu, mem, command] if processes: processes.append(num) @@ -327,7 +325,7 @@ def run_checks(queues: Any, args: Any) -> None: _pilot_heartbeat = get_proper_pilot_heartbeat() if last_heartbeat > _pilot_heartbeat: - detected_job_suspension = True if last_heartbeat > 10 * 60 else False + detected_job_suspension = last_heartbeat > 10 * 60 if detected_job_suspension: logger.warning(f'detected job suspension (last heartbeat was updated more than 10 minutes ago: {last_heartbeat} s)') else: @@ -427,8 +425,7 @@ def get_max_running_time(lifetime: int, queuedata: Any, queues: Any, push: bool, logger.warning(f'failed to convert maxtime from queuedata, will use default value for max running time ' f'({max_running_time}s)') else: - if max_running_time == 0: - max_running_time = lifetime # fallback to default value + max_running_time = lifetime if max_running_time == 0 else max_running_time # logger.debug(f'will use default value for max running time: {max_running_time}s') #else: # logger.debug(f'will use queuedata.maxtime value for max running time: {max_running_time}s') diff --git a/pilot/control/payload.py b/pilot/control/payload.py index ad39cbb81..8ef1c4538 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -20,7 +20,7 @@ # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 # - Tobias Wegner, tobias.wegner@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Wen Guan, wen.guan@cern.ch, 2017-2018 """Functions for handling the payload.""" @@ -33,6 +33,11 @@ from re import findall, split from typing import Any, TextIO +from pilot.common.errorcodes import ErrorCodes +from pilot.common.exception import ( + ExcThread, + PilotException +) from pilot.control.payloads import ( generic, eventservice, @@ -41,22 +46,20 @@ from pilot.control.job import send_state from pilot.util.auxiliary import set_pilot_state from pilot.util.container import execute -from pilot.util.processes import get_cpu_consumption_time from pilot.util.config import config from pilot.util.filehandling import ( - read_file, - remove_core_dumps, - get_guid, extract_lines_from_file, - find_file + find_file, + get_guid, + read_file, + read_json, + remove_core_dumps ) -from pilot.util.processes import threads_aborted -from pilot.util.queuehandling import put_in_queue -from pilot.common.errorcodes import ErrorCodes -from pilot.common.exception import ( - ExcThread, - PilotException +from pilot.util.processes import ( + get_cpu_consumption_time, + threads_aborted ) +from pilot.util.queuehandling import put_in_queue from pilot.util.realtimelogger import get_realtime_logger logger = logging.getLogger(__name__) @@ -76,7 +79,7 @@ def control(queues: Any, traces: Any, args: Any): threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'queues': queues, 'traces': traces, 'args': args}, name=name) for name, target in list(targets.items())] - [thread.start() for thread in threads] + _ = [thread.start() for thread in threads] # if an exception is thrown, the graceful_stop will be set by the ExcThread class run() function try: @@ -88,7 +91,8 @@ def control(queues: Any, traces: Any, args: Any): except queue.Empty: pass else: - exc_type, exc_obj, exc_trace = exc + # exc_type, exc_obj, exc_trace = exc + _, exc_obj, _ = exc logger.warning(f"thread \'{thread.name}\' received an exception from bucket: {exc_obj}") # deal with the exception @@ -293,10 +297,10 @@ def execute_payloads(queues: Any, traces: Any, args: Any): # noqa: C901 user = __import__(f'pilot.user.{pilot_user}.diagnose', globals(), locals(), [pilot_user], 0) try: exit_code_interpret = user.interpret(job) - except Exception as error: + except (Exception, PilotException) as error: logger.warning(f'exception caught: {error}') if isinstance(error, PilotException): - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error._errorCode, msg=error._message) + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error._error_code, msg=error._message) elif isinstance(error, str): if 'error code:' in error and 'message:' in error: error_code, diagnostics = extract_error_info(error) @@ -439,11 +443,11 @@ def get_logging_info(job: Any, args: Any) -> dict: except IndexError as exc: logger.warning(f'exception caught: {exc}') return {} - else: - # find the log file to tail - path = find_log_to_tail(job.debug_command, job.workdir, args, job.is_analysis()) - logger.info(f'using {path} for real-time logging') - info_dic['logfiles'] = [path] + + # find the log file to tail + path = find_log_to_tail(job.debug_command, job.workdir, args, job.is_analysis()) + logger.info(f'using {path} for real-time logging') + info_dic['logfiles'] = [path] if 'logfiles' not in info_dic: # Rubin @@ -529,7 +533,7 @@ def run_realtimelog(queues: Any, traces: Any, args: Any): # noqa: C901 f'(job.debug={job.debug}, job.debug_command={job.debug_command})') first1 = False break - if job.state == 'stageout' or job.state == 'failed' or job.state == 'holding': + if job.state in {'stageout', 'failed', 'holding'}: if first2: logger.debug(f'job is in state {job.state}, continue to next job or abort (wait for graceful stop)') first1 = True @@ -578,9 +582,8 @@ def run_realtimelog(queues: Any, traces: Any, args: Any): # noqa: C901 if realtime_logger is None: logger.debug('realtime logger was not found, waiting ..') continue - else: - logger.debug('realtime logger was found') + logger.debug('realtime logger was found') realtime_logger.sending_logs(args, job) # proceed to set the job_aborted flag? @@ -616,6 +619,23 @@ def perform_initial_payload_error_analysis(job: Any, exit_code: int): if exit_code != 0: logger.warning(f'main payload execution returned non-zero exit code: {exit_code}') + # check if the transform has produced an error report + path = os.path.join(job.workdir, config.Payload.error_report) + if os.path.exists(path): + error_report = read_json(path) + error_code = error_report.get('error_code') + error_diag = error_report.get('error_diag') + if error_code: + logger.warning(f'{config.Payload.error_report} contained error code: {error_code}') + logger.warning(f'{config.Payload.error_report} contained error diag: {error_diag}') + job.exeerrorcode = error_code + job.exeerrordiag = error_report.get('error_diag') + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADEXECUTIONFAILURE, msg=error_diag) + return + logger.info(f'{config.Payload.error_report} exists but did not contain any non-zero error code') + else: + logger.debug(f'{config.Payload.error_report} does not exist') + # look for singularity/apptainer errors (the exit code can be zero in this case) path = os.path.join(job.workdir, config.Payload.payloadstderr) if os.path.exists(path): @@ -661,28 +681,11 @@ def perform_initial_payload_error_analysis(job: Any, exit_code: int): msg = errors.format_diagnostics(exit_code, msg) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code, msg=msg) - - ''' - if exit_code != 0: - if msg: - msg = errors.format_diagnostics(exit_code, msg) - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code, msg=msg) - else: - if job.piloterrorcodes: - logger.warning('error code(s) already set: %s', str(job.piloterrorcodes)) - else: - # check if core dumps exist, if so remove them and return True - if remove_core_dumps(job.workdir) and not job.debug: - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.COREDUMP) - else: - logger.warning('initial error analysis did not resolve the issue (and core dumps were not found)') - ''' else: logger.info('main payload execution returned zero exit code') - # check if core dumps exist, if so remove them and return True + # check if core dumps exist, if so remove them if not job.debug: # do not shorten these if-statements - # only return True if found core dump belongs to payload if remove_core_dumps(job.workdir, pid=job.pid): # COREDUMP error will only be set if the core dump belongs to the payload (ie 'core.') logger.warning('setting COREDUMP error') diff --git a/pilot/control/payloads/eventservice.py b/pilot/control/payloads/eventservice.py index e1c61cce2..ebff6c7a0 100644 --- a/pilot/control/payloads/eventservice.py +++ b/pilot/control/payloads/eventservice.py @@ -18,7 +18,7 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2017-2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2024 """Executor module for event service payloads.""" @@ -37,17 +37,19 @@ class Executor(generic.Executor): """Executor class for event service payloads.""" - def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): - """ - Set initial values. - - :param args: args object (Any) - :param job: job object (Any) - :param out: stdout file object (TextIO) - :param err: stderr file object (TextIO) - :param traces: traces object (Any). - """ - super().__init__(args, job, out, err, traces) + # only define the __init__ function if it actually does anything - otherwise it can be omitted since the + # parent __init__ function will be called automatically + # def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): + # """ + # Set initial values. + # + # :param args: args object (Any) + # :param job: job object (Any) + # :param out: stdout file object (TextIO) + # :param err: stderr file object (TextIO) + # :param traces: traces object (Any). + # """ + # super().__init__(args, job, out, err, traces) def run_payload(self, job: Any, cmd: str, out: TextIO, err: TextIO) -> Any: """ @@ -62,8 +64,10 @@ def run_payload(self, job: Any, cmd: str, out: TextIO, err: TextIO) -> Any: self.pre_setup(job) # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'atlas').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "atlas").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) self.post_setup(job) @@ -74,13 +78,19 @@ def run_payload(self, job: Any, cmd: str, out: TextIO, err: TextIO) -> Any: try: executable = user.get_payload_command(job) except exception.PilotException: - logger.fatal('could not define payload command') + logger.fatal("could not define payload command") return None logger.info(f"payload execution command: {executable}") try: - payload = {'executable': executable, 'workdir': job.workdir, 'output_file': out, 'error_file': err, 'job': job} + payload = { + "executable": executable, + "workdir": job.workdir, + "output_file": out, + "error_file": err, + "job": job, + } logger.debug("payload: %s", payload) logger.info("starting EventService WorkExecutor") @@ -97,7 +107,7 @@ def run_payload(self, job: Any, cmd: str, out: TextIO, err: TextIO) -> Any: self.utility_after_payload_started(job) except Exception as error: - logger.error(f'could not execute: {error}') + logger.error(f"could not execute: {error}") return None return executor @@ -113,7 +123,7 @@ def get_executor_type(self) -> dict: """ # executor_type = 'hpo' if job.is_hpo else os.environ.get('PILOT_ES_EXECUTOR_TYPE', 'generic') # return {'executor_type': executor_type} - return {'executor_type': os.environ.get('PILOT_ES_EXECUTOR_TYPE', 'generic')} + return {"executor_type": os.environ.get("PILOT_ES_EXECUTOR_TYPE", "generic")} def wait_graceful(self, args: Any, proc: Any) -> int: """ diff --git a/pilot/control/payloads/eventservicemerge.py b/pilot/control/payloads/eventservicemerge.py index 05fb44b03..bd3be12b8 100644 --- a/pilot/control/payloads/eventservicemerge.py +++ b/pilot/control/payloads/eventservicemerge.py @@ -18,13 +18,13 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 """Executor module for event service merge payloads.""" import logging import os -from typing import Any, TextIO +from typing import Any # , TextIO from pilot.control.payloads import generic from pilot.util.container import execute @@ -35,17 +35,19 @@ class Executor(generic.Executor): """Executor class for event service merge payloads.""" - def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): - """ - Set initial values. - - :param args: args object (Any) - :param job: job object (Any) - :param out: stdout file object (TextIO) - :param err: stderr file object (TextIO) - :param traces: traces object (Any). - """ - super().__init__(args, job, out, err, traces) + # only define the __init__ function if it actually does anything - otherwise it can be omitted since the + # parent __init__ function will be called automatically + # def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): + # """ + # Set initial values. + # + # :param args: args object (Any) + # :param job: job object (Any) + # :param out: stdout file object (TextIO) + # :param err: stderr file object (TextIO) + # :param traces: traces object (Any). + # """ + # super().__init__(args, job, out, err, traces) def untar_file(self, lfn: str, workdir: str): """ diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index c46ef8682..91d98268f 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -20,7 +20,7 @@ # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 # - Tobias Wegner, tobias.wegner@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Wen Guan, wen.guan@cern.ch, 2018 """Executor module for generic payloads.""" @@ -29,12 +29,13 @@ import os import signal import time +import traceback from subprocess import PIPE from typing import Any, TextIO from pilot.common.errorcodes import ErrorCodes from pilot.control.job import send_state -from pilot.util.auxiliary import set_pilot_state #, show_memory_usage +from pilot.util.auxiliary import set_pilot_state # , show_memory_usage from pilot.util.config import config from pilot.util.container import execute from pilot.util.constants import ( @@ -47,24 +48,18 @@ PILOT_PRE_PAYLOAD, PILOT_POST_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED2, - UTILITY_AFTER_PAYLOAD_FINISHED2 -) -from pilot.util.filehandling import ( - write_file, - read_file + UTILITY_AFTER_PAYLOAD_FINISHED2, ) +from pilot.util.filehandling import write_file, read_file from pilot.util.processes import kill_processes -from pilot.util.timing import ( - add_to_pilot_timing, - get_time_measurement -) +from pilot.util.timing import add_to_pilot_timing, get_time_measurement from pilot.common.exception import PilotException logger = logging.getLogger(__name__) errors = ErrorCodes() -class Executor(): +class Executor: """Executor class for generic payloads.""" def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): @@ -82,12 +77,13 @@ def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): self.__out = out # payload stdout file object self.__err = err # payload stderr file object self.__traces = traces - self.__preprocess_stdout_name = '' - self.__preprocess_stderr_name = '' - self.__coprocess_stdout_name = 'coprocess_stdout.txt' - self.__coprocess_stderr_name = 'coprocess_stderr.txt' - self.__postprocess_stdout_name = '' - self.__postprocess_stderr_name = '' + # self.__preprocess_stdout_name = '' + # self.__preprocess_stderr_name = '' + self.__coprocess_stdout_name = "coprocess_stdout.txt" + self.__coprocess_stderr_name = "coprocess_stderr.txt" + + # self.__postprocess_stdout_name = '' + # self.__postprocess_stderr_name = '' def get_job(self): """ @@ -105,8 +101,8 @@ def pre_setup(self, job: Any): """ # write time stamps to pilot timing file update_time = time.time() - logger.debug(f'setting pre-setup time to {update_time} s') - logger.debug(f'gmtime is {time.gmtime(update_time)}') + logger.debug(f"setting pre-setup time to {update_time} s") + logger.debug(f"gmtime is {time.gmtime(update_time)}") add_to_pilot_timing(job.jobid, PILOT_PRE_SETUP, update_time, self.__args) def post_setup(self, job: Any, update_time: bool = None): @@ -119,8 +115,8 @@ def post_setup(self, job: Any, update_time: bool = None): # write time stamps to pilot timing file if not update_time: update_time = time.time() - logger.debug(f'setting post-setup time to {update_time} s') - logger.debug(f'gmtime is {time.gmtime(update_time)}') + logger.debug(f"setting post-setup time to {update_time} s") + logger.debug(f"gmtime is {time.gmtime(update_time)}") add_to_pilot_timing(job.jobid, PILOT_POST_SETUP, update_time, self.__args) def improve_post_setup(self): @@ -135,22 +131,32 @@ def improve_post_setup(self): if not os.path.exists(path): return - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.setup', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.setup", globals(), locals(), [pilot_user], 0 + ) try: end_setup_time = user.get_end_setup_time(path) # since epoch except Exception as exc: - logger.debug(f'caught exception: {exc} (will not update setup time)') + logger.debug(f"caught exception: {exc} (will not update setup time)") end_setup_time = None if end_setup_time: # get the currently stored post-setup time time_measurement_dictionary = self.__args.timing.get(self.__job.jobid, None) - current_post_setup = get_time_measurement(PILOT_POST_SETUP, time_measurement_dictionary, self.__args.timing) + current_post_setup = get_time_measurement( + PILOT_POST_SETUP, time_measurement_dictionary, self.__args.timing + ) if current_post_setup: - logger.info(f'existing post-setup time: {current_post_setup} s (since epoch) (current time: {time.time()})') - logger.debug(f'extracted end time from payload stdout: {end_setup_time} s') + logger.info( + f"existing post-setup time: {current_post_setup} s (since epoch) (current time: {time.time()})" + ) + logger.debug( + f"extracted end time from payload stdout: {end_setup_time} s" + ) diff = end_setup_time - current_post_setup - logger.info(f'payload setup finished {diff} s later than previously recorded') + logger.info( + f"payload setup finished {diff} s later than previously recorded" + ) self.post_setup(self.__job, update_time=end_setup_time) def utility_before_payload(self, job: Any) -> str: @@ -168,15 +174,21 @@ def utility_before_payload(self, job: Any) -> str: cmd = "" # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) # should we run any additional commands? (e.g. special monitoring commands) - cmd_dictionary = user.get_utility_commands(order=UTILITY_BEFORE_PAYLOAD, job=job) + cmd_dictionary = user.get_utility_commands( + order=UTILITY_BEFORE_PAYLOAD, job=job + ) if cmd_dictionary: cmd = f"{cmd_dictionary.get('command')} {cmd_dictionary.get('args')}" - _label = cmd_dictionary.get('label', 'utility') - logger.info(f'utility command (\'{_label}\') to be executed before the payload: {cmd}') + _label = cmd_dictionary.get("label", "utility") + logger.info( + f"utility command ('{_label}') to be executed before the payload: {cmd}" + ) return cmd @@ -192,15 +204,19 @@ def utility_with_payload(self, job: Any) -> str: cmd = "" # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) # should any additional commands be prepended to the payload execution string? cmd_dictionary = user.get_utility_commands(order=UTILITY_WITH_PAYLOAD, job=job) if cmd_dictionary: cmd = f"{cmd_dictionary.get('command')} {cmd_dictionary.get('args')}" - _label = cmd_dictionary.get('label', 'utility') - logger.info(f'utility command (\'{_label}\') to be executed with the payload: {cmd}') + _label = cmd_dictionary.get("label", "utility") + logger.info( + f"utility command ('{_label}') to be executed with the payload: {cmd}" + ) return cmd @@ -217,15 +233,19 @@ def get_utility_command(self, order: str = "") -> str: cmd = "" # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) # should any additional commands be executed after the payload? cmd_dictionary = user.get_utility_commands(order=order, job=self.__job) if cmd_dictionary: cmd = f"{cmd_dictionary.get('command')} {cmd_dictionary.get('args')}" - _label = cmd_dictionary.get('label', 'utility') - logger.info(f'utility command (\'{_label}\') to be executed after the payload: {cmd}') + _label = cmd_dictionary.get("label", "utility") + logger.info( + f"utility command ('{_label}') to be executed after the payload: {cmd}" + ) return cmd @@ -236,37 +256,55 @@ def utility_after_payload_started(self, job: Any): :param job: job object (Any). """ # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) # should any additional commands be executed after the payload? - cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_STARTED, job=job) + cmd_dictionary = user.get_utility_commands( + order=UTILITY_AFTER_PAYLOAD_STARTED, job=job + ) if cmd_dictionary: cmd = f"{cmd_dictionary.get('command')} {cmd_dictionary.get('args')}" - logger.info(f'utility command to be executed after the payload: {cmd}') + logger.info(f"utility command to be executed after the payload: {cmd}") # how should this command be executed? - utilitycommand = user.get_utility_command_setup(cmd_dictionary.get('command'), job) + utilitycommand = user.get_utility_command_setup( + cmd_dictionary.get("command"), job + ) if not utilitycommand: - logger.warning('empty utility command - nothing to run') + logger.warning("empty utility command - nothing to run") return try: - proc1 = execute(utilitycommand, workdir=job.workdir, returnproc=True, usecontainer=False, - stdout=PIPE, stderr=PIPE, cwd=job.workdir, job=job) + proc1 = execute( + utilitycommand, + workdir=job.workdir, + returnproc=True, + usecontainer=False, + stdout=PIPE, + stderr=PIPE, + cwd=job.workdir, + job=job, + ) except Exception as error: - logger.error(f'could not execute: {error}') + logger.error(f"could not execute: {error}") else: # store process handle in job object, and keep track on how many times the command has been launched # also store the full command in case it needs to be restarted later (by the job_monitor() thread) - logger.debug(f'storing process for {utilitycommand}') - job.utilities[cmd_dictionary.get('command')] = [proc1, 1, utilitycommand] + logger.debug(f"storing process for {utilitycommand}") + job.utilities[cmd_dictionary.get("command")] = [ + proc1, + 1, + utilitycommand, + ] # wait for command to start - #time.sleep(1) - #cmd = utilitycommand.split(';')[-1] - #prmon = f'prmon --pid {job.pid}' - #pid = None - #if prmon in cmd: + # time.sleep(1) + # cmd = utilitycommand.split(';')[-1] + # prmon = f'prmon --pid {job.pid}' + # pid = None + # if prmon in cmd: # import subprocess # import re # ps = subprocess.run(['ps', 'aux', str(os.getpid())], stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -278,10 +316,10 @@ def utility_after_payload_started(self, job: Any): # if matches: # pid = matches[0] # break - #if pid: + # if pid: # logger.info(f'{prmon} command has pid={pid} (appending to cmd dictionary)') # job.utilities[cmd_dictionary.get('command')].append(pid) - #else: + # else: # logger.info(f'could not extract any pid from ps for cmd={cmd}') def utility_after_payload_started_new(self, job: Any) -> str: @@ -296,31 +334,35 @@ def utility_after_payload_started_new(self, job: Any) -> str: cmd = "" # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) # should any additional commands be executed after the payload? - cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_STARTED, job=job) + cmd_dictionary = user.get_utility_commands( + order=UTILITY_AFTER_PAYLOAD_STARTED, job=job + ) if cmd_dictionary: cmd = f"{cmd_dictionary.get('command')} {cmd_dictionary.get('args')}" - logger.info(f'utility command to be executed after the payload: {cmd}') + logger.info(f"utility command to be executed after the payload: {cmd}") return cmd -# # how should this command be executed? -# utilitycommand = user.get_utility_command_setup(cmd_dictionary.get('command'), job) -# if not utilitycommand: -# logger.warning('empty utility command - nothing to run') -# return -# try: -# proc = execute(utilitycommand, workdir=job.workdir, returnproc=True, usecontainer=False, -# stdout=PIPE, stderr=PIPE, cwd=job.workdir, job=job) -# except Exception as error: -# logger.error('could not execute: %s', error) -# else: -# # store process handle in job object, and keep track on how many times the command has been launched -# # also store the full command in case it needs to be restarted later (by the job_monitor() thread) -# job.utilities[cmd_dictionary.get('command')] = [proc, 1, utilitycommand] + # # how should this command be executed? + # utilitycommand = user.get_utility_command_setup(cmd_dictionary.get('command'), job) + # if not utilitycommand: + # logger.warning('empty utility command - nothing to run') + # return + # try: + # proc = execute(utilitycommand, workdir=job.workdir, returnproc=True, usecontainer=False, + # stdout=PIPE, stderr=PIPE, cwd=job.workdir, job=job) + # except Exception as error: + # logger.error('could not execute: %s', error) + # else: + # # store process handle in job object, and keep track on how many times the command has been launched + # # also store the full command in case it needs to be restarted later (by the job_monitor() thread) + # job.utilities[cmd_dictionary.get('command')] = [proc, 1, utilitycommand] def utility_after_payload_finished(self, job: Any, order: str) -> (str, str, bool): """ @@ -337,17 +379,23 @@ def utility_after_payload_finished(self, job: Any, order: str) -> (str, str, boo cmd = "" # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) # should any additional commands be prepended to the payload execution string? cmd_dictionary = user.get_utility_commands(order=order, job=job) - label = cmd_dictionary.get('label') if cmd_dictionary else 'unknown' + label = cmd_dictionary.get("label") if cmd_dictionary else "unknown" if cmd_dictionary: cmd = f"{cmd_dictionary.get('command')} {cmd_dictionary.get('args')}" - logger.info(f'utility command (\'{label}\') to be executed after the payload has finished: {cmd}') + logger.info( + f"utility command ('{label}') to be executed after the payload has finished: {cmd}" + ) - ignore_failure = cmd_dictionary.get('ignore_failure') if cmd_dictionary else False + ignore_failure = ( + cmd_dictionary.get("ignore_failure") if cmd_dictionary else False + ) return cmd, label, ignore_failure def execute_utility_command(self, cmd: str, job: Any, label: str) -> int: @@ -359,20 +407,24 @@ def execute_utility_command(self, cmd: str, job: Any, label: str) -> int: :param label: command label (str) :return: exit code (int). """ - exit_code, stdout, stderr = execute(cmd, workdir=job.workdir, cwd=job.workdir, usecontainer=False) + exit_code, stdout, stderr = execute( + cmd, workdir=job.workdir, cwd=job.workdir, usecontainer=False + ) if exit_code: ignored_exit_codes = [160, 161, 162] logger.warning( - f'command returned non-zero exit code: {cmd} (exit code = {exit_code}) - see utility logs for details' + f"command returned non-zero exit code: {cmd} (exit code = {exit_code}) - see utility logs for details" ) - if label == 'preprocess': + if label == "preprocess": err = errors.PREPROCESSFAILURE - elif label == 'postprocess': + elif label == "postprocess": err = errors.POSTPROCESSFAILURE else: err = 0 # ie ignore exit_code = 0 - if err and exit_code not in ignored_exit_codes: # ignore no-more-data-points exit codes + if ( + err and exit_code not in ignored_exit_codes + ): # ignore no-more-data-points exit codes job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(err) if exit_code in ignored_exit_codes: job.transexitcode = exit_code @@ -396,28 +448,28 @@ def write_utility_output(self, workdir: str, step: str, stdout: str, stderr: str """ # dump to file try: - name_stdout = step + '_stdout.txt' - name_stderr = step + '_stderr.txt' - if step == 'preprocess': - self.__preprocess_stdout_name = name_stdout - self.__preprocess_stderr_name = name_stderr - elif step == 'postprocess': - self.__postprocess_stdout_name = name_stdout - self.__postprocess_stderr_name = name_stderr - name = os.path.join(workdir, step + '_stdout.txt') + # name_stdout = step + '_stdout.txt' + # name_stderr = step + '_stderr.txt' + # if step == 'preprocess': + # self.__preprocess_stdout_name = name_stdout + # self.__preprocess_stderr_name = name_stderr + # elif step == 'postprocess': + # self.__postprocess_stdout_name = name_stdout + # self.__postprocess_stderr_name = name_stderr + name = os.path.join(workdir, step + "_stdout.txt") write_file(name, stdout, unique=True) except PilotException as error: - logger.warning(f'failed to write utility stdout to file: {error}, {stdout}') + logger.warning(f"failed to write utility stdout to file: {error}, {stdout}") else: - logger.debug(f'wrote {name}') + logger.debug(f"wrote {name}") try: - name = os.path.join(workdir, step + '_stderr.txt') + name = os.path.join(workdir, step + "_stderr.txt") write_file(name, stderr, unique=True) except PilotException as error: - logger.warning(f'failed to write utility stderr to file: {error}, {stderr}') + logger.warning(f"failed to write utility stderr to file: {error}, {stderr}") else: - logger.debug(f'wrote {name}') + logger.debug(f"wrote {name}") def pre_payload(self, job: Any): """ @@ -429,8 +481,8 @@ def pre_payload(self, job: Any): """ # write time stamps to pilot timing file update_time = time.time() - logger.debug(f'setting pre-payload time to {update_time} s') - logger.debug(f'gmtime is {time.gmtime(update_time)}') + logger.debug(f"setting pre-payload time to {update_time} s") + logger.debug(f"gmtime is {time.gmtime(update_time)}") add_to_pilot_timing(job.jobid, PILOT_PRE_PAYLOAD, update_time, self.__args) def post_payload(self, job: Any): @@ -443,8 +495,8 @@ def post_payload(self, job: Any): """ # write time stamps to pilot timing file update_time = time.time() - logger.debug(f'setting post-payload time to {update_time} s') - logger.debug(f'gmtime is {time.gmtime(update_time)}') + logger.debug(f"setting post-payload time to {update_time} s") + logger.debug(f"gmtime is {time.gmtime(update_time)}") add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, update_time, self.__args) def run_command(self, cmd: str, label: str = "") -> Any: @@ -456,29 +508,41 @@ def run_command(self, cmd: str, label: str = "") -> Any: :return: subprocess object (Any). """ if label: - logger.info(f'\n\n{label}:\n\n{cmd}\n') - if label == 'coprocess': + logger.info(f"\n\n{label}:\n\n{cmd}\n") + if label == "coprocess": try: - out = open(os.path.join(self.__job.workdir, self.__coprocess_stdout_name), 'wb') - err = open(os.path.join(self.__job.workdir, self.__coprocess_stderr_name), 'wb') + out = open( + os.path.join(self.__job.workdir, self.__coprocess_stdout_name), "wb" + ) + err = open( + os.path.join(self.__job.workdir, self.__coprocess_stderr_name), "wb" + ) except IOError as error: - logger.warning(f'failed to open coprocess stdout/err: {error}') + logger.warning(f"failed to open coprocess stdout/err: {error}") out = None err = None else: out = None err = None try: - proc = execute(cmd, workdir=self.__job.workdir, returnproc=True, stdout=out, stderr=err, - usecontainer=False, cwd=self.__job.workdir, job=self.__job) + proc = execute( + cmd, + workdir=self.__job.workdir, + returnproc=True, + stdout=out, + stderr=err, + usecontainer=False, + cwd=self.__job.workdir, + job=self.__job, + ) except Exception as error: - logger.error(f'could not execute: {error}') + logger.error(f"could not execute: {error}") return None if isinstance(proc, tuple) and not proc[0]: - logger.error('failed to execute command') + logger.error("failed to execute command") return None - logger.info(f'started {label} -- pid={proc.pid} executable={cmd}') + logger.info(f"started {label} -- pid={proc.pid} executable={cmd}") return proc @@ -501,21 +565,29 @@ def run_payload(self, job: Any, cmd: str, out: Any, err: Any) -> Any: logger.info(f"\n\npayload execution command:\n\n{cmd}\n") try: - proc = execute(cmd, workdir=job.workdir, returnproc=True, - usecontainer=True, stdout=out, stderr=err, cwd=job.workdir, job=job) + proc = execute( + cmd, + workdir=job.workdir, + returnproc=True, + usecontainer=True, + stdout=out, + stderr=err, + cwd=job.workdir, + job=job, + ) except Exception as error: - logger.error(f'could not execute: {error}') + logger.error(f"could not execute: {error}") return None if isinstance(proc, tuple) and not proc[0]: - logger.error('failed to execute payload') + logger.error("failed to execute payload") return None - logger.info(f'started -- pid={proc.pid} executable={cmd}') + logger.info(f"started -- pid={proc.pid} executable={cmd}") job.pid = proc.pid job.pgrp = os.getpgid(job.pid) set_pilot_state(job=job, state="running") - #_cmd = self.utility_with_payload(job) + # _cmd = self.utility_with_payload(job) self.utility_after_payload_started(job) @@ -530,37 +602,33 @@ def extract_setup(self, cmd: str) -> str: :param cmd: payload command (str) :return: updated secondary command (str). """ - def cut_str_from(_cmd: str, _str: str) -> str: - """ - Cut the string from the position of the given _cmd. - :param _cmd: command (str) - :param _str: substring (str) - :return: cut command (str). - """ - return _cmd[:_cmd.find(_str)] + def cut_str_from(_cmd: str, _str: str) -> str: + """Cut the string from the position of the given _cmd.""" + return _cmd[: _cmd.find(_str)] def cut_str_from_last_semicolon(_cmd: str) -> str: """ Cut the string from the last semicolon. NOTE: this will not work if jobParams also contain ; - - :param _cmd: command (str) - :return: cut command (str). """ # remove any trailing spaces and ;-signs _cmd = _cmd.strip() - _cmd = _cmd[:-1] if _cmd.endswith(';') else _cmd - last_bit = _cmd.split(';')[-1] - return _cmd.replace(last_bit.strip(), '') - - if '/' in self.__job.transformation: # e.g. http://pandaserver.cern.ch:25080/trf/user/runHPO-00-00-01 - trfname = self.__job.transformation[self.__job.transformation.rfind('/') + 1:] # runHPO-00-00-01 - _trf = './' + trfname + _cmd = _cmd[:-1] if _cmd.endswith(";") else _cmd + last_bit = _cmd.split(";")[-1] + return _cmd.replace(last_bit.strip(), "") + + if ( + "/" in self.__job.transformation + ): # e.g. http://pandaserver.cern.ch:25080/trf/user/runHPO-00-00-01 + trfname = self.__job.transformation[ + self.__job.transformation.rfind("/") + 1: + ] # runHPO-00-00-01 + _trf = "./" + trfname else: trfname = self.__job.transformation - _trf = './' + self.__job.transformation + _trf = "./" + self.__job.transformation if _trf in cmd: setup = cut_str_from(cmd, _trf) @@ -589,7 +657,7 @@ def wait_graceful(self, args: Any, proc: Any) -> int: for _ in range(60): if args.graceful_stop.is_set(): breaker = True - logger.info(f'breaking -- sending SIGTERM to pid={proc.pid}') + logger.info(f"breaking -- sending SIGTERM to pid={proc.pid}") os.killpg(os.getpgid(proc.pid), signal.SIGTERM) break exit_code = proc.poll() @@ -597,7 +665,9 @@ def wait_graceful(self, args: Any, proc: Any) -> int: break time.sleep(1) if breaker: - logger.info(f'breaking -- sleep 10 s before sending SIGKILL pid={proc.pid}') + logger.info( + f"breaking -- sleep 10 s before sending SIGKILL pid={proc.pid}" + ) time.sleep(10) proc.kill() break @@ -605,11 +675,12 @@ def wait_graceful(self, args: Any, proc: Any) -> int: exit_code = proc.poll() if iteration % 10 == 0: - logger.info(f'running: iteration={iteration} pid={proc.pid} exit_code={exit_code}') + logger.info( + f"running: iteration={iteration} pid={proc.pid} exit_code={exit_code}" + ) if exit_code is not None: break - else: - continue + continue return exit_code @@ -623,15 +694,18 @@ def get_payload_command(self, job: Any) -> str: cmd = "" # for testing looping job: cmd = user.get_payload_command(job) + ';sleep 240' try: - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) - cmd = user.get_payload_command(job) #+ 'sleep 900' # to test looping jobs + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) + cmd = user.get_payload_command(job) # + 'sleep 900' # to test looping jobs except PilotException as error: self.post_setup(job) - import traceback logger.error(traceback.format_exc()) - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code()) - self.__traces.pilot['error_code'] = job.piloterrorcodes[0] + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( + error.get_error_code() + ) + self.__traces.pilot["error_code"] = job.piloterrorcodes[0] logger.fatal( f"could not define payload command (traces error set to: {self.__traces.pilot['error_code']})" ) @@ -658,20 +732,33 @@ def run_preprocess(self, job: Any): if cmd_before_payload: cmd_before_payload = job.setup + cmd_before_payload logger.info(f"\n\npreprocess execution command:\n\n{cmd_before_payload}\n") - exit_code = self.execute_utility_command(cmd_before_payload, job, 'preprocess') + exit_code = self.execute_utility_command( + cmd_before_payload, job, "preprocess" + ) if exit_code == 160: - logger.warning('no more HP points - time to abort processing loop') + logger.warning("no more HP points - time to abort processing loop") elif exit_code == 161: - logger.warning('no more HP points but at least one point was processed - time to abort processing loop') + logger.warning( + "no more HP points but at least one point was processed - time to abort processing loop" + ) elif exit_code == 162: - logger.warning('loop count reached the limit - time to abort processing loop') + logger.warning( + "loop count reached the limit - time to abort processing loop" + ) elif exit_code: # set error code - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PREPROCESSFAILURE) - logger.fatal(f'cannot continue since preprocess failed: exit_code={exit_code}') + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( + errors.PREPROCESSFAILURE + ) + logger.fatal( + f"cannot continue since preprocess failed: exit_code={exit_code}" + ) else: # in case the preprocess produced a command, chmod it - path = os.path.join(job.workdir, job.containeroptions.get('containerExec', 'does_not_exist')) + path = os.path.join( + job.workdir, + job.containeroptions.get("containerExec", "does_not_exist"), + ) if os.path.exists(path): os.chmod(path, 0o755) @@ -683,8 +770,10 @@ def should_verify_setup(self): :return: should verify setup (bool). """ - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.setup', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.setup", globals(), locals(), [pilot_user], 0 + ) return user.should_verify_setup(self.__job) def run(self) -> (int, str): # noqa: C901 @@ -696,7 +785,7 @@ def run(self) -> (int, str): # noqa: C901 :return: exit code (int), diagnostics (str). """ - diagnostics = '' + diagnostics = "" # get the payload command from the user specific code self.pre_setup(self.__job) @@ -704,46 +793,58 @@ def run(self) -> (int, str): # noqa: C901 # get the user defined payload command cmd = self.get_payload_command(self.__job) if not cmd: - logger.warning('aborting run() since payload command could not be defined') - return errors.UNKNOWNPAYLOADFAILURE, 'undefined payload command' + logger.warning("aborting run() since payload command could not be defined") + return errors.UNKNOWNPAYLOADFAILURE, "undefined payload command" # extract the setup in case the preprocess command needs it self.__job.setup = self.extract_setup(cmd) # should the setup be verified? (user defined) verify_setup = self.should_verify_setup() if verify_setup: - logger.debug(f'extracted setup to be verified:\n\n{self.__job.setup}') + logger.debug(f"extracted setup to be verified:\n\n{self.__job.setup}") try: _cmd = self.__job.setup stdout_filename = os.path.join(self.__job.workdir, "setup.stdout") stderr_filename = os.path.join(self.__job.workdir, "setup.stderr") - out = open(stdout_filename, 'wb') - err = open(stderr_filename, 'wb') + out = open(stdout_filename, "wb") + err = open(stderr_filename, "wb") # remove any trailing spaces and ;-signs _cmd = _cmd.strip() - trail = ';' if not _cmd.endswith(';') else '' - _cmd += trail + 'echo \"Done.\"' - exit_code, stdout, stderr = execute(_cmd, workdir=self.__job.workdir, returnproc=False, usecontainer=True, - stdout=out, stderr=err, cwd=self.__job.workdir, job=self.__job) + trail = ";" if not _cmd.endswith(";") else "" + _cmd += trail + 'echo "Done."' + exit_code, stdout, stderr = execute( + _cmd, + workdir=self.__job.workdir, + returnproc=False, + usecontainer=True, + stdout=out, + stderr=err, + cwd=self.__job.workdir, + job=self.__job, + ) if exit_code: - logger.warning(f'setup returned exit code={exit_code}') - diagnostics = stderr + stdout if stdout and stderr else '' + logger.warning(f"setup returned exit code={exit_code}") + diagnostics = stderr + stdout if stdout and stderr else "" if not diagnostics: stdout = read_file(stdout_filename) stderr = read_file(stderr_filename) - diagnostics = stderr + stdout if stdout and stderr else 'General payload setup verification error (check setup logs)' + diagnostics = ( + stderr + stdout + if stdout and stderr + else "General payload setup verification error (check setup logs)" + ) # check for special errors in thw output exit_code = errors.resolve_transform_error(exit_code, diagnostics) diagnostics = errors.format_diagnostics(exit_code, diagnostics) return exit_code, diagnostics if out: out.close() - logger.debug(f'closed {stdout_filename}') + logger.debug(f"closed {stdout_filename}") if err: err.close() - logger.debug(f'closed {stderr_filename}') + logger.debug(f"closed {stderr_filename}") except Exception as error: - diagnostics = f'could not execute: {error}' + diagnostics = f"could not execute: {error}" logger.error(diagnostics) return errors.PAYLOADEXECUTIONEXCEPTION, diagnostics @@ -753,10 +854,9 @@ def run(self) -> (int, str): # noqa: C901 # abort when nothing more to run, or when the preprocess returns a special exit code iteration = 0 while True: - - logger.info(f'payload iteration loop #{iteration + 1}') - os.environ['PILOT_EXEC_ITERATION_COUNT'] = f'{iteration}' - #if self.__args.debug: + logger.info(f"payload iteration loop #{iteration + 1}") + os.environ["PILOT_EXEC_ITERATION_COUNT"] = f"{iteration}" + # if self.__args.debug: # show_memory_usage() # first run the preprocess (if necessary) - note: this might update jobparams -> must update cmd @@ -768,22 +868,24 @@ def run(self) -> (int, str): # noqa: C901 exit_code = 0 # wipe the output file list since there won't be any new files # any output files from previous iterations, should have been transferred already - logger.debug('reset outdata since further output should not be expected after preprocess exit') + logger.debug( + "reset outdata since further output should not be expected after preprocess exit" + ) self.__job.outdata = [] break if jobparams_pre != jobparams_post: - logger.debug('jobparams were updated by utility_before_payload()') + logger.debug("jobparams were updated by utility_before_payload()") # must update cmd cmd = cmd.replace(jobparams_pre, jobparams_post) # now run the main payload, when it finishes, run the postprocess (if necessary) # note: no need to run any main payload in HPO Horovod jobs on Kubernetes - if os.environ.get('HARVESTER_HOROVOD', '') == '': + if os.environ.get("HARVESTER_HOROVOD", "") == "": proc = self.run_payload(self.__job, cmd, self.__out, self.__err) if not proc: # something went wrong, abort exit_code = errors.UNKNOWNEXCEPTION - diagnostics = 'command execution failed, check log for clues' + diagnostics = "command execution failed, check log for clues" break else: proc = None @@ -791,9 +893,11 @@ def run(self) -> (int, str): # noqa: C901 proc_co = None if proc is None: # run the post-process command even if there was no main payload - if os.environ.get('HARVESTER_HOROVOD', '') != '': - logger.info('no need to execute any main payload') - exit_code = self.run_utility_after_payload_finished(exit_code, True, UTILITY_AFTER_PAYLOAD_FINISHED2) + if os.environ.get("HARVESTER_HOROVOD", "") != "": + logger.info("no need to execute any main payload") + exit_code = self.run_utility_after_payload_finished( + exit_code, True, UTILITY_AFTER_PAYLOAD_FINISHED2 + ) self.post_payload(self.__job) else: break @@ -803,44 +907,60 @@ def run(self) -> (int, str): # noqa: C901 send_state(self.__job, self.__args, self.__job.state) # note: when sending a state change to the server, the server might respond with 'tobekilled' - if self.__job.state == 'failed': - logger.warning('job state is \'failed\' - abort payload and run()') + if self.__job.state == "failed": + logger.warning("job state is 'failed' - abort payload and run()") kill_processes(proc.pid) break # allow for a secondary command to be started after the payload (e.g. a coprocess) - utility_cmd = self.get_utility_command(order=UTILITY_AFTER_PAYLOAD_STARTED2) + utility_cmd = self.get_utility_command( + order=UTILITY_AFTER_PAYLOAD_STARTED2 + ) if utility_cmd: - logger.debug(f'starting utility command: {utility_cmd}') - label = 'coprocess' if 'coprocess' in utility_cmd else None + logger.debug(f"starting utility command: {utility_cmd}") + label = "coprocess" if "coprocess" in utility_cmd else None proc_co = self.run_command(utility_cmd, label=label) - logger.info('will wait for graceful exit') + logger.info("will wait for graceful exit") exit_code = self.wait_graceful(self.__args, proc) # reset error if Raythena decided to kill payload (no error) if errors.KILLPAYLOAD in self.__job.piloterrorcodes: - logger.debug('ignoring KILLPAYLOAD error') - self.__job.piloterrorcodes, self.__job.piloterrordiags = errors.remove_error_code(errors.KILLPAYLOAD, - pilot_error_codes=self.__job.piloterrorcodes, - pilot_error_diags=self.__job.piloterrordiags) + logger.debug("ignoring KILLPAYLOAD error") + ( + self.__job.piloterrorcodes, + self.__job.piloterrordiags, + ) = errors.remove_error_code( + errors.KILLPAYLOAD, + pilot_error_codes=self.__job.piloterrorcodes, + pilot_error_diags=self.__job.piloterrordiags, + ) exit_code = 0 - state = 'finished' + state = "finished" else: - state = 'finished' if exit_code == 0 else 'failed' + state = "finished" if exit_code == 0 else "failed" set_pilot_state(job=self.__job, state=state) - logger.info(f'\n\nfinished pid={proc.pid} exit_code={exit_code} state={self.__job.state}\n') + logger.info( + f"\n\nfinished pid={proc.pid} exit_code={exit_code} state={self.__job.state}\n" + ) # stop the utility command (e.g. a coprocess if necessary if proc_co: - logger.debug(f'stopping utility command: {utility_cmd}') + logger.debug(f"stopping utility command: {utility_cmd}") kill_processes(proc_co.pid) if exit_code is None: - logger.warning('detected unset exit_code from wait_graceful - reset to -1') + logger.warning( + "detected unset exit_code from wait_graceful - reset to -1" + ) exit_code = -1 - for order in (UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2): - exit_code = self.run_utility_after_payload_finished(exit_code, state, order) + for order in ( + UTILITY_AFTER_PAYLOAD_FINISHED, + UTILITY_AFTER_PAYLOAD_FINISHED2, + ): + exit_code = self.run_utility_after_payload_finished( + exit_code, state, order + ) # keep track of post-payload timing self.post_payload(self.__job) @@ -853,16 +973,18 @@ def run(self) -> (int, str): # noqa: C901 if self.__job.utilities != {}: self.stop_utilities() - if self.__job.is_hpo and state != 'failed': + if self.__job.is_hpo and state != "failed": # in case there are more hyper-parameter points, move away the previous log files - #self.rename_log_files(iteration) + # self.rename_log_files(iteration) iteration += 1 else: break return exit_code, diagnostics - def run_utility_after_payload_finished(self, exit_code: int, state: str, order: str) -> int: + def run_utility_after_payload_finished( + self, exit_code: int, state: str, order: str + ) -> int: """ Run utility command after the main payload has finished. @@ -876,18 +998,30 @@ def run_utility_after_payload_finished(self, exit_code: int, state: str, order: """ _exit_code = 0 try: - cmd_after_payload, label, ignore_failure = self.utility_after_payload_finished(self.__job, order) + ( + cmd_after_payload, + label, + ignore_failure, + ) = self.utility_after_payload_finished(self.__job, order) except Exception as error: logger.error(error) ignore_failure = False else: - if cmd_after_payload and self.__job.postprocess and state != 'failed': + if cmd_after_payload and self.__job.postprocess and state != "failed": cmd_after_payload = self.__job.setup + cmd_after_payload - logger.info(f"\n\npostprocess execution command:\n\n{cmd_after_payload}\n") - _exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label) + logger.info( + f"\n\npostprocess execution command:\n\n{cmd_after_payload}\n" + ) + _exit_code = self.execute_utility_command( + cmd_after_payload, self.__job, label + ) elif cmd_after_payload: - logger.info(f"\n\npostprocess execution command:\n\n{cmd_after_payload}\n") - _exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label) + logger.info( + f"\n\npostprocess execution command:\n\n{cmd_after_payload}\n" + ) + _exit_code = self.execute_utility_command( + cmd_after_payload, self.__job, label + ) # only set a new non-zero exit code if exit_code was not already set and ignore_failure is False # (e.g. any Xcache failure should be ignored to prevent job from failing since exit_code might get updated) @@ -898,22 +1032,28 @@ def run_utility_after_payload_finished(self, exit_code: int, state: str, order: def stop_utilities(self): """Stop any running utilities.""" - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) for utcmd in list(self.__job.utilities.keys()): utproc = self.__job.utilities[utcmd][0] if utproc: # get the pid for prmon - _list = self.__job.utilities.get('MemoryMonitor') + _list = self.__job.utilities.get("MemoryMonitor") if _list: pid = int(_list[-1]) if len(_list) == 4 else utproc.pid - logger.info(f'using pid={pid} to kill prmon') + logger.info(f"using pid={pid} to kill prmon") else: - logger.warning(f'did not find the pid for the memory monitor in the utilities list: {self.__job.utilities}') + logger.warning( + f"did not find the pid for the memory monitor in the utilities list: {self.__job.utilities}" + ) pid = utproc.pid status = self.kill_and_wait_for_process(pid, user, utcmd) - logger.info(f'utility process {utproc.pid} cleanup finished with status={status}') + logger.info( + f"utility process {utproc.pid} cleanup finished with status={status}" + ) user.post_utility_command_action(utcmd, self.__job) def kill_and_wait_for_process(self, pid: int, user: str, utcmd: str) -> int: @@ -926,7 +1066,7 @@ def kill_and_wait_for_process(self, pid: int, user: str, utcmd: str) -> int: :return: process exit status (int or None). """ sig = user.get_utility_command_kill_signal(utcmd) - logger.info(f"stopping utility process \'{utcmd}\' with signal {sig}") + logger.info(f"stopping utility process '{utcmd}' with signal {sig}") try: # Send SIGUSR1 signal to the process @@ -939,22 +1079,23 @@ def kill_and_wait_for_process(self, pid: int, user: str, utcmd: str) -> int: # Check the exit status of the process if os.WIFEXITED(status): - logger.debug('normal exit') + logger.debug("normal exit") return os.WEXITSTATUS(status) - else: - # Handle abnormal termination if needed - logger.warning('abnormal termination') - return None - else: - # Process doesn't exist - ignore - logger.info(f'process {pid} no longer exists') - return True + + # Handle abnormal termination if needed + logger.warning("abnormal termination") + return None + + # Process doesn't exist - ignore + logger.info(f"process {pid} no longer exists") + return True except OSError as exc: # Handle errors, such as process not found logger.warning(f"Error sending signal to/waiting for process {pid}: {exc}") return None + # try: # # Send SIGUSR1 signal to the process # os.kill(pid, sig) @@ -998,16 +1139,16 @@ def kill_and_wait_for_process(self, pid: int, user: str, utcmd: str) -> int: # logger.warning(f"exception caught: {exc}") # return None - def rename_log_files(self, iteration: int): - """ - Rename log files. - - :param iteration: iteration (int). - """ - names = [self.__preprocess_stdout_name, self.__preprocess_stderr_name, - self.__postprocess_stdout_name, self.__postprocess_stderr_name] - for name in names: - if os.path.exists(name): - os.rename(name, name + f'{iteration}') - else: - logger.warning(f'cannot rename {name} since it does not exist') +# def rename_log_files(self, iteration: int): +# """ +# Rename log files. +# +# :param iteration: iteration (int). +# """ +# names = [self.__preprocess_stdout_name, self.__preprocess_stderr_name, +# self.__postprocess_stdout_name, self.__postprocess_stderr_name] +# for name in names: +# if os.path.exists(name): +# os.rename(name, name + f'{iteration}') +# else: +# logger.warning(f'cannot rename {name} since it does not exist') diff --git a/pilot/copytool/gfal.py b/pilot/copytool/gfal.py index acf01a182..7fb34301f 100644 --- a/pilot/copytool/gfal.py +++ b/pilot/copytool/gfal.py @@ -19,7 +19,7 @@ # Authors: # - Pavlo Svirin, pavlo.svirin@cern.ch, 2017 # - Tobias Wegner, tobias.wegner@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 # - Alexey Anisenkov, anisyonk@cern.ch, 2018 """GFAL2 copy tool.""" @@ -29,10 +29,18 @@ import os from time import time -from .common import resolve_common_transfer_errors, get_timeout -from pilot.common.exception import PilotException, ErrorCodes, StageInFailure, StageOutFailure +from pilot.common.exception import ( + PilotException, + ErrorCodes, + StageInFailure, + StageOutFailure +) from pilot.util.container import execute #from pilot.util.timer import timeout +from .common import ( + resolve_common_transfer_errors, + get_timeout +) logger = logging.getLogger(__name__) @@ -148,8 +156,9 @@ def copy_out(files: list, **kwargs: dict) -> list: Upload given files using gfal command. :param files: Files to upload (files) - :raises: PilotException in case of errors - :return: updated files (list). + :param kwargs: kwargs dictionary (dict) + :return: updated files (list) + :raises: PilotException in case of errors. """ if not check_for_gfal(): raise StageOutFailure("No GFAL2 tools found") @@ -222,7 +231,7 @@ def move_all_files_in(files: list, nretries: int = 1) -> (int, str, str): ### exit_code, stdout, stderr = move(source, destination, entry.get('recursive', False)) if exit_code != 0: - if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: + if ((exit_code not in (errno.ETIMEDOUT, errno.ETIME)) or ((retry + 1) == nretries)): logger.warning(f"transfer failed: exit code = {exit_code}, stdout = {stdout}, stderr = {stderr}") return exit_code, stdout, stderr else: # all successful @@ -236,6 +245,7 @@ def move_all_files_out(files: list, nretries: int = 1) -> (int, str, str): ### Move all output files. :param files: list of FileSpec objects (list) + :param nretries: number of retries; sometimes there can be a timeout copying, but the next attempt may succeed (int) :return: exit_code (int), stdout (str), stderr (str). """ exit_code = 0 @@ -253,7 +263,7 @@ def move_all_files_out(files: list, nretries: int = 1) -> (int, str, str): ### exit_code, stdout, stderr = move(source, destination) if exit_code != 0: - if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: + if ((exit_code not in (errno.ETIMEDOUT, errno.ETIME)) or ((retry + 1) == nretries)): logger.warning(f"transfer failed: exit code = {exit_code}, stdout = {stdout}, stderr = {stderr}") return exit_code, stdout, stderr else: # all successful diff --git a/pilot/copytool/gs.py b/pilot/copytool/gs.py index 2d54a1b48..b0506468f 100644 --- a/pilot/copytool/gs.py +++ b/pilot/copytool/gs.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2024 # - Shuwei Ye, yesw@bnl.gov, 2021 """GS copy tool.""" @@ -30,19 +30,18 @@ from glob import glob from typing import Any -from pilot.info import infosys - try: from google.cloud import storage -except Exception: +except ImportError: storage_client = None else: storage_client = storage.Client() -from .common import resolve_common_transfer_errors from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import PilotException +from pilot.info import infosys from pilot.util.config import config +from .common import resolve_common_transfer_errors logger = logging.getLogger(__name__) errors = ErrorCodes() @@ -98,7 +97,7 @@ def resolve_surl(fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> d """ try: pandaqueue = infosys.pandaqueue - except Exception: + except AttributeError: pandaqueue = "" if pandaqueue is None: pandaqueue = "" @@ -128,8 +127,9 @@ def copy_in(files: list, **kwargs: dict) -> list: Download given files from a GCS bucket. :param files: list of `FileSpec` objects (list) - :raise: PilotException in case of controlled error - :return: updated files (list). + :param kwargs: kwargs dictionary (dict) + :return: updated files (list) + :raise: PilotException in case of controlled error. """ for fspec in files: @@ -180,8 +180,9 @@ def copy_out(files: list, **kwargs: dict): Upload given files to GS storage. :param files: list of `FileSpec` objects (list) - :raise: PilotException in case of controlled error - :return: updated files (list). + :param kwargs: kwargs dictionary (dict) + :return: updated files (list) + :raise: PilotException in case of controlled error. """ workdir = kwargs.pop('workdir') @@ -215,7 +216,7 @@ def copy_out(files: list, **kwargs: dict): for path in logfiles: logfile = os.path.basename(path) if os.path.exists(path): - if logfile == config.Pilot.pilotlog or logfile == config.Payload.payloadstdout or logfile == config.Payload.payloadstderr: + if logfile in (config.Pilot.pilotlog, config.Payload.payloadstdout, config.Payload.payloadstderr): content_type = "text/plain" logger.debug(f'change the file {logfile} content-type to text/plain') else: @@ -225,7 +226,7 @@ def copy_out(files: list, **kwargs: dict): if not isinstance(result, str): result = result.decode('utf-8') if result.find(';') > 0: - content_type = result.split(';')[0] + content_type = result.split(';', maxsplit=1)[0] logger.debug(f'change the file {logfile} content-type to {content_type}') except Exception: pass diff --git a/pilot/copytool/lsm.py b/pilot/copytool/lsm.py index 9b182ea43..c8eca99ff 100644 --- a/pilot/copytool/lsm.py +++ b/pilot/copytool/lsm.py @@ -19,7 +19,7 @@ # Authors: # - Pavlo Svirin, pavlo.svirin@cern.ch, 2017 # - Tobias Wegner, tobias.wegner@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """Local site mover copy tool.""" @@ -28,17 +28,17 @@ import os from time import time +from pilot.common.exception import ( + ErrorCodes, + PilotException +) +from pilot.util.container import execute +#from pilot.util.timer import timeout from .common import ( get_copysetup, verify_catalog_checksum, resolve_common_transfer_errors #, get_timeout ) -from pilot.common.exception import ( - PilotException, - ErrorCodes -) -from pilot.util.container import execute -#from pilot.util.timer import timeout logger = logging.getLogger(__name__) @@ -54,7 +54,7 @@ def is_valid_for_copy_in(files: list) -> bool: Placeholder. - :param files: list of FileSpec objects (list). + :param files: list of FileSpec objects (list) :return: always True (for now) (bool). """ # for f in files: @@ -69,7 +69,7 @@ def is_valid_for_copy_out(files: list) -> bool: Placeholder. - :param files: list of FileSpec objects (list). + :param files: list of FileSpec objects (list) :return: always True (for now) (bool). """ # for f in files: @@ -84,8 +84,8 @@ def copy_in(files: list, **kwargs: dict) -> list: :param files: list of `FileSpec` objects (list) :param kwargs: kwargs dictionary (dict) - :raises: PilotException in case of controlled error - :return: updated files (list). + :return: updated files (list) + :raises: PilotException in case of controlled error. """ copytools = kwargs.get('copytools') or [] copysetup = get_copysetup(copytools, 'lsm') @@ -157,8 +157,9 @@ def copy_out(files: list, **kwargs: dict) -> list: Upload given files using lsm copytool. :param files: list of `FileSpec` objects (list) - :raise: PilotException in case of controlled error - :return: updated files (list). + :param kwargs: kwargs dictionary (dict) + :return: updated files (list) + :raises: PilotException in case of controlled error. """ copytools = kwargs.get('copytools') or [] copysetup = get_copysetup(copytools, 'lsm') @@ -194,16 +195,16 @@ def copy_out(files: list, **kwargs: dict) -> list: checksum = f"adler32:{fspec.checksum.get('adler32')}" # define the command options - opts = {'--size': fspec.filesize, - '-t': token, - '--checksum': checksum, - '--guid': fspec.guid} - opts = " ".join([f"{k} {v}" for (k, v) in list(opts.items())]) + _opts = {'--size': fspec.filesize, + '-t': token, + '--checksum': checksum, + '--guid': fspec.guid} + opts = " ".join([f"{k} {v}" for (k, v) in list(_opts.items())]) logger.info(f"transferring file {fspec.lfn} from {source} to {destination}") nretries = 1 # input parameter to function? - for retry in range(nretries): + for _ in range(nretries): exit_code, stdout, stderr = move(source, destination, dst_in=False, copysetup=copysetup, options=opts) if exit_code != 0: @@ -217,9 +218,9 @@ def copy_out(files: list, **kwargs: dict) -> list: timeEnd=time()) trace_report.send() raise PilotException(error.get('error'), code=error.get('exit_code'), state=error.get('state')) - else: # all successful - logger.info('all successful') - break + + logger.info('all successful') + break fspec.status_code = 0 fspec.status = 'transferred' @@ -250,7 +251,7 @@ def move_all_files_in(files: list, nretries: int = 1) -> (int, str, str): exit_code, stdout, stderr = move(source, destination, dst_in=True) if exit_code != 0: - if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: + if ((exit_code not in (errno.ETIMEDOUT, errno.ETIME)) or ((retry + 1) == nretries)): logger.warning(f"transfer failed: exit code = {exit_code}, stdout = {stdout}, stderr = {stderr}") return exit_code, stdout, stderr else: # all successful @@ -280,7 +281,7 @@ def move_all_files_out(files: list, nretries: int = 1) -> (int, str, str): exit_code, stdout, stderr = move(source, destination, dst_in=False) if exit_code != 0: - if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: + if ((exit_code not in (errno.ETIMEDOUT, errno.ETIME)) or ((retry + 1) == nretries)): logger.warning(f"transfer failed: exit code = {exit_code}, stdout = {stdout}, stderr = {stderr}") return exit_code, stdout, stderr else: # all successful @@ -297,6 +298,8 @@ def move(source: str, destination: str, dst_in: bool = True, copysetup: str = "" :param source: path to source (str) :param destination: path to destination (str) :param dst_in: True for stage-in, False for stage-out (bool) + :param copysetup: path to copysetup (str) + :param options: additional options (str) :return: exit code (int), stdout (str), stderr (str). """ # copysetup = '/osg/mwt2/app/atlas_app/atlaswn/setup.sh' @@ -321,7 +324,7 @@ def move(source: str, destination: str, dst_in: bool = True, copysetup: str = "" exit_code = ErrorCodes.STAGEINFAILED else: exit_code = ErrorCodes.STAGEOUTFAILED - stdout = 'exception caught: e' % error + stdout = f'exception caught: {error}' stderr = '' logger.warning(stdout) diff --git a/pilot/copytool/mv.py b/pilot/copytool/mv.py index 62f04e731..7c31b6fc9 100644 --- a/pilot/copytool/mv.py +++ b/pilot/copytool/mv.py @@ -45,6 +45,8 @@ def is_valid_for_copy_in(files: list) -> bool: :param files: list of FileSpec objects (list). :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -60,6 +62,8 @@ def is_valid_for_copy_out(files: list) -> bool: :param files: list of FileSpec objects (list). :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -90,7 +94,7 @@ def create_output_list(files: list, init_dir: str): logger.info(f'adding to output.list: {fspec.lfn} {arcturl}') # Write output.list - with open(os.path.join(init_dir, 'output.list'), 'a') as f: + with open(os.path.join(init_dir, 'output.list'), 'a', encoding='utf-8') as f: f.write(f'{fspec.lfn} {arcturl}\n') @@ -128,8 +132,12 @@ def build_final_path(turl: str, prefix: str = 'file://localhost') -> (int, str, except FileExistsError: # ignore if sub dirs already exist pass - except (IsADirectoryError, OSError) as exc: - diagnostics = f'caught exception: {exc}' + except IsADirectoryError as exc: + diagnostics = f'caught IsADirectoryError exception: {exc}' + logger.warning(diagnostics) + return ErrorCodes.MKDIR, diagnostics, path + except OSError as exc: + diagnostics = f'caught OSError exception: {exc}' logger.warning(diagnostics) return ErrorCodes.MKDIR, diagnostics, path else: @@ -168,11 +176,11 @@ def copy_in(files: list, copy_type: str = "symlink", **kwargs: dict) -> list: logger.debug(f"workdir={kwargs.get('workdir')}") logger.debug(f"jobworkdir={kwargs.get('jobworkdir')}") - exit_code, stdout, stderr = move_all_files(files, - copy_type, - kwargs.get('workdir'), - kwargs.get('jobworkdir'), - mvfinaldest=kwargs.get('mvfinaldest', False)) + exit_code, stdout, _ = move_all_files(files, + copy_type, + kwargs.get('workdir'), + kwargs.get('jobworkdir'), + mvfinaldest=kwargs.get('mvfinaldest', False)) if exit_code != 0: # raise failure raise StageInFailure(stdout) @@ -187,8 +195,8 @@ def copy_out(files: list, copy_type: str = "mv", **kwargs: dict) -> list: :param files: list of `FileSpec` objects (list) :param copy_type: copy type (str) :param kwargs: kwargs dictionary (dict) - :raises PilotException: StageOutFailure, MKDirFailure - :return: updated files (list). + :return: updated files (list) + :raises PilotException: StageOutFailure, MKDirFailure. """ if copy_type not in ["cp", "mv"]: raise StageOutFailure("incorrect method for copy out") @@ -205,8 +213,7 @@ def copy_out(files: list, copy_type: str = "mv", **kwargs: dict) -> list: # raise failure if exit_code == ErrorCodes.MKDIR: raise MKDirFailure(stdout) - else: - raise StageOutFailure(stdout) + raise StageOutFailure(stdout) # Create output list for ARC CE if necessary logger.debug(f"init_dir for output.list={os.path.dirname(kwargs.get('workdir'))}") @@ -282,9 +289,9 @@ def move_all_files(files: list, copy_type: str, workdir: str, jobworkdir: str, else: fspec.status_code = ErrorCodes.STAGEOUTFAILED break - else: - fspec.status_code = 0 - fspec.status = 'transferred' + + fspec.status_code = 0 + fspec.status = 'transferred' return exit_code, stdout, stderr diff --git a/pilot/copytool/objectstore.py b/pilot/copytool/objectstore.py index 13589fd38..568499843 100644 --- a/pilot/copytool/objectstore.py +++ b/pilot/copytool/objectstore.py @@ -19,7 +19,7 @@ # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2019 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2024 """Objectstore copy tool.""" @@ -57,6 +57,8 @@ def is_valid_for_copy_in(files: list) -> bool: :param files: list of FileSpec objects (list). :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -72,6 +74,8 @@ def is_valid_for_copy_out(files: list) -> bool: :param files: list of FileSpec objects (list). :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -87,15 +91,18 @@ def resolve_surl(fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> d :param fspec: file spec data (Any) :param protocol: suggested protocol (dict) :param ddmconf: full ddm storage data (dict) + :param kwargs: kwargs dictionary (dict) :return: SURL dictionary {'surl': surl} (dict). """ + if kwargs: # to get rid of pylint warning + pass ddm = ddmconf.get(fspec.ddmendpoint) if not ddm: raise PilotException(f'failed to resolve ddmendpoint by name={fspec.ddmendpoint}') if ddm.is_deterministic: surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), get_rucio_path(fspec.scope, fspec.lfn)) - elif ddm.type in ['OS_ES', 'OS_LOGS']: + elif ddm.type in {'OS_ES', 'OS_LOGS'}: surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), fspec.lfn) fspec.protocol_id = protocol.get('id') else: @@ -109,8 +116,9 @@ def copy_in(files: list, **kwargs: dict) -> list: Download given files using rucio copytool. :param files: list of `FileSpec` objects (list) - :raise: PilotException in case of controlled error - :return: updated list of files (list). + :param kwargs: kwargs dictionary (dict) + :return: updated list of files (list) + :raises: PilotException in case of controlled error. """ # don't spoil the output, we depend on stderr parsing os.environ['RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]' @@ -166,7 +174,7 @@ def is_new_rucio_version() -> bool: :return: True if new rucio version (bool). """ _, stdout, _ = execute('rucio download -h') - return True if '--rses RSES' in stdout else False + return '--rses RSES' in stdout def copy_out(files: list, **kwargs: dict) -> list: diff --git a/pilot/copytool/rucio.py b/pilot/copytool/rucio.py index 8cd2dcdff..e526c4150 100644 --- a/pilot/copytool/rucio.py +++ b/pilot/copytool/rucio.py @@ -27,8 +27,6 @@ """Rucio copy tool.""" -from __future__ import absolute_import # Python 2 (2to3 complains about this) - import json import logging import os @@ -44,13 +42,13 @@ timeout, TimedThread, ) +from pilot.util.config import config +from pilot.util.filehandling import rename_xrdlog from .common import ( resolve_common_transfer_errors, verify_catalog_checksum, get_timeout ) -from pilot.util.config import config -from pilot.util.filehandling import rename_xrdlog logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -70,6 +68,8 @@ def is_valid_for_copy_in(files: list) -> bool: :param files: list of FileSpec objects (list). :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -85,6 +85,8 @@ def is_valid_for_copy_out(files: list) -> bool: :param files: list of FileSpec objects (list). :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -98,8 +100,8 @@ def copy_in(files: list, **kwargs: dict) -> list: :param files: list of `FileSpec` objects (list) :param kwargs: kwargs dictionary (dict) - :raises: PilotException in case of controlled error - :return: updated files (list). + :return: updated files (list) + :raises: PilotException in case of controlled error. """ ignore_errors = kwargs.get('ignore_errors') trace_report = kwargs.get('trace_report') @@ -151,7 +153,7 @@ def copy_in(files: list, **kwargs: dict) -> list: if not ignore_errors: trace_report.send() msg = f"{fspec.scope}:{fspec.lfn} from {fspec.ddmendpoint}, {error_details.get('error')}" - raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) + raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) from error else: protocol = get_protocol(trace_report_out) trace_report.update(protocol=protocol) @@ -253,10 +255,10 @@ def copy_in_bulk(files: list, **kwargs: dict) -> list: """ Download given files using rucio copytool. - :param files: list of `FileSpec` objects - :param ignore_errors: boolean, if specified then transfer failures will be ignored - :raise: PilotException in case of controlled error - :return: list of done files (list). + :param files: list of `FileSpec` objects (list) + :param kwargs: kwargs dictionary (dict) + :return: list of done files (list) + :raises: PilotException in case of controlled error. """ #allow_direct_access = kwargs.get('allow_direct_access') ignore_errors = kwargs.get('ignore_errors') @@ -282,6 +284,7 @@ def copy_in_bulk(files: list, **kwargs: dict) -> list: trace_report = deepcopy(trace_common_fields) localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None) diagnostics = f'none of the traces received from rucio. response from rucio: {error_msg}' + code = 0 for fspec in files: localsite = localsite if localsite else fspec.ddmendpoint trace_report.update(localSite=localsite, remoteSite=fspec.ddmendpoint, filesize=fspec.filesize) @@ -289,8 +292,9 @@ def copy_in_bulk(files: list, **kwargs: dict) -> list: trace_report.update(scope=fspec.scope, dataset=fspec.dataset) trace_report.update('STAGEIN_ATTEMPT_FAILED', stateReason=diagnostics, timeEnd=time()) trace_report.send() + code = fspec.status_code # just get one of them for later logger.error(diagnostics) - raise PilotException(diagnostics, code=fspec.status_code, state='STAGEIN_ATTEMPT_FAILED') + raise PilotException(diagnostics, code=code, state='STAGEIN_ATTEMPT_FAILED') from error # VALIDATION AND TERMINATION files_done = [] @@ -360,17 +364,13 @@ def _get_trace(fspec: Any, traces: list) -> list: :return: trace_candidates that correspond to the given file (list). """ try: - try: - trace_candidates = list(filter(lambda t: t['filename'] == fspec.lfn and t['scope'] == fspec.scope, traces)) # Python 2 - except Exception: - trace_candidates = list([t for t in traces if t['filename'] == fspec.lfn and t['scope'] == fspec.scope]) # Python 3 + trace_candidates = list(t for t in traces if t['filename'] == fspec.lfn and t['scope'] == fspec.scope) if trace_candidates: return trace_candidates - else: - logger.warning(f'file does not match to any trace received from Rucio: {fspec.lfn} {fspec.scope}') + logger.warning(f'file does not match to any trace received from Rucio: {fspec.lfn} {fspec.scope}') except Exception as error: logger.warning(f'traces from pilot and rucio could not be merged: {error}') - return [] + return [] #@timeout(seconds=10800) @@ -379,9 +379,9 @@ def copy_out(files: list, **kwargs: dict) -> list: # noqa: C901 Upload given files using rucio copytool. :param files: list of `FileSpec` objects (list) - :param ignore_errors: boolean, if specified then transfer failures will be ignored (bool) - :raise: PilotException in case of controlled error - :return: updated files list (list). + :param kwargs: kwargs dictionary (dict) + :return: updated files list (list) + :raises: PilotException in case of controlled error. """ # don't spoil the output, we depend on stderr parsing os.environ['RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]' @@ -420,23 +420,21 @@ def copy_out(files: list, **kwargs: dict) -> list: # noqa: C901 rucio_host) #_stage_out_api(fspec, summary_file_path, trace_report, trace_report_out) except PilotException as error: - error_msg = str(error) - error_details = handle_rucio_error(error_msg, trace_report, trace_report_out, fspec, stagein=False) + error_details = handle_rucio_error(str(error), trace_report, trace_report_out, fspec, stagein=False) protocol = get_protocol(trace_report_out) trace_report.update(protocol=protocol) if not ignore_errors: trace_report.send() msg = f" {fspec.scope}:{fspec.lfn} to {fspec.ddmendpoint}, {error_details.get('error')}" - raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) + raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) from error except Exception as error: - error_msg = str(error) - error_details = handle_rucio_error(error_msg, trace_report, trace_report_out, fspec, stagein=False) + error_details = handle_rucio_error(str(error), trace_report, trace_report_out, fspec, stagein=False) protocol = get_protocol(trace_report_out) trace_report.update(protocol=protocol) if not ignore_errors: trace_report.send() msg = f" {fspec.scope}:{fspec.lfn} to {fspec.ddmendpoint}, {error_details.get('error')}" - raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) + raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) from error else: protocol = get_protocol(trace_report_out) trace_report.update(protocol=protocol) @@ -477,12 +475,11 @@ def copy_out(files: list, **kwargs: dict) -> list: # noqa: C901 if not ignore_errors: raise PilotException("Failed to stageout: CRC mismatched", code=fspec.status_code, state=state) + elif local_checksum and checksum and local_checksum == checksum: + logger.info(f'local checksum ({local_checksum}) = remote checksum ({checksum})') else: - if local_checksum and checksum and local_checksum == checksum: - logger.info(f'local checksum ({local_checksum}) = remote checksum ({checksum})') - else: - logger.warning(f'checksum could not be verified: local checksum ({local_checksum}), ' - f'remote checksum ({checksum})') + logger.warning(f'checksum could not be verified: local checksum ({local_checksum}), ' + f'remote checksum ({checksum})') if not fspec.status_code: fspec.status_code = 0 fspec.status = 'transferred' @@ -573,7 +570,7 @@ def _stage_in_api(dst: str, fspec: Any, trace_report: dict, trace_report_out: li return ec, trace_report_out -def _stage_in_bulk(dst: str, files: list, trace_report_out: list = [], trace_common_fields: dict = {}, +def _stage_in_bulk(dst: str, files: list, trace_report_out: list = None, trace_common_fields: dict = None, rucio_host: str = ''): """ Stage-in files in bulk using the Rucio API. @@ -585,6 +582,10 @@ def _stage_in_bulk(dst: str, files: list, trace_report_out: list = [], trace_com :param rucio_host: optional rucio host (string). :raises Exception: download_client.download_pfns exception. """ + if trace_report_out is None: + trace_report_out = [] + if trace_common_fields is None: + trace_common_fields = {} # init. download client from rucio.client import Client from rucio.client.downloadclient import DownloadClient @@ -707,6 +708,9 @@ def _stage_out_api(fspec: Any, summary_file_path: str, trace_report: dict, trace logger.debug(f'summary_file_path={summary_file_path}') logger.debug(f'trace_report_out={trace_report_out}') result = upload_client.upload([_file], summary_file_path=summary_file_path, traces_copy_out=trace_report_out, ignore_availability=True) + except UnboundLocalError: + logger.warning('*** rucio API upload client failed ***') + logger.warning('rucio still needs a bug fix of the summary in the uploadclient') except Exception as error: logger.warning('*** rucio API upload client failed ***') logger.warning(f'caught exception: {error}') @@ -718,9 +722,6 @@ def _stage_out_api(fspec: Any, summary_file_path: str, trace_report: dict, trace if not trace_report_out[0].get('stateReason'): raise error ec = -1 - except UnboundLocalError: - logger.warning('*** rucio API upload client failed ***') - logger.warning('rucio still needs a bug fix of the summary in the uploadclient') else: logger.warning('*** rucio API upload client finished ***') logger.debug(f'client returned {result}') diff --git a/pilot/copytool/s3.py b/pilot/copytool/s3.py index 8a7e908d6..eaf4d8356 100644 --- a/pilot/copytool/s3.py +++ b/pilot/copytool/s3.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2024 """S3 copy tool.""" @@ -33,12 +33,12 @@ except Exception: pass -from .common import resolve_common_transfer_errors from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import PilotException from pilot.info import infosys from pilot.util.config import config from pilot.util.ruciopath import get_rucio_path +from .common import resolve_common_transfer_errors logger = logging.getLogger(__name__) errors = ErrorCodes() @@ -56,9 +56,11 @@ def is_valid_for_copy_in(files: list) -> bool: Placeholder. - :param files: list of FileSpec objects (list). + :param files: list of FileSpec objects (list) :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -71,9 +73,11 @@ def is_valid_for_copy_out(files: list) -> bool: Placeholder. - :param files: list of FileSpec objects (list). + :param files: list of FileSpec objects (list) :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -130,6 +134,8 @@ def resolve_surl(fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> d :param kwargs: kwargs dictionary (dict) :return: SURL dictionary {'surl': surl} (dict). """ + if kwargs: # to get rid of pylint warning + pass try: pandaqueue = infosys.pandaqueue except Exception: @@ -143,10 +149,10 @@ def resolve_surl(fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> d if ddm.is_deterministic: surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), get_rucio_path(fspec.scope, fspec.lfn)) - elif ddm.type in ['OS_ES', 'OS_LOGS']: + elif ddm.type in {'OS_ES', 'OS_LOGS'}: try: pandaqueue = infosys.pandaqueue - except Exception: + except AttributeError: pandaqueue = "" if pandaqueue is None: pandaqueue = "" @@ -327,13 +333,12 @@ def copy_out(files: list, **kwargs: dict) -> list: return files -def upload_file(file_name: str, full_url: str, object_name: str = None) -> (bool, str): +def upload_file(file_name: str, full_url: str) -> (bool, str): """ Upload a file to an S3 bucket. :param file_name: file to upload (str) - :param turl: target url to upload to (str) - :param object_name: S3 object name. If not specified then file_name is used (str) + :param full_url: full URL to upload to (str) :return: True if file was uploaded - otherwise False (bool), diagnostics (str). """ # upload the file diff --git a/pilot/copytool/xrdcp.py b/pilot/copytool/xrdcp.py index 7a4a6c3bd..148974217 100644 --- a/pilot/copytool/xrdcp.py +++ b/pilot/copytool/xrdcp.py @@ -18,7 +18,7 @@ # # Authors: # - Tobias Wegner, tobias.wegner@cern.ch, 2017-2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Alexey Anisenkov, anisyonk@cern.ch, 2017 """Xrdcp copy tool.""" @@ -28,10 +28,10 @@ import re from time import time -from .common import resolve_common_transfer_errors, verify_catalog_checksum #, get_timeout from pilot.util.container import execute from pilot.common.exception import PilotException, ErrorCodes #from pilot.util.timer import timeout +from .common import resolve_common_transfer_errors, verify_catalog_checksum #, get_timeout logger = logging.getLogger(__name__) @@ -53,6 +53,8 @@ def is_valid_for_copy_in(files: list) -> bool: # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False + if files: # to get rid of pylint warning + pass return True ## FIX ME LATER @@ -68,10 +70,12 @@ def is_valid_for_copy_out(files: list) -> bool: # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False + if files: # to get rid of pylint warning + pass return True ## FIX ME LATER -def _resolve_checksum_option(setup: str, **kwargs) -> str: +def _resolve_checksum_option(setup: str, **kwargs: dict) -> str: """ Resolve which checksum option to use. @@ -79,13 +83,6 @@ def _resolve_checksum_option(setup: str, **kwargs) -> str: :param kwargs: kwargs dictionary (dict) :return: option (str). """ - cmd = f"{copy_command} --version" - if setup: - cmd = f"source {setup}; {cmd}" - - logger.info(f"execute command ({cmd}) to check xrdcp client version") - - rcode, stdout, stderr = execute(cmd, **kwargs) cmd = f"{copy_command} -h" if setup: cmd = f"source {setup}; {cmd}" @@ -100,13 +97,12 @@ def _resolve_checksum_option(setup: str, **kwargs) -> str: if rcode: logger.error(f'FAILED to execute command={cmd}: {output}') - else: - if "--cksum" in output: - coption = f"--cksum {checksum_type}:print" - elif "-adler" in output and checksum_type == 'adler32': - coption = "-adler" - elif "-md5" in output and checksum_type == 'md5': - coption = "-md5" + elif "--cksum" in output: + coption = f"--cksum {checksum_type}:print" + elif "-adler" in output and checksum_type == 'adler32': + coption = "-adler" + elif "-md5" in output and checksum_type == 'md5': + coption = "-md5" if coption: logger.info(f"use {coption} option to get the checksum for {copy_command} command") @@ -130,6 +126,8 @@ def _stagefile(coption: str, source: str, destination: str, filesize: int, is_st :raises: PilotException in case of controlled error :return: destination file details - file size (int) checksum (str), checksum_type (str). """ + if filesize: # to get rid of pylint warning - could be useful + pass filesize_cmd, checksum_cmd, checksum_type = None, None, None cmd = f'{copy_command} -np -f {coption} {source} {destination}' @@ -173,8 +171,8 @@ def copy_in(files: list, **kwargs: dict) -> list: :param files: list of `FileSpec` objects (list) :param kwargs: kwargs dictionary (dict) - :raises: PilotException in case of controlled error - :return: updated list of files (list). + :return: updated list of files (list) + :raises: PilotException in case of controlled error. """ #allow_direct_access = kwargs.get('allow_direct_access') or False setup = kwargs.pop('copytools', {}).get('xrdcp', {}).get('setup') @@ -214,16 +212,16 @@ def copy_in(files: list, **kwargs: dict) -> list: state = 'STAGEIN_ATTEMPT_FAILED' trace_report.update(clientState=state, stateReason=diagnostics, timeEnd=time()) trace_report.send() + raise PilotException(diagnostics, code=fspec.status_code, state=state) from error + + # compare checksums + fspec.checksum[checksum_type] = checksum_cmd # remote checksum + state, diagnostics = verify_catalog_checksum(fspec, destination) + if diagnostics != "": + trace_report.update(clientState=state or 'STAGEIN_ATTEMPT_FAILED', stateReason=diagnostics, + timeEnd=time()) + trace_report.send() raise PilotException(diagnostics, code=fspec.status_code, state=state) - else: - # compare checksums - fspec.checksum[checksum_type] = checksum_cmd # remote checksum - state, diagnostics = verify_catalog_checksum(fspec, destination) - if diagnostics != "": - trace_report.update(clientState=state or 'STAGEIN_ATTEMPT_FAILED', stateReason=diagnostics, - timeEnd=time()) - trace_report.send() - raise PilotException(diagnostics, code=fspec.status_code, state=state) trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time()) trace_report.send() @@ -263,16 +261,16 @@ def copy_out(files: list, **kwargs: dict) -> list: diagnostics = error.get_detail() trace_report.update(clientState=state, stateReason=diagnostics, timeEnd=time()) trace_report.send() + raise PilotException(diagnostics, code=fspec.status_code, state=state) from error + + # compare checksums + fspec.checksum[checksum_type] = checksum_cmd # remote checksum + state, diagnostics = verify_catalog_checksum(fspec, fspec.surl) + if diagnostics != "": + trace_report.update(clientState=state or 'STAGEIN_ATTEMPT_FAILED', stateReason=diagnostics, + timeEnd=time()) + trace_report.send() raise PilotException(diagnostics, code=fspec.status_code, state=state) - else: - # compare checksums - fspec.checksum[checksum_type] = checksum_cmd # remote checksum - state, diagnostics = verify_catalog_checksum(fspec, fspec.surl) - if diagnostics != "": - trace_report.update(clientState=state or 'STAGEIN_ATTEMPT_FAILED', stateReason=diagnostics, - timeEnd=time()) - trace_report.send() - raise PilotException(diagnostics, code=fspec.status_code, state=state) return files diff --git a/pilot/eventservice/communicationmanager/communicationmanager.py b/pilot/eventservice/communicationmanager/communicationmanager.py index a7087ba90..61f47bb94 100644 --- a/pilot/eventservice/communicationmanager/communicationmanager.py +++ b/pilot/eventservice/communicationmanager/communicationmanager.py @@ -18,7 +18,7 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2023-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2023-2024 """Main classes to manage the messages between ES and harvester/ACT/Panda.""" @@ -320,8 +320,8 @@ def update_events(self, update_events: Any, post_hook: Any = None) -> Any: :param update_events: update events (Any) :param post_hook: post hook function (Any) - :raises: Exception caught when updating event ranges - :return: status of updating event ranges + :return: status of updating event ranges (Any) + :raises: Exception caught when updating event ranges. """ if self.is_stop(): return None @@ -333,7 +333,7 @@ def update_events(self, update_events: Any, post_hook: Any = None) -> Any: self.queues['update_events'].put(req) if req.post_hook: - return + return None while req.response is None: time.sleep(1) @@ -341,8 +341,8 @@ def update_events(self, update_events: Any, post_hook: Any = None) -> Any: raise req.response.exception if req.response.status is False: return None - else: - return req.response.content + + return req.response.content def get_plugin_confs(self) -> dict: """ diff --git a/pilot/eventservice/communicationmanager/plugins/basecommunicator.py b/pilot/eventservice/communicationmanager/plugins/basecommunicator.py index 49ce8f556..d668dc5ba 100644 --- a/pilot/eventservice/communicationmanager/plugins/basecommunicator.py +++ b/pilot/eventservice/communicationmanager/plugins/basecommunicator.py @@ -18,7 +18,7 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 """Base communicator.""" @@ -53,6 +53,8 @@ def __init__(self, *args: Any, **kwargs: dict): :param args: args object (Any) :param kwargs: kwargs dictionary (dict) """ + if args: # to get rid of pylint warning + pass super().__init__() for key, value in kwargs.items(): setattr(self, key, value) diff --git a/pilot/eventservice/communicationmanager/plugins/pandacommunicator.py b/pilot/eventservice/communicationmanager/plugins/pandacommunicator.py index 6b7531f0a..65f37d0a9 100644 --- a/pilot/eventservice/communicationmanager/plugins/pandacommunicator.py +++ b/pilot/eventservice/communicationmanager/plugins/pandacommunicator.py @@ -18,7 +18,7 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 """PanDA communicator.""" diff --git a/pilot/info/basedata.py b/pilot/info/basedata.py index 81676f4cf..519b0a1e2 100644 --- a/pilot/info/basedata.py +++ b/pilot/info/basedata.py @@ -17,7 +17,7 @@ # # Authors: # - Alexey Anisenkov, anisyonk@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2024 """ Base data class. @@ -54,7 +54,7 @@ class BaseData: _keys = {} - def _load_data(self, data: dict, kmap: dict = {}, validators: dict = None): + def _load_data(self, data: dict, kmap: dict = None, validators: dict = None): """ Construct and initialize data from ext source. @@ -62,6 +62,8 @@ def _load_data(self, data: dict, kmap: dict = {}, validators: dict = None): :param kmap: the translation map of data attributes from external format to internal schema (dict) :param validators: map of validation handlers to be applied (dict). """ + if kmap is None: + kmap = {} # the translation map of the queue data attributes from external data to internal schema # 'internal_name':('ext_name1', 'extname2_if_any') # 'internal_name2':'ext_name3' diff --git a/pilot/info/dataloader.py b/pilot/info/dataloader.py index 9acfc7530..530b84800 100644 --- a/pilot/info/dataloader.py +++ b/pilot/info/dataloader.py @@ -17,7 +17,7 @@ # # Authors: # - Alexey Anisenkov, anisyonk@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2024 """ Base loader class to retrieve data from Ext sources (file, url). @@ -218,7 +218,7 @@ def jsonparser(c): return None -def merge_dict_data(dic1: dict, dic2: dict, keys: list = [], common: bool = True, left: bool = True, +def merge_dict_data(dic1: dict, dic2: dict, keys: list = None, common: bool = True, left: bool = True, right: bool = True, rec: bool = False) -> dict: """ Recursively merge two dictionary objects. @@ -234,6 +234,8 @@ def merge_dict_data(dic1: dict, dic2: dict, keys: list = [], common: bool = True :param rec: if True then merge recursively (bool) :return: merged dictionary (dict). """ + if keys is None: + keys = [] ### TODO: verify and configure logic later if not (isinstance(dic1, dict) and isinstance(dic2, dict)): diff --git a/pilot/info/extinfo.py b/pilot/info/extinfo.py index fda96cb6f..beb4fc157 100644 --- a/pilot/info/extinfo.py +++ b/pilot/info/extinfo.py @@ -16,8 +16,8 @@ # under the License. # # Authors: -# - Alexey Anisenkov, anisyonk@cern.ch, 2018-21 -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Alexey Anisenkov, anisyonk@cern.ch, 2018-2021 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """ Information provider from external source(s). @@ -59,7 +59,7 @@ def __init__(self, cache_time: int = 60): self.cache_time = cache_time @classmethod - def load_schedconfig_data(cls, pandaqueues: list = [], priority: list = [], cache_time: int = 60) -> dict: + def load_schedconfig_data(cls, pandaqueues: list = None, priority: list = None, cache_time: int = 60) -> dict: """ Download the (CRIC-extended) data associated to PandaQueue from various sources (prioritized). @@ -71,6 +71,10 @@ def load_schedconfig_data(cls, pandaqueues: list = [], priority: list = [], cach :param cache_time: default cache time in seconds (int). :return: dict of schedconfig settings by PandaQueue name as a key (dict). """ + if pandaqueues is None: + pandaqueues = [] + if priority is None: + priority = [] pandaqueues = sorted(set(pandaqueues)) cache_dir = config.Information.cache_dir @@ -121,7 +125,7 @@ def get_cvmfs_path(url: str, fname: str) -> str: return cvmfs_path @classmethod - def load_queuedata(cls, pandaqueue: str, priority: list = [], cache_time: int = 60) -> dict: + def load_queuedata(cls, pandaqueue: str, priority: list = None, cache_time: int = 60) -> dict: """ Download the queuedata from various sources (prioritized). @@ -135,6 +139,8 @@ def load_queuedata(cls, pandaqueue: str, priority: list = [], cache_time: int = :return: dict of queuedata settings by PandaQueue name as a key (dict) :raises PilotException: in case of error. """ + if priority is None: + priority = [] if not pandaqueue: raise PilotException('load_queuedata(): pandaqueue name is not specififed', code=ErrorCodes.QUEUEDATA) @@ -195,7 +201,7 @@ def jsonparser_panda(dat: Any) -> dict: return cls.load_data(sources, priority, cache_time) @classmethod - def load_storage_data(cls, ddmendpoints: list = [], priority: list = [], cache_time: int = 60) -> dict: + def load_storage_data(cls, ddmendpoints: list = None, priority: list = None, cache_time: int = 60) -> dict: """ Download DDM Storages details by given name (DDMEndpoint) from various sources (prioritized). @@ -206,6 +212,10 @@ def load_storage_data(cls, ddmendpoints: list = [], priority: list = [], cache_t :param cache_time: default cache time in seconds (int) :return: dictionary of DDMEndpoint settings by DDMendpoint name as a key (dict). """ + if ddmendpoints is None: + ddmendpoints = [] + if priority is None: + priority = [] ddmendpoints = sorted(set(ddmendpoints)) cache_dir = config.Information.cache_dir @@ -268,12 +278,14 @@ def resolve_queuedata(self, pandaqueue: str, schedconf_priority: list = None) -> # merge return merge_dict_data(r, master_data) - def resolve_storage_data(self, ddmendpoints: list = []) -> dict: + def resolve_storage_data(self, ddmendpoints: list = None) -> dict: """ Resolve final DDM Storages details by given names (DDMEndpoint). :param ddmendpoints: list of ddmendpoint names (list) :return: dictionary of settings for given DDMEndpoint as a key (dict). """ + if ddmendpoints is None: + ddmendpoints = [] # load ddmconf settings return self.load_storage_data(ddmendpoints, cache_time=self.cache_time) ## use default priority diff --git a/pilot/info/infoservice.py b/pilot/info/infoservice.py index b9c7dd72d..5162f6fd9 100644 --- a/pilot/info/infoservice.py +++ b/pilot/info/infoservice.py @@ -17,8 +17,7 @@ # # Authors: # - Alexey Anisenkov, anisyonk@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 - +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2024 """ Info Service module. @@ -141,7 +140,7 @@ def whoami(cls): return inspect.stack()[1][3] @classmethod - def _resolve_data(cls, fname: Any, providers: list = [], args: list = [], kwargs: dict = {}, merge: bool = False) -> Any: + def _resolve_data(cls, fname: Any, providers: Any = None, args: list = None, kwargs: dict = None, merge: bool = False) -> Any: """ Resolve data by calling function `fname` of passed provider objects. @@ -150,12 +149,18 @@ def _resolve_data(cls, fname: Any, providers: list = [], args: list = [], kwargs and resolve data by execution function `fname` with passed arguments `args` and `kwargs` :param fname: name of function to be called (Any) - :param providers: list of provider objects (list) + :param providers: list or tuple of provider objects (Any) :param args: list of arguments to be passed to function (list) :param kwargs: list of keyword arguments to be passed to function (dict) :param merge: if True then merge data from all providers (bool) :return: The result of first successful execution will be returned (Any). """ + if providers is None: + providers = [] + if args is None: + args = [] + if kwargs is None: + kwargs = {} ret = None if merge: providers = list(providers) @@ -196,7 +201,7 @@ def resolve_queuedata(self, pandaqueue: str) -> Any: ## high level API return cache.get(pandaqueue) #@require_init - def resolve_storage_data(self, ddmendpoints: list = []) -> dict: ## high level API + def resolve_storage_data(self, ddmendpoints: list = None) -> dict: ## high level API """ Resolve final full storage data details. @@ -204,6 +209,8 @@ def resolve_storage_data(self, ddmendpoints: list = []) -> dict: ## high level :return: dictionary of DDMEndpoint settings by DDMEndpoint name as a key (dict) :raises PilotException: in case of error. """ + if ddmendpoints is None: + ddmendpoints = [] if isinstance(ddmendpoints, str): ddmendpoints = [ddmendpoints] @@ -252,12 +259,14 @@ def resolve_schedconf_sources(self) -> Any: ## high level API # # look up priority order: either from job, local config, extinfo provider # return self._resolve_data(self.whoami(), providers=(self.confinfo, self.jobinfo, self.extinfo), args=[name]) - def resolve_ddmendpoint_storageid(self, ddmendpoint: list = []): + def resolve_ddmendpoint_storageid(self, ddmendpoint: list = None): """ Resolve the map between ddmendpoint and storage_id. :param ddmendpoint: ddmendpoint name (list). """ + if ddmendpoint is None: + ddmendpoint = [] if not ddmendpoint or ddmendpoint not in self.ddmendpoint2storage_id: storages = self.resolve_storage_data(ddmendpoint) for storage_name, storage in storages.items(): diff --git a/pilot/info/jobinfo.py b/pilot/info/jobinfo.py index 1f37a6c4d..af9562c96 100644 --- a/pilot/info/jobinfo.py +++ b/pilot/info/jobinfo.py @@ -17,7 +17,7 @@ # # Authors: # - Alexey Anisenkov, anisyonk@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2024 """ @@ -33,7 +33,7 @@ logger = logging.getLogger(__name__) -class JobInfoProvider(object): +class JobInfoProvider: """ Job info provider which is used to extract settings specific for given Job and overwrite general configuration used by Information Service @@ -85,12 +85,13 @@ def resolve_queuedata(self, pandaqueue, **kwargs): return {pandaqueue: data} - def resolve_storage_data(self, ddmendpoints=[], **kwargs): + def resolve_storage_data(self, ddmendpoints: list = None, **kwargs: dict) -> dict: """ Resolve Job specific settings for storage data (including data passed via --overwriteStorageData) :return: dict of settings for requested DDMEndpoints with ddmendpoin as a key """ - + if ddmendpoints is None: + ddmendpoints = [] data = {} ## use job.overwrite_storagedata as a master source diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 2bd8a56e3..a09fbfc93 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Wen Guan, wen.guan@cern.ch, 2018 """Common functions for ATLAS.""" @@ -101,7 +101,6 @@ post_memory_monitor_action, get_memory_monitor_summary_filename, get_prefetcher_setup, - get_benchmark_setup, get_memory_monitor_output_filename, get_metadata_dict_from_txt, ) @@ -2078,7 +2077,7 @@ def remove_special_files(workdir: str, dir_list: list): remove_dir_tree(item) -def remove_redundant_files(workdir: str, outputfiles: list = None, piloterrors: list = [], debugmode: bool = False): +def remove_redundant_files(workdir: str, outputfiles: list = None, piloterrors: list = None, debugmode: bool = False): """ Remove redundant files and directories prior to creating the log file. @@ -2091,7 +2090,8 @@ def remove_redundant_files(workdir: str, outputfiles: list = None, piloterrors: """ if outputfiles is None: outputfiles = [] - + if piloterrors is None: + piloterrors = [] logger.debug("removing redundant files prior to log creation") workdir = os.path.abspath(workdir) @@ -2481,9 +2481,6 @@ def get_utility_command_setup(name: str, job: Any, setup: str = None) -> str: if name == 'Prefetcher': return get_prefetcher_setup(job) - if name == 'Benchmark': - return get_benchmark_setup(job) - return "" diff --git a/pilot/user/atlas/cpu.py b/pilot/user/atlas/cpu.py index ac2e8c7fc..25e7e6c77 100644 --- a/pilot/user/atlas/cpu.py +++ b/pilot/user/atlas/cpu.py @@ -17,26 +17,29 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 + +""" CPU related functionality.""" -import os import logging +import os +from typing import Any # from .utilities import get_memory_values #from pilot.util.container import execute from pilot.util.math import float_to_rounded_string from .utilities import get_memory_values + logger = logging.getLogger(__name__) -def get_core_count(job): +def get_core_count(job: Any) -> int: """ Return the core count from ATHENA_PROC_NUMBER. - :param job: job object. + :param job: job object (Any) :return: core count (int). """ - if "HPC_HPC" in job.infosys.queuedata.catchall: if job.corecount is None: job.corecount = 0 @@ -58,30 +61,27 @@ def get_core_count(job): return job.corecount -def add_core_count(corecount, core_counts=[]): +def add_core_count(corecount: int, core_counts: list = None) -> list: """ Add a core count measurement to the list of core counts. - :param corecount: current actual core count (int). - :param core_counts: list of core counts (list). + :param corecount: current actual core count (int) + :param core_counts: list of core counts (list) :return: updated list of core counts (list). """ - - if core_counts is None: # protection + if core_counts is None: core_counts = [] core_counts.append(corecount) return core_counts -def set_core_counts(**kwargs): +def set_core_counts(**kwargs: dict): """ Set the number of used cores. - :param kwargs: kwargs (dictionary). - :return: + :param kwargs: kwargs (dict). """ - # something like this could be used if prmon also gave info about ncores # (change nprocs -> ncores and add ncores to list in utilities module, get_average_summary_dictionary_prmon()) diff --git a/pilot/user/atlas/jobmetrics.py b/pilot/user/atlas/jobmetrics.py index 6115ecdc7..4b0301833 100644 --- a/pilot/user/atlas/jobmetrics.py +++ b/pilot/user/atlas/jobmetrics.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """Functions for building job metrics.""" @@ -49,7 +49,7 @@ logger = logging.getLogger(__name__) -def get_job_metrics_string(job: Any, extra: dict = {}) -> str: +def get_job_metrics_string(job: Any, extra: dict = None) -> str: """ Get the job metrics string. @@ -57,6 +57,8 @@ def get_job_metrics_string(job: Any, extra: dict = {}) -> str: :param extra: any extra information to be added (dict) :return: job metrics (str). """ + if extra is None: + extra = {} job_metrics = "" # report core count (will also set corecount in job object) @@ -146,7 +148,7 @@ def get_trace_exit_code(workdir: str) -> str: return trace_exit_code -def add_features(job_metrics: str, corecount: int, add: list = []) -> str: +def add_features(job_metrics: str, corecount: int, add: list = None) -> str: """ Add job and machine feature data to the job metrics if available. @@ -157,13 +159,18 @@ def add_features(job_metrics: str, corecount: int, add: list = []) -> str: :param add: features to be added (list) :return: updated job metrics (str). """ + if add is None: + add = [] if job_metrics and not job_metrics.endswith(' '): job_metrics += ' ' - def add_sub_features(features_dic, add=[]): + def add_sub_features(features_dic: dict, _add: list = None): + """Helper function.""" + if _add is None: + _add = [] features_str = '' for key in features_dic.keys(): - if add and key not in add: + if _add and key not in _add: continue value = features_dic.get(key, None) if value: @@ -185,7 +192,7 @@ def add_sub_features(features_dic, add=[]): logger.warning(f'cannot process hs06 machine feature: {exc} (hs06={hs06}, total_cpu={total_cpu}, corecount={corecount})') features_list = [machinefeatures, jobfeatures] for feature_item in features_list: - features_str = add_sub_features(feature_item, add=add) + features_str = add_sub_features(feature_item, _add=add) if features_str: job_metrics += features_str @@ -241,9 +248,10 @@ def add_event_number(job_metrics: str, workdir: str) -> str: return job_metrics -def get_job_metrics(job: Any, extra: dict = {}) -> str: +def get_job_metrics(job: Any, extra: dict = None) -> str: """ Return a properly formatted job metrics string. + The format of the job metrics string is defined by the server. It will be reported to the server during updateJob. Example of job metrics: @@ -251,11 +259,12 @@ def get_job_metrics(job: Any, extra: dict = {}) -> str: Format: nEvents= nEventsW= vmPeakMax= vmPeakMean= RSSMean= hs06= shutdownTime= cpuFactor= cpuLimit= diskLimit= jobStart= memLimit= runLimit= - :param job: job object + :param job: job object (Any) :param extra: any extra information to be added (dict) - :return: job metrics (string). + :return: job metrics (str). """ - + if extra is None: + extra = {} # get job metrics string job_metrics = get_job_metrics_string(job, extra=extra) diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py index d5cea6eae..ba47ba9de 100644 --- a/pilot/user/atlas/utilities.py +++ b/pilot/user/atlas/utilities.py @@ -17,11 +17,15 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 +"""Functions related to memory monitoring and other utilities.""" + +import logging import os import time from re import search +from typing import Any # from pilot.info import infosys from .setup import get_asetup @@ -31,47 +35,33 @@ from pilot.util.processes import is_process_running from pilot.util.psutils import get_command_by_pid -import logging logger = logging.getLogger(__name__) -def get_benchmark_setup(job): - """ - Return the proper setup for the benchmark command. - - :param job: job object. - :return: setup string for the benchmark command. - """ - - return '' - - -def get_prefetcher_setup(job): +def get_prefetcher_setup(job: Any) -> str: """ Return the proper setup for the Prefetcher. + Prefetcher is a tool used with the Event Streaming Service. - :param job: job object. - :return: setup string for the Prefetcher command. + :param job: job object (Any) + :return: setup string for the Prefetcher command (str). """ - - # add code here .. - - return '' + return "" -def get_network_monitor_setup(setup, job): +def get_network_monitor_setup(setup: str, job: Any) -> str: """ Return the proper setup for the network monitor. + The network monitor is currently setup together with the payload and is start before it. The payload setup should therefore be provided. The network monitor setup is prepended to it. - :param setup: payload setup string. - :param job: job object. - :return: network monitor setup string. + :param setup: payload setup string (str) + :param job: job object (Any) + :return: network monitor setup string (str). """ - - return '' + return "" def get_memory_monitor_summary_filename(selector=None): @@ -811,14 +801,12 @@ def get_memory_values(workdir, name=""): return summary_dictionary -def post_memory_monitor_action(job): +def post_memory_monitor_action(job: Any): """ Perform post action items for memory monitor. - :param job: job object. - :return: + :param job: job object (Any). """ - nap = 3 path1 = os.path.join(job.workdir, get_memory_monitor_summary_filename()) path2 = os.environ.get('PILOT_HOME') @@ -841,10 +829,7 @@ def post_memory_monitor_action(job): def precleanup(): """ Pre-cleanup at the beginning of the job to remove any pre-existing files from previous jobs in the main work dir. - - :return: """ - logger.debug('performing pre-cleanup of potentially pre-existing files from earlier job in main work dir') path = os.path.join(os.environ.get('PILOT_HOME'), get_memory_monitor_summary_filename()) if os.path.exists(path): @@ -852,16 +837,15 @@ def precleanup(): remove(path) -def get_cpu_arch(): +def get_cpu_arch() -> str: """ Return the CPU architecture string. The CPU architecture string is determined by a script (cpu_arch.py), run by the pilot but setup with lsetup. For details about this script, see: https://its.cern.ch/jira/browse/ATLINFR-4844 - :return: CPU arch (string). + :return: CPU arch (str). """ - cpu_arch = '' def filter_output(stdout): @@ -883,7 +867,8 @@ def filter_output(stdout): # CPU arch script has now been copied, time to execute it # (reset irrelevant stderr) ec, stdout, stderr = execute(cmd) - if ec == 0 and 'RHEL9 and clone support is relatively new' in stderr: + if ec == 0 and ('RHEL9 and clone support is relatively new' in stderr or + 'RHEL8 and clones are not supported for users' in stderr): stderr = '' if ec or stderr: logger.warning(f'ec={ec}, stdout={stdout}, stderr={stderr}') diff --git a/pilot/user/generic/common.py b/pilot/user/generic/common.py index a76c9ee52..ec1d3212e 100644 --- a/pilot/user/generic/common.py +++ b/pilot/user/generic/common.py @@ -17,10 +17,14 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 +"""Generic user specific functionality.""" + +import logging import os from signal import SIGTERM +from typing import Any from pilot.common.exception import TrfDownloadFailure from pilot.util.config import config @@ -28,44 +32,42 @@ from pilot.util.filehandling import read_file from .setup import get_analysis_trf -import logging logger = logging.getLogger(__name__) -def sanity_check(): +def sanity_check() -> int: """ Perform an initial sanity check before doing anything else in a given workflow. + This function can be used to verify importing of modules that are otherwise used much later, but it is better to abort the pilot if a problem is discovered early. - :return: exit code (0 if all is ok, otherwise non-zero exit code). + :return: exit code (0 if all is ok, otherwise non-zero exit code) (int). """ - return 0 -def validate(job): +def validate(job: Any) -> bool: """ Perform user specific payload/job validation. - :param job: job object. - :return: Boolean (True if validation is successful). + :param job: job object (Any) + :return: True if validation is successful (bool). """ - return True -def get_payload_command(job): +def get_payload_command(job: Any) -> str: """ - Return the full command for executing the payload, including the sourcing of all setup files and setting of - environment variables. + Return the full command for executing the payload + The returned command string includes the sourcing of all setup files and setting of + environment variables. By default, the full payload command is assumed to be in the job.jobparams. - :param job: job object - :return: command (string) + :param job: job object (Any) + :return: command (str). """ - # Try to download the trf # if job.imagename != "" or "--containerImage" in job.jobparams: # job.transformation = os.path.join(os.path.dirname(job.transformation), "runcontainer") @@ -79,17 +81,16 @@ def get_payload_command(job): return get_analysis_run_command(job, trf_name) -def get_analysis_run_command(job, trf_name): +def get_analysis_run_command(job: Any, trf_name: str) -> str: """ Return the proper run command for the user job. Example output: export X509_USER_PROXY=<..>;./runAthena --usePFCTurl --directIn - :param job: job object. - :param trf_name: name of the transform that will run the job (string). Used when containers are not used. - :return: command (string). + :param job: job object (Any) + :param trf_name: name of the transform that will run the job (string). Used when containers are not used (str) + :return: command (str). """ - cmd = "" # add the user proxy @@ -108,35 +109,36 @@ def get_analysis_run_command(job, trf_name): return cmd -def update_job_data(job): +def update_job_data(job: Any): """ This function can be used to update/add data to the job object. E.g. user specific information can be extracted from other job object fields. In the case of ATLAS, information is extracted from the metaData field and added to other job object fields. - :param job: job object - :return: + :param job: job object (Any) """ - pass -def remove_redundant_files(workdir, outputfiles=None, piloterrors=[], debugmode=False): +def remove_redundant_files(workdir: str, outputfiles: list = None, piloterrors: list = None, debugmode: bool = False): """ Remove redundant files and directories prior to creating the log file. - :param workdir: working directory (string). - :param outputfiles: list of output files. + :param workdir: working directory (str) + :param outputfiles: list of output files (list) :param piloterrors: list of Pilot assigned error codes (list). - :return: """ - + #if outputfiles is None: + # outputfiles = [] + #if piloterrors is None: + # piloterrors = [] pass -def get_utility_commands(order=None, job=None): +def get_utility_commands(order: int = None, job: Any = None) -> dict: """ Return a dictionary of utility commands and arguments to be executed in parallel with the payload. + This could e.g. be memory and network monitor commands. A separate function can be used to determine the corresponding command setups using the utility command name. If the optional order parameter is set, the function should return the list of corresponding commands. @@ -147,34 +149,31 @@ def get_utility_commands(order=None, job=None): FORMAT: {'command': , 'args': } - :param order: optional sorting order (see pilot.util.constants) - :param job: optional job object. - :return: dictionary of utilities to be executed in parallel with the payload. + :param order: optional sorting order (see pilot.util.constants) (int) + :param job: optional job object (Any) + :return: dictionary of utilities to be executed in parallel with the payload (dict). """ - return {} -def get_utility_command_setup(name, job, setup=None): +def get_utility_command_setup(name: str, job: Any, setup: str = None) -> str: """ Return the proper setup for the given utility command. If a payload setup is specified - :param name: - :param setup: - :return: + :param name: name of utility command (str) + :param setup: setup string (str) + :return: full setup string of the utility command (str). """ - - pass + return "" -def get_utility_command_execution_order(name): +def get_utility_command_execution_order(name: str) -> int: """ Should the given utility command be executed before or after the payload? - :param name: utility name (string). - :return: execution order constant (UTILITY_BEFORE_PAYLOAD or UTILITY_AFTER_PAYLOAD_STARTED) + :param name: utility name (str) + :return: execution order constant (UTILITY_BEFORE_PAYLOAD or UTILITY_AFTER_PAYLOAD_STARTED) (int). """ - # example implementation if name == 'monitor': return UTILITY_BEFORE_PAYLOAD @@ -182,139 +181,128 @@ def get_utility_command_execution_order(name): return UTILITY_AFTER_PAYLOAD_STARTED -def post_utility_command_action(name, job): +def post_utility_command_action(name: str, job: Any): """ Perform post action for given utility command. - :param name: name of utility command (string). - :param job: job object. - :return: + :param name: name of utility command (str) + :param job: job object (Any). """ - pass -def get_utility_command_kill_signal(name): +def get_utility_command_kill_signal(name: str) -> int: """ Return the proper kill signal used to stop the utility command. - :param name: - :return: kill signal + :param name: utility command name (str) + :return: kill signal (int). """ - return SIGTERM -def get_utility_command_output_filename(name, selector=None): +def get_utility_command_output_filename(name: str, selector: bool = None) -> str: """ Return the filename to the output of the utility command. - :param name: utility name (string). - :param selector: optional special conditions flag (boolean). - :return: filename (string). + :param name: utility name (str) + :param selector: optional special conditions flag (bool) + :return: filename (str). """ - return "" -def verify_job(job): +def verify_job(job: Any) -> bool: """ Verify job parameters for specific errors. + Note: in case of problem, the function should set the corresponding pilot error code using job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code()) - :param job: job object - :return: Boolean. + :param job: job object (Any) + :return: True if job is verified (bool). """ - return True -def update_stagein(job): +def update_stagein(job: Any): """ In case special files need to be skipped during stage-in, the job.indata list can be updated here. + See ATLAS code for an example. - :param job: job object. - :return: + :param job: job object (Any). """ - pass -def get_metadata(workdir): +def get_metadata(workdir: str) -> str: """ Return the metadata from file. - :param workdir: work directory (string) - :return: + :param workdir: work directory (str) + :return: metadata (str). """ - path = os.path.join(workdir, config.Payload.jobreport) metadata = read_file(path) if os.path.exists(path) else None return metadata -def update_server(job): +def update_server(job: Any): """ Perform any user specific server actions. E.g. this can be used to send special information to a logstash. - :param job: job object. - :return: + :param job: job object (Any) """ - pass -def post_prestagein_utility_command(**kwargs): +def post_prestagein_utility_command(**kwargs: dict): """ Execute any post pre-stage-in utility commands. - :param kwargs: kwargs (dictionary). - :return: + :param kwargs: kwargs (dict). """ - # label = kwargs.get('label', 'unknown_label') # stdout = kwargs.get('output', None) - pass -def process_debug_command(debug_command, pandaid): +def process_debug_command(debug_command: str, pandaid: str) -> str: """ In debug mode, the server can send a special debug command to the pilot via the updateJob backchannel. + This function can be used to process that command, i.e. to identify a proper pid to debug (which is unknown to the server). - :param debug_command: debug command (string), payload pid (int). - :param pandaid: PanDA id (string). - :return: updated debug command (string) + :param debug_command: debug command (str) + :param pandaid: PanDA id (str) + :return: updated debug command (str). """ - return debug_command -def allow_timefloor(submitmode): +def allow_timefloor(submitmode: str) -> bool: """ Should the timefloor mechanism (multi-jobs) be allowed for the given submit mode? - :param submitmode: submit mode (string). + :param submitmode: submit mode (str) + :return: True if timefloor is allowed (bool). """ - return True -def get_pilot_id(jobid): +def get_pilot_id(jobid: int) -> str: """ Get the pilot id from the environment variable GTAG. + Update if necessary (do not used if you want the same pilot id for all multi-jobs). - :param jobid: PanDA job id - UNUSED (int). - :return: pilot id (string). + :param jobid: PanDA job id - UNUSED (int) + :return: pilot id (str). """ - return os.environ.get("GTAG", "unknown") diff --git a/pilot/user/generic/cpu.py b/pilot/user/generic/cpu.py index c6ed1b4bc..03300de37 100644 --- a/pilot/user/generic/cpu.py +++ b/pilot/user/generic/cpu.py @@ -17,33 +17,38 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 -from pilot.util.container import execute +""" CPU related functionality.""" import logging +from typing import Any + +from pilot.util.container import execute + logger = logging.getLogger(__name__) -def get_core_count(job): +def get_core_count(job: Any) -> int: """ Return the core count. - :param job: job object. + :param job: job object (Any) :return: core count (int). """ - return 0 -def add_core_count(corecount, core_counts=[]): +def add_core_count(corecount: int, core_counts: list = None): """ Add a core count measurement to the list of core counts. - :param corecount: current actual core count (int). - :param core_counts: list of core counts (list). + :param corecount: current actual core count (int) + :param core_counts: list of core counts (list) :return: updated list of core counts (list). """ + if core_counts is None: + core_counts = [] return core_counts.append(corecount) diff --git a/pilot/user/generic/jobmetrics.py b/pilot/user/generic/jobmetrics.py index 864983d48..b24739ceb 100644 --- a/pilot/user/generic/jobmetrics.py +++ b/pilot/user/generic/jobmetrics.py @@ -17,17 +17,20 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 # from pilot.util.jobmetrics import get_job_metrics_entry import logging +from typing import Any + logger = logging.getLogger(__name__) -def get_job_metrics(job, extra={}): +def get_job_metrics(job: Any, extra: dict = None) -> str: """ Return a properly formatted job metrics string. + The format of the job metrics string is defined by the server. It will be reported to the server during updateJob. Example of job metrics: @@ -35,9 +38,10 @@ def get_job_metrics(job, extra={}): Format: nEvents= nEventsW= vmPeakMax= vmPeakMean= RSSMean= hs06= shutdownTime= cpuFactor= cpuLimit= diskLimit= jobStart= memLimit= runLimit= - :param job: job object + :param job: job object (Any) :param extra: any extra information to be added (dict) - :return: job metrics (string). + :return: job metrics (str). """ - + #if extra is None: + # extra = {} return "" diff --git a/pilot/user/rubin/common.py b/pilot/user/rubin/common.py index 8f736223c..b68aa6c12 100644 --- a/pilot/user/rubin/common.py +++ b/pilot/user/rubin/common.py @@ -17,11 +17,15 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 +"""Common functions for Rubin.""" + +import logging import os from re import findall from signal import SIGTERM +from typing import Any from pilot.common.exception import TrfDownloadFailure from pilot.util.config import config @@ -29,44 +33,41 @@ from pilot.util.filehandling import read_file from .setup import get_analysis_trf -import logging logger = logging.getLogger(__name__) -def sanity_check(): +def sanity_check() -> int: """ Perform an initial sanity check before doing anything else in a given workflow. + This function can be used to verify importing of modules that are otherwise used much later, but it is better to abort the pilot if a problem is discovered early. - :return: exit code (0 if all is ok, otherwise non-zero exit code). + :return: exit code (0 if all is ok, otherwise non-zero exit code) (int). """ - return 0 -def validate(job): +def validate(job: Any) -> bool: """ Perform user specific payload/job validation. - :param job: job object. - :return: Boolean (True if validation is successful). + :param job: job object (Any) + :return: True if validation is successful (bool) """ - return True -def get_payload_command(job): +def get_payload_command(job: Any): """ - Return the full command for executing the payload, including the sourcing of all setup files and setting of - environment variables. + Return the full command for executing the payload. + The returned string includes the sourcing of all setup files and setting of environment variables. By default, the full payload command is assumed to be in the job.jobparams. - :param job: job object - :return: command (string) + :param job: job object (Any) + :return: command (str). """ - # Try to download the trf # if job.imagename != "" or "--containerImage" in job.jobparams: # job.transformation = os.path.join(os.path.dirname(job.transformation), "runcontainer") @@ -80,17 +81,16 @@ def get_payload_command(job): return get_analysis_run_command(job, trf_name) -def get_analysis_run_command(job, trf_name): +def get_analysis_run_command(job: Any, trf_name: str) -> str: """ Return the proper run command for the user job. Example output: export X509_USER_PROXY=<..>;./runAthena --usePFCTurl --directIn - :param job: job object. - :param trf_name: name of the transform that will run the job (string). Used when containers are not used. - :return: command (string). + :param job: job object (Any) + :param trf_name: name of the transform that will run the job (string). Used when containers are not used (str) + :return: command (str). """ - cmd = "" # add the user proxy @@ -109,36 +109,38 @@ def get_analysis_run_command(job, trf_name): return cmd -def update_job_data(job): +def update_job_data(job: Any): """ This function can be used to update/add data to the job object. + E.g. user specific information can be extracted from other job object fields. In the case of ATLAS, information is extracted from the metaData field and added to other job object fields. - :param job: job object - :return: + :param job: job object (Any) """ - pass -def remove_redundant_files(workdir, outputfiles=None, piloterrors=[], debugmode=False): +def remove_redundant_files(workdir: str, outputfiles: list = None, piloterrors: list = None, debugmode: bool = False): """ Remove redundant files and directories prior to creating the log file. - :param workdir: working directory (string). - :param outputfiles: list of output files. - :param piloterrors: list of Pilot assigned error codes (list). - :param debugmode: True if debug mode has been switched on (Boolean). - :return: + :param workdir: working directory (str) + :param outputfiles: list of output files (list) + :param piloterrors: list of Pilot assigned error codes (list) + :param debugmode: True if debug mode has been switched on (bool). """ - + #if outputfiles is None: + # outputfiles = [] + #if piloterrors is None: + # piloterrors = [] pass -def get_utility_commands(order=None, job=None): +def get_utility_commands(order: int = None, job: Any = None) -> dict: """ Return a dictionary of utility commands and arguments to be executed in parallel with the payload. + This could e.g. be memory and network monitor commands. A separate function can be used to determine the corresponding command setups using the utility command name. If the optional order parameter is set, the function should return the list of corresponding commands. @@ -149,11 +151,10 @@ def get_utility_commands(order=None, job=None): FORMAT: {'command': , 'args': } - :param order: optional sorting order (see pilot.util.constants) - :param job: optional job object. - :return: dictionary of utilities to be executed in parallel with the payload. + :param order: optional sorting order (see pilot.util.constants) (int) + :param job: optional job object (Any) + :return: dictionary of utilities to be executed in parallel with the payload (dict). """ - return {} diff --git a/pilot/user/rubin/cpu.py b/pilot/user/rubin/cpu.py index 4873673b8..db2674cc6 100644 --- a/pilot/user/rubin/cpu.py +++ b/pilot/user/rubin/cpu.py @@ -17,45 +17,47 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2024 -from pilot.util.container import execute +""" CPU related functionality.""" import logging +from typing import Any +from pilot.util.container import execute + logger = logging.getLogger(__name__) -def get_core_count(job): +def get_core_count(job: Any) -> int: """ Return the core count. - :param job: job object. + :param job: job object (Any) :return: core count (int). """ - return 0 -def add_core_count(corecount, core_counts=[]): +def add_core_count(corecount: int, core_counts: list = None): """ Add a core count measurement to the list of core counts. - :param corecount: current actual core count (int). - :param core_counts: list of core counts (list). + :param corecount: current actual core count (int) + :param core_counts: list of core counts (list) :return: updated list of core counts (list). """ + if core_counts is None: + core_counts = [] return core_counts.append(corecount) -def set_core_counts(**kwargs): +def set_core_counts(**kwargs: dict): """ Set the number of used cores. - :param kwargs: kwargs (dictionary). - :return: + :param kwargs: kwargs (dict). """ - job = kwargs.get('job', None) if job and job.pgrp: cmd = f"ps axo pgid,psr | sort | grep {job.pgrp} | uniq | awk '{{print $1}}' | grep -x {job.pgrp} | wc -l" diff --git a/pilot/user/rubin/jobmetrics.py b/pilot/user/rubin/jobmetrics.py index 864983d48..b517bbdc1 100644 --- a/pilot/user/rubin/jobmetrics.py +++ b/pilot/user/rubin/jobmetrics.py @@ -17,17 +17,22 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 + +"""Functions for building job metrics.""" # from pilot.util.jobmetrics import get_job_metrics_entry import logging +from typing import Any + logger = logging.getLogger(__name__) -def get_job_metrics(job, extra={}): +def get_job_metrics(job: Any, extra: dict = None) -> str: """ Return a properly formatted job metrics string. + The format of the job metrics string is defined by the server. It will be reported to the server during updateJob. Example of job metrics: @@ -35,9 +40,10 @@ def get_job_metrics(job, extra={}): Format: nEvents= nEventsW= vmPeakMax= vmPeakMean= RSSMean= hs06= shutdownTime= cpuFactor= cpuLimit= diskLimit= jobStart= memLimit= runLimit= - :param job: job object + :param job: job object (Any) :param extra: any extra information to be added (dict) - :return: job metrics (string). + :return: job metrics (str). """ - + #if extra is None: + # extra = {} return "" diff --git a/pilot/user/sphenix/common.py b/pilot/user/sphenix/common.py index 4d100aae1..657a180dd 100644 --- a/pilot/user/sphenix/common.py +++ b/pilot/user/sphenix/common.py @@ -17,11 +17,13 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 +import logging import os import re from signal import SIGTERM +from typing import Any from pilot.common.exception import ( TrfDownloadFailure, @@ -46,11 +48,10 @@ from pilot.util.filehandling import read_file from .setup import get_analysis_trf -import logging logger = logging.getLogger(__name__) -def sanity_check(): +def sanity_check() -> int: """ Perform an initial sanity check before doing anything else in a given workflow. This function can be used to verify importing of modules that are otherwise used much later, but it is better to abort @@ -61,27 +62,26 @@ def sanity_check(): return 0 -def validate(job): +def validate(job: Any) -> bool: """ Perform user specific payload/job validation. - :param job: job object. - :return: Boolean (True if validation is successful). + :param job: job object (Any) + :return: True if validation is successful (bool). """ return True -def get_payload_command(job): +def get_payload_command(job: Any) -> str: """ Return the full command for executing the payload, including the sourcing of all setup files and setting of environment variables. By default, the full payload command is assumed to be in the job.jobparams. - :param job: job object - :return: command (string) + :param job: job object (Any) + :return: command (str). """ - # Try to download the trf # if job.imagename != "" or "--containerImage" in job.jobparams: # job.transformation = os.path.join(os.path.dirname(job.transformation), "runcontainer") @@ -95,7 +95,7 @@ def get_payload_command(job): return get_analysis_run_command(job, trf_name) -def get_analysis_run_command(job, trf_name): +def get_analysis_run_command(job: Any, trf_name: str) -> str: """ Return the proper run command for the user job. @@ -123,14 +123,13 @@ def get_analysis_run_command(job, trf_name): return cmd -def update_job_data(job): +def update_job_data(job: Any): """ This function can be used to update/add data to the job object. E.g. user specific information can be extracted from other job object fields. In the case of ATLAS, information is extracted from the metaData field and added to other job object fields. - :param job: job object - :return: + :param job: job object (Any). """ # in case the job was created with --outputs="regex|DST_.*\.root", we can now look for the corresponding # output files and add them to the output file list @@ -174,19 +173,18 @@ def update_job_data(job): logger.debug('no regex found in outdata file list') -def remove_redundant_files(workdir, outputfiles=None, piloterrors=[], debugmode=False): +def remove_redundant_files(workdir: str, outputfiles: list = None, piloterrors: list = None, debugmode: bool = False): """ Remove redundant files and directories prior to creating the log file. :param workdir: working directory (string). :param outputfiles: list of output files. :param piloterrors: list of Pilot assigned error codes (list). - :return: None """ - return + pass -def get_utility_commands(order=None, job=None): +def get_utility_commands(order: int = None, job: Any = None) -> dict: """ Return a dictionary of utility commands and arguments to be executed in parallel with the payload. This could e.g. be memory and network diff --git a/pilot/user/sphenix/cpu.py b/pilot/user/sphenix/cpu.py index 7f7c967fc..2126ab0b6 100644 --- a/pilot/user/sphenix/cpu.py +++ b/pilot/user/sphenix/cpu.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 from typing import Any from pilot.util.container import execute @@ -36,7 +36,7 @@ def get_core_count(job: Any) -> int: return 0 -def add_core_count(corecount: int, core_counts: list = []) -> list: +def add_core_count(corecount: int, core_counts: list = None) -> list: """ Add a core count measurement to the list of core counts. @@ -44,6 +44,8 @@ def add_core_count(corecount: int, core_counts: list = []) -> list: :param core_counts: list of core counts (list). :return: updated list of core counts (list). """ + if core_counts is None: + core_counts = [] return core_counts.append(corecount) diff --git a/pilot/user/sphenix/jobmetrics.py b/pilot/user/sphenix/jobmetrics.py index 01255974a..b24739ceb 100644 --- a/pilot/user/sphenix/jobmetrics.py +++ b/pilot/user/sphenix/jobmetrics.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 # from pilot.util.jobmetrics import get_job_metrics_entry @@ -27,7 +27,7 @@ logger = logging.getLogger(__name__) -def get_job_metrics(job: Any, extra: dict = {}) -> str: +def get_job_metrics(job: Any, extra: dict = None) -> str: """ Return a properly formatted job metrics string. @@ -42,4 +42,6 @@ def get_job_metrics(job: Any, extra: dict = {}) -> str: :param extra: any extra information to be added (dict) :return: job metrics (str). """ + #if extra is None: + # extra = {} return "" diff --git a/pilot/user/sphenix/utilities.py b/pilot/user/sphenix/utilities.py index 4515190ac..72a74218b 100644 --- a/pilot/user/sphenix/utilities.py +++ b/pilot/user/sphenix/utilities.py @@ -17,8 +17,9 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 +import logging import os import time from re import search @@ -31,7 +32,6 @@ from pilot.util.parameters import convert_to_int from pilot.util.processes import is_process_running -import logging logger = logging.getLogger(__name__) @@ -60,7 +60,7 @@ def get_memory_monitor_output_filename(suffix: str = 'txt') -> str: def get_memory_monitor_setup(pid: int, pgrp: int, jobid: int, workdir: str, command: str, setup: str = "", - use_container: bool = True, transformation: str = "", outdata: list = [], + use_container: bool = True, transformation: str = "", outdata: list = None, dump_ps: bool = False) -> (str, int): """ Return the proper setup for the memory monitor. @@ -81,6 +81,10 @@ def get_memory_monitor_setup(pid: int, pgrp: int, jobid: int, workdir: str, comm :param dump_ps: should ps output be dumped when identifying prmon process? (bool) :return: job work directory (str), pid for process inside container (int). """ + if setup: # to get rid of pylint warning, setup is not used for this user + pass + if outdata is None: + outdata = [] # try to get the pid from a pid.txt file which might be created by a container_script pid = get_proper_pid(pid, pgrp, jobid, command=command, transformation=transformation, outdata=outdata, use_container=use_container, dump_ps=dump_ps) if pid == -1: diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index 616c0964e..a254baabc 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -776,7 +776,7 @@ def __init__(self, message: str, timeout: int = None, *args: Any): """Initialize variables.""" self.timeout = timeout self.message = message - self._errorCode = 1334 + self._error_code = 1334 super(TimeoutException, self).__init__(*args) def __str__(self): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 6053694d6..528081f72 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -18,7 +18,7 @@ # # Authors # - Mario Lassnig, mario.lassnig@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """Constamts.""" @@ -27,8 +27,8 @@ # Pilot version RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates -REVISION = '5' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '4' # build number should be reset to '1' for every new development cycle +REVISION = '6' # revision number should be reset to '0' for every new version release, increased for small updates +BUILD = '8' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index dd09b570c..63f715f93 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -17,7 +17,7 @@ # # Authors: # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 ################################ @@ -227,6 +227,10 @@ payloadstderr: payload.stderr # looping = looping job check checks: looping +# The payload may produce an error report with a specifiec error_code and error_diag. +# If the file exists, the pilot will use it to report the error. +error_report: payload_error_report.json + ################################ # Container parameters diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index 307bce9a6..4a0deb2c5 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -405,7 +405,9 @@ def write_json(filename: str, data: Union[dict, list], sort_keys: bool = True, i with open(filename, 'w', encoding='utf-8') as _fh: dumpjson(data, _fh, sort_keys=sort_keys, indent=indent, separators=separators) except (IOError, TypeError) as exc: - logger.warning(f'exception caught in write_json: {exc}') + logger.warning(f'exception caught (1) in write_json: {exc}') + except Exception as exc: + logger.warning(f'exception caught (2) in write_json: {exc}') else: status = True diff --git a/pilot/util/filestate.py b/pilot/util/filestate.py index a2445f0db..75c0035d8 100644 --- a/pilot/util/filestate.py +++ b/pilot/util/filestate.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2022-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2022-2024 """Handling of file states.""" @@ -39,12 +39,14 @@ class FileState(object): _lfns = [] _state_list = ['NOT_YET_TRANSFERRED', 'TRANSFER_IN_PROGRESS', 'TRANSFERRED', 'TRANSFER_FAILED'] - def __init__(self, file_states={}): + def __init__(self, file_states: dict = None): """ Initialize variables. :param file_states: file states (dict). """ + if file_states is None: + file_states = {} self._lfns = file_states.get('lfns', []) self.set_initial_list() @@ -61,7 +63,7 @@ def get_file_states(self) -> dict: """ return self._file_states - def update(self, lfn='', state=''): + def update(self, lfn: str = "", state: str = ""): """ Update the state for a given LFN. diff --git a/pilot/util/harvester.py b/pilot/util/harvester.py index 9b59a796d..2ed9bef95 100644 --- a/pilot/util/harvester.py +++ b/pilot/util/harvester.py @@ -17,10 +17,11 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """Functions for interactiving with Harvester.""" +import logging import os import os.path import socket @@ -31,7 +32,6 @@ from pilot.util.filehandling import write_json, touch, remove, read_json, get_checksum_value from pilot.util.timing import time_stamp -import logging logger = logging.getLogger(__name__) @@ -262,7 +262,7 @@ def publish_stageout_files(job: Any, event_status_file: str) -> bool: return False -def publish_work_report(work_report: dict = {}, worker_attributes_file: str = "worker_attributes.json") -> bool: +def publish_work_report(work_report: dict = None, worker_attributes_file: str = "worker_attributes.json") -> bool: """ Publish the work report. @@ -273,6 +273,8 @@ def publish_work_report(work_report: dict = {}, worker_attributes_file: str = "w :raises FileHandlingFailure: in case of IOError :return: True if successfully published, False otherwise (bool). """ + if work_report is None: + work_report = {} if work_report: work_report['timestamp'] = time_stamp() if "outputfiles" in work_report: @@ -282,8 +284,8 @@ def publish_work_report(work_report: dict = {}, worker_attributes_file: str = "w if "xml" in work_report: del (work_report["xml"]) - ec = write_json(worker_attributes_file, work_report) - if ec: + status = write_json(worker_attributes_file, work_report) + if not status: logger.error(f"work report publish failed: {work_report}") return False else: diff --git a/pilot/util/https.py b/pilot/util/https.py index 0f9d15065..5613043ca 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -19,7 +19,7 @@ # Authors: # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 # - Mario Lassnig, mario.lassnig@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 """Functions for https interactions.""" @@ -180,7 +180,7 @@ def https_setup(args: Any = None, version: str = ""): logger.warning(f'Failed to initialize SSL context .. skipped, error: {exc}') -def request(url: str, data: dict = {}, plain: bool = False, secure: bool = True, ipv: str = 'IPv6') -> Any: +def request(url: str, data: dict = None, plain: bool = False, secure: bool = True, ipv: str = 'IPv6') -> Any: """ Send a request using HTTPS. @@ -204,12 +204,14 @@ def request(url: str, data: dict = {}, plain: bool = False, secure: bool = True, :param data: data to send (dict) :param plain: if true, treats the response as a plain text (bool) :param secure: default: True, i.e. use certificates (bool) - :param ipv: internet protocol version (str). + :param ipv: internet protocol version (str) :returns: - :keyword:`dict` -- if everything went OK - `str` -- if ``plain`` parameter is `True` - - `None` -- if something went wrong + - `None` -- if something went wrong. """ + if data is None: + data = {} _ctx.ssl_context = None # certificates are not available on the grid, use curl # note that X509_USER_PROXY might change during running (in the case of proxy downloads), so @@ -658,7 +660,7 @@ def get_server_command(url: str, port: str, cmd: str = 'getJob') -> str: return f'{url}/server/panda/{cmd}' -def request2(url: str = "", data: dict = {}, secure: bool = True, compressed: bool = True) -> str or dict: +def request2(url: str = "", data: dict = None, secure: bool = True, compressed: bool = True) -> str or dict: """ Send a request using HTTPS (using urllib module). @@ -668,6 +670,8 @@ def request2(url: str = "", data: dict = {}, secure: bool = True, compressed: bo :param compressed: compress data (bool) :return: server response (str or dict). """ + if data is None: + data = {} # https might not have been set up if running in a [middleware] container if not _ctx.cacert: logger.debug('setting up unset https') @@ -754,7 +758,7 @@ def request2(url: str = "", data: dict = {}, secure: bool = True, compressed: bo return ret -def request3(url: str, data: dict = {}) -> str: +def request3(url: str, data: dict = None) -> str: """ Send a request using HTTPS (using requests module). @@ -762,6 +766,8 @@ def request3(url: str, data: dict = {}) -> str: :param data: data to send (dict) :return: server response (str). """ + if data is None: + data = {} if not requests: logger.warning('cannot use requests module (not available)') return "" diff --git a/pilot/util/jobmetrics.py b/pilot/util/jobmetrics.py index 2eda49bf4..124703457 100644 --- a/pilot/util/jobmetrics.py +++ b/pilot/util/jobmetrics.py @@ -17,14 +17,14 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """Functions for building job metrics.""" +import logging from os import environ from typing import Any -import logging logger = logging.getLogger(__name__) @@ -45,7 +45,7 @@ def get_job_metrics_entry(name: str, value: str) -> str: return job_metrics_entry -def get_job_metrics(job: Any, extra: dict = {}) -> str: +def get_job_metrics(job: Any, extra: dict = None) -> str: """ Return a properly formatted job metrics string. @@ -62,6 +62,8 @@ def get_job_metrics(job: Any, extra: dict = {}) -> str: :param extra: any extra information to be added (dict) :return: job metrics (str). """ + if extra is None: + extra = {} user = environ.get('PILOT_USER', 'generic').lower() # TODO: replace with singleton try: job_metrics_module = __import__(f'pilot.user.{user}.jobmetrics', globals(), locals(), [user], 0)