diff --git a/pandaserver/daemons/scripts/datasetManager.py b/pandaserver/daemons/scripts/datasetManager.py index 00d14b3d7..9d0c5b830 100644 --- a/pandaserver/daemons/scripts/datasetManager.py +++ b/pandaserver/daemons/scripts/datasetManager.py @@ -31,7 +31,7 @@ def main(tbuf=None, **kwargs): # memory checker def _memoryCheck(str): try: - proc_status = "/proc/%d/status" % os.getpid() + proc_status = f"/proc/{os.getpid()}/status" procfile = open(proc_status) name = "" vmSize = "" diff --git a/pandaserver/server/panda.py b/pandaserver/server/panda.py index 73e6a2ee7..a7e70da17 100755 --- a/pandaserver/server/panda.py +++ b/pandaserver/server/panda.py @@ -81,12 +81,10 @@ from pandaserver.userinterface.UserIF import ( addHarvesterDialogs, avalancheTask, - changeJobPriorities, changeTaskAttributePanda, changeTaskModTimePanda, changeTaskPriority, changeTaskSplitRulePanda, - checkMergeGenerationStatus, checkSandboxFile, enableJumboJobs, execute_idds_workflow_command, @@ -95,38 +93,19 @@ get_files_in_datasets, get_job_statistics_per_site_label_resource, get_user_secrets, - getActiveDatasets, - getCloudSpecs, - getDisInUseForAnal, - getFilesInUseForAnal, getFullJobStatus, - getGShareStatus, - getHighestPrioJobStat, getJediTaskDetails, getJediTasksInTimeRange, - getJobIDsInTimeRange, getJobStatistics, getJobStatisticsForBamboo, getJobStatisticsPerSite, getJobStatisticsPerSiteResource, - getJobStatisticsPerUserSite, - getJobStatisticsWithLabel, getJobStatus, - getJobsToBeUpdated, getJumboJobDatasets, - getLFNsInUseForAnal, - getNumPilots, getPandaClientVer, - getPandaIDsSite, getPandaIDsWithTaskID, - getPandaIDwithJobExeID, - getPandIDsWithJobID, - getQueuedAnalJobs, - getRetryHistory, getScriptOfflineRunning, - getSerialNumberForGroupJob, getSiteSpecs, - getSlimmedFileInfoPandaIDs, getTaskParamsMap, getTaskStatus, getUserJobMetadata, @@ -138,11 +117,7 @@ killJobs, killTask, killUnfinishedJobs, - listTasksInShare, pauseTask, - queryJobInfoPerCloud, - queryLastFilesInDataset, - queryPandaIDs, reactivateTask, reassignJobs, reassignShare, @@ -152,9 +127,7 @@ reloadInput, reportWorkerStats, reportWorkerStats_jobtype, - resubmitJobs, resumeTask, - retryFailedJobsInActive, retryTask, send_command_to_job, set_user_secret, @@ -162,7 +135,6 @@ setNumSlotsForWP, submitJobs, sweepPQ, - updateProdDBUpdateTimes, updateServiceMetrics, updateWorkers, userIF, diff --git a/pandaserver/srvcore/allowed_methods.py b/pandaserver/srvcore/allowed_methods.py index fbb5df273..dbfaa2197 100644 --- a/pandaserver/srvcore/allowed_methods.py +++ b/pandaserver/srvcore/allowed_methods.py @@ -48,50 +48,24 @@ allowed_methods += [ "submitJobs", "getJobStatus", - "queryPandaIDs", "killJobs", "reassignJobs", "getJobStatistics", "getJobStatisticsPerSite", - "resubmitJobs", - "queryLastFilesInDataset", - "getPandaIDsSite", - "getJobsToBeUpdated", - "updateProdDBUpdateTimes", "getSiteSpecs", - "getCloudSpecs", - "queryJobInfoPerCloud", - "getJobIDsInTimeRange", - "getPandIDsWithJobID", "getFullJobStatus", "getJobStatisticsForBamboo", - "getFilesInUseForAnal", "getPandaClientVer", - "getSlimmedFileInfoPandaIDs", - "getQueuedAnalJobs", - "getHighestPrioJobStat", - "getActiveDatasets", - "getSerialNumberForGroupJob", - "checkMergeGenerationStatus", - "getNumPilots", - "retryFailedJobsInActive", - "getJobStatisticsWithLabel", - "getPandaIDwithJobExeID", - "getJobStatisticsPerUserSite", - "getDisInUseForAnal", - "getLFNsInUseForAnal", "getScriptOfflineRunning", "setDebugMode", "insertSandboxFileInfo", "checkSandboxFile", - "changeJobPriorities", "insertTaskParams", "killTask", "finishTask", "getJediTasksInTimeRange", "getJediTaskDetails", "retryTask", - "getRetryHistory", "changeTaskPriority", "reassignTask", "changeTaskAttributePanda", @@ -106,7 +80,6 @@ "reactivateTask", "getTaskStatus", "reassignShare", - "listTasksInShare", "getTaskParamsMap", "updateWorkers", "harvesterIsAlive", @@ -121,7 +94,6 @@ "updateServiceMetrics", "getUserJobMetadata", "getJumboJobDatasets", - "getGShareStatus", "sweepPQ", "get_job_statistics_per_site_label_resource", "relay_idds_command", diff --git a/pandaserver/taskbuffer/OraDBProxy.py b/pandaserver/taskbuffer/OraDBProxy.py index 73a17e33a..33ba6a6b0 100644 --- a/pandaserver/taskbuffer/OraDBProxy.py +++ b/pandaserver/taskbuffer/OraDBProxy.py @@ -4768,45 +4768,6 @@ def peekJob(self, pandaID, fromDefined, fromActive, fromArchived, fromWaiting, f job.jobStatus = "unknown" return job - # get PandaID with jobexeID - def getPandaIDwithJobExeID(self, jobexeID): - comment = " /* DBProxy.getPandaIDwithJobExeID */" - _logger.debug(f"getPandaIDwithJobExeID : {jobexeID}") - failedRetVal = (None, None, "") - # return for wrong jobexeID - if jobexeID in ["NULL", "", "None", None]: - return failedRetVal - # SQL - sql = "SELECT PandaID,jobDefinitionID,jobName FROM ATLAS_PANDA.jobsWaiting4 " - sql += "WHERE jobExecutionID=:jobexeID AND prodSourceLabel=:prodSourceLabel " - sql += "AND jobStatus=:jobStatus " - varMap = {} - varMap[":jobexeID"] = jobexeID - varMap[":jobStatus"] = "pending" - varMap[":prodSourceLabel"] = "managed" - try: - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10 - self.cur.execute(sql + comment, varMap) - res = self.cur.fetchone() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # not found - if res is None: - _logger.debug(f"getPandaIDwithJobExeID : jobexeID {jobexeID} not found") - return failedRetVal - _logger.debug(f"getPandaIDwithJobExeID : {jobexeID} -> {str(res)}") - return res - except Exception: - # roll back - self._rollback() - errtype, errvalue = sys.exc_info()[:2] - _logger.error(f"getPandaIDwithJobExeID : {jobexeID} {errtype} {errvalue}") - return failedRetVal - # get PandaIDs with TaskID def getPandaIDsWithTaskID(self, jediTaskID): comment = " /* DBProxy.getPandaIDsWithTaskID */" @@ -5376,191 +5337,6 @@ def getNumWaitingJobsWithOutDS(self, outputDSs): # return empty list return False, {} - # get slimmed file info with PandaIDs - def getSlimmedFileInfoPandaIDs(self, pandaIDs): - comment = " /* DBProxy.getSlimmedFileInfoPandaIDs */" - _logger.debug(f"getSlimmedFileInfoPandaIDs : {pandaIDs[0]} len={len(pandaIDs)}") - try: - sqlL = "SELECT lfn,type,dataset FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID" - sqlA = "SELECT /*+ INDEX(tab FILES_ARCH_PANDAID_IDX)*/ lfn,type,dataset FROM ATLAS_PANDAARCH.filesTable_ARCH tab " - sqlA += "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-60)" - retMap = {"inDS": [], "outDS": []} - # start transaction - self.conn.begin() - # select - for pandaID in pandaIDs: - # make sql - varMap = {} - varMap[":PandaID"] = pandaID - # select - self.cur.arraysize = 10000 - self.cur.execute(sqlL + comment, varMap) - resList = self.cur.fetchall() - # try archived if not found in filesTable4 - if len(resList) == 0: - self.cur.execute(sqlA + comment, varMap) - resList = self.cur.fetchall() - # append - for tmp_lfn, tmp_type, tmp_dataset in resList: - # skip lib.tgz - if tmp_lfn.endswith(".lib.tgz"): - continue - if tmp_type == "input": - if tmp_dataset not in retMap["inDS"]: - retMap["inDS"].append(tmp_dataset) - elif tmp_type == "output": - if tmp_dataset not in retMap["outDS"]: - retMap["outDS"].append(tmp_dataset) - # commit - if not self._commit(): - raise RuntimeError("Commit error") - _logger.debug(f"getSlimmedFileInfoPandaIDs : {str(retMap)}") - return retMap - except Exception: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error(f"getSlimmedFileInfoPandaIDs : {type} {value}") - # return empty list - return {} - - # get JobIDs in a time range - def getJobIDsInTimeRange(self, dn, timeRange, retJobIDs): - comment = " /* DBProxy.getJobIDsInTimeRange */" - _logger.debug(f"getJobIDsInTimeRange : {dn} {timeRange.strftime('%Y-%m-%d %H:%M:%S')}") - try: - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ["", "NULL", None]: - compactDN = dn - tables = [ - "ATLAS_PANDA.jobsArchived4", - "ATLAS_PANDA.jobsActive4", - "ATLAS_PANDA.jobsWaiting4", - "ATLAS_PANDA.jobsDefined4", - ] - # select - for table in tables: - # make sql - if table == "ATLAS_PANDA.jobsArchived4": - sql = ( - 'SELECT /*+ INDEX_RS_ASC(TAB("JOBSARCHIVED4"."PRODUSERNAME")) NO_INDEX(TAB("JOBSARCHIVED4"."MODIFICATIONTIME")) */ jobDefinitionID FROM %s tab ' - % table - ) - elif table == "ATLAS_PANDA.jobsActive4": - sql = ( - 'SELECT /*+ INDEX_RS_ASC(TAB("JOBSACTIVE4"."PRODUSERNAME")) NO_INDEX(TAB("JOBSACTIVE4"."MODIFICATIONTIME")) */ jobDefinitionID FROM %s tab ' - % table - ) - else: - sql = f"SELECT jobDefinitionID FROM {table} " - sql += "WHERE prodUserName=:prodUserName AND modificationTime>:modificationTime " - sql += "AND prodSourceLabel=:prodSourceLabel AND lockedBy<>:ngLock GROUP BY jobDefinitionID" - varMap = {} - varMap[":prodUserName"] = compactDN - varMap[":prodSourceLabel"] = "user" - varMap[":ngLock"] = "jedi" - varMap[":modificationTime"] = timeRange - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - _logger.debug(sql + comment + str(varMap)) - self.cur.execute(sql + comment, varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # append - for (tmpID,) in resList: - if tmpID not in retJobIDs: - retJobIDs.append(tmpID) - _logger.debug(f"getJobIDsInTimeRange : {str(retJobIDs)}") - return retJobIDs - except Exception: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error(f"getJobIDsInTimeRange : {type} {value}") - # return empty list - return [] - - # get PandaIDs for a JobID - def getPandIDsWithJobID(self, dn, jobID, idStatus, nJobs): - comment = " /* DBProxy.getPandIDsWithJobID */" - _logger.debug(f"getPandIDsWithJobID : {dn} {jobID}") - try: - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ["", "NULL", None]: - compactDN = dn - tables = [ - "ATLAS_PANDA.jobsDefined4", - "ATLAS_PANDA.jobsActive4", - "ATLAS_PANDA.jobsWaiting4", - "ATLAS_PANDA.jobsArchived4", - ] - buildJobID = None - # select - for table in tables: - # skip if all jobs have already been gotten - if nJobs > 0 and len(idStatus) >= nJobs: - continue - # make sql - sql = f"SELECT PandaID,jobStatus,commandToPilot,prodSourceLabel,taskBufferErrorCode FROM {table} " - sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sql += "AND prodSourceLabel in (:prodSourceLabel1,:prodSourceLabel2)" - varMap = {} - varMap[":prodUserName"] = compactDN - varMap[":jobDefinitionID"] = jobID - varMap[":prodSourceLabel1"] = "user" - varMap[":prodSourceLabel2"] = "panda" - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - # select - _logger.debug(sql + comment + str(varMap)) - self.cur.execute(sql + comment, varMap) - resList = self.cur.fetchall() - # append - for ( - tmpID, - tmpStatus, - tmpCommand, - tmpProdSourceLabel, - tmpTaskBufferErrorCode, - ) in resList: - # ignore jobs retried by pilot since they have new PandaIDs with the same jobsetID/jobdefID - if tmpTaskBufferErrorCode in [ErrorCode.EC_PilotRetried]: - continue - # ignore old buildJob which was replaced by rebrokerage - if tmpProdSourceLabel == "panda": - if buildJobID is None: - # first buildJob - buildJobID = tmpID - elif buildJobID >= tmpID: - # don't append old one - continue - else: - # delete old one - del idStatus[buildJobID] - buildJobID = tmpID - # append - idStatus[tmpID] = (tmpStatus, tmpCommand) - # commit - if not self._commit(): - raise RuntimeError("Commit error") - _logger.debug(f"getPandIDsWithJobID : {str(idStatus)}") - return idStatus, buildJobID - except Exception: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error(f"getPandIDsWithJobID : {type} {value}") - # return empty list - return {}, None - # lock jobs for reassign def lockJobsForReassign( self, @@ -5828,254 +5604,6 @@ def queryPandaID(self, jobDefID): self._rollback() return None - # query job info per cloud - def queryJobInfoPerCloud(self, cloud, schedulerID=None): - comment = " /* DBProxy.queryJobInfoPerCloud */" - _logger.debug(f"queryJobInfoPerCloud : {cloud} {schedulerID}") - attrs = ["PandaID", "jobStatus", "jobName"] - sql0 = "SELECT " - for attr in attrs: - sql0 += f"{attr}," - sql0 = f"{sql0[:-1]} " - sql0 += "FROM %s " - sql0 += "WHERE cloud=:cloud " - varMap = {} - varMap[":cloud"] = cloud - if schedulerID is not None: - sql0 += "AND schedulerID=:schedulerID " - varMap[":schedulerID"] = schedulerID - try: - ids = [] - returnList = [] - # select - for table in [ - "ATLAS_PANDA.jobsActive4", - "ATLAS_PANDA.jobsWaiting4", - "ATLAS_PANDA.jobsDefined4", - ]: - # start transaction - self.conn.begin() - # select - sql = sql0 % table - self.cur.arraysize = 10000 - self.cur.execute(sql + comment, varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # loop over all - for res in resList: - valMap = {} - # skip if already in the list - PandaID = res[0] - if PandaID in ids: - continue - # convert to map - for idx, attr in enumerate(attrs): - valMap[attr] = res[idx] - # append to list - ids.append(PandaID) - returnList.append(valMap) - # return - return returnList - except Exception: - type, value, traceBack = sys.exc_info() - _logger.error(f"queryJobInfoPerCloud : {type} {value}") - # roll back - self._rollback() - return None - - # get PandaIDs at Site - def getPandaIDsSite(self, site, status, limit): - comment = " /* DBProxy.getPandaIDsSite */" - _logger.debug(f"getPandaIDsSite : {site} {status} {limit}") - try: - ids = [] - # find table - if status in ["defined", "assigned"]: - table = "ATLAS_PANDA.jobsDefined4" - elif status in ["activated", "running", "holding", "transferring"]: - table = "ATLAS_PANDA.jobsActive4" - elif status in ["waiting"]: - table = "ATLAS_PANDA.jobsWaiting4" - elif status in ["finished", "failed"]: - table = "ATLAS_PANDA.jobsArchived4" - else: - _logger.error(f"unknown status:{status}") - return ids - # limit - limit = int(limit) - # SQL - sql = f"SELECT PandaID FROM {table} " - sql += "WHERE computingSite=:computingSite AND jobStatus=:jobStatus AND prodSourceLabel=:prodSourceLabel " - sql += "AND rownum<=:limit" - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[":computingSite"] = site - varMap[":jobStatus"] = status - varMap[":limit"] = limit - varMap[":prodSourceLabel"] = "managed" - self.cur.arraysize = limit - self.cur.execute(sql + comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # convert to list - for (id,) in res: - ids.append(id) - return ids - except Exception: - type, value, traceBack = sys.exc_info() - _logger.error(f"getPandaIDsSite : {type} {value}") - # roll back - self._rollback() - return [] - - # get PandaIDs to be updated in prodDB - def getPandaIDsForProdDB(self, limit, lockedby): - comment = " /* DBProxy.getPandaIDsForProdDB */" - _logger.debug(f"getPandaIDsForProdDB {limit}") - varMap = {} - varMap[":lockedby"] = lockedby - varMap[":limit"] = limit - varMap[":prodSourceLabel1"] = "managed" - sql0 = "PandaID,jobStatus,stateChangeTime,attemptNr,jobDefinitionID,jobExecutionID FROM %s " - sqlW = "WHERE prodSourceLabel IN (:prodSourceLabel1," - for tmpLabel in JobUtils.list_ptest_prod_sources: - tmpKey = f":prodSourceLabel_{tmpLabel}" - sqlW += tmpKey - sqlW += "," - varMap[tmpKey] = tmpLabel - sqlW = sqlW[:-1] - sqlW += ") AND lockedby=:lockedby " - sqlX = "AND stateChangeTime>prodDBUpdateTime " - sqlA = "AND (CASE WHEN stateChangeTime>prodDBUpdateTime THEN 1 ELSE null END) = 1 " - sql1 = "AND rownum<=:limit " - try: - retMap = {} - totalIDs = 0 - # select - for table in [ - "ATLAS_PANDA.jobsArchived4", - "ATLAS_PANDA.jobsActive4", - "ATLAS_PANDA.jobsWaiting4", - "ATLAS_PANDA.jobsDefined4", - ]: - # start transaction - self.conn.begin() - # select - sql = sql0 % table - if table in ["ATLAS_PANDA.jobsArchived4"]: - sql = "SELECT /*+ INDEX_RS_ASC(tab JOBSARCHIVED4_CHANGETIME) NO_INDEX(tab(PRODSOURCELABEL))*/ " + sql + " tab " + sqlW + sqlA - else: - sql = "SELECT " + sql + sqlW + sqlX - sql += sql1 - self.cur.arraysize = limit - _logger.debug(f"getPandaIDsForProdDB {sql + comment} {str(varMap)}") - self.cur.execute(sql + comment, varMap) - res = self.cur.fetchall() - _logger.debug(f"getPandaIDsForProdDB got {len(res)}") - # commit - if not self._commit(): - raise RuntimeError("Commit error") - for ( - PandaID, - jobStatus, - stateChangeTime, - attemptNr, - jobDefinitionID, - jobExecutionID, - ) in res: - # ignore dummy jobs in jobsDefined4 - if table == "ATLAS_PANDA.jobsDefined4" and (jobStatus not in ["defined", "assigned"]): - continue - # add status - if jobStatus not in retMap: - retMap[jobStatus] = [] - # append - retMap[jobStatus].append( - { - "PandaID": PandaID, - "attemptNr": attemptNr, - "stateChangeTime": stateChangeTime.strftime("%Y-%m-%d %H:%M:%S"), - "jobDefinitionID": jobDefinitionID, - "jobExecutionID": jobExecutionID, - } - ) - totalIDs += 1 - # limit - if totalIDs > limit: - break - _logger.debug(f"getPandaIDsForProdDB {limit} ret->{totalIDs}") - return retMap - except Exception: - type, value, traceBack = sys.exc_info() - _logger.error(f"getPandaIDsForProdDB : {type} {value}") - # roll back - self._rollback() - return {} - - # update prodDBUpdateTime - def updateProdDBUpdateTime(self, param): - comment = " /* DBProxy.updateProdDBUpdateTime */" - _logger.debug(f"updateProdDBUpdateTime {str(param)}") - sql0 = "UPDATE %s " - sql0 += "SET prodDBUpdateTime=TO_TIMESTAMP(:prodDBUpdateTime,'YYYY-MM-DD HH24:MI:SS') " - sql0 += "WHERE PandaID=:PandaID AND jobStatus=:jobStatus AND stateChangeTime=TO_TIMESTAMP(:stateChangeTime,'YYYY-MM-DD HH24:MI:SS') " - varMap = {} - varMap[":prodDBUpdateTime"] = param["stateChangeTime"] - varMap[":PandaID"] = param["PandaID"] - varMap[":jobStatus"] = param["jobStatus"] - varMap[":stateChangeTime"] = param["stateChangeTime"] - try: - # convert to string - if isinstance(varMap[":prodDBUpdateTime"], datetime.datetime): - varMap[":prodDBUpdateTime"] = varMap[":prodDBUpdateTime"].strftime("%Y-%m-%d %H:%M:%S") - if isinstance(varMap[":stateChangeTime"], datetime.datetime): - varMap[":stateChangeTime"] = varMap[":stateChangeTime"].strftime("%Y-%m-%d %H:%M:%S") - # set table - if param["jobStatus"] in ["defined", "assigned"]: - table = "ATLAS_PANDA.jobsDefined4" - elif param["jobStatus"] in ["waiting", "pending"]: - table = "ATLAS_PANDA.jobsWaiting4" - elif param["jobStatus"] in [ - "activated", - "sent", - "starting", - "running", - "holding", - "transferring", - ]: - table = "ATLAS_PANDA.jobsActive4" - elif param["jobStatus"] in ["finished", "failed", "cancelled", "closed"]: - table = "ATLAS_PANDA.jobsArchived4" - else: - _logger.error(f"invalid status {param['jobStatus']}") - return False - # set transaction - self.conn.begin() - # update - sql = sql0 % table - _logger.debug(sql + comment + str(varMap)) - self.cur.execute(sql + comment, varMap) - retU = self.cur.rowcount - # commit - if not self._commit(): - raise RuntimeError("Commit error") - _logger.debug(f"updateProdDBUpdateTime {param['PandaID']} ret={retU}") - if retU == 1: - return True - return False - except Exception: - type, value, traceBack = sys.exc_info() - _logger.error(f"updateProdDBUpdateTime : {type} {value}") - # roll back - self._rollback() - return False - # add metadata def addMetadata(self, pandaID, metadata, newStatus): comment = " /* DBProxy.addMetaData */" @@ -6661,98 +6189,15 @@ def getSerialNumber(self, datasetname, definedFreshFlag=None): _logger.error(f"getSerialNumber() : {type} {value}") return (-1, False) - # get serial number for group job - def getSerialNumberForGroupJob(self, name): - comment = " /* DBProxy.getSerialNumberForGroupJob */" - retVal = {"sn": "", "status": False} - try: - _logger.debug(f"getSerialNumberForGroupJob({name})") - # start transaction - self.conn.begin() - # get serial number - if self.backend == "oracle": - sql = "SELECT ATLAS_PANDA.GROUP_JOBID_SEQ.nextval FROM dual" - self.cur.execute(sql + comment, {}) - (sn,) = self.cur.fetchone() - else: - # panda_config.backend == 'mysql' - # fake sequence - sql = " INSERT INTO ATLAS_PANDA.GROUP_JOBID_SEQ (col) VALUES (NULL) " - self.cur.arraysize = 100 - self.cur.execute(sql + comment, {}) - sql2 = """ SELECT LAST_INSERT_ID() """ - self.cur.execute(sql2 + comment, {}) - (sn,) = self.cur.fetchone() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # return - retVal["sn"] = sn - retVal["status"] = True - _logger.debug(f"getSerialNumberForGroupJob : {name} {str(retVal)}") - return retVal - except Exception: - # roll back - self._rollback() - # error - errtype, errvalue = sys.exc_info()[:2] - _logger.error(f"getSerialNumberForGroupJob : {errtype} {errvalue}") - retVal["status"] = False - return retVal - - # change job priorities - def changeJobPriorities(self, newPrioMap): - comment = " /* DBProxy.changeJobPriorities */" - try: - _logger.debug("changeJobPriorities start") - sql = "UPDATE %s SET currentPriority=:currentPriority,assignedPriority=:assignedPriority " - sql += "WHERE PandaID=:PandaID" - # loop over all PandaIDs - for pandaID in newPrioMap: - newPrio = newPrioMap[pandaID] - varMap = {} - varMap[":PandaID"] = pandaID - varMap[":currentPriority"] = newPrio - varMap[":assignedPriority"] = newPrio - _logger.debug(f"changeJobPriorities PandaID={pandaID} -> prio={newPrio}") - # start transaction - self.conn.begin() - # try active tables - retU = None - for tableName in [ - "ATLAS_PANDA.jobsActive4", - "ATLAS_PANDA.jobsDefined4", - "ATLAS_PANDA.jobsWaiting4", - ]: - # execute - self.cur.execute((sql % tableName) + comment, varMap) - retU = self.cur.rowcount - if retU > 0: - break - # commit - if not self._commit(): - raise RuntimeError("Commit error") - _logger.debug(f"changeJobPriorities PandaID={pandaID} retU={retU}") - # return - _logger.debug("changeJobPriorities done") - return True, "" - except Exception: - # roll back - self._rollback() - # error - errtype, errvalue = sys.exc_info()[:2] - _logger.error(f"changeJobPriorities : {errtype} {errvalue}") - return False, "database error" - - # query files with map - def queryFilesWithMap(self, map): - comment = " /* DBProxy.queryFilesWithMap */" - _logger.debug("queryFilesWithMap()") - sql1 = f"SELECT PandaID,{FileSpec.columnNames()} FROM ATLAS_PANDA.filesTable4" - varMap = {} - for key in map: - if len(varMap) == 0: - sql1 += f" WHERE {key}=:{key}" + # query files with map + def queryFilesWithMap(self, map): + comment = " /* DBProxy.queryFilesWithMap */" + _logger.debug("queryFilesWithMap()") + sql1 = f"SELECT PandaID,{FileSpec.columnNames()} FROM ATLAS_PANDA.filesTable4" + varMap = {} + for key in map: + if len(varMap) == 0: + sql1 += f" WHERE {key}=:{key}" else: sql1 += f" AND {key}=:{key}" varMap[f":{key}"] = map[key] @@ -6933,325 +6378,6 @@ def getDatasetWithFile(self, lfn, jobPrioity=0): _logger.error(f"getDatasetWithFile : {lfn} : {errType} {errValue}") return {} - # get input files currently in use for analysis - def getFilesInUseForAnal(self, outDataset): - comment = " /* DBProxy.getFilesInUseForAnal */" - sqlSub = "SELECT destinationDBlock,PandaID FROM ATLAS_PANDA.filesTable4 " - sqlSub += "WHERE dataset=:dataset AND type IN (:type1,:type2) GROUP BY destinationDBlock,PandaID" - sqlPaA = "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsDefined4 " - sqlPaA += "WHERE PandaID=:PandaID " - sqlPaA += "UNION " - sqlPaA += "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsActive4 " - sqlPaA += "WHERE PandaID=:PandaID " - sqlPan = "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsArchived4 " - sqlPan += "WHERE PandaID=:PandaID AND modificationTime<=CURRENT_DATE " - sqlPan += "UNION " - sqlPan += "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDAARCH.jobsArchived " - sqlPan += "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30)" - sqlIdA = "SELECT PandaID,jobStatus FROM ATLAS_PANDA.jobsArchived4 " - sqlIdA += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sqlIdA += "AND prodSourceLabel=:prodSourceLabel1 " - sqlIdL = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBDEFID_IDX) */ " - sqlIdL += "PandaID,jobStatus FROM ATLAS_PANDAARCH.jobsArchived tab " - sqlIdL += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sqlIdL += "AND prodSourceLabel=:prodSourceLabel1 AND modificationTime>(CURRENT_DATE-30) " - sqlDis = "SELECT distinct dispatchDBlock FROM ATLAS_PANDA.filesTable4 " - sqlDis += "WHERE PandaID=:PandaID AND type=:type AND dispatchDBlock IS NOT NULL AND modificationTime <= CURRENT_DATE" - sqlLfn = "SELECT /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ lfn,PandaID FROM ATLAS_PANDA.filesTable4 tab " - sqlLfn += "WHERE dispatchDBlock=:dispatchDBlock AND type=:type " - sqlLfn += "AND (destinationDBlockToken IS NULL OR destinationDBlockToken<>:noshadow) AND modificationTime<=CURRENT_DATE" - nTry = 3 - for iTry in range(nTry): - inputFilesList = [] - try: - # start transaction - self.conn.begin() - # get sub datasets - varMap = {} - varMap[":dataset"] = outDataset - varMap[":type1"] = "output" - varMap[":type2"] = "log" - _logger.debug(f"getFilesInUseForAnal : {sqlSub} {str(varMap)}") - self.cur.arraysize = 100000 - retS = self.cur.execute(sqlSub + comment, varMap) - res = self.cur.fetchall() - subDSpandaIDmap = {} - checkedPandaIDs = {} - for subDataset, pandaID in res: - # avoid redundunt lookup - if pandaID in checkedPandaIDs: - continue - if subDataset in subDSpandaIDmap: - # append jobs as running since they are not in archived tables - if pandaID not in subDSpandaIDmap[subDataset]: - checkedPandaIDs[pandaID] = "running" - subDSpandaIDmap[subDataset].append(pandaID) - continue - # look for jobdefID and userName - varMap = {} - varMap[":PandaID"] = pandaID - _logger.debug(f"getFilesInUseForAnal : {sqlPaA} {str(varMap)}") - retP = self.cur.execute(sqlPaA + comment, varMap) - resP = self.cur.fetchall() - if len(resP) != 0: - jobDefinitionID, prodUserName = resP[0] - else: - _logger.debug(f"getFilesInUseForAnal : {sqlPan} {str(varMap)}") - retP = self.cur.execute(sqlPan + comment, varMap) - resP = self.cur.fetchall() - if len(resP) != 0: - jobDefinitionID, prodUserName = resP[0] - else: - continue - # get PandaIDs with obdefID and userName - tmpPandaIDs = [] - varMap = {} - varMap[":prodUserName"] = prodUserName - varMap[":jobDefinitionID"] = jobDefinitionID - varMap[":prodSourceLabel1"] = "user" - _logger.debug(f"getFilesInUseForAnal : {sqlIdA} {str(varMap)}") - retID = self.cur.execute(sqlIdA + comment, varMap) - resID = self.cur.fetchall() - for tmpPandaID, tmpJobStatus in resID: - checkedPandaIDs[tmpPandaID] = tmpJobStatus - tmpPandaIDs.append(tmpPandaID) - _logger.debug(f"getFilesInUseForAnal : {sqlIdL} {str(varMap)}") - retID = self.cur.execute(sqlIdL + comment, varMap) - resID = self.cur.fetchall() - for tmpPandaID, tmpJobStatus in resID: - if tmpPandaID not in tmpPandaIDs: - checkedPandaIDs[tmpPandaID] = tmpJobStatus - tmpPandaIDs.append(tmpPandaID) - # append - if subDataset not in subDSpandaIDmap: - subDSpandaIDmap[subDataset] = [] - for tmpPandaID in tmpPandaIDs: - # reuse failed files if jobs are in Archived since they cannot change back to active - if checkedPandaIDs[tmpPandaID] in [ - "failed", - "cancelled", - "closed", - ]: - continue - # collect PandaIDs - subDSpandaIDmap[subDataset].append(tmpPandaID) - # loop over all sub datasets - for subDataset in subDSpandaIDmap: - activePandaIDs = subDSpandaIDmap[subDataset] - # skip empty - if activePandaIDs == []: - continue - # get dispatchDBlocks - pandaID = activePandaIDs[0] - varMap = {} - varMap[":PandaID"] = pandaID - varMap[":type"] = "input" - _logger.debug(f"getFilesInUseForAnal : {sqlDis} {str(varMap)}") - self.cur.arraysize = 10000 - retD = self.cur.execute(sqlDis + comment, varMap) - resD = self.cur.fetchall() - # get LFNs - for (disDataset,) in resD: - # use new style only - if not disDataset.startswith("user_disp."): - continue - varMap = {} - varMap[":dispatchDBlock"] = disDataset - varMap[":type"] = "input" - varMap[":noshadow"] = "noshadow" - _logger.debug(f"getFilesInUseForAnal : {sqlLfn} {str(varMap)}") - self.cur.arraysize = 100000 - retL = self.cur.execute(sqlLfn + comment, varMap) - resL = self.cur.fetchall() - # append - for lfn, filePandaID in resL: - # skip files used by archived failed or cancelled jobs - if filePandaID in activePandaIDs and lfn not in inputFilesList: - inputFilesList.append(lfn) - # commit - if not self._commit(): - RuntimeError("Commit error") - _logger.debug(f"getFilesInUseForAnal : {len(inputFilesList)}") - return inputFilesList - except Exception: - # roll back - self._rollback() - if iTry + 1 < nTry: - _logger.debug(f"inputFilesList retry : {iTry}") - time.sleep(random.randint(10, 20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error(f"inputFilesList({outDataset}) : {type} {value}") - return [] - - # get list of dis dataset to get input files in shadow - def getDisInUseForAnal(self, outDataset): - comment = " /* DBProxy.getDisInUseForAnal */" - sqlSub = "SELECT destinationDBlock,PandaID,status FROM ATLAS_PANDA.filesTable4 " - sqlSub += "WHERE dataset=:dataset AND type=:type1 GROUP BY destinationDBlock,PandaID,status" - sqlPaA = "SELECT jobStatus FROM ATLAS_PANDA.jobsDefined4 " - sqlPaA += "WHERE PandaID=:PandaID " - sqlPaA += "UNION " - sqlPaA += "SELECT jobStatus FROM ATLAS_PANDA.jobsActive4 " - sqlPaA += "WHERE PandaID=:PandaID " - sqlPan = "SELECT jobStatus FROM ATLAS_PANDA.jobsArchived4 " - sqlPan += "WHERE PandaID=:PandaID AND modificationTime<=CURRENT_DATE " - sqlPan += "UNION " - sqlPan += "SELECT jobStatus FROM ATLAS_PANDAARCH.jobsArchived " - sqlPan += "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30)" - sqlDis = "SELECT distinct dispatchDBlock FROM ATLAS_PANDA.filesTable4 " - sqlDis += "WHERE PandaID=:PandaID AND type=:type AND dispatchDBlock IS NOT NULL AND modificationTime <= CURRENT_DATE" - inputDisList = [] - try: - timeStart = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) - _logger.debug(f"getDisInUseForAnal start for {outDataset}") - # start transaction - self.conn.begin() - # get sub datasets - varMap = {} - varMap[":dataset"] = outDataset - varMap[":type1"] = "log" - _logger.debug(f"getDisInUseForAnal : {sqlSub} {str(varMap)}") - self.cur.arraysize = 100000 - retS = self.cur.execute(sqlSub + comment, varMap) - res = self.cur.fetchall() - subDSpandaIDmap = {} - checkedPandaIDs = {} - for subDataset, pandaID, fileStatus in res: - # add map - if subDataset not in subDSpandaIDmap: - subDSpandaIDmap[subDataset] = [] - # check job status - if fileStatus != "ready": - varMap = {} - varMap[":PandaID"] = pandaID - _logger.debug(f"getDisInUseForAnal : {sqlPaA} {str(varMap)}") - retP = self.cur.execute(sqlPaA + comment, varMap) - resP = self.cur.fetchall() - if len(resP) != 0: - # append jobs as running since they are not in archived tables yet - checkedPandaIDs[pandaID] = "running" - subDSpandaIDmap[subDataset].append(pandaID) - else: - _logger.debug(f"getDisInUseForAnal : {sqlPan} {str(varMap)}") - retP = self.cur.execute(sqlPan + comment, varMap) - resP = self.cur.fetchall() - if len(resP) != 0: - (checkedPandaIDs[pandaID],) = resP[0] - # reuse failed files if jobs are in Archived since they cannot change back to active - if checkedPandaIDs[pandaID] in [ - "failed", - "cancelled", - "closed", - ]: - continue - # collect PandaIDs - subDSpandaIDmap[subDataset].append(pandaID) - else: - # not found - continue - else: - # no job lookup since file was sucessfully finished - checkedPandaIDs[pandaID] = "finished" - # collect PandaIDs - subDSpandaIDmap[subDataset].append(pandaID) - # loop over all sub datasets - for subDataset in subDSpandaIDmap: - activePandaIDs = subDSpandaIDmap[subDataset] - # skip empty - if activePandaIDs == []: - continue - resDisList = [] - # get dispatchDBlocks - pandaID = activePandaIDs[0] - varMap = {} - varMap[":PandaID"] = pandaID - varMap[":type"] = "input" - _logger.debug(f"getDisInUseForAnal : {sqlDis} {str(varMap)}") - self.cur.arraysize = 10000 - retD = self.cur.execute(sqlDis + comment, varMap) - resD = self.cur.fetchall() - # get shadow dis - for (disDataset,) in resD: - # use new style only - if not disDataset.startswith("user_disp."): - continue - if disDataset not in resDisList: - resDisList.append(disDataset) - # append - if resDisList != []: - inputDisList.append((resDisList, activePandaIDs)) - # commit - if not self._commit(): - raise RuntimeError("Commit error") - timeDelta = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) - timeStart - _logger.debug(f"getDisInUseForAnal end for {outDataset} len={len(inputDisList)} time={timeDelta.seconds}sec") - return inputDisList - except Exception: - # roll back - self._rollback() - errtype, errvalue = sys.exc_info()[:2] - _logger.error(f"getDisInUseForAnal({outDataset}) : {errtype} {errvalue}") - return None - - # get input LFNs currently in use for analysis with shadow dis - def getLFNsInUseForAnal(self, inputDisList): - comment = " /* DBProxy.getLFNsInUseForAnal */" - sqlLfn = "SELECT /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ lfn,PandaID FROM ATLAS_PANDA.filesTable4 tab " - sqlLfn += "WHERE dispatchDBlock=:dispatchDBlock AND type=:type " - sqlLfn += "AND (destinationDBlockToken IS NULL OR destinationDBlockToken<>:noshadow) AND modificationTime<=CURRENT_DATE" - inputFilesList = [] - try: - token = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None).isoformat("/") - # loop over all shadow dis datasets - pandaIdLfnMap = {} - for disDatasetList, activePandaIDs in inputDisList: - for disDataset in disDatasetList: - # use new style only - if not disDataset.startswith("user_disp."): - continue - # read LFNs and PandaIDs - if disDataset not in pandaIdLfnMap: - # start transaction - self.conn.begin() - varMap = {} - varMap[":dispatchDBlock"] = disDataset - varMap[":type"] = "input" - varMap[":noshadow"] = "noshadow" - _logger.debug(f"getLFNsInUseForAnal : <{token}> {sqlLfn} {str(varMap)}") - timeStart = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) - self.cur.arraysize = 100000 - retL = self.cur.execute(sqlLfn + comment, varMap) - resL = self.cur.fetchall() - # commit - timeDelta = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) - timeStart - _logger.debug(f"getLFNsInUseForAnal : <{token}> {disDataset} time={timeDelta.seconds}sec commit") - if not self._commit(): - raise RuntimeError("Commit error") - # make map - pandaIdLfnMap[disDataset] = {} - for lfn, filePandaID in resL: - if filePandaID not in pandaIdLfnMap[disDataset]: - pandaIdLfnMap[disDataset][filePandaID] = [] - pandaIdLfnMap[disDataset][filePandaID].append(lfn) - _logger.debug(f"getLFNsInUseForAnal : <{token}> {disDataset} map made with len={len(resL)}") - # append - for disDataset in disDatasetList: - _logger.debug(f"getLFNsInUseForAnal : <{token}> {disDataset} list making pandaIDs={len(activePandaIDs)} fileLen={len(inputFilesList)}") - for activePandaID in activePandaIDs: - # skip files used by archived failed or cancelled jobs - if activePandaID in pandaIdLfnMap[disDataset]: - inputFilesList += pandaIdLfnMap[disDataset][activePandaID] - _logger.debug(f"getLFNsInUseForAnal : <{token}> {disDataset} done") - _logger.debug(f"getLFNsInUseForAnal : <{token}> {len(inputFilesList)}") - return inputFilesList - except Exception: - # roll back - self._rollback() - errtype, errvalue = sys.exc_info()[:2] - _logger.error(f"getLFNsInUseForAnal({str(inputDisList)}) : {errtype} {errvalue}") - return None - # update input files and return corresponding PandaIDs def updateInFilesReturnPandaIDs(self, dataset, status, fileLFN=""): comment = " /* DBProxy.updateInFilesReturnPandaIDs */" @@ -7517,94 +6643,6 @@ def queryPandaIDwithDataset(self, datasets): _logger.error(f"queryPandaIDwithDataset : {type} {value}") return [] - # query last files in datasets - def queryLastFilesInDataset(self, datasets): - comment = " /* DBProxy.queryLastFilesInDataset */" - _logger.debug(f"queryLastFilesInDataset({datasets})") - if len(datasets) == 0: - return [] - # make SQL query - sql1 = "SELECT lfn,PandaID FROM ATLAS_PANDA.filesTable4 WHERE dataset=:dataset AND type=:type ORDER BY lfn DESC" - sqlL = "SELECT processingType FROM %s WHERE PandaID=:PandaID " - sqlA = "UNION SELECT processingType FROM ATLAS_PANDAARCH.jobsArchived WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30)" - sql2 = "SELECT lfn FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type=:type" - # execute - try: - retMap = {} - for dataset in datasets: - # start transaction - self.conn.begin() - # select max LFN - varMap = {} - varMap[":type"] = "output" - varMap[":dataset"] = dataset - self.cur.arraysize = 100000 - self.cur.execute(sql1 + comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # found - retList = [] - for tmpLFN, pandaID in res: - # skip log.tgz - if re.search("\.log\.tgz(\.\d+)*$", tmpLFN) is not None: - continue - # start transaction - self.conn.begin() - self.cur.arraysize = 10 - # check processingType - processingType = None - for tmpTable in [ - "ATLAS_PANDA.jobsDefined4", - "ATLAS_PANDA.jobsActive4", - "ATLAS_PANDA.jobsArchived4", - ]: - varMap = {} - varMap[":PandaID"] = pandaID - if tmpTable == "ATLAS_PANDA.jobsArchived4": - self.cur.execute((sqlL % tmpTable) + sqlA + comment, varMap) - else: - self.cur.execute((sqlL % tmpTable) + comment, varMap) - resP = self.cur.fetchone() - if resP is not None: - processingType = resP[0] - break - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # job not found - if processingType is None: - continue - - # start transaction - self.conn.begin() - # select LFNs - varMap = {} - varMap[":PandaID"] = pandaID - varMap[":type"] = "output" - self.cur.arraysize = 1000 - self.cur.execute(sql2 + comment, varMap) - res = self.cur.fetchall() - for r in res: - retList.append(r[0]) - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # get only the largest one - break - # append - retMap[dataset] = retList - # return - _logger.debug(f"queryLastFilesInDataset : {str(retMap)}") - return retMap - except Exception: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error(f"queryLastFilesInDataset : {type} {value}") - return {} - # query PandaID with filenames def queryPandaIDwithLFN(self, vlfns): comment = " /* DBProxy.queryPandaIDwithLFN */" @@ -7799,71 +6837,6 @@ def getJobStatistics( _logger.error(f"getJobStatistics : {type} {value}") return {} - # get job statistics with label - def getJobStatisticsWithLabel(self, siteStr=""): - comment = " /* DBProxy.getJobStatisticsWithLabel */" - _logger.debug(f"getJobStatisticsWithLabel({siteStr})") - sql0 = "SELECT computingSite,prodSourceLabel,jobStatus,COUNT(*) FROM %s " - # site - tmpSiteMap = {} - if siteStr != "": - sql0 += "WHERE computingSite IN (" - # loop over all sites - idxSite = 1 - for tmpSite in siteStr.split(","): - tmpSiteKey = f":site{idxSite}" - sql0 += f"{tmpSiteKey}," - tmpSiteMap[tmpSiteKey] = tmpSite - idxSite += 1 - sql0 = sql0[:-1] + ") " - sql0 += "GROUP BY computingSite,prodSourceLabel,jobStatus " - sqlMV = re.sub("COUNT\(\*\)", "SUM(num_of_jobs)", sql0) - sqlMV = re.sub("SELECT ", "SELECT /*+ RESULT_CACHE */ ", sqlMV) - tables = ["ATLAS_PANDA.jobsActive4", "ATLAS_PANDA.jobsDefined4"] - returnMap = {} - try: - for table in tables: - # start transaction - self.conn.begin() - # select - varMap = {} - self.cur.arraysize = 10000 - if table == "ATLAS_PANDA.jobsActive4": - sqlExeTmp = (sqlMV + comment) % "ATLAS_PANDA.MV_JOBSACTIVE4_STATS" - else: - sqlExeTmp = (sql0 + comment) % table - self.cur.execute(sqlExeTmp, tmpSiteMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # create map - for computingSite, prodSourceLabel, jobStatus, nCount in res: - # FIXME - # ignore some job status since they break APF - if jobStatus in ["merging"]: - continue - # add site - if computingSite not in returnMap: - returnMap[computingSite] = {} - # add SourceLabel - if prodSourceLabel not in returnMap[computingSite]: - returnMap[computingSite][prodSourceLabel] = {} - # add jobstatus - if jobStatus not in returnMap[computingSite][prodSourceLabel]: - returnMap[computingSite][prodSourceLabel][jobStatus] = 0 - # add - returnMap[computingSite][prodSourceLabel][jobStatus] += nCount - # return - _logger.debug(f"getJobStatisticsWithLabel() : {str(returnMap)}") - return returnMap - except Exception: - # roll back - self._rollback() - errType, errValue = sys.exc_info()[:2] - _logger.error(f"getJobStatisticsWithLabel : {errType} {errValue}") - return {} - # get job statistics for brokerage def getJobStatisticsBrokerage(self, minPriority=None, maxPriority=None): comment = " /* DBProxy.getJobStatisticsBrokerage */" @@ -7985,314 +6958,71 @@ def getJobStatisticsAnalBrokerage(self, minPriority=None): sql0 += "GROUP BY cloud,computingSite,jobStatus,processingType" # sql for materialized view sqlMV = re.sub("COUNT\(\*\)", "SUM(num_of_jobs)", sql0) - sqlMV = re.sub(":minPriority", "TRUNC(:minPriority,-1)", sqlMV) - sqlMV = re.sub("SELECT ", "SELECT /*+ RESULT_CACHE */ ", sqlMV) - tables = ["ATLAS_PANDA.jobsActive4", "ATLAS_PANDA.jobsDefined4"] - ret = {} - nTry = 3 - for iTry in range(nTry): - try: - for table in tables: - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[":prodSourceLabel1"] = "user" - varMap[":prodSourceLabel2"] = "panda" - if minPriority is not None: - varMap[":minPriority"] = minPriority - self.cur.arraysize = 10000 - if table == "ATLAS_PANDA.jobsActive4": - self.cur.execute( - (sqlMV + comment) % "ATLAS_PANDA.MV_JOBSACTIVE4_STATS", - varMap, - ) - else: - self.cur.execute((sql0 + comment) % table, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # create map - for computingSite, jobStatus, processingType, count in res: - # add site - if computingSite not in ret: - ret[computingSite] = {} - # add processingType - if processingType not in ret[computingSite]: - ret[computingSite][processingType] = {} - # add jobStatus - if jobStatus not in ret[computingSite][processingType]: - ret[computingSite][processingType][jobStatus] = count - # for zero - for site in ret: - siteVal = ret[site] - for pType in siteVal: - typeVal = siteVal[pType] - for stateItem in [ - "defined", - "assigned", - "activated", - "running", - ]: - if stateItem not in typeVal: - typeVal[stateItem] = 0 - # return - _logger.debug(f"getJobStatisticsAnalBrokerage -> {str(ret)}") - return ret - except Exception: - # roll back - self._rollback() - if iTry + 1 < nTry: - _logger.debug(f"getJobStatisticsAnalBrokerage retry : {iTry}") - time.sleep(2) - continue - type, value, traceBack = sys.exc_info() - _logger.error(f"getJobStatisticsAnalBrokerage : {type} {value}") - return {} - - # get highest prio jobs - def getHighestPrioJobStat(self): - comment = " /* DBProxy.getHighestPrioJobStat */" - _logger.debug("getHighestPrioJobStat()") - sql0 = "SELECT cloud,max(currentPriority) FROM %s WHERE " - sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) GROUP BY cloud" - sqlC = "SELECT COUNT(*) FROM %s WHERE " - sqlC += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND " - sqlC += "cloud=:cloud AND currentPriority=:currentPriority" - tables = ["ATLAS_PANDA.jobsActive4", "ATLAS_PANDA.jobsDefined4"] - ret = {} - try: - for table in tables: - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[":prodSourceLabel"] = "managed" - if table == "ATLAS_PANDA.jobsActive4": - varMap[":jobStatus1"] = "activated" - varMap[":jobStatus2"] = "dummy" - else: - varMap[":jobStatus1"] = "defined" - varMap[":jobStatus2"] = "assigned" - self.cur.arraysize = 100 - _logger.debug((sql0 + comment) % table) - self.cur.execute((sql0 + comment) % table, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # create map - for cloud, maxPriority in res: - # add cloud - if cloud not in ret: - ret[cloud] = {} - # add max priority - prioKey = "highestPrio" - nNotRunKey = "nNotRun" - getNumber = False - if prioKey not in ret[cloud]: - ret[cloud][prioKey] = maxPriority - ret[cloud][nNotRunKey] = 0 - getNumber = True - else: - # use highest one - if ret[cloud][prioKey] < maxPriority: - ret[cloud][prioKey] = maxPriority - # reset - ret[cloud][nNotRunKey] = 0 - getNumber = True - elif ret[cloud][prioKey] == maxPriority: - getNumber = True - # get number of jobs with highest prio - if getNumber: - varMap[":cloud"] = cloud - varMap[":currentPriority"] = maxPriority - self.cur.arraysize = 10 - _logger.debug((sqlC + comment) % table) - self.cur.execute((sqlC + comment) % table, varMap) - resC = self.cur.fetchone() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - ret[cloud][nNotRunKey] += resC[0] - # return - return ret - except Exception: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error(f"getHighestPrioJobStat : {type} {value}") - return {} - - # get highest prio jobs per process group - def getHighestPrioJobStatPerPG(self, useMorePG=False): - comment = " /* DBProxy.getHighestPrioJobStatPerPG */" - _logger.debug("getHighestPrioJobStatPerPG()") - if useMorePG is False: - sql0 = "SELECT cloud,max(currentPriority),processingType FROM %s WHERE " - sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) GROUP BY cloud,processingType" - sqlC = "SELECT COUNT(*) FROM %s WHERE " - sqlC += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND " - sqlC += "cloud=:cloud AND currentPriority=:currentPriority AND processingType=:processingType" - else: - sql0 = "SELECT cloud,max(currentPriority),processingType,coreCount,workingGroup FROM %s WHERE " - sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) " - sql0 += "GROUP BY cloud,processingType,coreCount,workingGroup" - sqlC = "SELECT COUNT(*) FROM %s WHERE " - sqlC += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND " - sqlC += "cloud=:cloud AND currentPriority=:currentPriority AND processingType=:processingType AND " - sqlC += "coreCount=:coreCount AND workingGroup=:workingGroup" - sqlCN = "SELECT COUNT(*) FROM %s WHERE " - sqlCN += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND " - sqlCN += "cloud=:cloud AND currentPriority=:currentPriority AND processingType=:processingType AND " - sqlCN += "coreCount IS NULL AND workingGroup=:workingGroup" - tables = ["ATLAS_PANDA.jobsActive4", "ATLAS_PANDA.jobsDefined4"] - ret = {} - try: - for table in tables: - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[":prodSourceLabel"] = "managed" - if table == "ATLAS_PANDA.jobsActive4": - varMap[":jobStatus1"] = "activated" - varMap[":jobStatus2"] = "dummy" - else: - varMap[":jobStatus1"] = "defined" - varMap[":jobStatus2"] = "assigned" - self.cur.arraysize = 100 - _logger.debug((sql0 + comment) % table + str(varMap)) - self.cur.execute((sql0 + comment) % table, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # create map - for tmpItem in res: - if useMorePG is False: - cloud, maxPriority, processingType = tmpItem - origCloud = cloud - origProcessingType = processingType - else: - ( - origCloud, - maxPriority, - origProcessingType, - coreCount, - workingGroup, - ) = tmpItem - # convert cloud and processingType for extended process group - if useMorePG == ProcessGroups.extensionLevel_1: - # extension level 1 - cloud, processingType = ProcessGroups.converCPTforEPG(origCloud, origProcessingType, coreCount) - else: - # extension level 2 - cloud, processingType = ProcessGroups.converCPTforEPG(origCloud, origProcessingType, coreCount, workingGroup) - # add cloud - if cloud not in ret: - ret[cloud] = {} - # get process group - processGroup = ProcessGroups.getProcessGroup(processingType) - # add process group - if processGroup not in ret[cloud]: - ret[cloud][processGroup] = {} - # add max priority - prioKey = "highestPrio" - nNotRunKey = "nNotRun" - getNumber = False - if prioKey not in ret[cloud][processGroup]: - ret[cloud][processGroup][prioKey] = maxPriority - ret[cloud][processGroup][nNotRunKey] = 0 - getNumber = True - else: - # use highest one - if ret[cloud][processGroup][prioKey] < maxPriority: - ret[cloud][processGroup][prioKey] = maxPriority - # reset - ret[cloud][processGroup][nNotRunKey] = 0 - getNumber = True - elif ret[cloud][processGroup][prioKey] == maxPriority: - getNumber = True - # get number of jobs with highest prio - if getNumber: - varMap[":cloud"] = origCloud - varMap[":currentPriority"] = maxPriority - varMap[":processingType"] = origProcessingType - if useMorePG is not False: - varMap[":workingGroup"] = workingGroup - if coreCount is not None: - varMap[":coreCount"] = coreCount - self.cur.arraysize = 10 - _logger.debug((sqlC + comment) % table + str(varMap)) - self.cur.execute((sqlC + comment) % table, varMap) - resC = self.cur.fetchone() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - ret[cloud][processGroup][nNotRunKey] += resC[0] - # return - _logger.debug(f"getHighestPrioJobStatPerPG -> {ret}") - return ret - except Exception: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error(f"getHighestPrioJobStatPerPG : {type} {value}") - return {} - - # get queued analysis jobs at a site - def getQueuedAnalJobs(self, site, dn): - comment = " /* DBProxy.getQueuedAnalJobs */" - _logger.debug(f"getQueuedAnalJobs({site},{dn})") - sql0 = "SELECT COUNT(*),jobStatus FROM %s WHERE " - sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) " - sql0 += "AND computingSite=:computingSite AND prodUserName != :prodUserName " - sql0 += "GROUP BY jobStatus " - tables = ["ATLAS_PANDA.jobsActive4", "ATLAS_PANDA.jobsDefined4"] - try: - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ["", "NULL", None]: - compactDN = dn - nQueued = 0 - nRunning = 0 - # loop over all tables - for table in tables: - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[":prodSourceLabel"] = "user" - varMap[":computingSite"] = site - varMap[":prodUserName"] = compactDN - if table == "ATLAS_PANDA.jobsActive4": - varMap[":jobStatus1"] = "activated" - varMap[":jobStatus2"] = "running" - else: - varMap[":jobStatus1"] = "defined" - varMap[":jobStatus2"] = "assigned" - self.cur.arraysize = 10 - self.cur.execute((sql0 + comment) % table, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # sum - for cnt, jobStatus in res: - if jobStatus == "running": - nRunning += cnt + sqlMV = re.sub(":minPriority", "TRUNC(:minPriority,-1)", sqlMV) + sqlMV = re.sub("SELECT ", "SELECT /*+ RESULT_CACHE */ ", sqlMV) + tables = ["ATLAS_PANDA.jobsActive4", "ATLAS_PANDA.jobsDefined4"] + ret = {} + nTry = 3 + for iTry in range(nTry): + try: + for table in tables: + # start transaction + self.conn.begin() + # select + varMap = {} + varMap[":prodSourceLabel1"] = "user" + varMap[":prodSourceLabel2"] = "panda" + if minPriority is not None: + varMap[":minPriority"] = minPriority + self.cur.arraysize = 10000 + if table == "ATLAS_PANDA.jobsActive4": + self.cur.execute( + (sqlMV + comment) % "ATLAS_PANDA.MV_JOBSACTIVE4_STATS", + varMap, + ) else: - nQueued += cnt - # return - return {"queued": nQueued, "running": nRunning} - except Exception: - # roll back - self._rollback() - errType, errValue = sys.exc_info()[:2] - _logger.error(f"getQueuedAnalJobs : {errType} {errValue}") - return {} + self.cur.execute((sql0 + comment) % table, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError("Commit error") + # create map + for computingSite, jobStatus, processingType, count in res: + # add site + if computingSite not in ret: + ret[computingSite] = {} + # add processingType + if processingType not in ret[computingSite]: + ret[computingSite][processingType] = {} + # add jobStatus + if jobStatus not in ret[computingSite][processingType]: + ret[computingSite][processingType][jobStatus] = count + # for zero + for site in ret: + siteVal = ret[site] + for pType in siteVal: + typeVal = siteVal[pType] + for stateItem in [ + "defined", + "assigned", + "activated", + "running", + ]: + if stateItem not in typeVal: + typeVal[stateItem] = 0 + # return + _logger.debug(f"getJobStatisticsAnalBrokerage -> {str(ret)}") + return ret + except Exception: + # roll back + self._rollback() + if iTry + 1 < nTry: + _logger.debug(f"getJobStatisticsAnalBrokerage retry : {iTry}") + time.sleep(2) + continue + type, value, traceBack = sys.exc_info() + _logger.error(f"getJobStatisticsAnalBrokerage : {type} {value}") + return {} # get computingSite and destinationSE for a dataset def getDestSE(self, dsname, fromArch=False): @@ -8659,57 +7389,6 @@ def getJobStatisticsPerProcessingType(self, useMorePG=False): _logger.error(f"getJobStatisticsPerProcessingType : {type} {value}") return {} - # get the number of waiting jobs per site and user - def getJobStatisticsPerUserSite(self): - comment = " /* DBProxy.getJobStatisticsPerUserSite */" - _logger.debug("getJobStatisticsPerUserSite()") - sqlN = "SELECT COUNT(*),prodUserID,computingSite FROM %s " - sqlN += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND jobStatus=:jobStatus GROUP BY prodUserID,computingSite" - ret = {} - try: - for table in ("ATLAS_PANDA.jobsActive4", "ATLAS_PANDA.jobsDefined4"): - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 100000 - # select - if table == "ATLAS_PANDA.jobsActive4": - jobStatus = "activated" - else: - jobStatus = "assigned" - varMap = {} - varMap[":prodSourceLabel1"] = "user" - varMap[":prodSourceLabel2"] = "panda" - varMap[":jobStatus"] = jobStatus - self.cur.execute((sqlN + comment) % table, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # create map - for cnt, prodUserName, computingSite in res: - # add site - if computingSite not in ret: - ret[computingSite] = {} - # add user - if prodUserName not in ret[computingSite]: - ret[computingSite][prodUserName] = { - "assigned": 0, - "activated": 0, - } - # add info - ret[computingSite][prodUserName][jobStatus] = cnt - # return - _logger.debug(f"getJobStatisticsPerUserSite -> {str(ret)}") - return ret - except Exception: - # roll back - self._rollback() - # error - errtype, errvalue = sys.exc_info()[:2] - _logger.error(f"getJobStatisticsPerUserSite : {errtype} {errvalue}") - return {} - # get number of activated analysis jobs def getNAnalysisJobs(self, nProcesses): comment = " /* DBProxy.getNAnalysisJobs */" @@ -10197,173 +8876,6 @@ def getArchiveTables(self): # return return ["ATLAS_PANDAARCH.jobsArchived"] - # get JobIDs in a time range - def getJobIDsInTimeRangeLog(self, dn, timeRange, retJobIDs): - comment = " /* DBProxy.getJobIDsInTimeRangeLog */" - _logger.debug(f"getJobIDsInTimeRangeLog : {dn} {timeRange.strftime('%Y-%m-%d %H:%M:%S')}") - try: - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ["", "NULL", None]: - compactDN = dn - # get list of archived tables - tables = self.getArchiveTables() - # select - for table in tables: - # make sql - sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODSOURCELABEL_IDX JOBS_PRODUSERNAME_IDX) */ " - sql += f"jobDefinitionID FROM {table} tab " - sql += "WHERE prodUserName=:prodUserName AND modificationTime>:modificationTime " - sql += "AND prodSourceLabel=:prodSourceLabel AND lockedBy<>:ngLock GROUP BY jobDefinitionID" - varMap = {} - varMap[":prodUserName"] = compactDN - varMap[":prodSourceLabel"] = "user" - varMap[":ngLock"] = "jedi" - varMap[":modificationTime"] = timeRange - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - _logger.debug(sql + comment + str(varMap)) - self.cur.execute(sql + comment, varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # append - for (tmpID,) in resList: - if tmpID not in retJobIDs: - retJobIDs.append(tmpID) - _logger.debug(f"getJobIDsInTimeRangeLog : {str(retJobIDs)}") - return retJobIDs - except Exception: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error(f"getJobIDsInTimeRangeLog : {type} {value}") - # return empty list - return retJobIDs - - # get PandaIDs for a JobID - def getPandIDsWithJobIDLog(self, dn, jobID, idStatus, nJobs, buildJobID=None): - comment = " /* Proxy.getPandIDsWithJobIDLog */" - _logger.debug(f"getPandIDsWithJobIDLog : {dn} {jobID}") - try: - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ["", "NULL", None]: - compactDN = dn - # get list of archived tables - tables = self.getArchiveTables() - # select - for table in tables: - # skip if all jobs have already been gotten - if nJobs > 0 and len(idStatus) >= nJobs: - continue - # make sql - sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBDEFID_IDX) */ " - sql += f"PandaID,jobStatus,commandToPilot,prodSourceLabel,taskBufferErrorCode FROM {table} tab " - sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sql += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND modificationTime>(CURRENT_DATE-30) " - varMap = {} - varMap[":prodUserName"] = compactDN - varMap[":jobDefinitionID"] = jobID - varMap[":prodSourceLabel1"] = "user" - varMap[":prodSourceLabel2"] = "panda" - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - # select - _logger.debug(sql + comment + str(varMap)) - self.cur.execute(sql + comment, varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # append - for ( - tmpID, - tmpStatus, - tmpCommand, - tmpProdSourceLabel, - tmpTaskBufferErrorCode, - ) in resList: - # ignore jobs retried by pilot since they have new PandaIDs with the same jobsetID/jobdefID - if tmpTaskBufferErrorCode in [ErrorCode.EC_PilotRetried]: - continue - # ignore old buildJob which was replaced by rebrokerage - if tmpProdSourceLabel == "panda": - if buildJobID is None: - # first buildJob - buildJobID = tmpID - elif buildJobID >= tmpID: - # don't append old one - continue - else: - # delete old one - del idStatus[buildJobID] - buildJobID = tmpID - # append - if tmpID not in idStatus: - idStatus[tmpID] = (tmpStatus, tmpCommand) - _logger.debug(f"getPandIDsWithJobIDLog : {str(idStatus)}") - return idStatus - except Exception: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error(f"getPandIDsWithJobIDLog : {type} {value}") - # return empty list - return {} - - # get PandaIDs for a JobsetID or JobdefID in jobsArchived - def getPandIDsWithIdInArch(self, prodUserName, id, isJobset): - comment = " /* Proxy.getPandIDsWithIdInArch */" - _logger.debug(f"getPandIDsWithIdInArch : {prodUserName} {id} {isJobset}") - try: - # make sql - if isJobset: - sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBSETID_IDX) */ " - else: - sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBDEFID_IDX) */ " - sql += "PandaID FROM ATLAS_PANDAARCH.jobsArchived tab " - sql += "WHERE prodUserName=:prodUserName " - sql += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND modificationTime>(CURRENT_DATE-30) " - if isJobset: - sql += "AND jobsetID=:jobID " - else: - sql += "AND jobDefinitionID=:jobID " - varMap = {} - varMap[":prodUserName"] = prodUserName - varMap[":jobID"] = id - varMap[":prodSourceLabel1"] = "user" - varMap[":prodSourceLabel2"] = "panda" - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 1000000 - # select - _logger.debug(sql + comment + str(varMap)) - self.cur.execute(sql + comment, varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # append - pandaIDs = [] - for (tmpID,) in resList: - pandaIDs.append(tmpID) - _logger.debug(f"getPandIDsWithIdInArch : {prodUserName} {id} -> {str(pandaIDs)}") - return pandaIDs - except Exception: - # roll back - self._rollback() - errType, errValue = sys.exc_info()[:2] - _logger.error(f"getPandIDsWithIdInArch : {errType} {errValue}") - # return empty list - return [] - # peek at job def peekJobLog(self, pandaID, days=None): comment = " /* DBProxy.peekJobLog */" @@ -10472,50 +8984,6 @@ def peekJobLog(self, pandaID, days=None): # return None return None - # get active datasets - def getActiveDatasets(self, computingSite, prodSourceLabel): - comment = " /* DBProxy.getActiveDatasets */" - _logger.debug(f"getActiveDatasets({computingSite},{prodSourceLabel})") - varMap = {} - varMap[":computingSite"] = computingSite - varMap[":jobStatus1"] = "assigned" - varMap[":jobStatus2"] = "activated" - varMap[":jobStatus3"] = "waiting" - varMap[":prodSourceLabel"] = prodSourceLabel - try: - retList = [] - for table in ["jobsActive4", "jobsDefined4", "jobsWaiting4"]: - if table == "jobsActive4": - sql0 = f"SELECT distinct prodDBlock FROM ATLAS_PANDA.{table} " - else: - sql0 = f"SELECT distinct prodDBlock FROM ATLAS_PANDA.{table} " - sql0 += "WHERE computingSite=:computingSite AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3) " - sql0 += "AND prodSourceLabel=:prodSourceLabel" - # start transaction - self.conn.begin() - # select - self.cur.execute(sql0 + comment, varMap) - resSs = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError("Commit error") - # append - for (prodDBlock,) in resSs: - if prodDBlock not in retList: - retList.append(prodDBlock) - # make string - retStr = "" - for tmpItem in retList: - retStr += f"{tmpItem}," - retStr = retStr[:-1] - return retStr - except Exception: - # roll back - self._rollback() - errType, errValue = sys.exc_info()[:2] - _logger.error(f"getActiveDatasets : {errType} {errValue}") - return "" - # check status of all sub datasets to trigger Notifier def checkDatasetStatusForNotifier(self, jobsetID, jobDefinitionID, prodUserName): comment = " /* DBProxy.checkDatasetStatusForNotifier */" @@ -11527,46 +9995,6 @@ def getJobsetIDforPandaID(self, pandaID, jediTaskID): return resT[0] return None - # get retry history - def getRetryHistoryJEDI(self, jediTaskID): - comment = " /* DBProxy.getRetryHistoryJEDI */" - methodName = comment.split(" ")[-2].split(".")[-1] - methodName += f" " - _logger.debug(f"{methodName} start") - try: - # set autocommit on - self.conn.begin() - self.cur.arraysize = 1000000 - # get - varMap = {} - varMap[":jediTaskID"] = jediTaskID - sql = f"SELECT oldPandaID,newPandaID FROM {panda_config.schemaJEDI}.JEDI_Job_Retry_History " - sql += "WHERE jediTaskID=:jediTaskID GROUP BY oldPandaID,newPandaID " - for tmpType in EventServiceUtils.relationTypesForJS: - tmpKey = f":{tmpType}" - sql += f"{tmpKey}," - varMap[tmpKey] = tmpType - sql = sql[:-1] - sql += ")) " - self.cur.execute(sql + comment, varMap) - resG = self.cur.fetchall() - retMap = {} - for oldPandaID, newPandaID in resG: - if oldPandaID not in retMap: - retMap[oldPandaID] = [] - retMap[oldPandaID].append(newPandaID) - # commit - if not self._commit(): - raise RuntimeError("Commit error") - _logger.debug(f"{methodName} return len={len(retMap)}") - return retMap - except Exception: - # roll back - self._rollback() - # error - self.dumpErrorMessage(_logger, methodName) - return None - # get original consumers def getOriginalConsumers(self, jediTaskID, jobsetID, pandaID): comment = " /* DBProxy.getOriginalConsumers */" @@ -19899,39 +18327,6 @@ def reassignShare(self, jedi_task_ids, gshare, reassign_running): tmp_log.error(f"reassignShare : {type} {value}") return -1, None - def listTasksInShare(self, gshare, status): - """ - Lists all task ids corresponding to share and in specified status - @param gshare: global share - @param status: status - """ - - comment = " /* DBProxy.listTasksInShare */" - method_name = comment.split(" ")[-2].split(".")[-1] - tmp_log = LogWrapper(_logger, method_name) - tmp_log.debug("start") - - try: - # Prepare the bindings and var map - var_map = {":gshare": gshare, "status": status} - - sql = """ - SELECT jeditaskid FROM ATLAS_PANDA.jedi_tasks - WHERE gshare=:gshare AND status=:status - """ - - self.cur.execute(sql + comment, var_map) - jedi_task_ids = [entry[0] for entry in self.cur.fetchall()] - - tmp_log.debug("done") - return 0, jedi_task_ids - - except Exception: - type, value, traceBack = sys.exc_info() - _logger.error(f"{comment}: {sql} {var_map}") - _logger.error(f"{comment}: {type} {value}") - return -1, None - def getCommands(self, harvester_id, n_commands): """ Gets n commands in status 'new' for a particular harvester instance and updates their status to 'retrieved' diff --git a/pandaserver/taskbuffer/TaskBuffer.py b/pandaserver/taskbuffer/TaskBuffer.py index c9be112d5..b2d8b97e0 100755 --- a/pandaserver/taskbuffer/TaskBuffer.py +++ b/pandaserver/taskbuffer/TaskBuffer.py @@ -204,6 +204,7 @@ def storeJobs( userCountry = None useExpress = False nExpressJobs = 0 + groupJobSerialNum = 0 # group jobs are not in use anymore, setting a 0 default useDebugMode = False siteMapper = self.get_site_mapper() @@ -313,17 +314,6 @@ def storeJobs( # get DB proxy proxy = self.proxyPool.getProxy() tmpLog.debug(f"got proxy") - # get group job serial number - groupJobSerialNum = 0 - if len(jobs) > 0 and (jobs[0].prodSourceLabel in JobUtils.analy_sources) and (not jobs[0].processingType in ["merge", "unmerge"]): - for tmpFile in jobs[-1].Files: - if tmpFile.type in ["output", "log"] and "$GROUPJOBSN" in tmpFile.lfn: - tmpLog.debug(f"getting group job serial number") - tmpSnRet = proxy.getSerialNumberForGroupJob(user) - if tmpSnRet["status"]: - groupJobSerialNum = tmpSnRet["sn"] - break - tmpLog.debug(f"got group job serial number") # get total number of files totalNumFiles = 0 for job in jobs: @@ -665,23 +655,6 @@ def getNumWaitingJobsWithOutDS(self, outputDSs): return res - # resubmit jobs - def resubmitJobs(self, jobIDs): - # get DB proxy - proxy = self.proxyPool.getProxy() - jobs = [] - # get jobs - for jobID in jobIDs: - res = proxy.peekJob(jobID, True, False, False, False) - if res: - jobs.append(res) - # release DB proxy - self.proxyPool.putProxy(proxy) - # set up dataset - if len(jobs) > 0: - Setupper(self, jobs).start() - return True - # update overall job information def updateJobs( self, @@ -981,66 +954,6 @@ def getJobs( return jobs + [nSent, {}, secrets_map] - # check merge job generation status - def checkMergeGenerationStatus(self, dn, jobID): - # return for NA - retNA = {"status": "NA", "mergeIDs": []} - try: - # get at most 2 PandaIDs - idStatus = self.getPandIDsWithJobID(dn, jobID, 2) - if idStatus == {}: - return retNA - # use larger PandaID which corresponds to runXYZ - tmpKeys = sorted(idStatus) - pandaID = tmpKeys[-1] - # get job - tmpJobs = self.getFullJobStatus([pandaID]) - if tmpJobs == [] or tmpJobs[0] is None: - return retNA - pandaJob = tmpJobs[0] - # non-merge job - if "--mergeOutput" not in pandaJob.jobParameters: - return retNA - # loop over all sub datasets - subDsList = [] - mergeStatus = None - mergeIDs = [] - for tmpFile in pandaJob.Files: - if tmpFile.type in ["output", "log"]: - if tmpFile.destinationDBlock not in subDsList: - subDsList.append(tmpFile.destinationDBlock) - # get dataset - tmpDsSpec = self.queryDatasetWithMap({"name": tmpFile.destinationDBlock}) - if tmpDsSpec is not None: - if tmpDsSpec.status in ["tobemerged"]: - # going to be merged - mergeStatus = "generating" - mergeIDs = [] - elif tmpDsSpec.status in [ - "tobeclosed", - "closed", - "completed", - ]: - # another dataset from --individualOutDS is waiting for Merger - if mergeStatus == "generating": - continue - # set status - mergeStatus = "generated" - # collect JobIDs of merge jobs - tmpMergeID = tmpDsSpec.MoverID - if tmpMergeID not in [0, None, "NULL"] + mergeIDs: - mergeIDs.append(tmpMergeID) - # no merger most likely because jobs were killed - if mergeStatus == "generated" and mergeIDs == []: - mergeStatus = "aborted" - # jobs are still runnign - if mergeStatus is None: - mergeStatus = "standby" - - return {"status": mergeStatus, "mergeIDs": mergeIDs} - except Exception: - return retNA - # get job status def getJobStatus( self, @@ -1094,20 +1007,6 @@ def peekJobs( return retJobs - # get PandaID with jobexeID - def getPandaIDwithJobExeID(self, jobexeIDs): - # get DBproxy - proxy = self.proxyPool.getProxy() - retJobs = [] - # peek at job - for jobexeID in jobexeIDs: - res = proxy.getPandaIDwithJobExeID(jobexeID) - retJobs.append(res) - # release proxy - self.proxyPool.putProxy(proxy) - - return retJobs - # get PandaIDs with TaskID def getPandaIDsWithTaskID(self, jediTaskID): # get DBproxy @@ -1119,109 +1018,6 @@ def getPandaIDsWithTaskID(self, jediTaskID): return retJobs - # get slimmed file info with PandaIDs - def getSlimmedFileInfoPandaIDs(self, pandaIDs): - iPandaID = 0 - nPandaID = 100 - retInfo = {} - while iPandaID < len(pandaIDs): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - tmpRetInfo = proxy.getSlimmedFileInfoPandaIDs(pandaIDs[iPandaID : iPandaID + nPandaID]) - # release proxy - self.proxyPool.putProxy(proxy) - iPandaID += nPandaID - if retInfo == {}: - retInfo = tmpRetInfo - else: - for outKey in tmpRetInfo: - if outKey not in retInfo: - retInfo[outKey] = [] - # append - for tmpItemRetInfo in tmpRetInfo[outKey]: - if tmpItemRetInfo not in retInfo[outKey]: - retInfo[outKey].append(tmpItemRetInfo) - - return retInfo - - # get JobIDs in a time range - def getJobIDsInTimeRange(self, dn, timeRangeStr): - # check DN - if dn in ["NULL", "", "None", None]: - return [] - # check timeRange - match = re.match("^(\d+)-(\d+)-(\d+) (\d+):(\d+):(\d+)$", timeRangeStr) - if match is None: - return [] - timeRange = datetime.datetime( - year=int(match.group(1)), - month=int(match.group(2)), - day=int(match.group(3)), - hour=int(match.group(4)), - minute=int(match.group(5)), - second=int(match.group(6)), - ) - # max range is 3 months - maxRange = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) - datetime.timedelta(days=30) - if timeRange < maxRange: - timeRange = maxRange - retJobIDs = [] - # get DBproxy - proxy = self.proxyPool.getProxy() - # get JobIDs - retJobIDs = proxy.getJobIDsInTimeRange(dn, timeRange, retJobIDs) - # release proxy - self.proxyPool.putProxy(proxy) - # read ARCH when time window is more than 3days (- 3 hours as a margin) - if timeRange < datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) - datetime.timedelta(days=2, hours=21): - # get ArchiveDBproxy - proxy = self.proxyPool.getProxy() - # get JobIDs - retJobIDs = proxy.getJobIDsInTimeRangeLog(dn, timeRange, retJobIDs) - # release proxy - self.proxyPool.putProxy(proxy) - - return retJobIDs - - # get PandaIDs for a JobID - def getPandIDsWithJobID(self, dn, jobID, nJobs): - idStatus = {} - # check DN - if dn in ["NULL", "", "None", None]: - return idStatus - # check JobID - try: - jobID = int(jobID) - nJobs = int(nJobs) - except Exception: - return idStatus - # get DBproxy - proxy = self.proxyPool.getProxy() - # get IDs - idStatus, buildJobID = proxy.getPandIDsWithJobID(dn, jobID, idStatus, nJobs) - # release proxy - self.proxyPool.putProxy(proxy) - # get ArchiveDBproxy - proxy = self.proxyPool.getProxy() - # get IDs - idStatus = proxy.getPandIDsWithJobIDLog(dn, jobID, idStatus, nJobs, buildJobID) - # release proxy - self.proxyPool.putProxy(proxy) - - return idStatus - - # get PandaIDs for a JobsetID or JobdefID in jobsArchived - def getPandIDsWithIdInArch(self, prodUserName, id, isJobset): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.getPandIDsWithIdInArch(prodUserName, id, isJobset) - # release proxy - self.proxyPool.putProxy(proxy) - - return ret - # get full job status def getFullJobStatus( self, @@ -1554,101 +1350,6 @@ def awakeJobs(self, ids): return True - # query PandaIDs - def queryPandaIDs(self, jobDefIDs): - # get DBproxy - proxy = self.proxyPool.getProxy() - pandaIDs = [] - # query PandaID - for jobDefID in jobDefIDs: - id = proxy.queryPandaID(jobDefID) - pandaIDs.append(id) - # release proxy - self.proxyPool.putProxy(proxy) - - return pandaIDs - - # query job info per cloud - def queryJobInfoPerCloud(self, cloud, schedulerID=None): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query job info - ret = proxy.queryJobInfoPerCloud(cloud, schedulerID) - # release proxy - self.proxyPool.putProxy(proxy) - - return ret - - # get PandaIDs to be updated in prodDB - def getPandaIDsForProdDB(self, limit, lockedby): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query PandaID - ret = proxy.getPandaIDsForProdDB(limit, lockedby) - # release proxy - self.proxyPool.putProxy(proxy) - - return ret - - # update prodDBUpdateTime - def updateProdDBUpdateTimes(self, paramList): - retList = [] - # get DBproxy - proxy = self.proxyPool.getProxy() - # update - for param in paramList: - ret = proxy.updateProdDBUpdateTime(param) - retList.append(ret) - # release proxy - self.proxyPool.putProxy(proxy) - - return retList - - # get PandaIDs at Site - def getPandaIDsSite(self, site, status, limit): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query PandaID - ids = proxy.getPandaIDsSite(site, status, limit) - # release proxy - self.proxyPool.putProxy(proxy) - - return ids - - # get input files currently in used for analysis - def getFilesInUseForAnal(self, outDataset): - # get DBproxy - proxy = self.proxyPool.getProxy() - retList = [] - # query LFNs - retList = proxy.getFilesInUseForAnal(outDataset) - # release proxy - self.proxyPool.putProxy(proxy) - - return retList - - # get list of dis dataset to get input files in shadow - def getDisInUseForAnal(self, outDataset): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query dis - retList = proxy.getDisInUseForAnal(outDataset) - # release proxy - self.proxyPool.putProxy(proxy) - - return retList - - # get input LFNs currently in use for analysis with shadow dis - def getLFNsInUseForAnal(self, inputDisList): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query dis - retList = proxy.getLFNsInUseForAnal(inputDisList) - # release proxy - self.proxyPool.putProxy(proxy) - - return retList - # update input files and return corresponding PandaIDs def updateInFilesReturnPandaIDs(self, dataset, status, fileLFN=""): # get DBproxy @@ -1776,17 +1477,6 @@ def queryDatasetWithMap(self, map): return ret - # query last files in a dataset - def queryLastFilesInDataset(self, datasets): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query files - ret = proxy.queryLastFilesInDataset(datasets) - # release proxy - self.proxyPool.putProxy(proxy) - - return ret - # set GUIDs def setGUIDs(self, files): # get DBproxy @@ -1896,17 +1586,6 @@ def getSerialNumber(self, datasetname, definedFreshFlag=None): return ret - # get serial number for group job - def getSerialNumberForGroupJob(self, name): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get serial number - ret = proxy.getSerialNumberForGroupJob(name) - # release proxy - self.proxyPool.putProxy(proxy) - - return ret - # add metadata def addMetadata(self, ids, metadataList, newStatusList): # get DBproxy @@ -1956,17 +1635,6 @@ def extractScope(self, name): return ret - # change job priorities - def changeJobPriorities(self, newPrioMap): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.changeJobPriorities(newPrioMap) - # release proxy - self.proxyPool.putProxy(proxy) - - return ret - # get destinationDBlockToken for a dataset def getDestTokens(self, dsname): # get DBproxy @@ -2017,17 +1685,6 @@ def getJobStatistics( return ret - # get job statistics with label - def getJobStatisticsWithLabel(self, siteStr=""): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get serial number - ret = proxy.getJobStatisticsWithLabel(siteStr) - # release proxy - self.proxyPool.putProxy(proxy) - - return ret - # get job statistics for brokerage def getJobStatisticsBrokerage(self, minPrio=None, maxPrio=None): # get DBproxy @@ -2054,42 +1711,6 @@ def getJobStatisticsAnalBrokerage(self, minPriority=None): return conRet - # get the number of waiting jobs per site and user - def getJobStatisticsPerUserSite(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get stat - ret = proxy.getJobStatisticsPerUserSite() - # release proxy - self.proxyPool.putProxy(proxy) - - return ret - - # get highest prio jobs - def getHighestPrioJobStat(self, perPG=False, useMorePG=False): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get stat - if not perPG: - ret = proxy.getHighestPrioJobStat() - else: - ret = proxy.getHighestPrioJobStatPerPG(useMorePG) - # release proxy - self.proxyPool.putProxy(proxy) - - return ret - - # get queued analysis jobs at a site - def getQueuedAnalJobs(self, site, dn): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get stat - ret = proxy.getQueuedAnalJobs(site, dn) - # release proxy - self.proxyPool.putProxy(proxy) - - return ret - # get job statistics for ExtIF def getJobStatisticsForExtIF(self, sourcetype=None): # get DBproxy @@ -2310,17 +1931,6 @@ def getJobIdUser(self, dn): return ret - # get active datasets - def getActiveDatasets(self, computingSite, prodSourceLabel): - # query an SQL return Status - proxy = self.proxyPool.getProxy() - # get - ret = proxy.getActiveDatasets(computingSite, prodSourceLabel) - # release proxy - self.proxyPool.putProxy(proxy) - - return ret - # check status of all sub datasets to trigger Notifier def checkDatasetStatusForNotifier(self, jobsetID, jobDefinitionID, prodUserName): # query an SQL return Status @@ -2504,17 +2114,6 @@ def updateEventRanges(self, eventRanges, version=0): return ret return json.dumps(ret[0]), json.dumps(ret[1]) - # get retry history - def getRetryHistoryJEDI(self, jediTaskID): - # get proxy - proxy = self.proxyPool.getProxy() - # exec - ret = proxy.getRetryHistoryJEDI(jediTaskID) - # release proxy - self.proxyPool.putProxy(proxy) - - return ret - # change task priority def changeTaskPriorityPanda(self, jediTaskID, newPriority): # get proxy @@ -3054,17 +2653,6 @@ def reassignShare(self, jedi_task_ids, share_dest, reassign_running): return res - # list tasks in share - def listTasksInShare(self, gshare, status): - # get DB proxy - proxy = self.proxyPool.getProxy() - # exec - res = proxy.listTasksInShare(gshare, status) - # release DB proxy - self.proxyPool.putProxy(proxy) - - return res - def is_valid_share(self, share_name): """ Checks whether the share is a valid leave share diff --git a/pandaserver/test/killProdJobs.py b/pandaserver/test/killProdJobs.py index 81edc5b95..fa56696a0 100755 --- a/pandaserver/test/killProdJobs.py +++ b/pandaserver/test/killProdJobs.py @@ -12,8 +12,8 @@ sys.exit(1) jobDefIDs = range(startID, endID + 1) -# quesry PandaID -status, ids = Client.queryPandaIDs(jobDefIDs) +# query PandaID - this UserIF method is obsolete +# status, ids = Client.queryPandaIDs(jobDefIDs) if status != 0: sys.exit(0) diff --git a/pandaserver/userinterface/Client.py b/pandaserver/userinterface/Client.py index cacd3cc10..812bb13fa 100755 --- a/pandaserver/userinterface/Client.py +++ b/pandaserver/userinterface/Client.py @@ -1,68 +1,33 @@ """ client methods - """ -import getpass import gzip import json import os import pickle -import re import socket import sys import tempfile -import uuid -from urllib.parse import urlencode +import requests from pandacommon.pandautils.net_utils import replace_hostname_in_url_randomly -from pandaserver.srvcore.CoreUtils import commands_get_status_output - -# configuration -try: - baseURL = os.environ["PANDA_URL"] -except Exception: - baseURL = "http://pandaserver.cern.ch:25080/server/panda" -try: - baseURLSSL = os.environ["PANDA_URL_SSL"] -except Exception: - baseURLSSL = "https://pandaserver.cern.ch:25443/server/panda" - +# PanDA server configuration +baseURL = os.environ.get("PANDA_URL", "http://pandaserver.cern.ch:25080/server/panda") +baseURLSSL = os.environ.get("PANDA_URL_SSL", "https://pandaserver.cern.ch:25443/server/panda") # exit code EC_Failed = 255 -# panda server URLs -if "PANDA_URL_MAP" in os.environ: - serverURLs = { - "default": {"URL": baseURL, "URLSSL": baseURLSSL}, - } - # decode envvar to map - try: - for tmpCompStr in os.environ["PANDA_URL_MAP"].split("|"): - tmpKey, tmpURL, tmpURLSSL = tmpCompStr.split(",") - # append - serverURLs[tmpKey] = {"URL": tmpURL, "URLSSL": tmpURLSSL} - except Exception: - pass -else: - # default - serverURLs = { - "default": {"URL": baseURL, "URLSSL": baseURLSSL}, - "CERN": { - "URL": "http://pandaserver.cern.ch:25080/server/panda", - "URLSSL": "https://pandaserver.cern.ch:25443/server/panda", - }, - } - -# bamboo -baseURLBAMBOO = "http://pandabamboo.cern.ch:25070/bamboo/bamboo" +def is_https(url): + # check if https is used + return url.startswith("https://") -# wrapper for pickle with python 3 def pickle_dumps(obj): + # wrapper for pickle with python 3 return pickle.dumps(obj, protocol=0) @@ -73,276 +38,133 @@ def pickle_loads(obj_string): return pickle.loads(obj_string) -# get URL -def _getURL(type, srvID=None): - if srvID in serverURLs: - urls = serverURLs[srvID] - else: - urls = serverURLs["default"] - return urls[type] - - -# get Panda srvIDs -def getPandas(): - srvs = list(serverURLs) - # remove 'default' - try: - srvs.remove("default") - except Exception: - pass - return srvs - - -# look for a grid proxy certificate -def _x509(): - # see X509_USER_PROXY - try: - return os.environ["X509_USER_PROXY"] - except Exception: - pass - # see the default place - x509 = f"/tmp/x509up_u{os.getuid()}" - if os.access(x509, os.R_OK): - return x509 - # no valid proxy certificate - # FIXME - print("No valid grid proxy certificate found") - return "" - - -# check if https -def is_https(url): - return url.startswith("https://") - - -# curl class -class _Curl: - # constructor +class HttpClient: def __init__(self): - # path to curl - self.path = "curl" # verification of the host certificate if "PANDA_VERIFY_HOST" in os.environ and os.environ["PANDA_VERIFY_HOST"] == "off": self.verifyHost = False else: self.verifyHost = True + # request a compressed response self.compress = True + # SSL cert/key - self.sslCert = "" - self.sslKey = "" - # verbose - self.verbose = False - # use json + self.ssl_certificate = self._x509() + self.ssl_key = self._x509() + self.use_json = False + # OIDC - if "PANDA_AUTH" in os.environ and os.environ["PANDA_AUTH"] == "oidc": - self.oidc = True - if "PANDA_AUTH_VO" in os.environ: - self.authVO = os.environ["PANDA_AUTH_VO"] - else: - self.authVO = None - if "PANDA_AUTH_ID_TOKEN" in os.environ: - self.idToken = os.environ["PANDA_AUTH_ID_TOKEN"] - else: - self.idToken = None - else: - self.oidc = False + self.oidc = os.getenv("PANDA_AUTH") == "oidc" + self.auth_vo = os.getenv("PANDA_AUTH_VO") if self.oidc else None + self.id_token = os.getenv("PANDA_AUTH_ID_TOKEN") if self.oidc else None - # GET method - def get(self, url, data): - use_https = is_https(url) - url = replace_hostname_in_url_randomly(url) - # make command - com = f"{self.path} --silent --get" - if not self.verifyHost: - com += " --insecure" - elif "X509_CERT_DIR" in os.environ: - com += f" --capath {os.environ['X509_CERT_DIR']}" - elif os.path.exists("/etc/grid-security/certificates"): - com += " --capath /etc/grid-security/certificates" - if self.compress: - com += " --compressed" - if self.oidc: - com += f' -H "Authorization: Bearer {self.idToken}"' - com += f' -H "Origin: {self.authVO}"' - elif use_https: - if not self.sslCert: - self.sslCert = _x509() - com += f" --cert {self.sslCert}" - com += f" --cacert {self.sslCert}" - if not self.sslKey: - self.sslKey = _x509() - com += f" --key {self.sslKey}" - # timeout - com += " -m 600" - # json - if self.use_json: - com += ' -H "Accept: application/json"' - # data - strData = "" - for key in data: - strData += f'data="{urlencode({key: data[key]})}"\n' - # write data to temporary config file + def _x509(self): + # retrieve the X509_USER_PROXY from the environment variables try: - tmpName = os.environ["PANDA_TMP"] + return os.environ["X509_USER_PROXY"] except Exception: - tmpName = "/tmp" - tmpName += f"/{getpass.getuser()}_{str(uuid.uuid4())}" - tmpFile = open(tmpName, "w") - tmpFile.write(strData) - tmpFile.close() - com += f" --config {tmpName}" - com += f" {url}" - # execute - if self.verbose: - print(com) - print(strData) - ret = commands_get_status_output(com) - # remove temporary file - os.remove(tmpName) - if ret[0] != 0: - ret = (ret[0] % 255, ret[1]) - if self.verbose: - print(ret) - return ret - - # POST method - def post(self, url, data, via_file=False): + pass + + # look for the default place + x509 = f"/tmp/x509up_u{os.getuid()}" + if os.access(x509, os.R_OK): + return x509 + + # no valid proxy certificate + print("No valid grid proxy certificate found") + return "" + + def _prepare_url(self, url): + """Modify URL with HTTPS check and hostname replacement.""" use_https = is_https(url) - url = replace_hostname_in_url_randomly(url) - # make command - com = f"{self.path} --silent" - if not self.verifyHost: - com += " --insecure" - elif "X509_CERT_DIR" in os.environ: - com += f" --capath {os.environ['X509_CERT_DIR']}" - elif os.path.exists("/etc/grid-security/certificates"): - com += " --capath /etc/grid-security/certificates" - if self.compress: - com += " --compressed" + modified_url = replace_hostname_in_url_randomly(url) + return modified_url, use_https + + def _prepare_headers(self): + """Prepare headers based on authentication and JSON settings.""" + headers = {} + if self.oidc: - com += f' -H "Authorization: Bearer {self.idToken}"' - com += f' -H "Origin: {self.authVO}"' - elif use_https: - if not self.sslCert: - self.sslCert = _x509() - com += f" --cert {self.sslCert}" - com += f" --cacert {self.sslCert}" - if not self.sslKey: - self.sslKey = _x509() - com += f" --key {self.sslKey}" - # timeout - com += " -m 600" - # json + headers["Authorization"] = f"Bearer {self.id_token}" + headers["Origin"] = self.auth_vo + if self.use_json: - com += ' -H "Accept: application/json"' - # data - strData = "" - for key in data: - strData += f'data="{urlencode({key: data[key]})}"\n' - # write data to temporary config file + headers["Accept"] = "application/json" + + return headers + + def _prepare_ssl(self, use_https): + """Prepare SSL configuration based on HTTPS usage and verification settings.""" + cert = None + verify = True + if use_https: + cert = (self.ssl_certificate, self.ssl_key) + + if not self.verifyHost: + verify = False + elif "X509_CERT_DIR" in os.environ: + verify = os.environ["X509_CERT_DIR"] + elif os.path.exists("/etc/grid-security/certificates"): + verify = "/etc/grid-security/certificates" + + return cert, verify + + def get(self, url, data): + url, use_https = self._prepare_url(url) + headers = self._prepare_headers() + cert, verify = self._prepare_ssl(use_https) + try: - tmpName = os.environ["PANDA_TMP"] - except Exception: - tmpName = "/tmp" - tmpName += f"/{getpass.getuser()}_{str(uuid.uuid4())}" - tmpNameOut = f"{tmpName}.out" - tmpFile = open(tmpName, "w") - tmpFile.write(strData) - tmpFile.close() - com += f" --config {tmpName}" - if via_file: - com += f" -o {tmpNameOut}" - com += f" {url}" - # execute - if self.verbose: - print(com) - print(strData) - s, o = commands_get_status_output(com) - if via_file: - with open(tmpNameOut, "rb") as f: - ret = (s, f.read()) - os.remove(tmpNameOut) - else: - ret = (s, o) - # remove temporary file - os.remove(tmpName) - if ret[0] != 0: - ret = (ret[0] % 255, ret[1]) - if self.verbose: - print(ret) - return ret - - # PUT method - def put(self, url, data): - use_https = is_https(url) - url = replace_hostname_in_url_randomly(url) - # make command - com = f"{self.path} --silent" - if not self.verifyHost: - com += " --insecure" - elif "X509_CERT_DIR" in os.environ: - com += f" --capath {os.environ['X509_CERT_DIR']}" - elif os.path.exists("/etc/grid-security/certificates"): - com += " --capath /etc/grid-security/certificates" - if self.compress: - com += " --compressed" - if self.oidc: - com += f' -H "Authorization: Bearer {self.idToken}"' - com += f' -H "Origin: {self.authVO}"' - elif use_https: - if not self.sslCert: - self.sslCert = _x509() - com += f" --cert {self.sslCert}" - com += f" --cacert {self.sslCert}" - if not self.sslKey: - self.sslKey = _x509() - com += f" --key {self.sslKey}" - # emulate PUT - for key in data: - com += f' -F "{key}=@{data[key]}"' - com += f" {url}" - # execute - if self.verbose: - print(com) - ret = commands_get_status_output(com) - if ret[0] != 0: - ret = (ret[0] % 255, ret[1]) - if self.verbose: - print(ret) - return ret + response = requests.get(url, headers=headers, params=data, timeout=600, cert=cert, verify=verify) + response.raise_for_status() + return 0, response.text + except requests.RequestException as e: + return 255, str(e) + + def post(self, url, data): + url, use_https = self._prepare_url(url) + headers = self._prepare_headers() + cert, verify = self._prepare_ssl(use_https) + + try: + response = requests.post(url, headers=headers, data=data, timeout=600, cert=cert, verify=verify) + response.raise_for_status() + return 0, response.text + except requests.RequestException as e: + return 255, str(e) + + def post_files(self, url, data): + url, use_https = self._prepare_url(url) + headers = self._prepare_headers() + cert, verify = self._prepare_ssl(use_https) + + files = {} + try: + files = {key: open(value, "rb") for key, value in data.items()} + print(f"cert: {cert}, verify: {verify}") + response = requests.post(url, headers=headers, files=files, timeout=600, cert=cert, verify=verify) + response.raise_for_status() + return 0, response.text + except requests.RequestException as e: + return 255, str(e) + finally: + for file in files.values(): + file.close() """ Client API - """ -# use web cache -def useWebCache(): - """Switch to use web cache for some read-only requests so that the number - of hits to the back-end database is reduced. - - args: - returns: +def submitJobs(jobs, toPending=False): """ - global baseURL - baseURL = re.sub("25080", "25085", baseURL) - global serverURLs - for tmpKey in serverURLs: - tmpVal = serverURLs[tmpKey] - tmpVal["URL"] = baseURL - - -# submit jobs -def submitJobs(jobs, srvID=None, toPending=False): - """Submit jobs + Submit jobs args: jobs: the list of JobSpecs - srvID: obsolete toPending: set True if jobs need to be pending state for the two-staged submission mechanism returns: @@ -358,32 +180,30 @@ def submitJobs(jobs, srvID=None, toPending=False): for job in jobs: job.creationHost = hostname # serialize - strJobs = pickle_dumps(jobs) - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - # execute - url = _getURL("URLSSL", srvID) + "/submitJobs" - data = {"jobs": strJobs} + str_jobs = pickle_dumps(jobs) + + http_client = HttpClient() + + url = f"{baseURLSSL}/submitJobs" + data = {"jobs": str_jobs} if toPending: data["toPending"] = True - status, output = curl.post(url, data) + status, output = http_client.post(url, data) if status != 0: print(output) return status, output try: return status, pickle_loads(output) except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR submitJobs : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr + err_type, err_value, _ = sys.exc_info() + err_str = f"ERROR submitJobs : {err_type} {err_value}" + print(err_str) + return EC_Failed, f"{output}\n{err_str}" -# get job status -def getJobStatus(ids, use_json=False): - """Get job status +def getJobStatus(panda_ids): + """ + Get job status args: ids: the list of PandaIDs @@ -394,68 +214,33 @@ def getJobStatus(ids, use_json=False): 255: communication failure the list of JobSpecs (or Nones for non-existing PandaIDs) """ - # serialize - if use_json: - strIDs = json.dumps(ids) - else: - strIDs = pickle_dumps(ids) - # instantiate curl - curl = _Curl() - curl.use_json = use_json - # execute - url = _getURL("URL") + "/getJobStatus" - data = {"ids": strIDs} - status, output = curl.post(url, data, via_file=True) - try: - if use_json: - return status, json.loads(output) - return status, pickle_loads(output) - except Exception as e: - errStr = f"ERROR getJobStatus : {str(e)}" - print(errStr) - return EC_Failed, output + "\n" + errStr + # Serialize the panda IDs + str_ids = json.dumps(panda_ids) + http_client = HttpClient() + http_client.use_json = True -# get PandaID with jobexeID -def getPandaIDwithJobExeID(ids): - """Get the list of PandaIDs corresponding to a given jobExecutionIDs - - args: - ids: list of jobExecutionIDs - returns: - status code - 0: communication succeeded to the panda server - 255: communication failure - the list of PandaIDs (or Nones for non-existing IDs) - """ - # serialize - strIDs = pickle_dumps(ids) - # instantiate curl - curl = _Curl() - # execute - url = _getURL("URL") + "/getPandaIDwithJobExeID" - data = {"ids": strIDs} - status, output = curl.post(url, data) + # Execute + url = f"{baseURL}/getJobStatus" + data = {"ids": str_ids} + status, output = http_client.post(url, data) try: - return status, pickle_loads(output) - except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getPandaIDwithJobExeID : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr + return status, json.loads(output) + except Exception as e: + err_str = f"ERROR getJobStatus: {str(e)}" + print(err_str) + return EC_Failed, f"{output}\n{err_str}" -# kill jobs def killJobs( ids, code=None, - verbose=False, - srvID=None, useMailAsID=False, keepUnmerged=False, jobSubStatus=None, ): - """Kill jobs. Normal users can kill only their own jobs. + """ + Kill jobs. Normal users can kill only their own jobs. People with production VOMS role can kill any jobs. Running jobs are killed when next heartbeat comes from the pilot. Set code=9 if running jobs need to be killed immediately. @@ -467,13 +252,11 @@ def killJobs( 3: aborted 4: expire in waiting 7: retry by server - 8: rebrokerage + 8: re-brokerage 9: force kill - 10: fast rebrokerage on overloaded PQs + 10: fast re-brokerage on overloaded PQs 50: kill by JEDI 91: kill user jobs with prod role - verbose: set True to see what's going on - srvID: obsolete useMailAsID: obsolete keepUnmerged: set True not to cancel unmerged jobs when pmerge is killed. jobSubStatus: set job sub status if any @@ -484,34 +267,32 @@ def killJobs( the list of clouds (or Nones if tasks are not yet assigned) """ # serialize - strIDs = pickle_dumps(ids) - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - curl.verbose = verbose + str_panda_ids = pickle_dumps(ids) + + http_client = HttpClient() + # execute - url = _getURL("URLSSL", srvID) + "/killJobs" - data = {"ids": strIDs, "code": code, "useMailAsID": useMailAsID} - killOpts = "" + url = f"{baseURLSSL}/killJobs" + data = {"ids": str_panda_ids, "code": code, "useMailAsID": useMailAsID} + kill_options = "" if keepUnmerged: - killOpts += "keepUnmerged," + kill_options += "keepUnmerged," if jobSubStatus is not None: - killOpts += f"jobSubStatus={jobSubStatus}," - data["killOpts"] = killOpts[:-1] - status, output = curl.post(url, data) + kill_options += f"jobSubStatus={jobSubStatus}," + data["killOpts"] = kill_options[:-1] + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR killJobs : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr + error_type, error_value, _ = sys.exc_info() + error_str = f"ERROR killJobs : {error_type} {error_value}" + print(error_str) + return EC_Failed, f"{output}\n{error_str}" -# reassign jobs def reassignJobs(ids, forPending=False, firstSubmission=None): - """Triggers reassignment of jobs. This is not effective if jobs were preassigned to sites before being submitted. + """ + Triggers reassignment of jobs. This is not effective if jobs were preassigned to sites before being submitted. args: ids: the list of taskIDs @@ -527,69 +308,30 @@ def reassignJobs(ids, forPending=False, firstSubmission=None): """ # serialize - strIDs = pickle_dumps(ids) - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + str_task_ids = pickle_dumps(ids) + + http_client = HttpClient() + # execute - url = baseURLSSL + "/reassignJobs" - data = {"ids": strIDs} + url = f"{baseURLSSL}/reassignJobs" + data = {"ids": str_task_ids} if forPending: data["forPending"] = True if firstSubmission is not None: data["firstSubmission"] = firstSubmission - status, output = curl.post(url, data) - try: - return status, pickle_loads(output) - except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR reassignJobs : {type} {value}" - print(errStr) - return EC_Failed, f"stat={status} err={output} {errStr}" - - -# query PandaIDs (obsolete) -def queryPandaIDs(ids): - # serialize - strIDs = pickle_dumps(ids) - # instantiate curl - curl = _Curl() - # execute - url = baseURL + "/queryPandaIDs" - data = {"ids": strIDs} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR queryPandaIDs : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr + error_type, error_value, _ = sys.exc_info() + error_str = f"ERROR reassignJobs : {error_type} {error_value}" + print(error_str) + return EC_Failed, f"stat={status} err={output} {error_str}" -# query job info per cloud (obsolete) -def queryJobInfoPerCloud(cloud, schedulerID=None): - # instantiate curl - curl = _Curl() - # execute - url = baseURL + "/queryJobInfoPerCloud" - data = {"cloud": cloud} - if schedulerID is not None: - data["schedulerID"] = schedulerID - status, output = curl.post(url, data) - try: - return status, pickle_loads(output) - except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR queryJobInfoPerCloud : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr - - -# get job statistics def getJobStatistics(sourcetype=None): - """Get job statistics + """ + Get job statistics args: sourcetype: type of jobs @@ -603,229 +345,98 @@ def getJobStatistics(sourcetype=None): map of the number jobs per job status in each site """ - # instantiate curl - curl = _Curl() - # execute - ret = {} - for srvID in getPandas(): - url = _getURL("URL", srvID) + "/getJobStatistics" - data = {} - if sourcetype is not None: - data["sourcetype"] = sourcetype - status, output = curl.get(url, data) - try: - tmpRet = status, pickle_loads(output) - if status != 0: - return tmpRet - except Exception: - print(output) - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getJobStatistics : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr - # gather - for tmpCloud in tmpRet[1]: - tmpVal = tmpRet[1][tmpCloud] - if tmpCloud not in ret: - # append cloud values - ret[tmpCloud] = tmpVal - else: - # sum statistics - for tmpStatus in tmpVal: - tmpCount = tmpVal[tmpStatus] - if tmpStatus in ret[tmpCloud]: - ret[tmpCloud][tmpStatus] += tmpCount - else: - ret[tmpCloud][tmpStatus] = tmpCount - return 0, ret - -# get job statistics for Bamboo -def getJobStatisticsForBamboo(useMorePG=False): - """Get job statistics for Bamboo - - args: - useMorePG: set True if fine-grained classification is required - returns: - status code - 0: communication succeeded to the panda server - 255: communication failure - map of the number jobs per job status in each site - - """ - # instantiate curl - curl = _Curl() + http_client = HttpClient() # execute ret = {} - for srvID in getPandas(): - url = _getURL("URL", srvID) + "/getJobStatisticsForBamboo" - data = {} - if useMorePG is not False: - data["useMorePG"] = useMorePG - status, output = curl.get(url, data) - try: - tmpRet = status, pickle_loads(output) - if status != 0: - return tmpRet - except Exception: - print(output) - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getJobStatisticsForBamboo : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr - # gather - for tmpCloud in tmpRet[1]: - tmpMap = tmpRet[1][tmpCloud] - if tmpCloud not in ret: - # append cloud values - ret[tmpCloud] = tmpMap - else: - # sum statistics - for tmpPType in tmpMap: - tmpVal = tmpMap[tmpPType] - if tmpPType not in ret[tmpCloud]: - ret[tmpCloud][tmpPType] = tmpVal - else: - for tmpStatus in tmpVal: - tmpCount = tmpVal[tmpStatus] - if tmpStatus in ret[tmpCloud][tmpPType]: - ret[tmpCloud][tmpPType][tmpStatus] += tmpCount - else: - ret[tmpCloud][tmpPType][tmpStatus] = tmpCount - return 0, ret - -# get highest prio jobs -def getHighestPrioJobStat(perPG=False, useMorePG=False): - """Get the number of jobs with the highest priorities in each combination of cloud and processingType - - args: - perPG: set True if grouped by processingGroup instead of processingType - useMorePG: set True if fine-grained classification is required - returns: - status code - 0: communication succeeded to the panda server - 255: communication failure - map of the number jobs and priorities in each combination of cloud and processingType (or processingGroup) - - """ - # instantiate curl - curl = _Curl() - # execute - ret = {} - url = baseURL + "/getHighestPrioJobStat" - data = {"perPG": perPG} - if useMorePG is not False: - data["useMorePG"] = useMorePG - status, output = curl.get(url, data) + url = f"{baseURL}/getJobStatistics" + data = {} + if sourcetype is not None: + data["sourcetype"] = sourcetype + status, output = http_client.get(url, data) try: - return status, pickle_loads(output) + tmp_return = status, pickle_loads(output) + if status != 0: + return tmp_return except Exception: print(output) - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getHighestPrioJobStat : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr - + error_type, error_value, _ = sys.exc_info() + error_str = f"ERROR getJobStatistics : {error_type} {error_value}" + print(error_str) + return EC_Failed, f"{output}\n{error_str}" + # gather + for tmpCloud in tmp_return[1]: + tmpVal = tmp_return[1][tmpCloud] + if tmpCloud not in ret: + # append cloud values + ret[tmpCloud] = tmpVal + else: + # sum statistics + for tmpStatus in tmpVal: + tmpCount = tmpVal[tmpStatus] + if tmpStatus in ret[tmpCloud]: + ret[tmpCloud][tmpStatus] += tmpCount + else: + ret[tmpCloud][tmpStatus] = tmpCount -# get jobs updated recently -def getJobsToBeUpdated(limit=5000, lockedby="", srvID=None): - """Get the list of jobs which have been recently updated. + return 0, ret - args: - limit: the maximum number of jobs - lockedby: name of the machinery which submitted jobs - srvID: obsolete - returns: - status code - 0: communication succeeded to the panda server - 255: communication failure - the lit of PandaIDs +def getJobStatisticsForBamboo(useMorePG=False): """ - # instantiate curl - curl = _Curl() - # execute - url = _getURL("URL", srvID) + "/getJobsToBeUpdated" - status, output = curl.get(url, {"limit": limit, "lockedby": lockedby}) - try: - return status, pickle_loads(output) - except Exception: - print(output) - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getJobsToBeUpdated : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr - - -# update prodDBUpdateTimes -def updateProdDBUpdateTimes(params, verbose=False, srvID=None): - """Update timestamp of jobs when update info is propagated to another database + Get job statistics for Bamboo (used by TRIUMF panglia monitoring) args: - params: map of PandaID and jobStatus and timestamp - verbose: set True to see what's going on - srvID: obsolete + useMorePG: set True if fine-grained classification is required returns: status code 0: communication succeeded to the panda server 255: communication failure - return code - True: request is processed - False: not processed + map of the number jobs per job status in each site """ - # serialize - strPar = pickle_dumps(params) - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - curl.verbose = verbose - # execute - url = _getURL("URLSSL", srvID) + "/updateProdDBUpdateTimes" - data = {"params": strPar} - status, output = curl.post(url, data) - try: - return status, pickle_loads(output) - except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR updateProdDBUpdateTimes : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr - - -# get PandaID at site -def getPandaIDsSite(site, status, limit=500): - """Get the list of jobs in a job status at at a site - - args: - site: site name - status: job status - limit: maximum number of jobs - returns: - status code - 0: communication succeeded to the panda server - 255: communication failure - the list of PandaIDs - """ - # instantiate curl - curl = _Curl() + http_client = HttpClient() # execute - url = baseURL + "/getPandaIDsSite" - status, output = curl.get(url, {"site": site, "status": status, "limit": limit}) + ret = {} + url = f"{baseURL}/getJobStatisticsForBamboo" + data = {} + if useMorePG is not False: + data["useMorePG"] = useMorePG + status, output = http_client.get(url, data) try: - return status, pickle_loads(output) + tmp_return = status, pickle_loads(output) + if status != 0: + return tmp_return except Exception: print(output) - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getPandaIDsSite : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr + error_type, error_value, _ = sys.exc_info() + error_str = f"ERROR getJobStatisticsForBamboo : {error_type} {error_value}" + print(error_str) + return EC_Failed, f"{output}\n{error_str}" + # gather + for tmpCloud in tmp_return[1]: + tmpMap = tmp_return[1][tmpCloud] + if tmpCloud not in ret: + # append cloud values + ret[tmpCloud] = tmpMap + else: + # sum statistics + for tmpPType in tmpMap: + tmpVal = tmpMap[tmpPType] + if tmpPType not in ret[tmpCloud]: + ret[tmpCloud][tmpPType] = tmpVal + else: + for tmpStatus in tmpVal: + tmpCount = tmpVal[tmpStatus] + if tmpStatus in ret[tmpCloud][tmpPType]: + ret[tmpCloud][tmpPType][tmpStatus] += tmpCount + else: + ret[tmpCloud][tmpPType][tmpStatus] = tmpCount + return 0, ret -# get job statistics per site def getJobStatisticsPerSite( predefined=False, workingGroup="", @@ -834,12 +445,13 @@ def getJobStatisticsPerSite( minPriority=None, readArchived=None, ): - """Get job statistics with job attributes + """ + Get job statistics with job attributes args: - predefined: get jobs which are assiggned to sites before being submitted - workingGroup: commna-separated list of workingGroups - countryGroup: commna-separated list of countryGroups + predefined: get jobs which are assigned to sites before being submitted + workingGroup: comma-separated list of workingGroups + countryGroup: comma-separated list of countryGroups jobType: type of jobs all: all jobs analysis: analysis jobs @@ -853,103 +465,50 @@ def getJobStatisticsPerSite( map of the number jobs per job status in each site """ - # instantiate curl - curl = _Curl() - # execute - ret = {} - for srvID in getPandas(): - url = _getURL("URL", srvID) + "/getJobStatisticsPerSite" - data = {"predefined": predefined} - if workingGroup not in ["", None]: - data["workingGroup"] = workingGroup - if countryGroup not in ["", None]: - data["countryGroup"] = countryGroup - if jobType not in ["", None]: - data["jobType"] = jobType - if minPriority not in ["", None]: - data["minPriority"] = minPriority - if readArchived not in ["", None]: - data["readArchived"] = readArchived - status, output = curl.get(url, data) - try: - tmpRet = status, pickle_loads(output) - if status != 0: - return tmpRet - except Exception: - print(output) - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getJobStatisticsPerSite : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr - # gather - for tmpSite in tmpRet[1]: - tmpVal = tmpRet[1][tmpSite] - if tmpSite not in ret: - # append site values - ret[tmpSite] = tmpVal - else: - # sum statistics - for tmpStatus in tmpVal: - tmpCount = tmpVal[tmpStatus] - if tmpStatus in ret[tmpSite]: - ret[tmpSite][tmpStatus] += tmpCount - else: - ret[tmpSite][tmpStatus] = tmpCount - return 0, ret - -# get job statistics per site with label -def getJobStatisticsWithLabel(site=""): - """Get job statistics per prodSourceLabel - - args: - site: commna-separated list of sites. An empty string for all sites. - returns: - status code - 0: communication succeeded to the panda server - 255: communication failure - map of the number jobs per job status and prodSourceLabel in each site - - """ - # instantiate curl - curl = _Curl() + http_client = HttpClient() # execute - url = baseURL + "/getJobStatisticsWithLabel" - data = {} - if site not in ["", None]: - data["site"] = site - status, output = curl.get(url, data) + ret = {} + url = f"{baseURL}/getJobStatisticsPerSite" + data = {"predefined": predefined} + if workingGroup not in ["", None]: + data["workingGroup"] = workingGroup + if countryGroup not in ["", None]: + data["countryGroup"] = countryGroup + if jobType not in ["", None]: + data["jobType"] = jobType + if minPriority not in ["", None]: + data["minPriority"] = minPriority + if readArchived not in ["", None]: + data["readArchived"] = readArchived + status, output = http_client.get(url, data) try: - return status, pickle_loads(output) + tmp_return = status, pickle_loads(output) + if status != 0: + return tmp_return except Exception: print(output) - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getJobStatisticsWithLabel : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr - + error_type, error_value, _ = sys.exc_info() + error_str = f"ERROR getJobStatisticsPerSite : {error_type} {error_value}" + print(error_str) + return EC_Failed, f"{output}\n{error_str}" + + # gather + for tmp_site, tmp_value in tmp_return[1].items(): + if tmp_site not in ret: + # append site values + ret[tmp_site] = tmp_value + else: + # sum statistics + for tmp_status, tmp_count in tmp_value.items(): + ret[tmp_site][tmp_status] = ret[tmp_site].get(tmp_status, 0) + tmp_count -# get the number of waiting jobs per site and user (obsolete) -def getJobStatisticsPerUserSite(): - # instantiate curl - curl = _Curl() - # execute - url = baseURL + "/getJobStatisticsPerUserSite" - data = {} - status, output = curl.get(url, data) - try: - return status, pickle_loads(output) - except Exception: - print(output) - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getJobStatisticsPerUserSite : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr + return 0, ret -# get job statistics per site and resource def getJobStatisticsPerSiteResource(timeWindow=None): - """Get job statistics with job attributes + """ + Get job statistics per site and resource. This is used by panglia (TRIUMF monitoring) args: timeWindow: to count number of jobs that finish/failed/cancelled for last N minutes. 12*60 by default @@ -960,27 +519,27 @@ def getJobStatisticsPerSiteResource(timeWindow=None): map of the number jobs per job status in each site and resource """ - # instantiate curl - curl = _Curl() + + http_client = HttpClient() # execute - url = baseURL + "/getJobStatisticsPerSiteResource" + url = f"{baseURL}/getJobStatisticsPerSiteResource" data = {} if timeWindow is not None: data["timeWindow"] = timeWindow - status, output = curl.get(url, data) + status, output = http_client.get(url, data) try: return status, json.loads(output) except Exception: print(output) - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getJobStatisticsPerSiteResource : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr + error_type, error_value, _ = sys.exc_info() + error_str = f"ERROR getJobStatisticsPerSiteResource : {error_type} {error_value}" + print(error_str) + return EC_Failed, f"{output}\n{error_str}" -# get job statistics per site, label, and resource def get_job_statistics_per_site_label_resource(time_window=None): - """Get job statistics per site, label, and resource + """ + Get job statistics per site, label, and resource args: timeWindow: to count number of jobs that finish/failed/cancelled for last N minutes. 12*60 by default @@ -991,87 +550,55 @@ def get_job_statistics_per_site_label_resource(time_window=None): map of the number jobs per job status in each site and resource """ - # instantiate curl - curl = _Curl() + + http_client = HttpClient() # execute - url = baseURL + "/get_job_statistics_per_site_label_resource" + url = f"{baseURL}/get_job_statistics_per_site_label_resource" data = {} if time_window is not None: data["time_window"] = time_window - status, output = curl.get(url, data) + status, output = http_client.get(url, data) try: return status, json.loads(output) except Exception as e: print(output) - errStr = f"ERROR get_job_statistics_per_site_label_resource : {str(e)}" - print(errStr) - return EC_Failed, output + "\n" + errStr - + error_str = f"ERROR get_job_statistics_per_site_label_resource : {str(e)}" + print(error_str) + return EC_Failed, f"{output}\n{error_str}" -# query last files in datasets -def queryLastFilesInDataset(datasets): - """Get names of files which have the largest serial number in each dataset - - args: - datasets: the list of dataset names - returns: - status code - 0: communication succeeded to the panda server - 255: communication failure - map of the dataset name and the file name +def insertSandboxFileInfo(userName, fileName, fileSize, checkSum): """ - # serialize - strDSs = pickle_dumps(datasets) - # instantiate curl - curl = _Curl() - # execute - url = baseURL + "/queryLastFilesInDataset" - data = {"datasets": strDSs} - status, output = curl.post(url, data) - try: - return status, pickle_loads(output) - except Exception: - type, value, traceBack = sys.exc_info() - print(f"ERROR queryLastFilesInDataset : {type} {value}") - return EC_Failed, None - - -# insert sandbox file info -def insertSandboxFileInfo(userName, fileName, fileSize, checkSum, verbose=False): - """Insert infomation of input sandbox + Insert information of input sandbox args: userName: the name of the user fileName: the file name fileSize: the file size fileSize: md5sum of the file - verbose: set True to see what's going on returns: status code 0: communication succeeded to the panda server else: communication failure """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - curl.verbose = verbose + + http_client = HttpClient() + # execute - url = baseURLSSL + "/insertSandboxFileInfo" + url = f"{baseURLSSL}/insertSandboxFileInfo" data = { "userName": userName, "fileName": fileName, "fileSize": fileSize, "checkSum": checkSum, } - return curl.post(url, data) + return http_client.post(url, data) -# upload input sandbox file def putFile(file): - """Upload input sandbox + """ + Upload input sandbox args: file: the file name @@ -1081,43 +608,40 @@ def putFile(file): else: communication failure """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/putFile" + url = f"{baseURLSSL}/putFile" data = {"file": file} - return curl.put(url, data) + return http_client.post_files(url, data) # delete file (obsolete) +# TODO: is this really obsolete? I think it's used in panda cache def deleteFile(file): - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + http_client = HttpClient() + # execute - url = baseURLSSL + "/deleteFile" + url = f"{baseURLSSL}/deleteFile" data = {"file": file} - return curl.post(url, data) + return http_client.post(url, data) # touch file (obsolete) -def touchFile(sourceURL, filename): - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() +# TODO: is this really obsolete? I think it's used in panda cache +def touchFile(source_url, filename): + http_client = HttpClient() + # execute - url = sourceURL + "/server/panda/touchFile" + url = f"{source_url}/server/panda/touchFile" data = {"filename": filename} - return curl.post(url, data) + return http_client.post(url, data) -# get site specs def getSiteSpecs(siteType=None): - """Get list of site specifications + """ + Get list of site specifications args: siteType: type of sites @@ -1131,144 +655,26 @@ def getSiteSpecs(siteType=None): map of site and attributes """ - # instantiate curl - curl = _Curl() + + http_client = HttpClient() # execute - url = baseURL + "/getSiteSpecs" + url = f"{baseURL}/getSiteSpecs" data = {} if siteType is not None: data = {"siteType": siteType} - status, output = curl.get(url, data) - try: - return status, pickle_loads(output) - except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getSiteSpecs : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr - - -# get cloud specs -def getCloudSpecs(): - """Get list of cloud specifications - - args: - returns: - status code - 0: communication succeeded to the panda server - 255: communication failure - map of cloud and attributes - - """ - # instantiate curl - curl = _Curl() - # execute - url = baseURL + "/getCloudSpecs" - status, output = curl.get(url, {}) - try: - return status, pickle_loads(output) - except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getCloudSpecs : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr - - -# get nPilots (obsolete) -def getNumPilots(): - # instantiate curl - curl = _Curl() - # execute - url = baseURL + "/getNumPilots" - status, output = curl.get(url, {}) - try: - return status, pickle_loads(output) - except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getNumPilots : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr - - -# run brokerage -def runBrokerage(sites, atlasRelease, cmtConfig=None): - """Run brokerage - - args: - sites: the list of candidate sites - atlasRelease: version number of SW release - cmtConfig: cmt config - returns: - status code - 0: communication succeeded to the panda server - else: communication failure - the name of the selected site - - """ - # serialize - strSites = pickle_dumps(sites) - # instantiate curl - curl = _Curl() - # execute - url = baseURL + "/runBrokerage" - data = {"sites": strSites, "atlasRelease": atlasRelease} - if cmtConfig is not None: - data["cmtConfig"] = cmtConfig - return curl.get(url, data) - - -# get RW -def getRW(priority=0): - """Get the amount of workload queued in each cloud - - args: - priority: workload with higher priorities than this value - returns: - status code - 0: communication succeeded to the panda server - 255: communication failure - map of cloud and the amount of workload - - """ - # instantiate curl - curl = _Curl() - # execute - url = baseURLBAMBOO + "/getRW" - # get RWs for high priority tasks - data = {"priority": priority} - status, output = curl.get(url, data) + status, output = http_client.get(url, data) try: return status, pickle_loads(output) except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getRW : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr + error_type, error_value, _ = sys.exc_info() + error_str = f"ERROR getSiteSpecs : {error_type} {error_value}" + print(error_str) + return EC_Failed, f"{output}\n{error_str}" -# change job priorities (obsolete) -def changeJobPriorities(newPrioMap): - # serialize - newPrioMapStr = pickle_dumps(newPrioMap) - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - # execute - url = baseURLSSL + "/changeJobPriorities" - data = {"newPrioMap": newPrioMapStr} - status, output = curl.post(url, data) - try: - return status, pickle_loads(output) - except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR changeJobPriorities : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr - - -# insert task params def insertTaskParams(taskParams): - """Insert task parameters + """ + Insert task parameters args: taskParams: a dictionary of task parameters @@ -1282,25 +688,24 @@ def insertTaskParams(taskParams): """ # serialize taskParamsStr = json.dumps(taskParams) - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/insertTaskParams" + url = f"{baseURLSSL}/insertTaskParams" data = {"taskParams": taskParamsStr} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR insertTaskParams : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR insertTaskParams : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# kill task def killTask(jediTaskID, broadcast=False): - """Kill a task + """ + Kill a task args: jediTaskID: jediTaskID of the task to be killed @@ -1318,27 +723,24 @@ def killTask(jediTaskID, broadcast=False): 100: non SSL connection 101: irrelevant taskID """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/killTask" - data = {"jediTaskID": jediTaskID} - data["properErrorCode"] = True - data["broadcast"] = broadcast - status, output = curl.post(url, data) + url = f"{baseURLSSL}/killTask" + data = {"jediTaskID": jediTaskID, "properErrorCode": True, "broadcast": broadcast} + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR killTask : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR killTask : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# finish task def finishTask(jediTaskID, soft=False, broadcast=False): - """Finish a task + """ + Finish a task args: jediTaskID: jediTaskID of the task to be finished @@ -1360,29 +762,26 @@ def finishTask(jediTaskID, soft=False, broadcast=False): 100: non SSL connection 101: irrelevant taskID """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/finishTask" - data = {"jediTaskID": jediTaskID} - data["properErrorCode"] = True - data["broadcast"] = broadcast + url = f"{baseURLSSL}/finishTask" + data = {"jediTaskID": jediTaskID, "properErrorCode": True, "broadcast": broadcast} if soft: data["soft"] = True - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR finishTask : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR finishTask : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# reassign task to a site def reassignTaskToSite(jediTaskID, site, mode=None): - """Reassign a task to a site. Existing jobs are killed and new jobs are generated at the site + """ + Reassign a task to a site. Existing jobs are killed and new jobs are generated at the site args: jediTaskID: jediTaskID of the task to be reassigned @@ -1404,27 +803,26 @@ def reassignTaskToSite(jediTaskID, site, mode=None): maxSite = 60 if site is not None and len(site) > maxSite: return EC_Failed, f"site parameter is too long > {maxSite}chars" - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/reassignTask" + url = f"{baseURLSSL}/reassignTask" data = {"jediTaskID": jediTaskID, "site": site} if mode is not None: data["mode"] = mode - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR reassignTaskToSite : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR reassignTaskToSite : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# reassign task to a cloud def reassignTaskToCloud(jediTaskID, cloud, mode=None): - """Reassign a task to a cloud. Existing jobs are killed and new jobs are generated in the cloud + """ + Reassign a task to a cloud. Existing jobs are killed and new jobs are generated in the cloud args: jediTaskID: jediTaskID of the task to be reassigned @@ -1443,27 +841,26 @@ def reassignTaskToCloud(jediTaskID, cloud, mode=None): 100: non SSL connection 101: irrelevant taskID """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/reassignTask" + url = f"{baseURLSSL}/reassignTask" data = {"jediTaskID": jediTaskID, "cloud": cloud} if mode is not None: data["mode"] = mode - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR reassignTaskToCloud : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR reassignTaskToCloud : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# reassign task to a nucleus def reassignTaskToNucleus(jediTaskID, nucleus, mode=None): - """Reassign a task to a nucleus. Existing jobs are killed and new jobs are generated in the cloud + """ + Reassign a task to a nucleus. Existing jobs are killed and new jobs are generated in the cloud args: jediTaskID: jediTaskID of the task to be reassigned @@ -1482,27 +879,26 @@ def reassignTaskToNucleus(jediTaskID, nucleus, mode=None): 100: non SSL connection 101: irrelevant taskID """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/reassignTask" + url = f"{baseURLSSL}/reassignTask" data = {"jediTaskID": jediTaskID, "nucleus": nucleus} if mode is not None: data["mode"] = mode - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR reassignTaskToCloud : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR reassignTaskToCloud : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# upload log def uploadLog(logStr, logFileName): - """Upload sandbox + """ + Upload log args: logStr: log message @@ -1513,10 +909,9 @@ def uploadLog(logStr, logFileName): else: communication failure """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # write log to a tmp file fh = tempfile.NamedTemporaryFile(delete=False) gfh = gzip.open(fh.name, mode="wb") @@ -1525,16 +920,16 @@ def uploadLog(logStr, logFileName): gfh.write(logStr) gfh.close() # execute - url = baseURLSSL + "/uploadLog" + url = f"{baseURLSSL}/uploadLog" data = {"file": f"{fh.name};filename={logFileName}"} - retVal = curl.put(url, data) + retVal = http_client.post_files(url, data) os.unlink(fh.name) return retVal -# change task priority def changeTaskPriority(jediTaskID, newPriority): - """Change task priority + """ + Change the task priority args: jediTaskID: jediTaskID of the task to change the priority @@ -1548,25 +943,24 @@ def changeTaskPriority(jediTaskID, newPriority): 1: succeeded None: database error """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/changeTaskPriority" + url = f"{baseURLSSL}/changeTaskPriority" data = {"jediTaskID": jediTaskID, "newPriority": newPriority} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR changeTaskPriority : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR changeTaskPriority : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# set debug mode def setDebugMode(pandaID, modeOn): - """Turn debug mode on/off for a job + """ + Turn debug mode for a job on/off args: pandaID: PandaID of the job @@ -1577,19 +971,18 @@ def setDebugMode(pandaID, modeOn): another: communication failure error message """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/setDebugMode" + url = f"{baseURLSSL}/setDebugMode" data = {"pandaID": pandaID, "modeOn": modeOn} - return curl.post(url, data) + return http_client.post(url, data) -# retry task -def retryTask(jediTaskID, verbose=False, noChildRetry=False, discardEvents=False, disable_staging_mode=False, keep_gshare_priority=False): - """Retry task +def retryTask(jediTaskID, noChildRetry=False, discardEvents=False, disable_staging_mode=False, keep_gshare_priority=False): + """ + Retry a task args: jediTaskID: jediTaskID of the task to retry @@ -1610,15 +1003,12 @@ def retryTask(jediTaskID, verbose=False, noChildRetry=False, discardEvents=False 100: non SSL connection 101: irrelevant taskID """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - curl.verbose = verbose + + http_client = HttpClient() + # execute - url = baseURLSSL + "/retryTask" - data = {"jediTaskID": jediTaskID} - data["properErrorCode"] = True + url = f"{baseURLSSL}/retryTask" + data = {"jediTaskID": jediTaskID, "properErrorCode": True} if noChildRetry: data["noChildRetry"] = True if discardEvents: @@ -1627,18 +1017,18 @@ def retryTask(jediTaskID, verbose=False, noChildRetry=False, discardEvents=False data["disable_staging_mode"] = True if keep_gshare_priority: data["keep_gshare_priority"] = True - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR retryTask : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR retryTask : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# reload input -def reloadInput(jediTaskID, verbose=False): - """Retry task +def reloadInput(jediTaskID): + """ + Reload the input for a task args: jediTaskID: jediTaskID of the task to retry @@ -1655,26 +1045,24 @@ def reloadInput(jediTaskID, verbose=False): 100: non SSL connection 101: irrelevant taskID """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - curl.verbose = verbose + + http_client = HttpClient() + # execute - url = baseURLSSL + "/reloadInput" + url = f"{baseURLSSL}/reloadInput" data = {"jediTaskID": jediTaskID} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR reloadInput : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR reloadInput : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# change task walltime def changeTaskWalltime(jediTaskID, wallTime): - """Change task priority + """ + Change task walltime args: jediTaskID: jediTaskID of the task to change the priority @@ -1688,25 +1076,24 @@ def changeTaskWalltime(jediTaskID, wallTime): 1: succeeded None: database error """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/changeTaskAttributePanda" + url = f"{baseURLSSL}/changeTaskAttributePanda" data = {"jediTaskID": jediTaskID, "attrName": "wallTime", "attrValue": wallTime} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR changeTaskWalltime : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR changeTaskWalltime : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# change task cputime def changeTaskCputime(jediTaskID, cpuTime): - """Change task cpuTime + """ + Change task CPU time args: jediTaskID: jediTaskID of the task to change the priority @@ -1720,25 +1107,24 @@ def changeTaskCputime(jediTaskID, cpuTime): 1: succeeded None: database error """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/changeTaskAttributePanda" + url = f"{baseURLSSL}/changeTaskAttributePanda" data = {"jediTaskID": jediTaskID, "attrName": "cpuTime", "attrValue": cpuTime} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR changeTaskCputime : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR changeTaskCputime : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# change task RAM count def changeTaskRamCount(jediTaskID, ramCount): - """Change task priority + """ + Change task RAM count args: jediTaskID: jediTaskID of the task to change the priority @@ -1752,25 +1138,24 @@ def changeTaskRamCount(jediTaskID, ramCount): 1: succeeded None: database error """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/changeTaskAttributePanda" + url = f"{baseURLSSL}/changeTaskAttributePanda" data = {"jediTaskID": jediTaskID, "attrName": "ramCount", "attrValue": ramCount} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR changeTaskRamCount : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR changeTaskRamCount : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# change task attribute def changeTaskAttribute(jediTaskID, attrName, attrValue): - """Change task attribute + """ + Change task attribute args: jediTaskID: jediTaskID of the task to change the attribute @@ -1780,31 +1165,30 @@ def changeTaskAttribute(jediTaskID, attrName, attrValue): status code 0: communication succeeded to the panda server 255: communication failure - return: a tupple of return code and message + return: a tuple of return code and message 0: unknown task 1: succeeded 2: disallowed to update the attribute None: database error """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/changeTaskAttributePanda" + url = f"{baseURLSSL}/changeTaskAttributePanda" data = {"jediTaskID": jediTaskID, "attrName": attrName, "attrValue": attrValue} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR changeTaskAttributePanda : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR changeTaskAttributePanda : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# change split rule for task def changeTaskSplitRule(jediTaskID, ruleName, ruleValue): - """Change split rule fo task + """ + Change split rule fo task args: jediTaskID: jediTaskID of the task to change the rule @@ -1814,31 +1198,30 @@ def changeTaskSplitRule(jediTaskID, ruleName, ruleValue): status code 0: communication succeeded to the panda server 255: communication failure - return: a tupple of return code and message + return: a tuple of return code and message 0: unknown task 1: succeeded 2: disallowed to update the attribute None: database error """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/changeTaskSplitRulePanda" + url = f"{baseURLSSL}/changeTaskSplitRulePanda" data = {"jediTaskID": jediTaskID, "attrName": ruleName, "attrValue": ruleValue} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR changeTaskSplitRule : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR changeTaskSplitRule : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# pause task -def pauseTask(jediTaskID, verbose=False): - """Pause task +def pauseTask(jediTaskID): + """ + Pause task args: jediTaskID: jediTaskID of the task to pause @@ -1855,26 +1238,24 @@ def pauseTask(jediTaskID, verbose=False): 100: non SSL connection 101: irrelevant taskID """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - curl.verbose = verbose + + http_client = HttpClient() + # execute - url = baseURLSSL + "/pauseTask" + url = f"{baseURLSSL}/pauseTask" data = {"jediTaskID": jediTaskID} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR pauseTask : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR pauseTask : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# resume task -def resumeTask(jediTaskID, verbose=False): - """Resume task +def resumeTask(jediTaskID): + """ + Resume task args: jediTaskID: jediTaskID of the task to release @@ -1891,26 +1272,24 @@ def resumeTask(jediTaskID, verbose=False): 100: non SSL connection 101: irrelevant taskID """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - curl.verbose = verbose + + http_client = HttpClient() + # execute - url = baseURLSSL + "/resumeTask" + url = f"{baseURLSSL}/resumeTask" data = {"jediTaskID": jediTaskID} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR resumeTask : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR resumeTask : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# avalanche task -def avalancheTask(jediTaskID, verbose=False): - """force avalanche for task +def avalancheTask(jediTaskID): + """ + Force avalanche for task args: jediTaskID: jediTaskID of the task to avalanche @@ -1927,26 +1306,24 @@ def avalancheTask(jediTaskID, verbose=False): 100: non SSL connection 101: irrelevant taskID """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - curl.verbose = verbose + + http_client = HttpClient() + # execute - url = baseURLSSL + "/avalancheTask" + url = f"{baseURLSSL}/avalancheTask" data = {"jediTaskID": jediTaskID} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR resumeTask : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR resumeTask : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# increase attempt number for unprocessed files def increaseAttemptNr(jediTaskID, increase): - """Change task priority + """ + Change task priority args: jediTaskID: jediTaskID of the task to increase attempt numbers @@ -1963,25 +1340,24 @@ def increaseAttemptNr(jediTaskID, increase): 4: wrong parameter None: database error """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/increaseAttemptNrPanda" + url = f"{baseURLSSL}/increaseAttemptNrPanda" data = {"jediTaskID": jediTaskID, "increasedNr": increase} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR increaseAttemptNr : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR increaseAttemptNr : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# kill unfinished jobs -def killUnfinishedJobs(jediTaskID, code=None, verbose=False, srvID=None, useMailAsID=False): - """Kill unfinished jobs in a task. Normal users can kill only their own jobs. +def killUnfinishedJobs(jediTaskID, code=None, useMailAsID=False): + """ + Kill unfinished jobs in a task. Normal users can kill only their own jobs. People with production VOMS role can kill any jobs. Running jobs are killed when next heartbeat comes from the pilot. Set code=9 if running jobs need to be killed immediately. @@ -1993,12 +1369,10 @@ def killUnfinishedJobs(jediTaskID, code=None, verbose=False, srvID=None, useMail 3: aborted 4: expire in waiting 7: retry by server - 8: rebrokerage + 8: re-brokerage 9: force kill 50: kill by JEDI 91: kill user jobs with prod role - verbose: set True to see what's going on - srvID: obsolete useMailAsID: obsolete returns: status code @@ -2006,27 +1380,25 @@ def killUnfinishedJobs(jediTaskID, code=None, verbose=False, srvID=None, useMail 255: communication failure the list of clouds (or Nones if tasks are not yet assigned) """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - curl.verbose = verbose + + http_client = HttpClient() + # execute - url = _getURL("URLSSL", srvID) + "/killUnfinishedJobs" + url = f"{baseURLSSL}/killUnfinishedJobs" data = {"jediTaskID": jediTaskID, "code": code, "useMailAsID": useMailAsID} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR killUnfinishedJobs : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr + error_type, error_value, _ = sys.exc_info() + error_str = f"ERROR killUnfinishedJobs : {error_type} {error_value}" + print(error_str) + return EC_Failed, f"{output}\n{error_str}" -# trigger task brokerage def triggerTaskBrokerage(jediTaskID): - """Trigger task brokerge + """ + Trigger task brokerage args: jediTaskID: jediTaskID of the task to change the attribute @@ -2034,30 +1406,29 @@ def triggerTaskBrokerage(jediTaskID): status code 0: communication succeeded to the panda server 255: communication failure - return: a tupple of return code and message + return: a tuple of return code and message 0: unknown task 1: succeeded None: database error """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/changeTaskModTimePanda" + url = f"{baseURLSSL}/changeTaskModTimePanda" data = {"jediTaskID": jediTaskID, "diffValue": -12} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR triggerTaskBrokerage : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR triggerTaskBrokerage : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# get PanDA IDs with TaskID def getPandaIDsWithTaskID(jediTaskID): - """Get PanDA IDs with TaskID + """ + Get PanDA IDs with TaskID args: jediTaskID: jediTaskID of the task to get lit of PanDA IDs @@ -2067,24 +1438,24 @@ def getPandaIDsWithTaskID(jediTaskID): 255: communication failure the list of PanDA IDs """ - # instantiate curl - curl = _Curl() + + http_client = HttpClient() # execute - url = baseURL + "/getPandaIDsWithTaskID" + url = f"{baseURL}/getPandaIDsWithTaskID" data = {"jediTaskID": jediTaskID} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getPandaIDsWithTaskID : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr + error_type, error_value, _ = sys.exc_info() + error_str = f"ERROR getPandaIDsWithTaskID : {error_type} {error_value}" + print(error_str) + return EC_Failed, f"{output}\n{error_str}" -# reactivate task def reactivateTask(jediTaskID, keep_attempt_nr=False, trigger_job_generation=False): - """Reactivate task + """ + Reactivate task args: jediTaskID: jediTaskID of the task to be reactivated @@ -2094,34 +1465,33 @@ def reactivateTask(jediTaskID, keep_attempt_nr=False, trigger_job_generation=Fal status code 0: communication succeeded to the panda server 255: communication failure - return: a tupple of return code and message + return: a tuple of return code and message 0: unknown task 1: succeeded None: database error """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/reactivateTask" + url = f"{baseURLSSL}/reactivateTask" data = {"jediTaskID": jediTaskID} if keep_attempt_nr: data["keep_attempt_nr"] = True if trigger_job_generation: data["trigger_job_generation"] = True - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR reactivateTask : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR reactivateTask : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# get task status TaskID def getTaskStatus(jediTaskID): - """Get task status + """ + Get task status for a particular task ID args: jediTaskID: jediTaskID of the task to get lit of PanDA IDs @@ -2131,27 +1501,28 @@ def getTaskStatus(jediTaskID): 255: communication failure the status string """ - # instantiate curl - curl = _Curl() + + http_client = HttpClient() # execute - url = baseURL + "/getTaskStatus" + url = f"{baseURL}/getTaskStatus" data = {"jediTaskID": jediTaskID} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getTaskStatus : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr + error_type, error_value, _ = sys.exc_info() + error_str = f"ERROR getTaskStatus : {error_type} {error_value}" + print(error_str) + return EC_Failed, f"{output}\n{error_str}" -# reassign specified tasks (and their jobs) to a new share def reassignShare(jedi_task_ids, share, reassign_running=False): """ + Reassign specified tasks (and their jobs) to a new share + args: jedi_task_ids: task ids to act on - share: share to be applied to jeditaskids + share: share to be applied to jedi task ids returns: status code 0: communication succeeded to the panda server @@ -2161,66 +1532,31 @@ def reassignShare(jedi_task_ids, share, reassign_running=False): 0: success None: database error """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() jedi_task_ids_pickle = pickle_dumps(jedi_task_ids) change_running_pickle = pickle_dumps(reassign_running) # execute - url = baseURLSSL + "/reassignShare" + url = f"{baseURLSSL}/reassignShare" data = { "jedi_task_ids_pickle": jedi_task_ids_pickle, "share": share, "reassign_running": change_running_pickle, } - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - err_type, err_value = sys.exc_info()[:2] - err_str = f"ERROR reassignShare : {err_type} {err_value}" - return EC_Failed, f"{output}\n{err_str}" + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR reassignShare : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# list tasks in a particular share and optionally status -def listTasksInShare(gshare, status="running"): - """ - args: - gshare: global share - status: task status, running by default - returns: - status code - 0: communication succeeded to the panda server - 255: communication failure - return: a tuple of return code and jedi_task_ids - 1: logical error - 0: success - None: database error - """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - - # execute - url = baseURLSSL + "/listTasksInShare" - data = {"gshare": gshare, "status": status} - status, output = curl.post(url, data) - - try: - return status, pickle_loads(output) - except Exception: - err_type, err_value = sys.exc_info()[:2] - err_str = f"ERROR listTasksInShare : {err_type} {err_value}" - return EC_Failed, f"{output}\n{err_str}" - - -# get taskParamsMap with TaskID def getTaskParamsMap(jediTaskID): - """Get task status + """ + Get task parameter map for a certain task ID args: jediTaskID: jediTaskID of the task to get taskParamsMap @@ -2233,24 +1569,24 @@ def getTaskParamsMap(jediTaskID): 0: success None: database error """ - # instantiate curl - curl = _Curl() + + http_client = HttpClient() # execute - url = baseURL + "/getTaskParamsMap" + url = f"{baseURL}/getTaskParamsMap" data = {"jediTaskID": jediTaskID} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, pickle_loads(output) except Exception: - type, value, traceBack = sys.exc_info() - errStr = f"ERROR getTaskParamsMap : {type} {value}" - print(errStr) - return EC_Failed, output + "\n" + errStr + error_type, error_value, _ = sys.exc_info() + error_str = f"ERROR getTaskParamsMap : {error_type} {error_value}" + print(error_str) + return EC_Failed, f"{output}\n{error_str}" -# set num slots for workload provisioning def setNumSlotsForWP(pandaQueueName, numSlots, gshare=None, resourceType=None, validPeriod=None): - """Set num slots for workload provisioning + """ + Set num slots for workload provisioning args: pandaQueueName: Panda Queue name @@ -2269,12 +1605,11 @@ def setNumSlotsForWP(pandaQueueName, numSlots, gshare=None, resourceType=None, v 101: missing production role 102: type error for some parameters """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/setNumSlotsForWP" + url = f"{baseURLSSL}/setNumSlotsForWP" data = {"pandaQueueName": pandaQueueName, "numSlots": numSlots} if gshare is not None: data["gshare"] = gshare @@ -2282,18 +1617,19 @@ def setNumSlotsForWP(pandaQueueName, numSlots, gshare=None, resourceType=None, v data["resourceType"] = resourceType if validPeriod is not None: data["validPeriod"] = validPeriod - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, json.loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR setNumSlotsForWP : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR setNumSlotsForWP : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" # enable jumbo jobs def enableJumboJobs(jediTaskID, totalJumboJobs=1, nJumboPerSite=1): - """Enable jumbo jobs + """ + Enable jumbo jobs for a task args: jediTaskID: jediTaskID of the task @@ -2310,60 +1646,29 @@ def enableJumboJobs(jediTaskID, totalJumboJobs=1, nJumboPerSite=1): 101: missing production role 102: type error for some parameters """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() + # execute - url = baseURLSSL + "/enableJumboJobs" + url = f"{baseURLSSL}/enableJumboJobs" data = { "jediTaskID": jediTaskID, "nJumboJobs": totalJumboJobs, "nJumboPerSite": nJumboPerSite, } - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, json.loads(output) except Exception: - errtype, errvalue = sys.exc_info()[:2] - errStr = f"ERROR /enableJumboJobs : {errtype} {errvalue}" - return EC_Failed, output + "\n" + errStr + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR /enableJumboJobs : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# get Global Share status -def getGShareStatus(): - """ - - returns: - status code - 0: communication succeeded to the panda server - 255: communication failure - tuple of return code and diagnostic message - 0: succeeded - 1: server error - 100: non SSL connection - 101: missing production role - 102: type error for some parameters - """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - # execute - url = baseURLSSL + "/getGShareStatus" - - status, output = curl.post(url, {}) - try: - return status, json.loads(output) - except Exception: - err_type, err_value = sys.exc_info()[:2] - err_str = f"ERROR /getGShareStatus : {err_type} {err_value}" - return EC_Failed, output + "\n" + err_str - - -# send a harvester command to panda server in order sweep a panda queue def sweepPQ(panda_queue, status_list, ce_list, submission_host_list): """ + Send a harvester command to panda server in order sweep a panda queue + args: panda_queue: panda queue name status_list: list with statuses to sweep, e.g. ['submitted'] @@ -2377,10 +1682,8 @@ def sweepPQ(panda_queue, status_list, ce_list, submission_host_list): False: logical error True: success """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() panda_queue_json = json.dumps(panda_queue) status_list_json = json.dumps(status_list) @@ -2388,26 +1691,27 @@ def sweepPQ(panda_queue, status_list, ce_list, submission_host_list): submission_host_list_json = json.dumps(submission_host_list) # execute - url = baseURLSSL + "/sweepPQ" + url = f"{baseURLSSL}/sweepPQ" data = { "panda_queue": panda_queue_json, "status_list": status_list_json, "ce_list": ce_list_json, "submission_host_list": submission_host_list_json, } - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, json.loads(output) except Exception: - err_type, err_value = sys.exc_info()[:2] - err_str = f"ERROR sweepPQ : {err_type} {err_value}" - return EC_Failed, f"{output}\n{err_str}" + error_type, error_value = sys.exc_info()[:2] + error_str = f"ERROR sweepPQ : {error_type} {error_value}" + return EC_Failed, f"{output}\n{error_str}" -# send a command to a job def send_command_to_job(panda_id, com): """ + Send a command to a job + args: panda_id: PandaID of the job com: a command string passed to the pilot. max 250 chars @@ -2419,29 +1723,25 @@ def send_command_to_job(panda_id, com): False: failed True: the command received """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() + + http_client = HttpClient() # execute - url = baseURLSSL + "/send_command_to_job" + url = f"{baseURLSSL}/send_command_to_job" data = {"panda_id": panda_id, "com": com} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, json.loads(output) except Exception as e: - err_str = f"ERROR send_command_to_job : {str(e)}" - return EC_Failed, f"{output}\n{err_str}" + error_str = f"ERROR send_command_to_job : {str(e)}" + return EC_Failed, f"{output}\n{error_str}" -# get ban list -def get_ban_users(verbose=False): - """Get ban user list +def get_ban_users(): + """ + Get list of banned users - args: - verbose: set True to see what's going on returns: status code True: communication succeeded to the panda server @@ -2449,14 +1749,14 @@ def get_ban_users(verbose=False): """ - # instantiate curl - curl = _Curl() - curl.verbose = verbose + + http_client = HttpClient() + # execute - url = baseURL + "/get_ban_users" + url = f"{baseURL}/get_ban_users" output = None try: - status, output = curl.post(url, {}) + status, output = http_client.post(url, {}) if status == 0: return json.loads(output) else: @@ -2465,8 +1765,9 @@ def get_ban_users(verbose=False): return False, f"broken response: {output}" -def release_task(jedi_task_id, verbose=False): - """release task from staging +def release_task(jedi_task_id): + """ + Release task from staging args: jedi_task_id: jediTaskID of the task to avalanche @@ -2483,17 +1784,15 @@ def release_task(jedi_task_id, verbose=False): 100: non SSL connection 101: irrelevant taskID """ - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - curl.verbose = verbose + + http_client = HttpClient() + # execute - url = baseURLSSL + "/release_task" + url = f"{baseURLSSL}/release_task" data = {"jedi_task_id": jedi_task_id} - status, output = curl.post(url, data) + status, output = http_client.post(url, data) try: return status, json.loads(output) except Exception as e: - err_str = f"ERROR release_task : failed with {str(e)}" - return EC_Failed, output + "\n" + err_str + error_str = f"ERROR release_task : failed with {str(e)}" + return EC_Failed, f"{output}\n{error_str}" diff --git a/pandaserver/userinterface/UserIF.py b/pandaserver/userinterface/UserIF.py index 082aa2e86..f96cd239a 100644 --- a/pandaserver/userinterface/UserIF.py +++ b/pandaserver/userinterface/UserIF.py @@ -31,9 +31,28 @@ except ImportError: pass +MESSAGE_SSL = "SSL secure connection is required" +MESSAGE_PROD_ROLE = "production or pilot role required" +MESSAGE_TASK_ID = "jediTaskID must be an integer" +MESSAGE_DATABASE = "database error in the PanDA server" +MESSAGE_JSON = "failed to load JSON" + +CODE_SSL = 100 +CODE_LOGIC = 101 +CODE_OTHER_PARAMS = 102 + + _logger = PandaLogger().getLogger("UserIF") +def resolve_true(variable): + return variable == "True" + + +def resolve_false(variable): + return variable != "False" + + # main class class UserIF: # constructor @@ -76,8 +95,8 @@ def submitJobs(self, jobsStr, user, host, userFQANs, prodRole=False, toPending=F good_labels_message = f"submitJobs {user} wrong job_label={tmpJob.job_label}" break except Exception: - errType, errValue = sys.exc_info()[:2] - _logger.error(f"submitJobs : checking good_labels {errType} {errValue}") + err_type, err_value = sys.exc_info()[:2] + _logger.error(f"submitJobs : checking good_labels {err_type} {err_value}") good_labels = False # reject injection for error with the labels @@ -107,7 +126,7 @@ def submitJobs(self, jobsStr, user, host, userFQANs, prodRole=False, toPending=F if job0.prodUserName and job0.prodUserName.lower() != "none": user = job0.prodUserName except AttributeError: - _logger.error("submitJobs : checking username for userVO[%s]: username not found, defaulting to %s. %s %s" % (userVO, user)) + _logger.error(f"submitJobs : checking username for userVO {userVO}: username not found, defaulting to {user}.") # store jobs ret = self.taskBuffer.storeJobs( @@ -123,46 +142,6 @@ def submitJobs(self, jobsStr, user, host, userFQANs, prodRole=False, toPending=F # serialize return WrappedPickle.dumps(ret) - # get serial number for group job - def getSerialNumberForGroupJob(self, name): - # get - ret = self.taskBuffer.getSerialNumberForGroupJob(name) - # serialize - return WrappedPickle.dumps(ret) - - # change job priorities - def changeJobPriorities(self, user, prodRole, newPrioMapStr): - # check production role - if not prodRole: - return False, "production role is required" - try: - # deserialize map - newPrioMap = WrappedPickle.loads(newPrioMapStr) - _logger.debug(f"changeJobPriorities {user} : {str(newPrioMap)}") - # change - ret = self.taskBuffer.changeJobPriorities(newPrioMap) - except Exception: - errType, errValue = sys.exc_info()[:2] - _logger.error(f"changeJobPriorities : {errType} {errValue}") - return False, "internal server error" - # serialize - return ret - - # retry failed subjobs in running job - def retryFailedJobsInActive(self, dn, jobID): - returnVal = False - try: - _logger.debug(f"retryFailedJobsInActive {dn} JobID:{jobID}") - cUID = self.taskBuffer.cleanUserID(dn) - tmpRet = self.taskBuffer.finalizePendingJobs(cUID, jobID) - returnVal = True - except Exception: - errType, errValue = sys.exc_info()[:2] - _logger.error(f"retryFailedJobsInActive: {errType} {errValue}") - returnVal = "ERROR: server side crash" - # return - return returnVal - # set debug mode def setDebugMode(self, dn, pandaID, prodManager, modeOn, workingGroup): ret = self.taskBuffer.setDebugMode(dn, pandaID, prodManager, modeOn, workingGroup) @@ -209,27 +188,6 @@ def getJobStatus(self, idsStr, use_json, no_pickle=False): return JobUtils.dump_jobs_json(ret) return WrappedPickle.dumps(ret) - # get PandaID with jobexeID - def getPandaIDwithJobExeID(self, idsStr): - try: - # deserialize jobspecs - ids = WrappedPickle.loads(idsStr) - _logger.debug(f"getPandaIDwithJobExeID len : {len(ids)}") - maxIDs = 5500 - if len(ids) > maxIDs: - _logger.error(f"too long ID list more than {maxIDs}") - ids = ids[:maxIDs] - except Exception: - errtype, errvalue = sys.exc_info()[:2] - _logger.error(f"getPandaIDwithJobExeID : {errtype} {errvalue}") - ids = [] - _logger.debug(f"getPandaIDwithJobExeID start : {ids}") - # peek jobs - ret = self.taskBuffer.getPandaIDwithJobExeID(ids) - _logger.debug("getPandaIDwithJobExeID end") - # serialize - return WrappedPickle.dumps(ret) - # get PandaIDs with TaskID def getPandaIDsWithTaskID(self, jediTaskID): # get PandaIDs @@ -237,39 +195,15 @@ def getPandaIDsWithTaskID(self, jediTaskID): # serialize return WrappedPickle.dumps(ret) - # get active datasets - def getActiveDatasets(self, computingSite, prodSourceLabel): - # run - ret = self.taskBuffer.getActiveDatasets(computingSite, prodSourceLabel) - # return - return ret - # get job statistics def getJobStatistics(self, sourcetype=None): # get job statistics ret = self.taskBuffer.getJobStatisticsForExtIF(sourcetype) - # serialize - return WrappedPickle.dumps(ret) - - # get highest prio jobs - def getHighestPrioJobStat(self, perPG=False, useMorePG=False): - # get job statistics - ret = self.taskBuffer.getHighestPrioJobStat(perPG, useMorePG) - # serialize - return WrappedPickle.dumps(ret) - - # get queued analysis jobs at a site - def getQueuedAnalJobs(self, site, dn): - # get job statistics - ret = self.taskBuffer.getQueuedAnalJobs(site, dn) - # serialize return WrappedPickle.dumps(ret) # get job statistics for Bamboo def getJobStatisticsForBamboo(self, useMorePG=False): - # get job statistics ret = self.taskBuffer.getJobStatisticsForBamboo(useMorePG) - # serialize return WrappedPickle.dumps(ret) # get job statistics per site @@ -291,115 +225,18 @@ def getJobStatisticsPerSite( jobType, minPriority=minPriority, ) - # serialize return WrappedPickle.dumps(ret, convert_to_safe=True) # get job statistics per site and resource def getJobStatisticsPerSiteResource(self, timeWindow): - # get job statistics ret = self.taskBuffer.getJobStatisticsPerSiteResource(timeWindow) - # serialize return json.dumps(ret) # get job statistics per site, source label, and resource type def get_job_statistics_per_site_label_resource(self, time_window): - # get job statistics ret = self.taskBuffer.get_job_statistics_per_site_label_resource(time_window) - # serialize return json.dumps(ret) - # get the number of waiting jobs per site and use - def getJobStatisticsPerUserSite(self): - # get job statistics - ret = self.taskBuffer.getJobStatisticsPerUserSite() - # serialize - return WrappedPickle.dumps(ret) - - # get job statistics per site with label - def getJobStatisticsWithLabel(self, site): - # get job statistics - ret = self.taskBuffer.getJobStatisticsWithLabel(site) - # serialize - return WrappedPickle.dumps(ret) - - # query PandaIDs - def queryPandaIDs(self, idsStr): - # deserialize IDs - ids = WrappedPickle.loads(idsStr) - # query PandaIDs - ret = self.taskBuffer.queryPandaIDs(ids) - # serialize - return WrappedPickle.dumps(ret) - - # get number of analysis jobs per user - def getNUserJobs(self, siteName): - # get - ret = self.taskBuffer.getNUserJobs(siteName) - # serialize - return WrappedPickle.dumps(ret) - - # query job info per cloud - def queryJobInfoPerCloud(self, cloud, schedulerID): - # query PandaIDs - ret = self.taskBuffer.queryJobInfoPerCloud(cloud, schedulerID) - # serialize - return WrappedPickle.dumps(ret) - - # query PandaIDs at site - def getPandaIDsSite(self, site, status, limit): - # query PandaIDs - ret = self.taskBuffer.getPandaIDsSite(site, status, limit) - # serialize - return WrappedPickle.dumps(ret) - - # get PandaIDs to be updated in prodDB - def getJobsToBeUpdated(self, limit, lockedby): - # query PandaIDs - ret = self.taskBuffer.getPandaIDsForProdDB(limit, lockedby) - # serialize - return WrappedPickle.dumps(ret) - - # update prodDBUpdateTimes - def updateProdDBUpdateTimes(self, paramsStr): - # deserialize IDs - params = WrappedPickle.loads(paramsStr) - # get jobs - ret = self.taskBuffer.updateProdDBUpdateTimes(params) - # serialize - return WrappedPickle.dumps(True) - - # query last files in datasets - def queryLastFilesInDataset(self, datasetStr): - # deserialize names - datasets = WrappedPickle.loads(datasetStr) - # get files - ret = self.taskBuffer.queryLastFilesInDataset(datasets) - # serialize - return WrappedPickle.dumps(ret) - - # get input files currently in used for analysis - def getFilesInUseForAnal(self, outDataset): - # get files - ret = self.taskBuffer.getFilesInUseForAnal(outDataset) - # serialize - return WrappedPickle.dumps(ret) - - # get list of dis dataset to get input files in shadow - def getDisInUseForAnal(self, outDataset): - # get files - ret = self.taskBuffer.getDisInUseForAnal(outDataset) - # serialize - return WrappedPickle.dumps(ret) - - # get input LFNs currently in use for analysis with shadow dis - def getLFNsInUseForAnal(self, inputDisListStr): - # deserialize IDs - inputDisList = WrappedPickle.loads(inputDisListStr) - # get files - ret = self.taskBuffer.getLFNsInUseForAnal(inputDisList) - # serialize - return WrappedPickle.dumps(ret) - # kill jobs def killJobs(self, idsStr, user, host, code, prodManager, useMailAsID, fqans, killOpts=[]): # deserialize IDs @@ -419,8 +256,8 @@ def killJobs(self, idsStr, user, host, code, prodManager, useMailAsID, fqans, ki break time.sleep(1) except Exception: - errType, errValue = sys.exc_info()[:2] - _logger.error(f"killJob : failed to convert email address {user} : {errType} {errValue}") + err_type, err_value = sys.exc_info()[:2] + _logger.error(f"killJob : failed to convert email address {user} : {err_type} {err_value}") # get working groups with prod role wgProdRole = [] for fqan in fqans: @@ -434,7 +271,6 @@ def killJobs(self, idsStr, user, host, code, prodManager, useMailAsID, fqans, ki wgProdRole.append(f"gr_{tmpWG}") # kill jobs ret = self.taskBuffer.killJobs(ids, user, code, prodManager, wgProdRole, killOpts) - # serialize return WrappedPickle.dumps(ret) # reassign jobs @@ -447,16 +283,6 @@ def reassignJobs(self, idsStr, user, host, forPending, firstSubmission): forPending=forPending, firstSubmission=firstSubmission, ) - # serialize - return WrappedPickle.dumps(ret) - - # resubmit jobs - def resubmitJobs(self, idsStr): - # deserialize IDs - ids = WrappedPickle.loads(idsStr) - # kill jobs - ret = self.taskBuffer.resubmitJobs(ids) - # serialize return WrappedPickle.dumps(ret) # get list of site spec @@ -481,106 +307,31 @@ def getSiteSpecs(self, siteType="analysis"): continue tmpSpec[attr] = getattr(spec, attr) specList[id] = tmpSpec - # serialize return WrappedPickle.dumps(specList) - # get list of cloud spec - def getCloudSpecs(self): - # get cloud list - siteMapper = SiteMapper(self.taskBuffer) - # serialize - return WrappedPickle.dumps(siteMapper.cloudSpec) - - # get nPilots - def getNumPilots(self): - # get nPilots - ret = self.taskBuffer.getCurrentSiteData() - numMap = {} - for siteID in ret: - siteNumMap = ret[siteID] - nPilots = 0 - # nPilots = getJob+updateJob - if "getJob" in siteNumMap: - nPilots += siteNumMap["getJob"] - if "updateJob" in siteNumMap: - nPilots += siteNumMap["updateJob"] - # append - numMap[siteID] = {"nPilots": nPilots} - # serialize - return WrappedPickle.dumps(numMap) - # get script for offline running def getScriptOfflineRunning(self, pandaID, days=None): - # register ret = self.taskBuffer.getScriptOfflineRunning(pandaID, days) - # return return ret # get ban users def get_ban_users(self): ret = self.taskBuffer.get_ban_users() - # return return json.dumps(ret) # get client version def getPandaClientVer(self): - # get ret = self.taskBuffer.getPandaClientVer() - # return return ret - # get slimmed file info with PandaIDs - def getSlimmedFileInfoPandaIDs(self, pandaIDsStr, dn): - try: - # deserialize IDs - pandaIDs = WrappedPickle.loads(pandaIDsStr) - # truncate - maxIDs = 5500 - if len(pandaIDs) > maxIDs: - _logger.error(f"getSlimmedFileInfoPandaIDs: too long ID list more than {maxIDs}") - pandaIDs = pandaIDs[:maxIDs] - # get - _logger.debug(f"getSlimmedFileInfoPandaIDs start : {dn} {len(pandaIDs)}") - ret = self.taskBuffer.getSlimmedFileInfoPandaIDs(pandaIDs) - _logger.debug("getSlimmedFileInfoPandaIDs end") - except Exception: - ret = {} - # serialize - return WrappedPickle.dumps(ret) - - # get JobIDs in a time range - def getJobIDsInTimeRange(self, dn, timeRange): - # get IDs - ret = self.taskBuffer.getJobIDsInTimeRange(dn, timeRange) - # serialize - return WrappedPickle.dumps(ret) - # get active JediTasks in a time range def getJediTasksInTimeRange(self, dn, timeRange, fullFlag, minTaskID, task_type): - # get IDs ret = self.taskBuffer.getJediTasksInTimeRange(dn, timeRange, fullFlag, minTaskID, task_type) - # serialize return WrappedPickle.dumps(ret) # get details of JediTask def getJediTaskDetails(self, jediTaskID, fullFlag, withTaskInfo): - # get IDs ret = self.taskBuffer.getJediTaskDetails(jediTaskID, fullFlag, withTaskInfo) - # serialize - return WrappedPickle.dumps(ret) - - # get PandaIDs for a JobID - def getPandIDsWithJobID(self, dn, jobID, nJobs): - # get IDs - ret = self.taskBuffer.getPandIDsWithJobID(dn, jobID, nJobs) - # serialize - return WrappedPickle.dumps(ret) - - # check merge job generation status - def checkMergeGenerationStatus(self, dn, jobID): - # check - ret = self.taskBuffer.checkMergeGenerationStatus(dn, jobID) - # serialize return WrappedPickle.dumps(ret) # get full job status @@ -601,12 +352,10 @@ def getFullJobStatus(self, idsStr, dn): # peek jobs ret = self.taskBuffer.getFullJobStatus(ids) _logger.debug("getFullJobStatus end") - # serialize return WrappedPickle.dumps(ret) # insert task params def insertTaskParams(self, taskParams, user, prodRole, fqans, properErrorCode, parent_tid): - # register ret = self.taskBuffer.insertTaskParamsPanda( taskParams, user, @@ -615,12 +364,10 @@ def insertTaskParams(self, taskParams, user, prodRole, fqans, properErrorCode, p properErrorCode=properErrorCode, parent_tid=parent_tid, ) - # return return ret # kill task def killTask(self, jediTaskID, user, prodRole, properErrorCode, broadcast): - # kill ret = self.taskBuffer.sendCommandTaskPanda( jediTaskID, user, @@ -629,12 +376,10 @@ def killTask(self, jediTaskID, user, prodRole, properErrorCode, broadcast): properErrorCode=properErrorCode, broadcast=broadcast, ) - # return return ret # finish task def finishTask(self, jediTaskID, user, prodRole, properErrorCode, qualifier, broadcast): - # kill ret = self.taskBuffer.sendCommandTaskPanda( jediTaskID, user, @@ -644,14 +389,11 @@ def finishTask(self, jediTaskID, user, prodRole, properErrorCode, qualifier, bro comQualifier=qualifier, broadcast=broadcast, ) - # return return ret # reload input def reloadInput(self, jediTaskID, user, prodRole): - # kill ret = self.taskBuffer.sendCommandTaskPanda(jediTaskID, user, prodRole, "incexec", comComment="{}", properErrorCode=True) - # return return ret # retry task @@ -690,8 +432,8 @@ def retryTask( allowActiveTask=True, ) except Exception: - errType, errValue = sys.exc_info()[:2] - ret = 1, f"server error with {errType}:{errValue}" + err_type, err_value = sys.exc_info()[:2] + ret = 1, f"server error with {err_type}:{err_value}" else: com_qualifier = "" for com_key, com_param in [ @@ -719,18 +461,15 @@ def retryTask( for jobID in jobdefList: self.taskBuffer.finalizePendingJobs(cUID, jobID) self.taskBuffer.increaseAttemptNrPanda(jediTaskID, 5) - retStr = "retry has been triggered for failed jobs " - retStr += f"while the task is still {ret[1]}" + return_str = f"retry has been triggered for failed jobs while the task is still {ret[1]}" if newParams is None: - ret = 0, retStr + ret = 0, return_str else: - ret = 3, retStr - # return + ret = 3, return_str return ret # reassign task def reassignTask(self, jediTaskID, user, prodRole, comComment): - # reassign ret = self.taskBuffer.sendCommandTaskPanda( jediTaskID, user, @@ -739,83 +478,55 @@ def reassignTask(self, jediTaskID, user, prodRole, comComment): comComment=comComment, properErrorCode=True, ) - # return return ret # pause task def pauseTask(self, jediTaskID, user, prodRole): - # exec ret = self.taskBuffer.sendCommandTaskPanda(jediTaskID, user, prodRole, "pause", properErrorCode=True) - # return return ret # resume task def resumeTask(self, jediTaskID, user, prodRole): - # exec ret = self.taskBuffer.sendCommandTaskPanda(jediTaskID, user, prodRole, "resume", properErrorCode=True) - # return return ret # force avalanche for task def avalancheTask(self, jediTaskID, user, prodRole): - # exec ret = self.taskBuffer.sendCommandTaskPanda(jediTaskID, user, prodRole, "avalanche", properErrorCode=True) - # return return ret # send command to task def send_command_to_task(self, jedi_task_id, user, prod_role, command_string): - # exec ret = self.taskBuffer.sendCommandTaskPanda(jedi_task_id, user, prod_role, command_string, properErrorCode=True) - # return - return ret - - # get retry history - def getRetryHistory(self, jediTaskID, user): - # get - _logger.debug(f"getRetryHistory jediTaskID={jediTaskID} start {user}") - ret = self.taskBuffer.getRetryHistoryJEDI(jediTaskID) - _logger.debug(f"getRetryHistory jediTaskID={jediTaskID} done") - # return return ret # change task priority def changeTaskPriority(self, jediTaskID, newPriority): - # kill ret = self.taskBuffer.changeTaskPriorityPanda(jediTaskID, newPriority) - # return return ret # increase attempt number for unprocessed files def increaseAttemptNrPanda(self, jediTaskID, increasedNr): - # exec ret = self.taskBuffer.increaseAttemptNrPanda(jediTaskID, increasedNr) - # return return ret # change task attribute def changeTaskAttributePanda(self, jediTaskID, attrName, attrValue): - # kill ret = self.taskBuffer.changeTaskAttributePanda(jediTaskID, attrName, attrValue) - # return return ret # change split rule for task def changeTaskSplitRulePanda(self, jediTaskID, attrName, attrValue): - # exec ret = self.taskBuffer.changeTaskSplitRulePanda(jediTaskID, attrName, attrValue) - # return return ret # reactivate task def reactivateTask(self, jediTaskID, keep_attempt_nr, trigger_job_generation): - # update datasets and task status ret = self.taskBuffer.reactivateTask(jediTaskID, keep_attempt_nr, trigger_job_generation) return ret # get task status def getTaskStatus(self, jediTaskID): - # update task status ret = self.taskBuffer.getTaskStatus(jediTaskID) return ret[0] @@ -823,14 +534,6 @@ def getTaskStatus(self, jediTaskID): def reassignShare(self, jedi_task_ids, share_dest, reassign_running): return self.taskBuffer.reassignShare(jedi_task_ids, share_dest, reassign_running) - # get global share status overview of the grid - def getGShareStatus(self): - return self.taskBuffer.getGShareStatus() - - # list tasks in share - def listTasksInShare(self, gshare, status): - return self.taskBuffer.listTasksInShare(gshare, status) - # get taskParamsMap def getTaskParamsMap(self, jediTaskID): # get taskParamsMap @@ -841,41 +544,37 @@ def getTaskParamsMap(self, jediTaskID): def updateWorkers(self, user, host, harvesterID, data): ret = self.taskBuffer.updateWorkers(harvesterID, data) if ret is None: - retVal = (False, "database error in the panda server") + return_value = (False, MESSAGE_DATABASE) else: - retVal = (True, ret) - # serialize - return json.dumps(retVal) + return_value = (True, ret) + return json.dumps(return_value) # update workers def updateServiceMetrics(self, user, host, harvesterID, data): ret = self.taskBuffer.updateServiceMetrics(harvesterID, data) if ret is None: - retVal = (False, "database error in the panda server") + return_value = (False, MESSAGE_DATABASE) else: - retVal = (True, ret) - # serialize - return json.dumps(retVal) + return_value = (True, ret) + return json.dumps(return_value) # add harvester dialog messages def addHarvesterDialogs(self, user, harvesterID, dialogs): ret = self.taskBuffer.addHarvesterDialogs(harvesterID, dialogs) if not ret: - retVal = (False, "database error in the panda server") + return_value = (False, MESSAGE_DATABASE) else: - retVal = (True, "") - # serialize - return json.dumps(retVal) + return_value = (True, "") + return json.dumps(return_value) # heartbeat for harvester def harvesterIsAlive(self, user, host, harvesterID, data): ret = self.taskBuffer.harvesterIsAlive(user, host, harvesterID, data) if ret is None: - retVal = (False, "database error") + return_value = (False, MESSAGE_DATABASE) else: - retVal = (True, ret) - # serialize - return json.dumps(retVal) + return_value = (True, ret) + return json.dumps(return_value) # get stats of workers def getWorkerStats(self): @@ -891,33 +590,29 @@ def reportWorkerStats_jobtype(self, harvesterID, siteName, paramsList): # set num slots for workload provisioning def setNumSlotsForWP(self, pandaQueueName, numSlots, gshare, resourceType, validPeriod): - retVal = self.taskBuffer.setNumSlotsForWP(pandaQueueName, numSlots, gshare, resourceType, validPeriod) - # serialize - return json.dumps(retVal) + return_value = self.taskBuffer.setNumSlotsForWP(pandaQueueName, numSlots, gshare, resourceType, validPeriod) + return json.dumps(return_value) # enable jumbo jobs def enableJumboJobs(self, jediTaskID, totalJumboJobs, nJumboPerSite): - retVal = self.taskBuffer.enableJumboJobs(jediTaskID, totalJumboJobs, nJumboPerSite) - if totalJumboJobs > 0 and retVal[0] == 0: + return_value = self.taskBuffer.enableJumboJobs(jediTaskID, totalJumboJobs, nJumboPerSite) + if totalJumboJobs > 0 and return_value[0] == 0: self.avalancheTask(jediTaskID, "panda", True) - # serialize - return json.dumps(retVal) + return json.dumps(return_value) # get user job metadata def getUserJobMetadata(self, jediTaskID): - retVal = self.taskBuffer.getUserJobMetadata(jediTaskID) - # serialize - return json.dumps(retVal) + return_value = self.taskBuffer.getUserJobMetadata(jediTaskID) + return json.dumps(return_value) # get jumbo job datasets def getJumboJobDatasets(self, n_days, grace_period): - retVal = self.taskBuffer.getJumboJobDatasets(n_days, grace_period) + return_value = self.taskBuffer.getJumboJobDatasets(n_days, grace_period) # serialize - return json.dumps(retVal) + return json.dumps(return_value) # sweep panda queue def sweepPQ(self, panda_queue, status_list, ce_list, submission_host_list): - # deserialize variables try: panda_queue_des = json.loads(panda_queue) status_list_des = json.loads(status_list) @@ -926,33 +621,27 @@ def sweepPQ(self, panda_queue, status_list, ce_list, submission_host_list): except Exception: _logger.error("Problem deserializing variables") - # reassign jobs ret = self.taskBuffer.sweepPQ(panda_queue_des, status_list_des, ce_list_des, submission_host_list_des) - # serialize return WrappedPickle.dumps(ret) # send command to a job def send_command_to_job(self, panda_id, com): ret = self.taskBuffer.send_command_to_job(panda_id, com) - # return return ret # set user secret def set_user_secret(self, owner, key, value): ret = self.taskBuffer.set_user_secret(owner, key, value) - # return return ret # get user secrets def get_user_secrets(self, owner, keys, get_json): ret = self.taskBuffer.get_user_secrets(owner, keys, get_json) - # return return ret # get files in datasets def get_files_in_datasets(self, task_id, dataset_types): ret = self.taskBuffer.get_files_in_datasets(task_id, dataset_types) - # return return ret @@ -964,50 +653,34 @@ def get_files_in_datasets(self, task_id, dataset_types): # get FQANs def _getFQAN(req): fqans = [] - for tmpKey in req.subprocess_env: - tmpVal = req.subprocess_env[tmpKey] - # compact credentials - if tmpKey.startswith("GRST_CRED_"): - # VOMS attribute - if tmpVal.startswith("VOMS"): - # FQAN - fqan = tmpVal.split()[-1] - # append - fqans.append(fqan) + for tmp_key in req.subprocess_env: + tmp_value = req.subprocess_env[tmp_key] + # Scan VOMS attributes + # compact style + if tmp_key.startswith("GRST_CRED_") and tmp_value.startswith("VOMS"): + fqan = tmp_value.split()[-1] + fqans.append(fqan) + # old style - elif tmpKey.startswith("GRST_CONN_"): - tmpItems = tmpVal.split(":") - # FQAN - if len(tmpItems) == 2 and tmpItems[0] == "fqan": - fqans.append(tmpItems[-1]) - # return + elif tmp_key.startswith("GRST_CONN_"): + tmp_items = tmp_value.split(":") + if len(tmp_items) == 2 and tmp_items[0] == "fqan": + fqans.append(tmp_items[-1]) + return fqans # get DN def _getDN(req): - realDN = "" + real_dn = "" if "SSL_CLIENT_S_DN" in req.subprocess_env: # remove redundant CN - realDN = CoreUtils.get_bare_dn(req.subprocess_env["SSL_CLIENT_S_DN"], keep_proxy=True) - return realDN - - -# get VOMS attributes -def _get_grst_attr(req): - vomsAttrs = [] - for tmpKey in req.subprocess_env: - tmpVal = req.subprocess_env[tmpKey] - vomsAttrs.append(f"{tmpKey} : {tmpVal}\n") - vomsAttrs.sort() - retStr = "" - for tmpStr in vomsAttrs: - retStr += tmpStr - return retStr + real_dn = CoreUtils.get_bare_dn(req.subprocess_env["SSL_CLIENT_S_DN"], keep_proxy=True) + return real_dn # check role -def _hasProdRole(req): +def _has_production_role(req): # check DN user = _getDN(req) for sdn in panda_config.production_dns: @@ -1078,13 +751,11 @@ def submitJobs(req, jobs, toPending=None): # hostname host = req.get_remote_host() # production Role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # to pending - if toPending == "True": - toPending = True - else: - toPending = False - return userIF.submitJobs(jobs, user, host, fqans, prodRole, toPending) + toPending = resolve_true(toPending) + + return userIF.submitJobs(jobs, user, host, fqans, is_production_role, toPending) # get job status @@ -1092,68 +763,43 @@ def getJobStatus(req, ids, no_pickle=None): return userIF.getJobStatus(ids, req.acceptJson(), no_pickle) -# get PandaID with jobexeID -def getPandaIDwithJobExeID(req, ids): - return userIF.getPandaIDwithJobExeID(ids) - - -# get queued analysis jobs at a site -def getQueuedAnalJobs(req, site): - # check security - if not isSecure(req): - return "ERROR: SSL is required" - # get DN - user = None - if "SSL_CLIENT_S_DN" in req.subprocess_env: - user = _getDN(req) - return userIF.getQueuedAnalJobs(site, user) - - -# get active datasets -def getActiveDatasets(req, computingSite, prodSourceLabel="managed"): - return userIF.getActiveDatasets(computingSite, prodSourceLabel) - - # set debug mode def setDebugMode(req, pandaID, modeOn): - tmpLog = LogWrapper(_logger, f"setDebugMode {pandaID} {modeOn}") + tmp_log = LogWrapper(_logger, f"setDebugMode {pandaID} {modeOn}") # get DN if "SSL_CLIENT_S_DN" not in req.subprocess_env: - errStr = "SSL connection is required" - tmpLog.error(errStr) - return "ERROR: " + errStr + error_str = MESSAGE_SSL + tmp_log.error(error_str) + return f"ERROR: {error_str}" user = _getDN(req) # check role - prodManager = _hasProdRole(req) + is_production_manager = _has_production_role(req) fqans = _getFQAN(req) - grst = _get_grst_attr(req) # mode - if modeOn == "True": - modeOn = True - else: - modeOn = False + modeOn = resolve_true(modeOn) + # get the primary working group with prod role - workingGroup = _getWGwithPR(req) - tmpLog.error(f"user={user} mgr={prodManager} wg={workingGroup} fqans={str(fqans)} grst={grst}") + working_group = _getWGwithPR(req) + tmp_log.debug(f"user={user} mgr={is_production_manager} wg={working_group} fqans={str(fqans)}") # exec - return userIF.setDebugMode(user, pandaID, prodManager, modeOn, workingGroup) + return userIF.setDebugMode(user, pandaID, is_production_manager, modeOn, working_group) # insert sandbox file info def insertSandboxFileInfo(req, userName, fileName, fileSize, checkSum): - tmpLog = LogWrapper(_logger, f"insertSandboxFileInfo {userName} {fileName}") + tmp_log = LogWrapper(_logger, f"insertSandboxFileInfo {userName} {fileName}") # get DN if "SSL_CLIENT_S_DN" not in req.subprocess_env: - errStr = "SSL connection is required" - tmpLog.error(errStr) - return "ERROR: " + errStr - user = _getDN(req) + error_str = MESSAGE_SSL + tmp_log.error(error_str) + return f"ERROR: {error_str}" + # check role - prodManager = _hasProdRole(req) - if not prodManager: - errStr = "missing role" - tmpLog.error(errStr) - return "ERROR: " + errStr + is_production_manager = _has_production_role(req) + if not is_production_manager: + tmp_log.error(MESSAGE_PROD_ROLE) + return f"ERROR: {MESSAGE_PROD_ROLE}" + # hostname if hasattr(panda_config, "sandboxHostname") and panda_config.sandboxHostname: hostName = panda_config.sandboxHostname @@ -1167,64 +813,17 @@ def insertSandboxFileInfo(req, userName, fileName, fileSize, checkSum): def checkSandboxFile(req, fileSize, checkSum): # get DN if "SSL_CLIENT_S_DN" not in req.subprocess_env: - return "ERROR: SSL connection is required" + return f"ERROR: {MESSAGE_SSL}" user = _getDN(req) # exec return userIF.checkSandboxFile(user, fileSize, checkSum) -# query PandaIDs -def queryPandaIDs(req, ids): - return userIF.queryPandaIDs(ids) - - -# query job info per cloud -def queryJobInfoPerCloud(req, cloud, schedulerID=None): - return userIF.queryJobInfoPerCloud(cloud, schedulerID) - - -# get PandaIDs at site -def getPandaIDsSite(req, site, status, limit=500): - return userIF.getPandaIDsSite(site, status, limit) - - -# get PandaIDs to be updated in prodDB -def getJobsToBeUpdated(req, limit=5000, lockedby=""): - limit = int(limit) - return userIF.getJobsToBeUpdated(limit, lockedby) - - -# update prodDBUpdateTimes -def updateProdDBUpdateTimes(req, params): - # check security - if not isSecure(req): - return False - return userIF.updateProdDBUpdateTimes(params) - - # get job statistics def getJobStatistics(req, sourcetype=None): return userIF.getJobStatistics(sourcetype) -# get highest prio jobs -def getHighestPrioJobStat(req, perPG=None, useMorePG=None): - if perPG == "True": - perPG = True - else: - perPG = False - if useMorePG == "True": - useMorePG = pandaserver.taskbuffer.ProcessGroups.extensionLevel_1 - elif useMorePG in ["False", None]: - useMorePG = False - else: - try: - useMorePG = int(useMorePG) - except Exception: - useMorePG = False - return userIF.getHighestPrioJobStat(perPG, useMorePG) - - # get job statistics for Babmoo def getJobStatisticsForBamboo(req, useMorePG=None): if useMorePG == "True": @@ -1239,11 +838,6 @@ def getJobStatisticsForBamboo(req, useMorePG=None): return userIF.getJobStatisticsForBamboo(useMorePG) -# get the number of waiting jobs per site and user -def getJobStatisticsPerUserSite(req): - return userIF.getJobStatisticsPerUserSite() - - # get job statistics per site and resource def getJobStatisticsPerSiteResource(req, timeWindow=None): return userIF.getJobStatisticsPerSiteResource(timeWindow) @@ -1264,15 +858,14 @@ def getJobStatisticsPerSite( minPriority=None, readArchived=None, ): - if predefined == "True": - predefined = True - else: - predefined = False + predefined = resolve_true(predefined) + if minPriority is not None: try: minPriority = int(minPriority) except Exception: minPriority = None + if readArchived == "True": readArchived = True elif readArchived == "False": @@ -1287,31 +880,6 @@ def getJobStatisticsPerSite( return userIF.getJobStatisticsPerSite(predefined, workingGroup, countryGroup, jobType, minPriority, readArchived) -# get job statistics per site with label -def getJobStatisticsWithLabel(req, site=""): - return userIF.getJobStatisticsWithLabel(site) - - -# query last files in datasets -def queryLastFilesInDataset(req, datasets): - return userIF.queryLastFilesInDataset(datasets) - - -# get input files currently in used for analysis -def getFilesInUseForAnal(req, outDataset): - return userIF.getFilesInUseForAnal(outDataset) - - -# get list of dis dataset to get input files in shadow -def getDisInUseForAnal(req, outDataset): - return userIF.getDisInUseForAnal(outDataset) - - -# get input LFNs currently in use for analysis with shadow dis -def getLFNsInUseForAnal(req, inputDisList): - return userIF.getLFNsInUseForAnal(inputDisList) - - # kill jobs def killJobs(req, ids, code=None, useMailAsID=None, killOpts=None): # check security @@ -1322,14 +890,12 @@ def killJobs(req, ids, code=None, useMailAsID=None, killOpts=None): if "SSL_CLIENT_S_DN" in req.subprocess_env: user = _getDN(req) # check role - prodManager = _hasProdRole(req) + is_production_manager = _has_production_role(req) # get FQANs fqans = _getFQAN(req) # use email address as ID - if useMailAsID == "True": - useMailAsID = True - else: - useMailAsID = False + useMailAsID = resolve_true(useMailAsID) + # hostname host = req.get_remote_host() # options @@ -1337,7 +903,7 @@ def killJobs(req, ids, code=None, useMailAsID=None, killOpts=None): killOpts = [] else: killOpts = killOpts.split(",") - return userIF.killJobs(ids, user, host, code, prodManager, useMailAsID, fqans, killOpts) + return userIF.killJobs(ids, user, host, code, is_production_manager, useMailAsID, fqans, killOpts) # reassign jobs @@ -1352,39 +918,12 @@ def reassignJobs(req, ids, forPending=None, firstSubmission=None): # hostname host = req.get_remote_host() # for pending - if forPending == "True": - forPending = True - else: - forPending = False - # first submission - if firstSubmission == "False": - firstSubmission = False - else: - firstSubmission = True - return userIF.reassignJobs(ids, user, host, forPending, firstSubmission) - - -# resubmit jobs -def resubmitJobs(req, ids): - # check security - if not isSecure(req): - return False - return userIF.resubmitJobs(ids) + forPending = resolve_true(forPending) + # first submission + firstSubmission = resolve_false(firstSubmission) -# change job priorities -def changeJobPriorities(req, newPrioMap=None): - # check security - if not isSecure(req): - return WrappedPickle.dumps((False, "secure connection is required")) - # get DN - user = None - if "SSL_CLIENT_S_DN" in req.subprocess_env: - user = _getDN(req) - # check role - prodRole = _hasProdRole(req) - ret = userIF.changeJobPriorities(user, prodRole, newPrioMap) - return WrappedPickle.dumps(ret) + return userIF.reassignJobs(ids, user, host, forPending, firstSubmission) # get list of site spec @@ -1395,11 +934,6 @@ def getSiteSpecs(req, siteType=None): return userIF.getSiteSpecs() -# get list of cloud spec -def getCloudSpecs(req): - return userIF.getCloudSpecs() - - # get ban users def get_ban_users(req): return userIF.get_ban_users() @@ -1410,40 +944,6 @@ def getPandaClientVer(req): return userIF.getPandaClientVer() -# get nPilots -def getNumPilots(req): - return userIF.getNumPilots() - - -# retry failed subjobs in running job -def retryFailedJobsInActive(req, jobID): - # check SSL - if "SSL_CLIENT_S_DN" not in req.subprocess_env: - return "ERROR: SSL connection is required" - # get DN - dn = _getDN(req) - if dn == "": - return "ERROR: could not get DN" - # convert jobID to long - try: - jobID = int(jobID) - except Exception: - return "ERROR: jobID is not an integer" - return userIF.retryFailedJobsInActive(dn, jobID) - - -# get serial number for group job -def getSerialNumberForGroupJob(req): - # check SSL - if "SSL_CLIENT_S_DN" not in req.subprocess_env: - return "ERROR: SSL connection is required" - # get DN - dn = _getDN(req) - if dn == "": - return "ERROR: could not get DN" - return userIF.getSerialNumberForGroupJob(dn) - - # get script for offline running def getScriptOfflineRunning(req, pandaID, days=None): try: @@ -1454,21 +954,6 @@ def getScriptOfflineRunning(req, pandaID, days=None): return userIF.getScriptOfflineRunning(pandaID, days) -# get JobIDs in a time range -def getJobIDsInTimeRange(req, timeRange, dn=None): - # check security - if not isSecure(req): - return False - # get DN - if "SSL_CLIENT_S_DN" not in req.subprocess_env: - return False - if dn is None: - dn = _getDN(req) - _logger.debug(f"getJobIDsInTimeRange {dn} {timeRange}") - # execute - return userIF.getJobIDsInTimeRange(dn, timeRange) - - # get active JediTasks in a time range def getJediTasksInTimeRange(req, timeRange, dn=None, fullFlag=None, minTaskID=None, task_type="user"): # check security @@ -1479,10 +964,8 @@ def getJediTasksInTimeRange(req, timeRange, dn=None, fullFlag=None, minTaskID=No return False if dn is None: dn = _getDN(req) - if fullFlag == "True": - fullFlag = True - else: - fullFlag = False + fullFlag = resolve_true(fullFlag) + try: minTaskID = int(minTaskID) except Exception: @@ -1501,61 +984,14 @@ def getJediTaskDetails(req, jediTaskID, fullFlag, withTaskInfo): if "SSL_CLIENT_S_DN" not in req.subprocess_env: return False # option - if fullFlag == "True": - fullFlag = True - else: - fullFlag = False - if withTaskInfo == "True": - withTaskInfo = True - else: - withTaskInfo = False + fullFlag = resolve_true(fullFlag) + withTaskInfo = resolve_true(withTaskInfo) + _logger.debug(f"getJediTaskDetails {jediTaskID} {fullFlag} {withTaskInfo}") # execute return userIF.getJediTaskDetails(jediTaskID, fullFlag, withTaskInfo) -# get PandaIDs for a JobID -def getPandIDsWithJobID(req, jobID, nJobs, dn=None): - # check security - if not isSecure(req): - return False - # get DN - if "SSL_CLIENT_S_DN" not in req.subprocess_env: - return False - if dn is None: - dn = _getDN(req) - _logger.debug(f"getPandIDsWithJobID {dn} JobID={jobID} nJobs={nJobs}") - # execute - return userIF.getPandIDsWithJobID(dn, jobID, nJobs) - - -# check merge job generation status -def checkMergeGenerationStatus(req, jobID, dn=None): - # check security - if not isSecure(req): - return False - # get DN - if "SSL_CLIENT_S_DN" not in req.subprocess_env: - return False - if dn is None: - dn = _getDN(req) - _logger.debug(f"checkMergeGenerationStatus {dn} JobID={jobID}") - # execute - return userIF.checkMergeGenerationStatus(dn, jobID) - - -# get slimmed file info with PandaIDs -def getSlimmedFileInfoPandaIDs(req, ids): - # check security - if not isSecure(req): - return False - # get DN - if "SSL_CLIENT_S_DN" not in req.subprocess_env: - return False - dn = _getDN(req) - return userIF.getSlimmedFileInfoPandaIDs(ids, dn) - - # get full job status def getFullJobStatus(req, ids): # check security @@ -1568,49 +1004,16 @@ def getFullJobStatus(req, ids): return userIF.getFullJobStatus(ids, dn) -# get a list of DN/myproxy pass phrase/queued job count at a site -def getNUserJobs(req, siteName): - # check security - prodManager = False - if not isSecure(req): - return "Failed : HTTPS connection is required" - # get FQANs - fqans = _getFQAN(req) - # loop over all FQANs - for fqan in fqans: - # check production role - for rolePat in [ - "/atlas/usatlas/Role=production", - "/atlas/Role=production", - "/atlas/usatlas/Role=pilot", - "/atlas/Role=pilot", - ]: - if fqan.startswith(rolePat): - prodManager = True - break - # escape - if prodManager: - break - # only prod managers can use this method - if not prodManager: - return "Failed : VOMS authorization failure. production or pilot role required" - # execute - return userIF.getNUserJobs(siteName) - - # insert task params def insertTaskParams(req, taskParams=None, properErrorCode=None, parent_tid=None): - tmpLog = LogWrapper(_logger, f"insertTaskParams-{datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None).isoformat('/')}") - tmpLog.debug("start") - if properErrorCode == "True": - properErrorCode = True - else: - properErrorCode = False + tmp_log = LogWrapper(_logger, f"insertTaskParams-{datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None).isoformat('/')}") + tmp_log.debug("start") + properErrorCode = resolve_true(properErrorCode) + # check security if not isSecure(req): - tmpMsg = "secure connection is required" - tmpLog.debug(tmpMsg) - return WrappedPickle.dumps((False, tmpMsg)) + tmp_log.debug(MESSAGE_SSL) + return WrappedPickle.dumps((False, MESSAGE_SSL)) # get DN user = None if "SSL_CLIENT_S_DN" in req.subprocess_env: @@ -1620,18 +1023,17 @@ def insertTaskParams(req, taskParams=None, properErrorCode=None, parent_tid=None try: json.loads(taskParams) except Exception: - tmpMsg = "failed to decode json" - tmpLog.debug(tmpMsg) - return WrappedPickle.dumps((False, tmpMsg)) + tmp_log.debug(MESSAGE_JSON) + return WrappedPickle.dumps((False, MESSAGE_JSON)) # check role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # get FQANs fqans = _getFQAN(req) - tmpLog.debug(f"user={user} prodRole={prodRole} FQAN:{str(fqans)} parent_tid={parent_tid}") - ret = userIF.insertTaskParams(taskParams, user, prodRole, fqans, properErrorCode, parent_tid) + tmp_log.debug(f"user={user} prodRole={is_production_role} FQAN:{str(fqans)} parent_tid={parent_tid}") + ret = userIF.insertTaskParams(taskParams, user, is_production_role, fqans, properErrorCode, parent_tid) try: - tmpLog.debug(ret[1]) + tmp_log.debug(ret[1]) except Exception: pass return WrappedPickle.dumps(ret) @@ -1639,35 +1041,28 @@ def insertTaskParams(req, taskParams=None, properErrorCode=None, parent_tid=None # kill task def killTask(req, jediTaskID=None, properErrorCode=None, broadcast=None): - if properErrorCode == "True": - properErrorCode = True - else: - properErrorCode = False - if broadcast == "True": - broadcast = True - else: - broadcast = False + properErrorCode = resolve_true(properErrorCode) + broadcast = resolve_true(broadcast) + # check security if not isSecure(req): - if properErrorCode: - return WrappedPickle.dumps((100, "secure connection is required")) - else: - return WrappedPickle.dumps((False, "secure connection is required")) + error_code = CODE_SSL if properErrorCode else False + return WrappedPickle.dumps((error_code, MESSAGE_SSL)) + # get DN user = None if "SSL_CLIENT_S_DN" in req.subprocess_env: user = _getDN(req) # check role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # check jediTaskID try: jediTaskID = int(jediTaskID) except Exception: - if properErrorCode: - return WrappedPickle.dumps((101, "jediTaskID must be an integer")) - else: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) - ret = userIF.killTask(jediTaskID, user, prodRole, properErrorCode, broadcast) + error_code = CODE_LOGIC if properErrorCode else False + return WrappedPickle.dumps((error_code, MESSAGE_TASK_ID)) + + ret = userIF.killTask(jediTaskID, user, is_production_role, properErrorCode, broadcast) return WrappedPickle.dumps(ret) @@ -1682,50 +1077,34 @@ def retryTask( disable_staging_mode=None, keep_gshare_priority=None, ): - if properErrorCode == "True": - properErrorCode = True - else: - properErrorCode = False - if noChildRetry == "True": - noChildRetry = True - else: - noChildRetry = False - if discardEvents == "True": - discardEvents = True - else: - discardEvents = False - if disable_staging_mode == "True": - disable_staging_mode = True - else: - disable_staging_mode = False - if keep_gshare_priority == "True": - keep_gshare_priority = True - else: - keep_gshare_priority = False + properErrorCode = resolve_true(properErrorCode) + noChildRetry = resolve_true(noChildRetry) + discardEvents = resolve_true(discardEvents) + disable_staging_mode = resolve_true(disable_staging_mode) + keep_gshare_priority = resolve_true(keep_gshare_priority) + # check security if not isSecure(req): - if properErrorCode: - return WrappedPickle.dumps((100, "secure connection is required")) - else: - return WrappedPickle.dumps((False, "secure connection is required")) + error_code = CODE_SSL if properErrorCode else False + return WrappedPickle.dumps((error_code, MESSAGE_SSL)) + # get DN user = None if "SSL_CLIENT_S_DN" in req.subprocess_env: user = _getDN(req) # check role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # check jediTaskID try: jediTaskID = int(jediTaskID) except Exception: - if properErrorCode: - return WrappedPickle.dumps((101, "jediTaskID must be an integer")) - else: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) + error_code = CODE_LOGIC if properErrorCode else False + return WrappedPickle.dumps((error_code, MESSAGE_TASK_ID)) + ret = userIF.retryTask( jediTaskID, user, - prodRole, + is_production_role, properErrorCode, newParams, noChildRetry, @@ -1740,18 +1119,18 @@ def retryTask( def reassignTask(req, jediTaskID, site=None, cloud=None, nucleus=None, soft=None, mode=None): # check security if not isSecure(req): - return WrappedPickle.dumps((100, "secure connection is required")) + return WrappedPickle.dumps((CODE_SSL, MESSAGE_SSL)) # get DN user = None if "SSL_CLIENT_S_DN" in req.subprocess_env: user = _getDN(req) # check role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # check jediTaskID try: jediTaskID = int(jediTaskID) except Exception: - return WrappedPickle.dumps((101, "jediTaskID must be an integer")) + return WrappedPickle.dumps((CODE_LOGIC, MESSAGE_TASK_ID)) # site or cloud if site is not None: # set 'y' to go back to oldStatus immediately @@ -1764,91 +1143,62 @@ def reassignTask(req, jediTaskID, site=None, cloud=None, nucleus=None, soft=None comComment += ":nokill reassign" elif mode == "soft" or soft == "True": comComment += ":soft reassign" - ret = userIF.reassignTask(jediTaskID, user, prodRole, comComment) + ret = userIF.reassignTask(jediTaskID, user, is_production_role, comComment) return WrappedPickle.dumps(ret) # finish task def finishTask(req, jediTaskID=None, properErrorCode=None, soft=None, broadcast=None): - if properErrorCode == "True": - properErrorCode = True - else: - properErrorCode = False + properErrorCode = resolve_true(properErrorCode) + broadcast = resolve_true(broadcast) qualifier = None if soft == "True": qualifier = "soft" - if broadcast == "True": - broadcast = True - else: - broadcast = False + # check security if not isSecure(req): - if properErrorCode: - return WrappedPickle.dumps((100, "secure connection is required")) - else: - return WrappedPickle.dumps((False, "secure connection is required")) + error_code = CODE_SSL if properErrorCode else False + return WrappedPickle.dumps((error_code, MESSAGE_SSL)) + # get DN user = None if "SSL_CLIENT_S_DN" in req.subprocess_env: user = _getDN(req) # check role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # check jediTaskID try: jediTaskID = int(jediTaskID) except Exception: - if properErrorCode: - return WrappedPickle.dumps((101, "jediTaskID must be an integer")) - else: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) - ret = userIF.finishTask(jediTaskID, user, prodRole, properErrorCode, qualifier, broadcast) + error_code = CODE_LOGIC if properErrorCode else False + return WrappedPickle.dumps((error_code, MESSAGE_TASK_ID)) + + ret = userIF.finishTask(jediTaskID, user, is_production_role, properErrorCode, qualifier, broadcast) return WrappedPickle.dumps(ret) # reload input def reloadInput(req, jediTaskID, properErrorCode=None): - if properErrorCode == "True": - properErrorCode = True - else: - properErrorCode = False + properErrorCode = resolve_true(properErrorCode) # check security if not isSecure(req): - if properErrorCode: - return WrappedPickle.dumps((100, "secure connection is required")) - else: - return WrappedPickle.dumps((False, "secure connection is required")) + error_code = CODE_SSL if properErrorCode else False + return WrappedPickle.dumps((error_code, MESSAGE_SSL)) + # get DN user = None if "SSL_CLIENT_S_DN" in req.subprocess_env: user = _getDN(req) # check role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # check jediTaskID try: jediTaskID = int(jediTaskID) except Exception: - if properErrorCode: - return WrappedPickle.dumps((101, "jediTaskID must be an integer")) - else: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) - ret = userIF.reloadInput(jediTaskID, user, prodRole) - return WrappedPickle.dumps(ret) - + error_code = CODE_LOGIC if properErrorCode else False + return WrappedPickle.dumps((error_code, MESSAGE_TASK_ID)) -# get retry history -def getRetryHistory(req, jediTaskID=None): - # check security - if not isSecure(req): - return WrappedPickle.dumps((False, "secure connection is required")) - # get DN - user = None - if "SSL_CLIENT_S_DN" in req.subprocess_env: - user = _getDN(req) - try: - jediTaskID = int(jediTaskID) - except Exception: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) - ret = userIF.getRetryHistory(jediTaskID, user) + ret = userIF.reloadInput(jediTaskID, user, is_production_role) return WrappedPickle.dumps(ret) @@ -1856,21 +1206,17 @@ def getRetryHistory(req, jediTaskID=None): def changeTaskPriority(req, jediTaskID=None, newPriority=None): # check security if not isSecure(req): - return WrappedPickle.dumps((False, "secure connection is required")) - # get DN - user = None - if "SSL_CLIENT_S_DN" in req.subprocess_env: - user = _getDN(req) + return WrappedPickle.dumps((False, MESSAGE_SSL)) # check role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # only prod managers can use this method - if not prodRole: + if not is_production_role: return "Failed : production or pilot role required" # check jediTaskID try: jediTaskID = int(jediTaskID) except Exception: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) + return WrappedPickle.dumps((False, MESSAGE_TASK_ID)) # check priority try: newPriority = int(newPriority) @@ -1884,57 +1230,47 @@ def changeTaskPriority(req, jediTaskID=None, newPriority=None): def increaseAttemptNrPanda(req, jediTaskID, increasedNr): # check security if not isSecure(req): - return WrappedPickle.dumps((False, "secure connection is required")) - # get DN - user = None - if "SSL_CLIENT_S_DN" in req.subprocess_env: - user = _getDN(req) + return WrappedPickle.dumps((False, MESSAGE_SSL)) + # check role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # only prod managers can use this method - ret = None - if not prodRole: - ret = 3, "production or pilot role required" + if not is_production_role: + return WrappedPickle.dumps((3, MESSAGE_PROD_ROLE)) + # check jediTaskID - if ret is None: - try: - jediTaskID = int(jediTaskID) - except Exception: - ret = 4, "jediTaskID must be an integer" - # check increase - if ret is None: - wrongNr = False - try: - increasedNr = int(increasedNr) - except Exception: - wrongNr = True - if wrongNr or increasedNr < 0: - ret = 4, "increase must be a positive integer" - # exec - if ret is None: - ret = userIF.increaseAttemptNrPanda(jediTaskID, increasedNr) - return WrappedPickle.dumps(ret) + try: + jediTaskID = int(jediTaskID) + except Exception: + return WrappedPickle.dumps((4, MESSAGE_TASK_ID)) + + # check value for increase + try: + increasedNr = int(increasedNr) + except Exception: + increasedNr = -1 + if increasedNr < 0: + return WrappedPickle.dumps((4, "increase must be a positive integer")) + + return userIF.increaseAttemptNrPanda(jediTaskID, increasedNr) # change task attribute def changeTaskAttributePanda(req, jediTaskID, attrName, attrValue): # check security if not isSecure(req): - return WrappedPickle.dumps((False, "secure connection is required")) - # get DN - user = None - if "SSL_CLIENT_S_DN" in req.subprocess_env: - user = _getDN(req) + return WrappedPickle.dumps((False, MESSAGE_SSL)) + # check role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # only prod managers can use this method - if not prodRole: - return WrappedPickle.dumps((False, "production or pilot role required")) + if not is_production_role: + return WrappedPickle.dumps((False, MESSAGE_PROD_ROLE)) # check jediTaskID try: jediTaskID = int(jediTaskID) except Exception: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) + return WrappedPickle.dumps((False, MESSAGE_TASK_ID)) # check attribute if attrName not in ["ramCount", "wallTime", "cpuTime", "coreCount"]: return WrappedPickle.dumps((2, f"disallowed to update {attrName}")) @@ -1946,21 +1282,18 @@ def changeTaskAttributePanda(req, jediTaskID, attrName, attrValue): def changeTaskSplitRulePanda(req, jediTaskID, attrName, attrValue): # check security if not isSecure(req): - return WrappedPickle.dumps((False, "secure connection is required")) - # get DN - user = None - if "SSL_CLIENT_S_DN" in req.subprocess_env: - user = _getDN(req) + return WrappedPickle.dumps((False, MESSAGE_SSL)) + # check role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # only prod managers can use this method - if not prodRole: - return WrappedPickle.dumps((False, "production or pilot role required")) + if not is_production_role: + return WrappedPickle.dumps((False, MESSAGE_PROD_ROLE)) # check jediTaskID try: jediTaskID = int(jediTaskID) except Exception: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) + return WrappedPickle.dumps((False, MESSAGE_TASK_ID)) # check attribute if attrName not in [ "AI", @@ -1989,22 +1322,22 @@ def changeTaskSplitRulePanda(req, jediTaskID, attrName, attrValue): def pauseTask(req, jediTaskID): # check security if not isSecure(req): - return WrappedPickle.dumps((False, "secure connection is required")) + return WrappedPickle.dumps((False, MESSAGE_SSL)) # get DN user = None if "SSL_CLIENT_S_DN" in req.subprocess_env: user = _getDN(req) # check role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # only prod managers can use this method - if not prodRole: - return WrappedPickle.dumps((False, "production role required")) + if not is_production_role: + return WrappedPickle.dumps((False, MESSAGE_PROD_ROLE)) # check jediTaskID try: jediTaskID = int(jediTaskID) except Exception: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) - ret = userIF.pauseTask(jediTaskID, user, prodRole) + return WrappedPickle.dumps((False, MESSAGE_TASK_ID)) + ret = userIF.pauseTask(jediTaskID, user, is_production_role) return WrappedPickle.dumps(ret) @@ -2012,22 +1345,22 @@ def pauseTask(req, jediTaskID): def resumeTask(req, jediTaskID): # check security if not isSecure(req): - return WrappedPickle.dumps((False, "secure connection is required")) + return WrappedPickle.dumps((False, MESSAGE_SSL)) # get DN user = None if "SSL_CLIENT_S_DN" in req.subprocess_env: user = _getDN(req) # check role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # only prod managers can use this method - if not prodRole: + if not is_production_role: return WrappedPickle.dumps((False, "production role required")) # check jediTaskID try: jediTaskID = int(jediTaskID) except Exception: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) - ret = userIF.resumeTask(jediTaskID, user, prodRole) + return WrappedPickle.dumps((False, MESSAGE_TASK_ID)) + ret = userIF.resumeTask(jediTaskID, user, is_production_role) return WrappedPickle.dumps(ret) @@ -2035,22 +1368,22 @@ def resumeTask(req, jediTaskID): def avalancheTask(req, jediTaskID): # check security if not isSecure(req): - return WrappedPickle.dumps((False, "secure connection is required")) + return WrappedPickle.dumps((False, MESSAGE_SSL)) # get DN user = None if "SSL_CLIENT_S_DN" in req.subprocess_env: user = _getDN(req) # check role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # only prod managers can use this method - if not prodRole: + if not is_production_role: return WrappedPickle.dumps((False, "production role required")) # check jediTaskID try: jediTaskID = int(jediTaskID) except Exception: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) - ret = userIF.avalancheTask(jediTaskID, user, prodRole) + return WrappedPickle.dumps((False, MESSAGE_TASK_ID)) + ret = userIF.avalancheTask(jediTaskID, user, is_production_role) return WrappedPickle.dumps(ret) @@ -2058,13 +1391,13 @@ def avalancheTask(req, jediTaskID): def release_task(req, jedi_task_id): # check security if not isSecure(req): - return json.dumps((False, "secure connection is required")) + return json.dumps((False, MESSAGE_SSL)) # get DN user = None if "SSL_CLIENT_S_DN" in req.subprocess_env: user = _getDN(req) # check role - prod_role = _hasProdRole(req) + prod_role = _has_production_role(req) # only prod managers can use this method if not prod_role: return json.dumps((False, "production role required")) @@ -2072,7 +1405,7 @@ def release_task(req, jedi_task_id): try: jedi_task_id = int(jedi_task_id) except Exception: - return json.dumps((False, "jediTaskID must be an integer")) + return json.dumps((False, MESSAGE_TASK_ID)) ret = userIF.send_command_to_task(jedi_task_id, user, prod_role, "release") return json.dumps(ret) @@ -2087,7 +1420,7 @@ def killUnfinishedJobs(req, jediTaskID, code=None, useMailAsID=None): if "SSL_CLIENT_S_DN" in req.subprocess_env: user = _getDN(req) # check role - prodManager = False + is_production_manager = False # get FQANs fqans = _getFQAN(req) # loop over all FQANs @@ -2095,43 +1428,37 @@ def killUnfinishedJobs(req, jediTaskID, code=None, useMailAsID=None): # check production role for rolePat in ["/atlas/usatlas/Role=production", "/atlas/Role=production"]: if fqan.startswith(rolePat): - prodManager = True + is_production_manager = True break # escape - if prodManager: + if is_production_manager: break # use email address as ID - if useMailAsID == "True": - useMailAsID = True - else: - useMailAsID = False + useMailAsID = resolve_true(useMailAsID) # hostname host = req.get_remote_host() # get PandaIDs ids = userIF.getPandaIDsWithTaskID(jediTaskID) # kill - return userIF.killJobs(ids, user, host, code, prodManager, useMailAsID, fqans) + return userIF.killJobs(ids, user, host, code, is_production_manager, useMailAsID, fqans) # change modificationTime for task def changeTaskModTimePanda(req, jediTaskID, diffValue): # check security if not isSecure(req): - return WrappedPickle.dumps((False, "secure connection is required")) - # get DN - user = None - if "SSL_CLIENT_S_DN" in req.subprocess_env: - user = _getDN(req) + return WrappedPickle.dumps((False, MESSAGE_SSL)) + # check role - prodRole = _hasProdRole(req) + is_production_role = _has_production_role(req) # only prod managers can use this method - if not prodRole: - return WrappedPickle.dumps((False, "production or pilot role required")) + if not is_production_role: + return WrappedPickle.dumps((False, MESSAGE_PROD_ROLE)) # check jediTaskID try: jediTaskID = int(jediTaskID) except Exception: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) + return WrappedPickle.dumps((False, MESSAGE_TASK_ID)) try: diffValue = int(diffValue) attrValue = datetime.datetime.now() + datetime.timedelta(hours=diffValue) @@ -2146,7 +1473,7 @@ def getPandaIDsWithTaskID(req, jediTaskID): try: jediTaskID = int(jediTaskID) except Exception: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) + return WrappedPickle.dumps((False, MESSAGE_TASK_ID)) idsStr = userIF.getPandaIDsWithTaskID(jediTaskID) # deserialize ids = WrappedPickle.loads(idsStr) @@ -2158,25 +1485,20 @@ def getPandaIDsWithTaskID(req, jediTaskID): def reactivateTask(req, jediTaskID, keep_attempt_nr=None, trigger_job_generation=None): # check security if not isSecure(req): - return WrappedPickle.dumps((False, "secure connection is required")) + return WrappedPickle.dumps((False, MESSAGE_SSL)) # check role - prodManager = _hasProdRole(req) - if not prodManager: + is_production_manager = _has_production_role(req) + if not is_production_manager: msg = "production role is required" - _logger.error("reactivateTask: " + msg) + _logger.error(f"reactivateTask: {msg}") return WrappedPickle.dumps((False, msg)) try: jediTaskID = int(jediTaskID) except Exception: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) - if keep_attempt_nr == "True": - keep_attempt_nr = True - else: - keep_attempt_nr = False - if trigger_job_generation == "True": - trigger_job_generation = True - else: - trigger_job_generation = False + return WrappedPickle.dumps((False, MESSAGE_TASK_ID)) + keep_attempt_nr = resolve_true(keep_attempt_nr) + trigger_job_generation = resolve_true(trigger_job_generation) + ret = userIF.reactivateTask(jediTaskID, keep_attempt_nr, trigger_job_generation) return WrappedPickle.dumps(ret) @@ -2187,7 +1509,7 @@ def getTaskStatus(req, jediTaskID): try: jediTaskID = int(jediTaskID) except Exception: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) + return WrappedPickle.dumps((False, MESSAGE_TASK_ID)) ret = userIF.getTaskStatus(jediTaskID) return WrappedPickle.dumps(ret) @@ -2196,15 +1518,12 @@ def getTaskStatus(req, jediTaskID): def reassignShare(req, jedi_task_ids_pickle, share, reassign_running): # check security if not isSecure(req): - return WrappedPickle.dumps((False, "secure connection is required")) - # get DN - user = None - if "SSL_CLIENT_S_DN" in req.subprocess_env: - user = _getDN(req) + return WrappedPickle.dumps((False, MESSAGE_SSL)) + # check role - prod_role = _hasProdRole(req) + prod_role = _has_production_role(req) if not prod_role: - return WrappedPickle.dumps((False, "production or pilot role required")) + return WrappedPickle.dumps((False, MESSAGE_PROD_ROLE)) jedi_task_ids = WrappedPickle.loads(jedi_task_ids_pickle) _logger.debug(f"reassignShare: jedi_task_ids: {jedi_task_ids}, share: {share}, reassign_running: {reassign_running}") @@ -2216,35 +1535,12 @@ def reassignShare(req, jedi_task_ids_pickle, share, reassign_running): return WrappedPickle.dumps(ret) -# list tasks in share -def listTasksInShare(req, gshare, status): - # check security - if not isSecure(req): - return WrappedPickle.dumps((False, "secure connection is required")) - # get DN - user = None - if "SSL_CLIENT_S_DN" in req.subprocess_env: - user = _getDN(req) - # check role - prod_role = _hasProdRole(req) - if not prod_role: - return WrappedPickle.dumps((False, "production or pilot role required")) - - _logger.debug(f"listTasksInShare: gshare: {gshare}, status: {status}") - - if not ((isinstance(gshare, str) and isinstance(status, str))): - return WrappedPickle.dumps((False, "gshare and status must be of type string")) - - ret = userIF.listTasksInShare(gshare, status) - return WrappedPickle.dumps(ret) - - # get taskParamsMap with TaskID def getTaskParamsMap(req, jediTaskID): try: jediTaskID = int(jediTaskID) except Exception: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) + return WrappedPickle.dumps((False, MESSAGE_TASK_ID)) ret = userIF.getTaskParamsMap(jediTaskID) return WrappedPickle.dumps(ret) @@ -2253,66 +1549,67 @@ def getTaskParamsMap(req, jediTaskID): def updateWorkers(req, harvesterID, workers): # check security if not isSecure(req): - return json.dumps((False, "SSL is required")) + return json.dumps((False, MESSAGE_SSL)) # get DN user = _getDN(req) # hostname host = req.get_remote_host() - retVal = None + return_value = None tStart = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) # convert try: data = json.loads(workers) except Exception: - retVal = json.dumps((False, "failed to load JSON")) + return_value = json.dumps((False, MESSAGE_JSON)) # update - if retVal is None: - retVal = userIF.updateWorkers(user, host, harvesterID, data) + if return_value is None: + return_value = userIF.updateWorkers(user, host, harvesterID, data) tDelta = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) - tStart - _logger.debug("updateWorkers %s took %s.%03d sec" % (harvesterID, tDelta.seconds, tDelta.microseconds / 1000)) - return retVal + _logger.debug(f"updateWorkers {harvesterID} took {tDelta.seconds}.{tDelta.microseconds // 1000:03d} sec") + + return return_value # update workers def updateServiceMetrics(req, harvesterID, metrics): # check security if not isSecure(req): - return json.dumps((False, "SSL is required")) + return json.dumps((False, MESSAGE_SSL)) user = _getDN(req) host = req.get_remote_host() - retVal = None + return_value = None tStart = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) # convert try: data = json.loads(metrics) except Exception: - retVal = json.dumps((False, "failed to load JSON")) + return_value = json.dumps((False, MESSAGE_JSON)) # update - if retVal is None: - retVal = userIF.updateServiceMetrics(user, host, harvesterID, data) + if return_value is None: + return_value = userIF.updateServiceMetrics(user, host, harvesterID, data) tDelta = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) - tStart + _logger.debug(f"updateServiceMetrics {harvesterID} took {tDelta.seconds}.{tDelta.microseconds // 1000:03d} sec") - _logger.debug("updateServiceMetrics %s took %s.%03d sec" % (harvesterID, tDelta.seconds, tDelta.microseconds / 1000)) - return retVal + return return_value # add harvester dialog messages def addHarvesterDialogs(req, harvesterID, dialogs): # check security if not isSecure(req): - return json.dumps((False, "SSL is required")) + return json.dumps((False, MESSAGE_SSL)) # get DN user = _getDN(req) # convert try: data = json.loads(dialogs) except Exception: - return json.dumps((False, "failed to load JSON")) + return json.dumps((False, MESSAGE_JSON)) # update return userIF.addHarvesterDialogs(user, harvesterID, data) @@ -2321,7 +1618,7 @@ def addHarvesterDialogs(req, harvesterID, dialogs): def harvesterIsAlive(req, harvesterID, data=None): # check security if not isSecure(req): - return json.dumps((False, "SSL is required")) + return json.dumps((False, MESSAGE_SSL)) # get DN user = _getDN(req) # hostname @@ -2333,7 +1630,7 @@ def harvesterIsAlive(req, harvesterID, data=None): else: data = dict() except Exception: - return json.dumps((False, "failed to load JSON")) + return json.dumps((False, MESSAGE_JSON)) # update return userIF.harvesterIsAlive(user, host, harvesterID, data) @@ -2349,7 +1646,7 @@ def getWorkerStats(req): def reportWorkerStats(req, harvesterID, siteName, paramsList): # check security if not isSecure(req): - return json.dumps((False, "SSL is required")) + return json.dumps((False, MESSAGE_SSL)) # update ret = userIF.reportWorkerStats(harvesterID, siteName, paramsList) return json.dumps(ret) @@ -2359,7 +1656,7 @@ def reportWorkerStats(req, harvesterID, siteName, paramsList): def reportWorkerStats_jobtype(req, harvesterID, siteName, paramsList): # check security if not isSecure(req): - return json.dumps((False, "SSL is required")) + return json.dumps((False, MESSAGE_SSL)) # update ret = userIF.reportWorkerStats_jobtype(harvesterID, siteName, paramsList) return json.dumps(ret) @@ -2369,15 +1666,15 @@ def reportWorkerStats_jobtype(req, harvesterID, siteName, paramsList): def setNumSlotsForWP(req, pandaQueueName, numSlots, gshare=None, resourceType=None, validPeriod=None): # check security if not isSecure(req): - return json.dumps((100, "SSL is required")) + return json.dumps((CODE_SSL, MESSAGE_SSL)) # check role - if not _hasProdRole(req): - return json.dumps((101, "production role is required in the certificate")) + if not _has_production_role(req): + return json.dumps((CODE_LOGIC, "production role is required in the certificate")) # convert try: numSlots = int(numSlots) except Exception: - return json.dumps((102, "numSlots must be int")) + return json.dumps((CODE_OTHER_PARAMS, "numSlots must be int")) # execute return userIF.setNumSlotsForWP(pandaQueueName, numSlots, gshare, resourceType, validPeriod) @@ -2386,15 +1683,15 @@ def setNumSlotsForWP(req, pandaQueueName, numSlots, gshare=None, resourceType=No def enableJumboJobs(req, jediTaskID, nJumboJobs, nJumboPerSite=None): # check security if not isSecure(req): - return json.dumps((100, "SSL is required")) + return json.dumps((CODE_SSL, MESSAGE_SSL)) # check role - if not _hasProdRole(req): - return json.dumps((101, "production role is required in the certificate")) + if not _has_production_role(req): + return json.dumps((CODE_LOGIC, "production role is required in the certificate")) # convert try: nJumboJobs = int(nJumboJobs) except Exception: - return json.dumps((102, "nJumboJobs must be int")) + return json.dumps((CODE_OTHER_PARAMS, "nJumboJobs must be int")) try: nJumboPerSite = int(nJumboPerSite) except Exception: @@ -2408,7 +1705,7 @@ def getUserJobMetadata(req, jediTaskID): try: jediTaskID = int(jediTaskID) except Exception: - return WrappedPickle.dumps((False, "jediTaskID must be an integer")) + return WrappedPickle.dumps((False, MESSAGE_TASK_ID)) return userIF.getUserJobMetadata(jediTaskID) @@ -2425,24 +1722,15 @@ def getJumboJobDatasets(req, n_days, grace_period=0): return userIF.getJumboJobDatasets(n_days, grace_period) -# get Global Share overview -def getGShareStatus(req): - # check security - if not isSecure(req): - return json.dumps((False, "SSL is required")) - ret = userIF.getGShareStatus() - return json.dumps(ret) - - # send Harvester the command to clean up the workers for a panda queue def sweepPQ(req, panda_queue, status_list, ce_list, submission_host_list): # check security if not isSecure(req): - return json.dumps((False, "SSL is required")) + return json.dumps((False, MESSAGE_SSL)) # check role - prod_role = _hasProdRole(req) + prod_role = _has_production_role(req) if not prod_role: - return json.dumps((False, "production or pilot role required")) + return json.dumps((False, MESSAGE_PROD_ROLE)) return json.dumps((True, userIF.sweepPQ(panda_queue, status_list, ce_list, submission_host_list))) @@ -2461,15 +1749,14 @@ def decode_idds_enum(d): # relay iDDS command def relay_idds_command(req, command_name, args=None, kwargs=None, manager=None, json_outputs=None): - tmpLog = LogWrapper( + tmp_log = LogWrapper( _logger, f"relay_idds_command-{datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None).isoformat('/')}", ) # check security if not isSecure(req): - tmpStr = "SSL is required" - tmpLog.error(tmpStr) - return json.dumps((False, tmpStr)) + tmp_log.error(MESSAGE_SSL) + return json.dumps((False, MESSAGE_SSL)) try: manager = resolve_bool(manager) if not manager: @@ -2483,14 +1770,14 @@ def relay_idds_command(req, command_name, args=None, kwargs=None, manager=None, else: c = iDDS_Client(idds_host) if not hasattr(c, command_name): - tmpStr = f"{command_name} is not a command of iDDS {c.__class__.__name__}" - tmpLog.error(tmpStr) - return json.dumps((False, tmpStr)) + tmp_str = f"{command_name} is not a command of iDDS {c.__class__.__name__}" + tmp_log.error(tmp_str) + return json.dumps((False, tmp_str)) if args: try: args = idds.common.utils.json_loads(args) except Exception as e: - tmpLog.warning(f"failed to load args json with {str(e)}") + tmp_log.warning(f"failed to load args json with {str(e)}") args = json.loads(args, object_hook=decode_idds_enum) else: args = [] @@ -2498,7 +1785,7 @@ def relay_idds_command(req, command_name, args=None, kwargs=None, manager=None, try: kwargs = idds.common.utils.json_loads(kwargs) except Exception as e: - tmpLog.warning(f"failed to load kwargs json with {str(e)}") + tmp_log.warning(f"failed to load kwargs json with {str(e)}") kwargs = json.loads(kwargs, object_hook=decode_idds_enum) else: kwargs = {} @@ -2509,23 +1796,23 @@ def relay_idds_command(req, command_name, args=None, kwargs=None, manager=None, dn = req.subprocess_env.get("SSL_CLIENT_S_DN") if dn: c.set_original_user(user_name=clean_user_id(dn)) - tmpLog.debug(f"execute: class={c.__class__.__name__} com={command_name} host={idds_host} args={str(args)[:200]} kwargs={str(kwargs)[:200]}") + tmp_log.debug(f"execute: class={c.__class__.__name__} com={command_name} host={idds_host} args={str(args)[:200]} kwargs={str(kwargs)[:200]}") ret = getattr(c, command_name)(*args, **kwargs) - tmpLog.debug(f"ret: {str(ret)[:200]}") + tmp_log.debug(f"ret: {str(ret)[:200]}") try: return json.dumps((True, ret)) except Exception: return idds.common.utils.json_dumps((True, ret)) except Exception as e: - tmpStr = f"failed to execute command with {str(e)}" - tmpLog.error(f"{tmpStr} {traceback.format_exc()}") - return json.dumps((False, tmpStr)) + tmp_str = f"failed to execute command with {str(e)}" + tmp_log.error(f"{tmp_str} {traceback.format_exc()}") + return json.dumps((False, tmp_str)) # relay iDDS workflow command with ownership check def execute_idds_workflow_command(req, command_name, kwargs=None, json_outputs=None): try: - tmpLog = LogWrapper( + tmp_log = LogWrapper( _logger, f"execute_idds_workflow_command-{datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None).isoformat('/')}", ) @@ -2546,9 +1833,9 @@ def execute_idds_workflow_command(req, command_name, kwargs=None, json_outputs=N elif command_name in ["abort", "suspend", "resume", "retry", "finish"]: check_owner = True else: - tmpMsg = f"{command_name} is unsupported" - tmpLog.error(tmpMsg) - return json.dumps((False, tmpMsg)) + tmp_message = f"{command_name} is unsupported" + tmp_log.error(tmp_message) + return json.dumps((False, tmp_message)) # check owner c = iDDS_ClientManager(idds_host) if json_outputs: @@ -2557,39 +1844,39 @@ def execute_idds_workflow_command(req, command_name, kwargs=None, json_outputs=N if check_owner: # requester if not dn: - tmpMsg = "SSL_CLIENT_S_DN is missing in HTTP request" - tmpLog.error(tmpMsg) - return json.dumps((False, tmpMsg)) + tmp_message = "SSL_CLIENT_S_DN is missing in HTTP request" + tmp_log.error(tmp_message) + return json.dumps((False, tmp_message)) requester = clean_user_id(dn) # get request_id request_id = kwargs.get("request_id") if request_id is None: - tmpMsg = "request_id is missing" - tmpLog.error(tmpMsg) - return json.dumps((False, tmpMsg)) + tmp_message = "request_id is missing" + tmp_log.error(tmp_message) + return json.dumps((False, tmp_message)) # get request req = c.get_requests(request_id=request_id) if not req: - tmpMsg = f"request {request_id} is not found" - tmpLog.error(tmpMsg) - return json.dumps((False, tmpMsg)) + tmp_message = f"request {request_id} is not found" + tmp_log.error(tmp_message) + return json.dumps((False, tmp_message)) user_name = req[0].get("username") if user_name and user_name != requester: - tmpMsg = f"request {request_id} is not owned by {requester}" - tmpLog.error(tmpMsg) - return json.dumps((False, tmpMsg)) + tmp_message = f"request {request_id} is not owned by {requester}" + tmp_log.error(tmp_message) + return json.dumps((False, tmp_message)) # set original username if dn: c.set_original_user(user_name=clean_user_id(dn)) # execute command - tmpLog.debug(f"com={command_name} host={idds_host} kwargs={str(kwargs)}") + tmp_log.debug(f"com={command_name} host={idds_host} kwargs={str(kwargs)}") ret = getattr(c, command_name)(**kwargs) - tmpLog.debug(str(ret)) + tmp_log.debug(str(ret)) if isinstance(ret, dict) and "message" in ret: return json.dumps((True, [ret["status"], ret["message"]])) return json.dumps((True, ret)) except Exception as e: - tmpLog.error(f"failed with {str(e)} {traceback.format_exc()}") + tmp_log.error(f"failed with {str(e)} {traceback.format_exc()}") return json.dumps((False, f"server failed with {str(e)}")) @@ -2597,40 +1884,38 @@ def execute_idds_workflow_command(req, command_name, kwargs=None, json_outputs=N def send_command_to_job(req, panda_id, com): # check security if not isSecure(req): - return json.dumps((False, "SSL is required")) + return json.dumps((False, MESSAGE_SSL)) # check role - prod_role = _hasProdRole(req) + prod_role = _has_production_role(req) if not prod_role: - return json.dumps((False, "production or pilot role required")) + return json.dumps((False, MESSAGE_PROD_ROLE)) return json.dumps(userIF.send_command_to_job(panda_id, com)) # set user secret def set_user_secret(req, key=None, value=None): - tmpLog = LogWrapper(_logger, f"set_user_secret-{datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None).isoformat('/')}") + tmp_log = LogWrapper(_logger, f"set_user_secret-{datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None).isoformat('/')}") # get owner dn = req.subprocess_env.get("SSL_CLIENT_S_DN") if not dn: - tmpMsg = "SSL_CLIENT_S_DN is missing in HTTP request" - tmpLog.error(tmpMsg) - return json.dumps((False, tmpMsg)) + tmp_message = "SSL_CLIENT_S_DN is missing in HTTP request" + tmp_log.error(tmp_message) + return json.dumps((False, tmp_message)) owner = clean_user_id(dn) return json.dumps(userIF.set_user_secret(owner, key, value)) # get user secrets def get_user_secrets(req, keys=None, get_json=None): - tmpLog = LogWrapper(_logger, f"get_user_secrets-{datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None).isoformat('/')}") + tmp_log = LogWrapper(_logger, f"get_user_secrets-{datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None).isoformat('/')}") # get owner dn = req.subprocess_env.get("SSL_CLIENT_S_DN") - if get_json == "True": - get_json = True - else: - get_json = False + get_json = resolve_true(get_json) + if not dn: - tmpMsg = "SSL_CLIENT_S_DN is missing in HTTP request" - tmpLog.error(tmpMsg) - return json.dumps((False, tmpMsg)) + tmp_message = "SSL_CLIENT_S_DN is missing in HTTP request" + tmp_log.error(tmp_message) + return json.dumps((False, tmp_message)) owner = clean_user_id(dn) return json.dumps(userIF.get_user_secrets(owner, keys, get_json)) @@ -2639,5 +1924,5 @@ def get_user_secrets(req, keys=None, get_json=None): def get_files_in_datasets(req, task_id, dataset_types="input,pseudo_input"): # check security if not isSecure(req): - return json.dumps((False, "SSL is required")) + return json.dumps((False, MESSAGE_SSL)) return json.dumps(userIF.get_files_in_datasets(task_id, dataset_types))