-
Notifications
You must be signed in to change notification settings - Fork 1
/
Monitor.py
1418 lines (1187 loc) · 74.2 KB
/
Monitor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import atexit
import time
import os
import commands
import sys
import pUtil
import signal
from shutil import copy, copy2
from random import shuffle
from glob import glob
from JobRecovery import JobRecovery
from processes import killProcesses, checkProcesses, killOrphans, getMaxMemoryUsageFromCGroups, isCGROUPSSite
from PilotErrors import PilotErrors
from FileStateClient import createFileStates, dumpFileStates, getFileState
from WatchDog import WatchDog
from PilotTCPServer import PilotTCPServer
from UpdateHandler import UpdateHandler
from RunJobFactory import RunJobFactory
from FileHandling import updatePilotErrorReport
import inspect
def lineno():
""" Returns the current line number in our program """
return inspect.currentframe().f_back.f_lineno
globalSite = None
class Monitor:
__skip = False
__localsizelimit_stdout = 2*1024**2 # size limit of payload stdout size during running. unit is in kB
def __init__(self, env):
self.__error = PilotErrors()
self.__env = env
#self.__env['pilot_startup'] = None
self.__env['create_softlink'] = True
self.__env['return_code'] = None
self.__env['curtime_sp'] = int(time.time())
self.__env['lastTimeFilesWereModified'] = {}
self.__wdog = WatchDog()
self.__runJob = None # Remember the RunJob instance
# register cleanup function
atexit.register(pUtil.cleanup, self.__wdog, self.__env['pilot_initdir'], self.__env['wrapperFlag'], self.__env['rmwkdir'])
signal.signal(signal.SIGTERM, pUtil.sig2exc)
signal.signal(signal.SIGQUIT, pUtil.sig2exc)
signal.signal(signal.SIGSEGV, pUtil.sig2exc)
signal.signal(signal.SIGXCPU, pUtil.sig2exc)
signal.signal(signal.SIGBUS, pUtil.sig2exc)
signal.signal(signal.SIGUSR1, pUtil.sig2exc)
def __allowLoopingJobKiller(self):
""" Should the looping job killer be run? """
# This decision is not on an Experiment level but is relevant only for the type of subprocess that is requested
# E.g. the looping job killer should normally be run for a normal ATLAS job, but not on an HPC. Therefore, the
# decision is made inside the subprocess class (RunJob, RunJobHPC, ..)
allow = True
# Which RunJob class do we need to ask?
if not self.__runJob:
# First get an Experiment object, which will tell us which subprocess to ask
thisExperiment = pUtil.getExperiment(self.__env['experiment'])
subprocessName = thisExperiment.getSubprocessName(self.__env['jobDic']["prod"][1].eventService)
pUtil.tolog("subprocessName = %s" % (subprocessName))
# Now get an instance of the corresponding class from the RunJobFactory
factory = RunJobFactory()
_rJ = factory.newRunJob(subprocessName)
self.__runJob = _rJ()
if self.__runJob:
# Is the looping job killer allowed by the subprocess?
allow = self.__runJob.allowLoopingJobKiller()
name = os.path.splitext(self.__runJob.getRunJobFileName())[0]
if allow:
pUtil.tolog("Looping job killer is allowed by subprocess %s" % (name))
else:
pUtil.tolog("Looping job killer is not allowed by subprocess %s" % (name))
else:
pUtil.tolog("!!WARNING!!2121!! Could not get an instance of a RunJob* class (cannot decide about looping job killer)")
allow = False
return allow
def __checkPayloadStdout(self):
""" Check the size of the payload stdout """
# loop over all parallel jobs
# (after multitasking was removed from the pilot, there is actually only one job)
for k in self.__env['jobDic'].keys():
# get list of log files
fileList = glob("%s/log.*" % (self.__env['jobDic'][k][1].workdir))
# is this a multi-trf job?
nJobs = self.__env['jobDic'][k][1].jobPars.count("\n") + 1
for _i in range(nJobs):
# get name of payload stdout file created by the pilot
_stdout = self.__env['jobDic'][k][1].stdout
if nJobs > 1:
_stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1))
filename = "%s/%s" % (self.__env['jobDic'][k][1].workdir, _stdout)
# add the primary stdout file to the fileList
fileList.append(filename)
# now loop over all files and check each individually (any large enough file will fail the job)
for filename in fileList:
if os.path.exists(filename):
try:
# get file size in bytes
fsize = os.path.getsize(filename)
except Exception, e:
pUtil.tolog("!!WARNING!!1999!! Could not read file size of %s: %s" % (filename, str(e)))
else:
# is the file too big?
if fsize > self.__env['localsizelimit_stdout'] * 1024: # convert to bytes
pilotErrorDiag = "Payload stdout file too big: %d B (larger than limit %d B)" % (fsize, self.__env['localsizelimit_stdout'] * 1024)
pUtil.tolog("!!FAILED!!1999!! %s" % (pilotErrorDiag))
# kill the job
#pUtil.tolog("Going to kill pid %d" %lineno())
killProcesses(self.__env['jobDic'][k][0], self.__env['jobDic'][k][1].pgrp)
self.__env['jobDic'][k][1].result[0] = "failed"
self.__env['jobDic'][k][1].currentState = self.__env['jobDic'][k][1].result[0]
self.__env['jobDic'][k][1].result[2] = self.__error.ERR_STDOUTTOOBIG
self.__env['jobDic'][k][1].pilotErrorDiag = pilotErrorDiag
self.__skip = True
# store the error info
updatePilotErrorReport(self.__env['jobDic'][k][1].result[2], pilotErrorDiag, "1", self.__env['jobDic'][k][1].jobId, self.__env['pilot_initdir'])
# remove the payload stdout file after the log extracts have been created
# remove any lingering input files from the work dir
if self.__env['jobDic'][k][1].inFiles:
if len(self.__env['jobDic'][k][1].inFiles) > 0:
ec = pUtil.removeFiles(self.__env['jobDic'][k][1].workdir, self.__env['jobDic'][k][1].inFiles)
else:
pUtil.tolog("Payload stdout (%s) within allowed size limit (%d B): %d B" % (_stdout, self.__env['localsizelimit_stdout']*1024, fsize))
else:
pUtil.tolog("(Skipping file size check of payload stdout file (%s) since it has not been created yet)" % (_stdout))
def __getMaxWorkDirSize(self):
"""
Return the maximum allowed size of the work directory for user jobs
"""
try:
maxwdirsize = int(pUtil.readpar('maxwdir'))*1024**2 # from MB to B, e.g. 16336 MB -> 17,129,537,536 B
except:
maxInputSize = pUtil.getMaxInputSize()
maxwdirsize = maxInputSize + self.__env['localsizelimit_stdout']*1024
pUtil.tolog("Work directory size check will use %d B as a max limit (maxinputsize [%d B] + local size limit for stdout [%d B])" %\
(maxwdirsize, maxInputSize, self.__env['localsizelimit_stdout']*1024))
else:
pUtil.tolog("Work directory size check will use %d B as a max limit (maxwdirsize)" % (maxwdirsize))
return maxwdirsize
def __checkWorkDir(self):
"""
Check the size of the pilot work dir for use jobs
"""
# get the limit of the workdir
maxwdirsize = self.__getMaxWorkDirSize()
# after multitasking was removed from the pilot, there is actually only one job
for k in self.__env['jobDic'].keys():
# get size of workDir
workDir = "%s" % (self.__env['jobDic'][k][1].workdir)
if os.path.exists(workDir):
try:
# get the size in kB
size_str = commands.getoutput("du -sk %s" % (workDir))
except Exception, e:
pUtil.tolog("Warning: failed to check remaining space: %s, %s" % (workDir, str(e)))
else:
# e.g., size_str = "900\t/scratch-local/nilsson/pilot3z"
try:
# remove tab and path, and convert to int (and B)
size = int(size_str.split("\t")[0])*1024
except Exception, e:
pUtil.tolog("Warning: failed to convert to int: %s" % str(e))
else:
# is user dir within allowed size limit?
if size > maxwdirsize:
pilotErrorDiag = "Work directory (%s) too large: %d B (must be < %d B)" %\
(workDir, size, maxwdirsize)
pUtil.tolog("!!FAILED!!1999!! %s" % (pilotErrorDiag))
# kill the job
killProcesses(self.__env['jobDic'][k][0], self.__env['jobDic'][k][1].pgrp)
self.__env['jobDic'][k][1].result[0] = "failed"
self.__env['jobDic'][k][1].currentState = self.__env['jobDic'][k][1].result[0]
self.__env['jobDic'][k][1].result[2] = self.__error.ERR_USERDIRTOOLARGE
self.__env['jobDic'][k][1].pilotErrorDiag = pilotErrorDiag
self.__skip = True
# store the error info
updatePilotErrorReport(self.__env['jobDic'][k][1].result[2], pilotErrorDiag, "1", self.__env['jobDic'][k][1].jobId, self.__env['pilot_initdir'])
# remove any lingering input files from the work dir
if self.__env['jobDic'][k][1].inFiles:
if len(self.__env['jobDic'][k][1].inFiles) > 0:
ec = pUtil.removeFiles(self.__env['jobDic'][k][1].workdir, self.__env['jobDic'][k][1].inFiles)
else:
pUtil.tolog("Checked size of user analysis work directory %s: %d B (within %d B limit)"
%(workDir, size, maxwdirsize))
else:
pUtil.tolog("(Skipping size check of workDir since it has not been created yet)")
def __checkLocalSpace(self, disk):
""" check the remaining local disk space during running """
spaceleft = int(disk)*1024**2 # B (node.disk is in MB)
_localspacelimit = self.__env['localspacelimit'] * 1024 # B
# do we have enough local disk space to continue running the job?
if spaceleft < _localspacelimit:
pilotErrorDiag = "Too little space left on local disk to run job: %d B (need > %d B)" % (spaceleft, _localspacelimit)
pUtil.tolog("!!FAILED!!1999!! %s" % (pilotErrorDiag))
for k in self.__env['jobDic'].keys():
# kill the job
#pUtil.tolog("Going to kill pid %d" %lineno())
killProcesses(self.__env['jobDic'][k][0], self.__env['jobDic'][k][1].pgrp)
self.__env['jobDic'][k][1].result[0] = "failed"
self.__env['jobDic'][k][1].currentState = self.__env['jobDic'][k][1].result[0]
self.__env['jobDic'][k][1].result[2] = self.__error.ERR_NOLOCALSPACE
self.__env['jobDic'][k][1].pilotErrorDiag = pilotErrorDiag
self.__skip = True
# store the error info
updatePilotErrorReport(self.__env['jobDic'][k][1].result[2], pilotErrorDiag, "1", self.__env['jobDic'][k][1].jobId, self.__env['pilot_initdir'])
# remove any lingering input files from the work dir
if self.__env['jobDic'][k][1].inFiles:
if len(self.__env['jobDic'][k][1].inFiles) > 0:
ec = pUtil.removeFiles(self.__env['jobDic'][k][1].workdir, self.__env['jobDic'][k][1].inFiles)
else:
pUtil.tolog("Remaining local disk space: %d B" % (spaceleft))
def __check_remaining_space(self):
"""
every ten minutes, check the remaining disk space and size of user workDir
and the size of the payload stdout file
"""
if (int(time.time()) - self.__env['curtime_sp']) > self.__env['update_freq_space']:
# check the size of the payload stdout
self.__skip = self.__checkPayloadStdout()
# update the worker node info (i.e. get the remaining disk space)
self.__env['workerNode'].collectWNInfo(self.__env['thisSite'].workdir)
self.__skip = self.__checkLocalSpace(self.__env['workerNode'].disk)
# check the size of the workdir for user jobs
self.__skip = self.__checkWorkDir()
# update the time for checking disk space
self.__env['curtime_sp'] = int(time.time())
def __createSoftLink(self):
""" create a soft link to the athena stdout in the site work dir """
# create_softlink is mutable
# will only point to the first stdout file currently
for k in self.__env['jobDic'].keys():
# is this a multi-trf job?
nJobs = self.__env['jobDic'][k][1].jobPars.count("\n") + 1
for _i in range(nJobs):
_stdout = self.__env['jobDic'][k][1].stdout
if nJobs > 1:
_stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1))
filename = os.path.join(self.__env['jobDic'][k][1].workdir, _stdout)
# create the soft link only if the stdout has been created
if os.path.exists(filename):
lnfilename = os.path.join(self.__env['thisSite'].workdir, _stdout)
# only create the soft link once..
if not os.path.exists(lnfilename):
# ..and only if the size of stdout is > 0
if os.path.getsize(filename) > 0:
ec, rs = commands.getstatusoutput("ln -s %s %s" % (filename, lnfilename))
if ec == 0:
pUtil.tolog("Created soft link to %s in sitedir: %s" % (_stdout, lnfilename))
else:
pUtil.tolog("!!WARNING!!1999!! Could not create soft link: %d, %s" % (ec, rs))
self.__env['create_softlink'] = False
else:
self.__env['create_softlink'] = False
else:
pUtil.tolog("(%s has not been created yet)" % (_stdout))
def create_softlink(self):
"""
create a soft link in the site workdir to the payload stdout
"""
if self.__env['create_softlink']:
try:
self.__createSoftLink()
except Exception, e:
pUtil.tolog("!!WARNING!!1999!! Caught an exception during soft link creation: %s" % str(e))
self.__env['create_softlink'] = False
def __failMaxTimeJob(self):
"""
Reached maximum batch system time limit, fail the job
"""
pUtil.tolog("!!WARNING!!1999!! The pilot has decided to kill the job since there is less than 10 minutes of the allowed batch system running time")
pilotErrorDiag = "Reached maximum batch system time limit"
pUtil.tolog("!!FAILED!!1999!! %s" % (pilotErrorDiag))
# after multitasking was removed from the pilot, there is actually only one job
for k in self.__env['jobDic'].keys():
# kill the job
#pUtil.tolog("Going to kill pid %d" %lineno())
killProcesses(self.__env['jobDic'][k][0], self.__env['jobDic'][k][1].pgrp)
self.__env['jobDic'][k][1].result[0] = "failed"
self.__env['jobDic'][k][1].currentState = self.__env['jobDic'][k][1].result[0]
self.__env['jobDic'][k][1].result[2] = self.__error.ERR_REACHEDMAXTIME
self.__env['jobDic'][k][1].pilotErrorDiag = pilotErrorDiag
# store the error info
updatePilotErrorReport(self.__env['jobDic'][k][1].result[2], pilotErrorDiag, "1", self.__env['jobDic'][k][1].jobId, self.__env['pilot_initdir'])
def __monitor_processes(self):
# monitor the number of running processes and the pilot running time
if (int(time.time()) - self.__env['curtime_proc']) > self.__env['update_freq_proc']:
# check the number of running processes
nProc = checkProcesses(self.__env['jobDic']["prod"][0])
if nProc > self.__env['maxNProc']:
self.__env['maxNProc'] = nProc
# monitor the pilot running time (once every five minutes = update_freq_proc)
time_passed_since_pilot_startup = int(time.time()) - self.__env['pilot_startup']
pUtil.tolog("Time passed since pilot startup = %d s (maximum allowed batch system time = %d s)"
% (time_passed_since_pilot_startup, self.__env['maxtime']))
if (self.__env['maxtime'] - time_passed_since_pilot_startup) < 10*60 and not self.__env['stageout']:
# reached maximum batch system time limit
if not "NO_PILOT_TIME_LIMIT_KILL" in pUtil.readpar('catchall'):
self.__failMaxTimeJob()
self.__skip = True
else:
pUtil.tolog("Max allowed batch system time passed (%d s) - but pilot will not kill job since NO_PILOT_TIME_LIMIT_KILL is present in catchall field" % (time_passed_since_pilot_startup))
# update the time for checking processes
self.__env['curtime_proc'] = int(time.time())
def __verifyOutputFileSizes(self):
""" Verify output file sizes """
pilotErrorDiag = ""
job_index = 0
rc = 0
pUtil.tolog("Verifying output file sizes")
for k in self.__env['jobDic'].keys():
if len(self.__env['jobDic'][k][1].outFiles) > 0:
for file_name in self.__env['jobDic'][k][1].outFiles:
findFlag = False
# locate the file first
out = commands.getoutput("find %s -name %s" % (self.__env['jobDic'][k][1].workdir, file_name))
if out != "":
for line in out.split('\n'):
try:
file_size = os.path.getsize(line)
findFlag = True
if file_size > self.__env['outputlimit']:
pilotErrorDiag = 'File: \"%s\" is too large %d > %d B)' % (line, file_size, self.__env['outputlimit'])
pUtil.tolog('!!WARNING!!2999!!%s' % (pilotErrorDiag))
job_index = k
rc = self.__error.ERR_OUTPUTFILETOOLARGE
else:
pUtil.tolog('File: \"%s\" currently has size %d < %d B)' % (line, file_size, self.__env['outputlimit']))
except:
pass
if not findFlag and file_name != self.__env['jobDic'][k][1].logFile:
# if not findFlag and not ".log." in file_name:
pUtil.tolog("Could not access file %s: %s" % (file_name, out))
return rc, pilotErrorDiag, job_index
def __checkOutputFileSizes(self):
""" check that the output file sizes are within the limit """
# return True for too large files, to skip looping test and normal server update
# verify the file sizes
rc, pilotErrorDiag, job_index = self.__verifyOutputFileSizes()
if rc == 0:
pUtil.tolog("Completed output file size verification")
else:
# found too large output file, stop the job
self.__env['jobDic'][job_index][1].result[0] = "failed"
self.__env['jobDic'][job_index][1].currentState = self.__env['jobDic'][job_index][1].result[0]
self.__env['jobDic'][job_index][1].result[2] = rc
self.__env['jobDic'][job_index][1].pilotErrorDiag = pilotErrorDiag
self.__skip = True
# store the error info
updatePilotErrorReport(self.__env['jobDic'][job_index][1].result[2], pilotErrorDiag, "1", self.__env['jobDic'][k][1].jobId, self.__env['pilot_initdir'])
def __verify_output_sizes(self):
# verify output file sizes every ten minutes
if (int(time.time()) - self.__env['curtime_of']) > self.__env['update_freq_space']:
# check the output file sizes
self.__skip = self.__checkOutputFileSizes()
# update the time for checking output file sizes
self.__env['curtime_of'] = int(time.time())
# FOR TESTING ONLY
# def __verify_memory_limits(self):
# # verify output file sizes every five minutes
# if (int(time.time()) - self.__env['curtime_mem']) > 1*60: #self.__env['update_freq_mem']:
# # check the CGROUPS memory
# max_memory = getMaxMemoryUsageFromCGroups()
# if max_memory:
# pUtil.tolog("cgroups max_memory = %s" % (max_memory))
# else:
# pUtil.tolog("cgroups max_memory not defined")
#
# # update the time for checking memory
# self.__env['curtime_mem'] = int(time.time())
def __killLoopingJob(self, job, pid, setStageout=False):
""" kill the looping job """
# the child process is looping, kill it
pilotErrorDiag = "Pilot has decided to kill looping job %s at %s" %\
(job.jobId, pUtil.timeStamp())
pUtil.tolog("!!FAILED!!1999!! %s" % (pilotErrorDiag))
cmd = 'ps -fwu %s' % (commands.getoutput("whoami"))
pUtil.tolog("%s: %s" % (cmd + '\n', commands.getoutput(cmd)))
cmd = 'ls -ltr %s' % (job.workdir)
pUtil.tolog("%s: %s" % (cmd + '\n', commands.getoutput(cmd)))
cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % (commands.getoutput("whoami"))
pUtil.tolog("%s: %s" % (cmd + '\n', commands.getoutput(cmd)))
killProcesses(pid, job.pgrp)
if self.__env['stagein']:
pilotErrorDiag += " (Job stuck in stage-in state)"
pUtil.tolog("!!FAILED!!1999!! Job stuck in stage-in state: file copy time-out")
job.result[2] = self.__error.ERR_GETTIMEOUT
elif setStageout:
pilotErrorDiag += " (Job stuck in stage-out state)"
pUtil.tolog("!!FAILED!!1999!! Job stuck in stage-out state: file copy time-out")
job.result[2] = self.__error.ERR_PUTTIMEOUT
else:
job.result[2] = self.__error.ERR_LOOPINGJOB
job.result[0] = "failed"
job.currentState = job.result[0]
job.pilotErrorDiag = pilotErrorDiag
# store the error info
updatePilotErrorReport(job.result[2], pilotErrorDiag, "1", job.jobId, self.__env['pilot_initdir'])
# remove any lingering input files from the work dir
if job.inFiles:
if len(job.inFiles) > 0:
ec = pUtil.removeFiles(job.workdir, job.inFiles)
return job
def __updateJobs(self):
""" Make final server update for all ended jobs"""
# get the stdout tails
stdout_dictionary = pUtil.getStdoutDictionary(self.__env['jobDic'])
# loop over all parallel jobs, update server, kill job if necessary
# (after multitasking was removed from the pilot, there is actually only one job)
for k in self.__env['jobDic'].keys():
tmp = self.__env['jobDic'][k][1].result[0]
if tmp != "finished" and tmp != "failed" and tmp != "holding":
# get the tail if possible
try:
self.__env['stdout_tail'] = stdout_dictionary[self.__env['jobDic'][k][1].jobId]
index = "path-%s" % (self.__env['jobDic'][k][1].jobId)
self.__env['stdout_path'] = stdout_dictionary[index]
pUtil.tolog("stdout_path=%s at index=%s" % (self.__env['stdout_path'], index))
except Exception, e:
self.__env['stdout_tail'] = "(stdout tail not available)"
self.__env['stdout_path'] = ""
pUtil.tolog("no stdout_path: %s" % (e))
# update the panda server
ret, retNode = pUtil.updatePandaServer(self.__env['jobDic'][k][1], stdout_tail = self.__env['stdout_tail'], stdout_path = self.__env['stdout_path'])
if ret == 0:
pUtil.tolog("Successfully updated panda server at %s" % pUtil.timeStamp())
else:
pUtil.tolog("!!WARNING!!1999!! updatePandaServer returned a %d" % (ret))
# kill a job if signaled from panda server
if "tobekilled" in self.__env['jobDic'][k][1].action:
pilotErrorDiag = "Pilot received a panda server signal to kill job %s at %s" %\
(self.__env['jobDic'][k][1].jobId, pUtil.timeStamp())
pUtil.tolog("!!FAILED!!1999!! %s" % (pilotErrorDiag))
if self.__env['jobrec']:
self.__env['jobrec'] = False
pUtil.tolog("Switching off job recovery")
# kill the real job process(es)
#pUtil.tolog("Going to kill pid %d" %lineno())
killProcesses(self.__env['jobDic'][k][0], self.__env['jobDic'][k][1].pgrp)
self.__env['jobDic'][k][1].result[0] = "failed"
self.__env['jobDic'][k][1].currentState = self.__env['jobDic'][k][1].result[0]
self.__env['jobDic'][k][1].result[2] = self.__error.ERR_PANDAKILL
self.__env['jobDic'][k][1].pilotErrorDiag = pilotErrorDiag
# store the error info
updatePilotErrorReport(self.__env['jobDic'][k][1].result[2], pilotErrorDiag, "1", self.__env['jobDic'][k][1].jobId, self.__env['pilot_initdir'])
# did we receive a command to turn on debug mode?
if "debug" in self.__env['jobDic'][k][1].action.lower():
pUtil.tolog("Pilot received a command to turn on debug mode from the server")
# self.__env['update_freq_server'] = 2*60
self.__env['update_freq_server'] = 5*60
pUtil.tolog("Server update frequency lowered to %d s" % (self.__env['update_freq_server']))
self.__env['jobDic'][k][1].debug = "True"
# did we receive a command to turn off debug mode?
if "debugoff" in self.__env['jobDic'][k][1].action.lower():
pUtil.tolog("Pilot received a command to turn off debug mode from the server")
self.__env['update_freq_server'] = 30*60
pUtil.tolog("Server update frequency increased to %d s" % (self.__env['update_freq_server']))
self.__env['jobDic'][k][1].debug = "False"
def __loopingJobKiller(self):
""" Look for looping job """
pUtil.tolog("Checking for looping job")
for k in self.__env['jobDic'].keys():
# if current run state is "stageout", just make sure that the stage out copy command is not hanging
if self.__env['stageout']:
pUtil.tolog("Making sure that the stage out copy command is not hanging")
if not self.__env['stageoutStartTime']:
pUtil.tolog("!!WARNING!!1700!! Stage-out start time not set")
else:
# how much time has passed since stage-out began?
time_passed = int(time.time()) - self.__env['stageoutStartTime']
# find out the timeout limit for the relevant site mover
from SiteMoverFarm import getSiteMover
sitemover = getSiteMover(pUtil.readpar('copytool'), "")
timeout = sitemover.get_timeout()
pUtil.tolog("Stage-out with %s site mover began at %s (%d s ago, site mover time-out: %d s)"
%(pUtil.readpar('copytool'), time.strftime("%H:%M:%S", time.gmtime(self.__env['stageoutStartTime'])), time_passed, timeout))
grace_time = 5*60
if time_passed > timeout:
pUtil.tolog("Adding a grace time of %d s in case copy command has been aborted but file removal is not complete" % (grace_time))
if time_passed > timeout + grace_time:
self.__env['jobDic'][k][1] = self.__killLoopingJob(self.__env['jobDic'][k][1], self.__env['jobDic'][k][0], setStageout = True)
elif len(self.__env['jobDic'][k][1].outFiles) > 0: # only check for looping job for jobs with output files
# loop over all job output files and find the one with the latest modification time
# note that some files might not have been created at this point (e.g. RDO built from HITS)
# locate all files that were modified the last N minutes
cmd = "find %s -mmin -%d" % (self.__env['jobDic'][k][1].workdir, int(self.__env['loopingLimit']/60))
pUtil.tolog("Executing command: %s" % (cmd))
out = commands.getoutput(cmd)
if out != "":
files = out.split("\n")
if len(files) > 0:
# remove unwanted list items (*.py, *.pyc, workdir, ...)
_files = []
for _file in files:
if not (self.__env['jobDic'][k][1].workdir == _file or
".lib.tgz" in _file or
".py" in _file or
"PoolFileCatalog" in _file or
"setup.sh" in _file or
"jobState" in _file or
"pandaJob" in _file or
"runjob" in _file or
"matched_replicas" in _file or
"DBRelease-" in _file):
_files.append(_file)
# else:
# pUtil.tolog("Ignored file: %s" % (_file))
if _files != []:
pUtil.tolog("Found %d files that were recently updated (e.g. file %s)" % (len(_files), _files[0]))
# s = ""
# for _file in _files:
# s += _file + ", "
# pUtil.tolog(s)
# get the current system time
self.__env['lastTimeFilesWereModified'][k] = int(time.time())
else:
pUtil.tolog("WARNING: found no recently updated files!")
else:
pUtil.tolog("WARNING: found no recently updated files")
else:
pUtil.tolog("WARNING: Found no recently updated files")
# check if the last modification time happened long ago
# (process is considered to be looping if it's files have not been modified within loopingLimit time)
pUtil.tolog("int(time.time())=%d"%int(time.time()))
pUtil.tolog("lastTimeFilesWereModified=%d"%self.__env['lastTimeFilesWereModified'][k])
pUtil.tolog("loopingLimit=%d"%self.__env['loopingLimit'])
if (int(time.time()) - self.__env['lastTimeFilesWereModified'][k]) > self.__env['loopingLimit']:
self.__env['jobDic'][k][1] = self.__killLoopingJob(self.__env['jobDic'][k][1], self.__env['jobDic'][k][0])
def __check_looping_jobs(self):
# every 30 minutes, look for looping jobs
if (int(time.time()) - self.__env['curtime']) > self.__env['update_freq_server'] and not self.__skip: # 30 minutes
# check when the workdir files were last updated or that the stageout command is not hanging
if self.__allowLoopingJobKiller():
self.__loopingJobKiller()
# make final server update for all ended jobs
self.__updateJobs()
# update the time for checking looping jobs
self.__env['curtime'] = int(time.time())
def __set_outputs(self):
# all output will be written to the pilot log as well as to stdout [with the pUtil.tolog() function]
pUtil.setPilotlogFilename("%s/pilotlog.txt" % (self.__env['thisSite'].workdir))
# redirect stderr
pUtil.setPilotstderrFilename("%s/pilot.stderr" % (self.__env['thisSite'].workdir))
sys.stderr = open(pUtil.getPilotstderrFilename(), 'w')
def __verify_permissions(self):
# verify permissions
cmd = "stat %s" % (self.__env['thisSite'].workdir)
pUtil.tolog("(1b) Executing command: %s" % (cmd))
rc, rs = commands.getstatusoutput(cmd)
pUtil.tolog("\n%s" % (rs))
def __getsetWNMem(self):
""" Get the memory limit from queuedata or from the -k pilot option and set it """
wn_mem = 0
# Get the memory limit primarily from queuedata
# Note: memory will soon be changed to maxmemory
_maxmemory = pUtil.readpar('maxmemory')
if _maxmemory == "":
_maxmemory = pUtil.readpar('memory')
if _maxmemory != "":
try:
maxmemory = int(_maxmemory) # Should already be an int
except Exception, e:
pUtil.tolog("Could not convert maxmemory to an int: %s" % (e))
maxmemory = -1
else:
pUtil.tolog("Got max memory limit: %d MB (from queuedata)" % (maxmemory))
else:
maxmemory = -1
# Get the max memory limit from the -k pilot option if specified
if maxmemory == -1 and self.__env['memory']:
try:
maxmemory = int(self.__env['memory'])
except Exception, e:
pUtil.tolog("Could not convert memory to an int: %s" % (e))
maxmemory = -1
else:
pUtil.tolog("Got max memory limit: %d MB (from pilot option -k)" % (maxmemory))
# Set the memory limit
if maxmemory > 0:
# Convert MB to Bytes for the setrlimit function
_maxmemory = maxmemory*1024**2
max_memory = getMaxMemoryUsageFromCGroups()
if max_memory:
pUtil.tolog("cgroups max_memory = %s" % (max_memory))
else:
pUtil.tolog("cgroups max_memory not defined")
# Only proceed if not a CGROUPS site
if not isCGROUPSSite():
pUtil.tolog("Not a CGROUPS site, proceeding with setting the memory limit")
try:
import resource
resource.setrlimit(resource.RLIMIT_AS, [_maxmemory, _maxmemory])
except Exception, e:
pUtil.tolog("!!WARNING!!3333!! resource.setrlimit failed: %s" % (e))
else:
pUtil.tolog("Max memory limit set to: %d B" % (_maxmemory))
else:
pUtil.tolog("Detected a CGROUPS site, will not set the memory limit")
cmd = "ulimit -a"
pUtil.tolog("Executing command: %s" % (cmd))
out = commands.getoutput(cmd)
pUtil.tolog("\n%s" % (out))
else:
pUtil.tolog("Max memory will not be set")
return maxmemory
def __checkLocalDiskSpace(self, disk):
""" Do we have enough local disk space left to run the job? """
ec = 0
# convert local space to B and compare with the space limit
spaceleft = int(disk)*1024**2 # B (node.disk is in MB)
_localspacelimit = self.__env['localspacelimit0'] * 1024 # B
pUtil.tolog("Local space limit: %d B" % (_localspacelimit))
if spaceleft < _localspacelimit:
pUtil.tolog("!!FAILED!!1999!! Too little space left on local disk to run job: %d B (need > %d B)" % (spaceleft, _localspacelimit))
ec = self.__error.ERR_NOLOCALSPACE
else:
pUtil.tolog("Remaining local disk space: %d B" % (spaceleft))
return ec
def __getLoopingLimit(self, maxCpuCount, jobPars, sitename):
""" Get the looping time limit for the current job (in seconds) """
# start with the default looping time limit, use maxCpuCount if necessary
if "ANALY_" in sitename:
loopingLimit = self.__env['loopingLimitDefaultUser']
else:
loopingLimit = self.__env['loopingLimitDefaultProd']
if maxCpuCount >= self.__env['loopingLimitMinDefault']:
_loopingLimit = max(self.__env['loopingLimitMinDefault'], maxCpuCount)
else:
_loopingLimit = max(loopingLimit, maxCpuCount)
if _loopingLimit != loopingLimit:
pUtil.tolog("Task request: Updated looping job limit from %d s to %d s using maxCpuCount" % \
(loopingLimit, _loopingLimit))
loopingLimit = _loopingLimit
else:
# increase the looping job limit for multi-trf jobs
if jobPars.find("\n") >= 0:
pUtil.tolog("Multi-trf job encountered: will double the looping job limit")
loopingLimit = 2*loopingLimit
pUtil.tolog("Using looping job limit: %d s" % (loopingLimit))
pUtil.tolog("maxCpuCount: %d s" % (maxCpuCount))
return loopingLimit
def __storePilotInitdir(self, targetdir, pilot_initdir):
""" Store the pilot launch directory in a file used by environment.py """
# This function is used to store the location of the init directory in the init directory itself as well as in the
# site work directory. The location file is used by environment.py to set the global env['pilot_initdir'] used
# by the pilot and the Monitor
# This function must be called before the global env variable is instantiated in the pilot
path = os.path.join(targetdir, "PILOT_INITDIR")
pUtil.tolog("Creating file %s with content %s" % (path, pilot_initdir))
pUtil.writeToFile(path, pilot_initdir)
# def __copyPilotVersion(self):
# """ Copy the PILOTVERSION file into the jobs' work directory """
def __createJobWorkdir(self, job, stderr):
""" Attempt to create the job workdir """
ec, errorText = job.mkJobWorkdir(self.__env['thisSite'].workdir)
if ec != 0:
job.setState(["failed", 0, self.__error.ERR_MKDIRWORKDIR])
ret, retNode = pUtil.updatePandaServer(job)
if ret == 0:
pUtil.tolog("Successfully updated panda server at %s" % pUtil.timeStamp())
else:
pUtil.tolog("!!WARNING!!1999!! updatePandaServer returned a %d" % (ret))
# send to stderr
print >> stderr, errorText
# remove the site workdir before exiting
pUtil.writeToFile(os.path.join(self.__env['thisSite'].workdir, "EXITCODE"), str(self.__error.ERR_GENERALERROR))
else:
pUtil.tolog("Created job workdir at %s" % (job.workdir))
# copy the job def file into job workdir
copy("%s/Job_%s.py" % (os.getcwd(), job.jobId), "%s/newJobDef.py" % job.workdir)
# also copy the time stamp file if it exists
_path = os.path.join(self.__env['pilot_initdir'], 'START_TIME_%s' % (job.jobId))
if os.path.exists(_path):
copy(_path, job.workdir)
pUtil.tolog("File %s copied to workdir" % (_path))
self.__storePilotInitdir(self.__env['job'].workdir, self.__env['pilot_initdir'])
return ec, job
def __throttleJob(self):
""" short delay requested by the server """
# dictionary for throttling job startup (setting running state)
throttleDic = {'CA':6, 'CERN':1.33, 'DE':6, 'ES':6, 'FR':6, 'IT':6, 'ND':0, 'NDGF':0, 'NL':6, 'TW':6, 'UK':6, 'US':6}
if self.__env['nSent'] > 0:
try:
_M = throttleDic[pUtil.readpar('cloud')]
except Exception, e:
pUtil.tolog("Warning: %s (using default value 6 as multiplier for throttling)" % str(e))
_M = 6
_t = (self.__env['nSent'] + 1)*_M
# protect for potential unreasonably high sleeping times
max_sleep = 60
if _t > max_sleep:
_t = max_sleep
pUtil.tolog("!!WARNING!!1111!! Throttle time out of bounds, reset to %d s (nSent = %d)" % (max_sleep, self.__env['nSent']))
pUtil.tolog("Throttle sleep: %d s" % (_t))
time.sleep(_t)
def __backupJobDef(self):
""" Backup job definition """
# note: the log messages here only appears in pilotlog.txt and not in the batch log since they are
# written by the forked child process
if os.path.exists(self.__env['pandaJobDataFileName']):
pUtil.tolog("Copying job definition (%s) to %s" % (self.__env['pandaJobDataFileName'], self.__env['jobDic']["prod"][1].workdir))
try:
copy2(self.__env['pandaJobDataFileName'], self.__env['jobDic']["prod"][1].workdir)
except Exception, e:
pUtil.tolog("!!WARNING!!1999!! Could not backup job definition: %s" % str(e))
else:
pandaJobDataFileName_i = self.__env['pandaJobDataFileName'].replace(".out", "_%d.out" % (self.__env['number_of_jobs']))
_path = os.path.join(self.__env['pilot_initdir'], pandaJobDataFileName_i)
pUtil.tolog("Copying job definition (%s) to %s" % (self.__env['pandaJobDataFileName'], _path))
try:
copy2(self.__env['pandaJobDataFileName'], _path)
except Exception, e:
pUtil.tolog("!!WARNING!!1999!! Could not backup job definition: %s" % str(e))
else:
pUtil.tolog("!!WARNING!!1999!! Could not backup job definition since file %s does not exist" % (self.__env['pandaJobDataFileName']))
def __cleanUpEndedJobs(self):
""" clean up the ended jobs (if there are any) """
# after multitasking was removed from the pilot, there is actually only one job
first = True
for k in self.__env['jobDic'].keys():
perr = self.__env['jobDic'][k][1].result[2]
terr = self.__env['jobDic'][k][1].result[1]
tmp = self.__env['jobDic'][k][1].result[0]
if tmp == "finished" or tmp == "failed" or tmp == "holding":
pUtil.tolog("Clean up the ended job: %s" % str(self.__env['jobDic'][k]))
# do not put the getStdoutDictionary() call outside the loop since cleanUpEndedJobs() is called every minute
# only call getStdoutDictionary() once
if first:
# get the stdout tails
pUtil.tolog("Refreshing tail stdout dictinary prior to finishing the job(s)")
stdout_dictionary = pUtil.getStdoutDictionary(self.__env['jobDic'])
first = False
# refresh the stdout tail if necessary
# get the tail if possible
try:
self.__env['stdout_tail'] = stdout_dictionary[self.__env['jobDic'][k][1].jobId]
index = "path-%s" % (self.__env['jobDic'][k][1].jobId)
self.__env['stdout_path'] = stdout_dictionary[index]
except:
self.__env['stdout_tail'] = "(stdout tail not available)"
self.__env['stdout_path'] = ""
# cleanup the job workdir, save/send the job tarball to DDM, and update
# panda server with the final job state
pUtil.postJobTask(self.__env['jobDic'][k][1], self.__env['thisSite'], self.__env['workerNode'],
self.__env['experiment'], jr = False, stdout_tail = self.__env['stdout_tail'], stdout_path = self.__env['stdout_path'])
if k == "prod":
prodJobDone = True
# for NG write the error code, if any
if os.environ.has_key('Nordugrid_pilot') and (perr != 0 or terr != 0):
if perr != 0:
ec = perr
else:
ec = terr
pUtil.writeToFile(os.path.join(self.__env['thisSite'].workdir, "EXITCODE"), str(ec))
# move this job from env['jobDic'] to zombieJobList for later collection
self.__env['zombieJobList'].append(self.__env['jobDic'][k][0]) # only needs pid of this job for cleanup
# athena processes can loop indefinately (e.g. pool utils), so kill all subprocesses just in case
pUtil.tolog("Killing remaining subprocesses (if any)")
if self.__env['jobDic'][k][1].result[2] == self.__error.ERR_OUTPUTFILETOOLARGE:
killOrphans()
#pUtil.tolog("Going to kill pid %d" %lineno())
killProcesses(self.__env['jobDic'][k][0], self.__env['jobDic'][k][1].pgrp)
if self.__env['jobDic'][k][1].result[2] == self.__error.ERR_OUTPUTFILETOOLARGE:
killOrphans()
# remove the process id file to prevent cleanup from trying to kill the remaining processes another time
# (should only be necessary for jobs killed by the batch system)
if os.path.exists(os.path.join(self.__env['thisSite'].workdir, "PROCESSID")):
try:
os.remove(os.path.join(self.__env['thisSite'].workdir, "PROCESSID"))
except Exception, e:
pUtil.tolog("!!WARNING!!2999!! Could not remove process id file: %s" % str(e))
else:
pUtil.tolog("Process id file removed")
# ready with this object, delete it
del self.__env['jobDic'][k]
def monitor_job(self):
""" Main monitoring loop launched from the pilot module """
try:
# multi-job variables
maxFailedMultiJobs = 3
multiJobTimeDelays = range(2, maxFailedMultiJobs+2) # [2,3,4]
shuffle(multiJobTimeDelays)
number_of_failed_jobs = 0
self.__set_outputs()
self.__verify_permissions()
# PN
#pUtil.tolog("Faking bad TCP server")
#pUtil.tolog("!!WARNING!!1234!! Failed to open TCP connection to localhost (worker node network problem), cannot continue")
#pUtil.fastCleanup(self.__env['thisSite'].workdir, self.__env['pilot_initdir'], self.__env['rmwkdir'])
#self.__env['return'] = self.__error.ERR_NOTCPCONNECTION
#return
# start the monitor and watchdog process
monthread = PilotTCPServer(UpdateHandler)
if not monthread.port:
pUtil.tolog("!!WARNING!!1234!! Failed to open TCP connection to localhost (worker node network problem), cannot continue")
pUtil.fastCleanup(self.__env['thisSite'].workdir, self.__env['pilot_initdir'], self.__env['rmwkdir'])
self.__env['return'] = self.__error.ERR_NOTCPCONNECTION
return
else:
pUtil.tolog("Pilot TCP server will use port: %d" % (monthread.port))
monthread.start()
# dump some pilot info, version id, etc (to the log file this time)
pUtil.tolog("\n\nEntered multi-job loop. Current work dir: %s\n" % (os.getcwd()))
pUtil.dumpPilotInfo(self.__env['version'], self.__env['pilot_version_tag'], self.__env['pilotId'],
self.__env['jobSchedulerId'], self.__env['pilot_initdir'], tofile = True)
if self.__env['timefloor'] != 0:
pUtil.tolog("Entering main pilot loop: multi job enabled (number of processed jobs: %d)" % (self.__env['number_of_jobs']))
self.__env['hasMultiJob'] = True
else:
pUtil.tolog("Entering main pilot loop: multi job disabled")
# do not reset hasMultiJob
# local checks begin here..................................................................................
# collect WN info again to avoid getting wrong disk info from gram dir which might differ from the payload workdir
pUtil.tolog("Collecting WN info from: %s (again)" % (self.__env['thisSite'].workdir))
self.__env['workerNode'].collectWNInfo(self.__env['thisSite'].workdir)
# overwrite mem since this should come from either pilot argument or queuedata
self.__env['workerNode'].mem = self.__getsetWNMem()
# update the globals used in the exception handler
globalSite = self.__env['thisSite']
globalWorkNode = self.__env['workerNode']
# get the experiment object
thisExperiment = pUtil.getExperiment(self.__env['experiment'])
# do we have a valid proxy?
if self.__env['proxycheckFlag']:
ec, pilotErrorDiag = thisExperiment.verifyProxy(envsetup="")
if ec != 0:
self.__env['return'] = ec
return
# do we have enough local disk space to run the job?
ec = self.__checkLocalDiskSpace(self.__env['workerNode'].disk)
if ec != 0:
pUtil.tolog("Pilot was executed on host: %s" % (self.__env['workerNode'].nodename))
pUtil.fastCleanup(self.__env['thisSite'].workdir, self.__env['pilot_initdir'], self.__env['rmwkdir'])
self.__env['return'] = ec
# store the error info
updatePilotErrorReport(ec, "Too little space left on local disk to run job", "1", self.__env['job'].jobId, self.__env['pilot_initdir'])
return
# make sure the pilot TCP server is still running
pUtil.tolog("Verifying that pilot TCP server is still alive...")
if pUtil.isPilotTCPServerAlive('localhost', monthread.port):
pUtil.tolog("...Pilot TCP server is still running")
else: