1 |
spiga |
1.1 |
from Actor import *
|
2 |
|
|
from crab_util import *
|
3 |
|
|
import common
|
4 |
|
|
from ApmonIf import ApmonIf
|
5 |
|
|
import Statistic
|
6 |
|
|
import time
|
7 |
|
|
from ProgressBar import ProgressBar
|
8 |
|
|
from TerminalController import TerminalController
|
9 |
|
|
|
10 |
|
|
import xml.dom.minidom
|
11 |
|
|
import xml.dom.ext
|
12 |
|
|
import TaskDB
|
13 |
|
|
|
14 |
|
|
class StatusServer(Actor):
|
15 |
|
|
|
16 |
|
|
def __init__(self, cfg_params,):
|
17 |
|
|
self.cfg_params = cfg_params
|
18 |
mcinquil |
1.2 |
|
19 |
mcinquil |
1.8 |
self.countNotSubmit = 0
|
20 |
mcinquil |
1.2 |
self.countSubmit = 0
|
21 |
|
|
self.countSubmitting = 0
|
22 |
mcinquil |
1.11 |
self.countWait = 0
|
23 |
mcinquil |
1.2 |
self.countDone = 0
|
24 |
|
|
self.countReady = 0
|
25 |
|
|
self.countSched = 0
|
26 |
|
|
self.countRun = 0
|
27 |
|
|
self.countAbort = 0
|
28 |
|
|
self.countCancel = 0
|
29 |
mcinquil |
1.12 |
self.countRet = 0
|
30 |
mcinquil |
1.6 |
self.countKilled = 0
|
31 |
mcinquil |
1.2 |
self.countCleared = 0
|
32 |
|
|
self.countToTjob = 0
|
33 |
|
|
|
34 |
spiga |
1.3 |
try:
|
35 |
|
|
self.server_name = self.cfg_params['CRAB.server_name'] # gsiftp://pcpg01.cern.ch/data/SEDir/
|
36 |
|
|
except KeyError:
|
37 |
|
|
msg = 'No server selected ...'
|
38 |
|
|
msg = msg + 'Please specify a server in the crab cfg file'
|
39 |
|
|
raise CrabException(msg)
|
40 |
|
|
|
41 |
spiga |
1.1 |
return
|
42 |
mcinquil |
1.2 |
|
43 |
|
|
def translateStatus(self, status):
|
44 |
|
|
"""
|
45 |
|
|
simmetric as server
|
46 |
|
|
"""
|
47 |
|
|
|
48 |
mcinquil |
1.4 |
stateConverting = {'Running': 'R', 'Aborted': 'A', 'Done': 'D', 'Done (Failed)': 'D',\
|
49 |
mcinquil |
1.12 |
'Cleared': 'D', 'Cancelled': 'K', 'Killed': 'K', 'NotSubmitted': 'C',\
|
50 |
spiga |
1.13 |
'Retrieving by the server': 'R' }
|
51 |
mcinquil |
1.2 |
|
52 |
|
|
if status in stateConverting:
|
53 |
|
|
return stateConverting[status]
|
54 |
|
|
return None
|
55 |
|
|
|
56 |
|
|
|
57 |
spiga |
1.1 |
def run(self):
|
58 |
|
|
"""
|
59 |
|
|
The main method of the class: check the status of the task
|
60 |
|
|
"""
|
61 |
|
|
common.logger.debug(5, "status server::run() called")
|
62 |
|
|
start = time.time()
|
63 |
|
|
|
64 |
|
|
totalCreatedJobs = 0
|
65 |
mcinquil |
1.16 |
flagSubmit = 0
|
66 |
spiga |
1.1 |
for nj in range(common.jobDB.nJobs()):
|
67 |
mcinquil |
1.16 |
if (common.jobDB.status(nj)=='S'):
|
68 |
spiga |
1.1 |
totalCreatedJobs +=1
|
69 |
mcinquil |
1.16 |
flagSubmit = 1
|
70 |
spiga |
1.1 |
|
71 |
|
|
if not flagSubmit:
|
72 |
mcinquil |
1.16 |
common.logger.message("Not Submitted jobs!")
|
73 |
|
|
common.logger.message("Before checking the status submit your jobs with the command: crab -submit all -c\n")
|
74 |
spiga |
1.1 |
return
|
75 |
|
|
|
76 |
|
|
common.scheduler.checkProxy()
|
77 |
|
|
|
78 |
|
|
common.taskDB.load()
|
79 |
|
|
WorkDirName =os.path.basename(os.path.split(common.work_space.topDir())[0])
|
80 |
|
|
projectUniqName = 'crab_'+str(WorkDirName)+'_'+common.taskDB.dict('TasKUUID')
|
81 |
|
|
try:
|
82 |
|
|
common.logger.message ("Checking the status...\n")
|
83 |
spiga |
1.3 |
cmd = 'lcg-cp --vo cms gsiftp://' + str(self.server_name) + str(projectUniqName)+'/res/xmlReportFile.xml file://'+common.work_space.resDir()+'xmlReportFile.xml'
|
84 |
mcinquil |
1.2 |
common.logger.debug(6, cmd)
|
85 |
spiga |
1.1 |
os.system(cmd +' >& /dev/null')
|
86 |
|
|
|
87 |
|
|
except:
|
88 |
mcinquil |
1.10 |
#msg = ("task status not yet available")
|
89 |
|
|
msg = "The server is managing your task."
|
90 |
|
|
msg += "\n A detailed report will be ready soon.\n"
|
91 |
spiga |
1.1 |
raise CrabException(msg)
|
92 |
|
|
|
93 |
|
|
try:
|
94 |
|
|
file = open(common.work_space.resDir()+"xmlReportFile.xml", "r")
|
95 |
|
|
doc = xml.dom.minidom.parse(common.work_space.resDir()+ "xmlReportFile.xml" )
|
96 |
|
|
|
97 |
|
|
except:
|
98 |
mcinquil |
1.10 |
#msg = ("problems reading report file")
|
99 |
|
|
msg = "The server is managing your task."
|
100 |
|
|
msg += "\n A detailed report will be ready soon.\n"
|
101 |
spiga |
1.1 |
raise CrabException(msg)
|
102 |
|
|
|
103 |
mcinquil |
1.2 |
### <Job status='Submitted' job_exit='NULL' id='1' exe_exit='NULL'/>
|
104 |
|
|
|
105 |
spiga |
1.1 |
task = doc.childNodes[0].childNodes[1].getAttribute("taskName")
|
106 |
mcinquil |
1.2 |
self.countToTjob = int(doc.childNodes[0].childNodes[1].getAttribute("totJob") )
|
107 |
|
|
|
108 |
|
|
addTree = 3
|
109 |
|
|
|
110 |
|
|
common.jobDB.load()
|
111 |
|
|
|
112 |
|
|
if doc.childNodes[0].childNodes[3].getAttribute("id") == "all":
|
113 |
|
|
if doc.childNodes[0].childNodes[3].getAttribute("status") == "Submitted":
|
114 |
|
|
self.countSubmitting = common.jobDB.nJobs()
|
115 |
|
|
for nj in range(common.jobDB.nJobs()):
|
116 |
|
|
common.jobDB.setStatus(nj, 'S')
|
117 |
mcinquil |
1.6 |
elif doc.childNodes[0].childNodes[3].getAttribute("status") == "Killed":
|
118 |
|
|
self.countKilled = common.jobDB.nJobs()
|
119 |
|
|
for nj in range(common.jobDB.nJobs()):
|
120 |
|
|
common.jobDB.setStatus(nj, 'K')
|
121 |
mcinquil |
1.14 |
self.countKilled = common.jobDB.nJobs()
|
122 |
mcinquil |
1.8 |
elif doc.childNodes[0].childNodes[3].getAttribute("status") == "NotSubmitted":
|
123 |
|
|
self.countNotSubmit = common.jobDB.nJobs()
|
124 |
mcinquil |
1.2 |
for nj in range(common.jobDB.nJobs()):
|
125 |
mcinquil |
1.8 |
common.jobDB.setStatus(nj, 'C')
|
126 |
mcinquil |
1.2 |
self.countToTjob = common.jobDB.nJobs()
|
127 |
spiga |
1.1 |
else:
|
128 |
mcinquil |
1.2 |
printline = ''
|
129 |
mcinquil |
1.16 |
printline+= "%-10s %-23s %-20s %-20s %-18s %-20s" % ('JOBID','STATUS','SITE','JOB_EXIT_STATUS','EXE_EXIT_CODE','RESUBMIT')
|
130 |
mcinquil |
1.2 |
print printline
|
131 |
mcinquil |
1.16 |
print '---------------------------------------------------------------------------------------------------------'
|
132 |
mcinquil |
1.2 |
|
133 |
|
|
for job in range( self.countToTjob ):
|
134 |
|
|
idJob = doc.childNodes[0].childNodes[job+addTree].getAttribute("id")
|
135 |
|
|
stato = doc.childNodes[0].childNodes[job+addTree].getAttribute("status")
|
136 |
|
|
exe_exit_code = doc.childNodes[0].childNodes[job+addTree].getAttribute("job_exit")
|
137 |
|
|
job_exit_status = doc.childNodes[0].childNodes[job+addTree].getAttribute("exe_exit")
|
138 |
mcinquil |
1.5 |
cleared = doc.childNodes[0].childNodes[job+addTree].getAttribute("cleared")
|
139 |
mcinquil |
1.15 |
|
140 |
|
|
try:
|
141 |
|
|
site = doc.childNodes[0].childNodes[job+addTree].getAttribute("site")
|
142 |
|
|
resub = doc.childNodes[0].childNodes[job+addTree].getAttribute("resubmit")
|
143 |
|
|
if site == "none" or site == "NULL" or site=="None":
|
144 |
|
|
site=''
|
145 |
|
|
if resub == "none" or resub =="None" or resub == "0":
|
146 |
|
|
resub=''
|
147 |
|
|
if stato == "Killed":
|
148 |
|
|
resub=''
|
149 |
|
|
except Excpetion, ex:
|
150 |
|
|
common.logger.message ("Problem reading report file: are you using the latest crab version?")
|
151 |
|
|
common.logger.debug( 1 , str(ex) )
|
152 |
|
|
|
153 |
mcinquil |
1.2 |
jobDbStatus = self.translateStatus(stato)
|
154 |
|
|
if jobDbStatus != None:
|
155 |
|
|
common.logger.debug(5, '*** Updating jobdb for job %s ***' %idJob)
|
156 |
|
|
if common.jobDB.status( str(int(idJob)-1) ) != "Y":
|
157 |
mcinquil |
1.5 |
if jobDbStatus == 'D' and int(cleared) != 1:#exe_exit_code =='' and job_exit_status=='':
|
158 |
|
|
## 'Done' but not yet cleared (server side) still showing 'Running'
|
159 |
mcinquil |
1.12 |
##stato = 'Running'
|
160 |
spiga |
1.13 |
stato = 'Retrieving by the server' ## changed - is this user friendly?
|
161 |
mcinquil |
1.5 |
jobDbStatus = 'R'
|
162 |
mcinquil |
1.2 |
common.jobDB.setStatus( str(int(idJob)-1), self.translateStatus(stato) )
|
163 |
|
|
else:
|
164 |
|
|
stato = "Cleared"
|
165 |
|
|
common.jobDB.setExitStatus( str(int(idJob)-1), job_exit_status )
|
166 |
mcinquil |
1.4 |
if stato != "Done" and stato != "Cleared" and stato != "Aborted" and stato != "Done (Failed)":
|
167 |
mcinquil |
1.16 |
print "%-10s %-23s %-20s %-20s %-18s %-20s" % (idJob,stato,site,'','',resub)
|
168 |
mcinquil |
1.2 |
else:
|
169 |
mcinquil |
1.16 |
print "%-10s %-23s %-20s %-20s %-18s %-20s" % (idJob,stato,site,exe_exit_code,job_exit_status,resub)
|
170 |
mcinquil |
1.2 |
|
171 |
|
|
if stato == 'Running':
|
172 |
|
|
self.countRun += 1
|
173 |
|
|
elif stato == 'Aborted':
|
174 |
|
|
self.countAbort += 1
|
175 |
mcinquil |
1.11 |
elif stato == 'Done' or stato == 'Done (Failed)':
|
176 |
mcinquil |
1.2 |
self.countDone += 1
|
177 |
|
|
elif stato == 'Cancelled':
|
178 |
|
|
self.countCancel += 1
|
179 |
|
|
elif stato == 'Submitted':
|
180 |
|
|
self.countSubmit += 1
|
181 |
|
|
elif stato == 'Submitting':
|
182 |
|
|
self.countSubmitting += 1
|
183 |
|
|
elif stato == 'Ready':
|
184 |
|
|
self.countReady += 1
|
185 |
|
|
elif stato == 'Scheduled':
|
186 |
|
|
self.countSched += 1
|
187 |
|
|
elif stato == 'Cleared':
|
188 |
|
|
self.countCleared += 1
|
189 |
mcinquil |
1.8 |
elif stato == 'NotSubmitted':
|
190 |
|
|
self.countSubmitting += 1
|
191 |
mcinquil |
1.11 |
elif stato == 'Waiting':
|
192 |
|
|
self.countWait += 1
|
193 |
spiga |
1.13 |
elif stato == 'Retrieving by the server':
|
194 |
mcinquil |
1.12 |
self.countRet += 1
|
195 |
mcinquil |
1.14 |
elif stato == 'Killed':
|
196 |
|
|
self.countKilled += 1
|
197 |
mcinquil |
1.2 |
|
198 |
|
|
addTree += 1
|
199 |
|
|
common.jobDB.save()
|
200 |
|
|
|
201 |
|
|
self.PrintReport_()
|
202 |
|
|
|
203 |
|
|
|
204 |
|
|
def PrintReport_(self) :
|
205 |
|
|
|
206 |
|
|
""" Report #jobs for each status """
|
207 |
|
|
|
208 |
|
|
|
209 |
|
|
print ''
|
210 |
|
|
print ">>>>>>>>> %i Total Jobs " % (self.countToTjob)
|
211 |
|
|
print ''
|
212 |
|
|
|
213 |
|
|
if (self.countSubmitting != 0) :
|
214 |
|
|
print ">>>>>>>>> %i Jobs Submitting by the server" % (self.countSubmitting)
|
215 |
mcinquil |
1.8 |
if (self.countNotSubmit != 0):
|
216 |
|
|
print ">>>>>>>>> %i Jobs Not Submitted to the grid" % (self.countNotSubmit)
|
217 |
mcinquil |
1.2 |
if (self.countSubmit != 0):
|
218 |
|
|
print ">>>>>>>>> %i Jobs Submitted" % (self.countSubmit)
|
219 |
mcinquil |
1.11 |
if (self.countWait != 0):
|
220 |
|
|
print ">>>>>>>>> %i Jobs Waiting" % (self.countWait)
|
221 |
mcinquil |
1.6 |
if (self.countReady != 0):
|
222 |
|
|
print ">>>>>>>>> %i Jobs Ready" % (self.countReady)
|
223 |
mcinquil |
1.2 |
if (self.countSched != 0):
|
224 |
|
|
print ">>>>>>>>> %i Jobs Scheduled" % (self.countSched)
|
225 |
mcinquil |
1.14 |
if (self.countRun != 0):
|
226 |
|
|
print ">>>>>>>>> %i Jobs Running" % (self.countRun)
|
227 |
mcinquil |
1.12 |
if (self.countRet != 0):
|
228 |
spiga |
1.13 |
print ">>>>>>>>> %i Jobs Retrieving by the server" % (self.countRet)
|
229 |
mcinquil |
1.6 |
if (self.countDone != 0):
|
230 |
|
|
print ">>>>>>>>> %i Jobs Done" % (self.countDone)
|
231 |
|
|
print " Retrieve them with: crab -getoutput -continue"
|
232 |
|
|
if (self.countKilled != 0):
|
233 |
|
|
print ">>>>>>>>> %i Jobs Killed" % (self.countKilled)
|
234 |
mcinquil |
1.14 |
# print " Retrieve more information with: crab -postMortem -continue"
|
235 |
mcinquil |
1.6 |
if (self.countAbort != 0):
|
236 |
|
|
print ">>>>>>>>> %i Jobs Aborted" % (self.countAbort)
|
237 |
mcinquil |
1.2 |
if (self.countCleared != 0):
|
238 |
farinafa |
1.9 |
print ">>>>>>>>> %i Jobs Cleared" % (self.countCleared)
|
239 |
|
|
|
240 |
|
|
countUnderMngmt = self.countToTjob - (self.countSubmitting+ self.countNotSubmit + self.countSubmit)
|
241 |
mcinquil |
1.14 |
countUnderMngmt -= (self.countReady + self.countSched + self.countRun + self.countDone + self.countRet)
|
242 |
mcinquil |
1.11 |
countUnderMngmt -= (self.countKilled + self.countAbort + self.countCleared + self.countWait)
|
243 |
farinafa |
1.9 |
if (countUnderMngmt != 0):
|
244 |
|
|
print ">>>>>>>>> %i Jobs Waiting or Under Server Management" % (countUnderMngmt)
|
245 |
|
|
|
246 |
mcinquil |
1.2 |
print ''
|
247 |
|
|
pass
|
248 |
spiga |
1.1 |
|