1 |
nsmirnov |
1.1 |
from Actor import *
|
2 |
nsmirnov |
1.6 |
from crab_util import *
|
3 |
nsmirnov |
1.2 |
import common
|
4 |
corvo |
1.9 |
from ApmonIf import ApmonIf
|
5 |
slacapra |
1.60 |
#from random import random
|
6 |
corvo |
1.30 |
import time
|
7 |
ewv |
1.128 |
import sha
|
8 |
ewv |
1.139 |
import socket
|
9 |
ewv |
1.140 |
import Scram
|
10 |
slacapra |
1.60 |
from ProgressBar import ProgressBar
|
11 |
|
|
from TerminalController import TerminalController
|
12 |
nsmirnov |
1.1 |
|
13 |
|
|
class Submitter(Actor):
|
14 |
slacapra |
1.84 |
def __init__(self, cfg_params, parsed_range, val):
|
15 |
nsmirnov |
1.1 |
self.cfg_params = cfg_params
|
16 |
slacapra |
1.84 |
|
17 |
|
|
# get user request
|
18 |
spiga |
1.156 |
self.nsjobs = -1
|
19 |
|
|
self.chosenJobsList = None
|
20 |
slacapra |
1.84 |
if val:
|
21 |
slacapra |
1.91 |
if val=='range': # for Resubmitter
|
22 |
spiga |
1.156 |
self.chosenJobsList = parsed_range
|
23 |
ewv |
1.92 |
elif val=='all':
|
24 |
slacapra |
1.84 |
pass
|
25 |
|
|
elif (type(eval(val)) is int) and eval(val) > 0:
|
26 |
|
|
# positive number
|
27 |
spiga |
1.156 |
self.nsjobs = eval(val)
|
28 |
slacapra |
1.84 |
elif (type(eval(val)) is tuple)or( type(eval(val)) is int and eval(val)<0 ) :
|
29 |
spiga |
1.156 |
self.chosenJobsList = parsed_range
|
30 |
|
|
self.nsjobs = len(chosenJobsList)
|
31 |
slacapra |
1.84 |
else:
|
32 |
|
|
msg = 'Bad submission option <'+str(val)+'>\n'
|
33 |
|
|
msg += ' Must be an integer or "all"'
|
34 |
|
|
msg += ' Generic range is not allowed"'
|
35 |
|
|
raise CrabException(msg)
|
36 |
|
|
pass
|
37 |
ewv |
1.92 |
|
38 |
spiga |
1.156 |
self.seWhiteList = cfg_params.get('GRID.se_white_list',[])
|
39 |
|
|
self.seBlackList = cfg_params.get('GRID.se_black_list',[])
|
40 |
ewv |
1.157 |
self.datasetPath=self.cfg_params['CMSSW.datasetpath']
|
41 |
|
|
if string.lower(self.datasetPath)=='none':
|
42 |
|
|
self.datasetPath = None
|
43 |
spiga |
1.156 |
self.scram = Scram.Scram(cfg_params)
|
44 |
|
|
return
|
45 |
|
|
|
46 |
|
|
def BuildJobList(self):
|
47 |
slacapra |
1.84 |
# total jobs
|
48 |
|
|
nj_list = []
|
49 |
spiga |
1.156 |
# build job list
|
50 |
|
|
from WMCore.SiteScreening.BlackWhiteListParser import SEBlackWhiteListParser
|
51 |
|
|
self.blackWhiteListParser = SEBlackWhiteListParser(self.seWhiteList, self.seBlackList, common.logger())
|
52 |
|
|
common.logger.debug('nsjobs '+str(self.nsjobs))
|
53 |
slacapra |
1.84 |
# get the first not already submitted
|
54 |
spiga |
1.115 |
self.complete_List = common._db.nJobs('list')
|
55 |
spiga |
1.145 |
common.logger.debug('Total jobs '+str(len(self.complete_List)))
|
56 |
slacapra |
1.84 |
jobSetForSubmission = 0
|
57 |
|
|
jobSkippedInSubmission = []
|
58 |
spiga |
1.115 |
tmp_jList = self.complete_List
|
59 |
spiga |
1.156 |
if self.chosenJobsList != None:
|
60 |
|
|
tmp_jList = self.chosenJobsList
|
61 |
spiga |
1.134 |
for job in common._db.getTask(tmp_jList).jobs:
|
62 |
ewv |
1.135 |
cleanedBlackWhiteList = self.blackWhiteListParser.cleanForBlackWhiteList(job['dlsDestination'])
|
63 |
ewv |
1.157 |
if (cleanedBlackWhiteList != '') or (self.datasetPath == None):
|
64 |
slacapra |
1.155 |
#if ( job.runningJob['status'] in ['C','RC'] and job.runningJob['statusScheduler'] in ['Created',None]):
|
65 |
|
|
if ( job.runningJob['state'] in ['Created']):
|
66 |
slacapra |
1.84 |
jobSetForSubmission +=1
|
67 |
ewv |
1.135 |
nj_list.append(job['id'])
|
68 |
ewv |
1.92 |
else:
|
69 |
slacapra |
1.84 |
continue
|
70 |
|
|
else :
|
71 |
spiga |
1.134 |
jobSkippedInSubmission.append( job['id'] )
|
72 |
spiga |
1.156 |
if self.nsjobs >0 and self.nsjobs == jobSetForSubmission:
|
73 |
slacapra |
1.84 |
break
|
74 |
|
|
pass
|
75 |
spiga |
1.156 |
if self.nsjobs>jobSetForSubmission:
|
76 |
|
|
common.logger.info('asking to submit '+str(self.nsjobs)+' jobs, but only '+\
|
77 |
spiga |
1.134 |
str(jobSetForSubmission)+' left: submitting those')
|
78 |
slacapra |
1.84 |
if len(jobSkippedInSubmission) > 0 :
|
79 |
|
|
mess =""
|
80 |
|
|
for jobs in jobSkippedInSubmission:
|
81 |
|
|
mess += str(jobs) + ","
|
82 |
spiga |
1.156 |
common.logger.info("Jobs: " +str(mess) + "\n\tskipped because no sites are hosting this data\n")
|
83 |
slacapra |
1.89 |
self.submissionError()
|
84 |
|
|
pass
|
85 |
slacapra |
1.84 |
# submit N from last submitted job
|
86 |
spiga |
1.145 |
common.logger.debug('nj_list '+str(nj_list))
|
87 |
slacapra |
1.84 |
self.nj_list = nj_list
|
88 |
nsmirnov |
1.1 |
return
|
89 |
ewv |
1.92 |
|
90 |
nsmirnov |
1.1 |
def run(self):
|
91 |
nsmirnov |
1.2 |
"""
|
92 |
slacapra |
1.53 |
The main method of the class: submit jobs in range self.nj_list
|
93 |
nsmirnov |
1.2 |
"""
|
94 |
spiga |
1.145 |
common.logger.debug("Submitter::run() called")
|
95 |
slacapra |
1.24 |
|
96 |
spiga |
1.112 |
start = time.time()
|
97 |
|
|
|
98 |
spiga |
1.156 |
self.BuildJobList()
|
99 |
|
|
|
100 |
ewv |
1.128 |
check = self.checkIfCreate()
|
101 |
|
|
|
102 |
spiga |
1.112 |
if check == 0 :
|
103 |
|
|
self.SendMLpre()
|
104 |
ewv |
1.128 |
|
105 |
|
|
list_matched , task = self.performMatch()
|
106 |
|
|
njs = self.perfromSubmission(list_matched, task)
|
107 |
|
|
|
108 |
spiga |
1.112 |
stop = time.time()
|
109 |
spiga |
1.145 |
common.logger.debug("Submission Time: "+str(stop - start))
|
110 |
ewv |
1.128 |
|
111 |
spiga |
1.149 |
msg = 'Total of %d jobs submitted'%njs
|
112 |
spiga |
1.112 |
if njs != len(self.nj_list) :
|
113 |
|
|
msg += ' (from %d requested).'%(len(self.nj_list))
|
114 |
|
|
else:
|
115 |
|
|
msg += '.'
|
116 |
spiga |
1.145 |
common.logger.info(msg)
|
117 |
ewv |
1.128 |
|
118 |
spiga |
1.112 |
if (njs < len(self.nj_list) or len(self.nj_list)==0):
|
119 |
|
|
self.submissionError()
|
120 |
|
|
|
121 |
|
|
|
122 |
ewv |
1.128 |
def checkIfCreate(self):
|
123 |
spiga |
1.112 |
"""
|
124 |
|
|
"""
|
125 |
|
|
code = 0
|
126 |
spiga |
1.94 |
totalCreatedJobs = 0
|
127 |
spiga |
1.134 |
task=common._db.getTask()
|
128 |
|
|
for job in task.jobs:
|
129 |
slacapra |
1.155 |
if job.runningJob['state'] == 'Created': totalCreatedJobs +=1
|
130 |
slacapra |
1.24 |
|
131 |
|
|
if (totalCreatedJobs==0):
|
132 |
slacapra |
1.155 |
common.logger.info("No jobs to be submitted: first create them")
|
133 |
|
|
code = 1
|
134 |
ewv |
1.128 |
return code
|
135 |
ewv |
1.92 |
|
136 |
gutsche |
1.70 |
|
137 |
ewv |
1.128 |
def performMatch(self):
|
138 |
|
|
"""
|
139 |
spiga |
1.113 |
"""
|
140 |
spiga |
1.145 |
common.logger.info("Checking available resources...")
|
141 |
ewv |
1.128 |
### define here the list of distinct destinations sites list
|
142 |
spiga |
1.94 |
distinct_dests = common._db.queryDistJob_Attr('dlsDestination', 'jobId' ,self.nj_list)
|
143 |
|
|
|
144 |
|
|
|
145 |
|
|
### define here the list of jobs Id for each distinct list of sites
|
146 |
spiga |
1.112 |
self.sub_jobs =[] # list of jobs Id list to submit
|
147 |
spiga |
1.95 |
jobs_to_match =[] # list of jobs Id to match
|
148 |
ewv |
1.128 |
all_jobs=[]
|
149 |
spiga |
1.94 |
count=0
|
150 |
ewv |
1.128 |
for distDest in distinct_dests:
|
151 |
spiga |
1.94 |
all_jobs.append(common._db.queryAttrJob({'dlsDestination':distDest},'jobId'))
|
152 |
|
|
sub_jobs_temp=[]
|
153 |
|
|
for i in self.nj_list:
|
154 |
ewv |
1.128 |
if i in all_jobs[count]: sub_jobs_temp.append(i)
|
155 |
spiga |
1.94 |
if len(sub_jobs_temp)>0:
|
156 |
ewv |
1.128 |
self.sub_jobs.append(sub_jobs_temp)
|
157 |
spiga |
1.112 |
jobs_to_match.append(self.sub_jobs[count][0])
|
158 |
spiga |
1.103 |
count +=1
|
159 |
spiga |
1.94 |
sel=0
|
160 |
ewv |
1.128 |
matched=[]
|
161 |
spiga |
1.95 |
|
162 |
|
|
task=common._db.getTask()
|
163 |
|
|
for id_job in jobs_to_match :
|
164 |
spiga |
1.121 |
match = common.scheduler.listMatch(distinct_dests[sel], False)
|
165 |
slacapra |
1.111 |
if len(match)>0:
|
166 |
spiga |
1.145 |
common.logger.info("Found compatible site(s) for job "+str(id_job))
|
167 |
slacapra |
1.110 |
matched.append(sel)
|
168 |
spiga |
1.77 |
else:
|
169 |
spiga |
1.145 |
common.logger.info("No compatible site found, will not submit jobs "+str(self.sub_jobs[sel]))
|
170 |
slacapra |
1.110 |
self.submissionError()
|
171 |
spiga |
1.94 |
sel += 1
|
172 |
ewv |
1.92 |
|
173 |
ewv |
1.128 |
return matched , task
|
174 |
spiga |
1.112 |
|
175 |
|
|
def perfromSubmission(self,matched,task):
|
176 |
|
|
|
177 |
ewv |
1.128 |
njs=0
|
178 |
|
|
|
179 |
spiga |
1.94 |
### Progress Bar indicator, deactivate for debug
|
180 |
spiga |
1.147 |
if common.debugLevel == 0 :
|
181 |
slacapra |
1.110 |
term = TerminalController()
|
182 |
ewv |
1.128 |
|
183 |
|
|
if len(matched)>0:
|
184 |
spiga |
1.145 |
common.logger.info(str(len(matched))+" blocks of jobs will be submitted")
|
185 |
ewv |
1.128 |
for ii in matched:
|
186 |
spiga |
1.145 |
common.logger.debug('Submitting jobs '+str(self.sub_jobs[ii]))
|
187 |
spiga |
1.112 |
|
188 |
slacapra |
1.110 |
try:
|
189 |
spiga |
1.112 |
common.scheduler.submit(self.sub_jobs[ii],task)
|
190 |
slacapra |
1.110 |
except CrabException:
|
191 |
|
|
raise CrabException("Job not submitted")
|
192 |
|
|
|
193 |
spiga |
1.150 |
if common.debugLevel == 0 :
|
194 |
spiga |
1.112 |
try: pbar = ProgressBar(term, 'Submitting '+str(len(self.sub_jobs[ii]))+' jobs')
|
195 |
corvo |
1.74 |
except: pbar = None
|
196 |
spiga |
1.150 |
if common.debugLevel == 0:
|
197 |
spiga |
1.94 |
if pbar :
|
198 |
spiga |
1.112 |
pbar.update(float(ii+1)/float(len(self.sub_jobs)),'please wait')
|
199 |
ewv |
1.128 |
### check the if the submission succeded Maybe not needed or at least simplified
|
200 |
spiga |
1.112 |
sched_Id = common._db.queryRunJob('schedulerId', self.sub_jobs[ii])
|
201 |
spiga |
1.95 |
listId=[]
|
202 |
spiga |
1.94 |
run_jobToSave = {'status' :'S'}
|
203 |
spiga |
1.108 |
listRunField = []
|
204 |
ewv |
1.128 |
for j in range(len(self.sub_jobs[ii])):
|
205 |
|
|
if str(sched_Id[j]) != '':
|
206 |
|
|
listId.append(self.sub_jobs[ii][j])
|
207 |
|
|
listRunField.append(run_jobToSave)
|
208 |
spiga |
1.145 |
common.logger.debug("Submitted job # "+ str(self.sub_jobs[ii][j]))
|
209 |
spiga |
1.94 |
njs += 1
|
210 |
ewv |
1.128 |
common._db.updateRunJob_(listId, listRunField)
|
211 |
mcinquil |
1.144 |
self.stateChange(listId,"SubSuccess")
|
212 |
spiga |
1.112 |
self.SendMLpost(self.sub_jobs[ii])
|
213 |
|
|
|
214 |
spiga |
1.94 |
else:
|
215 |
spiga |
1.145 |
common.logger.info("The whole task doesn't found compatible site ")
|
216 |
ewv |
1.92 |
|
217 |
spiga |
1.112 |
return njs
|
218 |
spiga |
1.99 |
|
219 |
|
|
def submissionError(self):
|
220 |
|
|
## add some more verbose message in case submission is not complete
|
221 |
|
|
msg = 'Submission performed using the Requirements: \n'
|
222 |
|
|
### TODO_ DS--BL
|
223 |
|
|
#msg += common.taskDB.dict("jobtype")+' version: '+common.taskDB.dict("codeVersion")+'\n'
|
224 |
|
|
#msg += '(Hint: please check if '+common.taskDB.dict("jobtype")+' is available at the Sites)\n'
|
225 |
spiga |
1.146 |
if self.cfg_params.has_key('GRID.se_white_list'):
|
226 |
|
|
msg += '\tSE White List: '+self.cfg_params['GRID.se_white_list']+'\n'
|
227 |
|
|
if self.cfg_params.has_key('GRID.se_black_list'):
|
228 |
|
|
msg += '\tSE Black List: '+self.cfg_params['GRID.se_black_list']+'\n'
|
229 |
|
|
if self.cfg_params.has_key('GRID.ce_white_list'):
|
230 |
|
|
msg += '\tCE White List: '+self.cfg_params['GRID.ce_white_list']+'\n'
|
231 |
|
|
if self.cfg_params.has_key('GRID.ce_black_list'):
|
232 |
|
|
msg += '\tCE Black List: '+self.cfg_params['GRID.ce_black_list']+'\n'
|
233 |
|
|
removeDefBL = self.cfg_params.get('GRID.remove_default_blacklist',0)
|
234 |
spiga |
1.137 |
if removeDefBL == '0':
|
235 |
|
|
msg += '\tNote: All CMS T1s are BlackListed by default \n'
|
236 |
spiga |
1.136 |
msg += '\t(Hint: By whitelisting you force the job to run at this particular site(s).\n'
|
237 |
|
|
msg += '\tPlease check if :\n'
|
238 |
|
|
msg += '\t\t -- the dataset is available at this site!\n'
|
239 |
|
|
msg += '\t\t -- the CMSSW version is available at this site!)\n'
|
240 |
spiga |
1.145 |
common.logger.info(msg)
|
241 |
spiga |
1.112 |
|
242 |
|
|
return
|
243 |
spiga |
1.99 |
|
244 |
spiga |
1.112 |
def collect_MLInfo(self):
|
245 |
|
|
"""
|
246 |
ewv |
1.129 |
Prepare DashBoard information
|
247 |
spiga |
1.112 |
"""
|
248 |
ewv |
1.92 |
|
249 |
spiga |
1.142 |
taskId = common._db.queryTask('name')
|
250 |
spiga |
1.112 |
gridName = string.strip(common.scheduler.userName())
|
251 |
spiga |
1.151 |
common.logger.debug("GRIDNAME: %s "%gridName)
|
252 |
spiga |
1.112 |
taskType = 'analysis'
|
253 |
ewv |
1.128 |
|
254 |
spiga |
1.112 |
self.executable = self.cfg_params.get('CMSSW.executable','cmsRun')
|
255 |
spiga |
1.146 |
VO = self.cfg_params.get('GRID.virtual_organization','cms')
|
256 |
spiga |
1.112 |
|
257 |
ewv |
1.129 |
params = {'tool': common.prog_name,
|
258 |
ewv |
1.157 |
'SubmissionType':'direct',
|
259 |
ewv |
1.129 |
'JSToolVersion': common.prog_version_str,
|
260 |
|
|
'tool_ui': os.environ.get('HOSTNAME',''),
|
261 |
|
|
'scheduler': common.scheduler.name(),
|
262 |
|
|
'GridName': gridName,
|
263 |
ewv |
1.140 |
'ApplicationVersion': self.scram.getSWVersion(),
|
264 |
ewv |
1.129 |
'taskType': taskType,
|
265 |
|
|
'vo': VO,
|
266 |
spiga |
1.142 |
'CMSUser': getUserName(),
|
267 |
|
|
'user': getUserName(),
|
268 |
spiga |
1.143 |
'taskId': str(taskId),
|
269 |
ewv |
1.129 |
'datasetFull': self.datasetPath,
|
270 |
ewv |
1.128 |
'exe': self.executable }
|
271 |
spiga |
1.112 |
|
272 |
|
|
return params
|
273 |
ewv |
1.128 |
|
274 |
spiga |
1.112 |
def SendMLpre(self):
|
275 |
|
|
"""
|
276 |
ewv |
1.128 |
Send Pre info to ML
|
277 |
spiga |
1.112 |
"""
|
278 |
|
|
params = self.collect_MLInfo()
|
279 |
ewv |
1.128 |
|
280 |
spiga |
1.112 |
params['jobId'] ='TaskMeta'
|
281 |
ewv |
1.128 |
|
282 |
spiga |
1.112 |
common.apmon.sendToML(params)
|
283 |
ewv |
1.128 |
|
284 |
spiga |
1.151 |
common.logger.debug('Submission DashBoard Pre-Submission report: %s'%str(params))
|
285 |
ewv |
1.128 |
|
286 |
spiga |
1.112 |
return
|
287 |
ewv |
1.92 |
|
288 |
spiga |
1.112 |
def SendMLpost(self,allList):
|
289 |
|
|
"""
|
290 |
ewv |
1.128 |
Send post-submission info to ML
|
291 |
|
|
"""
|
292 |
|
|
task = common._db.getTask(allList)
|
293 |
spiga |
1.112 |
|
294 |
|
|
params = {}
|
295 |
|
|
for k,v in self.collect_MLInfo().iteritems():
|
296 |
|
|
params[k] = v
|
297 |
ewv |
1.128 |
|
298 |
ewv |
1.157 |
msg = ''
|
299 |
spiga |
1.112 |
Sub_Type = 'Direct'
|
300 |
|
|
for job in task.jobs:
|
301 |
ewv |
1.128 |
jj = job['jobId']
|
302 |
spiga |
1.112 |
jobId = ''
|
303 |
|
|
localId = ''
|
304 |
ewv |
1.128 |
jid = str(job.runningJob['schedulerId'])
|
305 |
ewv |
1.130 |
if common.scheduler.name().upper() in ['CONDOR_G','GLIDEIN']:
|
306 |
spiga |
1.112 |
rb = 'OSG'
|
307 |
ewv |
1.128 |
taskHash = sha.new(common._db.queryTask('name')).hexdigest()
|
308 |
ewv |
1.130 |
jobId = str(jj) + '_https://' + common.scheduler.name() + '/' + taskHash + '/' + str(jj)
|
309 |
spiga |
1.151 |
msg += ('JobID for ML monitoring is created for CONDOR_G scheduler: %s \n'%str(jobId))
|
310 |
ewv |
1.128 |
elif common.scheduler.name().upper() in ['LSF', 'CAF']:
|
311 |
spiga |
1.138 |
jobId= str(jj) + "_https://"+common.scheduler.name()+":/"+jid+"-"+string.replace(str(task['name']),"_","-")
|
312 |
spiga |
1.151 |
msg += ('JobID for ML monitoring is created for LSF scheduler: %s\n'%str(jobId))
|
313 |
spiga |
1.112 |
rb = common.scheduler.name()
|
314 |
|
|
localId = jid
|
315 |
ewv |
1.139 |
elif common.scheduler.name().upper() in ['CONDOR']:
|
316 |
|
|
taskHash = sha.new(common._db.queryTask('name')).hexdigest()
|
317 |
|
|
jobId = str(jj) + '_https://' + socket.gethostname() + '/' + taskHash + '/' + str(jj)
|
318 |
spiga |
1.151 |
msg += ('JobID for ML monitoring is created for CONDOR scheduler: %s\n'%str(jobId))
|
319 |
ewv |
1.139 |
rb = common.scheduler.name()
|
320 |
edelmann |
1.154 |
elif common.scheduler.name().upper() in ['ARC']:
|
321 |
|
|
taskHash = sha.new(common._db.queryTask('name')).hexdigest()
|
322 |
|
|
jobId = str(jj) + '_https://' + socket.gethostname() + '/' + taskHash + '/' + str(jj)
|
323 |
|
|
msg += ('JobID for ML monitoring is created for ARC scheduler: %s\n'%str(jobId))
|
324 |
|
|
rb = 'ARC'
|
325 |
spiga |
1.112 |
else:
|
326 |
|
|
jobId = str(jj) + '_' + str(jid)
|
327 |
spiga |
1.151 |
msg += ('JobID for ML monitoring is created for gLite scheduler %s\n'%str(jobId))
|
328 |
spiga |
1.112 |
rb = str(job.runningJob['service'])
|
329 |
ewv |
1.128 |
|
330 |
|
|
dlsDest = job['dlsDestination']
|
331 |
spiga |
1.125 |
if len(dlsDest) == 1 :
|
332 |
|
|
T_SE=str(dlsDest[0])
|
333 |
|
|
elif len(dlsDest) == 2 :
|
334 |
|
|
T_SE=str(dlsDest[0])+','+str(dlsDest[1])
|
335 |
ewv |
1.128 |
else :
|
336 |
spiga |
1.112 |
T_SE=str(len(dlsDest))+'_Selected_SE'
|
337 |
|
|
|
338 |
|
|
|
339 |
|
|
infos = { 'jobId': jobId, \
|
340 |
|
|
'sid': jid, \
|
341 |
|
|
'broker': rb, \
|
342 |
|
|
'bossId': jj, \
|
343 |
|
|
'SubmissionType': Sub_Type, \
|
344 |
|
|
'TargetSE': T_SE, \
|
345 |
|
|
'localId' : localId}
|
346 |
|
|
|
347 |
|
|
for k,v in infos.iteritems():
|
348 |
|
|
params[k] = v
|
349 |
|
|
|
350 |
spiga |
1.151 |
msg +=('Submission DashBoard report: %s\n'%str(params))
|
351 |
spiga |
1.112 |
common.apmon.sendToML(params)
|
352 |
spiga |
1.151 |
common.logger.log(10-1,msg)
|
353 |
nsmirnov |
1.1 |
return
|
354 |
spiga |
1.112 |
|
355 |
|
|
|