ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cvsroot/COMP/CRAB/python/SchedulerEdg.py
Revision: 1.29
Committed: Fri Nov 18 15:50:23 2005 UTC (19 years, 5 months ago) by fanzago
Content type: text/x-python
Branch: MAIN
Changes since 1.28: +0 -1 lines
Log Message:
removed a print

File Contents

# Content
1 from Scheduler import Scheduler
2 from crab_logger import Logger
3 from crab_exceptions import *
4 from crab_util import *
5 import common
6
7 import os, sys, time
8
9 class SchedulerEdg(Scheduler):
10 def __init__(self):
11 Scheduler.__init__(self,"EDG")
12 self.states = [ "Acl", "cancelReason", "cancelling","ce_node","children", \
13 "children_hist","children_num","children_states","condorId","condor_jdl", \
14 "cpuTime","destination", "done_code","exit_code","expectFrom", \
15 "expectUpdate","globusId","jdl","jobId","jobtype", \
16 "lastUpdateTime","localId","location", "matched_jdl","network_server", \
17 "owner","parent_job", "reason","resubmitted","rsl","seed",\
18 "stateEnterTime","stateEnterTimes","subjob_failed", \
19 "user tags" , "status" , "status_code","hierarchy"]
20 return
21
22 def configure(self, cfg_params):
23
24 try: self.edg_config = cfg_params["EDG.config"]
25 except KeyError: self.edg_config = ''
26
27 try: self.edg_config_vo = cfg_params["EDG.config_vo"]
28 except KeyError: self.edg_config_vo = ''
29
30 try: self.LCG_version = cfg_params["EDG.lcg_version"]
31 except KeyError: self.LCG_version = '2'
32
33 try: self.EDG_requirements = cfg_params['EDG.requirements']
34 except KeyError: self.EDG_requirements = ''
35
36 try: self.EDG_retry_count = cfg_params['EDG.retry_count']
37 except KeyError: self.EDG_retry_count = ''
38
39 try: self.VO = cfg_params['EDG.virtual_organization']
40 except KeyError: self.VO = 'cms'
41
42 try: self.return_data = cfg_params['USER.return_data']
43 except KeyError: self.return_data = ''
44
45 try:
46 self.copy_data = cfg_params["USER.copy_data"]
47 try:
48 self.SE = cfg_params['USER.storage_element']
49 self.SE_PATH = cfg_params['USER.storage_path']
50 except KeyError:
51 msg = "Error. The [USER] section does not have 'storage_element'"
52 msg = msg + " and/or 'storage_path' entries, necessary to copy the output"
53 common.logger.message(msg)
54 raise CrabException(msg)
55 except KeyError: self.copy_data = ''
56
57 try:
58 self.register_data = cfg_params["USER.register_data"]
59 try:
60 self.LFN = cfg_params['USER.lfn_dir']
61 except KeyError:
62 msg = "Error. The [USER] section does not have 'lfn_dir' value"
63 msg = msg + " it's necessary for RLS registration"
64 common.logger.message(msg)
65 raise CrabException(msg)
66 except KeyError: self.register_data= ''
67
68 try: self.EDG_requirements = cfg_params['EDG.requirements']
69 except KeyError: self.EDG_requirements = ''
70
71 try: self.EDG_retry_count = cfg_params['EDG.retry_count']
72 except KeyError: self.EDG_retry_count = ''
73
74 try: self.EDG_clock_time = cfg_params['EDG.max_wall_clock_time']
75 except KeyError: self.EDG_clock_time= ''
76
77 try: self.EDG_cpu_time = cfg_params['EDG.max_cpu_time']
78 except KeyError: self.EDG_cpu_time = ''
79
80 # Add EDG_WL_LOCATION to the python path
81
82 try:
83 path = os.environ['EDG_WL_LOCATION']
84 except:
85 msg = "Error: the EDG_WL_LOCATION variable is not set."
86 raise CrabException(msg)
87
88 libPath=os.path.join(path, "lib")
89 sys.path.append(libPath)
90 libPath=os.path.join(path, "lib", "python")
91 sys.path.append(libPath)
92
93 self.proxyValid=0
94 return
95
96
97 def sched_parameter(self):
98 """
99 Returns file with scheduler-specific parameters
100 """
101
102 if (self.edg_config and self.edg_config_vo != ''):
103 self.param='sched_param.clad'
104 param_file = open(common.work_space.shareDir()+'/'+self.param, 'w')
105 param_file.write('RBconfig = "'+self.edg_config+'";\n')
106 param_file.write('RBconfigVO = "'+self.edg_config_vo+'";')
107 param_file.close()
108 return 1
109 else:
110 return 0
111
112 def wsSetupEnvironment(self):
113 """
114 Returns part of a job script which does scheduler-specific work.
115 """
116
117 txt = ''
118 if self.copy_data:
119 if self.SE:
120 txt += 'export SE='+self.SE+'\n'
121 txt += 'echo "SE = $SE"\n'
122 if self.SE_PATH:
123 if ( self.SE_PATH[-1] != '/' ) : self.SE_PATH = self.SE_PATH + '/'
124 txt += 'export SE_PATH='+self.SE_PATH+'\n'
125 txt += 'echo "SE_PATH = $SE_PATH"\n'
126
127 if self.register_data:
128 if self.VO:
129 txt += 'export VO='+self.VO+'\n'
130 if self.LFN:
131 txt += 'export LFN='+self.LFN+'\n'
132 txt += '\n'
133 txt += 'CloseCEs=`edg-brokerinfo getCE`\n'
134 txt += 'echo "CloseCEs = $CloseCEs"\n'
135 txt += 'CE=`echo $CloseCEs | sed -e "s/:.*//"`\n'
136 txt += 'echo "CE = $CE"\n'
137 return txt
138
139 def wsCopyOutput(self):
140 """
141 Write a CopyResults part of a job script, e.g.
142 to copy produced output into a storage element.
143 """
144 txt = ''
145 if self.copy_data:
146 copy = 'globus-url-copy file://`pwd`/$out_file gsiftp://${SE}${SE_PATH}$out_file'
147 txt += '#\n'
148 txt += '# Copy output to SE = $SE\n'
149 txt += '#\n'
150 txt += 'if [ $exe_result -eq 0 ]; then\n'
151 txt += ' for out_file in $file_list ; do\n'
152 txt += ' echo "Trying to copy output file to $SE "\n'
153 txt += ' echo "'+copy+'"\n'
154 txt += ' '+copy+' 2>&1\n'
155 txt += ' copy_exit_status=$?\n'
156 txt += ' echo "COPY_EXIT_STATUS = $copy_exit_status"\n'
157 txt += ' echo "STAGE_OUT = $copy_exit_status"\n'
158 txt += ' if [ $copy_exit_status -ne 0 ]; then \n'
159 txt += ' echo "Problems with SE= $SE" \n'
160 txt += ' else \n'
161 txt += ' echo "output copied into $SE/$SE_PATH directory"\n'
162 txt += ' fi \n'
163 txt += ' done\n'
164 txt += 'fi \n'
165 return txt
166
167 def wsRegisterOutput(self):
168 """
169 Returns part of a job script which does scheduler-specific work.
170 """
171
172 txt = ''
173 if self.register_data:
174 txt += '#\n'
175 txt += '# Register output to RLS\n'
176 txt += '#\n'
177 txt += 'if [[ $exe_result -eq 0 && $copy_exit_status -eq 0 ]]; then\n'
178 txt += ' for out_file in $file_list ; do\n'
179 txt += ' echo "Trying to register the output file into RLS"\n'
180 txt += ' echo "lcg-rf -l $LFN/$out_file --vo $VO sfn://$SE$SE_PATH/$out_file"\n'
181 txt += ' lcg-rf -l $LFN/$out_file --vo $VO sfn://$SE$SE_PATH/$out_file 2>&1 \n'
182 txt += ' register_exit_status=$?\n'
183 txt += ' echo "REGISTER_EXIT_STATUS = $register_exit_status"\n'
184 txt += ' echo "STAGE_OUT = $register_exit_status"\n'
185 txt += ' if [ $register_exit_status -ne 0 ]; then \n'
186 txt += ' echo "Problems with the registration to RLS" \n'
187 txt += ' echo "Try with srm protocol" \n'
188 txt += ' echo "lcg-rf -l $LFN/$out_file --vo $VO srm://$SE$SE_PATH/$out_file"\n'
189 txt += ' lcg-rf -l $LFN/$out_file --vo $VO srm://$SE$SE_PATH/$out_file 2>&1 \n'
190 txt += ' register_exit_status=$?\n'
191 txt += ' echo "REGISTER_EXIT_STATUS = $register_exit_status"\n'
192 txt += ' echo "STAGE_OUT = $register_exit_status"\n'
193 txt += ' if [ $register_exit_status -ne 0 ]; then \n'
194 txt += ' echo "Problems with the registration into RLS" \n'
195 txt += ' fi \n'
196 txt += ' else \n'
197 txt += ' echo "output registered to RLS"\n'
198 txt += ' fi \n'
199 txt += ' done\n'
200 txt += 'elif [[ $exe_result -eq 0 && $copy_exit_status -ne 0 ]]; then \n'
201 txt += ' echo "Trying to copy output file to CloseSE"\n'
202 txt += ' CLOSE_SE=`edg-brokerinfo getCloseSEs | head -1`\n'
203 txt += ' for out_file in $file_list ; do\n'
204 txt += ' echo "lcg-cr -v -l lfn:${LFN}/$out_file -d $CLOSE_SE -P $LFN/$out_file --vo $VO file://`pwd`/$out_file" \n'
205 txt += ' lcg-cr -v -l lfn:${LFN}/$out_file -d $CLOSE_SE -P $LFN/$out_file --vo $VO file://`pwd`/$out_file 2>&1 \n'
206 txt += ' register_exit_status=$?\n'
207 txt += ' echo "REGISTER_EXIT_STATUS = $register_exit_status"\n'
208 txt += ' echo "STAGE_OUT = $register_exit_status"\n'
209 txt += ' if [ $register_exit_status -ne 0 ]; then \n'
210 txt += ' echo "Problems with CloseSE" \n'
211 txt += ' else \n'
212 txt += ' echo "The program was successfully executed"\n'
213 txt += ' echo "SE = $CLOSE_SE"\n'
214 txt += ' echo "LFN for the file is LFN=${LFN}/$out_file"\n'
215 txt += ' fi \n'
216 txt += ' done\n'
217 txt += 'else\n'
218 txt += ' echo "Problem with the executable"\n'
219 txt += 'fi \n'
220 return txt
221
222 def loggingInfo(self, id):
223 """
224 retrieve the logging info from logging and bookkeeping and return it
225 """
226 self.checkProxy()
227 # id = common.jobDB.jobId(nj)
228 cmd = 'edg-job-get-logging-info -v 2 ' + id
229 cmd_out = os.popen(cmd)
230 # cmd_out = runCommand(cmd)
231 return cmd_out
232
233 def listMatch(self, nj):
234 """
235 Check the compatibility of available resources
236 """
237 self.checkProxy()
238 jdl = common.job_list[nj].jdlFilename()
239 cmd = 'edg-job-list-match ' + self.configOpt_() + jdl
240 myCmd = os.popen(cmd)
241 cmd_out = myCmd.readlines()
242 myCmd.close()
243 return self.parseListMatch_(cmd_out, jdl)
244
245 def parseListMatch_(self, out, jdl):
246
247 reComment = re.compile( r'^\**$' )
248 reEmptyLine = re.compile( r'^$' )
249 reVO = re.compile( r'Selected Virtual Organisation name.*' )
250 reCE = re.compile( r'CEId' )
251 reNO = re.compile( r'No Computing Element matching' )
252 reRB = re.compile( r'Connecting to host' )
253 next = 0
254 CEs=[]
255 Match=0
256
257 for line in out:
258 line = line.strip()
259 if reComment.match( line ):
260 next = 0
261 continue
262 if reEmptyLine.match(line):
263 continue
264 if reVO.match( line ):
265 VO =line.split()[-1]
266 common.logger.debug(5, 'VO :'+VO)
267 pass
268 if reRB.match( line ):
269 RB =line.split()[3]
270 common.logger.debug(5, 'Using RB :'+RB)
271 pass
272 if reCE.search( line ):
273 next = 1
274 continue
275 if next:
276 CE=line.split(':')[0]
277 if (CEs.count(CE) > 0):
278 pass
279 else:
280 CEs.append(CE)
281 Match=Match+1
282 common.logger.debug(5, 'Matched CE :'+CE)
283 pass
284 if reNO.match( line ):
285 common.logger.debug(5,line)
286 self.noMatchFound_(jdl)
287 Match=0
288 pass
289 return Match
290
291 def noMatchFound_(self, jdl):
292 reReq = re.compile( r'Requirements' )
293 reString = re.compile( r'"\S*"' )
294 f = file(jdl,'r')
295 for line in f.readlines():
296 line= line.strip()
297 if reReq.match(line):
298 for req in reString.findall(line):
299 if re.search("VO",req):
300 common.logger.message( "SW required: "+req)
301 continue
302 if re.search('"\d+',req):
303 common.logger.message("Other req : "+req)
304 continue
305 common.logger.message( "CE required: "+req)
306 break
307 pass
308 raise CrabException("No compatible resources found!")
309
310 def submit(self, nj):
311 """
312 Submit one EDG job.
313 """
314
315 self.checkProxy()
316 jid = None
317 jdl = common.job_list[nj].jdlFilename()
318
319 cmd = 'edg-job-submit ' + self.configOpt_() + jdl
320 cmd_out = runCommand(cmd)
321 if cmd_out != None:
322 reSid = re.compile( r'https.+' )
323 jid = reSid.search(cmd_out).group()
324 pass
325 return jid
326
327 def getExitStatus(self, id):
328 return self.getStatusAttribute_(id, 'exit_code')
329
330 def queryStatus(self, id):
331 return self.getStatusAttribute_(id, 'status')
332
333 def queryDest(self, id):
334 return self.getStatusAttribute_(id, 'destination')
335
336
337 def getStatusAttribute_(self, id, attr):
338 """ Query a status of the job with id """
339
340 self.checkProxy()
341 hstates = {}
342 Status = importName('edg_wl_userinterface_common_LbWrapper', 'Status')
343 # Bypass edg-job-status interfacing directly to C++ API
344 # Job attribute vector to retrieve status without edg-job-status
345 level = 0
346 # Instance of the Status class provided by LB API
347 jobStat = Status()
348 st = 0
349 jobStat.getStatus(id, level)
350 err, apiMsg = jobStat.get_error()
351 if err:
352 #print 'Error caught', apiMsg
353 #common.log.message(apiMsg)
354 common.logger.debug(5,'Error caught' + apiMsg)
355 return None
356 else:
357 for i in range(len(self.states)):
358 # Fill an hash table with all information retrieved from LB API
359 hstates[ self.states[i] ] = jobStat.loadStatus(st)[i]
360 result = jobStat.loadStatus(st)[ self.states.index(attr) ]
361 return result
362
363 def queryDetailedStatus(self, id):
364 """ Query a detailed status of the job with id """
365 cmd = 'edg-job-status '+id
366 cmd_out = runCommand(cmd)
367 return cmd_out
368
369 def getOutput(self, id):
370 """
371 Get output for a finished job with id.
372 Returns the name of directory with results.
373 """
374
375 self.checkProxy()
376 cmd = 'edg-job-get-output --dir ' + common.work_space.resDir() + ' ' + id
377 cmd_out = runCommand(cmd)
378
379 # Determine the output directory name
380 dir = common.work_space.resDir()
381 dir += os.getlogin()
382 dir += '_' + os.path.basename(id)
383 return dir
384
385 def cancel(self, id):
386 """ Cancel the EDG job with id """
387 self.checkProxy()
388 cmd = 'edg-job-cancel --noint ' + id
389 cmd_out = runCommand(cmd)
390 return cmd_out
391
392 def createSchScript(self, nj):
393 """
394 Create a JDL-file for EDG.
395 """
396
397 job = common.job_list[nj]
398 jbt = job.type()
399 inp_sandbox = jbt.inputSandbox(nj)
400 out_sandbox = jbt.outputSandbox(nj)
401 inp_storage_subdir = ''
402
403 title = '# This JDL was generated by '+\
404 common.prog_name+' (version '+common.prog_version_str+')\n'
405 jt_string = ''
406
407
408
409 SPL = inp_storage_subdir
410 if ( SPL and SPL[-1] != '/' ) : SPL = SPL + '/'
411
412 jdl_fname = job.jdlFilename()
413 jdl = open(jdl_fname, 'w')
414 jdl.write(title)
415
416 script = job.scriptFilename()
417 jdl.write('Executable = "' + os.path.basename(script) +'";\n')
418 jdl.write(jt_string)
419
420 ### only one .sh JDL has arguments:
421 firstEvent = common.jobDB.firstEvent(nj)
422 maxEvents = common.jobDB.maxEvents(nj)
423 jdl.write('Arguments = "' + str(nj+1)+' '+str(firstEvent)+' '+str(maxEvents)+'";\n')
424
425 inp_box = 'InputSandbox = { '
426 inp_box = inp_box + '"' + script + '",'
427
428 if inp_sandbox != None:
429 for fl in inp_sandbox:
430 inp_box = inp_box + ' "' + fl + '",'
431 pass
432 pass
433
434 #if common.use_jam:
435 # inp_box = inp_box+' "'+common.bin_dir+'/'+common.run_jam+'",'
436
437 for addFile in jbt.additional_inbox_files:
438 addFile = os.path.abspath(addFile)
439 inp_box = inp_box+' "'+addFile+'",'
440 pass
441
442 if inp_box[-1] == ',' : inp_box = inp_box[:-1]
443 inp_box = inp_box + ' };\n'
444 jdl.write(inp_box)
445
446 jdl.write('StdOutput = "' + job.stdout() + '";\n')
447 jdl.write('StdError = "' + job.stderr() + '";\n')
448
449
450 if job.stdout() == job.stderr():
451 out_box = 'OutputSandbox = { "' + \
452 job.stdout() + '", ".BrokerInfo",'
453 else:
454 out_box = 'OutputSandbox = { "' + \
455 job.stdout() + '", "' + \
456 job.stderr() + '", ".BrokerInfo",'
457
458 if self.return_data :
459 if out_sandbox != None:
460 for fl in out_sandbox:
461 out_box = out_box + ' "' + fl + '",'
462 pass
463 pass
464 pass
465
466 if out_box[-1] == ',' : out_box = out_box[:-1]
467 out_box = out_box + ' };'
468 jdl.write(out_box+'\n')
469
470 ### if at least a CE exists ...
471 if common.analisys_common_info['sites']:
472 if common.analisys_common_info['sw_version']:
473 req='Requirements = '
474 req=req + 'Member("VO-cms-' + \
475 common.analisys_common_info['sw_version'] + \
476 '", other.GlueHostApplicationSoftwareRunTimeEnvironment)'
477 if len(common.analisys_common_info['sites'])>0:
478 req = req + ' && ('
479 for i in range(len(common.analisys_common_info['sites'])):
480 req = req + 'other.GlueCEInfoHostName == "' \
481 + common.analisys_common_info['sites'][i] + '"'
482 if ( i < (int(len(common.analisys_common_info['sites']) - 1)) ):
483 req = req + ' || '
484 req = req + ')'
485
486 #### and USER REQUIREMENT
487 if self.EDG_requirements:
488 req = req + ' && ' + self.EDG_requirements
489 if self.EDG_clock_time:
490 req = req + ' && other.GlueCEPolicyMaxWallClockTime>='+self.EDG_clock_time
491 if self.EDG_cpu_time:
492 req = req + ' && other.GlueCEPolicyMaxCPUTime>='+self.EDG_cpu_time
493 req = req + ';\n'
494 jdl.write(req)
495
496 jdl.write('VirtualOrganisation = "' + self.VO + '";\n')
497
498 if ( self.EDG_retry_count ):
499 jdl.write('RetryCount = '+self.EDG_retry_count+';\n')
500 pass
501
502 jdl.close()
503 return
504
505 def checkProxy(self):
506 """
507 Function to check the Globus proxy.
508 """
509 if (self.proxyValid): return
510 timeleft = -999
511 minTimeLeft=10 # in hours
512 cmd = 'grid-proxy-info -e -v '+str(minTimeLeft)+':00'
513 try: cmd_out = runCommand(cmd,0)
514 except: print cmd_out
515 if (cmd_out == None or cmd_out=='1'):
516 common.logger.message( "No valid proxy found or timeleft too short!\n Creating a user proxy with default length of 100h\n")
517 cmd = 'grid-proxy-init -valid 100:00'
518 try:
519 out = os.system(cmd)
520 if (out>0): raise CrabException("Unable to create a valid proxy!\n")
521 except:
522 msg = "Unable to create a valid proxy!\n"
523 raise CrabException(msg)
524 cmd = 'grid-proxy-info -timeleft'
525 cmd_out = runCommand(cmd,0)
526 #print cmd_out, time.time()
527 #time.time(cms_out)
528 pass
529 self.proxyValid=1
530 return
531
532 def configOpt_(self):
533 edg_ui_cfg_opt = ' '
534 if self.edg_config:
535 edg_ui_cfg_opt = ' -c ' + self.edg_config + ' '
536 if self.edg_config_vo:
537 edg_ui_cfg_opt += ' --config-vo ' + self.edg_config_vo + ' '
538 return edg_ui_cfg_opt