ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cvsroot/COMP/CRAB/python/Splitter.py
(Generate patch)

Comparing COMP/CRAB/python/Splitter.py (file contents):
Revision 1.4 by spiga, Mon Feb 9 18:20:20 2009 UTC vs.
Revision 1.17 by spiga, Fri Jun 5 14:08:46 2009 UTC

# Line 1 | Line 1
1   import common
2 from crab_logger import Logger
2   from crab_exceptions import *
3   from crab_util import *
4   from WMCore.SiteScreening.BlackWhiteListParser import SEBlackWhiteListParser
# Line 10 | Line 9 | class JobSplitter:
9          self.args=args
10          #self.maxEvents
11          # init BlackWhiteListParser
12 <        seWhiteList = cfg_params.get('EDG.se_white_list',[])
13 <        seBlackList = cfg_params.get('EDG.se_black_list',[])
12 >        seWhiteList = cfg_params.get('GRID.se_white_list',[])
13 >        seBlackList = cfg_params.get('GRID.se_black_list',[])
14          self.blackWhiteListParser = SEBlackWhiteListParser(seWhiteList, seBlackList, common.logger)
15  
16  
# Line 92 | Line 91 | class JobSplitter:
91          # If user requested more events than are in the dataset
92          elif (totalEventsRequested > self.maxEvents):
93              eventsRemaining = self.maxEvents
94 <            common.logger.message("Requested "+str(self.total_number_of_events)+ " events, but only "+str(self.maxEvents)+" events are available.")
94 >            common.logger.info("Requested "+str(self.total_number_of_events)+ " events, but only "+str(self.maxEvents)+" events are available.")
95          # If user requested less events than are in the dataset
96          else:
97              eventsRemaining = totalEventsRequested
# Line 108 | Line 107 | class JobSplitter:
107              eventsPerJobRequested = int(eventsRemaining/self.theNumberOfJobs)
108  
109          if (self.selectNumberOfJobs):
110 <            common.logger.message("May not create the exact number_of_jobs requested.")
110 >            common.logger.info("May not create the exact number_of_jobs requested.")
111  
112          # old... to remove Daniele
113          totalNumberOfJobs = 999999999
# Line 125 | Line 124 | class JobSplitter:
124          jobsOfBlock = {}
125  
126          parString = ""
127 +        pString = ""
128          filesEventCount = 0
129  
130          # ---- Iterate over the blocks in the dataset until ---- #
# Line 137 | Line 137 | class JobSplitter:
137  
138              if self.eventsbyblock.has_key(block) :
139                  numEventsInBlock = self.eventsbyblock[block]
140 <                common.logger.debug(5,'Events in Block File '+str(numEventsInBlock))
140 >                common.logger.debug('Events in Block File '+str(numEventsInBlock))
141  
142                  files = filesbyblock[block]
143                  numFilesInBlock = len(files)
# Line 147 | Line 147 | class JobSplitter:
147                  if noBboundary == 0: # DD
148                      # ---- New block => New job ---- #
149                      parString = ""
150 +                    pString=""
151                      # counter for number of events in files currently worked on
152                      filesEventCount = 0
153                  # flag if next while loop should touch new file
# Line 156 | Line 157 | class JobSplitter:
157  
158                  # ---- Iterate over the files in the block until we've met the requested ---- #
159                  # ---- total # of events or we've gone over all the files in this block  ---- #
160 <                pString=''
160 >                msg='\n'
161                  while ( (eventsRemaining > 0) and (fileCount < numFilesInBlock) and (jobCount < totalNumberOfJobs) ):
162                      file = files[fileCount]
163                      if self.useParent==1:
164                          parent = self.parentFiles[file]
165 <                        for f in parent :
165 <                            pString += '\\\"' + f + '\\\"\,'
166 <                        common.logger.debug(6, "File "+str(file)+" has the following parents: "+str(parent))
167 <                        common.logger.write("File "+str(file)+" has the following parents: "+str(parent))
165 >                        common.logger.log(10-1, "File "+str(file)+" has the following parents: "+str(parent))
166                      if newFile :
167                          try:
168                              numEventsInFile = self.eventsbyfile[file]
169 <                            common.logger.debug(6, "File "+str(file)+" has "+str(numEventsInFile)+" events")
169 >                            common.logger.log(10-1, "File "+str(file)+" has "+str(numEventsInFile)+" events")
170                              # increase filesEventCount
171                              filesEventCount += numEventsInFile
172                              # Add file to current job
173 <                            parString += '\\\"' + file + '\\\"\,'
173 >                            parString +=  file + ','
174 >                            if self.useParent==1:
175 >                                for f in parent :
176 >                                    pString += f  + ','
177                              newFile = 0
178                          except KeyError:
179 <                            common.logger.message("File "+str(file)+" has unknown number of events: skipping")
179 >                            common.logger.info("File "+str(file)+" has unknown number of events: skipping")
180  
181                      eventsPerJobRequested = min(eventsPerJobRequested, eventsRemaining)
182                      # if less events in file remain than eventsPerJobRequested
# Line 188 | Line 189 | class JobSplitter:
189                              if ( fileCount == numFilesInBlock-1 ) :
190                                  # end job using last file, use remaining events in block
191                                  # close job and touch new file
192 <                                fullString = parString[:-2]
192 >                                fullString = parString[:-1]
193                                  if self.useParent==1:
194 <                                    fullParentString = pString[:-2]
194 >                                    fullParentString = pString[:-1]
195                                      list_of_lists.append([fullString,fullParentString,str(-1),str(jobSkipEventCount)])
196                                  else:
197                                      list_of_lists.append([fullString,str(-1),str(jobSkipEventCount)])
198 <                                common.logger.debug(3,"Job "+str(jobCount+1)+" can run over "+str(filesEventCount - jobSkipEventCount)+" events (last file in block).")
198 >                                msg += "Job %s can run over %s  events (last file in block).\n"%(str(jobCount+1), str(filesEventCount - jobSkipEventCount))
199                                  jobDestination.append(blockSites[block])
200 <                                common.logger.debug(5,"Job "+str(jobCount+1)+" Destination: "+str(jobDestination[jobCount]))
200 >                                msg += "Job %s Destination: %s\n"%(str(jobCount+1),str(jobDestination[jobCount]))
201                                  # fill jobs of block dictionary
202                                  jobsOfBlock[block].append(jobCount+1)
203                                  # reset counter
# Line 217 | Line 218 | class JobSplitter:
218                      # if events in file equal to eventsPerJobRequested
219                      elif ( filesEventCount - jobSkipEventCount == eventsPerJobRequested ) :
220                          # close job and touch new file
221 <                        fullString = parString[:-2]
221 >                        fullString = parString[:-1]
222                          if self.useParent==1:
223 <                            fullParentString = pString[:-2]
223 >                            fullParentString = pString[:-1]
224                              list_of_lists.append([fullString,fullParentString,str(eventsPerJobRequested),str(jobSkipEventCount)])
225                          else:
226                              list_of_lists.append([fullString,str(eventsPerJobRequested),str(jobSkipEventCount)])
227 <                        common.logger.debug(3,"Job "+str(jobCount+1)+" can run over "+str(eventsPerJobRequested)+" events.")
227 >                        msg += "Job %s can run over %s events.\n"%(str(jobCount+1),str(eventsPerJobRequested))
228                          jobDestination.append(blockSites[block])
229 <                        common.logger.debug(5,"Job "+str(jobCount+1)+" Destination: "+str(jobDestination[jobCount]))
229 >                        msg+= "Job %s Destination: %s\n"%(str(jobCount+1),str(jobDestination[jobCount]))
230                          jobsOfBlock[block].append(jobCount+1)
231                          # reset counter
232                          jobCount = jobCount + 1
# Line 242 | Line 243 | class JobSplitter:
243                      # if more events in file remain than eventsPerJobRequested
244                      else :
245                          # close job but don't touch new file
246 <                        fullString = parString[:-2]
246 >                        fullString = parString[:-1]
247                          if self.useParent==1:
248 <                            fullParentString = pString[:-2]
248 >                            fullParentString = pString[:-1]
249                              list_of_lists.append([fullString,fullParentString,str(eventsPerJobRequested),str(jobSkipEventCount)])
250                          else:
251                              list_of_lists.append([fullString,str(eventsPerJobRequested),str(jobSkipEventCount)])
252 <                        common.logger.debug(3,"Job "+str(jobCount+1)+" can run over "+str(eventsPerJobRequested)+" events.")
252 >                        msg += "Job %s can run over %s events.\n"%(str(jobCount+1),str(eventsPerJobRequested))
253                          jobDestination.append(blockSites[block])
254 <                        common.logger.debug(5,"Job "+str(jobCount+1)+" Destination: "+str(jobDestination[jobCount]))
254 >                        msg+= "Job %s Destination: %s\n"%(str(jobCount+1),str(jobDestination[jobCount]))
255                          jobsOfBlock[block].append(jobCount+1)
256                          # increase counter
257                          jobCount = jobCount + 1
# Line 261 | Line 262 | class JobSplitter:
262                          jobSkipEventCount = eventsPerJobRequested - (filesEventCount - jobSkipEventCount - self.eventsbyfile[file])
263                          # remove all but the last file
264                          filesEventCount = self.eventsbyfile[file]
265 +                        pString_tmp=''
266                          if self.useParent==1:
267 <                            for f in parent : pString += '\\\"' + f + '\\\"\,'
268 <                        parString = '\\\"' + file + '\\\"\,'
267 >                            for f in parent : pString_tmp +=  f + ','
268 >                        pString =  pString_tmp
269 >                        parString =  file + ','
270                      pass # END if
271                  pass # END while (iterate over files in the block)
272          pass # END while (iterate over blocks in the dataset)
273 +        common.logger.debug(msg)
274          self.ncjobs = self.total_number_of_jobs = jobCount
275          if (eventsRemaining > 0 and jobCount < totalNumberOfJobs ):
276 <            common.logger.message("Could not run on all requested events because some blocks not hosted at allowed sites.")
277 <        common.logger.message(str(jobCount)+" job(s) can run on "+str(totalEventCount)+" events.\n")
276 >            common.logger.info("Could not run on all requested events because some blocks not hosted at allowed sites.")
277 >        common.logger.info(str(jobCount)+" job(s) can run on "+str(totalEventCount)+" events.\n")
278  
279          # skip check on  block with no sites  DD
280 <        if noBboundary == 0 : self.checkBlockNoSite(blocks,jobsOfBlock)
280 >        if noBboundary == 0 : self.checkBlockNoSite(blocks,jobsOfBlock,blockSites)
281  
282         # prepare dict output
283          dictOut = {}
284 +        dictOut['params']= ['InputFiles','MaxEvents','SkipEvents']
285 +        if self.useParent: dictOut['params']= ['InputFiles','ParentFiles','MaxEvents','SkipEvents']
286          dictOut['args'] = list_of_lists
287          dictOut['jobDestination'] = jobDestination
288          dictOut['njobs']=self.total_number_of_jobs
# Line 285 | Line 291 | class JobSplitter:
291  
292          # keep trace of block with no sites to print a warning at the end
293  
294 <    def checkBlockNoSite(self,blocks,jobsOfBlock):  
294 >    def checkBlockNoSite(self,blocks,jobsOfBlock,blockSites):  
295          # screen output
296          screenOutput = "List of jobs and available destination sites:\n\n"
297          noSiteBlock = []
298          bloskNoSite = []
299 +        allBlock = []
300  
301          blockCounter = 0
302          for block in blocks:
303              if block in jobsOfBlock.keys() :
304                  blockCounter += 1
305 +                allBlock.append( blockCounter )
306                  screenOutput += "Block %5i: jobs %20s: sites: %s\n" % (blockCounter,spanRanges(jobsOfBlock[block]),
307                      ','.join(self.blackWhiteListParser.checkWhiteList(self.blackWhiteListParser.checkBlackList(blockSites[block],[block]),[block])))
308                  if len(self.blackWhiteListParser.checkWhiteList(self.blackWhiteListParser.checkBlackList(blockSites[block],[block]),[block])) == 0:
309                      noSiteBlock.append( spanRanges(jobsOfBlock[block]) )
310                      bloskNoSite.append( blockCounter )
311  
312 <        common.logger.message(screenOutput)
312 >        common.logger.info(screenOutput)
313          if len(noSiteBlock) > 0 and len(bloskNoSite) > 0:
314              msg = 'WARNING: No sites are hosting any part of data for block:\n                '
315              virgola = ""
# Line 316 | Line 324 | class JobSplitter:
324              for range_jobs in noSiteBlock:
325                  msg += str(range_jobs) + virgola
326              msg += '\n               will not be submitted and this block of data can not be analyzed!\n'
327 <            if self.cfg_params.has_key('EDG.se_white_list'):
328 <                msg += 'WARNING: SE White List: '+self.cfg_params['EDG.se_white_list']+'\n'
327 >            if self.cfg_params.has_key('GRID.se_white_list'):
328 >                msg += 'WARNING: SE White List: '+self.cfg_params['GRID.se_white_list']+'\n'
329                  msg += '(Hint: By whitelisting you force the job to run at this particular site(s).\n'
330                  msg += 'Please check if the dataset is available at this site!)\n'
331 <            if self.cfg_params.has_key('EDG.ce_white_list'):
332 <                msg += 'WARNING: CE White List: '+self.cfg_params['EDG.ce_white_list']+'\n'
331 >            if self.cfg_params.has_key('GRID.ce_white_list'):
332 >                msg += 'WARNING: CE White List: '+self.cfg_params['GRID.ce_white_list']+'\n'
333                  msg += '(Hint: By whitelisting you force the job to run at this particular site(s).\n'
334                  msg += 'Please check if the dataset is available at this site!)\n'
335  
336 <            common.logger.message(msg)
336 >            common.logger.info(msg)
337 >
338 >        if bloskNoSite == allBlock:
339 >            raise CrabException('No jobs created')
340  
341          return
342  
# Line 354 | Line 365 | class JobSplitter:
365          thefiles = Fileset(name='FilesToSplit')
366          fileList = pubdata.getListFiles()
367          for f in fileList:
357           # print f
368              block = f['Block']['Name']
359          #  if not blocks.has_key(block):
360          #      blocks[block] = reader.listFileBlockLocation(block)
369              try:
370                  f['Block']['StorageElementList'].extend(blockSites[block])
371              except:
# Line 386 | Line 394 | class JobSplitter:
394          set = Set(runList)
395          list_of_lists = []
396          jobDestination = []
389
397          count = 0
398 <        for i in list(set):
398 >        for jobGroup in  jobfactory():
399              if count <  self.theNumberOfJobs:
400 <                res = self.getJobInfo(jobfactory())
400 >                res = self.getJobInfo(jobGroup)
401                  parString = ''
402                  for file in res['lfns']:
403 <                    parString += '\\\"' + file + '\\\"\,'
404 <                fullString = parString[:-2]
403 >                    parString += file + ','
404 >                fullString = parString[:-1]
405                  list_of_lists.append([fullString,str(-1),str(0)])    
406                  #need to check single file location
407                  jobDestination.append(res['locations'])  
408                  count +=1
402        #print jobDestination
409         # prepare dict output
410          dictOut = {}
411 +        dictOut['params']= ['InputFiles','MaxEvents','SkipEvents']
412          dictOut['args'] = list_of_lists
413          dictOut['jobDestination'] = jobDestination
414          dictOut['njobs']=count
# Line 419 | Line 426 | class JobSplitter:
426                  for loc in file['locations']:
427                      if tmp_check < 1 :
428                          locations.append(loc)
429 <                    tmp_check = tmp_check + 1
429 >                tmp_check = tmp_check + 1
430                  ### qui va messo il check per la locations
431          res['lfns'] = lfns
432          res['locations'] = locations
# Line 430 | Line 437 | class JobSplitter:
437          """
438          Perform job splitting based on number of event per job
439          """
440 <        common.logger.debug(5,'Splitting per events')
440 >        common.logger.debug('Splitting per events')
441          self.checkUserSettings()
442          jobDestination=[]
443 <        if (self.selectNumberOfJobs == 0):
444 <            msg = 'Must specify  number_of_jobs.'
443 >        if ( (self.selectTotalNumberEvents + self.selectEventsPerJob + self.selectNumberOfJobs) != 2 ):
444 >            msg = 'Must define exactly two of total_number_of_events, events_per_job, or number_of_jobs.'
445              raise CrabException(msg)
446  
447          managedGenerators =self.args['managedGenerators']
# Line 442 | Line 449 | class JobSplitter:
449          firstRun = self.cfg_params.get('CMSSW.first_run',None)
450  
451          if (self.selectEventsPerJob):
452 <            common.logger.message('Required '+str(self.eventsPerJob)+' events per job ')
452 >            common.logger.info('Required '+str(self.eventsPerJob)+' events per job ')
453          if (self.selectNumberOfJobs):
454 <            common.logger.message('Required '+str(self.theNumberOfJobs)+' jobs in total ')
454 >            common.logger.info('Required '+str(self.theNumberOfJobs)+' jobs in total ')
455          if (self.selectTotalNumberEvents):
456 <            common.logger.message('Required '+str(self.total_number_of_events)+' events in total ')
456 >            common.logger.info('Required '+str(self.total_number_of_events)+' events in total ')
457  
458          if (self.total_number_of_events < 0):
459              msg='Cannot split jobs per Events with "-1" as total number of events'
# Line 463 | Line 470 | class JobSplitter:
470              self.total_number_of_jobs = self.theNumberOfJobs
471              self.eventsPerJob = int(self.total_number_of_events/self.total_number_of_jobs)
472  
473 <        common.logger.debug(5,'N jobs  '+str(self.total_number_of_jobs))
473 >        common.logger.debug('N jobs  '+str(self.total_number_of_jobs))
474  
475          # is there any remainder?
476          check = int(self.total_number_of_events) - (int(self.total_number_of_jobs)*self.eventsPerJob)
477  
478 <        common.logger.debug(5,'Check  '+str(check))
478 >        common.logger.debug('Check  '+str(check))
479  
480 <        common.logger.message(str(self.total_number_of_jobs)+' jobs can be created, each for '+str(self.eventsPerJob)+' for a total of '+str(self.total_number_of_jobs*self.eventsPerJob)+' events')
480 >        common.logger.info(str(self.total_number_of_jobs)+' jobs can be created, each for '+str(self.eventsPerJob)+' for a total of '+str(self.total_number_of_jobs*self.eventsPerJob)+' events')
481          if check > 0:
482 <            common.logger.message('Warning: asked '+str(self.total_number_of_events)+' but can do only '+str(int(self.total_number_of_jobs)*self.eventsPerJob))
482 >            common.logger.info('Warning: asked '+str(self.total_number_of_events)+' but can do only '+str(int(self.total_number_of_jobs)*self.eventsPerJob))
483  
484          # argument is seed number.$i
485          self.list_of_args = []
# Line 489 | Line 496 | class JobSplitter:
496                      args.append('1')
497                  else:
498                      args.append(str(i*self.eventsPerJob))
499 +            args.append(str(self.eventsPerJob))
500              self.list_of_args.append(args)
501         # prepare dict output
502 +
503          dictOut = {}
504 +        dictOut['params'] = ['MaxEvents']
505 +        if (firstRun):
506 +            dictOut['params'] = ['FirstRun','MaxEvents']
507 +            if ( generator in managedGenerators ) : dictOut['params'] = ['FirstRun', 'FirstEvent', 'MaxEvents']
508 +        else:  
509 +            if (generator in managedGenerators) : dictOut['params'] = ['FirstEvent', 'MaxEvents']
510          dictOut['args'] = self.list_of_args
511          dictOut['jobDestination'] = jobDestination
512          dictOut['njobs']=self.total_number_of_jobs
# Line 508 | Line 523 | class JobSplitter:
523              msg = 'must specify  number_of_jobs.'
524              raise crabexception(msg)
525          jobDestination = []
526 <        common.logger.debug(5,'Splitting per job')
527 <        common.logger.message('Required '+str(self.theNumberOfJobs)+' jobs in total ')
526 >        common.logger.debug('Splitting per job')
527 >        common.logger.info('Required '+str(self.theNumberOfJobs)+' jobs in total ')
528  
529          self.total_number_of_jobs = self.theNumberOfJobs
530  
531 <        common.logger.debug(5,'N jobs  '+str(self.total_number_of_jobs))
531 >        common.logger.debug('N jobs  '+str(self.total_number_of_jobs))
532  
533 <        common.logger.message(str(self.total_number_of_jobs)+' jobs can be created')
533 >        common.logger.info(str(self.total_number_of_jobs)+' jobs can be created')
534  
535          # argument is seed number.$i
536 <        self.list_of_args = []
536 >        #self.list_of_args = []
537          for i in range(self.total_number_of_jobs):
538              jobDestination.append([""])
539 <            self.list_of_args.append([str(i)])
539 >        #   self.list_of_args.append([str(i)])
540  
541         # prepare dict output
542          dictOut = {}
543 <        dictOut['args'] = self.list_of_args
543 >        dictOut['args'] = [] # self.list_of_args
544          dictOut['jobDestination'] = jobDestination
545          dictOut['njobs']=self.total_number_of_jobs
546          return dictOut

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines