ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cvsroot/COMP/CRAB/python/Splitter.py
(Generate patch)

Comparing COMP/CRAB/python/Splitter.py (file contents):
Revision 1.14 by spiga, Tue May 26 16:53:23 2009 UTC vs.
Revision 1.24 by spiga, Tue Jul 21 16:18:09 2009 UTC

# Line 9 | Line 9 | class JobSplitter:
9          self.args=args
10          #self.maxEvents
11          # init BlackWhiteListParser
12 <        seWhiteList = cfg_params.get('GRID.se_white_list',[])
12 >        self.seWhiteList = cfg_params.get('GRID.se_white_list',[])
13          seBlackList = cfg_params.get('GRID.se_black_list',[])
14 <        self.blackWhiteListParser = SEBlackWhiteListParser(seWhiteList, seBlackList, common.logger)
14 >        self.blackWhiteListParser = SEBlackWhiteListParser(self.seWhiteList, seBlackList, common.logger())
15  
16  
17      def checkUserSettings(self):
# Line 43 | Line 43 | class JobSplitter:
43              self.selectTotalNumberEvents = 0
44  
45  
46 +    def ComputeSubBlockSites( self, blockSites ):
47 +        """
48 +        """
49 +        sub_blockSites = {}
50 +        for k,v in blockSites.iteritems():
51 +            sites=self.blackWhiteListParser.checkWhiteList(v)
52 +            if sites : sub_blockSites[k]=v
53 +        if len(sub_blockSites) < 1:
54 +            msg = 'WARNING: the sites %s is not hosting any part of data.'%self.seWhiteList
55 +            raise CrabException(msg)
56 +        return sub_blockSites
57 +
58   ########################################################################
59      def jobSplittingByEvent( self ):
60          """
# Line 57 | Line 69 | class JobSplitter:
69                self.list_of_args - File(s) job will run on (a list of lists)
70          """
71  
72 <        jobDestination=[]  
72 >        jobDestination=[]
73          self.checkUserSettings()
74          if ( (self.selectTotalNumberEvents + self.selectEventsPerJob + self.selectNumberOfJobs) != 2 ):
75              msg = 'Must define exactly two of total_number_of_events, events_per_job, or number_of_jobs.'
76              raise CrabException(msg)
77 <
78 <        blockSites = self.args['blockSites']
77 >
78 >        blockSites = self.args['blockSites']
79          pubdata = self.args['pubdata']
80          filesbyblock=pubdata.getFiles()
81  
# Line 77 | Line 89 | class JobSplitter:
89          self.useParent = int(self.cfg_params.get('CMSSW.use_parent',0))
90          noBboundary = int(self.cfg_params.get('CMSSW.no_block_boundary',0))
91  
92 +        if noBboundary == 1:
93 +            if self.total_number_of_events== -1:
94 +                msg = 'You are selecting no_block_boundary=1 which does not allow to set total_number_of_events=-1\n'
95 +                msg +='\tYou shoud get the number of event from DBS web interface and use it for your configuration.'                    
96 +                raise CrabException(msg)
97 +            if len(self.seWhiteList.split(',')) != 1:
98 +                msg = 'You are selecting no_block_boundary=1 which requires to choose one and only one site.\n'
99 +                msg += "\tPlease set se_white_list with the site's storage element name."
100 +                raise  CrabException(msg)  
101 +            blockSites = self.ComputeSubBlockSites(blockSites)    
102 +
103          # ---- Handle the possible job splitting configurations ---- #
104          if (self.selectTotalNumberEvents):
105              totalEventsRequested = self.total_number_of_events
# Line 124 | Line 147 | class JobSplitter:
147          jobsOfBlock = {}
148  
149          parString = ""
150 +        pString = ""
151          filesEventCount = 0
152 +        msg=''
153  
154          # ---- Iterate over the blocks in the dataset until ---- #
155          # ---- we've met the requested total # of events    ---- #
# Line 146 | Line 171 | class JobSplitter:
171                  if noBboundary == 0: # DD
172                      # ---- New block => New job ---- #
173                      parString = ""
174 +                    pString=""
175                      # counter for number of events in files currently worked on
176                      filesEventCount = 0
177                  # flag if next while loop should touch new file
# Line 155 | Line 181 | class JobSplitter:
181  
182                  # ---- Iterate over the files in the block until we've met the requested ---- #
183                  # ---- total # of events or we've gone over all the files in this block  ---- #
184 <                pString=''
184 >                msg='\n'
185                  while ( (eventsRemaining > 0) and (fileCount < numFilesInBlock) and (jobCount < totalNumberOfJobs) ):
186                      file = files[fileCount]
187                      if self.useParent==1:
188                          parent = self.parentFiles[file]
163                        for f in parent :
164                            pString +=  f + ','
189                          common.logger.log(10-1, "File "+str(file)+" has the following parents: "+str(parent))
190                      if newFile :
191                          try:
# Line 171 | Line 195 | class JobSplitter:
195                              filesEventCount += numEventsInFile
196                              # Add file to current job
197                              parString +=  file + ','
198 +                            if self.useParent==1:
199 +                                for f in parent :
200 +                                    pString += f  + ','
201                              newFile = 0
202                          except KeyError:
203                              common.logger.info("File "+str(file)+" has unknown number of events: skipping")
# Line 192 | Line 219 | class JobSplitter:
219                                      list_of_lists.append([fullString,fullParentString,str(-1),str(jobSkipEventCount)])
220                                  else:
221                                      list_of_lists.append([fullString,str(-1),str(jobSkipEventCount)])
222 <                                common.logger.debug("Job "+str(jobCount+1)+" can run over "+str(filesEventCount - jobSkipEventCount)+" events (last file in block).")
222 >                                msg += "Job %s can run over %s  events (last file in block).\n"%(str(jobCount+1), str(filesEventCount - jobSkipEventCount))
223                                  jobDestination.append(blockSites[block])
224 <                                common.logger.debug("Job "+str(jobCount+1)+" Destination: "+str(jobDestination[jobCount]))
224 >                                msg += "Job %s Destination: %s\n"%(str(jobCount+1),str(SE2CMS(jobDestination[jobCount])))
225                                  # fill jobs of block dictionary
226                                  jobsOfBlock[block].append(jobCount+1)
227                                  # reset counter
# Line 221 | Line 248 | class JobSplitter:
248                              list_of_lists.append([fullString,fullParentString,str(eventsPerJobRequested),str(jobSkipEventCount)])
249                          else:
250                              list_of_lists.append([fullString,str(eventsPerJobRequested),str(jobSkipEventCount)])
251 <                        common.logger.debug("Job "+str(jobCount+1)+" can run over "+str(eventsPerJobRequested)+" events.")
251 >                        msg += "Job %s can run over %s events.\n"%(str(jobCount+1),str(eventsPerJobRequested))
252                          jobDestination.append(blockSites[block])
253 <                        common.logger.debug("Job "+str(jobCount+1)+" Destination: "+str(jobDestination[jobCount]))
253 >                        msg+= "Job %s Destination: %s\n"%(str(jobCount+1),str(SE2CMS(jobDestination[jobCount])))
254                          jobsOfBlock[block].append(jobCount+1)
255                          # reset counter
256                          jobCount = jobCount + 1
# Line 246 | Line 273 | class JobSplitter:
273                              list_of_lists.append([fullString,fullParentString,str(eventsPerJobRequested),str(jobSkipEventCount)])
274                          else:
275                              list_of_lists.append([fullString,str(eventsPerJobRequested),str(jobSkipEventCount)])
276 <                        common.logger.debug("Job "+str(jobCount+1)+" can run over "+str(eventsPerJobRequested)+" events.")
276 >                        msg += "Job %s can run over %s events.\n"%(str(jobCount+1),str(eventsPerJobRequested))
277                          jobDestination.append(blockSites[block])
278 <                        common.logger.debug("Job "+str(jobCount+1)+" Destination: "+str(jobDestination[jobCount]))
278 >                        msg+= "Job %s Destination: %s\n"%(str(jobCount+1),str(SE2CMS(jobDestination[jobCount])))
279                          jobsOfBlock[block].append(jobCount+1)
280                          # increase counter
281                          jobCount = jobCount + 1
# Line 259 | Line 286 | class JobSplitter:
286                          jobSkipEventCount = eventsPerJobRequested - (filesEventCount - jobSkipEventCount - self.eventsbyfile[file])
287                          # remove all but the last file
288                          filesEventCount = self.eventsbyfile[file]
289 +                        pString_tmp=''
290                          if self.useParent==1:
291 <                            for f in parent : pString +=  f + ','
291 >                            for f in parent : pString_tmp +=  f + ','
292 >                        pString =  pString_tmp
293                          parString =  file + ','
294                      pass # END if
295                  pass # END while (iterate over files in the block)
296          pass # END while (iterate over blocks in the dataset)
297 +        common.logger.debug(msg)
298          self.ncjobs = self.total_number_of_jobs = jobCount
299          if (eventsRemaining > 0 and jobCount < totalNumberOfJobs ):
300              common.logger.info("Could not run on all requested events because some blocks not hosted at allowed sites.")
301          common.logger.info(str(jobCount)+" job(s) can run on "+str(totalEventCount)+" events.\n")
302 <
302 >
303          # skip check on  block with no sites  DD
304          if noBboundary == 0 : self.checkBlockNoSite(blocks,jobsOfBlock,blockSites)
305  
# Line 285 | Line 315 | class JobSplitter:
315  
316          # keep trace of block with no sites to print a warning at the end
317  
318 <    def checkBlockNoSite(self,blocks,jobsOfBlock,blockSites):  
318 >    def checkBlockNoSite(self,blocks,jobsOfBlock,blockSites):
319          # screen output
320          screenOutput = "List of jobs and available destination sites:\n\n"
321          noSiteBlock = []
# Line 297 | Line 327 | class JobSplitter:
327              if block in jobsOfBlock.keys() :
328                  blockCounter += 1
329                  allBlock.append( blockCounter )
330 +                sites=self.blackWhiteListParser.checkWhiteList(self.blackWhiteListParser.checkBlackList(blockSites[block],[block]),[block])
331                  screenOutput += "Block %5i: jobs %20s: sites: %s\n" % (blockCounter,spanRanges(jobsOfBlock[block]),
332 <                    ','.join(self.blackWhiteListParser.checkWhiteList(self.blackWhiteListParser.checkBlackList(blockSites[block],[block]),[block])))
333 <                if len(self.blackWhiteListParser.checkWhiteList(self.blackWhiteListParser.checkBlackList(blockSites[block],[block]),[block])) == 0:
332 >                    ', '.join(SE2CMS(sites)))
333 >                if len(sites) == 0:
334                      noSiteBlock.append( spanRanges(jobsOfBlock[block]) )
335                      bloskNoSite.append( blockCounter )
336  
# Line 330 | Line 361 | class JobSplitter:
361              common.logger.info(msg)
362  
363          if bloskNoSite == allBlock:
364 <            raise CrabException('No jobs created')
364 >            raise CrabException('No jobs created')
365  
366          return
367  
368  
369   ########################################################################
370 <    def jobSplittingByRun(self):
370 >    def jobSplittingByRun(self):
371          """
372          """
373 <        from sets import Set  
373 >        from sets import Set
374          from WMCore.JobSplitting.RunBased import RunBased
375          from WMCore.DataStructs.Workflow import Workflow
376          from WMCore.DataStructs.File import File
377          from WMCore.DataStructs.Fileset import Fileset
378          from WMCore.DataStructs.Subscription import Subscription
379          from WMCore.JobSplitting.SplitterFactory import SplitterFactory
380 <        from WMCore.DataStructs.Run import Run
380 >        from WMCore.DataStructs.Run import Run
381  
382          self.checkUserSettings()
383 <        blockSites = self.args['blockSites']
383 >        blockSites = self.args['blockSites']
384          pubdata = self.args['pubdata']
385  
386          if self.selectNumberOfJobs == 0 :
387              self.theNumberOfJobs = 9999999
388          blocks = {}
389 <        runList = []
389 >        runList = []
390          thefiles = Fileset(name='FilesToSplit')
391          fileList = pubdata.getListFiles()
392          for f in fileList:
393              block = f['Block']['Name']
394 <            try:
394 >            try:
395                  f['Block']['StorageElementList'].extend(blockSites[block])
396              except:
397                  continue
# Line 368 | Line 399 | class JobSplitter:
399              [ wmbsFile['locations'].add(x) for x in blockSites[block] ]
400              wmbsFile['block'] = block
401              runNum = f['RunsList'][0]['RunNumber']
402 <            runList.append(runNum)
402 >            runList.append(runNum)
403              myRun = Run(runNumber=runNum)
404              wmbsFile.addRun( myRun )
405              thefiles.addFile(
406                  wmbsFile
407                  )
408 <
408 >
409          work = Workflow()
410          subs = Subscription(
411          fileset = thefiles,
# Line 383 | Line 414 | class JobSplitter:
414          type = "Processing")
415          splitter = SplitterFactory()
416          jobfactory = splitter(subs)
417 <        
418 <        #loop over all runs
417 >
418 >        #loop over all runs
419          set = Set(runList)
420          list_of_lists = []
421          jobDestination = []
391
422          count = 0
423 <        for i in list(set):
423 >        for jobGroup in  jobfactory():
424              if count <  self.theNumberOfJobs:
425 <                res = self.getJobInfo(jobfactory())
426 <                parString = ''
425 >                res = self.getJobInfo(jobGroup)
426 >                parString = ''
427                  for file in res['lfns']:
428                      parString += file + ','
429                  fullString = parString[:-1]
430 <                list_of_lists.append([fullString,str(-1),str(0)])    
430 >                list_of_lists.append([fullString,str(-1),str(0)])
431                  #need to check single file location
432 <                jobDestination.append(res['locations'])  
432 >                jobDestination.append(res['locations'])
433                  count +=1
434         # prepare dict output
435          dictOut = {}
# Line 412 | Line 442 | class JobSplitter:
442  
443      def getJobInfo( self,jobGroup ):
444          res = {}
445 <        lfns = []        
446 <        locations = []        
445 >        lfns = []
446 >        locations = []
447          tmp_check=0
448          for job in jobGroup.jobs:
449              for file in job.getFiles():
450 <                lfns.append(file['lfn'])
450 >                lfns.append(file['lfn'])
451                  for loc in file['locations']:
452                      if tmp_check < 1 :
453                          locations.append(loc)
454 <                tmp_check = tmp_check + 1
455 <                ### qui va messo il check per la locations
456 <        res['lfns'] = lfns
457 <        res['locations'] = locations
458 <        return res                
459 <      
454 >                tmp_check = tmp_check + 1
455 >                ### qui va messo il check per la locations
456 >        res['lfns'] = lfns
457 >        res['locations'] = locations
458 >        return res
459 >
460   ########################################################################
461 <    def jobSplittingNoInput(self):
461 >    def prepareSplittingNoInput(self):
462          """
433        Perform job splitting based on number of event per job
463          """
435        common.logger.debug('Splitting per events')
436        self.checkUserSettings()
437        jobDestination=[]
438        if ( (self.selectTotalNumberEvents + self.selectEventsPerJob + self.selectNumberOfJobs) != 2 ):
439            msg = 'Must define exactly two of total_number_of_events, events_per_job, or number_of_jobs.'
440            raise CrabException(msg)
441
442        managedGenerators =self.args['managedGenerators']
443        generator = self.args['generator']
444        firstRun = self.cfg_params.get('CMSSW.first_run',None)
445
464          if (self.selectEventsPerJob):
465              common.logger.info('Required '+str(self.eventsPerJob)+' events per job ')
466          if (self.selectNumberOfJobs):
# Line 465 | Line 483 | class JobSplitter:
483              self.total_number_of_jobs = self.theNumberOfJobs
484              self.eventsPerJob = int(self.total_number_of_events/self.total_number_of_jobs)
485  
486 +
487 +    def jobSplittingNoInput(self):
488 +        """
489 +        Perform job splitting based on number of event per job
490 +        """
491 +        common.logger.debug('Splitting per events')
492 +        self.checkUserSettings()
493 +        jobDestination=[]
494 +        if ( (self.selectTotalNumberEvents + self.selectEventsPerJob + self.selectNumberOfJobs) != 2 ):
495 +            msg = 'Must define exactly two of total_number_of_events, events_per_job, or number_of_jobs.'
496 +            raise CrabException(msg)
497 +
498 +        managedGenerators =self.args['managedGenerators']
499 +        generator = self.args['generator']
500 +        firstRun = self.cfg_params.get('CMSSW.first_run',None)
501 +
502 +        self.prepareSplittingNoInput()
503 +
504          common.logger.debug('N jobs  '+str(self.total_number_of_jobs))
505  
506          # is there any remainder?
# Line 486 | Line 522 | class JobSplitter:
522                  ## pythia first run
523                  args.append(str(firstRun)+str(i))
524              if (generator in managedGenerators):
525 <                if (generator == 'comphep' and i == 0):
525 >               args.append(generator)
526 >               if (generator == 'comphep' and i == 0):
527                      # COMPHEP is brain-dead and wants event #'s like 1,100,200,300
528                      args.append('1')
529 <                else:
529 >               else:
530                      args.append(str(i*self.eventsPerJob))
531              args.append(str(self.eventsPerJob))
532              self.list_of_args.append(args)
# Line 499 | Line 536 | class JobSplitter:
536          dictOut['params'] = ['MaxEvents']
537          if (firstRun):
538              dictOut['params'] = ['FirstRun','MaxEvents']
539 <            if ( generator in managedGenerators ) : dictOut['params'] = ['FirstRun', 'FirstEvent', 'MaxEvents']
540 <        else:  
541 <            if (generator in managedGenerators) : dictOut['params'] = ['FirstEvent', 'MaxEvents']
539 >            if ( generator in managedGenerators ) :
540 >                dictOut['params'] = ['FirstRun', 'Generator', 'FirstEvent', 'MaxEvents']
541 >        else:
542 >            if (generator in managedGenerators) :
543 >                dictOut['params'] = ['Generator', 'FirstEvent', 'MaxEvents']
544          dictOut['args'] = self.list_of_args
545          dictOut['jobDestination'] = jobDestination
546          dictOut['njobs']=self.total_number_of_jobs
# Line 521 | Line 560 | class JobSplitter:
560          common.logger.debug('Splitting per job')
561          common.logger.info('Required '+str(self.theNumberOfJobs)+' jobs in total ')
562  
563 <        self.total_number_of_jobs = self.theNumberOfJobs
563 > #        self.total_number_of_jobs = self.theNumberOfJobs
564 >
565 >        self.prepareSplittingNoInput()
566  
567          common.logger.debug('N jobs  '+str(self.total_number_of_jobs))
568  
569          common.logger.info(str(self.total_number_of_jobs)+' jobs can be created')
570  
571          # argument is seed number.$i
572 <        #self.list_of_args = []
572 >        self.list_of_args = []
573          for i in range(self.total_number_of_jobs):
574 +            args=[]
575              jobDestination.append([""])
576 <        #   self.list_of_args.append([str(i)])
576 >            if self.eventsPerJob != 0 :
577 >                args.append(str(self.eventsPerJob))
578 >                self.list_of_args.append(args)
579  
580         # prepare dict output
581          dictOut = {}
582 <        dictOut['args'] = [] # self.list_of_args
582 >        dictOut['params'] = ['MaxEvents']
583 >        dictOut['args'] =  self.list_of_args
584          dictOut['jobDestination'] = jobDestination
585          dictOut['njobs']=self.total_number_of_jobs
586          return dictOut
542
587  
588 <    def jobSplittingByLumi(self):
588 >
589 >    def jobSplittingByLumi(self):
590          """
591          """
592          return
# Line 549 | Line 594 | class JobSplitter:
594          """
595          Define key splittingType matrix
596          """
597 <        SplitAlogs = {
598 <                     'EventBased'           : self.jobSplittingByEvent,
597 >        SplitAlogs = {
598 >                     'EventBased'           : self.jobSplittingByEvent,
599                       'RunBased'             : self.jobSplittingByRun,
600 <                     'LumiBased'            : self.jobSplittingByLumi,
601 <                     'NoInput'              : self.jobSplittingNoInput,
600 >                     'LumiBased'            : self.jobSplittingByLumi,
601 >                     'NoInput'              : self.jobSplittingNoInput,
602                       'ForScript'            : self.jobSplittingForScript
603 <                     }  
603 >                     }
604          return SplitAlogs
605  

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines