[ViewVC] Diff of: cvsroot/COMP/CRAB/python/Splitter.py

Comparing COMP/CRAB/python/Splitter.py (file contents):
Revision 1.2 by spiga, Wed Feb 4 15:09:03 2009 UTC vs.
Revision 1.26 by ewv, Wed Jul 29 21:42:16 2009 UTC

#	Line 1 \| Line 1
1		import common
2	<	from crab_logger import Logger
2	>	from sets import Set
3		from crab_exceptions import *
4		from crab_util import *
5	+
6	+	from WMCore.DataStructs.File import File
7	+	from WMCore.DataStructs.Fileset import Fileset
8	+	from WMCore.DataStructs.Run import Run
9	+	from WMCore.DataStructs.Subscription import Subscription
10	+	from WMCore.DataStructs.Workflow import Workflow
11	+	from WMCore.JobSplitting.SplitterFactory import SplitterFactory
12		from WMCore.SiteScreening.BlackWhiteListParser import SEBlackWhiteListParser
13
14		class JobSplitter:
15		def __init__( self, cfg_params, args ):
16		self.cfg_params = cfg_params
17	<	self.blockSites = args['blockSites']
11	<	self.pubdata = args['pubdata']
17	>	self.args=args
18		#self.maxEvents
13	–	self.jobDestination=[] # Site destination(s) for each job (list of lists)
19		# init BlackWhiteListParser
20	<	seWhiteList = cfg_params.get('EDG.se_white_list',[])
21	<	seBlackList = cfg_params.get('EDG.se_black_list',[])
22	<	self.blackWhiteListParser = SEBlackWhiteListParser(seWhiteList, seBlackList, common.logger)
20	>	self.seWhiteList = cfg_params.get('GRID.se_white_list',[])
21	>	seBlackList = cfg_params.get('GRID.se_black_list',[])
22	>	self.blackWhiteListParser = SEBlackWhiteListParser(self.seWhiteList, seBlackList, common.logger())
23
24
25		def checkUserSettings(self):
#	Line 46 \| Line 51 \| class JobSplitter:
51		self.selectTotalNumberEvents = 0
52
53
54	+	def ComputeSubBlockSites( self, blockSites ):
55	+	"""
56	+	"""
57	+	sub_blockSites = {}
58	+	for k,v in blockSites.iteritems():
59	+	sites=self.blackWhiteListParser.checkWhiteList(v)
60	+	if sites : sub_blockSites[k]=v
61	+	if len(sub_blockSites) < 1:
62	+	msg = 'WARNING: the sites %s is not hosting any part of data.'%self.seWhiteList
63	+	raise CrabException(msg)
64	+	return sub_blockSites
65	+
66		########################################################################
67		def jobSplittingByEvent( self ):
68		"""
#	Line 55 \| Line 72 \| class JobSplitter:
72		REQUIRES: self.selectTotalNumberEvents, self.selectEventsPerJob, self.selectNumberofJobs,
73		self.total_number_of_events, self.eventsPerJob, self.theNumberOfJobs,
74		self.maxEvents, self.filesbyblock
75	<	SETS: self.jobDestination - Site destination(s) for each job (a list of lists)
75	>	SETS: jobDestination - Site destination(s) for each job (a list of lists)
76		self.total_number_of_jobs - Total # of jobs
77		self.list_of_args - File(s) job will run on (a list of lists)
78		"""
79
80	+	jobDestination=[]
81		self.checkUserSettings()
82		if ( (self.selectTotalNumberEvents + self.selectEventsPerJob + self.selectNumberOfJobs) != 2 ):
83		msg = 'Must define exactly two of total_number_of_events, events_per_job, or number_of_jobs.'
84		raise CrabException(msg)
67	–
68	–	self.filesbyblock=self.pubdata.getFiles()
85
86	<	self.eventsbyblock=self.pubdata.getEventsPerBlock()
87	<	self.eventsbyfile=self.pubdata.getEventsPerFile()
88	<	self.parentFiles=self.pubdata.getParent()
86	>	blockSites = self.args['blockSites']
87	>	pubdata = self.args['pubdata']
88	>	filesbyblock=pubdata.getFiles()
89	>
90	>	self.eventsbyblock=pubdata.getEventsPerBlock()
91	>	self.eventsbyfile=pubdata.getEventsPerFile()
92	>	self.parentFiles=pubdata.getParent()
93
94		## get max number of events
95	<	self.maxEvents=self.pubdata.getMaxEvents()
95	>	self.maxEvents=pubdata.getMaxEvents()
96
97		self.useParent = int(self.cfg_params.get('CMSSW.use_parent',0))
98		noBboundary = int(self.cfg_params.get('CMSSW.no_block_boundary',0))
99
100	+	if noBboundary == 1:
101	+	if self.total_number_of_events== -1:
102	+	msg = 'You are selecting no_block_boundary=1 which does not allow to set total_number_of_events=-1\n'
103	+	msg +='\tYou shoud get the number of event from DBS web interface and use it for your configuration.'
104	+	raise CrabException(msg)
105	+	if len(self.seWhiteList) == 0 or len(self.seWhiteList.split(',')) != 1:
106	+	msg = 'You are selecting no_block_boundary=1 which requires to choose one and only one site.\n'
107	+	msg += "\tPlease set se_white_list with the site's storage element name."
108	+	raise CrabException(msg)
109	+	blockSites = self.ComputeSubBlockSites(blockSites)
110	+
111		# ---- Handle the possible job splitting configurations ---- #
112		if (self.selectTotalNumberEvents):
113		totalEventsRequested = self.total_number_of_events
#	Line 91 \| Line 122 \| class JobSplitter:
122		# If user requested more events than are in the dataset
123		elif (totalEventsRequested > self.maxEvents):
124		eventsRemaining = self.maxEvents
125	<	common.logger.message("Requested "+str(self.total_number_of_events)+ " events, but only "+str(self.maxEvents)+" events are available.")
125	>	common.logger.info("Requested "+str(self.total_number_of_events)+ " events, but only "+str(self.maxEvents)+" events are available.")
126		# If user requested less events than are in the dataset
127		else:
128		eventsRemaining = totalEventsRequested
#	Line 107 \| Line 138 \| class JobSplitter:
138		eventsPerJobRequested = int(eventsRemaining/self.theNumberOfJobs)
139
140		if (self.selectNumberOfJobs):
141	<	common.logger.message("May not create the exact number_of_jobs requested.")
141	>	common.logger.info("May not create the exact number_of_jobs requested.")
142
143		# old... to remove Daniele
144		totalNumberOfJobs = 999999999
145
146	<	blocks = self.blockSites.keys()
146	>	blocks = blockSites.keys()
147		blockCount = 0
148		# Backup variable in case self.maxEvents counted events in a non-included block
149		numBlocksInDataset = len(blocks)
#	Line 124 \| Line 155 \| class JobSplitter:
155		jobsOfBlock = {}
156
157		parString = ""
158	+	pString = ""
159		filesEventCount = 0
160	+	msg=''
161
162		# ---- Iterate over the blocks in the dataset until ---- #
163		# ---- we've met the requested total # of events ---- #
#	Line 136 \| Line 169 \| class JobSplitter:
169
170		if self.eventsbyblock.has_key(block) :
171		numEventsInBlock = self.eventsbyblock[block]
172	<	common.logger.debug(5,'Events in Block File '+str(numEventsInBlock))
172	>	common.logger.debug('Events in Block File '+str(numEventsInBlock))
173
174	<	files = self.filesbyblock[block]
174	>	files = filesbyblock[block]
175		numFilesInBlock = len(files)
176		if (numFilesInBlock <= 0):
177		continue
#	Line 146 \| Line 179 \| class JobSplitter:
179		if noBboundary == 0: # DD
180		# ---- New block => New job ---- #
181		parString = ""
182	+	pString=""
183		# counter for number of events in files currently worked on
184		filesEventCount = 0
185		# flag if next while loop should touch new file
#	Line 155 \| Line 189 \| class JobSplitter:
189
190		# ---- Iterate over the files in the block until we've met the requested ---- #
191		# ---- total # of events or we've gone over all the files in this block ---- #
192	<	pString=''
192	>	msg='\n'
193		while ( (eventsRemaining > 0) and (fileCount < numFilesInBlock) and (jobCount < totalNumberOfJobs) ):
194		file = files[fileCount]
195		if self.useParent==1:
196		parent = self.parentFiles[file]
197	<	for f in parent :
164	<	pString += '\\\"' + f + '\\\"\,'
165	<	common.logger.debug(6, "File "+str(file)+" has the following parents: "+str(parent))
166	<	common.logger.write("File "+str(file)+" has the following parents: "+str(parent))
197	>	common.logger.log(10-1, "File "+str(file)+" has the following parents: "+str(parent))
198		if newFile :
199		try:
200		numEventsInFile = self.eventsbyfile[file]
201	<	common.logger.debug(6, "File "+str(file)+" has "+str(numEventsInFile)+" events")
201	>	common.logger.log(10-1, "File "+str(file)+" has "+str(numEventsInFile)+" events")
202		# increase filesEventCount
203		filesEventCount += numEventsInFile
204		# Add file to current job
205	<	parString += '\\\"' + file + '\\\"\,'
205	>	parString += file + ','
206	>	if self.useParent==1:
207	>	for f in parent :
208	>	pString += f + ','
209		newFile = 0
210		except KeyError:
211	<	common.logger.message("File "+str(file)+" has unknown number of events: skipping")
211	>	common.logger.info("File "+str(file)+" has unknown number of events: skipping")
212
213		eventsPerJobRequested = min(eventsPerJobRequested, eventsRemaining)
214		# if less events in file remain than eventsPerJobRequested
#	Line 187 \| Line 221 \| class JobSplitter:
221		if ( fileCount == numFilesInBlock-1 ) :
222		# end job using last file, use remaining events in block
223		# close job and touch new file
224	<	fullString = parString[:-2]
224	>	fullString = parString[:-1]
225		if self.useParent==1:
226	<	fullParentString = pString[:-2]
226	>	fullParentString = pString[:-1]
227		list_of_lists.append([fullString,fullParentString,str(-1),str(jobSkipEventCount)])
228		else:
229		list_of_lists.append([fullString,str(-1),str(jobSkipEventCount)])
230	<	common.logger.debug(3,"Job "+str(jobCount+1)+" can run over "+str(filesEventCount - jobSkipEventCount)+" events (last file in block).")
231	<	self.jobDestination.append(self.blockSites[block])
232	<	common.logger.debug(5,"Job "+str(jobCount+1)+" Destination: "+str(self.jobDestination[jobCount]))
230	>	msg += "Job %s can run over %s events (last file in block).\n"%(str(jobCount+1), str(filesEventCount - jobSkipEventCount))
231	>	jobDestination.append(blockSites[block])
232	>	msg += "Job %s Destination: %s\n"%(str(jobCount+1),str(SE2CMS(jobDestination[jobCount])))
233		# fill jobs of block dictionary
234		jobsOfBlock[block].append(jobCount+1)
235		# reset counter
#	Line 216 \| Line 250 \| class JobSplitter:
250		# if events in file equal to eventsPerJobRequested
251		elif ( filesEventCount - jobSkipEventCount == eventsPerJobRequested ) :
252		# close job and touch new file
253	<	fullString = parString[:-2]
253	>	fullString = parString[:-1]
254		if self.useParent==1:
255	<	fullParentString = pString[:-2]
255	>	fullParentString = pString[:-1]
256		list_of_lists.append([fullString,fullParentString,str(eventsPerJobRequested),str(jobSkipEventCount)])
257		else:
258		list_of_lists.append([fullString,str(eventsPerJobRequested),str(jobSkipEventCount)])
259	<	common.logger.debug(3,"Job "+str(jobCount+1)+" can run over "+str(eventsPerJobRequested)+" events.")
260	<	self.jobDestination.append(self.blockSites[block])
261	<	common.logger.debug(5,"Job "+str(jobCount+1)+" Destination: "+str(self.jobDestination[jobCount]))
259	>	msg += "Job %s can run over %s events.\n"%(str(jobCount+1),str(eventsPerJobRequested))
260	>	jobDestination.append(blockSites[block])
261	>	msg+= "Job %s Destination: %s\n"%(str(jobCount+1),str(SE2CMS(jobDestination[jobCount])))
262		jobsOfBlock[block].append(jobCount+1)
263		# reset counter
264		jobCount = jobCount + 1
#	Line 241 \| Line 275 \| class JobSplitter:
275		# if more events in file remain than eventsPerJobRequested
276		else :
277		# close job but don't touch new file
278	<	fullString = parString[:-2]
278	>	fullString = parString[:-1]
279		if self.useParent==1:
280	<	fullParentString = pString[:-2]
280	>	fullParentString = pString[:-1]
281		list_of_lists.append([fullString,fullParentString,str(eventsPerJobRequested),str(jobSkipEventCount)])
282		else:
283		list_of_lists.append([fullString,str(eventsPerJobRequested),str(jobSkipEventCount)])
284	<	common.logger.debug(3,"Job "+str(jobCount+1)+" can run over "+str(eventsPerJobRequested)+" events.")
285	<	self.jobDestination.append(self.blockSites[block])
286	<	common.logger.debug(5,"Job "+str(jobCount+1)+" Destination: "+str(self.jobDestination[jobCount]))
284	>	msg += "Job %s can run over %s events.\n"%(str(jobCount+1),str(eventsPerJobRequested))
285	>	jobDestination.append(blockSites[block])
286	>	msg+= "Job %s Destination: %s\n"%(str(jobCount+1),str(SE2CMS(jobDestination[jobCount])))
287		jobsOfBlock[block].append(jobCount+1)
288		# increase counter
289		jobCount = jobCount + 1
#	Line 260 \| Line 294 \| class JobSplitter:
294		jobSkipEventCount = eventsPerJobRequested - (filesEventCount - jobSkipEventCount - self.eventsbyfile[file])
295		# remove all but the last file
296		filesEventCount = self.eventsbyfile[file]
297	+	pString_tmp=''
298		if self.useParent==1:
299	<	for f in parent : pString += '\\\"' + f + '\\\"\,'
300	<	parString = '\\\"' + file + '\\\"\,'
299	>	for f in parent : pString_tmp += f + ','
300	>	pString = pString_tmp
301	>	parString = file + ','
302		pass # END if
303		pass # END while (iterate over files in the block)
304		pass # END while (iterate over blocks in the dataset)
305	+	common.logger.debug(msg)
306		self.ncjobs = self.total_number_of_jobs = jobCount
307		if (eventsRemaining > 0 and jobCount < totalNumberOfJobs ):
308	<	common.logger.message("Could not run on all requested events because some blocks not hosted at allowed sites.")
309	<	common.logger.message(str(jobCount)+" job(s) can run on "+str(totalEventCount)+" events.\n")
310	<
308	>	common.logger.info("Could not run on all requested events because some blocks not hosted at allowed sites.")
309	>	common.logger.info(str(jobCount)+" job(s) can run on "+str(totalEventCount)+" events.\n")
310	>
311		# skip check on block with no sites DD
312	<	if noBboundary == 0 : self.checkBlockNoSite(blocks,jobsOfBlock)
312	>	if noBboundary == 0 : self.checkBlockNoSite(blocks,jobsOfBlock,blockSites)
313
314		# prepare dict output
315		dictOut = {}
316	+	dictOut['params']= ['InputFiles','MaxEvents','SkipEvents']
317	+	if self.useParent: dictOut['params']= ['InputFiles','ParentFiles','MaxEvents','SkipEvents']
318		dictOut['args'] = list_of_lists
319	<	dictOut['jobDestination'] = self.jobDestination
319	>	dictOut['jobDestination'] = jobDestination
320		dictOut['njobs']=self.total_number_of_jobs
321
322		return dictOut
323
324		# keep trace of block with no sites to print a warning at the end
325
326	<	def checkBlockNoSite(self,blocks,jobsOfBlock):
326	>	def checkBlockNoSite(self,blocks,jobsOfBlock,blockSites):
327		# screen output
328		screenOutput = "List of jobs and available destination sites:\n\n"
329		noSiteBlock = []
330		bloskNoSite = []
331	+	allBlock = []
332
333		blockCounter = 0
334		for block in blocks:
335		if block in jobsOfBlock.keys() :
336		blockCounter += 1
337	+	allBlock.append( blockCounter )
338	+	sites=self.blackWhiteListParser.checkWhiteList(self.blackWhiteListParser.checkBlackList(blockSites[block],[block]),[block])
339		screenOutput += "Block %5i: jobs %20s: sites: %s\n" % (blockCounter,spanRanges(jobsOfBlock[block]),
340	<	','.join(self.blackWhiteListParser.checkWhiteList(self.blackWhiteListParser.checkBlackList(self.blockSites[block],block),block)))
341	<	if len(self.blackWhiteListParser.checkWhiteList(self.blackWhiteListParser.checkBlackList(self.blockSites[block],block),block)) == 0:
340	>	', '.join(SE2CMS(sites)))
341	>	if len(sites) == 0:
342		noSiteBlock.append( spanRanges(jobsOfBlock[block]) )
343		bloskNoSite.append( blockCounter )
344
345	<	common.logger.message(screenOutput)
345	>	common.logger.info(screenOutput)
346		if len(noSiteBlock) > 0 and len(bloskNoSite) > 0:
347		msg = 'WARNING: No sites are hosting any part of data for block:\n '
348		virgola = ""
#	Line 308 \| Line 350 \| class JobSplitter:
350		virgola = ","
351		for block in bloskNoSite:
352		msg += ' ' + str(block) + virgola
353	<	msg += '\n Related jobs:\n '
353	>	msg += '\n\t\tRelated jobs:\n '
354		virgola = ""
355		if len(noSiteBlock) > 1:
356		virgola = ","
357		for range_jobs in noSiteBlock:
358		msg += str(range_jobs) + virgola
359	<	msg += '\n will not be submitted and this block of data can not be analyzed!\n'
360	<	if self.cfg_params.has_key('EDG.se_white_list'):
361	<	msg += 'WARNING: SE White List: '+self.cfg_params['EDG.se_white_list']+'\n'
362	<	msg += '(Hint: By whitelisting you force the job to run at this particular site(s).\n'
363	<	msg += 'Please check if the dataset is available at this site!)\n'
364	<	if self.cfg_params.has_key('EDG.ce_white_list'):
365	<	msg += 'WARNING: CE White List: '+self.cfg_params['EDG.ce_white_list']+'\n'
366	<	msg += '(Hint: By whitelisting you force the job to run at this particular site(s).\n'
367	<	msg += 'Please check if the dataset is available at this site!)\n'
359	>	msg += '\n\t\twill not be submitted and this block of data can not be analyzed!\n'
360	>	if self.cfg_params.has_key('GRID.se_white_list'):
361	>	msg += '\tWARNING: SE White List: '+self.cfg_params['GRID.se_white_list']+'\n'
362	>	msg += '\t(Hint: By whitelisting you force the job to run at this particular site(s).\n'
363	>	msg += '\tPlease check if the dataset is available at this site!)'
364	>	if self.cfg_params.has_key('GRID.ce_white_list'):
365	>	msg += '\tWARNING: CE White List: '+self.cfg_params['GRID.ce_white_list']+'\n'
366	>	msg += '\t(Hint: By whitelisting you force the job to run at this particular site(s).\n'
367	>	msg += '\tPlease check if the dataset is available at this site!)\n'
368	>
369	>	common.logger.info(msg)
370
371	<	common.logger.message(msg)
371	>	if bloskNoSite == allBlock:
372	>	raise CrabException('No jobs created')
373
374		return
375
376
377		########################################################################
378	<	def jobSplittingByRun(self):
378	>	def jobSplittingByRun(self):
379		"""
380		"""
336	–	from sets import Set
337	–	from WMCore.JobSplitting.RunBased import RunBased
338	–	from WMCore.DataStructs.Workflow import Workflow
339	–	from WMCore.DataStructs.File import File
340	–	from WMCore.DataStructs.Fileset import Fileset
341	–	from WMCore.DataStructs.Subscription import Subscription
342	–	from WMCore.JobSplitting.SplitterFactory import SplitterFactory
343	–	from WMCore.DataStructs.Run import Run
381
382		self.checkUserSettings()
383	+	blockSites = self.args['blockSites']
384	+	pubdata = self.args['pubdata']
385
386		if self.selectNumberOfJobs == 0 :
387		self.theNumberOfJobs = 9999999
388		blocks = {}
389	<	runList = []
389	>	runList = []
390		thefiles = Fileset(name='FilesToSplit')
391	<	fileList = self.pubdata.getListFiles()
391	>	fileList = pubdata.getListFiles()
392		for f in fileList:
354	–	# print f
393		block = f['Block']['Name']
394	<	# if not blocks.has_key(block):
395	<	# blocks[block] = reader.listFileBlockLocation(block)
358	<	try:
359	<	f['Block']['StorageElementList'].extend(self.blockSites[block])
394	>	try:
395	>	f['Block']['StorageElementList'].extend(blockSites[block])
396		except:
397		continue
398		wmbsFile = File(f['LogicalFileName'])
399	<	[ wmbsFile['locations'].add(x) for x in self.blockSites[block] ]
399	>	[ wmbsFile['locations'].add(x) for x in blockSites[block] ]
400		wmbsFile['block'] = block
401		runNum = f['RunsList'][0]['RunNumber']
402	<	runList.append(runNum)
402	>	runList.append(runNum)
403		myRun = Run(runNumber=runNum)
404		wmbsFile.addRun( myRun )
405		thefiles.addFile(
406		wmbsFile
407		)
408	<
408	>
409		work = Workflow()
410		subs = Subscription(
411		fileset = thefiles,
#	Line 378 \| Line 414 \| class JobSplitter:
414		type = "Processing")
415		splitter = SplitterFactory()
416		jobfactory = splitter(subs)
417	<
418	<	#loop over all runs
417	>
418	>	#loop over all runs
419		set = Set(runList)
420		list_of_lists = []
421		jobDestination = []
386	–
422		count = 0
423	<	for i in list(set):
423	>	for jobGroup in jobfactory():
424		if count < self.theNumberOfJobs:
425	<	res = self.getJobInfo(jobfactory())
426	<	parString = ''
425	>	res = self.getJobInfo(jobGroup)
426	>	parString = ''
427		for file in res['lfns']:
428	<	parString += '\\\"' + file + '\\\"\,'
429	<	fullString = parString[:-2]
430	<	list_of_lists.append([fullString,str(-1),str(0)])
428	>	parString += file + ','
429	>	fullString = parString[:-1]
430	>	list_of_lists.append([fullString,str(-1),str(0)])
431		#need to check single file location
432	<	jobDestination.append(res['locations'])
432	>	jobDestination.append(res['locations'])
433		count +=1
399	–	#print jobDestination
434		# prepare dict output
435		dictOut = {}
436	+	dictOut['params']= ['InputFiles','MaxEvents','SkipEvents']
437		dictOut['args'] = list_of_lists
438		dictOut['jobDestination'] = jobDestination
439		dictOut['njobs']=count
#	Line 407 \| Line 442 \| class JobSplitter:
442
443		def getJobInfo( self,jobGroup ):
444		res = {}
445	<	lfns = []
446	<	locations = []
445	>	lfns = []
446	>	locations = []
447		tmp_check=0
448		for job in jobGroup.jobs:
449		for file in job.getFiles():
450	<	lfns.append(file['lfn'])
450	>	lfns.append(file['lfn'])
451		for loc in file['locations']:
452		if tmp_check < 1 :
453		locations.append(loc)
454	<	tmp_check = tmp_check + 1
455	<	### qui va messo il check per la locations
456	<	res['lfns'] = lfns
457	<	res['locations'] = locations
458	<	return res
459	<
454	>	tmp_check = tmp_check + 1
455	>	### qui va messo il check per la locations
456	>	res['lfns'] = lfns
457	>	res['locations'] = locations
458	>	return res
459	>
460		########################################################################
461	<	def jobSplittingNoInput(self):
461	>	def prepareSplittingNoInput(self):
462		"""
428	–	Perform job splitting based on number of event per job
463		"""
430	–	common.logger.debug(5,'Splitting per events')
431	–
464		if (self.selectEventsPerJob):
465	<	common.logger.message('Required '+str(self.eventsPerJob)+' events per job ')
465	>	common.logger.info('Required '+str(self.eventsPerJob)+' events per job ')
466		if (self.selectNumberOfJobs):
467	<	common.logger.message('Required '+str(self.theNumberOfJobs)+' jobs in total ')
467	>	common.logger.info('Required '+str(self.theNumberOfJobs)+' jobs in total ')
468		if (self.selectTotalNumberEvents):
469	<	common.logger.message('Required '+str(self.total_number_of_events)+' events in total ')
469	>	common.logger.info('Required '+str(self.total_number_of_events)+' events in total ')
470
471		if (self.total_number_of_events < 0):
472		msg='Cannot split jobs per Events with "-1" as total number of events'
#	Line 451 \| Line 483 \| class JobSplitter:
483		self.total_number_of_jobs = self.theNumberOfJobs
484		self.eventsPerJob = int(self.total_number_of_events/self.total_number_of_jobs)
485
486	<	common.logger.debug(5,'N jobs '+str(self.total_number_of_jobs))
486	>
487	>	def jobSplittingNoInput(self):
488	>	"""
489	>	Perform job splitting based on number of event per job
490	>	"""
491	>	common.logger.debug('Splitting per events')
492	>	self.checkUserSettings()
493	>	jobDestination=[]
494	>	if ( (self.selectTotalNumberEvents + self.selectEventsPerJob + self.selectNumberOfJobs) != 2 ):
495	>	msg = 'Must define exactly two of total_number_of_events, events_per_job, or number_of_jobs.'
496	>	raise CrabException(msg)
497	>
498	>	managedGenerators =self.args['managedGenerators']
499	>	generator = self.args['generator']
500	>	firstRun = self.cfg_params.get('CMSSW.first_run',None)
501	>
502	>	self.prepareSplittingNoInput()
503	>
504	>	common.logger.debug('N jobs '+str(self.total_number_of_jobs))
505
506		# is there any remainder?
507		check = int(self.total_number_of_events) - (int(self.total_number_of_jobs)*self.eventsPerJob)
508
509	<	common.logger.debug(5,'Check '+str(check))
509	>	common.logger.debug('Check '+str(check))
510
511	<	common.logger.message(str(self.total_number_of_jobs)+' jobs can be created, each for '+str(self.eventsPerJob)+' for a total of '+str(self.total_number_of_jobs*self.eventsPerJob)+' events')
511	>	common.logger.info(str(self.total_number_of_jobs)+' jobs can be created, each for '+str(self.eventsPerJob)+' for a total of '+str(self.total_number_of_jobs*self.eventsPerJob)+' events')
512		if check > 0:
513	<	common.logger.message('Warning: asked '+str(self.total_number_of_events)+' but can do only '+str(int(self.total_number_of_jobs)*self.eventsPerJob))
513	>	common.logger.info('Warning: asked '+str(self.total_number_of_events)+' but can do only '+str(int(self.total_number_of_jobs)*self.eventsPerJob))
514
515		# argument is seed number.$i
516		self.list_of_args = []
517		for i in range(self.total_number_of_jobs):
518		## Since there is no input, any site is good
519	<	self.jobDestination.append([""]) #must be empty to write correctly the xml
519	>	jobDestination.append([""]) #must be empty to write correctly the xml
520		args=[]
521	<	if (self.firstRun):
521	>	if (firstRun):
522		## pythia first run
523	<	args.append(str(self.firstRun)+str(i))
524	<	if (self.generator in self.managedGenerators):
525	<	if (self.generator == 'comphep' and i == 0):
523	>	args.append(str(firstRun)+str(i))
524	>	if (generator in managedGenerators):
525	>	args.append(generator)
526	>	if (generator == 'comphep' and i == 0):
527		# COMPHEP is brain-dead and wants event #'s like 1,100,200,300
528		args.append('1')
529	<	else:
529	>	else:
530		args.append(str(i*self.eventsPerJob))
531	+	args.append(str(self.eventsPerJob))
532		self.list_of_args.append(args)
533		# prepare dict output
534	+
535		dictOut = {}
536	+	dictOut['params'] = ['MaxEvents']
537	+	if (firstRun):
538	+	dictOut['params'] = ['FirstRun','MaxEvents']
539	+	if ( generator in managedGenerators ) :
540	+	dictOut['params'] = ['FirstRun', 'Generator', 'FirstEvent', 'MaxEvents']
541	+	else:
542	+	if (generator in managedGenerators) :
543	+	dictOut['params'] = ['Generator', 'FirstEvent', 'MaxEvents']
544		dictOut['args'] = self.list_of_args
545	<	dictOut['jobDestination'] = self.jobDestination
545	>	dictOut['jobDestination'] = jobDestination
546		dictOut['njobs']=self.total_number_of_jobs
547
548		return dictOut
#	Line 492 \| Line 553 \| class JobSplitter:
553		Perform job splitting based on number of job
554		"""
555		self.checkUserSettings()
556	<	if (self.selectnumberofjobs == 0):
556	>	if (self.selectNumberOfJobs == 0):
557		msg = 'must specify number_of_jobs.'
558		raise crabexception(msg)
559	+	jobDestination = []
560	+	common.logger.debug('Splitting per job')
561	+	common.logger.info('Required '+str(self.theNumberOfJobs)+' jobs in total ')
562
563	<	common.logger.debug(5,'Splitting per job')
500	<	common.logger.message('Required '+str(self.theNumberOfJobs)+' jobs in total ')
563	>	# self.total_number_of_jobs = self.theNumberOfJobs
564
565	<	self.total_number_of_jobs = self.theNumberOfJobs
565	>	self.prepareSplittingNoInput()
566
567	<	common.logger.debug(5,'N jobs '+str(self.total_number_of_jobs))
567	>	common.logger.debug('N jobs '+str(self.total_number_of_jobs))
568
569	<	common.logger.message(str(self.total_number_of_jobs)+' jobs can be created')
569	>	common.logger.info(str(self.total_number_of_jobs)+' jobs can be created')
570
571		# argument is seed number.$i
572		self.list_of_args = []
573		for i in range(self.total_number_of_jobs):
574	<	self.jobDestination.append([""])
575	<	self.list_of_args.append([str(i)])
574	>	args=[]
575	>	jobDestination.append([""])
576	>	if self.eventsPerJob != 0 :
577	>	args.append(str(self.eventsPerJob))
578	>	self.list_of_args.append(args)
579
580		# prepare dict output
581		dictOut = {}
582	<	dictOut['args'] = self.list_of_args
583	<	dictOut['jobDestination'] = []
582	>	dictOut['params'] = ['MaxEvents']
583	>	dictOut['args'] = self.list_of_args
584	>	dictOut['jobDestination'] = jobDestination
585		dictOut['njobs']=self.total_number_of_jobs
586		return dictOut
520	–
587
588	<	def jobSplittingByLumi(self):
588	>
589	>	def jobSplittingByLumi(self):
590		"""
591	+	Split task into jobs by Lumi section paying attention to which
592	+	lumis should be run (according to the analysis dataset).
593	+	This uses WMBS job splitting which does not split files over jobs
594	+	so the job will have AT LEAST as many lumis as requested, perhaps
595	+	more
596		"""
597	<	return
597	>
598	>	common.logger.debug('Splitting by Lumi')
599	>	self.checkUserSettings() # FIXME need one for lumis
600	>
601	>	blockSites = self.args['blockSites']
602	>	pubdata = self.args['pubdata']
603	>
604	>	if self.selectNumberOfJobs == 0 :
605	>	self.theNumberOfJobs = 9999999
606	>	lumisPerFile = pubdata.getLumis()
607	>
608	>	# Make the list of WMBS files for job splitter
609	>	fileList = pubdata.getListFiles()
610	>	thefiles = Fileset(name='FilesToSplit')
611	>	for jobFile in fileList:
612	>	block = jobFile['Block']['Name']
613	>	try:
614	>	jobFile['Block']['StorageElementList'].extend(blockSites[block])
615	>	except:
616	>	continue
617	>	wmbsFile = File(jobFile['LogicalFileName'])
618	>	[ wmbsFile['locations'].add(x) for x in blockSites[block] ]
619	>	wmbsFile['block'] = block
620	>	for lumi in lumisPerFile[jobFile['LogicalFileName']]:
621	>	wmbsFile.addRun(Run(lumi[0], lumi[1]))
622	>	thefiles.addFile(wmbsFile)
623	>
624	>	work = Workflow()
625	>	subs = Subscription(fileset = thefiles, workflow = work,
626	>	split_algo = 'LumiBased', type = "Processing")
627	>	splitter = SplitterFactory()
628	>	jobFactory = splitter(subs)
629	>
630	>	list_of_lists = []
631	>	jobDestination = []
632	>	jobCount = 0
633	>	self.theNumberOfJobs = 20 #FIXME
634	>	for jobGroup in jobFactory(lumis_per_job = 50): #FIXME
635	>	for job in jobGroup.jobs:
636	>	if jobCount < self.theNumberOfJobs:
637	>	lumis = []
638	>	lfns = []
639	>	locations = []
640	>	firstFile = True
641	>	# Collect information from all the files
642	>	for jobFile in job.getFiles():
643	>	if firstFile: # Get locations from first file in the job
644	>	for loc in jobFile['locations']:
645	>	locations.append(loc)
646	>	firstFile = False
647	>	# Accumulate Lumis from all files
648	>	for lumiList in jobFile['runs']:
649	>	theRun = lumiList.run
650	>	for theLumi in list(lumiList):
651	>	lumis.append( (theRun, theLumi) )
652	>
653	>	lfns.append(jobFile['lfn'])
654	>	fileString = ','.join(lfns)
655	>	lumiString = compressLumiString(lumis)
656	>	common.logger.debug('Job %s will run on %s files and %s lumis '
657	>	% (jobCount, len(lfns), len(lumis) ))
658	>	list_of_lists.append([fileString, str(-1), str(0), lumiString])
659	>
660	>	jobDestination.append(locations)
661	>	jobCount += 1
662	>	# Prepare dict output matching back to non-WMBS job creation
663	>	dictOut = {}
664	>	dictOut['params'] = ['InputFiles', 'MaxEvents', 'SkipEvents', 'Lumis']
665	>	dictOut['args'] = list_of_lists
666	>	dictOut['jobDestination'] = jobDestination
667	>	dictOut['njobs'] = jobCount
668	>
669	>	return dictOut
670	>
671	>
672		def Algos(self):
673		"""
674		Define key splittingType matrix
675		"""
676	<	SplitAlogs = {
677	<	'EventBased' : self.jobSplittingByEvent,
676	>	SplitAlogs = {
677	>	'EventBased' : self.jobSplittingByEvent,
678		'RunBased' : self.jobSplittingByRun,
679	<	'LumiBased' : self.jobSplittingByLumi,
680	<	'NoInput' : self.jobSplittingNoInput,
679	>	'LumiBased' : self.jobSplittingByLumi,
680	>	'NoInput' : self.jobSplittingNoInput,
681		'ForScript' : self.jobSplittingForScript
682	<	}
682	>	}
683		return SplitAlogs
684
685	+
686	+
687	+	def compressLumiString(lumis):
688	+	"""
689	+	Turn a list of 2-tuples of run/lumi numbers into a list of the format
690	+	R1:L1,R2:L2-R3:L3 which is acceptable to CMSSW LumiBlockRange variable
691	+	"""
692	+
693	+	lumis.sort()
694	+	parts = []
695	+	startRange = None
696	+	endRange = None
697	+
698	+	for lumiBlock in lumis:
699	+	if not startRange: # This is the first one
700	+	startRange = lumiBlock
701	+	endRange = lumiBlock
702	+	elif lumiBlock == endRange: # Same Lumi (different files?)
703	+	pass
704	+	elif lumiBlock[0] == endRange[0] and lumiBlock[1] == endRange[1] + 1: # This is a continuation
705	+	endRange = lumiBlock
706	+	else: # This is the start of a new range
707	+	part = ':'.join(map(str, startRange))
708	+	if startRange != endRange:
709	+	part += '-' + ':'.join(map(str, endRange))
710	+	parts.append(part)
711	+	startRange = lumiBlock
712	+	endRange = lumiBlock
713	+
714	+	# Put out what's left
715	+	if startRange:
716	+	part = ':'.join(map(str, startRange))
717	+	if startRange != endRange:
718	+	part += '-' + ':'.join(map(str, endRange))
719	+	parts.append(part)
720	+
721	+	output = ','.join(parts)
722	+	return output
723	+
724	+

Diff Legend

-–
+Removed lines
-+
+Added lines
-<
+Changed lines
->
+Changed lines

Comparing COMP/CRAB/python/Splitter.py (file contents): Revision 1.2 by spiga, Wed Feb 4 15:09:03 2009 UTC vs. Revision 1.26 by ewv, Wed Jul 29 21:42:16 2009 UTC

Diff Legend

Comparing COMP/CRAB/python/Splitter.py (file contents):
Revision 1.2 by spiga, Wed Feb 4 15:09:03 2009 UTC vs.
Revision 1.26 by ewv, Wed Jul 29 21:42:16 2009 UTC