CRAB/python/DataDiscovery.py

#!/usr/bin/env python

__revision__ = "$Id: DataDiscovery.py,v 1.49 2010/08/30 10:36:33 ewv Exp $"
__version__ = "$Revision: 1.49 $"

import exceptions
import DBSAPI.dbsApi
from DBSAPI.dbsApiException import *
import common
from crab_util import *
try: # Can remove when CMSSW 3.7 and earlier are dropped
    from FWCore.PythonUtilities.LumiList import LumiList
except ImportError:
    from LumiList import LumiList

import os


class DBSError(exceptions.Exception):
    def __init__(self, errorName, errorMessage):
        args='\nERROR DBS %s : %s \n'%(errorName,errorMessage)
        exceptions.Exception.__init__(self, args)
        pass

    def getErrorMessage(self):
        """ Return error message """
        return "%s" % (self.args)


class DBSInvalidDataTierError(exceptions.Exception):
    def __init__(self, errorName, errorMessage):
        args='\nERROR DBS %s : %s \n'%(errorName,errorMessage)
        exceptions.Exception.__init__(self, args)
        pass

    def getErrorMessage(self):
        """ Return error message """
        return "%s" % (self.args)


class DBSInfoError:
    def __init__(self, url):
        print '\nERROR accessing DBS url : '+url+'\n'
        pass


class DataDiscoveryError(exceptions.Exception):
    def __init__(self, errorMessage):
        self.args=errorMessage
        exceptions.Exception.__init__(self, self.args)
        pass

    def getErrorMessage(self):
        """ Return exception error """
        return "%s" % (self.args)


class NotExistingDatasetError(exceptions.Exception):
    def __init__(self, errorMessage):
        self.args=errorMessage
        exceptions.Exception.__init__(self, self.args)
        pass

    def getErrorMessage(self):
        """ Return exception error """
        return "%s" % (self.args)


class NoDataTierinProvenanceError(exceptions.Exception):
    def __init__(self, errorMessage):
        self.args=errorMessage
        exceptions.Exception.__init__(self, self.args)
        pass

    def getErrorMessage(self):
        """ Return exception error """
        return "%s" % (self.args)


class DataDiscovery:
    """
    Class to find and extact info from published data
    """
    def __init__(self, datasetPath, cfg_params, skipAnBlocks):

        #       Attributes
        self.datasetPath = datasetPath
        # Analysis dataset is primary/processed/tier/definition
        self.ads = len(self.datasetPath.split("/")) > 4
        self.cfg_params = cfg_params
        self.skipBlocks = skipAnBlocks

        self.eventsPerBlock = {}  # DBS output: map fileblocks-events for collection
        self.eventsPerFile = {}   # DBS output: map files-events
#         self.lumisPerBlock = {}   # DBS output: number of lumis in each block
#         self.lumisPerFile = {}    # DBS output: number of lumis in each file
        self.blocksinfo = {}      # DBS output: map fileblocks-files
        self.maxEvents = 0        # DBS output: max events
        self.maxLumis = 0         # DBS output: total number of lumis
        self.parent = {}          # DBS output: parents of each file
        self.lumis = {}           # DBS output: lumis in each file
        self.lumiMask = None
        self.splitByLumi = False
        self.splitDataByEvent = 0

    def fetchDBSInfo(self):
        """
        Contact DBS
        """
        ## get DBS URL
        global_url="http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
        dbs_url=  self.cfg_params.get('CMSSW.dbs_url', global_url)
        common.logger.info("Accessing DBS at: "+dbs_url)

        ## check if runs are selected
        runselection = []
        if (self.cfg_params.has_key('CMSSW.runselection')):
            runselection = parseRange2(self.cfg_params['CMSSW.runselection'])
            if len(runselection)>1000000:
                common.logger.info("ERROR: runselection range has more then 1M numbers")
                common.logger.info("ERROR: Too large. runselection is ignored")
                runselection=[]

        ## check if various lumi parameters are set
        self.lumiMask = self.cfg_params.get('CMSSW.lumi_mask',None)
        self.lumiParams = self.cfg_params.get('CMSSW.total_number_of_lumis',None) or \
                          self.cfg_params.get('CMSSW.lumis_per_job',None)

        lumiList = None
        if self.lumiMask:
            lumiList = LumiList(filename=self.lumiMask)
        if runselection:
            runList = LumiList(runs = runselection)

        self.splitByRun = int(self.cfg_params.get('CMSSW.split_by_run', 0))
        self.splitDataByEvent = int(self.cfg_params.get('CMSSW.split_by_event', 0))
        common.logger.log(10-1,"runselection is: %s"%runselection)

        if not self.splitByRun:
            self.splitByLumi = self.lumiMask or self.lumiParams or self.ads

        if self.splitByRun and not runselection:
            msg = "Error: split_by_run must be combined with a runselection"
            raise CrabException(msg)

        ## service API
        args = {}
        args['url']     = dbs_url
        args['level']   = 'CRITICAL'

        ## check if has been requested to use the parent info
        useparent = int(self.cfg_params.get('CMSSW.use_parent',0))

        ## check if has been asked for a non default file to store/read analyzed fileBlocks
        defaultName = common.work_space.shareDir()+'AnalyzedBlocks.txt'
        fileBlocks_FileName = os.path.abspath(self.cfg_params.get('CMSSW.fileblocks_file',defaultName))

        api = DBSAPI.dbsApi.DbsApi(args)
        self.files = self.queryDbs(api,path=self.datasetPath,runselection=runselection,useParent=useparent)

        # Check to see what the dataset is
        pdsName = self.datasetPath.split("/")[1]
        primDSs = api.listPrimaryDatasets(pdsName)
        dataType = primDSs[0]['Type']
        common.logger.debug("Datatype is %s" % dataType)
        if dataType == 'data' and not \
            (self.splitByRun or self.splitByLumi or self.splitDataByEvent):
            msg = 'Data must be split by lumi or by run. ' \
                  'Please see crab -help for the correct settings'
            raise  CrabException(msg)


        anFileBlocks = []
        if self.skipBlocks: anFileBlocks = readTXTfile(self, fileBlocks_FileName)

        # parse files and fill arrays
        for file in self.files :
            parList  = []
            fileLumis = [] # List of tuples
            # skip already analyzed blocks
            fileblock = file['Block']['Name']
            if fileblock not in anFileBlocks :
                filename = file['LogicalFileName']
                # asked retry the list of parent for the given child
                if useparent==1:
                    parList = [x['LogicalFileName'] for x in file['ParentList']]
                if self.splitByLumi:
                    fileLumis = [ (x['RunNumber'], x['LumiSectionNumber'])
                                 for x in file['LumiList'] ]
                self.parent[filename] = parList
                # For LumiMask, intersection of two lists.
                if self.lumiMask and runselection:
                    self.lumis[filename] = runList.filterLumis(lumiList.filterLumis(fileLumis))
                elif runselection:
                    self.lumis[filename] = runList.filterLumis(fileLumis)
                elif self.lumiMask:
                    self.lumis[filename] = lumiList.filterLumis(fileLumis)
                else:
                    self.lumis[filename] = fileLumis
                if filename.find('.dat') < 0 :
                    events    = file['NumberOfEvents']
                    # Count number of events and lumis per block
                    if fileblock in self.eventsPerBlock.keys() :
                        self.eventsPerBlock[fileblock] += events
                    else :
                        self.eventsPerBlock[fileblock] = events
                    # Number of events per file
                    self.eventsPerFile[filename] = events

                    # List of files per block
                    if fileblock in self.blocksinfo.keys() :
                        self.blocksinfo[fileblock].append(filename)
                    else :
                        self.blocksinfo[fileblock] = [filename]

                    # total number of events
                    self.maxEvents += events
                    self.maxLumis  += len(self.lumis[filename])

        if  self.skipBlocks and len(self.eventsPerBlock.keys()) == 0:
            msg = "No new fileblocks available for dataset: "+str(self.datasetPath)
            raise  CrabException(msg)


        if len(self.eventsPerBlock) <= 0:
            raise NotExistingDatasetError(("\nNo data for %s in DBS\nPlease check"
                                            + " dataset path variables in crab.cfg")
                                            % self.datasetPath)


    def queryDbs(self,api,path=None,runselection=None,useParent=None):


        allowedRetriveValue = []
        if self.splitByLumi or self.splitByRun or useParent == 1:
            allowedRetriveValue.extend(['retrive_block', 'retrive_run'])
        if self.splitByLumi:
            allowedRetriveValue.append('retrive_lumi')
        if useParent == 1:
            allowedRetriveValue.append('retrive_parent')
        common.logger.debug("Set of input parameters used for DBS query: %s" % allowedRetriveValue)
        try:
            if self.splitByRun:
                files = []
                for arun in runselection:
                    try:
                        if self.ads:
                            filesinrun = api.listFiles(analysisDataset=path,retriveList=allowedRetriveValue,runNumber=arun)
                        else:
                            filesinrun = api.listFiles(path=path,retriveList=allowedRetriveValue,runNumber=arun)
                        files.extend(filesinrun)
                    except:
                        msg="WARNING: problem extracting info from DBS for run %s "%arun
                        common.logger.info(msg)
                        pass

            else:
                if allowedRetriveValue:
                    if self.ads:
                        files = api.listFiles(analysisDataset=path, retriveList=allowedRetriveValue)
                    else :
                        files = api.listFiles(path=path, retriveList=allowedRetriveValue)
                else:
                    files = api.listDatasetFiles(self.datasetPath)

        except DbsBadRequest, msg:
            raise DataDiscoveryError(msg)
        except DBSError, msg:
            raise DataDiscoveryError(msg)

        return files


    def getMaxEvents(self):
        """
        max events
        """
        return self.maxEvents


    def getMaxLumis(self):
        """
        Return the number of lumis in the dataset
        """
        return self.maxLumis


    def getEventsPerBlock(self):
        """
        list the event collections structure by fileblock
        """
        return self.eventsPerBlock


    def getEventsPerFile(self):
        """
        list the event collections structure by file
        """
        return self.eventsPerFile


    def getFiles(self):
        """
        return files grouped by fileblock
        """
        return self.blocksinfo


    def getParent(self):
        """
        return parent grouped by file
        """
        return self.parent


    def getLumis(self):
        """
        return lumi sections grouped by file
        """
        return self.lumis


    def getListFiles(self):
        """
        return parent grouped by file
        """
        return self.files
Revision:	1.50
Committed:	Thu Sep 12 13:45:22 2013 UTC (11 years, 7 months ago) by belforte
Content type:	text/x-python
Branch:	MAIN
CVS Tags:	CRAB_2_9_1, CRAB_2_9_1_pre2, HEAD
Changes since 1.49:	+6 -2 lines
Error occurred while calculating annotation data.
Log Message:	protect against huge runselection range: https://savannah.cern.ch/bugs/index.php?95734
#	Content
1	#!/usr/bin/env python
2
3	__revision__ = "$Id: DataDiscovery.py,v 1.49 2010/08/30 10:36:33 ewv Exp $"
4	__version__ = "$Revision: 1.49 $"
5
6	import exceptions
7	import DBSAPI.dbsApi
8	from DBSAPI.dbsApiException import *
9	import common
10	from crab_util import *
11	try: # Can remove when CMSSW 3.7 and earlier are dropped
12	from FWCore.PythonUtilities.LumiList import LumiList
13	except ImportError:
14	from LumiList import LumiList
15
16	import os
17
18
19
20	class DBSError(exceptions.Exception):
21	def __init__(self, errorName, errorMessage):
22	args='\nERROR DBS %s : %s \n'%(errorName,errorMessage)
23	exceptions.Exception.__init__(self, args)
24	pass
25
26	def getErrorMessage(self):
27	""" Return error message """
28	return "%s" % (self.args)
29
30
31
32	class DBSInvalidDataTierError(exceptions.Exception):
33	def __init__(self, errorName, errorMessage):
34	args='\nERROR DBS %s : %s \n'%(errorName,errorMessage)
35	exceptions.Exception.__init__(self, args)
36	pass
37
38	def getErrorMessage(self):
39	""" Return error message """
40	return "%s" % (self.args)
41
42
43
44	class DBSInfoError:
45	def __init__(self, url):
46	print '\nERROR accessing DBS url : '+url+'\n'
47	pass
48
49
50
51	class DataDiscoveryError(exceptions.Exception):
52	def __init__(self, errorMessage):
53	self.args=errorMessage
54	exceptions.Exception.__init__(self, self.args)
55	pass
56
57	def getErrorMessage(self):
58	""" Return exception error """
59	return "%s" % (self.args)
60
61
62
63	class NotExistingDatasetError(exceptions.Exception):
64	def __init__(self, errorMessage):
65	self.args=errorMessage
66	exceptions.Exception.__init__(self, self.args)
67	pass
68
69	def getErrorMessage(self):
70	""" Return exception error """
71	return "%s" % (self.args)
72
73
74
75	class NoDataTierinProvenanceError(exceptions.Exception):
76	def __init__(self, errorMessage):
77	self.args=errorMessage
78	exceptions.Exception.__init__(self, self.args)
79	pass
80
81	def getErrorMessage(self):
82	""" Return exception error """
83	return "%s" % (self.args)
84
85
86
87	class DataDiscovery:
88	"""
89	Class to find and extact info from published data
90	"""
91	def __init__(self, datasetPath, cfg_params, skipAnBlocks):
92
93	# Attributes
94	self.datasetPath = datasetPath
95	# Analysis dataset is primary/processed/tier/definition
96	self.ads = len(self.datasetPath.split("/")) > 4
97	self.cfg_params = cfg_params
98	self.skipBlocks = skipAnBlocks
99
100	self.eventsPerBlock = {} # DBS output: map fileblocks-events for collection
101	self.eventsPerFile = {} # DBS output: map files-events
102	# self.lumisPerBlock = {} # DBS output: number of lumis in each block
103	# self.lumisPerFile = {} # DBS output: number of lumis in each file
104	self.blocksinfo = {} # DBS output: map fileblocks-files
105	self.maxEvents = 0 # DBS output: max events
106	self.maxLumis = 0 # DBS output: total number of lumis
107	self.parent = {} # DBS output: parents of each file
108	self.lumis = {} # DBS output: lumis in each file
109	self.lumiMask = None
110	self.splitByLumi = False
111	self.splitDataByEvent = 0
112
113	def fetchDBSInfo(self):
114	"""
115	Contact DBS
116	"""
117	## get DBS URL
118	global_url="http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
119	dbs_url= self.cfg_params.get('CMSSW.dbs_url', global_url)
120	common.logger.info("Accessing DBS at: "+dbs_url)
121
122	## check if runs are selected
123	runselection = []
124	if (self.cfg_params.has_key('CMSSW.runselection')):
125	runselection = parseRange2(self.cfg_params['CMSSW.runselection'])
126	if len(runselection)>1000000:
127	common.logger.info("ERROR: runselection range has more then 1M numbers")
128	common.logger.info("ERROR: Too large. runselection is ignored")
129	runselection=[]
130
131	## check if various lumi parameters are set
132	self.lumiMask = self.cfg_params.get('CMSSW.lumi_mask',None)
133	self.lumiParams = self.cfg_params.get('CMSSW.total_number_of_lumis',None) or \
134	self.cfg_params.get('CMSSW.lumis_per_job',None)
135
136	lumiList = None
137	if self.lumiMask:
138	lumiList = LumiList(filename=self.lumiMask)
139	if runselection:
140	runList = LumiList(runs = runselection)
141
142	self.splitByRun = int(self.cfg_params.get('CMSSW.split_by_run', 0))
143	self.splitDataByEvent = int(self.cfg_params.get('CMSSW.split_by_event', 0))
144	common.logger.log(10-1,"runselection is: %s"%runselection)
145
146	if not self.splitByRun:
147	self.splitByLumi = self.lumiMask or self.lumiParams or self.ads
148
149	if self.splitByRun and not runselection:
150	msg = "Error: split_by_run must be combined with a runselection"
151	raise CrabException(msg)
152
153	## service API
154	args = {}
155	args['url'] = dbs_url
156	args['level'] = 'CRITICAL'
157
158	## check if has been requested to use the parent info
159	useparent = int(self.cfg_params.get('CMSSW.use_parent',0))
160
161	## check if has been asked for a non default file to store/read analyzed fileBlocks
162	defaultName = common.work_space.shareDir()+'AnalyzedBlocks.txt'
163	fileBlocks_FileName = os.path.abspath(self.cfg_params.get('CMSSW.fileblocks_file',defaultName))
164
165	api = DBSAPI.dbsApi.DbsApi(args)
166	self.files = self.queryDbs(api,path=self.datasetPath,runselection=runselection,useParent=useparent)
167
168	# Check to see what the dataset is
169	pdsName = self.datasetPath.split("/")[1]
170	primDSs = api.listPrimaryDatasets(pdsName)
171	dataType = primDSs[0]['Type']
172	common.logger.debug("Datatype is %s" % dataType)
173	if dataType == 'data' and not \
174	(self.splitByRun or self.splitByLumi or self.splitDataByEvent):
175	msg = 'Data must be split by lumi or by run. ' \
176	'Please see crab -help for the correct settings'
177	raise CrabException(msg)
178
179
180
181	anFileBlocks = []
182	if self.skipBlocks: anFileBlocks = readTXTfile(self, fileBlocks_FileName)
183
184	# parse files and fill arrays
185	for file in self.files :
186	parList = []
187	fileLumis = [] # List of tuples
188	# skip already analyzed blocks
189	fileblock = file['Block']['Name']
190	if fileblock not in anFileBlocks :
191	filename = file['LogicalFileName']
192	# asked retry the list of parent for the given child
193	if useparent==1:
194	parList = [x['LogicalFileName'] for x in file['ParentList']]
195	if self.splitByLumi:
196	fileLumis = [ (x['RunNumber'], x['LumiSectionNumber'])
197	for x in file['LumiList'] ]
198	self.parent[filename] = parList
199	# For LumiMask, intersection of two lists.
200	if self.lumiMask and runselection:
201	self.lumis[filename] = runList.filterLumis(lumiList.filterLumis(fileLumis))
202	elif runselection:
203	self.lumis[filename] = runList.filterLumis(fileLumis)
204	elif self.lumiMask:
205	self.lumis[filename] = lumiList.filterLumis(fileLumis)
206	else:
207	self.lumis[filename] = fileLumis
208	if filename.find('.dat') < 0 :
209	events = file['NumberOfEvents']
210	# Count number of events and lumis per block
211	if fileblock in self.eventsPerBlock.keys() :
212	self.eventsPerBlock[fileblock] += events
213	else :
214	self.eventsPerBlock[fileblock] = events
215	# Number of events per file
216	self.eventsPerFile[filename] = events
217
218	# List of files per block
219	if fileblock in self.blocksinfo.keys() :
220	self.blocksinfo[fileblock].append(filename)
221	else :
222	self.blocksinfo[fileblock] = [filename]
223
224	# total number of events
225	self.maxEvents += events
226	self.maxLumis += len(self.lumis[filename])
227
228	if self.skipBlocks and len(self.eventsPerBlock.keys()) == 0:
229	msg = "No new fileblocks available for dataset: "+str(self.datasetPath)
230	raise CrabException(msg)
231
232
233	if len(self.eventsPerBlock) <= 0:
234	raise NotExistingDatasetError(("\nNo data for %s in DBS\nPlease check"
235	+ " dataset path variables in crab.cfg")
236	% self.datasetPath)
237
238
239	def queryDbs(self,api,path=None,runselection=None,useParent=None):
240
241
242	allowedRetriveValue = []
243	if self.splitByLumi or self.splitByRun or useParent == 1:
244	allowedRetriveValue.extend(['retrive_block', 'retrive_run'])
245	if self.splitByLumi:
246	allowedRetriveValue.append('retrive_lumi')
247	if useParent == 1:
248	allowedRetriveValue.append('retrive_parent')
249	common.logger.debug("Set of input parameters used for DBS query: %s" % allowedRetriveValue)
250	try:
251	if self.splitByRun:
252	files = []
253	for arun in runselection:
254	try:
255	if self.ads:
256	filesinrun = api.listFiles(analysisDataset=path,retriveList=allowedRetriveValue,runNumber=arun)
257	else:
258	filesinrun = api.listFiles(path=path,retriveList=allowedRetriveValue,runNumber=arun)
259	files.extend(filesinrun)
260	except:
261	msg="WARNING: problem extracting info from DBS for run %s "%arun
262	common.logger.info(msg)
263	pass
264
265	else:
266	if allowedRetriveValue:
267	if self.ads:
268	files = api.listFiles(analysisDataset=path, retriveList=allowedRetriveValue)
269	else :
270	files = api.listFiles(path=path, retriveList=allowedRetriveValue)
271	else:
272	files = api.listDatasetFiles(self.datasetPath)
273
274	except DbsBadRequest, msg:
275	raise DataDiscoveryError(msg)
276	except DBSError, msg:
277	raise DataDiscoveryError(msg)
278
279	return files
280
281
282	def getMaxEvents(self):
283	"""
284	max events
285	"""
286	return self.maxEvents
287
288
289	def getMaxLumis(self):
290	"""
291	Return the number of lumis in the dataset
292	"""
293	return self.maxLumis
294
295
296	def getEventsPerBlock(self):
297	"""
298	list the event collections structure by fileblock
299	"""
300	return self.eventsPerBlock
301
302
303	def getEventsPerFile(self):
304	"""
305	list the event collections structure by file
306	"""
307	return self.eventsPerFile
308
309
310	def getFiles(self):
311	"""
312	return files grouped by fileblock
313	"""
314	return self.blocksinfo
315
316
317	def getParent(self):
318	"""
319	return parent grouped by file
320	"""
321	return self.parent
322
323
324	def getLumis(self):
325	"""
326	return lumi sections grouped by file
327	"""
328	return self.lumis
329
330
331	def getListFiles(self):
332	"""
333	return parent grouped by file
334	"""
335	return self.files