CRAB/python/PubDB.py

#!/usr/bin/env python2
import sys, os, string, re
import urllib, urllister
import urllib2
import common
from RefDBInfo import RefDBInfo

# ####################################
class PubDBError:
    def __init__(self, url):
        print '\nERROR accessing PubDB at '+url+'\n'
        pass
 
# ####################################
class RefDBmapError:
    def __init__(self, url):
        print '\nERROR accessing RefDB-PubDBs map at '+url+'\n'
        pass 

# ####################################
class NoPHPError:
    def __init__(self, url):
        #print '\nERROR accessing PHP at '+url+' \n'
        print 'ERROR accessing PHP: ',url,' \n'
        pass
  
# ####################################
class pubDBResult:
    def __init__(self,
                 contents):
        self.contents=contents

    
    def dump(self):
        print 'Contents : ',self.contents
        pass

# ####################################
# class to access PubDBs
class PubDB:
    def __init__(self, owner, dataset, dataTiers, cfg_params):

#       Attributes
        self.owner = owner
        self.dataset = dataset
        self.dataTiers = dataTiers
        self.cfg_params = cfg_params
    
        self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
        self.RefDBphp_ = 'PubDB/GetIdCollection.php'
        self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'

        self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
        self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'

        self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
        self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'
    
##      link to the modified RefDB-PubDBs map script that allow the display option
        self.RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.mod.php?display=1'

#       Costructor procedures

        CEBlackList = []
        try:
            tmpBad = string.split(self.cfg_params['USER.ce_black_list'],',')
            #tmpBad = ['fnal']
            for tmp in tmpBad:
                tmp=string.strip(tmp)
                CEBlackList.append(tmp)
        except KeyError:
            pass
        print 'CEBlackList: ',CEBlackList
        self.reCEBlackList=[]
        for bad in CEBlackList:
            self.reCEBlackList.append(re.compile( bad ))
        #print 'ReBad: ',self.reCEBlackList


########################################################################
    def findAllCollections(self):
        """
        Contact RefDB and find the CollID of all the user required collections 
        """
        ## download from RefDB all the info about the given dataset-owner  
        refdb=RefDBInfo(self.owner,self.dataset)
        #print refdb.GetRefDBInfo()
        try:
            collInfos=refdb.GetRefDBInfo()
        except :
            sys.exit(10)
        #print "collInfos=", collInfos 
        
        first=1
        NeededCollID=[]
        refdbdataTiers=[]
        for coll in collInfos:
            ## select the primary collection
            if first:
                NeededCollID.append(coll[0])
                refdbdataTiers.append(coll[2])
                common.logger.message("\n --> primary collection for owner "+self.owner+" is: ID="+coll[0]+" DataTier="+coll[2])
                first=0
            else:
                ## select only the parents collections corresponding to data-tiers requested by the user 
                if  self.dataTiers.count(coll[2]):
                    NeededCollID.append(coll[0])
                    common.logger.message(" --> further collection required: ID="+coll[0]+" DataTier="+coll[2])
                refdbdataTiers.append(coll[2])
           
        ## check that the user asks for Data Tier really existing in RefDB, otherwise give a warning message
        for dt in self.dataTiers:
            if refdbdataTiers.count(dt)<=0:
                msg = "ERROR: Data Tier ( =>",dt,"<= ) not existing for dataset/owner "+ self.dataset+"/"+self.owner+"!"
                msg = msg + "Check the data_tier variable in crab.cfg"
                msg = msg + 'Owner Dataset not published with asked dataTiers! '+\
                       self.owner+' '+ self.dataset+' '+self.dataTiers
                common.logger.message(msg)
                return []
        
        #print 'Needed Collections are ', NeededCollID
        #return collInfos
        #print "NeededCollID= ", NeededCollID
        return NeededCollID
  
########################################################################
    def findPubDBsbyCollID(self,CollID):
        """
         Find the list of PubDB URLs having a given Collection 
        """
        ### contact the RefDB-PubDBs map to discovery where the given CollID is
        url = self.PubDBCentralUrl_+self.RefDBPubDBsmapPhp_+'&CollID=' + CollID
        # print "%s"%(url)
        try:
            f = urllib.urlopen(url)
        except IOError:
            # print 'Cannot access URL: '+url
            raise RefDBmapError(url)
        
        ### search for the PubDBURL string
        reURLLine=re.compile( r'PubDBURL=(\S*)' )
       
        PubDBURLs = []
        for line in f.readlines():
            #print '#',line,'#'
            if reURLLine.search(line) :
                URLLine=reURLLine.search(line).group()
                #print  string.split(URLLine,'=')[1]
                PubDBURLs.append(string.split(URLLine,'=')[1])
        
        ### return the list of PubDBURL where the collection is present
        #return PubDBURLs 
        return  self.uniquelist(PubDBURLs)
  
################################################################
    def findPubDBs(self,CollIDs):
        """
         Find the list of PubDB URLs having ALL the required collections
        """
        ### loop over all the required collections 
        #pubdbmap={}
        allurls=[]
        countColl=0
        for CollID in CollIDs :
            countColl=countColl+1
            ### map the CollectionID with the list of PubDB URLs
            #pubdbmap[CollID]=self.findPubDBsbyCollID(CollID)
            ### prepare a list all PubDB urls for all collections  
            allurls.extend(self.findPubDBsbyCollID(CollID))
        #print pubdbmap.values()
       
        ### select only PubDB urls that contains all the collections
        unique_urls=self.uniquelist(allurls)
        SelectedPubDBURLs=[]
        # loop on a unique list of PubDB urls
        for url in unique_urls :
            # check that PubDBurl occurrance is the same as the number of collections 
            if ( allurls.count(url)==countColl ) :
                SelectedPubDBURLs.append(url)
       
        #print 'Required Collections',CollIDs,'are all present in PubDBURLs : ',SelectedPubDBURLs,'\n'
        #return SelectedPubDBURLs
  ####  check based on CE black list: select only PubDB not in the CE black list   
        GoodPubDBURLs=self.checkBlackList(SelectedPubDBURLs)
        return GoodPubDBURLs

#######################################################################
    def uniquelist(self, old):
        """
         remove duplicates from a list
        """
        nd={}
        for e in old:
            nd[e]=0
        return nd.keys()
 
#######################################################################
    def checkBlackList(self, pubDBUrls):
        """
        select PubDB URLs that are at site not excluded by the user (via CE black list) 
        """
        goodurls = []
        for url in pubDBUrls:
            print 'connecting to the URL ',url
            good=1
            for re in self.reCEBlackList:
                if re.search(url):
                    common.logger.message('CE in black list, skipping PubDB URL '+url)
                    good=0
                pass
            if good: goodurls.append(url)
        if len(goodurls) == 0:
            common.logger.debug(3,"No selected PubDB URLs")
        return goodurls
  
########################################################################
    def getPubDBData(self, CollIDs, url):
        """
         Contact a PubDB to collect all the relevant information
        """
        result = []
        for CollID in CollIDs:
            end=string.rfind(url,'/')
            lastEq=string.rfind(url,'=')
            urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+CollID
            # print 'PHP URL: '+urlphp+' \n'
           
            reOld=re.compile( r'V24' )
            #print urlphp,'Old PubDB ',reOld.search(urlphp)
            if reOld.search(urlphp):
                raise NoPHPError(urlphp)
            else:
                try:
                    f = urllib2.urlopen(urlphp) 
                except urllib2.URLError, msg:
                    print "WARNING: ", msg 
                    raise PubDBError(urlphp)
                except urllib2.HTTPError, msg:
                    print "WARNING: ", msg
                    raise NoPHPError(urlphp)
                content = f.read()
                result.append(pubDBResult(content))
                #print "Coll",CollID," content ",content
                pass
            pass
        
        #print '.....'
        #for r in result:
        #     r.dump()
        #print '.....'
        return result
  
########################################################################
    def getAllPubDBData(self, CollIDs, urllist):
        """
         Contact a list of PubDB to collect all the relevant information
        """
        completeResult=[]
        for pubdburl in urllist: 
            completeResult.append(self.getPubDBData(CollIDs,pubdburl))
        
        ## print for debugging purpose
        #for result in completeResult:
        #   print '..... PubDB Site URL :',pubdburl
        #   for r in result:
        #      r.dump()
        #   print '.....................................'
         
        return completeResult
####################################################################
Revision:	1.4
Committed:	Wed Aug 10 16:52:51 2005 UTC (19 years, 8 months ago) by slacapra
Content type:	text/x-python
Branch:	MAIN
Changes since 1.3:	+8 -10 lines
Log Message:	more check on DB plus other
#	Content
1	#!/usr/bin/env python2
2	import sys, os, string, re
3	import urllib, urllister
4	import urllib2
5	import common
6	from RefDBInfo import RefDBInfo
7
8	# ####################################
9	class PubDBError:
10	def __init__(self, url):
11	print '\nERROR accessing PubDB at '+url+'\n'
12	pass
13
14	# ####################################
15	class RefDBmapError:
16	def __init__(self, url):
17	print '\nERROR accessing RefDB-PubDBs map at '+url+'\n'
18	pass
19
20	# ####################################
21	class NoPHPError:
22	def __init__(self, url):
23	#print '\nERROR accessing PHP at '+url+' \n'
24	print 'ERROR accessing PHP: ',url,' \n'
25	pass
26
27	# ####################################
28	class pubDBResult:
29	def __init__(self,
30	contents):
31	self.contents=contents
32
33
34	def dump(self):
35	print 'Contents : ',self.contents
36	pass
37
38	# ####################################
39	# class to access PubDBs
40	class PubDB:
41	def __init__(self, owner, dataset, dataTiers, cfg_params):
42
43	# Attributes
44	self.owner = owner
45	self.dataset = dataset
46	self.dataTiers = dataTiers
47	self.cfg_params = cfg_params
48
49	self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
50	self.RefDBphp_ = 'PubDB/GetIdCollection.php'
51	self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'
52
53	self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
54	self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'
55
56	self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
57	self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'
58
59	## link to the modified RefDB-PubDBs map script that allow the display option
60	self.RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.mod.php?display=1'
61
62	# Costructor procedures
63
64	CEBlackList = []
65	try:
66	tmpBad = string.split(self.cfg_params['USER.ce_black_list'],',')
67	#tmpBad = ['fnal']
68	for tmp in tmpBad:
69	tmp=string.strip(tmp)
70	CEBlackList.append(tmp)
71	except KeyError:
72	pass
73	print 'CEBlackList: ',CEBlackList
74	self.reCEBlackList=[]
75	for bad in CEBlackList:
76	self.reCEBlackList.append(re.compile( bad ))
77	#print 'ReBad: ',self.reCEBlackList
78
79
80	########################################################################
81	def findAllCollections(self):
82	"""
83	Contact RefDB and find the CollID of all the user required collections
84	"""
85	## download from RefDB all the info about the given dataset-owner
86	refdb=RefDBInfo(self.owner,self.dataset)
87	#print refdb.GetRefDBInfo()
88	try:
89	collInfos=refdb.GetRefDBInfo()
90	except :
91	sys.exit(10)
92	#print "collInfos=", collInfos
93
94	first=1
95	NeededCollID=[]
96	refdbdataTiers=[]
97	for coll in collInfos:
98	## select the primary collection
99	if first:
100	NeededCollID.append(coll[0])
101	refdbdataTiers.append(coll[2])
102	common.logger.message("\n --> primary collection for owner "+self.owner+" is: ID="+coll[0]+" DataTier="+coll[2])
103	first=0
104	else:
105	## select only the parents collections corresponding to data-tiers requested by the user
106	if self.dataTiers.count(coll[2]):
107	NeededCollID.append(coll[0])
108	common.logger.message(" --> further collection required: ID="+coll[0]+" DataTier="+coll[2])
109	refdbdataTiers.append(coll[2])
110
111	## check that the user asks for Data Tier really existing in RefDB, otherwise give a warning message
112	for dt in self.dataTiers:
113	if refdbdataTiers.count(dt)<=0:
114	msg = "ERROR: Data Tier ( =>",dt,"<= ) not existing for dataset/owner "+ self.dataset+"/"+self.owner+"!"
115	msg = msg + "Check the data_tier variable in crab.cfg"
116	msg = msg + 'Owner Dataset not published with asked dataTiers! '+\
117	self.owner+' '+ self.dataset+' '+self.dataTiers
118	common.logger.message(msg)
119	return []
120
121	#print 'Needed Collections are ', NeededCollID
122	#return collInfos
123	#print "NeededCollID= ", NeededCollID
124	return NeededCollID
125
126	########################################################################
127	def findPubDBsbyCollID(self,CollID):
128	"""
129	Find the list of PubDB URLs having a given Collection
130	"""
131	### contact the RefDB-PubDBs map to discovery where the given CollID is
132	url = self.PubDBCentralUrl_+self.RefDBPubDBsmapPhp_+'&CollID=' + CollID
133	# print "%s"%(url)
134	try:
135	f = urllib.urlopen(url)
136	except IOError:
137	# print 'Cannot access URL: '+url
138	raise RefDBmapError(url)
139
140	### search for the PubDBURL string
141	reURLLine=re.compile( r'PubDBURL=(\S*)' )
142
143	PubDBURLs = []
144	for line in f.readlines():
145	#print '#',line,'#'
146	if reURLLine.search(line) :
147	URLLine=reURLLine.search(line).group()
148	#print string.split(URLLine,'=')[1]
149	PubDBURLs.append(string.split(URLLine,'=')[1])
150
151	### return the list of PubDBURL where the collection is present
152	#return PubDBURLs
153	return self.uniquelist(PubDBURLs)
154
155	################################################################
156	def findPubDBs(self,CollIDs):
157	"""
158	Find the list of PubDB URLs having ALL the required collections
159	"""
160	### loop over all the required collections
161	#pubdbmap={}
162	allurls=[]
163	countColl=0
164	for CollID in CollIDs :
165	countColl=countColl+1
166	### map the CollectionID with the list of PubDB URLs
167	#pubdbmap[CollID]=self.findPubDBsbyCollID(CollID)
168	### prepare a list all PubDB urls for all collections
169	allurls.extend(self.findPubDBsbyCollID(CollID))
170	#print pubdbmap.values()
171
172	### select only PubDB urls that contains all the collections
173	unique_urls=self.uniquelist(allurls)
174	SelectedPubDBURLs=[]
175	# loop on a unique list of PubDB urls
176	for url in unique_urls :
177	# check that PubDBurl occurrance is the same as the number of collections
178	if ( allurls.count(url)==countColl ) :
179	SelectedPubDBURLs.append(url)
180
181	#print 'Required Collections',CollIDs,'are all present in PubDBURLs : ',SelectedPubDBURLs,'\n'
182	#return SelectedPubDBURLs
183	#### check based on CE black list: select only PubDB not in the CE black list
184	GoodPubDBURLs=self.checkBlackList(SelectedPubDBURLs)
185	return GoodPubDBURLs
186
187	#######################################################################
188	def uniquelist(self, old):
189	"""
190	remove duplicates from a list
191	"""
192	nd={}
193	for e in old:
194	nd[e]=0
195	return nd.keys()
196
197	#######################################################################
198	def checkBlackList(self, pubDBUrls):
199	"""
200	select PubDB URLs that are at site not excluded by the user (via CE black list)
201	"""
202	goodurls = []
203	for url in pubDBUrls:
204	print 'connecting to the URL ',url
205	good=1
206	for re in self.reCEBlackList:
207	if re.search(url):
208	common.logger.message('CE in black list, skipping PubDB URL '+url)
209	good=0
210	pass
211	if good: goodurls.append(url)
212	if len(goodurls) == 0:
213	common.logger.debug(3,"No selected PubDB URLs")
214	return goodurls
215
216	########################################################################
217	def getPubDBData(self, CollIDs, url):
218	"""
219	Contact a PubDB to collect all the relevant information
220	"""
221	result = []
222	for CollID in CollIDs:
223	end=string.rfind(url,'/')
224	lastEq=string.rfind(url,'=')
225	urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+CollID
226	# print 'PHP URL: '+urlphp+' \n'
227
228	reOld=re.compile( r'V24' )
229	#print urlphp,'Old PubDB ',reOld.search(urlphp)
230	if reOld.search(urlphp):
231	raise NoPHPError(urlphp)
232	else:
233	try:
234	f = urllib2.urlopen(urlphp)
235	except urllib2.URLError, msg:
236	print "WARNING: ", msg
237	raise PubDBError(urlphp)
238	except urllib2.HTTPError, msg:
239	print "WARNING: ", msg
240	raise NoPHPError(urlphp)
241	content = f.read()
242	result.append(pubDBResult(content))
243	#print "Coll",CollID," content ",content
244	pass
245	pass
246
247	#print '.....'
248	#for r in result:
249	# r.dump()
250	#print '.....'
251	return result
252
253	########################################################################
254	def getAllPubDBData(self, CollIDs, urllist):
255	"""
256	Contact a list of PubDB to collect all the relevant information
257	"""
258	completeResult=[]
259	for pubdburl in urllist:
260	completeResult.append(self.getPubDBData(CollIDs,pubdburl))
261
262	## print for debugging purpose
263	#for result in completeResult:
264	# print '..... PubDB Site URL :',pubdburl
265	# for r in result:
266	# r.dump()
267	# print '.....................................'
268
269	return completeResult
270	####################################################################