ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cvsroot/COMP/CRAB/python/PubDB.py
Revision: 1.11
Committed: Wed Nov 16 13:28:54 2005 UTC (19 years, 5 months ago) by afanfani
Content type: text/x-python
Branch: MAIN
CVS Tags: CRAB_1_0_7, CRAB_1_0_7_pre1, CRAB_1_0_6, CRAB_1_0_5, CRAB_1_0_4, CRAB_1_0_3
Changes since 1.10: +12 -7 lines
Log Message:
slight improvement in the error messages
fix to really skip CNAF in CEblacklist
fix not to count several times the PU CollID

File Contents

# User Rev Content
1 slacapra 1.3 #!/usr/bin/env python2
2 nsmirnov 1.1 import sys, os, string, re
3     import urllib, urllister
4     import urllib2
5 nsmirnov 1.2 import common
6 slacapra 1.3 from RefDBInfo import RefDBInfo
7 slacapra 1.6 from PubDBInfo import *
8 nsmirnov 1.1
9     # ####################################
10 slacapra 1.3 class PubDBError:
11     def __init__(self, url):
12     print '\nERROR accessing PubDB at '+url+'\n'
13     pass
14 slacapra 1.5
15     # ####################################
16     class PubDBGetAnalysisError:
17     def __init__(self, url,Collections):
18     print '\nERROR extracting info for collections '+Collections+' from PubDB '+url+'.\n'
19     pass
20    
21 slacapra 1.3 # ####################################
22     class RefDBmapError:
23     def __init__(self, url):
24     print '\nERROR accessing RefDB-PubDBs map at '+url+'\n'
25     pass
26 nsmirnov 1.1
27     # ####################################
28     class NoPHPError:
29 slacapra 1.3 def __init__(self, url):
30     #print '\nERROR accessing PHP at '+url+' \n'
31     print 'ERROR accessing PHP: ',url,' \n'
32     pass
33 nsmirnov 1.1
34     # ####################################
35     class pubDBResult:
36 slacapra 1.3 def __init__(self,
37     contents):
38     self.contents=contents
39 nsmirnov 1.1
40    
41 slacapra 1.3 def dump(self):
42     print 'Contents : ',self.contents
43     pass
44 nsmirnov 1.1
45     # ####################################
46     # class to access PubDBs
47     class PubDB:
48 slacapra 1.3 def __init__(self, owner, dataset, dataTiers, cfg_params):
49    
50     # Attributes
51     self.owner = owner
52     self.dataset = dataset
53     self.dataTiers = dataTiers
54 slacapra 1.5 self.NeededdataTiers=[]
55 slacapra 1.3 self.cfg_params = cfg_params
56 nsmirnov 1.1
57 slacapra 1.3 self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
58     self.RefDBphp_ = 'PubDB/GetIdCollection.php'
59     self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'
60 nsmirnov 1.1
61 slacapra 1.3 self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
62     self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'
63 nsmirnov 1.1
64 slacapra 1.3 self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
65     self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'
66 nsmirnov 1.1
67 slacapra 1.3 ## link to the modified RefDB-PubDBs map script that allow the display option
68     self.RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.mod.php?display=1'
69 nsmirnov 1.1
70 slacapra 1.3 # Costructor procedures
71 nsmirnov 1.1
72 slacapra 1.3 CEBlackList = []
73     try:
74 slacapra 1.5 tmpBad = string.split(self.cfg_params['EDG.ce_black_list'],',')
75 slacapra 1.3 #tmpBad = ['fnal']
76     for tmp in tmpBad:
77     tmp=string.strip(tmp)
78 afanfani 1.11 if (tmp == 'cnaf'): tmp = 'webserver' ########## warning: temp. patch
79 slacapra 1.3 CEBlackList.append(tmp)
80     except KeyError:
81     pass
82 slacapra 1.7
83     CEWhiteList = []
84     try:
85     tmpGood = string.split(self.cfg_params['EDG.ce_white_list'],',')
86     #tmpGood = ['cern']
87     for tmp in tmpGood:
88     tmp=string.strip(tmp)
89     #if (tmp == 'cnaf'): tmp = 'webserver' ########## warning: temp. patch
90     CEWhiteList.append(tmp)
91     except KeyError:
92     pass
93    
94     #print 'CEWhiteList: ',CEWhiteList
95     self.reCEWhiteList=[]
96     for Good in CEWhiteList:
97     self.reCEWhiteList.append(re.compile( Good ))
98     #print 'ReGood: ',self.reCEWhiteList
99    
100 slacapra 1.5 common.logger.debug(5,'CEBlackList: '+str(CEBlackList))
101 slacapra 1.7 common.logger.debug(5,'CEWhiteList: '+str(CEWhiteList))
102 slacapra 1.3 self.reCEBlackList=[]
103     for bad in CEBlackList:
104     self.reCEBlackList.append(re.compile( bad ))
105     #print 'ReBad: ',self.reCEBlackList
106 nsmirnov 1.1
107    
108     ########################################################################
109 slacapra 1.3 def findAllCollections(self):
110     """
111     Contact RefDB and find the CollID of all the user required collections
112     """
113     ## download from RefDB all the info about the given dataset-owner
114     refdb=RefDBInfo(self.owner,self.dataset)
115     #print refdb.GetRefDBInfo()
116     try:
117     collInfos=refdb.GetRefDBInfo()
118     except :
119     sys.exit(10)
120     #print "collInfos=", collInfos
121    
122     first=1
123     NeededCollID=[]
124     refdbdataTiers=[]
125     for coll in collInfos:
126     ## select the primary collection
127     if first:
128     NeededCollID.append(coll[0])
129 slacapra 1.5 self.NeededdataTiers.append(coll[2])
130 slacapra 1.3 refdbdataTiers.append(coll[2])
131     common.logger.message("\n --> primary collection for owner "+self.owner+" is: ID="+coll[0]+" DataTier="+coll[2])
132     first=0
133     else:
134     ## select only the parents collections corresponding to data-tiers requested by the user
135     if self.dataTiers.count(coll[2]):
136     NeededCollID.append(coll[0])
137 slacapra 1.5 self.NeededdataTiers.append(coll[2])
138 slacapra 1.3 common.logger.message(" --> further collection required: ID="+coll[0]+" DataTier="+coll[2])
139     refdbdataTiers.append(coll[2])
140    
141     ## check that the user asks for Data Tier really existing in RefDB, otherwise give a warning message
142     for dt in self.dataTiers:
143     if refdbdataTiers.count(dt)<=0:
144 slacapra 1.5 msg = "ERROR: Data Tier ( =>",dt,"<= ) not existing for dataset/owner "+self.dataset+"/"+self.owner+"! "
145     msg = str(msg) + 'Owner Dataset not published with asked dataTiers! '+\
146 slacapra 1.10 self.owner+' '+ self.dataset+' '+str(self.dataTiers)+'\n'
147 slacapra 1.5 msg = str(msg) + ' Check the data_tier variable in crab.cfg !\n'
148     common.logger.message(msg)
149 slacapra 1.3 return []
150    
151     #print 'Needed Collections are ', NeededCollID
152     #return collInfos
153     #print "NeededCollID= ", NeededCollID
154     return NeededCollID
155    
156 nsmirnov 1.1 ########################################################################
157 slacapra 1.3 def findPubDBsbyCollID(self,CollID):
158     """
159     Find the list of PubDB URLs having a given Collection
160     """
161     ### contact the RefDB-PubDBs map to discovery where the given CollID is
162     url = self.PubDBCentralUrl_+self.RefDBPubDBsmapPhp_+'&CollID=' + CollID
163     # print "%s"%(url)
164     try:
165     f = urllib.urlopen(url)
166     except IOError:
167     # print 'Cannot access URL: '+url
168     raise RefDBmapError(url)
169    
170     ### search for the PubDBURL string
171     reURLLine=re.compile( r'PubDBURL=(\S*)' )
172    
173     PubDBURLs = []
174     for line in f.readlines():
175     #print '#',line,'#'
176     if reURLLine.search(line) :
177     URLLine=reURLLine.search(line).group()
178     #print string.split(URLLine,'=')[1]
179     PubDBURLs.append(string.split(URLLine,'=')[1])
180 nsmirnov 1.1
181 slacapra 1.3 ### return the list of PubDBURL where the collection is present
182     #return PubDBURLs
183     return self.uniquelist(PubDBURLs)
184    
185     ################################################################
186     def findPubDBs(self,CollIDs):
187     """
188     Find the list of PubDB URLs having ALL the required collections
189     """
190     ### loop over all the required collections
191     #pubdbmap={}
192     allurls=[]
193     countColl=0
194     for CollID in CollIDs :
195     countColl=countColl+1
196     ### map the CollectionID with the list of PubDB URLs
197     #pubdbmap[CollID]=self.findPubDBsbyCollID(CollID)
198     ### prepare a list all PubDB urls for all collections
199     allurls.extend(self.findPubDBsbyCollID(CollID))
200     #print pubdbmap.values()
201    
202     ### select only PubDB urls that contains all the collections
203     unique_urls=self.uniquelist(allurls)
204     SelectedPubDBURLs=[]
205     # loop on a unique list of PubDB urls
206     for url in unique_urls :
207     # check that PubDBurl occurrance is the same as the number of collections
208     if ( allurls.count(url)==countColl ) :
209     SelectedPubDBURLs.append(url)
210 slacapra 1.7 common.logger.debug(5,'PubDBs '+str(SelectedPubDBURLs))
211 slacapra 1.3
212     #print 'Required Collections',CollIDs,'are all present in PubDBURLs : ',SelectedPubDBURLs,'\n'
213 slacapra 1.7 #### check based on CE black list: select only PubDB not in the CE black list
214     tmp=self.checkBlackList(SelectedPubDBURLs)
215     common.logger.debug(5,'PubDBs after black list '+str(tmp))
216    
217     ### check based on CE white list: select only PubDB defined by user
218     GoodPubDBURLs=self.checkWhiteList(tmp)
219 afanfani 1.11 if len(GoodPubDBURLs)>0 :
220     common.logger.debug(5,'PubDBs after white list '+str(GoodPubDBURLs))
221     common.logger.debug(3,'Selected sites via PubDB URLs are '+str(GoodPubDBURLs))
222 slacapra 1.3 return GoodPubDBURLs
223    
224     #######################################################################
225     def uniquelist(self, old):
226     """
227 slacapra 1.7 remove duplicates from a list
228 slacapra 1.3 """
229     nd={}
230     for e in old:
231     nd[e]=0
232     return nd.keys()
233 nsmirnov 1.1
234 slacapra 1.3 #######################################################################
235 slacapra 1.7 def checkWhiteList(self, pubDBUrls):
236     """
237     select PubDB URLs that are at site defined by the user (via CE white list)
238     """
239 slacapra 1.8 if len(self.reCEWhiteList)==0: return pubDBUrls
240 slacapra 1.7 goodurls = []
241     for url in pubDBUrls:
242     #print 'connecting to the URL ',url
243     good=0
244     for re in self.reCEWhiteList:
245     if re.search(url):
246     common.logger.debug(5,'CE in white list, adding PubDB URL '+url)
247     good=1
248     if not good: continue
249     goodurls.append(url)
250     if len(goodurls) == 0:
251 afanfani 1.11 common.logger.message("No sites found via PubDB \n")
252 slacapra 1.7 else:
253 afanfani 1.11 common.logger.debug(5,"Selected sites via PubDB URLs are "+str(goodurls)+"\n")
254 slacapra 1.7 return goodurls
255    
256     #######################################################################
257 slacapra 1.3 def checkBlackList(self, pubDBUrls):
258     """
259 slacapra 1.5 select PubDB URLs that are at site not exluded by the user (via CE black list)
260 slacapra 1.3 """
261 slacapra 1.8 if len(self.reCEBlackList)==0: return pubDBUrls
262 slacapra 1.3 goodurls = []
263     for url in pubDBUrls:
264 slacapra 1.5 common.logger.debug(10,'connecting to the URL '+url)
265 slacapra 1.3 good=1
266     for re in self.reCEBlackList:
267     if re.search(url):
268 slacapra 1.4 common.logger.message('CE in black list, skipping PubDB URL '+url)
269     good=0
270     pass
271     if good: goodurls.append(url)
272 slacapra 1.3 if len(goodurls) == 0:
273 afanfani 1.11 common.logger.debug(3,"No sites found via PubDB")
274 slacapra 1.3 return goodurls
275 slacapra 1.5
276     ########################################################################
277     def checkPubDBNewVersion(self, baseurl):
278     """
279     Check PubDB version to find out if it's new-style or old-style
280     """
281     ### check based on the existance of pubdb-get-version.php
282     urlversion=baseurl+'pubdb-get-version.php'
283     newversion=1;
284     try:
285     v = urllib2.urlopen(urlversion)
286     except urllib2.URLError, msg:
287     #print "WARNING: no URL to get PubDB version "
288     newversion=0;
289    
290     if (newversion) :
291     schemaversion = v.read()
292     #print schemaversion;
293    
294     return newversion
295    
296 slacapra 1.3 ########################################################################
297 slacapra 1.5 def getPubDBData(self, CollIDs, url , newversion):
298 slacapra 1.3 """
299     Contact a PubDB to collect all the relevant information
300     """
301     result = []
302 slacapra 1.5
303     ### get the base PubDb url
304     end=string.rfind(url,'/')
305     lastEq=string.rfind(url,'=')
306    
307     if (newversion) :
308     ### from PubDB V4 : get info for all the collections in one shot and unserialize the content
309     Collections=string.join(CollIDs,'-')
310     ## add the PU among the required Collections if the Digi are requested
311     # ( for the time being asking it directly to the PubDB so the RefDB
312     # level data discovery is bypassed..... in future when every site
313     # will have the new style it will be possible to ask for PU , at RefDB level, in method findAllCollections )
314     if ( self.NeededdataTiers.count('Digi') ):
315     PUCollID=self.getDatatierCollID(url[:end+1],Collections,"PU")
316 afanfani 1.11 if (PUCollID) :
317     if CollIDs.count(PUCollID)<=0:
318     CollIDs.append(PUCollID)
319 slacapra 1.5 ##
320     Collections=string.join(CollIDs,'-')
321     ### download from PubDB all the info about the given collections
322     pubdb_analysis=PubDBInfo(url[:end+1],Collections)
323     #print pubdb_analysis.GetPubDBInfo()
324     ok=0
325     try:
326     catInfos=pubdb_analysis.GetPubDBInfo()
327     ok=1
328     except :
329     #print "WARNING: can't get PubDB content out of "+url[:end+1]+"\n"
330 afanfani 1.11 print '\nERROR extracting info for collections '+Collections+' from PubDB '+url[:end+1]+'.'
331     print '>>>> Ask for help reporting that the failing PubDB script is: \n>>>> '+url[:end+1]+'pubdb-get-analysisinfo.php?collid='+Collections
332 slacapra 1.5 #raise PubDBGetAnalysisError(url[:end+1],Collections)
333     if (ok): result=catInfos;
334    
335     else:
336    
337     ### before PubDB V4 : get info for each collection and read the key-value pair text
338    
339     for CollID in CollIDs:
340 slacapra 1.3 urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+CollID
341     # print 'PHP URL: '+urlphp+' \n'
342 slacapra 1.5
343 slacapra 1.3 reOld=re.compile( r'V24' )
344     #print urlphp,'Old PubDB ',reOld.search(urlphp)
345     if reOld.search(urlphp):
346     raise NoPHPError(urlphp)
347     else:
348     try:
349     f = urllib2.urlopen(urlphp)
350     except urllib2.URLError, msg:
351     print "WARNING: ", msg
352     raise PubDBError(urlphp)
353     except urllib2.HTTPError, msg:
354     print "WARNING: ", msg
355     raise NoPHPError(urlphp)
356     content = f.read()
357     result.append(pubDBResult(content))
358     #print "Coll",CollID," content ",content
359     pass
360     pass
361    
362     #print '.....'
363     #for r in result:
364     # r.dump()
365     #print '.....'
366     return result
367 slacapra 1.5
368     ########################################################################
369     def getDatatierCollID(self,urlbase,CollIDString,datatier):
370     """
371     Contact a script of PubDB to retrieve the collid a DataTier
372     """
373     try:
374     f = urllib.urlopen(urlbase+'pubdb-get-collidbydatatier.php?collid='+CollIDString+"&datatier="+datatier)
375     except IOError:
376     raise PubDBGetAnalysisError(url[:end+1]+'pubdb-get-collidbydatatier.php',CollIDString)
377     data = f.read()
378     colldata=re.compile(r'collid=(\S*)').search(data);
379     if colldata:
380     datatier_CollID=colldata.group(1)
381     # print " --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier
382     common.logger.message(" --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier)
383    
384     return datatier_CollID
385    
386 slacapra 1.3 ########################################################################
387 slacapra 1.5 def getAllPubDBData(self):
388 slacapra 1.3 """
389     Contact a list of PubDB to collect all the relevant information
390     """
391 slacapra 1.5 newPubDBResult=[]
392     oldPubDBResult=[]
393     Result={}
394    
395     ### find the user-required collection IDs
396     CollIDs = self.findAllCollections()
397     ### find the PubDB URLs publishing the needed data
398     urllist = self.findPubDBs(CollIDs)
399     ### collect information sparatelly from new-style PubDBs and old-style PubDBs
400 slacapra 1.3 for pubdburl in urllist:
401 slacapra 1.5 end=string.rfind(pubdburl,'/')
402     newversion=self.checkPubDBNewVersion(pubdburl[:end+1])
403     if (newversion):
404     res=self.getPubDBData(CollIDs,pubdburl,newversion)
405     if len(res)>0:
406     newPubDBResult.append(res)
407     else:
408     resold=self.getPubDBData(CollIDs,pubdburl,newversion)
409     if len(resold)>0:
410     oldPubDBResult.append(resold)
411     ### fill a dictionary with all the PubBDs results both old-style and new-style
412     Result['newPubDB']=newPubDBResult
413     Result['oldPubDB']=oldPubDBResult
414    
415 slacapra 1.3 ## print for debugging purpose
416 slacapra 1.5 #
417     #for PubDBversion in Result.keys():
418     #print ("key %s, val %s" %(PubDBversion,Result[PubDBversion]))
419     # if len(Result[PubDBversion])>0 :
420     #print (" key %s"%(PubDBversion))
421     # for result in Result[PubDBversion]:
422     # for r in result:
423     #r.dump()
424     # common.log.write('----------------- \n')
425     #print '.....................................'
426    
427     return Result
428    
429 slacapra 1.3 ####################################################################