ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cvsroot/COMP/CRAB/python/PubDB.py
Revision: 1.12
Committed: Thu Apr 6 16:45:17 2006 UTC (19 years ago) by fanzago
Content type: text/x-python
Branch: MAIN
CVS Tags: CRAB_1_1_0, CRAB_1_1_0_pre4, CRAB_1_1_0_pre3, CRAB_1_1_0_pre1
Changes since 1.11: +0 -2 lines
Log Message:
removed some comments

File Contents

# User Rev Content
1 slacapra 1.3 #!/usr/bin/env python2
2 nsmirnov 1.1 import sys, os, string, re
3     import urllib, urllister
4     import urllib2
5 nsmirnov 1.2 import common
6 slacapra 1.3 from RefDBInfo import RefDBInfo
7 slacapra 1.6 from PubDBInfo import *
8 nsmirnov 1.1
9     # ####################################
10 slacapra 1.3 class PubDBError:
11     def __init__(self, url):
12     print '\nERROR accessing PubDB at '+url+'\n'
13     pass
14 slacapra 1.5
15     # ####################################
16     class PubDBGetAnalysisError:
17     def __init__(self, url,Collections):
18     print '\nERROR extracting info for collections '+Collections+' from PubDB '+url+'.\n'
19     pass
20    
21 slacapra 1.3 # ####################################
22     class RefDBmapError:
23     def __init__(self, url):
24     print '\nERROR accessing RefDB-PubDBs map at '+url+'\n'
25     pass
26 nsmirnov 1.1
27     # ####################################
28     class NoPHPError:
29 slacapra 1.3 def __init__(self, url):
30     #print '\nERROR accessing PHP at '+url+' \n'
31     print 'ERROR accessing PHP: ',url,' \n'
32     pass
33 nsmirnov 1.1
34     # ####################################
35     class pubDBResult:
36 slacapra 1.3 def __init__(self,
37     contents):
38     self.contents=contents
39 nsmirnov 1.1
40    
41 slacapra 1.3 def dump(self):
42     print 'Contents : ',self.contents
43     pass
44 nsmirnov 1.1
45     # ####################################
46     # class to access PubDBs
47     class PubDB:
48 slacapra 1.3 def __init__(self, owner, dataset, dataTiers, cfg_params):
49    
50     # Attributes
51     self.owner = owner
52     self.dataset = dataset
53     self.dataTiers = dataTiers
54 slacapra 1.5 self.NeededdataTiers=[]
55 slacapra 1.3 self.cfg_params = cfg_params
56 nsmirnov 1.1
57 slacapra 1.3 self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
58     self.RefDBphp_ = 'PubDB/GetIdCollection.php'
59     self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'
60 nsmirnov 1.1
61 slacapra 1.3 self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
62     self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'
63 nsmirnov 1.1
64 slacapra 1.3 self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
65     self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'
66 nsmirnov 1.1
67 slacapra 1.3 ## link to the modified RefDB-PubDBs map script that allow the display option
68     self.RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.mod.php?display=1'
69 nsmirnov 1.1
70 slacapra 1.3 # Costructor procedures
71 nsmirnov 1.1
72 slacapra 1.3 CEBlackList = []
73     try:
74 slacapra 1.5 tmpBad = string.split(self.cfg_params['EDG.ce_black_list'],',')
75 slacapra 1.3 #tmpBad = ['fnal']
76     for tmp in tmpBad:
77     tmp=string.strip(tmp)
78 afanfani 1.11 if (tmp == 'cnaf'): tmp = 'webserver' ########## warning: temp. patch
79 slacapra 1.3 CEBlackList.append(tmp)
80     except KeyError:
81     pass
82 slacapra 1.7
83     CEWhiteList = []
84     try:
85     tmpGood = string.split(self.cfg_params['EDG.ce_white_list'],',')
86     for tmp in tmpGood:
87     tmp=string.strip(tmp)
88     CEWhiteList.append(tmp)
89     except KeyError:
90     pass
91    
92     #print 'CEWhiteList: ',CEWhiteList
93     self.reCEWhiteList=[]
94     for Good in CEWhiteList:
95     self.reCEWhiteList.append(re.compile( Good ))
96     #print 'ReGood: ',self.reCEWhiteList
97    
98 slacapra 1.5 common.logger.debug(5,'CEBlackList: '+str(CEBlackList))
99 slacapra 1.7 common.logger.debug(5,'CEWhiteList: '+str(CEWhiteList))
100 slacapra 1.3 self.reCEBlackList=[]
101     for bad in CEBlackList:
102     self.reCEBlackList.append(re.compile( bad ))
103     #print 'ReBad: ',self.reCEBlackList
104 nsmirnov 1.1
105    
106     ########################################################################
107 slacapra 1.3 def findAllCollections(self):
108     """
109     Contact RefDB and find the CollID of all the user required collections
110     """
111     ## download from RefDB all the info about the given dataset-owner
112     refdb=RefDBInfo(self.owner,self.dataset)
113     #print refdb.GetRefDBInfo()
114     try:
115     collInfos=refdb.GetRefDBInfo()
116     except :
117     sys.exit(10)
118     #print "collInfos=", collInfos
119    
120     first=1
121     NeededCollID=[]
122     refdbdataTiers=[]
123     for coll in collInfos:
124     ## select the primary collection
125     if first:
126     NeededCollID.append(coll[0])
127 slacapra 1.5 self.NeededdataTiers.append(coll[2])
128 slacapra 1.3 refdbdataTiers.append(coll[2])
129     common.logger.message("\n --> primary collection for owner "+self.owner+" is: ID="+coll[0]+" DataTier="+coll[2])
130     first=0
131     else:
132     ## select only the parents collections corresponding to data-tiers requested by the user
133     if self.dataTiers.count(coll[2]):
134     NeededCollID.append(coll[0])
135 slacapra 1.5 self.NeededdataTiers.append(coll[2])
136 slacapra 1.3 common.logger.message(" --> further collection required: ID="+coll[0]+" DataTier="+coll[2])
137     refdbdataTiers.append(coll[2])
138    
139     ## check that the user asks for Data Tier really existing in RefDB, otherwise give a warning message
140     for dt in self.dataTiers:
141     if refdbdataTiers.count(dt)<=0:
142 slacapra 1.5 msg = "ERROR: Data Tier ( =>",dt,"<= ) not existing for dataset/owner "+self.dataset+"/"+self.owner+"! "
143     msg = str(msg) + 'Owner Dataset not published with asked dataTiers! '+\
144 slacapra 1.10 self.owner+' '+ self.dataset+' '+str(self.dataTiers)+'\n'
145 slacapra 1.5 msg = str(msg) + ' Check the data_tier variable in crab.cfg !\n'
146     common.logger.message(msg)
147 slacapra 1.3 return []
148    
149     #print 'Needed Collections are ', NeededCollID
150     #return collInfos
151     #print "NeededCollID= ", NeededCollID
152     return NeededCollID
153    
154 nsmirnov 1.1 ########################################################################
155 slacapra 1.3 def findPubDBsbyCollID(self,CollID):
156     """
157     Find the list of PubDB URLs having a given Collection
158     """
159     ### contact the RefDB-PubDBs map to discovery where the given CollID is
160     url = self.PubDBCentralUrl_+self.RefDBPubDBsmapPhp_+'&CollID=' + CollID
161     # print "%s"%(url)
162     try:
163     f = urllib.urlopen(url)
164     except IOError:
165     # print 'Cannot access URL: '+url
166     raise RefDBmapError(url)
167    
168     ### search for the PubDBURL string
169     reURLLine=re.compile( r'PubDBURL=(\S*)' )
170    
171     PubDBURLs = []
172     for line in f.readlines():
173     #print '#',line,'#'
174     if reURLLine.search(line) :
175     URLLine=reURLLine.search(line).group()
176     #print string.split(URLLine,'=')[1]
177     PubDBURLs.append(string.split(URLLine,'=')[1])
178 nsmirnov 1.1
179 slacapra 1.3 ### return the list of PubDBURL where the collection is present
180     #return PubDBURLs
181     return self.uniquelist(PubDBURLs)
182    
183     ################################################################
184     def findPubDBs(self,CollIDs):
185     """
186     Find the list of PubDB URLs having ALL the required collections
187     """
188     ### loop over all the required collections
189     #pubdbmap={}
190     allurls=[]
191     countColl=0
192     for CollID in CollIDs :
193     countColl=countColl+1
194     ### map the CollectionID with the list of PubDB URLs
195     #pubdbmap[CollID]=self.findPubDBsbyCollID(CollID)
196     ### prepare a list all PubDB urls for all collections
197     allurls.extend(self.findPubDBsbyCollID(CollID))
198     #print pubdbmap.values()
199    
200     ### select only PubDB urls that contains all the collections
201     unique_urls=self.uniquelist(allurls)
202     SelectedPubDBURLs=[]
203     # loop on a unique list of PubDB urls
204     for url in unique_urls :
205     # check that PubDBurl occurrance is the same as the number of collections
206     if ( allurls.count(url)==countColl ) :
207     SelectedPubDBURLs.append(url)
208 slacapra 1.7 common.logger.debug(5,'PubDBs '+str(SelectedPubDBURLs))
209 slacapra 1.3
210     #print 'Required Collections',CollIDs,'are all present in PubDBURLs : ',SelectedPubDBURLs,'\n'
211 slacapra 1.7 #### check based on CE black list: select only PubDB not in the CE black list
212     tmp=self.checkBlackList(SelectedPubDBURLs)
213     common.logger.debug(5,'PubDBs after black list '+str(tmp))
214    
215     ### check based on CE white list: select only PubDB defined by user
216     GoodPubDBURLs=self.checkWhiteList(tmp)
217 afanfani 1.11 if len(GoodPubDBURLs)>0 :
218     common.logger.debug(5,'PubDBs after white list '+str(GoodPubDBURLs))
219     common.logger.debug(3,'Selected sites via PubDB URLs are '+str(GoodPubDBURLs))
220 slacapra 1.3 return GoodPubDBURLs
221    
222     #######################################################################
223     def uniquelist(self, old):
224     """
225 slacapra 1.7 remove duplicates from a list
226 slacapra 1.3 """
227     nd={}
228     for e in old:
229     nd[e]=0
230     return nd.keys()
231 nsmirnov 1.1
232 slacapra 1.3 #######################################################################
233 slacapra 1.7 def checkWhiteList(self, pubDBUrls):
234     """
235     select PubDB URLs that are at site defined by the user (via CE white list)
236     """
237 slacapra 1.8 if len(self.reCEWhiteList)==0: return pubDBUrls
238 slacapra 1.7 goodurls = []
239     for url in pubDBUrls:
240     #print 'connecting to the URL ',url
241     good=0
242     for re in self.reCEWhiteList:
243     if re.search(url):
244     common.logger.debug(5,'CE in white list, adding PubDB URL '+url)
245     good=1
246     if not good: continue
247     goodurls.append(url)
248     if len(goodurls) == 0:
249 afanfani 1.11 common.logger.message("No sites found via PubDB \n")
250 slacapra 1.7 else:
251 afanfani 1.11 common.logger.debug(5,"Selected sites via PubDB URLs are "+str(goodurls)+"\n")
252 slacapra 1.7 return goodurls
253    
254     #######################################################################
255 slacapra 1.3 def checkBlackList(self, pubDBUrls):
256     """
257 slacapra 1.5 select PubDB URLs that are at site not exluded by the user (via CE black list)
258 slacapra 1.3 """
259 slacapra 1.8 if len(self.reCEBlackList)==0: return pubDBUrls
260 slacapra 1.3 goodurls = []
261     for url in pubDBUrls:
262 slacapra 1.5 common.logger.debug(10,'connecting to the URL '+url)
263 slacapra 1.3 good=1
264     for re in self.reCEBlackList:
265     if re.search(url):
266 slacapra 1.4 common.logger.message('CE in black list, skipping PubDB URL '+url)
267     good=0
268     pass
269     if good: goodurls.append(url)
270 slacapra 1.3 if len(goodurls) == 0:
271 afanfani 1.11 common.logger.debug(3,"No sites found via PubDB")
272 slacapra 1.3 return goodurls
273 slacapra 1.5
274     ########################################################################
275     def checkPubDBNewVersion(self, baseurl):
276     """
277     Check PubDB version to find out if it's new-style or old-style
278     """
279     ### check based on the existance of pubdb-get-version.php
280     urlversion=baseurl+'pubdb-get-version.php'
281     newversion=1;
282     try:
283     v = urllib2.urlopen(urlversion)
284     except urllib2.URLError, msg:
285     #print "WARNING: no URL to get PubDB version "
286     newversion=0;
287    
288     if (newversion) :
289     schemaversion = v.read()
290     #print schemaversion;
291    
292     return newversion
293    
294 slacapra 1.3 ########################################################################
295 slacapra 1.5 def getPubDBData(self, CollIDs, url , newversion):
296 slacapra 1.3 """
297     Contact a PubDB to collect all the relevant information
298     """
299     result = []
300 slacapra 1.5
301     ### get the base PubDb url
302     end=string.rfind(url,'/')
303     lastEq=string.rfind(url,'=')
304    
305     if (newversion) :
306     ### from PubDB V4 : get info for all the collections in one shot and unserialize the content
307     Collections=string.join(CollIDs,'-')
308     ## add the PU among the required Collections if the Digi are requested
309     # ( for the time being asking it directly to the PubDB so the RefDB
310     # level data discovery is bypassed..... in future when every site
311     # will have the new style it will be possible to ask for PU , at RefDB level, in method findAllCollections )
312     if ( self.NeededdataTiers.count('Digi') ):
313     PUCollID=self.getDatatierCollID(url[:end+1],Collections,"PU")
314 afanfani 1.11 if (PUCollID) :
315     if CollIDs.count(PUCollID)<=0:
316     CollIDs.append(PUCollID)
317 slacapra 1.5 ##
318     Collections=string.join(CollIDs,'-')
319     ### download from PubDB all the info about the given collections
320     pubdb_analysis=PubDBInfo(url[:end+1],Collections)
321     #print pubdb_analysis.GetPubDBInfo()
322     ok=0
323     try:
324     catInfos=pubdb_analysis.GetPubDBInfo()
325     ok=1
326     except :
327     #print "WARNING: can't get PubDB content out of "+url[:end+1]+"\n"
328 afanfani 1.11 print '\nERROR extracting info for collections '+Collections+' from PubDB '+url[:end+1]+'.'
329     print '>>>> Ask for help reporting that the failing PubDB script is: \n>>>> '+url[:end+1]+'pubdb-get-analysisinfo.php?collid='+Collections
330 slacapra 1.5 #raise PubDBGetAnalysisError(url[:end+1],Collections)
331     if (ok): result=catInfos;
332    
333     else:
334    
335     ### before PubDB V4 : get info for each collection and read the key-value pair text
336    
337     for CollID in CollIDs:
338 slacapra 1.3 urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+CollID
339     # print 'PHP URL: '+urlphp+' \n'
340 slacapra 1.5
341 slacapra 1.3 reOld=re.compile( r'V24' )
342     #print urlphp,'Old PubDB ',reOld.search(urlphp)
343     if reOld.search(urlphp):
344     raise NoPHPError(urlphp)
345     else:
346     try:
347     f = urllib2.urlopen(urlphp)
348     except urllib2.URLError, msg:
349     print "WARNING: ", msg
350     raise PubDBError(urlphp)
351     except urllib2.HTTPError, msg:
352     print "WARNING: ", msg
353     raise NoPHPError(urlphp)
354     content = f.read()
355     result.append(pubDBResult(content))
356     #print "Coll",CollID," content ",content
357     pass
358     pass
359    
360     #print '.....'
361     #for r in result:
362     # r.dump()
363     #print '.....'
364     return result
365 slacapra 1.5
366     ########################################################################
367     def getDatatierCollID(self,urlbase,CollIDString,datatier):
368     """
369     Contact a script of PubDB to retrieve the collid a DataTier
370     """
371     try:
372     f = urllib.urlopen(urlbase+'pubdb-get-collidbydatatier.php?collid='+CollIDString+"&datatier="+datatier)
373     except IOError:
374     raise PubDBGetAnalysisError(url[:end+1]+'pubdb-get-collidbydatatier.php',CollIDString)
375     data = f.read()
376     colldata=re.compile(r'collid=(\S*)').search(data);
377     if colldata:
378     datatier_CollID=colldata.group(1)
379     # print " --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier
380     common.logger.message(" --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier)
381    
382     return datatier_CollID
383    
384 slacapra 1.3 ########################################################################
385 slacapra 1.5 def getAllPubDBData(self):
386 slacapra 1.3 """
387     Contact a list of PubDB to collect all the relevant information
388     """
389 slacapra 1.5 newPubDBResult=[]
390     oldPubDBResult=[]
391     Result={}
392    
393     ### find the user-required collection IDs
394     CollIDs = self.findAllCollections()
395     ### find the PubDB URLs publishing the needed data
396     urllist = self.findPubDBs(CollIDs)
397     ### collect information sparatelly from new-style PubDBs and old-style PubDBs
398 slacapra 1.3 for pubdburl in urllist:
399 slacapra 1.5 end=string.rfind(pubdburl,'/')
400     newversion=self.checkPubDBNewVersion(pubdburl[:end+1])
401     if (newversion):
402     res=self.getPubDBData(CollIDs,pubdburl,newversion)
403     if len(res)>0:
404     newPubDBResult.append(res)
405     else:
406     resold=self.getPubDBData(CollIDs,pubdburl,newversion)
407     if len(resold)>0:
408     oldPubDBResult.append(resold)
409     ### fill a dictionary with all the PubBDs results both old-style and new-style
410     Result['newPubDB']=newPubDBResult
411     Result['oldPubDB']=oldPubDBResult
412    
413 slacapra 1.3 ## print for debugging purpose
414 slacapra 1.5 #
415     #for PubDBversion in Result.keys():
416     #print ("key %s, val %s" %(PubDBversion,Result[PubDBversion]))
417     # if len(Result[PubDBversion])>0 :
418     #print (" key %s"%(PubDBversion))
419     # for result in Result[PubDBversion]:
420     # for r in result:
421     #r.dump()
422     # common.log.write('----------------- \n')
423     #print '.....................................'
424    
425     return Result
426    
427 slacapra 1.3 ####################################################################