ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cvsroot/COMP/CRAB/python/PubDB.py
Revision: 1.6
Committed: Tue Aug 23 11:14:24 2005 UTC (19 years, 8 months ago) by slacapra
Content type: text/x-python
Branch: MAIN
CVS Tags: CRAB_1_0_0_beta4, CRAB_1_0_0_pre1_boss_2, CRAB_1_0_0_pre1_boss, CRAB_1_0_0_pre3, CRAB_1_0_0_pre2
Changes since 1.5: +1 -0 lines
Log Message:
add PubDBInfo

File Contents

# User Rev Content
1 slacapra 1.3 #!/usr/bin/env python2
2 nsmirnov 1.1 import sys, os, string, re
3     import urllib, urllister
4     import urllib2
5 nsmirnov 1.2 import common
6 slacapra 1.3 from RefDBInfo import RefDBInfo
7 slacapra 1.6 from PubDBInfo import *
8 nsmirnov 1.1
9     # ####################################
10 slacapra 1.3 class PubDBError:
11     def __init__(self, url):
12     print '\nERROR accessing PubDB at '+url+'\n'
13     pass
14 slacapra 1.5
15     # ####################################
16     class PubDBGetAnalysisError:
17     def __init__(self, url,Collections):
18     print '\nERROR extracting info for collections '+Collections+' from PubDB '+url+'.\n'
19     pass
20    
21 slacapra 1.3 # ####################################
22     class RefDBmapError:
23     def __init__(self, url):
24     print '\nERROR accessing RefDB-PubDBs map at '+url+'\n'
25     pass
26 nsmirnov 1.1
27     # ####################################
28     class NoPHPError:
29 slacapra 1.3 def __init__(self, url):
30     #print '\nERROR accessing PHP at '+url+' \n'
31     print 'ERROR accessing PHP: ',url,' \n'
32     pass
33 nsmirnov 1.1
34     # ####################################
35     class pubDBResult:
36 slacapra 1.3 def __init__(self,
37     contents):
38     self.contents=contents
39 nsmirnov 1.1
40    
41 slacapra 1.3 def dump(self):
42     print 'Contents : ',self.contents
43     pass
44 nsmirnov 1.1
45     # ####################################
46     # class to access PubDBs
47     class PubDB:
48 slacapra 1.3 def __init__(self, owner, dataset, dataTiers, cfg_params):
49    
50     # Attributes
51     self.owner = owner
52     self.dataset = dataset
53     self.dataTiers = dataTiers
54 slacapra 1.5 self.NeededdataTiers=[]
55 slacapra 1.3 self.cfg_params = cfg_params
56 nsmirnov 1.1
57 slacapra 1.3 self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
58     self.RefDBphp_ = 'PubDB/GetIdCollection.php'
59     self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'
60 nsmirnov 1.1
61 slacapra 1.3 self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
62     self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'
63 nsmirnov 1.1
64 slacapra 1.3 self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
65     self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'
66 nsmirnov 1.1
67 slacapra 1.3 ## link to the modified RefDB-PubDBs map script that allow the display option
68     self.RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.mod.php?display=1'
69 nsmirnov 1.1
70 slacapra 1.3 # Costructor procedures
71 nsmirnov 1.1
72 slacapra 1.3 CEBlackList = []
73     try:
74 slacapra 1.5 tmpBad = string.split(self.cfg_params['EDG.ce_black_list'],',')
75 slacapra 1.3 #tmpBad = ['fnal']
76     for tmp in tmpBad:
77     tmp=string.strip(tmp)
78 slacapra 1.5 if (tmp == 'cnaf'): tmp = 'webserver' ########## warning: temp. patch
79 slacapra 1.3 CEBlackList.append(tmp)
80     except KeyError:
81     pass
82 slacapra 1.5 common.logger.debug(5,'CEBlackList: '+str(CEBlackList))
83 slacapra 1.3 self.reCEBlackList=[]
84     for bad in CEBlackList:
85     self.reCEBlackList.append(re.compile( bad ))
86     #print 'ReBad: ',self.reCEBlackList
87 nsmirnov 1.1
88    
89     ########################################################################
90 slacapra 1.3 def findAllCollections(self):
91     """
92     Contact RefDB and find the CollID of all the user required collections
93     """
94     ## download from RefDB all the info about the given dataset-owner
95     refdb=RefDBInfo(self.owner,self.dataset)
96     #print refdb.GetRefDBInfo()
97     try:
98     collInfos=refdb.GetRefDBInfo()
99     except :
100     sys.exit(10)
101     #print "collInfos=", collInfos
102    
103     first=1
104     NeededCollID=[]
105     refdbdataTiers=[]
106     for coll in collInfos:
107     ## select the primary collection
108     if first:
109     NeededCollID.append(coll[0])
110 slacapra 1.5 self.NeededdataTiers.append(coll[2])
111 slacapra 1.3 refdbdataTiers.append(coll[2])
112     common.logger.message("\n --> primary collection for owner "+self.owner+" is: ID="+coll[0]+" DataTier="+coll[2])
113     first=0
114     else:
115     ## select only the parents collections corresponding to data-tiers requested by the user
116     if self.dataTiers.count(coll[2]):
117     NeededCollID.append(coll[0])
118 slacapra 1.5 self.NeededdataTiers.append(coll[2])
119 slacapra 1.3 common.logger.message(" --> further collection required: ID="+coll[0]+" DataTier="+coll[2])
120     refdbdataTiers.append(coll[2])
121    
122     ## check that the user asks for Data Tier really existing in RefDB, otherwise give a warning message
123     for dt in self.dataTiers:
124     if refdbdataTiers.count(dt)<=0:
125 slacapra 1.5 msg = "ERROR: Data Tier ( =>",dt,"<= ) not existing for dataset/owner "+self.dataset+"/"+self.owner+"! "
126     msg = str(msg) + 'Owner Dataset not published with asked dataTiers! '+\
127 slacapra 1.3 self.owner+' '+ self.dataset+' '+self.dataTiers
128 slacapra 1.5 msg = str(msg) + ' Check the data_tier variable in crab.cfg !\n'
129     common.logger.message(msg)
130 slacapra 1.3 return []
131    
132     #print 'Needed Collections are ', NeededCollID
133     #return collInfos
134     #print "NeededCollID= ", NeededCollID
135     return NeededCollID
136    
137 nsmirnov 1.1 ########################################################################
138 slacapra 1.3 def findPubDBsbyCollID(self,CollID):
139     """
140     Find the list of PubDB URLs having a given Collection
141     """
142     ### contact the RefDB-PubDBs map to discovery where the given CollID is
143     url = self.PubDBCentralUrl_+self.RefDBPubDBsmapPhp_+'&CollID=' + CollID
144     # print "%s"%(url)
145     try:
146     f = urllib.urlopen(url)
147     except IOError:
148     # print 'Cannot access URL: '+url
149     raise RefDBmapError(url)
150    
151     ### search for the PubDBURL string
152     reURLLine=re.compile( r'PubDBURL=(\S*)' )
153    
154     PubDBURLs = []
155     for line in f.readlines():
156     #print '#',line,'#'
157     if reURLLine.search(line) :
158     URLLine=reURLLine.search(line).group()
159     #print string.split(URLLine,'=')[1]
160     PubDBURLs.append(string.split(URLLine,'=')[1])
161 nsmirnov 1.1
162 slacapra 1.3 ### return the list of PubDBURL where the collection is present
163     #return PubDBURLs
164     return self.uniquelist(PubDBURLs)
165    
166     ################################################################
167     def findPubDBs(self,CollIDs):
168     """
169     Find the list of PubDB URLs having ALL the required collections
170     """
171     ### loop over all the required collections
172     #pubdbmap={}
173     allurls=[]
174     countColl=0
175     for CollID in CollIDs :
176     countColl=countColl+1
177     ### map the CollectionID with the list of PubDB URLs
178     #pubdbmap[CollID]=self.findPubDBsbyCollID(CollID)
179     ### prepare a list all PubDB urls for all collections
180     allurls.extend(self.findPubDBsbyCollID(CollID))
181     #print pubdbmap.values()
182    
183     ### select only PubDB urls that contains all the collections
184     unique_urls=self.uniquelist(allurls)
185     SelectedPubDBURLs=[]
186     # loop on a unique list of PubDB urls
187     for url in unique_urls :
188     # check that PubDBurl occurrance is the same as the number of collections
189     if ( allurls.count(url)==countColl ) :
190     SelectedPubDBURLs.append(url)
191    
192     #print 'Required Collections',CollIDs,'are all present in PubDBURLs : ',SelectedPubDBURLs,'\n'
193     #### check based on CE black list: select only PubDB not in the CE black list
194     GoodPubDBURLs=self.checkBlackList(SelectedPubDBURLs)
195     return GoodPubDBURLs
196    
197     #######################################################################
198     def uniquelist(self, old):
199     """
200     remove duplicates from a list
201     """
202     nd={}
203     for e in old:
204     nd[e]=0
205     return nd.keys()
206 nsmirnov 1.1
207 slacapra 1.3 #######################################################################
208     def checkBlackList(self, pubDBUrls):
209     """
210 slacapra 1.5 select PubDB URLs that are at site not exluded by the user (via CE black list)
211 slacapra 1.3 """
212     goodurls = []
213     for url in pubDBUrls:
214 slacapra 1.5 common.logger.debug(10,'connecting to the URL '+url)
215 slacapra 1.3 good=1
216     for re in self.reCEBlackList:
217     if re.search(url):
218 slacapra 1.4 common.logger.message('CE in black list, skipping PubDB URL '+url)
219     good=0
220     pass
221     if good: goodurls.append(url)
222 slacapra 1.3 if len(goodurls) == 0:
223 slacapra 1.4 common.logger.debug(3,"No selected PubDB URLs")
224 slacapra 1.3 return goodurls
225 slacapra 1.5
226     ########################################################################
227     def checkPubDBNewVersion(self, baseurl):
228     """
229     Check PubDB version to find out if it's new-style or old-style
230     """
231     ### check based on the existance of pubdb-get-version.php
232     urlversion=baseurl+'pubdb-get-version.php'
233     newversion=1;
234     try:
235     v = urllib2.urlopen(urlversion)
236     except urllib2.URLError, msg:
237     #print "WARNING: no URL to get PubDB version "
238     newversion=0;
239    
240     if (newversion) :
241     schemaversion = v.read()
242     #print schemaversion;
243    
244     return newversion
245    
246 slacapra 1.3 ########################################################################
247 slacapra 1.5 def getPubDBData(self, CollIDs, url , newversion):
248 slacapra 1.3 """
249     Contact a PubDB to collect all the relevant information
250     """
251     result = []
252 slacapra 1.5
253     ### get the base PubDb url
254     end=string.rfind(url,'/')
255     lastEq=string.rfind(url,'=')
256    
257     if (newversion) :
258     ### from PubDB V4 : get info for all the collections in one shot and unserialize the content
259     Collections=string.join(CollIDs,'-')
260     ## add the PU among the required Collections if the Digi are requested
261     # ( for the time being asking it directly to the PubDB so the RefDB
262     # level data discovery is bypassed..... in future when every site
263     # will have the new style it will be possible to ask for PU , at RefDB level, in method findAllCollections )
264     if ( self.NeededdataTiers.count('Digi') ):
265     PUCollID=self.getDatatierCollID(url[:end+1],Collections,"PU")
266     if (PUCollID) : CollIDs.append(PUCollID)
267     ##
268     Collections=string.join(CollIDs,'-')
269     ### download from PubDB all the info about the given collections
270     pubdb_analysis=PubDBInfo(url[:end+1],Collections)
271     #print pubdb_analysis.GetPubDBInfo()
272     ok=0
273     try:
274     catInfos=pubdb_analysis.GetPubDBInfo()
275     ok=1
276     except :
277     #print "WARNING: can't get PubDB content out of "+url[:end+1]+"\n"
278     print '\nERROR extracting info for collections '+Collections+' from PubDB '+url[:end+1]+'. The PU might not be published at that site.\n'
279     #raise PubDBGetAnalysisError(url[:end+1],Collections)
280     if (ok): result=catInfos;
281    
282     else:
283    
284     ### before PubDB V4 : get info for each collection and read the key-value pair text
285    
286     for CollID in CollIDs:
287 slacapra 1.3 urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+CollID
288     # print 'PHP URL: '+urlphp+' \n'
289 slacapra 1.5
290 slacapra 1.3 reOld=re.compile( r'V24' )
291     #print urlphp,'Old PubDB ',reOld.search(urlphp)
292     if reOld.search(urlphp):
293     raise NoPHPError(urlphp)
294     else:
295     try:
296     f = urllib2.urlopen(urlphp)
297     except urllib2.URLError, msg:
298     print "WARNING: ", msg
299     raise PubDBError(urlphp)
300     except urllib2.HTTPError, msg:
301     print "WARNING: ", msg
302     raise NoPHPError(urlphp)
303     content = f.read()
304     result.append(pubDBResult(content))
305     #print "Coll",CollID," content ",content
306     pass
307     pass
308    
309     #print '.....'
310     #for r in result:
311     # r.dump()
312     #print '.....'
313     return result
314 slacapra 1.5
315     ########################################################################
316     def getDatatierCollID(self,urlbase,CollIDString,datatier):
317     """
318     Contact a script of PubDB to retrieve the collid a DataTier
319     """
320     try:
321     f = urllib.urlopen(urlbase+'pubdb-get-collidbydatatier.php?collid='+CollIDString+"&datatier="+datatier)
322     except IOError:
323     raise PubDBGetAnalysisError(url[:end+1]+'pubdb-get-collidbydatatier.php',CollIDString)
324     data = f.read()
325     colldata=re.compile(r'collid=(\S*)').search(data);
326     if colldata:
327     datatier_CollID=colldata.group(1)
328     # print " --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier
329     common.logger.message(" --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier)
330    
331     return datatier_CollID
332    
333 slacapra 1.3 ########################################################################
334 slacapra 1.5 def getAllPubDBData(self):
335 slacapra 1.3 """
336     Contact a list of PubDB to collect all the relevant information
337     """
338 slacapra 1.5 newPubDBResult=[]
339     oldPubDBResult=[]
340     Result={}
341    
342     ### find the user-required collection IDs
343     CollIDs = self.findAllCollections()
344     ### find the PubDB URLs publishing the needed data
345     urllist = self.findPubDBs(CollIDs)
346     ### collect information sparatelly from new-style PubDBs and old-style PubDBs
347 slacapra 1.3 for pubdburl in urllist:
348 slacapra 1.5 end=string.rfind(pubdburl,'/')
349     newversion=self.checkPubDBNewVersion(pubdburl[:end+1])
350     if (newversion):
351     res=self.getPubDBData(CollIDs,pubdburl,newversion)
352     if len(res)>0:
353     newPubDBResult.append(res)
354     else:
355     resold=self.getPubDBData(CollIDs,pubdburl,newversion)
356     if len(resold)>0:
357     oldPubDBResult.append(resold)
358     ### fill a dictionary with all the PubBDs results both old-style and new-style
359     Result['newPubDB']=newPubDBResult
360     Result['oldPubDB']=oldPubDBResult
361    
362 slacapra 1.3 ## print for debugging purpose
363 slacapra 1.5 #
364     #for PubDBversion in Result.keys():
365     #print ("key %s, val %s" %(PubDBversion,Result[PubDBversion]))
366     # if len(Result[PubDBversion])>0 :
367     #print (" key %s"%(PubDBversion))
368     # for result in Result[PubDBversion]:
369     # for r in result:
370     #r.dump()
371     # common.log.write('----------------- \n')
372     #print '.....................................'
373    
374     return Result
375    
376 slacapra 1.3 ####################################################################