ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cvsroot/COMP/CRAB/python/PubDB.py
Revision: 1.10
Committed: Thu Nov 3 15:24:29 2005 UTC (19 years, 5 months ago) by slacapra
Content type: text/x-python
Branch: MAIN
CVS Tags: CRAB_1_0_2, CRAB_0_2_2, CRAB_1_0_1, CRAB_1_0_0_rc1
Changes since 1.9: +1 -1 lines
Log Message:
str again

File Contents

# Content
1 #!/usr/bin/env python2
2 import sys, os, string, re
3 import urllib, urllister
4 import urllib2
5 import common
6 from RefDBInfo import RefDBInfo
7 from PubDBInfo import *
8
9 # ####################################
10 class PubDBError:
11 def __init__(self, url):
12 print '\nERROR accessing PubDB at '+url+'\n'
13 pass
14
15 # ####################################
16 class PubDBGetAnalysisError:
17 def __init__(self, url,Collections):
18 print '\nERROR extracting info for collections '+Collections+' from PubDB '+url+'.\n'
19 pass
20
21 # ####################################
22 class RefDBmapError:
23 def __init__(self, url):
24 print '\nERROR accessing RefDB-PubDBs map at '+url+'\n'
25 pass
26
27 # ####################################
28 class NoPHPError:
29 def __init__(self, url):
30 #print '\nERROR accessing PHP at '+url+' \n'
31 print 'ERROR accessing PHP: ',url,' \n'
32 pass
33
34 # ####################################
35 class pubDBResult:
36 def __init__(self,
37 contents):
38 self.contents=contents
39
40
41 def dump(self):
42 print 'Contents : ',self.contents
43 pass
44
45 # ####################################
46 # class to access PubDBs
47 class PubDB:
48 def __init__(self, owner, dataset, dataTiers, cfg_params):
49
50 # Attributes
51 self.owner = owner
52 self.dataset = dataset
53 self.dataTiers = dataTiers
54 self.NeededdataTiers=[]
55 self.cfg_params = cfg_params
56
57 self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
58 self.RefDBphp_ = 'PubDB/GetIdCollection.php'
59 self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'
60
61 self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
62 self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'
63
64 self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
65 self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'
66
67 ## link to the modified RefDB-PubDBs map script that allow the display option
68 self.RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.mod.php?display=1'
69
70 # Costructor procedures
71
72 CEBlackList = []
73 try:
74 tmpBad = string.split(self.cfg_params['EDG.ce_black_list'],',')
75 #tmpBad = ['fnal']
76 for tmp in tmpBad:
77 tmp=string.strip(tmp)
78 #if (tmp == 'cnaf'): tmp = 'webserver' ########## warning: temp. patch
79 CEBlackList.append(tmp)
80 except KeyError:
81 pass
82
83 CEWhiteList = []
84 try:
85 tmpGood = string.split(self.cfg_params['EDG.ce_white_list'],',')
86 #tmpGood = ['cern']
87 for tmp in tmpGood:
88 tmp=string.strip(tmp)
89 #if (tmp == 'cnaf'): tmp = 'webserver' ########## warning: temp. patch
90 CEWhiteList.append(tmp)
91 except KeyError:
92 pass
93
94 #print 'CEWhiteList: ',CEWhiteList
95 self.reCEWhiteList=[]
96 for Good in CEWhiteList:
97 self.reCEWhiteList.append(re.compile( Good ))
98 #print 'ReGood: ',self.reCEWhiteList
99
100 common.logger.debug(5,'CEBlackList: '+str(CEBlackList))
101 common.logger.debug(5,'CEWhiteList: '+str(CEWhiteList))
102 self.reCEBlackList=[]
103 for bad in CEBlackList:
104 self.reCEBlackList.append(re.compile( bad ))
105 #print 'ReBad: ',self.reCEBlackList
106
107
108 ########################################################################
109 def findAllCollections(self):
110 """
111 Contact RefDB and find the CollID of all the user required collections
112 """
113 ## download from RefDB all the info about the given dataset-owner
114 refdb=RefDBInfo(self.owner,self.dataset)
115 #print refdb.GetRefDBInfo()
116 try:
117 collInfos=refdb.GetRefDBInfo()
118 except :
119 sys.exit(10)
120 #print "collInfos=", collInfos
121
122 first=1
123 NeededCollID=[]
124 refdbdataTiers=[]
125 for coll in collInfos:
126 ## select the primary collection
127 if first:
128 NeededCollID.append(coll[0])
129 self.NeededdataTiers.append(coll[2])
130 refdbdataTiers.append(coll[2])
131 common.logger.message("\n --> primary collection for owner "+self.owner+" is: ID="+coll[0]+" DataTier="+coll[2])
132 first=0
133 else:
134 ## select only the parents collections corresponding to data-tiers requested by the user
135 if self.dataTiers.count(coll[2]):
136 NeededCollID.append(coll[0])
137 self.NeededdataTiers.append(coll[2])
138 common.logger.message(" --> further collection required: ID="+coll[0]+" DataTier="+coll[2])
139 refdbdataTiers.append(coll[2])
140
141 ## check that the user asks for Data Tier really existing in RefDB, otherwise give a warning message
142 for dt in self.dataTiers:
143 if refdbdataTiers.count(dt)<=0:
144 msg = "ERROR: Data Tier ( =>",dt,"<= ) not existing for dataset/owner "+self.dataset+"/"+self.owner+"! "
145 msg = str(msg) + 'Owner Dataset not published with asked dataTiers! '+\
146 self.owner+' '+ self.dataset+' '+str(self.dataTiers)+'\n'
147 msg = str(msg) + ' Check the data_tier variable in crab.cfg !\n'
148 common.logger.message(msg)
149 return []
150
151 #print 'Needed Collections are ', NeededCollID
152 #return collInfos
153 #print "NeededCollID= ", NeededCollID
154 return NeededCollID
155
156 ########################################################################
157 def findPubDBsbyCollID(self,CollID):
158 """
159 Find the list of PubDB URLs having a given Collection
160 """
161 ### contact the RefDB-PubDBs map to discovery where the given CollID is
162 url = self.PubDBCentralUrl_+self.RefDBPubDBsmapPhp_+'&CollID=' + CollID
163 # print "%s"%(url)
164 try:
165 f = urllib.urlopen(url)
166 except IOError:
167 # print 'Cannot access URL: '+url
168 raise RefDBmapError(url)
169
170 ### search for the PubDBURL string
171 reURLLine=re.compile( r'PubDBURL=(\S*)' )
172
173 PubDBURLs = []
174 for line in f.readlines():
175 #print '#',line,'#'
176 if reURLLine.search(line) :
177 URLLine=reURLLine.search(line).group()
178 #print string.split(URLLine,'=')[1]
179 PubDBURLs.append(string.split(URLLine,'=')[1])
180
181 ### return the list of PubDBURL where the collection is present
182 #return PubDBURLs
183 return self.uniquelist(PubDBURLs)
184
185 ################################################################
186 def findPubDBs(self,CollIDs):
187 """
188 Find the list of PubDB URLs having ALL the required collections
189 """
190 ### loop over all the required collections
191 #pubdbmap={}
192 allurls=[]
193 countColl=0
194 for CollID in CollIDs :
195 countColl=countColl+1
196 ### map the CollectionID with the list of PubDB URLs
197 #pubdbmap[CollID]=self.findPubDBsbyCollID(CollID)
198 ### prepare a list all PubDB urls for all collections
199 allurls.extend(self.findPubDBsbyCollID(CollID))
200 #print pubdbmap.values()
201
202 ### select only PubDB urls that contains all the collections
203 unique_urls=self.uniquelist(allurls)
204 SelectedPubDBURLs=[]
205 # loop on a unique list of PubDB urls
206 for url in unique_urls :
207 # check that PubDBurl occurrance is the same as the number of collections
208 if ( allurls.count(url)==countColl ) :
209 SelectedPubDBURLs.append(url)
210 common.logger.debug(5,'PubDBs '+str(SelectedPubDBURLs))
211
212 #print 'Required Collections',CollIDs,'are all present in PubDBURLs : ',SelectedPubDBURLs,'\n'
213 #### check based on CE black list: select only PubDB not in the CE black list
214 tmp=self.checkBlackList(SelectedPubDBURLs)
215 common.logger.debug(5,'PubDBs after black list '+str(tmp))
216
217 ### check based on CE white list: select only PubDB defined by user
218 GoodPubDBURLs=self.checkWhiteList(tmp)
219 common.logger.debug(5,'PubDBs after white list '+str(GoodPubDBURLs))
220 return GoodPubDBURLs
221
222 #######################################################################
223 def uniquelist(self, old):
224 """
225 remove duplicates from a list
226 """
227 nd={}
228 for e in old:
229 nd[e]=0
230 return nd.keys()
231
232 #######################################################################
233 def checkWhiteList(self, pubDBUrls):
234 """
235 select PubDB URLs that are at site defined by the user (via CE white list)
236 """
237 if len(self.reCEWhiteList)==0: return pubDBUrls
238 goodurls = []
239 for url in pubDBUrls:
240 #print 'connecting to the URL ',url
241 good=0
242 for re in self.reCEWhiteList:
243 if re.search(url):
244 common.logger.debug(5,'CE in white list, adding PubDB URL '+url)
245 good=1
246 if not good: continue
247 goodurls.append(url)
248 if len(goodurls) == 0:
249 common.logger.message("No selected PubDB URLs \n")
250 else:
251 common.logger.debug(5,"Selected PubDB URLs are "+str(goodurls)+"\n")
252 return goodurls
253
254 #######################################################################
255 def checkBlackList(self, pubDBUrls):
256 """
257 select PubDB URLs that are at site not exluded by the user (via CE black list)
258 """
259 if len(self.reCEBlackList)==0: return pubDBUrls
260 goodurls = []
261 for url in pubDBUrls:
262 common.logger.debug(10,'connecting to the URL '+url)
263 good=1
264 for re in self.reCEBlackList:
265 if re.search(url):
266 common.logger.message('CE in black list, skipping PubDB URL '+url)
267 good=0
268 pass
269 if good: goodurls.append(url)
270 if len(goodurls) == 0:
271 common.logger.debug(3,"No selected PubDB URLs")
272 return goodurls
273
274 ########################################################################
275 def checkPubDBNewVersion(self, baseurl):
276 """
277 Check PubDB version to find out if it's new-style or old-style
278 """
279 ### check based on the existance of pubdb-get-version.php
280 urlversion=baseurl+'pubdb-get-version.php'
281 newversion=1;
282 try:
283 v = urllib2.urlopen(urlversion)
284 except urllib2.URLError, msg:
285 #print "WARNING: no URL to get PubDB version "
286 newversion=0;
287
288 if (newversion) :
289 schemaversion = v.read()
290 #print schemaversion;
291
292 return newversion
293
294 ########################################################################
295 def getPubDBData(self, CollIDs, url , newversion):
296 """
297 Contact a PubDB to collect all the relevant information
298 """
299 result = []
300
301 ### get the base PubDb url
302 end=string.rfind(url,'/')
303 lastEq=string.rfind(url,'=')
304
305 if (newversion) :
306 ### from PubDB V4 : get info for all the collections in one shot and unserialize the content
307 Collections=string.join(CollIDs,'-')
308 ## add the PU among the required Collections if the Digi are requested
309 # ( for the time being asking it directly to the PubDB so the RefDB
310 # level data discovery is bypassed..... in future when every site
311 # will have the new style it will be possible to ask for PU , at RefDB level, in method findAllCollections )
312 if ( self.NeededdataTiers.count('Digi') ):
313 PUCollID=self.getDatatierCollID(url[:end+1],Collections,"PU")
314 if (PUCollID) : CollIDs.append(PUCollID)
315 ##
316 Collections=string.join(CollIDs,'-')
317 ### download from PubDB all the info about the given collections
318 pubdb_analysis=PubDBInfo(url[:end+1],Collections)
319 #print pubdb_analysis.GetPubDBInfo()
320 ok=0
321 try:
322 catInfos=pubdb_analysis.GetPubDBInfo()
323 ok=1
324 except :
325 #print "WARNING: can't get PubDB content out of "+url[:end+1]+"\n"
326 print '\nERROR extracting info for collections '+Collections+' from PubDB '+url[:end+1]+'. The PU might not be published at that site.\n'
327 #raise PubDBGetAnalysisError(url[:end+1],Collections)
328 if (ok): result=catInfos;
329
330 else:
331
332 ### before PubDB V4 : get info for each collection and read the key-value pair text
333
334 for CollID in CollIDs:
335 urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+CollID
336 # print 'PHP URL: '+urlphp+' \n'
337
338 reOld=re.compile( r'V24' )
339 #print urlphp,'Old PubDB ',reOld.search(urlphp)
340 if reOld.search(urlphp):
341 raise NoPHPError(urlphp)
342 else:
343 try:
344 f = urllib2.urlopen(urlphp)
345 except urllib2.URLError, msg:
346 print "WARNING: ", msg
347 raise PubDBError(urlphp)
348 except urllib2.HTTPError, msg:
349 print "WARNING: ", msg
350 raise NoPHPError(urlphp)
351 content = f.read()
352 result.append(pubDBResult(content))
353 #print "Coll",CollID," content ",content
354 pass
355 pass
356
357 #print '.....'
358 #for r in result:
359 # r.dump()
360 #print '.....'
361 return result
362
363 ########################################################################
364 def getDatatierCollID(self,urlbase,CollIDString,datatier):
365 """
366 Contact a script of PubDB to retrieve the collid a DataTier
367 """
368 try:
369 f = urllib.urlopen(urlbase+'pubdb-get-collidbydatatier.php?collid='+CollIDString+"&datatier="+datatier)
370 except IOError:
371 raise PubDBGetAnalysisError(url[:end+1]+'pubdb-get-collidbydatatier.php',CollIDString)
372 data = f.read()
373 colldata=re.compile(r'collid=(\S*)').search(data);
374 if colldata:
375 datatier_CollID=colldata.group(1)
376 # print " --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier
377 common.logger.message(" --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier)
378
379 return datatier_CollID
380
381 ########################################################################
382 def getAllPubDBData(self):
383 """
384 Contact a list of PubDB to collect all the relevant information
385 """
386 newPubDBResult=[]
387 oldPubDBResult=[]
388 Result={}
389
390 ### find the user-required collection IDs
391 CollIDs = self.findAllCollections()
392 ### find the PubDB URLs publishing the needed data
393 urllist = self.findPubDBs(CollIDs)
394 ### collect information sparatelly from new-style PubDBs and old-style PubDBs
395 for pubdburl in urllist:
396 end=string.rfind(pubdburl,'/')
397 newversion=self.checkPubDBNewVersion(pubdburl[:end+1])
398 if (newversion):
399 res=self.getPubDBData(CollIDs,pubdburl,newversion)
400 if len(res)>0:
401 newPubDBResult.append(res)
402 else:
403 resold=self.getPubDBData(CollIDs,pubdburl,newversion)
404 if len(resold)>0:
405 oldPubDBResult.append(resold)
406 ### fill a dictionary with all the PubBDs results both old-style and new-style
407 Result['newPubDB']=newPubDBResult
408 Result['oldPubDB']=oldPubDBResult
409
410 ## print for debugging purpose
411 #
412 #for PubDBversion in Result.keys():
413 #print ("key %s, val %s" %(PubDBversion,Result[PubDBversion]))
414 # if len(Result[PubDBversion])>0 :
415 #print (" key %s"%(PubDBversion))
416 # for result in Result[PubDBversion]:
417 # for r in result:
418 #r.dump()
419 # common.log.write('----------------- \n')
420 #print '.....................................'
421
422 return Result
423
424 ####################################################################