ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cvsroot/COMP/CRAB/python/PubDB.py
Revision: 1.6
Committed: Tue Aug 23 11:14:24 2005 UTC (19 years, 8 months ago) by slacapra
Content type: text/x-python
Branch: MAIN
CVS Tags: CRAB_1_0_0_beta4, CRAB_1_0_0_pre1_boss_2, CRAB_1_0_0_pre1_boss, CRAB_1_0_0_pre3, CRAB_1_0_0_pre2
Changes since 1.5: +1 -0 lines
Log Message:
add PubDBInfo

File Contents

# Content
1 #!/usr/bin/env python2
2 import sys, os, string, re
3 import urllib, urllister
4 import urllib2
5 import common
6 from RefDBInfo import RefDBInfo
7 from PubDBInfo import *
8
9 # ####################################
10 class PubDBError:
11 def __init__(self, url):
12 print '\nERROR accessing PubDB at '+url+'\n'
13 pass
14
15 # ####################################
16 class PubDBGetAnalysisError:
17 def __init__(self, url,Collections):
18 print '\nERROR extracting info for collections '+Collections+' from PubDB '+url+'.\n'
19 pass
20
21 # ####################################
22 class RefDBmapError:
23 def __init__(self, url):
24 print '\nERROR accessing RefDB-PubDBs map at '+url+'\n'
25 pass
26
27 # ####################################
28 class NoPHPError:
29 def __init__(self, url):
30 #print '\nERROR accessing PHP at '+url+' \n'
31 print 'ERROR accessing PHP: ',url,' \n'
32 pass
33
34 # ####################################
35 class pubDBResult:
36 def __init__(self,
37 contents):
38 self.contents=contents
39
40
41 def dump(self):
42 print 'Contents : ',self.contents
43 pass
44
45 # ####################################
46 # class to access PubDBs
47 class PubDB:
48 def __init__(self, owner, dataset, dataTiers, cfg_params):
49
50 # Attributes
51 self.owner = owner
52 self.dataset = dataset
53 self.dataTiers = dataTiers
54 self.NeededdataTiers=[]
55 self.cfg_params = cfg_params
56
57 self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
58 self.RefDBphp_ = 'PubDB/GetIdCollection.php'
59 self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'
60
61 self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
62 self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'
63
64 self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
65 self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'
66
67 ## link to the modified RefDB-PubDBs map script that allow the display option
68 self.RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.mod.php?display=1'
69
70 # Costructor procedures
71
72 CEBlackList = []
73 try:
74 tmpBad = string.split(self.cfg_params['EDG.ce_black_list'],',')
75 #tmpBad = ['fnal']
76 for tmp in tmpBad:
77 tmp=string.strip(tmp)
78 if (tmp == 'cnaf'): tmp = 'webserver' ########## warning: temp. patch
79 CEBlackList.append(tmp)
80 except KeyError:
81 pass
82 common.logger.debug(5,'CEBlackList: '+str(CEBlackList))
83 self.reCEBlackList=[]
84 for bad in CEBlackList:
85 self.reCEBlackList.append(re.compile( bad ))
86 #print 'ReBad: ',self.reCEBlackList
87
88
89 ########################################################################
90 def findAllCollections(self):
91 """
92 Contact RefDB and find the CollID of all the user required collections
93 """
94 ## download from RefDB all the info about the given dataset-owner
95 refdb=RefDBInfo(self.owner,self.dataset)
96 #print refdb.GetRefDBInfo()
97 try:
98 collInfos=refdb.GetRefDBInfo()
99 except :
100 sys.exit(10)
101 #print "collInfos=", collInfos
102
103 first=1
104 NeededCollID=[]
105 refdbdataTiers=[]
106 for coll in collInfos:
107 ## select the primary collection
108 if first:
109 NeededCollID.append(coll[0])
110 self.NeededdataTiers.append(coll[2])
111 refdbdataTiers.append(coll[2])
112 common.logger.message("\n --> primary collection for owner "+self.owner+" is: ID="+coll[0]+" DataTier="+coll[2])
113 first=0
114 else:
115 ## select only the parents collections corresponding to data-tiers requested by the user
116 if self.dataTiers.count(coll[2]):
117 NeededCollID.append(coll[0])
118 self.NeededdataTiers.append(coll[2])
119 common.logger.message(" --> further collection required: ID="+coll[0]+" DataTier="+coll[2])
120 refdbdataTiers.append(coll[2])
121
122 ## check that the user asks for Data Tier really existing in RefDB, otherwise give a warning message
123 for dt in self.dataTiers:
124 if refdbdataTiers.count(dt)<=0:
125 msg = "ERROR: Data Tier ( =>",dt,"<= ) not existing for dataset/owner "+self.dataset+"/"+self.owner+"! "
126 msg = str(msg) + 'Owner Dataset not published with asked dataTiers! '+\
127 self.owner+' '+ self.dataset+' '+self.dataTiers
128 msg = str(msg) + ' Check the data_tier variable in crab.cfg !\n'
129 common.logger.message(msg)
130 return []
131
132 #print 'Needed Collections are ', NeededCollID
133 #return collInfos
134 #print "NeededCollID= ", NeededCollID
135 return NeededCollID
136
137 ########################################################################
138 def findPubDBsbyCollID(self,CollID):
139 """
140 Find the list of PubDB URLs having a given Collection
141 """
142 ### contact the RefDB-PubDBs map to discovery where the given CollID is
143 url = self.PubDBCentralUrl_+self.RefDBPubDBsmapPhp_+'&CollID=' + CollID
144 # print "%s"%(url)
145 try:
146 f = urllib.urlopen(url)
147 except IOError:
148 # print 'Cannot access URL: '+url
149 raise RefDBmapError(url)
150
151 ### search for the PubDBURL string
152 reURLLine=re.compile( r'PubDBURL=(\S*)' )
153
154 PubDBURLs = []
155 for line in f.readlines():
156 #print '#',line,'#'
157 if reURLLine.search(line) :
158 URLLine=reURLLine.search(line).group()
159 #print string.split(URLLine,'=')[1]
160 PubDBURLs.append(string.split(URLLine,'=')[1])
161
162 ### return the list of PubDBURL where the collection is present
163 #return PubDBURLs
164 return self.uniquelist(PubDBURLs)
165
166 ################################################################
167 def findPubDBs(self,CollIDs):
168 """
169 Find the list of PubDB URLs having ALL the required collections
170 """
171 ### loop over all the required collections
172 #pubdbmap={}
173 allurls=[]
174 countColl=0
175 for CollID in CollIDs :
176 countColl=countColl+1
177 ### map the CollectionID with the list of PubDB URLs
178 #pubdbmap[CollID]=self.findPubDBsbyCollID(CollID)
179 ### prepare a list all PubDB urls for all collections
180 allurls.extend(self.findPubDBsbyCollID(CollID))
181 #print pubdbmap.values()
182
183 ### select only PubDB urls that contains all the collections
184 unique_urls=self.uniquelist(allurls)
185 SelectedPubDBURLs=[]
186 # loop on a unique list of PubDB urls
187 for url in unique_urls :
188 # check that PubDBurl occurrance is the same as the number of collections
189 if ( allurls.count(url)==countColl ) :
190 SelectedPubDBURLs.append(url)
191
192 #print 'Required Collections',CollIDs,'are all present in PubDBURLs : ',SelectedPubDBURLs,'\n'
193 #### check based on CE black list: select only PubDB not in the CE black list
194 GoodPubDBURLs=self.checkBlackList(SelectedPubDBURLs)
195 return GoodPubDBURLs
196
197 #######################################################################
198 def uniquelist(self, old):
199 """
200 remove duplicates from a list
201 """
202 nd={}
203 for e in old:
204 nd[e]=0
205 return nd.keys()
206
207 #######################################################################
208 def checkBlackList(self, pubDBUrls):
209 """
210 select PubDB URLs that are at site not exluded by the user (via CE black list)
211 """
212 goodurls = []
213 for url in pubDBUrls:
214 common.logger.debug(10,'connecting to the URL '+url)
215 good=1
216 for re in self.reCEBlackList:
217 if re.search(url):
218 common.logger.message('CE in black list, skipping PubDB URL '+url)
219 good=0
220 pass
221 if good: goodurls.append(url)
222 if len(goodurls) == 0:
223 common.logger.debug(3,"No selected PubDB URLs")
224 return goodurls
225
226 ########################################################################
227 def checkPubDBNewVersion(self, baseurl):
228 """
229 Check PubDB version to find out if it's new-style or old-style
230 """
231 ### check based on the existance of pubdb-get-version.php
232 urlversion=baseurl+'pubdb-get-version.php'
233 newversion=1;
234 try:
235 v = urllib2.urlopen(urlversion)
236 except urllib2.URLError, msg:
237 #print "WARNING: no URL to get PubDB version "
238 newversion=0;
239
240 if (newversion) :
241 schemaversion = v.read()
242 #print schemaversion;
243
244 return newversion
245
246 ########################################################################
247 def getPubDBData(self, CollIDs, url , newversion):
248 """
249 Contact a PubDB to collect all the relevant information
250 """
251 result = []
252
253 ### get the base PubDb url
254 end=string.rfind(url,'/')
255 lastEq=string.rfind(url,'=')
256
257 if (newversion) :
258 ### from PubDB V4 : get info for all the collections in one shot and unserialize the content
259 Collections=string.join(CollIDs,'-')
260 ## add the PU among the required Collections if the Digi are requested
261 # ( for the time being asking it directly to the PubDB so the RefDB
262 # level data discovery is bypassed..... in future when every site
263 # will have the new style it will be possible to ask for PU , at RefDB level, in method findAllCollections )
264 if ( self.NeededdataTiers.count('Digi') ):
265 PUCollID=self.getDatatierCollID(url[:end+1],Collections,"PU")
266 if (PUCollID) : CollIDs.append(PUCollID)
267 ##
268 Collections=string.join(CollIDs,'-')
269 ### download from PubDB all the info about the given collections
270 pubdb_analysis=PubDBInfo(url[:end+1],Collections)
271 #print pubdb_analysis.GetPubDBInfo()
272 ok=0
273 try:
274 catInfos=pubdb_analysis.GetPubDBInfo()
275 ok=1
276 except :
277 #print "WARNING: can't get PubDB content out of "+url[:end+1]+"\n"
278 print '\nERROR extracting info for collections '+Collections+' from PubDB '+url[:end+1]+'. The PU might not be published at that site.\n'
279 #raise PubDBGetAnalysisError(url[:end+1],Collections)
280 if (ok): result=catInfos;
281
282 else:
283
284 ### before PubDB V4 : get info for each collection and read the key-value pair text
285
286 for CollID in CollIDs:
287 urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+CollID
288 # print 'PHP URL: '+urlphp+' \n'
289
290 reOld=re.compile( r'V24' )
291 #print urlphp,'Old PubDB ',reOld.search(urlphp)
292 if reOld.search(urlphp):
293 raise NoPHPError(urlphp)
294 else:
295 try:
296 f = urllib2.urlopen(urlphp)
297 except urllib2.URLError, msg:
298 print "WARNING: ", msg
299 raise PubDBError(urlphp)
300 except urllib2.HTTPError, msg:
301 print "WARNING: ", msg
302 raise NoPHPError(urlphp)
303 content = f.read()
304 result.append(pubDBResult(content))
305 #print "Coll",CollID," content ",content
306 pass
307 pass
308
309 #print '.....'
310 #for r in result:
311 # r.dump()
312 #print '.....'
313 return result
314
315 ########################################################################
316 def getDatatierCollID(self,urlbase,CollIDString,datatier):
317 """
318 Contact a script of PubDB to retrieve the collid a DataTier
319 """
320 try:
321 f = urllib.urlopen(urlbase+'pubdb-get-collidbydatatier.php?collid='+CollIDString+"&datatier="+datatier)
322 except IOError:
323 raise PubDBGetAnalysisError(url[:end+1]+'pubdb-get-collidbydatatier.php',CollIDString)
324 data = f.read()
325 colldata=re.compile(r'collid=(\S*)').search(data);
326 if colldata:
327 datatier_CollID=colldata.group(1)
328 # print " --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier
329 common.logger.message(" --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier)
330
331 return datatier_CollID
332
333 ########################################################################
334 def getAllPubDBData(self):
335 """
336 Contact a list of PubDB to collect all the relevant information
337 """
338 newPubDBResult=[]
339 oldPubDBResult=[]
340 Result={}
341
342 ### find the user-required collection IDs
343 CollIDs = self.findAllCollections()
344 ### find the PubDB URLs publishing the needed data
345 urllist = self.findPubDBs(CollIDs)
346 ### collect information sparatelly from new-style PubDBs and old-style PubDBs
347 for pubdburl in urllist:
348 end=string.rfind(pubdburl,'/')
349 newversion=self.checkPubDBNewVersion(pubdburl[:end+1])
350 if (newversion):
351 res=self.getPubDBData(CollIDs,pubdburl,newversion)
352 if len(res)>0:
353 newPubDBResult.append(res)
354 else:
355 resold=self.getPubDBData(CollIDs,pubdburl,newversion)
356 if len(resold)>0:
357 oldPubDBResult.append(resold)
358 ### fill a dictionary with all the PubBDs results both old-style and new-style
359 Result['newPubDB']=newPubDBResult
360 Result['oldPubDB']=oldPubDBResult
361
362 ## print for debugging purpose
363 #
364 #for PubDBversion in Result.keys():
365 #print ("key %s, val %s" %(PubDBversion,Result[PubDBversion]))
366 # if len(Result[PubDBversion])>0 :
367 #print (" key %s"%(PubDBversion))
368 # for result in Result[PubDBversion]:
369 # for r in result:
370 #r.dump()
371 # common.log.write('----------------- \n')
372 #print '.....................................'
373
374 return Result
375
376 ####################################################################