1 |
#!/usr/bin/env python2
|
2 |
import sys, os, string, re
|
3 |
import urllib, urllister
|
4 |
import urllib2
|
5 |
import common
|
6 |
from RefDBInfo import RefDBInfo
|
7 |
from PubDBInfo import *
|
8 |
|
9 |
# ####################################
|
10 |
class PubDBError:
|
11 |
def __init__(self, url):
|
12 |
print '\nERROR accessing PubDB at '+url+'\n'
|
13 |
pass
|
14 |
|
15 |
# ####################################
|
16 |
class PubDBGetAnalysisError:
|
17 |
def __init__(self, url,Collections):
|
18 |
print '\nERROR extracting info for collections '+Collections+' from PubDB '+url+'.\n'
|
19 |
pass
|
20 |
|
21 |
# ####################################
|
22 |
class RefDBmapError:
|
23 |
def __init__(self, url):
|
24 |
print '\nERROR accessing RefDB-PubDBs map at '+url+'\n'
|
25 |
pass
|
26 |
|
27 |
# ####################################
|
28 |
class NoPHPError:
|
29 |
def __init__(self, url):
|
30 |
#print '\nERROR accessing PHP at '+url+' \n'
|
31 |
print 'ERROR accessing PHP: ',url,' \n'
|
32 |
pass
|
33 |
|
34 |
# ####################################
|
35 |
class pubDBResult:
|
36 |
def __init__(self,
|
37 |
contents):
|
38 |
self.contents=contents
|
39 |
|
40 |
|
41 |
def dump(self):
|
42 |
print 'Contents : ',self.contents
|
43 |
pass
|
44 |
|
45 |
# ####################################
|
46 |
# class to access PubDBs
|
47 |
class PubDB:
|
48 |
def __init__(self, owner, dataset, dataTiers, cfg_params):
|
49 |
|
50 |
# Attributes
|
51 |
self.owner = owner
|
52 |
self.dataset = dataset
|
53 |
self.dataTiers = dataTiers
|
54 |
self.NeededdataTiers=[]
|
55 |
self.cfg_params = cfg_params
|
56 |
|
57 |
self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
|
58 |
self.RefDBphp_ = 'PubDB/GetIdCollection.php'
|
59 |
self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'
|
60 |
|
61 |
self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
|
62 |
self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'
|
63 |
|
64 |
self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
|
65 |
self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'
|
66 |
|
67 |
## link to the modified RefDB-PubDBs map script that allow the display option
|
68 |
self.RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.mod.php?display=1'
|
69 |
|
70 |
# Costructor procedures
|
71 |
|
72 |
CEBlackList = []
|
73 |
try:
|
74 |
tmpBad = string.split(self.cfg_params['EDG.ce_black_list'],',')
|
75 |
#tmpBad = ['fnal']
|
76 |
for tmp in tmpBad:
|
77 |
tmp=string.strip(tmp)
|
78 |
#if (tmp == 'cnaf'): tmp = 'webserver' ########## warning: temp. patch
|
79 |
CEBlackList.append(tmp)
|
80 |
except KeyError:
|
81 |
pass
|
82 |
|
83 |
CEWhiteList = []
|
84 |
try:
|
85 |
tmpGood = string.split(self.cfg_params['EDG.ce_white_list'],',')
|
86 |
#tmpGood = ['cern']
|
87 |
for tmp in tmpGood:
|
88 |
tmp=string.strip(tmp)
|
89 |
#if (tmp == 'cnaf'): tmp = 'webserver' ########## warning: temp. patch
|
90 |
CEWhiteList.append(tmp)
|
91 |
except KeyError:
|
92 |
pass
|
93 |
|
94 |
#print 'CEWhiteList: ',CEWhiteList
|
95 |
self.reCEWhiteList=[]
|
96 |
for Good in CEWhiteList:
|
97 |
self.reCEWhiteList.append(re.compile( Good ))
|
98 |
#print 'ReGood: ',self.reCEWhiteList
|
99 |
|
100 |
common.logger.debug(5,'CEBlackList: '+str(CEBlackList))
|
101 |
common.logger.debug(5,'CEWhiteList: '+str(CEWhiteList))
|
102 |
self.reCEBlackList=[]
|
103 |
for bad in CEBlackList:
|
104 |
self.reCEBlackList.append(re.compile( bad ))
|
105 |
#print 'ReBad: ',self.reCEBlackList
|
106 |
|
107 |
|
108 |
########################################################################
|
109 |
def findAllCollections(self):
|
110 |
"""
|
111 |
Contact RefDB and find the CollID of all the user required collections
|
112 |
"""
|
113 |
## download from RefDB all the info about the given dataset-owner
|
114 |
refdb=RefDBInfo(self.owner,self.dataset)
|
115 |
#print refdb.GetRefDBInfo()
|
116 |
try:
|
117 |
collInfos=refdb.GetRefDBInfo()
|
118 |
except :
|
119 |
sys.exit(10)
|
120 |
#print "collInfos=", collInfos
|
121 |
|
122 |
first=1
|
123 |
NeededCollID=[]
|
124 |
refdbdataTiers=[]
|
125 |
for coll in collInfos:
|
126 |
## select the primary collection
|
127 |
if first:
|
128 |
NeededCollID.append(coll[0])
|
129 |
self.NeededdataTiers.append(coll[2])
|
130 |
refdbdataTiers.append(coll[2])
|
131 |
common.logger.message("\n --> primary collection for owner "+self.owner+" is: ID="+coll[0]+" DataTier="+coll[2])
|
132 |
first=0
|
133 |
else:
|
134 |
## select only the parents collections corresponding to data-tiers requested by the user
|
135 |
if self.dataTiers.count(coll[2]):
|
136 |
NeededCollID.append(coll[0])
|
137 |
self.NeededdataTiers.append(coll[2])
|
138 |
common.logger.message(" --> further collection required: ID="+coll[0]+" DataTier="+coll[2])
|
139 |
refdbdataTiers.append(coll[2])
|
140 |
|
141 |
## check that the user asks for Data Tier really existing in RefDB, otherwise give a warning message
|
142 |
for dt in self.dataTiers:
|
143 |
if refdbdataTiers.count(dt)<=0:
|
144 |
msg = "ERROR: Data Tier ( =>",dt,"<= ) not existing for dataset/owner "+self.dataset+"/"+self.owner+"! "
|
145 |
msg = str(msg) + 'Owner Dataset not published with asked dataTiers! '+\
|
146 |
self.owner+' '+ self.dataset+' '+self.dataTiers
|
147 |
msg = str(msg) + ' Check the data_tier variable in crab.cfg !\n'
|
148 |
common.logger.message(msg)
|
149 |
return []
|
150 |
|
151 |
#print 'Needed Collections are ', NeededCollID
|
152 |
#return collInfos
|
153 |
#print "NeededCollID= ", NeededCollID
|
154 |
return NeededCollID
|
155 |
|
156 |
########################################################################
|
157 |
def findPubDBsbyCollID(self,CollID):
|
158 |
"""
|
159 |
Find the list of PubDB URLs having a given Collection
|
160 |
"""
|
161 |
### contact the RefDB-PubDBs map to discovery where the given CollID is
|
162 |
url = self.PubDBCentralUrl_+self.RefDBPubDBsmapPhp_+'&CollID=' + CollID
|
163 |
# print "%s"%(url)
|
164 |
try:
|
165 |
f = urllib.urlopen(url)
|
166 |
except IOError:
|
167 |
# print 'Cannot access URL: '+url
|
168 |
raise RefDBmapError(url)
|
169 |
|
170 |
### search for the PubDBURL string
|
171 |
reURLLine=re.compile( r'PubDBURL=(\S*)' )
|
172 |
|
173 |
PubDBURLs = []
|
174 |
for line in f.readlines():
|
175 |
#print '#',line,'#'
|
176 |
if reURLLine.search(line) :
|
177 |
URLLine=reURLLine.search(line).group()
|
178 |
#print string.split(URLLine,'=')[1]
|
179 |
PubDBURLs.append(string.split(URLLine,'=')[1])
|
180 |
|
181 |
### return the list of PubDBURL where the collection is present
|
182 |
#return PubDBURLs
|
183 |
return self.uniquelist(PubDBURLs)
|
184 |
|
185 |
################################################################
|
186 |
def findPubDBs(self,CollIDs):
|
187 |
"""
|
188 |
Find the list of PubDB URLs having ALL the required collections
|
189 |
"""
|
190 |
### loop over all the required collections
|
191 |
#pubdbmap={}
|
192 |
allurls=[]
|
193 |
countColl=0
|
194 |
for CollID in CollIDs :
|
195 |
countColl=countColl+1
|
196 |
### map the CollectionID with the list of PubDB URLs
|
197 |
#pubdbmap[CollID]=self.findPubDBsbyCollID(CollID)
|
198 |
### prepare a list all PubDB urls for all collections
|
199 |
allurls.extend(self.findPubDBsbyCollID(CollID))
|
200 |
#print pubdbmap.values()
|
201 |
|
202 |
### select only PubDB urls that contains all the collections
|
203 |
unique_urls=self.uniquelist(allurls)
|
204 |
SelectedPubDBURLs=[]
|
205 |
# loop on a unique list of PubDB urls
|
206 |
for url in unique_urls :
|
207 |
# check that PubDBurl occurrance is the same as the number of collections
|
208 |
if ( allurls.count(url)==countColl ) :
|
209 |
SelectedPubDBURLs.append(url)
|
210 |
common.logger.debug(5,'PubDBs '+str(SelectedPubDBURLs))
|
211 |
|
212 |
#print 'Required Collections',CollIDs,'are all present in PubDBURLs : ',SelectedPubDBURLs,'\n'
|
213 |
#### check based on CE black list: select only PubDB not in the CE black list
|
214 |
tmp=self.checkBlackList(SelectedPubDBURLs)
|
215 |
common.logger.debug(5,'PubDBs after black list '+str(tmp))
|
216 |
|
217 |
### check based on CE white list: select only PubDB defined by user
|
218 |
GoodPubDBURLs=self.checkWhiteList(tmp)
|
219 |
common.logger.debug(5,'PubDBs after white list '+str(GoodPubDBURLs))
|
220 |
return GoodPubDBURLs
|
221 |
|
222 |
#######################################################################
|
223 |
def uniquelist(self, old):
|
224 |
"""
|
225 |
remove duplicates from a list
|
226 |
"""
|
227 |
nd={}
|
228 |
for e in old:
|
229 |
nd[e]=0
|
230 |
return nd.keys()
|
231 |
|
232 |
#######################################################################
|
233 |
def checkWhiteList(self, pubDBUrls):
|
234 |
"""
|
235 |
select PubDB URLs that are at site defined by the user (via CE white list)
|
236 |
"""
|
237 |
goodurls = []
|
238 |
for url in pubDBUrls:
|
239 |
#print 'connecting to the URL ',url
|
240 |
good=0
|
241 |
for re in self.reCEWhiteList:
|
242 |
if re.search(url):
|
243 |
common.logger.debug(5,'CE in white list, adding PubDB URL '+url)
|
244 |
good=1
|
245 |
if not good: continue
|
246 |
goodurls.append(url)
|
247 |
if len(goodurls) == 0:
|
248 |
common.logger.message("No selected PubDB URLs \n")
|
249 |
else:
|
250 |
common.logger.debug(5,"Selected PubDB URLs are "+str(goodurls)+"\n")
|
251 |
return goodurls
|
252 |
|
253 |
#######################################################################
|
254 |
def checkBlackList(self, pubDBUrls):
|
255 |
"""
|
256 |
select PubDB URLs that are at site not exluded by the user (via CE black list)
|
257 |
"""
|
258 |
goodurls = []
|
259 |
for url in pubDBUrls:
|
260 |
common.logger.debug(10,'connecting to the URL '+url)
|
261 |
good=1
|
262 |
for re in self.reCEBlackList:
|
263 |
if re.search(url):
|
264 |
common.logger.message('CE in black list, skipping PubDB URL '+url)
|
265 |
good=0
|
266 |
pass
|
267 |
if good: goodurls.append(url)
|
268 |
if len(goodurls) == 0:
|
269 |
common.logger.debug(3,"No selected PubDB URLs")
|
270 |
return goodurls
|
271 |
|
272 |
########################################################################
|
273 |
def checkPubDBNewVersion(self, baseurl):
|
274 |
"""
|
275 |
Check PubDB version to find out if it's new-style or old-style
|
276 |
"""
|
277 |
### check based on the existance of pubdb-get-version.php
|
278 |
urlversion=baseurl+'pubdb-get-version.php'
|
279 |
newversion=1;
|
280 |
try:
|
281 |
v = urllib2.urlopen(urlversion)
|
282 |
except urllib2.URLError, msg:
|
283 |
#print "WARNING: no URL to get PubDB version "
|
284 |
newversion=0;
|
285 |
|
286 |
if (newversion) :
|
287 |
schemaversion = v.read()
|
288 |
#print schemaversion;
|
289 |
|
290 |
return newversion
|
291 |
|
292 |
########################################################################
|
293 |
def getPubDBData(self, CollIDs, url , newversion):
|
294 |
"""
|
295 |
Contact a PubDB to collect all the relevant information
|
296 |
"""
|
297 |
result = []
|
298 |
|
299 |
### get the base PubDb url
|
300 |
end=string.rfind(url,'/')
|
301 |
lastEq=string.rfind(url,'=')
|
302 |
|
303 |
if (newversion) :
|
304 |
### from PubDB V4 : get info for all the collections in one shot and unserialize the content
|
305 |
Collections=string.join(CollIDs,'-')
|
306 |
## add the PU among the required Collections if the Digi are requested
|
307 |
# ( for the time being asking it directly to the PubDB so the RefDB
|
308 |
# level data discovery is bypassed..... in future when every site
|
309 |
# will have the new style it will be possible to ask for PU , at RefDB level, in method findAllCollections )
|
310 |
if ( self.NeededdataTiers.count('Digi') ):
|
311 |
PUCollID=self.getDatatierCollID(url[:end+1],Collections,"PU")
|
312 |
if (PUCollID) : CollIDs.append(PUCollID)
|
313 |
##
|
314 |
Collections=string.join(CollIDs,'-')
|
315 |
### download from PubDB all the info about the given collections
|
316 |
pubdb_analysis=PubDBInfo(url[:end+1],Collections)
|
317 |
#print pubdb_analysis.GetPubDBInfo()
|
318 |
ok=0
|
319 |
try:
|
320 |
catInfos=pubdb_analysis.GetPubDBInfo()
|
321 |
ok=1
|
322 |
except :
|
323 |
#print "WARNING: can't get PubDB content out of "+url[:end+1]+"\n"
|
324 |
print '\nERROR extracting info for collections '+Collections+' from PubDB '+url[:end+1]+'. The PU might not be published at that site.\n'
|
325 |
#raise PubDBGetAnalysisError(url[:end+1],Collections)
|
326 |
if (ok): result=catInfos;
|
327 |
|
328 |
else:
|
329 |
|
330 |
### before PubDB V4 : get info for each collection and read the key-value pair text
|
331 |
|
332 |
for CollID in CollIDs:
|
333 |
urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+CollID
|
334 |
# print 'PHP URL: '+urlphp+' \n'
|
335 |
|
336 |
reOld=re.compile( r'V24' )
|
337 |
#print urlphp,'Old PubDB ',reOld.search(urlphp)
|
338 |
if reOld.search(urlphp):
|
339 |
raise NoPHPError(urlphp)
|
340 |
else:
|
341 |
try:
|
342 |
f = urllib2.urlopen(urlphp)
|
343 |
except urllib2.URLError, msg:
|
344 |
print "WARNING: ", msg
|
345 |
raise PubDBError(urlphp)
|
346 |
except urllib2.HTTPError, msg:
|
347 |
print "WARNING: ", msg
|
348 |
raise NoPHPError(urlphp)
|
349 |
content = f.read()
|
350 |
result.append(pubDBResult(content))
|
351 |
#print "Coll",CollID," content ",content
|
352 |
pass
|
353 |
pass
|
354 |
|
355 |
#print '.....'
|
356 |
#for r in result:
|
357 |
# r.dump()
|
358 |
#print '.....'
|
359 |
return result
|
360 |
|
361 |
########################################################################
|
362 |
def getDatatierCollID(self,urlbase,CollIDString,datatier):
|
363 |
"""
|
364 |
Contact a script of PubDB to retrieve the collid a DataTier
|
365 |
"""
|
366 |
try:
|
367 |
f = urllib.urlopen(urlbase+'pubdb-get-collidbydatatier.php?collid='+CollIDString+"&datatier="+datatier)
|
368 |
except IOError:
|
369 |
raise PubDBGetAnalysisError(url[:end+1]+'pubdb-get-collidbydatatier.php',CollIDString)
|
370 |
data = f.read()
|
371 |
colldata=re.compile(r'collid=(\S*)').search(data);
|
372 |
if colldata:
|
373 |
datatier_CollID=colldata.group(1)
|
374 |
# print " --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier
|
375 |
common.logger.message(" --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier)
|
376 |
|
377 |
return datatier_CollID
|
378 |
|
379 |
########################################################################
|
380 |
def getAllPubDBData(self):
|
381 |
"""
|
382 |
Contact a list of PubDB to collect all the relevant information
|
383 |
"""
|
384 |
newPubDBResult=[]
|
385 |
oldPubDBResult=[]
|
386 |
Result={}
|
387 |
|
388 |
### find the user-required collection IDs
|
389 |
CollIDs = self.findAllCollections()
|
390 |
### find the PubDB URLs publishing the needed data
|
391 |
urllist = self.findPubDBs(CollIDs)
|
392 |
### collect information sparatelly from new-style PubDBs and old-style PubDBs
|
393 |
for pubdburl in urllist:
|
394 |
end=string.rfind(pubdburl,'/')
|
395 |
newversion=self.checkPubDBNewVersion(pubdburl[:end+1])
|
396 |
if (newversion):
|
397 |
res=self.getPubDBData(CollIDs,pubdburl,newversion)
|
398 |
if len(res)>0:
|
399 |
newPubDBResult.append(res)
|
400 |
else:
|
401 |
resold=self.getPubDBData(CollIDs,pubdburl,newversion)
|
402 |
if len(resold)>0:
|
403 |
oldPubDBResult.append(resold)
|
404 |
### fill a dictionary with all the PubBDs results both old-style and new-style
|
405 |
Result['newPubDB']=newPubDBResult
|
406 |
Result['oldPubDB']=oldPubDBResult
|
407 |
|
408 |
## print for debugging purpose
|
409 |
#
|
410 |
#for PubDBversion in Result.keys():
|
411 |
#print ("key %s, val %s" %(PubDBversion,Result[PubDBversion]))
|
412 |
# if len(Result[PubDBversion])>0 :
|
413 |
#print (" key %s"%(PubDBversion))
|
414 |
# for result in Result[PubDBversion]:
|
415 |
# for r in result:
|
416 |
#r.dump()
|
417 |
# common.log.write('----------------- \n')
|
418 |
#print '.....................................'
|
419 |
|
420 |
return Result
|
421 |
|
422 |
####################################################################
|