1 |
#!/usr/bin/env python2
|
2 |
import sys, os, string, re
|
3 |
import urllib, urllister
|
4 |
import urllib2
|
5 |
import common
|
6 |
from RefDBInfo import RefDBInfo
|
7 |
from PubDBInfo import *
|
8 |
|
9 |
# ####################################
|
10 |
class PubDBError:
|
11 |
def __init__(self, url):
|
12 |
print '\nERROR accessing PubDB at '+url+'\n'
|
13 |
pass
|
14 |
|
15 |
# ####################################
|
16 |
class PubDBGetAnalysisError:
|
17 |
def __init__(self, url,Collections):
|
18 |
print '\nERROR extracting info for collections '+Collections+' from PubDB '+url+'.\n'
|
19 |
pass
|
20 |
|
21 |
# ####################################
|
22 |
class RefDBmapError:
|
23 |
def __init__(self, url):
|
24 |
print '\nERROR accessing RefDB-PubDBs map at '+url+'\n'
|
25 |
pass
|
26 |
|
27 |
# ####################################
|
28 |
class NoPHPError:
|
29 |
def __init__(self, url):
|
30 |
#print '\nERROR accessing PHP at '+url+' \n'
|
31 |
print 'ERROR accessing PHP: ',url,' \n'
|
32 |
pass
|
33 |
|
34 |
# ####################################
|
35 |
class pubDBResult:
|
36 |
def __init__(self,
|
37 |
contents):
|
38 |
self.contents=contents
|
39 |
|
40 |
|
41 |
def dump(self):
|
42 |
print 'Contents : ',self.contents
|
43 |
pass
|
44 |
|
45 |
# ####################################
|
46 |
# class to access PubDBs
|
47 |
class PubDB:
|
48 |
def __init__(self, owner, dataset, dataTiers, cfg_params):
|
49 |
|
50 |
# Attributes
|
51 |
self.owner = owner
|
52 |
self.dataset = dataset
|
53 |
self.dataTiers = dataTiers
|
54 |
self.NeededdataTiers=[]
|
55 |
self.cfg_params = cfg_params
|
56 |
|
57 |
self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
|
58 |
self.RefDBphp_ = 'PubDB/GetIdCollection.php'
|
59 |
self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'
|
60 |
|
61 |
self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
|
62 |
self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'
|
63 |
|
64 |
self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
|
65 |
self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'
|
66 |
|
67 |
## link to the modified RefDB-PubDBs map script that allow the display option
|
68 |
self.RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.mod.php?display=1'
|
69 |
|
70 |
# Costructor procedures
|
71 |
|
72 |
CEBlackList = []
|
73 |
try:
|
74 |
tmpBad = string.split(self.cfg_params['EDG.ce_black_list'],',')
|
75 |
#tmpBad = ['fnal']
|
76 |
for tmp in tmpBad:
|
77 |
tmp=string.strip(tmp)
|
78 |
if (tmp == 'cnaf'): tmp = 'webserver' ########## warning: temp. patch
|
79 |
CEBlackList.append(tmp)
|
80 |
except KeyError:
|
81 |
pass
|
82 |
common.logger.debug(5,'CEBlackList: '+str(CEBlackList))
|
83 |
self.reCEBlackList=[]
|
84 |
for bad in CEBlackList:
|
85 |
self.reCEBlackList.append(re.compile( bad ))
|
86 |
#print 'ReBad: ',self.reCEBlackList
|
87 |
|
88 |
|
89 |
########################################################################
|
90 |
def findAllCollections(self):
|
91 |
"""
|
92 |
Contact RefDB and find the CollID of all the user required collections
|
93 |
"""
|
94 |
## download from RefDB all the info about the given dataset-owner
|
95 |
refdb=RefDBInfo(self.owner,self.dataset)
|
96 |
#print refdb.GetRefDBInfo()
|
97 |
try:
|
98 |
collInfos=refdb.GetRefDBInfo()
|
99 |
except :
|
100 |
sys.exit(10)
|
101 |
#print "collInfos=", collInfos
|
102 |
|
103 |
first=1
|
104 |
NeededCollID=[]
|
105 |
refdbdataTiers=[]
|
106 |
for coll in collInfos:
|
107 |
## select the primary collection
|
108 |
if first:
|
109 |
NeededCollID.append(coll[0])
|
110 |
self.NeededdataTiers.append(coll[2])
|
111 |
refdbdataTiers.append(coll[2])
|
112 |
common.logger.message("\n --> primary collection for owner "+self.owner+" is: ID="+coll[0]+" DataTier="+coll[2])
|
113 |
first=0
|
114 |
else:
|
115 |
## select only the parents collections corresponding to data-tiers requested by the user
|
116 |
if self.dataTiers.count(coll[2]):
|
117 |
NeededCollID.append(coll[0])
|
118 |
self.NeededdataTiers.append(coll[2])
|
119 |
common.logger.message(" --> further collection required: ID="+coll[0]+" DataTier="+coll[2])
|
120 |
refdbdataTiers.append(coll[2])
|
121 |
|
122 |
## check that the user asks for Data Tier really existing in RefDB, otherwise give a warning message
|
123 |
for dt in self.dataTiers:
|
124 |
if refdbdataTiers.count(dt)<=0:
|
125 |
msg = "ERROR: Data Tier ( =>",dt,"<= ) not existing for dataset/owner "+self.dataset+"/"+self.owner+"! "
|
126 |
msg = str(msg) + 'Owner Dataset not published with asked dataTiers! '+\
|
127 |
self.owner+' '+ self.dataset+' '+self.dataTiers
|
128 |
msg = str(msg) + ' Check the data_tier variable in crab.cfg !\n'
|
129 |
common.logger.message(msg)
|
130 |
return []
|
131 |
|
132 |
#print 'Needed Collections are ', NeededCollID
|
133 |
#return collInfos
|
134 |
#print "NeededCollID= ", NeededCollID
|
135 |
return NeededCollID
|
136 |
|
137 |
########################################################################
|
138 |
def findPubDBsbyCollID(self,CollID):
|
139 |
"""
|
140 |
Find the list of PubDB URLs having a given Collection
|
141 |
"""
|
142 |
### contact the RefDB-PubDBs map to discovery where the given CollID is
|
143 |
url = self.PubDBCentralUrl_+self.RefDBPubDBsmapPhp_+'&CollID=' + CollID
|
144 |
# print "%s"%(url)
|
145 |
try:
|
146 |
f = urllib.urlopen(url)
|
147 |
except IOError:
|
148 |
# print 'Cannot access URL: '+url
|
149 |
raise RefDBmapError(url)
|
150 |
|
151 |
### search for the PubDBURL string
|
152 |
reURLLine=re.compile( r'PubDBURL=(\S*)' )
|
153 |
|
154 |
PubDBURLs = []
|
155 |
for line in f.readlines():
|
156 |
#print '#',line,'#'
|
157 |
if reURLLine.search(line) :
|
158 |
URLLine=reURLLine.search(line).group()
|
159 |
#print string.split(URLLine,'=')[1]
|
160 |
PubDBURLs.append(string.split(URLLine,'=')[1])
|
161 |
|
162 |
### return the list of PubDBURL where the collection is present
|
163 |
#return PubDBURLs
|
164 |
return self.uniquelist(PubDBURLs)
|
165 |
|
166 |
################################################################
|
167 |
def findPubDBs(self,CollIDs):
|
168 |
"""
|
169 |
Find the list of PubDB URLs having ALL the required collections
|
170 |
"""
|
171 |
### loop over all the required collections
|
172 |
#pubdbmap={}
|
173 |
allurls=[]
|
174 |
countColl=0
|
175 |
for CollID in CollIDs :
|
176 |
countColl=countColl+1
|
177 |
### map the CollectionID with the list of PubDB URLs
|
178 |
#pubdbmap[CollID]=self.findPubDBsbyCollID(CollID)
|
179 |
### prepare a list all PubDB urls for all collections
|
180 |
allurls.extend(self.findPubDBsbyCollID(CollID))
|
181 |
#print pubdbmap.values()
|
182 |
|
183 |
### select only PubDB urls that contains all the collections
|
184 |
unique_urls=self.uniquelist(allurls)
|
185 |
SelectedPubDBURLs=[]
|
186 |
# loop on a unique list of PubDB urls
|
187 |
for url in unique_urls :
|
188 |
# check that PubDBurl occurrance is the same as the number of collections
|
189 |
if ( allurls.count(url)==countColl ) :
|
190 |
SelectedPubDBURLs.append(url)
|
191 |
|
192 |
#print 'Required Collections',CollIDs,'are all present in PubDBURLs : ',SelectedPubDBURLs,'\n'
|
193 |
#### check based on CE black list: select only PubDB not in the CE black list
|
194 |
GoodPubDBURLs=self.checkBlackList(SelectedPubDBURLs)
|
195 |
return GoodPubDBURLs
|
196 |
|
197 |
#######################################################################
|
198 |
def uniquelist(self, old):
|
199 |
"""
|
200 |
remove duplicates from a list
|
201 |
"""
|
202 |
nd={}
|
203 |
for e in old:
|
204 |
nd[e]=0
|
205 |
return nd.keys()
|
206 |
|
207 |
#######################################################################
|
208 |
def checkBlackList(self, pubDBUrls):
|
209 |
"""
|
210 |
select PubDB URLs that are at site not exluded by the user (via CE black list)
|
211 |
"""
|
212 |
goodurls = []
|
213 |
for url in pubDBUrls:
|
214 |
common.logger.debug(10,'connecting to the URL '+url)
|
215 |
good=1
|
216 |
for re in self.reCEBlackList:
|
217 |
if re.search(url):
|
218 |
common.logger.message('CE in black list, skipping PubDB URL '+url)
|
219 |
good=0
|
220 |
pass
|
221 |
if good: goodurls.append(url)
|
222 |
if len(goodurls) == 0:
|
223 |
common.logger.debug(3,"No selected PubDB URLs")
|
224 |
return goodurls
|
225 |
|
226 |
########################################################################
|
227 |
def checkPubDBNewVersion(self, baseurl):
|
228 |
"""
|
229 |
Check PubDB version to find out if it's new-style or old-style
|
230 |
"""
|
231 |
### check based on the existance of pubdb-get-version.php
|
232 |
urlversion=baseurl+'pubdb-get-version.php'
|
233 |
newversion=1;
|
234 |
try:
|
235 |
v = urllib2.urlopen(urlversion)
|
236 |
except urllib2.URLError, msg:
|
237 |
#print "WARNING: no URL to get PubDB version "
|
238 |
newversion=0;
|
239 |
|
240 |
if (newversion) :
|
241 |
schemaversion = v.read()
|
242 |
#print schemaversion;
|
243 |
|
244 |
return newversion
|
245 |
|
246 |
########################################################################
|
247 |
def getPubDBData(self, CollIDs, url , newversion):
|
248 |
"""
|
249 |
Contact a PubDB to collect all the relevant information
|
250 |
"""
|
251 |
result = []
|
252 |
|
253 |
### get the base PubDb url
|
254 |
end=string.rfind(url,'/')
|
255 |
lastEq=string.rfind(url,'=')
|
256 |
|
257 |
if (newversion) :
|
258 |
### from PubDB V4 : get info for all the collections in one shot and unserialize the content
|
259 |
Collections=string.join(CollIDs,'-')
|
260 |
## add the PU among the required Collections if the Digi are requested
|
261 |
# ( for the time being asking it directly to the PubDB so the RefDB
|
262 |
# level data discovery is bypassed..... in future when every site
|
263 |
# will have the new style it will be possible to ask for PU , at RefDB level, in method findAllCollections )
|
264 |
if ( self.NeededdataTiers.count('Digi') ):
|
265 |
PUCollID=self.getDatatierCollID(url[:end+1],Collections,"PU")
|
266 |
if (PUCollID) : CollIDs.append(PUCollID)
|
267 |
##
|
268 |
Collections=string.join(CollIDs,'-')
|
269 |
### download from PubDB all the info about the given collections
|
270 |
pubdb_analysis=PubDBInfo(url[:end+1],Collections)
|
271 |
#print pubdb_analysis.GetPubDBInfo()
|
272 |
ok=0
|
273 |
try:
|
274 |
catInfos=pubdb_analysis.GetPubDBInfo()
|
275 |
ok=1
|
276 |
except :
|
277 |
#print "WARNING: can't get PubDB content out of "+url[:end+1]+"\n"
|
278 |
print '\nERROR extracting info for collections '+Collections+' from PubDB '+url[:end+1]+'. The PU might not be published at that site.\n'
|
279 |
#raise PubDBGetAnalysisError(url[:end+1],Collections)
|
280 |
if (ok): result=catInfos;
|
281 |
|
282 |
else:
|
283 |
|
284 |
### before PubDB V4 : get info for each collection and read the key-value pair text
|
285 |
|
286 |
for CollID in CollIDs:
|
287 |
urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+CollID
|
288 |
# print 'PHP URL: '+urlphp+' \n'
|
289 |
|
290 |
reOld=re.compile( r'V24' )
|
291 |
#print urlphp,'Old PubDB ',reOld.search(urlphp)
|
292 |
if reOld.search(urlphp):
|
293 |
raise NoPHPError(urlphp)
|
294 |
else:
|
295 |
try:
|
296 |
f = urllib2.urlopen(urlphp)
|
297 |
except urllib2.URLError, msg:
|
298 |
print "WARNING: ", msg
|
299 |
raise PubDBError(urlphp)
|
300 |
except urllib2.HTTPError, msg:
|
301 |
print "WARNING: ", msg
|
302 |
raise NoPHPError(urlphp)
|
303 |
content = f.read()
|
304 |
result.append(pubDBResult(content))
|
305 |
#print "Coll",CollID," content ",content
|
306 |
pass
|
307 |
pass
|
308 |
|
309 |
#print '.....'
|
310 |
#for r in result:
|
311 |
# r.dump()
|
312 |
#print '.....'
|
313 |
return result
|
314 |
|
315 |
########################################################################
|
316 |
def getDatatierCollID(self,urlbase,CollIDString,datatier):
|
317 |
"""
|
318 |
Contact a script of PubDB to retrieve the collid a DataTier
|
319 |
"""
|
320 |
try:
|
321 |
f = urllib.urlopen(urlbase+'pubdb-get-collidbydatatier.php?collid='+CollIDString+"&datatier="+datatier)
|
322 |
except IOError:
|
323 |
raise PubDBGetAnalysisError(url[:end+1]+'pubdb-get-collidbydatatier.php',CollIDString)
|
324 |
data = f.read()
|
325 |
colldata=re.compile(r'collid=(\S*)').search(data);
|
326 |
if colldata:
|
327 |
datatier_CollID=colldata.group(1)
|
328 |
# print " --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier
|
329 |
common.logger.message(" --> asking to PubDB "+urlbase+" for an additional collection : ID= "+datatier_CollID+" DataTier= "+datatier)
|
330 |
|
331 |
return datatier_CollID
|
332 |
|
333 |
########################################################################
|
334 |
def getAllPubDBData(self):
|
335 |
"""
|
336 |
Contact a list of PubDB to collect all the relevant information
|
337 |
"""
|
338 |
newPubDBResult=[]
|
339 |
oldPubDBResult=[]
|
340 |
Result={}
|
341 |
|
342 |
### find the user-required collection IDs
|
343 |
CollIDs = self.findAllCollections()
|
344 |
### find the PubDB URLs publishing the needed data
|
345 |
urllist = self.findPubDBs(CollIDs)
|
346 |
### collect information sparatelly from new-style PubDBs and old-style PubDBs
|
347 |
for pubdburl in urllist:
|
348 |
end=string.rfind(pubdburl,'/')
|
349 |
newversion=self.checkPubDBNewVersion(pubdburl[:end+1])
|
350 |
if (newversion):
|
351 |
res=self.getPubDBData(CollIDs,pubdburl,newversion)
|
352 |
if len(res)>0:
|
353 |
newPubDBResult.append(res)
|
354 |
else:
|
355 |
resold=self.getPubDBData(CollIDs,pubdburl,newversion)
|
356 |
if len(resold)>0:
|
357 |
oldPubDBResult.append(resold)
|
358 |
### fill a dictionary with all the PubBDs results both old-style and new-style
|
359 |
Result['newPubDB']=newPubDBResult
|
360 |
Result['oldPubDB']=oldPubDBResult
|
361 |
|
362 |
## print for debugging purpose
|
363 |
#
|
364 |
#for PubDBversion in Result.keys():
|
365 |
#print ("key %s, val %s" %(PubDBversion,Result[PubDBversion]))
|
366 |
# if len(Result[PubDBversion])>0 :
|
367 |
#print (" key %s"%(PubDBversion))
|
368 |
# for result in Result[PubDBversion]:
|
369 |
# for r in result:
|
370 |
#r.dump()
|
371 |
# common.log.write('----------------- \n')
|
372 |
#print '.....................................'
|
373 |
|
374 |
return Result
|
375 |
|
376 |
####################################################################
|