1 |
#!/usr/bin/env python2
|
2 |
import sys, os, string, re
|
3 |
import urllib, urllister
|
4 |
import urllib2
|
5 |
import common
|
6 |
from RefDBInfo import RefDBInfo
|
7 |
|
8 |
# ####################################
|
9 |
class PubDBError:
|
10 |
def __init__(self, url):
|
11 |
print '\nERROR accessing PubDB at '+url+'\n'
|
12 |
pass
|
13 |
|
14 |
# ####################################
|
15 |
class RefDBmapError:
|
16 |
def __init__(self, url):
|
17 |
print '\nERROR accessing RefDB-PubDBs map at '+url+'\n'
|
18 |
pass
|
19 |
|
20 |
# ####################################
|
21 |
class NoPHPError:
|
22 |
def __init__(self, url):
|
23 |
#print '\nERROR accessing PHP at '+url+' \n'
|
24 |
print 'ERROR accessing PHP: ',url,' \n'
|
25 |
pass
|
26 |
|
27 |
# ####################################
|
28 |
class pubDBResult:
|
29 |
def __init__(self,
|
30 |
contents):
|
31 |
self.contents=contents
|
32 |
|
33 |
|
34 |
def dump(self):
|
35 |
print 'Contents : ',self.contents
|
36 |
pass
|
37 |
|
38 |
# ####################################
|
39 |
# class to access PubDBs
|
40 |
class PubDB:
|
41 |
def __init__(self, owner, dataset, dataTiers, cfg_params):
|
42 |
|
43 |
# Attributes
|
44 |
self.owner = owner
|
45 |
self.dataset = dataset
|
46 |
self.dataTiers = dataTiers
|
47 |
self.cfg_params = cfg_params
|
48 |
|
49 |
self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
|
50 |
self.RefDBphp_ = 'PubDB/GetIdCollection.php'
|
51 |
self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'
|
52 |
|
53 |
self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
|
54 |
self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'
|
55 |
|
56 |
self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
|
57 |
self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'
|
58 |
|
59 |
## link to the modified RefDB-PubDBs map script that allow the display option
|
60 |
self.RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.mod.php?display=1'
|
61 |
|
62 |
# Costructor procedures
|
63 |
|
64 |
CEBlackList = []
|
65 |
try:
|
66 |
tmpBad = string.split(self.cfg_params['USER.ce_black_list'],',')
|
67 |
#tmpBad = ['fnal']
|
68 |
for tmp in tmpBad:
|
69 |
tmp=string.strip(tmp)
|
70 |
CEBlackList.append(tmp)
|
71 |
except KeyError:
|
72 |
pass
|
73 |
print 'CEBlackList: ',CEBlackList
|
74 |
self.reCEBlackList=[]
|
75 |
for bad in CEBlackList:
|
76 |
self.reCEBlackList.append(re.compile( bad ))
|
77 |
#print 'ReBad: ',self.reCEBlackList
|
78 |
|
79 |
|
80 |
########################################################################
|
81 |
def findAllCollections(self):
|
82 |
"""
|
83 |
Contact RefDB and find the CollID of all the user required collections
|
84 |
"""
|
85 |
## download from RefDB all the info about the given dataset-owner
|
86 |
refdb=RefDBInfo(self.owner,self.dataset)
|
87 |
#print refdb.GetRefDBInfo()
|
88 |
try:
|
89 |
collInfos=refdb.GetRefDBInfo()
|
90 |
except :
|
91 |
sys.exit(10)
|
92 |
#print "collInfos=", collInfos
|
93 |
|
94 |
first=1
|
95 |
NeededCollID=[]
|
96 |
refdbdataTiers=[]
|
97 |
for coll in collInfos:
|
98 |
## select the primary collection
|
99 |
if first:
|
100 |
NeededCollID.append(coll[0])
|
101 |
refdbdataTiers.append(coll[2])
|
102 |
common.logger.message("\n --> primary collection for owner "+self.owner+" is: ID="+coll[0]+" DataTier="+coll[2])
|
103 |
first=0
|
104 |
else:
|
105 |
## select only the parents collections corresponding to data-tiers requested by the user
|
106 |
if self.dataTiers.count(coll[2]):
|
107 |
NeededCollID.append(coll[0])
|
108 |
common.logger.message(" --> further collection required: ID="+coll[0]+" DataTier="+coll[2])
|
109 |
refdbdataTiers.append(coll[2])
|
110 |
|
111 |
## check that the user asks for Data Tier really existing in RefDB, otherwise give a warning message
|
112 |
for dt in self.dataTiers:
|
113 |
if refdbdataTiers.count(dt)<=0:
|
114 |
msg = "ERROR: Data Tier ( =>",dt,"<= ) not existing for dataset/owner "+ self.dataset+"/"+self.owner+"!"
|
115 |
msg = msg + "Check the data_tier variable in crab.cfg"
|
116 |
msg = msg + 'Owner Dataset not published with asked dataTiers! '+\
|
117 |
self.owner+' '+ self.dataset+' '+self.dataTiers
|
118 |
common.logger.message(msg)
|
119 |
return []
|
120 |
|
121 |
#print 'Needed Collections are ', NeededCollID
|
122 |
#return collInfos
|
123 |
#print "NeededCollID= ", NeededCollID
|
124 |
return NeededCollID
|
125 |
|
126 |
########################################################################
|
127 |
def findPubDBsbyCollID(self,CollID):
|
128 |
"""
|
129 |
Find the list of PubDB URLs having a given Collection
|
130 |
"""
|
131 |
### contact the RefDB-PubDBs map to discovery where the given CollID is
|
132 |
url = self.PubDBCentralUrl_+self.RefDBPubDBsmapPhp_+'&CollID=' + CollID
|
133 |
# print "%s"%(url)
|
134 |
try:
|
135 |
f = urllib.urlopen(url)
|
136 |
except IOError:
|
137 |
# print 'Cannot access URL: '+url
|
138 |
raise RefDBmapError(url)
|
139 |
|
140 |
### search for the PubDBURL string
|
141 |
reURLLine=re.compile( r'PubDBURL=(\S*)' )
|
142 |
|
143 |
PubDBURLs = []
|
144 |
for line in f.readlines():
|
145 |
#print '#',line,'#'
|
146 |
if reURLLine.search(line) :
|
147 |
URLLine=reURLLine.search(line).group()
|
148 |
#print string.split(URLLine,'=')[1]
|
149 |
PubDBURLs.append(string.split(URLLine,'=')[1])
|
150 |
|
151 |
### return the list of PubDBURL where the collection is present
|
152 |
#return PubDBURLs
|
153 |
return self.uniquelist(PubDBURLs)
|
154 |
|
155 |
################################################################
|
156 |
def findPubDBs(self,CollIDs):
|
157 |
"""
|
158 |
Find the list of PubDB URLs having ALL the required collections
|
159 |
"""
|
160 |
### loop over all the required collections
|
161 |
#pubdbmap={}
|
162 |
allurls=[]
|
163 |
countColl=0
|
164 |
for CollID in CollIDs :
|
165 |
countColl=countColl+1
|
166 |
### map the CollectionID with the list of PubDB URLs
|
167 |
#pubdbmap[CollID]=self.findPubDBsbyCollID(CollID)
|
168 |
### prepare a list all PubDB urls for all collections
|
169 |
allurls.extend(self.findPubDBsbyCollID(CollID))
|
170 |
#print pubdbmap.values()
|
171 |
|
172 |
### select only PubDB urls that contains all the collections
|
173 |
unique_urls=self.uniquelist(allurls)
|
174 |
SelectedPubDBURLs=[]
|
175 |
# loop on a unique list of PubDB urls
|
176 |
for url in unique_urls :
|
177 |
# check that PubDBurl occurrance is the same as the number of collections
|
178 |
if ( allurls.count(url)==countColl ) :
|
179 |
SelectedPubDBURLs.append(url)
|
180 |
|
181 |
#print 'Required Collections',CollIDs,'are all present in PubDBURLs : ',SelectedPubDBURLs,'\n'
|
182 |
#return SelectedPubDBURLs
|
183 |
#### check based on CE black list: select only PubDB not in the CE black list
|
184 |
GoodPubDBURLs=self.checkBlackList(SelectedPubDBURLs)
|
185 |
return GoodPubDBURLs
|
186 |
|
187 |
#######################################################################
|
188 |
def uniquelist(self, old):
|
189 |
"""
|
190 |
remove duplicates from a list
|
191 |
"""
|
192 |
nd={}
|
193 |
for e in old:
|
194 |
nd[e]=0
|
195 |
return nd.keys()
|
196 |
|
197 |
#######################################################################
|
198 |
def checkBlackList(self, pubDBUrls):
|
199 |
"""
|
200 |
select PubDB URLs that are at site not excluded by the user (via CE black list)
|
201 |
"""
|
202 |
goodurls = []
|
203 |
for url in pubDBUrls:
|
204 |
print 'connecting to the URL ',url
|
205 |
good=1
|
206 |
for re in self.reCEBlackList:
|
207 |
if re.search(url):
|
208 |
common.logger.message('CE in black list, skipping PubDB URL '+url)
|
209 |
good=0
|
210 |
pass
|
211 |
if good: goodurls.append(url)
|
212 |
if len(goodurls) == 0:
|
213 |
common.logger.debug(3,"No selected PubDB URLs")
|
214 |
return goodurls
|
215 |
|
216 |
########################################################################
|
217 |
def getPubDBData(self, CollIDs, url):
|
218 |
"""
|
219 |
Contact a PubDB to collect all the relevant information
|
220 |
"""
|
221 |
result = []
|
222 |
for CollID in CollIDs:
|
223 |
end=string.rfind(url,'/')
|
224 |
lastEq=string.rfind(url,'=')
|
225 |
urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+CollID
|
226 |
# print 'PHP URL: '+urlphp+' \n'
|
227 |
|
228 |
reOld=re.compile( r'V24' )
|
229 |
#print urlphp,'Old PubDB ',reOld.search(urlphp)
|
230 |
if reOld.search(urlphp):
|
231 |
raise NoPHPError(urlphp)
|
232 |
else:
|
233 |
try:
|
234 |
f = urllib2.urlopen(urlphp)
|
235 |
except urllib2.URLError, msg:
|
236 |
print "WARNING: ", msg
|
237 |
raise PubDBError(urlphp)
|
238 |
except urllib2.HTTPError, msg:
|
239 |
print "WARNING: ", msg
|
240 |
raise NoPHPError(urlphp)
|
241 |
content = f.read()
|
242 |
result.append(pubDBResult(content))
|
243 |
#print "Coll",CollID," content ",content
|
244 |
pass
|
245 |
pass
|
246 |
|
247 |
#print '.....'
|
248 |
#for r in result:
|
249 |
# r.dump()
|
250 |
#print '.....'
|
251 |
return result
|
252 |
|
253 |
########################################################################
|
254 |
def getAllPubDBData(self, CollIDs, urllist):
|
255 |
"""
|
256 |
Contact a list of PubDB to collect all the relevant information
|
257 |
"""
|
258 |
completeResult=[]
|
259 |
for pubdburl in urllist:
|
260 |
completeResult.append(self.getPubDBData(CollIDs,pubdburl))
|
261 |
|
262 |
## print for debugging purpose
|
263 |
#for result in completeResult:
|
264 |
# print '..... PubDB Site URL :',pubdburl
|
265 |
# for r in result:
|
266 |
# r.dump()
|
267 |
# print '.....................................'
|
268 |
|
269 |
return completeResult
|
270 |
####################################################################
|