1 |
#!/usr/bin/env python
|
2 |
import sys, os, string, re
|
3 |
import urllib, urllister
|
4 |
import urllib2
|
5 |
|
6 |
# ####################################
|
7 |
# Exception with use of refDB
|
8 |
class RefDBError:
|
9 |
def __init__(self, owner, dataset):
|
10 |
print '\nERROR accessing RefDB for Owner/Dataset: '+owner+'/'+dataset+'\n'
|
11 |
pass
|
12 |
|
13 |
# ####################################
|
14 |
class PubDBError:
|
15 |
def __init__(self, url):
|
16 |
print '\nERROR accessing PubDB at '+url+'\n'
|
17 |
pass
|
18 |
|
19 |
# ####################################
|
20 |
class NoPHPError:
|
21 |
def __init__(self, url):
|
22 |
#print '\nERROR accessing PHP at '+url+' \n'
|
23 |
print 'ERROR accessing PHP: ',url,'isn\'t updated version \n'
|
24 |
pass
|
25 |
|
26 |
# ####################################
|
27 |
class pubDBResult:
|
28 |
def __init__(self,
|
29 |
contents):
|
30 |
self.contents=contents
|
31 |
|
32 |
|
33 |
def dump(self):
|
34 |
print 'Contents : ',self.contents
|
35 |
|
36 |
# ####################################
|
37 |
# class to access PubDBs
|
38 |
class PubDB:
|
39 |
def __init__(self, owner, dataset, dataTiers):
|
40 |
self.owner = owner
|
41 |
self.dataset = dataset
|
42 |
self.dataTiers = dataTiers
|
43 |
|
44 |
self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
|
45 |
self.RefDBphp_ = 'PubDB/GetIdCollection.php'
|
46 |
self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'
|
47 |
|
48 |
self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
|
49 |
self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'
|
50 |
|
51 |
self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
|
52 |
self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'
|
53 |
|
54 |
try:
|
55 |
self.collid=self.findAllCollId()
|
56 |
except PubDBError:
|
57 |
raise RefDBError(self.owner, self.dataset)
|
58 |
|
59 |
########################################################################
|
60 |
def findAllCollId(self):
|
61 |
collId=self.findCollId()
|
62 |
|
63 |
|
64 |
NeededCollID = []
|
65 |
NeededCollID.append(collId)
|
66 |
#dataTypeReq = ['Digi' ]#, 'Digi', 'Hit', 'PU']
|
67 |
|
68 |
if len(self.dataTiers)>0:
|
69 |
dataTypeReq = self.dataTiers
|
70 |
CollInfos=self.findMotherCollId(collId)
|
71 |
while (CollInfos[1][2]!='PU'):
|
72 |
|
73 |
for TypeReq in dataTypeReq:
|
74 |
for CollInfo in CollInfos[1:]:
|
75 |
if TypeReq==CollInfo[2]:
|
76 |
NeededCollID.append(CollInfo[0])
|
77 |
break
|
78 |
pass
|
79 |
CollInfos=self.findMotherCollId(CollInfo[0])
|
80 |
### no more parents
|
81 |
if len(CollInfos)==1:
|
82 |
break
|
83 |
|
84 |
|
85 |
print NeededCollID
|
86 |
return NeededCollID
|
87 |
|
88 |
|
89 |
|
90 |
########################################################################
|
91 |
def findCollId(self):
|
92 |
"""
|
93 |
Contact RefDB and get CollId given Dataset and Owner
|
94 |
"""
|
95 |
|
96 |
#anche questa info viene dal cfg. E' PubDB centrale
|
97 |
url = self.RefDBurl_+self.RefDBphp_+'?Owner=' + self.owner + '&Dataset=' + self.dataset
|
98 |
|
99 |
try:
|
100 |
f = urllib.urlopen(url)
|
101 |
except IOError:
|
102 |
# print 'Cannot access URL: '+url
|
103 |
raise PubDBError(url)
|
104 |
|
105 |
line = f.read()
|
106 |
try:
|
107 |
collid = string.split(line,": ")
|
108 |
#part = string.strip(part[1])
|
109 |
collid = string.split(collid[1],"<")
|
110 |
collid = string.strip(collid[0])
|
111 |
except IndexError:
|
112 |
raise PubDBError(url)
|
113 |
|
114 |
print 'CollectionId: '+collid+' \n'
|
115 |
return collid
|
116 |
|
117 |
########################################################################
|
118 |
def findMotherCollId(self, collid):
|
119 |
"""
|
120 |
Contact RefDB and get CollId of mother of current Dataset Owner (eg. Digi if DST, Hit if Digi)
|
121 |
"""
|
122 |
|
123 |
url = self.RefDBurl_+self.RefDBMotherphp_+'?cid=' + collid
|
124 |
|
125 |
try:
|
126 |
f = urllib.urlopen(url)
|
127 |
except IOError:
|
128 |
# print 'Cannot access URL: '+url
|
129 |
raise PubDBError(url)
|
130 |
|
131 |
reEmptyLine = re.compile( r'^$' )
|
132 |
|
133 |
collInfos = []
|
134 |
for line in f.readlines():
|
135 |
#print '#',line,'#'
|
136 |
line = string.strip(line)
|
137 |
if reEmptyLine.match(line):
|
138 |
pass
|
139 |
else:
|
140 |
#print '#',line,'#'
|
141 |
keys = string.split(line,',')
|
142 |
#print '#',keys,'#'
|
143 |
collInfo = []
|
144 |
for key in keys:
|
145 |
collInfo.append(string.split(key, '=')[1])
|
146 |
collInfos.append(collInfo)
|
147 |
|
148 |
return collInfos
|
149 |
|
150 |
########################################################################
|
151 |
def getPubDBInfo(self, url):
|
152 |
"""
|
153 |
Contact a local PubDB to collect all the relevant information
|
154 |
"""
|
155 |
|
156 |
result = []
|
157 |
end=string.rfind(url,'/')
|
158 |
lastEq=string.rfind(url,'=')
|
159 |
urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+url[lastEq+1:]
|
160 |
# print 'PHP URL: '+urlphp+' \n'
|
161 |
|
162 |
reOld=re.compile( r'V24' )
|
163 |
#print urlphp,'Old PubDB ',reOld.search(urlphp)
|
164 |
if reOld.search(urlphp):
|
165 |
raise NoPHPError(urlphp)
|
166 |
# try:
|
167 |
# urldev=string.replace(urlphp,'V24','V3_1')
|
168 |
# print "urldev URL: ",urldev
|
169 |
# f = urllib2.urlopen(urldev)
|
170 |
# except urllib2.HTTPError, msg:
|
171 |
# raise NoPHPError(urldev)
|
172 |
else:
|
173 |
try:
|
174 |
f = urllib2.urlopen(urlphp)
|
175 |
except urllib2.HTTPError, msg:
|
176 |
raise NoPHPError(urlphp)
|
177 |
|
178 |
content = f.read()
|
179 |
return pubDBResult(content)
|
180 |
|
181 |
########################################################################
|
182 |
def findPubDBsUrls(self):
|
183 |
"""
|
184 |
Find the URL of the PubDB of all the sites which publish the collid
|
185 |
"""
|
186 |
|
187 |
completeResult = []
|
188 |
|
189 |
### first collId is the primary one, (Dataset/Owner asker by user)
|
190 |
### The other CollIDs are the parents
|
191 |
|
192 |
### Get all the pubDb's URL containig the primary collection _and_ the requested parents
|
193 |
primaryCollId=self.collid[0]
|
194 |
|
195 |
primaryUrl = self.PubDBCentralUrl_+self.PubDBCentralPhp_+'?CollID=' + primaryCollId
|
196 |
#print "primaryUrl=", primaryUrl
|
197 |
|
198 |
try:
|
199 |
sock = urllib.urlopen(primaryUrl)
|
200 |
except IOError:
|
201 |
raise PubDBError(primaryUrl)
|
202 |
|
203 |
parser = urllister.URLLister()
|
204 |
parser.feed(sock.read())
|
205 |
sock.close()
|
206 |
parser.close()
|
207 |
|
208 |
# this are all the href links found in the page
|
209 |
for url in parser.linksList:
|
210 |
# return only those which contains "CollID"
|
211 |
result=[]
|
212 |
if string.find(url, primaryCollId) != -1 :
|
213 |
#print 'URL ',url
|
214 |
result.append(url)
|
215 |
|
216 |
try:
|
217 |
for tmp in self.checkPubDBs(url):
|
218 |
result.append(tmp)
|
219 |
except PubDBError:
|
220 |
continue
|
221 |
|
222 |
completeResult.append(result)
|
223 |
|
224 |
return completeResult
|
225 |
|
226 |
#########################################################
|
227 |
def checkPubDBs(self, url):
|
228 |
"""
|
229 |
Check if the given PubDB contains also the CollId's collections
|
230 |
"""
|
231 |
|
232 |
result = []
|
233 |
primaryCollId=self.collid[0]
|
234 |
|
235 |
reNotCollId=re.compile( r'no such collection' )
|
236 |
reDBError=re.compile( r'DB Error: connect failed' )
|
237 |
for collid in self.collid[1:]:
|
238 |
newurl = string.replace(url,primaryCollId,collid)
|
239 |
try:
|
240 |
sock = urllib.urlopen(newurl)
|
241 |
for line in sock.readlines():
|
242 |
line = string.strip(line)
|
243 |
if reNotCollId.search(line):
|
244 |
raise PubDBError(newurl)
|
245 |
if reDBError.search(line):
|
246 |
raise PubDBError(newurl)
|
247 |
except IOError:
|
248 |
raise PubDBError(newurl)
|
249 |
sock.close()
|
250 |
result.append(newurl)
|
251 |
|
252 |
return result
|
253 |
|
254 |
#########################################################
|
255 |
def getAllPubDBsInfo(self):
|
256 |
"""
|
257 |
Prepare the file to send in InputSandbox, with the info retrieved by local PubDBs
|
258 |
"""
|
259 |
|
260 |
# get all the URLs of PubDBs which publish CollId
|
261 |
pubDBUrlsSet = self.findPubDBsUrls()
|
262 |
# print 'pubDBUrls ',pubDBUrlsSet
|
263 |
|
264 |
# get the contents of each PubDBs
|
265 |
completeResult = []
|
266 |
for pubDBUrls in pubDBUrlsSet:
|
267 |
result = []
|
268 |
for url in pubDBUrls:
|
269 |
# print 'URL ',url
|
270 |
try:
|
271 |
result.append(self.getPubDBInfo(url))
|
272 |
except NoPHPError:
|
273 |
continue
|
274 |
# for r in result:
|
275 |
# r.dump()
|
276 |
if len(result)>0: completeResult.append(result)
|
277 |
|
278 |
# print 'getAllPubDBsInfo ',completeResult
|
279 |
# for result in completeResult:
|
280 |
# print '.....'
|
281 |
# for r in result:
|
282 |
# r.dump()
|
283 |
# print '.....'
|
284 |
return completeResult
|