CRAB/python/PubDB.py

#!/usr/bin/env python
import sys, os, string, re
import urllib, urllister
import urllib2

# ####################################
# Exception with use of refDB
class RefDBError:
  def __init__(self, owner, dataset):
    print '\nERROR accessing RefDB for Owner/Dataset: '+owner+'/'+dataset+'\n'
    pass

# ####################################
class PubDBError:
  def __init__(self, url):
    print '\nERROR accessing PubDB at '+url+'\n'
    pass
  
# ####################################
class NoPHPError:
  def __init__(self, url):
    #print '\nERROR accessing PHP at '+url+' \n'
    print 'ERROR accessing PHP: ',url,'isn\'t updated version \n'
    pass
  
# ####################################
class pubDBResult:
  def __init__(self,
               contents):
    self.contents=contents

    
  def dump(self):
    print 'Contents : ',self.contents

# ####################################
# class to access PubDBs
class PubDB:
  def __init__(self, owner, dataset, dataTiers):
    self.owner = owner
    self.dataset = dataset
    self.dataTiers = dataTiers
    
    self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
    self.RefDBphp_ = 'PubDB/GetIdCollection.php'
    self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'

    self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
    self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'

    self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
    self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'

    try:
      self.collid=self.findAllCollId()
    except PubDBError:
      raise RefDBError(self.owner, self.dataset)

########################################################################
  def findAllCollId(self):
    collId=self.findCollId() 


    NeededCollID = []
    NeededCollID.append(collId)
    #dataTypeReq = ['Digi' ]#, 'Digi', 'Hit', 'PU']

    if len(self.dataTiers)>0:
      dataTypeReq = self.dataTiers
      CollInfos=self.findMotherCollId(collId)
      while (CollInfos[1][2]!='PU'):

        for TypeReq in dataTypeReq:
          for CollInfo in CollInfos[1:]:
            if TypeReq==CollInfo[2]:
              NeededCollID.append(CollInfo[0])
              break
          pass
        CollInfos=self.findMotherCollId(CollInfo[0])
        ### no more parents
        if len(CollInfos)==1:
          break

    
    print NeededCollID
    return NeededCollID
          
      
########################################################################
  def findCollId(self):
    """
    Contact RefDB and get CollId given Dataset and Owner
    """
   
    #anche questa info viene dal cfg. E' PubDB centrale 
    url = self.RefDBurl_+self.RefDBphp_+'?Owner=' + self.owner + '&Dataset=' + self.dataset

    try:
      f = urllib.urlopen(url)
    except IOError:
      # print 'Cannot access URL: '+url
      raise PubDBError(url)

    line = f.read()
    try:
      collid = string.split(line,": ")
      #part = string.strip(part[1])
      collid = string.split(collid[1],"<")
      collid = string.strip(collid[0])
    except IndexError:
      raise PubDBError(url)

    print 'CollectionId: '+collid+' \n'
    return collid

########################################################################
  def findMotherCollId(self, collid):
    """
    Contact RefDB and get CollId of mother of current Dataset Owner (eg. Digi if DST, Hit if Digi)
    """

    url = self.RefDBurl_+self.RefDBMotherphp_+'?cid=' + collid

    try:
      f = urllib.urlopen(url)
    except IOError:
      # print 'Cannot access URL: '+url
      raise PubDBError(url)

    reEmptyLine = re.compile( r'^$' )

    collInfos = []
    for line in f.readlines():
      #print '#',line,'#'
      line = string.strip(line)
      if reEmptyLine.match(line):
        pass
      else:
        #print '#',line,'#'
        keys = string.split(line,',')
        #print '#',keys,'#'
        collInfo = []
        for key in keys:
          collInfo.append(string.split(key, '=')[1])
        collInfos.append(collInfo)

    return collInfos

########################################################################
  def getPubDBInfo(self, url):
    """
    Contact a local PubDB to collect all the relevant information
    """

    result = []
    end=string.rfind(url,'/')
    lastEq=string.rfind(url,'=')
    urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+url[lastEq+1:]
    # print 'PHP URL: '+urlphp+' \n'

    reOld=re.compile( r'V24' )
    #print urlphp,'Old PubDB ',reOld.search(urlphp)
    if reOld.search(urlphp):
      raise NoPHPError(urlphp)
      # try:
      #   urldev=string.replace(urlphp,'V24','V3_1')
      #   print "urldev URL: ",urldev
      #   f = urllib2.urlopen(urldev)
      # except urllib2.HTTPError, msg:
      #   raise NoPHPError(urldev)
    else:
      try:
        f = urllib2.urlopen(urlphp) 
      except urllib2.HTTPError, msg:
        raise NoPHPError(urlphp)
    
    content = f.read()
    return pubDBResult(content)
   
########################################################################
  def findPubDBsUrls(self):
    """
    Find the URL of the PubDB of all the sites which publish the collid
    """
    
    completeResult = []

### first collId is the primary one, (Dataset/Owner asker by user)
### The other CollIDs are the parents

### Get all the pubDb's URL containig the primary collection _and_ the requested parents
    primaryCollId=self.collid[0]

    primaryUrl = self.PubDBCentralUrl_+self.PubDBCentralPhp_+'?CollID=' + primaryCollId
    #print "primaryUrl=", primaryUrl

    try:
      sock = urllib.urlopen(primaryUrl)
    except IOError:
      raise PubDBError(primaryUrl)

    parser = urllister.URLLister()
    parser.feed(sock.read())
    sock.close()
    parser.close()

# this are all the href links found in the page
    for url in parser.linksList: 
# return only those which contains "CollID"
      result=[]
      if string.find(url, primaryCollId) != -1 :
        #print 'URL ',url
        result.append(url)
        
        try:
          for tmp in self.checkPubDBs(url):
            result.append(tmp)
        except PubDBError:
          continue

        completeResult.append(result)
          
    return completeResult

#########################################################
  def checkPubDBs(self, url):
    """
    Check if the given PubDB contains also the CollId's collections
    """ 
 
    result = []
    primaryCollId=self.collid[0]
    
    reNotCollId=re.compile( r'no such collection' )
    reDBError=re.compile( r'DB Error: connect failed' )
    for collid in self.collid[1:]:
      newurl = string.replace(url,primaryCollId,collid)
      try:
        sock = urllib.urlopen(newurl)
        for line in sock.readlines():
          line = string.strip(line)
          if reNotCollId.search(line):
            raise PubDBError(newurl)
          if reDBError.search(line):
            raise PubDBError(newurl)
      except IOError:
        raise PubDBError(newurl)
      sock.close()
      result.append(newurl)

    return result

#########################################################
  def getAllPubDBsInfo(self):
    """
    Prepare the file to send in InputSandbox, with the info retrieved by local PubDBs
    """ 

# get all the URLs of PubDBs which publish CollId
    pubDBUrlsSet = self.findPubDBsUrls()
    # print 'pubDBUrls ',pubDBUrlsSet

# get the contents of each PubDBs
    completeResult = []
    for pubDBUrls in pubDBUrlsSet:
      result = []
      for url in pubDBUrls:
        # print 'URL ',url
        try:
          result.append(self.getPubDBInfo(url))
        except NoPHPError:
          continue
      # for r in result:
      #   r.dump()
      if len(result)>0: completeResult.append(result)
    
    # print 'getAllPubDBsInfo ',completeResult
    # for result in completeResult:
    #   print '.....'
    #   for r in result:
    #     r.dump()
    #   print '.....'
    return completeResult
Revision:	1.1
Committed:	Tue Jun 7 07:21:04 2005 UTC (19 years, 10 months ago) by nsmirnov
Content type:	text/x-python
Branch:	MAIN
Log Message:	pubdb.py renamed to PubDB.py
#	Content
1	#!/usr/bin/env python
2	import sys, os, string, re
3	import urllib, urllister
4	import urllib2
5
6	# ####################################
7	# Exception with use of refDB
8	class RefDBError:
9	def __init__(self, owner, dataset):
10	print '\nERROR accessing RefDB for Owner/Dataset: '+owner+'/'+dataset+'\n'
11	pass
12
13	# ####################################
14	class PubDBError:
15	def __init__(self, url):
16	print '\nERROR accessing PubDB at '+url+'\n'
17	pass
18
19	# ####################################
20	class NoPHPError:
21	def __init__(self, url):
22	#print '\nERROR accessing PHP at '+url+' \n'
23	print 'ERROR accessing PHP: ',url,'isn\'t updated version \n'
24	pass
25
26	# ####################################
27	class pubDBResult:
28	def __init__(self,
29	contents):
30	self.contents=contents
31
32
33	def dump(self):
34	print 'Contents : ',self.contents
35
36	# ####################################
37	# class to access PubDBs
38	class PubDB:
39	def __init__(self, owner, dataset, dataTiers):
40	self.owner = owner
41	self.dataset = dataset
42	self.dataTiers = dataTiers
43
44	self.RefDBurl_ = 'http://cmsdoc.cern.ch/cms/production/www/'
45	self.RefDBphp_ = 'PubDB/GetIdCollection.php'
46	self.RefDBMotherphp_ = 'cgi/SQL/CollectionTree.php'
47
48	self.PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
49	self.PubDBCentralPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'
50
51	self.PubDBAnalysisPhp_ = 'get-pubdb-analysisinfo.php'
52	self.PubDBAnalysisPhpOld_ = 'get-pubdb-analysisinfo.php'
53
54	try:
55	self.collid=self.findAllCollId()
56	except PubDBError:
57	raise RefDBError(self.owner, self.dataset)
58
59	########################################################################
60	def findAllCollId(self):
61	collId=self.findCollId()
62
63
64	NeededCollID = []
65	NeededCollID.append(collId)
66	#dataTypeReq = ['Digi' ]#, 'Digi', 'Hit', 'PU']
67
68	if len(self.dataTiers)>0:
69	dataTypeReq = self.dataTiers
70	CollInfos=self.findMotherCollId(collId)
71	while (CollInfos[1][2]!='PU'):
72
73	for TypeReq in dataTypeReq:
74	for CollInfo in CollInfos[1:]:
75	if TypeReq==CollInfo[2]:
76	NeededCollID.append(CollInfo[0])
77	break
78	pass
79	CollInfos=self.findMotherCollId(CollInfo[0])
80	### no more parents
81	if len(CollInfos)==1:
82	break
83
84
85	print NeededCollID
86	return NeededCollID
87
88
89
90	########################################################################
91	def findCollId(self):
92	"""
93	Contact RefDB and get CollId given Dataset and Owner
94	"""
95
96	#anche questa info viene dal cfg. E' PubDB centrale
97	url = self.RefDBurl_+self.RefDBphp_+'?Owner=' + self.owner + '&Dataset=' + self.dataset
98
99	try:
100	f = urllib.urlopen(url)
101	except IOError:
102	# print 'Cannot access URL: '+url
103	raise PubDBError(url)
104
105	line = f.read()
106	try:
107	collid = string.split(line,": ")
108	#part = string.strip(part[1])
109	collid = string.split(collid[1],"<")
110	collid = string.strip(collid[0])
111	except IndexError:
112	raise PubDBError(url)
113
114	print 'CollectionId: '+collid+' \n'
115	return collid
116
117	########################################################################
118	def findMotherCollId(self, collid):
119	"""
120	Contact RefDB and get CollId of mother of current Dataset Owner (eg. Digi if DST, Hit if Digi)
121	"""
122
123	url = self.RefDBurl_+self.RefDBMotherphp_+'?cid=' + collid
124
125	try:
126	f = urllib.urlopen(url)
127	except IOError:
128	# print 'Cannot access URL: '+url
129	raise PubDBError(url)
130
131	reEmptyLine = re.compile( r'^$' )
132
133	collInfos = []
134	for line in f.readlines():
135	#print '#',line,'#'
136	line = string.strip(line)
137	if reEmptyLine.match(line):
138	pass
139	else:
140	#print '#',line,'#'
141	keys = string.split(line,',')
142	#print '#',keys,'#'
143	collInfo = []
144	for key in keys:
145	collInfo.append(string.split(key, '=')[1])
146	collInfos.append(collInfo)
147
148	return collInfos
149
150	########################################################################
151	def getPubDBInfo(self, url):
152	"""
153	Contact a local PubDB to collect all the relevant information
154	"""
155
156	result = []
157	end=string.rfind(url,'/')
158	lastEq=string.rfind(url,'=')
159	urlphp=url[:end+1]+self.PubDBAnalysisPhp_+'?CollID='+url[lastEq+1:]
160	# print 'PHP URL: '+urlphp+' \n'
161
162	reOld=re.compile( r'V24' )
163	#print urlphp,'Old PubDB ',reOld.search(urlphp)
164	if reOld.search(urlphp):
165	raise NoPHPError(urlphp)
166	# try:
167	# urldev=string.replace(urlphp,'V24','V3_1')
168	# print "urldev URL: ",urldev
169	# f = urllib2.urlopen(urldev)
170	# except urllib2.HTTPError, msg:
171	# raise NoPHPError(urldev)
172	else:
173	try:
174	f = urllib2.urlopen(urlphp)
175	except urllib2.HTTPError, msg:
176	raise NoPHPError(urlphp)
177
178	content = f.read()
179	return pubDBResult(content)
180
181	########################################################################
182	def findPubDBsUrls(self):
183	"""
184	Find the URL of the PubDB of all the sites which publish the collid
185	"""
186
187	completeResult = []
188
189	### first collId is the primary one, (Dataset/Owner asker by user)
190	### The other CollIDs are the parents
191
192	### Get all the pubDb's URL containig the primary collection _and_ the requested parents
193	primaryCollId=self.collid[0]
194
195	primaryUrl = self.PubDBCentralUrl_+self.PubDBCentralPhp_+'?CollID=' + primaryCollId
196	#print "primaryUrl=", primaryUrl
197
198	try:
199	sock = urllib.urlopen(primaryUrl)
200	except IOError:
201	raise PubDBError(primaryUrl)
202
203	parser = urllister.URLLister()
204	parser.feed(sock.read())
205	sock.close()
206	parser.close()
207
208	# this are all the href links found in the page
209	for url in parser.linksList:
210	# return only those which contains "CollID"
211	result=[]
212	if string.find(url, primaryCollId) != -1 :
213	#print 'URL ',url
214	result.append(url)
215
216	try:
217	for tmp in self.checkPubDBs(url):
218	result.append(tmp)
219	except PubDBError:
220	continue
221
222	completeResult.append(result)
223
224	return completeResult
225
226	#########################################################
227	def checkPubDBs(self, url):
228	"""
229	Check if the given PubDB contains also the CollId's collections
230	"""
231
232	result = []
233	primaryCollId=self.collid[0]
234
235	reNotCollId=re.compile( r'no such collection' )
236	reDBError=re.compile( r'DB Error: connect failed' )
237	for collid in self.collid[1:]:
238	newurl = string.replace(url,primaryCollId,collid)
239	try:
240	sock = urllib.urlopen(newurl)
241	for line in sock.readlines():
242	line = string.strip(line)
243	if reNotCollId.search(line):
244	raise PubDBError(newurl)
245	if reDBError.search(line):
246	raise PubDBError(newurl)
247	except IOError:
248	raise PubDBError(newurl)
249	sock.close()
250	result.append(newurl)
251
252	return result
253
254	#########################################################
255	def getAllPubDBsInfo(self):
256	"""
257	Prepare the file to send in InputSandbox, with the info retrieved by local PubDBs
258	"""
259
260	# get all the URLs of PubDBs which publish CollId
261	pubDBUrlsSet = self.findPubDBsUrls()
262	# print 'pubDBUrls ',pubDBUrlsSet
263
264	# get the contents of each PubDBs
265	completeResult = []
266	for pubDBUrls in pubDBUrlsSet:
267	result = []
268	for url in pubDBUrls:
269	# print 'URL ',url
270	try:
271	result.append(self.getPubDBInfo(url))
272	except NoPHPError:
273	continue
274	# for r in result:
275	# r.dump()
276	if len(result)>0: completeResult.append(result)
277
278	# print 'getAllPubDBsInfo ',completeResult
279	# for result in completeResult:
280	# print '.....'
281	# for r in result:
282	# r.dump()
283	# print '.....'
284	return completeResult